utf8-cleaner 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b6e265c0aefe9f9c6fa6082b86f21c81a92400b6
4
+ data.tar.gz: 12dd12dceef125122d2877dbc9e9e597055ddfac
5
+ SHA512:
6
+ metadata.gz: b794d47e0c9460ef5ec9eadc9340f705a491b0c48370efb82310d2bf21ff7713d655a4fdbb312c4788913879476be5a4c8823e8f494d8a2fe7376ecc4dd58a6c
7
+ data.tar.gz: 0bf1fb1a6d23600a3f5ff4ae7d1f62c7fded5a0d7d0233ebc85103b12c51a849a7ef83708a21568a28bd005e8ea1199ffff6b0d05b0c186a8889624e190f5e19
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
data/Guardfile ADDED
@@ -0,0 +1,9 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/utf8-cleaner/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
9
+
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # UTF8Cleaner
2
2
 
3
+ [<img src="https://secure.travis-ci.org/singlebrook/utf8-cleaner.png" />](http://travis-ci.org/singlebrook/utf8-cleaner)
4
+
3
5
  Removes invalid UTF-8 characters from the environment so that your app doesn't choke
4
6
  on them. This prevents errors like "invalid byte sequence in UTF-8".
5
7
 
data/Rakefile CHANGED
@@ -1 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+
3
+ task :default do
4
+ sh "rspec spec"
5
+ end
data/lib/utf8-cleaner.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  require "utf8-cleaner/version"
2
2
  require "utf8-cleaner/middleware"
3
+ require "utf8-cleaner/uri_string"
3
4
  require "utf8-cleaner/railtie" if defined? Rails
@@ -7,6 +7,7 @@ module UTF8Cleaner
7
7
  "QUERY_STRING",
8
8
  "REQUEST_PATH",
9
9
  "REQUEST_URI",
10
+ "HTTP_COOKIE"
10
11
  ]
11
12
 
12
13
  def initialize(app)
@@ -19,42 +20,22 @@ module UTF8Cleaner
19
20
 
20
21
  private
21
22
 
22
- def sanitize_env(env)
23
- SANITIZE_ENV_KEYS.each do |key|
24
- next unless value = env[key]
25
- value = sanitize_string(URI.decode(value))
26
- env[key] = URI.encode(value)
27
- end
28
- ["HTTP_COOKIE"].each do |key|
29
- next unless value = env[key]
30
- fixed = sanitize_string(value)
31
- env[key] = fixed if fixed
32
- end
33
- env
34
- end
35
-
36
- def sanitize_string(string)
37
- return string unless string.is_a? String
38
-
39
- # Try it as UTF-8 directly
40
- cleaned = string.dup.force_encoding('UTF-8')
41
- if cleaned.valid_encoding?
42
- cleaned
43
- else
44
- utf8clean(string)
45
- end
23
+ def is_valid_utf8(string)
24
+ utf8 = string.dup.force_encoding('UTF-8')
25
+ string == utf8 && utf8.valid_encoding?
46
26
  rescue EncodingError
47
- utf8clean(string)
27
+ false
48
28
  end
49
29
 
50
- def utf8clean(string)
51
- # Force it to UTF-8, throwing out invalid bits
52
- if RUBY_VERSION >= "1.9.3"
53
- # These converters don't exist in 1.9.2
54
- string.encode('UTF-16', 'UTF-8', :invalid => :replace, :replace => '').encode('UTF-8', 'UTF-16')
55
- else
56
- string.chars.select{|i| i.valid_encoding?}.join
30
+ def sanitize_env(env)
31
+ SANITIZE_ENV_KEYS.each do |key|
32
+ next unless value = env[key]
33
+
34
+ if value.include?('%')
35
+ env[key] = URIString.new(value).cleaned
36
+ end
57
37
  end
38
+ env
58
39
  end
59
40
  end
60
41
  end
@@ -0,0 +1,112 @@
1
+ module UTF8Cleaner
2
+ class URIString
3
+ attr_accessor :data
4
+
5
+ def initialize(data)
6
+ self.data = data
7
+ end
8
+
9
+ def cleaned
10
+ if valid?
11
+ data
12
+ else
13
+ encoded_char_array.join
14
+ end
15
+ end
16
+
17
+ def encoded?
18
+ data.include?('%')
19
+ end
20
+
21
+ def valid?
22
+ valid_uri_encoded_utf8(data)
23
+ end
24
+
25
+ private
26
+
27
+ # Returns an array of valid URI-encoded UTF-8 characters.
28
+ def encoded_char_array
29
+ char_array = []
30
+ index = 0
31
+
32
+ while (index < data.length) do
33
+ char = data[index]
34
+
35
+ if char == '%'
36
+ # Skip the next two characters, which are the encoded byte
37
+ # indicates by this %. (We'll change this later for multibyte characters.)
38
+ skip_next = 2
39
+
40
+ # How long is this character?
41
+ first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
42
+ bytes = utf8_char_length_in_bytes(first_byte)
43
+
44
+ # Grab the specified number of encoded bytes
45
+ utf8_char_encoded_bytes = next_n_bytes_from(index, bytes)
46
+
47
+ # Did we get the right number of bytes?
48
+ if utf8_char_encoded_bytes.length == bytes
49
+
50
+ # We did. Is it a valid character?
51
+ utf8_char_encoded = utf8_char_encoded_bytes.join
52
+
53
+ if valid_uri_encoded_utf8(utf8_char_encoded)
54
+ # It's valid!
55
+ char_array << utf8_char_encoded
56
+
57
+ # If we're dealing with a multibyte character, skip more than two
58
+ # of the next characters, which have already been processed.
59
+ skip_next = bytes * 3 - 1
60
+ end
61
+ end
62
+ index += skip_next
63
+ else
64
+ # This was not an encoded character, so just add it and move to the next.
65
+ char_array << char
66
+ end
67
+ index += 1
68
+ end
69
+
70
+ char_array
71
+ end
72
+
73
+ def valid_uri_encoded_utf8(string)
74
+ URI.decode(string).force_encoding('UTF-8').valid_encoding?
75
+ end
76
+
77
+ # Grab the next num_bytes URI-encoded bytes from the raw character array.
78
+ # Returns an array like ['%E2', '%9C', '%93']
79
+ def next_n_bytes_from(index, num_bytes)
80
+ return [] if data.length < index + (3 * num_bytes)
81
+
82
+ num_bytes.times.map do |n|
83
+ # Look for percent signs in the right places
84
+ pct_index = index + (3 * n)
85
+ if data[pct_index] == '%'
86
+ byte = data[pct_index + 1..pct_index + 2]
87
+ else
88
+ # An expected percent sign was missing. The whole character is invalid.
89
+ return []
90
+ end
91
+ '%' + byte
92
+ end
93
+ end
94
+
95
+ # If the first byte is between 0xC0 and 0xDF, the UTF-8 character has two bytes;
96
+ # if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes;
97
+ # and if it is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
98
+ # first_byte is a string like "0x13"
99
+ def utf8_char_length_in_bytes(first_byte)
100
+ if first_byte.hex < 'C0'.hex
101
+ 1
102
+ elsif first_byte.hex < 'DF'.hex
103
+ 2
104
+ elsif first_byte.hex < 'EF'.hex
105
+ 3
106
+ else
107
+ 4
108
+ end
109
+ end
110
+
111
+ end
112
+ end
@@ -1,3 +1,3 @@
1
1
  module UTF8Cleaner
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -1,13 +1,12 @@
1
- # -*- encoding: utf-8 -*-
2
1
  require 'spec_helper'
3
2
 
4
- describe 'UTF8Cleaner::Middleware' do
3
+ describe UTF8Cleaner::Middleware do
5
4
  let :env do
6
5
  {
7
- 'PATH_INFO' => 'foo/bar%2e%2fbaz',
6
+ 'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
8
7
  'QUERY_STRING' => 'foo=bar%FF',
9
8
  'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
10
- 'REQUEST_URI' => '%C3%89'
9
+ 'REQUEST_URI' => '%C3%89%E2%9C%93'
11
10
  }
12
11
  end
13
12
 
@@ -15,19 +14,13 @@ describe 'UTF8Cleaner::Middleware' do
15
14
  UTF8Cleaner::Middleware.new(nil).send(:sanitize_env, env)
16
15
  end
17
16
 
18
- it "removes invalid UTF-8 sequences" do
19
- new_env['QUERY_STRING'].should == 'foo=bar'
17
+ describe "removes invalid UTF-8 sequences" do
18
+ it { new_env['QUERY_STRING'].should == 'foo=bar' }
19
+ it { new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++' }
20
20
  end
21
21
 
22
- it "turns valid %-escaped ASCII chars into their ASCII equivalents" do
23
- new_env['PATH_INFO'].should == 'foo/bar./baz'
24
- end
25
-
26
- it "leaves valid %-escaped UTF-8 chars alone" do
27
- new_env['REQUEST_URI'].should == '%C3%89'
28
- end
29
-
30
- it "handles an awful URL" do
31
- new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++'
22
+ describe "leaves all valid characters untouched" do
23
+ it { new_env['PATH_INFO'].should == 'foo/bar%2e%2fbaz%26%3B' }
24
+ it { new_env['REQUEST_URI'].should == '%C3%89%E2%9C%93' }
32
25
  end
33
26
  end
data/spec/spec_helper.rb CHANGED
@@ -2,8 +2,11 @@ require 'rubygems'
2
2
  require 'rspec/autorun'
3
3
 
4
4
  require 'utf8-cleaner'
5
+ require 'uri'
5
6
 
6
7
  RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+
7
10
  # Run specs in random order to surface order dependencies. If you find an
8
11
  # order dependency and want to debug it, you can fix the order by providing
9
12
  # the seed, which is printed after each run.
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ module UTF8Cleaner
4
+
5
+ describe URIString do
6
+ let(:invalid_string) { URIString.new('%FF') }
7
+ let(:ascii_string) { URIString.new('foo') }
8
+ let(:encoded_string) { URIString.new('%26') }
9
+ let(:multibyte_string) { URIString.new('%E2%9C%93') }
10
+ let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
11
+ # foo/ bar. / baz& ; √ baz
12
+
13
+ describe '#new' do
14
+ it { encoded_string.should be_a URIString }
15
+ end
16
+
17
+ describe '#cleaned' do
18
+ it { invalid_string.cleaned.should eq('') }
19
+ it { ascii_string.cleaned.should eq('foo') }
20
+ it { encoded_string.cleaned.should eq('%26') }
21
+ it { multibyte_string.cleaned.should eq('%E2%9C%93') }
22
+ it { complex_invalid_string.cleaned.should eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
23
+ end
24
+
25
+ describe '#encoded?' do
26
+ it { encoded_string.should be_encoded }
27
+ it { invalid_string.should be_encoded }
28
+ it { multibyte_string.should be_encoded }
29
+ it { complex_invalid_string.should be_encoded }
30
+
31
+ it { ascii_string.should_not be_encoded }
32
+ end
33
+
34
+ describe '#valid?' do
35
+ it { ascii_string.should be_valid }
36
+ it { encoded_string.should be_valid }
37
+ it { multibyte_string.should be_valid }
38
+
39
+ it { invalid_string.should_not be_valid }
40
+ it { complex_invalid_string.should_not be_valid }
41
+ end
42
+
43
+ end
44
+
45
+ end
data/utf8-cleaner.gemspec CHANGED
@@ -17,5 +17,8 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
+ gem.add_development_dependency "rake"
21
+ gem.add_development_dependency "guard"
22
+ gem.add_development_dependency "guard-rspec"
20
23
  gem.add_development_dependency "rspec"
21
24
  end
metadata CHANGED
@@ -1,30 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8-cleaner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
5
- prerelease:
4
+ version: 0.0.6
6
5
  platform: ruby
7
6
  authors:
8
7
  - Leon Miller-Out
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-04-23 00:00:00.000000000 Z
11
+ date: 2013-10-16 00:00:00.000000000 Z
13
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: guard
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: guard-rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
14
55
  - !ruby/object:Gem::Dependency
15
56
  name: rspec
16
57
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
58
  requirements:
19
- - - ! '>='
59
+ - - '>='
20
60
  - !ruby/object:Gem::Version
21
61
  version: '0'
22
62
  type: :development
23
63
  prerelease: false
24
64
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
65
  requirements:
27
- - - ! '>='
66
+ - - '>='
28
67
  - !ruby/object:Gem::Version
29
68
  version: '0'
30
69
  description: Removes invalid UTF8 characters from the URL and other env vars
@@ -35,41 +74,45 @@ extensions: []
35
74
  extra_rdoc_files: []
36
75
  files:
37
76
  - .gitignore
77
+ - .travis.yml
38
78
  - Gemfile
79
+ - Guardfile
39
80
  - LICENSE.txt
40
81
  - README.md
41
82
  - Rakefile
42
83
  - lib/utf8-cleaner.rb
43
84
  - lib/utf8-cleaner/middleware.rb
44
85
  - lib/utf8-cleaner/railtie.rb
86
+ - lib/utf8-cleaner/uri_string.rb
45
87
  - lib/utf8-cleaner/version.rb
46
88
  - spec/middleware_spec.rb
47
89
  - spec/spec_helper.rb
90
+ - spec/uri_string_spec.rb
48
91
  - utf8-cleaner.gemspec
49
92
  homepage: https://github.com/singlebrook/utf8-cleaner
50
93
  licenses: []
94
+ metadata: {}
51
95
  post_install_message:
52
96
  rdoc_options: []
53
97
  require_paths:
54
98
  - lib
55
99
  required_ruby_version: !ruby/object:Gem::Requirement
56
- none: false
57
100
  requirements:
58
- - - ! '>='
101
+ - - '>='
59
102
  - !ruby/object:Gem::Version
60
103
  version: '0'
61
104
  required_rubygems_version: !ruby/object:Gem::Requirement
62
- none: false
63
105
  requirements:
64
- - - ! '>='
106
+ - - '>='
65
107
  - !ruby/object:Gem::Version
66
108
  version: '0'
67
109
  requirements: []
68
110
  rubyforge_project:
69
- rubygems_version: 1.8.23
111
+ rubygems_version: 2.0.3
70
112
  signing_key:
71
- specification_version: 3
113
+ specification_version: 4
72
114
  summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"
73
115
  test_files:
74
116
  - spec/middleware_spec.rb
75
117
  - spec/spec_helper.rb
118
+ - spec/uri_string_spec.rb