utf8-cleaner 0.0.4 → 0.0.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b6e265c0aefe9f9c6fa6082b86f21c81a92400b6
4
+ data.tar.gz: 12dd12dceef125122d2877dbc9e9e597055ddfac
5
+ SHA512:
6
+ metadata.gz: b794d47e0c9460ef5ec9eadc9340f705a491b0c48370efb82310d2bf21ff7713d655a4fdbb312c4788913879476be5a4c8823e8f494d8a2fe7376ecc4dd58a6c
7
+ data.tar.gz: 0bf1fb1a6d23600a3f5ff4ae7d1f62c7fded5a0d7d0233ebc85103b12c51a849a7ef83708a21568a28bd005e8ea1199ffff6b0d05b0c186a8889624e190f5e19
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.9.3
4
+ - 2.0.0
data/Guardfile ADDED
@@ -0,0 +1,9 @@
1
+ # A sample Guardfile
2
+ # More info at https://github.com/guard/guard#readme
3
+
4
+ guard :rspec do
5
+ watch(%r{^spec/.+_spec\.rb$})
6
+ watch(%r{^lib/utf8-cleaner/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
7
+ watch('spec/spec_helper.rb') { "spec" }
8
+ end
9
+
data/README.md CHANGED
@@ -1,5 +1,7 @@
1
1
  # UTF8Cleaner
2
2
 
3
+ [<img src="https://secure.travis-ci.org/singlebrook/utf8-cleaner.png" />](http://travis-ci.org/singlebrook/utf8-cleaner)
4
+
3
5
  Removes invalid UTF-8 characters from the environment so that your app doesn't choke
4
6
  on them. This prevents errors like "invalid byte sequence in UTF-8".
5
7
 
data/Rakefile CHANGED
@@ -1 +1,5 @@
1
1
  require "bundler/gem_tasks"
2
+
3
+ task :default do
4
+ sh "rspec spec"
5
+ end
data/lib/utf8-cleaner.rb CHANGED
@@ -1,3 +1,4 @@
1
1
  require "utf8-cleaner/version"
2
2
  require "utf8-cleaner/middleware"
3
+ require "utf8-cleaner/uri_string"
3
4
  require "utf8-cleaner/railtie" if defined? Rails
@@ -7,6 +7,7 @@ module UTF8Cleaner
7
7
  "QUERY_STRING",
8
8
  "REQUEST_PATH",
9
9
  "REQUEST_URI",
10
+ "HTTP_COOKIE"
10
11
  ]
11
12
 
12
13
  def initialize(app)
@@ -19,42 +20,22 @@ module UTF8Cleaner
19
20
 
20
21
  private
21
22
 
22
- def sanitize_env(env)
23
- SANITIZE_ENV_KEYS.each do |key|
24
- next unless value = env[key]
25
- value = sanitize_string(URI.decode(value))
26
- env[key] = URI.encode(value)
27
- end
28
- ["HTTP_COOKIE"].each do |key|
29
- next unless value = env[key]
30
- fixed = sanitize_string(value)
31
- env[key] = fixed if fixed
32
- end
33
- env
34
- end
35
-
36
- def sanitize_string(string)
37
- return string unless string.is_a? String
38
-
39
- # Try it as UTF-8 directly
40
- cleaned = string.dup.force_encoding('UTF-8')
41
- if cleaned.valid_encoding?
42
- cleaned
43
- else
44
- utf8clean(string)
45
- end
23
+ def is_valid_utf8(string)
24
+ utf8 = string.dup.force_encoding('UTF-8')
25
+ string == utf8 && utf8.valid_encoding?
46
26
  rescue EncodingError
47
- utf8clean(string)
27
+ false
48
28
  end
49
29
 
50
- def utf8clean(string)
51
- # Force it to UTF-8, throwing out invalid bits
52
- if RUBY_VERSION >= "1.9.3"
53
- # These converters don't exist in 1.9.2
54
- string.encode('UTF-16', 'UTF-8', :invalid => :replace, :replace => '').encode('UTF-8', 'UTF-16')
55
- else
56
- string.chars.select{|i| i.valid_encoding?}.join
30
+ def sanitize_env(env)
31
+ SANITIZE_ENV_KEYS.each do |key|
32
+ next unless value = env[key]
33
+
34
+ if value.include?('%')
35
+ env[key] = URIString.new(value).cleaned
36
+ end
57
37
  end
38
+ env
58
39
  end
59
40
  end
60
41
  end
@@ -0,0 +1,112 @@
1
+ module UTF8Cleaner
2
+ class URIString
3
+ attr_accessor :data
4
+
5
+ def initialize(data)
6
+ self.data = data
7
+ end
8
+
9
+ def cleaned
10
+ if valid?
11
+ data
12
+ else
13
+ encoded_char_array.join
14
+ end
15
+ end
16
+
17
+ def encoded?
18
+ data.include?('%')
19
+ end
20
+
21
+ def valid?
22
+ valid_uri_encoded_utf8(data)
23
+ end
24
+
25
+ private
26
+
27
+ # Returns an array of valid URI-encoded UTF-8 characters.
28
+ def encoded_char_array
29
+ char_array = []
30
+ index = 0
31
+
32
+ while (index < data.length) do
33
+ char = data[index]
34
+
35
+ if char == '%'
36
+ # Skip the next two characters, which are the encoded byte
37
+ # indicates by this %. (We'll change this later for multibyte characters.)
38
+ skip_next = 2
39
+
40
+ # How long is this character?
41
+ first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
42
+ bytes = utf8_char_length_in_bytes(first_byte)
43
+
44
+ # Grab the specified number of encoded bytes
45
+ utf8_char_encoded_bytes = next_n_bytes_from(index, bytes)
46
+
47
+ # Did we get the right number of bytes?
48
+ if utf8_char_encoded_bytes.length == bytes
49
+
50
+ # We did. Is it a valid character?
51
+ utf8_char_encoded = utf8_char_encoded_bytes.join
52
+
53
+ if valid_uri_encoded_utf8(utf8_char_encoded)
54
+ # It's valid!
55
+ char_array << utf8_char_encoded
56
+
57
+ # If we're dealing with a multibyte character, skip more than two
58
+ # of the next characters, which have already been processed.
59
+ skip_next = bytes * 3 - 1
60
+ end
61
+ end
62
+ index += skip_next
63
+ else
64
+ # This was not an encoded character, so just add it and move to the next.
65
+ char_array << char
66
+ end
67
+ index += 1
68
+ end
69
+
70
+ char_array
71
+ end
72
+
73
+ def valid_uri_encoded_utf8(string)
74
+ URI.decode(string).force_encoding('UTF-8').valid_encoding?
75
+ end
76
+
77
+ # Grab the next num_bytes URI-encoded bytes from the raw character array.
78
+ # Returns an array like ['%E2', '%9C', '%93']
79
+ def next_n_bytes_from(index, num_bytes)
80
+ return [] if data.length < index + (3 * num_bytes)
81
+
82
+ num_bytes.times.map do |n|
83
+ # Look for percent signs in the right places
84
+ pct_index = index + (3 * n)
85
+ if data[pct_index] == '%'
86
+ byte = data[pct_index + 1..pct_index + 2]
87
+ else
88
+ # An expected percent sign was missing. The whole character is invalid.
89
+ return []
90
+ end
91
+ '%' + byte
92
+ end
93
+ end
94
+
95
+ # If the first byte is between 0xC0 and 0xDF, the UTF-8 character has two bytes;
96
+ # if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes;
97
+ # and if it is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
98
+ # first_byte is a string like "0x13"
99
+ def utf8_char_length_in_bytes(first_byte)
100
+ if first_byte.hex < 'C0'.hex
101
+ 1
102
+ elsif first_byte.hex < 'DF'.hex
103
+ 2
104
+ elsif first_byte.hex < 'EF'.hex
105
+ 3
106
+ else
107
+ 4
108
+ end
109
+ end
110
+
111
+ end
112
+ end
@@ -1,3 +1,3 @@
1
1
  module UTF8Cleaner
2
- VERSION = "0.0.4"
2
+ VERSION = "0.0.6"
3
3
  end
@@ -1,13 +1,12 @@
1
- # -*- encoding: utf-8 -*-
2
1
  require 'spec_helper'
3
2
 
4
- describe 'UTF8Cleaner::Middleware' do
3
+ describe UTF8Cleaner::Middleware do
5
4
  let :env do
6
5
  {
7
- 'PATH_INFO' => 'foo/bar%2e%2fbaz',
6
+ 'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
8
7
  'QUERY_STRING' => 'foo=bar%FF',
9
8
  'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
10
- 'REQUEST_URI' => '%C3%89'
9
+ 'REQUEST_URI' => '%C3%89%E2%9C%93'
11
10
  }
12
11
  end
13
12
 
@@ -15,19 +14,13 @@ describe 'UTF8Cleaner::Middleware' do
15
14
  UTF8Cleaner::Middleware.new(nil).send(:sanitize_env, env)
16
15
  end
17
16
 
18
- it "removes invalid UTF-8 sequences" do
19
- new_env['QUERY_STRING'].should == 'foo=bar'
17
+ describe "removes invalid UTF-8 sequences" do
18
+ it { new_env['QUERY_STRING'].should == 'foo=bar' }
19
+ it { new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++' }
20
20
  end
21
21
 
22
- it "turns valid %-escaped ASCII chars into their ASCII equivalents" do
23
- new_env['PATH_INFO'].should == 'foo/bar./baz'
24
- end
25
-
26
- it "leaves valid %-escaped UTF-8 chars alone" do
27
- new_env['REQUEST_URI'].should == '%C3%89'
28
- end
29
-
30
- it "handles an awful URL" do
31
- new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++'
22
+ describe "leaves all valid characters untouched" do
23
+ it { new_env['PATH_INFO'].should == 'foo/bar%2e%2fbaz%26%3B' }
24
+ it { new_env['REQUEST_URI'].should == '%C3%89%E2%9C%93' }
32
25
  end
33
26
  end
data/spec/spec_helper.rb CHANGED
@@ -2,8 +2,11 @@ require 'rubygems'
2
2
  require 'rspec/autorun'
3
3
 
4
4
  require 'utf8-cleaner'
5
+ require 'uri'
5
6
 
6
7
  RSpec.configure do |config|
8
+ config.treat_symbols_as_metadata_keys_with_true_values = true
9
+
7
10
  # Run specs in random order to surface order dependencies. If you find an
8
11
  # order dependency and want to debug it, you can fix the order by providing
9
12
  # the seed, which is printed after each run.
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ module UTF8Cleaner
4
+
5
+ describe URIString do
6
+ let(:invalid_string) { URIString.new('%FF') }
7
+ let(:ascii_string) { URIString.new('foo') }
8
+ let(:encoded_string) { URIString.new('%26') }
9
+ let(:multibyte_string) { URIString.new('%E2%9C%93') }
10
+ let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
11
+ # foo/ bar. / baz& ; √ baz
12
+
13
+ describe '#new' do
14
+ it { encoded_string.should be_a URIString }
15
+ end
16
+
17
+ describe '#cleaned' do
18
+ it { invalid_string.cleaned.should eq('') }
19
+ it { ascii_string.cleaned.should eq('foo') }
20
+ it { encoded_string.cleaned.should eq('%26') }
21
+ it { multibyte_string.cleaned.should eq('%E2%9C%93') }
22
+ it { complex_invalid_string.cleaned.should eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
23
+ end
24
+
25
+ describe '#encoded?' do
26
+ it { encoded_string.should be_encoded }
27
+ it { invalid_string.should be_encoded }
28
+ it { multibyte_string.should be_encoded }
29
+ it { complex_invalid_string.should be_encoded }
30
+
31
+ it { ascii_string.should_not be_encoded }
32
+ end
33
+
34
+ describe '#valid?' do
35
+ it { ascii_string.should be_valid }
36
+ it { encoded_string.should be_valid }
37
+ it { multibyte_string.should be_valid }
38
+
39
+ it { invalid_string.should_not be_valid }
40
+ it { complex_invalid_string.should_not be_valid }
41
+ end
42
+
43
+ end
44
+
45
+ end
data/utf8-cleaner.gemspec CHANGED
@@ -17,5 +17,8 @@ Gem::Specification.new do |gem|
17
17
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
18
18
  gem.require_paths = ["lib"]
19
19
 
20
+ gem.add_development_dependency "rake"
21
+ gem.add_development_dependency "guard"
22
+ gem.add_development_dependency "guard-rspec"
20
23
  gem.add_development_dependency "rspec"
21
24
  end
metadata CHANGED
@@ -1,30 +1,69 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8-cleaner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.4
5
- prerelease:
4
+ version: 0.0.6
6
5
  platform: ruby
7
6
  authors:
8
7
  - Leon Miller-Out
9
8
  autorequire:
10
9
  bindir: bin
11
10
  cert_chain: []
12
- date: 2013-04-23 00:00:00.000000000 Z
11
+ date: 2013-10-16 00:00:00.000000000 Z
13
12
  dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: guard
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: guard-rspec
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :development
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
14
55
  - !ruby/object:Gem::Dependency
15
56
  name: rspec
16
57
  requirement: !ruby/object:Gem::Requirement
17
- none: false
18
58
  requirements:
19
- - - ! '>='
59
+ - - '>='
20
60
  - !ruby/object:Gem::Version
21
61
  version: '0'
22
62
  type: :development
23
63
  prerelease: false
24
64
  version_requirements: !ruby/object:Gem::Requirement
25
- none: false
26
65
  requirements:
27
- - - ! '>='
66
+ - - '>='
28
67
  - !ruby/object:Gem::Version
29
68
  version: '0'
30
69
  description: Removes invalid UTF8 characters from the URL and other env vars
@@ -35,41 +74,45 @@ extensions: []
35
74
  extra_rdoc_files: []
36
75
  files:
37
76
  - .gitignore
77
+ - .travis.yml
38
78
  - Gemfile
79
+ - Guardfile
39
80
  - LICENSE.txt
40
81
  - README.md
41
82
  - Rakefile
42
83
  - lib/utf8-cleaner.rb
43
84
  - lib/utf8-cleaner/middleware.rb
44
85
  - lib/utf8-cleaner/railtie.rb
86
+ - lib/utf8-cleaner/uri_string.rb
45
87
  - lib/utf8-cleaner/version.rb
46
88
  - spec/middleware_spec.rb
47
89
  - spec/spec_helper.rb
90
+ - spec/uri_string_spec.rb
48
91
  - utf8-cleaner.gemspec
49
92
  homepage: https://github.com/singlebrook/utf8-cleaner
50
93
  licenses: []
94
+ metadata: {}
51
95
  post_install_message:
52
96
  rdoc_options: []
53
97
  require_paths:
54
98
  - lib
55
99
  required_ruby_version: !ruby/object:Gem::Requirement
56
- none: false
57
100
  requirements:
58
- - - ! '>='
101
+ - - '>='
59
102
  - !ruby/object:Gem::Version
60
103
  version: '0'
61
104
  required_rubygems_version: !ruby/object:Gem::Requirement
62
- none: false
63
105
  requirements:
64
- - - ! '>='
106
+ - - '>='
65
107
  - !ruby/object:Gem::Version
66
108
  version: '0'
67
109
  requirements: []
68
110
  rubyforge_project:
69
- rubygems_version: 1.8.23
111
+ rubygems_version: 2.0.3
70
112
  signing_key:
71
- specification_version: 3
113
+ specification_version: 4
72
114
  summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"
73
115
  test_files:
74
116
  - spec/middleware_spec.rb
75
117
  - spec/spec_helper.rb
118
+ - spec/uri_string_spec.rb