utf8-cleaner 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.travis.yml +4 -0
- data/Guardfile +9 -0
- data/README.md +2 -0
- data/Rakefile +4 -0
- data/lib/utf8-cleaner.rb +1 -0
- data/lib/utf8-cleaner/middleware.rb +13 -32
- data/lib/utf8-cleaner/uri_string.rb +112 -0
- data/lib/utf8-cleaner/version.rb +1 -1
- data/spec/middleware_spec.rb +9 -16
- data/spec/spec_helper.rb +3 -0
- data/spec/uri_string_spec.rb +45 -0
- data/utf8-cleaner.gemspec +3 -0
- metadata +56 -13
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b6e265c0aefe9f9c6fa6082b86f21c81a92400b6
|
4
|
+
data.tar.gz: 12dd12dceef125122d2877dbc9e9e597055ddfac
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: b794d47e0c9460ef5ec9eadc9340f705a491b0c48370efb82310d2bf21ff7713d655a4fdbb312c4788913879476be5a4c8823e8f494d8a2fe7376ecc4dd58a6c
|
7
|
+
data.tar.gz: 0bf1fb1a6d23600a3f5ff4ae7d1f62c7fded5a0d7d0233ebc85103b12c51a849a7ef83708a21568a28bd005e8ea1199ffff6b0d05b0c186a8889624e190f5e19
|
data/.travis.yml
ADDED
data/Guardfile
ADDED
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# UTF8Cleaner
|
2
2
|
|
3
|
+
[<img src="https://secure.travis-ci.org/singlebrook/utf8-cleaner.png" />](http://travis-ci.org/singlebrook/utf8-cleaner)
|
4
|
+
|
3
5
|
Removes invalid UTF-8 characters from the environment so that your app doesn't choke
|
4
6
|
on them. This prevents errors like "invalid byte sequence in UTF-8".
|
5
7
|
|
data/Rakefile
CHANGED
data/lib/utf8-cleaner.rb
CHANGED
@@ -7,6 +7,7 @@ module UTF8Cleaner
|
|
7
7
|
"QUERY_STRING",
|
8
8
|
"REQUEST_PATH",
|
9
9
|
"REQUEST_URI",
|
10
|
+
"HTTP_COOKIE"
|
10
11
|
]
|
11
12
|
|
12
13
|
def initialize(app)
|
@@ -19,42 +20,22 @@ module UTF8Cleaner
|
|
19
20
|
|
20
21
|
private
|
21
22
|
|
22
|
-
def
|
23
|
-
|
24
|
-
|
25
|
-
value = sanitize_string(URI.decode(value))
|
26
|
-
env[key] = URI.encode(value)
|
27
|
-
end
|
28
|
-
["HTTP_COOKIE"].each do |key|
|
29
|
-
next unless value = env[key]
|
30
|
-
fixed = sanitize_string(value)
|
31
|
-
env[key] = fixed if fixed
|
32
|
-
end
|
33
|
-
env
|
34
|
-
end
|
35
|
-
|
36
|
-
def sanitize_string(string)
|
37
|
-
return string unless string.is_a? String
|
38
|
-
|
39
|
-
# Try it as UTF-8 directly
|
40
|
-
cleaned = string.dup.force_encoding('UTF-8')
|
41
|
-
if cleaned.valid_encoding?
|
42
|
-
cleaned
|
43
|
-
else
|
44
|
-
utf8clean(string)
|
45
|
-
end
|
23
|
+
def is_valid_utf8(string)
|
24
|
+
utf8 = string.dup.force_encoding('UTF-8')
|
25
|
+
string == utf8 && utf8.valid_encoding?
|
46
26
|
rescue EncodingError
|
47
|
-
|
27
|
+
false
|
48
28
|
end
|
49
29
|
|
50
|
-
def
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
30
|
+
def sanitize_env(env)
|
31
|
+
SANITIZE_ENV_KEYS.each do |key|
|
32
|
+
next unless value = env[key]
|
33
|
+
|
34
|
+
if value.include?('%')
|
35
|
+
env[key] = URIString.new(value).cleaned
|
36
|
+
end
|
57
37
|
end
|
38
|
+
env
|
58
39
|
end
|
59
40
|
end
|
60
41
|
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
module UTF8Cleaner
|
2
|
+
class URIString
|
3
|
+
attr_accessor :data
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
self.data = data
|
7
|
+
end
|
8
|
+
|
9
|
+
def cleaned
|
10
|
+
if valid?
|
11
|
+
data
|
12
|
+
else
|
13
|
+
encoded_char_array.join
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def encoded?
|
18
|
+
data.include?('%')
|
19
|
+
end
|
20
|
+
|
21
|
+
def valid?
|
22
|
+
valid_uri_encoded_utf8(data)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
# Returns an array of valid URI-encoded UTF-8 characters.
|
28
|
+
def encoded_char_array
|
29
|
+
char_array = []
|
30
|
+
index = 0
|
31
|
+
|
32
|
+
while (index < data.length) do
|
33
|
+
char = data[index]
|
34
|
+
|
35
|
+
if char == '%'
|
36
|
+
# Skip the next two characters, which are the encoded byte
|
37
|
+
# indicates by this %. (We'll change this later for multibyte characters.)
|
38
|
+
skip_next = 2
|
39
|
+
|
40
|
+
# How long is this character?
|
41
|
+
first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
|
42
|
+
bytes = utf8_char_length_in_bytes(first_byte)
|
43
|
+
|
44
|
+
# Grab the specified number of encoded bytes
|
45
|
+
utf8_char_encoded_bytes = next_n_bytes_from(index, bytes)
|
46
|
+
|
47
|
+
# Did we get the right number of bytes?
|
48
|
+
if utf8_char_encoded_bytes.length == bytes
|
49
|
+
|
50
|
+
# We did. Is it a valid character?
|
51
|
+
utf8_char_encoded = utf8_char_encoded_bytes.join
|
52
|
+
|
53
|
+
if valid_uri_encoded_utf8(utf8_char_encoded)
|
54
|
+
# It's valid!
|
55
|
+
char_array << utf8_char_encoded
|
56
|
+
|
57
|
+
# If we're dealing with a multibyte character, skip more than two
|
58
|
+
# of the next characters, which have already been processed.
|
59
|
+
skip_next = bytes * 3 - 1
|
60
|
+
end
|
61
|
+
end
|
62
|
+
index += skip_next
|
63
|
+
else
|
64
|
+
# This was not an encoded character, so just add it and move to the next.
|
65
|
+
char_array << char
|
66
|
+
end
|
67
|
+
index += 1
|
68
|
+
end
|
69
|
+
|
70
|
+
char_array
|
71
|
+
end
|
72
|
+
|
73
|
+
def valid_uri_encoded_utf8(string)
|
74
|
+
URI.decode(string).force_encoding('UTF-8').valid_encoding?
|
75
|
+
end
|
76
|
+
|
77
|
+
# Grab the next num_bytes URI-encoded bytes from the raw character array.
|
78
|
+
# Returns an array like ['%E2', '%9C', '%93']
|
79
|
+
def next_n_bytes_from(index, num_bytes)
|
80
|
+
return [] if data.length < index + (3 * num_bytes)
|
81
|
+
|
82
|
+
num_bytes.times.map do |n|
|
83
|
+
# Look for percent signs in the right places
|
84
|
+
pct_index = index + (3 * n)
|
85
|
+
if data[pct_index] == '%'
|
86
|
+
byte = data[pct_index + 1..pct_index + 2]
|
87
|
+
else
|
88
|
+
# An expected percent sign was missing. The whole character is invalid.
|
89
|
+
return []
|
90
|
+
end
|
91
|
+
'%' + byte
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
# If the first byte is between 0xC0 and 0xDF, the UTF-8 character has two bytes;
|
96
|
+
# if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes;
|
97
|
+
# and if it is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
|
98
|
+
# first_byte is a string like "0x13"
|
99
|
+
def utf8_char_length_in_bytes(first_byte)
|
100
|
+
if first_byte.hex < 'C0'.hex
|
101
|
+
1
|
102
|
+
elsif first_byte.hex < 'DF'.hex
|
103
|
+
2
|
104
|
+
elsif first_byte.hex < 'EF'.hex
|
105
|
+
3
|
106
|
+
else
|
107
|
+
4
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
data/lib/utf8-cleaner/version.rb
CHANGED
data/spec/middleware_spec.rb
CHANGED
@@ -1,13 +1,12 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
1
|
require 'spec_helper'
|
3
2
|
|
4
|
-
describe
|
3
|
+
describe UTF8Cleaner::Middleware do
|
5
4
|
let :env do
|
6
5
|
{
|
7
|
-
'PATH_INFO' => 'foo
|
6
|
+
'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
|
8
7
|
'QUERY_STRING' => 'foo=bar%FF',
|
9
8
|
'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
|
10
|
-
'REQUEST_URI' => '%C3%89'
|
9
|
+
'REQUEST_URI' => '%C3%89%E2%9C%93'
|
11
10
|
}
|
12
11
|
end
|
13
12
|
|
@@ -15,19 +14,13 @@ describe 'UTF8Cleaner::Middleware' do
|
|
15
14
|
UTF8Cleaner::Middleware.new(nil).send(:sanitize_env, env)
|
16
15
|
end
|
17
16
|
|
18
|
-
|
19
|
-
new_env['QUERY_STRING'].should == 'foo=bar'
|
17
|
+
describe "removes invalid UTF-8 sequences" do
|
18
|
+
it { new_env['QUERY_STRING'].should == 'foo=bar' }
|
19
|
+
it { new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++' }
|
20
20
|
end
|
21
21
|
|
22
|
-
|
23
|
-
new_env['PATH_INFO'].should == 'foo/bar
|
24
|
-
|
25
|
-
|
26
|
-
it "leaves valid %-escaped UTF-8 chars alone" do
|
27
|
-
new_env['REQUEST_URI'].should == '%C3%89'
|
28
|
-
end
|
29
|
-
|
30
|
-
it "handles an awful URL" do
|
31
|
-
new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++'
|
22
|
+
describe "leaves all valid characters untouched" do
|
23
|
+
it { new_env['PATH_INFO'].should == 'foo/bar%2e%2fbaz%26%3B' }
|
24
|
+
it { new_env['REQUEST_URI'].should == '%C3%89%E2%9C%93' }
|
32
25
|
end
|
33
26
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -2,8 +2,11 @@ require 'rubygems'
|
|
2
2
|
require 'rspec/autorun'
|
3
3
|
|
4
4
|
require 'utf8-cleaner'
|
5
|
+
require 'uri'
|
5
6
|
|
6
7
|
RSpec.configure do |config|
|
8
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
9
|
+
|
7
10
|
# Run specs in random order to surface order dependencies. If you find an
|
8
11
|
# order dependency and want to debug it, you can fix the order by providing
|
9
12
|
# the seed, which is printed after each run.
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
module UTF8Cleaner
|
4
|
+
|
5
|
+
describe URIString do
|
6
|
+
let(:invalid_string) { URIString.new('%FF') }
|
7
|
+
let(:ascii_string) { URIString.new('foo') }
|
8
|
+
let(:encoded_string) { URIString.new('%26') }
|
9
|
+
let(:multibyte_string) { URIString.new('%E2%9C%93') }
|
10
|
+
let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
|
11
|
+
# foo/ bar. / baz& ; √ baz
|
12
|
+
|
13
|
+
describe '#new' do
|
14
|
+
it { encoded_string.should be_a URIString }
|
15
|
+
end
|
16
|
+
|
17
|
+
describe '#cleaned' do
|
18
|
+
it { invalid_string.cleaned.should eq('') }
|
19
|
+
it { ascii_string.cleaned.should eq('foo') }
|
20
|
+
it { encoded_string.cleaned.should eq('%26') }
|
21
|
+
it { multibyte_string.cleaned.should eq('%E2%9C%93') }
|
22
|
+
it { complex_invalid_string.cleaned.should eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
|
23
|
+
end
|
24
|
+
|
25
|
+
describe '#encoded?' do
|
26
|
+
it { encoded_string.should be_encoded }
|
27
|
+
it { invalid_string.should be_encoded }
|
28
|
+
it { multibyte_string.should be_encoded }
|
29
|
+
it { complex_invalid_string.should be_encoded }
|
30
|
+
|
31
|
+
it { ascii_string.should_not be_encoded }
|
32
|
+
end
|
33
|
+
|
34
|
+
describe '#valid?' do
|
35
|
+
it { ascii_string.should be_valid }
|
36
|
+
it { encoded_string.should be_valid }
|
37
|
+
it { multibyte_string.should be_valid }
|
38
|
+
|
39
|
+
it { invalid_string.should_not be_valid }
|
40
|
+
it { complex_invalid_string.should_not be_valid }
|
41
|
+
end
|
42
|
+
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
data/utf8-cleaner.gemspec
CHANGED
@@ -17,5 +17,8 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
18
18
|
gem.require_paths = ["lib"]
|
19
19
|
|
20
|
+
gem.add_development_dependency "rake"
|
21
|
+
gem.add_development_dependency "guard"
|
22
|
+
gem.add_development_dependency "guard-rspec"
|
20
23
|
gem.add_development_dependency "rspec"
|
21
24
|
end
|
metadata
CHANGED
@@ -1,30 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8-cleaner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.6
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Leon Miller-Out
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-10-16 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rake
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - '>='
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - '>='
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: guard
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: guard-rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
14
55
|
- !ruby/object:Gem::Dependency
|
15
56
|
name: rspec
|
16
57
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
58
|
requirements:
|
19
|
-
- -
|
59
|
+
- - '>='
|
20
60
|
- !ruby/object:Gem::Version
|
21
61
|
version: '0'
|
22
62
|
type: :development
|
23
63
|
prerelease: false
|
24
64
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
65
|
requirements:
|
27
|
-
- -
|
66
|
+
- - '>='
|
28
67
|
- !ruby/object:Gem::Version
|
29
68
|
version: '0'
|
30
69
|
description: Removes invalid UTF8 characters from the URL and other env vars
|
@@ -35,41 +74,45 @@ extensions: []
|
|
35
74
|
extra_rdoc_files: []
|
36
75
|
files:
|
37
76
|
- .gitignore
|
77
|
+
- .travis.yml
|
38
78
|
- Gemfile
|
79
|
+
- Guardfile
|
39
80
|
- LICENSE.txt
|
40
81
|
- README.md
|
41
82
|
- Rakefile
|
42
83
|
- lib/utf8-cleaner.rb
|
43
84
|
- lib/utf8-cleaner/middleware.rb
|
44
85
|
- lib/utf8-cleaner/railtie.rb
|
86
|
+
- lib/utf8-cleaner/uri_string.rb
|
45
87
|
- lib/utf8-cleaner/version.rb
|
46
88
|
- spec/middleware_spec.rb
|
47
89
|
- spec/spec_helper.rb
|
90
|
+
- spec/uri_string_spec.rb
|
48
91
|
- utf8-cleaner.gemspec
|
49
92
|
homepage: https://github.com/singlebrook/utf8-cleaner
|
50
93
|
licenses: []
|
94
|
+
metadata: {}
|
51
95
|
post_install_message:
|
52
96
|
rdoc_options: []
|
53
97
|
require_paths:
|
54
98
|
- lib
|
55
99
|
required_ruby_version: !ruby/object:Gem::Requirement
|
56
|
-
none: false
|
57
100
|
requirements:
|
58
|
-
- -
|
101
|
+
- - '>='
|
59
102
|
- !ruby/object:Gem::Version
|
60
103
|
version: '0'
|
61
104
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
62
|
-
none: false
|
63
105
|
requirements:
|
64
|
-
- -
|
106
|
+
- - '>='
|
65
107
|
- !ruby/object:Gem::Version
|
66
108
|
version: '0'
|
67
109
|
requirements: []
|
68
110
|
rubyforge_project:
|
69
|
-
rubygems_version:
|
111
|
+
rubygems_version: 2.0.3
|
70
112
|
signing_key:
|
71
|
-
specification_version:
|
113
|
+
specification_version: 4
|
72
114
|
summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"
|
73
115
|
test_files:
|
74
116
|
- spec/middleware_spec.rb
|
75
117
|
- spec/spec_helper.rb
|
118
|
+
- spec/uri_string_spec.rb
|