utf8-cleaner 0.1.1 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/lib/utf8-cleaner/middleware.rb +4 -4
- data/lib/utf8-cleaner/uri_string.rb +26 -7
- data/lib/utf8-cleaner/version.rb +1 -1
- data/spec/middleware_spec.rb +14 -4
- data/spec/uri_string_spec.rb +10 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ef101f548a442f6dff1dcdad93983578b0c9afd
|
4
|
+
data.tar.gz: 7e02ed8d5d55c4bd6dc9033f0740bd6a5fc85cc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 87eb291f74fcc40fff6ccd80567e23e0556d1fee28657a22a089293024459023abe89a7bbeb31db24b380bee9da7095146377346c9a8524fab0f9dc9b3878d00
|
7
|
+
data.tar.gz: 761762367fca098c24fe872708b621c490075bf0d7c7a57cc07f209555c190b178890c6973a34a6b135d4baafbbfbf92b845fc5886b2cbf328b2d98535f1e92b
|
data/CHANGELOG.md
CHANGED
@@ -34,14 +34,14 @@ module UTF8Cleaner
|
|
34
34
|
def sanitize_env_keys(env)
|
35
35
|
SANITIZE_ENV_KEYS.each do |key|
|
36
36
|
next unless value = env[key]
|
37
|
-
env[key] =
|
37
|
+
env[key] = cleaned_string(value)
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
41
|
def sanitize_env_rack_input(env)
|
42
42
|
case env['CONTENT_TYPE']
|
43
43
|
when 'application/x-www-form-urlencoded'
|
44
|
-
cleaned_value =
|
44
|
+
cleaned_value = cleaned_string(env['rack.input'].read)
|
45
45
|
env['rack.input'] = StringIO.new(cleaned_value) if cleaned_value
|
46
46
|
env['rack.input'].rewind
|
47
47
|
when 'multipart/form-data'
|
@@ -51,8 +51,8 @@ module UTF8Cleaner
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
def
|
55
|
-
value = tidy_bytes(value)
|
54
|
+
def cleaned_string(value)
|
55
|
+
value = tidy_bytes(value) unless value.ascii_only?
|
56
56
|
value = URIString.new(value).cleaned if value.include?('%')
|
57
57
|
value
|
58
58
|
end
|
@@ -1,7 +1,12 @@
|
|
1
1
|
module UTF8Cleaner
|
2
|
+
# Cleans invalid %-encodings from URI-encoded strings.
|
2
3
|
class URIString
|
3
4
|
attr_accessor :data
|
4
5
|
|
6
|
+
HEX_CHARS = '0-9a-fA-F'
|
7
|
+
HEX_CHARS_REGEX = /[#{HEX_CHARS}]/
|
8
|
+
INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/
|
9
|
+
|
5
10
|
def initialize(data)
|
6
11
|
self.data = data
|
7
12
|
end
|
@@ -16,11 +21,6 @@ module UTF8Cleaner
|
|
16
21
|
|
17
22
|
def valid?
|
18
23
|
valid_uri_encoded_utf8(data)
|
19
|
-
rescue ArgumentError => e
|
20
|
-
if e.message =~ /invalid byte sequence/
|
21
|
-
return false
|
22
|
-
end
|
23
|
-
raise e
|
24
24
|
end
|
25
25
|
|
26
26
|
private
|
@@ -38,6 +38,19 @@ module UTF8Cleaner
|
|
38
38
|
# indicates by this %. (We'll change this later for multibyte characters.)
|
39
39
|
skip_next = 2
|
40
40
|
|
41
|
+
# If the next character is not a hex char, drop the percent and it
|
42
|
+
unless data[index + 1] =~ HEX_CHARS_REGEX
|
43
|
+
index += 2
|
44
|
+
next
|
45
|
+
end
|
46
|
+
|
47
|
+
# If the character after that is not a hex char, drop the percent and
|
48
|
+
# both of the following chars.
|
49
|
+
unless data[index + 2] =~ HEX_CHARS_REGEX
|
50
|
+
index += 3
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
41
54
|
# How long is this character?
|
42
55
|
first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
|
43
56
|
bytes = utf8_char_length_in_bytes(first_byte)
|
@@ -72,7 +85,13 @@ module UTF8Cleaner
|
|
72
85
|
end
|
73
86
|
|
74
87
|
def valid_uri_encoded_utf8(string)
|
75
|
-
URI.decode(string).force_encoding('UTF-8').valid_encoding?
|
88
|
+
URI.decode(string).force_encoding('UTF-8').valid_encoding? &&
|
89
|
+
string !~ INVALID_PERCENT_ENCODING_REGEX
|
90
|
+
rescue ArgumentError => e
|
91
|
+
if e.message =~ /invalid byte sequence/
|
92
|
+
return false
|
93
|
+
end
|
94
|
+
raise e
|
76
95
|
end
|
77
96
|
|
78
97
|
# Grab the next num_bytes URI-encoded bytes from the raw character array.
|
@@ -110,4 +129,4 @@ module UTF8Cleaner
|
|
110
129
|
end
|
111
130
|
|
112
131
|
end
|
113
|
-
end
|
132
|
+
end
|
data/lib/utf8-cleaner/version.rb
CHANGED
data/spec/middleware_spec.rb
CHANGED
@@ -13,10 +13,11 @@ module UTF8Cleaner
|
|
13
13
|
'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
|
14
14
|
'QUERY_STRING' => 'foo=bar%FF',
|
15
15
|
'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
|
16
|
-
'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0",
|
16
|
+
'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93",
|
17
17
|
'REQUEST_URI' => '%C3%89%E2%9C%93',
|
18
18
|
'rack.input' => StringIO.new("foo=%FFbar%F8"),
|
19
|
-
'CONTENT_TYPE' => 'application/x-www-form-urlencoded'
|
19
|
+
'CONTENT_TYPE' => 'application/x-www-form-urlencoded',
|
20
|
+
'HTTP_COOKIE' => nil
|
20
21
|
}
|
21
22
|
end
|
22
23
|
|
@@ -27,7 +28,7 @@ module UTF8Cleaner
|
|
27
28
|
end
|
28
29
|
|
29
30
|
describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do
|
30
|
-
it { expect(new_env['HTTP_USER_AGENT']).to eq(
|
31
|
+
it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") }
|
31
32
|
end
|
32
33
|
|
33
34
|
describe "leaves all valid characters untouched" do
|
@@ -74,5 +75,14 @@ module UTF8Cleaner
|
|
74
75
|
expect(new_env['PATH_INFO']).to eq('/this/is/safe')
|
75
76
|
end
|
76
77
|
end
|
78
|
+
|
79
|
+
# Ensure that all cleaned values parse cleanly.
|
80
|
+
# E.g. make sure Rack/Rails won't choke on them
|
81
|
+
after do
|
82
|
+
cleaned = new_env
|
83
|
+
env.keys.reject{|key| key == 'rack.input'}.each do |key|
|
84
|
+
URI.decode_www_form_component(cleaned[key]) if cleaned[key]
|
85
|
+
end
|
86
|
+
end
|
77
87
|
end
|
78
|
-
end
|
88
|
+
end
|
data/spec/uri_string_spec.rb
CHANGED
@@ -9,6 +9,9 @@ module UTF8Cleaner
|
|
9
9
|
let(:multibyte_string) { URIString.new('%E2%9C%93') }
|
10
10
|
let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
|
11
11
|
# foo/ bar. / baz& ; √ baz
|
12
|
+
let(:no_byte_at_all) { URIString.new('%') }
|
13
|
+
let(:not_even_hex_chars1) { URIString.new('%x') }
|
14
|
+
let(:not_even_hex_chars2) { URIString.new('%0zhey') }
|
12
15
|
|
13
16
|
describe '#new' do
|
14
17
|
it { expect(encoded_string).to be_a(URIString) }
|
@@ -20,6 +23,9 @@ module UTF8Cleaner
|
|
20
23
|
it { expect(encoded_string.cleaned).to eq('%26') }
|
21
24
|
it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') }
|
22
25
|
it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
|
26
|
+
it { expect(no_byte_at_all.cleaned).to eq('') }
|
27
|
+
it { expect(not_even_hex_chars1.cleaned).to eq('') }
|
28
|
+
it { expect(not_even_hex_chars2.cleaned).to eq('hey') }
|
23
29
|
end
|
24
30
|
|
25
31
|
describe '#valid?' do
|
@@ -29,8 +35,11 @@ module UTF8Cleaner
|
|
29
35
|
|
30
36
|
it { expect(invalid_string).to_not be_valid }
|
31
37
|
it { expect(complex_invalid_string).to_not be_valid }
|
38
|
+
it { expect(no_byte_at_all).to_not be_valid }
|
39
|
+
it { expect(not_even_hex_chars1).to_not be_valid }
|
40
|
+
it { expect(not_even_hex_chars2).to_not be_valid }
|
32
41
|
end
|
33
42
|
|
34
43
|
end
|
35
44
|
|
36
|
-
end
|
45
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8-cleaner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leon Miller-Out
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -137,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
137
137
|
version: '0'
|
138
138
|
requirements: []
|
139
139
|
rubyforge_project:
|
140
|
-
rubygems_version: 2.
|
140
|
+
rubygems_version: 2.4.5.1
|
141
141
|
signing_key:
|
142
142
|
specification_version: 4
|
143
143
|
summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"
|