utf8-cleaner 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -1
- data/lib/utf8-cleaner/middleware.rb +4 -4
- data/lib/utf8-cleaner/uri_string.rb +26 -7
- data/lib/utf8-cleaner/version.rb +1 -1
- data/spec/middleware_spec.rb +14 -4
- data/spec/uri_string_spec.rb +10 -1
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3ef101f548a442f6dff1dcdad93983578b0c9afd
|
4
|
+
data.tar.gz: 7e02ed8d5d55c4bd6dc9033f0740bd6a5fc85cc0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 87eb291f74fcc40fff6ccd80567e23e0556d1fee28657a22a089293024459023abe89a7bbeb31db24b380bee9da7095146377346c9a8524fab0f9dc9b3878d00
|
7
|
+
data.tar.gz: 761762367fca098c24fe872708b621c490075bf0d7c7a57cc07f209555c190b178890c6973a34a6b135d4baafbbfbf92b845fc5886b2cbf328b2d98535f1e92b
|
data/CHANGELOG.md
CHANGED
@@ -34,14 +34,14 @@ module UTF8Cleaner
|
|
34
34
|
def sanitize_env_keys(env)
|
35
35
|
SANITIZE_ENV_KEYS.each do |key|
|
36
36
|
next unless value = env[key]
|
37
|
-
env[key] =
|
37
|
+
env[key] = cleaned_string(value)
|
38
38
|
end
|
39
39
|
end
|
40
40
|
|
41
41
|
def sanitize_env_rack_input(env)
|
42
42
|
case env['CONTENT_TYPE']
|
43
43
|
when 'application/x-www-form-urlencoded'
|
44
|
-
cleaned_value =
|
44
|
+
cleaned_value = cleaned_string(env['rack.input'].read)
|
45
45
|
env['rack.input'] = StringIO.new(cleaned_value) if cleaned_value
|
46
46
|
env['rack.input'].rewind
|
47
47
|
when 'multipart/form-data'
|
@@ -51,8 +51,8 @@ module UTF8Cleaner
|
|
51
51
|
end
|
52
52
|
end
|
53
53
|
|
54
|
-
def
|
55
|
-
value = tidy_bytes(value)
|
54
|
+
def cleaned_string(value)
|
55
|
+
value = tidy_bytes(value) unless value.ascii_only?
|
56
56
|
value = URIString.new(value).cleaned if value.include?('%')
|
57
57
|
value
|
58
58
|
end
|
@@ -1,7 +1,12 @@
|
|
1
1
|
module UTF8Cleaner
|
2
|
+
# Cleans invalid %-encodings from URI-encoded strings.
|
2
3
|
class URIString
|
3
4
|
attr_accessor :data
|
4
5
|
|
6
|
+
HEX_CHARS = '0-9a-fA-F'
|
7
|
+
HEX_CHARS_REGEX = /[#{HEX_CHARS}]/
|
8
|
+
INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/
|
9
|
+
|
5
10
|
def initialize(data)
|
6
11
|
self.data = data
|
7
12
|
end
|
@@ -16,11 +21,6 @@ module UTF8Cleaner
|
|
16
21
|
|
17
22
|
def valid?
|
18
23
|
valid_uri_encoded_utf8(data)
|
19
|
-
rescue ArgumentError => e
|
20
|
-
if e.message =~ /invalid byte sequence/
|
21
|
-
return false
|
22
|
-
end
|
23
|
-
raise e
|
24
24
|
end
|
25
25
|
|
26
26
|
private
|
@@ -38,6 +38,19 @@ module UTF8Cleaner
|
|
38
38
|
# indicates by this %. (We'll change this later for multibyte characters.)
|
39
39
|
skip_next = 2
|
40
40
|
|
41
|
+
# If the next character is not a hex char, drop the percent and it
|
42
|
+
unless data[index + 1] =~ HEX_CHARS_REGEX
|
43
|
+
index += 2
|
44
|
+
next
|
45
|
+
end
|
46
|
+
|
47
|
+
# If the character after that is not a hex char, drop the percent and
|
48
|
+
# both of the following chars.
|
49
|
+
unless data[index + 2] =~ HEX_CHARS_REGEX
|
50
|
+
index += 3
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
41
54
|
# How long is this character?
|
42
55
|
first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
|
43
56
|
bytes = utf8_char_length_in_bytes(first_byte)
|
@@ -72,7 +85,13 @@ module UTF8Cleaner
|
|
72
85
|
end
|
73
86
|
|
74
87
|
def valid_uri_encoded_utf8(string)
|
75
|
-
URI.decode(string).force_encoding('UTF-8').valid_encoding?
|
88
|
+
URI.decode(string).force_encoding('UTF-8').valid_encoding? &&
|
89
|
+
string !~ INVALID_PERCENT_ENCODING_REGEX
|
90
|
+
rescue ArgumentError => e
|
91
|
+
if e.message =~ /invalid byte sequence/
|
92
|
+
return false
|
93
|
+
end
|
94
|
+
raise e
|
76
95
|
end
|
77
96
|
|
78
97
|
# Grab the next num_bytes URI-encoded bytes from the raw character array.
|
@@ -110,4 +129,4 @@ module UTF8Cleaner
|
|
110
129
|
end
|
111
130
|
|
112
131
|
end
|
113
|
-
end
|
132
|
+
end
|
data/lib/utf8-cleaner/version.rb
CHANGED
data/spec/middleware_spec.rb
CHANGED
@@ -13,10 +13,11 @@ module UTF8Cleaner
|
|
13
13
|
'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
|
14
14
|
'QUERY_STRING' => 'foo=bar%FF',
|
15
15
|
'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
|
16
|
-
'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0",
|
16
|
+
'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93",
|
17
17
|
'REQUEST_URI' => '%C3%89%E2%9C%93',
|
18
18
|
'rack.input' => StringIO.new("foo=%FFbar%F8"),
|
19
|
-
'CONTENT_TYPE' => 'application/x-www-form-urlencoded'
|
19
|
+
'CONTENT_TYPE' => 'application/x-www-form-urlencoded',
|
20
|
+
'HTTP_COOKIE' => nil
|
20
21
|
}
|
21
22
|
end
|
22
23
|
|
@@ -27,7 +28,7 @@ module UTF8Cleaner
|
|
27
28
|
end
|
28
29
|
|
29
30
|
describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do
|
30
|
-
it { expect(new_env['HTTP_USER_AGENT']).to eq(
|
31
|
+
it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") }
|
31
32
|
end
|
32
33
|
|
33
34
|
describe "leaves all valid characters untouched" do
|
@@ -74,5 +75,14 @@ module UTF8Cleaner
|
|
74
75
|
expect(new_env['PATH_INFO']).to eq('/this/is/safe')
|
75
76
|
end
|
76
77
|
end
|
78
|
+
|
79
|
+
# Ensure that all cleaned values parse cleanly.
|
80
|
+
# E.g. make sure Rack/Rails won't choke on them
|
81
|
+
after do
|
82
|
+
cleaned = new_env
|
83
|
+
env.keys.reject{|key| key == 'rack.input'}.each do |key|
|
84
|
+
URI.decode_www_form_component(cleaned[key]) if cleaned[key]
|
85
|
+
end
|
86
|
+
end
|
77
87
|
end
|
78
|
-
end
|
88
|
+
end
|
data/spec/uri_string_spec.rb
CHANGED
@@ -9,6 +9,9 @@ module UTF8Cleaner
|
|
9
9
|
let(:multibyte_string) { URIString.new('%E2%9C%93') }
|
10
10
|
let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
|
11
11
|
# foo/ bar. / baz& ; √ baz
|
12
|
+
let(:no_byte_at_all) { URIString.new('%') }
|
13
|
+
let(:not_even_hex_chars1) { URIString.new('%x') }
|
14
|
+
let(:not_even_hex_chars2) { URIString.new('%0zhey') }
|
12
15
|
|
13
16
|
describe '#new' do
|
14
17
|
it { expect(encoded_string).to be_a(URIString) }
|
@@ -20,6 +23,9 @@ module UTF8Cleaner
|
|
20
23
|
it { expect(encoded_string.cleaned).to eq('%26') }
|
21
24
|
it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') }
|
22
25
|
it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
|
26
|
+
it { expect(no_byte_at_all.cleaned).to eq('') }
|
27
|
+
it { expect(not_even_hex_chars1.cleaned).to eq('') }
|
28
|
+
it { expect(not_even_hex_chars2.cleaned).to eq('hey') }
|
23
29
|
end
|
24
30
|
|
25
31
|
describe '#valid?' do
|
@@ -29,8 +35,11 @@ module UTF8Cleaner
|
|
29
35
|
|
30
36
|
it { expect(invalid_string).to_not be_valid }
|
31
37
|
it { expect(complex_invalid_string).to_not be_valid }
|
38
|
+
it { expect(no_byte_at_all).to_not be_valid }
|
39
|
+
it { expect(not_even_hex_chars1).to_not be_valid }
|
40
|
+
it { expect(not_even_hex_chars2).to_not be_valid }
|
32
41
|
end
|
33
42
|
|
34
43
|
end
|
35
44
|
|
36
|
-
end
|
45
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: utf8-cleaner
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Leon Miller-Out
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-11-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -137,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
137
137
|
version: '0'
|
138
138
|
requirements: []
|
139
139
|
rubyforge_project:
|
140
|
-
rubygems_version: 2.
|
140
|
+
rubygems_version: 2.4.5.1
|
141
141
|
signing_key:
|
142
142
|
specification_version: 4
|
143
143
|
summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"
|