utf8-cleaner 0.1.1 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d4f4b8427b2de1cca2aad8cf672164873552352c
4
- data.tar.gz: f4c6e987bd3dcdcdd459b896ca6e8768fe103ad8
3
+ metadata.gz: 3ef101f548a442f6dff1dcdad93983578b0c9afd
4
+ data.tar.gz: 7e02ed8d5d55c4bd6dc9033f0740bd6a5fc85cc0
5
5
  SHA512:
6
- metadata.gz: 5984ac142b4497cc4dfb3626f09b79a5c74c0e27ac449239876a68cf1d73adc49433b163573d037ed5c91963b7c020e9cb8653a68a75c5eb09b13cf006f8533f
7
- data.tar.gz: b9640a1a528a8873338dd08c1bf278994fec361a91c4dfc6562d01c21871f8029b48b120bc35453ff8fc7c948fe4c02df4d3da59dac68504393990d882f7c29e
6
+ metadata.gz: 87eb291f74fcc40fff6ccd80567e23e0556d1fee28657a22a089293024459023abe89a7bbeb31db24b380bee9da7095146377346c9a8524fab0f9dc9b3878d00
7
+ data.tar.gz: 761762367fca098c24fe872708b621c490075bf0d7c7a57cc07f209555c190b178890c6973a34a6b135d4baafbbfbf92b845fc5886b2cbf328b2d98535f1e92b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## V0.2.0
4
+
5
+ * Removes invalid %-encodings like "%x", "%0z", and "%" if not followed by two hex chars
6
+
3
7
  ## v0.1.1
4
8
 
5
9
  * Now cleans HTTP_USER_AGENT
@@ -7,4 +11,4 @@
7
11
 
8
12
  ## v0.1.0
9
13
 
10
- Broken.
14
+ Broken.
@@ -34,14 +34,14 @@ module UTF8Cleaner
34
34
  def sanitize_env_keys(env)
35
35
  SANITIZE_ENV_KEYS.each do |key|
36
36
  next unless value = env[key]
37
- env[key] = cleaned_uri_string(value)
37
+ env[key] = cleaned_string(value)
38
38
  end
39
39
  end
40
40
 
41
41
  def sanitize_env_rack_input(env)
42
42
  case env['CONTENT_TYPE']
43
43
  when 'application/x-www-form-urlencoded'
44
- cleaned_value = cleaned_uri_string(env['rack.input'].read)
44
+ cleaned_value = cleaned_string(env['rack.input'].read)
45
45
  env['rack.input'] = StringIO.new(cleaned_value) if cleaned_value
46
46
  env['rack.input'].rewind
47
47
  when 'multipart/form-data'
@@ -51,8 +51,8 @@ module UTF8Cleaner
51
51
  end
52
52
  end
53
53
 
54
- def cleaned_uri_string(value)
55
- value = tidy_bytes(value) if value && !value.ascii_only?
54
+ def cleaned_string(value)
55
+ value = tidy_bytes(value) unless value.ascii_only?
56
56
  value = URIString.new(value).cleaned if value.include?('%')
57
57
  value
58
58
  end
@@ -1,7 +1,12 @@
1
1
  module UTF8Cleaner
2
+ # Cleans invalid %-encodings from URI-encoded strings.
2
3
  class URIString
3
4
  attr_accessor :data
4
5
 
6
+ HEX_CHARS = '0-9a-fA-F'
7
+ HEX_CHARS_REGEX = /[#{HEX_CHARS}]/
8
+ INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/
9
+
5
10
  def initialize(data)
6
11
  self.data = data
7
12
  end
@@ -16,11 +21,6 @@ module UTF8Cleaner
16
21
 
17
22
  def valid?
18
23
  valid_uri_encoded_utf8(data)
19
- rescue ArgumentError => e
20
- if e.message =~ /invalid byte sequence/
21
- return false
22
- end
23
- raise e
24
24
  end
25
25
 
26
26
  private
@@ -38,6 +38,19 @@ module UTF8Cleaner
38
38
  # indicates by this %. (We'll change this later for multibyte characters.)
39
39
  skip_next = 2
40
40
 
41
+ # If the next character is not a hex char, drop the percent and it
42
+ unless data[index + 1] =~ HEX_CHARS_REGEX
43
+ index += 2
44
+ next
45
+ end
46
+
47
+ # If the character after that is not a hex char, drop the percent and
48
+ # both of the following chars.
49
+ unless data[index + 2] =~ HEX_CHARS_REGEX
50
+ index += 3
51
+ next
52
+ end
53
+
41
54
  # How long is this character?
42
55
  first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
43
56
  bytes = utf8_char_length_in_bytes(first_byte)
@@ -72,7 +85,13 @@ module UTF8Cleaner
72
85
  end
73
86
 
74
87
  def valid_uri_encoded_utf8(string)
75
- URI.decode(string).force_encoding('UTF-8').valid_encoding?
88
+ URI.decode(string).force_encoding('UTF-8').valid_encoding? &&
89
+ string !~ INVALID_PERCENT_ENCODING_REGEX
90
+ rescue ArgumentError => e
91
+ if e.message =~ /invalid byte sequence/
92
+ return false
93
+ end
94
+ raise e
76
95
  end
77
96
 
78
97
  # Grab the next num_bytes URI-encoded bytes from the raw character array.
@@ -110,4 +129,4 @@ module UTF8Cleaner
110
129
  end
111
130
 
112
131
  end
113
- end
132
+ end
@@ -1,3 +1,3 @@
1
1
  module UTF8Cleaner
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -13,10 +13,11 @@ module UTF8Cleaner
13
13
  'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
14
14
  'QUERY_STRING' => 'foo=bar%FF',
15
15
  'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
16
- 'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0",
16
+ 'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93",
17
17
  'REQUEST_URI' => '%C3%89%E2%9C%93',
18
18
  'rack.input' => StringIO.new("foo=%FFbar%F8"),
19
- 'CONTENT_TYPE' => 'application/x-www-form-urlencoded'
19
+ 'CONTENT_TYPE' => 'application/x-www-form-urlencoded',
20
+ 'HTTP_COOKIE' => nil
20
21
  }
21
22
  end
22
23
 
@@ -27,7 +28,7 @@ module UTF8Cleaner
27
28
  end
28
29
 
29
30
  describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do
30
- it { expect(new_env['HTTP_USER_AGENT']).to eq('Android Versión/4.0') }
31
+ it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") }
31
32
  end
32
33
 
33
34
  describe "leaves all valid characters untouched" do
@@ -74,5 +75,14 @@ module UTF8Cleaner
74
75
  expect(new_env['PATH_INFO']).to eq('/this/is/safe')
75
76
  end
76
77
  end
78
+
79
+ # Ensure that all cleaned values parse cleanly.
80
+ # E.g. make sure Rack/Rails won't choke on them
81
+ after do
82
+ cleaned = new_env
83
+ env.keys.reject{|key| key == 'rack.input'}.each do |key|
84
+ URI.decode_www_form_component(cleaned[key]) if cleaned[key]
85
+ end
86
+ end
77
87
  end
78
- end
88
+ end
@@ -9,6 +9,9 @@ module UTF8Cleaner
9
9
  let(:multibyte_string) { URIString.new('%E2%9C%93') }
10
10
  let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
11
11
  # foo/ bar. / baz& ; √ baz
12
+ let(:no_byte_at_all) { URIString.new('%') }
13
+ let(:not_even_hex_chars1) { URIString.new('%x') }
14
+ let(:not_even_hex_chars2) { URIString.new('%0zhey') }
12
15
 
13
16
  describe '#new' do
14
17
  it { expect(encoded_string).to be_a(URIString) }
@@ -20,6 +23,9 @@ module UTF8Cleaner
20
23
  it { expect(encoded_string.cleaned).to eq('%26') }
21
24
  it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') }
22
25
  it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
26
+ it { expect(no_byte_at_all.cleaned).to eq('') }
27
+ it { expect(not_even_hex_chars1.cleaned).to eq('') }
28
+ it { expect(not_even_hex_chars2.cleaned).to eq('hey') }
23
29
  end
24
30
 
25
31
  describe '#valid?' do
@@ -29,8 +35,11 @@ module UTF8Cleaner
29
35
 
30
36
  it { expect(invalid_string).to_not be_valid }
31
37
  it { expect(complex_invalid_string).to_not be_valid }
38
+ it { expect(no_byte_at_all).to_not be_valid }
39
+ it { expect(not_even_hex_chars1).to_not be_valid }
40
+ it { expect(not_even_hex_chars2).to_not be_valid }
32
41
  end
33
42
 
34
43
  end
35
44
 
36
- end
45
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8-cleaner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leon Miller-Out
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-19 00:00:00.000000000 Z
11
+ date: 2015-11-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -137,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
137
137
  version: '0'
138
138
  requirements: []
139
139
  rubyforge_project:
140
- rubygems_version: 2.2.2
140
+ rubygems_version: 2.4.5.1
141
141
  signing_key:
142
142
  specification_version: 4
143
143
  summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"