utf8-cleaner 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d4f4b8427b2de1cca2aad8cf672164873552352c
4
- data.tar.gz: f4c6e987bd3dcdcdd459b896ca6e8768fe103ad8
3
+ metadata.gz: 3ef101f548a442f6dff1dcdad93983578b0c9afd
4
+ data.tar.gz: 7e02ed8d5d55c4bd6dc9033f0740bd6a5fc85cc0
5
5
  SHA512:
6
- metadata.gz: 5984ac142b4497cc4dfb3626f09b79a5c74c0e27ac449239876a68cf1d73adc49433b163573d037ed5c91963b7c020e9cb8653a68a75c5eb09b13cf006f8533f
7
- data.tar.gz: b9640a1a528a8873338dd08c1bf278994fec361a91c4dfc6562d01c21871f8029b48b120bc35453ff8fc7c948fe4c02df4d3da59dac68504393990d882f7c29e
6
+ metadata.gz: 87eb291f74fcc40fff6ccd80567e23e0556d1fee28657a22a089293024459023abe89a7bbeb31db24b380bee9da7095146377346c9a8524fab0f9dc9b3878d00
7
+ data.tar.gz: 761762367fca098c24fe872708b621c490075bf0d7c7a57cc07f209555c190b178890c6973a34a6b135d4baafbbfbf92b845fc5886b2cbf328b2d98535f1e92b
data/CHANGELOG.md CHANGED
@@ -1,5 +1,9 @@
1
1
  # CHANGELOG
2
2
 
3
+ ## V0.2.0
4
+
5
+ * Removes invalid %-encodings like "%x", "%0z", and "%" if not followed by two hex chars
6
+
3
7
  ## v0.1.1
4
8
 
5
9
  * Now cleans HTTP_USER_AGENT
@@ -7,4 +11,4 @@
7
11
 
8
12
  ## v0.1.0
9
13
 
10
- Broken.
14
+ Broken.
@@ -34,14 +34,14 @@ module UTF8Cleaner
34
34
  def sanitize_env_keys(env)
35
35
  SANITIZE_ENV_KEYS.each do |key|
36
36
  next unless value = env[key]
37
- env[key] = cleaned_uri_string(value)
37
+ env[key] = cleaned_string(value)
38
38
  end
39
39
  end
40
40
 
41
41
  def sanitize_env_rack_input(env)
42
42
  case env['CONTENT_TYPE']
43
43
  when 'application/x-www-form-urlencoded'
44
- cleaned_value = cleaned_uri_string(env['rack.input'].read)
44
+ cleaned_value = cleaned_string(env['rack.input'].read)
45
45
  env['rack.input'] = StringIO.new(cleaned_value) if cleaned_value
46
46
  env['rack.input'].rewind
47
47
  when 'multipart/form-data'
@@ -51,8 +51,8 @@ module UTF8Cleaner
51
51
  end
52
52
  end
53
53
 
54
- def cleaned_uri_string(value)
55
- value = tidy_bytes(value) if value && !value.ascii_only?
54
+ def cleaned_string(value)
55
+ value = tidy_bytes(value) unless value.ascii_only?
56
56
  value = URIString.new(value).cleaned if value.include?('%')
57
57
  value
58
58
  end
@@ -1,7 +1,12 @@
1
1
  module UTF8Cleaner
2
+ # Cleans invalid %-encodings from URI-encoded strings.
2
3
  class URIString
3
4
  attr_accessor :data
4
5
 
6
+ HEX_CHARS = '0-9a-fA-F'
7
+ HEX_CHARS_REGEX = /[#{HEX_CHARS}]/
8
+ INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/
9
+
5
10
  def initialize(data)
6
11
  self.data = data
7
12
  end
@@ -16,11 +21,6 @@ module UTF8Cleaner
16
21
 
17
22
  def valid?
18
23
  valid_uri_encoded_utf8(data)
19
- rescue ArgumentError => e
20
- if e.message =~ /invalid byte sequence/
21
- return false
22
- end
23
- raise e
24
24
  end
25
25
 
26
26
  private
@@ -38,6 +38,19 @@ module UTF8Cleaner
38
38
  # indicates by this %. (We'll change this later for multibyte characters.)
39
39
  skip_next = 2
40
40
 
41
+ # If the next character is not a hex char, drop the percent and it
42
+ unless data[index + 1] =~ HEX_CHARS_REGEX
43
+ index += 2
44
+ next
45
+ end
46
+
47
+ # If the character after that is not a hex char, drop the percent and
48
+ # both of the following chars.
49
+ unless data[index + 2] =~ HEX_CHARS_REGEX
50
+ index += 3
51
+ next
52
+ end
53
+
41
54
  # How long is this character?
42
55
  first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
43
56
  bytes = utf8_char_length_in_bytes(first_byte)
@@ -72,7 +85,13 @@ module UTF8Cleaner
72
85
  end
73
86
 
74
87
  def valid_uri_encoded_utf8(string)
75
- URI.decode(string).force_encoding('UTF-8').valid_encoding?
88
+ URI.decode(string).force_encoding('UTF-8').valid_encoding? &&
89
+ string !~ INVALID_PERCENT_ENCODING_REGEX
90
+ rescue ArgumentError => e
91
+ if e.message =~ /invalid byte sequence/
92
+ return false
93
+ end
94
+ raise e
76
95
  end
77
96
 
78
97
  # Grab the next num_bytes URI-encoded bytes from the raw character array.
@@ -110,4 +129,4 @@ module UTF8Cleaner
110
129
  end
111
130
 
112
131
  end
113
- end
132
+ end
@@ -1,3 +1,3 @@
1
1
  module UTF8Cleaner
2
- VERSION = "0.1.1"
2
+ VERSION = "0.2.0"
3
3
  end
@@ -13,10 +13,11 @@ module UTF8Cleaner
13
13
  'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
14
14
  'QUERY_STRING' => 'foo=bar%FF',
15
15
  'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
16
- 'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0",
16
+ 'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93",
17
17
  'REQUEST_URI' => '%C3%89%E2%9C%93',
18
18
  'rack.input' => StringIO.new("foo=%FFbar%F8"),
19
- 'CONTENT_TYPE' => 'application/x-www-form-urlencoded'
19
+ 'CONTENT_TYPE' => 'application/x-www-form-urlencoded',
20
+ 'HTTP_COOKIE' => nil
20
21
  }
21
22
  end
22
23
 
@@ -27,7 +28,7 @@ module UTF8Cleaner
27
28
  end
28
29
 
29
30
  describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do
30
- it { expect(new_env['HTTP_USER_AGENT']).to eq('Android Versión/4.0') }
31
+ it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") }
31
32
  end
32
33
 
33
34
  describe "leaves all valid characters untouched" do
@@ -74,5 +75,14 @@ module UTF8Cleaner
74
75
  expect(new_env['PATH_INFO']).to eq('/this/is/safe')
75
76
  end
76
77
  end
78
+
79
+ # Ensure that all cleaned values parse cleanly.
80
+ # E.g. make sure Rack/Rails won't choke on them
81
+ after do
82
+ cleaned = new_env
83
+ env.keys.reject{|key| key == 'rack.input'}.each do |key|
84
+ URI.decode_www_form_component(cleaned[key]) if cleaned[key]
85
+ end
86
+ end
77
87
  end
78
- end
88
+ end
@@ -9,6 +9,9 @@ module UTF8Cleaner
9
9
  let(:multibyte_string) { URIString.new('%E2%9C%93') }
10
10
  let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
11
11
  # foo/ bar. / baz& ; √ baz
12
+ let(:no_byte_at_all) { URIString.new('%') }
13
+ let(:not_even_hex_chars1) { URIString.new('%x') }
14
+ let(:not_even_hex_chars2) { URIString.new('%0zhey') }
12
15
 
13
16
  describe '#new' do
14
17
  it { expect(encoded_string).to be_a(URIString) }
@@ -20,6 +23,9 @@ module UTF8Cleaner
20
23
  it { expect(encoded_string.cleaned).to eq('%26') }
21
24
  it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') }
22
25
  it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
26
+ it { expect(no_byte_at_all.cleaned).to eq('') }
27
+ it { expect(not_even_hex_chars1.cleaned).to eq('') }
28
+ it { expect(not_even_hex_chars2.cleaned).to eq('hey') }
23
29
  end
24
30
 
25
31
  describe '#valid?' do
@@ -29,8 +35,11 @@ module UTF8Cleaner
29
35
 
30
36
  it { expect(invalid_string).to_not be_valid }
31
37
  it { expect(complex_invalid_string).to_not be_valid }
38
+ it { expect(no_byte_at_all).to_not be_valid }
39
+ it { expect(not_even_hex_chars1).to_not be_valid }
40
+ it { expect(not_even_hex_chars2).to_not be_valid }
32
41
  end
33
42
 
34
43
  end
35
44
 
36
- end
45
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: utf8-cleaner
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.1
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Leon Miller-Out
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-09-19 00:00:00.000000000 Z
11
+ date: 2015-11-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -137,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
137
137
  version: '0'
138
138
  requirements: []
139
139
  rubyforge_project:
140
- rubygems_version: 2.2.2
140
+ rubygems_version: 2.4.5.1
141
141
  signing_key:
142
142
  specification_version: 4
143
143
  summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"