RubyGems - utf8-cleaner - Versions diffs - 0.1.1 → 0.2.0 - Mend

utf8-cleaner 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -1
data/lib/utf8-cleaner/middleware.rb +4 -4
data/lib/utf8-cleaner/uri_string.rb +26 -7
data/lib/utf8-cleaner/version.rb +1 -1
data/spec/middleware_spec.rb +14 -4
data/spec/uri_string_spec.rb +10 -1
metadata +3 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: d4f4b8427b2de1cca2aad8cf672164873552352c
-  data.tar.gz: f4c6e987bd3dcdcdd459b896ca6e8768fe103ad8
+  metadata.gz: 3ef101f548a442f6dff1dcdad93983578b0c9afd
+  data.tar.gz: 7e02ed8d5d55c4bd6dc9033f0740bd6a5fc85cc0
 SHA512:
-  metadata.gz: 5984ac142b4497cc4dfb3626f09b79a5c74c0e27ac449239876a68cf1d73adc49433b163573d037ed5c91963b7c020e9cb8653a68a75c5eb09b13cf006f8533f
-  data.tar.gz: b9640a1a528a8873338dd08c1bf278994fec361a91c4dfc6562d01c21871f8029b48b120bc35453ff8fc7c948fe4c02df4d3da59dac68504393990d882f7c29e
+  metadata.gz: 87eb291f74fcc40fff6ccd80567e23e0556d1fee28657a22a089293024459023abe89a7bbeb31db24b380bee9da7095146377346c9a8524fab0f9dc9b3878d00
+  data.tar.gz: 761762367fca098c24fe872708b621c490075bf0d7c7a57cc07f209555c190b178890c6973a34a6b135d4baafbbfbf92b845fc5886b2cbf328b2d98535f1e92b

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,9 @@
 # CHANGELOG
+## V0.2.0
+* Removes invalid %-encodings like "%x", "%0z", and "%" if not followed by two hex chars
 ## v0.1.1
 * Now cleans HTTP_USER_AGENT
@@ -7,4 +11,4 @@
 ## v0.1.0
-Broken.
+Broken.

data/lib/utf8-cleaner/middleware.rb CHANGED Viewed

@@ -34,14 +34,14 @@ module UTF8Cleaner
     def sanitize_env_keys(env)
       SANITIZE_ENV_KEYS.each do |key|
         next unless value = env[key]
-        env[key] = cleaned_uri_string(value)
+        env[key] = cleaned_string(value)
       end
     end
     def sanitize_env_rack_input(env)
       case env['CONTENT_TYPE']
       when 'application/x-www-form-urlencoded'
-        cleaned_value = cleaned_uri_string(env['rack.input'].read)
+        cleaned_value = cleaned_string(env['rack.input'].read)
         env['rack.input'] = StringIO.new(cleaned_value) if cleaned_value
         env['rack.input'].rewind
       when 'multipart/form-data'
@@ -51,8 +51,8 @@ module UTF8Cleaner
       end
     end
-    def cleaned_uri_string(value)
-      value = tidy_bytes(value) if value && !value.ascii_only?
+    def cleaned_string(value)
+      value = tidy_bytes(value) unless value.ascii_only?
       value = URIString.new(value).cleaned if value.include?('%')
       value
     end

data/lib/utf8-cleaner/uri_string.rb CHANGED Viewed

@@ -1,7 +1,12 @@
 module UTF8Cleaner
+  # Cleans invalid %-encodings from URI-encoded strings.
   class URIString
     attr_accessor :data
+    HEX_CHARS = '0-9a-fA-F'
+    HEX_CHARS_REGEX = /[#{HEX_CHARS}]/
+    INVALID_PERCENT_ENCODING_REGEX = /%(?![#{HEX_CHARS}]{2})/
     def initialize(data)
       self.data = data
     end
@@ -16,11 +21,6 @@ module UTF8Cleaner
     def valid?
       valid_uri_encoded_utf8(data)
-    rescue ArgumentError => e
-      if e.message =~ /invalid byte sequence/
-        return false
-      end
-      raise e
     end
     private
@@ -38,6 +38,19 @@ module UTF8Cleaner
           # indicates by this %. (We'll change this later for multibyte characters.)
           skip_next = 2
+          # If the next character is not a hex char, drop the percent and it
+          unless data[index + 1] =~ HEX_CHARS_REGEX
+            index += 2
+            next
+          end
+          # If the character after that is not a hex char, drop the percent and
+          # both of the following chars.
+          unless data[index + 2] =~ HEX_CHARS_REGEX
+            index += 3
+            next
+          end
           # How long is this character?
           first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
           bytes = utf8_char_length_in_bytes(first_byte)
@@ -72,7 +85,13 @@ module UTF8Cleaner
     end
     def valid_uri_encoded_utf8(string)
-      URI.decode(string).force_encoding('UTF-8').valid_encoding?
+      URI.decode(string).force_encoding('UTF-8').valid_encoding? &&
+        string !~ INVALID_PERCENT_ENCODING_REGEX
+    rescue ArgumentError => e
+      if e.message =~ /invalid byte sequence/
+        return false
+      end
+      raise e
     end
     # Grab the next num_bytes URI-encoded bytes from the raw character array.
@@ -110,4 +129,4 @@ module UTF8Cleaner
     end
   end
-end
+end

data/lib/utf8-cleaner/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UTF8Cleaner
-  VERSION = "0.1.1"
+  VERSION = "0.2.0"
 end

data/spec/middleware_spec.rb CHANGED Viewed

@@ -13,10 +13,11 @@ module UTF8Cleaner
           'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
           'QUERY_STRING' => 'foo=bar%FF',
           'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
-          'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0",
+          'HTTP_USER_AGENT' => "Android Versi\xF3n/4.0\x93",
           'REQUEST_URI' => '%C3%89%E2%9C%93',
           'rack.input' => StringIO.new("foo=%FFbar%F8"),
-          'CONTENT_TYPE' => 'application/x-www-form-urlencoded'
+          'CONTENT_TYPE' => 'application/x-www-form-urlencoded',
+          'HTTP_COOKIE' => nil
         }
       end
@@ -27,7 +28,7 @@ module UTF8Cleaner
       end
       describe 'replaces \x-encoded characters from the ISO-8859-1 and CP1252 code pages with their UTF-8 equivalents' do
-        it { expect(new_env['HTTP_USER_AGENT']).to eq('Android Versión/4.0') }
+        it { expect(new_env['HTTP_USER_AGENT']).to eq("Android Versi\u00F3n/4.0\u201C") }
       end
       describe "leaves all valid characters untouched" do
@@ -74,5 +75,14 @@ module UTF8Cleaner
         expect(new_env['PATH_INFO']).to eq('/this/is/safe')
       end
     end
+    # Ensure that all cleaned values parse cleanly.
+    # E.g. make sure Rack/Rails won't choke on them
+    after do
+      cleaned = new_env
+      env.keys.reject{|key| key == 'rack.input'}.each do |key|
+        URI.decode_www_form_component(cleaned[key]) if cleaned[key]
+      end
+    end
   end
-end
+end

data/spec/uri_string_spec.rb CHANGED Viewed

@@ -9,6 +9,9 @@ module UTF8Cleaner
     let(:multibyte_string) { URIString.new('%E2%9C%93') }
     let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
                                                 # foo/   bar.  /  baz&  ;  √              baz
+    let(:no_byte_at_all)      { URIString.new('%') }
+    let(:not_even_hex_chars1) { URIString.new('%x') }
+    let(:not_even_hex_chars2) { URIString.new('%0zhey') }
     describe '#new' do
       it { expect(encoded_string).to be_a(URIString) }
@@ -20,6 +23,9 @@ module UTF8Cleaner
       it { expect(encoded_string.cleaned).to eq('%26') }
       it { expect(multibyte_string.cleaned).to eq('%E2%9C%93') }
       it { expect(complex_invalid_string.cleaned).to eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
+      it { expect(no_byte_at_all.cleaned).to eq('') }
+      it { expect(not_even_hex_chars1.cleaned).to eq('') }
+      it { expect(not_even_hex_chars2.cleaned).to eq('hey') }
     end
     describe '#valid?' do
@@ -29,8 +35,11 @@ module UTF8Cleaner
       it { expect(invalid_string).to_not be_valid }
       it { expect(complex_invalid_string).to_not be_valid }
+      it { expect(no_byte_at_all).to_not be_valid }
+      it { expect(not_even_hex_chars1).to_not be_valid }
+      it { expect(not_even_hex_chars2).to_not be_valid }
     end
   end
-end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: utf8-cleaner
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - Leon Miller-Out
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-09-19 00:00:00.000000000 Z
+date: 2015-11-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: activesupport
@@ -137,7 +137,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.2.2
+rubygems_version: 2.4.5.1
 signing_key:
 specification_version: 4
 summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"