RubyGems - utf8-cleaner - Versions diffs - 0.0.4 → 0.0.6 - Mend

utf8-cleaner 0.0.4 → 0.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +7 -0
data/.travis.yml +4 -0
data/Guardfile +9 -0
data/README.md +2 -0
data/Rakefile +4 -0
data/lib/utf8-cleaner.rb +1 -0
data/lib/utf8-cleaner/middleware.rb +13 -32
data/lib/utf8-cleaner/uri_string.rb +112 -0
data/lib/utf8-cleaner/version.rb +1 -1
data/spec/middleware_spec.rb +9 -16
data/spec/spec_helper.rb +3 -0
data/spec/uri_string_spec.rb +45 -0
data/utf8-cleaner.gemspec +3 -0
metadata +56 -13

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: b6e265c0aefe9f9c6fa6082b86f21c81a92400b6
+  data.tar.gz: 12dd12dceef125122d2877dbc9e9e597055ddfac
+SHA512:
+  metadata.gz: b794d47e0c9460ef5ec9eadc9340f705a491b0c48370efb82310d2bf21ff7713d655a4fdbb312c4788913879476be5a4c8823e8f494d8a2fe7376ecc4dd58a6c
+  data.tar.gz: 0bf1fb1a6d23600a3f5ff4ae7d1f62c7fded5a0d7d0233ebc85103b12c51a849a7ef83708a21568a28bd005e8ea1199ffff6b0d05b0c186a8889624e190f5e19

data/.travis.yml ADDED Viewed

@@ -0,0 +1,4 @@
+language: ruby
+rvm:
+  - 1.9.3
+  - 2.0.0

data/Guardfile ADDED Viewed

@@ -0,0 +1,9 @@
+# A sample Guardfile
+# More info at https://github.com/guard/guard#readme
+guard :rspec do
+  watch(%r{^spec/.+_spec\.rb$})
+  watch(%r{^lib/utf8-cleaner/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
+  watch('spec/spec_helper.rb') { "spec" }
+end

data/README.md CHANGED Viewed

@@ -1,5 +1,7 @@
 # UTF8Cleaner
+[<img src="https://secure.travis-ci.org/singlebrook/utf8-cleaner.png" />](http://travis-ci.org/singlebrook/utf8-cleaner)
 Removes invalid UTF-8 characters from the environment so that your app doesn't choke
 on them. This prevents errors like "invalid byte sequence in UTF-8".

data/Rakefile CHANGED Viewed

@@ -1 +1,5 @@
 require "bundler/gem_tasks"
+task :default do
+  sh "rspec spec"
+end

data/lib/utf8-cleaner.rb CHANGED Viewed

@@ -1,3 +1,4 @@
 require "utf8-cleaner/version"
 require "utf8-cleaner/middleware"
+require "utf8-cleaner/uri_string"
 require "utf8-cleaner/railtie" if defined? Rails

data/lib/utf8-cleaner/middleware.rb CHANGED Viewed

@@ -7,6 +7,7 @@ module UTF8Cleaner
      "QUERY_STRING",
      "REQUEST_PATH",
      "REQUEST_URI",
+     "HTTP_COOKIE"
     ]
     def initialize(app)
@@ -19,42 +20,22 @@ module UTF8Cleaner
     private
-    def sanitize_env(env)
-     SANITIZE_ENV_KEYS.each do |key|
-       next unless value = env[key]
-       value = sanitize_string(URI.decode(value))
-       env[key] = URI.encode(value)
-     end
-     ["HTTP_COOKIE"].each do |key|
-       next unless value = env[key]
-       fixed = sanitize_string(value)
-       env[key] = fixed if fixed
-     end
-     env
-    end
-    def sanitize_string(string)
-      return string unless string.is_a? String
-      # Try it as UTF-8 directly
-      cleaned = string.dup.force_encoding('UTF-8')
-      if cleaned.valid_encoding?
-        cleaned
-      else
-        utf8clean(string)
-      end
+    def is_valid_utf8(string)
+      utf8 = string.dup.force_encoding('UTF-8')
+      string == utf8 && utf8.valid_encoding?
     rescue EncodingError
-      utf8clean(string)
+      false
     end
-    def utf8clean(string)
-      # Force it to UTF-8, throwing out invalid bits
-      if RUBY_VERSION >= "1.9.3"
-        # These converters don't exist in 1.9.2
-        string.encode('UTF-16', 'UTF-8', :invalid => :replace, :replace => '').encode('UTF-8', 'UTF-16')
-      else
-        string.chars.select{|i| i.valid_encoding?}.join
+    def sanitize_env(env)
+      SANITIZE_ENV_KEYS.each do |key|
+        next unless value = env[key]
+        if value.include?('%')
+          env[key] = URIString.new(value).cleaned
+        end
       end
+      env
     end
   end
 end

data/lib/utf8-cleaner/uri_string.rb ADDED Viewed

@@ -0,0 +1,112 @@
+module UTF8Cleaner
+  class URIString
+    attr_accessor :data
+    def initialize(data)
+      self.data = data
+    end
+    def cleaned
+      if valid?
+        data
+      else
+        encoded_char_array.join
+      end
+    end
+    def encoded?
+      data.include?('%')
+    end
+    def valid?
+      valid_uri_encoded_utf8(data)
+    end
+    private
+    # Returns an array of valid URI-encoded UTF-8 characters.
+    def encoded_char_array
+      char_array = []
+      index = 0
+      while (index < data.length) do
+        char = data[index]
+        if char == '%'
+          # Skip the next two characters, which are the encoded byte
+          # indicates by this %. (We'll change this later for multibyte characters.)
+          skip_next = 2
+          # How long is this character?
+          first_byte = '0x' + (data[index + 1] + data[index + 2]).upcase
+          bytes = utf8_char_length_in_bytes(first_byte)
+          # Grab the specified number of encoded bytes
+          utf8_char_encoded_bytes = next_n_bytes_from(index, bytes)
+          # Did we get the right number of bytes?
+          if utf8_char_encoded_bytes.length == bytes
+            # We did. Is it a valid character?
+            utf8_char_encoded = utf8_char_encoded_bytes.join
+            if valid_uri_encoded_utf8(utf8_char_encoded)
+              # It's valid!
+              char_array << utf8_char_encoded
+              # If we're dealing with a multibyte character, skip more than two
+              # of the next characters, which have already been processed.
+              skip_next = bytes * 3 - 1
+            end
+          end
+          index += skip_next
+        else
+          # This was not an encoded character, so just add it and move to the next.
+          char_array << char
+        end
+        index += 1
+      end
+      char_array
+    end
+    def valid_uri_encoded_utf8(string)
+      URI.decode(string).force_encoding('UTF-8').valid_encoding?
+    end
+    # Grab the next num_bytes URI-encoded bytes from the raw character array.
+    # Returns an array like ['%E2', '%9C', '%93']
+    def next_n_bytes_from(index, num_bytes)
+      return [] if data.length < index + (3 * num_bytes)
+      num_bytes.times.map do |n|
+        # Look for percent signs in the right places
+        pct_index = index + (3 * n)
+        if data[pct_index] == '%'
+          byte = data[pct_index + 1..pct_index + 2]
+        else
+          # An expected percent sign was missing. The whole character is invalid.
+          return []
+        end
+        '%' + byte
+      end
+    end
+    # If the first byte is between 0xC0 and 0xDF, the UTF-8 character has two bytes;
+    # if it is between 0xE0 and 0xEF, the UTF-8 character has 3 bytes;
+    # and if it is 0xF0 and 0xFF, the UTF-8 character has 4 bytes.
+    # first_byte is a string like "0x13"
+    def utf8_char_length_in_bytes(first_byte)
+      if first_byte.hex < 'C0'.hex
+        1
+      elsif first_byte.hex < 'DF'.hex
+        2
+      elsif first_byte.hex < 'EF'.hex
+        3
+      else
+        4
+      end
+    end
+  end
+end

data/lib/utf8-cleaner/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module UTF8Cleaner
-  VERSION = "0.0.4"
+  VERSION = "0.0.6"
 end

data/spec/middleware_spec.rb CHANGED Viewed

@@ -1,13 +1,12 @@
-# -*- encoding: utf-8 -*-
 require 'spec_helper'
-describe 'UTF8Cleaner::Middleware' do
+describe UTF8Cleaner::Middleware do
   let :env do
     {
-      'PATH_INFO' => 'foo/bar%2e%2fbaz',
+      'PATH_INFO' => 'foo/%FFbar%2e%2fbaz%26%3B',
       'QUERY_STRING' => 'foo=bar%FF',
       'HTTP_REFERER' => 'http://example.com/blog+Result:+%ED%E5+%ED%E0%F8%EB%EE%F1%FC+%F4%EE%F0%EC%FB+%E4%EB%FF+%EE%F2%EF%F0%E0%E2%EA%E8',
-      'REQUEST_URI' => '%C3%89'
+      'REQUEST_URI' => '%C3%89%E2%9C%93'
     }
   end
@@ -15,19 +14,13 @@ describe 'UTF8Cleaner::Middleware' do
     UTF8Cleaner::Middleware.new(nil).send(:sanitize_env, env)
   end
-  it "removes invalid UTF-8 sequences" do
-    new_env['QUERY_STRING'].should == 'foo=bar'
+  describe "removes invalid UTF-8 sequences" do
+    it { new_env['QUERY_STRING'].should == 'foo=bar' }
+    it { new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++' }
   end
-  it "turns valid %-escaped ASCII chars into their ASCII equivalents" do
-    new_env['PATH_INFO'].should == 'foo/bar./baz'
-  end
-  it "leaves valid %-escaped UTF-8 chars alone" do
-    new_env['REQUEST_URI'].should == '%C3%89'
-  end
-  it "handles an awful URL" do
-    new_env['HTTP_REFERER'].should == 'http://example.com/blog+Result:+++++'
+  describe "leaves all valid characters untouched" do
+    it { new_env['PATH_INFO'].should == 'foo/bar%2e%2fbaz%26%3B' }
+    it { new_env['REQUEST_URI'].should == '%C3%89%E2%9C%93' }
   end
 end

data/spec/spec_helper.rb CHANGED Viewed

@@ -2,8 +2,11 @@ require 'rubygems'
 require 'rspec/autorun'
 require 'utf8-cleaner'
+require 'uri'
 RSpec.configure do |config|
+  config.treat_symbols_as_metadata_keys_with_true_values = true
   # Run specs in random order to surface order dependencies. If you find an
   # order dependency and want to debug it, you can fix the order by providing
   # the seed, which is printed after each run.

data/spec/uri_string_spec.rb ADDED Viewed

@@ -0,0 +1,45 @@
+require 'spec_helper'
+module UTF8Cleaner
+  describe URIString do
+    let(:invalid_string)   { URIString.new('%FF') }
+    let(:ascii_string)     { URIString.new('foo') }
+    let(:encoded_string)   { URIString.new('%26') }
+    let(:multibyte_string) { URIString.new('%E2%9C%93') }
+    let(:complex_invalid_string) { URIString.new('foo/%FFbar%2e%2fbaz%26%3B%E2%9C%93%E2%9Cbaz') }
+                                                # foo/   bar.  /  baz&  ;  √              baz
+    describe '#new' do
+      it { encoded_string.should be_a URIString }
+    end
+    describe '#cleaned' do
+      it { invalid_string.cleaned.should eq('') }
+      it { ascii_string.cleaned.should eq('foo') }
+      it { encoded_string.cleaned.should eq('%26') }
+      it { multibyte_string.cleaned.should eq('%E2%9C%93') }
+      it { complex_invalid_string.cleaned.should eq('foo/bar%2e%2fbaz%26%3B%E2%9C%93baz') }
+    end
+    describe '#encoded?' do
+      it { encoded_string.should be_encoded }
+      it { invalid_string.should be_encoded }
+      it { multibyte_string.should be_encoded }
+      it { complex_invalid_string.should be_encoded }
+      it { ascii_string.should_not be_encoded }
+    end
+    describe '#valid?' do
+      it { ascii_string.should be_valid }
+      it { encoded_string.should be_valid }
+      it { multibyte_string.should be_valid }
+      it { invalid_string.should_not be_valid }
+      it { complex_invalid_string.should_not be_valid }
+    end
+  end
+end

data/utf8-cleaner.gemspec CHANGED Viewed

@@ -17,5 +17,8 @@ Gem::Specification.new do |gem|
   gem.test_files    = gem.files.grep(%r{^(test|spec|features)/})
   gem.require_paths = ["lib"]
+  gem.add_development_dependency "rake"
+  gem.add_development_dependency "guard"
+  gem.add_development_dependency "guard-rspec"
   gem.add_development_dependency "rspec"
 end

metadata CHANGED Viewed

@@ -1,30 +1,69 @@
 --- !ruby/object:Gem::Specification
 name: utf8-cleaner
 version: !ruby/object:Gem::Version
-  version: 0.0.4
-  prerelease:
+  version: 0.0.6
 platform: ruby
 authors:
 - Leon Miller-Out
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2013-04-23 00:00:00.000000000 Z
+date: 2013-10-16 00:00:00.000000000 Z
 dependencies:
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: guard
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: guard-rspec
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - '>='
+      - !ruby/object:Gem::Version
+        version: '0'
 - !ruby/object:Gem::Dependency
   name: rspec
   requirement: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
   type: :development
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
-    none: false
     requirements:
-    - - ! '>='
+    - - '>='
       - !ruby/object:Gem::Version
         version: '0'
 description: Removes invalid UTF8 characters from the URL and other env vars
@@ -35,41 +74,45 @@ extensions: []
 extra_rdoc_files: []
 files:
 - .gitignore
+- .travis.yml
 - Gemfile
+- Guardfile
 - LICENSE.txt
 - README.md
 - Rakefile
 - lib/utf8-cleaner.rb
 - lib/utf8-cleaner/middleware.rb
 - lib/utf8-cleaner/railtie.rb
+- lib/utf8-cleaner/uri_string.rb
 - lib/utf8-cleaner/version.rb
 - spec/middleware_spec.rb
 - spec/spec_helper.rb
+- spec/uri_string_spec.rb
 - utf8-cleaner.gemspec
 homepage: https://github.com/singlebrook/utf8-cleaner
 licenses: []
+metadata: {}
 post_install_message:
 rdoc_options: []
 require_paths:
 - lib
 required_ruby_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
-  none: false
   requirements:
-  - - ! '>='
+  - - '>='
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 1.8.23
+rubygems_version: 2.0.3
 signing_key:
-specification_version: 3
+specification_version: 4
 summary: Prevent annoying error reports of "invalid byte sequence in UTF-8"
 test_files:
 - spec/middleware_spec.rb
 - spec/spec_helper.rb
+- spec/uri_string_spec.rb