RubyGems - guess_html_encoding - Versions diffs - 0.0.10 → 0.0.11 - Mend

guess_html_encoding 0.0.10 → 0.0.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/lib/guess_html_encoding.rb +281 -5
data/lib/guess_html_encoding/version.rb +1 -1
data/spec/guess_html_encoding_spec.rb +90 -0
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: f24b82e186d3e1a58cd2061c7cb1eef2f5b5d1b0
-  data.tar.gz: cbfd0284000e074ef621763a36ca2be60cbed218
+  metadata.gz: c4a5a5d5cd40292d68650b9dd471adb932f424cd
+  data.tar.gz: b6f65abc65007e6cf570520eb78cf24adeb6d6bf
 SHA512:
-  metadata.gz: 4d68030d7c0af216faa1e1dc029c65b6557287a8349aa89ec2a7a98833de4178a838693d2bf3e866b966edd597951e0d31c26f3a4c33daab30c7afa93692b7a5
-  data.tar.gz: e2ddc685bae62c4cc6e962dd79a4f69863aef512b556957b2ab91113b492c3b07d7315f880c57179c8a9aea30d0279823983a21bbdcc6cbbc697cfc9ef2ada30
+  metadata.gz: 35d9b1b1b2b42b4b17bcaa49cf908143ba566a214f2637f2b335ba0157191ad52d20b632c9d4ee5de82aeecc088ba3bd6c6023b46d994af69c5c1680353deecd
+  data.tar.gz: 91534f086571eac16bd248bb22189e7a88af4b66438dd1ad690c98f4ed8d2122a6572231c6573e87cd7fecd84af0b38e1c1c52764456ddc20708daa848a96593

data/Gemfile.lock CHANGED

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    guess_html_encoding (0.0.9)
+    guess_html_encoding (0.0.11)
 GEM
   remote: http://rubygems.org/

data/lib/guess_html_encoding.rb CHANGED

@@ -19,11 +19,9 @@ module GuessHtmlEncoding
     end
     if out.nil? || out.empty? || !encoding_loaded?(out)
-      if html =~ /<meta[^>]*HTTP-EQUIV=["']?Content-Type["']?[^>]*content=["']([^'"]*)["']/i && $1 =~ /charset=([\w\d-]+);?/i
-        out = $1
-      elsif html =~ /<meta\s+charset=["']([\w\d-]+)?/i
-        out = $1
-      end
+      out = HTMLScanner.new(html[0,2500]).encoding || out
       out.upcase! unless out.nil?
     end
@@ -55,4 +53,282 @@ module GuessHtmlEncoding
   def self.encoding_loaded?(encoding)
     !!Encoding.find(encoding) rescue nil
   end
+  class HTMLScanner
+    def initialize(html)
+      @html = html
+    end
+    # Returns the encoding sniffed from the content of an HTML page, as determined using an
+    # implemention of the algorithm to 'prescan a byte stream to determine its encoding', as
+    # specified by the HTML specification:
+    # http://www.w3.org/html/wg/drafts/html/master/syntax.html#prescan-a-byte-stream-to-determine-its-encoding
+    def encoding
+      position = 0
+      charset = nil
+      length = @html.length
+      done = false
+      while position < length && !done
+        # First look for a standard HTML comment (ie <!-- blah -->)
+        if @html[position, 4] == '<!--'
+          position += 2
+          position += (@html[position, length].index('-->') || length)
+        # Then look for the start of a meta tag
+        elsif  @html[position, 6] =~ /\A\<meta[\s\/]/i
+          charset, position_increment = charset_from_meta(@html[position + 5, length])
+          break if charset
+          position += position_increment
+        # Then look for <! or </ or <?
+        elsif @html[position, 2] =~ /\A\<[\!\/\?]/
+          # Advance position to the first > that appears next in string, or end
+          position += @html[position, length].index('>') || length
+        else
+          # Do nothing. (This is just here to make the algorithm easier to follow)
+        end
+        # Advance position to next character
+        position += 1
+      end
+      charset
+    end
+    private
+    # Given a string which starts with the space or slash following a `<meta`,
+    # look for a charset and returns it along with the position of the next
+    # character following the closing `>` character
+    def charset_from_meta(string)
+      position = 0
+      attribute_list = {}
+      got_pragma = false
+      need_pragma = nil
+      charset = nil
+      length = string.length
+      while position < length
+        attribute, position_increment = attribute(string[position, length])
+        position += position_increment.to_i
+        if attribute == nil
+          break
+        elsif attribute_list[attribute[:attribute_name]]
+          # Do nothing
+        else
+          # found a new attribute. Add it to the list
+          attribute_list[attribute[:attribute_name]] = attribute[:attribute_value]
+          if attribute[:attribute_name] == 'http-equiv'
+            got_pragma = true
+          elsif attribute[:attribute_name] == 'content'
+            content_charset = charset_from_meta_content(attribute[:attribute_value])
+            if content_charset && charset == nil
+              charset = content_charset
+              need_pragma = true
+            end
+          elsif attribute[:attribute_name] == 'charset'
+            charset = attribute[:attribute_value]
+            need_pragma = false
+          end
+        end
+      end
+      if need_pragma == nil || (need_pragma == true && got_pragma == false)
+        [nil, position]
+      else
+        [charset, position]
+      end
+    end
+    # Given a string representing the 'content' attribute value of a meta tag
+    # with an `http-equiv` attribute, returns the charset specified within that
+    # value, or nil.
+    def charset_from_meta_content(string)
+      charset_match = string.match(/charset\s*\=\s*(.+)/i)
+      if charset_match
+        charset_value = charset_match[1]
+        charset_value[/\A\"(.*)\"/, 1] ||
+        charset_value[/\A\'(.*)\'/, 1] ||
+        charset_value[/(.*)[\s;]/, 1] ||
+        charset_value[/(.*)/, 1]
+      else
+        nil
+      end
+    end
+    # Given a string, returns the first attribute in the sting (as a hash), and
+    # the position of the next character in the string
+    def attribute(string)
+      attribute_name = ""
+      attribute_value = ""
+      length = string.length
+      position = 0
+      return [nil, nil] if length == 0
+      while position < (length)
+        # If character matches 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), 0x20 (ASCII space), or 0x2F (ASCII /) then advance position
+        if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}\u{2f}]/
+          position += 1
+        elsif string[position] == '>'
+          attribute_name = nil
+          break
+        else
+          while position < length
+            if string[position] == '=' && attribute_name != ''
+              attribute_value, position_increment = attribute_value(string[position + 1, length])
+              position += position_increment + 1
+              break
+            elsif string[position] =~ /[\>\/]/
+              break
+            elsif string[position] =~ /[A-Z]/
+              attribute_name += string[position].downcase
+              position += 1
+            else
+              attribute_name += string[position]
+              position += 1
+            end
+          end
+          break
+        end
+      end
+      if attribute_name
+        [{attribute_name: attribute_name, attribute_value: attribute_value}, position]
+      else
+        [nil, position]
+      end
+    end
+    # Given a string, this returns the attribute value from the start of the string,
+    # and the position of the following character in the string
+    def attribute_value(string)
+      attribute_value = ''
+      position = 0
+      length = string.length
+      while position < length
+        # x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR), or 0x20 (ASCII space) then advance position to the next byte, then, repeat this step.
+        if string[position] =~ /[\u{09}\u{0A}\u{0C}\u{0D}\u{20}]/
+          position += 1
+        elsif string[position] =~ /['"]/
+          attribute_value, position = quoted_value(string[position, length])
+          break
+        elsif string[position] == '>'
+          position += 1
+          break
+        else
+          attribute_value, position = unquoted_value(string[position, length])
+          break
+        end
+      end
+      [attribute_value, position]
+    end
+    # Given a string, at the start of which is quoted attribute value, returns
+    # that attribute value, and the position of the next character in the string
+    # (following the second matching quote mark)
+    def quoted_value(string)
+      attribute_value = ""
+      quote_type = string[0]
+      position = 1
+      length = string.length
+      while position < length
+        if string[position] == quote_type
+          position += 1
+          break
+        else
+          attribute_value += downcase_A_to_Z_only(string[position])
+          position += 1
+        end
+      end
+      [attribute_value, position]
+    end
+    # Given a string, at the start of which is an unquoted attribute value, returns
+    # that attribute value, and the position of the next character in the string
+    def unquoted_value(string)
+      downcased_value = downcase_A_to_Z_only(string[/\A[^\t\u{0A}\u{0C}\u{0D}\u{20}\>]*/])
+      [downcased_value, downcased_value.length]
+    end
+    # Downcases the A-Z characters only (eg not É -> é)
+    def downcase_A_to_Z_only(string)
+      string.gsub(/([A-Z])/) { |match| match.downcase }
+    end
+  end
 end

data/lib/guess_html_encoding/version.rb CHANGED

@@ -1,3 +1,3 @@
 module GuessHtmlEncoding
-  VERSION = "0.0.10"
+  VERSION = "0.0.11"
 end

data/spec/guess_html_encoding_spec.rb CHANGED

@@ -3,6 +3,96 @@ require 'spec_helper'
 describe "GuessHtmlEncoding" do
   describe "#guess" do
+    it 'should use an uppercased unquoted meta tag' do
+      expect(GuessHtmlEncoding.guess('<META CHARSET=UTF-8>')).to eql('UTF-8')
+    end
+    it 'should use a quoted meta tag' do
+      expect(GuessHtmlEncoding.guess('<meta charset="UTF-8">')).to eql('UTF-8')
+    end
+    it 'should use a http-equiv meta tag' do
+      expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8">')).to eql('UTF-8')
+    end
+    it 'should use a http-equiv meta tag with semi-colons in the content value' do
+      expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="text/html; charset=UTF-8;">')).to eql('UTF-8')
+    end
+    it 'should use a http-equiv meta tag with attributes in unusual order' do
+      expect(GuessHtmlEncoding.guess('<meta content="text/html; charset=UTF-8;" http-equiv="content-type">')).to eql('UTF-8')
+    end
+    it 'should use a http-equiv meta tag with attributes in unusual order' do
+      expect(GuessHtmlEncoding.guess('<meta><meta charset="UTF-8">')).to eql('UTF-8')
+    end
+    it 'should use the first meta tag with a charset value' do
+      expect(GuessHtmlEncoding.guess('<meta charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-9')
+    end
+    it 'should use a meta http-equiv tag with spaces in the content value' do
+      expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=' text/html ; charset = UTF-8;'>")).to eql('UTF-8')
+    end
+    it 'should use a meta http-equiv tag with newlines in the content value' do
+      expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='\t\ncharset=UTF-8\n'>")).to eql('UTF-8')
+    end
+    it 'should use a meta http-equiv tag with double quotes in the content value' do
+      expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content='text/html; charset=\"UTF-8\">")).to eql('UTF-8')
+    end
+    it 'should use a meta http-equiv tag with single quotes in the content value' do
+      expect(GuessHtmlEncoding.guess("<meta http-equiv='content-type' content=\"text/html; charset='UTF-8'\">")).to eql('UTF-8')
+    end
+    it 'should use the first charset attribute' do
+      expect(GuessHtmlEncoding.guess('<meta charset="UTF-9" charset="UTF-8">>')).to eql('UTF-9')
+    end
+    it 'should use the charset value over the content value' do
+      expect(GuessHtmlEncoding.guess('<meta http-equiv="content-type" content="charset=UTF-8" charset="UTF-9">')).to eql('UTF-9')
+    end
+    it 'should use the charset value if it appears before http-equiv' do
+      expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" charset="UTF-9" http-equiv="content-type" >')).to eql('UTF-9')
+    end
+    it 'should ignore meta tags with content attribute but no http-equiv' do
+      expect(GuessHtmlEncoding.guess('<meta content="charset=UTF-8" ><meta charset="UTF-9">')).to eql('UTF-9')
+    end
+    it 'should ignore a commented-out meta tag' do
+      expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!--<meta charset="UTF-9">--><meta charset="UTF-8">')).to eql('UTF-8')
+    end
+    it 'should ignore a minimal comment' do
+      expect(GuessHtmlEncoding.guess('<!DOCTYPE html><html><!--><meta charset="UTF-9"></html>')).to eql('UTF-9')
+    end
+    it 'should ignore an oddly commented out meta tag using <! >' do
+      expect(GuessHtmlEncoding.guess('<!DOCTYPE html><!<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
+    end
+    it 'should ignore an oddly commented out meta tag using </ >' do
+      expect(GuessHtmlEncoding.guess('<!DOCTYPE html></<meta charset="UTF-9">><meta charset="UTF-8">')).to eql('UTF-8')
+    end
+    it 'should ignore an oddly commented out meta tag using <?  ?>' do
+      expect(GuessHtmlEncoding.guess('<!DOCTYPE html><?<meta charset="UTF-9">?><meta charset="UTF-8">')).to eql('UTF-8')
+    end
+    it 'should ignore a <metadata> tag' do
+      expect(GuessHtmlEncoding.guess('<metadata test="yes" charset="UTF-9"><meta charset="UTF-8">')).to eql('UTF-8')
+    end
+    it 'should only search the first 2500 characters' do
+      html = 2500.times.collect { ' ' }.join + '<meta charset="UTF-8">'
+      expect(GuessHtmlEncoding.guess(html)).to eql(nil)
+    end
     it "can use headers" do
       guess = GuessHtmlEncoding.guess("<html><body><div>hi!</div></body></html>",
                                       "Hello: world\nContent-Type: text/html; charset=LATIN1\nFoo: bar")

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: guess_html_encoding
 version: !ruby/object:Gem::Version
-  version: 0.0.10
+  version: 0.0.11
 platform: ruby
 authors:
 - Andrew Cantino (Iteration Labs, LLC)
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-12-14 00:00:00.000000000 Z
+date: 2015-02-02 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rspec