RubyGems - spk-html5 - Versions diffs - 0.10.1 - Mend

spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

data/History.txt +10 -0
data/Manifest.txt +73 -0
data/README +45 -0
data/Rakefile.rb +33 -0
data/bin/html5 +7 -0
data/lib/html5.rb +13 -0
data/lib/html5/cli.rb +248 -0
data/lib/html5/constants.rb +1061 -0
data/lib/html5/filters/base.rb +10 -0
data/lib/html5/filters/inject_meta_charset.rb +82 -0
data/lib/html5/filters/iso639codes.rb +755 -0
data/lib/html5/filters/optionaltags.rb +198 -0
data/lib/html5/filters/rfc2046.rb +31 -0
data/lib/html5/filters/rfc3987.rb +91 -0
data/lib/html5/filters/sanitizer.rb +15 -0
data/lib/html5/filters/validator.rb +834 -0
data/lib/html5/filters/whitespace.rb +36 -0
data/lib/html5/html5parser.rb +247 -0
data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
data/lib/html5/html5parser/after_body_phase.rb +46 -0
data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
data/lib/html5/html5parser/after_head_phase.rb +55 -0
data/lib/html5/html5parser/before_head_phase.rb +44 -0
data/lib/html5/html5parser/before_html_phase.rb +41 -0
data/lib/html5/html5parser/in_body_phase.rb +636 -0
data/lib/html5/html5parser/in_caption_phase.rb +69 -0
data/lib/html5/html5parser/in_cell_phase.rb +78 -0
data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
data/lib/html5/html5parser/in_head_phase.rb +143 -0
data/lib/html5/html5parser/in_row_phase.rb +96 -0
data/lib/html5/html5parser/in_select_phase.rb +90 -0
data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
data/lib/html5/html5parser/in_table_phase.rb +177 -0
data/lib/html5/html5parser/initial_phase.rb +133 -0
data/lib/html5/html5parser/phase.rb +171 -0
data/lib/html5/inputstream.rb +735 -0
data/lib/html5/liberalxmlparser.rb +158 -0
data/lib/html5/sanitizer.rb +209 -0
data/lib/html5/serializer.rb +2 -0
data/lib/html5/serializer/htmlserializer.rb +179 -0
data/lib/html5/serializer/xhtmlserializer.rb +20 -0
data/lib/html5/sniffer.rb +45 -0
data/lib/html5/tokenizer.rb +1059 -0
data/lib/html5/treebuilders.rb +24 -0
data/lib/html5/treebuilders/base.rb +339 -0
data/lib/html5/treebuilders/hpricot.rb +231 -0
data/lib/html5/treebuilders/rexml.rb +215 -0
data/lib/html5/treebuilders/simpletree.rb +191 -0
data/lib/html5/treewalkers.rb +26 -0
data/lib/html5/treewalkers/base.rb +162 -0
data/lib/html5/treewalkers/hpricot.rb +48 -0
data/lib/html5/treewalkers/rexml.rb +48 -0
data/lib/html5/treewalkers/simpletree.rb +48 -0
data/lib/html5/version.rb +3 -0
data/test/preamble.rb +69 -0
data/test/test_cli.rb +16 -0
data/test/test_encoding.rb +35 -0
data/test/test_input_stream.rb +26 -0
data/test/test_lxp.rb +283 -0
data/test/test_parser.rb +63 -0
data/test/test_sanitizer.rb +173 -0
data/test/test_serializer.rb +67 -0
data/test/test_sniffer.rb +27 -0
data/test/test_stream.rb +71 -0
data/test/test_tokenizer.rb +95 -0
data/test/test_treewalkers.rb +135 -0
data/test/test_validator.rb +31 -0
data/test/tokenizer_test_parser.rb +67 -0
data/test19.rb +38 -0
metadata +198 -0

data/lib/html5/html5parser/initial_phase.rb ADDED

@@ -0,0 +1,133 @@
+require 'html5/html5parser/phase'
+module HTML5
+  class InitialPhase < Phase
+    # This phase deals with error handling as well which is currently not
+    # covered in the specification. The error handling is typically known as
+    # "quirks mode". It is expected that a future version of HTML5 will define this.
+    def process_eof
+      parse_error("expected-doctype-but-got-eof")
+      @parser.phase = @parser.phases[:beforeHtml]
+      @parser.phase.process_eof
+    end
+    def processComment(data)
+      @tree.insert_comment(data, @tree.document)
+    end
+    def processDoctype(name, publicId, systemId, correct)
+      if name.downcase != 'html' or publicId or systemId
+        parse_error("unknown-doctype")
+      end
+      # XXX need to update DOCTYPE tokens
+      @tree.insertDoctype(name, publicId, systemId)
+      publicId = publicId.to_s.upcase
+      if name.downcase != 'html'
+        # XXX quirks mode
+      else
+        if ["+//silmaril//dtd html pro v0r11 19970101//en",
+          "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
+          "-//as//dtd html 3.0 aswedit + extensions//en",
+          "-//ietf//dtd html 2.0 level 1//en",
+          "-//ietf//dtd html 2.0 level 2//en",
+          "-//ietf//dtd html 2.0 strict level 1//en",
+          "-//ietf//dtd html 2.0 strict level 2//en",
+          "-//ietf//dtd html 2.0 strict//en",
+          "-//ietf//dtd html 2.0//en",
+          "-//ietf//dtd html 2.1e//en",
+          "-//ietf//dtd html 3.0//en",
+          "-//ietf//dtd html 3.0//en//",
+          "-//ietf//dtd html 3.2 final//en",
+          "-//ietf//dtd html 3.2//en",
+          "-//ietf//dtd html 3//en",
+          "-//ietf//dtd html level 0//en",
+          "-//ietf//dtd html level 0//en//2.0",
+          "-//ietf//dtd html level 1//en",
+          "-//ietf//dtd html level 1//en//2.0",
+          "-//ietf//dtd html level 2//en",
+          "-//ietf//dtd html level 2//en//2.0",
+          "-//ietf//dtd html level 3//en",
+          "-//ietf//dtd html level 3//en//3.0",
+          "-//ietf//dtd html strict level 0//en",
+          "-//ietf//dtd html strict level 0//en//2.0",
+          "-//ietf//dtd html strict level 1//en",
+          "-//ietf//dtd html strict level 1//en//2.0",
+          "-//ietf//dtd html strict level 2//en",
+          "-//ietf//dtd html strict level 2//en//2.0",
+          "-//ietf//dtd html strict level 3//en",
+          "-//ietf//dtd html strict level 3//en//3.0",
+          "-//ietf//dtd html strict//en",
+          "-//ietf//dtd html strict//en//2.0",
+          "-//ietf//dtd html strict//en//3.0",
+          "-//ietf//dtd html//en",
+          "-//ietf//dtd html//en//2.0",
+          "-//ietf//dtd html//en//3.0",
+          "-//metrius//dtd metrius presentational//en",
+          "-//microsoft//dtd internet explorer 2.0 html strict//en",
+          "-//microsoft//dtd internet explorer 2.0 html//en",
+          "-//microsoft//dtd internet explorer 2.0 tables//en",
+          "-//microsoft//dtd internet explorer 3.0 html strict//en",
+          "-//microsoft//dtd internet explorer 3.0 html//en",
+          "-//microsoft//dtd internet explorer 3.0 tables//en",
+          "-//netscape comm. corp.//dtd html//en",
+          "-//netscape comm. corp.//dtd strict html//en",
+          "-//o'reilly and associates//dtd html 2.0//en",
+          "-//o'reilly and associates//dtd html extended 1.0//en",
+          "-//spyglass//dtd html 2.0 extended//en",
+          "-//sq//dtd html 2.0 hotmetal + extensions//en",
+          "-//sun microsystems corp.//dtd hotjava html//en",
+          "-//sun microsystems corp.//dtd hotjava strict html//en",
+          "-//w3c//dtd html 3 1995-03-24//en",
+          "-//w3c//dtd html 3.2 draft//en",
+          "-//w3c//dtd html 3.2 final//en",
+          "-//w3c//dtd html 3.2//en",
+          "-//w3c//dtd html 3.2s draft//en",
+          "-//w3c//dtd html 4.0 frameset//en",
+          "-//w3c//dtd html 4.0 transitional//en",
+          "-//w3c//dtd html experimental 19960712//en",
+          "-//w3c//dtd html experimental 970421//en",
+          "-//w3c//dtd w3 html//en",
+          "-//w3o//dtd w3 html 3.0//en",
+          "-//w3o//dtd w3 html 3.0//en//",
+          "-//w3o//dtd w3 html strict 3.0//en//",
+          "-//webtechs//dtd mozilla html 2.0//en",
+          "-//webtechs//dtd mozilla html//en",
+          "-/w3c/dtd html 4.0 transitional/en",
+          "html"].include?(publicId) or
+          (systemId == nil and
+           ["-//w3c//dtd html 4.01 frameset//EN",
+             "-//w3c//dtd html 4.01 transitional//EN"].include?(publicId)) or
+           (systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")
+           #XXX quirks mode
+        end
+      end
+      @parser.phase = @parser.phases[:beforeHtml]
+    end
+    def processSpaceCharacters(data)
+    end
+    def processCharacters(data)
+      parse_error("expected-doctype-but-got-chars")
+      @parser.phase = @parser.phases[:beforeHtml]
+      @parser.phase.processCharacters(data)
+    end
+    def processStartTag(name, attributes, self_closing=false)
+      parse_error("expected-doctype-but-got-start-tag", {"name" => name})
+      @parser.phase = @parser.phases[:beforeHtml]
+      @parser.phase.processStartTag(name, attributes)
+    end
+    def processEndTag(name)
+      parse_error("expected-doctype-but-got-end-tag", {"name" => name})
+      @parser.phase = @parser.phases[:beforeHtml]
+      @parser.phase.processEndTag(name)
+    end
+  end
+end

data/lib/html5/html5parser/phase.rb ADDED

@@ -0,0 +1,171 @@
+module HTML5
+  # Base class for helper objects that implement each phase of processing.
+  #
+  # Handler methods should be in the following order (they can be omitted):
+  #
+  #   * EOF
+  #   * Comment
+  #   * Doctype
+  #   * SpaceCharacters
+  #   * Characters
+  #   * StartTag
+  #     - startTag* methods
+  #   * EndTag
+  #     - endTag* methods
+  #
+  class Phase
+    extend Forwardable
+    def_delegators :@parser, :parse_error
+    # The following example call:
+    #
+    #   tag_handlers('startTag', 'html', %w( base link meta ), %w( li dt dd ) => 'ListItem')
+    #
+    # ...would return a hash equal to this:
+    #
+    #   { 'html' => 'startTagHtml',
+    #     'base' => 'startTagBaseLinkMeta',
+    #     'link' => 'startTagBaseLinkMeta',
+    #     'meta' => 'startTagBaseLinkMeta',
+    #     'li'   => 'startTagListItem',
+    #     'dt'   => 'startTagListItem',
+    #     'dd'   => 'startTagListItem'  }
+    #
+    def self.tag_handlers(prefix, *tags)
+      mapping = {}
+      if tags.last.is_a?(Hash)
+        tags.pop.each do |names, handler_method_suffix|
+          handler_method = prefix + handler_method_suffix
+          Array(names).each {|name| mapping[name] = handler_method }
+        end
+      end
+      tags.each do |names|
+        names = Array(names)
+        handler_method = prefix + names.map {|name| name.capitalize }.join
+        names.each {|name| mapping[name] = handler_method }
+      end
+      mapping
+    end
+    def self.start_tag_handlers
+      @start_tag_handlers ||= Hash.new('startTagOther')
+    end
+    # Declare what start tags this Phase handles. Can be called more than once.
+    #
+    # Example usage:
+    #
+    #   handle_start 'html'
+    #   # html start tags will be handled by a method named 'startTagHtml'
+    #
+    #   handle_start %( base link meta )
+    #   # base, link and meta start tags will be handled by a method named 'startTagBaseLinkMeta'
+    #
+    #   handle_start %( li dt dd ) => 'ListItem'
+    #   # li, dt, and dd start tags will be handled by a method named 'startTagListItem'
+    #
+    def self.handle_start(*tags)
+      start_tag_handlers.update tag_handlers('startTag', *tags)
+    end
+    def self.end_tag_handlers
+      @end_tag_handlers ||= Hash.new('endTagOther')
+    end
+    # Declare what end tags this Phase handles. Behaves like handle_start.
+    #
+    def self.handle_end(*tags)
+      end_tag_handlers.update tag_handlers('endTag', *tags)
+    end
+    def initialize(parser, tree)
+      @parser, @tree = parser, tree
+    end
+    def process_eof
+      @tree.generateImpliedEndTags
+      if @tree.open_elements.length > 2
+        parse_error("expected-closing-tag-but-got-eof")
+      elsif @tree.open_elements.length == 2 and @tree.open_elements[1].name != 'body'
+        # This happens for framesets or something?
+        parse_error("expected-closing-tag-but-got-eof")
+      elsif @parser.inner_html and @tree.open_elements.length > 1
+        # XXX This is not what the specification says. Not sure what to do here.
+        parse_error("eof-in-innerhtml")
+      end
+      # Betting ends.
+    end
+    def processComment(data)
+      # For most phases the following is correct. Where it's not it will be
+      # overridden.
+      @tree.insert_comment(data, @tree.open_elements.last)
+    end
+    def processDoctype(name, publicId, systemId, correct)
+      parse_error("unexpected-doctype")
+    end
+    def processSpaceCharacters(data)
+      @tree.insertText(data)
+    end
+    def processStartTag(name, attributes, self_closing=false)
+      if method(self.class.start_tag_handlers[name]).arity == 2
+        send self.class.start_tag_handlers[name], name, attributes
+      else
+        send self.class.start_tag_handlers[name], name, attributes, self_closing
+      end
+    end
+    def startTagHtml(name, attributes)
+      if @parser.first_start_tag == false and name == 'html'
+         parse_error("non-html-root")
+      end
+      # XXX Need a check here to see if the first start tag token emitted is
+      # this token... If it's not, invoke parse_error.
+      attributes.each do |attr, value|
+        unless @tree.open_elements.first.attributes.has_key?(attr)
+          @tree.open_elements.first.attributes[attr] = value
+        end
+      end
+      @parser.first_start_tag = false
+    end
+    def processEndTag(name)
+      send self.class.end_tag_handlers[name], name
+    end
+    def assert(value)
+      throw AssertionError.new unless value
+    end
+    def in_scope?(*args)
+      @tree.elementInScope(*args)
+    end
+    def remove_open_elements_until(name=nil)
+      finished = false
+      until finished || @tree.open_elements.length == 0
+        element = @tree.open_elements.pop
+        finished = name.nil? ? yield(element) : element.name == name
+      end
+      return element
+    end
+    def adjust_mathml_attributes(attributes)
+      attributes.collect do |a|
+        if a.first =='definitionurl'
+          ['definitionURL', a[1]]
+        else
+          a
+        end
+      end
+    end
+    def adjust_foreign_attributes(attributes)
+      attributes
+    end
+  end
+end

data/lib/html5/inputstream.rb ADDED

@@ -0,0 +1,735 @@
+require 'stringio'
+require 'html5/constants'
+module HTML5
+  # Provides a unicode stream of characters to the HTMLTokenizer.
+  # This class takes care of character encoding and removing or replacing
+  # incorrect byte-sequences and also provides column and line tracking.
+  class HTMLInputStream
+    attr_accessor :queue, :char_encoding, :errors
+    # see /usr/lib/ruby/1.9.1/rexml/text.rb
+    VALID_CHAR = [
+      0x9, 0xA, 0xD,
+      (0x20..0xD7FF),
+      (0xE000..0xFFFD),
+      (0x10000..0x10FFFF)
+    ]
+    if String.method_defined? :encode
+      VALID_XML_CHARS = Regexp.new('^['+
+        VALID_CHAR.map { |item|
+          case item
+          when Fixnum
+            [item].pack('U').force_encoding('utf-8')
+          when Range
+            [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
+          end
+        }.join +
+      ']*$')
+    else
+      VALID_XML_CHARS = /^(
+         | [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
+         |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
+         | [\xE1-\xEC\xEE][\x80-\xBF]{2}      # straight 3-byte
+         |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
+         |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
+         | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
+         |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
+       )*$/nx;
+    end
+    # Initialises the HTMLInputStream.
+    #
+    # HTMLInputStream(source, [encoding]) -> Normalized stream from source
+    # for use by the HTML5Lib.
+    #
+    # source can be either a file-object, local filename or a string.
+    #
+    # The optional encoding parameter must be a string that indicates
+    # the encoding.  If specified, that encoding will be used,
+    # regardless of any BOM or later declaration (such as in a meta
+    # element)
+    #
+    # parseMeta - Look for a <meta> element containing encoding information
+    def initialize(source, options = {})
+      @encoding   = nil
+      @parse_meta = true
+      @chardet    = true
+      options.each {|name, value| instance_variable_set("@#{name}", value) }
+      # partial Ruby 1.9 support
+      if @encoding and source.respond_to? :force_encoding
+        source.force_encoding(@encoding) rescue nil
+      end
+      # Raw Stream
+      @raw_stream = open_stream(source)
+      # Encoding Information
+      #Number of bytes to use when looking for a meta element with
+      #encoding information
+      @NUM_BYTES_META = 512
+      #Number of bytes to use when using detecting encoding using chardet
+      @NUM_BYTES_CHARDET = 256
+      #Number of bytes to use when reading content
+      @NUM_BYTES_BUFFER = 1024
+      #Encoding to use if no other information can be found
+      @DEFAULT_ENCODING = 'windows-1252'
+      #Detect encoding iff no explicit "transport level" encoding is supplied
+      if @encoding.nil?
+        @char_encoding = detect_encoding
+      else
+        @char_encoding = @encoding
+      end
+      # Read bytes from stream decoding them into Unicode
+      @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
+      if @char_encoding == 'windows-1252'
+        @win1252 = true
+      elsif @char_encoding != 'utf-8'
+        require 'iconv'
+        begin
+          @buffer << @raw_stream.read unless @raw_stream.eof?
+          @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
+        rescue
+          @win1252 = true
+        end
+      end
+      @queue = []
+      @errors = []
+      # Reset position in the list to read from
+      @tell = 0
+      @line = @col = 0
+      @line_lengths = []
+    end
+    # Produces a file object from source.
+    #
+    # source can be either a file object, local filename or a string.
+    def open_stream(source)
+      # Already an IO like object
+      if source.respond_to?(:read)
+        source
+      else
+        # Treat source as a string and wrap in StringIO
+        StringIO.new(source)
+      end
+    end
+    def detect_encoding
+      #First look for a BOM
+      #This will also read past the BOM if present
+      encoding = detect_bom
+      #If there is no BOM need to look for meta elements with encoding
+      #information
+      if encoding.nil? and @parse_meta
+        encoding = detect_encoding_meta
+      end
+      #Guess with chardet, if avaliable
+      if encoding.nil? and @chardet
+        begin
+          require 'rubygems'
+          require 'UniversalDetector' # gem install chardet
+          buffers = []
+          detector = UniversalDetector::Detector.instance
+          detector.reset
+          until @raw_stream.eof?
+            buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
+            break if !buffer or buffer.empty?
+            buffers << buffer
+            detector.feed(buffer)
+            break if detector.instance_eval {@done}
+            detector.instance_eval {
+              @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
+            }
+          end
+          detector.close
+          encoding = detector.result['encoding']
+          seek(buffers*'', 0)
+        rescue LoadError
+        end
+      end
+      # If all else fails use the default encoding
+      if encoding.nil?
+        encoding = @DEFAULT_ENCODING
+      end
+      #Substitute for equivalent encoding
+      if 'iso-8859-1' == encoding.downcase
+        encoding = 'windows-1252'
+      end
+      encoding
+    end
+    # Attempts to detect at BOM at the start of the stream. If
+    # an encoding can be determined from the BOM return the name of the
+    # encoding otherwise return nil
+    def detect_bom
+      bom_dict = {
+        "\xef\xbb\xbf"     => 'utf-8',
+        "\xff\xfe"         => 'utf-16le',
+        "\xfe\xff"         => 'utf-16be',
+        "\xff\xfe\x00\x00" => 'utf-32le',
+        "\x00\x00\xfe\xff" => 'utf-32be'
+      }
+      # Go to beginning of file and read in 4 bytes
+      string = @raw_stream.read(4)
+      return nil unless string
+      # Try detecting the BOM using bytes from the string
+      encoding = bom_dict[string[0...3]]      # UTF-8
+      seek = 3
+      unless encoding
+        # Need to detect UTF-32 before UTF-16
+        encoding = bom_dict[string]       # UTF-32
+        seek = 4
+        unless encoding
+          encoding = bom_dict[string[0...2]]  # UTF-16
+          seek = 2
+        end
+      end
+      # Set the read position past the BOM if one was found, otherwise
+      # set it to the start of the stream
+      seek(string, encoding ? seek : 0)
+      return encoding
+    end
+    def seek(buffer, n)
+      if @raw_stream.respond_to?(:unget)
+        @raw_stream.unget(buffer[n..-1])
+        return
+      end
+      if @raw_stream.respond_to?(:seek)
+        begin
+          @raw_stream.seek(n)
+          return
+        rescue Errno::ESPIPE
+        end
+      end
+      #TODO: huh?
+      require 'delegate'
+      @raw_stream = SimpleDelegator.new(@raw_stream)
+      class << @raw_stream
+        def read(chars=-1)
+          if chars == -1 or chars > @data.length
+            result = @data
+            @data = ''
+            return result if __getobj__.eof?
+            return result + __getobj__.read if chars == -1
+            return result + __getobj__.read(chars-result.length)
+          elsif @data.empty?
+            return __getobj__.read(chars)
+          else
+            result = @data[1...chars]
+            @data = @data[chars..-1]
+            return result
+          end
+        end
+        def unget(data)
+          if !@data or @data.empty?
+            @data = data
+          else
+            @data += data
+          end
+        end
+      end
+      @raw_stream.unget(buffer[n .. -1])
+    end
+    # Report the encoding declared by the meta element
+    def detect_encoding_meta
+      buffer = @raw_stream.read(@NUM_BYTES_META)
+      parser = EncodingParser.new(buffer)
+      seek(buffer, 0)
+      return parser.get_encoding
+    end
+    # Returns (line, col) of the current position in the stream.
+    def position
+      line, col = @line, @col
+      if @queue and @queue.last != :EOF
+        @queue.reverse.each do |c|
+          if c == "\n"
+            line -= 1
+            raise RuntimeError.new("col=#{col}") unless col == 0
+            col = @line_lengths[line]
+          else
+            col -= 1
+          end
+        end
+      end
+      return [line + 1, col]
+    end
+    # Read one character from the stream or queue if available. Return
+    # EOF when EOF is reached.
+    def char
+      unless @queue.empty?
+        return @queue.shift
+      else
+        if @tell + 3 > @buffer.length && !@raw_stream.eof?
+          # read next block
+          @buffer = @buffer[@tell..-1] + @raw_stream.read(@NUM_BYTES_BUFFER)
+          @tell = 0
+        end
+        c = @buffer[@tell]
+        @tell += 1
+        case c
+        when String
+          # partial Ruby 1.9 support
+          case c
+          when "\0"
+            @errors.push("null-character")
+            c = "\uFFFD" # null characters are invalid
+          when "\r"
+            @tell += 1 if @buffer[@tell] == "\n"
+            c = "\n"
+          when "\x80" .. "\x9F"
+            c = ENTITIES_WINDOWS1252[c.ord-0x80].chr('utf-8')
+          when "\xA0" .. "\xFF"
+            if c.encoding == Encoding::ASCII_8BIT
+              c = c.encode('utf-8','iso-8859-1')
+            end
+          end
+          if c == "\x0D"
+            # normalize newlines
+            @tell += 1 if @buffer[@tell] == 0x0A
+            c = 0x0A
+          end
+          # update position in stream
+          if c == "\x0a"
+            @line_lengths << @col
+            @line += 1
+            @col = 0
+          else
+            @col += 1
+          end
+          c
+        when 0x01..0x7F
+          if c == 0x0D
+            # normalize newlines
+            @tell += 1 if @buffer[@tell] == 0x0A
+            c = 0x0A
+          end
+          # update position in stream
+          if c == 0x0a
+            @line_lengths << @col
+            @line += 1
+            @col = 0
+          else
+            @col += 1
+          end
+          c.chr
+        when 0x80..0xBF
+          if !@win1252
+            [0xFFFD].pack('U') # invalid utf-8
+          elsif c <= 0x9f
+            [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
+          else
+            "\xC2" + c.chr # convert to utf-8
+          end
+        when 0xC0..0xFF
+          if instance_variable_defined?("@win1252") && @win1252
+            "\xC3" + (c - 64).chr # convert to utf-8
+          elsif @buffer[@tell - 1..@tell + 3] =~ VALID_XML_CHARS
+            @tell += $1.length - 1
+            $1
+          else
+            [0xFFFD].pack('U') # invalid utf-8
+          end
+        when 0x00
+          @errors.push("null-character")
+          [0xFFFD].pack('U') # null characters are invalid
+        else
+          :EOF
+        end
+      end
+    end
+    # Returns a string of characters from the stream up to but not
+    # including any character in characters or EOF. characters can be
+    # any container that supports the in method being called on it.
+    def chars_until(characters, opposite=false)
+      char_stack = [char]
+      while char_stack.last != :EOF
+        break unless (characters.include?(char_stack.last)) == opposite
+        char_stack.push(char)
+      end
+      # Put the character stopped on back to the front of the queue
+      # from where it came.
+      c = char_stack.pop
+      @queue.insert(0, c) unless c == :EOF
+      return char_stack.join('')
+    end
+    def unget(characters)
+      return if characters == :EOF
+      if characters.respond_to? :to_a
+        @queue.unshift(*characters.to_a)
+      else
+        characters.reverse.each_char {|c| @queue.unshift(c)}
+      end
+    end
+  end
+  # String-like object with an assosiated position and various extra methods
+  # If the position is ever greater than the string length then an exception is raised
+  class EncodingBytes < String
+    attr_accessor :position
+    def initialize(value)
+      super(value)
+      @position = -1
+    end
+    def each
+      while @position < length
+        @position += 1
+        yield self[@position]
+      end
+    rescue EOF
+    end
+    def current_byte
+      raise EOF if @position >= length
+      return self[@position].chr
+    end
+    # Skip past a list of characters
+    def skip(chars=SPACE_CHARACTERS)
+      while chars.include?(current_byte)
+        @position += 1
+      end
+    end
+    # Look for a sequence of bytes at the start of a string. If the bytes
+    # are found return true and advance the position to the byte after the
+    # match. Otherwise return false and leave the position alone
+    def match_bytes(bytes, lower=false)
+      data = self[position ... position+bytes.length]
+      data.downcase! if lower
+      rv = (data == bytes)
+      @position += bytes.length if rv == true
+      return rv
+    end
+    # Look for the next sequence of bytes matching a given sequence. If
+    # a match is found advance the position to the last byte of the match
+    def jump_to(bytes)
+      new_position = self[position .. -1].index(bytes)
+      if new_position
+        @position += (new_position + bytes.length-1)
+        return true
+      else
+        raise EOF
+      end
+    end
+    # Move the pointer so it points to the next byte in a set of possible
+    # bytes
+    def find_next(byte_list)
+      until byte_list.include?(current_byte)
+        @position += 1
+      end
+    end
+  end
+  # Mini parser for detecting character encoding from meta elements
+  class EncodingParser
+    ASCII_PUNCTUATION = %r{[\x09-\x0D\x20-\x2F\x3A-\x40\x5B-\x60\x7B-\x7E]}
+    # a (hopefully) temporary hack to deal with the fact that ruby doesn't have a built in encodings
+    #   library
+    ENCODINGS = ['euc_jp', 'utf-8', "iso8859-2", "iso-8859-1", "utf-16", "UTF-16LE", "UTF-16BE"].inject({}){|m, v| m[v.downcase.gsub(ASCII_PUNCTUATION, '')] = v; m}
+    # string - the data to work on for encoding detection
+    def initialize(data)
+      @data = EncodingBytes.new(data.to_s)
+      @encoding = nil
+    end
+    @@method_dispatch = [
+      ['<!--', :handle_comment],
+      ['<meta', :handle_meta],
+      ['</', :handle_possible_end_tag],
+      ['<!', :handle_other],
+      ['<?', :handle_other],
+      ['<', :handle_possible_start_tag]
+    ]
+    def get_encoding
+      @data.each do |byte|
+        keep_parsing = true
+        @@method_dispatch.each do |(key, method)|
+          if @data.match_bytes(key, lower = true)
+            keep_parsing = send(method)
+            break
+          end
+        end
+        break unless keep_parsing
+      end
+      unless @encoding.nil?
+        @encoding = @encoding.strip
+        if ["utf16", "utf16be", "utf16le", "utf32", "utf32be", "utf32le"].include?(@encoding.downcase.gsub(ASCII_PUNCTUATION, ''))
+          @encoding = 'utf-8'
+        end
+      end
+      return @encoding
+    end
+    # Skip over comments
+    def handle_comment
+      return @data.jump_to('-->')
+    end
+    def handle_meta
+      # if we have <meta not followed by a space so just keep going
+      return true unless SPACE_CHARACTERS.include?(@data.current_byte)
+      #We have a valid meta element we want to search for attributes
+      while true
+        #Try to find the next attribute after the current position
+        attr = get_attribute
+        return true if attr.nil?
+        if attr[0] == 'charset'
+          tentative_encoding = attr[1]
+          codec = codec_name(tentative_encoding)
+          if codec
+            @encoding = codec
+            return false
+          end
+        elsif attr[0] == 'content'
+          content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
+          tentative_encoding = content_parser.parse
+          codec = codec_name(tentative_encoding)
+          if codec
+            @encoding = codec
+            return false
+          end
+        end
+      end
+    end
+    def handle_possible_start_tag
+      return handle_possible_tag(false)
+    end
+    def handle_possible_end_tag
+      @data.position += 1
+      return handle_possible_tag(true)
+    end
+    def handle_possible_tag(end_tag)
+      unless ASCII_LETTERS.include?(@data.current_byte)
+        #If the next byte is not an ascii letter either ignore this
+        #fragment (possible start tag case) or treat it according to
+        #handleOther
+        if end_tag
+          @data.position -= 1
+          handle_other
+        end
+        return true
+      end
+      @data.find_next(SPACE_CHARACTERS + ['<', '>'])
+      if @data.current_byte == '<'
+        #return to the first step in the overall "two step" algorithm
+        #reprocessing the < byte
+        @data.position -= 1
+      else
+        #Read all attributes
+        {} until get_attribute.nil?
+      end
+      return true
+    end
+    def handle_other
+      return @data.jump_to('>')
+    end
+    # Return a name,value pair for the next attribute in the stream,
+    # if one is found, or nil
+    def get_attribute
+      @data.skip(SPACE_CHARACTERS + ['/'])
+      if @data.current_byte == '<'
+        @data.position -= 1
+        return nil
+      elsif @data.current_byte == '>'
+        return nil
+      end
+      attr_name = []
+      attr_value = []
+      space_found = false
+      #Step 5 attribute name
+      while true
+        if @data.current_byte == '=' and attr_name
+          break
+        elsif SPACE_CHARACTERS.include?(@data.current_byte)
+          space_found = true
+          break
+        elsif ['/', '<', '>'].include?(@data.current_byte)
+          return [attr_name.join(''), '']
+        elsif ASCII_UPPERCASE.include?(@data.current_byte)
+          attr_name.push(@data.current_byte.downcase)
+        else
+          attr_name.push(@data.current_byte)
+        end
+        #Step 6
+        @data.position += 1
+      end
+      #Step 7
+      if space_found
+        @data.skip
+        #Step 8
+        unless @data.current_byte == '='
+          @data.position -= 1
+          return [attr_name.join(''), '']
+        end
+      end
+      #XXX need to advance position in both spaces and value case
+      #Step 9
+      @data.position += 1
+      #Step 10
+      @data.skip
+      #Step 11
+      if ["'", '"'].include?(@data.current_byte)
+        #11.1
+        quote_char = @data.current_byte
+        while true
+          @data.position+=1
+          #11.3
+          if @data.current_byte == quote_char
+            @data.position += 1
+            return [attr_name.join(''), attr_value.join('')]
+          #11.4
+          elsif ASCII_UPPERCASE.include?(@data.current_byte)
+            attr_value.push(@data.current_byte.downcase)
+          #11.5
+          else
+            attr_value.push(@data.current_byte)
+          end
+        end
+      elsif ['>', '<'].include?(@data.current_byte)
+        return [attr_name.join(''), '']
+      elsif ASCII_UPPERCASE.include?(@data.current_byte)
+        attr_value.push(@data.current_byte.downcase)
+      else
+        attr_value.push(@data.current_byte)
+      end
+      while true
+        @data.position += 1
+        if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
+          return [attr_name.join(''), attr_value.join('')]
+        elsif ASCII_UPPERCASE.include?(@data.current_byte)
+          attr_value.push(@data.current_byte.downcase)
+        else
+          attr_value.push(@data.current_byte)
+        end
+      end
+    end
+    def codec_name(encoding)
+      if (!encoding.nil? && encoding.kind_of?(String))
+        canonical_name = encoding.downcase.gsub(ASCII_PUNCTUATION, '')
+        ENCODINGS[canonical_name]
+        # p encoding
+        # encoding
+      else
+        nil
+      end
+    end
+  end
+  class ContentAttrParser
+    def initialize(data)
+      @data = data
+    end
+    def parse
+      begin
+        #Skip to the first ";"
+        @data.position = 0
+        @data.jump_to(';')
+        @data.position += 1
+        @data.skip
+        #Check if the attr name is charset
+        #otherwise return
+        @data.jump_to('charset')
+        @data.position += 1
+        @data.skip
+        unless @data.current_byte == '='
+          #If there is no = sign keep looking for attrs
+          return nil
+        end
+        @data.position += 1
+        @data.skip
+        #Look for an encoding between matching quote marks
+        if ['"', "'"].include?(@data.current_byte)
+          quote_mark = @data.current_byte
+          @data.position += 1
+          old_position = @data.position
+          @data.jump_to(quote_mark)
+          return @data[old_position ... @data.position]
+        else
+          #Unquoted value
+          old_position = @data.position
+          begin
+            @data.find_next(SPACE_CHARACTERS)
+            return @data[old_position ... @data.position]
+          rescue EOF
+            #Return the whole remaining value
+            return @data[old_position .. -1]
+          end
+        end
+      rescue EOF
+        return nil
+      end
+    end
+  end
+end