RubyGems - html5 - Versions diffs - 0.1.0 - Mend

html5 0.1.0

Files changed (59) hide show

data/History.txt +3 -0
data/Manifest.txt +58 -0
data/README +9 -0
data/Rakefile.rb +17 -0
data/lib/html5/constants.rb +818 -0
data/lib/html5/filters/base.rb +10 -0
data/lib/html5/filters/inject_meta_charset.rb +82 -0
data/lib/html5/filters/optionaltags.rb +198 -0
data/lib/html5/filters/sanitizer.rb +15 -0
data/lib/html5/filters/whitespace.rb +36 -0
data/lib/html5/html5parser/after_body_phase.rb +46 -0
data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
data/lib/html5/html5parser/after_head_phase.rb +50 -0
data/lib/html5/html5parser/before_head_phase.rb +41 -0
data/lib/html5/html5parser/in_body_phase.rb +607 -0
data/lib/html5/html5parser/in_caption_phase.rb +68 -0
data/lib/html5/html5parser/in_cell_phase.rb +78 -0
data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
data/lib/html5/html5parser/in_head_phase.rb +138 -0
data/lib/html5/html5parser/in_row_phase.rb +87 -0
data/lib/html5/html5parser/in_select_phase.rb +84 -0
data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
data/lib/html5/html5parser/in_table_phase.rb +110 -0
data/lib/html5/html5parser/initial_phase.rb +134 -0
data/lib/html5/html5parser/phase.rb +158 -0
data/lib/html5/html5parser/root_element_phase.rb +42 -0
data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
data/lib/html5/html5parser.rb +248 -0
data/lib/html5/inputstream.rb +654 -0
data/lib/html5/liberalxmlparser.rb +158 -0
data/lib/html5/sanitizer.rb +188 -0
data/lib/html5/serializer/htmlserializer.rb +180 -0
data/lib/html5/serializer/xhtmlserializer.rb +20 -0
data/lib/html5/serializer.rb +2 -0
data/lib/html5/tokenizer.rb +968 -0
data/lib/html5/treebuilders/base.rb +334 -0
data/lib/html5/treebuilders/hpricot.rb +231 -0
data/lib/html5/treebuilders/rexml.rb +208 -0
data/lib/html5/treebuilders/simpletree.rb +185 -0
data/lib/html5/treebuilders.rb +24 -0
data/lib/html5/treewalkers/base.rb +154 -0
data/lib/html5/treewalkers/hpricot.rb +48 -0
data/lib/html5/treewalkers/rexml.rb +48 -0
data/lib/html5/treewalkers/simpletree.rb +48 -0
data/lib/html5/treewalkers.rb +26 -0
data/lib/html5.rb +13 -0
data/parse.rb +217 -0
data/tests/preamble.rb +82 -0
data/tests/test_encoding.rb +35 -0
data/tests/test_lxp.rb +263 -0
data/tests/test_parser.rb +68 -0
data/tests/test_sanitizer.rb +142 -0
data/tests/test_serializer.rb +68 -0
data/tests/test_stream.rb +62 -0
data/tests/test_tokenizer.rb +94 -0
data/tests/test_treewalkers.rb +116 -0
data/tests/tokenizer_test_parser.rb +63 -0
metadata +120 -0

data/lib/html5/inputstream.rb ADDED Viewed

@@ -0,0 +1,654 @@
+require 'stringio'
+require 'html5/constants'
+module HTML5
+  # Provides a unicode stream of characters to the HTMLTokenizer.
+  # This class takes care of character encoding and removing or replacing
+  # incorrect byte-sequences and also provides column and line tracking.
+  class HTMLInputStream
+    attr_accessor :queue, :char_encoding, :errors
+    # Initialises the HTMLInputStream.
+    #
+    # HTMLInputStream(source, [encoding]) -> Normalized stream from source
+    # for use by the HTML5Lib.
+    #
+    # source can be either a file-object, local filename or a string.
+    #
+    # The optional encoding parameter must be a string that indicates
+    # the encoding.  If specified, that encoding will be used,
+    # regardless of any BOM or later declaration (such as in a meta
+    # element)
+    #
+    # parseMeta - Look for a <meta> element containing encoding information
+    def initialize(source, options = {})
+      @encoding   = nil
+      @parse_meta = true
+      @chardet    = true
+      options.each {|name, value| instance_variable_set("@#{name}", value) }
+      # Raw Stream
+      @raw_stream = open_stream(source)
+      # Encoding Information
+      #Number of bytes to use when looking for a meta element with
+      #encoding information
+      @NUM_BYTES_META = 512
+      #Number of bytes to use when using detecting encoding using chardet
+      @NUM_BYTES_CHARDET = 256
+      #Number of bytes to use when reading content
+      @NUM_BYTES_BUFFER = 1024
+      #Encoding to use if no other information can be found
+      @DEFAULT_ENCODING = 'windows-1252'
+      #Detect encoding iff no explicit "transport level" encoding is supplied
+      if @encoding.nil? or not HTML5.is_valid_encoding(@encoding)
+        @char_encoding = detect_encoding
+      else
+        @char_encoding = @encoding
+      end
+      # Read bytes from stream decoding them into Unicode
+      @buffer = @raw_stream.read(@NUM_BYTES_BUFFER) || ''
+      if @char_encoding == 'windows-1252'
+        @win1252 = true
+      elsif @char_encoding != 'utf-8'
+        begin
+          require 'iconv'
+          begin
+            @buffer << @raw_stream.read unless @raw_stream.eof?
+            @buffer = Iconv.iconv('utf-8', @char_encoding, @buffer).first
+          rescue
+            @win1252 = true
+          end
+        rescue LoadError
+          @win1252 = true
+        end
+      end
+      @queue = []
+      @errors = []
+      # Reset position in the list to read from
+      @tell = 0
+      @line = @col = 0
+      @line_lengths = []
+    end
+    # Produces a file object from source.
+    #
+    # source can be either a file object, local filename or a string.
+    def open_stream(source)
+      # Already an IO like object
+      if source.respond_to?(:read)
+        @stream = source
+      else
+        # Treat source as a string and wrap in StringIO
+        @stream = StringIO.new(source)
+      end
+      return @stream
+    end
+    def detect_encoding
+      #First look for a BOM
+      #This will also read past the BOM if present
+      encoding = detect_bom
+      #If there is no BOM need to look for meta elements with encoding
+      #information
+      if encoding.nil? and @parse_meta
+        encoding = detect_encoding_meta
+      end
+      #Guess with chardet, if avaliable
+      if encoding.nil? and @chardet
+        begin
+          require 'rubygems'
+          require 'UniversalDetector' # gem install chardet
+          buffers = []
+          detector = UniversalDetector::Detector.instance
+          detector.reset
+          until @raw_stream.eof?
+            buffer = @raw_stream.read(@NUM_BYTES_CHARDET)
+            break if !buffer or buffer.empty?
+            buffers << buffer
+            detector.feed(buffer)
+            break if detector.instance_eval {@done}
+            detector.instance_eval {
+              @_mLastChar = @_mLastChar.chr if Fixnum === @_mLastChar
+            }
+          end
+          detector.close
+          encoding = detector.result['encoding']
+          seek(buffers*'', 0)
+        rescue LoadError
+        end
+      end
+      # If all else fails use the default encoding
+      if encoding.nil?
+        encoding = @DEFAULT_ENCODING
+      end
+      #Substitute for equivalent encodings
+      encoding_sub = {'iso-8859-1' => 'windows-1252'}
+      if encoding_sub.has_key?(encoding.downcase)
+        encoding = encoding_sub[encoding.downcase]
+      end
+      return encoding
+    end
+    # Attempts to detect at BOM at the start of the stream. If
+    # an encoding can be determined from the BOM return the name of the
+    # encoding otherwise return nil
+    def detect_bom
+      bom_dict = {
+        "\xef\xbb\xbf" => 'utf-8',
+        "\xff\xfe" => 'utf-16le',
+        "\xfe\xff" => 'utf-16be',
+        "\xff\xfe\x00\x00" => 'utf-32le',
+        "\x00\x00\xfe\xff" => 'utf-32be'
+      }
+      # Go to beginning of file and read in 4 bytes
+      string = @raw_stream.read(4)
+      return nil unless string
+      # Try detecting the BOM using bytes from the string
+      encoding = bom_dict[string[0...3]]      # UTF-8
+      seek = 3
+      unless encoding
+        # Need to detect UTF-32 before UTF-16
+        encoding = bom_dict[string]       # UTF-32
+        seek = 4
+        unless encoding
+          encoding = bom_dict[string[0...2]]  # UTF-16
+          seek = 2
+        end
+      end
+      # Set the read position past the BOM if one was found, otherwise
+      # set it to the start of the stream
+      seek(string, encoding ? seek : 0)
+      return encoding
+    end
+    def seek(buffer, n)
+      if @raw_stream.respond_to?(:unget)
+        @raw_stream.unget(buffer[n..-1])
+        return
+      end
+      if @raw_stream.respond_to?(:seek)
+        begin
+          @raw_stream.seek(n)
+          return
+        rescue Errno::ESPIPE
+        end
+      end
+      require 'delegate'
+      @raw_stream = SimpleDelegator.new(@raw_stream)
+      class << @raw_stream
+        def read(chars=-1)
+          if chars == -1 or chars > @data.length
+            result = @data
+            @data = ''
+            return result if __getobj__.eof?
+            return result + __getobj__.read if chars == -1
+            return result + __getobj__.read(chars-result.length)
+          elsif @data.empty?
+            return __getobj__.read(chars)
+          else
+            result = @data[1...chars]
+            @data = @data[chars..-1]
+            return result
+          end
+        end
+        def unget(data)
+          if !@data or @data.empty?
+            @data = data
+          else
+            @data += data
+          end
+        end
+      end
+      @raw_stream.unget(buffer[n .. -1])
+    end
+    # Report the encoding declared by the meta element
+    def detect_encoding_meta
+      buffer = @raw_stream.read(@NUM_BYTES_META)
+      parser = EncodingParser.new(buffer)
+      seek(buffer, 0)
+      return parser.get_encoding
+    end
+    # Returns (line, col) of the current position in the stream.
+    def position
+      line, col = @line, @col
+      @queue.reverse.each do |c|
+        if c == "\n"
+          line -= 1
+          raise RuntimeError.new("col=#{col}") unless col == 0
+          col = @line_lengths[line]
+        else
+          col -= 1
+        end
+      end
+      return [line+1, col]
+    end
+    # Read one character from the stream or queue if available. Return
+    # EOF when EOF is reached.
+    def char
+      unless @queue.empty?
+        return @queue.shift
+      else
+        if @tell + 3 > @buffer.length and !@raw_stream.eof?
+          # read next block
+          @buffer = @buffer[@tell .. -1] + @raw_stream.read(@NUM_BYTES_BUFFER)
+          @tell = 0
+        end
+        c = @buffer[@tell]
+        @tell += 1
+        case c
+        when 0x01 .. 0x7F
+          if c == 0x0D
+            # normalize newlines
+            @tell += 1 if @buffer[@tell] == 0x0A
+            c = 0x0A
+          end
+          # update position in stream
+          if c == 0x0a
+            @line_lengths << @col
+            @line += 1
+            @col = 0
+          else
+            @col += 1
+          end
+          c.chr
+        when 0x80 .. 0xBF
+          if !@win1252
+            [0xFFFD].pack('U') # invalid utf-8
+          elsif c <= 0x9f
+            [ENTITIES_WINDOWS1252[c-0x80]].pack('U')
+          else
+            "\xC2" + c.chr # convert to utf-8
+          end
+        when 0xC0 .. 0xFF
+          if instance_variable_defined?(:@win1252) && @win1252
+            "\xC3" + (c-64).chr # convert to utf-8
+          elsif @buffer[@tell-1 .. @tell+3] =~ /^
+                ( [\xC2-\xDF][\x80-\xBF]             # non-overlong 2-byte
+                |  \xE0[\xA0-\xBF][\x80-\xBF]        # excluding overlongs
+                | [\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}  # straight 3-byte
+                |  \xED[\x80-\x9F][\x80-\xBF]        # excluding surrogates
+                |  \xF0[\x90-\xBF][\x80-\xBF]{2}     # planes 1-3
+                | [\xF1-\xF3][\x80-\xBF]{3}          # planes 4-15
+                |  \xF4[\x80-\x8F][\x80-\xBF]{2}     # plane 16
+                )/x
+            @tell += $1.length - 1
+            $1
+          else
+            [0xFFFD].pack('U') # invalid utf-8
+          end
+        when 0x00
+          @errors.push('null character found in input stream, ' +
+            'replaced with U+FFFD')
+          [0xFFFD].pack('U') # null characters are invalid
+        else
+          :EOF
+        end
+      end
+    end
+    # Returns a string of characters from the stream up to but not
+    # including any character in characters or EOF. characters can be
+    # any container that supports the in method being called on it.
+    def chars_until(characters, opposite=false)
+      char_stack = [char]
+      while char_stack.last != :EOF
+        break unless (characters.include?(char_stack.last)) == opposite
+        char_stack.push(char)
+      end
+      # Put the character stopped on back to the front of the queue
+      # from where it came.
+      c = char_stack.pop
+      @queue.insert(0, c) unless c == :EOF
+      return char_stack.join('')
+    end
+    def unget(characters)
+      @queue.unshift(*characters.to_a) unless characters == :EOF
+    end
+  end
+  # String-like object with an assosiated position and various extra methods
+  # If the position is ever greater than the string length then an exception is raised
+  class EncodingBytes < String
+    attr_accessor :position
+    def initialize(value)
+      super(value)
+      @position = -1
+    end
+    def each
+      while @position < length
+        @position += 1
+        yield self[@position]
+      end
+    rescue EOF
+    end
+    def current_byte
+      raise EOF if @position >= length
+      return self[@position].chr
+    end
+    # Skip past a list of characters
+    def skip(chars=SPACE_CHARACTERS)
+      while chars.include?(current_byte)
+        @position += 1
+      end
+    end
+    # Look for a sequence of bytes at the start of a string. If the bytes
+    # are found return true and advance the position to the byte after the
+    # match. Otherwise return false and leave the position alone
+    def match_bytes(bytes, lower=false)
+      data = self[position ... position+bytes.length]
+      data.downcase! if lower
+      rv = (data == bytes)
+      @position += bytes.length if rv == true
+      return rv
+    end
+    # Look for the next sequence of bytes matching a given sequence. If
+    # a match is found advance the position to the last byte of the match
+    def jump_to(bytes)
+      new_position = self[position .. -1].index(bytes)
+      if new_position
+        @position += (new_position + bytes.length-1)
+        return true
+      else
+        raise EOF
+      end
+    end
+    # Move the pointer so it points to the next byte in a set of possible
+    # bytes
+    def find_next(byte_list)
+      until byte_list.include?(current_byte)
+        @position += 1
+      end
+    end
+  end
+  # Mini parser for detecting character encoding from meta elements
+  class EncodingParser
+    # string - the data to work on for encoding detection
+    def initialize(data)
+      @data = EncodingBytes.new(data.to_s)
+      @encoding = nil
+    end
+    @@method_dispatch = [
+      ['<!--', :handle_comment],
+      ['<meta', :handle_meta],
+      ['</', :handle_possible_end_tag],
+      ['<!', :handle_other],
+      ['<?', :handle_other],
+      ['<', :handle_possible_start_tag]
+    ]
+    def get_encoding
+      @data.each do |byte|
+        keep_parsing = true
+        @@method_dispatch.each do |(key, method)|
+          if @data.match_bytes(key, lower = true)
+            keep_parsing = send(method)
+            break
+          end
+        end
+        break unless keep_parsing
+      end
+      @encoding = @encoding.strip unless @encoding.nil?
+      return @encoding
+    end
+    # Skip over comments
+    def handle_comment
+      return @data.jump_to('-->')
+    end
+    def handle_meta
+      # if we have <meta not followed by a space so just keep going
+      return true unless SPACE_CHARACTERS.include?(@data.current_byte)
+      #We have a valid meta element we want to search for attributes
+      while true
+        #Try to find the next attribute after the current position
+        attr = get_attribute
+        return true if attr.nil?
+        if attr[0] == 'charset'
+          tentative_encoding = attr[1]
+          if HTML5.is_valid_encoding(tentative_encoding)
+            @encoding = tentative_encoding
+            return false
+          end
+        elsif attr[0] == 'content'
+          content_parser = ContentAttrParser.new(EncodingBytes.new(attr[1]))
+          tentative_encoding = content_parser.parse
+          if HTML5.is_valid_encoding(tentative_encoding)
+            @encoding = tentative_encoding
+            return false
+          end
+        end
+      end
+    end
+    def handle_possible_start_tag
+      return handle_possible_tag(false)
+    end
+    def handle_possible_end_tag
+      @data.position += 1
+      return handle_possible_tag(true)
+    end
+    def handle_possible_tag(end_tag)
+      unless ASCII_LETTERS.include?(@data.current_byte)
+        #If the next byte is not an ascii letter either ignore this
+        #fragment (possible start tag case) or treat it according to
+        #handleOther
+        if end_tag
+          @data.position -= 1
+          handle_other
+        end
+        return true
+      end
+      @data.find_next(SPACE_CHARACTERS + ['<', '>'])
+      if @data.current_byte == '<'
+        #return to the first step in the overall "two step" algorithm
+        #reprocessing the < byte
+        @data.position -= 1
+      else
+        #Read all attributes
+        {} until get_attribute.nil?
+      end
+      return true
+    end
+    def handle_other
+      return @data.jump_to('>')
+    end
+    # Return a name,value pair for the next attribute in the stream,
+    # if one is found, or nil
+    def get_attribute
+      @data.skip(SPACE_CHARACTERS + ['/'])
+      if @data.current_byte == '<'
+        @data.position -= 1
+        return nil
+      elsif @data.current_byte == '>'
+        return nil
+      end
+      attr_name = []
+      attr_value = []
+      space_found = false
+      #Step 5 attribute name
+      while true
+        if @data.current_byte == '=' and attr_name
+          break
+        elsif SPACE_CHARACTERS.include?(@data.current_byte)
+          space_found = true
+          break
+        elsif ['/', '<', '>'].include?(@data.current_byte)
+          return [attr_name.join(''), '']
+        elsif ASCII_UPPERCASE.include?(@data.current_byte)
+          attr_name.push(@data.current_byte.downcase)
+        else
+          attr_name.push(@data.current_byte)
+        end
+        #Step 6
+        @data.position += 1
+      end
+      #Step 7
+      if space_found
+        @data.skip
+        #Step 8
+        unless @data.current_byte == '='
+          @data.position -= 1
+          return [attr_name.join(''), '']
+        end
+      end
+      #XXX need to advance position in both spaces and value case
+      #Step 9
+      @data.position += 1
+      #Step 10
+      @data.skip
+      #Step 11
+      if ["'", '"'].include?(@data.current_byte)
+        #11.1
+        quote_char = @data.current_byte
+        while true
+          @data.position+=1
+          #11.3
+          if @data.current_byte == quote_char
+            @data.position += 1
+            return [attr_name.join(''), attr_value.join('')]
+          #11.4
+          elsif ASCII_UPPERCASE.include?(@data.current_byte)
+            attr_value.push(@data.current_byte.downcase)
+          #11.5
+          else
+            attr_value.push(@data.current_byte)
+          end
+        end
+      elsif ['>', '<'].include?(@data.current_byte)
+        return [attr_name.join(''), '']
+      elsif ASCII_UPPERCASE.include?(@data.current_byte)
+        attr_value.push(@data.current_byte.downcase)
+      else
+        attr_value.push(@data.current_byte)
+      end
+      while true
+        @data.position += 1
+        if (SPACE_CHARACTERS + ['>', '<']).include?(@data.current_byte)
+          return [attr_name.join(''), attr_value.join('')]
+        elsif ASCII_UPPERCASE.include?(@data.current_byte)
+          attr_value.push(@data.current_byte.downcase)
+        else
+          attr_value.push(@data.current_byte)
+        end
+      end
+    end
+  end
+  class ContentAttrParser
+    def initialize(data)
+      @data = data
+    end
+    def parse
+      begin
+        #Skip to the first ";"
+        @data.position = 0
+        @data.jump_to(';')
+        @data.position += 1
+        @data.skip
+        #Check if the attr name is charset
+        #otherwise return
+        @data.jump_to('charset')
+        @data.position += 1
+        @data.skip
+        unless @data.current_byte == '='
+          #If there is no = sign keep looking for attrs
+          return nil
+        end
+        @data.position += 1
+        @data.skip
+        #Look for an encoding between matching quote marks
+        if ['"', "'"].include?(@data.current_byte)
+          quote_mark = @data.current_byte
+          @data.position += 1
+          old_position = @data.position
+          @data.jump_to(quote_mark)
+          return @data[old_position ... @data.position]
+        else
+          #Unquoted value
+          old_position = @data.position
+          begin
+            @data.find_next(SPACE_CHARACTERS)
+            return @data[old_position ... @data.position]
+          rescue EOF
+            #Return the whole remaining value
+            return @data[old_position .. -1]
+          end
+        end
+      rescue EOF
+        return nil
+      end
+    end
+  end
+  # Determine if a string is a supported encoding
+  def self.is_valid_encoding(encoding)
+    (not encoding.nil? and encoding.kind_of?(String) and ENCODINGS.include?(encoding.downcase.strip))
+  end
+end