RubyGems - html5 - Versions diffs - 0.1.0 - Mend

html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

data/History.txt +3 -0
data/Manifest.txt +58 -0
data/README +9 -0
data/Rakefile.rb +17 -0
data/lib/html5/constants.rb +818 -0
data/lib/html5/filters/base.rb +10 -0
data/lib/html5/filters/inject_meta_charset.rb +82 -0
data/lib/html5/filters/optionaltags.rb +198 -0
data/lib/html5/filters/sanitizer.rb +15 -0
data/lib/html5/filters/whitespace.rb +36 -0
data/lib/html5/html5parser/after_body_phase.rb +46 -0
data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
data/lib/html5/html5parser/after_head_phase.rb +50 -0
data/lib/html5/html5parser/before_head_phase.rb +41 -0
data/lib/html5/html5parser/in_body_phase.rb +607 -0
data/lib/html5/html5parser/in_caption_phase.rb +68 -0
data/lib/html5/html5parser/in_cell_phase.rb +78 -0
data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
data/lib/html5/html5parser/in_head_phase.rb +138 -0
data/lib/html5/html5parser/in_row_phase.rb +87 -0
data/lib/html5/html5parser/in_select_phase.rb +84 -0
data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
data/lib/html5/html5parser/in_table_phase.rb +110 -0
data/lib/html5/html5parser/initial_phase.rb +134 -0
data/lib/html5/html5parser/phase.rb +158 -0
data/lib/html5/html5parser/root_element_phase.rb +42 -0
data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
data/lib/html5/html5parser.rb +248 -0
data/lib/html5/inputstream.rb +654 -0
data/lib/html5/liberalxmlparser.rb +158 -0
data/lib/html5/sanitizer.rb +188 -0
data/lib/html5/serializer/htmlserializer.rb +180 -0
data/lib/html5/serializer/xhtmlserializer.rb +20 -0
data/lib/html5/serializer.rb +2 -0
data/lib/html5/tokenizer.rb +968 -0
data/lib/html5/treebuilders/base.rb +334 -0
data/lib/html5/treebuilders/hpricot.rb +231 -0
data/lib/html5/treebuilders/rexml.rb +208 -0
data/lib/html5/treebuilders/simpletree.rb +185 -0
data/lib/html5/treebuilders.rb +24 -0
data/lib/html5/treewalkers/base.rb +154 -0
data/lib/html5/treewalkers/hpricot.rb +48 -0
data/lib/html5/treewalkers/rexml.rb +48 -0
data/lib/html5/treewalkers/simpletree.rb +48 -0
data/lib/html5/treewalkers.rb +26 -0
data/lib/html5.rb +13 -0
data/parse.rb +217 -0
data/tests/preamble.rb +82 -0
data/tests/test_encoding.rb +35 -0
data/tests/test_lxp.rb +263 -0
data/tests/test_parser.rb +68 -0
data/tests/test_sanitizer.rb +142 -0
data/tests/test_serializer.rb +68 -0
data/tests/test_stream.rb +62 -0
data/tests/test_tokenizer.rb +94 -0
data/tests/test_treewalkers.rb +116 -0
data/tests/tokenizer_test_parser.rb +63 -0
metadata +120 -0

data/lib/html5/filters/base.rb ADDED Viewed

@@ -0,0 +1,10 @@
+require 'delegate'
+require 'enumerator'
+module HTML5
+  module Filters
+    class Base < SimpleDelegator
+      include Enumerable
+    end
+  end
+end

data/lib/html5/filters/inject_meta_charset.rb ADDED Viewed

@@ -0,0 +1,82 @@
+require 'html5/filters/base'
+module HTML5
+  module Filters
+    class InjectMetaCharset < Base
+      def initialize(source, encoding)
+        super(source)
+        @encoding = encoding
+      end
+      def each
+        state = :pre_head
+        meta_found = @encoding.nil?
+        pending = []
+        __getobj__.each do |token|
+          case token[:type]
+          when :StartTag
+            state = :in_head if token[:name].downcase == "head"
+          when :EmptyTag
+            if token[:name].downcase == "meta"
+              # replace charset with actual encoding
+              token[:data].each_with_index do |(name, value), index|
+                if name == 'charset'
+                  token[:data][index][1] = @encoding
+                  meta_found = true
+                end
+              end
+              # replace charset with actual encoding
+              has_http_equiv_content_type = false
+              content_index = -1
+              token[:data].each_with_index do |(name, value), i|
+                if name.downcase == 'charset'
+                  token[:data][i] = ['charset', @encoding]
+                  meta_found = true
+                  break
+                elsif name == 'http-equiv' and value.downcase == 'content-type'
+                  has_http_equiv_content_type = true
+                elsif name == 'content'
+                  content_index = i
+                end
+              end
+              if !meta_found
+                if has_http_equiv_content_type && content_index >= 0
+                  token[:data][content_index][1] = 'text/html; charset=%s' % @encoding
+                  meta_found = true
+                end
+              end
+            elsif token[:name].downcase == "head" && !meta_found
+              # insert meta into empty head
+              yield :type => :StartTag, :name => "head", :data => token[:data]
+              yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]]
+              yield :type => :EndTag,   :name => "head"
+              meta_found = true
+              next
+            end
+          when :EndTag
+            if token[:name].downcase == "head" && pending.any?
+              # insert meta into head (if necessary) and flush pending queue
+              yield pending.shift
+              yield :type => :EmptyTag, :name => "meta", :data => [["charset", @encoding]] if !meta_found
+              yield pending.shift while pending.any?
+              meta_found = true
+              state = :post_head
+            end
+          end
+          if state == :in_head
+            pending << token
+          else
+            yield token
+          end
+        end
+      end
+    end
+  end
+end

data/lib/html5/filters/optionaltags.rb ADDED Viewed

@@ -0,0 +1,198 @@
+require 'html5/constants'
+require 'html5/filters/base'
+module HTML5
+  module Filters
+    class OptionalTagFilter < Base
+      def slider
+        previous1 = previous2 = nil
+        __getobj__.each do |token|
+          yield previous2, previous1, token if previous1 != nil
+          previous2 = previous1
+          previous1 = token
+        end
+        yield previous2, previous1, nil
+      end
+      def each
+        slider do |previous, token, nexttok|
+          type = token[:type]
+          if type == :StartTag
+            yield token unless token[:data].empty? and is_optional_start(token[:name], previous, nexttok)
+          elsif type == :EndTag
+            yield token unless is_optional_end(token[:name], nexttok)
+          else
+            yield token
+          end
+        end
+      end
+      def is_optional_start(tagname, previous, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if tagname == 'html'
+          # An html element's start tag may be omitted if the first thing
+          # inside the html element is not a space character or a comment.
+          return ![:Comment, :SpaceCharacters].include?(type)
+        elsif tagname == 'head'
+          # A head element's start tag may be omitted if the first thing
+          # inside the head element is an element.
+          return type == :StartTag
+        elsif tagname == 'body'
+          # A body element's start tag may be omitted if the first thing
+          # inside the body element is not a space character or a comment,
+          # except if the first thing inside the body element is a script
+          # or style element and the node immediately preceding the body
+          # element is a head element whose end tag has been omitted.
+          if [:Comment, :SpaceCharacters].include?(type)
+            return false
+          elsif type == :StartTag
+            # XXX: we do not look at the preceding event, so we never omit
+            # the body element's start tag if it's followed by a script or
+            # a style element.
+            return !%w[script style].include?(nexttok[:name])
+          else
+            return true
+          end
+        elsif tagname == 'colgroup'
+          # A colgroup element's start tag may be omitted if the first thing
+          # inside the colgroup element is a col element, and if the element
+          # is not immediately preceeded by another colgroup element whose
+          # end tag has been omitted.
+          if type == :StartTag
+            # XXX: we do not look at the preceding event, so instead we never
+            # omit the colgroup element's end tag when it is immediately
+            # followed by another colgroup element. See is_optional_end.
+            return nexttok[:name] == "col"
+          else
+            return false
+          end
+        elsif tagname == 'tbody'
+          # A tbody element's start tag may be omitted if the first thing
+          # inside the tbody element is a tr element, and if the element is
+          # not immediately preceeded by a tbody, thead, or tfoot element
+          # whose end tag has been omitted.
+          if type == :StartTag
+            # omit the thead and tfoot elements' end tag when they are
+            # immediately followed by a tbody element. See is_optional_end.
+            if previous and previous[:type] == :EndTag && %w(tbody thead tfoot).include?(previous[:name])
+              return false
+            end
+            return nexttok[:name] == 'tr'
+          else
+            return false
+          end
+        end
+        return false
+      end
+      def is_optional_end(tagname, nexttok)
+        type = nexttok ? nexttok[:type] : nil
+        if %w[html head body].include?(tagname)
+          # An html element's end tag may be omitted if the html element
+          # is not immediately followed by a space character or a comment.
+          return ![:Comment, :SpaceCharacters].include?(type)
+        elsif %w[li optgroup option tr].include?(tagname)
+          # A li element's end tag may be omitted if the li element is
+          # immediately followed by another li element or if there is
+          # no more content in the parent element.
+          # An optgroup element's end tag may be omitted if the optgroup
+          # element is immediately followed by another optgroup element,
+          # or if there is no more content in the parent element.
+          # An option element's end tag may be omitted if the option
+          # element is immediately followed by another option element,
+          # or if there is no more content in the parent element.
+          # A tr element's end tag may be omitted if the tr element is
+          # immediately followed by another tr element, or if there is
+          # no more content in the parent element.
+          if type == :StartTag
+            return nexttok[:name] == tagname
+          else
+            return type == :EndTag || type == nil
+          end
+        elsif %w(dt dd).include?(tagname)
+          # A dt element's end tag may be omitted if the dt element is
+          # immediately followed by another dt element or a dd element.
+          # A dd element's end tag may be omitted if the dd element is
+          # immediately followed by another dd element or a dt element,
+          # or if there is no more content in the parent element.
+          if type == :StartTag
+            return %w(dt dd).include?(nexttok[:name])
+          elsif tagname == 'dd'
+            return type == :EndTag || type == nil
+          else
+            return false
+          end
+        elsif tagname == 'p'
+          # A p element's end tag may be omitted if the p element is
+          # immediately followed by an address, blockquote, dl, fieldset,
+          # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+          # or ul  element, or if there is no more content in the parent
+          # element.
+          if type == :StartTag
+            return %w(address blockquote dl fieldset form h1 h2 h3 h4 h5
+                      h6 hr menu ol p pre table ul).include?(nexttok[:name])
+          else
+            return type == :EndTag || type == nil
+          end
+        elsif tagname == 'colgroup'
+          # A colgroup element's end tag may be omitted if the colgroup
+          # element is not immediately followed by a space character or
+          # a comment.
+          if [:Comment, :SpaceCharacters].include?(type)
+            return false
+          elsif type == :StartTag
+            # XXX: we also look for an immediately following colgroup
+            # element. See is_optional_start.
+            return nexttok[:name] != 'colgroup'
+          else
+            return true
+          end
+        elsif %w(thead tbody).include? tagname
+          # A thead element's end tag may be omitted if the thead element
+          # is immediately followed by a tbody or tfoot element.
+          # A tbody element's end tag may be omitted if the tbody element
+          # is immediately followed by a tbody or tfoot element, or if
+          # there is no more content in the parent element.
+          # A tfoot element's end tag may be omitted if the tfoot element
+          # is immediately followed by a tbody element, or if there is no
+          # more content in the parent element.
+          # XXX: we never omit the end tag when the following element is
+          # a tbody. See is_optional_start.
+          if type == :StartTag
+            return %w(tbody tfoot).include?(nexttok[:name])
+          elsif tagname == 'tbody'
+            return (type == :EndTag or type == nil)
+          else
+            return false
+          end
+        elsif tagname == 'tfoot'
+          # A tfoot element's end tag may be omitted if the tfoot element
+          # is immediately followed by a tbody element, or if there is no
+          # more content in the parent element.
+          # XXX: we never omit the end tag when the following element is
+          # a tbody. See is_optional_start.
+          if type == :StartTag
+            return nexttok[:name] == 'tbody'
+          else
+            return type == :EndTag || type == nil
+          end
+        elsif %w(td th).include? tagname
+          # A td element's end tag may be omitted if the td element is
+          # immediately followed by a td or th element, or if there is
+          # no more content in the parent element.
+          # A th element's end tag may be omitted if the th element is
+          # immediately followed by a td or th element, or if there is
+          # no more content in the parent element.
+          if type == :StartTag
+            return %w(td th).include?(nexttok[:name])
+          else
+            return type == :EndTag || type == nil
+          end
+        end
+        return false
+      end
+    end
+  end
+end

data/lib/html5/filters/sanitizer.rb ADDED Viewed

@@ -0,0 +1,15 @@
+require 'html5/filters/base'
+require 'html5/sanitizer'
+module HTML5
+  module Filters
+    class HTMLSanitizeFilter < Base
+      include HTMLSanitizeModule
+      def each
+        __getobj__.each do |token|
+          yield(sanitize_token(token))
+        end
+      end
+    end
+  end
+end

data/lib/html5/filters/whitespace.rb ADDED Viewed

@@ -0,0 +1,36 @@
+require 'html5/constants'
+require 'html5/filters/base'
+module HTML5
+  module Filters
+    class WhitespaceFilter < Base
+      SPACE_PRESERVE_ELEMENTS = %w[pre textarea] + RCDATA_ELEMENTS
+      SPACES = /[#{SPACE_CHARACTERS.join('')}]+/m
+      def each
+        preserve = 0
+        __getobj__.each do |token|
+          case token[:type]
+          when :StartTag
+            if preserve > 0 or SPACE_PRESERVE_ELEMENTS.include?(token[:name])
+              preserve += 1
+            end
+          when :EndTag
+            preserve -= 1 if preserve > 0
+          when :SpaceCharacters
+            token[:data] = " " if preserve == 0 && token[:data]
+          when :Characters
+            token[:data] = token[:data].sub(SPACES,' ') if preserve == 0
+          end
+          yield token
+        end
+      end
+    end
+  end
+end

data/lib/html5/html5parser/after_body_phase.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'html5/html5parser/phase'
+module HTML5
+  class AfterBodyPhase < Phase
+    handle_end 'html'
+    def processComment(data)
+      # This is needed because data is to be appended to the <html> element
+      # here and not to whatever is currently open.
+      @tree.insert_comment(data, @tree.open_elements.first)
+    end
+    def processCharacters(data)
+      parse_error(_('Unexpected non-space characters in the after body phase.'))
+      @parser.phase = @parser.phases[:inBody]
+      @parser.phase.processCharacters(data)
+    end
+    def processStartTag(name, attributes)
+      parse_error(_("Unexpected start tag token (#{name}) in the after body phase."))
+      @parser.phase = @parser.phases[:inBody]
+      @parser.phase.processStartTag(name, attributes)
+    end
+    def endTagHtml(name)
+      if @parser.inner_html
+        parse_error
+      else
+        # XXX: This may need to be done, not sure
+        # Don't set last_phase to the current phase but to the inBody phase
+        # instead. No need for extra parse errors if there's something after </html>.
+        # Try "<!doctype html>X</html>X" for instance.
+        @parser.last_phase = @parser.phase
+        @parser.phase      = @parser.phases[:trailingEnd]
+      end
+    end
+    def endTagOther(name)
+      parse_error(_("Unexpected end tag token (#{name}) in the after body phase."))
+      @parser.phase = @parser.phases[:inBody]
+      @parser.phase.processEndTag(name)
+    end
+  end
+end

data/lib/html5/html5parser/after_frameset_phase.rb ADDED Viewed

@@ -0,0 +1,34 @@
+require 'html5/html5parser/phase'
+module HTML5
+  class AfterFramesetPhase < Phase
+    # http://www.whatwg.org/specs/web-apps/current-work/#after3
+    handle_start 'html', 'noframes'
+    handle_end 'html'
+    def processCharacters(data)
+      parse_error(_('Unexpected non-space characters in the after frameset phase. Ignored.'))
+    end
+    def startTagNoframes(name, attributes)
+      @parser.phases[:inBody].processStartTag(name, attributes)
+    end
+    def startTagOther(name, attributes)
+      parse_error(_("Unexpected start tag (#{name}) in the after frameset phase. Ignored."))
+    end
+    def endTagHtml(name)
+      @parser.last_phase = @parser.phase
+      @parser.phase      = @parser.phases[:trailingEnd]
+    end
+    def endTagOther(name)
+      parse_error(_("Unexpected end tag (#{name}) in the after frameset phase. Ignored."))
+    end
+  end
+end

data/lib/html5/html5parser/after_head_phase.rb ADDED Viewed

@@ -0,0 +1,50 @@
+require 'html5/html5parser/phase'
+module HTML5
+  class AfterHeadPhase < Phase
+    handle_start 'html', 'body', 'frameset', %w( base link meta script style title ) => 'FromHead'
+    def process_eof
+      anythingElse
+      @parser.phase.process_eof
+    end
+    def processCharacters(data)
+      anythingElse
+      @parser.phase.processCharacters(data)
+    end
+    def startTagBody(name, attributes)
+      @tree.insert_element(name, attributes)
+      @parser.phase = @parser.phases[:inBody]
+    end
+    def startTagFrameset(name, attributes)
+      @tree.insert_element(name, attributes)
+      @parser.phase = @parser.phases[:inFrameset]
+    end
+    def startTagFromHead(name, attributes)
+      parse_error(_("Unexpected start tag (#{name}) that can be in head. Moved."))
+      @parser.phase = @parser.phases[:inHead]
+      @parser.phase.processStartTag(name, attributes)
+    end
+    def startTagOther(name, attributes)
+      anythingElse
+      @parser.phase.processStartTag(name, attributes)
+    end
+    def processEndTag(name)
+      anythingElse
+      @parser.phase.processEndTag(name)
+    end
+    def anythingElse
+      @tree.insert_element('body', {})
+      @parser.phase = @parser.phases[:inBody]
+    end
+  end
+end

data/lib/html5/html5parser/before_head_phase.rb ADDED Viewed

@@ -0,0 +1,41 @@
+require 'html5/html5parser/phase'
+module HTML5
+  class BeforeHeadPhase < Phase
+    handle_start 'html', 'head'
+    handle_end %w( html head body br p ) => 'ImplyHead'
+    def process_eof
+      startTagHead('head', {})
+      @parser.phase.process_eof
+    end
+    def processCharacters(data)
+      startTagHead('head', {})
+      @parser.phase.processCharacters(data)
+    end
+    def startTagHead(name, attributes)
+      @tree.insert_element(name, attributes)
+      @tree.head_pointer = @tree.open_elements[-1]
+      @parser.phase = @parser.phases[:inHead]
+    end
+    def startTagOther(name, attributes)
+      startTagHead('head', {})
+      @parser.phase.processStartTag(name, attributes)
+    end
+    def endTagImplyHead(name)
+      startTagHead('head', {})
+      @parser.phase.processEndTag(name)
+    end
+    def endTagOther(name)
+      parse_error(_("Unexpected end tag (#{name}) after the (implied) root element."))
+    end
+  end
+end