RubyGems - spk-html5 - Versions diffs - 0.10.1 - Mend

spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (74) hide show

data/History.txt +10 -0
data/Manifest.txt +73 -0
data/README +45 -0
data/Rakefile.rb +33 -0
data/bin/html5 +7 -0
data/lib/html5.rb +13 -0
data/lib/html5/cli.rb +248 -0
data/lib/html5/constants.rb +1061 -0
data/lib/html5/filters/base.rb +10 -0
data/lib/html5/filters/inject_meta_charset.rb +82 -0
data/lib/html5/filters/iso639codes.rb +755 -0
data/lib/html5/filters/optionaltags.rb +198 -0
data/lib/html5/filters/rfc2046.rb +31 -0
data/lib/html5/filters/rfc3987.rb +91 -0
data/lib/html5/filters/sanitizer.rb +15 -0
data/lib/html5/filters/validator.rb +834 -0
data/lib/html5/filters/whitespace.rb +36 -0
data/lib/html5/html5parser.rb +247 -0
data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
data/lib/html5/html5parser/after_body_phase.rb +46 -0
data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
data/lib/html5/html5parser/after_head_phase.rb +55 -0
data/lib/html5/html5parser/before_head_phase.rb +44 -0
data/lib/html5/html5parser/before_html_phase.rb +41 -0
data/lib/html5/html5parser/in_body_phase.rb +636 -0
data/lib/html5/html5parser/in_caption_phase.rb +69 -0
data/lib/html5/html5parser/in_cell_phase.rb +78 -0
data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
data/lib/html5/html5parser/in_head_phase.rb +143 -0
data/lib/html5/html5parser/in_row_phase.rb +96 -0
data/lib/html5/html5parser/in_select_phase.rb +90 -0
data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
data/lib/html5/html5parser/in_table_phase.rb +177 -0
data/lib/html5/html5parser/initial_phase.rb +133 -0
data/lib/html5/html5parser/phase.rb +171 -0
data/lib/html5/inputstream.rb +735 -0
data/lib/html5/liberalxmlparser.rb +158 -0
data/lib/html5/sanitizer.rb +209 -0
data/lib/html5/serializer.rb +2 -0
data/lib/html5/serializer/htmlserializer.rb +179 -0
data/lib/html5/serializer/xhtmlserializer.rb +20 -0
data/lib/html5/sniffer.rb +45 -0
data/lib/html5/tokenizer.rb +1059 -0
data/lib/html5/treebuilders.rb +24 -0
data/lib/html5/treebuilders/base.rb +339 -0
data/lib/html5/treebuilders/hpricot.rb +231 -0
data/lib/html5/treebuilders/rexml.rb +215 -0
data/lib/html5/treebuilders/simpletree.rb +191 -0
data/lib/html5/treewalkers.rb +26 -0
data/lib/html5/treewalkers/base.rb +162 -0
data/lib/html5/treewalkers/hpricot.rb +48 -0
data/lib/html5/treewalkers/rexml.rb +48 -0
data/lib/html5/treewalkers/simpletree.rb +48 -0
data/lib/html5/version.rb +3 -0
data/test/preamble.rb +69 -0
data/test/test_cli.rb +16 -0
data/test/test_encoding.rb +35 -0
data/test/test_input_stream.rb +26 -0
data/test/test_lxp.rb +283 -0
data/test/test_parser.rb +63 -0
data/test/test_sanitizer.rb +173 -0
data/test/test_serializer.rb +67 -0
data/test/test_sniffer.rb +27 -0
data/test/test_stream.rb +71 -0
data/test/test_tokenizer.rb +95 -0
data/test/test_treewalkers.rb +135 -0
data/test/test_validator.rb +31 -0
data/test/tokenizer_test_parser.rb +67 -0
data/test19.rb +38 -0
metadata +198 -0

data/test/test_parser.rb ADDED

@@ -0,0 +1,63 @@
+require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
+require 'html5/treebuilders'
+require 'html5/html5parser'
+require 'html5/cli'
+$tree_types_to_test = ['simpletree', 'rexml']
+begin
+  require 'hpricot'
+  $tree_types_to_test.push('hpricot')
+rescue LoadError
+end
+class Html5ParserTestCase < Test::Unit::TestCase
+  include HTML5
+  include TestSupport
+  html5_test_files('tree-construction').each do |test_file|
+    test_name = File.basename(test_file).sub('.dat', '')
+    TestData.new(test_file, %w(data errors document-fragment document)).each_with_index do |(input, errors, inner_html, expected), index|
+      next if (input.downcase.include?('<svg')) || input.downcase.include?('xlink:')
+      errors = errors.split("\n")
+      expected = expected.gsub("\n| ","\n")[2..-1]
+      $tree_types_to_test.each do |tree_name|
+        define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
+          parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
+          if inner_html
+            parser.parse_fragment(input, inner_html)
+          else
+            parser.parse(input)
+          end
+          actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
+          assert_equal sortattrs(expected), sortattrs(actual_output), [
+            '', 'Input:', input,
+            '', 'Expected:', expected,
+            '', 'Received:', actual_output
+          ].join("\n")
+          actual_errors = parser.errors.map do |(line, col), message, datavars|
+            message = CLI::PythonicTemplate.new(E[message]).to_s(datavars)
+            "Line: #{line} Col: #{col} #{message}"
+          end
+          assert_equal errors, actual_errors, [
+            '', 'Input', input,
+            '', "Expected errors (#{errors.length}):", errors.join("\n"),
+            '', "Actual errors (#{actual_errors.length}):",
+                 actual_errors.join("\n") + "\n"
+          ].join("\n")
+        end
+      end
+    end
+  end
+end

data/test/test_sanitizer.rb ADDED

@@ -0,0 +1,173 @@
+#!/usr/bin/env ruby
+require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
+require 'html5/html5parser'
+require 'html5/liberalxmlparser'
+require 'html5/treewalkers'
+require 'html5/serializer'
+require 'html5/sanitizer'
+class SanitizeTest < Test::Unit::TestCase
+  include HTML5
+  def sanitize_xhtml stream
+    XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
+  end
+  def sanitize_html stream
+    HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
+  end
+  def sanitize_rexml stream
+    require 'rexml/document'
+    doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
+    tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
+    XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
+      :quote_char => "'",
+      :inject_meta_charset => false,
+      :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
+  rescue REXML::ParseException
+    return "Ill-formed XHTML!"
+  end
+  def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+    assert_equal htmloutput, sanitize_html(input)
+    assert_equal xhtmloutput, sanitize_xhtml(input)
+    assert_equal rexmloutput, sanitize_rexml(input)
+  end
+  HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_allow_#{tag_name}_tag" do
+      input       = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
+      htmloutput  = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
+      xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
+      rexmloutput = xhtmloutput
+      if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+      elsif tag_name == 'col'
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        rexmloutput = "<col title='1' />"
+      elsif tag_name == 'table'
+        htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
+        xhtmloutput = htmloutput
+      elsif tag_name == 'image'
+        htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
+      elsif VOID_ELEMENTS.include?(tag_name)
+        htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
+        xhtmloutput = htmloutput
+        htmloutput += '<br/>' if tag_name == 'br'
+        rexmloutput =  "<#{tag_name} title='1' />"
+      end
+      check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
+    end
+  end
+  HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
+    define_method "test_should_forbid_#{tag_name.upcase}_tag" do
+      input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
+      output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    next if attribute_name == 'style'
+    define_method "test_should_allow_#{attribute_name}_attribute" do
+      input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
+      output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      check_sanitization(input, htmloutput, output, output)
+    end
+  end
+  HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
+    define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
+      input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
+      output =  "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_#{protocol}_uris" do
+      input = %(<a href="#{protocol}">foo</a>)
+      output = "<a href='#{protocol}'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
+    define_method "test_should_allow_uppercase_#{protocol}_uris" do
+      input = %(<a href="#{protocol.upcase}">foo</a>)
+      output = "<a href='#{protocol.upcase}'>foo</a>"
+      check_sanitization(input, output, output, output)
+    end
+  end
+  HTMLSanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
+    next unless HTMLSanitizer::ALLOWED_ELEMENTS.include?(tag_name)
+    define_method "test_#{tag_name}_should_allow_local_href" do
+      input = %(<#{tag_name} xlink:href="#foo"/>)
+      output = "<#{tag_name.downcase} xlink:href='#foo'/>"
+      xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+    define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
+      input = %(<#{tag_name} xlink:href="\n#foo"/>)
+      output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
+      xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+    define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
+      input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
+      output = "<#{tag_name.downcase}/>"
+      xhtmloutput = "<#{tag_name}></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+    define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
+      input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
+      output = "<#{tag_name.downcase}/>"
+      xhtmloutput = "<#{tag_name}></#{tag_name}>"
+      check_sanitization(input, output, xhtmloutput, xhtmloutput)
+    end
+  end
+  def test_should_handle_astral_plane_characters
+    input = "<p>&#x1d4b5; &#x1d538;</p>"
+    output = "<p>\360\235\222\265 \360\235\224\270</p>"
+    check_sanitization(input, output, output, output)
+    input = "<p><tspan>\360\235\224\270</tspan> a</p>"
+    output = "<p><tspan>\360\235\224\270</tspan> a</p>"
+    check_sanitization(input, output, output, output)
+  end
+# This affects only NS4. Is it worth fixing?
+#  def test_javascript_includes
+#    input = %(<div size="&{alert('XSS')}">foo</div>)
+#    output = "<div>foo</div>"
+#    check_sanitization(input, output, output, output)
+#  end
+  html5_test_files('sanitizer').each do |filename|
+    JSON::parse(open(filename).read).each do |test|
+      define_method "test_#{test['name']}" do
+        check_sanitization(
+          test['input'],
+          test['output'],
+          test['xhtml'] || test['output'],
+          test['rexml'] || test['output']
+        )
+      end
+    end
+  end
+end

data/test/test_serializer.rb ADDED

@@ -0,0 +1,67 @@
+require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
+require 'html5/html5parser'
+require 'html5/serializer'
+require 'html5/treewalkers'
+#Run the serialize error checks
+checkSerializeErrors = false
+class JsonWalker < HTML5::TreeWalkers::Base
+  def each
+    @tree.each do |token|
+      case token[0]
+      when 'StartTag'
+        yield start_tag(token[1], token[2])
+      when 'EndTag'
+        yield end_tag(token[1])
+      when 'EmptyTag'
+        yield empty_tag(token[1], token[2])
+      when 'Comment'
+        yield comment(token[1])
+      when 'Characters', 'SpaceCharacters'
+        text(token[1]) {|textToken| yield textToken}
+      when 'Doctype'
+        yield doctype(token[1], token[2], token[3])
+      else
+        raise "Unknown token type: " + token[0]
+      end
+    end
+  end
+end
+class Html5SerializeTestcase < Test::Unit::TestCase
+  html5_test_files('serializer').each do |filename|
+    test_name = File.basename(filename).sub('.test', '')
+    tests = JSON::parse(open(filename).read)
+    tests['tests'].each_with_index do |test, index|
+      define_method "test_#{test_name}_#{index+1}" do
+        if test["options"] and test["options"]["encoding"]
+          test["options"][:encoding] = test["options"]["encoding"]
+        end
+        result = HTML5::HTMLSerializer.
+          serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
+        expected = test["expected"]
+        if expected.length == 1
+          assert_equal(expected[0], result, test["description"])
+        elsif !expected.include?(result)
+          flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
+        end
+        next if test_name == 'optionaltags'
+        result = HTML5::XHTMLSerializer.
+          serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
+        expected = test["xhtml"] || test["expected"]
+        if expected.length == 1
+          assert_equal(expected[0], result, test["description"])
+        elsif !expected.include?(result)
+          flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
+        end
+      end
+    end
+  end
+end

data/test/test_sniffer.rb ADDED

@@ -0,0 +1,27 @@
+require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
+require "html5/sniffer"
+class TestFeedTypeSniffer < Test::Unit::TestCase
+  include HTML5
+  include TestSupport
+  include Sniffer
+  html5_test_files('sniffer').each do |test_file|
+    test_name = File.basename(test_file).sub('.test', '')
+    tests = JSON.parse(File.read(test_file))
+    tests.each_with_index do |data, index|
+      define_method('test_%s_%d' % [test_name, index + 1]) do
+        assert_equal data['type'], html_or_feed(data['input'])
+      end
+    end
+  end
+  # each_with_index do |t, i|
+  #     define_method "test_#{i}" do
+  #       assert_equal t[0], sniff_feed_type(t[1])
+  #     end
+  #   end
+end

data/test/test_stream.rb ADDED

@@ -0,0 +1,71 @@
+require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
+require 'html5/inputstream'
+class HTMLInputStreamTest < Test::Unit::TestCase
+  include HTML5
+  def getc stream
+    if String.method_defined? :force_encoding
+      stream.char.force_encoding('binary')
+    else
+      stream.char
+    end
+  end
+  def test_char_ascii
+    stream = HTMLInputStream.new("'", :encoding=>'ascii')
+    assert_equal('ascii', stream.char_encoding)
+    assert_equal("'", stream.char)
+  end
+  def test_char_null
+    stream = HTMLInputStream.new("\x00")
+    assert_equal("\xef\xbf\xbd", getc(stream))
+  end
+  def test_char_utf8
+    stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
+    assert_equal('utf-8', stream.char_encoding)
+    assert_equal("\xe2\x80\x98", getc(stream))
+  end
+  def test_char_win1252
+    stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
+    assert_equal('windows-1252', stream.char_encoding)
+    assert_equal("\xc2\xa2", getc(stream))
+    assert_equal("\xc3\x85", getc(stream))
+    assert_equal("\xc3\xb1", getc(stream))
+    assert_equal("\xe2\x80\x99", getc(stream))
+    assert_equal("\xe2\x80\xa0", getc(stream))
+  end
+  def test_bom
+    stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
+    assert_equal('utf-8', stream.char_encoding)
+    assert_equal("'", stream.char)
+  end
+  begin
+    require 'iconv'
+    def test_utf_16
+      input = Iconv.new('utf-16', 'utf-8').iconv(' '*1025)
+      stream = HTMLInputStream.new(input)
+      assert('utf-16-le', stream.char_encoding)
+      assert_equal(1025, stream.chars_until(' ', true).length)
+    end
+  rescue LoadError
+    puts "iconv not found, skipping iconv tests"
+  end
+  def test_newlines
+    stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
+    assert_equal([1,0], stream.position)
+    assert_equal("a\nbb\n", stream.chars_until('c'))
+    assert_equal([3,0], stream.position)
+    assert_equal("ccc\ndddd", stream.chars_until('x'))
+    assert_equal([4,4], stream.position)
+    assert_equal([1,2,3], stream.instance_eval {@line_lengths})
+  end
+end

data/test/test_tokenizer.rb ADDED

@@ -0,0 +1,95 @@
+require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
+require 'html5/tokenizer'
+require 'tokenizer_test_parser'
+class Html5TokenizerTestCase < Test::Unit::TestCase
+  def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
+    if !ignoreErrorOrder
+      assert_equal expectedTokens, receivedTokens, message
+    else
+      #Sort the tokens into two groups; non-parse errors and parse errors
+      expected = [[],[]]
+      received = [[],[]]
+      for token in expectedTokens
+        if token != "ParseError"
+          expected[0] << token
+        else
+          expected[1] << token
+        end
+      end
+      for token in receivedTokens
+        if token != "ParseError"
+          received[0] << token
+        else
+          received[1] << token
+        end
+      end
+      assert_equal expected, received, message
+    end
+  end
+  def type_of?(token_name, token)
+    token != 'ParseError' and token_name == token.first
+  end
+  def convert_attribute_arrays_to_hashes(tokens)
+    tokens.inject([]) do |tokens, token|
+      token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
+      tokens << token
+    end
+  end
+  def concatenate_consecutive_characters(tokens)
+    tokens.inject([]) do |tokens, token|
+      if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
+        tokens.last[1] = tokens.last[1] + token[1]
+        next tokens
+      end
+      tokens << token
+    end
+  end
+  def tokenizer_test(data)
+    (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
+      message = [
+        '', 'Description:', data['description'],
+        '', 'Input:', data['input'],
+        '', 'Content Model Flag:', content_model_flag,
+        '' ] * "\n"
+      assert_nothing_raised message do
+        tokenizer = HTML5::HTMLTokenizer.new(data['input'], :encoding => 'utf-8')
+        tokenizer.content_model_flag = content_model_flag.to_sym
+        tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
+        tokens = TokenizerTestParser.new(tokenizer).parse
+        actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
+        expected = concatenate_consecutive_characters(data['output'])
+        assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
+      end
+    end
+  end
+  html5_test_files('tokenizer').each do |test_file|
+    test_name = File.basename(test_file).sub('.test', '')
+    tests = JSON.parse(File.read(test_file))['tests']
+    if tests != nil
+      tests.each_with_index do |data, index|
+        define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
+      end
+    end
+  end
+end