spk-html5 0.10.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,63 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/treebuilders'
4
+ require 'html5/html5parser'
5
+ require 'html5/cli'
6
+
7
+ $tree_types_to_test = ['simpletree', 'rexml']
8
+
9
+ begin
10
+ require 'hpricot'
11
+ $tree_types_to_test.push('hpricot')
12
+ rescue LoadError
13
+ end
14
+
15
+ class Html5ParserTestCase < Test::Unit::TestCase
16
+ include HTML5
17
+ include TestSupport
18
+
19
+ html5_test_files('tree-construction').each do |test_file|
20
+
21
+ test_name = File.basename(test_file).sub('.dat', '')
22
+
23
+ TestData.new(test_file, %w(data errors document-fragment document)).each_with_index do |(input, errors, inner_html, expected), index|
24
+ next if (input.downcase.include?('<svg')) || input.downcase.include?('xlink:')
25
+ errors = errors.split("\n")
26
+ expected = expected.gsub("\n| ","\n")[2..-1]
27
+
28
+ $tree_types_to_test.each do |tree_name|
29
+ define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
30
+
31
+ parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
32
+
33
+ if inner_html
34
+ parser.parse_fragment(input, inner_html)
35
+ else
36
+ parser.parse(input)
37
+ end
38
+
39
+ actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
40
+
41
+ assert_equal sortattrs(expected), sortattrs(actual_output), [
42
+ '', 'Input:', input,
43
+ '', 'Expected:', expected,
44
+ '', 'Received:', actual_output
45
+ ].join("\n")
46
+
47
+ actual_errors = parser.errors.map do |(line, col), message, datavars|
48
+ message = CLI::PythonicTemplate.new(E[message]).to_s(datavars)
49
+ "Line: #{line} Col: #{col} #{message}"
50
+ end
51
+
52
+ assert_equal errors, actual_errors, [
53
+ '', 'Input', input,
54
+ '', "Expected errors (#{errors.length}):", errors.join("\n"),
55
+ '', "Actual errors (#{actual_errors.length}):",
56
+ actual_errors.join("\n") + "\n"
57
+ ].join("\n")
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
4
+
5
+ require 'html5/html5parser'
6
+ require 'html5/liberalxmlparser'
7
+ require 'html5/treewalkers'
8
+ require 'html5/serializer'
9
+ require 'html5/sanitizer'
10
+
11
+ class SanitizeTest < Test::Unit::TestCase
12
+ include HTML5
13
+
14
+ def sanitize_xhtml stream
15
+ XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
16
+ end
17
+
18
+ def sanitize_html stream
19
+ HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
20
+ end
21
+
22
+ def sanitize_rexml stream
23
+ require 'rexml/document'
24
+ doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
25
+ tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
26
+ XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
27
+ :quote_char => "'",
28
+ :inject_meta_charset => false,
29
+ :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
30
+ rescue REXML::ParseException
31
+ return "Ill-formed XHTML!"
32
+ end
33
+
34
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
35
+ assert_equal htmloutput, sanitize_html(input)
36
+ assert_equal xhtmloutput, sanitize_xhtml(input)
37
+ assert_equal rexmloutput, sanitize_rexml(input)
38
+ end
39
+
40
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
41
+ define_method "test_should_allow_#{tag_name}_tag" do
42
+ input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
+ htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
+ xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
+ rexmloutput = xhtmloutput
46
+
47
+ if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
+ xhtmloutput = htmloutput
50
+ elsif tag_name == 'col'
51
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
+ xhtmloutput = htmloutput
53
+ rexmloutput = "<col title='1' />"
54
+ elsif tag_name == 'table'
55
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
+ xhtmloutput = htmloutput
57
+ elsif tag_name == 'image'
58
+ htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
+ xhtmloutput = htmloutput
60
+ rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
+ elsif VOID_ELEMENTS.include?(tag_name)
62
+ htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
+ xhtmloutput = htmloutput
64
+ htmloutput += '<br/>' if tag_name == 'br'
65
+ rexmloutput = "<#{tag_name} title='1' />"
66
+ end
67
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
+ end
69
+ end
70
+
71
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
72
+ define_method "test_should_forbid_#{tag_name.upcase}_tag" do
73
+ input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
74
+ output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
75
+ check_sanitization(input, output, output, output)
76
+ end
77
+ end
78
+
79
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
80
+ next if attribute_name == 'style'
81
+ define_method "test_should_allow_#{attribute_name}_attribute" do
82
+ input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
83
+ output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
84
+ htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
85
+ check_sanitization(input, htmloutput, output, output)
86
+ end
87
+ end
88
+
89
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
90
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
91
+ input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
92
+ output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
93
+ check_sanitization(input, output, output, output)
94
+ end
95
+ end
96
+
97
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
98
+ define_method "test_should_allow_#{protocol}_uris" do
99
+ input = %(<a href="#{protocol}">foo</a>)
100
+ output = "<a href='#{protocol}'>foo</a>"
101
+ check_sanitization(input, output, output, output)
102
+ end
103
+ end
104
+
105
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
106
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
107
+ input = %(<a href="#{protocol.upcase}">foo</a>)
108
+ output = "<a href='#{protocol.upcase}'>foo</a>"
109
+ check_sanitization(input, output, output, output)
110
+ end
111
+ end
112
+
113
+ HTMLSanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
114
+ next unless HTMLSanitizer::ALLOWED_ELEMENTS.include?(tag_name)
115
+ define_method "test_#{tag_name}_should_allow_local_href" do
116
+ input = %(<#{tag_name} xlink:href="#foo"/>)
117
+ output = "<#{tag_name.downcase} xlink:href='#foo'/>"
118
+ xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
119
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
120
+ end
121
+
122
+ define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
123
+ input = %(<#{tag_name} xlink:href="\n#foo"/>)
124
+ output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
125
+ xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
126
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
127
+ end
128
+
129
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
130
+ input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
131
+ output = "<#{tag_name.downcase}/>"
132
+ xhtmloutput = "<#{tag_name}></#{tag_name}>"
133
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
134
+ end
135
+
136
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
137
+ input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
138
+ output = "<#{tag_name.downcase}/>"
139
+ xhtmloutput = "<#{tag_name}></#{tag_name}>"
140
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
141
+ end
142
+ end
143
+
144
+ def test_should_handle_astral_plane_characters
145
+ input = "<p>&#x1d4b5; &#x1d538;</p>"
146
+ output = "<p>\360\235\222\265 \360\235\224\270</p>"
147
+ check_sanitization(input, output, output, output)
148
+
149
+ input = "<p><tspan>\360\235\224\270</tspan> a</p>"
150
+ output = "<p><tspan>\360\235\224\270</tspan> a</p>"
151
+ check_sanitization(input, output, output, output)
152
+ end
153
+
154
+ # This affects only NS4. Is it worth fixing?
155
+ # def test_javascript_includes
156
+ # input = %(<div size="&{alert('XSS')}">foo</div>)
157
+ # output = "<div>foo</div>"
158
+ # check_sanitization(input, output, output, output)
159
+ # end
160
+
161
+ html5_test_files('sanitizer').each do |filename|
162
+ JSON::parse(open(filename).read).each do |test|
163
+ define_method "test_#{test['name']}" do
164
+ check_sanitization(
165
+ test['input'],
166
+ test['output'],
167
+ test['xhtml'] || test['output'],
168
+ test['rexml'] || test['output']
169
+ )
170
+ end
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,67 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/serializer'
5
+ require 'html5/treewalkers'
6
+
7
+ #Run the serialize error checks
8
+ checkSerializeErrors = false
9
+
10
+ class JsonWalker < HTML5::TreeWalkers::Base
11
+ def each
12
+ @tree.each do |token|
13
+ case token[0]
14
+ when 'StartTag'
15
+ yield start_tag(token[1], token[2])
16
+ when 'EndTag'
17
+ yield end_tag(token[1])
18
+ when 'EmptyTag'
19
+ yield empty_tag(token[1], token[2])
20
+ when 'Comment'
21
+ yield comment(token[1])
22
+ when 'Characters', 'SpaceCharacters'
23
+ text(token[1]) {|textToken| yield textToken}
24
+ when 'Doctype'
25
+ yield doctype(token[1], token[2], token[3])
26
+ else
27
+ raise "Unknown token type: " + token[0]
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ class Html5SerializeTestcase < Test::Unit::TestCase
34
+ html5_test_files('serializer').each do |filename|
35
+ test_name = File.basename(filename).sub('.test', '')
36
+ tests = JSON::parse(open(filename).read)
37
+ tests['tests'].each_with_index do |test, index|
38
+
39
+ define_method "test_#{test_name}_#{index+1}" do
40
+ if test["options"] and test["options"]["encoding"]
41
+ test["options"][:encoding] = test["options"]["encoding"]
42
+ end
43
+
44
+ result = HTML5::HTMLSerializer.
45
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
46
+ expected = test["expected"]
47
+ if expected.length == 1
48
+ assert_equal(expected[0], result, test["description"])
49
+ elsif !expected.include?(result)
50
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
51
+ end
52
+
53
+ next if test_name == 'optionaltags'
54
+
55
+ result = HTML5::XHTMLSerializer.
56
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
57
+ expected = test["xhtml"] || test["expected"]
58
+ if expected.length == 1
59
+ assert_equal(expected[0], result, test["description"])
60
+ elsif !expected.include?(result)
61
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
62
+ end
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,27 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+ require "html5/sniffer"
3
+
4
+ class TestFeedTypeSniffer < Test::Unit::TestCase
5
+ include HTML5
6
+ include TestSupport
7
+ include Sniffer
8
+
9
+ html5_test_files('sniffer').each do |test_file|
10
+ test_name = File.basename(test_file).sub('.test', '')
11
+
12
+ tests = JSON.parse(File.read(test_file))
13
+
14
+ tests.each_with_index do |data, index|
15
+ define_method('test_%s_%d' % [test_name, index + 1]) do
16
+ assert_equal data['type'], html_or_feed(data['input'])
17
+ end
18
+ end
19
+ end
20
+ # each_with_index do |t, i|
21
+ # define_method "test_#{i}" do
22
+ # assert_equal t[0], sniff_feed_type(t[1])
23
+ # end
24
+ # end
25
+
26
+
27
+ end
@@ -0,0 +1,71 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class HTMLInputStreamTest < Test::Unit::TestCase
6
+ include HTML5
7
+
8
+ def getc stream
9
+ if String.method_defined? :force_encoding
10
+ stream.char.force_encoding('binary')
11
+ else
12
+ stream.char
13
+ end
14
+ end
15
+
16
+ def test_char_ascii
17
+ stream = HTMLInputStream.new("'", :encoding=>'ascii')
18
+ assert_equal('ascii', stream.char_encoding)
19
+ assert_equal("'", stream.char)
20
+ end
21
+
22
+ def test_char_null
23
+ stream = HTMLInputStream.new("\x00")
24
+ assert_equal("\xef\xbf\xbd", getc(stream))
25
+ end
26
+
27
+ def test_char_utf8
28
+ stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
29
+ assert_equal('utf-8', stream.char_encoding)
30
+ assert_equal("\xe2\x80\x98", getc(stream))
31
+ end
32
+
33
+ def test_char_win1252
34
+ stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
35
+ assert_equal('windows-1252', stream.char_encoding)
36
+ assert_equal("\xc2\xa2", getc(stream))
37
+ assert_equal("\xc3\x85", getc(stream))
38
+ assert_equal("\xc3\xb1", getc(stream))
39
+ assert_equal("\xe2\x80\x99", getc(stream))
40
+ assert_equal("\xe2\x80\xa0", getc(stream))
41
+ end
42
+
43
+ def test_bom
44
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
45
+ assert_equal('utf-8', stream.char_encoding)
46
+ assert_equal("'", stream.char)
47
+ end
48
+
49
+ begin
50
+ require 'iconv'
51
+
52
+ def test_utf_16
53
+ input = Iconv.new('utf-16', 'utf-8').iconv(' '*1025)
54
+ stream = HTMLInputStream.new(input)
55
+ assert('utf-16-le', stream.char_encoding)
56
+ assert_equal(1025, stream.chars_until(' ', true).length)
57
+ end
58
+ rescue LoadError
59
+ puts "iconv not found, skipping iconv tests"
60
+ end
61
+
62
+ def test_newlines
63
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
64
+ assert_equal([1,0], stream.position)
65
+ assert_equal("a\nbb\n", stream.chars_until('c'))
66
+ assert_equal([3,0], stream.position)
67
+ assert_equal("ccc\ndddd", stream.chars_until('x'))
68
+ assert_equal([4,4], stream.position)
69
+ assert_equal([1,2,3], stream.instance_eval {@line_lengths})
70
+ end
71
+ end
@@ -0,0 +1,95 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/tokenizer'
4
+
5
+ require 'tokenizer_test_parser'
6
+
7
+ class Html5TokenizerTestCase < Test::Unit::TestCase
8
+
9
+ def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
10
+ if !ignoreErrorOrder
11
+ assert_equal expectedTokens, receivedTokens, message
12
+ else
13
+ #Sort the tokens into two groups; non-parse errors and parse errors
14
+ expected = [[],[]]
15
+ received = [[],[]]
16
+
17
+ for token in expectedTokens
18
+ if token != "ParseError"
19
+ expected[0] << token
20
+ else
21
+ expected[1] << token
22
+ end
23
+ end
24
+
25
+ for token in receivedTokens
26
+ if token != "ParseError"
27
+ received[0] << token
28
+ else
29
+ received[1] << token
30
+ end
31
+ end
32
+ assert_equal expected, received, message
33
+ end
34
+ end
35
+
36
+ def type_of?(token_name, token)
37
+ token != 'ParseError' and token_name == token.first
38
+ end
39
+
40
+ def convert_attribute_arrays_to_hashes(tokens)
41
+ tokens.inject([]) do |tokens, token|
42
+ token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
43
+ tokens << token
44
+ end
45
+ end
46
+
47
+ def concatenate_consecutive_characters(tokens)
48
+ tokens.inject([]) do |tokens, token|
49
+ if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
50
+ tokens.last[1] = tokens.last[1] + token[1]
51
+ next tokens
52
+ end
53
+ tokens << token
54
+ end
55
+ end
56
+
57
+ def tokenizer_test(data)
58
+ (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
59
+ message = [
60
+ '', 'Description:', data['description'],
61
+ '', 'Input:', data['input'],
62
+ '', 'Content Model Flag:', content_model_flag,
63
+ '' ] * "\n"
64
+
65
+ assert_nothing_raised message do
66
+ tokenizer = HTML5::HTMLTokenizer.new(data['input'], :encoding => 'utf-8')
67
+
68
+ tokenizer.content_model_flag = content_model_flag.to_sym
69
+
70
+ tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
71
+
72
+ tokens = TokenizerTestParser.new(tokenizer).parse
73
+
74
+ actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
75
+
76
+ expected = concatenate_consecutive_characters(data['output'])
77
+
78
+ assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
79
+ end
80
+ end
81
+ end
82
+
83
+ html5_test_files('tokenizer').each do |test_file|
84
+ test_name = File.basename(test_file).sub('.test', '')
85
+
86
+ tests = JSON.parse(File.read(test_file))['tests']
87
+ if tests != nil
88
+ tests.each_with_index do |data, index|
89
+ define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
90
+ end
91
+ end
92
+ end
93
+
94
+ end
95
+