spk-html5 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. data/History.txt +10 -0
  2. data/Manifest.txt +73 -0
  3. data/README +45 -0
  4. data/Rakefile.rb +33 -0
  5. data/bin/html5 +7 -0
  6. data/lib/html5.rb +13 -0
  7. data/lib/html5/cli.rb +248 -0
  8. data/lib/html5/constants.rb +1061 -0
  9. data/lib/html5/filters/base.rb +10 -0
  10. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  11. data/lib/html5/filters/iso639codes.rb +755 -0
  12. data/lib/html5/filters/optionaltags.rb +198 -0
  13. data/lib/html5/filters/rfc2046.rb +31 -0
  14. data/lib/html5/filters/rfc3987.rb +91 -0
  15. data/lib/html5/filters/sanitizer.rb +15 -0
  16. data/lib/html5/filters/validator.rb +834 -0
  17. data/lib/html5/filters/whitespace.rb +36 -0
  18. data/lib/html5/html5parser.rb +247 -0
  19. data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
  20. data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
  21. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  22. data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
  23. data/lib/html5/html5parser/after_head_phase.rb +55 -0
  24. data/lib/html5/html5parser/before_head_phase.rb +44 -0
  25. data/lib/html5/html5parser/before_html_phase.rb +41 -0
  26. data/lib/html5/html5parser/in_body_phase.rb +636 -0
  27. data/lib/html5/html5parser/in_caption_phase.rb +69 -0
  28. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  29. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  30. data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
  31. data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
  32. data/lib/html5/html5parser/in_head_phase.rb +143 -0
  33. data/lib/html5/html5parser/in_row_phase.rb +96 -0
  34. data/lib/html5/html5parser/in_select_phase.rb +90 -0
  35. data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
  36. data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
  37. data/lib/html5/html5parser/in_table_phase.rb +177 -0
  38. data/lib/html5/html5parser/initial_phase.rb +133 -0
  39. data/lib/html5/html5parser/phase.rb +171 -0
  40. data/lib/html5/inputstream.rb +735 -0
  41. data/lib/html5/liberalxmlparser.rb +158 -0
  42. data/lib/html5/sanitizer.rb +209 -0
  43. data/lib/html5/serializer.rb +2 -0
  44. data/lib/html5/serializer/htmlserializer.rb +179 -0
  45. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  46. data/lib/html5/sniffer.rb +45 -0
  47. data/lib/html5/tokenizer.rb +1059 -0
  48. data/lib/html5/treebuilders.rb +24 -0
  49. data/lib/html5/treebuilders/base.rb +339 -0
  50. data/lib/html5/treebuilders/hpricot.rb +231 -0
  51. data/lib/html5/treebuilders/rexml.rb +215 -0
  52. data/lib/html5/treebuilders/simpletree.rb +191 -0
  53. data/lib/html5/treewalkers.rb +26 -0
  54. data/lib/html5/treewalkers/base.rb +162 -0
  55. data/lib/html5/treewalkers/hpricot.rb +48 -0
  56. data/lib/html5/treewalkers/rexml.rb +48 -0
  57. data/lib/html5/treewalkers/simpletree.rb +48 -0
  58. data/lib/html5/version.rb +3 -0
  59. data/test/preamble.rb +69 -0
  60. data/test/test_cli.rb +16 -0
  61. data/test/test_encoding.rb +35 -0
  62. data/test/test_input_stream.rb +26 -0
  63. data/test/test_lxp.rb +283 -0
  64. data/test/test_parser.rb +63 -0
  65. data/test/test_sanitizer.rb +173 -0
  66. data/test/test_serializer.rb +67 -0
  67. data/test/test_sniffer.rb +27 -0
  68. data/test/test_stream.rb +71 -0
  69. data/test/test_tokenizer.rb +95 -0
  70. data/test/test_treewalkers.rb +135 -0
  71. data/test/test_validator.rb +31 -0
  72. data/test/tokenizer_test_parser.rb +67 -0
  73. data/test19.rb +38 -0
  74. metadata +198 -0
@@ -0,0 +1,63 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/treebuilders'
4
+ require 'html5/html5parser'
5
+ require 'html5/cli'
6
+
7
+ $tree_types_to_test = ['simpletree', 'rexml']
8
+
9
+ begin
10
+ require 'hpricot'
11
+ $tree_types_to_test.push('hpricot')
12
+ rescue LoadError
13
+ end
14
+
15
+ class Html5ParserTestCase < Test::Unit::TestCase
16
+ include HTML5
17
+ include TestSupport
18
+
19
+ html5_test_files('tree-construction').each do |test_file|
20
+
21
+ test_name = File.basename(test_file).sub('.dat', '')
22
+
23
+ TestData.new(test_file, %w(data errors document-fragment document)).each_with_index do |(input, errors, inner_html, expected), index|
24
+ next if (input.downcase.include?('<svg')) || input.downcase.include?('xlink:')
25
+ errors = errors.split("\n")
26
+ expected = expected.gsub("\n| ","\n")[2..-1]
27
+
28
+ $tree_types_to_test.each do |tree_name|
29
+ define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
30
+
31
+ parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
32
+
33
+ if inner_html
34
+ parser.parse_fragment(input, inner_html)
35
+ else
36
+ parser.parse(input)
37
+ end
38
+
39
+ actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
40
+
41
+ assert_equal sortattrs(expected), sortattrs(actual_output), [
42
+ '', 'Input:', input,
43
+ '', 'Expected:', expected,
44
+ '', 'Received:', actual_output
45
+ ].join("\n")
46
+
47
+ actual_errors = parser.errors.map do |(line, col), message, datavars|
48
+ message = CLI::PythonicTemplate.new(E[message]).to_s(datavars)
49
+ "Line: #{line} Col: #{col} #{message}"
50
+ end
51
+
52
+ assert_equal errors, actual_errors, [
53
+ '', 'Input', input,
54
+ '', "Expected errors (#{errors.length}):", errors.join("\n"),
55
+ '', "Actual errors (#{actual_errors.length}):",
56
+ actual_errors.join("\n") + "\n"
57
+ ].join("\n")
58
+ end
59
+ end
60
+ end
61
+ end
62
+
63
+ end
@@ -0,0 +1,173 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
4
+
5
+ require 'html5/html5parser'
6
+ require 'html5/liberalxmlparser'
7
+ require 'html5/treewalkers'
8
+ require 'html5/serializer'
9
+ require 'html5/sanitizer'
10
+
11
+ class SanitizeTest < Test::Unit::TestCase
12
+ include HTML5
13
+
14
+ def sanitize_xhtml stream
15
+ XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
16
+ end
17
+
18
+ def sanitize_html stream
19
+ HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
20
+ end
21
+
22
+ def sanitize_rexml stream
23
+ require 'rexml/document'
24
+ doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
25
+ tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
26
+ XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
27
+ :quote_char => "'",
28
+ :inject_meta_charset => false,
29
+ :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
30
+ rescue REXML::ParseException
31
+ return "Ill-formed XHTML!"
32
+ end
33
+
34
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
35
+ assert_equal htmloutput, sanitize_html(input)
36
+ assert_equal xhtmloutput, sanitize_xhtml(input)
37
+ assert_equal rexmloutput, sanitize_rexml(input)
38
+ end
39
+
40
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
41
+ define_method "test_should_allow_#{tag_name}_tag" do
42
+ input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
+ htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
+ xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
+ rexmloutput = xhtmloutput
46
+
47
+ if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
+ xhtmloutput = htmloutput
50
+ elsif tag_name == 'col'
51
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
+ xhtmloutput = htmloutput
53
+ rexmloutput = "<col title='1' />"
54
+ elsif tag_name == 'table'
55
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
+ xhtmloutput = htmloutput
57
+ elsif tag_name == 'image'
58
+ htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
+ xhtmloutput = htmloutput
60
+ rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
+ elsif VOID_ELEMENTS.include?(tag_name)
62
+ htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
+ xhtmloutput = htmloutput
64
+ htmloutput += '<br/>' if tag_name == 'br'
65
+ rexmloutput = "<#{tag_name} title='1' />"
66
+ end
67
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
+ end
69
+ end
70
+
71
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
72
+ define_method "test_should_forbid_#{tag_name.upcase}_tag" do
73
+ input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
74
+ output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
75
+ check_sanitization(input, output, output, output)
76
+ end
77
+ end
78
+
79
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
80
+ next if attribute_name == 'style'
81
+ define_method "test_should_allow_#{attribute_name}_attribute" do
82
+ input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
83
+ output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
84
+ htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
85
+ check_sanitization(input, htmloutput, output, output)
86
+ end
87
+ end
88
+
89
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
90
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
91
+ input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
92
+ output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
93
+ check_sanitization(input, output, output, output)
94
+ end
95
+ end
96
+
97
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
98
+ define_method "test_should_allow_#{protocol}_uris" do
99
+ input = %(<a href="#{protocol}">foo</a>)
100
+ output = "<a href='#{protocol}'>foo</a>"
101
+ check_sanitization(input, output, output, output)
102
+ end
103
+ end
104
+
105
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
106
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
107
+ input = %(<a href="#{protocol.upcase}">foo</a>)
108
+ output = "<a href='#{protocol.upcase}'>foo</a>"
109
+ check_sanitization(input, output, output, output)
110
+ end
111
+ end
112
+
113
+ HTMLSanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
114
+ next unless HTMLSanitizer::ALLOWED_ELEMENTS.include?(tag_name)
115
+ define_method "test_#{tag_name}_should_allow_local_href" do
116
+ input = %(<#{tag_name} xlink:href="#foo"/>)
117
+ output = "<#{tag_name.downcase} xlink:href='#foo'/>"
118
+ xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
119
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
120
+ end
121
+
122
+ define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
123
+ input = %(<#{tag_name} xlink:href="\n#foo"/>)
124
+ output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
125
+ xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
126
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
127
+ end
128
+
129
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
130
+ input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
131
+ output = "<#{tag_name.downcase}/>"
132
+ xhtmloutput = "<#{tag_name}></#{tag_name}>"
133
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
134
+ end
135
+
136
+ define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
137
+ input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
138
+ output = "<#{tag_name.downcase}/>"
139
+ xhtmloutput = "<#{tag_name}></#{tag_name}>"
140
+ check_sanitization(input, output, xhtmloutput, xhtmloutput)
141
+ end
142
+ end
143
+
144
+ def test_should_handle_astral_plane_characters
145
+ input = "<p>&#x1d4b5; &#x1d538;</p>"
146
+ output = "<p>\360\235\222\265 \360\235\224\270</p>"
147
+ check_sanitization(input, output, output, output)
148
+
149
+ input = "<p><tspan>\360\235\224\270</tspan> a</p>"
150
+ output = "<p><tspan>\360\235\224\270</tspan> a</p>"
151
+ check_sanitization(input, output, output, output)
152
+ end
153
+
154
+ # This affects only NS4. Is it worth fixing?
155
+ # def test_javascript_includes
156
+ # input = %(<div size="&{alert('XSS')}">foo</div>)
157
+ # output = "<div>foo</div>"
158
+ # check_sanitization(input, output, output, output)
159
+ # end
160
+
161
+ html5_test_files('sanitizer').each do |filename|
162
+ JSON::parse(open(filename).read).each do |test|
163
+ define_method "test_#{test['name']}" do
164
+ check_sanitization(
165
+ test['input'],
166
+ test['output'],
167
+ test['xhtml'] || test['output'],
168
+ test['rexml'] || test['output']
169
+ )
170
+ end
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,67 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/serializer'
5
+ require 'html5/treewalkers'
6
+
7
+ #Run the serialize error checks
8
+ checkSerializeErrors = false
9
+
10
+ class JsonWalker < HTML5::TreeWalkers::Base
11
+ def each
12
+ @tree.each do |token|
13
+ case token[0]
14
+ when 'StartTag'
15
+ yield start_tag(token[1], token[2])
16
+ when 'EndTag'
17
+ yield end_tag(token[1])
18
+ when 'EmptyTag'
19
+ yield empty_tag(token[1], token[2])
20
+ when 'Comment'
21
+ yield comment(token[1])
22
+ when 'Characters', 'SpaceCharacters'
23
+ text(token[1]) {|textToken| yield textToken}
24
+ when 'Doctype'
25
+ yield doctype(token[1], token[2], token[3])
26
+ else
27
+ raise "Unknown token type: " + token[0]
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ class Html5SerializeTestcase < Test::Unit::TestCase
34
+ html5_test_files('serializer').each do |filename|
35
+ test_name = File.basename(filename).sub('.test', '')
36
+ tests = JSON::parse(open(filename).read)
37
+ tests['tests'].each_with_index do |test, index|
38
+
39
+ define_method "test_#{test_name}_#{index+1}" do
40
+ if test["options"] and test["options"]["encoding"]
41
+ test["options"][:encoding] = test["options"]["encoding"]
42
+ end
43
+
44
+ result = HTML5::HTMLSerializer.
45
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
46
+ expected = test["expected"]
47
+ if expected.length == 1
48
+ assert_equal(expected[0], result, test["description"])
49
+ elsif !expected.include?(result)
50
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
51
+ end
52
+
53
+ next if test_name == 'optionaltags'
54
+
55
+ result = HTML5::XHTMLSerializer.
56
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
57
+ expected = test["xhtml"] || test["expected"]
58
+ if expected.length == 1
59
+ assert_equal(expected[0], result, test["description"])
60
+ elsif !expected.include?(result)
61
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
62
+ end
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,27 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+ require "html5/sniffer"
3
+
4
+ class TestFeedTypeSniffer < Test::Unit::TestCase
5
+ include HTML5
6
+ include TestSupport
7
+ include Sniffer
8
+
9
+ html5_test_files('sniffer').each do |test_file|
10
+ test_name = File.basename(test_file).sub('.test', '')
11
+
12
+ tests = JSON.parse(File.read(test_file))
13
+
14
+ tests.each_with_index do |data, index|
15
+ define_method('test_%s_%d' % [test_name, index + 1]) do
16
+ assert_equal data['type'], html_or_feed(data['input'])
17
+ end
18
+ end
19
+ end
20
+ # each_with_index do |t, i|
21
+ # define_method "test_#{i}" do
22
+ # assert_equal t[0], sniff_feed_type(t[1])
23
+ # end
24
+ # end
25
+
26
+
27
+ end
@@ -0,0 +1,71 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class HTMLInputStreamTest < Test::Unit::TestCase
6
+ include HTML5
7
+
8
+ def getc stream
9
+ if String.method_defined? :force_encoding
10
+ stream.char.force_encoding('binary')
11
+ else
12
+ stream.char
13
+ end
14
+ end
15
+
16
+ def test_char_ascii
17
+ stream = HTMLInputStream.new("'", :encoding=>'ascii')
18
+ assert_equal('ascii', stream.char_encoding)
19
+ assert_equal("'", stream.char)
20
+ end
21
+
22
+ def test_char_null
23
+ stream = HTMLInputStream.new("\x00")
24
+ assert_equal("\xef\xbf\xbd", getc(stream))
25
+ end
26
+
27
+ def test_char_utf8
28
+ stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
29
+ assert_equal('utf-8', stream.char_encoding)
30
+ assert_equal("\xe2\x80\x98", getc(stream))
31
+ end
32
+
33
+ def test_char_win1252
34
+ stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
35
+ assert_equal('windows-1252', stream.char_encoding)
36
+ assert_equal("\xc2\xa2", getc(stream))
37
+ assert_equal("\xc3\x85", getc(stream))
38
+ assert_equal("\xc3\xb1", getc(stream))
39
+ assert_equal("\xe2\x80\x99", getc(stream))
40
+ assert_equal("\xe2\x80\xa0", getc(stream))
41
+ end
42
+
43
+ def test_bom
44
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
45
+ assert_equal('utf-8', stream.char_encoding)
46
+ assert_equal("'", stream.char)
47
+ end
48
+
49
+ begin
50
+ require 'iconv'
51
+
52
+ def test_utf_16
53
+ input = Iconv.new('utf-16', 'utf-8').iconv(' '*1025)
54
+ stream = HTMLInputStream.new(input)
55
+ assert('utf-16-le', stream.char_encoding)
56
+ assert_equal(1025, stream.chars_until(' ', true).length)
57
+ end
58
+ rescue LoadError
59
+ puts "iconv not found, skipping iconv tests"
60
+ end
61
+
62
+ def test_newlines
63
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
64
+ assert_equal([1,0], stream.position)
65
+ assert_equal("a\nbb\n", stream.chars_until('c'))
66
+ assert_equal([3,0], stream.position)
67
+ assert_equal("ccc\ndddd", stream.chars_until('x'))
68
+ assert_equal([4,4], stream.position)
69
+ assert_equal([1,2,3], stream.instance_eval {@line_lengths})
70
+ end
71
+ end
@@ -0,0 +1,95 @@
1
+ require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
2
+
3
+ require 'html5/tokenizer'
4
+
5
+ require 'tokenizer_test_parser'
6
+
7
+ class Html5TokenizerTestCase < Test::Unit::TestCase
8
+
9
+ def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
10
+ if !ignoreErrorOrder
11
+ assert_equal expectedTokens, receivedTokens, message
12
+ else
13
+ #Sort the tokens into two groups; non-parse errors and parse errors
14
+ expected = [[],[]]
15
+ received = [[],[]]
16
+
17
+ for token in expectedTokens
18
+ if token != "ParseError"
19
+ expected[0] << token
20
+ else
21
+ expected[1] << token
22
+ end
23
+ end
24
+
25
+ for token in receivedTokens
26
+ if token != "ParseError"
27
+ received[0] << token
28
+ else
29
+ received[1] << token
30
+ end
31
+ end
32
+ assert_equal expected, received, message
33
+ end
34
+ end
35
+
36
+ def type_of?(token_name, token)
37
+ token != 'ParseError' and token_name == token.first
38
+ end
39
+
40
+ def convert_attribute_arrays_to_hashes(tokens)
41
+ tokens.inject([]) do |tokens, token|
42
+ token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
43
+ tokens << token
44
+ end
45
+ end
46
+
47
+ def concatenate_consecutive_characters(tokens)
48
+ tokens.inject([]) do |tokens, token|
49
+ if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
50
+ tokens.last[1] = tokens.last[1] + token[1]
51
+ next tokens
52
+ end
53
+ tokens << token
54
+ end
55
+ end
56
+
57
+ def tokenizer_test(data)
58
+ (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
59
+ message = [
60
+ '', 'Description:', data['description'],
61
+ '', 'Input:', data['input'],
62
+ '', 'Content Model Flag:', content_model_flag,
63
+ '' ] * "\n"
64
+
65
+ assert_nothing_raised message do
66
+ tokenizer = HTML5::HTMLTokenizer.new(data['input'], :encoding => 'utf-8')
67
+
68
+ tokenizer.content_model_flag = content_model_flag.to_sym
69
+
70
+ tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
71
+
72
+ tokens = TokenizerTestParser.new(tokenizer).parse
73
+
74
+ actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
75
+
76
+ expected = concatenate_consecutive_characters(data['output'])
77
+
78
+ assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
79
+ end
80
+ end
81
+ end
82
+
83
+ html5_test_files('tokenizer').each do |test_file|
84
+ test_name = File.basename(test_file).sub('.test', '')
85
+
86
+ tests = JSON.parse(File.read(test_file))['tests']
87
+ if tests != nil
88
+ tests.each_with_index do |data, index|
89
+ define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
90
+ end
91
+ end
92
+ end
93
+
94
+ end
95
+