spk-html5 0.10.1
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +10 -0
- data/Manifest.txt +73 -0
- data/README +45 -0
- data/Rakefile.rb +33 -0
- data/bin/html5 +7 -0
- data/lib/html5.rb +13 -0
- data/lib/html5/cli.rb +248 -0
- data/lib/html5/constants.rb +1061 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/iso639codes.rb +755 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/rfc2046.rb +31 -0
- data/lib/html5/filters/rfc3987.rb +91 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/validator.rb +834 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser.rb +247 -0
- data/lib/html5/html5parser/after_after_body_phase.rb +43 -0
- data/lib/html5/html5parser/after_after_frameset_phase.rb +32 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +33 -0
- data/lib/html5/html5parser/after_head_phase.rb +55 -0
- data/lib/html5/html5parser/before_head_phase.rb +44 -0
- data/lib/html5/html5parser/before_html_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +636 -0
- data/lib/html5/html5parser/in_caption_phase.rb +69 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_foreign_content_phase.rb +50 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +56 -0
- data/lib/html5/html5parser/in_head_phase.rb +143 -0
- data/lib/html5/html5parser/in_row_phase.rb +96 -0
- data/lib/html5/html5parser/in_select_phase.rb +90 -0
- data/lib/html5/html5parser/in_select_table_phase.rb +35 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +92 -0
- data/lib/html5/html5parser/in_table_phase.rb +177 -0
- data/lib/html5/html5parser/initial_phase.rb +133 -0
- data/lib/html5/html5parser/phase.rb +171 -0
- data/lib/html5/inputstream.rb +735 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +209 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/serializer/htmlserializer.rb +179 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/sniffer.rb +45 -0
- data/lib/html5/tokenizer.rb +1059 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treebuilders/base.rb +339 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +215 -0
- data/lib/html5/treebuilders/simpletree.rb +191 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5/treewalkers/base.rb +162 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/version.rb +3 -0
- data/test/preamble.rb +69 -0
- data/test/test_cli.rb +16 -0
- data/test/test_encoding.rb +35 -0
- data/test/test_input_stream.rb +26 -0
- data/test/test_lxp.rb +283 -0
- data/test/test_parser.rb +63 -0
- data/test/test_sanitizer.rb +173 -0
- data/test/test_serializer.rb +67 -0
- data/test/test_sniffer.rb +27 -0
- data/test/test_stream.rb +71 -0
- data/test/test_tokenizer.rb +95 -0
- data/test/test_treewalkers.rb +135 -0
- data/test/test_validator.rb +31 -0
- data/test/tokenizer_test_parser.rb +67 -0
- data/test19.rb +38 -0
- metadata +198 -0
data/test/test_parser.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
|
2
|
+
|
3
|
+
require 'html5/treebuilders'
|
4
|
+
require 'html5/html5parser'
|
5
|
+
require 'html5/cli'
|
6
|
+
|
7
|
+
$tree_types_to_test = ['simpletree', 'rexml']
|
8
|
+
|
9
|
+
begin
|
10
|
+
require 'hpricot'
|
11
|
+
$tree_types_to_test.push('hpricot')
|
12
|
+
rescue LoadError
|
13
|
+
end
|
14
|
+
|
15
|
+
class Html5ParserTestCase < Test::Unit::TestCase
|
16
|
+
include HTML5
|
17
|
+
include TestSupport
|
18
|
+
|
19
|
+
html5_test_files('tree-construction').each do |test_file|
|
20
|
+
|
21
|
+
test_name = File.basename(test_file).sub('.dat', '')
|
22
|
+
|
23
|
+
TestData.new(test_file, %w(data errors document-fragment document)).each_with_index do |(input, errors, inner_html, expected), index|
|
24
|
+
next if (input.downcase.include?('<svg')) || input.downcase.include?('xlink:')
|
25
|
+
errors = errors.split("\n")
|
26
|
+
expected = expected.gsub("\n| ","\n")[2..-1]
|
27
|
+
|
28
|
+
$tree_types_to_test.each do |tree_name|
|
29
|
+
define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
|
30
|
+
|
31
|
+
parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
|
32
|
+
|
33
|
+
if inner_html
|
34
|
+
parser.parse_fragment(input, inner_html)
|
35
|
+
else
|
36
|
+
parser.parse(input)
|
37
|
+
end
|
38
|
+
|
39
|
+
actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
|
40
|
+
|
41
|
+
assert_equal sortattrs(expected), sortattrs(actual_output), [
|
42
|
+
'', 'Input:', input,
|
43
|
+
'', 'Expected:', expected,
|
44
|
+
'', 'Received:', actual_output
|
45
|
+
].join("\n")
|
46
|
+
|
47
|
+
actual_errors = parser.errors.map do |(line, col), message, datavars|
|
48
|
+
message = CLI::PythonicTemplate.new(E[message]).to_s(datavars)
|
49
|
+
"Line: #{line} Col: #{col} #{message}"
|
50
|
+
end
|
51
|
+
|
52
|
+
assert_equal errors, actual_errors, [
|
53
|
+
'', 'Input', input,
|
54
|
+
'', "Expected errors (#{errors.length}):", errors.join("\n"),
|
55
|
+
'', "Actual errors (#{actual_errors.length}):",
|
56
|
+
actual_errors.join("\n") + "\n"
|
57
|
+
].join("\n")
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
@@ -0,0 +1,173 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
|
4
|
+
|
5
|
+
require 'html5/html5parser'
|
6
|
+
require 'html5/liberalxmlparser'
|
7
|
+
require 'html5/treewalkers'
|
8
|
+
require 'html5/serializer'
|
9
|
+
require 'html5/sanitizer'
|
10
|
+
|
11
|
+
class SanitizeTest < Test::Unit::TestCase
|
12
|
+
include HTML5
|
13
|
+
|
14
|
+
def sanitize_xhtml stream
|
15
|
+
XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
|
16
|
+
end
|
17
|
+
|
18
|
+
def sanitize_html stream
|
19
|
+
HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).join
|
20
|
+
end
|
21
|
+
|
22
|
+
def sanitize_rexml stream
|
23
|
+
require 'rexml/document'
|
24
|
+
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
|
25
|
+
tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
|
26
|
+
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
27
|
+
:quote_char => "'",
|
28
|
+
:inject_meta_charset => false,
|
29
|
+
:sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
30
|
+
rescue REXML::ParseException
|
31
|
+
return "Ill-formed XHTML!"
|
32
|
+
end
|
33
|
+
|
34
|
+
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
35
|
+
assert_equal htmloutput, sanitize_html(input)
|
36
|
+
assert_equal xhtmloutput, sanitize_xhtml(input)
|
37
|
+
assert_equal rexmloutput, sanitize_rexml(input)
|
38
|
+
end
|
39
|
+
|
40
|
+
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
41
|
+
define_method "test_should_allow_#{tag_name}_tag" do
|
42
|
+
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
43
|
+
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
44
|
+
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
45
|
+
rexmloutput = xhtmloutput
|
46
|
+
|
47
|
+
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
48
|
+
htmloutput = "foo <bad>bar</bad> baz"
|
49
|
+
xhtmloutput = htmloutput
|
50
|
+
elsif tag_name == 'col'
|
51
|
+
htmloutput = "foo <bad>bar</bad> baz"
|
52
|
+
xhtmloutput = htmloutput
|
53
|
+
rexmloutput = "<col title='1' />"
|
54
|
+
elsif tag_name == 'table'
|
55
|
+
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
56
|
+
xhtmloutput = htmloutput
|
57
|
+
elsif tag_name == 'image'
|
58
|
+
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
|
59
|
+
xhtmloutput = htmloutput
|
60
|
+
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
61
|
+
elsif VOID_ELEMENTS.include?(tag_name)
|
62
|
+
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
63
|
+
xhtmloutput = htmloutput
|
64
|
+
htmloutput += '<br/>' if tag_name == 'br'
|
65
|
+
rexmloutput = "<#{tag_name} title='1' />"
|
66
|
+
end
|
67
|
+
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
72
|
+
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
73
|
+
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
74
|
+
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
75
|
+
check_sanitization(input, output, output, output)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
80
|
+
next if attribute_name == 'style'
|
81
|
+
define_method "test_should_allow_#{attribute_name}_attribute" do
|
82
|
+
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
83
|
+
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
84
|
+
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
85
|
+
check_sanitization(input, htmloutput, output, output)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
90
|
+
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
91
|
+
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
92
|
+
output = "<p>foo <bad>bar</bad> baz</p>"
|
93
|
+
check_sanitization(input, output, output, output)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
98
|
+
define_method "test_should_allow_#{protocol}_uris" do
|
99
|
+
input = %(<a href="#{protocol}">foo</a>)
|
100
|
+
output = "<a href='#{protocol}'>foo</a>"
|
101
|
+
check_sanitization(input, output, output, output)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
106
|
+
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
107
|
+
input = %(<a href="#{protocol.upcase}">foo</a>)
|
108
|
+
output = "<a href='#{protocol.upcase}'>foo</a>"
|
109
|
+
check_sanitization(input, output, output, output)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
HTMLSanitizer::SVG_ALLOW_LOCAL_HREF.each do |tag_name|
|
114
|
+
next unless HTMLSanitizer::ALLOWED_ELEMENTS.include?(tag_name)
|
115
|
+
define_method "test_#{tag_name}_should_allow_local_href" do
|
116
|
+
input = %(<#{tag_name} xlink:href="#foo"/>)
|
117
|
+
output = "<#{tag_name.downcase} xlink:href='#foo'/>"
|
118
|
+
xhtmloutput = "<#{tag_name} xlink:href='#foo'></#{tag_name}>"
|
119
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
120
|
+
end
|
121
|
+
|
122
|
+
define_method "test_#{tag_name}_should_allow_local_href_with_newline" do
|
123
|
+
input = %(<#{tag_name} xlink:href="\n#foo"/>)
|
124
|
+
output = "<#{tag_name.downcase} xlink:href='\n#foo'/>"
|
125
|
+
xhtmloutput = "<#{tag_name} xlink:href='\n#foo'></#{tag_name}>"
|
126
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
127
|
+
end
|
128
|
+
|
129
|
+
define_method "test_#{tag_name}_should_forbid_nonlocal_href" do
|
130
|
+
input = %(<#{tag_name} xlink:href="http://bad.com/foo"/>)
|
131
|
+
output = "<#{tag_name.downcase}/>"
|
132
|
+
xhtmloutput = "<#{tag_name}></#{tag_name}>"
|
133
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
134
|
+
end
|
135
|
+
|
136
|
+
define_method "test_#{tag_name}_should_forbid_nonlocal_href_with_newline" do
|
137
|
+
input = %(<#{tag_name} xlink:href="\nhttp://bad.com/foo"/>)
|
138
|
+
output = "<#{tag_name.downcase}/>"
|
139
|
+
xhtmloutput = "<#{tag_name}></#{tag_name}>"
|
140
|
+
check_sanitization(input, output, xhtmloutput, xhtmloutput)
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_should_handle_astral_plane_characters
|
145
|
+
input = "<p>𝒵 𝔸</p>"
|
146
|
+
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
147
|
+
check_sanitization(input, output, output, output)
|
148
|
+
|
149
|
+
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
150
|
+
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
151
|
+
check_sanitization(input, output, output, output)
|
152
|
+
end
|
153
|
+
|
154
|
+
# This affects only NS4. Is it worth fixing?
|
155
|
+
# def test_javascript_includes
|
156
|
+
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
157
|
+
# output = "<div>foo</div>"
|
158
|
+
# check_sanitization(input, output, output, output)
|
159
|
+
# end
|
160
|
+
|
161
|
+
html5_test_files('sanitizer').each do |filename|
|
162
|
+
JSON::parse(open(filename).read).each do |test|
|
163
|
+
define_method "test_#{test['name']}" do
|
164
|
+
check_sanitization(
|
165
|
+
test['input'],
|
166
|
+
test['output'],
|
167
|
+
test['xhtml'] || test['output'],
|
168
|
+
test['rexml'] || test['output']
|
169
|
+
)
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
|
2
|
+
|
3
|
+
require 'html5/html5parser'
|
4
|
+
require 'html5/serializer'
|
5
|
+
require 'html5/treewalkers'
|
6
|
+
|
7
|
+
#Run the serialize error checks
|
8
|
+
checkSerializeErrors = false
|
9
|
+
|
10
|
+
class JsonWalker < HTML5::TreeWalkers::Base
|
11
|
+
def each
|
12
|
+
@tree.each do |token|
|
13
|
+
case token[0]
|
14
|
+
when 'StartTag'
|
15
|
+
yield start_tag(token[1], token[2])
|
16
|
+
when 'EndTag'
|
17
|
+
yield end_tag(token[1])
|
18
|
+
when 'EmptyTag'
|
19
|
+
yield empty_tag(token[1], token[2])
|
20
|
+
when 'Comment'
|
21
|
+
yield comment(token[1])
|
22
|
+
when 'Characters', 'SpaceCharacters'
|
23
|
+
text(token[1]) {|textToken| yield textToken}
|
24
|
+
when 'Doctype'
|
25
|
+
yield doctype(token[1], token[2], token[3])
|
26
|
+
else
|
27
|
+
raise "Unknown token type: " + token[0]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Html5SerializeTestcase < Test::Unit::TestCase
|
34
|
+
html5_test_files('serializer').each do |filename|
|
35
|
+
test_name = File.basename(filename).sub('.test', '')
|
36
|
+
tests = JSON::parse(open(filename).read)
|
37
|
+
tests['tests'].each_with_index do |test, index|
|
38
|
+
|
39
|
+
define_method "test_#{test_name}_#{index+1}" do
|
40
|
+
if test["options"] and test["options"]["encoding"]
|
41
|
+
test["options"][:encoding] = test["options"]["encoding"]
|
42
|
+
end
|
43
|
+
|
44
|
+
result = HTML5::HTMLSerializer.
|
45
|
+
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
46
|
+
expected = test["expected"]
|
47
|
+
if expected.length == 1
|
48
|
+
assert_equal(expected[0], result, test["description"])
|
49
|
+
elsif !expected.include?(result)
|
50
|
+
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
51
|
+
end
|
52
|
+
|
53
|
+
next if test_name == 'optionaltags'
|
54
|
+
|
55
|
+
result = HTML5::XHTMLSerializer.
|
56
|
+
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
57
|
+
expected = test["xhtml"] || test["expected"]
|
58
|
+
if expected.length == 1
|
59
|
+
assert_equal(expected[0], result, test["description"])
|
60
|
+
elsif !expected.include?(result)
|
61
|
+
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
|
2
|
+
require "html5/sniffer"
|
3
|
+
|
4
|
+
class TestFeedTypeSniffer < Test::Unit::TestCase
|
5
|
+
include HTML5
|
6
|
+
include TestSupport
|
7
|
+
include Sniffer
|
8
|
+
|
9
|
+
html5_test_files('sniffer').each do |test_file|
|
10
|
+
test_name = File.basename(test_file).sub('.test', '')
|
11
|
+
|
12
|
+
tests = JSON.parse(File.read(test_file))
|
13
|
+
|
14
|
+
tests.each_with_index do |data, index|
|
15
|
+
define_method('test_%s_%d' % [test_name, index + 1]) do
|
16
|
+
assert_equal data['type'], html_or_feed(data['input'])
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
# each_with_index do |t, i|
|
21
|
+
# define_method "test_#{i}" do
|
22
|
+
# assert_equal t[0], sniff_feed_type(t[1])
|
23
|
+
# end
|
24
|
+
# end
|
25
|
+
|
26
|
+
|
27
|
+
end
|
data/test/test_stream.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
|
2
|
+
|
3
|
+
require 'html5/inputstream'
|
4
|
+
|
5
|
+
class HTMLInputStreamTest < Test::Unit::TestCase
|
6
|
+
include HTML5
|
7
|
+
|
8
|
+
def getc stream
|
9
|
+
if String.method_defined? :force_encoding
|
10
|
+
stream.char.force_encoding('binary')
|
11
|
+
else
|
12
|
+
stream.char
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def test_char_ascii
|
17
|
+
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
18
|
+
assert_equal('ascii', stream.char_encoding)
|
19
|
+
assert_equal("'", stream.char)
|
20
|
+
end
|
21
|
+
|
22
|
+
def test_char_null
|
23
|
+
stream = HTMLInputStream.new("\x00")
|
24
|
+
assert_equal("\xef\xbf\xbd", getc(stream))
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_char_utf8
|
28
|
+
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
|
29
|
+
assert_equal('utf-8', stream.char_encoding)
|
30
|
+
assert_equal("\xe2\x80\x98", getc(stream))
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_char_win1252
|
34
|
+
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
|
35
|
+
assert_equal('windows-1252', stream.char_encoding)
|
36
|
+
assert_equal("\xc2\xa2", getc(stream))
|
37
|
+
assert_equal("\xc3\x85", getc(stream))
|
38
|
+
assert_equal("\xc3\xb1", getc(stream))
|
39
|
+
assert_equal("\xe2\x80\x99", getc(stream))
|
40
|
+
assert_equal("\xe2\x80\xa0", getc(stream))
|
41
|
+
end
|
42
|
+
|
43
|
+
def test_bom
|
44
|
+
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
45
|
+
assert_equal('utf-8', stream.char_encoding)
|
46
|
+
assert_equal("'", stream.char)
|
47
|
+
end
|
48
|
+
|
49
|
+
begin
|
50
|
+
require 'iconv'
|
51
|
+
|
52
|
+
def test_utf_16
|
53
|
+
input = Iconv.new('utf-16', 'utf-8').iconv(' '*1025)
|
54
|
+
stream = HTMLInputStream.new(input)
|
55
|
+
assert('utf-16-le', stream.char_encoding)
|
56
|
+
assert_equal(1025, stream.chars_until(' ', true).length)
|
57
|
+
end
|
58
|
+
rescue LoadError
|
59
|
+
puts "iconv not found, skipping iconv tests"
|
60
|
+
end
|
61
|
+
|
62
|
+
def test_newlines
|
63
|
+
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
|
64
|
+
assert_equal([1,0], stream.position)
|
65
|
+
assert_equal("a\nbb\n", stream.chars_until('c'))
|
66
|
+
assert_equal([3,0], stream.position)
|
67
|
+
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
68
|
+
assert_equal([4,4], stream.position)
|
69
|
+
assert_equal([1,2,3], stream.instance_eval {@line_lengths})
|
70
|
+
end
|
71
|
+
end
|
@@ -0,0 +1,95 @@
|
|
1
|
+
require File.expand_path(File.join(File.dirname(__FILE__), 'preamble'))
|
2
|
+
|
3
|
+
require 'html5/tokenizer'
|
4
|
+
|
5
|
+
require 'tokenizer_test_parser'
|
6
|
+
|
7
|
+
class Html5TokenizerTestCase < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
|
10
|
+
if !ignoreErrorOrder
|
11
|
+
assert_equal expectedTokens, receivedTokens, message
|
12
|
+
else
|
13
|
+
#Sort the tokens into two groups; non-parse errors and parse errors
|
14
|
+
expected = [[],[]]
|
15
|
+
received = [[],[]]
|
16
|
+
|
17
|
+
for token in expectedTokens
|
18
|
+
if token != "ParseError"
|
19
|
+
expected[0] << token
|
20
|
+
else
|
21
|
+
expected[1] << token
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
for token in receivedTokens
|
26
|
+
if token != "ParseError"
|
27
|
+
received[0] << token
|
28
|
+
else
|
29
|
+
received[1] << token
|
30
|
+
end
|
31
|
+
end
|
32
|
+
assert_equal expected, received, message
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def type_of?(token_name, token)
|
37
|
+
token != 'ParseError' and token_name == token.first
|
38
|
+
end
|
39
|
+
|
40
|
+
def convert_attribute_arrays_to_hashes(tokens)
|
41
|
+
tokens.inject([]) do |tokens, token|
|
42
|
+
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
|
43
|
+
tokens << token
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def concatenate_consecutive_characters(tokens)
|
48
|
+
tokens.inject([]) do |tokens, token|
|
49
|
+
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
|
50
|
+
tokens.last[1] = tokens.last[1] + token[1]
|
51
|
+
next tokens
|
52
|
+
end
|
53
|
+
tokens << token
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def tokenizer_test(data)
|
58
|
+
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
|
59
|
+
message = [
|
60
|
+
'', 'Description:', data['description'],
|
61
|
+
'', 'Input:', data['input'],
|
62
|
+
'', 'Content Model Flag:', content_model_flag,
|
63
|
+
'' ] * "\n"
|
64
|
+
|
65
|
+
assert_nothing_raised message do
|
66
|
+
tokenizer = HTML5::HTMLTokenizer.new(data['input'], :encoding => 'utf-8')
|
67
|
+
|
68
|
+
tokenizer.content_model_flag = content_model_flag.to_sym
|
69
|
+
|
70
|
+
tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
|
71
|
+
|
72
|
+
tokens = TokenizerTestParser.new(tokenizer).parse
|
73
|
+
|
74
|
+
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
|
75
|
+
|
76
|
+
expected = concatenate_consecutive_characters(data['output'])
|
77
|
+
|
78
|
+
assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
html5_test_files('tokenizer').each do |test_file|
|
84
|
+
test_name = File.basename(test_file).sub('.test', '')
|
85
|
+
|
86
|
+
tests = JSON.parse(File.read(test_file))['tests']
|
87
|
+
if tests != nil
|
88
|
+
tests.each_with_index do |data, index|
|
89
|
+
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
|
94
|
+
end
|
95
|
+
|