html5 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +3 -0
- data/Manifest.txt +58 -0
- data/README +9 -0
- data/Rakefile.rb +17 -0
- data/lib/html5/constants.rb +818 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
- data/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +607 -0
- data/lib/html5/html5parser/in_caption_phase.rb +68 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/html5/html5parser/in_row_phase.rb +87 -0
- data/lib/html5/html5parser/in_select_phase.rb +84 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
- data/lib/html5/html5parser/in_table_phase.rb +110 -0
- data/lib/html5/html5parser/initial_phase.rb +134 -0
- data/lib/html5/html5parser/phase.rb +158 -0
- data/lib/html5/html5parser/root_element_phase.rb +42 -0
- data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/html5/html5parser.rb +248 -0
- data/lib/html5/inputstream.rb +654 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +188 -0
- data/lib/html5/serializer/htmlserializer.rb +180 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/tokenizer.rb +968 -0
- data/lib/html5/treebuilders/base.rb +334 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +208 -0
- data/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treewalkers/base.rb +154 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5.rb +13 -0
- data/parse.rb +217 -0
- data/tests/preamble.rb +82 -0
- data/tests/test_encoding.rb +35 -0
- data/tests/test_lxp.rb +263 -0
- data/tests/test_parser.rb +68 -0
- data/tests/test_sanitizer.rb +142 -0
- data/tests/test_serializer.rb +68 -0
- data/tests/test_stream.rb +62 -0
- data/tests/test_tokenizer.rb +94 -0
- data/tests/test_treewalkers.rb +116 -0
- data/tests/tokenizer_test_parser.rb +63 -0
- metadata +120 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
4
|
+
|
5
|
+
require 'html5/html5parser'
|
6
|
+
require 'html5/liberalxmlparser'
|
7
|
+
require 'html5/treewalkers'
|
8
|
+
require 'html5/serializer'
|
9
|
+
require 'html5/sanitizer'
|
10
|
+
|
11
|
+
class SanitizeTest < Test::Unit::TestCase
|
12
|
+
include HTML5
|
13
|
+
|
14
|
+
def sanitize_xhtml stream
|
15
|
+
XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
def sanitize_html stream
|
19
|
+
HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
def sanitize_rexml stream
|
23
|
+
require 'rexml/document'
|
24
|
+
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
|
25
|
+
tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
|
26
|
+
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
27
|
+
:quote_char => "'",
|
28
|
+
:inject_meta_charset => false,
|
29
|
+
:sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
30
|
+
rescue REXML::ParseException
|
31
|
+
return "Ill-formed XHTML!"
|
32
|
+
end
|
33
|
+
|
34
|
+
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
35
|
+
assert_equal htmloutput, sanitize_html(input)
|
36
|
+
assert_equal xhtmloutput, sanitize_xhtml(input)
|
37
|
+
assert_equal rexmloutput, sanitize_rexml(input)
|
38
|
+
end
|
39
|
+
|
40
|
+
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
41
|
+
define_method "test_should_allow_#{tag_name}_tag" do
|
42
|
+
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
43
|
+
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
44
|
+
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
45
|
+
rexmloutput = xhtmloutput
|
46
|
+
|
47
|
+
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
48
|
+
htmloutput = "foo <bad>bar</bad> baz"
|
49
|
+
xhtmloutput = htmloutput
|
50
|
+
elsif tag_name == 'col'
|
51
|
+
htmloutput = "foo <bad>bar</bad> baz"
|
52
|
+
xhtmloutput = htmloutput
|
53
|
+
rexmloutput = "<col title='1' />"
|
54
|
+
elsif tag_name == 'table'
|
55
|
+
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
56
|
+
xhtmloutput = htmloutput
|
57
|
+
elsif tag_name == 'image'
|
58
|
+
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
|
59
|
+
xhtmloutput = htmloutput
|
60
|
+
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
61
|
+
elsif VOID_ELEMENTS.include?(tag_name)
|
62
|
+
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
63
|
+
xhtmloutput = htmloutput
|
64
|
+
htmloutput += '<br/>' if tag_name == 'br'
|
65
|
+
rexmloutput = "<#{tag_name} title='1' />"
|
66
|
+
end
|
67
|
+
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
72
|
+
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
73
|
+
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
74
|
+
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
75
|
+
check_sanitization(input, output, output, output)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
80
|
+
next if attribute_name == 'style'
|
81
|
+
define_method "test_should_allow_#{attribute_name}_attribute" do
|
82
|
+
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
83
|
+
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
84
|
+
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
85
|
+
check_sanitization(input, htmloutput, output, output)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
90
|
+
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
91
|
+
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
92
|
+
output = "<p>foo <bad>bar</bad> baz</p>"
|
93
|
+
check_sanitization(input, output, output, output)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
98
|
+
define_method "test_should_allow_#{protocol}_uris" do
|
99
|
+
input = %(<a href="#{protocol}">foo</a>)
|
100
|
+
output = "<a href='#{protocol}'>foo</a>"
|
101
|
+
check_sanitization(input, output, output, output)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
106
|
+
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
107
|
+
input = %(<a href="#{protocol.upcase}">foo</a>)
|
108
|
+
output = "<a href='#{protocol.upcase}'>foo</a>"
|
109
|
+
check_sanitization(input, output, output, output)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def test_should_handle_astral_plane_characters
|
114
|
+
input = "<p>𝒵 𝔸</p>"
|
115
|
+
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
116
|
+
check_sanitization(input, output, output, output)
|
117
|
+
|
118
|
+
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
119
|
+
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
120
|
+
check_sanitization(input, output, output, output)
|
121
|
+
end
|
122
|
+
|
123
|
+
# This affects only NS4. Is it worth fixing?
|
124
|
+
# def test_javascript_includes
|
125
|
+
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
126
|
+
# output = "<div>foo</div>"
|
127
|
+
# check_sanitization(input, output, output, output)
|
128
|
+
# end
|
129
|
+
|
130
|
+
html5_test_files('sanitizer').each do |filename|
|
131
|
+
JSON::parse(open(filename).read).each do |test|
|
132
|
+
define_method "test_#{test['name']}" do
|
133
|
+
check_sanitization(
|
134
|
+
test['input'],
|
135
|
+
test['output'],
|
136
|
+
test['xhtml'] || test['output'],
|
137
|
+
test['rexml'] || test['output']
|
138
|
+
)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/html5parser'
|
4
|
+
require 'html5/serializer'
|
5
|
+
require 'html5/treewalkers'
|
6
|
+
|
7
|
+
#Run the serialize error checks
|
8
|
+
checkSerializeErrors = false
|
9
|
+
|
10
|
+
class JsonWalker < HTML5::TreeWalkers::Base
|
11
|
+
def each
|
12
|
+
@tree.each do |token|
|
13
|
+
case token[0]
|
14
|
+
when 'StartTag'
|
15
|
+
yield start_tag(token[1], token[2])
|
16
|
+
when 'EndTag'
|
17
|
+
yield end_tag(token[1])
|
18
|
+
when 'EmptyTag'
|
19
|
+
yield empty_tag(token[1], token[2])
|
20
|
+
when 'Comment'
|
21
|
+
yield comment(token[1])
|
22
|
+
when 'Characters', 'SpaceCharacters'
|
23
|
+
text(token[1]) {|textToken| yield textToken}
|
24
|
+
when 'Doctype'
|
25
|
+
yield doctype(token[1], token[2], token[3])
|
26
|
+
else
|
27
|
+
raise "Unknown token type: " + token[0]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Html5SerializeTestcase < Test::Unit::TestCase
|
34
|
+
html5_test_files('serializer').each do |filename|
|
35
|
+
test_name = File.basename(filename).sub('.test', '')
|
36
|
+
tests = JSON::parse(open(filename).read)
|
37
|
+
tests['tests'].each_with_index do |test, index|
|
38
|
+
|
39
|
+
define_method "test_#{test_name}_#{index+1}" do
|
40
|
+
if test["options"] and test["options"]["encoding"]
|
41
|
+
test["options"][:encoding] = test["options"]["encoding"]
|
42
|
+
end
|
43
|
+
|
44
|
+
result = HTML5::HTMLSerializer.
|
45
|
+
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
46
|
+
expected = test["expected"]
|
47
|
+
if expected.length == 1
|
48
|
+
assert_equal(expected[0], result, test["description"])
|
49
|
+
elsif !expected.include?(result)
|
50
|
+
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
51
|
+
end
|
52
|
+
|
53
|
+
return if test_name == 'optionaltags'
|
54
|
+
|
55
|
+
result = HTML5::XHTMLSerializer.
|
56
|
+
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
57
|
+
expected = test["xhtml"] || test["expected"]
|
58
|
+
if expected.length == 1
|
59
|
+
assert_equal(expected[0], result, test["description"])
|
60
|
+
elsif !expected.include?(result)
|
61
|
+
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/inputstream'
|
4
|
+
|
5
|
+
class HTMLInputStreamTest < Test::Unit::TestCase
|
6
|
+
include HTML5
|
7
|
+
|
8
|
+
def test_char_ascii
|
9
|
+
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
10
|
+
assert_equal('ascii', stream.char_encoding)
|
11
|
+
assert_equal("'", stream.char)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_char_null
|
15
|
+
stream = HTMLInputStream.new("\x00")
|
16
|
+
assert_equal("\xef\xbf\xbd", stream.char)
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_char_utf8
|
20
|
+
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
|
21
|
+
assert_equal('utf-8', stream.char_encoding)
|
22
|
+
assert_equal("\xe2\x80\x98", stream.char)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_char_win1252
|
26
|
+
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
|
27
|
+
assert_equal('windows-1252', stream.char_encoding)
|
28
|
+
assert_equal("\xc2\xa2", stream.char)
|
29
|
+
assert_equal("\xc3\x85", stream.char)
|
30
|
+
assert_equal("\xc3\xb1", stream.char)
|
31
|
+
assert_equal("\xe2\x80\x99", stream.char)
|
32
|
+
assert_equal("\xe2\x80\xa0", stream.char)
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_bom
|
36
|
+
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
37
|
+
assert_equal('utf-8', stream.char_encoding)
|
38
|
+
assert_equal("'", stream.char)
|
39
|
+
end
|
40
|
+
|
41
|
+
begin
|
42
|
+
require 'iconv'
|
43
|
+
|
44
|
+
def test_utf_16
|
45
|
+
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
|
46
|
+
assert(stream.char_encoding, 'utf-16-le')
|
47
|
+
assert_equal(1025, stream.chars_until(' ',true).length)
|
48
|
+
end
|
49
|
+
rescue LoadError
|
50
|
+
puts "iconv not found, skipping iconv tests"
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_newlines
|
54
|
+
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
|
55
|
+
assert_equal([1,0], stream.position)
|
56
|
+
assert_equal("a\nbb\n", stream.chars_until('c'))
|
57
|
+
assert_equal([3,0], stream.position)
|
58
|
+
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
59
|
+
assert_equal([4,4], stream.position)
|
60
|
+
assert_equal([1,2,3], stream.instance_eval {@line_lengths})
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/tokenizer'
|
4
|
+
|
5
|
+
require 'tokenizer_test_parser'
|
6
|
+
|
7
|
+
class Html5TokenizerTestCase < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
|
10
|
+
if !ignoreErrorOrder
|
11
|
+
return expectedTokens == receivedTokens
|
12
|
+
else
|
13
|
+
#Sort the tokens into two groups; non-parse errors and parse errors
|
14
|
+
expected = [[],[]]
|
15
|
+
received = [[],[]]
|
16
|
+
|
17
|
+
for token in expectedTokens
|
18
|
+
if token != "ParseError"
|
19
|
+
expected[0] << token
|
20
|
+
else
|
21
|
+
expected[1] << token
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
for token in receivedTokens
|
26
|
+
if token != "ParseError"
|
27
|
+
received[0] << token
|
28
|
+
else
|
29
|
+
received[1] << token
|
30
|
+
end
|
31
|
+
end
|
32
|
+
assert_equal expected, received, message
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def type_of?(token_name, token)
|
37
|
+
token != 'ParseError' and token_name == token.first
|
38
|
+
end
|
39
|
+
|
40
|
+
def convert_attribute_arrays_to_hashes(tokens)
|
41
|
+
tokens.inject([]) do |tokens, token|
|
42
|
+
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
|
43
|
+
tokens << token
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def concatenate_consecutive_characters(tokens)
|
48
|
+
tokens.inject([]) do |tokens, token|
|
49
|
+
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
|
50
|
+
tokens.last[1] = tokens.last[1] + token[1]
|
51
|
+
next tokens
|
52
|
+
end
|
53
|
+
tokens << token
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def tokenizer_test(data)
|
58
|
+
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
|
59
|
+
message = [
|
60
|
+
'', 'Description:', data['description'],
|
61
|
+
'', 'Input:', data['input'],
|
62
|
+
'', 'Content Model Flag:', content_model_flag,
|
63
|
+
'' ] * "\n"
|
64
|
+
|
65
|
+
assert_nothing_raised message do
|
66
|
+
tokenizer = HTML5::HTMLTokenizer.new(data['input'])
|
67
|
+
|
68
|
+
tokenizer.content_model_flag = content_model_flag.to_sym
|
69
|
+
|
70
|
+
tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
|
71
|
+
|
72
|
+
tokens = TokenizerTestParser.new(tokenizer).parse
|
73
|
+
|
74
|
+
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
|
75
|
+
|
76
|
+
expected = concatenate_consecutive_characters(data['output'])
|
77
|
+
|
78
|
+
assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
html5_test_files('tokenizer').each do |test_file|
|
84
|
+
test_name = File.basename(test_file).sub('.test', '')
|
85
|
+
|
86
|
+
tests = JSON.parse(File.read(test_file))['tests']
|
87
|
+
|
88
|
+
tests.each_with_index do |data, index|
|
89
|
+
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/html5parser'
|
4
|
+
require 'html5/treewalkers'
|
5
|
+
require 'html5/treebuilders'
|
6
|
+
|
7
|
+
$tree_types_to_test = {
|
8
|
+
'simpletree' =>
|
9
|
+
{:builder => HTML5::TreeBuilders['simpletree'],
|
10
|
+
:walker => HTML5::TreeWalkers['simpletree']},
|
11
|
+
'rexml' =>
|
12
|
+
{:builder => HTML5::TreeBuilders['rexml'],
|
13
|
+
:walker => HTML5::TreeWalkers['rexml']},
|
14
|
+
'hpricot' =>
|
15
|
+
{:builder => HTML5::TreeBuilders['hpricot'],
|
16
|
+
:walker => HTML5::TreeWalkers['hpricot']},
|
17
|
+
}
|
18
|
+
|
19
|
+
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
20
|
+
|
21
|
+
class TestTreeWalkers < Test::Unit::TestCase
|
22
|
+
include HTML5::TestSupport
|
23
|
+
|
24
|
+
def concatenateCharacterTokens(tokens)
|
25
|
+
charactersToken = nil
|
26
|
+
for token in tokens
|
27
|
+
type = token[:type]
|
28
|
+
if [:Characters, :SpaceCharacters].include?(type)
|
29
|
+
if charactersToken == nil
|
30
|
+
charactersToken = {:type => :Characters, :data => token[:data]}
|
31
|
+
else
|
32
|
+
charactersToken[:data] += token[:data]
|
33
|
+
end
|
34
|
+
else
|
35
|
+
if charactersToken != nil
|
36
|
+
yield charactersToken
|
37
|
+
charactersToken = nil
|
38
|
+
end
|
39
|
+
yield token
|
40
|
+
end
|
41
|
+
end
|
42
|
+
yield charactersToken if charactersToken != nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def convertTokens(tokens)
|
46
|
+
output = []
|
47
|
+
indent = 0
|
48
|
+
concatenateCharacterTokens(tokens) do |token|
|
49
|
+
case token[:type]
|
50
|
+
when :StartTag, :EmptyTag
|
51
|
+
output << "#{' '*indent}<#{token[:name]}>"
|
52
|
+
indent += 2
|
53
|
+
for name, value in token[:data].to_a.sort
|
54
|
+
next if name=='xmlns'
|
55
|
+
output << "#{' '*indent}#{name}=\"#{value}\""
|
56
|
+
end
|
57
|
+
indent -= 2 if token[:type] == :EmptyTag
|
58
|
+
when :EndTag
|
59
|
+
indent -= 2
|
60
|
+
when :Comment
|
61
|
+
output << "#{' '*indent}<!-- #{token[:data]} -->"
|
62
|
+
when :Doctype
|
63
|
+
if token[:name] and token[:name].any?
|
64
|
+
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
|
65
|
+
else
|
66
|
+
output << "#{' '*indent}<!DOCTYPE >"
|
67
|
+
end
|
68
|
+
when :Characters, :SpaceCharacters
|
69
|
+
output << "#{' '*indent}\"#{token[:data]}\""
|
70
|
+
else
|
71
|
+
# TODO: what to do with errors?
|
72
|
+
end
|
73
|
+
end
|
74
|
+
return output.join("\n")
|
75
|
+
end
|
76
|
+
|
77
|
+
html5_test_files('tree-construction').each do |test_file|
|
78
|
+
|
79
|
+
test_name = File.basename(test_file).sub('.dat', '')
|
80
|
+
next if test_name == 'tests5' # TODO
|
81
|
+
|
82
|
+
TestData.new(test_file, %w(data errors document-fragment document)).
|
83
|
+
each_with_index do |(input, errors, inner_html, expected), index|
|
84
|
+
|
85
|
+
expected = expected.gsub("\n| ","\n")[2..-1]
|
86
|
+
|
87
|
+
$tree_types_to_test.each do |tree_name, tree_class|
|
88
|
+
|
89
|
+
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
90
|
+
|
91
|
+
parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
|
92
|
+
|
93
|
+
if inner_html
|
94
|
+
parser.parse_fragment(input, inner_html)
|
95
|
+
else
|
96
|
+
parser.parse(input)
|
97
|
+
end
|
98
|
+
|
99
|
+
document = parser.tree.get_document
|
100
|
+
|
101
|
+
begin
|
102
|
+
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
|
103
|
+
expected = sortattrs(expected)
|
104
|
+
assert_equal expected, output, [
|
105
|
+
'', 'Input:', input,
|
106
|
+
'', 'Expected:', expected,
|
107
|
+
'', 'Recieved:', output
|
108
|
+
].join("\n")
|
109
|
+
rescue NotImplementedError
|
110
|
+
# Amnesty for those that confess...
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
|
3
|
+
class TokenizerTestParser
|
4
|
+
def initialize(tokenizer)
|
5
|
+
@tokenizer = tokenizer
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse
|
9
|
+
@outputTokens = []
|
10
|
+
|
11
|
+
debug = nil
|
12
|
+
for token in @tokenizer
|
13
|
+
debug = token.inspect if token[:type] == :ParseError
|
14
|
+
send(('process' + token[:type].to_s), token)
|
15
|
+
end
|
16
|
+
|
17
|
+
return @outputTokens
|
18
|
+
end
|
19
|
+
|
20
|
+
def processDoctype(token)
|
21
|
+
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
|
22
|
+
token[:systemId], token[:correct]])
|
23
|
+
end
|
24
|
+
|
25
|
+
def processStartTag(token)
|
26
|
+
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
27
|
+
end
|
28
|
+
|
29
|
+
def processEmptyTag(token)
|
30
|
+
if not HTML5::VOID_ELEMENTS.include? token[:name]
|
31
|
+
@outputTokens.push("ParseError")
|
32
|
+
end
|
33
|
+
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
34
|
+
end
|
35
|
+
|
36
|
+
def processEndTag(token)
|
37
|
+
if token[:data].length > 0
|
38
|
+
self.processParseError(token)
|
39
|
+
end
|
40
|
+
@outputTokens.push(["EndTag", token[:name]])
|
41
|
+
end
|
42
|
+
|
43
|
+
def processComment(token)
|
44
|
+
@outputTokens.push(["Comment", token[:data]])
|
45
|
+
end
|
46
|
+
|
47
|
+
def processCharacters(token)
|
48
|
+
@outputTokens.push(["Character", token[:data]])
|
49
|
+
end
|
50
|
+
|
51
|
+
alias processSpaceCharacters processCharacters
|
52
|
+
|
53
|
+
def processCharacters(token)
|
54
|
+
@outputTokens.push(["Character", token[:data]])
|
55
|
+
end
|
56
|
+
|
57
|
+
def process_eof(token)
|
58
|
+
end
|
59
|
+
|
60
|
+
def processParseError(token)
|
61
|
+
@outputTokens.push("ParseError")
|
62
|
+
end
|
63
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.2
|
3
|
+
specification_version: 1
|
4
|
+
name: html5
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2007-08-07 00:00:00 -07:00
|
8
|
+
summary: HTML5 parser/tokenizer.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: ryan@theryanking.com
|
12
|
+
homepage: http://code.google.com/p/html5lib
|
13
|
+
rubyforge_project: html5
|
14
|
+
description: A ruby based HTML parser/tokenizer based on the WHATWG HTML5 specification for maximum compatibility with major desktop web browsers.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Ryan King
|
31
|
+
files:
|
32
|
+
- History.txt
|
33
|
+
- Manifest.txt
|
34
|
+
- README
|
35
|
+
- Rakefile.rb
|
36
|
+
- lib/html5.rb
|
37
|
+
- lib/html5/constants.rb
|
38
|
+
- lib/html5/filters/base.rb
|
39
|
+
- lib/html5/filters/inject_meta_charset.rb
|
40
|
+
- lib/html5/filters/optionaltags.rb
|
41
|
+
- lib/html5/filters/sanitizer.rb
|
42
|
+
- lib/html5/filters/whitespace.rb
|
43
|
+
- lib/html5/html5parser.rb
|
44
|
+
- lib/html5/html5parser/after_body_phase.rb
|
45
|
+
- lib/html5/html5parser/after_frameset_phase.rb
|
46
|
+
- lib/html5/html5parser/after_head_phase.rb
|
47
|
+
- lib/html5/html5parser/before_head_phase.rb
|
48
|
+
- lib/html5/html5parser/in_body_phase.rb
|
49
|
+
- lib/html5/html5parser/in_caption_phase.rb
|
50
|
+
- lib/html5/html5parser/in_cell_phase.rb
|
51
|
+
- lib/html5/html5parser/in_column_group_phase.rb
|
52
|
+
- lib/html5/html5parser/in_frameset_phase.rb
|
53
|
+
- lib/html5/html5parser/in_head_phase.rb
|
54
|
+
- lib/html5/html5parser/in_row_phase.rb
|
55
|
+
- lib/html5/html5parser/in_select_phase.rb
|
56
|
+
- lib/html5/html5parser/in_table_body_phase.rb
|
57
|
+
- lib/html5/html5parser/in_table_phase.rb
|
58
|
+
- lib/html5/html5parser/initial_phase.rb
|
59
|
+
- lib/html5/html5parser/phase.rb
|
60
|
+
- lib/html5/html5parser/root_element_phase.rb
|
61
|
+
- lib/html5/html5parser/trailing_end_phase.rb
|
62
|
+
- lib/html5/inputstream.rb
|
63
|
+
- lib/html5/liberalxmlparser.rb
|
64
|
+
- lib/html5/sanitizer.rb
|
65
|
+
- lib/html5/serializer.rb
|
66
|
+
- lib/html5/serializer/htmlserializer.rb
|
67
|
+
- lib/html5/serializer/xhtmlserializer.rb
|
68
|
+
- lib/html5/tokenizer.rb
|
69
|
+
- lib/html5/treebuilders.rb
|
70
|
+
- lib/html5/treebuilders/base.rb
|
71
|
+
- lib/html5/treebuilders/hpricot.rb
|
72
|
+
- lib/html5/treebuilders/rexml.rb
|
73
|
+
- lib/html5/treebuilders/simpletree.rb
|
74
|
+
- lib/html5/treewalkers.rb
|
75
|
+
- lib/html5/treewalkers/base.rb
|
76
|
+
- lib/html5/treewalkers/hpricot.rb
|
77
|
+
- lib/html5/treewalkers/rexml.rb
|
78
|
+
- lib/html5/treewalkers/simpletree.rb
|
79
|
+
- parse.rb
|
80
|
+
- tests/preamble.rb
|
81
|
+
- tests/test_encoding.rb
|
82
|
+
- tests/test_lxp.rb
|
83
|
+
- tests/test_parser.rb
|
84
|
+
- tests/test_sanitizer.rb
|
85
|
+
- tests/test_serializer.rb
|
86
|
+
- tests/test_stream.rb
|
87
|
+
- tests/test_tokenizer.rb
|
88
|
+
- tests/test_treewalkers.rb
|
89
|
+
- tests/tokenizer_test_parser.rb
|
90
|
+
test_files: []
|
91
|
+
|
92
|
+
rdoc_options: []
|
93
|
+
|
94
|
+
extra_rdoc_files: []
|
95
|
+
|
96
|
+
executables: []
|
97
|
+
|
98
|
+
extensions: []
|
99
|
+
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
dependencies:
|
103
|
+
- !ruby/object:Gem::Dependency
|
104
|
+
name: chardet
|
105
|
+
version_requirement:
|
106
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.9.0
|
111
|
+
version:
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: hoe
|
114
|
+
version_requirement:
|
115
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 1.2.0
|
120
|
+
version:
|