html5 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +3 -0
- data/Manifest.txt +58 -0
- data/README +9 -0
- data/Rakefile.rb +17 -0
- data/lib/html5/constants.rb +818 -0
- data/lib/html5/filters/base.rb +10 -0
- data/lib/html5/filters/inject_meta_charset.rb +82 -0
- data/lib/html5/filters/optionaltags.rb +198 -0
- data/lib/html5/filters/sanitizer.rb +15 -0
- data/lib/html5/filters/whitespace.rb +36 -0
- data/lib/html5/html5parser/after_body_phase.rb +46 -0
- data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
- data/lib/html5/html5parser/after_head_phase.rb +50 -0
- data/lib/html5/html5parser/before_head_phase.rb +41 -0
- data/lib/html5/html5parser/in_body_phase.rb +607 -0
- data/lib/html5/html5parser/in_caption_phase.rb +68 -0
- data/lib/html5/html5parser/in_cell_phase.rb +78 -0
- data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
- data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
- data/lib/html5/html5parser/in_head_phase.rb +138 -0
- data/lib/html5/html5parser/in_row_phase.rb +87 -0
- data/lib/html5/html5parser/in_select_phase.rb +84 -0
- data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
- data/lib/html5/html5parser/in_table_phase.rb +110 -0
- data/lib/html5/html5parser/initial_phase.rb +134 -0
- data/lib/html5/html5parser/phase.rb +158 -0
- data/lib/html5/html5parser/root_element_phase.rb +42 -0
- data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
- data/lib/html5/html5parser.rb +248 -0
- data/lib/html5/inputstream.rb +654 -0
- data/lib/html5/liberalxmlparser.rb +158 -0
- data/lib/html5/sanitizer.rb +188 -0
- data/lib/html5/serializer/htmlserializer.rb +180 -0
- data/lib/html5/serializer/xhtmlserializer.rb +20 -0
- data/lib/html5/serializer.rb +2 -0
- data/lib/html5/tokenizer.rb +968 -0
- data/lib/html5/treebuilders/base.rb +334 -0
- data/lib/html5/treebuilders/hpricot.rb +231 -0
- data/lib/html5/treebuilders/rexml.rb +208 -0
- data/lib/html5/treebuilders/simpletree.rb +185 -0
- data/lib/html5/treebuilders.rb +24 -0
- data/lib/html5/treewalkers/base.rb +154 -0
- data/lib/html5/treewalkers/hpricot.rb +48 -0
- data/lib/html5/treewalkers/rexml.rb +48 -0
- data/lib/html5/treewalkers/simpletree.rb +48 -0
- data/lib/html5/treewalkers.rb +26 -0
- data/lib/html5.rb +13 -0
- data/parse.rb +217 -0
- data/tests/preamble.rb +82 -0
- data/tests/test_encoding.rb +35 -0
- data/tests/test_lxp.rb +263 -0
- data/tests/test_parser.rb +68 -0
- data/tests/test_sanitizer.rb +142 -0
- data/tests/test_serializer.rb +68 -0
- data/tests/test_stream.rb +62 -0
- data/tests/test_tokenizer.rb +94 -0
- data/tests/test_treewalkers.rb +116 -0
- data/tests/tokenizer_test_parser.rb +63 -0
- metadata +120 -0
@@ -0,0 +1,142 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
4
|
+
|
5
|
+
require 'html5/html5parser'
|
6
|
+
require 'html5/liberalxmlparser'
|
7
|
+
require 'html5/treewalkers'
|
8
|
+
require 'html5/serializer'
|
9
|
+
require 'html5/sanitizer'
|
10
|
+
|
11
|
+
class SanitizeTest < Test::Unit::TestCase
|
12
|
+
include HTML5
|
13
|
+
|
14
|
+
def sanitize_xhtml stream
|
15
|
+
XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
|
16
|
+
end
|
17
|
+
|
18
|
+
def sanitize_html stream
|
19
|
+
HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
|
20
|
+
end
|
21
|
+
|
22
|
+
def sanitize_rexml stream
|
23
|
+
require 'rexml/document'
|
24
|
+
doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
|
25
|
+
tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
|
26
|
+
XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
|
27
|
+
:quote_char => "'",
|
28
|
+
:inject_meta_charset => false,
|
29
|
+
:sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
|
30
|
+
rescue REXML::ParseException
|
31
|
+
return "Ill-formed XHTML!"
|
32
|
+
end
|
33
|
+
|
34
|
+
def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
35
|
+
assert_equal htmloutput, sanitize_html(input)
|
36
|
+
assert_equal xhtmloutput, sanitize_xhtml(input)
|
37
|
+
assert_equal rexmloutput, sanitize_rexml(input)
|
38
|
+
end
|
39
|
+
|
40
|
+
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
41
|
+
define_method "test_should_allow_#{tag_name}_tag" do
|
42
|
+
input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
43
|
+
htmloutput = "<#{tag_name.downcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.downcase}>"
|
44
|
+
xhtmloutput = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
|
45
|
+
rexmloutput = xhtmloutput
|
46
|
+
|
47
|
+
if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
|
48
|
+
htmloutput = "foo <bad>bar</bad> baz"
|
49
|
+
xhtmloutput = htmloutput
|
50
|
+
elsif tag_name == 'col'
|
51
|
+
htmloutput = "foo <bad>bar</bad> baz"
|
52
|
+
xhtmloutput = htmloutput
|
53
|
+
rexmloutput = "<col title='1' />"
|
54
|
+
elsif tag_name == 'table'
|
55
|
+
htmloutput = "foo <bad>bar</bad>baz<table title='1'> </table>"
|
56
|
+
xhtmloutput = htmloutput
|
57
|
+
elsif tag_name == 'image'
|
58
|
+
htmloutput = "<img title='1'/>foo <bad>bar</bad> baz"
|
59
|
+
xhtmloutput = htmloutput
|
60
|
+
rexmloutput = "<image title='1'>foo <bad>bar</bad> baz</image>"
|
61
|
+
elsif VOID_ELEMENTS.include?(tag_name)
|
62
|
+
htmloutput = "<#{tag_name} title='1'/>foo <bad>bar</bad> baz"
|
63
|
+
xhtmloutput = htmloutput
|
64
|
+
htmloutput += '<br/>' if tag_name == 'br'
|
65
|
+
rexmloutput = "<#{tag_name} title='1' />"
|
66
|
+
end
|
67
|
+
check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
|
72
|
+
define_method "test_should_forbid_#{tag_name.upcase}_tag" do
|
73
|
+
input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
74
|
+
output = "<#{tag_name.upcase} title=\"1\">foo <bad>bar</bad> baz</#{tag_name.upcase}>"
|
75
|
+
check_sanitization(input, output, output, output)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
80
|
+
next if attribute_name == 'style'
|
81
|
+
define_method "test_should_allow_#{attribute_name}_attribute" do
|
82
|
+
input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
83
|
+
output = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
|
84
|
+
htmloutput = "<p #{attribute_name.downcase}='foo'>foo <bad>bar</bad> baz</p>"
|
85
|
+
check_sanitization(input, htmloutput, output, output)
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
|
90
|
+
define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
|
91
|
+
input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
|
92
|
+
output = "<p>foo <bad>bar</bad> baz</p>"
|
93
|
+
check_sanitization(input, output, output, output)
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
98
|
+
define_method "test_should_allow_#{protocol}_uris" do
|
99
|
+
input = %(<a href="#{protocol}">foo</a>)
|
100
|
+
output = "<a href='#{protocol}'>foo</a>"
|
101
|
+
check_sanitization(input, output, output, output)
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
|
106
|
+
define_method "test_should_allow_uppercase_#{protocol}_uris" do
|
107
|
+
input = %(<a href="#{protocol.upcase}">foo</a>)
|
108
|
+
output = "<a href='#{protocol.upcase}'>foo</a>"
|
109
|
+
check_sanitization(input, output, output, output)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
113
|
+
def test_should_handle_astral_plane_characters
|
114
|
+
input = "<p>𝒵 𝔸</p>"
|
115
|
+
output = "<p>\360\235\222\265 \360\235\224\270</p>"
|
116
|
+
check_sanitization(input, output, output, output)
|
117
|
+
|
118
|
+
input = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
119
|
+
output = "<p><tspan>\360\235\224\270</tspan> a</p>"
|
120
|
+
check_sanitization(input, output, output, output)
|
121
|
+
end
|
122
|
+
|
123
|
+
# This affects only NS4. Is it worth fixing?
|
124
|
+
# def test_javascript_includes
|
125
|
+
# input = %(<div size="&{alert('XSS')}">foo</div>)
|
126
|
+
# output = "<div>foo</div>"
|
127
|
+
# check_sanitization(input, output, output, output)
|
128
|
+
# end
|
129
|
+
|
130
|
+
html5_test_files('sanitizer').each do |filename|
|
131
|
+
JSON::parse(open(filename).read).each do |test|
|
132
|
+
define_method "test_#{test['name']}" do
|
133
|
+
check_sanitization(
|
134
|
+
test['input'],
|
135
|
+
test['output'],
|
136
|
+
test['xhtml'] || test['output'],
|
137
|
+
test['rexml'] || test['output']
|
138
|
+
)
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/html5parser'
|
4
|
+
require 'html5/serializer'
|
5
|
+
require 'html5/treewalkers'
|
6
|
+
|
7
|
+
#Run the serialize error checks
|
8
|
+
checkSerializeErrors = false
|
9
|
+
|
10
|
+
class JsonWalker < HTML5::TreeWalkers::Base
|
11
|
+
def each
|
12
|
+
@tree.each do |token|
|
13
|
+
case token[0]
|
14
|
+
when 'StartTag'
|
15
|
+
yield start_tag(token[1], token[2])
|
16
|
+
when 'EndTag'
|
17
|
+
yield end_tag(token[1])
|
18
|
+
when 'EmptyTag'
|
19
|
+
yield empty_tag(token[1], token[2])
|
20
|
+
when 'Comment'
|
21
|
+
yield comment(token[1])
|
22
|
+
when 'Characters', 'SpaceCharacters'
|
23
|
+
text(token[1]) {|textToken| yield textToken}
|
24
|
+
when 'Doctype'
|
25
|
+
yield doctype(token[1], token[2], token[3])
|
26
|
+
else
|
27
|
+
raise "Unknown token type: " + token[0]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
class Html5SerializeTestcase < Test::Unit::TestCase
|
34
|
+
html5_test_files('serializer').each do |filename|
|
35
|
+
test_name = File.basename(filename).sub('.test', '')
|
36
|
+
tests = JSON::parse(open(filename).read)
|
37
|
+
tests['tests'].each_with_index do |test, index|
|
38
|
+
|
39
|
+
define_method "test_#{test_name}_#{index+1}" do
|
40
|
+
if test["options"] and test["options"]["encoding"]
|
41
|
+
test["options"][:encoding] = test["options"]["encoding"]
|
42
|
+
end
|
43
|
+
|
44
|
+
result = HTML5::HTMLSerializer.
|
45
|
+
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
46
|
+
expected = test["expected"]
|
47
|
+
if expected.length == 1
|
48
|
+
assert_equal(expected[0], result, test["description"])
|
49
|
+
elsif !expected.include?(result)
|
50
|
+
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
51
|
+
end
|
52
|
+
|
53
|
+
return if test_name == 'optionaltags'
|
54
|
+
|
55
|
+
result = HTML5::XHTMLSerializer.
|
56
|
+
serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
|
57
|
+
expected = test["xhtml"] || test["expected"]
|
58
|
+
if expected.length == 1
|
59
|
+
assert_equal(expected[0], result, test["description"])
|
60
|
+
elsif !expected.include?(result)
|
61
|
+
flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/inputstream'
|
4
|
+
|
5
|
+
class HTMLInputStreamTest < Test::Unit::TestCase
|
6
|
+
include HTML5
|
7
|
+
|
8
|
+
def test_char_ascii
|
9
|
+
stream = HTMLInputStream.new("'", :encoding=>'ascii')
|
10
|
+
assert_equal('ascii', stream.char_encoding)
|
11
|
+
assert_equal("'", stream.char)
|
12
|
+
end
|
13
|
+
|
14
|
+
def test_char_null
|
15
|
+
stream = HTMLInputStream.new("\x00")
|
16
|
+
assert_equal("\xef\xbf\xbd", stream.char)
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_char_utf8
|
20
|
+
stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
|
21
|
+
assert_equal('utf-8', stream.char_encoding)
|
22
|
+
assert_equal("\xe2\x80\x98", stream.char)
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_char_win1252
|
26
|
+
stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
|
27
|
+
assert_equal('windows-1252', stream.char_encoding)
|
28
|
+
assert_equal("\xc2\xa2", stream.char)
|
29
|
+
assert_equal("\xc3\x85", stream.char)
|
30
|
+
assert_equal("\xc3\xb1", stream.char)
|
31
|
+
assert_equal("\xe2\x80\x99", stream.char)
|
32
|
+
assert_equal("\xe2\x80\xa0", stream.char)
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_bom
|
36
|
+
stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
|
37
|
+
assert_equal('utf-8', stream.char_encoding)
|
38
|
+
assert_equal("'", stream.char)
|
39
|
+
end
|
40
|
+
|
41
|
+
begin
|
42
|
+
require 'iconv'
|
43
|
+
|
44
|
+
def test_utf_16
|
45
|
+
stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
|
46
|
+
assert(stream.char_encoding, 'utf-16-le')
|
47
|
+
assert_equal(1025, stream.chars_until(' ',true).length)
|
48
|
+
end
|
49
|
+
rescue LoadError
|
50
|
+
puts "iconv not found, skipping iconv tests"
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_newlines
|
54
|
+
stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
|
55
|
+
assert_equal([1,0], stream.position)
|
56
|
+
assert_equal("a\nbb\n", stream.chars_until('c'))
|
57
|
+
assert_equal([3,0], stream.position)
|
58
|
+
assert_equal("ccc\ndddd", stream.chars_until('x'))
|
59
|
+
assert_equal([4,4], stream.position)
|
60
|
+
assert_equal([1,2,3], stream.instance_eval {@line_lengths})
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/tokenizer'
|
4
|
+
|
5
|
+
require 'tokenizer_test_parser'
|
6
|
+
|
7
|
+
class Html5TokenizerTestCase < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
|
10
|
+
if !ignoreErrorOrder
|
11
|
+
return expectedTokens == receivedTokens
|
12
|
+
else
|
13
|
+
#Sort the tokens into two groups; non-parse errors and parse errors
|
14
|
+
expected = [[],[]]
|
15
|
+
received = [[],[]]
|
16
|
+
|
17
|
+
for token in expectedTokens
|
18
|
+
if token != "ParseError"
|
19
|
+
expected[0] << token
|
20
|
+
else
|
21
|
+
expected[1] << token
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
for token in receivedTokens
|
26
|
+
if token != "ParseError"
|
27
|
+
received[0] << token
|
28
|
+
else
|
29
|
+
received[1] << token
|
30
|
+
end
|
31
|
+
end
|
32
|
+
assert_equal expected, received, message
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def type_of?(token_name, token)
|
37
|
+
token != 'ParseError' and token_name == token.first
|
38
|
+
end
|
39
|
+
|
40
|
+
def convert_attribute_arrays_to_hashes(tokens)
|
41
|
+
tokens.inject([]) do |tokens, token|
|
42
|
+
token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
|
43
|
+
tokens << token
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def concatenate_consecutive_characters(tokens)
|
48
|
+
tokens.inject([]) do |tokens, token|
|
49
|
+
if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
|
50
|
+
tokens.last[1] = tokens.last[1] + token[1]
|
51
|
+
next tokens
|
52
|
+
end
|
53
|
+
tokens << token
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def tokenizer_test(data)
|
58
|
+
(data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
|
59
|
+
message = [
|
60
|
+
'', 'Description:', data['description'],
|
61
|
+
'', 'Input:', data['input'],
|
62
|
+
'', 'Content Model Flag:', content_model_flag,
|
63
|
+
'' ] * "\n"
|
64
|
+
|
65
|
+
assert_nothing_raised message do
|
66
|
+
tokenizer = HTML5::HTMLTokenizer.new(data['input'])
|
67
|
+
|
68
|
+
tokenizer.content_model_flag = content_model_flag.to_sym
|
69
|
+
|
70
|
+
tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
|
71
|
+
|
72
|
+
tokens = TokenizerTestParser.new(tokenizer).parse
|
73
|
+
|
74
|
+
actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
|
75
|
+
|
76
|
+
expected = concatenate_consecutive_characters(data['output'])
|
77
|
+
|
78
|
+
assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
html5_test_files('tokenizer').each do |test_file|
|
84
|
+
test_name = File.basename(test_file).sub('.test', '')
|
85
|
+
|
86
|
+
tests = JSON.parse(File.read(test_file))['tests']
|
87
|
+
|
88
|
+
tests.each_with_index do |data, index|
|
89
|
+
define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
end
|
94
|
+
|
@@ -0,0 +1,116 @@
|
|
1
|
+
require File.join(File.dirname(__FILE__), 'preamble')
|
2
|
+
|
3
|
+
require 'html5/html5parser'
|
4
|
+
require 'html5/treewalkers'
|
5
|
+
require 'html5/treebuilders'
|
6
|
+
|
7
|
+
$tree_types_to_test = {
|
8
|
+
'simpletree' =>
|
9
|
+
{:builder => HTML5::TreeBuilders['simpletree'],
|
10
|
+
:walker => HTML5::TreeWalkers['simpletree']},
|
11
|
+
'rexml' =>
|
12
|
+
{:builder => HTML5::TreeBuilders['rexml'],
|
13
|
+
:walker => HTML5::TreeWalkers['rexml']},
|
14
|
+
'hpricot' =>
|
15
|
+
{:builder => HTML5::TreeBuilders['hpricot'],
|
16
|
+
:walker => HTML5::TreeWalkers['hpricot']},
|
17
|
+
}
|
18
|
+
|
19
|
+
puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
|
20
|
+
|
21
|
+
class TestTreeWalkers < Test::Unit::TestCase
|
22
|
+
include HTML5::TestSupport
|
23
|
+
|
24
|
+
def concatenateCharacterTokens(tokens)
|
25
|
+
charactersToken = nil
|
26
|
+
for token in tokens
|
27
|
+
type = token[:type]
|
28
|
+
if [:Characters, :SpaceCharacters].include?(type)
|
29
|
+
if charactersToken == nil
|
30
|
+
charactersToken = {:type => :Characters, :data => token[:data]}
|
31
|
+
else
|
32
|
+
charactersToken[:data] += token[:data]
|
33
|
+
end
|
34
|
+
else
|
35
|
+
if charactersToken != nil
|
36
|
+
yield charactersToken
|
37
|
+
charactersToken = nil
|
38
|
+
end
|
39
|
+
yield token
|
40
|
+
end
|
41
|
+
end
|
42
|
+
yield charactersToken if charactersToken != nil
|
43
|
+
end
|
44
|
+
|
45
|
+
def convertTokens(tokens)
|
46
|
+
output = []
|
47
|
+
indent = 0
|
48
|
+
concatenateCharacterTokens(tokens) do |token|
|
49
|
+
case token[:type]
|
50
|
+
when :StartTag, :EmptyTag
|
51
|
+
output << "#{' '*indent}<#{token[:name]}>"
|
52
|
+
indent += 2
|
53
|
+
for name, value in token[:data].to_a.sort
|
54
|
+
next if name=='xmlns'
|
55
|
+
output << "#{' '*indent}#{name}=\"#{value}\""
|
56
|
+
end
|
57
|
+
indent -= 2 if token[:type] == :EmptyTag
|
58
|
+
when :EndTag
|
59
|
+
indent -= 2
|
60
|
+
when :Comment
|
61
|
+
output << "#{' '*indent}<!-- #{token[:data]} -->"
|
62
|
+
when :Doctype
|
63
|
+
if token[:name] and token[:name].any?
|
64
|
+
output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
|
65
|
+
else
|
66
|
+
output << "#{' '*indent}<!DOCTYPE >"
|
67
|
+
end
|
68
|
+
when :Characters, :SpaceCharacters
|
69
|
+
output << "#{' '*indent}\"#{token[:data]}\""
|
70
|
+
else
|
71
|
+
# TODO: what to do with errors?
|
72
|
+
end
|
73
|
+
end
|
74
|
+
return output.join("\n")
|
75
|
+
end
|
76
|
+
|
77
|
+
html5_test_files('tree-construction').each do |test_file|
|
78
|
+
|
79
|
+
test_name = File.basename(test_file).sub('.dat', '')
|
80
|
+
next if test_name == 'tests5' # TODO
|
81
|
+
|
82
|
+
TestData.new(test_file, %w(data errors document-fragment document)).
|
83
|
+
each_with_index do |(input, errors, inner_html, expected), index|
|
84
|
+
|
85
|
+
expected = expected.gsub("\n| ","\n")[2..-1]
|
86
|
+
|
87
|
+
$tree_types_to_test.each do |tree_name, tree_class|
|
88
|
+
|
89
|
+
define_method "test_#{test_name}_#{index}_#{tree_name}" do
|
90
|
+
|
91
|
+
parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
|
92
|
+
|
93
|
+
if inner_html
|
94
|
+
parser.parse_fragment(input, inner_html)
|
95
|
+
else
|
96
|
+
parser.parse(input)
|
97
|
+
end
|
98
|
+
|
99
|
+
document = parser.tree.get_document
|
100
|
+
|
101
|
+
begin
|
102
|
+
output = sortattrs(convertTokens(tree_class[:walker].new(document)))
|
103
|
+
expected = sortattrs(expected)
|
104
|
+
assert_equal expected, output, [
|
105
|
+
'', 'Input:', input,
|
106
|
+
'', 'Expected:', expected,
|
107
|
+
'', 'Recieved:', output
|
108
|
+
].join("\n")
|
109
|
+
rescue NotImplementedError
|
110
|
+
# Amnesty for those that confess...
|
111
|
+
end
|
112
|
+
end
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'html5/constants'
|
2
|
+
|
3
|
+
class TokenizerTestParser
|
4
|
+
def initialize(tokenizer)
|
5
|
+
@tokenizer = tokenizer
|
6
|
+
end
|
7
|
+
|
8
|
+
def parse
|
9
|
+
@outputTokens = []
|
10
|
+
|
11
|
+
debug = nil
|
12
|
+
for token in @tokenizer
|
13
|
+
debug = token.inspect if token[:type] == :ParseError
|
14
|
+
send(('process' + token[:type].to_s), token)
|
15
|
+
end
|
16
|
+
|
17
|
+
return @outputTokens
|
18
|
+
end
|
19
|
+
|
20
|
+
def processDoctype(token)
|
21
|
+
@outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
|
22
|
+
token[:systemId], token[:correct]])
|
23
|
+
end
|
24
|
+
|
25
|
+
def processStartTag(token)
|
26
|
+
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
27
|
+
end
|
28
|
+
|
29
|
+
def processEmptyTag(token)
|
30
|
+
if not HTML5::VOID_ELEMENTS.include? token[:name]
|
31
|
+
@outputTokens.push("ParseError")
|
32
|
+
end
|
33
|
+
@outputTokens.push(["StartTag", token[:name], token[:data]])
|
34
|
+
end
|
35
|
+
|
36
|
+
def processEndTag(token)
|
37
|
+
if token[:data].length > 0
|
38
|
+
self.processParseError(token)
|
39
|
+
end
|
40
|
+
@outputTokens.push(["EndTag", token[:name]])
|
41
|
+
end
|
42
|
+
|
43
|
+
def processComment(token)
|
44
|
+
@outputTokens.push(["Comment", token[:data]])
|
45
|
+
end
|
46
|
+
|
47
|
+
def processCharacters(token)
|
48
|
+
@outputTokens.push(["Character", token[:data]])
|
49
|
+
end
|
50
|
+
|
51
|
+
alias processSpaceCharacters processCharacters
|
52
|
+
|
53
|
+
def processCharacters(token)
|
54
|
+
@outputTokens.push(["Character", token[:data]])
|
55
|
+
end
|
56
|
+
|
57
|
+
def process_eof(token)
|
58
|
+
end
|
59
|
+
|
60
|
+
def processParseError(token)
|
61
|
+
@outputTokens.push("ParseError")
|
62
|
+
end
|
63
|
+
end
|
metadata
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.2
|
3
|
+
specification_version: 1
|
4
|
+
name: html5
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2007-08-07 00:00:00 -07:00
|
8
|
+
summary: HTML5 parser/tokenizer.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: ryan@theryanking.com
|
12
|
+
homepage: http://code.google.com/p/html5lib
|
13
|
+
rubyforge_project: html5
|
14
|
+
description: A ruby based HTML parser/tokenizer based on the WHATWG HTML5 specification for maximum compatibility with major desktop web browsers.
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Ryan King
|
31
|
+
files:
|
32
|
+
- History.txt
|
33
|
+
- Manifest.txt
|
34
|
+
- README
|
35
|
+
- Rakefile.rb
|
36
|
+
- lib/html5.rb
|
37
|
+
- lib/html5/constants.rb
|
38
|
+
- lib/html5/filters/base.rb
|
39
|
+
- lib/html5/filters/inject_meta_charset.rb
|
40
|
+
- lib/html5/filters/optionaltags.rb
|
41
|
+
- lib/html5/filters/sanitizer.rb
|
42
|
+
- lib/html5/filters/whitespace.rb
|
43
|
+
- lib/html5/html5parser.rb
|
44
|
+
- lib/html5/html5parser/after_body_phase.rb
|
45
|
+
- lib/html5/html5parser/after_frameset_phase.rb
|
46
|
+
- lib/html5/html5parser/after_head_phase.rb
|
47
|
+
- lib/html5/html5parser/before_head_phase.rb
|
48
|
+
- lib/html5/html5parser/in_body_phase.rb
|
49
|
+
- lib/html5/html5parser/in_caption_phase.rb
|
50
|
+
- lib/html5/html5parser/in_cell_phase.rb
|
51
|
+
- lib/html5/html5parser/in_column_group_phase.rb
|
52
|
+
- lib/html5/html5parser/in_frameset_phase.rb
|
53
|
+
- lib/html5/html5parser/in_head_phase.rb
|
54
|
+
- lib/html5/html5parser/in_row_phase.rb
|
55
|
+
- lib/html5/html5parser/in_select_phase.rb
|
56
|
+
- lib/html5/html5parser/in_table_body_phase.rb
|
57
|
+
- lib/html5/html5parser/in_table_phase.rb
|
58
|
+
- lib/html5/html5parser/initial_phase.rb
|
59
|
+
- lib/html5/html5parser/phase.rb
|
60
|
+
- lib/html5/html5parser/root_element_phase.rb
|
61
|
+
- lib/html5/html5parser/trailing_end_phase.rb
|
62
|
+
- lib/html5/inputstream.rb
|
63
|
+
- lib/html5/liberalxmlparser.rb
|
64
|
+
- lib/html5/sanitizer.rb
|
65
|
+
- lib/html5/serializer.rb
|
66
|
+
- lib/html5/serializer/htmlserializer.rb
|
67
|
+
- lib/html5/serializer/xhtmlserializer.rb
|
68
|
+
- lib/html5/tokenizer.rb
|
69
|
+
- lib/html5/treebuilders.rb
|
70
|
+
- lib/html5/treebuilders/base.rb
|
71
|
+
- lib/html5/treebuilders/hpricot.rb
|
72
|
+
- lib/html5/treebuilders/rexml.rb
|
73
|
+
- lib/html5/treebuilders/simpletree.rb
|
74
|
+
- lib/html5/treewalkers.rb
|
75
|
+
- lib/html5/treewalkers/base.rb
|
76
|
+
- lib/html5/treewalkers/hpricot.rb
|
77
|
+
- lib/html5/treewalkers/rexml.rb
|
78
|
+
- lib/html5/treewalkers/simpletree.rb
|
79
|
+
- parse.rb
|
80
|
+
- tests/preamble.rb
|
81
|
+
- tests/test_encoding.rb
|
82
|
+
- tests/test_lxp.rb
|
83
|
+
- tests/test_parser.rb
|
84
|
+
- tests/test_sanitizer.rb
|
85
|
+
- tests/test_serializer.rb
|
86
|
+
- tests/test_stream.rb
|
87
|
+
- tests/test_tokenizer.rb
|
88
|
+
- tests/test_treewalkers.rb
|
89
|
+
- tests/tokenizer_test_parser.rb
|
90
|
+
test_files: []
|
91
|
+
|
92
|
+
rdoc_options: []
|
93
|
+
|
94
|
+
extra_rdoc_files: []
|
95
|
+
|
96
|
+
executables: []
|
97
|
+
|
98
|
+
extensions: []
|
99
|
+
|
100
|
+
requirements: []
|
101
|
+
|
102
|
+
dependencies:
|
103
|
+
- !ruby/object:Gem::Dependency
|
104
|
+
name: chardet
|
105
|
+
version_requirement:
|
106
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.9.0
|
111
|
+
version:
|
112
|
+
- !ruby/object:Gem::Dependency
|
113
|
+
name: hoe
|
114
|
+
version_requirement:
|
115
|
+
version_requirements: !ruby/object:Gem::Version::Requirement
|
116
|
+
requirements:
|
117
|
+
- - ">="
|
118
|
+
- !ruby/object:Gem::Version
|
119
|
+
version: 1.2.0
|
120
|
+
version:
|