html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,142 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), 'preamble')
4
+
5
+ require 'html5/html5parser'
6
+ require 'html5/liberalxmlparser'
7
+ require 'html5/treewalkers'
8
+ require 'html5/serializer'
9
+ require 'html5/sanitizer'
10
+
11
+ class SanitizeTest < Test::Unit::TestCase
12
+ include HTML5
13
+
14
+ def sanitize_xhtml stream
15
+ XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
16
+ end
17
+
18
+ def sanitize_html stream
19
+ HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
20
+ end
21
+
22
+ def sanitize_rexml stream
23
+ require 'rexml/document'
24
+ doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
25
+ tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
26
+ XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
27
+ :quote_char => "'",
28
+ :inject_meta_charset => false,
29
+ :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
30
+ rescue REXML::ParseException
31
+ return "Ill-formed XHTML!"
32
+ end
33
+
34
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
35
+ assert_equal htmloutput, sanitize_html(input)
36
+ assert_equal xhtmloutput, sanitize_xhtml(input)
37
+ assert_equal rexmloutput, sanitize_rexml(input)
38
+ end
39
+
40
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
41
+ define_method "test_should_allow_#{tag_name}_tag" do
42
+ input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
+ htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
+ xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
+ rexmloutput = xhtmloutput
46
+
47
+ if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
+ xhtmloutput = htmloutput
50
+ elsif tag_name == 'col'
51
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
+ xhtmloutput = htmloutput
53
+ rexmloutput = "<col title='1' />"
54
+ elsif tag_name == 'table'
55
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
+ xhtmloutput = htmloutput
57
+ elsif tag_name == 'image'
58
+ htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
+ xhtmloutput = htmloutput
60
+ rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
+ elsif VOID_ELEMENTS.include?(tag_name)
62
+ htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
+ xhtmloutput = htmloutput
64
+ htmloutput += '<br/>' if tag_name == 'br'
65
+ rexmloutput = "<#{tag_name} title='1' />"
66
+ end
67
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
+ end
69
+ end
70
+
71
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
72
+ define_method "test_should_forbid_#{tag_name.upcase}_tag" do
73
+ input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
74
+ output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
75
+ check_sanitization(input, output, output, output)
76
+ end
77
+ end
78
+
79
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
80
+ next if attribute_name == 'style'
81
+ define_method "test_should_allow_#{attribute_name}_attribute" do
82
+ input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
83
+ output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
84
+ htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
85
+ check_sanitization(input, htmloutput, output, output)
86
+ end
87
+ end
88
+
89
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
90
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
91
+ input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
92
+ output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
93
+ check_sanitization(input, output, output, output)
94
+ end
95
+ end
96
+
97
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
98
+ define_method "test_should_allow_#{protocol}_uris" do
99
+ input = %(<a href="#{protocol}">foo</a>)
100
+ output = "<a href='#{protocol}'>foo</a>"
101
+ check_sanitization(input, output, output, output)
102
+ end
103
+ end
104
+
105
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
106
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
107
+ input = %(<a href="#{protocol.upcase}">foo</a>)
108
+ output = "<a href='#{protocol.upcase}'>foo</a>"
109
+ check_sanitization(input, output, output, output)
110
+ end
111
+ end
112
+
113
+ def test_should_handle_astral_plane_characters
114
+ input = "<p>&#x1d4b5; &#x1d538;</p>"
115
+ output = "<p>\360\235\222\265 \360\235\224\270</p>"
116
+ check_sanitization(input, output, output, output)
117
+
118
+ input = "<p><tspan>\360\235\224\270</tspan> a</p>"
119
+ output = "<p><tspan>\360\235\224\270</tspan> a</p>"
120
+ check_sanitization(input, output, output, output)
121
+ end
122
+
123
+ # This affects only NS4. Is it worth fixing?
124
+ # def test_javascript_includes
125
+ # input = %(<div size="&{alert('XSS')}">foo</div>)
126
+ # output = "<div>foo</div>"
127
+ # check_sanitization(input, output, output, output)
128
+ # end
129
+
130
+ html5_test_files('sanitizer').each do |filename|
131
+ JSON::parse(open(filename).read).each do |test|
132
+ define_method "test_#{test['name']}" do
133
+ check_sanitization(
134
+ test['input'],
135
+ test['output'],
136
+ test['xhtml'] || test['output'],
137
+ test['rexml'] || test['output']
138
+ )
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/serializer'
5
+ require 'html5/treewalkers'
6
+
7
+ #Run the serialize error checks
8
+ checkSerializeErrors = false
9
+
10
+ class JsonWalker < HTML5::TreeWalkers::Base
11
+ def each
12
+ @tree.each do |token|
13
+ case token[0]
14
+ when 'StartTag'
15
+ yield start_tag(token[1], token[2])
16
+ when 'EndTag'
17
+ yield end_tag(token[1])
18
+ when 'EmptyTag'
19
+ yield empty_tag(token[1], token[2])
20
+ when 'Comment'
21
+ yield comment(token[1])
22
+ when 'Characters', 'SpaceCharacters'
23
+ text(token[1]) {|textToken| yield textToken}
24
+ when 'Doctype'
25
+ yield doctype(token[1], token[2], token[3])
26
+ else
27
+ raise "Unknown token type: " + token[0]
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ class Html5SerializeTestcase < Test::Unit::TestCase
34
+ html5_test_files('serializer').each do |filename|
35
+ test_name = File.basename(filename).sub('.test', '')
36
+ tests = JSON::parse(open(filename).read)
37
+ tests['tests'].each_with_index do |test, index|
38
+
39
+ define_method "test_#{test_name}_#{index+1}" do
40
+ if test["options"] and test["options"]["encoding"]
41
+ test["options"][:encoding] = test["options"]["encoding"]
42
+ end
43
+
44
+ result = HTML5::HTMLSerializer.
45
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
46
+ expected = test["expected"]
47
+ if expected.length == 1
48
+ assert_equal(expected[0], result, test["description"])
49
+ elsif !expected.include?(result)
50
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
51
+ end
52
+
53
+ return if test_name == 'optionaltags'
54
+
55
+ result = HTML5::XHTMLSerializer.
56
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
57
+ expected = test["xhtml"] || test["expected"]
58
+ if expected.length == 1
59
+ assert_equal(expected[0], result, test["description"])
60
+ elsif !expected.include?(result)
61
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,62 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class HTMLInputStreamTest < Test::Unit::TestCase
6
+ include HTML5
7
+
8
+ def test_char_ascii
9
+ stream = HTMLInputStream.new("'", :encoding=>'ascii')
10
+ assert_equal('ascii', stream.char_encoding)
11
+ assert_equal("'", stream.char)
12
+ end
13
+
14
+ def test_char_null
15
+ stream = HTMLInputStream.new("\x00")
16
+ assert_equal("\xef\xbf\xbd", stream.char)
17
+ end
18
+
19
+ def test_char_utf8
20
+ stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
21
+ assert_equal('utf-8', stream.char_encoding)
22
+ assert_equal("\xe2\x80\x98", stream.char)
23
+ end
24
+
25
+ def test_char_win1252
26
+ stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
27
+ assert_equal('windows-1252', stream.char_encoding)
28
+ assert_equal("\xc2\xa2", stream.char)
29
+ assert_equal("\xc3\x85", stream.char)
30
+ assert_equal("\xc3\xb1", stream.char)
31
+ assert_equal("\xe2\x80\x99", stream.char)
32
+ assert_equal("\xe2\x80\xa0", stream.char)
33
+ end
34
+
35
+ def test_bom
36
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
37
+ assert_equal('utf-8', stream.char_encoding)
38
+ assert_equal("'", stream.char)
39
+ end
40
+
41
+ begin
42
+ require 'iconv'
43
+
44
+ def test_utf_16
45
+ stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
46
+ assert(stream.char_encoding, 'utf-16-le')
47
+ assert_equal(1025, stream.chars_until(' ',true).length)
48
+ end
49
+ rescue LoadError
50
+ puts "iconv not found, skipping iconv tests"
51
+ end
52
+
53
+ def test_newlines
54
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
55
+ assert_equal([1,0], stream.position)
56
+ assert_equal("a\nbb\n", stream.chars_until('c'))
57
+ assert_equal([3,0], stream.position)
58
+ assert_equal("ccc\ndddd", stream.chars_until('x'))
59
+ assert_equal([4,4], stream.position)
60
+ assert_equal([1,2,3], stream.instance_eval {@line_lengths})
61
+ end
62
+ end
@@ -0,0 +1,94 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/tokenizer'
4
+
5
+ require 'tokenizer_test_parser'
6
+
7
+ class Html5TokenizerTestCase < Test::Unit::TestCase
8
+
9
+ def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
10
+ if !ignoreErrorOrder
11
+ return expectedTokens == receivedTokens
12
+ else
13
+ #Sort the tokens into two groups; non-parse errors and parse errors
14
+ expected = [[],[]]
15
+ received = [[],[]]
16
+
17
+ for token in expectedTokens
18
+ if token != "ParseError"
19
+ expected[0] << token
20
+ else
21
+ expected[1] << token
22
+ end
23
+ end
24
+
25
+ for token in receivedTokens
26
+ if token != "ParseError"
27
+ received[0] << token
28
+ else
29
+ received[1] << token
30
+ end
31
+ end
32
+ assert_equal expected, received, message
33
+ end
34
+ end
35
+
36
+ def type_of?(token_name, token)
37
+ token != 'ParseError' and token_name == token.first
38
+ end
39
+
40
+ def convert_attribute_arrays_to_hashes(tokens)
41
+ tokens.inject([]) do |tokens, token|
42
+ token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
43
+ tokens << token
44
+ end
45
+ end
46
+
47
+ def concatenate_consecutive_characters(tokens)
48
+ tokens.inject([]) do |tokens, token|
49
+ if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
50
+ tokens.last[1] = tokens.last[1] + token[1]
51
+ next tokens
52
+ end
53
+ tokens << token
54
+ end
55
+ end
56
+
57
+ def tokenizer_test(data)
58
+ (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
59
+ message = [
60
+ '', 'Description:', data['description'],
61
+ '', 'Input:', data['input'],
62
+ '', 'Content Model Flag:', content_model_flag,
63
+ '' ] * "\n"
64
+
65
+ assert_nothing_raised message do
66
+ tokenizer = HTML5::HTMLTokenizer.new(data['input'])
67
+
68
+ tokenizer.content_model_flag = content_model_flag.to_sym
69
+
70
+ tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
71
+
72
+ tokens = TokenizerTestParser.new(tokenizer).parse
73
+
74
+ actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
75
+
76
+ expected = concatenate_consecutive_characters(data['output'])
77
+
78
+ assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
79
+ end
80
+ end
81
+ end
82
+
83
+ html5_test_files('tokenizer').each do |test_file|
84
+ test_name = File.basename(test_file).sub('.test', '')
85
+
86
+ tests = JSON.parse(File.read(test_file))['tests']
87
+
88
+ tests.each_with_index do |data, index|
89
+ define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
90
+ end
91
+ end
92
+
93
+ end
94
+
@@ -0,0 +1,116 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/treewalkers'
5
+ require 'html5/treebuilders'
6
+
7
+ $tree_types_to_test = {
8
+ 'simpletree' =>
9
+ {:builder => HTML5::TreeBuilders['simpletree'],
10
+ :walker => HTML5::TreeWalkers['simpletree']},
11
+ 'rexml' =>
12
+ {:builder => HTML5::TreeBuilders['rexml'],
13
+ :walker => HTML5::TreeWalkers['rexml']},
14
+ 'hpricot' =>
15
+ {:builder => HTML5::TreeBuilders['hpricot'],
16
+ :walker => HTML5::TreeWalkers['hpricot']},
17
+ }
18
+
19
+ puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
20
+
21
+ class TestTreeWalkers < Test::Unit::TestCase
22
+ include HTML5::TestSupport
23
+
24
+ def concatenateCharacterTokens(tokens)
25
+ charactersToken = nil
26
+ for token in tokens
27
+ type = token[:type]
28
+ if [:Characters, :SpaceCharacters].include?(type)
29
+ if charactersToken == nil
30
+ charactersToken = {:type => :Characters, :data => token[:data]}
31
+ else
32
+ charactersToken[:data] += token[:data]
33
+ end
34
+ else
35
+ if charactersToken != nil
36
+ yield charactersToken
37
+ charactersToken = nil
38
+ end
39
+ yield token
40
+ end
41
+ end
42
+ yield charactersToken if charactersToken != nil
43
+ end
44
+
45
+ def convertTokens(tokens)
46
+ output = []
47
+ indent = 0
48
+ concatenateCharacterTokens(tokens) do |token|
49
+ case token[:type]
50
+ when :StartTag, :EmptyTag
51
+ output << "#{' '*indent}<#{token[:name]}>"
52
+ indent += 2
53
+ for name, value in token[:data].to_a.sort
54
+ next if name=='xmlns'
55
+ output << "#{' '*indent}#{name}=\"#{value}\""
56
+ end
57
+ indent -= 2 if token[:type] == :EmptyTag
58
+ when :EndTag
59
+ indent -= 2
60
+ when :Comment
61
+ output << "#{' '*indent}<!-- #{token[:data]} -->"
62
+ when :Doctype
63
+ if token[:name] and token[:name].any?
64
+ output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
65
+ else
66
+ output << "#{' '*indent}<!DOCTYPE >"
67
+ end
68
+ when :Characters, :SpaceCharacters
69
+ output << "#{' '*indent}\"#{token[:data]}\""
70
+ else
71
+ # TODO: what to do with errors?
72
+ end
73
+ end
74
+ return output.join("\n")
75
+ end
76
+
77
+ html5_test_files('tree-construction').each do |test_file|
78
+
79
+ test_name = File.basename(test_file).sub('.dat', '')
80
+ next if test_name == 'tests5' # TODO
81
+
82
+ TestData.new(test_file, %w(data errors document-fragment document)).
83
+ each_with_index do |(input, errors, inner_html, expected), index|
84
+
85
+ expected = expected.gsub("\n| ","\n")[2..-1]
86
+
87
+ $tree_types_to_test.each do |tree_name, tree_class|
88
+
89
+ define_method "test_#{test_name}_#{index}_#{tree_name}" do
90
+
91
+ parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
92
+
93
+ if inner_html
94
+ parser.parse_fragment(input, inner_html)
95
+ else
96
+ parser.parse(input)
97
+ end
98
+
99
+ document = parser.tree.get_document
100
+
101
+ begin
102
+ output = sortattrs(convertTokens(tree_class[:walker].new(document)))
103
+ expected = sortattrs(expected)
104
+ assert_equal expected, output, [
105
+ '', 'Input:', input,
106
+ '', 'Expected:', expected,
107
+ '', 'Recieved:', output
108
+ ].join("\n")
109
+ rescue NotImplementedError
110
+ # Amnesty for those that confess...
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,63 @@
1
+ require 'html5/constants'
2
+
3
+ class TokenizerTestParser
4
+ def initialize(tokenizer)
5
+ @tokenizer = tokenizer
6
+ end
7
+
8
+ def parse
9
+ @outputTokens = []
10
+
11
+ debug = nil
12
+ for token in @tokenizer
13
+ debug = token.inspect if token[:type] == :ParseError
14
+ send(('process' + token[:type].to_s), token)
15
+ end
16
+
17
+ return @outputTokens
18
+ end
19
+
20
+ def processDoctype(token)
21
+ @outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
22
+ token[:systemId], token[:correct]])
23
+ end
24
+
25
+ def processStartTag(token)
26
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
27
+ end
28
+
29
+ def processEmptyTag(token)
30
+ if not HTML5::VOID_ELEMENTS.include? token[:name]
31
+ @outputTokens.push("ParseError")
32
+ end
33
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
34
+ end
35
+
36
+ def processEndTag(token)
37
+ if token[:data].length > 0
38
+ self.processParseError(token)
39
+ end
40
+ @outputTokens.push(["EndTag", token[:name]])
41
+ end
42
+
43
+ def processComment(token)
44
+ @outputTokens.push(["Comment", token[:data]])
45
+ end
46
+
47
+ def processCharacters(token)
48
+ @outputTokens.push(["Character", token[:data]])
49
+ end
50
+
51
+ alias processSpaceCharacters processCharacters
52
+
53
+ def processCharacters(token)
54
+ @outputTokens.push(["Character", token[:data]])
55
+ end
56
+
57
+ def process_eof(token)
58
+ end
59
+
60
+ def processParseError(token)
61
+ @outputTokens.push("ParseError")
62
+ end
63
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.2
3
+ specification_version: 1
4
+ name: html5
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-08-07 00:00:00 -07:00
8
+ summary: HTML5 parser/tokenizer.
9
+ require_paths:
10
+ - lib
11
+ email: ryan@theryanking.com
12
+ homepage: http://code.google.com/p/html5lib
13
+ rubyforge_project: html5
14
+ description: A ruby based HTML parser/tokenizer based on the WHATWG HTML5 specification for maximum compatibility with major desktop web browsers.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Ryan King
31
+ files:
32
+ - History.txt
33
+ - Manifest.txt
34
+ - README
35
+ - Rakefile.rb
36
+ - lib/html5.rb
37
+ - lib/html5/constants.rb
38
+ - lib/html5/filters/base.rb
39
+ - lib/html5/filters/inject_meta_charset.rb
40
+ - lib/html5/filters/optionaltags.rb
41
+ - lib/html5/filters/sanitizer.rb
42
+ - lib/html5/filters/whitespace.rb
43
+ - lib/html5/html5parser.rb
44
+ - lib/html5/html5parser/after_body_phase.rb
45
+ - lib/html5/html5parser/after_frameset_phase.rb
46
+ - lib/html5/html5parser/after_head_phase.rb
47
+ - lib/html5/html5parser/before_head_phase.rb
48
+ - lib/html5/html5parser/in_body_phase.rb
49
+ - lib/html5/html5parser/in_caption_phase.rb
50
+ - lib/html5/html5parser/in_cell_phase.rb
51
+ - lib/html5/html5parser/in_column_group_phase.rb
52
+ - lib/html5/html5parser/in_frameset_phase.rb
53
+ - lib/html5/html5parser/in_head_phase.rb
54
+ - lib/html5/html5parser/in_row_phase.rb
55
+ - lib/html5/html5parser/in_select_phase.rb
56
+ - lib/html5/html5parser/in_table_body_phase.rb
57
+ - lib/html5/html5parser/in_table_phase.rb
58
+ - lib/html5/html5parser/initial_phase.rb
59
+ - lib/html5/html5parser/phase.rb
60
+ - lib/html5/html5parser/root_element_phase.rb
61
+ - lib/html5/html5parser/trailing_end_phase.rb
62
+ - lib/html5/inputstream.rb
63
+ - lib/html5/liberalxmlparser.rb
64
+ - lib/html5/sanitizer.rb
65
+ - lib/html5/serializer.rb
66
+ - lib/html5/serializer/htmlserializer.rb
67
+ - lib/html5/serializer/xhtmlserializer.rb
68
+ - lib/html5/tokenizer.rb
69
+ - lib/html5/treebuilders.rb
70
+ - lib/html5/treebuilders/base.rb
71
+ - lib/html5/treebuilders/hpricot.rb
72
+ - lib/html5/treebuilders/rexml.rb
73
+ - lib/html5/treebuilders/simpletree.rb
74
+ - lib/html5/treewalkers.rb
75
+ - lib/html5/treewalkers/base.rb
76
+ - lib/html5/treewalkers/hpricot.rb
77
+ - lib/html5/treewalkers/rexml.rb
78
+ - lib/html5/treewalkers/simpletree.rb
79
+ - parse.rb
80
+ - tests/preamble.rb
81
+ - tests/test_encoding.rb
82
+ - tests/test_lxp.rb
83
+ - tests/test_parser.rb
84
+ - tests/test_sanitizer.rb
85
+ - tests/test_serializer.rb
86
+ - tests/test_stream.rb
87
+ - tests/test_tokenizer.rb
88
+ - tests/test_treewalkers.rb
89
+ - tests/tokenizer_test_parser.rb
90
+ test_files: []
91
+
92
+ rdoc_options: []
93
+
94
+ extra_rdoc_files: []
95
+
96
+ executables: []
97
+
98
+ extensions: []
99
+
100
+ requirements: []
101
+
102
+ dependencies:
103
+ - !ruby/object:Gem::Dependency
104
+ name: chardet
105
+ version_requirement:
106
+ version_requirements: !ruby/object:Gem::Version::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: 0.9.0
111
+ version:
112
+ - !ruby/object:Gem::Dependency
113
+ name: hoe
114
+ version_requirement:
115
+ version_requirements: !ruby/object:Gem::Version::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 1.2.0
120
+ version: