html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,142 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require File.join(File.dirname(__FILE__), 'preamble')
4
+
5
+ require 'html5/html5parser'
6
+ require 'html5/liberalxmlparser'
7
+ require 'html5/treewalkers'
8
+ require 'html5/serializer'
9
+ require 'html5/sanitizer'
10
+
11
+ class SanitizeTest < Test::Unit::TestCase
12
+ include HTML5
13
+
14
+ def sanitize_xhtml stream
15
+ XHTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
16
+ end
17
+
18
+ def sanitize_html stream
19
+ HTMLParser.parse_fragment(stream, {:tokenizer => HTMLSanitizer, :encoding => 'utf-8', :lowercase_element_name => false, :lowercase_attr_name => false}).to_s
20
+ end
21
+
22
+ def sanitize_rexml stream
23
+ require 'rexml/document'
24
+ doc = REXML::Document.new("<div xmlns='http://www.w3.org/1999/xhtml'>#{stream}</div>")
25
+ tokens = TreeWalkers.get_tree_walker('rexml').new(doc)
26
+ XHTMLSerializer.serialize(tokens, {:encoding=>'utf-8',
27
+ :quote_char => "'",
28
+ :inject_meta_charset => false,
29
+ :sanitize => true}).gsub(/\A<div xmlns='http:\/\/www.w3.org\/1999\/xhtml'>(.*)<\/div>\Z/m, '\1')
30
+ rescue REXML::ParseException
31
+ return "Ill-formed XHTML!"
32
+ end
33
+
34
+ def check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
35
+ assert_equal htmloutput, sanitize_html(input)
36
+ assert_equal xhtmloutput, sanitize_xhtml(input)
37
+ assert_equal rexmloutput, sanitize_rexml(input)
38
+ end
39
+
40
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
41
+ define_method "test_should_allow_#{tag_name}_tag" do
42
+ input = "<#{tag_name} title='1'>foo <bad>bar</bad> baz</#{tag_name}>"
43
+ htmloutput = "<#{tag_name.downcase} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name.downcase}>"
44
+ xhtmloutput = "<#{tag_name} title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</#{tag_name}>"
45
+ rexmloutput = xhtmloutput
46
+
47
+ if %w[caption colgroup optgroup option tbody td tfoot th thead tr].include?(tag_name)
48
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
49
+ xhtmloutput = htmloutput
50
+ elsif tag_name == 'col'
51
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt; baz"
52
+ xhtmloutput = htmloutput
53
+ rexmloutput = "<col title='1' />"
54
+ elsif tag_name == 'table'
55
+ htmloutput = "foo &lt;bad&gt;bar&lt;/bad&gt;baz<table title='1'> </table>"
56
+ xhtmloutput = htmloutput
57
+ elsif tag_name == 'image'
58
+ htmloutput = "<img title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
59
+ xhtmloutput = htmloutput
60
+ rexmloutput = "<image title='1'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</image>"
61
+ elsif VOID_ELEMENTS.include?(tag_name)
62
+ htmloutput = "<#{tag_name} title='1'/>foo &lt;bad&gt;bar&lt;/bad&gt; baz"
63
+ xhtmloutput = htmloutput
64
+ htmloutput += '<br/>' if tag_name == 'br'
65
+ rexmloutput = "<#{tag_name} title='1' />"
66
+ end
67
+ check_sanitization(input, htmloutput, xhtmloutput, rexmloutput)
68
+ end
69
+ end
70
+
71
+ HTMLSanitizer::ALLOWED_ELEMENTS.each do |tag_name|
72
+ define_method "test_should_forbid_#{tag_name.upcase}_tag" do
73
+ input = "<#{tag_name.upcase} title='1'>foo <bad>bar</bad> baz</#{tag_name.upcase}>"
74
+ output = "&lt;#{tag_name.upcase} title=\"1\"&gt;foo &lt;bad&gt;bar&lt;/bad&gt; baz&lt;/#{tag_name.upcase}&gt;"
75
+ check_sanitization(input, output, output, output)
76
+ end
77
+ end
78
+
79
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
80
+ next if attribute_name == 'style'
81
+ define_method "test_should_allow_#{attribute_name}_attribute" do
82
+ input = "<p #{attribute_name}='foo'>foo <bad>bar</bad> baz</p>"
83
+ output = "<p #{attribute_name}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
84
+ htmloutput = "<p #{attribute_name.downcase}='foo'>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
85
+ check_sanitization(input, htmloutput, output, output)
86
+ end
87
+ end
88
+
89
+ HTMLSanitizer::ALLOWED_ATTRIBUTES.each do |attribute_name|
90
+ define_method "test_should_forbid_#{attribute_name.upcase}_attribute" do
91
+ input = "<p #{attribute_name.upcase}='display: none;'>foo <bad>bar</bad> baz</p>"
92
+ output = "<p>foo &lt;bad&gt;bar&lt;/bad&gt; baz</p>"
93
+ check_sanitization(input, output, output, output)
94
+ end
95
+ end
96
+
97
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
98
+ define_method "test_should_allow_#{protocol}_uris" do
99
+ input = %(<a href="#{protocol}">foo</a>)
100
+ output = "<a href='#{protocol}'>foo</a>"
101
+ check_sanitization(input, output, output, output)
102
+ end
103
+ end
104
+
105
+ HTMLSanitizer::ALLOWED_PROTOCOLS.each do |protocol|
106
+ define_method "test_should_allow_uppercase_#{protocol}_uris" do
107
+ input = %(<a href="#{protocol.upcase}">foo</a>)
108
+ output = "<a href='#{protocol.upcase}'>foo</a>"
109
+ check_sanitization(input, output, output, output)
110
+ end
111
+ end
112
+
113
+ def test_should_handle_astral_plane_characters
114
+ input = "<p>&#x1d4b5; &#x1d538;</p>"
115
+ output = "<p>\360\235\222\265 \360\235\224\270</p>"
116
+ check_sanitization(input, output, output, output)
117
+
118
+ input = "<p><tspan>\360\235\224\270</tspan> a</p>"
119
+ output = "<p><tspan>\360\235\224\270</tspan> a</p>"
120
+ check_sanitization(input, output, output, output)
121
+ end
122
+
123
+ # This affects only NS4. Is it worth fixing?
124
+ # def test_javascript_includes
125
+ # input = %(<div size="&{alert('XSS')}">foo</div>)
126
+ # output = "<div>foo</div>"
127
+ # check_sanitization(input, output, output, output)
128
+ # end
129
+
130
+ html5_test_files('sanitizer').each do |filename|
131
+ JSON::parse(open(filename).read).each do |test|
132
+ define_method "test_#{test['name']}" do
133
+ check_sanitization(
134
+ test['input'],
135
+ test['output'],
136
+ test['xhtml'] || test['output'],
137
+ test['rexml'] || test['output']
138
+ )
139
+ end
140
+ end
141
+ end
142
+ end
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/serializer'
5
+ require 'html5/treewalkers'
6
+
7
+ #Run the serialize error checks
8
+ checkSerializeErrors = false
9
+
10
+ class JsonWalker < HTML5::TreeWalkers::Base
11
+ def each
12
+ @tree.each do |token|
13
+ case token[0]
14
+ when 'StartTag'
15
+ yield start_tag(token[1], token[2])
16
+ when 'EndTag'
17
+ yield end_tag(token[1])
18
+ when 'EmptyTag'
19
+ yield empty_tag(token[1], token[2])
20
+ when 'Comment'
21
+ yield comment(token[1])
22
+ when 'Characters', 'SpaceCharacters'
23
+ text(token[1]) {|textToken| yield textToken}
24
+ when 'Doctype'
25
+ yield doctype(token[1], token[2], token[3])
26
+ else
27
+ raise "Unknown token type: " + token[0]
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ class Html5SerializeTestcase < Test::Unit::TestCase
34
+ html5_test_files('serializer').each do |filename|
35
+ test_name = File.basename(filename).sub('.test', '')
36
+ tests = JSON::parse(open(filename).read)
37
+ tests['tests'].each_with_index do |test, index|
38
+
39
+ define_method "test_#{test_name}_#{index+1}" do
40
+ if test["options"] and test["options"]["encoding"]
41
+ test["options"][:encoding] = test["options"]["encoding"]
42
+ end
43
+
44
+ result = HTML5::HTMLSerializer.
45
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
46
+ expected = test["expected"]
47
+ if expected.length == 1
48
+ assert_equal(expected[0], result, test["description"])
49
+ elsif !expected.include?(result)
50
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
51
+ end
52
+
53
+ return if test_name == 'optionaltags'
54
+
55
+ result = HTML5::XHTMLSerializer.
56
+ serialize(JsonWalker.new(test["input"]), (test["options"] || {}))
57
+ expected = test["xhtml"] || test["expected"]
58
+ if expected.length == 1
59
+ assert_equal(expected[0], result, test["description"])
60
+ elsif !expected.include?(result)
61
+ flunk("Expected: #{expected.inspect}, Received: #{result.inspect}")
62
+ end
63
+
64
+ end
65
+
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,62 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class HTMLInputStreamTest < Test::Unit::TestCase
6
+ include HTML5
7
+
8
+ def test_char_ascii
9
+ stream = HTMLInputStream.new("'", :encoding=>'ascii')
10
+ assert_equal('ascii', stream.char_encoding)
11
+ assert_equal("'", stream.char)
12
+ end
13
+
14
+ def test_char_null
15
+ stream = HTMLInputStream.new("\x00")
16
+ assert_equal("\xef\xbf\xbd", stream.char)
17
+ end
18
+
19
+ def test_char_utf8
20
+ stream = HTMLInputStream.new("\xe2\x80\x98", :encoding=>'utf-8')
21
+ assert_equal('utf-8', stream.char_encoding)
22
+ assert_equal("\xe2\x80\x98", stream.char)
23
+ end
24
+
25
+ def test_char_win1252
26
+ stream = HTMLInputStream.new("\xa2\xc5\xf1\x92\x86")
27
+ assert_equal('windows-1252', stream.char_encoding)
28
+ assert_equal("\xc2\xa2", stream.char)
29
+ assert_equal("\xc3\x85", stream.char)
30
+ assert_equal("\xc3\xb1", stream.char)
31
+ assert_equal("\xe2\x80\x99", stream.char)
32
+ assert_equal("\xe2\x80\xa0", stream.char)
33
+ end
34
+
35
+ def test_bom
36
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "'")
37
+ assert_equal('utf-8', stream.char_encoding)
38
+ assert_equal("'", stream.char)
39
+ end
40
+
41
+ begin
42
+ require 'iconv'
43
+
44
+ def test_utf_16
45
+ stream = HTMLInputStream.new("\xff\xfe" + " \x00"*1025)
46
+ assert(stream.char_encoding, 'utf-16-le')
47
+ assert_equal(1025, stream.chars_until(' ',true).length)
48
+ end
49
+ rescue LoadError
50
+ puts "iconv not found, skipping iconv tests"
51
+ end
52
+
53
+ def test_newlines
54
+ stream = HTMLInputStream.new("\xef\xbb\xbf" + "a\nbb\r\nccc\rdddd")
55
+ assert_equal([1,0], stream.position)
56
+ assert_equal("a\nbb\n", stream.chars_until('c'))
57
+ assert_equal([3,0], stream.position)
58
+ assert_equal("ccc\ndddd", stream.chars_until('x'))
59
+ assert_equal([4,4], stream.position)
60
+ assert_equal([1,2,3], stream.instance_eval {@line_lengths})
61
+ end
62
+ end
@@ -0,0 +1,94 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/tokenizer'
4
+
5
+ require 'tokenizer_test_parser'
6
+
7
+ class Html5TokenizerTestCase < Test::Unit::TestCase
8
+
9
+ def assert_tokens_match(expectedTokens, receivedTokens, ignoreErrorOrder, message)
10
+ if !ignoreErrorOrder
11
+ return expectedTokens == receivedTokens
12
+ else
13
+ #Sort the tokens into two groups; non-parse errors and parse errors
14
+ expected = [[],[]]
15
+ received = [[],[]]
16
+
17
+ for token in expectedTokens
18
+ if token != "ParseError"
19
+ expected[0] << token
20
+ else
21
+ expected[1] << token
22
+ end
23
+ end
24
+
25
+ for token in receivedTokens
26
+ if token != "ParseError"
27
+ received[0] << token
28
+ else
29
+ received[1] << token
30
+ end
31
+ end
32
+ assert_equal expected, received, message
33
+ end
34
+ end
35
+
36
+ def type_of?(token_name, token)
37
+ token != 'ParseError' and token_name == token.first
38
+ end
39
+
40
+ def convert_attribute_arrays_to_hashes(tokens)
41
+ tokens.inject([]) do |tokens, token|
42
+ token[2] = Hash[*token[2].reverse.flatten] if type_of?('StartTag', token)
43
+ tokens << token
44
+ end
45
+ end
46
+
47
+ def concatenate_consecutive_characters(tokens)
48
+ tokens.inject([]) do |tokens, token|
49
+ if type_of?('Character', token) and tokens.any? and type_of?('Character', tokens.last)
50
+ tokens.last[1] = tokens.last[1] + token[1]
51
+ next tokens
52
+ end
53
+ tokens << token
54
+ end
55
+ end
56
+
57
+ def tokenizer_test(data)
58
+ (data['contentModelFlags'] || [:PCDATA]).each do |content_model_flag|
59
+ message = [
60
+ '', 'Description:', data['description'],
61
+ '', 'Input:', data['input'],
62
+ '', 'Content Model Flag:', content_model_flag,
63
+ '' ] * "\n"
64
+
65
+ assert_nothing_raised message do
66
+ tokenizer = HTML5::HTMLTokenizer.new(data['input'])
67
+
68
+ tokenizer.content_model_flag = content_model_flag.to_sym
69
+
70
+ tokenizer.current_token = {:type => :startTag, :name => data['lastStartTag']} if data.has_key?('lastStartTag')
71
+
72
+ tokens = TokenizerTestParser.new(tokenizer).parse
73
+
74
+ actual = concatenate_consecutive_characters(convert_attribute_arrays_to_hashes(tokens))
75
+
76
+ expected = concatenate_consecutive_characters(data['output'])
77
+
78
+ assert_tokens_match expected, actual, data["ignoreErrorOrder"], message
79
+ end
80
+ end
81
+ end
82
+
83
+ html5_test_files('tokenizer').each do |test_file|
84
+ test_name = File.basename(test_file).sub('.test', '')
85
+
86
+ tests = JSON.parse(File.read(test_file))['tests']
87
+
88
+ tests.each_with_index do |data, index|
89
+ define_method('test_%s_%d' % [test_name, index + 1]) { tokenizer_test data }
90
+ end
91
+ end
92
+
93
+ end
94
+
@@ -0,0 +1,116 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/html5parser'
4
+ require 'html5/treewalkers'
5
+ require 'html5/treebuilders'
6
+
7
+ $tree_types_to_test = {
8
+ 'simpletree' =>
9
+ {:builder => HTML5::TreeBuilders['simpletree'],
10
+ :walker => HTML5::TreeWalkers['simpletree']},
11
+ 'rexml' =>
12
+ {:builder => HTML5::TreeBuilders['rexml'],
13
+ :walker => HTML5::TreeWalkers['rexml']},
14
+ 'hpricot' =>
15
+ {:builder => HTML5::TreeBuilders['hpricot'],
16
+ :walker => HTML5::TreeWalkers['hpricot']},
17
+ }
18
+
19
+ puts 'Testing tree walkers: ' + $tree_types_to_test.keys * ', '
20
+
21
+ class TestTreeWalkers < Test::Unit::TestCase
22
+ include HTML5::TestSupport
23
+
24
+ def concatenateCharacterTokens(tokens)
25
+ charactersToken = nil
26
+ for token in tokens
27
+ type = token[:type]
28
+ if [:Characters, :SpaceCharacters].include?(type)
29
+ if charactersToken == nil
30
+ charactersToken = {:type => :Characters, :data => token[:data]}
31
+ else
32
+ charactersToken[:data] += token[:data]
33
+ end
34
+ else
35
+ if charactersToken != nil
36
+ yield charactersToken
37
+ charactersToken = nil
38
+ end
39
+ yield token
40
+ end
41
+ end
42
+ yield charactersToken if charactersToken != nil
43
+ end
44
+
45
+ def convertTokens(tokens)
46
+ output = []
47
+ indent = 0
48
+ concatenateCharacterTokens(tokens) do |token|
49
+ case token[:type]
50
+ when :StartTag, :EmptyTag
51
+ output << "#{' '*indent}<#{token[:name]}>"
52
+ indent += 2
53
+ for name, value in token[:data].to_a.sort
54
+ next if name=='xmlns'
55
+ output << "#{' '*indent}#{name}=\"#{value}\""
56
+ end
57
+ indent -= 2 if token[:type] == :EmptyTag
58
+ when :EndTag
59
+ indent -= 2
60
+ when :Comment
61
+ output << "#{' '*indent}<!-- #{token[:data]} -->"
62
+ when :Doctype
63
+ if token[:name] and token[:name].any?
64
+ output << "#{' '*indent}<!DOCTYPE #{token[:name]}>"
65
+ else
66
+ output << "#{' '*indent}<!DOCTYPE >"
67
+ end
68
+ when :Characters, :SpaceCharacters
69
+ output << "#{' '*indent}\"#{token[:data]}\""
70
+ else
71
+ # TODO: what to do with errors?
72
+ end
73
+ end
74
+ return output.join("\n")
75
+ end
76
+
77
+ html5_test_files('tree-construction').each do |test_file|
78
+
79
+ test_name = File.basename(test_file).sub('.dat', '')
80
+ next if test_name == 'tests5' # TODO
81
+
82
+ TestData.new(test_file, %w(data errors document-fragment document)).
83
+ each_with_index do |(input, errors, inner_html, expected), index|
84
+
85
+ expected = expected.gsub("\n| ","\n")[2..-1]
86
+
87
+ $tree_types_to_test.each do |tree_name, tree_class|
88
+
89
+ define_method "test_#{test_name}_#{index}_#{tree_name}" do
90
+
91
+ parser = HTML5::HTMLParser.new(:tree => tree_class[:builder])
92
+
93
+ if inner_html
94
+ parser.parse_fragment(input, inner_html)
95
+ else
96
+ parser.parse(input)
97
+ end
98
+
99
+ document = parser.tree.get_document
100
+
101
+ begin
102
+ output = sortattrs(convertTokens(tree_class[:walker].new(document)))
103
+ expected = sortattrs(expected)
104
+ assert_equal expected, output, [
105
+ '', 'Input:', input,
106
+ '', 'Expected:', expected,
107
+ '', 'Recieved:', output
108
+ ].join("\n")
109
+ rescue NotImplementedError
110
+ # Amnesty for those that confess...
111
+ end
112
+ end
113
+ end
114
+ end
115
+ end
116
+ end
@@ -0,0 +1,63 @@
1
+ require 'html5/constants'
2
+
3
+ class TokenizerTestParser
4
+ def initialize(tokenizer)
5
+ @tokenizer = tokenizer
6
+ end
7
+
8
+ def parse
9
+ @outputTokens = []
10
+
11
+ debug = nil
12
+ for token in @tokenizer
13
+ debug = token.inspect if token[:type] == :ParseError
14
+ send(('process' + token[:type].to_s), token)
15
+ end
16
+
17
+ return @outputTokens
18
+ end
19
+
20
+ def processDoctype(token)
21
+ @outputTokens.push(["DOCTYPE", token[:name], token[:publicId],
22
+ token[:systemId], token[:correct]])
23
+ end
24
+
25
+ def processStartTag(token)
26
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
27
+ end
28
+
29
+ def processEmptyTag(token)
30
+ if not HTML5::VOID_ELEMENTS.include? token[:name]
31
+ @outputTokens.push("ParseError")
32
+ end
33
+ @outputTokens.push(["StartTag", token[:name], token[:data]])
34
+ end
35
+
36
+ def processEndTag(token)
37
+ if token[:data].length > 0
38
+ self.processParseError(token)
39
+ end
40
+ @outputTokens.push(["EndTag", token[:name]])
41
+ end
42
+
43
+ def processComment(token)
44
+ @outputTokens.push(["Comment", token[:data]])
45
+ end
46
+
47
+ def processCharacters(token)
48
+ @outputTokens.push(["Character", token[:data]])
49
+ end
50
+
51
+ alias processSpaceCharacters processCharacters
52
+
53
+ def processCharacters(token)
54
+ @outputTokens.push(["Character", token[:data]])
55
+ end
56
+
57
+ def process_eof(token)
58
+ end
59
+
60
+ def processParseError(token)
61
+ @outputTokens.push("ParseError")
62
+ end
63
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.2
3
+ specification_version: 1
4
+ name: html5
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2007-08-07 00:00:00 -07:00
8
+ summary: HTML5 parser/tokenizer.
9
+ require_paths:
10
+ - lib
11
+ email: ryan@theryanking.com
12
+ homepage: http://code.google.com/p/html5lib
13
+ rubyforge_project: html5
14
+ description: A ruby based HTML parser/tokenizer based on the WHATWG HTML5 specification for maximum compatibility with major desktop web browsers.
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Ryan King
31
+ files:
32
+ - History.txt
33
+ - Manifest.txt
34
+ - README
35
+ - Rakefile.rb
36
+ - lib/html5.rb
37
+ - lib/html5/constants.rb
38
+ - lib/html5/filters/base.rb
39
+ - lib/html5/filters/inject_meta_charset.rb
40
+ - lib/html5/filters/optionaltags.rb
41
+ - lib/html5/filters/sanitizer.rb
42
+ - lib/html5/filters/whitespace.rb
43
+ - lib/html5/html5parser.rb
44
+ - lib/html5/html5parser/after_body_phase.rb
45
+ - lib/html5/html5parser/after_frameset_phase.rb
46
+ - lib/html5/html5parser/after_head_phase.rb
47
+ - lib/html5/html5parser/before_head_phase.rb
48
+ - lib/html5/html5parser/in_body_phase.rb
49
+ - lib/html5/html5parser/in_caption_phase.rb
50
+ - lib/html5/html5parser/in_cell_phase.rb
51
+ - lib/html5/html5parser/in_column_group_phase.rb
52
+ - lib/html5/html5parser/in_frameset_phase.rb
53
+ - lib/html5/html5parser/in_head_phase.rb
54
+ - lib/html5/html5parser/in_row_phase.rb
55
+ - lib/html5/html5parser/in_select_phase.rb
56
+ - lib/html5/html5parser/in_table_body_phase.rb
57
+ - lib/html5/html5parser/in_table_phase.rb
58
+ - lib/html5/html5parser/initial_phase.rb
59
+ - lib/html5/html5parser/phase.rb
60
+ - lib/html5/html5parser/root_element_phase.rb
61
+ - lib/html5/html5parser/trailing_end_phase.rb
62
+ - lib/html5/inputstream.rb
63
+ - lib/html5/liberalxmlparser.rb
64
+ - lib/html5/sanitizer.rb
65
+ - lib/html5/serializer.rb
66
+ - lib/html5/serializer/htmlserializer.rb
67
+ - lib/html5/serializer/xhtmlserializer.rb
68
+ - lib/html5/tokenizer.rb
69
+ - lib/html5/treebuilders.rb
70
+ - lib/html5/treebuilders/base.rb
71
+ - lib/html5/treebuilders/hpricot.rb
72
+ - lib/html5/treebuilders/rexml.rb
73
+ - lib/html5/treebuilders/simpletree.rb
74
+ - lib/html5/treewalkers.rb
75
+ - lib/html5/treewalkers/base.rb
76
+ - lib/html5/treewalkers/hpricot.rb
77
+ - lib/html5/treewalkers/rexml.rb
78
+ - lib/html5/treewalkers/simpletree.rb
79
+ - parse.rb
80
+ - tests/preamble.rb
81
+ - tests/test_encoding.rb
82
+ - tests/test_lxp.rb
83
+ - tests/test_parser.rb
84
+ - tests/test_sanitizer.rb
85
+ - tests/test_serializer.rb
86
+ - tests/test_stream.rb
87
+ - tests/test_tokenizer.rb
88
+ - tests/test_treewalkers.rb
89
+ - tests/tokenizer_test_parser.rb
90
+ test_files: []
91
+
92
+ rdoc_options: []
93
+
94
+ extra_rdoc_files: []
95
+
96
+ executables: []
97
+
98
+ extensions: []
99
+
100
+ requirements: []
101
+
102
+ dependencies:
103
+ - !ruby/object:Gem::Dependency
104
+ name: chardet
105
+ version_requirement:
106
+ version_requirements: !ruby/object:Gem::Version::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: 0.9.0
111
+ version:
112
+ - !ruby/object:Gem::Dependency
113
+ name: hoe
114
+ version_requirement:
115
+ version_requirements: !ruby/object:Gem::Version::Requirement
116
+ requirements:
117
+ - - ">="
118
+ - !ruby/object:Gem::Version
119
+ version: 1.2.0
120
+ version: