html5 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,26 @@
1
+ require 'html5/treewalkers/base'
2
+
3
+ module HTML5
4
+ module TreeWalkers
5
+
6
+ class << self
7
+ def [](name)
8
+ case name.to_s.downcase
9
+ when 'simpletree'
10
+ require 'html5/treewalkers/simpletree'
11
+ SimpleTree::TreeWalker
12
+ when 'rexml'
13
+ require 'html5/treewalkers/rexml'
14
+ REXML::TreeWalker
15
+ when 'hpricot'
16
+ require 'html5/treewalkers/hpricot'
17
+ Hpricot::TreeWalker
18
+ else
19
+ raise "Unknown TreeWalker #{name}"
20
+ end
21
+ end
22
+
23
+ alias :get_tree_walker :[]
24
+ end
25
+ end
26
+ end
data/lib/html5.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'html5/html5parser'
2
+ require 'html5/version'
3
+
4
+ module HTML5
5
+
6
+ def self.parse(stream, options={})
7
+ HTMLParser.parse(stream, options)
8
+ end
9
+
10
+ def self.parse_fragment(stream, options={})
11
+ HTMLParser.parse(stream, options)
12
+ end
13
+ end
data/parse.rb ADDED
@@ -0,0 +1,217 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Parse a document to a simpletree tree, with optional profiling
4
+
5
+ $:.unshift File.dirname(__FILE__),'lib'
6
+
7
+ def parse(opts, args)
8
+ encoding = nil
9
+
10
+ f = args[-1]
11
+ if f
12
+ begin
13
+ if f[0..6] == 'http://'
14
+ require 'open-uri'
15
+ f = URI.parse(f).open
16
+ encoding = f.charset
17
+ elsif f == '-'
18
+ f = $stdin
19
+ else
20
+ f = open(f)
21
+ end
22
+ rescue
23
+ end
24
+ else
25
+ $stderr.write("No filename provided. Use -h for help\n")
26
+ exit(1)
27
+ end
28
+
29
+ require 'html5/treebuilders'
30
+ treebuilder = HTML5::TreeBuilders[opts.treebuilder]
31
+
32
+ if opts.output == :xml
33
+ require 'html5/liberalxmlparser'
34
+ p = HTML5::XHTMLParser.new(:tree=>treebuilder)
35
+ else
36
+ require 'html5/html5parser'
37
+ p = HTML5::HTMLParser.new(:tree=>treebuilder)
38
+ end
39
+
40
+ if opts.parsemethod == :parse
41
+ args = [f, encoding]
42
+ else
43
+ args = [f, 'div', encoding]
44
+ end
45
+
46
+ if opts.profile
47
+ require 'profiler'
48
+ Profiler__::start_profile
49
+ p.send(opts.parsemethod, *args)
50
+ Profiler__::stop_profile
51
+ Profiler__::print_profile($stderr)
52
+ elsif opts.time
53
+ require 'time' # TODO: switch to benchmark
54
+ t0 = Time.new
55
+ document = p.send(opts.parsemethod, *args)
56
+ t1 = Time.new
57
+ printOutput(p, document, opts)
58
+ t2 = Time.new
59
+ puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
60
+ else
61
+ document = p.send(opts.parsemethod, *args)
62
+ printOutput(p, document, opts)
63
+ end
64
+ end
65
+
66
+ def printOutput(parser, document, opts)
67
+ puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
68
+
69
+ case opts.output
70
+ when :xml
71
+ print document
72
+ when :html
73
+ require 'html5/treewalkers'
74
+ tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
75
+ require 'html5/serializer'
76
+ puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
77
+ when :hilite
78
+ print document.hilite
79
+ when :tree
80
+ document = [document] unless document.respond_to?(:each)
81
+ document.each {|fragment| puts parser.tree.testSerializer(fragment)}
82
+ end
83
+
84
+ if opts.error
85
+ errList=[]
86
+ for pos, message in parser.errors
87
+ errList << ("Line %i Col %i"%pos + " " + message)
88
+ end
89
+ $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
90
+ end
91
+ end
92
+
93
+ require 'ostruct'
94
+ options = OpenStruct.new
95
+ options.profile = false
96
+ options.time = false
97
+ options.output = :html
98
+ options.treebuilder = 'simpletree'
99
+ options.error = false
100
+ options.encoding = false
101
+ options.parsemethod = :parse
102
+ options.serializer = {
103
+ :encoding => 'utf-8',
104
+ :omit_optional_tags => false,
105
+ :inject_meta_charset => false
106
+ }
107
+
108
+ require 'optparse'
109
+ opts = OptionParser.new do |opts|
110
+ opts.separator ""
111
+ opts.separator "Parse Options:"
112
+
113
+ opts.on("-b", "--treebuilder NAME") do |treebuilder|
114
+ options.treebuilder = treebuilder
115
+ end
116
+
117
+ opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
118
+ options.parsemethod = :parse_fragment
119
+ end
120
+
121
+ opts.separator ""
122
+ opts.separator "Filter Options:"
123
+
124
+ opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
125
+ options.serializer[:inject_meta_charset] = inject
126
+ end
127
+
128
+ opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
129
+ options.serializer[:strip_whitespace] = strip
130
+ end
131
+
132
+ opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
133
+ options.serializer[:sanitize] = sanitize
134
+ end
135
+
136
+ opts.separator ""
137
+ opts.separator "Output Options:"
138
+
139
+ opts.on("--tree", "output as debug tree") do |tree|
140
+ options.output = :tree
141
+ end
142
+
143
+ opts.on("-x", "--xml", "output as xml") do |xml|
144
+ options.output = :xml
145
+ options.treebuilder = "rexml"
146
+ end
147
+
148
+ opts.on("--[no-]html", "Output as html") do |html|
149
+ options.output = (html ? :html : nil)
150
+ end
151
+
152
+ opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
153
+ options.output = :hilite
154
+ end
155
+
156
+ opts.on("-e", "--error", "Print a list of parse errors") do |error|
157
+ options.error = error
158
+ end
159
+
160
+ opts.separator ""
161
+ opts.separator "Serialization Options:"
162
+
163
+ opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
164
+ options.serializer[:omit_optional_tags] = omit
165
+ end
166
+
167
+ opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
168
+ options.serializer[:quote_attr_values] = quote
169
+ end
170
+
171
+ opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
172
+ options.serializer[:use_best_quote_char] = best
173
+ end
174
+
175
+ opts.on("--quote-char C", "Use specified quote character") do |c|
176
+ options.serializer[:quote_char] = c
177
+ end
178
+
179
+ opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
180
+ options.serializer[:minimize_boolean_attributes] = min
181
+ end
182
+
183
+ opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
184
+ options.serializer[:use_trailing_solidus] = slash
185
+ end
186
+
187
+ opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
188
+ options.serializer[:escape_lt_in_attrs] = lt
189
+ end
190
+
191
+ opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
192
+ options.serializer[:escape_rcdata] = rcdata
193
+ end
194
+
195
+ opts.separator ""
196
+ opts.separator "Other Options:"
197
+
198
+ opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
199
+ options.profile = profile
200
+ end
201
+
202
+ opts.on("-t", "--[no-]time", "Time the run") do |time|
203
+ options.time = time
204
+ end
205
+
206
+ opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
207
+ options.encoding = encoding
208
+ end
209
+
210
+ opts.on_tail("-h", "--help", "Show this message") do
211
+ puts opts
212
+ exit
213
+ end
214
+ end
215
+
216
+ opts.parse!(ARGV)
217
+ parse options, ARGV
data/tests/preamble.rb ADDED
@@ -0,0 +1,82 @@
1
+ require 'test/unit'
2
+
3
+ HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
4
+
5
+ if File.exists?(File.join(HTML5_BASE, 'testdata'))
6
+ TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
7
+ else
8
+ TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
9
+ end
10
+
11
+ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
12
+
13
+ $:.unshift File.dirname(__FILE__)
14
+
15
+ def html5_test_files(subdirectory)
16
+ Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
17
+ end
18
+
19
+ begin
20
+ require 'rubygems'
21
+ require 'json'
22
+ rescue LoadError
23
+ class JSON
24
+ def self.parse json
25
+ json.gsub!(/\$/, "\\$")
26
+ json.gsub!(/"\s*:/, '"=>')
27
+ json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
28
+ null = nil
29
+ eval json
30
+ end
31
+ end
32
+ end
33
+
34
+ module HTML5
35
+ module TestSupport
36
+ # convert the output of str(document) to the format used in the testcases
37
+ def convertTreeDump(treedump)
38
+ treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
39
+ end
40
+
41
+ def sortattrs(output)
42
+ output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
43
+ match.split("\n").sort.join("\n")
44
+ end
45
+ end
46
+
47
+ class TestData
48
+ include Enumerable
49
+
50
+ def initialize(filename, sections)
51
+ @f = open(filename)
52
+ @sections = sections
53
+ end
54
+
55
+ def each
56
+ data = {}
57
+ key=nil
58
+ @f.each_line do |line|
59
+ if line[0] == ?# and @sections.include?(line[1..-2])
60
+ heading = line[1..-2]
61
+ if data.any? and heading == @sections[0]
62
+ data[key].chomp! #Remove trailing newline
63
+ yield normaliseOutput(data)
64
+ data = {}
65
+ end
66
+ key = heading
67
+ data[key]=""
68
+ elsif key
69
+ data[key] += line
70
+ end
71
+ end
72
+ yield normaliseOutput(data) if data
73
+ end
74
+
75
+ def normaliseOutput(data)
76
+ #Remove trailing newlines
77
+ data.keys.each { |key| data[key].chomp! }
78
+ @sections.map {|heading| data[heading]}
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,35 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class Html5EncodingTestCase < Test::Unit::TestCase
6
+ include HTML5
7
+ include TestSupport
8
+
9
+ begin
10
+ require 'rubygems'
11
+ require 'UniversalDetector'
12
+
13
+ def test_chardet
14
+ file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
15
+ stream = HTML5::HTMLInputStream.new(file, :chardet => true)
16
+ assert_equal 'big5', stream.char_encoding.downcase
17
+ rescue LoadError
18
+ puts "chardet not found, skipping chardet tests"
19
+ end
20
+ end
21
+
22
+ html5_test_files('encoding').each do |test_file|
23
+ test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
24
+
25
+ TestData.new(test_file, %w(data encoding)).
26
+ each_with_index do |(input, encoding), index|
27
+
28
+ define_method 'test_%s_%d' % [ test_name, index + 1 ] do
29
+ stream = HTML5::HTMLInputStream.new(input, :chardet => false)
30
+ assert_equal encoding.downcase, stream.char_encoding.downcase, input
31
+ end
32
+ end
33
+ end
34
+
35
+ end
data/tests/test_lxp.rb ADDED
@@ -0,0 +1,263 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/liberalxmlparser'
4
+
5
+ XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
6
+
7
+ def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
8
+ sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
9
+ document = parser.parse(input.chomp, :lowercase_attr_name => false, :lowercase_element_name => false).root
10
+ if not expected
11
+ expected = input.chomp.gsub(XMLELEM,&sortattrs)
12
+ expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
13
+ output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
14
+ assert_equal(expected, output)
15
+ else
16
+ assert_equal(expected, document.to_s.gsub(/'/,'"'))
17
+ end
18
+ end
19
+
20
+ def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
21
+ assert_xml_equal(input, expected, parser)
22
+ end
23
+
24
+ class BasicXhtml5Test < Test::Unit::TestCase
25
+
26
+ def test_title_body_mismatched_close
27
+ assert_xhtml_equal(
28
+ '<title>Xhtml</title><b><i>content</b></i>',
29
+ '<html xmlns="http://www.w3.org/1999/xhtml">' +
30
+ '<head><title>Xhtml</title></head>' +
31
+ '<body><b><i>content</i></b></body>' +
32
+ '</html>')
33
+ end
34
+
35
+ def test_title_body_named_charref
36
+ assert_xhtml_equal(
37
+ '<title>ntilde</title>A &ntilde B',
38
+ '<html xmlns="http://www.w3.org/1999/xhtml">' +
39
+ '<head><title>ntilde</title></head>' +
40
+ '<body>A '+ [0xF1].pack('U') + ' B</body>' +
41
+ '</html>')
42
+ end
43
+ end
44
+
45
+ class BasicXmlTest < Test::Unit::TestCase
46
+
47
+ def test_comment
48
+ assert_xml_equal("<x><!-- foo --></x>")
49
+ end
50
+
51
+ def test_cdata
52
+ assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
53
+ end
54
+
55
+ def test_simple_text
56
+ assert_xml_equal("<p>foo</p>","<p>foo</p>")
57
+ end
58
+
59
+ def test_optional_close
60
+ assert_xml_equal("<p>foo","<p>foo</p>")
61
+ end
62
+
63
+ def test_html_mismatched
64
+ assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
65
+ end
66
+ end
67
+
68
+ class OpmlTest < Test::Unit::TestCase
69
+
70
+ def test_mixedCaseElement
71
+ assert_xml_equal(
72
+ '<opml version="1.0">' +
73
+ '<head><ownerName>Dave Winer</ownerName></head>' +
74
+ '</opml>')
75
+ end
76
+
77
+ def test_mixedCaseAttribute
78
+ assert_xml_equal(
79
+ '<opml version="1.0">' +
80
+ '<body><outline isComment="true"/></body>' +
81
+ '</opml>')
82
+ end
83
+
84
+ def test_malformed
85
+ assert_xml_equal(
86
+ '<opml version="1.0">' +
87
+ '<body><outline text="Odds & Ends"/></body>' +
88
+ '</opml>',
89
+ '<opml version="1.0">' +
90
+ '<body><outline text="Odds &amp; Ends"/></body>' +
91
+ '</opml>')
92
+ end
93
+ end
94
+
95
+ class XhtmlTest < Test::Unit::TestCase
96
+
97
+ def test_mathml
98
+ assert_xhtml_equal <<EOX
99
+ <html xmlns="http://www.w3.org/1999/xhtml">
100
+ <head><title>MathML</title></head>
101
+ <body>
102
+ <math xmlns="http://www.w3.org/1998/Math/MathML">
103
+ <mrow>
104
+ <mi>x</mi>
105
+ <mo>=</mo>
106
+
107
+ <mfrac>
108
+ <mrow>
109
+ <mrow>
110
+ <mo>-</mo>
111
+ <mi>b</mi>
112
+ </mrow>
113
+ <mo>&#177;</mo>
114
+ <msqrt>
115
+
116
+ <mrow>
117
+ <msup>
118
+ <mi>b</mi>
119
+ <mn>2</mn>
120
+ </msup>
121
+ <mo>-</mo>
122
+ <mrow>
123
+
124
+ <mn>4</mn>
125
+ <mo>&#8290;</mo>
126
+ <mi>a</mi>
127
+ <mo>&#8290;</mo>
128
+ <mi>c</mi>
129
+ </mrow>
130
+ </mrow>
131
+
132
+ </msqrt>
133
+ </mrow>
134
+ <mrow>
135
+ <mn>2</mn>
136
+ <mo>&#8290;</mo>
137
+ <mi>a</mi>
138
+ </mrow>
139
+ </mfrac>
140
+
141
+ </mrow>
142
+ </math>
143
+ </body></html>
144
+ EOX
145
+ end
146
+
147
+ def test_svg
148
+ assert_xhtml_equal <<EOX
149
+ <html xmlns="http://www.w3.org/1999/xhtml">
150
+ <head><title>SVG</title></head>
151
+ <body>
152
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
153
+ <path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
154
+ c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
155
+ </path>
156
+ <circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
157
+ </circle>
158
+
159
+ </svg>
160
+ </body></html>
161
+ EOX
162
+ end
163
+
164
+ def test_xlink
165
+ assert_xhtml_equal <<EOX
166
+ <html xmlns="http://www.w3.org/1999/xhtml">
167
+ <head><title>XLINK</title></head>
168
+ <body>
169
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
170
+ <defs xmlns:l="http://www.w3.org/1999/xlink">
171
+ <radialGradient id="s1" fx=".4" fy=".2" r=".7">
172
+ <stop stop-color="#FE8"/>
173
+ <stop stop-color="#D70" offset="1"/>
174
+ </radialGradient>
175
+ <radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
176
+ <radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
177
+ <radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
178
+ </defs>
179
+ <g stroke="#940">
180
+ <path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
181
+ <path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
182
+ <path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
183
+
184
+ <path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
185
+ <path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
186
+ <path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
187
+ </g>
188
+ </svg>
189
+ </body></html>
190
+ EOX
191
+ end
192
+
193
+ def test_br
194
+ assert_xhtml_equal <<EOX1
195
+ <html xmlns="http://www.w3.org/1999/xhtml">
196
+ <head><title>BR</title></head>
197
+ <body>
198
+ <br/>
199
+ </body></html>
200
+ EOX1
201
+ end
202
+
203
+ def test_strong
204
+ assert_xhtml_equal <<EOX
205
+ <html xmlns="http://www.w3.org/1999/xhtml">
206
+ <head><title>STRONG</title></head>
207
+ <body>
208
+ <strong></strong>
209
+ </body></html>
210
+ EOX
211
+ end
212
+
213
+ def test_script
214
+ assert_xhtml_equal <<EOX
215
+ <html xmlns="http://www.w3.org/1999/xhtml">
216
+ <head><title>SCRIPT</title></head>
217
+ <body>
218
+ <script>1 &lt; 2 &amp; 3</script>
219
+ </body></html>
220
+ EOX
221
+ end
222
+
223
+ def test_script_src
224
+ assert_xhtml_equal <<EOX1, <<EOX2.strip
225
+ <html xmlns="http://www.w3.org/1999/xhtml">
226
+ <head><title>SCRIPT</title><script src="http://example.com"/></head>
227
+ <body>
228
+ <script>1 &lt; 2 &amp; 3</script>
229
+ </body></html>
230
+ EOX1
231
+ <html xmlns="http://www.w3.org/1999/xhtml">
232
+ <head><title>SCRIPT</title><script src="http://example.com"></script></head>
233
+ <body>
234
+ <script>1 &lt; 2 &amp; 3</script>
235
+ </body></html>
236
+ EOX2
237
+ end
238
+
239
+ def test_title
240
+ assert_xhtml_equal <<EOX
241
+ <html xmlns="http://www.w3.org/1999/xhtml">
242
+ <head><title>1 &lt; 2 &amp; 3</title></head>
243
+ <body>
244
+ </body></html>
245
+ EOX
246
+ end
247
+
248
+ def test_prolog
249
+ assert_xhtml_equal <<EOX1, <<EOX2.strip
250
+ <?xml version="1.0" encoding="UTF-8" ?>
251
+ <html xmlns="http://www.w3.org/1999/xhtml">
252
+ <head><title>PROLOG</title></head>
253
+ <body>
254
+ </body></html>
255
+ EOX1
256
+ <html xmlns="http://www.w3.org/1999/xhtml">
257
+ <head><title>PROLOG</title></head>
258
+ <body>
259
+ </body></html>
260
+ EOX2
261
+ end
262
+
263
+ end
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/treebuilders'
4
+ require 'html5/html5parser'
5
+
6
+
7
+ $tree_types_to_test = ['simpletree', 'rexml']
8
+
9
+ begin
10
+ require 'hpricot'
11
+ $tree_types_to_test.push('hpricot')
12
+ rescue LoadError
13
+ end
14
+
15
+ $CHECK_PARSER_ERRORS = ARGV.delete('-p') # TODO
16
+
17
+ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
18
+
19
+
20
+ class Html5ParserTestCase < Test::Unit::TestCase
21
+ include HTML5
22
+ include TestSupport
23
+
24
+ html5_test_files('tree-construction').each do |test_file|
25
+
26
+ test_name = File.basename(test_file).sub('.dat', '')
27
+
28
+ TestData.new(test_file, %w(data errors document-fragment document)).
29
+ each_with_index do |(input, errors, inner_html, expected), index|
30
+
31
+ errors = errors.split("\n")
32
+ expected = expected.gsub("\n| ","\n")[2..-1]
33
+
34
+ $tree_types_to_test.each do |tree_name|
35
+ define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
36
+
37
+ parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
38
+
39
+ if inner_html
40
+ parser.parse_fragment(input, inner_html)
41
+ else
42
+ parser.parse(input)
43
+ end
44
+
45
+ actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
46
+
47
+ assert_equal sortattrs(expected), sortattrs(actual_output), [
48
+ '', 'Input:', input,
49
+ '', 'Expected:', expected,
50
+ '', 'Recieved:', actual_output
51
+ ].join("\n")
52
+
53
+ actual_errors = parser.errors.map do |(line, col), message|
54
+ 'Line: %i Col: %i %s' % [line, col, message]
55
+ end
56
+ assert_equal errors.length, parser.errors.length, [
57
+ '', 'Input', input,
58
+ '', "Expected errors (#{errors.length}):", errors.join("\n"),
59
+ '', "Actual errors (#{actual_errors.length}):",
60
+ actual_errors.join("\n")
61
+ ].join("\n")
62
+
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ end