html5 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. data/History.txt +3 -0
  2. data/Manifest.txt +58 -0
  3. data/README +9 -0
  4. data/Rakefile.rb +17 -0
  5. data/lib/html5/constants.rb +818 -0
  6. data/lib/html5/filters/base.rb +10 -0
  7. data/lib/html5/filters/inject_meta_charset.rb +82 -0
  8. data/lib/html5/filters/optionaltags.rb +198 -0
  9. data/lib/html5/filters/sanitizer.rb +15 -0
  10. data/lib/html5/filters/whitespace.rb +36 -0
  11. data/lib/html5/html5parser/after_body_phase.rb +46 -0
  12. data/lib/html5/html5parser/after_frameset_phase.rb +34 -0
  13. data/lib/html5/html5parser/after_head_phase.rb +50 -0
  14. data/lib/html5/html5parser/before_head_phase.rb +41 -0
  15. data/lib/html5/html5parser/in_body_phase.rb +607 -0
  16. data/lib/html5/html5parser/in_caption_phase.rb +68 -0
  17. data/lib/html5/html5parser/in_cell_phase.rb +78 -0
  18. data/lib/html5/html5parser/in_column_group_phase.rb +55 -0
  19. data/lib/html5/html5parser/in_frameset_phase.rb +57 -0
  20. data/lib/html5/html5parser/in_head_phase.rb +138 -0
  21. data/lib/html5/html5parser/in_row_phase.rb +87 -0
  22. data/lib/html5/html5parser/in_select_phase.rb +84 -0
  23. data/lib/html5/html5parser/in_table_body_phase.rb +83 -0
  24. data/lib/html5/html5parser/in_table_phase.rb +110 -0
  25. data/lib/html5/html5parser/initial_phase.rb +134 -0
  26. data/lib/html5/html5parser/phase.rb +158 -0
  27. data/lib/html5/html5parser/root_element_phase.rb +42 -0
  28. data/lib/html5/html5parser/trailing_end_phase.rb +35 -0
  29. data/lib/html5/html5parser.rb +248 -0
  30. data/lib/html5/inputstream.rb +654 -0
  31. data/lib/html5/liberalxmlparser.rb +158 -0
  32. data/lib/html5/sanitizer.rb +188 -0
  33. data/lib/html5/serializer/htmlserializer.rb +180 -0
  34. data/lib/html5/serializer/xhtmlserializer.rb +20 -0
  35. data/lib/html5/serializer.rb +2 -0
  36. data/lib/html5/tokenizer.rb +968 -0
  37. data/lib/html5/treebuilders/base.rb +334 -0
  38. data/lib/html5/treebuilders/hpricot.rb +231 -0
  39. data/lib/html5/treebuilders/rexml.rb +208 -0
  40. data/lib/html5/treebuilders/simpletree.rb +185 -0
  41. data/lib/html5/treebuilders.rb +24 -0
  42. data/lib/html5/treewalkers/base.rb +154 -0
  43. data/lib/html5/treewalkers/hpricot.rb +48 -0
  44. data/lib/html5/treewalkers/rexml.rb +48 -0
  45. data/lib/html5/treewalkers/simpletree.rb +48 -0
  46. data/lib/html5/treewalkers.rb +26 -0
  47. data/lib/html5.rb +13 -0
  48. data/parse.rb +217 -0
  49. data/tests/preamble.rb +82 -0
  50. data/tests/test_encoding.rb +35 -0
  51. data/tests/test_lxp.rb +263 -0
  52. data/tests/test_parser.rb +68 -0
  53. data/tests/test_sanitizer.rb +142 -0
  54. data/tests/test_serializer.rb +68 -0
  55. data/tests/test_stream.rb +62 -0
  56. data/tests/test_tokenizer.rb +94 -0
  57. data/tests/test_treewalkers.rb +116 -0
  58. data/tests/tokenizer_test_parser.rb +63 -0
  59. metadata +120 -0
@@ -0,0 +1,26 @@
1
+ require 'html5/treewalkers/base'
2
+
3
+ module HTML5
4
+ module TreeWalkers
5
+
6
+ class << self
7
+ def [](name)
8
+ case name.to_s.downcase
9
+ when 'simpletree'
10
+ require 'html5/treewalkers/simpletree'
11
+ SimpleTree::TreeWalker
12
+ when 'rexml'
13
+ require 'html5/treewalkers/rexml'
14
+ REXML::TreeWalker
15
+ when 'hpricot'
16
+ require 'html5/treewalkers/hpricot'
17
+ Hpricot::TreeWalker
18
+ else
19
+ raise "Unknown TreeWalker #{name}"
20
+ end
21
+ end
22
+
23
+ alias :get_tree_walker :[]
24
+ end
25
+ end
26
+ end
data/lib/html5.rb ADDED
@@ -0,0 +1,13 @@
1
+ require 'html5/html5parser'
2
+ require 'html5/version'
3
+
4
+ module HTML5
5
+
6
+ def self.parse(stream, options={})
7
+ HTMLParser.parse(stream, options)
8
+ end
9
+
10
+ def self.parse_fragment(stream, options={})
11
+ HTMLParser.parse(stream, options)
12
+ end
13
+ end
data/parse.rb ADDED
@@ -0,0 +1,217 @@
1
+ #!/usr/bin/env ruby
2
+ #
3
+ # Parse a document to a simpletree tree, with optional profiling
4
+
5
+ $:.unshift File.dirname(__FILE__),'lib'
6
+
7
+ def parse(opts, args)
8
+ encoding = nil
9
+
10
+ f = args[-1]
11
+ if f
12
+ begin
13
+ if f[0..6] == 'http://'
14
+ require 'open-uri'
15
+ f = URI.parse(f).open
16
+ encoding = f.charset
17
+ elsif f == '-'
18
+ f = $stdin
19
+ else
20
+ f = open(f)
21
+ end
22
+ rescue
23
+ end
24
+ else
25
+ $stderr.write("No filename provided. Use -h for help\n")
26
+ exit(1)
27
+ end
28
+
29
+ require 'html5/treebuilders'
30
+ treebuilder = HTML5::TreeBuilders[opts.treebuilder]
31
+
32
+ if opts.output == :xml
33
+ require 'html5/liberalxmlparser'
34
+ p = HTML5::XHTMLParser.new(:tree=>treebuilder)
35
+ else
36
+ require 'html5/html5parser'
37
+ p = HTML5::HTMLParser.new(:tree=>treebuilder)
38
+ end
39
+
40
+ if opts.parsemethod == :parse
41
+ args = [f, encoding]
42
+ else
43
+ args = [f, 'div', encoding]
44
+ end
45
+
46
+ if opts.profile
47
+ require 'profiler'
48
+ Profiler__::start_profile
49
+ p.send(opts.parsemethod, *args)
50
+ Profiler__::stop_profile
51
+ Profiler__::print_profile($stderr)
52
+ elsif opts.time
53
+ require 'time' # TODO: switch to benchmark
54
+ t0 = Time.new
55
+ document = p.send(opts.parsemethod, *args)
56
+ t1 = Time.new
57
+ printOutput(p, document, opts)
58
+ t2 = Time.new
59
+ puts "\n\nRun took: %fs (plus %fs to print the output)"%[t1-t0, t2-t1]
60
+ else
61
+ document = p.send(opts.parsemethod, *args)
62
+ printOutput(p, document, opts)
63
+ end
64
+ end
65
+
66
+ def printOutput(parser, document, opts)
67
+ puts "Encoding: #{parser.tokenizer.stream.char_encoding}" if opts.encoding
68
+
69
+ case opts.output
70
+ when :xml
71
+ print document
72
+ when :html
73
+ require 'html5/treewalkers'
74
+ tokens = HTML5::TreeWalkers[opts.treebuilder].new(document)
75
+ require 'html5/serializer'
76
+ puts HTML5::HTMLSerializer.serialize(tokens, opts.serializer)
77
+ when :hilite
78
+ print document.hilite
79
+ when :tree
80
+ document = [document] unless document.respond_to?(:each)
81
+ document.each {|fragment| puts parser.tree.testSerializer(fragment)}
82
+ end
83
+
84
+ if opts.error
85
+ errList=[]
86
+ for pos, message in parser.errors
87
+ errList << ("Line %i Col %i"%pos + " " + message)
88
+ end
89
+ $stdout.write("\nParse errors:\n" + errList.join("\n")+"\n")
90
+ end
91
+ end
92
+
93
+ require 'ostruct'
94
+ options = OpenStruct.new
95
+ options.profile = false
96
+ options.time = false
97
+ options.output = :html
98
+ options.treebuilder = 'simpletree'
99
+ options.error = false
100
+ options.encoding = false
101
+ options.parsemethod = :parse
102
+ options.serializer = {
103
+ :encoding => 'utf-8',
104
+ :omit_optional_tags => false,
105
+ :inject_meta_charset => false
106
+ }
107
+
108
+ require 'optparse'
109
+ opts = OptionParser.new do |opts|
110
+ opts.separator ""
111
+ opts.separator "Parse Options:"
112
+
113
+ opts.on("-b", "--treebuilder NAME") do |treebuilder|
114
+ options.treebuilder = treebuilder
115
+ end
116
+
117
+ opts.on("-f", "--fragment", "Parse as a fragment") do |parse|
118
+ options.parsemethod = :parse_fragment
119
+ end
120
+
121
+ opts.separator ""
122
+ opts.separator "Filter Options:"
123
+
124
+ opts.on("--[no-]inject-meta-charset", "inject <meta charset>") do |inject|
125
+ options.serializer[:inject_meta_charset] = inject
126
+ end
127
+
128
+ opts.on("--[no-]strip-whitespace", "strip unnecessary whitespace") do |strip|
129
+ options.serializer[:strip_whitespace] = strip
130
+ end
131
+
132
+ opts.on("--[no-]sanitize", "escape unsafe tags") do |sanitize|
133
+ options.serializer[:sanitize] = sanitize
134
+ end
135
+
136
+ opts.separator ""
137
+ opts.separator "Output Options:"
138
+
139
+ opts.on("--tree", "output as debug tree") do |tree|
140
+ options.output = :tree
141
+ end
142
+
143
+ opts.on("-x", "--xml", "output as xml") do |xml|
144
+ options.output = :xml
145
+ options.treebuilder = "rexml"
146
+ end
147
+
148
+ opts.on("--[no-]html", "Output as html") do |html|
149
+ options.output = (html ? :html : nil)
150
+ end
151
+
152
+ opts.on("--hilite", "Output as formatted highlighted code.") do |hilite|
153
+ options.output = :hilite
154
+ end
155
+
156
+ opts.on("-e", "--error", "Print a list of parse errors") do |error|
157
+ options.error = error
158
+ end
159
+
160
+ opts.separator ""
161
+ opts.separator "Serialization Options:"
162
+
163
+ opts.on("--[no-]omit-optional-tags", "Omit optional tags") do |omit|
164
+ options.serializer[:omit_optional_tags] = omit
165
+ end
166
+
167
+ opts.on("--[no-]quote-attr-values", "Quote attribute values") do |quote|
168
+ options.serializer[:quote_attr_values] = quote
169
+ end
170
+
171
+ opts.on("--[no-]use-best-quote-char", "Use best quote character") do |best|
172
+ options.serializer[:use_best_quote_char] = best
173
+ end
174
+
175
+ opts.on("--quote-char C", "Use specified quote character") do |c|
176
+ options.serializer[:quote_char] = c
177
+ end
178
+
179
+ opts.on("--[no-]minimize-boolean-attributes", "Minimize boolean attributes") do |min|
180
+ options.serializer[:minimize_boolean_attributes] = min
181
+ end
182
+
183
+ opts.on("--[no-]use-trailing-solidus", "Use trailing solidus") do |slash|
184
+ options.serializer[:use_trailing_solidus] = slash
185
+ end
186
+
187
+ opts.on("--[no-]escape-lt-in-attrs", "Escape less than signs in attribute values") do |lt|
188
+ options.serializer[:escape_lt_in_attrs] = lt
189
+ end
190
+
191
+ opts.on("--[no-]escape-rcdata", "Escape rcdata element values") do |rcdata|
192
+ options.serializer[:escape_rcdata] = rcdata
193
+ end
194
+
195
+ opts.separator ""
196
+ opts.separator "Other Options:"
197
+
198
+ opts.on("-p", "--[no-]profile", "Profile the run") do |profile|
199
+ options.profile = profile
200
+ end
201
+
202
+ opts.on("-t", "--[no-]time", "Time the run") do |time|
203
+ options.time = time
204
+ end
205
+
206
+ opts.on("-c", "--[no-]encoding", "Print character encoding used") do |encoding|
207
+ options.encoding = encoding
208
+ end
209
+
210
+ opts.on_tail("-h", "--help", "Show this message") do
211
+ puts opts
212
+ exit
213
+ end
214
+ end
215
+
216
+ opts.parse!(ARGV)
217
+ parse options, ARGV
data/tests/preamble.rb ADDED
@@ -0,0 +1,82 @@
1
+ require 'test/unit'
2
+
3
+ HTML5_BASE = File.dirname(File.dirname(File.dirname(File.expand_path(__FILE__))))
4
+
5
+ if File.exists?(File.join(HTML5_BASE, 'testdata'))
6
+ TESTDATA_DIR = File.join(HTML5_BASE, 'testdata')
7
+ else
8
+ TESTDATA_DIR = File.join(File.dirname(File.dirname(File.expand_path(__FILE__))), 'testdata')
9
+ end
10
+
11
+ $:.unshift File.join(File.dirname(File.dirname(__FILE__)),'lib')
12
+
13
+ $:.unshift File.dirname(__FILE__)
14
+
15
+ def html5_test_files(subdirectory)
16
+ Dir[File.join(TESTDATA_DIR, subdirectory, '*.*')]
17
+ end
18
+
19
+ begin
20
+ require 'rubygems'
21
+ require 'json'
22
+ rescue LoadError
23
+ class JSON
24
+ def self.parse json
25
+ json.gsub!(/\$/, "\\$")
26
+ json.gsub!(/"\s*:/, '"=>')
27
+ json.gsub!(/\\u[0-9a-fA-F]{4}/) {|x| [x[2..-1].to_i(16)].pack('U')}
28
+ null = nil
29
+ eval json
30
+ end
31
+ end
32
+ end
33
+
34
+ module HTML5
35
+ module TestSupport
36
+ # convert the output of str(document) to the format used in the testcases
37
+ def convertTreeDump(treedump)
38
+ treedump.split(/\n/)[1..-1].map { |line| (line.length > 2 and line[0] == ?|) ? line[3..-1] : line }.join("\n")
39
+ end
40
+
41
+ def sortattrs(output)
42
+ output.gsub(/^(\s+)\w+=.*(\n\1\w+=.*)+/) do |match|
43
+ match.split("\n").sort.join("\n")
44
+ end
45
+ end
46
+
47
+ class TestData
48
+ include Enumerable
49
+
50
+ def initialize(filename, sections)
51
+ @f = open(filename)
52
+ @sections = sections
53
+ end
54
+
55
+ def each
56
+ data = {}
57
+ key=nil
58
+ @f.each_line do |line|
59
+ if line[0] == ?# and @sections.include?(line[1..-2])
60
+ heading = line[1..-2]
61
+ if data.any? and heading == @sections[0]
62
+ data[key].chomp! #Remove trailing newline
63
+ yield normaliseOutput(data)
64
+ data = {}
65
+ end
66
+ key = heading
67
+ data[key]=""
68
+ elsif key
69
+ data[key] += line
70
+ end
71
+ end
72
+ yield normaliseOutput(data) if data
73
+ end
74
+
75
+ def normaliseOutput(data)
76
+ #Remove trailing newlines
77
+ data.keys.each { |key| data[key].chomp! }
78
+ @sections.map {|heading| data[heading]}
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,35 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/inputstream'
4
+
5
+ class Html5EncodingTestCase < Test::Unit::TestCase
6
+ include HTML5
7
+ include TestSupport
8
+
9
+ begin
10
+ require 'rubygems'
11
+ require 'UniversalDetector'
12
+
13
+ def test_chardet
14
+ file = File.open(File.join(TESTDATA_DIR, 'encoding', 'chardet', 'test_big5.txt'), 'r')
15
+ stream = HTML5::HTMLInputStream.new(file, :chardet => true)
16
+ assert_equal 'big5', stream.char_encoding.downcase
17
+ rescue LoadError
18
+ puts "chardet not found, skipping chardet tests"
19
+ end
20
+ end
21
+
22
+ html5_test_files('encoding').each do |test_file|
23
+ test_name = File.basename(test_file).sub('.dat', '').tr('-', '')
24
+
25
+ TestData.new(test_file, %w(data encoding)).
26
+ each_with_index do |(input, encoding), index|
27
+
28
+ define_method 'test_%s_%d' % [ test_name, index + 1 ] do
29
+ stream = HTML5::HTMLInputStream.new(input, :chardet => false)
30
+ assert_equal encoding.downcase, stream.char_encoding.downcase, input
31
+ end
32
+ end
33
+ end
34
+
35
+ end
data/tests/test_lxp.rb ADDED
@@ -0,0 +1,263 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/liberalxmlparser'
4
+
5
+ XMLELEM = /<(\w+\s*)((?:[-:\w]+="[^"]*"\s*)+)(\/?)>/
6
+
7
+ def assert_xml_equal(input, expected=nil, parser=HTML5::XMLParser)
8
+ sortattrs = proc {"<#{$1+$2.split.sort.join(' ')+$3}>"}
9
+ document = parser.parse(input.chomp, :lowercase_attr_name => false, :lowercase_element_name => false).root
10
+ if not expected
11
+ expected = input.chomp.gsub(XMLELEM,&sortattrs)
12
+ expected = expected.gsub(/&#(\d+);/) {[$1.to_i].pack('U')}
13
+ output = document.to_s.gsub(/'/,'"').gsub(XMLELEM,&sortattrs)
14
+ assert_equal(expected, output)
15
+ else
16
+ assert_equal(expected, document.to_s.gsub(/'/,'"'))
17
+ end
18
+ end
19
+
20
+ def assert_xhtml_equal(input, expected=nil, parser=HTML5::XHTMLParser)
21
+ assert_xml_equal(input, expected, parser)
22
+ end
23
+
24
+ class BasicXhtml5Test < Test::Unit::TestCase
25
+
26
+ def test_title_body_mismatched_close
27
+ assert_xhtml_equal(
28
+ '<title>Xhtml</title><b><i>content</b></i>',
29
+ '<html xmlns="http://www.w3.org/1999/xhtml">' +
30
+ '<head><title>Xhtml</title></head>' +
31
+ '<body><b><i>content</i></b></body>' +
32
+ '</html>')
33
+ end
34
+
35
+ def test_title_body_named_charref
36
+ assert_xhtml_equal(
37
+ '<title>ntilde</title>A &ntilde B',
38
+ '<html xmlns="http://www.w3.org/1999/xhtml">' +
39
+ '<head><title>ntilde</title></head>' +
40
+ '<body>A '+ [0xF1].pack('U') + ' B</body>' +
41
+ '</html>')
42
+ end
43
+ end
44
+
45
+ class BasicXmlTest < Test::Unit::TestCase
46
+
47
+ def test_comment
48
+ assert_xml_equal("<x><!-- foo --></x>")
49
+ end
50
+
51
+ def test_cdata
52
+ assert_xml_equal("<x><![CDATA[foo]]></x>","<x>foo</x>")
53
+ end
54
+
55
+ def test_simple_text
56
+ assert_xml_equal("<p>foo</p>","<p>foo</p>")
57
+ end
58
+
59
+ def test_optional_close
60
+ assert_xml_equal("<p>foo","<p>foo</p>")
61
+ end
62
+
63
+ def test_html_mismatched
64
+ assert_xml_equal("<b><i>foo</b></i>","<b><i>foo</i></b>")
65
+ end
66
+ end
67
+
68
+ class OpmlTest < Test::Unit::TestCase
69
+
70
+ def test_mixedCaseElement
71
+ assert_xml_equal(
72
+ '<opml version="1.0">' +
73
+ '<head><ownerName>Dave Winer</ownerName></head>' +
74
+ '</opml>')
75
+ end
76
+
77
+ def test_mixedCaseAttribute
78
+ assert_xml_equal(
79
+ '<opml version="1.0">' +
80
+ '<body><outline isComment="true"/></body>' +
81
+ '</opml>')
82
+ end
83
+
84
+ def test_malformed
85
+ assert_xml_equal(
86
+ '<opml version="1.0">' +
87
+ '<body><outline text="Odds & Ends"/></body>' +
88
+ '</opml>',
89
+ '<opml version="1.0">' +
90
+ '<body><outline text="Odds &amp; Ends"/></body>' +
91
+ '</opml>')
92
+ end
93
+ end
94
+
95
+ class XhtmlTest < Test::Unit::TestCase
96
+
97
+ def test_mathml
98
+ assert_xhtml_equal <<EOX
99
+ <html xmlns="http://www.w3.org/1999/xhtml">
100
+ <head><title>MathML</title></head>
101
+ <body>
102
+ <math xmlns="http://www.w3.org/1998/Math/MathML">
103
+ <mrow>
104
+ <mi>x</mi>
105
+ <mo>=</mo>
106
+
107
+ <mfrac>
108
+ <mrow>
109
+ <mrow>
110
+ <mo>-</mo>
111
+ <mi>b</mi>
112
+ </mrow>
113
+ <mo>&#177;</mo>
114
+ <msqrt>
115
+
116
+ <mrow>
117
+ <msup>
118
+ <mi>b</mi>
119
+ <mn>2</mn>
120
+ </msup>
121
+ <mo>-</mo>
122
+ <mrow>
123
+
124
+ <mn>4</mn>
125
+ <mo>&#8290;</mo>
126
+ <mi>a</mi>
127
+ <mo>&#8290;</mo>
128
+ <mi>c</mi>
129
+ </mrow>
130
+ </mrow>
131
+
132
+ </msqrt>
133
+ </mrow>
134
+ <mrow>
135
+ <mn>2</mn>
136
+ <mo>&#8290;</mo>
137
+ <mi>a</mi>
138
+ </mrow>
139
+ </mfrac>
140
+
141
+ </mrow>
142
+ </math>
143
+ </body></html>
144
+ EOX
145
+ end
146
+
147
+ def test_svg
148
+ assert_xhtml_equal <<EOX
149
+ <html xmlns="http://www.w3.org/1999/xhtml">
150
+ <head><title>SVG</title></head>
151
+ <body>
152
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
153
+ <path d="M38,38c0-12,24-15,23-2c0,9-16,13-16,23v7h11v-4c0-9,17-12,17-27
154
+ c-2-22-45-22-45,3zM45,70h11v11h-11z" fill="#371">
155
+ </path>
156
+ <circle cx="50" cy="50" r="45" fill="none" stroke="#371" stroke-width="10">
157
+ </circle>
158
+
159
+ </svg>
160
+ </body></html>
161
+ EOX
162
+ end
163
+
164
+ def test_xlink
165
+ assert_xhtml_equal <<EOX
166
+ <html xmlns="http://www.w3.org/1999/xhtml">
167
+ <head><title>XLINK</title></head>
168
+ <body>
169
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100">
170
+ <defs xmlns:l="http://www.w3.org/1999/xlink">
171
+ <radialGradient id="s1" fx=".4" fy=".2" r=".7">
172
+ <stop stop-color="#FE8"/>
173
+ <stop stop-color="#D70" offset="1"/>
174
+ </radialGradient>
175
+ <radialGradient id="s2" fx=".8" fy=".5" l:href="#s1"/>
176
+ <radialGradient id="s3" fx=".5" fy=".9" l:href="#s1"/>
177
+ <radialGradient id="s4" fx=".1" fy=".5" l:href="#s1"/>
178
+ </defs>
179
+ <g stroke="#940">
180
+ <path d="M73,29c-37-40-62-24-52,4l6-7c-8-16,7-26,42,9z" fill="url(#s1)"/>
181
+ <path d="M47,8c33-16,48,21,9,47l-6-5c38-27,20-44,5-37z" fill="url(#s2)"/>
182
+ <path d="M77,32c22,30,10,57-39,51l-1-8c3,3,67,5,36-36z" fill="url(#s3)"/>
183
+
184
+ <path d="M58,84c-4,20-38-4-8-24l-6-5c-36,43,15,56,23,27z" fill="url(#s4)"/>
185
+ <path d="M40,14c-40,37-37,52-9,68l1-8c-16-13-29-21,16-56z" fill="url(#s1)"/>
186
+ <path d="M31,33c19,23,20,7,35,41l-9,1.7c-4-19-8-14-31-37z" fill="url(#s2)"/>
187
+ </g>
188
+ </svg>
189
+ </body></html>
190
+ EOX
191
+ end
192
+
193
+ def test_br
194
+ assert_xhtml_equal <<EOX1
195
+ <html xmlns="http://www.w3.org/1999/xhtml">
196
+ <head><title>BR</title></head>
197
+ <body>
198
+ <br/>
199
+ </body></html>
200
+ EOX1
201
+ end
202
+
203
+ def test_strong
204
+ assert_xhtml_equal <<EOX
205
+ <html xmlns="http://www.w3.org/1999/xhtml">
206
+ <head><title>STRONG</title></head>
207
+ <body>
208
+ <strong></strong>
209
+ </body></html>
210
+ EOX
211
+ end
212
+
213
+ def test_script
214
+ assert_xhtml_equal <<EOX
215
+ <html xmlns="http://www.w3.org/1999/xhtml">
216
+ <head><title>SCRIPT</title></head>
217
+ <body>
218
+ <script>1 &lt; 2 &amp; 3</script>
219
+ </body></html>
220
+ EOX
221
+ end
222
+
223
+ def test_script_src
224
+ assert_xhtml_equal <<EOX1, <<EOX2.strip
225
+ <html xmlns="http://www.w3.org/1999/xhtml">
226
+ <head><title>SCRIPT</title><script src="http://example.com"/></head>
227
+ <body>
228
+ <script>1 &lt; 2 &amp; 3</script>
229
+ </body></html>
230
+ EOX1
231
+ <html xmlns="http://www.w3.org/1999/xhtml">
232
+ <head><title>SCRIPT</title><script src="http://example.com"></script></head>
233
+ <body>
234
+ <script>1 &lt; 2 &amp; 3</script>
235
+ </body></html>
236
+ EOX2
237
+ end
238
+
239
+ def test_title
240
+ assert_xhtml_equal <<EOX
241
+ <html xmlns="http://www.w3.org/1999/xhtml">
242
+ <head><title>1 &lt; 2 &amp; 3</title></head>
243
+ <body>
244
+ </body></html>
245
+ EOX
246
+ end
247
+
248
+ def test_prolog
249
+ assert_xhtml_equal <<EOX1, <<EOX2.strip
250
+ <?xml version="1.0" encoding="UTF-8" ?>
251
+ <html xmlns="http://www.w3.org/1999/xhtml">
252
+ <head><title>PROLOG</title></head>
253
+ <body>
254
+ </body></html>
255
+ EOX1
256
+ <html xmlns="http://www.w3.org/1999/xhtml">
257
+ <head><title>PROLOG</title></head>
258
+ <body>
259
+ </body></html>
260
+ EOX2
261
+ end
262
+
263
+ end
@@ -0,0 +1,68 @@
1
+ require File.join(File.dirname(__FILE__), 'preamble')
2
+
3
+ require 'html5/treebuilders'
4
+ require 'html5/html5parser'
5
+
6
+
7
+ $tree_types_to_test = ['simpletree', 'rexml']
8
+
9
+ begin
10
+ require 'hpricot'
11
+ $tree_types_to_test.push('hpricot')
12
+ rescue LoadError
13
+ end
14
+
15
+ $CHECK_PARSER_ERRORS = ARGV.delete('-p') # TODO
16
+
17
+ puts 'Testing tree builders: ' + $tree_types_to_test * ', '
18
+
19
+
20
+ class Html5ParserTestCase < Test::Unit::TestCase
21
+ include HTML5
22
+ include TestSupport
23
+
24
+ html5_test_files('tree-construction').each do |test_file|
25
+
26
+ test_name = File.basename(test_file).sub('.dat', '')
27
+
28
+ TestData.new(test_file, %w(data errors document-fragment document)).
29
+ each_with_index do |(input, errors, inner_html, expected), index|
30
+
31
+ errors = errors.split("\n")
32
+ expected = expected.gsub("\n| ","\n")[2..-1]
33
+
34
+ $tree_types_to_test.each do |tree_name|
35
+ define_method 'test_%s_%d_%s' % [ test_name, index + 1, tree_name ] do
36
+
37
+ parser = HTMLParser.new(:tree => TreeBuilders[tree_name])
38
+
39
+ if inner_html
40
+ parser.parse_fragment(input, inner_html)
41
+ else
42
+ parser.parse(input)
43
+ end
44
+
45
+ actual_output = convertTreeDump(parser.tree.testSerializer(parser.tree.document))
46
+
47
+ assert_equal sortattrs(expected), sortattrs(actual_output), [
48
+ '', 'Input:', input,
49
+ '', 'Expected:', expected,
50
+ '', 'Recieved:', actual_output
51
+ ].join("\n")
52
+
53
+ actual_errors = parser.errors.map do |(line, col), message|
54
+ 'Line: %i Col: %i %s' % [line, col, message]
55
+ end
56
+ assert_equal errors.length, parser.errors.length, [
57
+ '', 'Input', input,
58
+ '', "Expected errors (#{errors.length}):", errors.join("\n"),
59
+ '', "Actual errors (#{actual_errors.length}):",
60
+ actual_errors.join("\n")
61
+ ].join("\n")
62
+
63
+ end
64
+ end
65
+ end
66
+ end
67
+
68
+ end