htmltools 1.10

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/ruby
2
+ require 'html/tree'
3
+ require 'test/unit'
4
+
5
+ module HTMLTree
6
+ class Parser
7
+ attr_reader :currentNode
8
+ attr_reader :rootNode
9
+ end
10
+
11
+ module TreeElement
12
+ alias :ch :children
13
+ end
14
+ end
15
+
16
+ class HTMLTreeParserTestCase < Test::Unit::TestCase
17
+ def setup
18
+ @p = HTMLTree::Parser.new(true, false)
19
+ end
20
+
21
+ def rn
22
+ @p.rootNode
23
+ end
24
+
25
+ def cn
26
+ @p.currentNode
27
+ end
28
+
29
+ attr_reader :p
30
+
31
+ def test_empty
32
+ assert_equal(cn, rn)
33
+ assert_equal(nil, cn.parent)
34
+ assert_equal([], rn.children)
35
+ end
36
+
37
+ def test_skeleton
38
+ d = rn
39
+ assert_equal(d.class, HTMLTree::Document)
40
+ p.feed('<html></html>')
41
+ assert_equal(d, cn)
42
+ assert_equal(d, rn)
43
+ assert_equal(rn, @p.tree)
44
+ assert_equal(@p.html, @p.tree.html_node)
45
+ assert_equal(@p.html, rn.ch[0])
46
+ assert_equal(1, rn.ch.size)
47
+ assert_equal('', rn.tag)
48
+ assert_equal('html', rn.ch[0].tag)
49
+ assert_equal(d.html_node, rn.ch[0])
50
+ end
51
+
52
+ def test_reset
53
+ p.feed('<html><head></head><body attrib1="xxx"></body></html>')
54
+ assert_equal(1, rn.ch.size)
55
+ assert_equal(2, rn.ch[0].ch.size)
56
+ p.reset
57
+ assert_equal(0, rn.ch.size)
58
+ p.feed('<html><head></head><body attrib1="xxx"></body></html>')
59
+ assert_equal(1, rn.ch.size)
60
+ assert_equal(2, rn.ch[0].ch.size)
61
+ end
62
+
63
+ def test_skeleton2
64
+ d = rn
65
+ p.feed('<html><head></head><body attrib1="xxx"></body></html>')
66
+ assert_equal(d, cn)
67
+ assert_equal(d, rn)
68
+ assert_equal(1, rn.ch.size)
69
+ h = rn.ch[0] # html
70
+ assert_equal(d.html_node, h)
71
+ assert_equal('html', h.tag)
72
+ assert_equal(2, h.ch.size)
73
+ assert_equal(0, h.ch[1].ch.size)
74
+ assert_equal(0, h.ch[0].ch.size)
75
+ assert_equal('head', h.ch[0].tag)
76
+ assert_equal('body', h.ch[1].tag)
77
+ assert_equal('xxx', h.ch[1].attribute('attrib1'))
78
+ end
79
+
80
+ def test_empty_tag
81
+ d = rn
82
+ p.feed('<html><head></head><body attrib1="xxx"><br></body></html>')
83
+ assert_equal(d, cn)
84
+ h = rn.ch[0]
85
+ assert_equal('html', h.tag)
86
+ assert_equal(2, h.ch.size)
87
+ assert_equal(0, h.ch[0].ch.size)
88
+ assert_equal(1, h.ch[1].ch.size)
89
+ assert_equal('head', h.ch[0].tag)
90
+ assert_equal('body', h.ch[1].tag)
91
+ assert_equal('br', h.ch[1].ch[0].tag)
92
+ end
93
+
94
+ def test_no_end_tag
95
+ p.feed("<html><body>Foo<br />bar</body></html>")
96
+ h = rn.ch[0]
97
+ assert_equal('html', h.tag)
98
+ assert_equal('body', h.ch[0].tag)
99
+ assert_equal('br', h.ch[0].ch[1].tag)
100
+ assert_equal({}, h.ch[0].ch[1].attributes)
101
+ assert_equal([], h.ch[0].ch[1].ch)
102
+ assert_equal('Foo', h.ch[0].ch[0].content)
103
+ assert_equal('bar', h.ch[0].ch[2].content)
104
+ end
105
+
106
+ def test_content
107
+ d = rn
108
+ p.feed('<html><head></head><body attrib1="xxx"><p>stuff</p></body></html>')
109
+ assert_equal(d, rn)
110
+ assert_equal(d, cn)
111
+ h = rn.ch[0]
112
+ assert_equal('html', h.tag)
113
+ assert_equal(2, h.ch.size) # html => head, body
114
+ assert_equal(0, h.ch[0].ch.size) # head =>
115
+ assert_equal(1, h.ch[1].ch.size) # body=>p
116
+ assert_equal(1, h.ch[1].ch[0].ch.size) #p=>stuff
117
+ assert_equal('head', h.ch[0].tag)
118
+ assert_equal('body', h.ch[1].tag)
119
+ assert_equal('p', h.ch[1].ch[0].tag)
120
+ data = h.ch[1].ch[0].ch[0] # html/body/p/<data>
121
+ assert_equal(true, data.data?)
122
+ assert_equal('', data.tag)
123
+ assert_equal('stuff', data.to_s)
124
+ assert_equal({}, data.attributes)
125
+ end
126
+
127
+ def test_unclosed_li
128
+ p.feed('<html><body><ul><li>Item 1<li>Item 2<li>Item 3</ul></body></html>')
129
+
130
+ html = rn.ch[0]
131
+ assert_equal('html', html.tag)
132
+
133
+ ul = html.ch[0].ch[0]
134
+ assert_equal('ul', ul.tag)
135
+
136
+ assert_equal(3, ul.ch.size)
137
+ end
138
+
139
+ def test_partial_file
140
+ p.feed("<ul><li>test</li><li>test test</li></ul>")
141
+ li = rn.ch[0]
142
+ assert_equal('ul', li.tag)
143
+ assert_equal(2, li.ch.size)
144
+ assert_equal('li', li.ch[0].tag)
145
+ assert_equal('li', li.ch[1].tag)
146
+ end
147
+
148
+ def test_break_nesting
149
+ p.feed('<HTML><BODY><p><ul><LI></ul><p></BODY></HTML>')
150
+
151
+ expected = tree("html",
152
+ tree("body", tree("p", tree("ul", tree("li"))), tree("p"))
153
+ )
154
+ expected.assert_matches(p.html)
155
+ end
156
+
157
+ def test_meta
158
+ p.feed('<html><head><META NAME="robots" CONTENT="noindex,follow"></head><body></body></html>')
159
+
160
+ expected = tree("html",
161
+ tree("head", tree("meta")),
162
+ tree("body"))
163
+
164
+ expected.assert_matches(p.html)
165
+ end
166
+
167
+ class VerificationTree
168
+ include Test::Unit::Assertions
169
+
170
+ def initialize(tag)
171
+ @tag = tag
172
+ @children = []
173
+ end
174
+
175
+ def add_children(children)
176
+ @children = children
177
+ self
178
+ end
179
+
180
+ def assert_matches(tree)
181
+ assert_equal(@tag, tree.tag)
182
+
183
+ assert_equal(tree.elements.collect { |node| node.tag }, @children.collect { |node| node.tag }, tree.path)
184
+ assert_children_matches(@children, tree.elements)
185
+ end
186
+
187
+ def assert_children_matches(children, treechildren)
188
+ for i in (0...children.size) do
189
+ children[i].assert_matches(treechildren[i])
190
+ end
191
+ end
192
+
193
+ attr_reader :tag
194
+ end
195
+
196
+ def tree(tag, *children)
197
+ tree = VerificationTree.new(tag)
198
+ tree.add_children(children)
199
+ tree
200
+ end
201
+ end
@@ -0,0 +1,160 @@
1
+ # $Id: tc_source-parser.rb,v 1.3 2006/07/24 09:28:19 Philip Dorrell Exp $
2
+
3
+ require 'html/stparser'
4
+ require 'test/unit'
5
+
6
+ class TestSourceParser < HTML::SGMLParser
7
+ def initialize(verbose, test_case)
8
+ super(verbose)
9
+ @test_case = test_case
10
+ @fulldata = ""
11
+ end
12
+ attr_reader :test_case
13
+
14
+ def feed(data)
15
+ @fulldata = @fulldata + data
16
+ super(data)
17
+ end
18
+
19
+ def last_src
20
+ return @fulldata[src_range]
21
+ end
22
+
23
+ def warn(msg); test_case.callback('warn', msg); end
24
+
25
+ def handle_starttag(tag, method, attrs); test_case.callback('starttag', last_src); end
26
+ def unknown_starttag(tag, attrs); test_case.callback('starttag', last_src); end
27
+ def handle_endtag(tag, method); test_case.callback('endtag', last_src); end
28
+ def unknown_endtag(tag); test_case.callback('endtag', last_src); end
29
+ def handle_charref(name); test_case.callback('charref', last_src); end
30
+ def handle_entityref(name); test_case.callback('entityref', last_src); end
31
+ def handle_data(data); test_case.callback('data', last_src); end
32
+ def handle_comment(data); test_case.callback('comment', last_src); end
33
+ def handle_special(data); test_case.callback('special', last_src); end
34
+ end
35
+
36
+ class SourceParserTestCase < Test::Unit::TestCase
37
+
38
+ def setup
39
+ @parser = TestSourceParser.new(true, self)
40
+ @callbacks = []
41
+ end
42
+
43
+ def callback(*stuff)
44
+ @callbacks << stuff
45
+ end
46
+
47
+ attr_reader :parser
48
+ attr_reader :callbacks
49
+
50
+ # run the given block and return the callbacks if any
51
+ def callbacks_from
52
+ @callbacks = []
53
+ yield
54
+ #show_callbacks
55
+ @callbacks
56
+ end
57
+
58
+ def show_callbacks
59
+ puts "Callbacks: "
60
+ @callbacks.each do |callback|
61
+ puts " [#{callback[0]}: \"#{callback[1]}\"]"
62
+ end
63
+ end
64
+
65
+ def test_empty_html
66
+ cbs = callbacks_from { parser.feed('<html>') }
67
+ assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
68
+ assert_equal(['starttag', '<html>'], cbs[0])
69
+
70
+ cbs = callbacks_from{ parser.feed('</html>') }
71
+ assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
72
+ assert_equal(['endtag', '</html>'], cbs[0])
73
+ end
74
+
75
+ def test_attribs
76
+ cbs = callbacks_from {
77
+ parser.feed('<html><body bgcolor="#ffffff" width="123"><p>Fred</p></body></html>')
78
+ }
79
+ assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
80
+ assert_equal(['starttag', '<html>'], cbs[0])
81
+ assert_equal(['starttag', '<body bgcolor="#ffffff" width="123">'], cbs[1])
82
+ assert_equal(['data', "Fred"], cbs[3])
83
+ assert_equal(['endtag', '</body>'], cbs[5])
84
+ end
85
+
86
+ # FIXME should we insert <p> tags here?
87
+ def test_no_para_tags_in_body
88
+ cbs = callbacks_from { parser.feed('<html><body>Fred</body></html>') }
89
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
90
+ assert_equal(['endtag', '</body>'], cbs[3])
91
+ end
92
+
93
+ def test_empty_tag
94
+ cbs = callbacks_from { parser.feed('<html><body><img src="whatever"></body></html>') }
95
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
96
+ assert_equal(["starttag", '<img src="whatever">'], cbs[2])
97
+ end
98
+
99
+ def test_data
100
+ cbs = callbacks_from { parser.feed('<html><body><p>Data1</p><br><p>More_data</p></body></html>') }
101
+ assert_equal(11, cbs.size, "cbs is #{cbs.inspect}")
102
+ assert_equal(["data", "Data1"], cbs[3])
103
+ assert_equal(["data", "More_data"], cbs[7])
104
+ end
105
+
106
+ def test_whitespace_stripping
107
+ cbs = callbacks_from { parser.feed('<html><body><p> Data1 ') }
108
+ assert_equal(4, cbs.size, "cbs is #{cbs.inspect}")
109
+ assert_equal(["data", " Data1 "], cbs[3])
110
+
111
+ cbs = callbacks_from { parser.feed('</p><p> Data2 </p></body></html>') }
112
+ assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
113
+ assert_equal(["data", " Data2 "], cbs[2])
114
+ end
115
+
116
+ def test_script
117
+ cbs = callbacks_from { parser.feed <<EOS
118
+ <html><body>
119
+ <script type="text/javascript" language="Java_script">
120
+ <!--
121
+ var page_name = "Page_view_item";
122
+ //-->
123
+ </script>
124
+ </body></html>
125
+ EOS
126
+ }
127
+ assert_equal(12, cbs.size, "cbs is #{cbs.inspect}")
128
+ assert_equal(["starttag", '<script type="text/javascript" language="Java_script">'], cbs[3]);
129
+ assert_equal(["endtag", "</script>"], cbs[7])
130
+ end
131
+
132
+ def test_unknown_character
133
+ cbs = callbacks_from { parser.feed('<html><body>&#12345;</body></html>') }
134
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
135
+ assert_equal(["charref", "&#12345;"], cbs[2])
136
+ end
137
+
138
+ def test_unknown_entity
139
+ cbs = callbacks_from { parser.feed('<html><body>&fred;</body></html>') }
140
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
141
+ assert_equal(["entityref", "&fred;"], cbs[2])
142
+ end
143
+
144
+ def test_comment
145
+ cbs = callbacks_from { parser.feed('<html><body><!-- comment here --></body></html>') }
146
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
147
+ assert_equal(["comment", "<!-- comment here -->"], cbs[2])
148
+ end
149
+
150
+ #TODO is this right (w/the !)?
151
+ def test_special
152
+ cbs = callbacks_from { parser.feed <<EOS
153
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
154
+ <html><body></body></html>
155
+ EOS
156
+ }
157
+ assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
158
+ assert_equal(["special", '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'], cbs[0])
159
+ end
160
+ end
@@ -0,0 +1,196 @@
1
+ # Test cases for html-tree.rb
2
+ # Copyright (C) 2002 Ned Konz <ned@bike-nomad.com>
3
+ # License: Ruby's
4
+ # $Id: tc_stacking-parser.rb,v 1.4 2004/02/10 21:36:31 jhannes Exp $
5
+
6
+ require 'html/stparser'
7
+ require 'test/unit'
8
+
9
+ class TestStackingParser < HTML::StackingParser
10
+ def initialize(verbose, test_case)
11
+ super(verbose)
12
+ @test_case = test_case
13
+ end
14
+ attr_reader :test_case
15
+
16
+ def warn(msg); test_case.callback('warn', msg); end
17
+
18
+ def handle_comment(data); super; test_case.callback('comment', data); end
19
+ def handle_cdata(data); test_case.callback('data', data); end
20
+ def handle_start_tag(tag, attrs); test_case.callback('start_tag', tag, attrs); end
21
+ def handle_end_tag(tag); test_case.callback('end_tag', tag); end
22
+ def handle_empty_tag(tag, attrs); test_case.callback('empty_tag', tag, attrs); end
23
+ def handle_unknown_tag(tag, attrs); test_case.callback('unknown_tag', tag, attrs); end
24
+ def handle_missing_end_tag(tag); test_case.callback('missing_end_tag', tag); end;
25
+ def handle_extra_end_tag(tag); test_case.callback('extra_end_tag', tag); end
26
+ def handle_script(data); test_case.callback('script', data); end
27
+ def handle_unknown_character(name); test_case.callback('unknown_character', name); end
28
+ def handle_unknown_entity(name); test_case.callback('unknown_entity', name); end
29
+ def handle_special(data); test_case.callback('special', data); end
30
+ end
31
+
32
+ class StackingParserTestCase < Test::Unit::TestCase
33
+
34
+ def setup
35
+ @parser = TestStackingParser.new(true, self)
36
+ @callbacks = []
37
+ end
38
+
39
+ def callback(*stuff)
40
+ @callbacks << stuff
41
+ end
42
+
43
+ attr_reader :parser
44
+ attr_reader :callbacks
45
+
46
+ # run the given block and return the callbacks if any
47
+ def callbacks_from
48
+ @callbacks = []
49
+ yield
50
+ @callbacks
51
+ end
52
+
53
+ def test_empty_stack
54
+ # test stack empty at first
55
+ assert(parser.stack.empty?)
56
+ # test last_tag and parent_tag don't blow up with empty stack
57
+ assert_equal('html', parser.last_tag)
58
+ assert_equal('html', parser.parent_tag)
59
+ end
60
+
61
+ def test_empty_html
62
+ cbs = callbacks_from { parser.feed('<html>') }
63
+ assert_same(false, parser.stack.empty?)
64
+ assert_equal('html', parser.last_tag)
65
+ assert_equal('html', parser.parent_tag)
66
+ assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
67
+ assert_equal(['start_tag', 'html', []], cbs[0])
68
+
69
+ cbs = callbacks_from{ parser.feed('</html>') }
70
+ assert(parser.stack.empty?)
71
+ assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
72
+ assert_equal(['end_tag', 'html'], cbs[0])
73
+ end
74
+
75
+ def test_attribs
76
+ cbs = callbacks_from {
77
+ parser.feed('<html><body bgcolor="#ffffff" width="123"><p>Fred</p></body></html>')
78
+ }
79
+ assert_same(true, parser.stack.empty?)
80
+ assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
81
+ assert_equal(['start_tag', 'html', []], cbs[0])
82
+ assert_equal(["start_tag", "body", [["bgcolor", "#ffffff"], ["width", "123"]]], cbs[1])
83
+ assert_equal(['data', "Fred"], cbs[3])
84
+ end
85
+
86
+ # FIXME should we insert <p> tags here?
87
+ def test_no_para_tags_in_body
88
+ cbs = callbacks_from { parser.feed('<html><body>Fred</body></html>') }
89
+ assert_equal(true, parser.stack.empty?)
90
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
91
+ assert_equal(['data', 'Fred'], cbs[2])
92
+ end
93
+
94
+ def test_empty_tag
95
+ cbs = callbacks_from { parser.feed('<html><body><img src="whatever"></body></html>') }
96
+ assert_equal(true, parser.stack.empty?)
97
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
98
+ assert_equal(["empty_tag", "img", [["src", "whatever"]]], cbs[2])
99
+ end
100
+
101
+ def test_unknown_tag
102
+ cbs = callbacks_from { parser.feed('<html><body><froobzle a="b"></froobzle></body></html>') }
103
+ assert_equal(true, parser.stack.empty?)
104
+ assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
105
+ assert_equal(["unknown_tag", "froobzle", [["a", "b"]]], cbs[2])
106
+ end
107
+
108
+ def test_missing_end_tag
109
+ cbs = callbacks_from { parser.feed('<html><body><div></body></html>') }
110
+ assert_equal(true, parser.stack.empty?)
111
+ assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
112
+ assert_equal(["missing_end_tag", "div"], cbs[3])
113
+ end
114
+
115
+ def test_extra_end_tag
116
+ cbs = callbacks_from { parser.feed('<html><body></body></body></html>') }
117
+ assert_equal(true, parser.stack.empty?)
118
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
119
+ assert_equal(["extra_end_tag", "body"], cbs[3])
120
+ end
121
+
122
+ def test_data
123
+ cbs = callbacks_from { parser.feed('<html><body><p>Data1</p><br><p>More_data</p></body></html>') }
124
+ assert_equal(true, parser.stack.empty?)
125
+ assert_equal(11, cbs.size, "cbs is #{cbs.inspect}")
126
+ assert_equal(["data", "Data1"], cbs[3])
127
+ assert_equal(["data", "More_data"], cbs[7])
128
+ end
129
+
130
+ def test_whitespace_stripping
131
+ parser.strip_whitespace = false
132
+ cbs = callbacks_from { parser.feed('<html><body><p> Data1 ') }
133
+ assert_equal(4, cbs.size, "cbs is #{cbs.inspect}")
134
+ assert_equal(["data", " Data1 "], cbs[3])
135
+
136
+ parser.strip_whitespace = true
137
+ cbs = callbacks_from { parser.feed('</p><p> Data2 </p></body></html>') }
138
+ assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
139
+ assert_equal(["data", "Data2"], cbs[2])
140
+ assert_equal(true, parser.stack.empty?)
141
+ end
142
+
143
+ def test_script
144
+ parser.strip_whitespace = true
145
+ cbs = callbacks_from { parser.feed <<EOS
146
+ <html><body>
147
+ <script type="text/javascript" language="Java_script">
148
+ <!--
149
+ var page_name = "Page_view_item";
150
+ //-->
151
+ </script>
152
+ </body></html>
153
+ EOS
154
+ }
155
+ assert_equal([], parser.stack)
156
+ assert_equal(true, parser.stack.empty?)
157
+ assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
158
+ assert_equal(["start_tag", "script",
159
+ [["type", "text/javascript"], ["language", "Java_script"]]], cbs[2])
160
+ assert_equal(["script", "\n<!--\nvar page_name = \"Page_view_item\";\n//-->\n"], cbs[3])
161
+ assert_equal(["end_tag", "script"], cbs[4])
162
+ end
163
+
164
+ def test_unknown_character
165
+ cbs = callbacks_from { parser.feed('<html><body>&#12345;</body></html>') }
166
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
167
+ assert_equal(["unknown_character", "12345"], cbs[2])
168
+ end
169
+
170
+ def test_unknown_entity
171
+ cbs = callbacks_from { parser.feed('<html><body>&fred;</body></html>') }
172
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
173
+ assert_equal(["unknown_entity", "fred"], cbs[2])
174
+ end
175
+
176
+ def test_comment
177
+ parser.strip_whitespace = true
178
+ cbs = callbacks_from { parser.feed('<html><body><!-- comment here --></body></html>') }
179
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
180
+ assert_equal(["comment", "comment here"], cbs[2])
181
+ end
182
+
183
+ #TODO is this right (w/the !)?
184
+ def test_special
185
+ parser.strip_whitespace = true
186
+ cbs = callbacks_from { parser.feed <<EOS
187
+ <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
188
+ <html><body></body></html>
189
+ EOS
190
+ }
191
+ assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
192
+ assert_equal(["special", '!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"'], cbs[0])
193
+ end
194
+ end
195
+
196
+ $stdout.sync = true