htmltools 1.10
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
require 'html/tree'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
module HTMLTree
|
6
|
+
class Parser
|
7
|
+
attr_reader :currentNode
|
8
|
+
attr_reader :rootNode
|
9
|
+
end
|
10
|
+
|
11
|
+
module TreeElement
|
12
|
+
alias :ch :children
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class HTMLTreeParserTestCase < Test::Unit::TestCase
|
17
|
+
def setup
|
18
|
+
@p = HTMLTree::Parser.new(true, false)
|
19
|
+
end
|
20
|
+
|
21
|
+
def rn
|
22
|
+
@p.rootNode
|
23
|
+
end
|
24
|
+
|
25
|
+
def cn
|
26
|
+
@p.currentNode
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :p
|
30
|
+
|
31
|
+
def test_empty
|
32
|
+
assert_equal(cn, rn)
|
33
|
+
assert_equal(nil, cn.parent)
|
34
|
+
assert_equal([], rn.children)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_skeleton
|
38
|
+
d = rn
|
39
|
+
assert_equal(d.class, HTMLTree::Document)
|
40
|
+
p.feed('<html></html>')
|
41
|
+
assert_equal(d, cn)
|
42
|
+
assert_equal(d, rn)
|
43
|
+
assert_equal(rn, @p.tree)
|
44
|
+
assert_equal(@p.html, @p.tree.html_node)
|
45
|
+
assert_equal(@p.html, rn.ch[0])
|
46
|
+
assert_equal(1, rn.ch.size)
|
47
|
+
assert_equal('', rn.tag)
|
48
|
+
assert_equal('html', rn.ch[0].tag)
|
49
|
+
assert_equal(d.html_node, rn.ch[0])
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_reset
|
53
|
+
p.feed('<html><head></head><body attrib1="xxx"></body></html>')
|
54
|
+
assert_equal(1, rn.ch.size)
|
55
|
+
assert_equal(2, rn.ch[0].ch.size)
|
56
|
+
p.reset
|
57
|
+
assert_equal(0, rn.ch.size)
|
58
|
+
p.feed('<html><head></head><body attrib1="xxx"></body></html>')
|
59
|
+
assert_equal(1, rn.ch.size)
|
60
|
+
assert_equal(2, rn.ch[0].ch.size)
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_skeleton2
|
64
|
+
d = rn
|
65
|
+
p.feed('<html><head></head><body attrib1="xxx"></body></html>')
|
66
|
+
assert_equal(d, cn)
|
67
|
+
assert_equal(d, rn)
|
68
|
+
assert_equal(1, rn.ch.size)
|
69
|
+
h = rn.ch[0] # html
|
70
|
+
assert_equal(d.html_node, h)
|
71
|
+
assert_equal('html', h.tag)
|
72
|
+
assert_equal(2, h.ch.size)
|
73
|
+
assert_equal(0, h.ch[1].ch.size)
|
74
|
+
assert_equal(0, h.ch[0].ch.size)
|
75
|
+
assert_equal('head', h.ch[0].tag)
|
76
|
+
assert_equal('body', h.ch[1].tag)
|
77
|
+
assert_equal('xxx', h.ch[1].attribute('attrib1'))
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_empty_tag
|
81
|
+
d = rn
|
82
|
+
p.feed('<html><head></head><body attrib1="xxx"><br></body></html>')
|
83
|
+
assert_equal(d, cn)
|
84
|
+
h = rn.ch[0]
|
85
|
+
assert_equal('html', h.tag)
|
86
|
+
assert_equal(2, h.ch.size)
|
87
|
+
assert_equal(0, h.ch[0].ch.size)
|
88
|
+
assert_equal(1, h.ch[1].ch.size)
|
89
|
+
assert_equal('head', h.ch[0].tag)
|
90
|
+
assert_equal('body', h.ch[1].tag)
|
91
|
+
assert_equal('br', h.ch[1].ch[0].tag)
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_no_end_tag
|
95
|
+
p.feed("<html><body>Foo<br />bar</body></html>")
|
96
|
+
h = rn.ch[0]
|
97
|
+
assert_equal('html', h.tag)
|
98
|
+
assert_equal('body', h.ch[0].tag)
|
99
|
+
assert_equal('br', h.ch[0].ch[1].tag)
|
100
|
+
assert_equal({}, h.ch[0].ch[1].attributes)
|
101
|
+
assert_equal([], h.ch[0].ch[1].ch)
|
102
|
+
assert_equal('Foo', h.ch[0].ch[0].content)
|
103
|
+
assert_equal('bar', h.ch[0].ch[2].content)
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_content
|
107
|
+
d = rn
|
108
|
+
p.feed('<html><head></head><body attrib1="xxx"><p>stuff</p></body></html>')
|
109
|
+
assert_equal(d, rn)
|
110
|
+
assert_equal(d, cn)
|
111
|
+
h = rn.ch[0]
|
112
|
+
assert_equal('html', h.tag)
|
113
|
+
assert_equal(2, h.ch.size) # html => head, body
|
114
|
+
assert_equal(0, h.ch[0].ch.size) # head =>
|
115
|
+
assert_equal(1, h.ch[1].ch.size) # body=>p
|
116
|
+
assert_equal(1, h.ch[1].ch[0].ch.size) #p=>stuff
|
117
|
+
assert_equal('head', h.ch[0].tag)
|
118
|
+
assert_equal('body', h.ch[1].tag)
|
119
|
+
assert_equal('p', h.ch[1].ch[0].tag)
|
120
|
+
data = h.ch[1].ch[0].ch[0] # html/body/p/<data>
|
121
|
+
assert_equal(true, data.data?)
|
122
|
+
assert_equal('', data.tag)
|
123
|
+
assert_equal('stuff', data.to_s)
|
124
|
+
assert_equal({}, data.attributes)
|
125
|
+
end
|
126
|
+
|
127
|
+
def test_unclosed_li
|
128
|
+
p.feed('<html><body><ul><li>Item 1<li>Item 2<li>Item 3</ul></body></html>')
|
129
|
+
|
130
|
+
html = rn.ch[0]
|
131
|
+
assert_equal('html', html.tag)
|
132
|
+
|
133
|
+
ul = html.ch[0].ch[0]
|
134
|
+
assert_equal('ul', ul.tag)
|
135
|
+
|
136
|
+
assert_equal(3, ul.ch.size)
|
137
|
+
end
|
138
|
+
|
139
|
+
def test_partial_file
|
140
|
+
p.feed("<ul><li>test</li><li>test test</li></ul>")
|
141
|
+
li = rn.ch[0]
|
142
|
+
assert_equal('ul', li.tag)
|
143
|
+
assert_equal(2, li.ch.size)
|
144
|
+
assert_equal('li', li.ch[0].tag)
|
145
|
+
assert_equal('li', li.ch[1].tag)
|
146
|
+
end
|
147
|
+
|
148
|
+
def test_break_nesting
|
149
|
+
p.feed('<HTML><BODY><p><ul><LI></ul><p></BODY></HTML>')
|
150
|
+
|
151
|
+
expected = tree("html",
|
152
|
+
tree("body", tree("p", tree("ul", tree("li"))), tree("p"))
|
153
|
+
)
|
154
|
+
expected.assert_matches(p.html)
|
155
|
+
end
|
156
|
+
|
157
|
+
def test_meta
|
158
|
+
p.feed('<html><head><META NAME="robots" CONTENT="noindex,follow"></head><body></body></html>')
|
159
|
+
|
160
|
+
expected = tree("html",
|
161
|
+
tree("head", tree("meta")),
|
162
|
+
tree("body"))
|
163
|
+
|
164
|
+
expected.assert_matches(p.html)
|
165
|
+
end
|
166
|
+
|
167
|
+
class VerificationTree
|
168
|
+
include Test::Unit::Assertions
|
169
|
+
|
170
|
+
def initialize(tag)
|
171
|
+
@tag = tag
|
172
|
+
@children = []
|
173
|
+
end
|
174
|
+
|
175
|
+
def add_children(children)
|
176
|
+
@children = children
|
177
|
+
self
|
178
|
+
end
|
179
|
+
|
180
|
+
def assert_matches(tree)
|
181
|
+
assert_equal(@tag, tree.tag)
|
182
|
+
|
183
|
+
assert_equal(tree.elements.collect { |node| node.tag }, @children.collect { |node| node.tag }, tree.path)
|
184
|
+
assert_children_matches(@children, tree.elements)
|
185
|
+
end
|
186
|
+
|
187
|
+
def assert_children_matches(children, treechildren)
|
188
|
+
for i in (0...children.size) do
|
189
|
+
children[i].assert_matches(treechildren[i])
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
attr_reader :tag
|
194
|
+
end
|
195
|
+
|
196
|
+
def tree(tag, *children)
|
197
|
+
tree = VerificationTree.new(tag)
|
198
|
+
tree.add_children(children)
|
199
|
+
tree
|
200
|
+
end
|
201
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# $Id: tc_source-parser.rb,v 1.3 2006/07/24 09:28:19 Philip Dorrell Exp $
|
2
|
+
|
3
|
+
require 'html/stparser'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestSourceParser < HTML::SGMLParser
|
7
|
+
def initialize(verbose, test_case)
|
8
|
+
super(verbose)
|
9
|
+
@test_case = test_case
|
10
|
+
@fulldata = ""
|
11
|
+
end
|
12
|
+
attr_reader :test_case
|
13
|
+
|
14
|
+
def feed(data)
|
15
|
+
@fulldata = @fulldata + data
|
16
|
+
super(data)
|
17
|
+
end
|
18
|
+
|
19
|
+
def last_src
|
20
|
+
return @fulldata[src_range]
|
21
|
+
end
|
22
|
+
|
23
|
+
def warn(msg); test_case.callback('warn', msg); end
|
24
|
+
|
25
|
+
def handle_starttag(tag, method, attrs); test_case.callback('starttag', last_src); end
|
26
|
+
def unknown_starttag(tag, attrs); test_case.callback('starttag', last_src); end
|
27
|
+
def handle_endtag(tag, method); test_case.callback('endtag', last_src); end
|
28
|
+
def unknown_endtag(tag); test_case.callback('endtag', last_src); end
|
29
|
+
def handle_charref(name); test_case.callback('charref', last_src); end
|
30
|
+
def handle_entityref(name); test_case.callback('entityref', last_src); end
|
31
|
+
def handle_data(data); test_case.callback('data', last_src); end
|
32
|
+
def handle_comment(data); test_case.callback('comment', last_src); end
|
33
|
+
def handle_special(data); test_case.callback('special', last_src); end
|
34
|
+
end
|
35
|
+
|
36
|
+
class SourceParserTestCase < Test::Unit::TestCase
|
37
|
+
|
38
|
+
def setup
|
39
|
+
@parser = TestSourceParser.new(true, self)
|
40
|
+
@callbacks = []
|
41
|
+
end
|
42
|
+
|
43
|
+
def callback(*stuff)
|
44
|
+
@callbacks << stuff
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :parser
|
48
|
+
attr_reader :callbacks
|
49
|
+
|
50
|
+
# run the given block and return the callbacks if any
|
51
|
+
def callbacks_from
|
52
|
+
@callbacks = []
|
53
|
+
yield
|
54
|
+
#show_callbacks
|
55
|
+
@callbacks
|
56
|
+
end
|
57
|
+
|
58
|
+
def show_callbacks
|
59
|
+
puts "Callbacks: "
|
60
|
+
@callbacks.each do |callback|
|
61
|
+
puts " [#{callback[0]}: \"#{callback[1]}\"]"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_empty_html
|
66
|
+
cbs = callbacks_from { parser.feed('<html>') }
|
67
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
68
|
+
assert_equal(['starttag', '<html>'], cbs[0])
|
69
|
+
|
70
|
+
cbs = callbacks_from{ parser.feed('</html>') }
|
71
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
72
|
+
assert_equal(['endtag', '</html>'], cbs[0])
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_attribs
|
76
|
+
cbs = callbacks_from {
|
77
|
+
parser.feed('<html><body bgcolor="#ffffff" width="123"><p>Fred</p></body></html>')
|
78
|
+
}
|
79
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
80
|
+
assert_equal(['starttag', '<html>'], cbs[0])
|
81
|
+
assert_equal(['starttag', '<body bgcolor="#ffffff" width="123">'], cbs[1])
|
82
|
+
assert_equal(['data', "Fred"], cbs[3])
|
83
|
+
assert_equal(['endtag', '</body>'], cbs[5])
|
84
|
+
end
|
85
|
+
|
86
|
+
# FIXME should we insert <p> tags here?
|
87
|
+
def test_no_para_tags_in_body
|
88
|
+
cbs = callbacks_from { parser.feed('<html><body>Fred</body></html>') }
|
89
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
90
|
+
assert_equal(['endtag', '</body>'], cbs[3])
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_empty_tag
|
94
|
+
cbs = callbacks_from { parser.feed('<html><body><img src="whatever"></body></html>') }
|
95
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
96
|
+
assert_equal(["starttag", '<img src="whatever">'], cbs[2])
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_data
|
100
|
+
cbs = callbacks_from { parser.feed('<html><body><p>Data1</p><br><p>More_data</p></body></html>') }
|
101
|
+
assert_equal(11, cbs.size, "cbs is #{cbs.inspect}")
|
102
|
+
assert_equal(["data", "Data1"], cbs[3])
|
103
|
+
assert_equal(["data", "More_data"], cbs[7])
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_whitespace_stripping
|
107
|
+
cbs = callbacks_from { parser.feed('<html><body><p> Data1 ') }
|
108
|
+
assert_equal(4, cbs.size, "cbs is #{cbs.inspect}")
|
109
|
+
assert_equal(["data", " Data1 "], cbs[3])
|
110
|
+
|
111
|
+
cbs = callbacks_from { parser.feed('</p><p> Data2 </p></body></html>') }
|
112
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
113
|
+
assert_equal(["data", " Data2 "], cbs[2])
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_script
|
117
|
+
cbs = callbacks_from { parser.feed <<EOS
|
118
|
+
<html><body>
|
119
|
+
<script type="text/javascript" language="Java_script">
|
120
|
+
<!--
|
121
|
+
var page_name = "Page_view_item";
|
122
|
+
//-->
|
123
|
+
</script>
|
124
|
+
</body></html>
|
125
|
+
EOS
|
126
|
+
}
|
127
|
+
assert_equal(12, cbs.size, "cbs is #{cbs.inspect}")
|
128
|
+
assert_equal(["starttag", '<script type="text/javascript" language="Java_script">'], cbs[3]);
|
129
|
+
assert_equal(["endtag", "</script>"], cbs[7])
|
130
|
+
end
|
131
|
+
|
132
|
+
def test_unknown_character
|
133
|
+
cbs = callbacks_from { parser.feed('<html><body>〹</body></html>') }
|
134
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
135
|
+
assert_equal(["charref", "〹"], cbs[2])
|
136
|
+
end
|
137
|
+
|
138
|
+
def test_unknown_entity
|
139
|
+
cbs = callbacks_from { parser.feed('<html><body>&fred;</body></html>') }
|
140
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
141
|
+
assert_equal(["entityref", "&fred;"], cbs[2])
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_comment
|
145
|
+
cbs = callbacks_from { parser.feed('<html><body><!-- comment here --></body></html>') }
|
146
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
147
|
+
assert_equal(["comment", "<!-- comment here -->"], cbs[2])
|
148
|
+
end
|
149
|
+
|
150
|
+
#TODO is this right (w/the !)?
|
151
|
+
def test_special
|
152
|
+
cbs = callbacks_from { parser.feed <<EOS
|
153
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
154
|
+
<html><body></body></html>
|
155
|
+
EOS
|
156
|
+
}
|
157
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
158
|
+
assert_equal(["special", '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'], cbs[0])
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
# Test cases for html-tree.rb
|
2
|
+
# Copyright (C) 2002 Ned Konz <ned@bike-nomad.com>
|
3
|
+
# License: Ruby's
|
4
|
+
# $Id: tc_stacking-parser.rb,v 1.4 2004/02/10 21:36:31 jhannes Exp $
|
5
|
+
|
6
|
+
require 'html/stparser'
|
7
|
+
require 'test/unit'
|
8
|
+
|
9
|
+
class TestStackingParser < HTML::StackingParser
|
10
|
+
def initialize(verbose, test_case)
|
11
|
+
super(verbose)
|
12
|
+
@test_case = test_case
|
13
|
+
end
|
14
|
+
attr_reader :test_case
|
15
|
+
|
16
|
+
def warn(msg); test_case.callback('warn', msg); end
|
17
|
+
|
18
|
+
def handle_comment(data); super; test_case.callback('comment', data); end
|
19
|
+
def handle_cdata(data); test_case.callback('data', data); end
|
20
|
+
def handle_start_tag(tag, attrs); test_case.callback('start_tag', tag, attrs); end
|
21
|
+
def handle_end_tag(tag); test_case.callback('end_tag', tag); end
|
22
|
+
def handle_empty_tag(tag, attrs); test_case.callback('empty_tag', tag, attrs); end
|
23
|
+
def handle_unknown_tag(tag, attrs); test_case.callback('unknown_tag', tag, attrs); end
|
24
|
+
def handle_missing_end_tag(tag); test_case.callback('missing_end_tag', tag); end;
|
25
|
+
def handle_extra_end_tag(tag); test_case.callback('extra_end_tag', tag); end
|
26
|
+
def handle_script(data); test_case.callback('script', data); end
|
27
|
+
def handle_unknown_character(name); test_case.callback('unknown_character', name); end
|
28
|
+
def handle_unknown_entity(name); test_case.callback('unknown_entity', name); end
|
29
|
+
def handle_special(data); test_case.callback('special', data); end
|
30
|
+
end
|
31
|
+
|
32
|
+
class StackingParserTestCase < Test::Unit::TestCase
|
33
|
+
|
34
|
+
def setup
|
35
|
+
@parser = TestStackingParser.new(true, self)
|
36
|
+
@callbacks = []
|
37
|
+
end
|
38
|
+
|
39
|
+
def callback(*stuff)
|
40
|
+
@callbacks << stuff
|
41
|
+
end
|
42
|
+
|
43
|
+
attr_reader :parser
|
44
|
+
attr_reader :callbacks
|
45
|
+
|
46
|
+
# run the given block and return the callbacks if any
|
47
|
+
def callbacks_from
|
48
|
+
@callbacks = []
|
49
|
+
yield
|
50
|
+
@callbacks
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_empty_stack
|
54
|
+
# test stack empty at first
|
55
|
+
assert(parser.stack.empty?)
|
56
|
+
# test last_tag and parent_tag don't blow up with empty stack
|
57
|
+
assert_equal('html', parser.last_tag)
|
58
|
+
assert_equal('html', parser.parent_tag)
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_empty_html
|
62
|
+
cbs = callbacks_from { parser.feed('<html>') }
|
63
|
+
assert_same(false, parser.stack.empty?)
|
64
|
+
assert_equal('html', parser.last_tag)
|
65
|
+
assert_equal('html', parser.parent_tag)
|
66
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
67
|
+
assert_equal(['start_tag', 'html', []], cbs[0])
|
68
|
+
|
69
|
+
cbs = callbacks_from{ parser.feed('</html>') }
|
70
|
+
assert(parser.stack.empty?)
|
71
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
72
|
+
assert_equal(['end_tag', 'html'], cbs[0])
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_attribs
|
76
|
+
cbs = callbacks_from {
|
77
|
+
parser.feed('<html><body bgcolor="#ffffff" width="123"><p>Fred</p></body></html>')
|
78
|
+
}
|
79
|
+
assert_same(true, parser.stack.empty?)
|
80
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
81
|
+
assert_equal(['start_tag', 'html', []], cbs[0])
|
82
|
+
assert_equal(["start_tag", "body", [["bgcolor", "#ffffff"], ["width", "123"]]], cbs[1])
|
83
|
+
assert_equal(['data', "Fred"], cbs[3])
|
84
|
+
end
|
85
|
+
|
86
|
+
# FIXME should we insert <p> tags here?
|
87
|
+
def test_no_para_tags_in_body
|
88
|
+
cbs = callbacks_from { parser.feed('<html><body>Fred</body></html>') }
|
89
|
+
assert_equal(true, parser.stack.empty?)
|
90
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
91
|
+
assert_equal(['data', 'Fred'], cbs[2])
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_empty_tag
|
95
|
+
cbs = callbacks_from { parser.feed('<html><body><img src="whatever"></body></html>') }
|
96
|
+
assert_equal(true, parser.stack.empty?)
|
97
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
98
|
+
assert_equal(["empty_tag", "img", [["src", "whatever"]]], cbs[2])
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_unknown_tag
|
102
|
+
cbs = callbacks_from { parser.feed('<html><body><froobzle a="b"></froobzle></body></html>') }
|
103
|
+
assert_equal(true, parser.stack.empty?)
|
104
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
105
|
+
assert_equal(["unknown_tag", "froobzle", [["a", "b"]]], cbs[2])
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_missing_end_tag
|
109
|
+
cbs = callbacks_from { parser.feed('<html><body><div></body></html>') }
|
110
|
+
assert_equal(true, parser.stack.empty?)
|
111
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
112
|
+
assert_equal(["missing_end_tag", "div"], cbs[3])
|
113
|
+
end
|
114
|
+
|
115
|
+
def test_extra_end_tag
|
116
|
+
cbs = callbacks_from { parser.feed('<html><body></body></body></html>') }
|
117
|
+
assert_equal(true, parser.stack.empty?)
|
118
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
119
|
+
assert_equal(["extra_end_tag", "body"], cbs[3])
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_data
|
123
|
+
cbs = callbacks_from { parser.feed('<html><body><p>Data1</p><br><p>More_data</p></body></html>') }
|
124
|
+
assert_equal(true, parser.stack.empty?)
|
125
|
+
assert_equal(11, cbs.size, "cbs is #{cbs.inspect}")
|
126
|
+
assert_equal(["data", "Data1"], cbs[3])
|
127
|
+
assert_equal(["data", "More_data"], cbs[7])
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_whitespace_stripping
|
131
|
+
parser.strip_whitespace = false
|
132
|
+
cbs = callbacks_from { parser.feed('<html><body><p> Data1 ') }
|
133
|
+
assert_equal(4, cbs.size, "cbs is #{cbs.inspect}")
|
134
|
+
assert_equal(["data", " Data1 "], cbs[3])
|
135
|
+
|
136
|
+
parser.strip_whitespace = true
|
137
|
+
cbs = callbacks_from { parser.feed('</p><p> Data2 </p></body></html>') }
|
138
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
139
|
+
assert_equal(["data", "Data2"], cbs[2])
|
140
|
+
assert_equal(true, parser.stack.empty?)
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_script
|
144
|
+
parser.strip_whitespace = true
|
145
|
+
cbs = callbacks_from { parser.feed <<EOS
|
146
|
+
<html><body>
|
147
|
+
<script type="text/javascript" language="Java_script">
|
148
|
+
<!--
|
149
|
+
var page_name = "Page_view_item";
|
150
|
+
//-->
|
151
|
+
</script>
|
152
|
+
</body></html>
|
153
|
+
EOS
|
154
|
+
}
|
155
|
+
assert_equal([], parser.stack)
|
156
|
+
assert_equal(true, parser.stack.empty?)
|
157
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
158
|
+
assert_equal(["start_tag", "script",
|
159
|
+
[["type", "text/javascript"], ["language", "Java_script"]]], cbs[2])
|
160
|
+
assert_equal(["script", "\n<!--\nvar page_name = \"Page_view_item\";\n//-->\n"], cbs[3])
|
161
|
+
assert_equal(["end_tag", "script"], cbs[4])
|
162
|
+
end
|
163
|
+
|
164
|
+
def test_unknown_character
|
165
|
+
cbs = callbacks_from { parser.feed('<html><body>〹</body></html>') }
|
166
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
167
|
+
assert_equal(["unknown_character", "12345"], cbs[2])
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_unknown_entity
|
171
|
+
cbs = callbacks_from { parser.feed('<html><body>&fred;</body></html>') }
|
172
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
173
|
+
assert_equal(["unknown_entity", "fred"], cbs[2])
|
174
|
+
end
|
175
|
+
|
176
|
+
def test_comment
|
177
|
+
parser.strip_whitespace = true
|
178
|
+
cbs = callbacks_from { parser.feed('<html><body><!-- comment here --></body></html>') }
|
179
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
180
|
+
assert_equal(["comment", "comment here"], cbs[2])
|
181
|
+
end
|
182
|
+
|
183
|
+
#TODO is this right (w/the !)?
|
184
|
+
def test_special
|
185
|
+
parser.strip_whitespace = true
|
186
|
+
cbs = callbacks_from { parser.feed <<EOS
|
187
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
188
|
+
<html><body></body></html>
|
189
|
+
EOS
|
190
|
+
}
|
191
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
192
|
+
assert_equal(["special", '!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"'], cbs[0])
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
$stdout.sync = true
|