htmltools 1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +58 -0
- data/README +162 -0
- data/demo/degolive.rb +89 -0
- data/demo/ebaySearch.rb +93 -0
- data/demo/xpath.rb +62 -0
- data/lib/html/element.rb +323 -0
- data/lib/html/rexml-nodepath.rb +49 -0
- data/lib/html/sgml-parser.rb +372 -0
- data/lib/html/stparser.rb +280 -0
- data/lib/html/tags.rb +288 -0
- data/lib/html/tree.rb +140 -0
- data/lib/html/xmltree.rb +173 -0
- data/lib/html/xpath.rb +72 -0
- data/test/suite.rb +5 -0
- data/test/tc_html-element.rb +73 -0
- data/test/tc_html-tree.rb +201 -0
- data/test/tc_source-parser.rb +160 -0
- data/test/tc_stacking-parser.rb +196 -0
- data/test/tc_xpath.rb +87 -0
- metadata +58 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
#!/usr/bin/ruby
|
2
|
+
require 'html/tree'
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
module HTMLTree
|
6
|
+
class Parser
|
7
|
+
attr_reader :currentNode
|
8
|
+
attr_reader :rootNode
|
9
|
+
end
|
10
|
+
|
11
|
+
module TreeElement
|
12
|
+
alias :ch :children
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
class HTMLTreeParserTestCase < Test::Unit::TestCase
|
17
|
+
def setup
|
18
|
+
@p = HTMLTree::Parser.new(true, false)
|
19
|
+
end
|
20
|
+
|
21
|
+
def rn
|
22
|
+
@p.rootNode
|
23
|
+
end
|
24
|
+
|
25
|
+
def cn
|
26
|
+
@p.currentNode
|
27
|
+
end
|
28
|
+
|
29
|
+
attr_reader :p
|
30
|
+
|
31
|
+
def test_empty
|
32
|
+
assert_equal(cn, rn)
|
33
|
+
assert_equal(nil, cn.parent)
|
34
|
+
assert_equal([], rn.children)
|
35
|
+
end
|
36
|
+
|
37
|
+
def test_skeleton
|
38
|
+
d = rn
|
39
|
+
assert_equal(d.class, HTMLTree::Document)
|
40
|
+
p.feed('<html></html>')
|
41
|
+
assert_equal(d, cn)
|
42
|
+
assert_equal(d, rn)
|
43
|
+
assert_equal(rn, @p.tree)
|
44
|
+
assert_equal(@p.html, @p.tree.html_node)
|
45
|
+
assert_equal(@p.html, rn.ch[0])
|
46
|
+
assert_equal(1, rn.ch.size)
|
47
|
+
assert_equal('', rn.tag)
|
48
|
+
assert_equal('html', rn.ch[0].tag)
|
49
|
+
assert_equal(d.html_node, rn.ch[0])
|
50
|
+
end
|
51
|
+
|
52
|
+
def test_reset
|
53
|
+
p.feed('<html><head></head><body attrib1="xxx"></body></html>')
|
54
|
+
assert_equal(1, rn.ch.size)
|
55
|
+
assert_equal(2, rn.ch[0].ch.size)
|
56
|
+
p.reset
|
57
|
+
assert_equal(0, rn.ch.size)
|
58
|
+
p.feed('<html><head></head><body attrib1="xxx"></body></html>')
|
59
|
+
assert_equal(1, rn.ch.size)
|
60
|
+
assert_equal(2, rn.ch[0].ch.size)
|
61
|
+
end
|
62
|
+
|
63
|
+
def test_skeleton2
|
64
|
+
d = rn
|
65
|
+
p.feed('<html><head></head><body attrib1="xxx"></body></html>')
|
66
|
+
assert_equal(d, cn)
|
67
|
+
assert_equal(d, rn)
|
68
|
+
assert_equal(1, rn.ch.size)
|
69
|
+
h = rn.ch[0] # html
|
70
|
+
assert_equal(d.html_node, h)
|
71
|
+
assert_equal('html', h.tag)
|
72
|
+
assert_equal(2, h.ch.size)
|
73
|
+
assert_equal(0, h.ch[1].ch.size)
|
74
|
+
assert_equal(0, h.ch[0].ch.size)
|
75
|
+
assert_equal('head', h.ch[0].tag)
|
76
|
+
assert_equal('body', h.ch[1].tag)
|
77
|
+
assert_equal('xxx', h.ch[1].attribute('attrib1'))
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_empty_tag
|
81
|
+
d = rn
|
82
|
+
p.feed('<html><head></head><body attrib1="xxx"><br></body></html>')
|
83
|
+
assert_equal(d, cn)
|
84
|
+
h = rn.ch[0]
|
85
|
+
assert_equal('html', h.tag)
|
86
|
+
assert_equal(2, h.ch.size)
|
87
|
+
assert_equal(0, h.ch[0].ch.size)
|
88
|
+
assert_equal(1, h.ch[1].ch.size)
|
89
|
+
assert_equal('head', h.ch[0].tag)
|
90
|
+
assert_equal('body', h.ch[1].tag)
|
91
|
+
assert_equal('br', h.ch[1].ch[0].tag)
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_no_end_tag
|
95
|
+
p.feed("<html><body>Foo<br />bar</body></html>")
|
96
|
+
h = rn.ch[0]
|
97
|
+
assert_equal('html', h.tag)
|
98
|
+
assert_equal('body', h.ch[0].tag)
|
99
|
+
assert_equal('br', h.ch[0].ch[1].tag)
|
100
|
+
assert_equal({}, h.ch[0].ch[1].attributes)
|
101
|
+
assert_equal([], h.ch[0].ch[1].ch)
|
102
|
+
assert_equal('Foo', h.ch[0].ch[0].content)
|
103
|
+
assert_equal('bar', h.ch[0].ch[2].content)
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_content
|
107
|
+
d = rn
|
108
|
+
p.feed('<html><head></head><body attrib1="xxx"><p>stuff</p></body></html>')
|
109
|
+
assert_equal(d, rn)
|
110
|
+
assert_equal(d, cn)
|
111
|
+
h = rn.ch[0]
|
112
|
+
assert_equal('html', h.tag)
|
113
|
+
assert_equal(2, h.ch.size) # html => head, body
|
114
|
+
assert_equal(0, h.ch[0].ch.size) # head =>
|
115
|
+
assert_equal(1, h.ch[1].ch.size) # body=>p
|
116
|
+
assert_equal(1, h.ch[1].ch[0].ch.size) #p=>stuff
|
117
|
+
assert_equal('head', h.ch[0].tag)
|
118
|
+
assert_equal('body', h.ch[1].tag)
|
119
|
+
assert_equal('p', h.ch[1].ch[0].tag)
|
120
|
+
data = h.ch[1].ch[0].ch[0] # html/body/p/<data>
|
121
|
+
assert_equal(true, data.data?)
|
122
|
+
assert_equal('', data.tag)
|
123
|
+
assert_equal('stuff', data.to_s)
|
124
|
+
assert_equal({}, data.attributes)
|
125
|
+
end
|
126
|
+
|
127
|
+
def test_unclosed_li
|
128
|
+
p.feed('<html><body><ul><li>Item 1<li>Item 2<li>Item 3</ul></body></html>')
|
129
|
+
|
130
|
+
html = rn.ch[0]
|
131
|
+
assert_equal('html', html.tag)
|
132
|
+
|
133
|
+
ul = html.ch[0].ch[0]
|
134
|
+
assert_equal('ul', ul.tag)
|
135
|
+
|
136
|
+
assert_equal(3, ul.ch.size)
|
137
|
+
end
|
138
|
+
|
139
|
+
def test_partial_file
|
140
|
+
p.feed("<ul><li>test</li><li>test test</li></ul>")
|
141
|
+
li = rn.ch[0]
|
142
|
+
assert_equal('ul', li.tag)
|
143
|
+
assert_equal(2, li.ch.size)
|
144
|
+
assert_equal('li', li.ch[0].tag)
|
145
|
+
assert_equal('li', li.ch[1].tag)
|
146
|
+
end
|
147
|
+
|
148
|
+
def test_break_nesting
|
149
|
+
p.feed('<HTML><BODY><p><ul><LI></ul><p></BODY></HTML>')
|
150
|
+
|
151
|
+
expected = tree("html",
|
152
|
+
tree("body", tree("p", tree("ul", tree("li"))), tree("p"))
|
153
|
+
)
|
154
|
+
expected.assert_matches(p.html)
|
155
|
+
end
|
156
|
+
|
157
|
+
def test_meta
|
158
|
+
p.feed('<html><head><META NAME="robots" CONTENT="noindex,follow"></head><body></body></html>')
|
159
|
+
|
160
|
+
expected = tree("html",
|
161
|
+
tree("head", tree("meta")),
|
162
|
+
tree("body"))
|
163
|
+
|
164
|
+
expected.assert_matches(p.html)
|
165
|
+
end
|
166
|
+
|
167
|
+
class VerificationTree
|
168
|
+
include Test::Unit::Assertions
|
169
|
+
|
170
|
+
def initialize(tag)
|
171
|
+
@tag = tag
|
172
|
+
@children = []
|
173
|
+
end
|
174
|
+
|
175
|
+
def add_children(children)
|
176
|
+
@children = children
|
177
|
+
self
|
178
|
+
end
|
179
|
+
|
180
|
+
def assert_matches(tree)
|
181
|
+
assert_equal(@tag, tree.tag)
|
182
|
+
|
183
|
+
assert_equal(tree.elements.collect { |node| node.tag }, @children.collect { |node| node.tag }, tree.path)
|
184
|
+
assert_children_matches(@children, tree.elements)
|
185
|
+
end
|
186
|
+
|
187
|
+
def assert_children_matches(children, treechildren)
|
188
|
+
for i in (0...children.size) do
|
189
|
+
children[i].assert_matches(treechildren[i])
|
190
|
+
end
|
191
|
+
end
|
192
|
+
|
193
|
+
attr_reader :tag
|
194
|
+
end
|
195
|
+
|
196
|
+
def tree(tag, *children)
|
197
|
+
tree = VerificationTree.new(tag)
|
198
|
+
tree.add_children(children)
|
199
|
+
tree
|
200
|
+
end
|
201
|
+
end
|
@@ -0,0 +1,160 @@
|
|
1
|
+
# $Id: tc_source-parser.rb,v 1.3 2006/07/24 09:28:19 Philip Dorrell Exp $
|
2
|
+
|
3
|
+
require 'html/stparser'
|
4
|
+
require 'test/unit'
|
5
|
+
|
6
|
+
class TestSourceParser < HTML::SGMLParser
|
7
|
+
def initialize(verbose, test_case)
|
8
|
+
super(verbose)
|
9
|
+
@test_case = test_case
|
10
|
+
@fulldata = ""
|
11
|
+
end
|
12
|
+
attr_reader :test_case
|
13
|
+
|
14
|
+
def feed(data)
|
15
|
+
@fulldata = @fulldata + data
|
16
|
+
super(data)
|
17
|
+
end
|
18
|
+
|
19
|
+
def last_src
|
20
|
+
return @fulldata[src_range]
|
21
|
+
end
|
22
|
+
|
23
|
+
def warn(msg); test_case.callback('warn', msg); end
|
24
|
+
|
25
|
+
def handle_starttag(tag, method, attrs); test_case.callback('starttag', last_src); end
|
26
|
+
def unknown_starttag(tag, attrs); test_case.callback('starttag', last_src); end
|
27
|
+
def handle_endtag(tag, method); test_case.callback('endtag', last_src); end
|
28
|
+
def unknown_endtag(tag); test_case.callback('endtag', last_src); end
|
29
|
+
def handle_charref(name); test_case.callback('charref', last_src); end
|
30
|
+
def handle_entityref(name); test_case.callback('entityref', last_src); end
|
31
|
+
def handle_data(data); test_case.callback('data', last_src); end
|
32
|
+
def handle_comment(data); test_case.callback('comment', last_src); end
|
33
|
+
def handle_special(data); test_case.callback('special', last_src); end
|
34
|
+
end
|
35
|
+
|
36
|
+
class SourceParserTestCase < Test::Unit::TestCase
|
37
|
+
|
38
|
+
def setup
|
39
|
+
@parser = TestSourceParser.new(true, self)
|
40
|
+
@callbacks = []
|
41
|
+
end
|
42
|
+
|
43
|
+
def callback(*stuff)
|
44
|
+
@callbacks << stuff
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :parser
|
48
|
+
attr_reader :callbacks
|
49
|
+
|
50
|
+
# run the given block and return the callbacks if any
|
51
|
+
def callbacks_from
|
52
|
+
@callbacks = []
|
53
|
+
yield
|
54
|
+
#show_callbacks
|
55
|
+
@callbacks
|
56
|
+
end
|
57
|
+
|
58
|
+
def show_callbacks
|
59
|
+
puts "Callbacks: "
|
60
|
+
@callbacks.each do |callback|
|
61
|
+
puts " [#{callback[0]}: \"#{callback[1]}\"]"
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_empty_html
|
66
|
+
cbs = callbacks_from { parser.feed('<html>') }
|
67
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
68
|
+
assert_equal(['starttag', '<html>'], cbs[0])
|
69
|
+
|
70
|
+
cbs = callbacks_from{ parser.feed('</html>') }
|
71
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
72
|
+
assert_equal(['endtag', '</html>'], cbs[0])
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_attribs
|
76
|
+
cbs = callbacks_from {
|
77
|
+
parser.feed('<html><body bgcolor="#ffffff" width="123"><p>Fred</p></body></html>')
|
78
|
+
}
|
79
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
80
|
+
assert_equal(['starttag', '<html>'], cbs[0])
|
81
|
+
assert_equal(['starttag', '<body bgcolor="#ffffff" width="123">'], cbs[1])
|
82
|
+
assert_equal(['data', "Fred"], cbs[3])
|
83
|
+
assert_equal(['endtag', '</body>'], cbs[5])
|
84
|
+
end
|
85
|
+
|
86
|
+
# FIXME should we insert <p> tags here?
|
87
|
+
def test_no_para_tags_in_body
|
88
|
+
cbs = callbacks_from { parser.feed('<html><body>Fred</body></html>') }
|
89
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
90
|
+
assert_equal(['endtag', '</body>'], cbs[3])
|
91
|
+
end
|
92
|
+
|
93
|
+
def test_empty_tag
|
94
|
+
cbs = callbacks_from { parser.feed('<html><body><img src="whatever"></body></html>') }
|
95
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
96
|
+
assert_equal(["starttag", '<img src="whatever">'], cbs[2])
|
97
|
+
end
|
98
|
+
|
99
|
+
def test_data
|
100
|
+
cbs = callbacks_from { parser.feed('<html><body><p>Data1</p><br><p>More_data</p></body></html>') }
|
101
|
+
assert_equal(11, cbs.size, "cbs is #{cbs.inspect}")
|
102
|
+
assert_equal(["data", "Data1"], cbs[3])
|
103
|
+
assert_equal(["data", "More_data"], cbs[7])
|
104
|
+
end
|
105
|
+
|
106
|
+
def test_whitespace_stripping
|
107
|
+
cbs = callbacks_from { parser.feed('<html><body><p> Data1 ') }
|
108
|
+
assert_equal(4, cbs.size, "cbs is #{cbs.inspect}")
|
109
|
+
assert_equal(["data", " Data1 "], cbs[3])
|
110
|
+
|
111
|
+
cbs = callbacks_from { parser.feed('</p><p> Data2 </p></body></html>') }
|
112
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
113
|
+
assert_equal(["data", " Data2 "], cbs[2])
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_script
|
117
|
+
cbs = callbacks_from { parser.feed <<EOS
|
118
|
+
<html><body>
|
119
|
+
<script type="text/javascript" language="Java_script">
|
120
|
+
<!--
|
121
|
+
var page_name = "Page_view_item";
|
122
|
+
//-->
|
123
|
+
</script>
|
124
|
+
</body></html>
|
125
|
+
EOS
|
126
|
+
}
|
127
|
+
assert_equal(12, cbs.size, "cbs is #{cbs.inspect}")
|
128
|
+
assert_equal(["starttag", '<script type="text/javascript" language="Java_script">'], cbs[3]);
|
129
|
+
assert_equal(["endtag", "</script>"], cbs[7])
|
130
|
+
end
|
131
|
+
|
132
|
+
def test_unknown_character
|
133
|
+
cbs = callbacks_from { parser.feed('<html><body>〹</body></html>') }
|
134
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
135
|
+
assert_equal(["charref", "〹"], cbs[2])
|
136
|
+
end
|
137
|
+
|
138
|
+
def test_unknown_entity
|
139
|
+
cbs = callbacks_from { parser.feed('<html><body>&fred;</body></html>') }
|
140
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
141
|
+
assert_equal(["entityref", "&fred;"], cbs[2])
|
142
|
+
end
|
143
|
+
|
144
|
+
def test_comment
|
145
|
+
cbs = callbacks_from { parser.feed('<html><body><!-- comment here --></body></html>') }
|
146
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
147
|
+
assert_equal(["comment", "<!-- comment here -->"], cbs[2])
|
148
|
+
end
|
149
|
+
|
150
|
+
#TODO is this right (w/the !)?
|
151
|
+
def test_special
|
152
|
+
cbs = callbacks_from { parser.feed <<EOS
|
153
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
154
|
+
<html><body></body></html>
|
155
|
+
EOS
|
156
|
+
}
|
157
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
158
|
+
assert_equal(["special", '<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">'], cbs[0])
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,196 @@
|
|
1
|
+
# Test cases for html-tree.rb
|
2
|
+
# Copyright (C) 2002 Ned Konz <ned@bike-nomad.com>
|
3
|
+
# License: Ruby's
|
4
|
+
# $Id: tc_stacking-parser.rb,v 1.4 2004/02/10 21:36:31 jhannes Exp $
|
5
|
+
|
6
|
+
require 'html/stparser'
|
7
|
+
require 'test/unit'
|
8
|
+
|
9
|
+
class TestStackingParser < HTML::StackingParser
|
10
|
+
def initialize(verbose, test_case)
|
11
|
+
super(verbose)
|
12
|
+
@test_case = test_case
|
13
|
+
end
|
14
|
+
attr_reader :test_case
|
15
|
+
|
16
|
+
def warn(msg); test_case.callback('warn', msg); end
|
17
|
+
|
18
|
+
def handle_comment(data); super; test_case.callback('comment', data); end
|
19
|
+
def handle_cdata(data); test_case.callback('data', data); end
|
20
|
+
def handle_start_tag(tag, attrs); test_case.callback('start_tag', tag, attrs); end
|
21
|
+
def handle_end_tag(tag); test_case.callback('end_tag', tag); end
|
22
|
+
def handle_empty_tag(tag, attrs); test_case.callback('empty_tag', tag, attrs); end
|
23
|
+
def handle_unknown_tag(tag, attrs); test_case.callback('unknown_tag', tag, attrs); end
|
24
|
+
def handle_missing_end_tag(tag); test_case.callback('missing_end_tag', tag); end;
|
25
|
+
def handle_extra_end_tag(tag); test_case.callback('extra_end_tag', tag); end
|
26
|
+
def handle_script(data); test_case.callback('script', data); end
|
27
|
+
def handle_unknown_character(name); test_case.callback('unknown_character', name); end
|
28
|
+
def handle_unknown_entity(name); test_case.callback('unknown_entity', name); end
|
29
|
+
def handle_special(data); test_case.callback('special', data); end
|
30
|
+
end
|
31
|
+
|
32
|
+
class StackingParserTestCase < Test::Unit::TestCase
|
33
|
+
|
34
|
+
def setup
|
35
|
+
@parser = TestStackingParser.new(true, self)
|
36
|
+
@callbacks = []
|
37
|
+
end
|
38
|
+
|
39
|
+
def callback(*stuff)
|
40
|
+
@callbacks << stuff
|
41
|
+
end
|
42
|
+
|
43
|
+
attr_reader :parser
|
44
|
+
attr_reader :callbacks
|
45
|
+
|
46
|
+
# run the given block and return the callbacks if any
|
47
|
+
def callbacks_from
|
48
|
+
@callbacks = []
|
49
|
+
yield
|
50
|
+
@callbacks
|
51
|
+
end
|
52
|
+
|
53
|
+
def test_empty_stack
|
54
|
+
# test stack empty at first
|
55
|
+
assert(parser.stack.empty?)
|
56
|
+
# test last_tag and parent_tag don't blow up with empty stack
|
57
|
+
assert_equal('html', parser.last_tag)
|
58
|
+
assert_equal('html', parser.parent_tag)
|
59
|
+
end
|
60
|
+
|
61
|
+
def test_empty_html
|
62
|
+
cbs = callbacks_from { parser.feed('<html>') }
|
63
|
+
assert_same(false, parser.stack.empty?)
|
64
|
+
assert_equal('html', parser.last_tag)
|
65
|
+
assert_equal('html', parser.parent_tag)
|
66
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
67
|
+
assert_equal(['start_tag', 'html', []], cbs[0])
|
68
|
+
|
69
|
+
cbs = callbacks_from{ parser.feed('</html>') }
|
70
|
+
assert(parser.stack.empty?)
|
71
|
+
assert_equal(1, cbs.size, "cbs is #{cbs.inspect}")
|
72
|
+
assert_equal(['end_tag', 'html'], cbs[0])
|
73
|
+
end
|
74
|
+
|
75
|
+
def test_attribs
|
76
|
+
cbs = callbacks_from {
|
77
|
+
parser.feed('<html><body bgcolor="#ffffff" width="123"><p>Fred</p></body></html>')
|
78
|
+
}
|
79
|
+
assert_same(true, parser.stack.empty?)
|
80
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
81
|
+
assert_equal(['start_tag', 'html', []], cbs[0])
|
82
|
+
assert_equal(["start_tag", "body", [["bgcolor", "#ffffff"], ["width", "123"]]], cbs[1])
|
83
|
+
assert_equal(['data', "Fred"], cbs[3])
|
84
|
+
end
|
85
|
+
|
86
|
+
# FIXME should we insert <p> tags here?
|
87
|
+
def test_no_para_tags_in_body
|
88
|
+
cbs = callbacks_from { parser.feed('<html><body>Fred</body></html>') }
|
89
|
+
assert_equal(true, parser.stack.empty?)
|
90
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
91
|
+
assert_equal(['data', 'Fred'], cbs[2])
|
92
|
+
end
|
93
|
+
|
94
|
+
def test_empty_tag
|
95
|
+
cbs = callbacks_from { parser.feed('<html><body><img src="whatever"></body></html>') }
|
96
|
+
assert_equal(true, parser.stack.empty?)
|
97
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
98
|
+
assert_equal(["empty_tag", "img", [["src", "whatever"]]], cbs[2])
|
99
|
+
end
|
100
|
+
|
101
|
+
def test_unknown_tag
|
102
|
+
cbs = callbacks_from { parser.feed('<html><body><froobzle a="b"></froobzle></body></html>') }
|
103
|
+
assert_equal(true, parser.stack.empty?)
|
104
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
105
|
+
assert_equal(["unknown_tag", "froobzle", [["a", "b"]]], cbs[2])
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_missing_end_tag
|
109
|
+
cbs = callbacks_from { parser.feed('<html><body><div></body></html>') }
|
110
|
+
assert_equal(true, parser.stack.empty?)
|
111
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
112
|
+
assert_equal(["missing_end_tag", "div"], cbs[3])
|
113
|
+
end
|
114
|
+
|
115
|
+
def test_extra_end_tag
|
116
|
+
cbs = callbacks_from { parser.feed('<html><body></body></body></html>') }
|
117
|
+
assert_equal(true, parser.stack.empty?)
|
118
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
119
|
+
assert_equal(["extra_end_tag", "body"], cbs[3])
|
120
|
+
end
|
121
|
+
|
122
|
+
def test_data
|
123
|
+
cbs = callbacks_from { parser.feed('<html><body><p>Data1</p><br><p>More_data</p></body></html>') }
|
124
|
+
assert_equal(true, parser.stack.empty?)
|
125
|
+
assert_equal(11, cbs.size, "cbs is #{cbs.inspect}")
|
126
|
+
assert_equal(["data", "Data1"], cbs[3])
|
127
|
+
assert_equal(["data", "More_data"], cbs[7])
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_whitespace_stripping
|
131
|
+
parser.strip_whitespace = false
|
132
|
+
cbs = callbacks_from { parser.feed('<html><body><p> Data1 ') }
|
133
|
+
assert_equal(4, cbs.size, "cbs is #{cbs.inspect}")
|
134
|
+
assert_equal(["data", " Data1 "], cbs[3])
|
135
|
+
|
136
|
+
parser.strip_whitespace = true
|
137
|
+
cbs = callbacks_from { parser.feed('</p><p> Data2 </p></body></html>') }
|
138
|
+
assert_equal(6, cbs.size, "cbs is #{cbs.inspect}")
|
139
|
+
assert_equal(["data", "Data2"], cbs[2])
|
140
|
+
assert_equal(true, parser.stack.empty?)
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_script
|
144
|
+
parser.strip_whitespace = true
|
145
|
+
cbs = callbacks_from { parser.feed <<EOS
|
146
|
+
<html><body>
|
147
|
+
<script type="text/javascript" language="Java_script">
|
148
|
+
<!--
|
149
|
+
var page_name = "Page_view_item";
|
150
|
+
//-->
|
151
|
+
</script>
|
152
|
+
</body></html>
|
153
|
+
EOS
|
154
|
+
}
|
155
|
+
assert_equal([], parser.stack)
|
156
|
+
assert_equal(true, parser.stack.empty?)
|
157
|
+
assert_equal(7, cbs.size, "cbs is #{cbs.inspect}")
|
158
|
+
assert_equal(["start_tag", "script",
|
159
|
+
[["type", "text/javascript"], ["language", "Java_script"]]], cbs[2])
|
160
|
+
assert_equal(["script", "\n<!--\nvar page_name = \"Page_view_item\";\n//-->\n"], cbs[3])
|
161
|
+
assert_equal(["end_tag", "script"], cbs[4])
|
162
|
+
end
|
163
|
+
|
164
|
+
def test_unknown_character
|
165
|
+
cbs = callbacks_from { parser.feed('<html><body>〹</body></html>') }
|
166
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
167
|
+
assert_equal(["unknown_character", "12345"], cbs[2])
|
168
|
+
end
|
169
|
+
|
170
|
+
def test_unknown_entity
|
171
|
+
cbs = callbacks_from { parser.feed('<html><body>&fred;</body></html>') }
|
172
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
173
|
+
assert_equal(["unknown_entity", "fred"], cbs[2])
|
174
|
+
end
|
175
|
+
|
176
|
+
def test_comment
|
177
|
+
parser.strip_whitespace = true
|
178
|
+
cbs = callbacks_from { parser.feed('<html><body><!-- comment here --></body></html>') }
|
179
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
180
|
+
assert_equal(["comment", "comment here"], cbs[2])
|
181
|
+
end
|
182
|
+
|
183
|
+
#TODO is this right (w/the !)?
|
184
|
+
def test_special
|
185
|
+
parser.strip_whitespace = true
|
186
|
+
cbs = callbacks_from { parser.feed <<EOS
|
187
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
188
|
+
<html><body></body></html>
|
189
|
+
EOS
|
190
|
+
}
|
191
|
+
assert_equal(5, cbs.size, "cbs is #{cbs.inspect}")
|
192
|
+
assert_equal(["special", '!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"'], cbs[0])
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
$stdout.sync = true
|