webtranslateit-hpricot 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/CHANGELOG +122 -0
- data/COPYING +18 -0
- data/README.md +295 -0
- data/Rakefile +237 -0
- data/ext/fast_xs/FastXsService.java +1123 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +210 -0
- data/ext/hpricot_scan/HpricotCss.java +850 -0
- data/ext/hpricot_scan/HpricotScanService.java +2085 -0
- data/ext/hpricot_scan/MANIFEST +0 -0
- data/ext/hpricot_scan/extconf.rb +9 -0
- data/ext/hpricot_scan/hpricot_common.rl +76 -0
- data/ext/hpricot_scan/hpricot_css.c +3511 -0
- data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
- data/ext/hpricot_scan/hpricot_css.rl +120 -0
- data/ext/hpricot_scan/hpricot_scan.c +6848 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
- data/ext/hpricot_scan/hpricot_scan.rl +911 -0
- data/extras/hpricot.png +0 -0
- data/hpricot.gemspec +18 -0
- data/lib/hpricot/blankslate.rb +63 -0
- data/lib/hpricot/builder.rb +217 -0
- data/lib/hpricot/elements.rb +514 -0
- data/lib/hpricot/htmlinfo.rb +691 -0
- data/lib/hpricot/inspect.rb +103 -0
- data/lib/hpricot/modules.rb +40 -0
- data/lib/hpricot/parse.rb +38 -0
- data/lib/hpricot/tag.rb +219 -0
- data/lib/hpricot/tags.rb +164 -0
- data/lib/hpricot/traverse.rb +839 -0
- data/lib/hpricot/xchar.rb +95 -0
- data/lib/hpricot.rb +26 -0
- data/setup.rb +1585 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/cy0.html +3653 -0
- data/test/files/immob.html +400 -0
- data/test/files/pace_application.html +1320 -0
- data/test/files/tenderlove.html +16 -0
- data/test/files/uswebgen.html +220 -0
- data/test/files/utf8.html +1054 -0
- data/test/files/week9.html +1723 -0
- data/test/files/why.xml +19 -0
- data/test/load_files.rb +7 -0
- data/test/nokogiri-bench.rb +64 -0
- data/test/test_alter.rb +96 -0
- data/test/test_builder.rb +37 -0
- data/test/test_parser.rb +496 -0
- data/test/test_paths.rb +25 -0
- data/test/test_preserved.rb +88 -0
- data/test/test_xml.rb +28 -0
- metadata +106 -0
data/test/test_parser.rb
ADDED
|
@@ -0,0 +1,496 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#!/usr/bin/env ruby
|
|
3
|
+
|
|
4
|
+
require 'test/unit'
|
|
5
|
+
require 'hpricot'
|
|
6
|
+
require 'load_files'
|
|
7
|
+
|
|
8
|
+
class TestParser < Test::Unit::TestCase
|
|
9
|
+
def test_set_attr
|
|
10
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
11
|
+
@basic.search('//p').set('class', 'para')
|
|
12
|
+
assert_equal 4, @basic.search('//p').length
|
|
13
|
+
assert_equal 4, @basic.search('//p').find_all { |x| x['class'] == 'para' }.length
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Test creating a new element
|
|
17
|
+
def test_new_element
|
|
18
|
+
elem = Hpricot::Elem.new('form')
|
|
19
|
+
assert_not_nil(elem)
|
|
20
|
+
assert_not_nil(elem.attributes)
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def test_scan_text
|
|
24
|
+
assert_equal 'FOO', Hpricot.make("FOO").children.first.content
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
def test_filter_by_attr
|
|
28
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
|
29
|
+
|
|
30
|
+
# this link is escaped in the doc
|
|
31
|
+
link = 'http://www.youtube.com/watch?v=TvSNXyNw26g&search=chris%20ware'
|
|
32
|
+
assert_equal link, @boingboing.at("a[@href='#{link}']")['href']
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def test_filter_contains
|
|
36
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
37
|
+
assert_equal '<title>Sample XHTML</title>', @basic.search("title:contains('Sample')").to_s
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def test_get_element_by_id
|
|
41
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
42
|
+
assert_equal 'link1', @basic.get_element_by_id('link1')['id']
|
|
43
|
+
assert_equal 'link1', @basic.get_element_by_id('body1').get_element_by_id('link1').get_attribute('id')
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
def test_get_element_by_tag_name
|
|
47
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
48
|
+
assert_equal 'link1', @basic.get_elements_by_tag_name('a')[0].get_attribute('id')
|
|
49
|
+
assert_equal 'link1', @basic.get_elements_by_tag_name('body')[0].get_element_by_id('link1').get_attribute('id')
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def test_get_elements_by_tag_name_star
|
|
53
|
+
simple = Hpricot.parse("<div><p id='first'>First</p><p id='second'>Second</p></div>")
|
|
54
|
+
assert_equal 3, simple.get_elements_by_tag_name("*").size
|
|
55
|
+
assert_equal 1, simple.get_elements_by_tag_name("div").size
|
|
56
|
+
assert_equal 2, simple.get_elements_by_tag_name("p").size
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def test_output_basic
|
|
60
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
61
|
+
@basic2 = Hpricot.parse(@basic.inner_html)
|
|
62
|
+
scan_basic @basic2
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def test_scan_basic
|
|
66
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
67
|
+
scan_basic @basic
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
def scan_basic doc
|
|
71
|
+
assert_kind_of Hpricot::XMLDecl, doc.children.first
|
|
72
|
+
assert_not_equal doc.children.first.to_s, doc.children[1].to_s
|
|
73
|
+
assert_equal 'link1', doc.at('#link1')['id']
|
|
74
|
+
assert_equal 'link1', doc.at("p a")['id']
|
|
75
|
+
assert_equal 'link1', (doc/:p/:a).first['id']
|
|
76
|
+
assert_equal 'link1', doc.search('p').at('a').get_attribute('id')
|
|
77
|
+
assert_equal 'link2', (doc/'p').filter('.ohmy').search('a').first.get_attribute('id')
|
|
78
|
+
assert_equal (doc/'p')[2], (doc/'p').filter(':nth(2)')[0]
|
|
79
|
+
assert_equal (doc/'p')[2], (doc/'p').filter('[3]')[0]
|
|
80
|
+
assert_equal 4, (doc/'p').filter('*').length
|
|
81
|
+
assert_equal 4, (doc/'p').filter('* *').length
|
|
82
|
+
eles = (doc/'p').filter('.ohmy')
|
|
83
|
+
assert_equal 1, eles.length
|
|
84
|
+
assert_equal 'ohmy', eles.first.get_attribute('class')
|
|
85
|
+
assert_equal 3, (doc/'p:not(.ohmy)').length
|
|
86
|
+
assert_equal 3, (doc/'p').not('.ohmy').length
|
|
87
|
+
assert_equal 3, (doc/'p').not(eles.first).length
|
|
88
|
+
assert_equal 2, (doc/'p').filter('[@class]').length
|
|
89
|
+
assert_equal 'last final', (doc/'p[@class~="final"]').first.get_attribute('class')
|
|
90
|
+
assert_equal 1, (doc/'p').filter('[@class~="final"]').length
|
|
91
|
+
assert_equal 2, (doc/'p > a').length
|
|
92
|
+
assert_equal 1, (doc/'p.ohmy > a').length
|
|
93
|
+
assert_equal 2, (doc/'p / a').length
|
|
94
|
+
assert_equal 2, (doc/'link ~ link').length
|
|
95
|
+
assert_equal 3, (doc/'title ~ link').length
|
|
96
|
+
assert_equal 5, (doc/"//p/text()").length
|
|
97
|
+
assert_equal 6, (doc/"//p[a]//text()").length
|
|
98
|
+
assert_equal 2, (doc/"//p/a/text()").length
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def test_positional
|
|
102
|
+
h = Hpricot( "<div><br/><p>one</p><p>two</p></div>" )
|
|
103
|
+
assert_equal "<p>one</p>", h.search("//div/p:eq(0)").to_s
|
|
104
|
+
assert_equal "<p>one</p>", h.search("//div/p:first").to_s
|
|
105
|
+
assert_equal "<p>one</p>", h.search("//div/p:first()").to_s
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
def test_pace
|
|
109
|
+
doc = Hpricot(TestFiles::PACE_APPLICATION)
|
|
110
|
+
assert_equal 'get', doc.at('form[@name=frmSect11]')['method']
|
|
111
|
+
# assert_equal '2', doc.at('#hdnSpouse')['value']
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
def test_scan_boingboing
|
|
115
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
|
116
|
+
assert_equal 60, (@boingboing/'p.posted').length
|
|
117
|
+
assert_equal 1, @boingboing.search("//a[@name='027906']").length
|
|
118
|
+
assert_equal 10, @boingboing.search("script comment()").length
|
|
119
|
+
assert_equal 3, @boingboing.search("a[text()*='Boing']").length
|
|
120
|
+
assert_equal 1, @boingboing.search("h3[text()='College kids reportedly taking more smart drugs']").length
|
|
121
|
+
assert_equal 0, @boingboing.search("h3[text()='College']").length
|
|
122
|
+
assert_equal 60, @boingboing.search("h3").length
|
|
123
|
+
assert_equal 59, @boingboing.search("h3[text()!='College kids reportedly taking more smart drugs']").length
|
|
124
|
+
assert_equal 17, @boingboing.search("h3[text()$='s']").length
|
|
125
|
+
assert_equal 116, @boingboing.search("p[text()]").length
|
|
126
|
+
assert_equal 211, @boingboing.search("p").length
|
|
127
|
+
end
|
|
128
|
+
|
|
129
|
+
def test_reparent
|
|
130
|
+
doc = Hpricot(%{<div id="blurb_1"></div>})
|
|
131
|
+
div1 = doc.search('#blurb_1')
|
|
132
|
+
div1.before('<div id="blurb_0"></div>')
|
|
133
|
+
|
|
134
|
+
div0 = doc.search('#blurb_0')
|
|
135
|
+
div0.before('<div id="blurb_a"></div>')
|
|
136
|
+
|
|
137
|
+
assert_equal 'div', doc.at('#blurb_1').name
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def test_siblings
|
|
141
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
142
|
+
t = @basic.at(:title)
|
|
143
|
+
e = t.next_sibling
|
|
144
|
+
assert_equal 'test1.css', e['href']
|
|
145
|
+
assert_equal 'title', e.previous_sibling.name
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
def test_css_negation
|
|
149
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
150
|
+
assert_equal 3, (@basic/'p:not(.final)').length
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def test_remove_attribute
|
|
154
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
155
|
+
(@basic/:p).each { |ele| ele.remove_attribute('class') }
|
|
156
|
+
assert_equal 0, (@basic/'p[@class]').length
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def test_abs_xpath
|
|
160
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
|
161
|
+
assert_equal 60, @boingboing.search("/html/body//p[@class='posted']").length
|
|
162
|
+
assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
|
|
163
|
+
assert_equal 18, @boingboing.search("//script").length
|
|
164
|
+
divs = @boingboing.search("//script/../div")
|
|
165
|
+
assert_equal 2, divs.length
|
|
166
|
+
imgs = @boingboing.search('//div/p/a/img')
|
|
167
|
+
assert_equal 16, imgs.length
|
|
168
|
+
assert_equal 16, @boingboing.search('//div').search('p/a/img').length
|
|
169
|
+
assert imgs.all? { |x| x.name == 'img' }
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
def test_predicates
|
|
173
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
|
174
|
+
assert_equal 2, @boingboing.search('//link[@rel="alternate"]').length
|
|
175
|
+
p_imgs = @boingboing.search('//div/p[/a/img]')
|
|
176
|
+
assert_equal 16, p_imgs.length
|
|
177
|
+
assert p_imgs.all? { |x| x.name == 'p' }
|
|
178
|
+
p_imgs = @boingboing.search('//div/p[a/img]')
|
|
179
|
+
assert_equal 16, p_imgs.length
|
|
180
|
+
assert p_imgs.all? { |x| x.name == 'p' }
|
|
181
|
+
assert_equal 1, @boingboing.search('//input[@checked]').length
|
|
182
|
+
end
|
|
183
|
+
|
|
184
|
+
def test_tag_case
|
|
185
|
+
@tenderlove = Hpricot.parse(TestFiles::TENDERLOVE)
|
|
186
|
+
assert_equal 2, @tenderlove.search('//a').length
|
|
187
|
+
assert_equal 3, @tenderlove.search('//area').length
|
|
188
|
+
assert_equal 2, @tenderlove.search('//meta').length
|
|
189
|
+
end
|
|
190
|
+
|
|
191
|
+
def test_alt_predicates
|
|
192
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
|
193
|
+
assert_equal 1, @boingboing.search('//table/tr:last').length
|
|
194
|
+
|
|
195
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
196
|
+
assert_equal "<p>The third paragraph</p>",
|
|
197
|
+
@basic.search('p:eq(2)').to_html
|
|
198
|
+
assert_equal '<p class="last final"><b>THE FINAL PARAGRAPH</b></p>',
|
|
199
|
+
@basic.search('p:last').to_html
|
|
200
|
+
assert_equal 'last final', @basic.search('//p:last-of-type').first.get_attribute('class')
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
def test_insert_after # ticket #63
|
|
204
|
+
doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
|
|
205
|
+
(doc/'div').each do |element|
|
|
206
|
+
element.after('<p>Paragraph 1</p><p>Paragraph 2</p>')
|
|
207
|
+
end
|
|
208
|
+
assert_equal doc.to_html, '<html><body><div id="a-div"></div><p>Paragraph 1</p><p>Paragraph 2</p></body></html>'
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def test_insert_before # ticket #61
|
|
212
|
+
doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
|
|
213
|
+
(doc/'div').each do |element|
|
|
214
|
+
element.before('<p>Paragraph 1</p><p>Paragraph 2</p>')
|
|
215
|
+
end
|
|
216
|
+
assert_equal doc.to_html, '<html><body><p>Paragraph 1</p><p>Paragraph 2</p><div id="a-div"></div></body></html>'
|
|
217
|
+
end
|
|
218
|
+
|
|
219
|
+
def test_many_paths
|
|
220
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
|
221
|
+
assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
|
|
222
|
+
assert_equal 18, @boingboing.search('//div/p[a/img]|//link[@rel="alternate"]').length
|
|
223
|
+
end
|
|
224
|
+
|
|
225
|
+
def test_stacked_search
|
|
226
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
|
227
|
+
assert_kind_of Hpricot::Elements, @boingboing.search('//div/p').search('a img')
|
|
228
|
+
end
|
|
229
|
+
|
|
230
|
+
def test_attr_casing
|
|
231
|
+
doc = Hpricot("<a HREF='a'>A simple <b>test</b> string.</a>")
|
|
232
|
+
assert_equal (doc % :a)[:href], "a"
|
|
233
|
+
assert_equal (doc % :a)[:HREF], nil
|
|
234
|
+
assert_equal (doc % :a)['href'], "a"
|
|
235
|
+
assert_equal (doc % :a)['HREF'], nil
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
def test_class_search
|
|
239
|
+
# test case sent by Chih-Chao Lam
|
|
240
|
+
doc = Hpricot("<div class=xyz'>abc</div>")
|
|
241
|
+
assert_equal 1, doc.search(".xyz").length
|
|
242
|
+
doc = Hpricot("<div class=xyz>abc</div><div class=abc>xyz</div>")
|
|
243
|
+
assert_equal 1, doc.search(".xyz").length
|
|
244
|
+
assert_equal 4, doc.search("*").length
|
|
245
|
+
end
|
|
246
|
+
|
|
247
|
+
def test_kleene_star
|
|
248
|
+
# bug noticed by raja bhatia
|
|
249
|
+
doc = Hpricot("<span class='small'>1</span><div class='large'>2</div><div class='small'>3</div><span class='blue large'>4</span>")
|
|
250
|
+
assert_equal 2, doc.search("*[@class*='small']").length
|
|
251
|
+
assert_equal 2, doc.search("*.small").length
|
|
252
|
+
assert_equal 2, doc.search(".small").length
|
|
253
|
+
assert_equal 2, doc.search(".large").length
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
def test_empty_comment
|
|
257
|
+
doc = Hpricot("<p><!----></p>")
|
|
258
|
+
assert doc.children[0].children[0].comment?
|
|
259
|
+
doc = Hpricot("<p><!-- --></p>")
|
|
260
|
+
assert doc.children[0].children[0].comment?
|
|
261
|
+
end
|
|
262
|
+
|
|
263
|
+
def test_body_newlines
|
|
264
|
+
@immob = Hpricot.parse(TestFiles::IMMOB)
|
|
265
|
+
body = @immob.at(:body)
|
|
266
|
+
{'background' => '', 'bgcolor' => '#ffffff', 'text' => '#000000', 'marginheight' => '10',
|
|
267
|
+
'marginwidth' => '10', 'leftmargin' => '10', 'topmargin' => '10', 'link' => '#000066',
|
|
268
|
+
'alink' => '#ff6600', 'hlink' => "#ff6600", 'vlink' => "#000000"}.each do |k, v|
|
|
269
|
+
assert_equal v, body[k]
|
|
270
|
+
end
|
|
271
|
+
end
|
|
272
|
+
|
|
273
|
+
def test_nested_twins
|
|
274
|
+
@doc = Hpricot("<div>Hi<div>there</div></div>")
|
|
275
|
+
assert_equal 1, (@doc/"div div").length
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
def test_wildcard
|
|
279
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
280
|
+
assert_equal 3, (@basic/"*[@id]").length
|
|
281
|
+
assert_equal 3, (@basic/"//*[@id]").length
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def test_javascripts
|
|
285
|
+
@immob = Hpricot.parse(TestFiles::IMMOB)
|
|
286
|
+
assert_equal 3, (@immob/:script)[0].inner_html.scan(/<LINK/).length
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def test_nested_scripts
|
|
290
|
+
@week9 = Hpricot.parse(TestFiles::WEEK9)
|
|
291
|
+
assert_equal 14, (@week9/"a").find_all { |x| x.inner_html.include? "GameCenter" }.length
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def test_uswebgen
|
|
295
|
+
@uswebgen = Hpricot.parse(TestFiles::USWEBGEN)
|
|
296
|
+
# sent by brent beardsley, hpricot 0.3 had problems with all the links.
|
|
297
|
+
assert_equal 67, (@uswebgen/:a).length
|
|
298
|
+
end
|
|
299
|
+
|
|
300
|
+
def test_mangled_tags
|
|
301
|
+
[%{<html><form name='loginForm' method='post' action='/units/a/login/1,13088,779-1,00.html'?URL=></form></html>},
|
|
302
|
+
%{<html><form name='loginForm' ?URL= method='post' action='/units/a/login/1,13088,779-1,00.html'></form></html>},
|
|
303
|
+
%{<html><form name='loginForm'?URL= ?URL= method='post' action='/units/a/login/1,13088,779-1,00.html'?URL=></form></html>},
|
|
304
|
+
%{<html><form name='loginForm' method='post' action='/units/a/login/1,13088,779-1,00.html' ?URL=></form></html>}].
|
|
305
|
+
each do |str|
|
|
306
|
+
doc = Hpricot(str)
|
|
307
|
+
assert_equal 1, (doc/:form).length
|
|
308
|
+
assert_equal '/units/a/login/1,13088,779-1,00.html', doc.at("form")['action']
|
|
309
|
+
end
|
|
310
|
+
end
|
|
311
|
+
|
|
312
|
+
def test_procins
|
|
313
|
+
doc = Hpricot("<?php print('hello') ?>\n<?xml blah='blah'?>")
|
|
314
|
+
assert_equal "php", doc.children[0].target
|
|
315
|
+
assert_equal "blah='blah'", doc.children[2].content
|
|
316
|
+
end
|
|
317
|
+
|
|
318
|
+
def test_no_buffer_error
|
|
319
|
+
Hpricot(%{<p>\n\n<input type="hidden" name="__VIEWSTATE" value="#{(("X" * 2000) + "\n") * 44}" />\n\n</p>})
|
|
320
|
+
end
|
|
321
|
+
|
|
322
|
+
def test_youtube_attr
|
|
323
|
+
str = <<-edoc
|
|
324
|
+
<html><body>
|
|
325
|
+
Lorem ipsum. Jolly roger, ding-dong sing-a-long
|
|
326
|
+
<object width="425" height="350">
|
|
327
|
+
<param name="movie" value="http://www.youtube.com/v/NbDQ4M_cuwA"></param>
|
|
328
|
+
<param name="wmode" value="transparent"></param>
|
|
329
|
+
<embed src="http://www.youtube.com/v/NbDQ4M_cuwA"
|
|
330
|
+
type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
|
|
331
|
+
</embed>
|
|
332
|
+
</object>
|
|
333
|
+
Check out my posting, I have bright mice in large clown cars.
|
|
334
|
+
<object width="425" height="350">
|
|
335
|
+
<param name="movie" value="http://www.youtube.com/v/foobar"></param>
|
|
336
|
+
<param name="wmode" value="transparent"></param>
|
|
337
|
+
<embed src="http://www.youtube.com/v/foobar"
|
|
338
|
+
type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
|
|
339
|
+
</embed>
|
|
340
|
+
</object>
|
|
341
|
+
</body></html?
|
|
342
|
+
edoc
|
|
343
|
+
doc = Hpricot(str)
|
|
344
|
+
assert_equal "http://www.youtube.com/v/NbDQ4M_cuwA",
|
|
345
|
+
doc.at("//object/param[@value='http://www.youtube.com/v/NbDQ4M_cuwA']")['value']
|
|
346
|
+
end
|
|
347
|
+
|
|
348
|
+
# ticket #84 by jamezilla
|
|
349
|
+
def test_screwed_xmlns
|
|
350
|
+
doc = Hpricot(<<-edoc)
|
|
351
|
+
<?xml:namespace prefix = cwi />
|
|
352
|
+
<html><body>HAI</body></html>
|
|
353
|
+
edoc
|
|
354
|
+
assert_equal "HAI", doc.at("body").inner_text
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
# http://github.com/hpricot/hpricot/issues#issue/28
|
|
358
|
+
def test_invalid_inner_text
|
|
359
|
+
assert_equal "A", Hpricot('A&B;').inner_text[0...1]
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
# http://github.com/hpricot/hpricot/issues#issue/25
|
|
363
|
+
def test_encoding_compatibility_error
|
|
364
|
+
Hpricot("<p>\xC3\x9Cber</p><p>M³</p>").inner_text
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Reported by Jonathan Nichols on the Hpricot list (24 May 2007)
|
|
368
|
+
def test_self_closed_form
|
|
369
|
+
doc = Hpricot(<<-edoc)
|
|
370
|
+
<body>
|
|
371
|
+
<form action="/loginRegForm" name="regForm" method="POST" />
|
|
372
|
+
<input type="button">
|
|
373
|
+
</form>
|
|
374
|
+
</body>
|
|
375
|
+
edoc
|
|
376
|
+
assert_equal "button", doc.at("//form/input")['type']
|
|
377
|
+
end
|
|
378
|
+
|
|
379
|
+
def test_escaped_quote
|
|
380
|
+
# Backslash '\' is not an escape character in HTML.
|
|
381
|
+
doc = Hpricot("<div><input type='text' value='C:\\dir\\' /><p id='test_id'>test</p></div>")
|
|
382
|
+
assert_equal "C:\\dir\\", doc.at("input")["value"]
|
|
383
|
+
doc = Hpricot('<div><input type="text" value="C:\\dir\\" /><p id="test_id">test</p></div>')
|
|
384
|
+
assert_equal "C:\\dir\\", doc.at("input")["value"]
|
|
385
|
+
end
|
|
386
|
+
|
|
387
|
+
def test_filters
|
|
388
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
389
|
+
assert_equal 0, (@basic/"title:parent").size
|
|
390
|
+
assert_equal 3, (@basic/"p:parent").size
|
|
391
|
+
assert_equal 3, (@basic/"link:empty").size
|
|
392
|
+
assert_equal 1, (@basic/"span:empty").size
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def test_keep_cdata
|
|
396
|
+
str = %{<script> /*<![CDATA[*/
|
|
397
|
+
/*]]>*/ </script>}
|
|
398
|
+
assert_equal str, Hpricot(str).to_html
|
|
399
|
+
end
|
|
400
|
+
|
|
401
|
+
def test_namespace
|
|
402
|
+
chunk = <<-END
|
|
403
|
+
<a xmlns:t="http://www.nexopia.com/dev/template">
|
|
404
|
+
<t:sam>hi </t:sam>
|
|
405
|
+
</a>
|
|
406
|
+
END
|
|
407
|
+
doc = Hpricot::XML(chunk)
|
|
408
|
+
assert (doc/"//t:sam").size > 0 # at least this should probably work
|
|
409
|
+
# assert (doc/"//sam").size > 0 # this would be nice
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
def test_uxs_ignores_non_entities
|
|
413
|
+
assert_equal 'abc', Hpricot.uxs('abc')
|
|
414
|
+
end
|
|
415
|
+
|
|
416
|
+
def test_uxs_handles_gt_lt_amp_quot
|
|
417
|
+
assert_equal '"&<>', Hpricot.uxs('"&<>')
|
|
418
|
+
end
|
|
419
|
+
|
|
420
|
+
def test_uxs_handles_numeric_values
|
|
421
|
+
if String.method_defined? :encoding
|
|
422
|
+
assert_equal "é", Hpricot.uxs('é')
|
|
423
|
+
else
|
|
424
|
+
assert_equal "\303\251", Hpricot.uxs('é')
|
|
425
|
+
end
|
|
426
|
+
end
|
|
427
|
+
|
|
428
|
+
def test_uxs_handles_hexadecimal_values
|
|
429
|
+
if String.method_defined? :encoding
|
|
430
|
+
assert_equal "é", Hpricot.uxs('é')
|
|
431
|
+
else
|
|
432
|
+
assert_equal "\303\251", Hpricot.uxs('é')
|
|
433
|
+
end
|
|
434
|
+
end
|
|
435
|
+
|
|
436
|
+
def test_uxs_handles_entities
|
|
437
|
+
if String.method_defined? :encoding
|
|
438
|
+
assert_equal "é", Hpricot.uxs('é')
|
|
439
|
+
else
|
|
440
|
+
assert_equal "\303\251", Hpricot.uxs('é')
|
|
441
|
+
end
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
def test_cdata_inner_text
|
|
445
|
+
xml = Hpricot.XML(%{
|
|
446
|
+
<peon>
|
|
447
|
+
<id>96586</id>
|
|
448
|
+
<stdout><![CDATA[This is STDOUT]]></stdout>
|
|
449
|
+
<stderr><!-- IGNORE --><![CDATA[This is]]> STDERR</stderr>
|
|
450
|
+
</peon>})
|
|
451
|
+
assert_equal "This is STDOUT", (xml/:peon/:stdout).inner_text
|
|
452
|
+
assert_equal "This is STDERR", (xml/:peon/:stderr).inner_text
|
|
453
|
+
end
|
|
454
|
+
|
|
455
|
+
def test_parsing_html_with_noscript
|
|
456
|
+
doc = Hpricot(<<-edoc)
|
|
457
|
+
<html>
|
|
458
|
+
<head>
|
|
459
|
+
<noscript>
|
|
460
|
+
<meta http-equiv="refresh" content="0; url=http://www.yoursite.com/noscripts.html"/>
|
|
461
|
+
</noscript>
|
|
462
|
+
<meta name="verification" content="7ff5e90iormq5niy6x98j75" />
|
|
463
|
+
</head>
|
|
464
|
+
<body>
|
|
465
|
+
<h1>Testing</h1>
|
|
466
|
+
</body>
|
|
467
|
+
</html>
|
|
468
|
+
|
|
469
|
+
edoc
|
|
470
|
+
assert_equal "7ff5e90iormq5niy6x98j75", doc.at("/html/head/meta[@name='verification']")['content']
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
def test_nil_attr
|
|
474
|
+
# parsing this file was failing on JRuby
|
|
475
|
+
assert_nothing_raised {Hpricot.parse(TestFiles::BNQT)}
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
def test_unknown_tag
|
|
479
|
+
header = <<-edoc
|
|
480
|
+
<header id="htest">
|
|
481
|
+
<div id="dtest">blah</div>
|
|
482
|
+
</header>
|
|
483
|
+
edoc
|
|
484
|
+
doc = Hpricot(<<-edoc)
|
|
485
|
+
<div>#{header}</div>
|
|
486
|
+
edoc
|
|
487
|
+
assert_equal header.chomp, (doc/"#htest").to_html
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
def test_nested_unknown_tags
|
|
491
|
+
header =
|
|
492
|
+
%(<header id="htest"><div id="dtest"><nav>blah</nav></div></header>)
|
|
493
|
+
doc = Hpricot(%(<div>#{header}</div>))
|
|
494
|
+
assert_equal header.chomp, (doc/"#htest").to_html
|
|
495
|
+
end
|
|
496
|
+
end
|
data/test/test_paths.rb
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'test/unit'
|
|
4
|
+
require 'hpricot'
|
|
5
|
+
require 'load_files'
|
|
6
|
+
|
|
7
|
+
class TestParser < Test::Unit::TestCase
|
|
8
|
+
def test_roundtrip
|
|
9
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
|
10
|
+
%w[link link[2] body #link1 a p.ohmy].each do |css_sel|
|
|
11
|
+
ele = @basic.at(css_sel)
|
|
12
|
+
assert_equal ele, @basic.at(ele.css_path)
|
|
13
|
+
assert_equal ele, @basic.at(ele.xpath)
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
def test_attr_brackets
|
|
17
|
+
doc = Hpricot('<input name="vendor[porkpies]"/>')
|
|
18
|
+
assert_equal 1, (doc/'input[@name^="vendor[porkpies]"]').length
|
|
19
|
+
assert_equal 1, (doc/'input[@name="vendor[porkpies]"]').length
|
|
20
|
+
assert_equal 0, (doc/'input[@name$="]]]]]"]').length
|
|
21
|
+
|
|
22
|
+
doc = Hpricot('<input name="vendor[porkpies][meaty]"/>')
|
|
23
|
+
assert_equal 1, (doc/'input[@name^="vendor[porkpies][meaty]"]').length
|
|
24
|
+
end
|
|
25
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
#!/usr/bin/env ruby
|
|
3
|
+
|
|
4
|
+
require 'test/unit'
|
|
5
|
+
require 'hpricot'
|
|
6
|
+
require 'load_files'
|
|
7
|
+
|
|
8
|
+
unless "".respond_to?(:lines)
|
|
9
|
+
require 'enumerator'
|
|
10
|
+
class String
|
|
11
|
+
def lines
|
|
12
|
+
Enumerable::Enumerator.new(self, :each_line)
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
class TestPreserved < Test::Unit::TestCase
|
|
18
|
+
def assert_roundtrip str
|
|
19
|
+
doc = Hpricot(str)
|
|
20
|
+
yield doc if block_given?
|
|
21
|
+
str2 = doc.to_original_html
|
|
22
|
+
if RUBY_VERSION =~ /^1.9/
|
|
23
|
+
str2.force_encoding('UTF-8')
|
|
24
|
+
end
|
|
25
|
+
str.lines.zip(str2.lines).each do |s1, s2|
|
|
26
|
+
assert_equal s1, s2
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def assert_html str1, str2
|
|
31
|
+
doc = Hpricot(str2)
|
|
32
|
+
yield doc if block_given?
|
|
33
|
+
assert_equal str1, doc.to_original_html
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def test_simple
|
|
37
|
+
str = "<p>Hpricot is a <b>you know <i>uh</b> fine thing.</p>"
|
|
38
|
+
assert_html str, str
|
|
39
|
+
assert_html "<p class=\"new\">Hpricot is a <b>you know <i>uh</b> fine thing.</p>", str do |doc|
|
|
40
|
+
(doc/:p).set('class', 'new')
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def test_parent
|
|
45
|
+
str = "<html><base href='/'><head><title>Test</title></head><body><div id='wrap'><p>Paragraph one.</p><p>Paragraph two.</p></div></body></html>"
|
|
46
|
+
assert_html str, str
|
|
47
|
+
assert_html "<html><base href='/'><body><div id=\"all\"><div><p>Paragraph one.</p></div><div><p>Paragraph two.</p></div></div></body></html>", str do |doc|
|
|
48
|
+
(doc/:head).remove
|
|
49
|
+
(doc/:div).set('id', 'all')
|
|
50
|
+
(doc/:p).wrap('<div></div>')
|
|
51
|
+
end
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
def test_escaping_of_contents
|
|
55
|
+
doc = Hpricot(TestFiles::BOINGBOING)
|
|
56
|
+
assert_equal "Fukuda’s Automatic Door opens around your body as you pass through it. The idea is to save energy and keep the room clean.", doc.at("img[@alt='200606131240']").next.to_s.strip
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
def test_files
|
|
60
|
+
assert_roundtrip TestFiles::BASIC
|
|
61
|
+
assert_roundtrip TestFiles::BOINGBOING
|
|
62
|
+
assert_roundtrip TestFiles::CY0
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def test_fixup_link
|
|
66
|
+
doc = %{<?xml version="1.0" encoding="UTF-8"?><rss><channel><link>ht</link></channel></rss>}
|
|
67
|
+
assert_roundtrip doc
|
|
68
|
+
assert_equal Hpricot(doc).to_s,
|
|
69
|
+
%{<?xml version="1.0" encoding="UTF-8"?><rss><channel><link />ht</channel></rss>}
|
|
70
|
+
assert_equal Hpricot.XML(doc).to_s,
|
|
71
|
+
%{<?xml version="1.0" encoding="UTF-8"?><rss><channel><link>ht</link></channel></rss>}
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def test_escaping_of_attrs
|
|
75
|
+
# ampersands in URLs
|
|
76
|
+
str = %{<a href="http://google.com/search?q=hpricot&l=en">Google</a>}
|
|
77
|
+
link = (doc = Hpricot(str)).at(:a)
|
|
78
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link['href']
|
|
79
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.attributes['href']
|
|
80
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.get_attribute('href')
|
|
81
|
+
assert_equal "http://google.com/search?q=hpricot&l=en", link.raw_attributes['href']
|
|
82
|
+
assert_equal str, doc.to_html
|
|
83
|
+
|
|
84
|
+
# alter the url
|
|
85
|
+
link['href'] = "javascript:alert(\"AGGA-KA-BOO!\")"
|
|
86
|
+
assert_equal %{<a href="javascript:alert("AGGA-KA-BOO!")">Google</a>}, doc.to_html
|
|
87
|
+
end
|
|
88
|
+
end
|
data/test/test_xml.rb
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
|
|
3
|
+
require 'test/unit'
|
|
4
|
+
require 'hpricot'
|
|
5
|
+
require 'load_files'
|
|
6
|
+
|
|
7
|
+
class TestParser < Test::Unit::TestCase
|
|
8
|
+
# normally, the link tags are empty HTML tags.
|
|
9
|
+
# contributed by laudney.
|
|
10
|
+
def test_normally_empty
|
|
11
|
+
doc = Hpricot::XML("<rss><channel><title>this is title</title><link>http://fake.com</link></channel></rss>")
|
|
12
|
+
assert_equal "this is title", (doc/:rss/:channel/:title).text
|
|
13
|
+
assert_equal "http://fake.com", (doc/:rss/:channel/:link).text
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# make sure XML doesn't get downcased
|
|
17
|
+
def test_casing
|
|
18
|
+
doc = Hpricot::XML(TestFiles::WHY)
|
|
19
|
+
assert_equal "hourly", (doc.at "sy:updatePeriod").inner_html
|
|
20
|
+
assert_equal 1, (doc/"guid[@isPermaLink]").length
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# be sure tags named "text" are ok
|
|
24
|
+
def test_text_tags
|
|
25
|
+
doc = Hpricot::XML("<feed><title>City Poisoned</title><text>Rita Lee has poisoned Brazil.</text></feed>")
|
|
26
|
+
assert_equal "City Poisoned", (doc/"title").text
|
|
27
|
+
end
|
|
28
|
+
end
|