webtranslateit-hpricot 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +15 -0
  3. data/CHANGELOG +122 -0
  4. data/COPYING +18 -0
  5. data/README.md +295 -0
  6. data/Rakefile +237 -0
  7. data/ext/fast_xs/FastXsService.java +1123 -0
  8. data/ext/fast_xs/extconf.rb +4 -0
  9. data/ext/fast_xs/fast_xs.c +210 -0
  10. data/ext/hpricot_scan/HpricotCss.java +850 -0
  11. data/ext/hpricot_scan/HpricotScanService.java +2085 -0
  12. data/ext/hpricot_scan/MANIFEST +0 -0
  13. data/ext/hpricot_scan/extconf.rb +9 -0
  14. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  15. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  16. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  17. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  18. data/ext/hpricot_scan/hpricot_scan.c +6848 -0
  19. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  20. data/ext/hpricot_scan/hpricot_scan.java.rl +1173 -0
  21. data/ext/hpricot_scan/hpricot_scan.rl +911 -0
  22. data/extras/hpricot.png +0 -0
  23. data/hpricot.gemspec +18 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +217 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +95 -0
  35. data/lib/hpricot.rb +26 -0
  36. data/setup.rb +1585 -0
  37. data/test/files/basic.xhtml +17 -0
  38. data/test/files/boingboing.html +2266 -0
  39. data/test/files/cy0.html +3653 -0
  40. data/test/files/immob.html +400 -0
  41. data/test/files/pace_application.html +1320 -0
  42. data/test/files/tenderlove.html +16 -0
  43. data/test/files/uswebgen.html +220 -0
  44. data/test/files/utf8.html +1054 -0
  45. data/test/files/week9.html +1723 -0
  46. data/test/files/why.xml +19 -0
  47. data/test/load_files.rb +7 -0
  48. data/test/nokogiri-bench.rb +64 -0
  49. data/test/test_alter.rb +96 -0
  50. data/test/test_builder.rb +37 -0
  51. data/test/test_parser.rb +496 -0
  52. data/test/test_paths.rb +25 -0
  53. data/test/test_preserved.rb +88 -0
  54. data/test/test_xml.rb +28 -0
  55. metadata +106 -0
@@ -0,0 +1,496 @@
1
+ # -*- coding: utf-8 -*-
2
+ #!/usr/bin/env ruby
3
+
4
+ require 'test/unit'
5
+ require 'hpricot'
6
+ require 'load_files'
7
+
8
+ class TestParser < Test::Unit::TestCase
9
+ def test_set_attr
10
+ @basic = Hpricot.parse(TestFiles::BASIC)
11
+ @basic.search('//p').set('class', 'para')
12
+ assert_equal 4, @basic.search('//p').length
13
+ assert_equal 4, @basic.search('//p').find_all { |x| x['class'] == 'para' }.length
14
+ end
15
+
16
+ # Test creating a new element
17
+ def test_new_element
18
+ elem = Hpricot::Elem.new('form')
19
+ assert_not_nil(elem)
20
+ assert_not_nil(elem.attributes)
21
+ end
22
+
23
+ def test_scan_text
24
+ assert_equal 'FOO', Hpricot.make("FOO").children.first.content
25
+ end
26
+
27
+ def test_filter_by_attr
28
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
29
+
30
+ # this link is escaped in the doc
31
+ link = 'http://www.youtube.com/watch?v=TvSNXyNw26g&search=chris%20ware'
32
+ assert_equal link, @boingboing.at("a[@href='#{link}']")['href']
33
+ end
34
+
35
+ def test_filter_contains
36
+ @basic = Hpricot.parse(TestFiles::BASIC)
37
+ assert_equal '<title>Sample XHTML</title>', @basic.search("title:contains('Sample')").to_s
38
+ end
39
+
40
+ def test_get_element_by_id
41
+ @basic = Hpricot.parse(TestFiles::BASIC)
42
+ assert_equal 'link1', @basic.get_element_by_id('link1')['id']
43
+ assert_equal 'link1', @basic.get_element_by_id('body1').get_element_by_id('link1').get_attribute('id')
44
+ end
45
+
46
+ def test_get_element_by_tag_name
47
+ @basic = Hpricot.parse(TestFiles::BASIC)
48
+ assert_equal 'link1', @basic.get_elements_by_tag_name('a')[0].get_attribute('id')
49
+ assert_equal 'link1', @basic.get_elements_by_tag_name('body')[0].get_element_by_id('link1').get_attribute('id')
50
+ end
51
+
52
+ def test_get_elements_by_tag_name_star
53
+ simple = Hpricot.parse("<div><p id='first'>First</p><p id='second'>Second</p></div>")
54
+ assert_equal 3, simple.get_elements_by_tag_name("*").size
55
+ assert_equal 1, simple.get_elements_by_tag_name("div").size
56
+ assert_equal 2, simple.get_elements_by_tag_name("p").size
57
+ end
58
+
59
+ def test_output_basic
60
+ @basic = Hpricot.parse(TestFiles::BASIC)
61
+ @basic2 = Hpricot.parse(@basic.inner_html)
62
+ scan_basic @basic2
63
+ end
64
+
65
+ def test_scan_basic
66
+ @basic = Hpricot.parse(TestFiles::BASIC)
67
+ scan_basic @basic
68
+ end
69
+
70
+ def scan_basic doc
71
+ assert_kind_of Hpricot::XMLDecl, doc.children.first
72
+ assert_not_equal doc.children.first.to_s, doc.children[1].to_s
73
+ assert_equal 'link1', doc.at('#link1')['id']
74
+ assert_equal 'link1', doc.at("p a")['id']
75
+ assert_equal 'link1', (doc/:p/:a).first['id']
76
+ assert_equal 'link1', doc.search('p').at('a').get_attribute('id')
77
+ assert_equal 'link2', (doc/'p').filter('.ohmy').search('a').first.get_attribute('id')
78
+ assert_equal (doc/'p')[2], (doc/'p').filter(':nth(2)')[0]
79
+ assert_equal (doc/'p')[2], (doc/'p').filter('[3]')[0]
80
+ assert_equal 4, (doc/'p').filter('*').length
81
+ assert_equal 4, (doc/'p').filter('* *').length
82
+ eles = (doc/'p').filter('.ohmy')
83
+ assert_equal 1, eles.length
84
+ assert_equal 'ohmy', eles.first.get_attribute('class')
85
+ assert_equal 3, (doc/'p:not(.ohmy)').length
86
+ assert_equal 3, (doc/'p').not('.ohmy').length
87
+ assert_equal 3, (doc/'p').not(eles.first).length
88
+ assert_equal 2, (doc/'p').filter('[@class]').length
89
+ assert_equal 'last final', (doc/'p[@class~="final"]').first.get_attribute('class')
90
+ assert_equal 1, (doc/'p').filter('[@class~="final"]').length
91
+ assert_equal 2, (doc/'p > a').length
92
+ assert_equal 1, (doc/'p.ohmy > a').length
93
+ assert_equal 2, (doc/'p / a').length
94
+ assert_equal 2, (doc/'link ~ link').length
95
+ assert_equal 3, (doc/'title ~ link').length
96
+ assert_equal 5, (doc/"//p/text()").length
97
+ assert_equal 6, (doc/"//p[a]//text()").length
98
+ assert_equal 2, (doc/"//p/a/text()").length
99
+ end
100
+
101
+ def test_positional
102
+ h = Hpricot( "<div><br/><p>one</p><p>two</p></div>" )
103
+ assert_equal "<p>one</p>", h.search("//div/p:eq(0)").to_s
104
+ assert_equal "<p>one</p>", h.search("//div/p:first").to_s
105
+ assert_equal "<p>one</p>", h.search("//div/p:first()").to_s
106
+ end
107
+
108
+ def test_pace
109
+ doc = Hpricot(TestFiles::PACE_APPLICATION)
110
+ assert_equal 'get', doc.at('form[@name=frmSect11]')['method']
111
+ # assert_equal '2', doc.at('#hdnSpouse')['value']
112
+ end
113
+
114
+ def test_scan_boingboing
115
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
116
+ assert_equal 60, (@boingboing/'p.posted').length
117
+ assert_equal 1, @boingboing.search("//a[@name='027906']").length
118
+ assert_equal 10, @boingboing.search("script comment()").length
119
+ assert_equal 3, @boingboing.search("a[text()*='Boing']").length
120
+ assert_equal 1, @boingboing.search("h3[text()='College kids reportedly taking more smart drugs']").length
121
+ assert_equal 0, @boingboing.search("h3[text()='College']").length
122
+ assert_equal 60, @boingboing.search("h3").length
123
+ assert_equal 59, @boingboing.search("h3[text()!='College kids reportedly taking more smart drugs']").length
124
+ assert_equal 17, @boingboing.search("h3[text()$='s']").length
125
+ assert_equal 116, @boingboing.search("p[text()]").length
126
+ assert_equal 211, @boingboing.search("p").length
127
+ end
128
+
129
+ def test_reparent
130
+ doc = Hpricot(%{<div id="blurb_1"></div>})
131
+ div1 = doc.search('#blurb_1')
132
+ div1.before('<div id="blurb_0"></div>')
133
+
134
+ div0 = doc.search('#blurb_0')
135
+ div0.before('<div id="blurb_a"></div>')
136
+
137
+ assert_equal 'div', doc.at('#blurb_1').name
138
+ end
139
+
140
+ def test_siblings
141
+ @basic = Hpricot.parse(TestFiles::BASIC)
142
+ t = @basic.at(:title)
143
+ e = t.next_sibling
144
+ assert_equal 'test1.css', e['href']
145
+ assert_equal 'title', e.previous_sibling.name
146
+ end
147
+
148
+ def test_css_negation
149
+ @basic = Hpricot.parse(TestFiles::BASIC)
150
+ assert_equal 3, (@basic/'p:not(.final)').length
151
+ end
152
+
153
+ def test_remove_attribute
154
+ @basic = Hpricot.parse(TestFiles::BASIC)
155
+ (@basic/:p).each { |ele| ele.remove_attribute('class') }
156
+ assert_equal 0, (@basic/'p[@class]').length
157
+ end
158
+
159
+ def test_abs_xpath
160
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
161
+ assert_equal 60, @boingboing.search("/html/body//p[@class='posted']").length
162
+ assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
163
+ assert_equal 18, @boingboing.search("//script").length
164
+ divs = @boingboing.search("//script/../div")
165
+ assert_equal 2, divs.length
166
+ imgs = @boingboing.search('//div/p/a/img')
167
+ assert_equal 16, imgs.length
168
+ assert_equal 16, @boingboing.search('//div').search('p/a/img').length
169
+ assert imgs.all? { |x| x.name == 'img' }
170
+ end
171
+
172
+ def test_predicates
173
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
174
+ assert_equal 2, @boingboing.search('//link[@rel="alternate"]').length
175
+ p_imgs = @boingboing.search('//div/p[/a/img]')
176
+ assert_equal 16, p_imgs.length
177
+ assert p_imgs.all? { |x| x.name == 'p' }
178
+ p_imgs = @boingboing.search('//div/p[a/img]')
179
+ assert_equal 16, p_imgs.length
180
+ assert p_imgs.all? { |x| x.name == 'p' }
181
+ assert_equal 1, @boingboing.search('//input[@checked]').length
182
+ end
183
+
184
+ def test_tag_case
185
+ @tenderlove = Hpricot.parse(TestFiles::TENDERLOVE)
186
+ assert_equal 2, @tenderlove.search('//a').length
187
+ assert_equal 3, @tenderlove.search('//area').length
188
+ assert_equal 2, @tenderlove.search('//meta').length
189
+ end
190
+
191
+ def test_alt_predicates
192
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
193
+ assert_equal 1, @boingboing.search('//table/tr:last').length
194
+
195
+ @basic = Hpricot.parse(TestFiles::BASIC)
196
+ assert_equal "<p>The third paragraph</p>",
197
+ @basic.search('p:eq(2)').to_html
198
+ assert_equal '<p class="last final"><b>THE FINAL PARAGRAPH</b></p>',
199
+ @basic.search('p:last').to_html
200
+ assert_equal 'last final', @basic.search('//p:last-of-type').first.get_attribute('class')
201
+ end
202
+
203
+ def test_insert_after # ticket #63
204
+ doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
205
+ (doc/'div').each do |element|
206
+ element.after('<p>Paragraph 1</p><p>Paragraph 2</p>')
207
+ end
208
+ assert_equal doc.to_html, '<html><body><div id="a-div"></div><p>Paragraph 1</p><p>Paragraph 2</p></body></html>'
209
+ end
210
+
211
+ def test_insert_before # ticket #61
212
+ doc = Hpricot('<html><body><div id="a-div"></div></body></html>')
213
+ (doc/'div').each do |element|
214
+ element.before('<p>Paragraph 1</p><p>Paragraph 2</p>')
215
+ end
216
+ assert_equal doc.to_html, '<html><body><p>Paragraph 1</p><p>Paragraph 2</p><div id="a-div"></div></body></html>'
217
+ end
218
+
219
+ def test_many_paths
220
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
221
+ assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
222
+ assert_equal 18, @boingboing.search('//div/p[a/img]|//link[@rel="alternate"]').length
223
+ end
224
+
225
+ def test_stacked_search
226
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
227
+ assert_kind_of Hpricot::Elements, @boingboing.search('//div/p').search('a img')
228
+ end
229
+
230
+ def test_attr_casing
231
+ doc = Hpricot("<a HREF='a'>A simple <b>test</b> string.</a>")
232
+ assert_equal (doc % :a)[:href], "a"
233
+ assert_equal (doc % :a)[:HREF], nil
234
+ assert_equal (doc % :a)['href'], "a"
235
+ assert_equal (doc % :a)['HREF'], nil
236
+ end
237
+
238
+ def test_class_search
239
+ # test case sent by Chih-Chao Lam
240
+ doc = Hpricot("<div class=xyz'>abc</div>")
241
+ assert_equal 1, doc.search(".xyz").length
242
+ doc = Hpricot("<div class=xyz>abc</div><div class=abc>xyz</div>")
243
+ assert_equal 1, doc.search(".xyz").length
244
+ assert_equal 4, doc.search("*").length
245
+ end
246
+
247
+ def test_kleene_star
248
+ # bug noticed by raja bhatia
249
+ doc = Hpricot("<span class='small'>1</span><div class='large'>2</div><div class='small'>3</div><span class='blue large'>4</span>")
250
+ assert_equal 2, doc.search("*[@class*='small']").length
251
+ assert_equal 2, doc.search("*.small").length
252
+ assert_equal 2, doc.search(".small").length
253
+ assert_equal 2, doc.search(".large").length
254
+ end
255
+
256
+ def test_empty_comment
257
+ doc = Hpricot("<p><!----></p>")
258
+ assert doc.children[0].children[0].comment?
259
+ doc = Hpricot("<p><!-- --></p>")
260
+ assert doc.children[0].children[0].comment?
261
+ end
262
+
263
+ def test_body_newlines
264
+ @immob = Hpricot.parse(TestFiles::IMMOB)
265
+ body = @immob.at(:body)
266
+ {'background' => '', 'bgcolor' => '#ffffff', 'text' => '#000000', 'marginheight' => '10',
267
+ 'marginwidth' => '10', 'leftmargin' => '10', 'topmargin' => '10', 'link' => '#000066',
268
+ 'alink' => '#ff6600', 'hlink' => "#ff6600", 'vlink' => "#000000"}.each do |k, v|
269
+ assert_equal v, body[k]
270
+ end
271
+ end
272
+
273
+ def test_nested_twins
274
+ @doc = Hpricot("<div>Hi<div>there</div></div>")
275
+ assert_equal 1, (@doc/"div div").length
276
+ end
277
+
278
+ def test_wildcard
279
+ @basic = Hpricot.parse(TestFiles::BASIC)
280
+ assert_equal 3, (@basic/"*[@id]").length
281
+ assert_equal 3, (@basic/"//*[@id]").length
282
+ end
283
+
284
+ def test_javascripts
285
+ @immob = Hpricot.parse(TestFiles::IMMOB)
286
+ assert_equal 3, (@immob/:script)[0].inner_html.scan(/<LINK/).length
287
+ end
288
+
289
+ def test_nested_scripts
290
+ @week9 = Hpricot.parse(TestFiles::WEEK9)
291
+ assert_equal 14, (@week9/"a").find_all { |x| x.inner_html.include? "GameCenter" }.length
292
+ end
293
+
294
+ def test_uswebgen
295
+ @uswebgen = Hpricot.parse(TestFiles::USWEBGEN)
296
+ # sent by brent beardsley, hpricot 0.3 had problems with all the links.
297
+ assert_equal 67, (@uswebgen/:a).length
298
+ end
299
+
300
+ def test_mangled_tags
301
+ [%{<html><form name='loginForm' method='post' action='/units/a/login/1,13088,779-1,00.html'?URL=></form></html>},
302
+ %{<html><form name='loginForm' ?URL= method='post' action='/units/a/login/1,13088,779-1,00.html'></form></html>},
303
+ %{<html><form name='loginForm'?URL= ?URL= method='post' action='/units/a/login/1,13088,779-1,00.html'?URL=></form></html>},
304
+ %{<html><form name='loginForm' method='post' action='/units/a/login/1,13088,779-1,00.html' ?URL=></form></html>}].
305
+ each do |str|
306
+ doc = Hpricot(str)
307
+ assert_equal 1, (doc/:form).length
308
+ assert_equal '/units/a/login/1,13088,779-1,00.html', doc.at("form")['action']
309
+ end
310
+ end
311
+
312
+ def test_procins
313
+ doc = Hpricot("<?php print('hello') ?>\n<?xml blah='blah'?>")
314
+ assert_equal "php", doc.children[0].target
315
+ assert_equal "blah='blah'", doc.children[2].content
316
+ end
317
+
318
+ def test_no_buffer_error
319
+ Hpricot(%{<p>\n\n<input type="hidden" name="__VIEWSTATE" value="#{(("X" * 2000) + "\n") * 44}" />\n\n</p>})
320
+ end
321
+
322
+ def test_youtube_attr
323
+ str = <<-edoc
324
+ <html><body>
325
+ Lorem ipsum. Jolly roger, ding-dong sing-a-long
326
+ <object width="425" height="350">
327
+ <param name="movie" value="http://www.youtube.com/v/NbDQ4M_cuwA"></param>
328
+ <param name="wmode" value="transparent"></param>
329
+ <embed src="http://www.youtube.com/v/NbDQ4M_cuwA"
330
+ type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
331
+ </embed>
332
+ </object>
333
+ Check out my posting, I have bright mice in large clown cars.
334
+ <object width="425" height="350">
335
+ <param name="movie" value="http://www.youtube.com/v/foobar"></param>
336
+ <param name="wmode" value="transparent"></param>
337
+ <embed src="http://www.youtube.com/v/foobar"
338
+ type="application/x-shockwave-flash" wmode="transparent" width="425" height="350">
339
+ </embed>
340
+ </object>
341
+ </body></html?
342
+ edoc
343
+ doc = Hpricot(str)
344
+ assert_equal "http://www.youtube.com/v/NbDQ4M_cuwA",
345
+ doc.at("//object/param[@value='http://www.youtube.com/v/NbDQ4M_cuwA']")['value']
346
+ end
347
+
348
+ # ticket #84 by jamezilla
349
+ def test_screwed_xmlns
350
+ doc = Hpricot(<<-edoc)
351
+ <?xml:namespace prefix = cwi />
352
+ <html><body>HAI</body></html>
353
+ edoc
354
+ assert_equal "HAI", doc.at("body").inner_text
355
+ end
356
+
357
+ # http://github.com/hpricot/hpricot/issues#issue/28
358
+ def test_invalid_inner_text
359
+ assert_equal "A", Hpricot('A&B;').inner_text[0...1]
360
+ end
361
+
362
+ # http://github.com/hpricot/hpricot/issues#issue/25
363
+ def test_encoding_compatibility_error
364
+ Hpricot("<p>\xC3\x9Cber</p><p>M&sup3;</p>").inner_text
365
+ end
366
+
367
+ # Reported by Jonathan Nichols on the Hpricot list (24 May 2007)
368
+ def test_self_closed_form
369
+ doc = Hpricot(<<-edoc)
370
+ <body>
371
+ <form action="/loginRegForm" name="regForm" method="POST" />
372
+ <input type="button">
373
+ </form>
374
+ </body>
375
+ edoc
376
+ assert_equal "button", doc.at("//form/input")['type']
377
+ end
378
+
379
+ def test_escaped_quote
380
+ # Backslash '\' is not an escape character in HTML.
381
+ doc = Hpricot("<div><input type='text' value='C:\\dir\\' /><p id='test_id'>test</p></div>")
382
+ assert_equal "C:\\dir\\", doc.at("input")["value"]
383
+ doc = Hpricot('<div><input type="text" value="C:\\dir\\" /><p id="test_id">test</p></div>')
384
+ assert_equal "C:\\dir\\", doc.at("input")["value"]
385
+ end
386
+
387
+ def test_filters
388
+ @basic = Hpricot.parse(TestFiles::BASIC)
389
+ assert_equal 0, (@basic/"title:parent").size
390
+ assert_equal 3, (@basic/"p:parent").size
391
+ assert_equal 3, (@basic/"link:empty").size
392
+ assert_equal 1, (@basic/"span:empty").size
393
+ end
394
+
395
+ def test_keep_cdata
396
+ str = %{<script> /*<![CDATA[*/
397
+ /*]]>*/ </script>}
398
+ assert_equal str, Hpricot(str).to_html
399
+ end
400
+
401
+ def test_namespace
402
+ chunk = <<-END
403
+ <a xmlns:t="http://www.nexopia.com/dev/template">
404
+ <t:sam>hi </t:sam>
405
+ </a>
406
+ END
407
+ doc = Hpricot::XML(chunk)
408
+ assert (doc/"//t:sam").size > 0 # at least this should probably work
409
+ # assert (doc/"//sam").size > 0 # this would be nice
410
+ end
411
+
412
+ def test_uxs_ignores_non_entities
413
+ assert_equal 'abc', Hpricot.uxs('abc')
414
+ end
415
+
416
+ def test_uxs_handles_gt_lt_amp_quot
417
+ assert_equal '"&<>', Hpricot.uxs('&quot;&amp;&lt;&gt;')
418
+ end
419
+
420
+ def test_uxs_handles_numeric_values
421
+ if String.method_defined? :encoding
422
+ assert_equal "é", Hpricot.uxs('&#233;')
423
+ else
424
+ assert_equal "\303\251", Hpricot.uxs('&#233;')
425
+ end
426
+ end
427
+
428
+ def test_uxs_handles_hexadecimal_values
429
+ if String.method_defined? :encoding
430
+ assert_equal "é", Hpricot.uxs('&#xe9;')
431
+ else
432
+ assert_equal "\303\251", Hpricot.uxs('&#xe9;')
433
+ end
434
+ end
435
+
436
+ def test_uxs_handles_entities
437
+ if String.method_defined? :encoding
438
+ assert_equal "é", Hpricot.uxs('&eacute;')
439
+ else
440
+ assert_equal "\303\251", Hpricot.uxs('&eacute;')
441
+ end
442
+ end
443
+
444
+ def test_cdata_inner_text
445
+ xml = Hpricot.XML(%{
446
+ <peon>
447
+ <id>96586</id>
448
+ <stdout><![CDATA[This is STDOUT]]></stdout>
449
+ <stderr><!-- IGNORE --><![CDATA[This is]]> STDERR</stderr>
450
+ </peon>})
451
+ assert_equal "This is STDOUT", (xml/:peon/:stdout).inner_text
452
+ assert_equal "This is STDERR", (xml/:peon/:stderr).inner_text
453
+ end
454
+
455
+ def test_parsing_html_with_noscript
456
+ doc = Hpricot(<<-edoc)
457
+ <html>
458
+ <head>
459
+ <noscript>
460
+ <meta http-equiv="refresh" content="0; url=http://www.yoursite.com/noscripts.html"/>
461
+ </noscript>
462
+ <meta name="verification" content="7ff5e90iormq5niy6x98j75" />
463
+ </head>
464
+ <body>
465
+ <h1>Testing</h1>
466
+ </body>
467
+ </html>
468
+
469
+ edoc
470
+ assert_equal "7ff5e90iormq5niy6x98j75", doc.at("/html/head/meta[@name='verification']")['content']
471
+ end
472
+
473
+ def test_nil_attr
474
+ # parsing this file was failing on JRuby
475
+ assert_nothing_raised {Hpricot.parse(TestFiles::BNQT)}
476
+ end
477
+
478
+ def test_unknown_tag
479
+ header = <<-edoc
480
+ <header id="htest">
481
+ <div id="dtest">blah</div>
482
+ </header>
483
+ edoc
484
+ doc = Hpricot(<<-edoc)
485
+ <div>#{header}</div>
486
+ edoc
487
+ assert_equal header.chomp, (doc/"#htest").to_html
488
+ end
489
+
490
+ def test_nested_unknown_tags
491
+ header =
492
+ %(<header id="htest"><div id="dtest"><nav>blah</nav></div></header>)
493
+ doc = Hpricot(%(<div>#{header}</div>))
494
+ assert_equal header.chomp, (doc/"#htest").to_html
495
+ end
496
+ end
@@ -0,0 +1,25 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require 'hpricot'
5
+ require 'load_files'
6
+
7
+ class TestParser < Test::Unit::TestCase
8
+ def test_roundtrip
9
+ @basic = Hpricot.parse(TestFiles::BASIC)
10
+ %w[link link[2] body #link1 a p.ohmy].each do |css_sel|
11
+ ele = @basic.at(css_sel)
12
+ assert_equal ele, @basic.at(ele.css_path)
13
+ assert_equal ele, @basic.at(ele.xpath)
14
+ end
15
+ end
16
+ def test_attr_brackets
17
+ doc = Hpricot('<input name="vendor[porkpies]"/>')
18
+ assert_equal 1, (doc/'input[@name^="vendor[porkpies]"]').length
19
+ assert_equal 1, (doc/'input[@name="vendor[porkpies]"]').length
20
+ assert_equal 0, (doc/'input[@name$="]]]]]"]').length
21
+
22
+ doc = Hpricot('<input name="vendor[porkpies][meaty]"/>')
23
+ assert_equal 1, (doc/'input[@name^="vendor[porkpies][meaty]"]').length
24
+ end
25
+ end
@@ -0,0 +1,88 @@
1
+ # -*- coding: utf-8 -*-
2
+ #!/usr/bin/env ruby
3
+
4
+ require 'test/unit'
5
+ require 'hpricot'
6
+ require 'load_files'
7
+
8
+ unless "".respond_to?(:lines)
9
+ require 'enumerator'
10
+ class String
11
+ def lines
12
+ Enumerable::Enumerator.new(self, :each_line)
13
+ end
14
+ end
15
+ end
16
+
17
+ class TestPreserved < Test::Unit::TestCase
18
+ def assert_roundtrip str
19
+ doc = Hpricot(str)
20
+ yield doc if block_given?
21
+ str2 = doc.to_original_html
22
+ if RUBY_VERSION =~ /^1.9/
23
+ str2.force_encoding('UTF-8')
24
+ end
25
+ str.lines.zip(str2.lines).each do |s1, s2|
26
+ assert_equal s1, s2
27
+ end
28
+ end
29
+
30
+ def assert_html str1, str2
31
+ doc = Hpricot(str2)
32
+ yield doc if block_given?
33
+ assert_equal str1, doc.to_original_html
34
+ end
35
+
36
+ def test_simple
37
+ str = "<p>Hpricot is a <b>you know <i>uh</b> fine thing.</p>"
38
+ assert_html str, str
39
+ assert_html "<p class=\"new\">Hpricot is a <b>you know <i>uh</b> fine thing.</p>", str do |doc|
40
+ (doc/:p).set('class', 'new')
41
+ end
42
+ end
43
+
44
+ def test_parent
45
+ str = "<html><base href='/'><head><title>Test</title></head><body><div id='wrap'><p>Paragraph one.</p><p>Paragraph two.</p></div></body></html>"
46
+ assert_html str, str
47
+ assert_html "<html><base href='/'><body><div id=\"all\"><div><p>Paragraph one.</p></div><div><p>Paragraph two.</p></div></div></body></html>", str do |doc|
48
+ (doc/:head).remove
49
+ (doc/:div).set('id', 'all')
50
+ (doc/:p).wrap('<div></div>')
51
+ end
52
+ end
53
+
54
+ def test_escaping_of_contents
55
+ doc = Hpricot(TestFiles::BOINGBOING)
56
+ assert_equal "Fukuda’s Automatic Door opens around your body as you pass through it. The idea is to save energy and keep the room clean.", doc.at("img[@alt='200606131240']").next.to_s.strip
57
+ end
58
+
59
+ def test_files
60
+ assert_roundtrip TestFiles::BASIC
61
+ assert_roundtrip TestFiles::BOINGBOING
62
+ assert_roundtrip TestFiles::CY0
63
+ end
64
+
65
+ def test_fixup_link
66
+ doc = %{<?xml version="1.0" encoding="UTF-8"?><rss><channel><link>ht</link></channel></rss>}
67
+ assert_roundtrip doc
68
+ assert_equal Hpricot(doc).to_s,
69
+ %{<?xml version="1.0" encoding="UTF-8"?><rss><channel><link />ht</channel></rss>}
70
+ assert_equal Hpricot.XML(doc).to_s,
71
+ %{<?xml version="1.0" encoding="UTF-8"?><rss><channel><link>ht</link></channel></rss>}
72
+ end
73
+
74
+ def test_escaping_of_attrs
75
+ # ampersands in URLs
76
+ str = %{<a href="http://google.com/search?q=hpricot&amp;l=en">Google</a>}
77
+ link = (doc = Hpricot(str)).at(:a)
78
+ assert_equal "http://google.com/search?q=hpricot&l=en", link['href']
79
+ assert_equal "http://google.com/search?q=hpricot&l=en", link.attributes['href']
80
+ assert_equal "http://google.com/search?q=hpricot&l=en", link.get_attribute('href')
81
+ assert_equal "http://google.com/search?q=hpricot&amp;l=en", link.raw_attributes['href']
82
+ assert_equal str, doc.to_html
83
+
84
+ # alter the url
85
+ link['href'] = "javascript:alert(\"AGGA-KA-BOO!\")"
86
+ assert_equal %{<a href="javascript:alert(&quot;AGGA-KA-BOO!&quot;)">Google</a>}, doc.to_html
87
+ end
88
+ end
data/test/test_xml.rb ADDED
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require 'hpricot'
5
+ require 'load_files'
6
+
7
+ class TestParser < Test::Unit::TestCase
8
+ # normally, the link tags are empty HTML tags.
9
+ # contributed by laudney.
10
+ def test_normally_empty
11
+ doc = Hpricot::XML("<rss><channel><title>this is title</title><link>http://fake.com</link></channel></rss>")
12
+ assert_equal "this is title", (doc/:rss/:channel/:title).text
13
+ assert_equal "http://fake.com", (doc/:rss/:channel/:link).text
14
+ end
15
+
16
+ # make sure XML doesn't get downcased
17
+ def test_casing
18
+ doc = Hpricot::XML(TestFiles::WHY)
19
+ assert_equal "hourly", (doc.at "sy:updatePeriod").inner_html
20
+ assert_equal 1, (doc/"guid[@isPermaLink]").length
21
+ end
22
+
23
+ # be sure tags named "text" are ok
24
+ def test_text_tags
25
+ doc = Hpricot::XML("<feed><title>City Poisoned</title><text>Rita Lee has poisoned Brazil.</text></feed>")
26
+ assert_equal "City Poisoned", (doc/"title").text
27
+ end
28
+ end