hpricot 0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ module TestFiles
2
+ Dir.chdir(File.dirname(__FILE__)) do
3
+ Dir['files/*.{html,xhtml}'].each do |fname|
4
+ const_set fname[%r!/(\w+)\.\w+$!, 1].upcase, IO.read(fname)
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,141 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require 'hpricot'
5
+ require 'load_files'
6
+
7
+ class TestParser < Test::Unit::TestCase
8
+ def setup
9
+ @basic = Hpricot.parse(TestFiles::BASIC)
10
+ @boingboing = Hpricot.parse(TestFiles::BOINGBOING)
11
+ @immob = Hpricot.parse(TestFiles::IMMOB)
12
+ @uswebgen = Hpricot.parse(TestFiles::USWEBGEN)
13
+ # @utf8 = Hpricot.parse(TestFiles::UTF8)
14
+ end
15
+
16
+ # def test_set_attr
17
+ # @basic.search('//p').set('class', 'para')
18
+ # assert_equal '', @basic.search('//p').map { |x| x.attributes }
19
+ # end
20
+
21
+ def test_scan_text
22
+ assert_equal 'FOO', Hpricot.make("FOO").first.content
23
+ end
24
+
25
+ def test_get_element_by_id
26
+ assert_equal 'link1', @basic.get_element_by_id('link1')['id']
27
+ assert_equal 'link1', @basic.get_element_by_id('body1').get_element_by_id('link1').get_attribute('id')
28
+ end
29
+
30
+ def test_get_element_by_tag_name
31
+ assert_equal 'link1', @basic.get_elements_by_tag_name('a')[0].get_attribute('id')
32
+ assert_equal 'link1', @basic.get_elements_by_tag_name('body')[0].get_element_by_id('link1').get_attribute('id')
33
+ end
34
+
35
+ def test_output_basic
36
+ @basic2 = Hpricot.parse(@basic.inner_html)
37
+ scan_basic @basic2
38
+ end
39
+
40
+ def test_scan_basic
41
+ scan_basic @basic
42
+ end
43
+
44
+ def scan_basic doc
45
+ assert_equal 'link1', doc.at('#link1')['id']
46
+ assert_equal 'link1', doc.at("p a")['id']
47
+ assert_equal 'link1', (doc/:p/:a).first['id']
48
+ assert_equal 'link1', doc.search('p').at('a').get_attribute('id')
49
+ assert_equal 'link2', (doc/'p').filter('.ohmy').search('a').first.get_attribute('id')
50
+ assert_equal (doc/'p')[2], (doc/'p').filter(':nth(2)')[0]
51
+ assert_equal 4, (doc/'p').filter('*').length
52
+ assert_equal 4, (doc/'p').filter('* *').length
53
+ eles = (doc/'p').filter('.ohmy')
54
+ assert_equal 1, eles.length
55
+ assert_equal 'ohmy', eles.first.get_attribute('class')
56
+ assert_equal 3, (doc/'p:not(.ohmy)').length
57
+ assert_equal 3, (doc/'p').not('.ohmy').length
58
+ assert_equal 3, (doc/'p').not(eles.first).length
59
+ assert_equal 2, (doc/'p').filter('[@class]').length
60
+ assert_equal 'last final', (doc/'p[@class~="final"]').first.get_attribute('class')
61
+ assert_equal 1, (doc/'p').filter('[@class~="final"]').length
62
+ assert_equal 2, (doc/'p > a').length
63
+ assert_equal 1, (doc/'p.ohmy > a').length
64
+ assert_equal 2, (doc/'p / a').length
65
+ assert_equal 2, (doc/'link ~ link').length
66
+ assert_equal 3, (doc/'title ~ link').length
67
+ end
68
+
69
+ def test_scan_boingboing
70
+ assert_equal 60, (@boingboing/'p.posted').length
71
+ assert_equal 1, @boingboing.search("//a[@name='027906']").length
72
+ end
73
+
74
+ def test_css_negation
75
+ assert_equal 3, (@basic/'p:not(.final)').length
76
+ end
77
+
78
+ def test_remove_attribute
79
+ (@basic/:p).each { |ele| ele.remove_attribute('class') }
80
+ assert_equal 0, (@basic/'p[@class]').length
81
+ end
82
+
83
+ def test_abs_xpath
84
+ assert_equal 60, @boingboing.search("/html/body//p[@class='posted']").length
85
+ assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
86
+ assert_equal 18, @boingboing.search("//script").length
87
+ divs = @boingboing.search("//script/../div")
88
+ assert_equal 2, divs.length
89
+ assert_equal 1, divs.search('a').length
90
+ imgs = @boingboing.search('//div/p/a/img')
91
+ assert_equal 15, imgs.length
92
+ assert_equal 17, @boingboing.search('//div').search('p/a/img').length
93
+ assert imgs.all? { |x| x.name == 'img' }
94
+ end
95
+
96
+ def test_predicates
97
+ assert_equal 2, @boingboing.search('//link[@rel="alternate"]').length
98
+ p_imgs = @boingboing.search('//div/p[/a/img]')
99
+ assert_equal 15, p_imgs.length
100
+ assert p_imgs.all? { |x| x.name == 'p' }
101
+ p_imgs = @boingboing.search('//div/p[a/img]')
102
+ assert_equal 18, p_imgs.length
103
+ assert p_imgs.all? { |x| x.name == 'p' }
104
+ assert_equal 1, @boingboing.search('//input[@checked]').length
105
+ end
106
+
107
+ def test_alt_predicates
108
+ assert_equal 2, @boingboing.search('//table/tr:last').length
109
+ assert_equal "<p>The third paragraph</p>",
110
+ @basic.search('p:eq(2)').to_html
111
+ assert_equal '<p class="last final"><b>THE FINAL PARAGRAPH</b></p>',
112
+ @basic.search('p:last').to_html
113
+ assert_equal 'last final', @basic.search('//p:last-of-type').first.get_attribute('class')
114
+ end
115
+
116
+ def test_many_paths
117
+ assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
118
+ assert_equal 20, @boingboing.search('//div/p[a/img]|//link[@rel="alternate"]').length
119
+ end
120
+
121
+ def test_body_newlines
122
+ body = @immob.at(:body)
123
+ {'background' => '', 'bgcolor' => '#ffffff', 'text' => '#000000', 'marginheight' => '10',
124
+ 'marginwidth' => '10', 'leftmargin' => '10', 'topmargin' => '10', 'link' => '#000066',
125
+ 'alink' => '#ff6600', 'hlink' => "#ff6600", 'vlink' => "#000000"}.each do |k, v|
126
+ assert_equal v, body[k]
127
+ end
128
+ end
129
+
130
+ def test_javascripts
131
+ assert_equal 3, (@immob/:script)[0].inner_html.scan(/<LINK/).length
132
+ end
133
+
134
+ def test_uswebgen
135
+ # sent by brent beardsley, hpricot 0.3 had problems with all the links.
136
+ assert_equal 67, (@uswebgen/:a).length
137
+ end
138
+
139
+ def test_unicode
140
+ end
141
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: 1
4
+ name: hpricot
5
+ version: !ruby/object:Gem::Version
6
+ version: "0.4"
7
+ date: 2006-08-11 00:00:00 -06:00
8
+ summary: a swift, liberal HTML parser with a fantastic library
9
+ require_paths:
10
+ - lib
11
+ email: why@ruby-lang.org
12
+ homepage: http://code.whytheluckystiff.net/hpricot/
13
+ rubyforge_project:
14
+ description: a swift, liberal HTML parser with a fantastic library
15
+ autorequire:
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: false
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - why the lucky stiff
31
+ files:
32
+ - COPYING
33
+ - README
34
+ - Rakefile
35
+ - test/files
36
+ - test/load_files.rb
37
+ - test/test_parser.rb
38
+ - test/files/uswebgen.html
39
+ - test/files/boingboing.html
40
+ - test/files/immob.html
41
+ - test/files/basic.xhtml
42
+ - lib/hpricot
43
+ - lib/hpricot_scan.so
44
+ - lib/hpricot.rb
45
+ - lib/hpricot/htmlinfo.rb
46
+ - lib/hpricot/text.rb
47
+ - lib/hpricot/inspect.rb
48
+ - lib/hpricot/modules.rb
49
+ - lib/hpricot/parse.rb
50
+ - lib/hpricot/tag.rb
51
+ - lib/hpricot/traverse.rb
52
+ - lib/hpricot/elements.rb
53
+ - extras/mingw-rbconfig.rb
54
+ - ext/hpricot_scan/hpricot_scan.h
55
+ - ext/hpricot_scan/hpricot_scan.c
56
+ - ext/hpricot_scan/extconf.rb
57
+ - ext/hpricot_scan/hpricot_scan.rl
58
+ - CHANGELOG
59
+ test_files: []
60
+
61
+ rdoc_options: []
62
+
63
+ extra_rdoc_files:
64
+ - README
65
+ - CHANGELOG
66
+ - COPYING
67
+ executables: []
68
+
69
+ extensions:
70
+ - ext/hpricot_scan/extconf.rb
71
+ requirements: []
72
+
73
+ dependencies: []
74
+