hpricot 0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +34 -0
- data/COPYING +18 -0
- data/README +6 -0
- data/Rakefile +166 -0
- data/ext/hpricot_scan/extconf.rb +6 -0
- data/ext/hpricot_scan/hpricot_scan.c +5964 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.rl +300 -0
- data/extras/mingw-rbconfig.rb +176 -0
- data/lib/hpricot.rb +6 -0
- data/lib/hpricot/elements.rb +292 -0
- data/lib/hpricot/htmlinfo.rb +672 -0
- data/lib/hpricot/inspect.rb +90 -0
- data/lib/hpricot/modules.rb +37 -0
- data/lib/hpricot/parse.rb +286 -0
- data/lib/hpricot/tag.rb +146 -0
- data/lib/hpricot/text.rb +115 -0
- data/lib/hpricot/traverse.rb +511 -0
- data/lib/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/immob.html +400 -0
- data/test/files/uswebgen.html +220 -0
- data/test/load_files.rb +7 -0
- data/test/test_parser.rb +141 -0
- metadata +74 -0
data/test/load_files.rb
ADDED
data/test/test_parser.rb
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'hpricot'
|
5
|
+
require 'load_files'
|
6
|
+
|
7
|
+
class TestParser < Test::Unit::TestCase
|
8
|
+
def setup
|
9
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
10
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
11
|
+
@immob = Hpricot.parse(TestFiles::IMMOB)
|
12
|
+
@uswebgen = Hpricot.parse(TestFiles::USWEBGEN)
|
13
|
+
# @utf8 = Hpricot.parse(TestFiles::UTF8)
|
14
|
+
end
|
15
|
+
|
16
|
+
# def test_set_attr
|
17
|
+
# @basic.search('//p').set('class', 'para')
|
18
|
+
# assert_equal '', @basic.search('//p').map { |x| x.attributes }
|
19
|
+
# end
|
20
|
+
|
21
|
+
def test_scan_text
|
22
|
+
assert_equal 'FOO', Hpricot.make("FOO").first.content
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_get_element_by_id
|
26
|
+
assert_equal 'link1', @basic.get_element_by_id('link1')['id']
|
27
|
+
assert_equal 'link1', @basic.get_element_by_id('body1').get_element_by_id('link1').get_attribute('id')
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_get_element_by_tag_name
|
31
|
+
assert_equal 'link1', @basic.get_elements_by_tag_name('a')[0].get_attribute('id')
|
32
|
+
assert_equal 'link1', @basic.get_elements_by_tag_name('body')[0].get_element_by_id('link1').get_attribute('id')
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_output_basic
|
36
|
+
@basic2 = Hpricot.parse(@basic.inner_html)
|
37
|
+
scan_basic @basic2
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_scan_basic
|
41
|
+
scan_basic @basic
|
42
|
+
end
|
43
|
+
|
44
|
+
def scan_basic doc
|
45
|
+
assert_equal 'link1', doc.at('#link1')['id']
|
46
|
+
assert_equal 'link1', doc.at("p a")['id']
|
47
|
+
assert_equal 'link1', (doc/:p/:a).first['id']
|
48
|
+
assert_equal 'link1', doc.search('p').at('a').get_attribute('id')
|
49
|
+
assert_equal 'link2', (doc/'p').filter('.ohmy').search('a').first.get_attribute('id')
|
50
|
+
assert_equal (doc/'p')[2], (doc/'p').filter(':nth(2)')[0]
|
51
|
+
assert_equal 4, (doc/'p').filter('*').length
|
52
|
+
assert_equal 4, (doc/'p').filter('* *').length
|
53
|
+
eles = (doc/'p').filter('.ohmy')
|
54
|
+
assert_equal 1, eles.length
|
55
|
+
assert_equal 'ohmy', eles.first.get_attribute('class')
|
56
|
+
assert_equal 3, (doc/'p:not(.ohmy)').length
|
57
|
+
assert_equal 3, (doc/'p').not('.ohmy').length
|
58
|
+
assert_equal 3, (doc/'p').not(eles.first).length
|
59
|
+
assert_equal 2, (doc/'p').filter('[@class]').length
|
60
|
+
assert_equal 'last final', (doc/'p[@class~="final"]').first.get_attribute('class')
|
61
|
+
assert_equal 1, (doc/'p').filter('[@class~="final"]').length
|
62
|
+
assert_equal 2, (doc/'p > a').length
|
63
|
+
assert_equal 1, (doc/'p.ohmy > a').length
|
64
|
+
assert_equal 2, (doc/'p / a').length
|
65
|
+
assert_equal 2, (doc/'link ~ link').length
|
66
|
+
assert_equal 3, (doc/'title ~ link').length
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_scan_boingboing
|
70
|
+
assert_equal 60, (@boingboing/'p.posted').length
|
71
|
+
assert_equal 1, @boingboing.search("//a[@name='027906']").length
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_css_negation
|
75
|
+
assert_equal 3, (@basic/'p:not(.final)').length
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_remove_attribute
|
79
|
+
(@basic/:p).each { |ele| ele.remove_attribute('class') }
|
80
|
+
assert_equal 0, (@basic/'p[@class]').length
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_abs_xpath
|
84
|
+
assert_equal 60, @boingboing.search("/html/body//p[@class='posted']").length
|
85
|
+
assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
|
86
|
+
assert_equal 18, @boingboing.search("//script").length
|
87
|
+
divs = @boingboing.search("//script/../div")
|
88
|
+
assert_equal 2, divs.length
|
89
|
+
assert_equal 1, divs.search('a').length
|
90
|
+
imgs = @boingboing.search('//div/p/a/img')
|
91
|
+
assert_equal 15, imgs.length
|
92
|
+
assert_equal 17, @boingboing.search('//div').search('p/a/img').length
|
93
|
+
assert imgs.all? { |x| x.name == 'img' }
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_predicates
|
97
|
+
assert_equal 2, @boingboing.search('//link[@rel="alternate"]').length
|
98
|
+
p_imgs = @boingboing.search('//div/p[/a/img]')
|
99
|
+
assert_equal 15, p_imgs.length
|
100
|
+
assert p_imgs.all? { |x| x.name == 'p' }
|
101
|
+
p_imgs = @boingboing.search('//div/p[a/img]')
|
102
|
+
assert_equal 18, p_imgs.length
|
103
|
+
assert p_imgs.all? { |x| x.name == 'p' }
|
104
|
+
assert_equal 1, @boingboing.search('//input[@checked]').length
|
105
|
+
end
|
106
|
+
|
107
|
+
def test_alt_predicates
|
108
|
+
assert_equal 2, @boingboing.search('//table/tr:last').length
|
109
|
+
assert_equal "<p>The third paragraph</p>",
|
110
|
+
@basic.search('p:eq(2)').to_html
|
111
|
+
assert_equal '<p class="last final"><b>THE FINAL PARAGRAPH</b></p>',
|
112
|
+
@basic.search('p:last').to_html
|
113
|
+
assert_equal 'last final', @basic.search('//p:last-of-type').first.get_attribute('class')
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_many_paths
|
117
|
+
assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
|
118
|
+
assert_equal 20, @boingboing.search('//div/p[a/img]|//link[@rel="alternate"]').length
|
119
|
+
end
|
120
|
+
|
121
|
+
def test_body_newlines
|
122
|
+
body = @immob.at(:body)
|
123
|
+
{'background' => '', 'bgcolor' => '#ffffff', 'text' => '#000000', 'marginheight' => '10',
|
124
|
+
'marginwidth' => '10', 'leftmargin' => '10', 'topmargin' => '10', 'link' => '#000066',
|
125
|
+
'alink' => '#ff6600', 'hlink' => "#ff6600", 'vlink' => "#000000"}.each do |k, v|
|
126
|
+
assert_equal v, body[k]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_javascripts
|
131
|
+
assert_equal 3, (@immob/:script)[0].inner_html.scan(/<LINK/).length
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_uswebgen
|
135
|
+
# sent by brent beardsley, hpricot 0.3 had problems with all the links.
|
136
|
+
assert_equal 67, (@uswebgen/:a).length
|
137
|
+
end
|
138
|
+
|
139
|
+
def test_unicode
|
140
|
+
end
|
141
|
+
end
|
metadata
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: hpricot
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: "0.4"
|
7
|
+
date: 2006-08-11 00:00:00 -06:00
|
8
|
+
summary: a swift, liberal HTML parser with a fantastic library
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: why@ruby-lang.org
|
12
|
+
homepage: http://code.whytheluckystiff.net/hpricot/
|
13
|
+
rubyforge_project:
|
14
|
+
description: a swift, liberal HTML parser with a fantastic library
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: false
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- why the lucky stiff
|
31
|
+
files:
|
32
|
+
- COPYING
|
33
|
+
- README
|
34
|
+
- Rakefile
|
35
|
+
- test/files
|
36
|
+
- test/load_files.rb
|
37
|
+
- test/test_parser.rb
|
38
|
+
- test/files/uswebgen.html
|
39
|
+
- test/files/boingboing.html
|
40
|
+
- test/files/immob.html
|
41
|
+
- test/files/basic.xhtml
|
42
|
+
- lib/hpricot
|
43
|
+
- lib/hpricot_scan.so
|
44
|
+
- lib/hpricot.rb
|
45
|
+
- lib/hpricot/htmlinfo.rb
|
46
|
+
- lib/hpricot/text.rb
|
47
|
+
- lib/hpricot/inspect.rb
|
48
|
+
- lib/hpricot/modules.rb
|
49
|
+
- lib/hpricot/parse.rb
|
50
|
+
- lib/hpricot/tag.rb
|
51
|
+
- lib/hpricot/traverse.rb
|
52
|
+
- lib/hpricot/elements.rb
|
53
|
+
- extras/mingw-rbconfig.rb
|
54
|
+
- ext/hpricot_scan/hpricot_scan.h
|
55
|
+
- ext/hpricot_scan/hpricot_scan.c
|
56
|
+
- ext/hpricot_scan/extconf.rb
|
57
|
+
- ext/hpricot_scan/hpricot_scan.rl
|
58
|
+
- CHANGELOG
|
59
|
+
test_files: []
|
60
|
+
|
61
|
+
rdoc_options: []
|
62
|
+
|
63
|
+
extra_rdoc_files:
|
64
|
+
- README
|
65
|
+
- CHANGELOG
|
66
|
+
- COPYING
|
67
|
+
executables: []
|
68
|
+
|
69
|
+
extensions:
|
70
|
+
- ext/hpricot_scan/extconf.rb
|
71
|
+
requirements: []
|
72
|
+
|
73
|
+
dependencies: []
|
74
|
+
|