hpricot 0.4-mswin32
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +34 -0
- data/COPYING +18 -0
- data/README +6 -0
- data/Rakefile +166 -0
- data/ext/hpricot_scan/extconf.rb +6 -0
- data/ext/hpricot_scan/hpricot_scan.c +5964 -0
- data/ext/hpricot_scan/hpricot_scan.h +79 -0
- data/ext/hpricot_scan/hpricot_scan.rl +300 -0
- data/extras/mingw-rbconfig.rb +176 -0
- data/lib/hpricot.rb +6 -0
- data/lib/hpricot/elements.rb +292 -0
- data/lib/hpricot/htmlinfo.rb +672 -0
- data/lib/hpricot/inspect.rb +90 -0
- data/lib/hpricot/modules.rb +37 -0
- data/lib/hpricot/parse.rb +286 -0
- data/lib/hpricot/tag.rb +146 -0
- data/lib/hpricot/text.rb +115 -0
- data/lib/hpricot/traverse.rb +511 -0
- data/lib/hpricot_scan.so +0 -0
- data/test/files/basic.xhtml +17 -0
- data/test/files/boingboing.html +2266 -0
- data/test/files/immob.html +400 -0
- data/test/files/uswebgen.html +220 -0
- data/test/load_files.rb +7 -0
- data/test/test_parser.rb +141 -0
- metadata +72 -0
data/test/load_files.rb
ADDED
data/test/test_parser.rb
ADDED
@@ -0,0 +1,141 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'hpricot'
|
5
|
+
require 'load_files'
|
6
|
+
|
7
|
+
class TestParser < Test::Unit::TestCase
|
8
|
+
def setup
|
9
|
+
@basic = Hpricot.parse(TestFiles::BASIC)
|
10
|
+
@boingboing = Hpricot.parse(TestFiles::BOINGBOING)
|
11
|
+
@immob = Hpricot.parse(TestFiles::IMMOB)
|
12
|
+
@uswebgen = Hpricot.parse(TestFiles::USWEBGEN)
|
13
|
+
# @utf8 = Hpricot.parse(TestFiles::UTF8)
|
14
|
+
end
|
15
|
+
|
16
|
+
# def test_set_attr
|
17
|
+
# @basic.search('//p').set('class', 'para')
|
18
|
+
# assert_equal '', @basic.search('//p').map { |x| x.attributes }
|
19
|
+
# end
|
20
|
+
|
21
|
+
def test_scan_text
|
22
|
+
assert_equal 'FOO', Hpricot.make("FOO").first.content
|
23
|
+
end
|
24
|
+
|
25
|
+
def test_get_element_by_id
|
26
|
+
assert_equal 'link1', @basic.get_element_by_id('link1')['id']
|
27
|
+
assert_equal 'link1', @basic.get_element_by_id('body1').get_element_by_id('link1').get_attribute('id')
|
28
|
+
end
|
29
|
+
|
30
|
+
def test_get_element_by_tag_name
|
31
|
+
assert_equal 'link1', @basic.get_elements_by_tag_name('a')[0].get_attribute('id')
|
32
|
+
assert_equal 'link1', @basic.get_elements_by_tag_name('body')[0].get_element_by_id('link1').get_attribute('id')
|
33
|
+
end
|
34
|
+
|
35
|
+
def test_output_basic
|
36
|
+
@basic2 = Hpricot.parse(@basic.inner_html)
|
37
|
+
scan_basic @basic2
|
38
|
+
end
|
39
|
+
|
40
|
+
def test_scan_basic
|
41
|
+
scan_basic @basic
|
42
|
+
end
|
43
|
+
|
44
|
+
def scan_basic doc
|
45
|
+
assert_equal 'link1', doc.at('#link1')['id']
|
46
|
+
assert_equal 'link1', doc.at("p a")['id']
|
47
|
+
assert_equal 'link1', (doc/:p/:a).first['id']
|
48
|
+
assert_equal 'link1', doc.search('p').at('a').get_attribute('id')
|
49
|
+
assert_equal 'link2', (doc/'p').filter('.ohmy').search('a').first.get_attribute('id')
|
50
|
+
assert_equal (doc/'p')[2], (doc/'p').filter(':nth(2)')[0]
|
51
|
+
assert_equal 4, (doc/'p').filter('*').length
|
52
|
+
assert_equal 4, (doc/'p').filter('* *').length
|
53
|
+
eles = (doc/'p').filter('.ohmy')
|
54
|
+
assert_equal 1, eles.length
|
55
|
+
assert_equal 'ohmy', eles.first.get_attribute('class')
|
56
|
+
assert_equal 3, (doc/'p:not(.ohmy)').length
|
57
|
+
assert_equal 3, (doc/'p').not('.ohmy').length
|
58
|
+
assert_equal 3, (doc/'p').not(eles.first).length
|
59
|
+
assert_equal 2, (doc/'p').filter('[@class]').length
|
60
|
+
assert_equal 'last final', (doc/'p[@class~="final"]').first.get_attribute('class')
|
61
|
+
assert_equal 1, (doc/'p').filter('[@class~="final"]').length
|
62
|
+
assert_equal 2, (doc/'p > a').length
|
63
|
+
assert_equal 1, (doc/'p.ohmy > a').length
|
64
|
+
assert_equal 2, (doc/'p / a').length
|
65
|
+
assert_equal 2, (doc/'link ~ link').length
|
66
|
+
assert_equal 3, (doc/'title ~ link').length
|
67
|
+
end
|
68
|
+
|
69
|
+
def test_scan_boingboing
|
70
|
+
assert_equal 60, (@boingboing/'p.posted').length
|
71
|
+
assert_equal 1, @boingboing.search("//a[@name='027906']").length
|
72
|
+
end
|
73
|
+
|
74
|
+
def test_css_negation
|
75
|
+
assert_equal 3, (@basic/'p:not(.final)').length
|
76
|
+
end
|
77
|
+
|
78
|
+
def test_remove_attribute
|
79
|
+
(@basic/:p).each { |ele| ele.remove_attribute('class') }
|
80
|
+
assert_equal 0, (@basic/'p[@class]').length
|
81
|
+
end
|
82
|
+
|
83
|
+
def test_abs_xpath
|
84
|
+
assert_equal 60, @boingboing.search("/html/body//p[@class='posted']").length
|
85
|
+
assert_equal 60, @boingboing.search("/*/body//p[@class='posted']").length
|
86
|
+
assert_equal 18, @boingboing.search("//script").length
|
87
|
+
divs = @boingboing.search("//script/../div")
|
88
|
+
assert_equal 2, divs.length
|
89
|
+
assert_equal 1, divs.search('a').length
|
90
|
+
imgs = @boingboing.search('//div/p/a/img')
|
91
|
+
assert_equal 15, imgs.length
|
92
|
+
assert_equal 17, @boingboing.search('//div').search('p/a/img').length
|
93
|
+
assert imgs.all? { |x| x.name == 'img' }
|
94
|
+
end
|
95
|
+
|
96
|
+
def test_predicates
|
97
|
+
assert_equal 2, @boingboing.search('//link[@rel="alternate"]').length
|
98
|
+
p_imgs = @boingboing.search('//div/p[/a/img]')
|
99
|
+
assert_equal 15, p_imgs.length
|
100
|
+
assert p_imgs.all? { |x| x.name == 'p' }
|
101
|
+
p_imgs = @boingboing.search('//div/p[a/img]')
|
102
|
+
assert_equal 18, p_imgs.length
|
103
|
+
assert p_imgs.all? { |x| x.name == 'p' }
|
104
|
+
assert_equal 1, @boingboing.search('//input[@checked]').length
|
105
|
+
end
|
106
|
+
|
107
|
+
def test_alt_predicates
|
108
|
+
assert_equal 2, @boingboing.search('//table/tr:last').length
|
109
|
+
assert_equal "<p>The third paragraph</p>",
|
110
|
+
@basic.search('p:eq(2)').to_html
|
111
|
+
assert_equal '<p class="last final"><b>THE FINAL PARAGRAPH</b></p>',
|
112
|
+
@basic.search('p:last').to_html
|
113
|
+
assert_equal 'last final', @basic.search('//p:last-of-type').first.get_attribute('class')
|
114
|
+
end
|
115
|
+
|
116
|
+
def test_many_paths
|
117
|
+
assert_equal 62, @boingboing.search('p.posted, link[@rel="alternate"]').length
|
118
|
+
assert_equal 20, @boingboing.search('//div/p[a/img]|//link[@rel="alternate"]').length
|
119
|
+
end
|
120
|
+
|
121
|
+
def test_body_newlines
|
122
|
+
body = @immob.at(:body)
|
123
|
+
{'background' => '', 'bgcolor' => '#ffffff', 'text' => '#000000', 'marginheight' => '10',
|
124
|
+
'marginwidth' => '10', 'leftmargin' => '10', 'topmargin' => '10', 'link' => '#000066',
|
125
|
+
'alink' => '#ff6600', 'hlink' => "#ff6600", 'vlink' => "#000000"}.each do |k, v|
|
126
|
+
assert_equal v, body[k]
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
def test_javascripts
|
131
|
+
assert_equal 3, (@immob/:script)[0].inner_html.scan(/<LINK/).length
|
132
|
+
end
|
133
|
+
|
134
|
+
def test_uswebgen
|
135
|
+
# sent by brent beardsley, hpricot 0.3 had problems with all the links.
|
136
|
+
assert_equal 67, (@uswebgen/:a).length
|
137
|
+
end
|
138
|
+
|
139
|
+
def test_unicode
|
140
|
+
end
|
141
|
+
end
|
metadata
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: 1
|
4
|
+
name: hpricot
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: "0.4"
|
7
|
+
date: 2006-08-11 00:00:00 -06:00
|
8
|
+
summary: a swift, liberal HTML parser with a fantastic library
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: why@ruby-lang.org
|
12
|
+
homepage: http://code.whytheluckystiff.net/hpricot/
|
13
|
+
rubyforge_project:
|
14
|
+
description: a swift, liberal HTML parser with a fantastic library
|
15
|
+
autorequire:
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: false
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: mswin32
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- why the lucky stiff
|
31
|
+
files:
|
32
|
+
- test/load_files.rb
|
33
|
+
- test/test_parser.rb
|
34
|
+
- test/files/uswebgen.html
|
35
|
+
- test/files/boingboing.html
|
36
|
+
- test/files/immob.html
|
37
|
+
- test/files/basic.xhtml
|
38
|
+
- lib/hpricot.rb
|
39
|
+
- lib/hpricot/htmlinfo.rb
|
40
|
+
- lib/hpricot/text.rb
|
41
|
+
- lib/hpricot/inspect.rb
|
42
|
+
- lib/hpricot/modules.rb
|
43
|
+
- lib/hpricot/parse.rb
|
44
|
+
- lib/hpricot/tag.rb
|
45
|
+
- lib/hpricot/traverse.rb
|
46
|
+
- lib/hpricot/elements.rb
|
47
|
+
- ext/hpricot_scan/hpricot_scan.c
|
48
|
+
- ext/hpricot_scan/extconf.rb
|
49
|
+
- ext/hpricot_scan/hpricot_scan.h
|
50
|
+
- ext/hpricot_scan/hpricot_scan.rl
|
51
|
+
- CHANGELOG
|
52
|
+
- README
|
53
|
+
- Rakefile
|
54
|
+
- COPYING
|
55
|
+
- extras/mingw-rbconfig.rb
|
56
|
+
- lib/hpricot_scan.so
|
57
|
+
test_files: []
|
58
|
+
|
59
|
+
rdoc_options: []
|
60
|
+
|
61
|
+
extra_rdoc_files:
|
62
|
+
- README
|
63
|
+
- CHANGELOG
|
64
|
+
- COPYING
|
65
|
+
executables: []
|
66
|
+
|
67
|
+
extensions: []
|
68
|
+
|
69
|
+
requirements: []
|
70
|
+
|
71
|
+
dependencies: []
|
72
|
+
|