scrapi 1.1.2 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +8 -0
- data/Rakefile +3 -4
- data/lib/scraper/base.rb +28 -8
- data/lib/scraper/microformats.rb +93 -0
- data/test/scraper_test.rb +178 -172
- metadata +6 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
Version 1.2.0 (August 27, 2006)
|
2
|
+
|
3
|
+
* Added: collect() method called just before result().
|
4
|
+
* Changed: Elements are not skipped when processes unless :skip=>true.
|
5
|
+
! Pay attention to this one, it could affect some scrapers.
|
6
|
+
* Fixed: Declaring an array immediately creates an accessor for it.
|
7
|
+
* Added: Scraper::Microformat for scraping hAtom and basic hCard.
|
8
|
+
|
1
9
|
Version 1.1.2 (August 13, 2006)
|
2
10
|
|
3
11
|
* Changed: Allows multiple :not pseudo classes to be used with the same
|
data/Rakefile
CHANGED
@@ -41,14 +41,13 @@ gem_spec = Gem::Specification.new do |spec|
|
|
41
41
|
|
42
42
|
spec.name = "scrapi"
|
43
43
|
spec.version = version
|
44
|
-
spec.summary = "scrAPI toolkit for Ruby"
|
44
|
+
spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
|
45
45
|
spec.description = <<-EOF
|
46
|
-
|
47
|
-
select => extract => store processing rules.
|
46
|
+
scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
|
48
47
|
EOF
|
49
48
|
spec.author = "Assaf Arkin"
|
50
49
|
spec.email = "assaf.arkin@gmail.com"
|
51
|
-
spec.homepage = "http://labnotes.org/"
|
50
|
+
spec.homepage = "http://blog.labnotes.org/category/scrapi/"
|
52
51
|
|
53
52
|
spec.files = FileList["{test,lib}/**/*", "README", "CHANGELOG", "Rakefile", "MIT-LICENSE"].to_a
|
54
53
|
spec.require_path = "lib"
|
data/lib/scraper/base.rb
CHANGED
@@ -472,11 +472,19 @@ module Scraper
|
|
472
472
|
# process "a[href]", "urls[]"=>"@href"
|
473
473
|
def array(*symbols)
|
474
474
|
@arrays ||= []
|
475
|
-
symbols.each
|
475
|
+
symbols.each do |symbol|
|
476
|
+
symbol = symbol.to_sym
|
477
|
+
@arrays << symbol
|
478
|
+
begin
|
479
|
+
self.instance_method(symbol)
|
480
|
+
rescue NameError
|
481
|
+
attr_accessor symbol
|
482
|
+
end
|
483
|
+
end
|
476
484
|
end
|
477
485
|
|
478
486
|
|
479
|
-
|
487
|
+
private
|
480
488
|
|
481
489
|
|
482
490
|
# Called by #process and #process_first, see there for
|
@@ -655,6 +663,7 @@ module Scraper
|
|
655
663
|
begin
|
656
664
|
self.instance_method(target)
|
657
665
|
rescue NameError
|
666
|
+
instance = "@#{target}".to_sym
|
658
667
|
attr_accessor target
|
659
668
|
end
|
660
669
|
reader = "#{target}=".to_sym
|
@@ -740,8 +749,9 @@ module Scraper
|
|
740
749
|
prepare document
|
741
750
|
# Retrieve the document. This may raise HTTPError or HTMLParseError.
|
742
751
|
case document
|
743
|
-
when Array
|
744
|
-
|
752
|
+
when Array
|
753
|
+
stack = @document.reverse # see below
|
754
|
+
when HTML::Node
|
745
755
|
# If a root element is specified, start selecting from there.
|
746
756
|
# The stack is empty if we can't find any root element (makes
|
747
757
|
# sense). However, the node we're going to process may be
|
@@ -750,7 +760,8 @@ module Scraper
|
|
750
760
|
root_element = option(:root_element)
|
751
761
|
root = root_element ? @document.find(:tag=>root_element) : @document
|
752
762
|
stack = root ? (root.tag? ? [root] : root.children.reverse) : []
|
753
|
-
else
|
763
|
+
else
|
764
|
+
return
|
754
765
|
end
|
755
766
|
# @skip stores all the elements we want to skip (see #skip).
|
756
767
|
# rules stores all the rules we want to process with this
|
@@ -801,9 +812,10 @@ module Scraper
|
|
801
812
|
# If it returns true, skip the element and if
|
802
813
|
# the current element, don't process any more
|
803
814
|
# rules. Again, pay attention to descendants.
|
804
|
-
|
805
|
-
if (skip || @skip.delete(true)) && @skip.delete(false).nil?
|
815
|
+
if extractor.bind(self).call(element)
|
806
816
|
@extracted = true
|
817
|
+
end
|
818
|
+
if @skip.delete(true)
|
807
819
|
if element.equal?(node)
|
808
820
|
skip_this = true
|
809
821
|
else
|
@@ -824,6 +836,7 @@ module Scraper
|
|
824
836
|
ensure
|
825
837
|
@skip = nil
|
826
838
|
end
|
839
|
+
collect
|
827
840
|
return result
|
828
841
|
end
|
829
842
|
|
@@ -895,7 +908,7 @@ module Scraper
|
|
895
908
|
case elements
|
896
909
|
when Array: @skip.concat elements
|
897
910
|
when HTML::Node: @skip << elements
|
898
|
-
when nil: @skip <<
|
911
|
+
when nil: @skip << true
|
899
912
|
when true, false: @skip << elements
|
900
913
|
end
|
901
914
|
# Calling skip(element) as the last statement is
|
@@ -920,6 +933,13 @@ module Scraper
|
|
920
933
|
end
|
921
934
|
|
922
935
|
|
936
|
+
# Called by #scrape scraping the document, and before calling #result.
|
937
|
+
# Typically used to run any validation, post-processing steps,
|
938
|
+
# resolving referenced elements, etc.
|
939
|
+
def collect()
|
940
|
+
end
|
941
|
+
|
942
|
+
|
923
943
|
# Returns the result of a succcessful scrape.
|
924
944
|
#
|
925
945
|
# This method is called by #scrape after running all the rules on the
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require "time"
|
2
|
+
|
3
|
+
|
4
|
+
module Scraper
|
5
|
+
|
6
|
+
module Microformats
|
7
|
+
|
8
|
+
class HCard < Scraper::Base
|
9
|
+
|
10
|
+
process ".fn", :fn=>:text
|
11
|
+
process ".given-name", :given_name=>:text
|
12
|
+
process ".family-name", :family_name=>:text
|
13
|
+
process "img.photo", :photo=>"@src"
|
14
|
+
process "a.url", :url=>"@href"
|
15
|
+
|
16
|
+
result :fn, :given_name, :family_name, :photo, :url
|
17
|
+
|
18
|
+
def collect()
|
19
|
+
unless fn
|
20
|
+
if self.fn = given_name
|
21
|
+
self.given_name << " #{family_name}" if family_name
|
22
|
+
else
|
23
|
+
self.fn = family_name
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
class HAtom < Scraper::Base
|
32
|
+
|
33
|
+
class Entry < Scraper::Base
|
34
|
+
|
35
|
+
array :content, :tags
|
36
|
+
|
37
|
+
process ".entry-title", :title=>:text
|
38
|
+
process ".entry-content", :content=>:element
|
39
|
+
process ".entry-summary", :summary=>:element
|
40
|
+
process "a[rel~=bookmark]", :permalink=>["@href"]
|
41
|
+
process ".author.vcard, .author .vcard", :author=>HCard
|
42
|
+
process ".published", :published=>["abbr@title", :text]
|
43
|
+
process ".updated", :updated=>["abbr@title", :text]
|
44
|
+
process "a[rel~=tag]", :tags=>:text
|
45
|
+
|
46
|
+
def collect()
|
47
|
+
self.published = Time.parse(published)
|
48
|
+
self.updated = updated ? Time.parse(updated) : published
|
49
|
+
end
|
50
|
+
|
51
|
+
result :title, :content, :summary, :permalink, :author, :published, :updated, :tags
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
class Feed < Scraper::Base
|
56
|
+
|
57
|
+
array :entries
|
58
|
+
|
59
|
+
process ".hentry", :entries=>Entry
|
60
|
+
|
61
|
+
def result()
|
62
|
+
entries
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
array :feeds, :entries
|
68
|
+
|
69
|
+
# Skip feeds, so we don't process them twice.
|
70
|
+
process ".hfeed", :skip=>true, :feeds=>Feed
|
71
|
+
# And so we can collect unwrapped entries into a separate feed.
|
72
|
+
process ".hentry", :skip=>true, :entries=>Entry
|
73
|
+
# And collect the first remaining hcard as the default author.
|
74
|
+
process ".vcard", :hcard=>HCard
|
75
|
+
|
76
|
+
def collect()
|
77
|
+
@feeds ||= []
|
78
|
+
@feeds << entries if entries
|
79
|
+
for feed in feeds
|
80
|
+
for entry in feed
|
81
|
+
entry.author = hcard unless entry.author
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
result :feeds
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
data/test/scraper_test.rb
CHANGED
@@ -14,6 +14,20 @@ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
|
|
14
14
|
|
15
15
|
class ScraperTest < Test::Unit::TestCase
|
16
16
|
|
17
|
+
DIVS123 = <<-EOF
|
18
|
+
<div id="1"></div>
|
19
|
+
<div id="2"></div>
|
20
|
+
<div id="3"></div>
|
21
|
+
EOF
|
22
|
+
|
23
|
+
DIVS1_23 = <<-EOF
|
24
|
+
<div id="1">
|
25
|
+
<div id="2"></div>
|
26
|
+
<div id="3"></div>
|
27
|
+
</div>
|
28
|
+
EOF
|
29
|
+
|
30
|
+
|
17
31
|
def setup
|
18
32
|
Net::HTTP.reset_on_get
|
19
33
|
end
|
@@ -28,8 +42,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
28
42
|
#
|
29
43
|
|
30
44
|
def test_define_selectors
|
31
|
-
|
32
|
-
scraper = new_scraper(html) do
|
45
|
+
scraper = new_scraper(DIVS123) do
|
33
46
|
selector :test, "div"
|
34
47
|
end
|
35
48
|
assert_equal 3, scraper.test(scraper.document).size
|
@@ -40,8 +53,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
40
53
|
|
41
54
|
|
42
55
|
def test_selector_blocks
|
43
|
-
|
44
|
-
scraper = new_scraper(html) do
|
56
|
+
scraper = new_scraper(DIVS123) do
|
45
57
|
selector :test, "div" do |elements|
|
46
58
|
return elements[0..-2]
|
47
59
|
elements[0..-2]
|
@@ -52,18 +64,16 @@ class ScraperTest < Test::Unit::TestCase
|
|
52
64
|
|
53
65
|
|
54
66
|
def test_array_selectors
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
|
67
|
+
scraper = new_scraper(DIVS123) do
|
68
|
+
selector :test, "#?", "2"
|
69
|
+
end
|
70
|
+
assert_equal 1, scraper.test(scraper.document).size
|
71
|
+
assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
|
61
72
|
end
|
62
73
|
|
63
74
|
|
64
75
|
def test_object_selectors
|
65
|
-
|
66
|
-
scraper = new_scraper(html) do
|
76
|
+
scraper = new_scraper(DIVS123) do
|
67
77
|
selector :test, HTML::Selector.new("div")
|
68
78
|
end
|
69
79
|
assert_equal 3, scraper.test(scraper.document).size
|
@@ -71,8 +81,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
71
81
|
|
72
82
|
|
73
83
|
def test_selector_returns_array
|
74
|
-
|
75
|
-
scraper = new_scraper(html) do
|
84
|
+
scraper = new_scraper(DIVS123) do
|
76
85
|
selector :test0, "#4"
|
77
86
|
selector :test1, "#1"
|
78
87
|
selector :test3, "div"
|
@@ -84,26 +93,24 @@ class ScraperTest < Test::Unit::TestCase
|
|
84
93
|
|
85
94
|
|
86
95
|
def test_select_in_document_order
|
87
|
-
|
88
|
-
scraper = new_scraper(html) do
|
96
|
+
scraper = new_scraper(DIVS123) do
|
89
97
|
selector :test, "#2,#1"
|
90
98
|
end
|
91
|
-
assert_equal 2,
|
99
|
+
assert_equal 2, scraper.test(scraper.document).size
|
92
100
|
assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
|
93
101
|
assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
|
94
102
|
end
|
95
103
|
|
96
104
|
|
97
105
|
def test_selecting_first_element
|
98
|
-
|
99
|
-
scraper = new_scraper(html) do
|
106
|
+
scraper = new_scraper(DIVS123) do
|
100
107
|
selector :test, "div"
|
101
108
|
end
|
102
|
-
assert_equal 3,
|
109
|
+
assert_equal 3, scraper.test(scraper.document).size
|
103
110
|
assert scraper.first_test(scraper.document)
|
104
111
|
assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
|
105
112
|
|
106
|
-
scraper = new_scraper(
|
113
|
+
scraper = new_scraper(DIVS123) do
|
107
114
|
selector :test, "div" do |element|
|
108
115
|
element[0].attributes["id"]
|
109
116
|
end
|
@@ -118,8 +125,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
118
125
|
#
|
119
126
|
|
120
127
|
def test_processing_rule
|
121
|
-
|
122
|
-
scraper = new_scraper(html) do
|
128
|
+
scraper = new_scraper(DIVS123) do
|
123
129
|
process "div" do |element|
|
124
130
|
@count = (@count || 0) + 1
|
125
131
|
end
|
@@ -131,8 +137,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
131
137
|
|
132
138
|
|
133
139
|
def test_processing_rule_with_array
|
134
|
-
|
135
|
-
scraper = new_scraper(html) do
|
140
|
+
scraper = new_scraper(DIVS123) do
|
136
141
|
process "#?", "1" do |element|
|
137
142
|
@count = (@count || 0) + 1
|
138
143
|
end
|
@@ -144,8 +149,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
144
149
|
|
145
150
|
|
146
151
|
def test_processing_rule_with_selector
|
147
|
-
|
148
|
-
scraper = new_scraper(html) do
|
152
|
+
scraper = new_scraper(DIVS123) do
|
149
153
|
process HTML::Selector.new("div") do |element|
|
150
154
|
@count = (@count || 0) + 1
|
151
155
|
end
|
@@ -157,8 +161,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
157
161
|
|
158
162
|
|
159
163
|
def test_extracting_in_code
|
160
|
-
|
161
|
-
scraper = new_scraper(html) do
|
164
|
+
scraper = new_scraper(DIVS123) do
|
162
165
|
process "div" do |element|
|
163
166
|
@concat = (@concat || "") << element.attributes["id"]
|
164
167
|
end
|
@@ -170,8 +173,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
170
173
|
|
171
174
|
|
172
175
|
def test_processing_in_document_order
|
173
|
-
|
174
|
-
scraper = new_scraper(html) do
|
176
|
+
scraper = new_scraper(DIVS123) do
|
175
177
|
process "#2,#1" do |element|
|
176
178
|
@concat = (@concat || "") << element.attributes["id"]
|
177
179
|
end
|
@@ -182,68 +184,28 @@ class ScraperTest < Test::Unit::TestCase
|
|
182
184
|
end
|
183
185
|
|
184
186
|
|
185
|
-
def test_skip_if_extractor_returns_true
|
186
|
-
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
187
|
-
scraper = new_scraper(html) do
|
188
|
-
process "#1" do |element|
|
189
|
-
@first = true
|
190
|
-
false
|
191
|
-
end
|
192
|
-
process "#1" do |element|
|
193
|
-
@second = true
|
194
|
-
end
|
195
|
-
attr :first
|
196
|
-
attr :second
|
197
|
-
end
|
198
|
-
scraper.scrape
|
199
|
-
assert_equal true, scraper.first
|
200
|
-
assert_equal true, scraper.second
|
201
|
-
scraper = new_scraper(html) do
|
202
|
-
process "#1" do |element|
|
203
|
-
@first = true
|
204
|
-
true
|
205
|
-
end
|
206
|
-
process "#1" do |element|
|
207
|
-
@second = true
|
208
|
-
end
|
209
|
-
attr :first
|
210
|
-
attr :second
|
211
|
-
end
|
212
|
-
scraper.scrape
|
213
|
-
assert_equal true, scraper.first
|
214
|
-
assert_equal nil, scraper.second
|
215
|
-
end
|
216
|
-
|
217
|
-
|
218
187
|
def test_process_once_if_skipped
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
@first = true
|
223
|
-
skip element
|
224
|
-
false
|
188
|
+
scraper = new_scraper(DIVS123) do
|
189
|
+
def prepare(document)
|
190
|
+
@found = []
|
225
191
|
end
|
226
|
-
process
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
192
|
+
process("#1") { |element| @found[0] = true }
|
193
|
+
process("#1") { |element| @found[1] = true ; skip element }
|
194
|
+
process("#1") { |element| @found[2] = true }
|
195
|
+
process("#2", :skip=>true){ |element| @found[3] = true }
|
196
|
+
process("#2") { |element| @found[4] = true }
|
197
|
+
attr_reader :found
|
231
198
|
end
|
232
199
|
scraper.scrape
|
233
|
-
assert_equal true, scraper.
|
234
|
-
assert_equal nil, scraper.second
|
200
|
+
assert_equal [true, true, nil, true], scraper.found
|
235
201
|
end
|
236
202
|
|
237
203
|
|
238
204
|
def test_skip_children
|
239
|
-
|
240
|
-
scraper = new_scraper(html) do
|
205
|
+
scraper = new_scraper(DIVS1_23) do
|
241
206
|
process "div" do |element|
|
242
207
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
243
|
-
|
244
|
-
skip to_skip
|
245
|
-
end
|
246
|
-
false
|
208
|
+
skip id2(element)
|
247
209
|
end
|
248
210
|
selector :id2, "#2"
|
249
211
|
attr :concat
|
@@ -254,68 +216,68 @@ class ScraperTest < Test::Unit::TestCase
|
|
254
216
|
|
255
217
|
|
256
218
|
def test_skip_descendants
|
257
|
-
|
258
|
-
scraper = new_scraper(
|
219
|
+
# Root, child of root, grandchild of root.
|
220
|
+
scraper = new_scraper(DIVS1_23) do
|
259
221
|
process "div" do |element|
|
260
222
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
261
|
-
false
|
262
223
|
end
|
263
224
|
attr :concat
|
264
225
|
end
|
265
226
|
scraper.scrape
|
266
|
-
# Root, child of root, grandchild of root.
|
267
227
|
assert_equal "123", scraper.concat
|
268
|
-
|
228
|
+
|
229
|
+
# Stop at root.
|
230
|
+
scraper = new_scraper(DIVS1_23) do
|
269
231
|
process "div" do |element|
|
270
232
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
271
|
-
|
233
|
+
skip
|
272
234
|
end
|
273
235
|
attr :concat
|
274
236
|
end
|
275
237
|
scraper.scrape
|
276
|
-
# Stop at root.
|
277
238
|
assert_equal "1", scraper.concat
|
278
239
|
|
279
|
-
scraper
|
240
|
+
scraper.scrape
|
241
|
+
# Child of root, and child of root's child
|
242
|
+
scraper = new_scraper(DIVS1_23) do
|
280
243
|
process "div>div" do |element|
|
281
244
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
282
|
-
false
|
283
245
|
end
|
284
246
|
attr :concat
|
285
247
|
end
|
286
248
|
scraper.scrape
|
287
|
-
# Child of root, and child of root's child
|
288
249
|
assert_equal "23", scraper.concat
|
289
|
-
|
250
|
+
|
251
|
+
# Stop at child of root.
|
252
|
+
scraper = new_scraper(DIVS1_23) do
|
290
253
|
process "div>div" do |element|
|
291
254
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
292
|
-
|
255
|
+
skip element.next_element
|
293
256
|
end
|
294
257
|
attr :concat
|
295
258
|
end
|
296
259
|
scraper.scrape
|
297
|
-
# Stop at child of root.
|
298
260
|
assert_equal "2", scraper.concat
|
299
261
|
|
300
|
-
|
262
|
+
# Child of root, the child of child of root.
|
263
|
+
scraper = new_scraper(DIVS1_23) do
|
301
264
|
process "div div" do |element|
|
302
265
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
303
|
-
false
|
304
266
|
end
|
305
267
|
attr :concat
|
306
268
|
end
|
307
269
|
scraper.scrape
|
308
|
-
# Child of root, the child of child of root.
|
309
270
|
assert_equal "23", scraper.concat
|
310
|
-
|
271
|
+
|
272
|
+
# Child of root.
|
273
|
+
scraper = new_scraper(DIVS1_23) do
|
311
274
|
process "div div" do |element|
|
312
275
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
313
|
-
|
276
|
+
skip element.next_element
|
314
277
|
end
|
315
278
|
attr :concat
|
316
279
|
end
|
317
280
|
scraper.scrape
|
318
|
-
# Child of root.
|
319
281
|
assert_equal "2", scraper.concat
|
320
282
|
end
|
321
283
|
|
@@ -328,15 +290,15 @@ class ScraperTest < Test::Unit::TestCase
|
|
328
290
|
end
|
329
291
|
scraper.scrape
|
330
292
|
assert_equal "this", scraper.this1
|
331
|
-
assert_equal
|
293
|
+
assert_equal "this", scraper.this2
|
332
294
|
|
333
295
|
scraper = new_scraper(html) do
|
334
296
|
process "#1", :this1=>:text, :skip=>false
|
335
297
|
process "#1", :this2=>:text
|
336
298
|
end
|
337
299
|
scraper.scrape
|
338
|
-
|
339
|
-
|
300
|
+
assert_equal "this", scraper.this1
|
301
|
+
assert_equal "this", scraper.this2
|
340
302
|
|
341
303
|
scraper = new_scraper(html) do
|
342
304
|
process "#1", :this1=>:text, :skip=>true do
|
@@ -346,13 +308,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
346
308
|
end
|
347
309
|
scraper.scrape
|
348
310
|
assert_equal "this", scraper.this1
|
349
|
-
assert_equal nil,
|
311
|
+
assert_equal nil, scraper.this2
|
350
312
|
end
|
351
313
|
|
352
314
|
|
353
315
|
def test_stop
|
354
|
-
|
355
|
-
scraper = new_scraper(html) do
|
316
|
+
scraper = new_scraper(DIVS123) do
|
356
317
|
process "div" do |element|
|
357
318
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
358
319
|
stop
|
@@ -365,18 +326,14 @@ class ScraperTest < Test::Unit::TestCase
|
|
365
326
|
|
366
327
|
|
367
328
|
def test_process_first
|
368
|
-
|
369
|
-
scraper = new_scraper(html) do
|
329
|
+
scraper = new_scraper(DIVS123) do
|
370
330
|
process "div" do |element|
|
371
331
|
@all = (@all || 0) + 1
|
372
|
-
false
|
373
332
|
end
|
374
333
|
process_first "div" do |element|
|
375
334
|
@first = (@first || 0) + 1
|
376
|
-
false
|
377
335
|
end
|
378
|
-
|
379
|
-
attr :first
|
336
|
+
attr_accessor :all, :first
|
380
337
|
end
|
381
338
|
scraper.scrape
|
382
339
|
assert_equal 3, scraper.all
|
@@ -391,14 +348,17 @@ class ScraperTest < Test::Unit::TestCase
|
|
391
348
|
response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
|
392
349
|
response["Last-Modified"] = time
|
393
350
|
response["ETag"] = "etag"
|
394
|
-
[response,
|
395
|
-
<html>
|
396
|
-
<head>
|
397
|
-
|
398
|
-
</head>
|
399
|
-
<body
|
400
|
-
|
401
|
-
|
351
|
+
[response, <<-EOF
|
352
|
+
<html>
|
353
|
+
<head>
|
354
|
+
<meta http-equiv="content-type" value="text/html; charset=other-encoding">
|
355
|
+
</head>
|
356
|
+
<body>
|
357
|
+
<div id="x"/>
|
358
|
+
</body>
|
359
|
+
</html>
|
360
|
+
EOF
|
361
|
+
]
|
402
362
|
else
|
403
363
|
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
|
404
364
|
response["Location"] = "http://localhost/redirect"
|
@@ -417,11 +377,15 @@ class ScraperTest < Test::Unit::TestCase
|
|
417
377
|
|
418
378
|
def test_scraping_end_to_end
|
419
379
|
Net::HTTP.on_get do |address, path, headers|
|
420
|
-
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"),
|
421
|
-
<html>
|
422
|
-
<body
|
423
|
-
|
424
|
-
|
380
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF
|
381
|
+
<html>
|
382
|
+
<body>
|
383
|
+
<div id="1"/>
|
384
|
+
<div id="2"/>
|
385
|
+
</body>
|
386
|
+
</html>
|
387
|
+
EOF
|
388
|
+
]
|
425
389
|
end
|
426
390
|
scraper = new_scraper(URI.parse("http://localhost/")) do
|
427
391
|
process "div" do |element|
|
@@ -475,7 +439,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
475
439
|
|
476
440
|
|
477
441
|
def test_extractors_objects
|
478
|
-
html =
|
442
|
+
html = <<-EOF
|
443
|
+
<h1 class="header"></h1>
|
444
|
+
<h2 class="header"></h2>
|
445
|
+
EOF
|
479
446
|
# Extract both elements based on class, return the second one.
|
480
447
|
scraper = new_scraper(html) do
|
481
448
|
process ".header", extractor(:header=>:element)
|
@@ -484,7 +451,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
484
451
|
scraper.scrape
|
485
452
|
assert_equal "h2", scraper.header.name
|
486
453
|
# Extracting a specific element skips the second match.
|
487
|
-
html =
|
454
|
+
html = <<-EOF
|
455
|
+
<h1 class="header"></h1>
|
456
|
+
<h2 class="header"></h2>
|
457
|
+
EOF
|
488
458
|
scraper = new_scraper(html) do
|
489
459
|
process ".header", extractor(:header=>"h1")
|
490
460
|
attr :header
|
@@ -496,7 +466,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
496
466
|
|
497
467
|
def test_attribute_extractors
|
498
468
|
# Extracting the attribute skips the second match.
|
499
|
-
html =
|
469
|
+
html = <<-EOF
|
470
|
+
<abbr title="foo">bar</div>
|
471
|
+
<abbr>foo</abbr>
|
472
|
+
EOF
|
500
473
|
scraper = new_scraper(html) do
|
501
474
|
process "abbr", extractor(:title=>"@title")
|
502
475
|
attr :title
|
@@ -504,7 +477,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
504
477
|
scraper.scrape
|
505
478
|
assert_equal "foo", scraper.title
|
506
479
|
# Extracting a specific element skips the second match.
|
507
|
-
html =
|
480
|
+
html = <<-EOF
|
481
|
+
<h1 class="header" id="1"></h1>
|
482
|
+
<h2 class="header" id="2"></h2>
|
483
|
+
EOF
|
508
484
|
scraper = new_scraper(html) do
|
509
485
|
process ".header", extractor(:header=>"h1@id")
|
510
486
|
attr :header
|
@@ -522,7 +498,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
522
498
|
attr :h1
|
523
499
|
attr :h2
|
524
500
|
end
|
525
|
-
html =
|
501
|
+
html = <<-EOF
|
502
|
+
<div>
|
503
|
+
<h1>first</h1>
|
504
|
+
<h2>second</h2>
|
505
|
+
</div>
|
506
|
+
EOF
|
526
507
|
scraper = new_scraper(html) do
|
527
508
|
process "div", extractor(:headers=>headers)
|
528
509
|
attr :headers
|
@@ -535,7 +516,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
535
516
|
|
536
517
|
|
537
518
|
def test_array_extractors
|
538
|
-
html =
|
519
|
+
html = <<-EOF
|
520
|
+
<div>
|
521
|
+
<h1>first</h1>
|
522
|
+
<h1>second</h1>
|
523
|
+
</div>
|
524
|
+
EOF
|
539
525
|
scraper = new_scraper(html) do
|
540
526
|
process "h1", extractor("headers[]"=>:text)
|
541
527
|
attr :headers
|
@@ -543,13 +529,17 @@ class ScraperTest < Test::Unit::TestCase
|
|
543
529
|
scraper.scrape
|
544
530
|
assert scraper.headers.is_a?(Array)
|
545
531
|
assert_equal 2, scraper.headers.size
|
546
|
-
assert_equal "first",
|
547
|
-
assert_equal "second",
|
532
|
+
assert_equal "first", scraper.headers[0]
|
533
|
+
assert_equal "second", scraper.headers[1]
|
548
534
|
end
|
549
535
|
|
550
536
|
|
551
537
|
def test_hash_extractors
|
552
|
-
html =
|
538
|
+
html = <<-EOF
|
539
|
+
<div>
|
540
|
+
<h1 id="1" class="header">first</h1>
|
541
|
+
</div>
|
542
|
+
EOF
|
553
543
|
scraper = new_scraper(html) do
|
554
544
|
process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
|
555
545
|
attr :header
|
@@ -558,21 +548,25 @@ class ScraperTest < Test::Unit::TestCase
|
|
558
548
|
assert scraper.header.is_a?(Hash)
|
559
549
|
assert_equal 3, scraper.header.size
|
560
550
|
assert_equal "1", scraper.header[:id]
|
561
|
-
assert_equal "header",
|
562
|
-
assert_equal "first",
|
551
|
+
assert_equal "header", scraper.header[:class]
|
552
|
+
assert_equal "first", scraper.header[:text]
|
563
553
|
end
|
564
554
|
|
565
555
|
|
566
556
|
def test_multi_value_extractors
|
567
|
-
html =
|
557
|
+
html = <<-EOF
|
558
|
+
<div>
|
559
|
+
<h1 id="1" class="header">first</h1>
|
560
|
+
</div>
|
561
|
+
EOF
|
568
562
|
scraper = new_scraper(html) do
|
569
563
|
process "h1", [:text, :kls]=>Scraper.define {
|
570
564
|
process "*", :text=>:text, :kls=>"@class"
|
571
565
|
}
|
572
566
|
end
|
573
567
|
result = scraper.scrape
|
574
|
-
assert "first",
|
575
|
-
assert "header",
|
568
|
+
assert "first", result.text
|
569
|
+
assert "header", result.kls
|
576
570
|
end
|
577
571
|
|
578
572
|
|
@@ -581,7 +575,13 @@ class ScraperTest < Test::Unit::TestCase
|
|
581
575
|
# if not found look for class attribute (first
|
582
576
|
# two headers), otherwise just get text (third
|
583
577
|
# header).
|
584
|
-
html =
|
578
|
+
html = <<-EOF
|
579
|
+
<div>
|
580
|
+
<h1 class="foo">first</h1>
|
581
|
+
<h1 class="foo" id="bar">second</h1>
|
582
|
+
<h1>third</h1>
|
583
|
+
</div>
|
584
|
+
EOF
|
585
585
|
scraper = new_scraper(html) do
|
586
586
|
process "h1", extractor("headers[]"=>["@id", "@class", :text])
|
587
587
|
attr :headers
|
@@ -589,37 +589,41 @@ class ScraperTest < Test::Unit::TestCase
|
|
589
589
|
scraper.scrape
|
590
590
|
assert scraper.headers.is_a?(Array)
|
591
591
|
assert_equal 3, scraper.headers.size
|
592
|
-
assert_equal "foo",
|
593
|
-
assert_equal "bar",
|
592
|
+
assert_equal "foo", scraper.headers[0]
|
593
|
+
assert_equal "bar", scraper.headers[1]
|
594
594
|
assert_equal "third", scraper.headers[2]
|
595
595
|
end
|
596
596
|
|
597
597
|
|
598
|
+
DIVS_ST_ND = <<-EOF
|
599
|
+
<div id="1">first</div>
|
600
|
+
<div id="2">second</div>
|
601
|
+
EOF
|
602
|
+
|
598
603
|
def test_accessors_from_extractor
|
599
|
-
|
600
|
-
scraper = new_scraper(html) do
|
604
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
601
605
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
602
606
|
result :div_id
|
603
607
|
end
|
604
608
|
value = scraper.scrape
|
605
609
|
assert_equal "1", value
|
606
610
|
|
607
|
-
scraper = new_scraper(
|
611
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
608
612
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
609
613
|
result :div_id, :div_text
|
610
614
|
end
|
611
615
|
value = scraper.scrape
|
612
|
-
assert_equal "1",
|
616
|
+
assert_equal "1", value.div_id
|
613
617
|
assert_equal "first", value.div_text
|
614
618
|
|
615
|
-
scraper = new_scraper(
|
619
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
616
620
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
617
621
|
end
|
618
622
|
value = scraper.scrape
|
619
|
-
assert_equal "1",
|
623
|
+
assert_equal "1", value.div_id
|
620
624
|
assert_equal "first", value.div_text
|
621
625
|
|
622
|
-
scraper = new_scraper(
|
626
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
623
627
|
attr_accessor :div_class
|
624
628
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
625
629
|
result :div_id, :div_class
|
@@ -628,7 +632,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
628
632
|
assert_equal "1", value.div_id
|
629
633
|
assert_raise(NoMethodError) { value.div_text }
|
630
634
|
|
631
|
-
scraper = new_scraper(
|
635
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
632
636
|
process "div", "div_ids[]"=>"@id"
|
633
637
|
result :div_ids
|
634
638
|
end
|
@@ -639,8 +643,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
639
643
|
|
640
644
|
|
641
645
|
def test_array_accessors
|
642
|
-
|
643
|
-
scraper = new_scraper(html) do
|
646
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
644
647
|
array :div_id, :div_text
|
645
648
|
process "div", :div_id=>"@id", :div_text=>:text
|
646
649
|
result :div_id, :div_text
|
@@ -650,8 +653,8 @@ class ScraperTest < Test::Unit::TestCase
|
|
650
653
|
assert_equal 2, value.div_text.size
|
651
654
|
assert_equal "1", value.div_id[0]
|
652
655
|
assert_equal "2", value.div_id[1]
|
653
|
-
assert_equal "first",
|
654
|
-
assert_equal "second",
|
656
|
+
assert_equal "first", value.div_text[0]
|
657
|
+
assert_equal "second", value.div_text[1]
|
655
658
|
end
|
656
659
|
|
657
660
|
|
@@ -659,9 +662,17 @@ class ScraperTest < Test::Unit::TestCase
|
|
659
662
|
# Root element tests.
|
660
663
|
#
|
661
664
|
|
665
|
+
HTML_EMPTY = <<-EOF
|
666
|
+
<html>
|
667
|
+
<head>
|
668
|
+
</head>
|
669
|
+
<body>
|
670
|
+
</body>
|
671
|
+
</html>
|
672
|
+
EOF
|
673
|
+
|
662
674
|
def test_scrape_body_by_default
|
663
|
-
|
664
|
-
scraper = Class.new(Scraper::Base).new(html)
|
675
|
+
scraper = Class.new(Scraper::Base).new(HTML_EMPTY)
|
665
676
|
scraper.class.instance_eval do
|
666
677
|
process "head" do |element| @head = element end
|
667
678
|
attr :head
|
@@ -675,17 +686,16 @@ class ScraperTest < Test::Unit::TestCase
|
|
675
686
|
|
676
687
|
|
677
688
|
def test_changing_root_element
|
678
|
-
|
679
|
-
only_header = new_scraper(html) do
|
689
|
+
only_header = new_scraper(HTML_EMPTY) do
|
680
690
|
root_element "head"
|
681
691
|
process "head" do |element| @head = element end
|
682
692
|
attr :head
|
683
693
|
process "body" do |element| @body = element end
|
684
694
|
attr :body
|
685
695
|
end
|
686
|
-
only_body = Class.new(only_header.class).new(
|
696
|
+
only_body = Class.new(only_header.class).new(HTML_EMPTY)
|
687
697
|
only_body.class.root_element "body"
|
688
|
-
both_parts = Class.new(only_body.class).new(
|
698
|
+
both_parts = Class.new(only_body.class).new(HTML_EMPTY)
|
689
699
|
both_parts.class.root_element nil
|
690
700
|
# We set this scraper to begin with the head element,
|
691
701
|
# so we can see the head element, but not the body.
|
@@ -709,8 +719,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
709
719
|
|
710
720
|
def test_prepare_and_result
|
711
721
|
# Extracting the attribute skips the second match.
|
712
|
-
|
713
|
-
scraper = new_scraper(html) do
|
722
|
+
scraper = new_scraper(DIVS123) do
|
714
723
|
process("div") { |element| @count +=1 }
|
715
724
|
define_method(:prepare) { @count = 1 }
|
716
725
|
define_method(:result) { @count }
|
@@ -722,8 +731,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
722
731
|
|
723
732
|
def test_changing_document_from_prepare
|
724
733
|
# Extracting the attribute skips the second match.
|
725
|
-
|
726
|
-
scraper = new_scraper(html) do
|
734
|
+
scraper = new_scraper(DIVS123) do
|
727
735
|
selector :divs, "div"
|
728
736
|
define_method :prepare do |document|
|
729
737
|
@document = divs(document)[1]
|
@@ -739,13 +747,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
739
747
|
|
740
748
|
|
741
749
|
def test_anonymous_scrapers
|
742
|
-
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
743
750
|
scraper = Scraper.define do
|
744
751
|
array :ids
|
745
752
|
process "div", :ids=>"@id"
|
746
753
|
result :ids
|
747
754
|
end
|
748
|
-
result = scraper.scrape(
|
755
|
+
result = scraper.scrape(DIVS123)
|
749
756
|
assert_equal "1", result[0]
|
750
757
|
assert_equal "2", result[1]
|
751
758
|
assert_equal "3", result[2]
|
@@ -753,14 +760,13 @@ class ScraperTest < Test::Unit::TestCase
|
|
753
760
|
|
754
761
|
|
755
762
|
def test_named_rules
|
756
|
-
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
757
763
|
scraper = Scraper.define do
|
758
764
|
array :ids1, :ids2
|
759
765
|
process :main, "div", :ids1=>"@id"
|
760
766
|
process :main, "div", :ids2=>"@id"
|
761
767
|
result :ids1, :ids2
|
762
768
|
end
|
763
|
-
result = scraper.scrape(
|
769
|
+
result = scraper.scrape(DIVS123)
|
764
770
|
assert_equal nil, result.ids1
|
765
771
|
assert_equal 3, result.ids2.size
|
766
772
|
assert_equal "1", result.ids2[0]
|
@@ -775,7 +781,7 @@ protected
|
|
775
781
|
cls = Class.new(Scraper::Base)
|
776
782
|
cls.root_element nil
|
777
783
|
cls.parser :html_parser
|
778
|
-
cls.
|
784
|
+
cls.class_eval &block if block
|
779
785
|
cls.new(what)
|
780
786
|
end
|
781
787
|
|
@@ -791,7 +797,7 @@ protected
|
|
791
797
|
cls = Class.new(Scraper::Base)
|
792
798
|
cls.root_element nil
|
793
799
|
cls.parser :tidy
|
794
|
-
cls.
|
800
|
+
cls.class_eval &block if block
|
795
801
|
cls.new(what)
|
796
802
|
end
|
797
803
|
|
metadata
CHANGED
@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: scrapi
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2006-08-
|
8
|
-
summary: scrAPI toolkit for Ruby
|
6
|
+
version: 1.2.0
|
7
|
+
date: 2006-08-27 00:00:00 -07:00
|
8
|
+
summary: scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: assaf.arkin@gmail.com
|
12
|
-
homepage: http://labnotes.org/
|
12
|
+
homepage: http://blog.labnotes.org/category/scrapi/
|
13
13
|
rubyforge_project: scrapi
|
14
|
-
description:
|
14
|
+
description: scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
|
15
15
|
autorequire: scrapi.rb
|
16
16
|
default_executable:
|
17
17
|
bindir: bin
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- lib/html
|
41
41
|
- lib/scraper/reader.rb
|
42
42
|
- lib/scraper/base.rb
|
43
|
+
- lib/scraper/microformats.rb
|
43
44
|
- lib/tidy/libtidy.so
|
44
45
|
- lib/tidy/libtidy.dll
|
45
46
|
- lib/html/node_ext.rb
|