scrapi 1.1.2 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +8 -0
- data/Rakefile +3 -4
- data/lib/scraper/base.rb +28 -8
- data/lib/scraper/microformats.rb +93 -0
- data/test/scraper_test.rb +178 -172
- metadata +6 -5
data/CHANGELOG
CHANGED
@@ -1,3 +1,11 @@
|
|
1
|
+
Version 1.2.0 (August 27, 2006)
|
2
|
+
|
3
|
+
* Added: collect() method called just before result().
|
4
|
+
* Changed: Elements are not skipped when processes unless :skip=>true.
|
5
|
+
! Pay attention to this one, it could affect some scrapers.
|
6
|
+
* Fixed: Declaring an array immediately creates an accessor for it.
|
7
|
+
* Added: Scraper::Microformat for scraping hAtom and basic hCard.
|
8
|
+
|
1
9
|
Version 1.1.2 (August 13, 2006)
|
2
10
|
|
3
11
|
* Changed: Allows multiple :not pseudo classes to be used with the same
|
data/Rakefile
CHANGED
@@ -41,14 +41,13 @@ gem_spec = Gem::Specification.new do |spec|
|
|
41
41
|
|
42
42
|
spec.name = "scrapi"
|
43
43
|
spec.version = version
|
44
|
-
spec.summary = "scrAPI toolkit for Ruby"
|
44
|
+
spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
|
45
45
|
spec.description = <<-EOF
|
46
|
-
|
47
|
-
select => extract => store processing rules.
|
46
|
+
scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
|
48
47
|
EOF
|
49
48
|
spec.author = "Assaf Arkin"
|
50
49
|
spec.email = "assaf.arkin@gmail.com"
|
51
|
-
spec.homepage = "http://labnotes.org/"
|
50
|
+
spec.homepage = "http://blog.labnotes.org/category/scrapi/"
|
52
51
|
|
53
52
|
spec.files = FileList["{test,lib}/**/*", "README", "CHANGELOG", "Rakefile", "MIT-LICENSE"].to_a
|
54
53
|
spec.require_path = "lib"
|
data/lib/scraper/base.rb
CHANGED
@@ -472,11 +472,19 @@ module Scraper
|
|
472
472
|
# process "a[href]", "urls[]"=>"@href"
|
473
473
|
def array(*symbols)
|
474
474
|
@arrays ||= []
|
475
|
-
symbols.each
|
475
|
+
symbols.each do |symbol|
|
476
|
+
symbol = symbol.to_sym
|
477
|
+
@arrays << symbol
|
478
|
+
begin
|
479
|
+
self.instance_method(symbol)
|
480
|
+
rescue NameError
|
481
|
+
attr_accessor symbol
|
482
|
+
end
|
483
|
+
end
|
476
484
|
end
|
477
485
|
|
478
486
|
|
479
|
-
|
487
|
+
private
|
480
488
|
|
481
489
|
|
482
490
|
# Called by #process and #process_first, see there for
|
@@ -655,6 +663,7 @@ module Scraper
|
|
655
663
|
begin
|
656
664
|
self.instance_method(target)
|
657
665
|
rescue NameError
|
666
|
+
instance = "@#{target}".to_sym
|
658
667
|
attr_accessor target
|
659
668
|
end
|
660
669
|
reader = "#{target}=".to_sym
|
@@ -740,8 +749,9 @@ module Scraper
|
|
740
749
|
prepare document
|
741
750
|
# Retrieve the document. This may raise HTTPError or HTMLParseError.
|
742
751
|
case document
|
743
|
-
when Array
|
744
|
-
|
752
|
+
when Array
|
753
|
+
stack = @document.reverse # see below
|
754
|
+
when HTML::Node
|
745
755
|
# If a root element is specified, start selecting from there.
|
746
756
|
# The stack is empty if we can't find any root element (makes
|
747
757
|
# sense). However, the node we're going to process may be
|
@@ -750,7 +760,8 @@ module Scraper
|
|
750
760
|
root_element = option(:root_element)
|
751
761
|
root = root_element ? @document.find(:tag=>root_element) : @document
|
752
762
|
stack = root ? (root.tag? ? [root] : root.children.reverse) : []
|
753
|
-
else
|
763
|
+
else
|
764
|
+
return
|
754
765
|
end
|
755
766
|
# @skip stores all the elements we want to skip (see #skip).
|
756
767
|
# rules stores all the rules we want to process with this
|
@@ -801,9 +812,10 @@ module Scraper
|
|
801
812
|
# If it returns true, skip the element and if
|
802
813
|
# the current element, don't process any more
|
803
814
|
# rules. Again, pay attention to descendants.
|
804
|
-
|
805
|
-
if (skip || @skip.delete(true)) && @skip.delete(false).nil?
|
815
|
+
if extractor.bind(self).call(element)
|
806
816
|
@extracted = true
|
817
|
+
end
|
818
|
+
if @skip.delete(true)
|
807
819
|
if element.equal?(node)
|
808
820
|
skip_this = true
|
809
821
|
else
|
@@ -824,6 +836,7 @@ module Scraper
|
|
824
836
|
ensure
|
825
837
|
@skip = nil
|
826
838
|
end
|
839
|
+
collect
|
827
840
|
return result
|
828
841
|
end
|
829
842
|
|
@@ -895,7 +908,7 @@ module Scraper
|
|
895
908
|
case elements
|
896
909
|
when Array: @skip.concat elements
|
897
910
|
when HTML::Node: @skip << elements
|
898
|
-
when nil: @skip <<
|
911
|
+
when nil: @skip << true
|
899
912
|
when true, false: @skip << elements
|
900
913
|
end
|
901
914
|
# Calling skip(element) as the last statement is
|
@@ -920,6 +933,13 @@ module Scraper
|
|
920
933
|
end
|
921
934
|
|
922
935
|
|
936
|
+
# Called by #scrape scraping the document, and before calling #result.
|
937
|
+
# Typically used to run any validation, post-processing steps,
|
938
|
+
# resolving referenced elements, etc.
|
939
|
+
def collect()
|
940
|
+
end
|
941
|
+
|
942
|
+
|
923
943
|
# Returns the result of a succcessful scrape.
|
924
944
|
#
|
925
945
|
# This method is called by #scrape after running all the rules on the
|
@@ -0,0 +1,93 @@
|
|
1
|
+
require "time"
|
2
|
+
|
3
|
+
|
4
|
+
module Scraper
|
5
|
+
|
6
|
+
module Microformats
|
7
|
+
|
8
|
+
class HCard < Scraper::Base
|
9
|
+
|
10
|
+
process ".fn", :fn=>:text
|
11
|
+
process ".given-name", :given_name=>:text
|
12
|
+
process ".family-name", :family_name=>:text
|
13
|
+
process "img.photo", :photo=>"@src"
|
14
|
+
process "a.url", :url=>"@href"
|
15
|
+
|
16
|
+
result :fn, :given_name, :family_name, :photo, :url
|
17
|
+
|
18
|
+
def collect()
|
19
|
+
unless fn
|
20
|
+
if self.fn = given_name
|
21
|
+
self.given_name << " #{family_name}" if family_name
|
22
|
+
else
|
23
|
+
self.fn = family_name
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
class HAtom < Scraper::Base
|
32
|
+
|
33
|
+
class Entry < Scraper::Base
|
34
|
+
|
35
|
+
array :content, :tags
|
36
|
+
|
37
|
+
process ".entry-title", :title=>:text
|
38
|
+
process ".entry-content", :content=>:element
|
39
|
+
process ".entry-summary", :summary=>:element
|
40
|
+
process "a[rel~=bookmark]", :permalink=>["@href"]
|
41
|
+
process ".author.vcard, .author .vcard", :author=>HCard
|
42
|
+
process ".published", :published=>["abbr@title", :text]
|
43
|
+
process ".updated", :updated=>["abbr@title", :text]
|
44
|
+
process "a[rel~=tag]", :tags=>:text
|
45
|
+
|
46
|
+
def collect()
|
47
|
+
self.published = Time.parse(published)
|
48
|
+
self.updated = updated ? Time.parse(updated) : published
|
49
|
+
end
|
50
|
+
|
51
|
+
result :title, :content, :summary, :permalink, :author, :published, :updated, :tags
|
52
|
+
|
53
|
+
end
|
54
|
+
|
55
|
+
class Feed < Scraper::Base
|
56
|
+
|
57
|
+
array :entries
|
58
|
+
|
59
|
+
process ".hentry", :entries=>Entry
|
60
|
+
|
61
|
+
def result()
|
62
|
+
entries
|
63
|
+
end
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
array :feeds, :entries
|
68
|
+
|
69
|
+
# Skip feeds, so we don't process them twice.
|
70
|
+
process ".hfeed", :skip=>true, :feeds=>Feed
|
71
|
+
# And so we can collect unwrapped entries into a separate feed.
|
72
|
+
process ".hentry", :skip=>true, :entries=>Entry
|
73
|
+
# And collect the first remaining hcard as the default author.
|
74
|
+
process ".vcard", :hcard=>HCard
|
75
|
+
|
76
|
+
def collect()
|
77
|
+
@feeds ||= []
|
78
|
+
@feeds << entries if entries
|
79
|
+
for feed in feeds
|
80
|
+
for entry in feed
|
81
|
+
entry.author = hcard unless entry.author
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
result :feeds
|
87
|
+
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
91
|
+
|
92
|
+
end
|
93
|
+
|
data/test/scraper_test.rb
CHANGED
@@ -14,6 +14,20 @@ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
|
|
14
14
|
|
15
15
|
class ScraperTest < Test::Unit::TestCase
|
16
16
|
|
17
|
+
DIVS123 = <<-EOF
|
18
|
+
<div id="1"></div>
|
19
|
+
<div id="2"></div>
|
20
|
+
<div id="3"></div>
|
21
|
+
EOF
|
22
|
+
|
23
|
+
DIVS1_23 = <<-EOF
|
24
|
+
<div id="1">
|
25
|
+
<div id="2"></div>
|
26
|
+
<div id="3"></div>
|
27
|
+
</div>
|
28
|
+
EOF
|
29
|
+
|
30
|
+
|
17
31
|
def setup
|
18
32
|
Net::HTTP.reset_on_get
|
19
33
|
end
|
@@ -28,8 +42,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
28
42
|
#
|
29
43
|
|
30
44
|
def test_define_selectors
|
31
|
-
|
32
|
-
scraper = new_scraper(html) do
|
45
|
+
scraper = new_scraper(DIVS123) do
|
33
46
|
selector :test, "div"
|
34
47
|
end
|
35
48
|
assert_equal 3, scraper.test(scraper.document).size
|
@@ -40,8 +53,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
40
53
|
|
41
54
|
|
42
55
|
def test_selector_blocks
|
43
|
-
|
44
|
-
scraper = new_scraper(html) do
|
56
|
+
scraper = new_scraper(DIVS123) do
|
45
57
|
selector :test, "div" do |elements|
|
46
58
|
return elements[0..-2]
|
47
59
|
elements[0..-2]
|
@@ -52,18 +64,16 @@ class ScraperTest < Test::Unit::TestCase
|
|
52
64
|
|
53
65
|
|
54
66
|
def test_array_selectors
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
|
67
|
+
scraper = new_scraper(DIVS123) do
|
68
|
+
selector :test, "#?", "2"
|
69
|
+
end
|
70
|
+
assert_equal 1, scraper.test(scraper.document).size
|
71
|
+
assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
|
61
72
|
end
|
62
73
|
|
63
74
|
|
64
75
|
def test_object_selectors
|
65
|
-
|
66
|
-
scraper = new_scraper(html) do
|
76
|
+
scraper = new_scraper(DIVS123) do
|
67
77
|
selector :test, HTML::Selector.new("div")
|
68
78
|
end
|
69
79
|
assert_equal 3, scraper.test(scraper.document).size
|
@@ -71,8 +81,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
71
81
|
|
72
82
|
|
73
83
|
def test_selector_returns_array
|
74
|
-
|
75
|
-
scraper = new_scraper(html) do
|
84
|
+
scraper = new_scraper(DIVS123) do
|
76
85
|
selector :test0, "#4"
|
77
86
|
selector :test1, "#1"
|
78
87
|
selector :test3, "div"
|
@@ -84,26 +93,24 @@ class ScraperTest < Test::Unit::TestCase
|
|
84
93
|
|
85
94
|
|
86
95
|
def test_select_in_document_order
|
87
|
-
|
88
|
-
scraper = new_scraper(html) do
|
96
|
+
scraper = new_scraper(DIVS123) do
|
89
97
|
selector :test, "#2,#1"
|
90
98
|
end
|
91
|
-
assert_equal 2,
|
99
|
+
assert_equal 2, scraper.test(scraper.document).size
|
92
100
|
assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
|
93
101
|
assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
|
94
102
|
end
|
95
103
|
|
96
104
|
|
97
105
|
def test_selecting_first_element
|
98
|
-
|
99
|
-
scraper = new_scraper(html) do
|
106
|
+
scraper = new_scraper(DIVS123) do
|
100
107
|
selector :test, "div"
|
101
108
|
end
|
102
|
-
assert_equal 3,
|
109
|
+
assert_equal 3, scraper.test(scraper.document).size
|
103
110
|
assert scraper.first_test(scraper.document)
|
104
111
|
assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
|
105
112
|
|
106
|
-
scraper = new_scraper(
|
113
|
+
scraper = new_scraper(DIVS123) do
|
107
114
|
selector :test, "div" do |element|
|
108
115
|
element[0].attributes["id"]
|
109
116
|
end
|
@@ -118,8 +125,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
118
125
|
#
|
119
126
|
|
120
127
|
def test_processing_rule
|
121
|
-
|
122
|
-
scraper = new_scraper(html) do
|
128
|
+
scraper = new_scraper(DIVS123) do
|
123
129
|
process "div" do |element|
|
124
130
|
@count = (@count || 0) + 1
|
125
131
|
end
|
@@ -131,8 +137,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
131
137
|
|
132
138
|
|
133
139
|
def test_processing_rule_with_array
|
134
|
-
|
135
|
-
scraper = new_scraper(html) do
|
140
|
+
scraper = new_scraper(DIVS123) do
|
136
141
|
process "#?", "1" do |element|
|
137
142
|
@count = (@count || 0) + 1
|
138
143
|
end
|
@@ -144,8 +149,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
144
149
|
|
145
150
|
|
146
151
|
def test_processing_rule_with_selector
|
147
|
-
|
148
|
-
scraper = new_scraper(html) do
|
152
|
+
scraper = new_scraper(DIVS123) do
|
149
153
|
process HTML::Selector.new("div") do |element|
|
150
154
|
@count = (@count || 0) + 1
|
151
155
|
end
|
@@ -157,8 +161,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
157
161
|
|
158
162
|
|
159
163
|
def test_extracting_in_code
|
160
|
-
|
161
|
-
scraper = new_scraper(html) do
|
164
|
+
scraper = new_scraper(DIVS123) do
|
162
165
|
process "div" do |element|
|
163
166
|
@concat = (@concat || "") << element.attributes["id"]
|
164
167
|
end
|
@@ -170,8 +173,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
170
173
|
|
171
174
|
|
172
175
|
def test_processing_in_document_order
|
173
|
-
|
174
|
-
scraper = new_scraper(html) do
|
176
|
+
scraper = new_scraper(DIVS123) do
|
175
177
|
process "#2,#1" do |element|
|
176
178
|
@concat = (@concat || "") << element.attributes["id"]
|
177
179
|
end
|
@@ -182,68 +184,28 @@ class ScraperTest < Test::Unit::TestCase
|
|
182
184
|
end
|
183
185
|
|
184
186
|
|
185
|
-
def test_skip_if_extractor_returns_true
|
186
|
-
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
187
|
-
scraper = new_scraper(html) do
|
188
|
-
process "#1" do |element|
|
189
|
-
@first = true
|
190
|
-
false
|
191
|
-
end
|
192
|
-
process "#1" do |element|
|
193
|
-
@second = true
|
194
|
-
end
|
195
|
-
attr :first
|
196
|
-
attr :second
|
197
|
-
end
|
198
|
-
scraper.scrape
|
199
|
-
assert_equal true, scraper.first
|
200
|
-
assert_equal true, scraper.second
|
201
|
-
scraper = new_scraper(html) do
|
202
|
-
process "#1" do |element|
|
203
|
-
@first = true
|
204
|
-
true
|
205
|
-
end
|
206
|
-
process "#1" do |element|
|
207
|
-
@second = true
|
208
|
-
end
|
209
|
-
attr :first
|
210
|
-
attr :second
|
211
|
-
end
|
212
|
-
scraper.scrape
|
213
|
-
assert_equal true, scraper.first
|
214
|
-
assert_equal nil, scraper.second
|
215
|
-
end
|
216
|
-
|
217
|
-
|
218
187
|
def test_process_once_if_skipped
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
@first = true
|
223
|
-
skip element
|
224
|
-
false
|
188
|
+
scraper = new_scraper(DIVS123) do
|
189
|
+
def prepare(document)
|
190
|
+
@found = []
|
225
191
|
end
|
226
|
-
process
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
192
|
+
process("#1") { |element| @found[0] = true }
|
193
|
+
process("#1") { |element| @found[1] = true ; skip element }
|
194
|
+
process("#1") { |element| @found[2] = true }
|
195
|
+
process("#2", :skip=>true){ |element| @found[3] = true }
|
196
|
+
process("#2") { |element| @found[4] = true }
|
197
|
+
attr_reader :found
|
231
198
|
end
|
232
199
|
scraper.scrape
|
233
|
-
assert_equal true, scraper.
|
234
|
-
assert_equal nil, scraper.second
|
200
|
+
assert_equal [true, true, nil, true], scraper.found
|
235
201
|
end
|
236
202
|
|
237
203
|
|
238
204
|
def test_skip_children
|
239
|
-
|
240
|
-
scraper = new_scraper(html) do
|
205
|
+
scraper = new_scraper(DIVS1_23) do
|
241
206
|
process "div" do |element|
|
242
207
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
243
|
-
|
244
|
-
skip to_skip
|
245
|
-
end
|
246
|
-
false
|
208
|
+
skip id2(element)
|
247
209
|
end
|
248
210
|
selector :id2, "#2"
|
249
211
|
attr :concat
|
@@ -254,68 +216,68 @@ class ScraperTest < Test::Unit::TestCase
|
|
254
216
|
|
255
217
|
|
256
218
|
def test_skip_descendants
|
257
|
-
|
258
|
-
scraper = new_scraper(
|
219
|
+
# Root, child of root, grandchild of root.
|
220
|
+
scraper = new_scraper(DIVS1_23) do
|
259
221
|
process "div" do |element|
|
260
222
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
261
|
-
false
|
262
223
|
end
|
263
224
|
attr :concat
|
264
225
|
end
|
265
226
|
scraper.scrape
|
266
|
-
# Root, child of root, grandchild of root.
|
267
227
|
assert_equal "123", scraper.concat
|
268
|
-
|
228
|
+
|
229
|
+
# Stop at root.
|
230
|
+
scraper = new_scraper(DIVS1_23) do
|
269
231
|
process "div" do |element|
|
270
232
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
271
|
-
|
233
|
+
skip
|
272
234
|
end
|
273
235
|
attr :concat
|
274
236
|
end
|
275
237
|
scraper.scrape
|
276
|
-
# Stop at root.
|
277
238
|
assert_equal "1", scraper.concat
|
278
239
|
|
279
|
-
scraper
|
240
|
+
scraper.scrape
|
241
|
+
# Child of root, and child of root's child
|
242
|
+
scraper = new_scraper(DIVS1_23) do
|
280
243
|
process "div>div" do |element|
|
281
244
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
282
|
-
false
|
283
245
|
end
|
284
246
|
attr :concat
|
285
247
|
end
|
286
248
|
scraper.scrape
|
287
|
-
# Child of root, and child of root's child
|
288
249
|
assert_equal "23", scraper.concat
|
289
|
-
|
250
|
+
|
251
|
+
# Stop at child of root.
|
252
|
+
scraper = new_scraper(DIVS1_23) do
|
290
253
|
process "div>div" do |element|
|
291
254
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
292
|
-
|
255
|
+
skip element.next_element
|
293
256
|
end
|
294
257
|
attr :concat
|
295
258
|
end
|
296
259
|
scraper.scrape
|
297
|
-
# Stop at child of root.
|
298
260
|
assert_equal "2", scraper.concat
|
299
261
|
|
300
|
-
|
262
|
+
# Child of root, the child of child of root.
|
263
|
+
scraper = new_scraper(DIVS1_23) do
|
301
264
|
process "div div" do |element|
|
302
265
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
303
|
-
false
|
304
266
|
end
|
305
267
|
attr :concat
|
306
268
|
end
|
307
269
|
scraper.scrape
|
308
|
-
# Child of root, the child of child of root.
|
309
270
|
assert_equal "23", scraper.concat
|
310
|
-
|
271
|
+
|
272
|
+
# Child of root.
|
273
|
+
scraper = new_scraper(DIVS1_23) do
|
311
274
|
process "div div" do |element|
|
312
275
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
313
|
-
|
276
|
+
skip element.next_element
|
314
277
|
end
|
315
278
|
attr :concat
|
316
279
|
end
|
317
280
|
scraper.scrape
|
318
|
-
# Child of root.
|
319
281
|
assert_equal "2", scraper.concat
|
320
282
|
end
|
321
283
|
|
@@ -328,15 +290,15 @@ class ScraperTest < Test::Unit::TestCase
|
|
328
290
|
end
|
329
291
|
scraper.scrape
|
330
292
|
assert_equal "this", scraper.this1
|
331
|
-
assert_equal
|
293
|
+
assert_equal "this", scraper.this2
|
332
294
|
|
333
295
|
scraper = new_scraper(html) do
|
334
296
|
process "#1", :this1=>:text, :skip=>false
|
335
297
|
process "#1", :this2=>:text
|
336
298
|
end
|
337
299
|
scraper.scrape
|
338
|
-
|
339
|
-
|
300
|
+
assert_equal "this", scraper.this1
|
301
|
+
assert_equal "this", scraper.this2
|
340
302
|
|
341
303
|
scraper = new_scraper(html) do
|
342
304
|
process "#1", :this1=>:text, :skip=>true do
|
@@ -346,13 +308,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
346
308
|
end
|
347
309
|
scraper.scrape
|
348
310
|
assert_equal "this", scraper.this1
|
349
|
-
assert_equal nil,
|
311
|
+
assert_equal nil, scraper.this2
|
350
312
|
end
|
351
313
|
|
352
314
|
|
353
315
|
def test_stop
|
354
|
-
|
355
|
-
scraper = new_scraper(html) do
|
316
|
+
scraper = new_scraper(DIVS123) do
|
356
317
|
process "div" do |element|
|
357
318
|
@concat = (@concat || "") << (element.attributes["id"] || "")
|
358
319
|
stop
|
@@ -365,18 +326,14 @@ class ScraperTest < Test::Unit::TestCase
|
|
365
326
|
|
366
327
|
|
367
328
|
def test_process_first
|
368
|
-
|
369
|
-
scraper = new_scraper(html) do
|
329
|
+
scraper = new_scraper(DIVS123) do
|
370
330
|
process "div" do |element|
|
371
331
|
@all = (@all || 0) + 1
|
372
|
-
false
|
373
332
|
end
|
374
333
|
process_first "div" do |element|
|
375
334
|
@first = (@first || 0) + 1
|
376
|
-
false
|
377
335
|
end
|
378
|
-
|
379
|
-
attr :first
|
336
|
+
attr_accessor :all, :first
|
380
337
|
end
|
381
338
|
scraper.scrape
|
382
339
|
assert_equal 3, scraper.all
|
@@ -391,14 +348,17 @@ class ScraperTest < Test::Unit::TestCase
|
|
391
348
|
response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
|
392
349
|
response["Last-Modified"] = time
|
393
350
|
response["ETag"] = "etag"
|
394
|
-
[response,
|
395
|
-
<html>
|
396
|
-
<head>
|
397
|
-
|
398
|
-
</head>
|
399
|
-
<body
|
400
|
-
|
401
|
-
|
351
|
+
[response, <<-EOF
|
352
|
+
<html>
|
353
|
+
<head>
|
354
|
+
<meta http-equiv="content-type" value="text/html; charset=other-encoding">
|
355
|
+
</head>
|
356
|
+
<body>
|
357
|
+
<div id="x"/>
|
358
|
+
</body>
|
359
|
+
</html>
|
360
|
+
EOF
|
361
|
+
]
|
402
362
|
else
|
403
363
|
response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
|
404
364
|
response["Location"] = "http://localhost/redirect"
|
@@ -417,11 +377,15 @@ class ScraperTest < Test::Unit::TestCase
|
|
417
377
|
|
418
378
|
def test_scraping_end_to_end
|
419
379
|
Net::HTTP.on_get do |address, path, headers|
|
420
|
-
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"),
|
421
|
-
<html>
|
422
|
-
<body
|
423
|
-
|
424
|
-
|
380
|
+
[Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF
|
381
|
+
<html>
|
382
|
+
<body>
|
383
|
+
<div id="1"/>
|
384
|
+
<div id="2"/>
|
385
|
+
</body>
|
386
|
+
</html>
|
387
|
+
EOF
|
388
|
+
]
|
425
389
|
end
|
426
390
|
scraper = new_scraper(URI.parse("http://localhost/")) do
|
427
391
|
process "div" do |element|
|
@@ -475,7 +439,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
475
439
|
|
476
440
|
|
477
441
|
def test_extractors_objects
|
478
|
-
html =
|
442
|
+
html = <<-EOF
|
443
|
+
<h1 class="header"></h1>
|
444
|
+
<h2 class="header"></h2>
|
445
|
+
EOF
|
479
446
|
# Extract both elements based on class, return the second one.
|
480
447
|
scraper = new_scraper(html) do
|
481
448
|
process ".header", extractor(:header=>:element)
|
@@ -484,7 +451,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
484
451
|
scraper.scrape
|
485
452
|
assert_equal "h2", scraper.header.name
|
486
453
|
# Extracting a specific element skips the second match.
|
487
|
-
html =
|
454
|
+
html = <<-EOF
|
455
|
+
<h1 class="header"></h1>
|
456
|
+
<h2 class="header"></h2>
|
457
|
+
EOF
|
488
458
|
scraper = new_scraper(html) do
|
489
459
|
process ".header", extractor(:header=>"h1")
|
490
460
|
attr :header
|
@@ -496,7 +466,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
496
466
|
|
497
467
|
def test_attribute_extractors
|
498
468
|
# Extracting the attribute skips the second match.
|
499
|
-
html =
|
469
|
+
html = <<-EOF
|
470
|
+
<abbr title="foo">bar</div>
|
471
|
+
<abbr>foo</abbr>
|
472
|
+
EOF
|
500
473
|
scraper = new_scraper(html) do
|
501
474
|
process "abbr", extractor(:title=>"@title")
|
502
475
|
attr :title
|
@@ -504,7 +477,10 @@ class ScraperTest < Test::Unit::TestCase
|
|
504
477
|
scraper.scrape
|
505
478
|
assert_equal "foo", scraper.title
|
506
479
|
# Extracting a specific element skips the second match.
|
507
|
-
html =
|
480
|
+
html = <<-EOF
|
481
|
+
<h1 class="header" id="1"></h1>
|
482
|
+
<h2 class="header" id="2"></h2>
|
483
|
+
EOF
|
508
484
|
scraper = new_scraper(html) do
|
509
485
|
process ".header", extractor(:header=>"h1@id")
|
510
486
|
attr :header
|
@@ -522,7 +498,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
522
498
|
attr :h1
|
523
499
|
attr :h2
|
524
500
|
end
|
525
|
-
html =
|
501
|
+
html = <<-EOF
|
502
|
+
<div>
|
503
|
+
<h1>first</h1>
|
504
|
+
<h2>second</h2>
|
505
|
+
</div>
|
506
|
+
EOF
|
526
507
|
scraper = new_scraper(html) do
|
527
508
|
process "div", extractor(:headers=>headers)
|
528
509
|
attr :headers
|
@@ -535,7 +516,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
535
516
|
|
536
517
|
|
537
518
|
def test_array_extractors
|
538
|
-
html =
|
519
|
+
html = <<-EOF
|
520
|
+
<div>
|
521
|
+
<h1>first</h1>
|
522
|
+
<h1>second</h1>
|
523
|
+
</div>
|
524
|
+
EOF
|
539
525
|
scraper = new_scraper(html) do
|
540
526
|
process "h1", extractor("headers[]"=>:text)
|
541
527
|
attr :headers
|
@@ -543,13 +529,17 @@ class ScraperTest < Test::Unit::TestCase
|
|
543
529
|
scraper.scrape
|
544
530
|
assert scraper.headers.is_a?(Array)
|
545
531
|
assert_equal 2, scraper.headers.size
|
546
|
-
assert_equal "first",
|
547
|
-
assert_equal "second",
|
532
|
+
assert_equal "first", scraper.headers[0]
|
533
|
+
assert_equal "second", scraper.headers[1]
|
548
534
|
end
|
549
535
|
|
550
536
|
|
551
537
|
def test_hash_extractors
|
552
|
-
html =
|
538
|
+
html = <<-EOF
|
539
|
+
<div>
|
540
|
+
<h1 id="1" class="header">first</h1>
|
541
|
+
</div>
|
542
|
+
EOF
|
553
543
|
scraper = new_scraper(html) do
|
554
544
|
process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
|
555
545
|
attr :header
|
@@ -558,21 +548,25 @@ class ScraperTest < Test::Unit::TestCase
|
|
558
548
|
assert scraper.header.is_a?(Hash)
|
559
549
|
assert_equal 3, scraper.header.size
|
560
550
|
assert_equal "1", scraper.header[:id]
|
561
|
-
assert_equal "header",
|
562
|
-
assert_equal "first",
|
551
|
+
assert_equal "header", scraper.header[:class]
|
552
|
+
assert_equal "first", scraper.header[:text]
|
563
553
|
end
|
564
554
|
|
565
555
|
|
566
556
|
def test_multi_value_extractors
|
567
|
-
html =
|
557
|
+
html = <<-EOF
|
558
|
+
<div>
|
559
|
+
<h1 id="1" class="header">first</h1>
|
560
|
+
</div>
|
561
|
+
EOF
|
568
562
|
scraper = new_scraper(html) do
|
569
563
|
process "h1", [:text, :kls]=>Scraper.define {
|
570
564
|
process "*", :text=>:text, :kls=>"@class"
|
571
565
|
}
|
572
566
|
end
|
573
567
|
result = scraper.scrape
|
574
|
-
assert "first",
|
575
|
-
assert "header",
|
568
|
+
assert "first", result.text
|
569
|
+
assert "header", result.kls
|
576
570
|
end
|
577
571
|
|
578
572
|
|
@@ -581,7 +575,13 @@ class ScraperTest < Test::Unit::TestCase
|
|
581
575
|
# if not found look for class attribute (first
|
582
576
|
# two headers), otherwise just get text (third
|
583
577
|
# header).
|
584
|
-
html =
|
578
|
+
html = <<-EOF
|
579
|
+
<div>
|
580
|
+
<h1 class="foo">first</h1>
|
581
|
+
<h1 class="foo" id="bar">second</h1>
|
582
|
+
<h1>third</h1>
|
583
|
+
</div>
|
584
|
+
EOF
|
585
585
|
scraper = new_scraper(html) do
|
586
586
|
process "h1", extractor("headers[]"=>["@id", "@class", :text])
|
587
587
|
attr :headers
|
@@ -589,37 +589,41 @@ class ScraperTest < Test::Unit::TestCase
|
|
589
589
|
scraper.scrape
|
590
590
|
assert scraper.headers.is_a?(Array)
|
591
591
|
assert_equal 3, scraper.headers.size
|
592
|
-
assert_equal "foo",
|
593
|
-
assert_equal "bar",
|
592
|
+
assert_equal "foo", scraper.headers[0]
|
593
|
+
assert_equal "bar", scraper.headers[1]
|
594
594
|
assert_equal "third", scraper.headers[2]
|
595
595
|
end
|
596
596
|
|
597
597
|
|
598
|
+
DIVS_ST_ND = <<-EOF
|
599
|
+
<div id="1">first</div>
|
600
|
+
<div id="2">second</div>
|
601
|
+
EOF
|
602
|
+
|
598
603
|
def test_accessors_from_extractor
|
599
|
-
|
600
|
-
scraper = new_scraper(html) do
|
604
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
601
605
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
602
606
|
result :div_id
|
603
607
|
end
|
604
608
|
value = scraper.scrape
|
605
609
|
assert_equal "1", value
|
606
610
|
|
607
|
-
scraper = new_scraper(
|
611
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
608
612
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
609
613
|
result :div_id, :div_text
|
610
614
|
end
|
611
615
|
value = scraper.scrape
|
612
|
-
assert_equal "1",
|
616
|
+
assert_equal "1", value.div_id
|
613
617
|
assert_equal "first", value.div_text
|
614
618
|
|
615
|
-
scraper = new_scraper(
|
619
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
616
620
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
617
621
|
end
|
618
622
|
value = scraper.scrape
|
619
|
-
assert_equal "1",
|
623
|
+
assert_equal "1", value.div_id
|
620
624
|
assert_equal "first", value.div_text
|
621
625
|
|
622
|
-
scraper = new_scraper(
|
626
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
623
627
|
attr_accessor :div_class
|
624
628
|
process_first "div", :div_id=>"@id", :div_text=>:text
|
625
629
|
result :div_id, :div_class
|
@@ -628,7 +632,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
628
632
|
assert_equal "1", value.div_id
|
629
633
|
assert_raise(NoMethodError) { value.div_text }
|
630
634
|
|
631
|
-
scraper = new_scraper(
|
635
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
632
636
|
process "div", "div_ids[]"=>"@id"
|
633
637
|
result :div_ids
|
634
638
|
end
|
@@ -639,8 +643,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
639
643
|
|
640
644
|
|
641
645
|
def test_array_accessors
|
642
|
-
|
643
|
-
scraper = new_scraper(html) do
|
646
|
+
scraper = new_scraper(DIVS_ST_ND) do
|
644
647
|
array :div_id, :div_text
|
645
648
|
process "div", :div_id=>"@id", :div_text=>:text
|
646
649
|
result :div_id, :div_text
|
@@ -650,8 +653,8 @@ class ScraperTest < Test::Unit::TestCase
|
|
650
653
|
assert_equal 2, value.div_text.size
|
651
654
|
assert_equal "1", value.div_id[0]
|
652
655
|
assert_equal "2", value.div_id[1]
|
653
|
-
assert_equal "first",
|
654
|
-
assert_equal "second",
|
656
|
+
assert_equal "first", value.div_text[0]
|
657
|
+
assert_equal "second", value.div_text[1]
|
655
658
|
end
|
656
659
|
|
657
660
|
|
@@ -659,9 +662,17 @@ class ScraperTest < Test::Unit::TestCase
|
|
659
662
|
# Root element tests.
|
660
663
|
#
|
661
664
|
|
665
|
+
HTML_EMPTY = <<-EOF
|
666
|
+
<html>
|
667
|
+
<head>
|
668
|
+
</head>
|
669
|
+
<body>
|
670
|
+
</body>
|
671
|
+
</html>
|
672
|
+
EOF
|
673
|
+
|
662
674
|
def test_scrape_body_by_default
|
663
|
-
|
664
|
-
scraper = Class.new(Scraper::Base).new(html)
|
675
|
+
scraper = Class.new(Scraper::Base).new(HTML_EMPTY)
|
665
676
|
scraper.class.instance_eval do
|
666
677
|
process "head" do |element| @head = element end
|
667
678
|
attr :head
|
@@ -675,17 +686,16 @@ class ScraperTest < Test::Unit::TestCase
|
|
675
686
|
|
676
687
|
|
677
688
|
def test_changing_root_element
|
678
|
-
|
679
|
-
only_header = new_scraper(html) do
|
689
|
+
only_header = new_scraper(HTML_EMPTY) do
|
680
690
|
root_element "head"
|
681
691
|
process "head" do |element| @head = element end
|
682
692
|
attr :head
|
683
693
|
process "body" do |element| @body = element end
|
684
694
|
attr :body
|
685
695
|
end
|
686
|
-
only_body = Class.new(only_header.class).new(
|
696
|
+
only_body = Class.new(only_header.class).new(HTML_EMPTY)
|
687
697
|
only_body.class.root_element "body"
|
688
|
-
both_parts = Class.new(only_body.class).new(
|
698
|
+
both_parts = Class.new(only_body.class).new(HTML_EMPTY)
|
689
699
|
both_parts.class.root_element nil
|
690
700
|
# We set this scraper to begin with the head element,
|
691
701
|
# so we can see the head element, but not the body.
|
@@ -709,8 +719,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
709
719
|
|
710
720
|
def test_prepare_and_result
|
711
721
|
# Extracting the attribute skips the second match.
|
712
|
-
|
713
|
-
scraper = new_scraper(html) do
|
722
|
+
scraper = new_scraper(DIVS123) do
|
714
723
|
process("div") { |element| @count +=1 }
|
715
724
|
define_method(:prepare) { @count = 1 }
|
716
725
|
define_method(:result) { @count }
|
@@ -722,8 +731,7 @@ class ScraperTest < Test::Unit::TestCase
|
|
722
731
|
|
723
732
|
def test_changing_document_from_prepare
|
724
733
|
# Extracting the attribute skips the second match.
|
725
|
-
|
726
|
-
scraper = new_scraper(html) do
|
734
|
+
scraper = new_scraper(DIVS123) do
|
727
735
|
selector :divs, "div"
|
728
736
|
define_method :prepare do |document|
|
729
737
|
@document = divs(document)[1]
|
@@ -739,13 +747,12 @@ class ScraperTest < Test::Unit::TestCase
|
|
739
747
|
|
740
748
|
|
741
749
|
def test_anonymous_scrapers
|
742
|
-
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
743
750
|
scraper = Scraper.define do
|
744
751
|
array :ids
|
745
752
|
process "div", :ids=>"@id"
|
746
753
|
result :ids
|
747
754
|
end
|
748
|
-
result = scraper.scrape(
|
755
|
+
result = scraper.scrape(DIVS123)
|
749
756
|
assert_equal "1", result[0]
|
750
757
|
assert_equal "2", result[1]
|
751
758
|
assert_equal "3", result[2]
|
@@ -753,14 +760,13 @@ class ScraperTest < Test::Unit::TestCase
|
|
753
760
|
|
754
761
|
|
755
762
|
def test_named_rules
|
756
|
-
html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
|
757
763
|
scraper = Scraper.define do
|
758
764
|
array :ids1, :ids2
|
759
765
|
process :main, "div", :ids1=>"@id"
|
760
766
|
process :main, "div", :ids2=>"@id"
|
761
767
|
result :ids1, :ids2
|
762
768
|
end
|
763
|
-
result = scraper.scrape(
|
769
|
+
result = scraper.scrape(DIVS123)
|
764
770
|
assert_equal nil, result.ids1
|
765
771
|
assert_equal 3, result.ids2.size
|
766
772
|
assert_equal "1", result.ids2[0]
|
@@ -775,7 +781,7 @@ protected
|
|
775
781
|
cls = Class.new(Scraper::Base)
|
776
782
|
cls.root_element nil
|
777
783
|
cls.parser :html_parser
|
778
|
-
cls.
|
784
|
+
cls.class_eval &block if block
|
779
785
|
cls.new(what)
|
780
786
|
end
|
781
787
|
|
@@ -791,7 +797,7 @@ protected
|
|
791
797
|
cls = Class.new(Scraper::Base)
|
792
798
|
cls.root_element nil
|
793
799
|
cls.parser :tidy
|
794
|
-
cls.
|
800
|
+
cls.class_eval &block if block
|
795
801
|
cls.new(what)
|
796
802
|
end
|
797
803
|
|
metadata
CHANGED
@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
|
|
3
3
|
specification_version: 1
|
4
4
|
name: scrapi
|
5
5
|
version: !ruby/object:Gem::Version
|
6
|
-
version: 1.
|
7
|
-
date: 2006-08-
|
8
|
-
summary: scrAPI toolkit for Ruby
|
6
|
+
version: 1.2.0
|
7
|
+
date: 2006-08-27 00:00:00 -07:00
|
8
|
+
summary: scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules.
|
9
9
|
require_paths:
|
10
10
|
- lib
|
11
11
|
email: assaf.arkin@gmail.com
|
12
|
-
homepage: http://labnotes.org/
|
12
|
+
homepage: http://blog.labnotes.org/category/scrapi/
|
13
13
|
rubyforge_project: scrapi
|
14
|
-
description:
|
14
|
+
description: scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
|
15
15
|
autorequire: scrapi.rb
|
16
16
|
default_executable:
|
17
17
|
bindir: bin
|
@@ -40,6 +40,7 @@ files:
|
|
40
40
|
- lib/html
|
41
41
|
- lib/scraper/reader.rb
|
42
42
|
- lib/scraper/base.rb
|
43
|
+
- lib/scraper/microformats.rb
|
43
44
|
- lib/tidy/libtidy.so
|
44
45
|
- lib/tidy/libtidy.dll
|
45
46
|
- lib/html/node_ext.rb
|