scrapi 1.1.2 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,11 @@
1
+ Version 1.2.0 (August 27, 2006)
2
+
3
+ * Added: collect() method called just before result().
4
+ * Changed: Elements are not skipped when processes unless :skip=>true.
5
+ ! Pay attention to this one, it could affect some scrapers.
6
+ * Fixed: Declaring an array immediately creates an accessor for it.
7
+ * Added: Scraper::Microformat for scraping hAtom and basic hCard.
8
+
1
9
  Version 1.1.2 (August 13, 2006)
2
10
 
3
11
  * Changed: Allows multiple :not pseudo classes to be used with the same
data/Rakefile CHANGED
@@ -41,14 +41,13 @@ gem_spec = Gem::Specification.new do |spec|
41
41
 
42
42
  spec.name = "scrapi"
43
43
  spec.version = version
44
- spec.summary = "scrAPI toolkit for Ruby"
44
+ spec.summary = "scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules."
45
45
  spec.description = <<-EOF
46
- A framework for writing scrapers using CSS selectors and simple
47
- select => extract => store processing rules.
46
+ scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
48
47
  EOF
49
48
  spec.author = "Assaf Arkin"
50
49
  spec.email = "assaf.arkin@gmail.com"
51
- spec.homepage = "http://labnotes.org/"
50
+ spec.homepage = "http://blog.labnotes.org/category/scrapi/"
52
51
 
53
52
  spec.files = FileList["{test,lib}/**/*", "README", "CHANGELOG", "Rakefile", "MIT-LICENSE"].to_a
54
53
  spec.require_path = "lib"
data/lib/scraper/base.rb CHANGED
@@ -472,11 +472,19 @@ module Scraper
472
472
  # process "a[href]", "urls[]"=>"@href"
473
473
  def array(*symbols)
474
474
  @arrays ||= []
475
- symbols.each { |sym| @arrays << sym.to_sym }
475
+ symbols.each do |symbol|
476
+ symbol = symbol.to_sym
477
+ @arrays << symbol
478
+ begin
479
+ self.instance_method(symbol)
480
+ rescue NameError
481
+ attr_accessor symbol
482
+ end
483
+ end
476
484
  end
477
485
 
478
486
 
479
- private
487
+ private
480
488
 
481
489
 
482
490
  # Called by #process and #process_first, see there for
@@ -655,6 +663,7 @@ module Scraper
655
663
  begin
656
664
  self.instance_method(target)
657
665
  rescue NameError
666
+ instance = "@#{target}".to_sym
658
667
  attr_accessor target
659
668
  end
660
669
  reader = "#{target}=".to_sym
@@ -740,8 +749,9 @@ module Scraper
740
749
  prepare document
741
750
  # Retrieve the document. This may raise HTTPError or HTMLParseError.
742
751
  case document
743
- when Array: stack = @document.reverse # see below
744
- when HTML::Node:
752
+ when Array
753
+ stack = @document.reverse # see below
754
+ when HTML::Node
745
755
  # If a root element is specified, start selecting from there.
746
756
  # The stack is empty if we can't find any root element (makes
747
757
  # sense). However, the node we're going to process may be
@@ -750,7 +760,8 @@ module Scraper
750
760
  root_element = option(:root_element)
751
761
  root = root_element ? @document.find(:tag=>root_element) : @document
752
762
  stack = root ? (root.tag? ? [root] : root.children.reverse) : []
753
- else return
763
+ else
764
+ return
754
765
  end
755
766
  # @skip stores all the elements we want to skip (see #skip).
756
767
  # rules stores all the rules we want to process with this
@@ -801,9 +812,10 @@ module Scraper
801
812
  # If it returns true, skip the element and if
802
813
  # the current element, don't process any more
803
814
  # rules. Again, pay attention to descendants.
804
- skip = extractor.bind(self).call(element)
805
- if (skip || @skip.delete(true)) && @skip.delete(false).nil?
815
+ if extractor.bind(self).call(element)
806
816
  @extracted = true
817
+ end
818
+ if @skip.delete(true)
807
819
  if element.equal?(node)
808
820
  skip_this = true
809
821
  else
@@ -824,6 +836,7 @@ module Scraper
824
836
  ensure
825
837
  @skip = nil
826
838
  end
839
+ collect
827
840
  return result
828
841
  end
829
842
 
@@ -895,7 +908,7 @@ module Scraper
895
908
  case elements
896
909
  when Array: @skip.concat elements
897
910
  when HTML::Node: @skip << elements
898
- when nil: @skip << self.element
911
+ when nil: @skip << true
899
912
  when true, false: @skip << elements
900
913
  end
901
914
  # Calling skip(element) as the last statement is
@@ -920,6 +933,13 @@ module Scraper
920
933
  end
921
934
 
922
935
 
936
+ # Called by #scrape scraping the document, and before calling #result.
937
+ # Typically used to run any validation, post-processing steps,
938
+ # resolving referenced elements, etc.
939
+ def collect()
940
+ end
941
+
942
+
923
943
  # Returns the result of a succcessful scrape.
924
944
  #
925
945
  # This method is called by #scrape after running all the rules on the
@@ -0,0 +1,93 @@
1
+ require "time"
2
+
3
+
4
+ module Scraper
5
+
6
+ module Microformats
7
+
8
+ class HCard < Scraper::Base
9
+
10
+ process ".fn", :fn=>:text
11
+ process ".given-name", :given_name=>:text
12
+ process ".family-name", :family_name=>:text
13
+ process "img.photo", :photo=>"@src"
14
+ process "a.url", :url=>"@href"
15
+
16
+ result :fn, :given_name, :family_name, :photo, :url
17
+
18
+ def collect()
19
+ unless fn
20
+ if self.fn = given_name
21
+ self.given_name << " #{family_name}" if family_name
22
+ else
23
+ self.fn = family_name
24
+ end
25
+ end
26
+ end
27
+
28
+ end
29
+
30
+
31
+ class HAtom < Scraper::Base
32
+
33
+ class Entry < Scraper::Base
34
+
35
+ array :content, :tags
36
+
37
+ process ".entry-title", :title=>:text
38
+ process ".entry-content", :content=>:element
39
+ process ".entry-summary", :summary=>:element
40
+ process "a[rel~=bookmark]", :permalink=>["@href"]
41
+ process ".author.vcard, .author .vcard", :author=>HCard
42
+ process ".published", :published=>["abbr@title", :text]
43
+ process ".updated", :updated=>["abbr@title", :text]
44
+ process "a[rel~=tag]", :tags=>:text
45
+
46
+ def collect()
47
+ self.published = Time.parse(published)
48
+ self.updated = updated ? Time.parse(updated) : published
49
+ end
50
+
51
+ result :title, :content, :summary, :permalink, :author, :published, :updated, :tags
52
+
53
+ end
54
+
55
+ class Feed < Scraper::Base
56
+
57
+ array :entries
58
+
59
+ process ".hentry", :entries=>Entry
60
+
61
+ def result()
62
+ entries
63
+ end
64
+
65
+ end
66
+
67
+ array :feeds, :entries
68
+
69
+ # Skip feeds, so we don't process them twice.
70
+ process ".hfeed", :skip=>true, :feeds=>Feed
71
+ # And so we can collect unwrapped entries into a separate feed.
72
+ process ".hentry", :skip=>true, :entries=>Entry
73
+ # And collect the first remaining hcard as the default author.
74
+ process ".vcard", :hcard=>HCard
75
+
76
+ def collect()
77
+ @feeds ||= []
78
+ @feeds << entries if entries
79
+ for feed in feeds
80
+ for entry in feed
81
+ entry.author = hcard unless entry.author
82
+ end
83
+ end
84
+ end
85
+
86
+ result :feeds
87
+
88
+ end
89
+
90
+ end
91
+
92
+ end
93
+
data/test/scraper_test.rb CHANGED
@@ -14,6 +14,20 @@ require File.join(File.dirname(__FILE__), "../lib", "scrapi")
14
14
 
15
15
  class ScraperTest < Test::Unit::TestCase
16
16
 
17
+ DIVS123 = <<-EOF
18
+ <div id="1"></div>
19
+ <div id="2"></div>
20
+ <div id="3"></div>
21
+ EOF
22
+
23
+ DIVS1_23 = <<-EOF
24
+ <div id="1">
25
+ <div id="2"></div>
26
+ <div id="3"></div>
27
+ </div>
28
+ EOF
29
+
30
+
17
31
  def setup
18
32
  Net::HTTP.reset_on_get
19
33
  end
@@ -28,8 +42,7 @@ class ScraperTest < Test::Unit::TestCase
28
42
  #
29
43
 
30
44
  def test_define_selectors
31
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
32
- scraper = new_scraper(html) do
45
+ scraper = new_scraper(DIVS123) do
33
46
  selector :test, "div"
34
47
  end
35
48
  assert_equal 3, scraper.test(scraper.document).size
@@ -40,8 +53,7 @@ class ScraperTest < Test::Unit::TestCase
40
53
 
41
54
 
42
55
  def test_selector_blocks
43
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
44
- scraper = new_scraper(html) do
56
+ scraper = new_scraper(DIVS123) do
45
57
  selector :test, "div" do |elements|
46
58
  return elements[0..-2]
47
59
  elements[0..-2]
@@ -52,18 +64,16 @@ class ScraperTest < Test::Unit::TestCase
52
64
 
53
65
 
54
66
  def test_array_selectors
55
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
56
- scraper = new_scraper(html) do
57
- selector :test, "#?", "2"
58
- end
59
- assert_equal 1, scraper.test(scraper.document).size
60
- assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
67
+ scraper = new_scraper(DIVS123) do
68
+ selector :test, "#?", "2"
69
+ end
70
+ assert_equal 1, scraper.test(scraper.document).size
71
+ assert_equal "2", scraper.test(scraper.document)[0].attributes["id"]
61
72
  end
62
73
 
63
74
 
64
75
  def test_object_selectors
65
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
66
- scraper = new_scraper(html) do
76
+ scraper = new_scraper(DIVS123) do
67
77
  selector :test, HTML::Selector.new("div")
68
78
  end
69
79
  assert_equal 3, scraper.test(scraper.document).size
@@ -71,8 +81,7 @@ class ScraperTest < Test::Unit::TestCase
71
81
 
72
82
 
73
83
  def test_selector_returns_array
74
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
75
- scraper = new_scraper(html) do
84
+ scraper = new_scraper(DIVS123) do
76
85
  selector :test0, "#4"
77
86
  selector :test1, "#1"
78
87
  selector :test3, "div"
@@ -84,26 +93,24 @@ class ScraperTest < Test::Unit::TestCase
84
93
 
85
94
 
86
95
  def test_select_in_document_order
87
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
88
- scraper = new_scraper(html) do
96
+ scraper = new_scraper(DIVS123) do
89
97
  selector :test, "#2,#1"
90
98
  end
91
- assert_equal 2, scraper.test(scraper.document).size
99
+ assert_equal 2, scraper.test(scraper.document).size
92
100
  assert_equal "1", scraper.test(scraper.document)[0].attributes["id"]
93
101
  assert_equal "2", scraper.test(scraper.document)[1].attributes["id"]
94
102
  end
95
103
 
96
104
 
97
105
  def test_selecting_first_element
98
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
99
- scraper = new_scraper(html) do
106
+ scraper = new_scraper(DIVS123) do
100
107
  selector :test, "div"
101
108
  end
102
- assert_equal 3, scraper.test(scraper.document).size
109
+ assert_equal 3, scraper.test(scraper.document).size
103
110
  assert scraper.first_test(scraper.document)
104
111
  assert_equal "1", scraper.first_test(scraper.document).attributes["id"]
105
112
 
106
- scraper = new_scraper(html) do
113
+ scraper = new_scraper(DIVS123) do
107
114
  selector :test, "div" do |element|
108
115
  element[0].attributes["id"]
109
116
  end
@@ -118,8 +125,7 @@ class ScraperTest < Test::Unit::TestCase
118
125
  #
119
126
 
120
127
  def test_processing_rule
121
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
122
- scraper = new_scraper(html) do
128
+ scraper = new_scraper(DIVS123) do
123
129
  process "div" do |element|
124
130
  @count = (@count || 0) + 1
125
131
  end
@@ -131,8 +137,7 @@ class ScraperTest < Test::Unit::TestCase
131
137
 
132
138
 
133
139
  def test_processing_rule_with_array
134
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
135
- scraper = new_scraper(html) do
140
+ scraper = new_scraper(DIVS123) do
136
141
  process "#?", "1" do |element|
137
142
  @count = (@count || 0) + 1
138
143
  end
@@ -144,8 +149,7 @@ class ScraperTest < Test::Unit::TestCase
144
149
 
145
150
 
146
151
  def test_processing_rule_with_selector
147
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
148
- scraper = new_scraper(html) do
152
+ scraper = new_scraper(DIVS123) do
149
153
  process HTML::Selector.new("div") do |element|
150
154
  @count = (@count || 0) + 1
151
155
  end
@@ -157,8 +161,7 @@ class ScraperTest < Test::Unit::TestCase
157
161
 
158
162
 
159
163
  def test_extracting_in_code
160
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
161
- scraper = new_scraper(html) do
164
+ scraper = new_scraper(DIVS123) do
162
165
  process "div" do |element|
163
166
  @concat = (@concat || "") << element.attributes["id"]
164
167
  end
@@ -170,8 +173,7 @@ class ScraperTest < Test::Unit::TestCase
170
173
 
171
174
 
172
175
  def test_processing_in_document_order
173
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
174
- scraper = new_scraper(html) do
176
+ scraper = new_scraper(DIVS123) do
175
177
  process "#2,#1" do |element|
176
178
  @concat = (@concat || "") << element.attributes["id"]
177
179
  end
@@ -182,68 +184,28 @@ class ScraperTest < Test::Unit::TestCase
182
184
  end
183
185
 
184
186
 
185
- def test_skip_if_extractor_returns_true
186
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
187
- scraper = new_scraper(html) do
188
- process "#1" do |element|
189
- @first = true
190
- false
191
- end
192
- process "#1" do |element|
193
- @second = true
194
- end
195
- attr :first
196
- attr :second
197
- end
198
- scraper.scrape
199
- assert_equal true, scraper.first
200
- assert_equal true, scraper.second
201
- scraper = new_scraper(html) do
202
- process "#1" do |element|
203
- @first = true
204
- true
205
- end
206
- process "#1" do |element|
207
- @second = true
208
- end
209
- attr :first
210
- attr :second
211
- end
212
- scraper.scrape
213
- assert_equal true, scraper.first
214
- assert_equal nil, scraper.second
215
- end
216
-
217
-
218
187
  def test_process_once_if_skipped
219
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
220
- scraper = new_scraper(html) do
221
- process "#1" do |element|
222
- @first = true
223
- skip element
224
- false
188
+ scraper = new_scraper(DIVS123) do
189
+ def prepare(document)
190
+ @found = []
225
191
  end
226
- process "#1" do |element|
227
- @second = true
228
- end
229
- attr :first
230
- attr :second
192
+ process("#1") { |element| @found[0] = true }
193
+ process("#1") { |element| @found[1] = true ; skip element }
194
+ process("#1") { |element| @found[2] = true }
195
+ process("#2", :skip=>true){ |element| @found[3] = true }
196
+ process("#2") { |element| @found[4] = true }
197
+ attr_reader :found
231
198
  end
232
199
  scraper.scrape
233
- assert_equal true, scraper.first
234
- assert_equal nil, scraper.second
200
+ assert_equal [true, true, nil, true], scraper.found
235
201
  end
236
202
 
237
203
 
238
204
  def test_skip_children
239
- html = %Q{<div><div id="1"></div><div id="2"></div><div id="3"></div></div>}
240
- scraper = new_scraper(html) do
205
+ scraper = new_scraper(DIVS1_23) do
241
206
  process "div" do |element|
242
207
  @concat = (@concat || "") << (element.attributes["id"] || "")
243
- if to_skip = id2(element)
244
- skip to_skip
245
- end
246
- false
208
+ skip id2(element)
247
209
  end
248
210
  selector :id2, "#2"
249
211
  attr :concat
@@ -254,68 +216,68 @@ class ScraperTest < Test::Unit::TestCase
254
216
 
255
217
 
256
218
  def test_skip_descendants
257
- html = %Q{<div id="1"><div id="2"><div id="3"></div></div</div>}
258
- scraper = new_scraper(html) do
219
+ # Root, child of root, grandchild of root.
220
+ scraper = new_scraper(DIVS1_23) do
259
221
  process "div" do |element|
260
222
  @concat = (@concat || "") << (element.attributes["id"] || "")
261
- false
262
223
  end
263
224
  attr :concat
264
225
  end
265
226
  scraper.scrape
266
- # Root, child of root, grandchild of root.
267
227
  assert_equal "123", scraper.concat
268
- scraper = new_scraper(html) do
228
+
229
+ # Stop at root.
230
+ scraper = new_scraper(DIVS1_23) do
269
231
  process "div" do |element|
270
232
  @concat = (@concat || "") << (element.attributes["id"] || "")
271
- true
233
+ skip
272
234
  end
273
235
  attr :concat
274
236
  end
275
237
  scraper.scrape
276
- # Stop at root.
277
238
  assert_equal "1", scraper.concat
278
239
 
279
- scraper = new_scraper(html) do
240
+ scraper.scrape
241
+ # Child of root, and child of root's child
242
+ scraper = new_scraper(DIVS1_23) do
280
243
  process "div>div" do |element|
281
244
  @concat = (@concat || "") << (element.attributes["id"] || "")
282
- false
283
245
  end
284
246
  attr :concat
285
247
  end
286
248
  scraper.scrape
287
- # Child of root, and child of root's child
288
249
  assert_equal "23", scraper.concat
289
- scraper = new_scraper(html) do
250
+
251
+ # Stop at child of root.
252
+ scraper = new_scraper(DIVS1_23) do
290
253
  process "div>div" do |element|
291
254
  @concat = (@concat || "") << (element.attributes["id"] || "")
292
- true
255
+ skip element.next_element
293
256
  end
294
257
  attr :concat
295
258
  end
296
259
  scraper.scrape
297
- # Stop at child of root.
298
260
  assert_equal "2", scraper.concat
299
261
 
300
- scraper = new_scraper(html) do
262
+ # Child of root, the child of child of root.
263
+ scraper = new_scraper(DIVS1_23) do
301
264
  process "div div" do |element|
302
265
  @concat = (@concat || "") << (element.attributes["id"] || "")
303
- false
304
266
  end
305
267
  attr :concat
306
268
  end
307
269
  scraper.scrape
308
- # Child of root, the child of child of root.
309
270
  assert_equal "23", scraper.concat
310
- scraper = new_scraper(html) do
271
+
272
+ # Child of root.
273
+ scraper = new_scraper(DIVS1_23) do
311
274
  process "div div" do |element|
312
275
  @concat = (@concat || "") << (element.attributes["id"] || "")
313
- true
276
+ skip element.next_element
314
277
  end
315
278
  attr :concat
316
279
  end
317
280
  scraper.scrape
318
- # Child of root.
319
281
  assert_equal "2", scraper.concat
320
282
  end
321
283
 
@@ -328,15 +290,15 @@ class ScraperTest < Test::Unit::TestCase
328
290
  end
329
291
  scraper.scrape
330
292
  assert_equal "this", scraper.this1
331
- assert_equal nil, scraper.this2
293
+ assert_equal "this", scraper.this2
332
294
 
333
295
  scraper = new_scraper(html) do
334
296
  process "#1", :this1=>:text, :skip=>false
335
297
  process "#1", :this2=>:text
336
298
  end
337
299
  scraper.scrape
338
- #assert_equal "this", scraper.this1
339
- #assert_equal "this", scraper.this2
300
+ assert_equal "this", scraper.this1
301
+ assert_equal "this", scraper.this2
340
302
 
341
303
  scraper = new_scraper(html) do
342
304
  process "#1", :this1=>:text, :skip=>true do
@@ -346,13 +308,12 @@ class ScraperTest < Test::Unit::TestCase
346
308
  end
347
309
  scraper.scrape
348
310
  assert_equal "this", scraper.this1
349
- assert_equal nil, scraper.this2
311
+ assert_equal nil, scraper.this2
350
312
  end
351
313
 
352
314
 
353
315
  def test_stop
354
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
355
- scraper = new_scraper(html) do
316
+ scraper = new_scraper(DIVS123) do
356
317
  process "div" do |element|
357
318
  @concat = (@concat || "") << (element.attributes["id"] || "")
358
319
  stop
@@ -365,18 +326,14 @@ class ScraperTest < Test::Unit::TestCase
365
326
 
366
327
 
367
328
  def test_process_first
368
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
369
- scraper = new_scraper(html) do
329
+ scraper = new_scraper(DIVS123) do
370
330
  process "div" do |element|
371
331
  @all = (@all || 0) + 1
372
- false
373
332
  end
374
333
  process_first "div" do |element|
375
334
  @first = (@first || 0) + 1
376
- false
377
335
  end
378
- attr :all
379
- attr :first
336
+ attr_accessor :all, :first
380
337
  end
381
338
  scraper.scrape
382
339
  assert_equal 3, scraper.all
@@ -391,14 +348,17 @@ class ScraperTest < Test::Unit::TestCase
391
348
  response = Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK")
392
349
  response["Last-Modified"] = time
393
350
  response["ETag"] = "etag"
394
- [response, %Q{
395
- <html>
396
- <head>
397
- <meta http-equiv="content-type" value="text/html; charset=other-encoding">
398
- </head>
399
- <body><div id="x"/></body>
400
- </html>
401
- }]
351
+ [response, <<-EOF
352
+ <html>
353
+ <head>
354
+ <meta http-equiv="content-type" value="text/html; charset=other-encoding">
355
+ </head>
356
+ <body>
357
+ <div id="x"/>
358
+ </body>
359
+ </html>
360
+ EOF
361
+ ]
402
362
  else
403
363
  response = Net::HTTPMovedPermanently.new(Net::HTTP.version_1_2, 300, "Moved")
404
364
  response["Location"] = "http://localhost/redirect"
@@ -417,11 +377,15 @@ class ScraperTest < Test::Unit::TestCase
417
377
 
418
378
  def test_scraping_end_to_end
419
379
  Net::HTTP.on_get do |address, path, headers|
420
- [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), %Q{
421
- <html>
422
- <body><div id="1"/><div id="2"/></body>
423
- </html>
424
- }]
380
+ [Net::HTTPSuccess.new(Net::HTTP.version_1_2, 200, "OK"), <<-EOF
381
+ <html>
382
+ <body>
383
+ <div id="1"/>
384
+ <div id="2"/>
385
+ </body>
386
+ </html>
387
+ EOF
388
+ ]
425
389
  end
426
390
  scraper = new_scraper(URI.parse("http://localhost/")) do
427
391
  process "div" do |element|
@@ -475,7 +439,10 @@ class ScraperTest < Test::Unit::TestCase
475
439
 
476
440
 
477
441
  def test_extractors_objects
478
- html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
442
+ html = <<-EOF
443
+ <h1 class="header"></h1>
444
+ <h2 class="header"></h2>
445
+ EOF
479
446
  # Extract both elements based on class, return the second one.
480
447
  scraper = new_scraper(html) do
481
448
  process ".header", extractor(:header=>:element)
@@ -484,7 +451,10 @@ class ScraperTest < Test::Unit::TestCase
484
451
  scraper.scrape
485
452
  assert_equal "h2", scraper.header.name
486
453
  # Extracting a specific element skips the second match.
487
- html = %Q{<h1 class="header"></h1><h2 class="header"></h2>}
454
+ html = <<-EOF
455
+ <h1 class="header"></h1>
456
+ <h2 class="header"></h2>
457
+ EOF
488
458
  scraper = new_scraper(html) do
489
459
  process ".header", extractor(:header=>"h1")
490
460
  attr :header
@@ -496,7 +466,10 @@ class ScraperTest < Test::Unit::TestCase
496
466
 
497
467
  def test_attribute_extractors
498
468
  # Extracting the attribute skips the second match.
499
- html = %Q{<abbr title="foo">bar</div><abbr>foo</abbr>}
469
+ html = <<-EOF
470
+ <abbr title="foo">bar</div>
471
+ <abbr>foo</abbr>
472
+ EOF
500
473
  scraper = new_scraper(html) do
501
474
  process "abbr", extractor(:title=>"@title")
502
475
  attr :title
@@ -504,7 +477,10 @@ class ScraperTest < Test::Unit::TestCase
504
477
  scraper.scrape
505
478
  assert_equal "foo", scraper.title
506
479
  # Extracting a specific element skips the second match.
507
- html = %Q{<h1 class="header" id="1"></h1><h2 class="header" id="2"></h2>}
480
+ html = <<-EOF
481
+ <h1 class="header" id="1"></h1>
482
+ <h2 class="header" id="2"></h2>
483
+ EOF
508
484
  scraper = new_scraper(html) do
509
485
  process ".header", extractor(:header=>"h1@id")
510
486
  attr :header
@@ -522,7 +498,12 @@ class ScraperTest < Test::Unit::TestCase
522
498
  attr :h1
523
499
  attr :h2
524
500
  end
525
- html = %Q{<div><h1>first</h1><h2>second</h2></div>}
501
+ html = <<-EOF
502
+ <div>
503
+ <h1>first</h1>
504
+ <h2>second</h2>
505
+ </div>
506
+ EOF
526
507
  scraper = new_scraper(html) do
527
508
  process "div", extractor(:headers=>headers)
528
509
  attr :headers
@@ -535,7 +516,12 @@ class ScraperTest < Test::Unit::TestCase
535
516
 
536
517
 
537
518
  def test_array_extractors
538
- html = %Q{<div><h1>first</h1><h1>second</h1></div>}
519
+ html = <<-EOF
520
+ <div>
521
+ <h1>first</h1>
522
+ <h1>second</h1>
523
+ </div>
524
+ EOF
539
525
  scraper = new_scraper(html) do
540
526
  process "h1", extractor("headers[]"=>:text)
541
527
  attr :headers
@@ -543,13 +529,17 @@ class ScraperTest < Test::Unit::TestCase
543
529
  scraper.scrape
544
530
  assert scraper.headers.is_a?(Array)
545
531
  assert_equal 2, scraper.headers.size
546
- assert_equal "first", scraper.headers[0]
547
- assert_equal "second", scraper.headers[1]
532
+ assert_equal "first", scraper.headers[0]
533
+ assert_equal "second", scraper.headers[1]
548
534
  end
549
535
 
550
536
 
551
537
  def test_hash_extractors
552
- html = %Q{<div><h1 id="1" class="header">first</h1></div>}
538
+ html = <<-EOF
539
+ <div>
540
+ <h1 id="1" class="header">first</h1>
541
+ </div>
542
+ EOF
553
543
  scraper = new_scraper(html) do
554
544
  process "h1", extractor("header"=>{:id=>"@id", :class=>"@class", :text=>:text})
555
545
  attr :header
@@ -558,21 +548,25 @@ class ScraperTest < Test::Unit::TestCase
558
548
  assert scraper.header.is_a?(Hash)
559
549
  assert_equal 3, scraper.header.size
560
550
  assert_equal "1", scraper.header[:id]
561
- assert_equal "header", scraper.header[:class]
562
- assert_equal "first", scraper.header[:text]
551
+ assert_equal "header", scraper.header[:class]
552
+ assert_equal "first", scraper.header[:text]
563
553
  end
564
554
 
565
555
 
566
556
  def test_multi_value_extractors
567
- html = %Q{<div><h1 id="1" class="header">first</h1></div>}
557
+ html = <<-EOF
558
+ <div>
559
+ <h1 id="1" class="header">first</h1>
560
+ </div>
561
+ EOF
568
562
  scraper = new_scraper(html) do
569
563
  process "h1", [:text, :kls]=>Scraper.define {
570
564
  process "*", :text=>:text, :kls=>"@class"
571
565
  }
572
566
  end
573
567
  result = scraper.scrape
574
- assert "first", result.text
575
- assert "header", result.kls
568
+ assert "first", result.text
569
+ assert "header", result.kls
576
570
  end
577
571
 
578
572
 
@@ -581,7 +575,13 @@ class ScraperTest < Test::Unit::TestCase
581
575
  # if not found look for class attribute (first
582
576
  # two headers), otherwise just get text (third
583
577
  # header).
584
- html = %Q{<div><h1 class="foo">first</h1><h1 class="foo" id="bar">second</h1><h1>third</h1></div>}
578
+ html = <<-EOF
579
+ <div>
580
+ <h1 class="foo">first</h1>
581
+ <h1 class="foo" id="bar">second</h1>
582
+ <h1>third</h1>
583
+ </div>
584
+ EOF
585
585
  scraper = new_scraper(html) do
586
586
  process "h1", extractor("headers[]"=>["@id", "@class", :text])
587
587
  attr :headers
@@ -589,37 +589,41 @@ class ScraperTest < Test::Unit::TestCase
589
589
  scraper.scrape
590
590
  assert scraper.headers.is_a?(Array)
591
591
  assert_equal 3, scraper.headers.size
592
- assert_equal "foo", scraper.headers[0]
593
- assert_equal "bar", scraper.headers[1]
592
+ assert_equal "foo", scraper.headers[0]
593
+ assert_equal "bar", scraper.headers[1]
594
594
  assert_equal "third", scraper.headers[2]
595
595
  end
596
596
 
597
597
 
598
+ DIVS_ST_ND = <<-EOF
599
+ <div id="1">first</div>
600
+ <div id="2">second</div>
601
+ EOF
602
+
598
603
  def test_accessors_from_extractor
599
- html = %Q{<div id="1">first</div><div id="2">second</div>}
600
- scraper = new_scraper(html) do
604
+ scraper = new_scraper(DIVS_ST_ND) do
601
605
  process_first "div", :div_id=>"@id", :div_text=>:text
602
606
  result :div_id
603
607
  end
604
608
  value = scraper.scrape
605
609
  assert_equal "1", value
606
610
 
607
- scraper = new_scraper(html) do
611
+ scraper = new_scraper(DIVS_ST_ND) do
608
612
  process_first "div", :div_id=>"@id", :div_text=>:text
609
613
  result :div_id, :div_text
610
614
  end
611
615
  value = scraper.scrape
612
- assert_equal "1", value.div_id
616
+ assert_equal "1", value.div_id
613
617
  assert_equal "first", value.div_text
614
618
 
615
- scraper = new_scraper(html) do
619
+ scraper = new_scraper(DIVS_ST_ND) do
616
620
  process_first "div", :div_id=>"@id", :div_text=>:text
617
621
  end
618
622
  value = scraper.scrape
619
- assert_equal "1", value.div_id
623
+ assert_equal "1", value.div_id
620
624
  assert_equal "first", value.div_text
621
625
 
622
- scraper = new_scraper(html) do
626
+ scraper = new_scraper(DIVS_ST_ND) do
623
627
  attr_accessor :div_class
624
628
  process_first "div", :div_id=>"@id", :div_text=>:text
625
629
  result :div_id, :div_class
@@ -628,7 +632,7 @@ class ScraperTest < Test::Unit::TestCase
628
632
  assert_equal "1", value.div_id
629
633
  assert_raise(NoMethodError) { value.div_text }
630
634
 
631
- scraper = new_scraper(html) do
635
+ scraper = new_scraper(DIVS_ST_ND) do
632
636
  process "div", "div_ids[]"=>"@id"
633
637
  result :div_ids
634
638
  end
@@ -639,8 +643,7 @@ class ScraperTest < Test::Unit::TestCase
639
643
 
640
644
 
641
645
  def test_array_accessors
642
- html = %Q{<div id="1">first</div><div id="2">second</div>}
643
- scraper = new_scraper(html) do
646
+ scraper = new_scraper(DIVS_ST_ND) do
644
647
  array :div_id, :div_text
645
648
  process "div", :div_id=>"@id", :div_text=>:text
646
649
  result :div_id, :div_text
@@ -650,8 +653,8 @@ class ScraperTest < Test::Unit::TestCase
650
653
  assert_equal 2, value.div_text.size
651
654
  assert_equal "1", value.div_id[0]
652
655
  assert_equal "2", value.div_id[1]
653
- assert_equal "first", value.div_text[0]
654
- assert_equal "second", value.div_text[1]
656
+ assert_equal "first", value.div_text[0]
657
+ assert_equal "second", value.div_text[1]
655
658
  end
656
659
 
657
660
 
@@ -659,9 +662,17 @@ class ScraperTest < Test::Unit::TestCase
659
662
  # Root element tests.
660
663
  #
661
664
 
665
+ HTML_EMPTY = <<-EOF
666
+ <html>
667
+ <head>
668
+ </head>
669
+ <body>
670
+ </body>
671
+ </html>
672
+ EOF
673
+
662
674
  def test_scrape_body_by_default
663
- html = %Q{<html><head></head><body></body></html>}
664
- scraper = Class.new(Scraper::Base).new(html)
675
+ scraper = Class.new(Scraper::Base).new(HTML_EMPTY)
665
676
  scraper.class.instance_eval do
666
677
  process "head" do |element| @head = element end
667
678
  attr :head
@@ -675,17 +686,16 @@ class ScraperTest < Test::Unit::TestCase
675
686
 
676
687
 
677
688
  def test_changing_root_element
678
- html = %Q{<html><head></head><body></body></html>}
679
- only_header = new_scraper(html) do
689
+ only_header = new_scraper(HTML_EMPTY) do
680
690
  root_element "head"
681
691
  process "head" do |element| @head = element end
682
692
  attr :head
683
693
  process "body" do |element| @body = element end
684
694
  attr :body
685
695
  end
686
- only_body = Class.new(only_header.class).new(html)
696
+ only_body = Class.new(only_header.class).new(HTML_EMPTY)
687
697
  only_body.class.root_element "body"
688
- both_parts = Class.new(only_body.class).new(html)
698
+ both_parts = Class.new(only_body.class).new(HTML_EMPTY)
689
699
  both_parts.class.root_element nil
690
700
  # We set this scraper to begin with the head element,
691
701
  # so we can see the head element, but not the body.
@@ -709,8 +719,7 @@ class ScraperTest < Test::Unit::TestCase
709
719
 
710
720
  def test_prepare_and_result
711
721
  # Extracting the attribute skips the second match.
712
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
713
- scraper = new_scraper(html) do
722
+ scraper = new_scraper(DIVS123) do
714
723
  process("div") { |element| @count +=1 }
715
724
  define_method(:prepare) { @count = 1 }
716
725
  define_method(:result) { @count }
@@ -722,8 +731,7 @@ class ScraperTest < Test::Unit::TestCase
722
731
 
723
732
  def test_changing_document_from_prepare
724
733
  # Extracting the attribute skips the second match.
725
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
726
- scraper = new_scraper(html) do
734
+ scraper = new_scraper(DIVS123) do
727
735
  selector :divs, "div"
728
736
  define_method :prepare do |document|
729
737
  @document = divs(document)[1]
@@ -739,13 +747,12 @@ class ScraperTest < Test::Unit::TestCase
739
747
 
740
748
 
741
749
  def test_anonymous_scrapers
742
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
743
750
  scraper = Scraper.define do
744
751
  array :ids
745
752
  process "div", :ids=>"@id"
746
753
  result :ids
747
754
  end
748
- result = scraper.scrape(html)
755
+ result = scraper.scrape(DIVS123)
749
756
  assert_equal "1", result[0]
750
757
  assert_equal "2", result[1]
751
758
  assert_equal "3", result[2]
@@ -753,14 +760,13 @@ class ScraperTest < Test::Unit::TestCase
753
760
 
754
761
 
755
762
  def test_named_rules
756
- html = %Q{<div id="1"></div><div id="2"></div><div id="3"></div>}
757
763
  scraper = Scraper.define do
758
764
  array :ids1, :ids2
759
765
  process :main, "div", :ids1=>"@id"
760
766
  process :main, "div", :ids2=>"@id"
761
767
  result :ids1, :ids2
762
768
  end
763
- result = scraper.scrape(html)
769
+ result = scraper.scrape(DIVS123)
764
770
  assert_equal nil, result.ids1
765
771
  assert_equal 3, result.ids2.size
766
772
  assert_equal "1", result.ids2[0]
@@ -775,7 +781,7 @@ protected
775
781
  cls = Class.new(Scraper::Base)
776
782
  cls.root_element nil
777
783
  cls.parser :html_parser
778
- cls.instance_eval &block if block
784
+ cls.class_eval &block if block
779
785
  cls.new(what)
780
786
  end
781
787
 
@@ -791,7 +797,7 @@ protected
791
797
  cls = Class.new(Scraper::Base)
792
798
  cls.root_element nil
793
799
  cls.parser :tidy
794
- cls.instance_eval &block if block
800
+ cls.class_eval &block if block
795
801
  cls.new(what)
796
802
  end
797
803
 
metadata CHANGED
@@ -3,15 +3,15 @@ rubygems_version: 0.9.0
3
3
  specification_version: 1
4
4
  name: scrapi
5
5
  version: !ruby/object:Gem::Version
6
- version: 1.1.2
7
- date: 2006-08-15 00:00:00 -07:00
8
- summary: scrAPI toolkit for Ruby
6
+ version: 1.2.0
7
+ date: 2006-08-27 00:00:00 -07:00
8
+ summary: scrAPI toolkit for Ruby. Uses CSS selectors to write easy, maintainable HTML scraping rules.
9
9
  require_paths:
10
10
  - lib
11
11
  email: assaf.arkin@gmail.com
12
- homepage: http://labnotes.org/
12
+ homepage: http://blog.labnotes.org/category/scrapi/
13
13
  rubyforge_project: scrapi
14
- description: A framework for writing scrapers using CSS selectors and simple select => extract => store processing rules.
14
+ description: scrAPI is an HTML scraping toolkit for Ruby. It uses CSS selectors to write easy, maintainable scraping rules to select, extract and store data from HTML content.
15
15
  autorequire: scrapi.rb
16
16
  default_executable:
17
17
  bindir: bin
@@ -40,6 +40,7 @@ files:
40
40
  - lib/html
41
41
  - lib/scraper/reader.rb
42
42
  - lib/scraper/base.rb
43
+ - lib/scraper/microformats.rb
43
44
  - lib/tidy/libtidy.so
44
45
  - lib/tidy/libtidy.dll
45
46
  - lib/html/node_ext.rb