spidey-mongo 0.0.3 → 0.0.4

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -46,12 +46,15 @@ With persistent storage of the URL-crawling queue, it's now possible to stop cra
46
46
 
47
47
  ### Recording Results
48
48
 
49
- By default, invocations of `record(data)` by the spider simply insert new documents into the result collection. If corresponding results may already exist in the collection and should instead be updated, use the `set_result_key` helper (with a proc or method symbol) to specify how to find the document to update:
49
+ By default, invocations of `record(data)` by the spider simply insert new documents into the result collection. If corresponding results may already exist in the collection and should instead be updated, define a `result_key` method that returns a key by which to find the corresponding document. The method is called with a hash of the data being recorded:
50
50
 
51
51
  class EbaySpider < Spidey::AbstractSpider
52
52
  include Spidey::Strategies::Mongo
53
- set_result_key ->(data) { data[:auction_id] }
54
53
 
54
+ def result_key(data)
55
+ data[:detail_url]
56
+ end
57
+
55
58
  # ...
56
59
  end
57
60
 
@@ -1,5 +1,5 @@
1
1
  module Spidey
2
2
  module Mongo
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
@@ -2,24 +2,6 @@ module Spidey::Strategies
2
2
  module Mongo
3
3
  attr_accessor :url_collection, :result_collection, :error_collection
4
4
 
5
- module ClassMethods
6
- def set_result_key(callback)
7
- @result_key = callback
8
- end
9
-
10
- def result_key(spider, data)
11
- case @result_key
12
- when Symbol then spider.send(@result_key, data)
13
- when Proc then @result_key.call(data)
14
- else nil
15
- end
16
- end
17
- end
18
-
19
- def self.included(base)
20
- base.extend ClassMethods
21
- end
22
-
23
5
  def initialize(attrs = {})
24
6
  self.url_collection = attrs.delete(:url_collection)
25
7
  self.result_collection = attrs.delete(:result_collection)
@@ -44,7 +26,7 @@ module Spidey::Strategies
44
26
 
45
27
  def record(data)
46
28
  $stderr.puts "Recording #{data.inspect.truncate(500)}" if verbose
47
- if key = self.class.result_key(self, data)
29
+ if respond_to?(:result_key) && key = result_key(data)
48
30
  result_collection.update({'key' => key}, {'$set' => data}, upsert: true)
49
31
  else
50
32
  result_collection.insert data
@@ -3,8 +3,11 @@ require 'spec_helper'
3
3
  describe Spidey::Strategies::Mongo do
4
4
  class TestSpider < Spidey::AbstractSpider
5
5
  include Spidey::Strategies::Mongo
6
- set_result_key ->(data) { data[:detail_url] }
7
6
  handle "http://www.cnn.com", :process_home
7
+
8
+ def result_key(data)
9
+ data[:detail_url]
10
+ end
8
11
  end
9
12
 
10
13
  before(:each) do
@@ -15,6 +18,10 @@ describe Spidey::Strategies::Mongo do
15
18
  error_collection: @db['errors'])
16
19
  end
17
20
 
21
+ after(:each) do
22
+ %w{ urls results errors }.each{ |col| @db[col].drop }
23
+ end
24
+
18
25
  it "should add initial URLs to collection" do
19
26
  doc = @db['urls'].find_one(url: "http://www.cnn.com")
20
27
  doc['handler'].should == :process_home
@@ -33,6 +40,12 @@ describe Spidey::Strategies::Mongo do
33
40
  doc['foo'].should == 'bar'
34
41
  end
35
42
 
43
+ it "should update existing result" do
44
+ @db['results'].insert key: 'http://foo.bar', detail_url: 'http://foo.bar'
45
+ @spider.record detail_url: 'http://foo.bar', foo: 'bar'
46
+ @db['results'].count.should == 1
47
+ end
48
+
36
49
  it "should add error" do
37
50
  @spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
38
51
  doc = @db['errors'].find_one
data/spidey-mongo.gemspec CHANGED
@@ -21,6 +21,7 @@ Gem::Specification.new do |s|
21
21
 
22
22
  s.add_development_dependency "rake"
23
23
  s.add_development_dependency "rspec"
24
+ s.add_development_dependency "ruby-debug19"
24
25
 
25
26
  s.add_runtime_dependency "spidey"
26
27
  s.add_runtime_dependency "mongo"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidey-mongo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-06-27 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &70227312727160 !ruby/object:Gem::Requirement
16
+ requirement: &70312341584000 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70227312727160
24
+ version_requirements: *70312341584000
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rspec
27
- requirement: &70227312726140 !ruby/object:Gem::Requirement
27
+ requirement: &70312341583440 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,21 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70227312726140
35
+ version_requirements: *70312341583440
36
+ - !ruby/object:Gem::Dependency
37
+ name: ruby-debug19
38
+ requirement: &70312341582900 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70312341582900
36
47
  - !ruby/object:Gem::Dependency
37
48
  name: spidey
38
- requirement: &70227312725220 !ruby/object:Gem::Requirement
49
+ requirement: &70312341582320 !ruby/object:Gem::Requirement
39
50
  none: false
40
51
  requirements:
41
52
  - - ! '>='
@@ -43,10 +54,10 @@ dependencies:
43
54
  version: '0'
44
55
  type: :runtime
45
56
  prerelease: false
46
- version_requirements: *70227312725220
57
+ version_requirements: *70312341582320
47
58
  - !ruby/object:Gem::Dependency
48
59
  name: mongo
49
- requirement: &70227312724420 !ruby/object:Gem::Requirement
60
+ requirement: &70312341581640 !ruby/object:Gem::Requirement
50
61
  none: false
51
62
  requirements:
52
63
  - - ! '>='
@@ -54,10 +65,10 @@ dependencies:
54
65
  version: '0'
55
66
  type: :runtime
56
67
  prerelease: false
57
- version_requirements: *70227312724420
68
+ version_requirements: *70312341581640
58
69
  - !ruby/object:Gem::Dependency
59
70
  name: bson_ext
60
- requirement: &70227312717760 !ruby/object:Gem::Requirement
71
+ requirement: &70312341580620 !ruby/object:Gem::Requirement
61
72
  none: false
62
73
  requirements:
63
74
  - - ! '>='
@@ -65,7 +76,7 @@ dependencies:
65
76
  version: '0'
66
77
  type: :runtime
67
78
  prerelease: false
68
- version_requirements: *70227312717760
79
+ version_requirements: *70312341580620
69
80
  description: Implements a MongoDB back-end for Spidey, a framework for crawling and
70
81
  scraping web sites.
71
82
  email:
@@ -100,7 +111,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
100
111
  version: '0'
101
112
  segments:
102
113
  - 0
103
- hash: 128629480123059091
114
+ hash: -1659482461320296287
104
115
  required_rubygems_version: !ruby/object:Gem::Requirement
105
116
  none: false
106
117
  requirements:
@@ -109,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
120
  version: '0'
110
121
  segments:
111
122
  - 0
112
- hash: 128629480123059091
123
+ hash: -1659482461320296287
113
124
  requirements: []
114
125
  rubyforge_project: spidey-mongo
115
126
  rubygems_version: 1.8.10