spidey-mongo 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -46,12 +46,15 @@ With persistent storage of the URL-crawling queue, it's now possible to stop cra
46
46
 
47
47
  ### Recording Results
48
48
 
49
- By default, invocations of `record(data)` by the spider simply insert new documents into the result collection. If corresponding results may already exist in the collection and should instead be updated, use the `set_result_key` helper (with a proc or method symbol) to specify how to find the document to update:
49
+ By default, invocations of `record(data)` by the spider simply insert new documents into the result collection. If corresponding results may already exist in the collection and should instead be updated, define a `result_key` method that returns a key by which to find the corresponding document. The method is called with a hash of the data being recorded:
50
50
 
51
51
  class EbaySpider < Spidey::AbstractSpider
52
52
  include Spidey::Strategies::Mongo
53
- set_result_key ->(data) { data[:auction_id] }
54
53
 
54
+ def result_key(data)
55
+ data[:detail_url]
56
+ end
57
+
55
58
  # ...
56
59
  end
57
60
 
@@ -1,5 +1,5 @@
1
1
  module Spidey
2
2
  module Mongo
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.4"
4
4
  end
5
5
  end
@@ -2,24 +2,6 @@ module Spidey::Strategies
2
2
  module Mongo
3
3
  attr_accessor :url_collection, :result_collection, :error_collection
4
4
 
5
- module ClassMethods
6
- def set_result_key(callback)
7
- @result_key = callback
8
- end
9
-
10
- def result_key(spider, data)
11
- case @result_key
12
- when Symbol then spider.send(@result_key, data)
13
- when Proc then @result_key.call(data)
14
- else nil
15
- end
16
- end
17
- end
18
-
19
- def self.included(base)
20
- base.extend ClassMethods
21
- end
22
-
23
5
  def initialize(attrs = {})
24
6
  self.url_collection = attrs.delete(:url_collection)
25
7
  self.result_collection = attrs.delete(:result_collection)
@@ -44,7 +26,7 @@ module Spidey::Strategies
44
26
 
45
27
  def record(data)
46
28
  $stderr.puts "Recording #{data.inspect.truncate(500)}" if verbose
47
- if key = self.class.result_key(self, data)
29
+ if respond_to?(:result_key) && key = result_key(data)
48
30
  result_collection.update({'key' => key}, {'$set' => data}, upsert: true)
49
31
  else
50
32
  result_collection.insert data
@@ -3,8 +3,11 @@ require 'spec_helper'
3
3
  describe Spidey::Strategies::Mongo do
4
4
  class TestSpider < Spidey::AbstractSpider
5
5
  include Spidey::Strategies::Mongo
6
- set_result_key ->(data) { data[:detail_url] }
7
6
  handle "http://www.cnn.com", :process_home
7
+
8
+ def result_key(data)
9
+ data[:detail_url]
10
+ end
8
11
  end
9
12
 
10
13
  before(:each) do
@@ -15,6 +18,10 @@ describe Spidey::Strategies::Mongo do
15
18
  error_collection: @db['errors'])
16
19
  end
17
20
 
21
+ after(:each) do
22
+ %w{ urls results errors }.each{ |col| @db[col].drop }
23
+ end
24
+
18
25
  it "should add initial URLs to collection" do
19
26
  doc = @db['urls'].find_one(url: "http://www.cnn.com")
20
27
  doc['handler'].should == :process_home
@@ -33,6 +40,12 @@ describe Spidey::Strategies::Mongo do
33
40
  doc['foo'].should == 'bar'
34
41
  end
35
42
 
43
+ it "should update existing result" do
44
+ @db['results'].insert key: 'http://foo.bar', detail_url: 'http://foo.bar'
45
+ @spider.record detail_url: 'http://foo.bar', foo: 'bar'
46
+ @db['results'].count.should == 1
47
+ end
48
+
36
49
  it "should add error" do
37
50
  @spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
38
51
  doc = @db['errors'].find_one
data/spidey-mongo.gemspec CHANGED
@@ -21,6 +21,7 @@ Gem::Specification.new do |s|
21
21
 
22
22
  s.add_development_dependency "rake"
23
23
  s.add_development_dependency "rspec"
24
+ s.add_development_dependency "ruby-debug19"
24
25
 
25
26
  s.add_runtime_dependency "spidey"
26
27
  s.add_runtime_dependency "mongo"
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: spidey-mongo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.3
4
+ version: 0.0.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-06-27 00:00:00.000000000Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake
16
- requirement: &70227312727160 !ruby/object:Gem::Requirement
16
+ requirement: &70312341584000 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :development
23
23
  prerelease: false
24
- version_requirements: *70227312727160
24
+ version_requirements: *70312341584000
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rspec
27
- requirement: &70227312726140 !ruby/object:Gem::Requirement
27
+ requirement: &70312341583440 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,21 @@ dependencies:
32
32
  version: '0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *70227312726140
35
+ version_requirements: *70312341583440
36
+ - !ruby/object:Gem::Dependency
37
+ name: ruby-debug19
38
+ requirement: &70312341582900 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :development
45
+ prerelease: false
46
+ version_requirements: *70312341582900
36
47
  - !ruby/object:Gem::Dependency
37
48
  name: spidey
38
- requirement: &70227312725220 !ruby/object:Gem::Requirement
49
+ requirement: &70312341582320 !ruby/object:Gem::Requirement
39
50
  none: false
40
51
  requirements:
41
52
  - - ! '>='
@@ -43,10 +54,10 @@ dependencies:
43
54
  version: '0'
44
55
  type: :runtime
45
56
  prerelease: false
46
- version_requirements: *70227312725220
57
+ version_requirements: *70312341582320
47
58
  - !ruby/object:Gem::Dependency
48
59
  name: mongo
49
- requirement: &70227312724420 !ruby/object:Gem::Requirement
60
+ requirement: &70312341581640 !ruby/object:Gem::Requirement
50
61
  none: false
51
62
  requirements:
52
63
  - - ! '>='
@@ -54,10 +65,10 @@ dependencies:
54
65
  version: '0'
55
66
  type: :runtime
56
67
  prerelease: false
57
- version_requirements: *70227312724420
68
+ version_requirements: *70312341581640
58
69
  - !ruby/object:Gem::Dependency
59
70
  name: bson_ext
60
- requirement: &70227312717760 !ruby/object:Gem::Requirement
71
+ requirement: &70312341580620 !ruby/object:Gem::Requirement
61
72
  none: false
62
73
  requirements:
63
74
  - - ! '>='
@@ -65,7 +76,7 @@ dependencies:
65
76
  version: '0'
66
77
  type: :runtime
67
78
  prerelease: false
68
- version_requirements: *70227312717760
79
+ version_requirements: *70312341580620
69
80
  description: Implements a MongoDB back-end for Spidey, a framework for crawling and
70
81
  scraping web sites.
71
82
  email:
@@ -100,7 +111,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
100
111
  version: '0'
101
112
  segments:
102
113
  - 0
103
- hash: 128629480123059091
114
+ hash: -1659482461320296287
104
115
  required_rubygems_version: !ruby/object:Gem::Requirement
105
116
  none: false
106
117
  requirements:
@@ -109,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
120
  version: '0'
110
121
  segments:
111
122
  - 0
112
- hash: 128629480123059091
123
+ hash: -1659482461320296287
113
124
  requirements: []
114
125
  rubyforge_project: spidey-mongo
115
126
  rubygems_version: 1.8.10