spidey-mongo 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +5 -2
- data/lib/spidey-mongo/version.rb +1 -1
- data/lib/spidey/strategies/mongo.rb +1 -19
- data/spec/spidey/strategies/mongo_spec.rb +14 -1
- data/spidey-mongo.gemspec +1 -0
- metadata +24 -13
data/README.md
CHANGED
@@ -46,12 +46,15 @@ With persistent storage of the URL-crawling queue, it's now possible to stop cra
|
|
46
46
|
|
47
47
|
### Recording Results
|
48
48
|
|
49
|
-
By default, invocations of `record(data)` by the spider simply insert new documents into the result collection. If corresponding results may already exist in the collection and should instead be updated,
|
49
|
+
By default, invocations of `record(data)` by the spider simply insert new documents into the result collection. If corresponding results may already exist in the collection and should instead be updated, define a `result_key` method that returns a key by which to find the corresponding document. The method is called with a hash of the data being recorded:
|
50
50
|
|
51
51
|
class EbaySpider < Spidey::AbstractSpider
|
52
52
|
include Spidey::Strategies::Mongo
|
53
|
-
set_result_key ->(data) { data[:auction_id] }
|
54
53
|
|
54
|
+
def result_key(data)
|
55
|
+
data[:detail_url]
|
56
|
+
end
|
57
|
+
|
55
58
|
# ...
|
56
59
|
end
|
57
60
|
|
data/lib/spidey-mongo/version.rb
CHANGED
@@ -2,24 +2,6 @@ module Spidey::Strategies
|
|
2
2
|
module Mongo
|
3
3
|
attr_accessor :url_collection, :result_collection, :error_collection
|
4
4
|
|
5
|
-
module ClassMethods
|
6
|
-
def set_result_key(callback)
|
7
|
-
@result_key = callback
|
8
|
-
end
|
9
|
-
|
10
|
-
def result_key(spider, data)
|
11
|
-
case @result_key
|
12
|
-
when Symbol then spider.send(@result_key, data)
|
13
|
-
when Proc then @result_key.call(data)
|
14
|
-
else nil
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def self.included(base)
|
20
|
-
base.extend ClassMethods
|
21
|
-
end
|
22
|
-
|
23
5
|
def initialize(attrs = {})
|
24
6
|
self.url_collection = attrs.delete(:url_collection)
|
25
7
|
self.result_collection = attrs.delete(:result_collection)
|
@@ -44,7 +26,7 @@ module Spidey::Strategies
|
|
44
26
|
|
45
27
|
def record(data)
|
46
28
|
$stderr.puts "Recording #{data.inspect.truncate(500)}" if verbose
|
47
|
-
if key =
|
29
|
+
if respond_to?(:result_key) && key = result_key(data)
|
48
30
|
result_collection.update({'key' => key}, {'$set' => data}, upsert: true)
|
49
31
|
else
|
50
32
|
result_collection.insert data
|
@@ -3,8 +3,11 @@ require 'spec_helper'
|
|
3
3
|
describe Spidey::Strategies::Mongo do
|
4
4
|
class TestSpider < Spidey::AbstractSpider
|
5
5
|
include Spidey::Strategies::Mongo
|
6
|
-
set_result_key ->(data) { data[:detail_url] }
|
7
6
|
handle "http://www.cnn.com", :process_home
|
7
|
+
|
8
|
+
def result_key(data)
|
9
|
+
data[:detail_url]
|
10
|
+
end
|
8
11
|
end
|
9
12
|
|
10
13
|
before(:each) do
|
@@ -15,6 +18,10 @@ describe Spidey::Strategies::Mongo do
|
|
15
18
|
error_collection: @db['errors'])
|
16
19
|
end
|
17
20
|
|
21
|
+
after(:each) do
|
22
|
+
%w{ urls results errors }.each{ |col| @db[col].drop }
|
23
|
+
end
|
24
|
+
|
18
25
|
it "should add initial URLs to collection" do
|
19
26
|
doc = @db['urls'].find_one(url: "http://www.cnn.com")
|
20
27
|
doc['handler'].should == :process_home
|
@@ -33,6 +40,12 @@ describe Spidey::Strategies::Mongo do
|
|
33
40
|
doc['foo'].should == 'bar'
|
34
41
|
end
|
35
42
|
|
43
|
+
it "should update existing result" do
|
44
|
+
@db['results'].insert key: 'http://foo.bar', detail_url: 'http://foo.bar'
|
45
|
+
@spider.record detail_url: 'http://foo.bar', foo: 'bar'
|
46
|
+
@db['results'].count.should == 1
|
47
|
+
end
|
48
|
+
|
36
49
|
it "should add error" do
|
37
50
|
@spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
|
38
51
|
doc = @db['errors'].find_one
|
data/spidey-mongo.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidey-mongo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-06-27 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &70312341584000 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70312341584000
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &70312341583440 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,21 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70312341583440
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: ruby-debug19
|
38
|
+
requirement: &70312341582900 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70312341582900
|
36
47
|
- !ruby/object:Gem::Dependency
|
37
48
|
name: spidey
|
38
|
-
requirement: &
|
49
|
+
requirement: &70312341582320 !ruby/object:Gem::Requirement
|
39
50
|
none: false
|
40
51
|
requirements:
|
41
52
|
- - ! '>='
|
@@ -43,10 +54,10 @@ dependencies:
|
|
43
54
|
version: '0'
|
44
55
|
type: :runtime
|
45
56
|
prerelease: false
|
46
|
-
version_requirements: *
|
57
|
+
version_requirements: *70312341582320
|
47
58
|
- !ruby/object:Gem::Dependency
|
48
59
|
name: mongo
|
49
|
-
requirement: &
|
60
|
+
requirement: &70312341581640 !ruby/object:Gem::Requirement
|
50
61
|
none: false
|
51
62
|
requirements:
|
52
63
|
- - ! '>='
|
@@ -54,10 +65,10 @@ dependencies:
|
|
54
65
|
version: '0'
|
55
66
|
type: :runtime
|
56
67
|
prerelease: false
|
57
|
-
version_requirements: *
|
68
|
+
version_requirements: *70312341581640
|
58
69
|
- !ruby/object:Gem::Dependency
|
59
70
|
name: bson_ext
|
60
|
-
requirement: &
|
71
|
+
requirement: &70312341580620 !ruby/object:Gem::Requirement
|
61
72
|
none: false
|
62
73
|
requirements:
|
63
74
|
- - ! '>='
|
@@ -65,7 +76,7 @@ dependencies:
|
|
65
76
|
version: '0'
|
66
77
|
type: :runtime
|
67
78
|
prerelease: false
|
68
|
-
version_requirements: *
|
79
|
+
version_requirements: *70312341580620
|
69
80
|
description: Implements a MongoDB back-end for Spidey, a framework for crawling and
|
70
81
|
scraping web sites.
|
71
82
|
email:
|
@@ -100,7 +111,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
100
111
|
version: '0'
|
101
112
|
segments:
|
102
113
|
- 0
|
103
|
-
hash:
|
114
|
+
hash: -1659482461320296287
|
104
115
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
105
116
|
none: false
|
106
117
|
requirements:
|
@@ -109,7 +120,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
120
|
version: '0'
|
110
121
|
segments:
|
111
122
|
- 0
|
112
|
-
hash:
|
123
|
+
hash: -1659482461320296287
|
113
124
|
requirements: []
|
114
125
|
rubyforge_project: spidey-mongo
|
115
126
|
rubygems_version: 1.8.10
|