spidey-mongo 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +21 -2
- data/lib/spidey/strategies/mongo.rb +8 -8
- data/lib/spidey/strategies/moped.rb +62 -0
- data/lib/spidey-mongo/version.rb +1 -1
- data/lib/spidey-mongo.rb +2 -2
- data/spec/spidey/strategies/mongo_spec.rb +15 -14
- data/spec/spidey/strategies/moped_spec.rb +63 -0
- data/spidey-mongo.gemspec +4 -3
- metadata +31 -12
data/README.md
CHANGED
@@ -5,7 +5,7 @@ This gem implements a [MongoDB](http://www.mongodb.org/) back-end for [Spidey](h
|
|
5
5
|
|
6
6
|
See [Spidey](https://githubcom/joeyAghion/spidey)'s documentation for a basic example spider class.
|
7
7
|
|
8
|
-
The default implementation stores the queue of URLs being crawled, any generated results, and errors as attributes on the spider instance (i.e., in memory). By including this gem's
|
8
|
+
The default implementation stores the queue of URLs being crawled, any generated results, and errors as attributes on the spider instance (i.e., in memory). By including this gem's module, spider implementations can store them in a MongoDB database instead.
|
9
9
|
|
10
10
|
Usage
|
11
11
|
-----
|
@@ -15,6 +15,16 @@ Usage
|
|
15
15
|
gem install spidey-mongo
|
16
16
|
|
17
17
|
|
18
|
+
### `mongo` versus `moped`
|
19
|
+
|
20
|
+
Spidey-Mongo provides two strategies:
|
21
|
+
|
22
|
+
* `Spidey::Strategies::Mongo`: Compatible with 10gen's [`mongo`](https://github.com/mongodb/mongo-ruby-driver) gem
|
23
|
+
* `Spidey::Strategies::Moped`: Compatible with the [`moped`](https://github.com/mongoid/moped) gem, e.g., for use with Mongoid 3.x
|
24
|
+
|
25
|
+
You can include either strategy in your classes, as appropriate. All the examples in this README assume `Spidey::Strategies::Mongo`.
|
26
|
+
|
27
|
+
|
18
28
|
### Example spider class
|
19
29
|
|
20
30
|
class EbaySpider < Spidey::AbstractSpider
|
@@ -65,6 +75,15 @@ Testing
|
|
65
75
|
|
66
76
|
bundle exec rspec
|
67
77
|
|
78
|
+
Contributors
|
79
|
+
------------
|
80
|
+
|
81
|
+
[Joey Aghion](https://github.com/joeyAghion), [Frank Macreery](https://github.com/fancyremarker)
|
82
|
+
|
83
|
+
To Do
|
84
|
+
-----
|
85
|
+
* Extract behaviors shared by `Mongo` and `Moped` strategies.
|
86
|
+
|
68
87
|
Copyright
|
69
88
|
---------
|
70
|
-
Copyright (c) 2012 Joey Aghion,
|
89
|
+
Copyright (c) 2012, 2013 Joey Aghion, Artsy Inc. See [LICENSE.txt](LICENSE.txt) for further details.
|
@@ -1,20 +1,20 @@
|
|
1
1
|
module Spidey::Strategies
|
2
2
|
module Mongo
|
3
3
|
attr_accessor :url_collection, :result_collection, :error_collection
|
4
|
-
|
4
|
+
|
5
5
|
def initialize(attrs = {})
|
6
6
|
self.url_collection = attrs.delete(:url_collection)
|
7
7
|
self.result_collection = attrs.delete(:result_collection)
|
8
8
|
self.error_collection = attrs.delete(:error_collection)
|
9
9
|
super attrs
|
10
10
|
end
|
11
|
-
|
11
|
+
|
12
12
|
def crawl(options = {})
|
13
13
|
@crawl_started_at = Time.now
|
14
14
|
@until = Time.now + options[:crawl_for] if options[:crawl_for]
|
15
15
|
super options
|
16
16
|
end
|
17
|
-
|
17
|
+
|
18
18
|
def handle(url, handler, default_data = {})
|
19
19
|
Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
|
20
20
|
url_collection.update(
|
@@ -23,7 +23,7 @@ module Spidey::Strategies
|
|
23
23
|
upsert: true
|
24
24
|
)
|
25
25
|
end
|
26
|
-
|
26
|
+
|
27
27
|
def record(data)
|
28
28
|
doc = data.merge('spider' => self.class.name)
|
29
29
|
Spidey.logger.info "Recording #{doc.inspect[0..500]}..."
|
@@ -33,7 +33,7 @@ module Spidey::Strategies
|
|
33
33
|
result_collection.insert doc
|
34
34
|
end
|
35
35
|
end
|
36
|
-
|
36
|
+
|
37
37
|
def each_url(&block)
|
38
38
|
while url = get_next_url
|
39
39
|
break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch
|
@@ -41,14 +41,14 @@ module Spidey::Strategies
|
|
41
41
|
yield url['url'], url['handler'], url['default_data'].symbolize_keys
|
42
42
|
end
|
43
43
|
end
|
44
|
-
|
44
|
+
|
45
45
|
def add_error(attrs)
|
46
46
|
error = attrs.delete(:error)
|
47
47
|
doc = attrs.merge(created_at: Time.now, error: error.class.name, message: error.message, spider: self.class.name)
|
48
48
|
error_collection.insert doc
|
49
49
|
Spidey.logger.error "Error on #{attrs[:url]}. #{error.class}: #{error.message}"
|
50
50
|
end
|
51
|
-
|
51
|
+
|
52
52
|
private
|
53
53
|
|
54
54
|
def get_next_url
|
@@ -57,6 +57,6 @@ module Spidey::Strategies
|
|
57
57
|
sort: [[:last_crawled_at, ::Mongo::ASCENDING], [:_id, ::Mongo::ASCENDING]]
|
58
58
|
})
|
59
59
|
end
|
60
|
-
|
60
|
+
|
61
61
|
end
|
62
62
|
end
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Spidey::Strategies
|
2
|
+
module Moped
|
3
|
+
attr_accessor :url_collection, :result_collection, :error_collection
|
4
|
+
|
5
|
+
def initialize(attrs = {})
|
6
|
+
self.url_collection = attrs.delete(:url_collection)
|
7
|
+
self.result_collection = attrs.delete(:result_collection)
|
8
|
+
self.error_collection = attrs.delete(:error_collection)
|
9
|
+
super attrs
|
10
|
+
end
|
11
|
+
|
12
|
+
def crawl(options = {})
|
13
|
+
@crawl_started_at = Time.now
|
14
|
+
@until = Time.now + options[:crawl_for] if options[:crawl_for]
|
15
|
+
super options
|
16
|
+
end
|
17
|
+
|
18
|
+
def handle(url, handler, default_data = {})
|
19
|
+
Spidey.logger.info "Queueing #{url.inspect[0..200]}..."
|
20
|
+
url_collection.find(
|
21
|
+
{'spider' => self.class.name, 'url' => url}
|
22
|
+
).upsert(
|
23
|
+
{'$set' => {'handler' => handler, 'default_data' => default_data}}
|
24
|
+
)
|
25
|
+
end
|
26
|
+
|
27
|
+
def record(data)
|
28
|
+
doc = data.merge('spider' => self.class.name)
|
29
|
+
Spidey.logger.info "Recording #{doc.inspect[0..500]}..."
|
30
|
+
if respond_to?(:result_key) && key = result_key(doc)
|
31
|
+
result_collection.find({'key' => key}).upsert({'$set' => doc})
|
32
|
+
else
|
33
|
+
result_collection.insert doc
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def each_url(&block)
|
38
|
+
while url = get_next_url
|
39
|
+
break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch
|
40
|
+
url_collection.find({'_id' => url['_id']}).update('$set' => {last_crawled_at: Time.now})
|
41
|
+
yield url['url'], url['handler'], url['default_data'].symbolize_keys
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def add_error(attrs)
|
46
|
+
error = attrs.delete(:error)
|
47
|
+
doc = attrs.merge(created_at: Time.now, error: error.class.name, message: error.message, spider: self.class.name)
|
48
|
+
error_collection.insert doc
|
49
|
+
Spidey.logger.error "Error on #{attrs[:url]}. #{error.class}: #{error.message}"
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def get_next_url
|
55
|
+
return nil if (@until && Time.now >= @until) # exceeded time bound
|
56
|
+
url_collection.find({spider: self.class.name}).sort({
|
57
|
+
'last_crawled_at' => 1, '_id' => 1
|
58
|
+
}).first
|
59
|
+
end
|
60
|
+
|
61
|
+
end
|
62
|
+
end
|
data/lib/spidey-mongo/version.rb
CHANGED
data/lib/spidey-mongo.rb
CHANGED
@@ -1,53 +1,54 @@
|
|
1
1
|
require 'spec_helper'
|
2
|
+
require 'mongo'
|
2
3
|
|
3
4
|
describe Spidey::Strategies::Mongo do
|
4
|
-
class
|
5
|
+
class TestMongoSpider < Spidey::AbstractSpider
|
5
6
|
include Spidey::Strategies::Mongo
|
6
7
|
handle "http://www.cnn.com", :process_home
|
7
|
-
|
8
|
+
|
8
9
|
def result_key(data)
|
9
10
|
data[:detail_url]
|
10
11
|
end
|
11
12
|
end
|
12
|
-
|
13
|
+
|
13
14
|
before(:each) do
|
14
15
|
@db = Mongo::Connection.new['spidey-mongo-test']
|
15
|
-
@spider =
|
16
|
+
@spider = TestMongoSpider.new(
|
16
17
|
url_collection: @db['urls'],
|
17
18
|
result_collection: @db['results'],
|
18
19
|
error_collection: @db['errors'])
|
19
20
|
end
|
20
|
-
|
21
|
+
|
21
22
|
after(:each) do
|
22
23
|
%w{ urls results errors }.each{ |col| @db[col].drop }
|
23
24
|
end
|
24
|
-
|
25
|
+
|
25
26
|
it "should add initial URLs to collection" do
|
26
27
|
doc = @db['urls'].find_one(url: "http://www.cnn.com")
|
27
28
|
doc['handler'].should == :process_home
|
28
|
-
doc['spider'].should == '
|
29
|
+
doc['spider'].should == 'TestMongoSpider'
|
29
30
|
end
|
30
|
-
|
31
|
+
|
31
32
|
it "should not add duplicate URLs" do
|
32
33
|
@spider.send :handle, "http://www.cnn.com", :process_home
|
33
34
|
@db['urls'].find(url: "http://www.cnn.com").count.should == 1
|
34
35
|
end
|
35
|
-
|
36
|
+
|
36
37
|
it "should add results" do
|
37
38
|
@spider.record detail_url: 'http://www.cnn.com', foo: 'bar'
|
38
39
|
@db['results'].count.should == 1
|
39
40
|
doc = @db['results'].find_one
|
40
41
|
doc['detail_url'].should == 'http://www.cnn.com'
|
41
42
|
doc['foo'].should == 'bar'
|
42
|
-
doc['spider'].should == '
|
43
|
+
doc['spider'].should == 'TestMongoSpider'
|
43
44
|
end
|
44
|
-
|
45
|
+
|
45
46
|
it "should update existing result" do
|
46
47
|
@db['results'].insert key: 'http://foo.bar', detail_url: 'http://foo.bar'
|
47
48
|
@spider.record detail_url: 'http://foo.bar', foo: 'bar'
|
48
49
|
@db['results'].count.should == 1
|
49
50
|
end
|
50
|
-
|
51
|
+
|
51
52
|
it "should add error" do
|
52
53
|
@spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
|
53
54
|
doc = @db['errors'].find_one
|
@@ -55,7 +56,7 @@ describe Spidey::Strategies::Mongo do
|
|
55
56
|
doc['url'].should == 'http://www.cnn.com'
|
56
57
|
doc['handler'].should == :blah
|
57
58
|
doc['message'].should == 'WTF'
|
58
|
-
doc['spider'].should == '
|
59
|
+
doc['spider'].should == 'TestMongoSpider'
|
59
60
|
end
|
60
|
-
|
61
|
+
|
61
62
|
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'moped'
|
3
|
+
|
4
|
+
describe Spidey::Strategies::Moped do
|
5
|
+
class TestMopedSpider < Spidey::AbstractSpider
|
6
|
+
include Spidey::Strategies::Moped
|
7
|
+
handle "http://www.cnn.com", :process_home
|
8
|
+
|
9
|
+
def result_key(data)
|
10
|
+
data[:detail_url]
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
before(:each) do
|
15
|
+
@db = Moped::Session.new(['127.0.0.1:27017'])
|
16
|
+
@db.use 'spidey-mongo-test'
|
17
|
+
@spider = TestMopedSpider.new(
|
18
|
+
url_collection: @db['urls'],
|
19
|
+
result_collection: @db['results'],
|
20
|
+
error_collection: @db['errors'])
|
21
|
+
end
|
22
|
+
|
23
|
+
after(:each) do
|
24
|
+
%w{ urls results errors }.each{ |col| @db[col].drop }
|
25
|
+
end
|
26
|
+
|
27
|
+
it "should add initial URLs to collection" do
|
28
|
+
doc = @db['urls'].find(url: "http://www.cnn.com").first
|
29
|
+
doc['handler'].should == :process_home
|
30
|
+
doc['spider'].should == 'TestMopedSpider'
|
31
|
+
end
|
32
|
+
|
33
|
+
it "should not add duplicate URLs" do
|
34
|
+
@spider.send :handle, "http://www.cnn.com", :process_home
|
35
|
+
@db['urls'].find(url: "http://www.cnn.com").count.should == 1
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should add results" do
|
39
|
+
@spider.record detail_url: 'http://www.cnn.com', foo: 'bar'
|
40
|
+
@db['results'].find.count.should == 1
|
41
|
+
doc = @db['results'].find.first
|
42
|
+
doc['detail_url'].should == 'http://www.cnn.com'
|
43
|
+
doc['foo'].should == 'bar'
|
44
|
+
doc['spider'].should == 'TestMopedSpider'
|
45
|
+
end
|
46
|
+
|
47
|
+
it "should update existing result" do
|
48
|
+
@db['results'].insert key: 'http://foo.bar', detail_url: 'http://foo.bar'
|
49
|
+
@spider.record detail_url: 'http://foo.bar', foo: 'bar'
|
50
|
+
@db['results'].find.count.should == 1
|
51
|
+
end
|
52
|
+
|
53
|
+
it "should add error" do
|
54
|
+
@spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
|
55
|
+
doc = @db['errors'].find.first
|
56
|
+
doc['error'].should == 'Exception'
|
57
|
+
doc['url'].should == 'http://www.cnn.com'
|
58
|
+
doc['handler'].should == :blah
|
59
|
+
doc['message'].should == 'WTF'
|
60
|
+
doc['spider'].should == 'TestMopedSpider'
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
data/spidey-mongo.gemspec
CHANGED
@@ -21,8 +21,9 @@ Gem::Specification.new do |s|
|
|
21
21
|
|
22
22
|
s.add_development_dependency "rake"
|
23
23
|
s.add_development_dependency "rspec"
|
24
|
-
|
24
|
+
s.add_development_dependency "mongo"
|
25
|
+
s.add_development_dependency "bson_ext"
|
26
|
+
s.add_development_dependency "moped"
|
27
|
+
|
25
28
|
s.add_runtime_dependency "spidey", ">= 0.1.0"
|
26
|
-
s.add_runtime_dependency "mongo"
|
27
|
-
s.add_runtime_dependency "bson_ext"
|
28
29
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidey-mongo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-08-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -44,30 +44,30 @@ dependencies:
|
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
46
|
- !ruby/object:Gem::Dependency
|
47
|
-
name:
|
47
|
+
name: mongo
|
48
48
|
requirement: !ruby/object:Gem::Requirement
|
49
49
|
none: false
|
50
50
|
requirements:
|
51
51
|
- - ! '>='
|
52
52
|
- !ruby/object:Gem::Version
|
53
|
-
version: 0
|
54
|
-
type: :
|
53
|
+
version: '0'
|
54
|
+
type: :development
|
55
55
|
prerelease: false
|
56
56
|
version_requirements: !ruby/object:Gem::Requirement
|
57
57
|
none: false
|
58
58
|
requirements:
|
59
59
|
- - ! '>='
|
60
60
|
- !ruby/object:Gem::Version
|
61
|
-
version: 0
|
61
|
+
version: '0'
|
62
62
|
- !ruby/object:Gem::Dependency
|
63
|
-
name:
|
63
|
+
name: bson_ext
|
64
64
|
requirement: !ruby/object:Gem::Requirement
|
65
65
|
none: false
|
66
66
|
requirements:
|
67
67
|
- - ! '>='
|
68
68
|
- !ruby/object:Gem::Version
|
69
69
|
version: '0'
|
70
|
-
type: :
|
70
|
+
type: :development
|
71
71
|
prerelease: false
|
72
72
|
version_requirements: !ruby/object:Gem::Requirement
|
73
73
|
none: false
|
@@ -76,14 +76,14 @@ dependencies:
|
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
78
|
- !ruby/object:Gem::Dependency
|
79
|
-
name:
|
79
|
+
name: moped
|
80
80
|
requirement: !ruby/object:Gem::Requirement
|
81
81
|
none: false
|
82
82
|
requirements:
|
83
83
|
- - ! '>='
|
84
84
|
- !ruby/object:Gem::Version
|
85
85
|
version: '0'
|
86
|
-
type: :
|
86
|
+
type: :development
|
87
87
|
prerelease: false
|
88
88
|
version_requirements: !ruby/object:Gem::Requirement
|
89
89
|
none: false
|
@@ -91,6 +91,22 @@ dependencies:
|
|
91
91
|
- - ! '>='
|
92
92
|
- !ruby/object:Gem::Version
|
93
93
|
version: '0'
|
94
|
+
- !ruby/object:Gem::Dependency
|
95
|
+
name: spidey
|
96
|
+
requirement: !ruby/object:Gem::Requirement
|
97
|
+
none: false
|
98
|
+
requirements:
|
99
|
+
- - ! '>='
|
100
|
+
- !ruby/object:Gem::Version
|
101
|
+
version: 0.1.0
|
102
|
+
type: :runtime
|
103
|
+
prerelease: false
|
104
|
+
version_requirements: !ruby/object:Gem::Requirement
|
105
|
+
none: false
|
106
|
+
requirements:
|
107
|
+
- - ! '>='
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: 0.1.0
|
94
110
|
description: Implements a MongoDB back-end for Spidey, a framework for crawling and
|
95
111
|
scraping web sites.
|
96
112
|
email:
|
@@ -107,8 +123,10 @@ files:
|
|
107
123
|
- lib/spidey-mongo.rb
|
108
124
|
- lib/spidey-mongo/version.rb
|
109
125
|
- lib/spidey/strategies/mongo.rb
|
126
|
+
- lib/spidey/strategies/moped.rb
|
110
127
|
- spec/spec_helper.rb
|
111
128
|
- spec/spidey/strategies/mongo_spec.rb
|
129
|
+
- spec/spidey/strategies/moped_spec.rb
|
112
130
|
- spidey-mongo.gemspec
|
113
131
|
homepage: https://github.com/joeyAghion/spidey-mongo
|
114
132
|
licenses:
|
@@ -125,7 +143,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
125
143
|
version: '0'
|
126
144
|
segments:
|
127
145
|
- 0
|
128
|
-
hash:
|
146
|
+
hash: 987129952958952365
|
129
147
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
148
|
none: false
|
131
149
|
requirements:
|
@@ -134,7 +152,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
134
152
|
version: '0'
|
135
153
|
segments:
|
136
154
|
- 0
|
137
|
-
hash:
|
155
|
+
hash: 987129952958952365
|
138
156
|
requirements: []
|
139
157
|
rubyforge_project: spidey-mongo
|
140
158
|
rubygems_version: 1.8.25
|
@@ -145,3 +163,4 @@ summary: Implements a MongoDB back-end for Spidey, a framework for crawling and
|
|
145
163
|
test_files:
|
146
164
|
- spec/spec_helper.rb
|
147
165
|
- spec/spidey/strategies/mongo_spec.rb
|
166
|
+
- spec/spidey/strategies/moped_spec.rb
|