spidey-mongo 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/Gemfile +5 -0
- data/Rakefile +1 -0
- data/lib/spidey/strategies/mongo.rb +78 -0
- data/lib/spidey-mongo/version.rb +5 -0
- data/lib/spidey-mongo.rb +5 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/spidey/strategies/mongo_spec.rb +45 -0
- data/spidey-mongo.gemspec +27 -0
- metadata +119 -0
data/Gemfile
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Spidey::Strategies
|
2
|
+
module Mongo
|
3
|
+
attr_accessor :url_collection, :result_collection, :error_collection
|
4
|
+
|
5
|
+
module ClassMethods
|
6
|
+
def set_result_key(callback)
|
7
|
+
@result_key = callback
|
8
|
+
end
|
9
|
+
|
10
|
+
def result_key(spider, data)
|
11
|
+
case @result_key
|
12
|
+
when Symbol then spider.send(@result_key, data)
|
13
|
+
when Proc then @result_key.call(data)
|
14
|
+
else nil
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.included(base)
|
20
|
+
base.extend ClassMethods
|
21
|
+
end
|
22
|
+
|
23
|
+
def initialize(attrs = {})
|
24
|
+
self.url_collection = attrs.delete(:url_collection)
|
25
|
+
self.result_collection = attrs.delete(:result_collection)
|
26
|
+
self.error_collection = attrs.delete(:error_collection)
|
27
|
+
super attrs
|
28
|
+
end
|
29
|
+
|
30
|
+
def crawl(options = {})
|
31
|
+
@crawl_started_at = Time.now
|
32
|
+
@until = Time.now + options[:crawl_for] if options[:crawl_for]
|
33
|
+
super options
|
34
|
+
end
|
35
|
+
|
36
|
+
def handle(url, handler, default_data = {})
|
37
|
+
$stderr.puts "Queueing #{url.inspect.truncate(500)}" if verbose
|
38
|
+
url_collection.update(
|
39
|
+
{'spider' => self.class.name, 'url' => url},
|
40
|
+
{'$set' => {'handler' => handler, 'default_data' => default_data}},
|
41
|
+
upsert: true
|
42
|
+
)
|
43
|
+
end
|
44
|
+
|
45
|
+
def record(data)
|
46
|
+
$stderr.puts "Recording #{data.inspect.truncate(500)}" if verbose
|
47
|
+
if key = self.class.result_key(self, data)
|
48
|
+
result_collection.update({'key' => key}, {'$set' => data}, upsert: true)
|
49
|
+
else
|
50
|
+
result_collection.insert data
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
def each_url(&block)
|
55
|
+
while url = get_next_url
|
56
|
+
break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch
|
57
|
+
url_collection.update({'_id' => url['_id']}, '$set' => {last_crawled_at: Time.now})
|
58
|
+
yield url['url'], url['handler'], url['default_data'].symbolize_keys
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def add_error(attrs)
|
63
|
+
error = attrs.delete(:error)
|
64
|
+
error_collection.insert attrs.merge(created_at: Time.now, error: error.class.name, message: error.message)
|
65
|
+
$stderr.puts "Error on #{attrs[:url]}. #{error.class}: #{error.message}" if verbose
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
def get_next_url
|
71
|
+
return nil if (@until && Time.now >= @until) # exceeded time bound
|
72
|
+
url_collection.find_one({spider: self.class.name}, {
|
73
|
+
sort: [[:last_crawled_at, ::Mongo::ASCENDING], [:_id, ::Mongo::ASCENDING]]
|
74
|
+
})
|
75
|
+
end
|
76
|
+
|
77
|
+
end
|
78
|
+
end
|
data/lib/spidey-mongo.rb
ADDED
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Spidey::Strategies::Mongo do
|
4
|
+
class TestSpider < Spidey::AbstractSpider
|
5
|
+
include Spidey::Strategies::Mongo
|
6
|
+
set_result_key ->(data) { data[:detail_url] }
|
7
|
+
handle "http://www.cnn.com", :process_home
|
8
|
+
end
|
9
|
+
|
10
|
+
before(:each) do
|
11
|
+
@db = Mongo::Connection.new['spidey-mongo-test']
|
12
|
+
@spider = TestSpider.new(
|
13
|
+
url_collection: @db['urls'],
|
14
|
+
result_collection: @db['results'],
|
15
|
+
error_collection: @db['errors'])
|
16
|
+
end
|
17
|
+
|
18
|
+
it "should add initial URLs to collection" do
|
19
|
+
doc = @db['urls'].find_one(url: "http://www.cnn.com")
|
20
|
+
doc['handler'].should == :process_home
|
21
|
+
end
|
22
|
+
|
23
|
+
it "should not add duplicate URLs" do
|
24
|
+
@spider.send :handle, "http://www.cnn.com", :process_home
|
25
|
+
@db['urls'].find(url: "http://www.cnn.com").count.should == 1
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should add results" do
|
29
|
+
@spider.record detail_url: 'http://www.cnn.com', foo: 'bar'
|
30
|
+
@db['results'].count.should == 1
|
31
|
+
doc = @db['results'].find_one
|
32
|
+
doc['detail_url'].should == 'http://www.cnn.com'
|
33
|
+
doc['foo'].should == 'bar'
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should add error" do
|
37
|
+
@spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
|
38
|
+
doc = @db['errors'].find_one
|
39
|
+
doc['error'].should == 'Exception'
|
40
|
+
doc['url'].should == 'http://www.cnn.com'
|
41
|
+
doc['handler'].should == :blah
|
42
|
+
doc['message'].should == 'WTF'
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "spidey-mongo/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "spidey-mongo"
|
7
|
+
s.version = Spidey::Mongo::VERSION
|
8
|
+
s.authors = ["Joey Aghion"]
|
9
|
+
s.email = ["joey@aghion.com"]
|
10
|
+
s.homepage = "https://github.com/joeyAghion/spidey-mongo"
|
11
|
+
s.summary = %q{Implements a MongoDB back-end for Spidey, a framework for crawling and scraping web sites.}
|
12
|
+
s.description = %q{Implements a MongoDB back-end for Spidey, a framework for crawling and scraping web sites.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "spidey-mongo"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_development_dependency "rake"
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
|
24
|
+
s.add_runtime_dependency "spidey"
|
25
|
+
s.add_runtime_dependency "mongo"
|
26
|
+
s.add_runtime_dependency "bson_ext"
|
27
|
+
end
|
metadata
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spidey-mongo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Joey Aghion
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-27 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: &70361603997680 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70361603997680
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
requirement: &70361603997220 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70361603997220
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: spidey
|
38
|
+
requirement: &70361603996320 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70361603996320
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: mongo
|
49
|
+
requirement: &70361603995160 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :runtime
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *70361603995160
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: bson_ext
|
60
|
+
requirement: &70361603994540 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :runtime
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70361603994540
|
69
|
+
description: Implements a MongoDB back-end for Spidey, a framework for crawling and
|
70
|
+
scraping web sites.
|
71
|
+
email:
|
72
|
+
- joey@aghion.com
|
73
|
+
executables: []
|
74
|
+
extensions: []
|
75
|
+
extra_rdoc_files: []
|
76
|
+
files:
|
77
|
+
- .gitignore
|
78
|
+
- Gemfile
|
79
|
+
- Rakefile
|
80
|
+
- lib/spidey-mongo.rb
|
81
|
+
- lib/spidey-mongo/version.rb
|
82
|
+
- lib/spidey/strategies/mongo.rb
|
83
|
+
- spec/spec_helper.rb
|
84
|
+
- spec/spidey/strategies/mongo_spec.rb
|
85
|
+
- spidey-mongo.gemspec
|
86
|
+
homepage: https://github.com/joeyAghion/spidey-mongo
|
87
|
+
licenses: []
|
88
|
+
post_install_message:
|
89
|
+
rdoc_options: []
|
90
|
+
require_paths:
|
91
|
+
- lib
|
92
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
93
|
+
none: false
|
94
|
+
requirements:
|
95
|
+
- - ! '>='
|
96
|
+
- !ruby/object:Gem::Version
|
97
|
+
version: '0'
|
98
|
+
segments:
|
99
|
+
- 0
|
100
|
+
hash: 3377333768066102144
|
101
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
102
|
+
none: false
|
103
|
+
requirements:
|
104
|
+
- - ! '>='
|
105
|
+
- !ruby/object:Gem::Version
|
106
|
+
version: '0'
|
107
|
+
segments:
|
108
|
+
- 0
|
109
|
+
hash: 3377333768066102144
|
110
|
+
requirements: []
|
111
|
+
rubyforge_project: spidey-mongo
|
112
|
+
rubygems_version: 1.8.10
|
113
|
+
signing_key:
|
114
|
+
specification_version: 3
|
115
|
+
summary: Implements a MongoDB back-end for Spidey, a framework for crawling and scraping
|
116
|
+
web sites.
|
117
|
+
test_files:
|
118
|
+
- spec/spec_helper.rb
|
119
|
+
- spec/spidey/strategies/mongo_spec.rb
|