spidey-mongo 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .rspec
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in spidey-mongo.gemspec
4
+
5
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,78 @@
1
+ module Spidey::Strategies
2
+ module Mongo
3
+ attr_accessor :url_collection, :result_collection, :error_collection
4
+
5
+ module ClassMethods
6
+ def set_result_key(callback)
7
+ @result_key = callback
8
+ end
9
+
10
+ def result_key(spider, data)
11
+ case @result_key
12
+ when Symbol then spider.send(@result_key, data)
13
+ when Proc then @result_key.call(data)
14
+ else nil
15
+ end
16
+ end
17
+ end
18
+
19
+ def self.included(base)
20
+ base.extend ClassMethods
21
+ end
22
+
23
+ def initialize(attrs = {})
24
+ self.url_collection = attrs.delete(:url_collection)
25
+ self.result_collection = attrs.delete(:result_collection)
26
+ self.error_collection = attrs.delete(:error_collection)
27
+ super attrs
28
+ end
29
+
30
+ def crawl(options = {})
31
+ @crawl_started_at = Time.now
32
+ @until = Time.now + options[:crawl_for] if options[:crawl_for]
33
+ super options
34
+ end
35
+
36
+ def handle(url, handler, default_data = {})
37
+ $stderr.puts "Queueing #{url.inspect.truncate(500)}" if verbose
38
+ url_collection.update(
39
+ {'spider' => self.class.name, 'url' => url},
40
+ {'$set' => {'handler' => handler, 'default_data' => default_data}},
41
+ upsert: true
42
+ )
43
+ end
44
+
45
+ def record(data)
46
+ $stderr.puts "Recording #{data.inspect.truncate(500)}" if verbose
47
+ if key = self.class.result_key(self, data)
48
+ result_collection.update({'key' => key}, {'$set' => data}, upsert: true)
49
+ else
50
+ result_collection.insert data
51
+ end
52
+ end
53
+
54
+ def each_url(&block)
55
+ while url = get_next_url
56
+ break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch
57
+ url_collection.update({'_id' => url['_id']}, '$set' => {last_crawled_at: Time.now})
58
+ yield url['url'], url['handler'], url['default_data'].symbolize_keys
59
+ end
60
+ end
61
+
62
+ def add_error(attrs)
63
+ error = attrs.delete(:error)
64
+ error_collection.insert attrs.merge(created_at: Time.now, error: error.class.name, message: error.message)
65
+ $stderr.puts "Error on #{attrs[:url]}. #{error.class}: #{error.message}" if verbose
66
+ end
67
+
68
+ private
69
+
70
+ def get_next_url
71
+ return nil if (@until && Time.now >= @until) # exceeded time bound
72
+ url_collection.find_one({spider: self.class.name}, {
73
+ sort: [[:last_crawled_at, ::Mongo::ASCENDING], [:_id, ::Mongo::ASCENDING]]
74
+ })
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,5 @@
1
+ module Spidey
2
+ module Mongo
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ require 'spidey'
2
+ require 'mongo'
3
+
4
+ require 'spidey-mongo/version'
5
+ require 'spidey/strategies/mongo'
@@ -0,0 +1,8 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+ require 'spidey-mongo'
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+ end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spidey::Strategies::Mongo do
4
+ class TestSpider < Spidey::AbstractSpider
5
+ include Spidey::Strategies::Mongo
6
+ set_result_key ->(data) { data[:detail_url] }
7
+ handle "http://www.cnn.com", :process_home
8
+ end
9
+
10
+ before(:each) do
11
+ @db = Mongo::Connection.new['spidey-mongo-test']
12
+ @spider = TestSpider.new(
13
+ url_collection: @db['urls'],
14
+ result_collection: @db['results'],
15
+ error_collection: @db['errors'])
16
+ end
17
+
18
+ it "should add initial URLs to collection" do
19
+ doc = @db['urls'].find_one(url: "http://www.cnn.com")
20
+ doc['handler'].should == :process_home
21
+ end
22
+
23
+ it "should not add duplicate URLs" do
24
+ @spider.send :handle, "http://www.cnn.com", :process_home
25
+ @db['urls'].find(url: "http://www.cnn.com").count.should == 1
26
+ end
27
+
28
+ it "should add results" do
29
+ @spider.record detail_url: 'http://www.cnn.com', foo: 'bar'
30
+ @db['results'].count.should == 1
31
+ doc = @db['results'].find_one
32
+ doc['detail_url'].should == 'http://www.cnn.com'
33
+ doc['foo'].should == 'bar'
34
+ end
35
+
36
+ it "should add error" do
37
+ @spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
38
+ doc = @db['errors'].find_one
39
+ doc['error'].should == 'Exception'
40
+ doc['url'].should == 'http://www.cnn.com'
41
+ doc['handler'].should == :blah
42
+ doc['message'].should == 'WTF'
43
+ end
44
+
45
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "spidey-mongo/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "spidey-mongo"
7
+ s.version = Spidey::Mongo::VERSION
8
+ s.authors = ["Joey Aghion"]
9
+ s.email = ["joey@aghion.com"]
10
+ s.homepage = "https://github.com/joeyAghion/spidey-mongo"
11
+ s.summary = %q{Implements a MongoDB back-end for Spidey, a framework for crawling and scraping web sites.}
12
+ s.description = %q{Implements a MongoDB back-end for Spidey, a framework for crawling and scraping web sites.}
13
+
14
+ s.rubyforge_project = "spidey-mongo"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rake"
22
+ s.add_development_dependency "rspec"
23
+
24
+ s.add_runtime_dependency "spidey"
25
+ s.add_runtime_dependency "mongo"
26
+ s.add_runtime_dependency "bson_ext"
27
+ end
metadata ADDED
@@ -0,0 +1,119 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spidey-mongo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Joey Aghion
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-27 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: &70361603997680 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70361603997680
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &70361603997220 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70361603997220
36
+ - !ruby/object:Gem::Dependency
37
+ name: spidey
38
+ requirement: &70361603996320 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70361603996320
47
+ - !ruby/object:Gem::Dependency
48
+ name: mongo
49
+ requirement: &70361603995160 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70361603995160
58
+ - !ruby/object:Gem::Dependency
59
+ name: bson_ext
60
+ requirement: &70361603994540 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *70361603994540
69
+ description: Implements a MongoDB back-end for Spidey, a framework for crawling and
70
+ scraping web sites.
71
+ email:
72
+ - joey@aghion.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - .gitignore
78
+ - Gemfile
79
+ - Rakefile
80
+ - lib/spidey-mongo.rb
81
+ - lib/spidey-mongo/version.rb
82
+ - lib/spidey/strategies/mongo.rb
83
+ - spec/spec_helper.rb
84
+ - spec/spidey/strategies/mongo_spec.rb
85
+ - spidey-mongo.gemspec
86
+ homepage: https://github.com/joeyAghion/spidey-mongo
87
+ licenses: []
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ segments:
99
+ - 0
100
+ hash: 3377333768066102144
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ! '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ segments:
108
+ - 0
109
+ hash: 3377333768066102144
110
+ requirements: []
111
+ rubyforge_project: spidey-mongo
112
+ rubygems_version: 1.8.10
113
+ signing_key:
114
+ specification_version: 3
115
+ summary: Implements a MongoDB back-end for Spidey, a framework for crawling and scraping
116
+ web sites.
117
+ test_files:
118
+ - spec/spec_helper.rb
119
+ - spec/spidey/strategies/mongo_spec.rb