spidey-mongo 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .rspec
data/Gemfile ADDED
@@ -0,0 +1,5 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in spidey-mongo.gemspec
4
+
5
+ gemspec
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,78 @@
1
+ module Spidey::Strategies
2
+ module Mongo
3
+ attr_accessor :url_collection, :result_collection, :error_collection
4
+
5
+ module ClassMethods
6
+ def set_result_key(callback)
7
+ @result_key = callback
8
+ end
9
+
10
+ def result_key(spider, data)
11
+ case @result_key
12
+ when Symbol then spider.send(@result_key, data)
13
+ when Proc then @result_key.call(data)
14
+ else nil
15
+ end
16
+ end
17
+ end
18
+
19
+ def self.included(base)
20
+ base.extend ClassMethods
21
+ end
22
+
23
+ def initialize(attrs = {})
24
+ self.url_collection = attrs.delete(:url_collection)
25
+ self.result_collection = attrs.delete(:result_collection)
26
+ self.error_collection = attrs.delete(:error_collection)
27
+ super attrs
28
+ end
29
+
30
+ def crawl(options = {})
31
+ @crawl_started_at = Time.now
32
+ @until = Time.now + options[:crawl_for] if options[:crawl_for]
33
+ super options
34
+ end
35
+
36
+ def handle(url, handler, default_data = {})
37
+ $stderr.puts "Queueing #{url.inspect.truncate(500)}" if verbose
38
+ url_collection.update(
39
+ {'spider' => self.class.name, 'url' => url},
40
+ {'$set' => {'handler' => handler, 'default_data' => default_data}},
41
+ upsert: true
42
+ )
43
+ end
44
+
45
+ def record(data)
46
+ $stderr.puts "Recording #{data.inspect.truncate(500)}" if verbose
47
+ if key = self.class.result_key(self, data)
48
+ result_collection.update({'key' => key}, {'$set' => data}, upsert: true)
49
+ else
50
+ result_collection.insert data
51
+ end
52
+ end
53
+
54
+ def each_url(&block)
55
+ while url = get_next_url
56
+ break if url['last_crawled_at'] && url['last_crawled_at'] >= @crawl_started_at # crawled already in this batch
57
+ url_collection.update({'_id' => url['_id']}, '$set' => {last_crawled_at: Time.now})
58
+ yield url['url'], url['handler'], url['default_data'].symbolize_keys
59
+ end
60
+ end
61
+
62
+ def add_error(attrs)
63
+ error = attrs.delete(:error)
64
+ error_collection.insert attrs.merge(created_at: Time.now, error: error.class.name, message: error.message)
65
+ $stderr.puts "Error on #{attrs[:url]}. #{error.class}: #{error.message}" if verbose
66
+ end
67
+
68
+ private
69
+
70
+ def get_next_url
71
+ return nil if (@until && Time.now >= @until) # exceeded time bound
72
+ url_collection.find_one({spider: self.class.name}, {
73
+ sort: [[:last_crawled_at, ::Mongo::ASCENDING], [:_id, ::Mongo::ASCENDING]]
74
+ })
75
+ end
76
+
77
+ end
78
+ end
@@ -0,0 +1,5 @@
1
+ module Spidey
2
+ module Mongo
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,5 @@
1
+ require 'spidey'
2
+ require 'mongo'
3
+
4
+ require 'spidey-mongo/version'
5
+ require 'spidey/strategies/mongo'
@@ -0,0 +1,8 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+ require 'spidey-mongo'
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+ end
@@ -0,0 +1,45 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spidey::Strategies::Mongo do
4
+ class TestSpider < Spidey::AbstractSpider
5
+ include Spidey::Strategies::Mongo
6
+ set_result_key ->(data) { data[:detail_url] }
7
+ handle "http://www.cnn.com", :process_home
8
+ end
9
+
10
+ before(:each) do
11
+ @db = Mongo::Connection.new['spidey-mongo-test']
12
+ @spider = TestSpider.new(
13
+ url_collection: @db['urls'],
14
+ result_collection: @db['results'],
15
+ error_collection: @db['errors'])
16
+ end
17
+
18
+ it "should add initial URLs to collection" do
19
+ doc = @db['urls'].find_one(url: "http://www.cnn.com")
20
+ doc['handler'].should == :process_home
21
+ end
22
+
23
+ it "should not add duplicate URLs" do
24
+ @spider.send :handle, "http://www.cnn.com", :process_home
25
+ @db['urls'].find(url: "http://www.cnn.com").count.should == 1
26
+ end
27
+
28
+ it "should add results" do
29
+ @spider.record detail_url: 'http://www.cnn.com', foo: 'bar'
30
+ @db['results'].count.should == 1
31
+ doc = @db['results'].find_one
32
+ doc['detail_url'].should == 'http://www.cnn.com'
33
+ doc['foo'].should == 'bar'
34
+ end
35
+
36
+ it "should add error" do
37
+ @spider.add_error error: Exception.new("WTF"), url: "http://www.cnn.com", handler: :blah
38
+ doc = @db['errors'].find_one
39
+ doc['error'].should == 'Exception'
40
+ doc['url'].should == 'http://www.cnn.com'
41
+ doc['handler'].should == :blah
42
+ doc['message'].should == 'WTF'
43
+ end
44
+
45
+ end
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "spidey-mongo/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "spidey-mongo"
7
+ s.version = Spidey::Mongo::VERSION
8
+ s.authors = ["Joey Aghion"]
9
+ s.email = ["joey@aghion.com"]
10
+ s.homepage = "https://github.com/joeyAghion/spidey-mongo"
11
+ s.summary = %q{Implements a MongoDB back-end for Spidey, a framework for crawling and scraping web sites.}
12
+ s.description = %q{Implements a MongoDB back-end for Spidey, a framework for crawling and scraping web sites.}
13
+
14
+ s.rubyforge_project = "spidey-mongo"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rake"
22
+ s.add_development_dependency "rspec"
23
+
24
+ s.add_runtime_dependency "spidey"
25
+ s.add_runtime_dependency "mongo"
26
+ s.add_runtime_dependency "bson_ext"
27
+ end
metadata ADDED
@@ -0,0 +1,119 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spidey-mongo
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Joey Aghion
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-27 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: &70361603997680 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70361603997680
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &70361603997220 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70361603997220
36
+ - !ruby/object:Gem::Dependency
37
+ name: spidey
38
+ requirement: &70361603996320 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70361603996320
47
+ - !ruby/object:Gem::Dependency
48
+ name: mongo
49
+ requirement: &70361603995160 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *70361603995160
58
+ - !ruby/object:Gem::Dependency
59
+ name: bson_ext
60
+ requirement: &70361603994540 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *70361603994540
69
+ description: Implements a MongoDB back-end for Spidey, a framework for crawling and
70
+ scraping web sites.
71
+ email:
72
+ - joey@aghion.com
73
+ executables: []
74
+ extensions: []
75
+ extra_rdoc_files: []
76
+ files:
77
+ - .gitignore
78
+ - Gemfile
79
+ - Rakefile
80
+ - lib/spidey-mongo.rb
81
+ - lib/spidey-mongo/version.rb
82
+ - lib/spidey/strategies/mongo.rb
83
+ - spec/spec_helper.rb
84
+ - spec/spidey/strategies/mongo_spec.rb
85
+ - spidey-mongo.gemspec
86
+ homepage: https://github.com/joeyAghion/spidey-mongo
87
+ licenses: []
88
+ post_install_message:
89
+ rdoc_options: []
90
+ require_paths:
91
+ - lib
92
+ required_ruby_version: !ruby/object:Gem::Requirement
93
+ none: false
94
+ requirements:
95
+ - - ! '>='
96
+ - !ruby/object:Gem::Version
97
+ version: '0'
98
+ segments:
99
+ - 0
100
+ hash: 3377333768066102144
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ none: false
103
+ requirements:
104
+ - - ! '>='
105
+ - !ruby/object:Gem::Version
106
+ version: '0'
107
+ segments:
108
+ - 0
109
+ hash: 3377333768066102144
110
+ requirements: []
111
+ rubyforge_project: spidey-mongo
112
+ rubygems_version: 1.8.10
113
+ signing_key:
114
+ specification_version: 3
115
+ summary: Implements a MongoDB back-end for Spidey, a framework for crawling and scraping
116
+ web sites.
117
+ test_files:
118
+ - spec/spec_helper.rb
119
+ - spec/spidey/strategies/mongo_spec.rb