spidey 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .rspec
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in spidey.gemspec
4
+ gemspec
@@ -0,0 +1,27 @@
1
+ Spidey
2
+ ======
3
+
4
+
5
+
6
+ Example
7
+ -------
8
+
9
+
10
+ Strategies
11
+ ----------
12
+
13
+ ### In-Memory
14
+
15
+ ### Mongo
16
+
17
+
18
+ Contributing
19
+ ------------
20
+
21
+ To Do
22
+ -----
23
+ * Add examples
24
+
25
+
26
+ License
27
+ -------
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,2 @@
1
+ require "spidey/version"
2
+ require "spidey/abstract_spider"
@@ -0,0 +1,102 @@
1
+ # encoding: utf-8
2
+ require 'mechanize'
3
+
4
+ module Spidey
5
+ class AbstractSpider
6
+ attr_accessor :urls, :handlers, :results, :request_interval, :verbose, :errors
7
+
8
+ DEFAULT_REQUEST_INTERVAL = 3 # seconds
9
+
10
+ def self.handle(url, handler, default_data = {})
11
+ start_urls << url
12
+ handlers[url] = [handler, default_data]
13
+ end
14
+
15
+ # Accepts:
16
+ # request_interval: number of seconds to wait between requests (default: 3)
17
+ # verbose: prints debugging and progress information if true
18
+ def initialize(attrs = {})
19
+ @urls = []
20
+ @handlers = {}
21
+ @results = []
22
+ self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
23
+ @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
24
+ @verbose = !!attrs[:verbose]
25
+ end
26
+
27
+ # Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
28
+ # max_urls: maximum number of URLs to crawl before returning (optional)
29
+ def crawl(options = {})
30
+ @errors = []
31
+ i = 0
32
+ each_url do |url, handler, default_data|
33
+ break if options[:max_urls] && i >= options[:max_urls]
34
+ begin
35
+ page = agent.get(url)
36
+ $stderr.puts "Handling #{url.inspect}" if verbose
37
+ send handler, page, default_data
38
+ rescue => ex
39
+ add_error url: url, handler: handler, error: ex
40
+ end
41
+ sleep request_interval if request_interval > 0
42
+ i += 1
43
+ end
44
+ end
45
+
46
+ protected
47
+
48
+ # Override this for custom queueing of crawled URLs.
49
+ def handle(url, handler, default_data = {})
50
+ unless @handlers[url]
51
+ @urls << url
52
+ @handlers[url] = [handler, default_data]
53
+ end
54
+ end
55
+
56
+ # Override this for custom storage or prioritization of crawled URLs.
57
+ # Iterates through URL queue, yielding the URL, handler, and default data.
58
+ def each_url(&block)
59
+ urls.each do |url|
60
+ yield url, handlers[url].first, handlers[url].last
61
+ end
62
+ end
63
+
64
+ # Override this for custom result storage.
65
+ def record(data)
66
+ results << data
67
+ $stderr.puts "Recording #{data.inspect}" if verbose
68
+ end
69
+
70
+ # Override this for custom error-handling.
71
+ def add_error(attrs)
72
+ @errors << attrs
73
+ $stderr.puts "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" if verbose
74
+ end
75
+
76
+ def resolve_url(href, page)
77
+ agent.agent.resolve(href, page).to_s
78
+ end
79
+
80
+ # Strips ASCII/Unicode whitespace from ends and substitutes ASCII for Unicode internal spaces.
81
+ def clean(str)
82
+ return nil unless str
83
+ str.gsub(/\p{Space}/, ' ').strip.squeeze(' ')
84
+ end
85
+
86
+ private
87
+
88
+ def agent
89
+ @agent ||= Mechanize.new
90
+ end
91
+
92
+ def self.start_urls
93
+ @start_urls ||= []
94
+ end
95
+
96
+ def self.handlers
97
+ @handlers ||= {}
98
+ end
99
+
100
+ end
101
+
102
+ end
@@ -0,0 +1,3 @@
1
+ module Spidey
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,8 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+ require 'spidey'
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+ end
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spidey::AbstractSpider do
4
+ describe '#crawl' do
5
+ before(:each) do
6
+ @agent = double('agent')
7
+ @page = double('page')
8
+ @agent.stub(get: @page)
9
+ Mechanize.stub(new: @agent)
10
+ end
11
+
12
+ describe "with handle declaration" do
13
+ class TestSpider < Spidey::AbstractSpider
14
+ handle "http://www.yahoo.com", :process_home
15
+ end
16
+
17
+ it "invokes handler" do
18
+ @agent.should_receive(:get).with("http://www.yahoo.com").and_return(@page)
19
+ spider = TestSpider.new request_interval: 0
20
+ spider.should_receive(:process_home).with(@page, {})
21
+ spider.crawl
22
+ end
23
+
24
+ it "records errors" do
25
+ spider = TestSpider.new request_interval: 0
26
+ spider.should_receive(:process_home).and_raise("Whoops - some error")
27
+ spider.crawl
28
+ spider.errors.size.should == 1
29
+ spider.errors.last[:url].should == "http://www.yahoo.com"
30
+ spider.errors.last[:handler].should == :process_home
31
+ spider.errors.last[:error].message.should == "Whoops - some error"
32
+ end
33
+
34
+ describe "with follow-up URL handlers" do
35
+ class TestSpider < Spidey::AbstractSpider
36
+ def process_home(page, default_data = {})
37
+ handle "http://www.yahoo.com/deep_page.html", :process_deep_page
38
+ end
39
+ end
40
+
41
+ it "invokes configured handlers on follow-up URLs" do
42
+ spider = TestSpider.new request_interval: 0
43
+ page2 = double('page')
44
+ @agent.should_receive(:get).with("http://www.yahoo.com/deep_page.html").and_return(page2)
45
+ spider.should_receive(:process_deep_page).with(page2, {})
46
+ spider.crawl
47
+ end
48
+ end
49
+ end
50
+
51
+ describe "with default_data" do
52
+ class TestSpiderWithData < Spidey::AbstractSpider
53
+ handle "http://www.yahoo.com", :process_home, a: 1, b: 2
54
+ end
55
+
56
+ it "passes default data through" do
57
+ spider = TestSpiderWithData.new request_interval: 0
58
+ spider.should_receive(:process_home).with(@page, a: 1, b: 2)
59
+ spider.crawl
60
+ end
61
+ end
62
+ end
63
+
64
+ describe '#clean' do
65
+ {
66
+ "Untitled, " => "Untitled,",
67
+ " Pahk the Cah" => "Pahk the Cah",
68
+ " Untitled 1999 " => "Untitled 1999",
69
+ nil => nil
70
+ }.each do |original, cleaned|
71
+ it "replaces '#{original}' with '#{cleaned}'" do
72
+ Spidey::AbstractSpider.new(request_interval: 0).send(:clean, original).should == cleaned
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "spidey/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "spidey"
7
+ s.version = Spidey::VERSION
8
+ s.authors = ["Joey Aghion"]
9
+ s.email = ["joey@aghion.com"]
10
+ s.homepage = "https://github.com/joeyAghion/spidey"
11
+ s.summary = %q{A loose framework for crawling and scraping web sites.}
12
+ s.description = %q{A loose framework for crawling and scraping web sites.}
13
+
14
+ s.rubyforge_project = "spidey"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rake"
22
+ s.add_development_dependency "rspec"
23
+
24
+ s.add_runtime_dependency "mechanize"
25
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spidey
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Joey Aghion
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-26 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: &70136044260380 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70136044260380
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &70136044256960 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70136044256960
36
+ - !ruby/object:Gem::Dependency
37
+ name: mechanize
38
+ requirement: &70136044255140 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70136044255140
47
+ description: A loose framework for crawling and scraping web sites.
48
+ email:
49
+ - joey@aghion.com
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - .gitignore
55
+ - Gemfile
56
+ - README.md
57
+ - Rakefile
58
+ - lib/spidey.rb
59
+ - lib/spidey/abstract_spider.rb
60
+ - lib/spidey/version.rb
61
+ - spec/spec_helper.rb
62
+ - spec/spidey/abstract_spider_spec.rb
63
+ - spidey.gemspec
64
+ homepage: https://github.com/joeyAghion/spidey
65
+ licenses: []
66
+ post_install_message:
67
+ rdoc_options: []
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ segments:
77
+ - 0
78
+ hash: 3586547064482976006
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ! '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ segments:
86
+ - 0
87
+ hash: 3586547064482976006
88
+ requirements: []
89
+ rubyforge_project: spidey
90
+ rubygems_version: 1.8.10
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: A loose framework for crawling and scraping web sites.
94
+ test_files:
95
+ - spec/spec_helper.rb
96
+ - spec/spidey/abstract_spider_spec.rb