spidey 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ .rspec
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in spidey.gemspec
4
+ gemspec
@@ -0,0 +1,27 @@
1
+ Spidey
2
+ ======
3
+
4
+
5
+
6
+ Example
7
+ -------
8
+
9
+
10
+ Strategies
11
+ ----------
12
+
13
+ ### In-Memory
14
+
15
+ ### Mongo
16
+
17
+
18
+ Contributing
19
+ ------------
20
+
21
+ To Do
22
+ -----
23
+ * Add examples
24
+
25
+
26
+ License
27
+ -------
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,2 @@
1
+ require "spidey/version"
2
+ require "spidey/abstract_spider"
@@ -0,0 +1,102 @@
1
+ # encoding: utf-8
2
+ require 'mechanize'
3
+
4
+ module Spidey
5
+ class AbstractSpider
6
+ attr_accessor :urls, :handlers, :results, :request_interval, :verbose, :errors
7
+
8
+ DEFAULT_REQUEST_INTERVAL = 3 # seconds
9
+
10
+ def self.handle(url, handler, default_data = {})
11
+ start_urls << url
12
+ handlers[url] = [handler, default_data]
13
+ end
14
+
15
+ # Accepts:
16
+ # request_interval: number of seconds to wait between requests (default: 3)
17
+ # verbose: prints debugging and progress information if true
18
+ def initialize(attrs = {})
19
+ @urls = []
20
+ @handlers = {}
21
+ @results = []
22
+ self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
23
+ @request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
24
+ @verbose = !!attrs[:verbose]
25
+ end
26
+
27
+ # Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
28
+ # max_urls: maximum number of URLs to crawl before returning (optional)
29
+ def crawl(options = {})
30
+ @errors = []
31
+ i = 0
32
+ each_url do |url, handler, default_data|
33
+ break if options[:max_urls] && i >= options[:max_urls]
34
+ begin
35
+ page = agent.get(url)
36
+ $stderr.puts "Handling #{url.inspect}" if verbose
37
+ send handler, page, default_data
38
+ rescue => ex
39
+ add_error url: url, handler: handler, error: ex
40
+ end
41
+ sleep request_interval if request_interval > 0
42
+ i += 1
43
+ end
44
+ end
45
+
46
+ protected
47
+
48
+ # Override this for custom queueing of crawled URLs.
49
+ def handle(url, handler, default_data = {})
50
+ unless @handlers[url]
51
+ @urls << url
52
+ @handlers[url] = [handler, default_data]
53
+ end
54
+ end
55
+
56
+ # Override this for custom storage or prioritization of crawled URLs.
57
+ # Iterates through URL queue, yielding the URL, handler, and default data.
58
+ def each_url(&block)
59
+ urls.each do |url|
60
+ yield url, handlers[url].first, handlers[url].last
61
+ end
62
+ end
63
+
64
+ # Override this for custom result storage.
65
+ def record(data)
66
+ results << data
67
+ $stderr.puts "Recording #{data.inspect}" if verbose
68
+ end
69
+
70
+ # Override this for custom error-handling.
71
+ def add_error(attrs)
72
+ @errors << attrs
73
+ $stderr.puts "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" if verbose
74
+ end
75
+
76
+ def resolve_url(href, page)
77
+ agent.agent.resolve(href, page).to_s
78
+ end
79
+
80
+ # Strips ASCII/Unicode whitespace from ends and substitutes ASCII for Unicode internal spaces.
81
+ def clean(str)
82
+ return nil unless str
83
+ str.gsub(/\p{Space}/, ' ').strip.squeeze(' ')
84
+ end
85
+
86
+ private
87
+
88
+ def agent
89
+ @agent ||= Mechanize.new
90
+ end
91
+
92
+ def self.start_urls
93
+ @start_urls ||= []
94
+ end
95
+
96
+ def self.handlers
97
+ @handlers ||= {}
98
+ end
99
+
100
+ end
101
+
102
+ end
@@ -0,0 +1,3 @@
1
+ module Spidey
2
+ VERSION = "0.0.2"
3
+ end
@@ -0,0 +1,8 @@
1
+ $:.unshift(File.dirname(__FILE__) + '/../lib')
2
+ require 'spidey'
3
+
4
+ RSpec.configure do |config|
5
+ config.treat_symbols_as_metadata_keys_with_true_values = true
6
+ config.run_all_when_everything_filtered = true
7
+ config.filter_run :focus
8
+ end
@@ -0,0 +1,76 @@
1
+ require 'spec_helper'
2
+
3
+ describe Spidey::AbstractSpider do
4
+ describe '#crawl' do
5
+ before(:each) do
6
+ @agent = double('agent')
7
+ @page = double('page')
8
+ @agent.stub(get: @page)
9
+ Mechanize.stub(new: @agent)
10
+ end
11
+
12
+ describe "with handle declaration" do
13
+ class TestSpider < Spidey::AbstractSpider
14
+ handle "http://www.yahoo.com", :process_home
15
+ end
16
+
17
+ it "invokes handler" do
18
+ @agent.should_receive(:get).with("http://www.yahoo.com").and_return(@page)
19
+ spider = TestSpider.new request_interval: 0
20
+ spider.should_receive(:process_home).with(@page, {})
21
+ spider.crawl
22
+ end
23
+
24
+ it "records errors" do
25
+ spider = TestSpider.new request_interval: 0
26
+ spider.should_receive(:process_home).and_raise("Whoops - some error")
27
+ spider.crawl
28
+ spider.errors.size.should == 1
29
+ spider.errors.last[:url].should == "http://www.yahoo.com"
30
+ spider.errors.last[:handler].should == :process_home
31
+ spider.errors.last[:error].message.should == "Whoops - some error"
32
+ end
33
+
34
+ describe "with follow-up URL handlers" do
35
+ class TestSpider < Spidey::AbstractSpider
36
+ def process_home(page, default_data = {})
37
+ handle "http://www.yahoo.com/deep_page.html", :process_deep_page
38
+ end
39
+ end
40
+
41
+ it "invokes configured handlers on follow-up URLs" do
42
+ spider = TestSpider.new request_interval: 0
43
+ page2 = double('page')
44
+ @agent.should_receive(:get).with("http://www.yahoo.com/deep_page.html").and_return(page2)
45
+ spider.should_receive(:process_deep_page).with(page2, {})
46
+ spider.crawl
47
+ end
48
+ end
49
+ end
50
+
51
+ describe "with default_data" do
52
+ class TestSpiderWithData < Spidey::AbstractSpider
53
+ handle "http://www.yahoo.com", :process_home, a: 1, b: 2
54
+ end
55
+
56
+ it "passes default data through" do
57
+ spider = TestSpiderWithData.new request_interval: 0
58
+ spider.should_receive(:process_home).with(@page, a: 1, b: 2)
59
+ spider.crawl
60
+ end
61
+ end
62
+ end
63
+
64
+ describe '#clean' do
65
+ {
66
+ "Untitled, " => "Untitled,",
67
+ " Pahk the Cah" => "Pahk the Cah",
68
+ " Untitled 1999 " => "Untitled 1999",
69
+ nil => nil
70
+ }.each do |original, cleaned|
71
+ it "replaces '#{original}' with '#{cleaned}'" do
72
+ Spidey::AbstractSpider.new(request_interval: 0).send(:clean, original).should == cleaned
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,25 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "spidey/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "spidey"
7
+ s.version = Spidey::VERSION
8
+ s.authors = ["Joey Aghion"]
9
+ s.email = ["joey@aghion.com"]
10
+ s.homepage = "https://github.com/joeyAghion/spidey"
11
+ s.summary = %q{A loose framework for crawling and scraping web sites.}
12
+ s.description = %q{A loose framework for crawling and scraping web sites.}
13
+
14
+ s.rubyforge_project = "spidey"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rake"
22
+ s.add_development_dependency "rspec"
23
+
24
+ s.add_runtime_dependency "mechanize"
25
+ end
metadata ADDED
@@ -0,0 +1,96 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: spidey
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.2
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Joey Aghion
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-06-26 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rake
16
+ requirement: &70136044260380 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *70136044260380
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &70136044256960 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :development
34
+ prerelease: false
35
+ version_requirements: *70136044256960
36
+ - !ruby/object:Gem::Dependency
37
+ name: mechanize
38
+ requirement: &70136044255140 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *70136044255140
47
+ description: A loose framework for crawling and scraping web sites.
48
+ email:
49
+ - joey@aghion.com
50
+ executables: []
51
+ extensions: []
52
+ extra_rdoc_files: []
53
+ files:
54
+ - .gitignore
55
+ - Gemfile
56
+ - README.md
57
+ - Rakefile
58
+ - lib/spidey.rb
59
+ - lib/spidey/abstract_spider.rb
60
+ - lib/spidey/version.rb
61
+ - spec/spec_helper.rb
62
+ - spec/spidey/abstract_spider_spec.rb
63
+ - spidey.gemspec
64
+ homepage: https://github.com/joeyAghion/spidey
65
+ licenses: []
66
+ post_install_message:
67
+ rdoc_options: []
68
+ require_paths:
69
+ - lib
70
+ required_ruby_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ segments:
77
+ - 0
78
+ hash: 3586547064482976006
79
+ required_rubygems_version: !ruby/object:Gem::Requirement
80
+ none: false
81
+ requirements:
82
+ - - ! '>='
83
+ - !ruby/object:Gem::Version
84
+ version: '0'
85
+ segments:
86
+ - 0
87
+ hash: 3586547064482976006
88
+ requirements: []
89
+ rubyforge_project: spidey
90
+ rubygems_version: 1.8.10
91
+ signing_key:
92
+ specification_version: 3
93
+ summary: A loose framework for crawling and scraping web sites.
94
+ test_files:
95
+ - spec/spec_helper.rb
96
+ - spec/spidey/abstract_spider_spec.rb