spidey 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +5 -0
- data/Gemfile +4 -0
- data/README.md +27 -0
- data/Rakefile +1 -0
- data/lib/spidey.rb +2 -0
- data/lib/spidey/abstract_spider.rb +102 -0
- data/lib/spidey/version.rb +3 -0
- data/spec/spec_helper.rb +8 -0
- data/spec/spidey/abstract_spider_spec.rb +76 -0
- data/spidey.gemspec +25 -0
- metadata +96 -0
data/Gemfile
ADDED
data/README.md
ADDED
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
data/lib/spidey.rb
ADDED
@@ -0,0 +1,102 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
require 'mechanize'
|
3
|
+
|
4
|
+
module Spidey
|
5
|
+
class AbstractSpider
|
6
|
+
attr_accessor :urls, :handlers, :results, :request_interval, :verbose, :errors
|
7
|
+
|
8
|
+
DEFAULT_REQUEST_INTERVAL = 3 # seconds
|
9
|
+
|
10
|
+
def self.handle(url, handler, default_data = {})
|
11
|
+
start_urls << url
|
12
|
+
handlers[url] = [handler, default_data]
|
13
|
+
end
|
14
|
+
|
15
|
+
# Accepts:
|
16
|
+
# request_interval: number of seconds to wait between requests (default: 3)
|
17
|
+
# verbose: prints debugging and progress information if true
|
18
|
+
def initialize(attrs = {})
|
19
|
+
@urls = []
|
20
|
+
@handlers = {}
|
21
|
+
@results = []
|
22
|
+
self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
|
23
|
+
@request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
|
24
|
+
@verbose = !!attrs[:verbose]
|
25
|
+
end
|
26
|
+
|
27
|
+
# Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
|
28
|
+
# max_urls: maximum number of URLs to crawl before returning (optional)
|
29
|
+
def crawl(options = {})
|
30
|
+
@errors = []
|
31
|
+
i = 0
|
32
|
+
each_url do |url, handler, default_data|
|
33
|
+
break if options[:max_urls] && i >= options[:max_urls]
|
34
|
+
begin
|
35
|
+
page = agent.get(url)
|
36
|
+
$stderr.puts "Handling #{url.inspect}" if verbose
|
37
|
+
send handler, page, default_data
|
38
|
+
rescue => ex
|
39
|
+
add_error url: url, handler: handler, error: ex
|
40
|
+
end
|
41
|
+
sleep request_interval if request_interval > 0
|
42
|
+
i += 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
protected
|
47
|
+
|
48
|
+
# Override this for custom queueing of crawled URLs.
|
49
|
+
def handle(url, handler, default_data = {})
|
50
|
+
unless @handlers[url]
|
51
|
+
@urls << url
|
52
|
+
@handlers[url] = [handler, default_data]
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
# Override this for custom storage or prioritization of crawled URLs.
|
57
|
+
# Iterates through URL queue, yielding the URL, handler, and default data.
|
58
|
+
def each_url(&block)
|
59
|
+
urls.each do |url|
|
60
|
+
yield url, handlers[url].first, handlers[url].last
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
# Override this for custom result storage.
|
65
|
+
def record(data)
|
66
|
+
results << data
|
67
|
+
$stderr.puts "Recording #{data.inspect}" if verbose
|
68
|
+
end
|
69
|
+
|
70
|
+
# Override this for custom error-handling.
|
71
|
+
def add_error(attrs)
|
72
|
+
@errors << attrs
|
73
|
+
$stderr.puts "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}" if verbose
|
74
|
+
end
|
75
|
+
|
76
|
+
def resolve_url(href, page)
|
77
|
+
agent.agent.resolve(href, page).to_s
|
78
|
+
end
|
79
|
+
|
80
|
+
# Strips ASCII/Unicode whitespace from ends and substitutes ASCII for Unicode internal spaces.
|
81
|
+
def clean(str)
|
82
|
+
return nil unless str
|
83
|
+
str.gsub(/\p{Space}/, ' ').strip.squeeze(' ')
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
|
88
|
+
def agent
|
89
|
+
@agent ||= Mechanize.new
|
90
|
+
end
|
91
|
+
|
92
|
+
def self.start_urls
|
93
|
+
@start_urls ||= []
|
94
|
+
end
|
95
|
+
|
96
|
+
def self.handlers
|
97
|
+
@handlers ||= {}
|
98
|
+
end
|
99
|
+
|
100
|
+
end
|
101
|
+
|
102
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Spidey::AbstractSpider do
|
4
|
+
describe '#crawl' do
|
5
|
+
before(:each) do
|
6
|
+
@agent = double('agent')
|
7
|
+
@page = double('page')
|
8
|
+
@agent.stub(get: @page)
|
9
|
+
Mechanize.stub(new: @agent)
|
10
|
+
end
|
11
|
+
|
12
|
+
describe "with handle declaration" do
|
13
|
+
class TestSpider < Spidey::AbstractSpider
|
14
|
+
handle "http://www.yahoo.com", :process_home
|
15
|
+
end
|
16
|
+
|
17
|
+
it "invokes handler" do
|
18
|
+
@agent.should_receive(:get).with("http://www.yahoo.com").and_return(@page)
|
19
|
+
spider = TestSpider.new request_interval: 0
|
20
|
+
spider.should_receive(:process_home).with(@page, {})
|
21
|
+
spider.crawl
|
22
|
+
end
|
23
|
+
|
24
|
+
it "records errors" do
|
25
|
+
spider = TestSpider.new request_interval: 0
|
26
|
+
spider.should_receive(:process_home).and_raise("Whoops - some error")
|
27
|
+
spider.crawl
|
28
|
+
spider.errors.size.should == 1
|
29
|
+
spider.errors.last[:url].should == "http://www.yahoo.com"
|
30
|
+
spider.errors.last[:handler].should == :process_home
|
31
|
+
spider.errors.last[:error].message.should == "Whoops - some error"
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "with follow-up URL handlers" do
|
35
|
+
class TestSpider < Spidey::AbstractSpider
|
36
|
+
def process_home(page, default_data = {})
|
37
|
+
handle "http://www.yahoo.com/deep_page.html", :process_deep_page
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "invokes configured handlers on follow-up URLs" do
|
42
|
+
spider = TestSpider.new request_interval: 0
|
43
|
+
page2 = double('page')
|
44
|
+
@agent.should_receive(:get).with("http://www.yahoo.com/deep_page.html").and_return(page2)
|
45
|
+
spider.should_receive(:process_deep_page).with(page2, {})
|
46
|
+
spider.crawl
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "with default_data" do
|
52
|
+
class TestSpiderWithData < Spidey::AbstractSpider
|
53
|
+
handle "http://www.yahoo.com", :process_home, a: 1, b: 2
|
54
|
+
end
|
55
|
+
|
56
|
+
it "passes default data through" do
|
57
|
+
spider = TestSpiderWithData.new request_interval: 0
|
58
|
+
spider.should_receive(:process_home).with(@page, a: 1, b: 2)
|
59
|
+
spider.crawl
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
describe '#clean' do
|
65
|
+
{
|
66
|
+
"Untitled, " => "Untitled,",
|
67
|
+
" Pahk the Cah" => "Pahk the Cah",
|
68
|
+
" Untitled 1999 " => "Untitled 1999",
|
69
|
+
nil => nil
|
70
|
+
}.each do |original, cleaned|
|
71
|
+
it "replaces '#{original}' with '#{cleaned}'" do
|
72
|
+
Spidey::AbstractSpider.new(request_interval: 0).send(:clean, original).should == cleaned
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
data/spidey.gemspec
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "spidey/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "spidey"
|
7
|
+
s.version = Spidey::VERSION
|
8
|
+
s.authors = ["Joey Aghion"]
|
9
|
+
s.email = ["joey@aghion.com"]
|
10
|
+
s.homepage = "https://github.com/joeyAghion/spidey"
|
11
|
+
s.summary = %q{A loose framework for crawling and scraping web sites.}
|
12
|
+
s.description = %q{A loose framework for crawling and scraping web sites.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "spidey"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_development_dependency "rake"
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
|
24
|
+
s.add_runtime_dependency "mechanize"
|
25
|
+
end
|
metadata
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: spidey
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.2
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Joey Aghion
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-06-26 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rake
|
16
|
+
requirement: &70136044260380 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70136044260380
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: rspec
|
27
|
+
requirement: &70136044256960 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70136044256960
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: mechanize
|
38
|
+
requirement: &70136044255140 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :runtime
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *70136044255140
|
47
|
+
description: A loose framework for crawling and scraping web sites.
|
48
|
+
email:
|
49
|
+
- joey@aghion.com
|
50
|
+
executables: []
|
51
|
+
extensions: []
|
52
|
+
extra_rdoc_files: []
|
53
|
+
files:
|
54
|
+
- .gitignore
|
55
|
+
- Gemfile
|
56
|
+
- README.md
|
57
|
+
- Rakefile
|
58
|
+
- lib/spidey.rb
|
59
|
+
- lib/spidey/abstract_spider.rb
|
60
|
+
- lib/spidey/version.rb
|
61
|
+
- spec/spec_helper.rb
|
62
|
+
- spec/spidey/abstract_spider_spec.rb
|
63
|
+
- spidey.gemspec
|
64
|
+
homepage: https://github.com/joeyAghion/spidey
|
65
|
+
licenses: []
|
66
|
+
post_install_message:
|
67
|
+
rdoc_options: []
|
68
|
+
require_paths:
|
69
|
+
- lib
|
70
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
segments:
|
77
|
+
- 0
|
78
|
+
hash: 3586547064482976006
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
none: false
|
81
|
+
requirements:
|
82
|
+
- - ! '>='
|
83
|
+
- !ruby/object:Gem::Version
|
84
|
+
version: '0'
|
85
|
+
segments:
|
86
|
+
- 0
|
87
|
+
hash: 3586547064482976006
|
88
|
+
requirements: []
|
89
|
+
rubyforge_project: spidey
|
90
|
+
rubygems_version: 1.8.10
|
91
|
+
signing_key:
|
92
|
+
specification_version: 3
|
93
|
+
summary: A loose framework for crawling and scraping web sites.
|
94
|
+
test_files:
|
95
|
+
- spec/spec_helper.rb
|
96
|
+
- spec/spidey/abstract_spider_spec.rb
|