spidey 0.0.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/.travis.yml +9 -0
- data/README.md +8 -2
- data/Rakefile +6 -0
- data/lib/spidey.rb +10 -0
- data/lib/spidey/abstract_spider.rb +7 -9
- data/lib/spidey/railtie.rb +7 -0
- data/lib/spidey/version.rb +1 -1
- data/spidey.gemspec +4 -5
- metadata +7 -21
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
Spidey
|
1
|
+
Spidey [](https://travis-ci.org/joeyAghion/spidey)
|
2
2
|
======
|
3
3
|
|
4
4
|
Spidey provides a bare-bones framework for crawling and scraping web sites. Its goal is to keep boilerplate scraping logic out of your code.
|
@@ -55,6 +55,12 @@ By default, the lists of URLs being crawled, results scraped, and errors encount
|
|
55
55
|
Add the [spidey-mongo](https://github.com/joeyAghion/spidey-mongo) gem and include `Spidey::Strategies::Mongo` in your spider to instead use MongoDB to persist these data. [See the docs](https://github.com/joeyAghion/spidey-mongo) for more information. Or, you can implement your own strategy by overriding the appropriate methods from `AbstractSpider`.
|
56
56
|
|
57
57
|
|
58
|
+
Logging
|
59
|
+
-------
|
60
|
+
|
61
|
+
You may set `Spidey.logger` to a logger of your choosing. When used in a Rails environment, the logger defaults to the Rails logger. Otherwise, it's directed to STDOUT.
|
62
|
+
|
63
|
+
|
58
64
|
Contributing
|
59
65
|
------------
|
60
66
|
|
@@ -68,4 +74,4 @@ To Do
|
|
68
74
|
|
69
75
|
Copyright
|
70
76
|
---------
|
71
|
-
Copyright (c) 2012 Joey Aghion,
|
77
|
+
Copyright (c) 2012 [Joey Aghion](http://halfamind.aghion.com), [Artsy Inc](http://artsy.net). See [LICENSE.txt](LICENSE.txt) for further details.
|
data/Rakefile
CHANGED
data/lib/spidey.rb
CHANGED
@@ -3,7 +3,7 @@ require 'mechanize'
|
|
3
3
|
|
4
4
|
module Spidey
|
5
5
|
class AbstractSpider
|
6
|
-
attr_accessor :urls, :handlers, :results, :request_interval, :
|
6
|
+
attr_accessor :urls, :handlers, :results, :request_interval, :errors
|
7
7
|
|
8
8
|
DEFAULT_REQUEST_INTERVAL = 3 # seconds
|
9
9
|
|
@@ -14,14 +14,12 @@ module Spidey
|
|
14
14
|
|
15
15
|
# Accepts:
|
16
16
|
# request_interval: number of seconds to wait between requests (default: 3)
|
17
|
-
# verbose: prints debugging and progress information if true
|
18
17
|
def initialize(attrs = {})
|
19
18
|
@urls = []
|
20
19
|
@handlers = {}
|
21
20
|
@results = []
|
22
21
|
self.class.start_urls.each { |url| handle url, *self.class.handlers[url] }
|
23
22
|
@request_interval = attrs[:request_interval] || DEFAULT_REQUEST_INTERVAL
|
24
|
-
@verbose = !!attrs[:verbose]
|
25
23
|
end
|
26
24
|
|
27
25
|
# Iterates through URLs queued for handling, including any that are added in the course of crawling. Accepts:
|
@@ -33,7 +31,7 @@ module Spidey
|
|
33
31
|
break if options[:max_urls] && i >= options[:max_urls]
|
34
32
|
begin
|
35
33
|
page = agent.get(url)
|
36
|
-
|
34
|
+
Spidey.logger.info "Handling #{url.inspect}"
|
37
35
|
send handler, page, default_data
|
38
36
|
rescue => ex
|
39
37
|
add_error url: url, handler: handler, error: ex
|
@@ -60,17 +58,17 @@ module Spidey
|
|
60
58
|
yield url, handlers[url].first, handlers[url].last
|
61
59
|
end
|
62
60
|
end
|
63
|
-
|
61
|
+
|
64
62
|
# Override this for custom result storage.
|
65
63
|
def record(data)
|
66
64
|
results << data
|
67
|
-
|
65
|
+
Spidey.logger.info "Recording #{data.inspect}"
|
68
66
|
end
|
69
|
-
|
67
|
+
|
70
68
|
# Override this for custom error-handling.
|
71
69
|
def add_error(attrs)
|
72
70
|
@errors << attrs
|
73
|
-
|
71
|
+
Spidey.logger.error "Error on #{attrs[:url]}. #{attrs[:error].class}: #{attrs[:error].message}"
|
74
72
|
end
|
75
73
|
|
76
74
|
def resolve_url(href, page)
|
@@ -98,5 +96,5 @@ module Spidey
|
|
98
96
|
end
|
99
97
|
|
100
98
|
end
|
101
|
-
|
99
|
+
|
102
100
|
end
|
data/lib/spidey/version.rb
CHANGED
data/spidey.gemspec
CHANGED
@@ -11,17 +11,16 @@ Gem::Specification.new do |s|
|
|
11
11
|
s.summary = %q{A loose framework for crawling and scraping web sites.}
|
12
12
|
s.description = %q{A loose framework for crawling and scraping web sites.}
|
13
13
|
s.license = 'MIT'
|
14
|
-
|
14
|
+
|
15
15
|
s.rubyforge_project = "spidey"
|
16
|
-
|
16
|
+
|
17
17
|
s.files = `git ls-files`.split("\n")
|
18
18
|
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
19
19
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
20
20
|
s.require_paths = ["lib"]
|
21
|
-
|
21
|
+
|
22
22
|
s.add_development_dependency "rake"
|
23
23
|
s.add_development_dependency "rspec"
|
24
|
-
|
25
|
-
|
24
|
+
|
26
25
|
s.add_runtime_dependency "mechanize"
|
27
26
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: spidey
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2013-05-03 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
@@ -43,22 +43,6 @@ dependencies:
|
|
43
43
|
- - ! '>='
|
44
44
|
- !ruby/object:Gem::Version
|
45
45
|
version: '0'
|
46
|
-
- !ruby/object:Gem::Dependency
|
47
|
-
name: ruby-debug19
|
48
|
-
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
|
-
requirements:
|
51
|
-
- - ! '>='
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: '0'
|
54
|
-
type: :development
|
55
|
-
prerelease: false
|
56
|
-
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
|
-
requirements:
|
59
|
-
- - ! '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
46
|
- !ruby/object:Gem::Dependency
|
63
47
|
name: mechanize
|
64
48
|
requirement: !ruby/object:Gem::Requirement
|
@@ -83,6 +67,7 @@ extensions: []
|
|
83
67
|
extra_rdoc_files: []
|
84
68
|
files:
|
85
69
|
- .gitignore
|
70
|
+
- .travis.yml
|
86
71
|
- Gemfile
|
87
72
|
- LICENSE.txt
|
88
73
|
- README.md
|
@@ -90,6 +75,7 @@ files:
|
|
90
75
|
- examples/ebay_pet_supplies_spider.rb
|
91
76
|
- lib/spidey.rb
|
92
77
|
- lib/spidey/abstract_spider.rb
|
78
|
+
- lib/spidey/railtie.rb
|
93
79
|
- lib/spidey/version.rb
|
94
80
|
- spec/spec_helper.rb
|
95
81
|
- spec/spidey/abstract_spider_spec.rb
|
@@ -109,7 +95,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
109
95
|
version: '0'
|
110
96
|
segments:
|
111
97
|
- 0
|
112
|
-
hash:
|
98
|
+
hash: 101937807007286306
|
113
99
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
114
100
|
none: false
|
115
101
|
requirements:
|
@@ -118,10 +104,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
118
104
|
version: '0'
|
119
105
|
segments:
|
120
106
|
- 0
|
121
|
-
hash:
|
107
|
+
hash: 101937807007286306
|
122
108
|
requirements: []
|
123
109
|
rubyforge_project: spidey
|
124
|
-
rubygems_version: 1.8.
|
110
|
+
rubygems_version: 1.8.25
|
125
111
|
signing_key:
|
126
112
|
specification_version: 3
|
127
113
|
summary: A loose framework for crawling and scraping web sites.
|