spider 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGES +3 -0
- data/README +90 -17
- data/doc/classes/IncludedInMemcached.html +217 -0
- data/doc/classes/Spider.html +10 -8
- data/doc/classes/SpiderInstance.html +96 -45
- data/doc/created.rid +1 -1
- data/doc/files/README.html +95 -21
- data/doc/{classes/Net.html → files/lib/included_in_memcached_rb.html} +23 -16
- data/doc/files/lib/spider_instance_rb.html +118 -0
- data/doc/files/lib/spider_rb.html +95 -32
- data/doc/fr_class_index.html +1 -0
- data/doc/fr_file_index.html +2 -0
- data/doc/fr_method_index.html +11 -7
- data/lib/included_in_memcached.rb +22 -0
- data/lib/spider.rb +4 -246
- data/lib/spider_instance.rb +290 -0
- data/spec/included_in_memcached_spec.rb +44 -0
- data/spec/spider_instance_spec.rb +46 -4
- data/spider.gemspec +1 -1
- metadata +8 -8
- data/doc/classes/Net/HTTPRedirection.html +0 -144
- data/doc/classes/Net/HTTPResponse.html +0 -166
- data/doc/classes/Net/HTTPSuccess.html +0 -144
- data/doc/classes/NilClass.html +0 -144
data/doc/fr_class_index.html
CHANGED
@@ -20,6 +20,7 @@
|
|
20
20
|
<div id="index">
|
21
21
|
<h1 class="section-bar">Classes</h1>
|
22
22
|
<div id="index-entries">
|
23
|
+
<a href="classes/IncludedInMemcached.html">IncludedInMemcached</a><br />
|
23
24
|
<a href="classes/Spider.html">Spider</a><br />
|
24
25
|
<a href="classes/SpiderInstance.html">SpiderInstance</a><br />
|
25
26
|
</div>
|
data/doc/fr_file_index.html
CHANGED
@@ -21,7 +21,9 @@
|
|
21
21
|
<h1 class="section-bar">Files</h1>
|
22
22
|
<div id="index-entries">
|
23
23
|
<a href="files/README.html">README</a><br />
|
24
|
+
<a href="files/lib/included_in_memcached_rb.html">lib/included_in_memcached.rb</a><br />
|
24
25
|
<a href="files/lib/spider_rb.html">lib/spider.rb</a><br />
|
26
|
+
<a href="files/lib/spider_instance_rb.html">lib/spider_instance.rb</a><br />
|
25
27
|
</div>
|
26
28
|
</div>
|
27
29
|
</body>
|
data/doc/fr_method_index.html
CHANGED
@@ -20,13 +20,17 @@
|
|
20
20
|
<div id="index">
|
21
21
|
<h1 class="section-bar">Methods</h1>
|
22
22
|
<div id="index-entries">
|
23
|
-
<a href="classes/
|
24
|
-
<a href="classes/SpiderInstance.html#
|
25
|
-
<a href="classes/SpiderInstance.html#M000005">
|
26
|
-
<a href="classes/SpiderInstance.html#
|
27
|
-
<a href="classes/SpiderInstance.html#
|
28
|
-
<a href="classes/
|
29
|
-
<a href="classes/
|
23
|
+
<a href="classes/IncludedInMemcached.html#M000002"><< (IncludedInMemcached)</a><br />
|
24
|
+
<a href="classes/SpiderInstance.html#M000004">add_url_check (SpiderInstance)</a><br />
|
25
|
+
<a href="classes/SpiderInstance.html#M000005">check_already_seen_with (SpiderInstance)</a><br />
|
26
|
+
<a href="classes/SpiderInstance.html#M000010">clear_headers (SpiderInstance)</a><br />
|
27
|
+
<a href="classes/SpiderInstance.html#M000009">headers (SpiderInstance)</a><br />
|
28
|
+
<a href="classes/IncludedInMemcached.html#M000003">include? (IncludedInMemcached)</a><br />
|
29
|
+
<a href="classes/IncludedInMemcached.html#M000001">new (IncludedInMemcached)</a><br />
|
30
|
+
<a href="classes/SpiderInstance.html#M000006">on (SpiderInstance)</a><br />
|
31
|
+
<a href="classes/SpiderInstance.html#M000007">setup (SpiderInstance)</a><br />
|
32
|
+
<a href="classes/Spider.html#M000011">start_at (Spider)</a><br />
|
33
|
+
<a href="classes/SpiderInstance.html#M000008">teardown (SpiderInstance)</a><br />
|
30
34
|
</div>
|
31
35
|
</div>
|
32
36
|
</body>
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'memcache'
|
2
|
+
|
3
|
+
# A specialized class using memcached to track items stored. It supports
|
4
|
+
# three operations: new, <<, and include? . Together these can be used to
|
5
|
+
# add items to the memcache, then determine whether the item has been added.
|
6
|
+
class IncludedInMemcached
|
7
|
+
# Construct a new IncludedInMemcached instance. All arguments here are
|
8
|
+
# passed to MemCache (part of the memcache-client gem).
|
9
|
+
def initialize(*a)
|
10
|
+
@c = MemCache.new(*a)
|
11
|
+
end
|
12
|
+
|
13
|
+
# Add an item to the memcache.
|
14
|
+
def <<(v)
|
15
|
+
@c.add(v.to_s, v)
|
16
|
+
end
|
17
|
+
|
18
|
+
# True if the item is in the memcache.
|
19
|
+
def include?(v)
|
20
|
+
@c.get(v.to_s) == v
|
21
|
+
end
|
22
|
+
end
|
data/lib/spider.rb
CHANGED
@@ -9,7 +9,7 @@
|
|
9
9
|
# notice, this list of conditions and the following disclaimer in the
|
10
10
|
# documentation and/or other materials provided with the distribution.
|
11
11
|
# * Neither the name Mike Burns nor the
|
12
|
-
# names of
|
12
|
+
# names of his contributors may be used to endorse or promote products
|
13
13
|
# derived from this software without specific prior written permission.
|
14
14
|
#
|
15
15
|
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
@@ -23,35 +23,15 @@
|
|
23
23
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
24
24
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
25
25
|
|
26
|
-
require '
|
27
|
-
require 'open-uri'
|
28
|
-
require 'uri'
|
29
|
-
require 'net/http'
|
30
|
-
require 'net/https'
|
31
|
-
|
32
|
-
module Net #:nodoc:
|
33
|
-
class HTTPResponse #:nodoc:
|
34
|
-
def success?; false; end
|
35
|
-
def redirect?; false; end
|
36
|
-
end
|
37
|
-
class HTTPSuccess #:nodoc:
|
38
|
-
def success?; true; end
|
39
|
-
end
|
40
|
-
class HTTPRedirection #:nodoc:
|
41
|
-
def redirect?; true; end
|
42
|
-
end
|
43
|
-
end
|
44
|
-
|
45
|
-
class NilClass #:nodoc:
|
46
|
-
def merge(h); h; end
|
47
|
-
end
|
26
|
+
require File.dirname(__FILE__)+'/spider_instance'
|
48
27
|
|
49
28
|
# A spidering library for Ruby. Handles robots.txt, scraping, finding more
|
50
29
|
# links, and doing it all over again.
|
51
30
|
class Spider
|
52
31
|
# Runs the spider starting at the given URL. Also takes a block that is given
|
53
32
|
# the SpiderInstance. Use the block to define the rules and handlers for
|
54
|
-
# the discovered Web pages.
|
33
|
+
# the discovered Web pages. See SpiderInstance for the possible rules and
|
34
|
+
# handlers.
|
55
35
|
#
|
56
36
|
# Spider.start_at('http://mike-burns.com/') do |s|
|
57
37
|
# s.add_url_check do |a_url|
|
@@ -78,225 +58,3 @@ class Spider
|
|
78
58
|
a_spider.start!
|
79
59
|
end
|
80
60
|
end
|
81
|
-
|
82
|
-
class SpiderInstance
|
83
|
-
def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
|
84
|
-
@url_checks = []
|
85
|
-
@cache = :memory
|
86
|
-
@callbacks = {}
|
87
|
-
@next_urls = next_urls
|
88
|
-
@seen = seen
|
89
|
-
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
90
|
-
@robots_seen = robots_seen
|
91
|
-
@headers = {}
|
92
|
-
@setup = nil
|
93
|
-
@teardown = nil
|
94
|
-
end
|
95
|
-
|
96
|
-
# Add a predicate that determines whether to continue down this URL's path.
|
97
|
-
# All predicates must be true in order for a URL to proceed.
|
98
|
-
#
|
99
|
-
# Takes a block that takes a string and produces a boolean. For example, this
|
100
|
-
# will ensure that the URL starts with 'http://mike-burns.com':
|
101
|
-
#
|
102
|
-
# add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
|
103
|
-
def add_url_check(&block)
|
104
|
-
@url_checks << block
|
105
|
-
end
|
106
|
-
|
107
|
-
def use_cache(cache_type) #:nodoc:
|
108
|
-
@cache = cache_type
|
109
|
-
end
|
110
|
-
|
111
|
-
# Add a response handler. A response handler's trigger can be :every,
|
112
|
-
# :success, :failure, or any HTTP status code. The handler itself can be
|
113
|
-
# either a Proc or a block.
|
114
|
-
#
|
115
|
-
# The arguments to the block are: the URL as a string, an instance of
|
116
|
-
# Net::HTTPResponse, and the prior URL as a string.
|
117
|
-
#
|
118
|
-
#
|
119
|
-
# For example:
|
120
|
-
#
|
121
|
-
# on 404 do |a_url, resp, prior_url|
|
122
|
-
# puts "URL not found: #{a_url}"
|
123
|
-
# end
|
124
|
-
#
|
125
|
-
# on :success do |a_url, resp, prior_url|
|
126
|
-
# puts a_url
|
127
|
-
# puts resp.body
|
128
|
-
# end
|
129
|
-
#
|
130
|
-
# on :every do |a_url, resp, prior_url|
|
131
|
-
# puts "Given this code: #{resp.code}"
|
132
|
-
# end
|
133
|
-
def on(code, p = nil, &block)
|
134
|
-
f = p ? p : block
|
135
|
-
case code
|
136
|
-
when Fixnum
|
137
|
-
@callbacks[code] = f
|
138
|
-
else
|
139
|
-
@callbacks[code.to_sym] = f
|
140
|
-
end
|
141
|
-
end
|
142
|
-
|
143
|
-
# Run before the HTTP request. Given the URL as a string.
|
144
|
-
# setup do |a_url|
|
145
|
-
# headers['Cookies'] = 'user_id=1;admin=true'
|
146
|
-
# end
|
147
|
-
def setup(p = nil, &block)
|
148
|
-
@setup = p ? p : block
|
149
|
-
end
|
150
|
-
|
151
|
-
# Run last, once for each page. Given the URL as a string.
|
152
|
-
def teardown(p = nil, &block)
|
153
|
-
@teardown = p ? p : block
|
154
|
-
end
|
155
|
-
|
156
|
-
# Use like a hash:
|
157
|
-
# headers['Cookies'] = 'user_id=1;password=btrross3'
|
158
|
-
def headers
|
159
|
-
HeaderSetter.new(self)
|
160
|
-
end
|
161
|
-
|
162
|
-
def raw_headers #:nodoc:
|
163
|
-
@headers
|
164
|
-
end
|
165
|
-
def raw_headers=(v) #:nodoc:
|
166
|
-
@headers = v
|
167
|
-
end
|
168
|
-
|
169
|
-
# Reset the headers hash.
|
170
|
-
def clear_headers
|
171
|
-
@headers = {}
|
172
|
-
end
|
173
|
-
|
174
|
-
def start! #:nodoc:
|
175
|
-
next_urls = @next_urls
|
176
|
-
begin
|
177
|
-
tmp_n_u = {}
|
178
|
-
next_urls.each do |prior_url, urls|
|
179
|
-
urls.map do |a_url|
|
180
|
-
[a_url, (URI.parse(a_url) rescue nil)]
|
181
|
-
end.select do |a_url, parsed_url|
|
182
|
-
allowable_url?(a_url, parsed_url)
|
183
|
-
end.each do |a_url, parsed_url|
|
184
|
-
@setup.call(a_url) unless @setup.nil?
|
185
|
-
get_page(parsed_url) do |response|
|
186
|
-
do_callbacks(a_url, response, prior_url)
|
187
|
-
tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
188
|
-
end
|
189
|
-
@teardown.call(a_url) unless @teardown.nil?
|
190
|
-
end
|
191
|
-
end
|
192
|
-
next_urls = tmp_n_u
|
193
|
-
end while !next_urls.empty?
|
194
|
-
end
|
195
|
-
|
196
|
-
def success_or_failure(code) #:nodoc:
|
197
|
-
if code > 199 && code < 300
|
198
|
-
:success
|
199
|
-
else
|
200
|
-
:failure
|
201
|
-
end
|
202
|
-
end
|
203
|
-
|
204
|
-
def allowable_url?(a_url, parsed_url) #:nodoc:
|
205
|
-
!parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
|
206
|
-
@url_checks.map{|url_check|url_check.call(a_url)}.all?
|
207
|
-
end
|
208
|
-
|
209
|
-
# True if the robots.txt for that URL allows access to it.
|
210
|
-
def allowed?(a_url, parsed_url) # :nodoc:
|
211
|
-
u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
|
212
|
-
begin
|
213
|
-
unless @robots_seen.include?(u)
|
214
|
-
open(u, 'User-Agent' => 'Ruby Spider',
|
215
|
-
'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
|
216
|
-
@rules.parse(u, url.read)
|
217
|
-
end
|
218
|
-
@robots_seen << u
|
219
|
-
end
|
220
|
-
@rules.allowed?(a_url)
|
221
|
-
rescue OpenURI::HTTPError
|
222
|
-
true # No robots.txt
|
223
|
-
rescue Exception, Timeout::Error # to keep it from crashing
|
224
|
-
false
|
225
|
-
end
|
226
|
-
end
|
227
|
-
|
228
|
-
def get_page(parsed_url, &block) #:nodoc:
|
229
|
-
@seen << parsed_url
|
230
|
-
begin
|
231
|
-
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
|
232
|
-
http.use_ssl = parsed_url.scheme == 'https'
|
233
|
-
# Uses start because http.finish cannot be called.
|
234
|
-
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
|
235
|
-
@headers))}
|
236
|
-
if r.redirect?
|
237
|
-
get_page(URI.parse(r['Location']), &block)
|
238
|
-
else
|
239
|
-
block.call(r)
|
240
|
-
end
|
241
|
-
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
|
242
|
-
p e
|
243
|
-
nil
|
244
|
-
end
|
245
|
-
end
|
246
|
-
|
247
|
-
def do_callbacks(a_url, resp, prior_url) #:nodoc:
|
248
|
-
cbs = [@callbacks[:every],
|
249
|
-
resp.success? ? @callbacks[:success] : @callbacks[:failure],
|
250
|
-
@callbacks[resp.code]]
|
251
|
-
|
252
|
-
cbs.each do |cb|
|
253
|
-
cb.call(a_url, resp, prior_url) if cb
|
254
|
-
end
|
255
|
-
end
|
256
|
-
|
257
|
-
def generate_next_urls(a_url, resp) #:nodoc:
|
258
|
-
web_page = resp.body
|
259
|
-
base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
|
260
|
-
[a_url[0,a_url.rindex('/')]])[0]
|
261
|
-
base_url = remove_trailing_slash(base_url)
|
262
|
-
web_page.scan(/href="(.*?)"/i).flatten.map do |link|
|
263
|
-
begin
|
264
|
-
parsed_link = URI.parse(link)
|
265
|
-
if parsed_link.fragment == '#'
|
266
|
-
nil
|
267
|
-
else
|
268
|
-
case parsed_link.scheme
|
269
|
-
when 'http'
|
270
|
-
link
|
271
|
-
when nil
|
272
|
-
u = URI.parse(base_url)
|
273
|
-
if link[0].chr == '/'
|
274
|
-
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
275
|
-
elsif u.path.nil? || u.path == ''
|
276
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{link}"
|
277
|
-
else
|
278
|
-
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
279
|
-
end
|
280
|
-
else
|
281
|
-
nil
|
282
|
-
end
|
283
|
-
end
|
284
|
-
rescue
|
285
|
-
nil
|
286
|
-
end
|
287
|
-
end.compact
|
288
|
-
end
|
289
|
-
|
290
|
-
def remove_trailing_slash(s) #:nodoc:
|
291
|
-
s.sub(%r{/*$},'')
|
292
|
-
end
|
293
|
-
|
294
|
-
class HeaderSetter #:nodoc:
|
295
|
-
def initialize(si)
|
296
|
-
@si = si
|
297
|
-
end
|
298
|
-
def []=(k,v)
|
299
|
-
@si.raw_headers = @si.raw_headers.merge({k => v})
|
300
|
-
end
|
301
|
-
end
|
302
|
-
end
|
@@ -0,0 +1,290 @@
|
|
1
|
+
# Copyright 2007 Mike Burns
|
2
|
+
|
3
|
+
# Redistribution and use in source and binary forms, with or without
|
4
|
+
# modification, are permitted provided that the following conditions are met:
|
5
|
+
# * Redistributions of source code must retain the above copyright
|
6
|
+
# notice, this list of conditions and the following disclaimer.
|
7
|
+
# * Redistributions in binary form must reproduce the above copyright
|
8
|
+
# notice, this list of conditions and the following disclaimer in the
|
9
|
+
# documentation and/or other materials provided with the distribution.
|
10
|
+
# * Neither the name Mike Burns nor the
|
11
|
+
# names of his contributors may be used to endorse or promote products
|
12
|
+
# derived from this software without specific prior written permission.
|
13
|
+
#
|
14
|
+
# THIS SOFTWARE IS PROVIDED BY Mike Burns ``AS IS'' AND ANY
|
15
|
+
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
16
|
+
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
17
|
+
# DISCLAIMED. IN NO EVENT SHALL Mike Burns BE LIABLE FOR ANY
|
18
|
+
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
19
|
+
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
20
|
+
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
21
|
+
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
22
|
+
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
23
|
+
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
24
|
+
|
25
|
+
require 'robot_rules'
|
26
|
+
require 'open-uri'
|
27
|
+
require 'uri'
|
28
|
+
require 'net/http'
|
29
|
+
require 'net/https'
|
30
|
+
|
31
|
+
module Net #:nodoc:
|
32
|
+
class HTTPResponse #:nodoc:
|
33
|
+
def success?; false; end
|
34
|
+
def redirect?; false; end
|
35
|
+
end
|
36
|
+
class HTTPSuccess #:nodoc:
|
37
|
+
def success?; true; end
|
38
|
+
end
|
39
|
+
class HTTPRedirection #:nodoc:
|
40
|
+
def redirect?; true; end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
class NilClass #:nodoc:
|
45
|
+
def merge(h); h; end
|
46
|
+
end
|
47
|
+
|
48
|
+
class SpiderInstance
|
49
|
+
def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
|
50
|
+
@url_checks = []
|
51
|
+
@cache = :memory
|
52
|
+
@callbacks = {}
|
53
|
+
@next_urls = next_urls
|
54
|
+
@seen = seen
|
55
|
+
@rules = rules || RobotRules.new('Ruby Spider 1.0')
|
56
|
+
@robots_seen = robots_seen
|
57
|
+
@headers = {}
|
58
|
+
@setup = nil
|
59
|
+
@teardown = nil
|
60
|
+
end
|
61
|
+
|
62
|
+
# Add a predicate that determines whether to continue down this URL's path.
|
63
|
+
# All predicates must be true in order for a URL to proceed.
|
64
|
+
#
|
65
|
+
# Takes a block that takes a string and produces a boolean. For example, this
|
66
|
+
# will ensure that the URL starts with 'http://mike-burns.com':
|
67
|
+
#
|
68
|
+
# add_url_check { |a_url| a_url =~ %r{^http://mike-burns.com.*}
|
69
|
+
def add_url_check(&block)
|
70
|
+
@url_checks << block
|
71
|
+
end
|
72
|
+
|
73
|
+
# The Web is a graph; to avoid cycles we store the nodes (URLs) already
|
74
|
+
# visited. The Web is a really, really, really big graph; as such, this list
|
75
|
+
# of visited nodes grows really, really, really big.
|
76
|
+
#
|
77
|
+
# Change the object used to store these seen nodes with this. The default
|
78
|
+
# object is an instance of Array. Available with Spider is a wrapper of
|
79
|
+
# memcached.
|
80
|
+
#
|
81
|
+
# You can implement a custom class for this; any object passed to
|
82
|
+
# check_already_seen_with must understand just << and included? .
|
83
|
+
#
|
84
|
+
# # default
|
85
|
+
# check_already_seen_with Array.new
|
86
|
+
#
|
87
|
+
# # memcached
|
88
|
+
# require 'spider/included_in_memcached'
|
89
|
+
# check_already_seen_with IncludedInMemcached.new('localhost:11211')
|
90
|
+
def check_already_seen_with(cacher)
|
91
|
+
if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
|
92
|
+
@seen = cacher
|
93
|
+
else
|
94
|
+
raise ArgumentError, 'expected something that responds to << and included?'
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
# Add a response handler. A response handler's trigger can be :every,
|
99
|
+
# :success, :failure, or any HTTP status code. The handler itself can be
|
100
|
+
# either a Proc or a block.
|
101
|
+
#
|
102
|
+
# The arguments to the block are: the URL as a string, an instance of
|
103
|
+
# Net::HTTPResponse, and the prior URL as a string.
|
104
|
+
#
|
105
|
+
#
|
106
|
+
# For example:
|
107
|
+
#
|
108
|
+
# on 404 do |a_url, resp, prior_url|
|
109
|
+
# puts "URL not found: #{a_url}"
|
110
|
+
# end
|
111
|
+
#
|
112
|
+
# on :success do |a_url, resp, prior_url|
|
113
|
+
# puts a_url
|
114
|
+
# puts resp.body
|
115
|
+
# end
|
116
|
+
#
|
117
|
+
# on :every do |a_url, resp, prior_url|
|
118
|
+
# puts "Given this code: #{resp.code}"
|
119
|
+
# end
|
120
|
+
def on(code, p = nil, &block)
|
121
|
+
f = p ? p : block
|
122
|
+
case code
|
123
|
+
when Fixnum
|
124
|
+
@callbacks[code] = f
|
125
|
+
else
|
126
|
+
@callbacks[code.to_sym] = f
|
127
|
+
end
|
128
|
+
end
|
129
|
+
|
130
|
+
# Run before the HTTP request. Given the URL as a string.
|
131
|
+
# setup do |a_url|
|
132
|
+
# headers['Cookies'] = 'user_id=1;admin=true'
|
133
|
+
# end
|
134
|
+
def setup(p = nil, &block)
|
135
|
+
@setup = p ? p : block
|
136
|
+
end
|
137
|
+
|
138
|
+
# Run last, once for each page. Given the URL as a string.
|
139
|
+
def teardown(p = nil, &block)
|
140
|
+
@teardown = p ? p : block
|
141
|
+
end
|
142
|
+
|
143
|
+
# Use like a hash:
|
144
|
+
# headers['Cookies'] = 'user_id=1;password=btrross3'
|
145
|
+
def headers
|
146
|
+
HeaderSetter.new(self)
|
147
|
+
end
|
148
|
+
|
149
|
+
def raw_headers #:nodoc:
|
150
|
+
@headers
|
151
|
+
end
|
152
|
+
def raw_headers=(v) #:nodoc:
|
153
|
+
@headers = v
|
154
|
+
end
|
155
|
+
|
156
|
+
# Reset the headers hash.
|
157
|
+
def clear_headers
|
158
|
+
@headers = {}
|
159
|
+
end
|
160
|
+
|
161
|
+
def start! #:nodoc:
|
162
|
+
next_urls = @next_urls
|
163
|
+
begin
|
164
|
+
tmp_n_u = {}
|
165
|
+
next_urls.each do |prior_url, urls|
|
166
|
+
urls.map do |a_url|
|
167
|
+
[a_url, (URI.parse(a_url) rescue nil)]
|
168
|
+
end.select do |a_url, parsed_url|
|
169
|
+
allowable_url?(a_url, parsed_url)
|
170
|
+
end.each do |a_url, parsed_url|
|
171
|
+
@setup.call(a_url) unless @setup.nil?
|
172
|
+
get_page(parsed_url) do |response|
|
173
|
+
do_callbacks(a_url, response, prior_url)
|
174
|
+
tmp_n_u[a_url] = generate_next_urls(a_url, response)
|
175
|
+
end
|
176
|
+
@teardown.call(a_url) unless @teardown.nil?
|
177
|
+
end
|
178
|
+
end
|
179
|
+
next_urls = tmp_n_u
|
180
|
+
end while !next_urls.empty?
|
181
|
+
end
|
182
|
+
|
183
|
+
def success_or_failure(code) #:nodoc:
|
184
|
+
if code > 199 && code < 300
|
185
|
+
:success
|
186
|
+
else
|
187
|
+
:failure
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def allowable_url?(a_url, parsed_url) #:nodoc:
|
192
|
+
!parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
|
193
|
+
@url_checks.map{|url_check|url_check.call(a_url)}.all?
|
194
|
+
end
|
195
|
+
|
196
|
+
# True if the robots.txt for that URL allows access to it.
|
197
|
+
def allowed?(a_url, parsed_url) # :nodoc:
|
198
|
+
u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
|
199
|
+
begin
|
200
|
+
unless @robots_seen.include?(u)
|
201
|
+
open(u, 'User-Agent' => 'Ruby Spider',
|
202
|
+
'Accept' => 'text/html,text/xml,application/xml,text/plain') do |url|
|
203
|
+
@rules.parse(u, url.read)
|
204
|
+
end
|
205
|
+
@robots_seen << u
|
206
|
+
end
|
207
|
+
@rules.allowed?(a_url)
|
208
|
+
rescue OpenURI::HTTPError
|
209
|
+
true # No robots.txt
|
210
|
+
rescue Exception, Timeout::Error # to keep it from crashing
|
211
|
+
false
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def get_page(parsed_url, &block) #:nodoc:
|
216
|
+
@seen << parsed_url
|
217
|
+
begin
|
218
|
+
http = Net::HTTP.new(parsed_url.host, parsed_url.port)
|
219
|
+
http.use_ssl = parsed_url.scheme == 'https'
|
220
|
+
# Uses start because http.finish cannot be called.
|
221
|
+
r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri,
|
222
|
+
@headers))}
|
223
|
+
if r.redirect?
|
224
|
+
get_page(URI.parse(r['Location']), &block)
|
225
|
+
else
|
226
|
+
block.call(r)
|
227
|
+
end
|
228
|
+
rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
|
229
|
+
p e
|
230
|
+
nil
|
231
|
+
end
|
232
|
+
end
|
233
|
+
|
234
|
+
def do_callbacks(a_url, resp, prior_url) #:nodoc:
|
235
|
+
cbs = [@callbacks[:every],
|
236
|
+
resp.success? ? @callbacks[:success] : @callbacks[:failure],
|
237
|
+
@callbacks[resp.code]]
|
238
|
+
|
239
|
+
cbs.each do |cb|
|
240
|
+
cb.call(a_url, resp, prior_url) if cb
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
def generate_next_urls(a_url, resp) #:nodoc:
|
245
|
+
web_page = resp.body
|
246
|
+
base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
|
247
|
+
[a_url[0,a_url.rindex('/')]])[0]
|
248
|
+
base_url = remove_trailing_slash(base_url)
|
249
|
+
web_page.scan(/href="(.*?)"/i).flatten.map do |link|
|
250
|
+
begin
|
251
|
+
parsed_link = URI.parse(link)
|
252
|
+
if parsed_link.fragment == '#'
|
253
|
+
nil
|
254
|
+
else
|
255
|
+
case parsed_link.scheme
|
256
|
+
when 'http'
|
257
|
+
link
|
258
|
+
when nil
|
259
|
+
u = URI.parse(base_url)
|
260
|
+
if link[0].chr == '/'
|
261
|
+
"#{u.scheme}://#{u.host}:#{u.port}#{link}"
|
262
|
+
elsif u.path.nil? || u.path == ''
|
263
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{link}"
|
264
|
+
else
|
265
|
+
"#{u.scheme}://#{u.host}:#{u.port}/#{u.path}/#{link}"
|
266
|
+
end
|
267
|
+
else
|
268
|
+
nil
|
269
|
+
end
|
270
|
+
end
|
271
|
+
rescue
|
272
|
+
nil
|
273
|
+
end
|
274
|
+
end.compact
|
275
|
+
end
|
276
|
+
|
277
|
+
def remove_trailing_slash(s) #:nodoc:
|
278
|
+
s.sub(%r{/*$},'')
|
279
|
+
end
|
280
|
+
|
281
|
+
class HeaderSetter #:nodoc:
|
282
|
+
def initialize(si)
|
283
|
+
@si = si
|
284
|
+
end
|
285
|
+
def []=(k,v)
|
286
|
+
@si.raw_headers = @si.raw_headers.merge({k => v})
|
287
|
+
end
|
288
|
+
end
|
289
|
+
end
|
290
|
+
|