polipus 0.2.2 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/AUTHORS.md +4 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +2 -2
- data/examples/error_handling.rb +22 -0
- data/examples/robots_txt_handling.rb +13 -0
- data/lib/polipus.rb +54 -21
- data/lib/polipus/http.rb +29 -14
- data/lib/polipus/page.rb +15 -6
- data/lib/polipus/robotex.rb +154 -0
- data/lib/polipus/version.rb +1 -1
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/gzipped_on.yml +80 -70
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4091 -7461
- data/spec/http_spec.rb +32 -2
- data/spec/page_spec.rb +19 -0
- data/spec/polipus_spec.rb +18 -0
- data/spec/robotex_spec.rb +86 -0
- data/spec/spec_helper.rb +1 -0
- metadata +15 -4
- data/AUTHORS +0 -2
- data/README.rdoc +0 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZTc3MjQ1OWQwNzVhMWFhMGQ2NTdlYjM3ZTkyZDQ3ZDAwZDExZWQ1Mw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTUzYTE5M2U4YTQ3ZGVmZTAzMzdiYjJmZWYzM2Q3MTU0NDMyYzAwMQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDk5MGQ2MzBkYzU2MjJlNDg1YTkwYTU1YjJjYWQ0YjAyNDY5OTZkNWJlZDIw
|
10
|
+
NDAwNjY2ZjMwMGUxZWE0NTNiNzc5YmIzZTg2NjcwNjFjZTMyNzIxZjZlYzZm
|
11
|
+
N2ZjMTk2ZjRkYjU0M2VjZDk0NWMxYzk0MjE4MWRkOWFiY2M3YTA=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OTgwMTI5MWFhNWQ5Mjk4OWNmZTk3ZGE0MTMyYzM5NDlkMWJhMjFiMWQ4NDQ4
|
14
|
+
OGI1NDU3ZDQ0ZTkzNWFkMzAyZjg3YmRiNDlmN2I0ZDNlNWRlZmVkMjIzMWQ2
|
15
|
+
MGY0NGQ4YTQ1ZmEyMGQ0M2VkNzE2YzIyOGMxOGE4MDQzMWFkZjU=
|
data/AUTHORS.md
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
## 0.3.0 (2015-06-02)
|
4
|
+
|
5
|
+
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
|
6
|
+
|
7
|
+
* Add `PolipusCrawler#add_to_queue` to add a page back to the queue
|
8
|
+
[#24](https://github.com/taganaka/polipus/pull/24)
|
9
|
+
* Introduce new block `PolipusCrawler#on_page_error` which runs when there was an error (`Page#error`).
|
10
|
+
For example a connectivity error.
|
11
|
+
See `/examples/error_handling.rb`
|
12
|
+
[#15](https://github.com/taganaka/polipus/issues/15)
|
13
|
+
* Add `Page#success?` which returns true if HTTP code is something in between 200 and 206.
|
14
|
+
* Polipus supports now `robots.txt` directives.
|
15
|
+
Set the option `:obey_robots_txt` to `true`.
|
16
|
+
See `/examples/robots_txt_handling.rb`
|
17
|
+
[#30](https://github.com/taganaka/polipus/pull/30)
|
18
|
+
* Add support for GZIP and deflate compressed HTTP requests
|
19
|
+
[#26](https://github.com/taganaka/polipus/pull/26)
|
20
|
+
* Minor improvements to code style
|
data/Gemfile
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
gemspec
|
3
|
+
gemspec
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'polipus'
|
2
|
+
|
3
|
+
Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
|
4
|
+
# Handle connectivity errors
|
5
|
+
# Only runs when there is an error
|
6
|
+
crawler.on_page_error do |page|
|
7
|
+
# Don't store the page
|
8
|
+
page.storable = false
|
9
|
+
# Add the URL again to the queue
|
10
|
+
crawler.add_to_queue(page)
|
11
|
+
end
|
12
|
+
|
13
|
+
# In-place page processing
|
14
|
+
# Runs also when there was an error in the page
|
15
|
+
crawler.on_page_downloaded do |page|
|
16
|
+
# Skip block if there is an error
|
17
|
+
return if page.error
|
18
|
+
|
19
|
+
# A nokogiri object
|
20
|
+
puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'polipus'
|
2
|
+
|
3
|
+
options = {
|
4
|
+
user_agent: 'Googlebot', # Act as Google bot
|
5
|
+
obey_robots_txt: true # Follow /robots.txt rules if any
|
6
|
+
}
|
7
|
+
|
8
|
+
Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
|
9
|
+
|
10
|
+
crawler.on_page_downloaded do |page|
|
11
|
+
puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
|
12
|
+
end
|
13
|
+
end
|
data/lib/polipus.rb
CHANGED
@@ -8,6 +8,7 @@ require "polipus/storage"
|
|
8
8
|
require "polipus/url_tracker"
|
9
9
|
require "polipus/plugin"
|
10
10
|
require "polipus/queue_overflow"
|
11
|
+
require "polipus/robotex"
|
11
12
|
require "thread"
|
12
13
|
require "logger"
|
13
14
|
require "json"
|
@@ -62,6 +63,7 @@ module Polipus
|
|
62
63
|
:stats_enabled => false,
|
63
64
|
# Cookies strategy
|
64
65
|
:cookie_jar => nil,
|
66
|
+
# whether or not accept cookies
|
65
67
|
:accept_cookies => false,
|
66
68
|
# A set of hosts that should be considered parts of the same domain
|
67
69
|
# Eg It can be used to follow links with and without 'www' domain
|
@@ -69,7 +71,9 @@ module Polipus
|
|
69
71
|
# Mark a connection as staled after connection_max_hits request
|
70
72
|
:connection_max_hits => nil,
|
71
73
|
# Page TTL: mark a page as expired after ttl_page seconds
|
72
|
-
:ttl_page => nil
|
74
|
+
:ttl_page => nil,
|
75
|
+
# don't obey the robots exclusion protocol
|
76
|
+
:obey_robots_txt => false
|
73
77
|
}
|
74
78
|
|
75
79
|
attr_reader :storage
|
@@ -110,6 +114,7 @@ module Polipus
|
|
110
114
|
@skip_links_like = []
|
111
115
|
@on_page_downloaded = []
|
112
116
|
@on_before_save = []
|
117
|
+
@on_page_error = []
|
113
118
|
@focus_crawl_block = nil
|
114
119
|
@on_crawl_end = []
|
115
120
|
@redis_factory = nil
|
@@ -122,8 +127,8 @@ module Polipus
|
|
122
127
|
|
123
128
|
@urls = [urls].flatten.map{ |url| URI(url) }
|
124
129
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
125
|
-
|
126
130
|
@internal_queue = queue_factory
|
131
|
+
@robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
|
127
132
|
|
128
133
|
execute_plugin 'on_initialize'
|
129
134
|
|
@@ -139,14 +144,10 @@ module Polipus
|
|
139
144
|
PolipusSignalHandler.enable
|
140
145
|
overflow_items_controller if queue_overflow_adapter
|
141
146
|
|
142
|
-
q = queue_factory
|
143
147
|
@urls.each do |u|
|
144
|
-
page
|
145
|
-
page.user_data.p_seeded = true
|
146
|
-
q << page.to_json
|
148
|
+
add_url(u) { |page| page.user_data.p_seeded = true }
|
147
149
|
end
|
148
|
-
|
149
|
-
return if q.empty?
|
150
|
+
return if @internal_queue.empty?
|
150
151
|
|
151
152
|
execute_plugin 'on_crawl_start'
|
152
153
|
@options[:workers].times do |worker_number|
|
@@ -194,27 +195,28 @@ module Polipus
|
|
194
195
|
page = pages.last
|
195
196
|
end
|
196
197
|
|
197
|
-
# Execute on_before_save blocks
|
198
|
-
@on_before_save.each {|e| e.call(page)} unless page.nil?
|
199
198
|
execute_plugin 'on_after_download'
|
200
199
|
|
201
|
-
|
200
|
+
if page.error
|
201
|
+
@logger.warn {"Page #{page.url} has error: #{page.error}"}
|
202
|
+
incr_error
|
203
|
+
@on_page_error.each {|e| e.call(page)}
|
204
|
+
end
|
202
205
|
|
203
|
-
|
206
|
+
# Execute on_before_save blocks
|
207
|
+
@on_before_save.each {|e| e.call(page)}
|
204
208
|
|
205
|
-
if page
|
209
|
+
if page.storable?
|
206
210
|
@storage.add page
|
207
211
|
end
|
208
212
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
end
|
213
|
-
|
213
|
+
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
|
214
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
|
215
|
+
|
214
216
|
incr_pages
|
215
217
|
|
216
218
|
# Execute on_page_downloaded blocks
|
217
|
-
@on_page_downloaded.each {|e| e.call(page)}
|
219
|
+
@on_page_downloaded.each {|e| e.call(page)}
|
218
220
|
|
219
221
|
if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
|
220
222
|
links_for(page).each do |url_to_visit|
|
@@ -264,6 +266,7 @@ module Polipus
|
|
264
266
|
self
|
265
267
|
end
|
266
268
|
|
269
|
+
# A block of code will be executed when crawl session is over
|
267
270
|
def on_crawl_end(&block)
|
268
271
|
@on_crawl_end << block
|
269
272
|
self
|
@@ -276,6 +279,12 @@ module Polipus
|
|
276
279
|
self
|
277
280
|
end
|
278
281
|
|
282
|
+
# A block of code will be executed whether a page contains an error
|
283
|
+
def on_page_error(&block)
|
284
|
+
@on_page_error << block
|
285
|
+
self
|
286
|
+
end
|
287
|
+
|
279
288
|
# A block of code will be executed
|
280
289
|
# on every page downloaded. The code is used to extract urls to visit
|
281
290
|
# see links_for method
|
@@ -313,9 +322,18 @@ module Polipus
|
|
313
322
|
@redis ||= redis_factory_adapter
|
314
323
|
end
|
315
324
|
|
325
|
+
def add_to_queue(page)
|
326
|
+
if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
|
327
|
+
add_url(page.url, referer: page.referer, depth: page.depth)
|
328
|
+
else
|
329
|
+
add_url(page)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
316
333
|
# Enqueue an url, no matter what
|
317
|
-
def add_url
|
318
|
-
page = Page.new(url)
|
334
|
+
def add_url(url, params = {})
|
335
|
+
page = Page.new(url, params)
|
336
|
+
yield(page) if block_given?
|
319
337
|
@internal_queue << page.to_json
|
320
338
|
end
|
321
339
|
|
@@ -329,7 +347,11 @@ module Polipus
|
|
329
347
|
private
|
330
348
|
# URLs enqueue policy
|
331
349
|
def should_be_visited?(url, with_tracker = true)
|
350
|
+
|
332
351
|
case
|
352
|
+
# robots.txt
|
353
|
+
when !allowed_by_robot?(url)
|
354
|
+
false
|
333
355
|
# Check against whitelist pattern matching
|
334
356
|
when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
|
335
357
|
false
|
@@ -368,6 +390,17 @@ module Polipus
|
|
368
390
|
@storage.exists?(page) && !page_expired?(page)
|
369
391
|
end
|
370
392
|
|
393
|
+
#
|
394
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
395
|
+
# is granted access in it. Always returns +true+ when we are
|
396
|
+
# not obeying robots.txt.
|
397
|
+
#
|
398
|
+
def allowed_by_robot?(link)
|
399
|
+
return true if @robots.nil?
|
400
|
+
@options[:obey_robots_txt] ? @robots.allowed?(link) : true
|
401
|
+
end
|
402
|
+
|
403
|
+
|
371
404
|
# The url is enqueued for a later visit
|
372
405
|
def enqueue url_to_visit, current_page, queue
|
373
406
|
page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
|
data/lib/polipus/http.rb
CHANGED
@@ -7,6 +7,21 @@ module Polipus
|
|
7
7
|
class HTTP
|
8
8
|
# Maximum number of redirects to follow on each get_response
|
9
9
|
REDIRECT_LIMIT = 5
|
10
|
+
RESCUABLE_ERRORS = [
|
11
|
+
EOFError,
|
12
|
+
Errno::ECONNREFUSED,
|
13
|
+
Errno::ECONNRESET,
|
14
|
+
Errno::EHOSTUNREACH,
|
15
|
+
Errno::EINVAL,
|
16
|
+
Errno::EPIPE,
|
17
|
+
Errno::ETIMEDOUT,
|
18
|
+
Net::HTTPBadResponse,
|
19
|
+
Net::HTTPHeaderSyntaxError,
|
20
|
+
Net::ProtocolError,
|
21
|
+
SocketError,
|
22
|
+
Timeout::Error,
|
23
|
+
Zlib::DataError
|
24
|
+
]
|
10
25
|
|
11
26
|
def initialize(opts = {})
|
12
27
|
@connections = {}
|
@@ -30,13 +45,8 @@ module Polipus
|
|
30
45
|
url = URI(url)
|
31
46
|
pages = []
|
32
47
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
33
|
-
|
34
|
-
|
35
|
-
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
36
|
-
body = gzip.read
|
37
|
-
end
|
38
|
-
|
39
|
-
pages << Page.new(location, :body => body,
|
48
|
+
handle_compression response
|
49
|
+
pages << Page.new(location, :body => response.body,
|
40
50
|
:code => code,
|
41
51
|
:headers => response.to_hash,
|
42
52
|
:referer => referer,
|
@@ -47,13 +57,13 @@ module Polipus
|
|
47
57
|
end
|
48
58
|
|
49
59
|
pages
|
50
|
-
rescue
|
60
|
+
rescue *RESCUABLE_ERRORS => e
|
51
61
|
if verbose?
|
52
62
|
puts e.inspect
|
53
63
|
puts e.backtrace
|
54
64
|
end
|
55
65
|
|
56
|
-
[Page.new(url, :
|
66
|
+
[Page.new(url, error: e, referer: referer, depth: depth)]
|
57
67
|
end
|
58
68
|
|
59
69
|
#
|
@@ -154,7 +164,7 @@ module Polipus
|
|
154
164
|
opts['User-Agent'] = user_agent if user_agent
|
155
165
|
opts['Referer'] = referer.to_s if referer
|
156
166
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
157
|
-
opts['Accept-Encoding'] = 'gzip'
|
167
|
+
opts['Accept-Encoding'] = 'gzip,deflate'
|
158
168
|
|
159
169
|
|
160
170
|
retries = 0
|
@@ -169,8 +179,7 @@ module Polipus
|
|
169
179
|
response_time = ((finish - start) * 1000).round
|
170
180
|
cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
|
171
181
|
return response, response_time
|
172
|
-
rescue
|
173
|
-
|
182
|
+
rescue *RESCUABLE_ERRORS => e
|
174
183
|
puts e.inspect if verbose?
|
175
184
|
refresh_connection(url)
|
176
185
|
retries += 1
|
@@ -229,8 +238,14 @@ module Polipus
|
|
229
238
|
to_url.host.nil? || (to_url.host == from_url.host)
|
230
239
|
end
|
231
240
|
|
232
|
-
def
|
233
|
-
|
241
|
+
def handle_compression response
|
242
|
+
case response["content-encoding"]
|
243
|
+
when "gzip", "x-gzip"
|
244
|
+
body_io = StringIO.new(response.body)
|
245
|
+
response.body.replace Zlib::GzipReader.new(body_io).read
|
246
|
+
when "deflate"
|
247
|
+
response.body.replace Zlib::Inflate.inflate(response.body)
|
248
|
+
end
|
234
249
|
end
|
235
250
|
|
236
251
|
end
|
data/lib/polipus/page.rb
CHANGED
@@ -17,8 +17,7 @@ module Polipus
|
|
17
17
|
attr_reader :error
|
18
18
|
# Integer response code of the page
|
19
19
|
attr_accessor :code
|
20
|
-
# Depth of this page from the root of the crawl.
|
21
|
-
# shortest path; use PageStore#shortest_paths! to find that value.
|
20
|
+
# Depth of this page from the root of the crawl.
|
22
21
|
attr_accessor :depth
|
23
22
|
# URL of the page that brought us to this page
|
24
23
|
attr_accessor :referer
|
@@ -41,7 +40,7 @@ module Polipus
|
|
41
40
|
# Create a new page
|
42
41
|
#
|
43
42
|
def initialize(url, params = {})
|
44
|
-
@url =
|
43
|
+
@url = URI(url)
|
45
44
|
@code = params[:code]
|
46
45
|
@headers = params[:headers] || {}
|
47
46
|
@headers['content-type'] ||= ['']
|
@@ -130,6 +129,14 @@ module Polipus
|
|
130
129
|
(300..307).include?(@code)
|
131
130
|
end
|
132
131
|
|
132
|
+
#
|
133
|
+
# Returns +true+ if the page is a HTTP success, returns +false+
|
134
|
+
# otherwise.
|
135
|
+
#
|
136
|
+
def success?
|
137
|
+
(200..206).include?(@code)
|
138
|
+
end
|
139
|
+
|
133
140
|
#
|
134
141
|
# Returns +true+ if the page was not found (returned 404 code),
|
135
142
|
# returns +false+ otherwise.
|
@@ -192,7 +199,8 @@ module Polipus
|
|
192
199
|
'response_time' => @response_time,
|
193
200
|
'fetched' => @fetched,
|
194
201
|
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
195
|
-
'fetched_at' => @fetched_at
|
202
|
+
'fetched_at' => @fetched_at,
|
203
|
+
'error' => @error
|
196
204
|
}
|
197
205
|
end
|
198
206
|
|
@@ -230,7 +238,8 @@ module Polipus
|
|
230
238
|
'@response_time' => hash['response_time'].to_i,
|
231
239
|
'@fetched' => hash['fetched'],
|
232
240
|
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
233
|
-
'@fetched_at' => hash['fetched_at']
|
241
|
+
'@fetched_at' => hash['fetched_at'],
|
242
|
+
'@error' => hash['error']
|
234
243
|
}.each do |var, value|
|
235
244
|
page.instance_variable_set(var, value)
|
236
245
|
end
|
@@ -242,4 +251,4 @@ module Polipus
|
|
242
251
|
self.from_hash hash
|
243
252
|
end
|
244
253
|
end
|
245
|
-
end
|
254
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'uri'
|
3
|
+
require 'timeout'
|
4
|
+
module Polipus
|
5
|
+
|
6
|
+
# Original code taken from
|
7
|
+
# https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
|
8
|
+
|
9
|
+
class Robotex
|
10
|
+
|
11
|
+
DEFAULT_TIMEOUT = 3
|
12
|
+
VERSION = '1.0.0'
|
13
|
+
|
14
|
+
attr_reader :user_agent
|
15
|
+
|
16
|
+
class ParsedRobots
|
17
|
+
|
18
|
+
def initialize(uri, user_agent)
|
19
|
+
io = Robotex.get_robots_txt(uri, user_agent)
|
20
|
+
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
21
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
22
|
+
end
|
23
|
+
|
24
|
+
@disallows = {}
|
25
|
+
@allows = {}
|
26
|
+
@delays = {}
|
27
|
+
agent = /.*/
|
28
|
+
io.each do |line|
|
29
|
+
next if line =~ /^\s*(#.*|$)/
|
30
|
+
arr = line.split(":")
|
31
|
+
key = arr.shift
|
32
|
+
value = arr.join(":").strip
|
33
|
+
value.strip!
|
34
|
+
case key.downcase
|
35
|
+
when "user-agent"
|
36
|
+
agent = to_regex(value)
|
37
|
+
when "allow"
|
38
|
+
unless value.empty?
|
39
|
+
@allows[agent] ||= []
|
40
|
+
@allows[agent] << to_regex(value)
|
41
|
+
end
|
42
|
+
when "disallow"
|
43
|
+
unless value.empty?
|
44
|
+
@disallows[agent] ||= []
|
45
|
+
@disallows[agent] << to_regex(value)
|
46
|
+
end
|
47
|
+
when "crawl-delay"
|
48
|
+
@delays[agent] = value.to_i
|
49
|
+
end
|
50
|
+
end
|
51
|
+
@parsed = true
|
52
|
+
end
|
53
|
+
|
54
|
+
def allowed?(uri, user_agent)
|
55
|
+
return true unless @parsed
|
56
|
+
allowed = true
|
57
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
58
|
+
path = uri.request_uri
|
59
|
+
|
60
|
+
@allows.each do |key, value|
|
61
|
+
unless allowed
|
62
|
+
if user_agent =~ key
|
63
|
+
value.each do |rule|
|
64
|
+
if path =~ rule
|
65
|
+
allowed = true
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
@disallows.each do |key, value|
|
73
|
+
if user_agent =~ key
|
74
|
+
value.each do |rule|
|
75
|
+
if path =~ rule
|
76
|
+
allowed = false
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
return allowed
|
83
|
+
end
|
84
|
+
|
85
|
+
def delay(user_agent)
|
86
|
+
@delays.each do |agent, delay|
|
87
|
+
return delay if agent =~ user_agent
|
88
|
+
end
|
89
|
+
nil
|
90
|
+
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
|
94
|
+
def to_regex(pattern)
|
95
|
+
pattern = Regexp.escape(pattern)
|
96
|
+
pattern.gsub!(Regexp.escape("*"), ".*")
|
97
|
+
Regexp.compile("^#{pattern}")
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.get_robots_txt(uri, user_agent)
|
102
|
+
begin
|
103
|
+
Timeout::timeout(Robotex.timeout) do
|
104
|
+
URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
105
|
+
end
|
106
|
+
rescue Timeout::Error
|
107
|
+
STDERR.puts "robots.txt request timed out"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.timeout=(t)
|
112
|
+
@timeout = t
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.timeout
|
116
|
+
@timeout || DEFAULT_TIMEOUT
|
117
|
+
end
|
118
|
+
|
119
|
+
def initialize(user_agent = nil)
|
120
|
+
user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
|
121
|
+
@user_agent = user_agent
|
122
|
+
@last_accessed = Time.at(1)
|
123
|
+
@parsed = {}
|
124
|
+
end
|
125
|
+
|
126
|
+
def parse_host(uri)
|
127
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
128
|
+
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
133
|
+
#
|
134
|
+
def allowed?(uri)
|
135
|
+
parse_host(uri).allowed?(uri, @user_agent)
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Return the value of the Crawl-Delay directive, or nil if none
|
140
|
+
def delay(uri)
|
141
|
+
parse_host(uri).delay(@user_agent)
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
|
146
|
+
#
|
147
|
+
def delay!(uri)
|
148
|
+
delay = delay(uri)
|
149
|
+
sleep delay - (Time.now - @last_accessed) if !!delay
|
150
|
+
@last_accessed = Time.now
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
end
|