polipus 0.2.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/AUTHORS.md +4 -0
- data/CHANGELOG.md +20 -0
- data/Gemfile +2 -2
- data/examples/error_handling.rb +22 -0
- data/examples/robots_txt_handling.rb +13 -0
- data/lib/polipus.rb +54 -21
- data/lib/polipus/http.rb +29 -14
- data/lib/polipus/page.rb +15 -6
- data/lib/polipus/robotex.rb +154 -0
- data/lib/polipus/version.rb +1 -1
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/gzipped_on.yml +80 -70
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4091 -7461
- data/spec/http_spec.rb +32 -2
- data/spec/page_spec.rb +19 -0
- data/spec/polipus_spec.rb +18 -0
- data/spec/robotex_spec.rb +86 -0
- data/spec/spec_helper.rb +1 -0
- metadata +15 -4
- data/AUTHORS +0 -2
- data/README.rdoc +0 -3
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZTc3MjQ1OWQwNzVhMWFhMGQ2NTdlYjM3ZTkyZDQ3ZDAwZDExZWQ1Mw==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
OTUzYTE5M2U4YTQ3ZGVmZTAzMzdiYjJmZWYzM2Q3MTU0NDMyYzAwMQ==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
MDk5MGQ2MzBkYzU2MjJlNDg1YTkwYTU1YjJjYWQ0YjAyNDY5OTZkNWJlZDIw
|
10
|
+
NDAwNjY2ZjMwMGUxZWE0NTNiNzc5YmIzZTg2NjcwNjFjZTMyNzIxZjZlYzZm
|
11
|
+
N2ZjMTk2ZjRkYjU0M2VjZDk0NWMxYzk0MjE4MWRkOWFiY2M3YTA=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
OTgwMTI5MWFhNWQ5Mjk4OWNmZTk3ZGE0MTMyYzM5NDlkMWJhMjFiMWQ4NDQ4
|
14
|
+
OGI1NDU3ZDQ0ZTkzNWFkMzAyZjg3YmRiNDlmN2I0ZDNlNWRlZmVkMjIzMWQ2
|
15
|
+
MGY0NGQ4YTQ1ZmEyMGQ0M2VkNzE2YzIyOGMxOGE4MDQzMWFkZjU=
|
data/AUTHORS.md
ADDED
data/CHANGELOG.md
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# Changelog
|
2
|
+
|
3
|
+
## 0.3.0 (2015-06-02)
|
4
|
+
|
5
|
+
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
|
6
|
+
|
7
|
+
* Add `PolipusCrawler#add_to_queue` to add a page back to the queue
|
8
|
+
[#24](https://github.com/taganaka/polipus/pull/24)
|
9
|
+
* Introduce new block `PolipusCrawler#on_page_error` which runs when there was an error (`Page#error`).
|
10
|
+
For example a connectivity error.
|
11
|
+
See `/examples/error_handling.rb`
|
12
|
+
[#15](https://github.com/taganaka/polipus/issues/15)
|
13
|
+
* Add `Page#success?` which returns true if HTTP code is something in between 200 and 206.
|
14
|
+
* Polipus supports now `robots.txt` directives.
|
15
|
+
Set the option `:obey_robots_txt` to `true`.
|
16
|
+
See `/examples/robots_txt_handling.rb`
|
17
|
+
[#30](https://github.com/taganaka/polipus/pull/30)
|
18
|
+
* Add support for GZIP and deflate compressed HTTP requests
|
19
|
+
[#26](https://github.com/taganaka/polipus/pull/26)
|
20
|
+
* Minor improvements to code style
|
data/Gemfile
CHANGED
@@ -1,3 +1,3 @@
|
|
1
|
-
source
|
1
|
+
source 'https://rubygems.org'
|
2
2
|
|
3
|
-
gemspec
|
3
|
+
gemspec
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'polipus'
|
2
|
+
|
3
|
+
Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
|
4
|
+
# Handle connectivity errors
|
5
|
+
# Only runs when there is an error
|
6
|
+
crawler.on_page_error do |page|
|
7
|
+
# Don't store the page
|
8
|
+
page.storable = false
|
9
|
+
# Add the URL again to the queue
|
10
|
+
crawler.add_to_queue(page)
|
11
|
+
end
|
12
|
+
|
13
|
+
# In-place page processing
|
14
|
+
# Runs also when there was an error in the page
|
15
|
+
crawler.on_page_downloaded do |page|
|
16
|
+
# Skip block if there is an error
|
17
|
+
return if page.error
|
18
|
+
|
19
|
+
# A nokogiri object
|
20
|
+
puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'polipus'
|
2
|
+
|
3
|
+
options = {
|
4
|
+
user_agent: 'Googlebot', # Act as Google bot
|
5
|
+
obey_robots_txt: true # Follow /robots.txt rules if any
|
6
|
+
}
|
7
|
+
|
8
|
+
Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
|
9
|
+
|
10
|
+
crawler.on_page_downloaded do |page|
|
11
|
+
puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
|
12
|
+
end
|
13
|
+
end
|
data/lib/polipus.rb
CHANGED
@@ -8,6 +8,7 @@ require "polipus/storage"
|
|
8
8
|
require "polipus/url_tracker"
|
9
9
|
require "polipus/plugin"
|
10
10
|
require "polipus/queue_overflow"
|
11
|
+
require "polipus/robotex"
|
11
12
|
require "thread"
|
12
13
|
require "logger"
|
13
14
|
require "json"
|
@@ -62,6 +63,7 @@ module Polipus
|
|
62
63
|
:stats_enabled => false,
|
63
64
|
# Cookies strategy
|
64
65
|
:cookie_jar => nil,
|
66
|
+
# whether or not accept cookies
|
65
67
|
:accept_cookies => false,
|
66
68
|
# A set of hosts that should be considered parts of the same domain
|
67
69
|
# Eg It can be used to follow links with and without 'www' domain
|
@@ -69,7 +71,9 @@ module Polipus
|
|
69
71
|
# Mark a connection as staled after connection_max_hits request
|
70
72
|
:connection_max_hits => nil,
|
71
73
|
# Page TTL: mark a page as expired after ttl_page seconds
|
72
|
-
:ttl_page => nil
|
74
|
+
:ttl_page => nil,
|
75
|
+
# don't obey the robots exclusion protocol
|
76
|
+
:obey_robots_txt => false
|
73
77
|
}
|
74
78
|
|
75
79
|
attr_reader :storage
|
@@ -110,6 +114,7 @@ module Polipus
|
|
110
114
|
@skip_links_like = []
|
111
115
|
@on_page_downloaded = []
|
112
116
|
@on_before_save = []
|
117
|
+
@on_page_error = []
|
113
118
|
@focus_crawl_block = nil
|
114
119
|
@on_crawl_end = []
|
115
120
|
@redis_factory = nil
|
@@ -122,8 +127,8 @@ module Polipus
|
|
122
127
|
|
123
128
|
@urls = [urls].flatten.map{ |url| URI(url) }
|
124
129
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
125
|
-
|
126
130
|
@internal_queue = queue_factory
|
131
|
+
@robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
|
127
132
|
|
128
133
|
execute_plugin 'on_initialize'
|
129
134
|
|
@@ -139,14 +144,10 @@ module Polipus
|
|
139
144
|
PolipusSignalHandler.enable
|
140
145
|
overflow_items_controller if queue_overflow_adapter
|
141
146
|
|
142
|
-
q = queue_factory
|
143
147
|
@urls.each do |u|
|
144
|
-
page
|
145
|
-
page.user_data.p_seeded = true
|
146
|
-
q << page.to_json
|
148
|
+
add_url(u) { |page| page.user_data.p_seeded = true }
|
147
149
|
end
|
148
|
-
|
149
|
-
return if q.empty?
|
150
|
+
return if @internal_queue.empty?
|
150
151
|
|
151
152
|
execute_plugin 'on_crawl_start'
|
152
153
|
@options[:workers].times do |worker_number|
|
@@ -194,27 +195,28 @@ module Polipus
|
|
194
195
|
page = pages.last
|
195
196
|
end
|
196
197
|
|
197
|
-
# Execute on_before_save blocks
|
198
|
-
@on_before_save.each {|e| e.call(page)} unless page.nil?
|
199
198
|
execute_plugin 'on_after_download'
|
200
199
|
|
201
|
-
|
200
|
+
if page.error
|
201
|
+
@logger.warn {"Page #{page.url} has error: #{page.error}"}
|
202
|
+
incr_error
|
203
|
+
@on_page_error.each {|e| e.call(page)}
|
204
|
+
end
|
202
205
|
|
203
|
-
|
206
|
+
# Execute on_before_save blocks
|
207
|
+
@on_before_save.each {|e| e.call(page)}
|
204
208
|
|
205
|
-
if page
|
209
|
+
if page.storable?
|
206
210
|
@storage.add page
|
207
211
|
end
|
208
212
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
end
|
213
|
-
|
213
|
+
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
|
214
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
|
215
|
+
|
214
216
|
incr_pages
|
215
217
|
|
216
218
|
# Execute on_page_downloaded blocks
|
217
|
-
@on_page_downloaded.each {|e| e.call(page)}
|
219
|
+
@on_page_downloaded.each {|e| e.call(page)}
|
218
220
|
|
219
221
|
if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
|
220
222
|
links_for(page).each do |url_to_visit|
|
@@ -264,6 +266,7 @@ module Polipus
|
|
264
266
|
self
|
265
267
|
end
|
266
268
|
|
269
|
+
# A block of code will be executed when crawl session is over
|
267
270
|
def on_crawl_end(&block)
|
268
271
|
@on_crawl_end << block
|
269
272
|
self
|
@@ -276,6 +279,12 @@ module Polipus
|
|
276
279
|
self
|
277
280
|
end
|
278
281
|
|
282
|
+
# A block of code will be executed whether a page contains an error
|
283
|
+
def on_page_error(&block)
|
284
|
+
@on_page_error << block
|
285
|
+
self
|
286
|
+
end
|
287
|
+
|
279
288
|
# A block of code will be executed
|
280
289
|
# on every page downloaded. The code is used to extract urls to visit
|
281
290
|
# see links_for method
|
@@ -313,9 +322,18 @@ module Polipus
|
|
313
322
|
@redis ||= redis_factory_adapter
|
314
323
|
end
|
315
324
|
|
325
|
+
def add_to_queue(page)
|
326
|
+
if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
|
327
|
+
add_url(page.url, referer: page.referer, depth: page.depth)
|
328
|
+
else
|
329
|
+
add_url(page)
|
330
|
+
end
|
331
|
+
end
|
332
|
+
|
316
333
|
# Enqueue an url, no matter what
|
317
|
-
def add_url
|
318
|
-
page = Page.new(url)
|
334
|
+
def add_url(url, params = {})
|
335
|
+
page = Page.new(url, params)
|
336
|
+
yield(page) if block_given?
|
319
337
|
@internal_queue << page.to_json
|
320
338
|
end
|
321
339
|
|
@@ -329,7 +347,11 @@ module Polipus
|
|
329
347
|
private
|
330
348
|
# URLs enqueue policy
|
331
349
|
def should_be_visited?(url, with_tracker = true)
|
350
|
+
|
332
351
|
case
|
352
|
+
# robots.txt
|
353
|
+
when !allowed_by_robot?(url)
|
354
|
+
false
|
333
355
|
# Check against whitelist pattern matching
|
334
356
|
when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
|
335
357
|
false
|
@@ -368,6 +390,17 @@ module Polipus
|
|
368
390
|
@storage.exists?(page) && !page_expired?(page)
|
369
391
|
end
|
370
392
|
|
393
|
+
#
|
394
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
395
|
+
# is granted access in it. Always returns +true+ when we are
|
396
|
+
# not obeying robots.txt.
|
397
|
+
#
|
398
|
+
def allowed_by_robot?(link)
|
399
|
+
return true if @robots.nil?
|
400
|
+
@options[:obey_robots_txt] ? @robots.allowed?(link) : true
|
401
|
+
end
|
402
|
+
|
403
|
+
|
371
404
|
# The url is enqueued for a later visit
|
372
405
|
def enqueue url_to_visit, current_page, queue
|
373
406
|
page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
|
data/lib/polipus/http.rb
CHANGED
@@ -7,6 +7,21 @@ module Polipus
|
|
7
7
|
class HTTP
|
8
8
|
# Maximum number of redirects to follow on each get_response
|
9
9
|
REDIRECT_LIMIT = 5
|
10
|
+
RESCUABLE_ERRORS = [
|
11
|
+
EOFError,
|
12
|
+
Errno::ECONNREFUSED,
|
13
|
+
Errno::ECONNRESET,
|
14
|
+
Errno::EHOSTUNREACH,
|
15
|
+
Errno::EINVAL,
|
16
|
+
Errno::EPIPE,
|
17
|
+
Errno::ETIMEDOUT,
|
18
|
+
Net::HTTPBadResponse,
|
19
|
+
Net::HTTPHeaderSyntaxError,
|
20
|
+
Net::ProtocolError,
|
21
|
+
SocketError,
|
22
|
+
Timeout::Error,
|
23
|
+
Zlib::DataError
|
24
|
+
]
|
10
25
|
|
11
26
|
def initialize(opts = {})
|
12
27
|
@connections = {}
|
@@ -30,13 +45,8 @@ module Polipus
|
|
30
45
|
url = URI(url)
|
31
46
|
pages = []
|
32
47
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
33
|
-
|
34
|
-
|
35
|
-
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
36
|
-
body = gzip.read
|
37
|
-
end
|
38
|
-
|
39
|
-
pages << Page.new(location, :body => body,
|
48
|
+
handle_compression response
|
49
|
+
pages << Page.new(location, :body => response.body,
|
40
50
|
:code => code,
|
41
51
|
:headers => response.to_hash,
|
42
52
|
:referer => referer,
|
@@ -47,13 +57,13 @@ module Polipus
|
|
47
57
|
end
|
48
58
|
|
49
59
|
pages
|
50
|
-
rescue
|
60
|
+
rescue *RESCUABLE_ERRORS => e
|
51
61
|
if verbose?
|
52
62
|
puts e.inspect
|
53
63
|
puts e.backtrace
|
54
64
|
end
|
55
65
|
|
56
|
-
[Page.new(url, :
|
66
|
+
[Page.new(url, error: e, referer: referer, depth: depth)]
|
57
67
|
end
|
58
68
|
|
59
69
|
#
|
@@ -154,7 +164,7 @@ module Polipus
|
|
154
164
|
opts['User-Agent'] = user_agent if user_agent
|
155
165
|
opts['Referer'] = referer.to_s if referer
|
156
166
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
157
|
-
opts['Accept-Encoding'] = 'gzip'
|
167
|
+
opts['Accept-Encoding'] = 'gzip,deflate'
|
158
168
|
|
159
169
|
|
160
170
|
retries = 0
|
@@ -169,8 +179,7 @@ module Polipus
|
|
169
179
|
response_time = ((finish - start) * 1000).round
|
170
180
|
cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
|
171
181
|
return response, response_time
|
172
|
-
rescue
|
173
|
-
|
182
|
+
rescue *RESCUABLE_ERRORS => e
|
174
183
|
puts e.inspect if verbose?
|
175
184
|
refresh_connection(url)
|
176
185
|
retries += 1
|
@@ -229,8 +238,14 @@ module Polipus
|
|
229
238
|
to_url.host.nil? || (to_url.host == from_url.host)
|
230
239
|
end
|
231
240
|
|
232
|
-
def
|
233
|
-
|
241
|
+
def handle_compression response
|
242
|
+
case response["content-encoding"]
|
243
|
+
when "gzip", "x-gzip"
|
244
|
+
body_io = StringIO.new(response.body)
|
245
|
+
response.body.replace Zlib::GzipReader.new(body_io).read
|
246
|
+
when "deflate"
|
247
|
+
response.body.replace Zlib::Inflate.inflate(response.body)
|
248
|
+
end
|
234
249
|
end
|
235
250
|
|
236
251
|
end
|
data/lib/polipus/page.rb
CHANGED
@@ -17,8 +17,7 @@ module Polipus
|
|
17
17
|
attr_reader :error
|
18
18
|
# Integer response code of the page
|
19
19
|
attr_accessor :code
|
20
|
-
# Depth of this page from the root of the crawl.
|
21
|
-
# shortest path; use PageStore#shortest_paths! to find that value.
|
20
|
+
# Depth of this page from the root of the crawl.
|
22
21
|
attr_accessor :depth
|
23
22
|
# URL of the page that brought us to this page
|
24
23
|
attr_accessor :referer
|
@@ -41,7 +40,7 @@ module Polipus
|
|
41
40
|
# Create a new page
|
42
41
|
#
|
43
42
|
def initialize(url, params = {})
|
44
|
-
@url =
|
43
|
+
@url = URI(url)
|
45
44
|
@code = params[:code]
|
46
45
|
@headers = params[:headers] || {}
|
47
46
|
@headers['content-type'] ||= ['']
|
@@ -130,6 +129,14 @@ module Polipus
|
|
130
129
|
(300..307).include?(@code)
|
131
130
|
end
|
132
131
|
|
132
|
+
#
|
133
|
+
# Returns +true+ if the page is a HTTP success, returns +false+
|
134
|
+
# otherwise.
|
135
|
+
#
|
136
|
+
def success?
|
137
|
+
(200..206).include?(@code)
|
138
|
+
end
|
139
|
+
|
133
140
|
#
|
134
141
|
# Returns +true+ if the page was not found (returned 404 code),
|
135
142
|
# returns +false+ otherwise.
|
@@ -192,7 +199,8 @@ module Polipus
|
|
192
199
|
'response_time' => @response_time,
|
193
200
|
'fetched' => @fetched,
|
194
201
|
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
195
|
-
'fetched_at' => @fetched_at
|
202
|
+
'fetched_at' => @fetched_at,
|
203
|
+
'error' => @error
|
196
204
|
}
|
197
205
|
end
|
198
206
|
|
@@ -230,7 +238,8 @@ module Polipus
|
|
230
238
|
'@response_time' => hash['response_time'].to_i,
|
231
239
|
'@fetched' => hash['fetched'],
|
232
240
|
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
233
|
-
'@fetched_at' => hash['fetched_at']
|
241
|
+
'@fetched_at' => hash['fetched_at'],
|
242
|
+
'@error' => hash['error']
|
234
243
|
}.each do |var, value|
|
235
244
|
page.instance_variable_set(var, value)
|
236
245
|
end
|
@@ -242,4 +251,4 @@ module Polipus
|
|
242
251
|
self.from_hash hash
|
243
252
|
end
|
244
253
|
end
|
245
|
-
end
|
254
|
+
end
|
@@ -0,0 +1,154 @@
|
|
1
|
+
require 'open-uri'
|
2
|
+
require 'uri'
|
3
|
+
require 'timeout'
|
4
|
+
module Polipus
|
5
|
+
|
6
|
+
# Original code taken from
|
7
|
+
# https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
|
8
|
+
|
9
|
+
class Robotex
|
10
|
+
|
11
|
+
DEFAULT_TIMEOUT = 3
|
12
|
+
VERSION = '1.0.0'
|
13
|
+
|
14
|
+
attr_reader :user_agent
|
15
|
+
|
16
|
+
class ParsedRobots
|
17
|
+
|
18
|
+
def initialize(uri, user_agent)
|
19
|
+
io = Robotex.get_robots_txt(uri, user_agent)
|
20
|
+
if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
|
21
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
22
|
+
end
|
23
|
+
|
24
|
+
@disallows = {}
|
25
|
+
@allows = {}
|
26
|
+
@delays = {}
|
27
|
+
agent = /.*/
|
28
|
+
io.each do |line|
|
29
|
+
next if line =~ /^\s*(#.*|$)/
|
30
|
+
arr = line.split(":")
|
31
|
+
key = arr.shift
|
32
|
+
value = arr.join(":").strip
|
33
|
+
value.strip!
|
34
|
+
case key.downcase
|
35
|
+
when "user-agent"
|
36
|
+
agent = to_regex(value)
|
37
|
+
when "allow"
|
38
|
+
unless value.empty?
|
39
|
+
@allows[agent] ||= []
|
40
|
+
@allows[agent] << to_regex(value)
|
41
|
+
end
|
42
|
+
when "disallow"
|
43
|
+
unless value.empty?
|
44
|
+
@disallows[agent] ||= []
|
45
|
+
@disallows[agent] << to_regex(value)
|
46
|
+
end
|
47
|
+
when "crawl-delay"
|
48
|
+
@delays[agent] = value.to_i
|
49
|
+
end
|
50
|
+
end
|
51
|
+
@parsed = true
|
52
|
+
end
|
53
|
+
|
54
|
+
def allowed?(uri, user_agent)
|
55
|
+
return true unless @parsed
|
56
|
+
allowed = true
|
57
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
58
|
+
path = uri.request_uri
|
59
|
+
|
60
|
+
@allows.each do |key, value|
|
61
|
+
unless allowed
|
62
|
+
if user_agent =~ key
|
63
|
+
value.each do |rule|
|
64
|
+
if path =~ rule
|
65
|
+
allowed = true
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
@disallows.each do |key, value|
|
73
|
+
if user_agent =~ key
|
74
|
+
value.each do |rule|
|
75
|
+
if path =~ rule
|
76
|
+
allowed = false
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
return allowed
|
83
|
+
end
|
84
|
+
|
85
|
+
def delay(user_agent)
|
86
|
+
@delays.each do |agent, delay|
|
87
|
+
return delay if agent =~ user_agent
|
88
|
+
end
|
89
|
+
nil
|
90
|
+
end
|
91
|
+
|
92
|
+
protected
|
93
|
+
|
94
|
+
def to_regex(pattern)
|
95
|
+
pattern = Regexp.escape(pattern)
|
96
|
+
pattern.gsub!(Regexp.escape("*"), ".*")
|
97
|
+
Regexp.compile("^#{pattern}")
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def self.get_robots_txt(uri, user_agent)
|
102
|
+
begin
|
103
|
+
Timeout::timeout(Robotex.timeout) do
|
104
|
+
URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
105
|
+
end
|
106
|
+
rescue Timeout::Error
|
107
|
+
STDERR.puts "robots.txt request timed out"
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
def self.timeout=(t)
|
112
|
+
@timeout = t
|
113
|
+
end
|
114
|
+
|
115
|
+
def self.timeout
|
116
|
+
@timeout || DEFAULT_TIMEOUT
|
117
|
+
end
|
118
|
+
|
119
|
+
def initialize(user_agent = nil)
|
120
|
+
user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
|
121
|
+
@user_agent = user_agent
|
122
|
+
@last_accessed = Time.at(1)
|
123
|
+
@parsed = {}
|
124
|
+
end
|
125
|
+
|
126
|
+
def parse_host(uri)
|
127
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
128
|
+
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
133
|
+
#
|
134
|
+
def allowed?(uri)
|
135
|
+
parse_host(uri).allowed?(uri, @user_agent)
|
136
|
+
end
|
137
|
+
|
138
|
+
#
|
139
|
+
# Return the value of the Crawl-Delay directive, or nil if none
|
140
|
+
def delay(uri)
|
141
|
+
parse_host(uri).delay(@user_agent)
|
142
|
+
end
|
143
|
+
|
144
|
+
#
|
145
|
+
# Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
|
146
|
+
#
|
147
|
+
def delay!(uri)
|
148
|
+
delay = delay(uri)
|
149
|
+
sleep delay - (Time.now - @last_accessed) if !!delay
|
150
|
+
@last_accessed = Time.now
|
151
|
+
end
|
152
|
+
|
153
|
+
end
|
154
|
+
end
|