parallel588_polipus 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,14 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'polipus'
|
3
|
+
|
4
|
+
options = {
|
5
|
+
user_agent: 'Googlebot', # Act as Google bot
|
6
|
+
obey_robots_txt: true # Follow /robots.txt rules if any
|
7
|
+
}
|
8
|
+
|
9
|
+
Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
|
10
|
+
|
11
|
+
crawler.on_page_downloaded do |page|
|
12
|
+
puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'polipus'
|
3
|
+
|
4
|
+
Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
|
5
|
+
# In-place page processing
|
6
|
+
crawler.on_page_downloaded do |page|
|
7
|
+
# A nokogiri object
|
8
|
+
puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
|
9
|
+
end
|
10
|
+
end
|
data/lib/polipus.rb
ADDED
@@ -0,0 +1,488 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'redis'
|
3
|
+
require 'redis/connection/hiredis'
|
4
|
+
require 'redis-queue'
|
5
|
+
require 'polipus/version'
|
6
|
+
require 'polipus/http'
|
7
|
+
require 'polipus/storage'
|
8
|
+
require 'polipus/url_tracker'
|
9
|
+
require 'polipus/plugin'
|
10
|
+
require 'polipus/queue_overflow'
|
11
|
+
require 'polipus/robotex'
|
12
|
+
require 'polipus/signal_handler'
|
13
|
+
require 'thread'
|
14
|
+
require 'logger'
|
15
|
+
require 'json'
|
16
|
+
|
17
|
+
module Polipus
|
18
|
+
def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
|
19
|
+
PolipusCrawler.crawl(job_name, urls, options, &block)
|
20
|
+
end
|
21
|
+
|
22
|
+
class PolipusCrawler
|
23
|
+
OPTS = {
|
24
|
+
# run 4 threads
|
25
|
+
workers: 4,
|
26
|
+
# identify self as Polipus/VERSION
|
27
|
+
user_agent: "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
|
28
|
+
# by default, don't limit the depth of the crawl
|
29
|
+
depth_limit: false,
|
30
|
+
# number of times HTTP redirects will be followed
|
31
|
+
redirect_limit: 5,
|
32
|
+
# storage engine defaults to DevNull
|
33
|
+
storage: nil,
|
34
|
+
# proxy server hostname
|
35
|
+
proxy_host: nil,
|
36
|
+
# proxy server port number
|
37
|
+
proxy_port: false,
|
38
|
+
# proxy server username
|
39
|
+
proxy_user: nil,
|
40
|
+
# proxy server password
|
41
|
+
proxy_pass: nil,
|
42
|
+
# HTTP read timeout in seconds
|
43
|
+
read_timeout: 30,
|
44
|
+
# HTTP open connection timeout in seconds
|
45
|
+
open_timeout: 10,
|
46
|
+
# Time to wait for new messages on Redis
|
47
|
+
# After this timeout, current crawling session is marked as terminated
|
48
|
+
queue_timeout: 30,
|
49
|
+
# An URL tracker instance. default is Bloomfilter based on redis
|
50
|
+
url_tracker: nil,
|
51
|
+
# A Redis options {} that will be passed directly to Redis.new
|
52
|
+
redis_options: {},
|
53
|
+
# An instance of logger
|
54
|
+
logger: nil,
|
55
|
+
# A logger level
|
56
|
+
logger_level: nil,
|
57
|
+
# whether the query string should be included in the saved page
|
58
|
+
include_query_string_in_saved_page: true,
|
59
|
+
# Max number of items to keep on redis
|
60
|
+
queue_items_limit: 2_000_000,
|
61
|
+
# The adapter used to store exceed (queue_items_limit) redis items
|
62
|
+
queue_overflow_adapter: nil,
|
63
|
+
# Every x seconds, the main queue is checked for overflowed items
|
64
|
+
queue_overflow_manager_check_time: 60,
|
65
|
+
# If true, each page downloaded will increment a counter on redis
|
66
|
+
stats_enabled: false,
|
67
|
+
# Cookies strategy
|
68
|
+
cookie_jar: nil,
|
69
|
+
# whether or not accept cookies
|
70
|
+
accept_cookies: false,
|
71
|
+
# A set of hosts that should be considered parts of the same domain
|
72
|
+
# Eg It can be used to follow links with and without 'www' domain
|
73
|
+
domain_aliases: [],
|
74
|
+
# Mark a connection as staled after connection_max_hits request
|
75
|
+
connection_max_hits: nil,
|
76
|
+
# Page TTL: mark a page as expired after ttl_page seconds
|
77
|
+
ttl_page: nil,
|
78
|
+
# don't obey the robots exclusion protocol
|
79
|
+
obey_robots_txt: false,
|
80
|
+
# If true, signal handling strategy is enabled.
|
81
|
+
# INT and TERM signal will stop polipus gracefully
|
82
|
+
# Disable it if polipus will run as a part of Resque or DelayedJob-like system
|
83
|
+
enable_signal_handler: true
|
84
|
+
}
|
85
|
+
|
86
|
+
attr_reader :storage
|
87
|
+
attr_reader :job_name
|
88
|
+
attr_reader :logger
|
89
|
+
attr_reader :options
|
90
|
+
attr_reader :crawler_name
|
91
|
+
|
92
|
+
OPTS.keys.each do |key|
|
93
|
+
define_method "#{key}=" do |value|
|
94
|
+
@options[key.to_sym] = value
|
95
|
+
end
|
96
|
+
define_method "#{key}" do
|
97
|
+
@options[key.to_sym]
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def initialize(job_name = 'polipus', urls = [], options = {})
|
102
|
+
@job_name = job_name
|
103
|
+
@options = OPTS.merge(options)
|
104
|
+
@options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
|
105
|
+
@logger = @options[:logger] ||= Logger.new(nil)
|
106
|
+
|
107
|
+
unless @logger.class.to_s == 'Log4r::Logger'
|
108
|
+
@logger.level = @options[:logger_level] ||= Logger::INFO
|
109
|
+
end
|
110
|
+
|
111
|
+
@storage = @options[:storage] ||= Storage.dev_null
|
112
|
+
|
113
|
+
@workers_pool = []
|
114
|
+
|
115
|
+
@follow_links_like = []
|
116
|
+
@skip_links_like = []
|
117
|
+
@on_page_downloaded = []
|
118
|
+
@on_before_save = []
|
119
|
+
@on_page_error = []
|
120
|
+
@focus_crawl_block = nil
|
121
|
+
@on_crawl_start = []
|
122
|
+
@on_crawl_end = []
|
123
|
+
@redis_factory = nil
|
124
|
+
|
125
|
+
@overflow_manager = nil
|
126
|
+
@crawler_name = `hostname`.strip + "-#{@job_name}"
|
127
|
+
|
128
|
+
@storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
|
129
|
+
|
130
|
+
@urls = [urls].flatten.map { |url| URI(url) }
|
131
|
+
@urls.each { |url| url.path = '/' if url.path.empty? }
|
132
|
+
if @options[:obey_robots_txt]
|
133
|
+
@robots =
|
134
|
+
if @options[:user_agent].respond_to?(:sample)
|
135
|
+
Polipus::Robotex.new(@options[:user_agent].sample)
|
136
|
+
else
|
137
|
+
Polipus::Robotex.new(@options[:user_agent])
|
138
|
+
end
|
139
|
+
end
|
140
|
+
# Attach signal handling if enabled
|
141
|
+
SignalHandler.enable if @options[:enable_signal_handler]
|
142
|
+
|
143
|
+
if queue_overflow_adapter
|
144
|
+
@on_crawl_start << lambda do |_|
|
145
|
+
Thread.new do
|
146
|
+
Thread.current[:name] = :overflow_items_controller
|
147
|
+
overflow_items_controller.run
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
@on_crawl_end << lambda do |_|
|
153
|
+
Thread.list.select { |thread| thread.status && Thread.current[:name] == :overflow_items_controller }.each(&:kill)
|
154
|
+
end
|
155
|
+
|
156
|
+
execute_plugin 'on_initialize'
|
157
|
+
|
158
|
+
yield self if block_given?
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.crawl(*args, &block)
|
162
|
+
new(*args, &block).takeover
|
163
|
+
end
|
164
|
+
|
165
|
+
def takeover
|
166
|
+
@urls.each do |u|
|
167
|
+
add_url(u) { |page| page.user_data.p_seeded = true }
|
168
|
+
end
|
169
|
+
return if internal_queue.empty?
|
170
|
+
|
171
|
+
@on_crawl_start.each { |e| e.call(self) }
|
172
|
+
|
173
|
+
execute_plugin 'on_crawl_start'
|
174
|
+
@options[:workers].times do |worker_number|
|
175
|
+
@workers_pool << Thread.new do
|
176
|
+
@logger.debug { "Start worker #{worker_number}" }
|
177
|
+
http = HTTP.new(@options)
|
178
|
+
queue = queue_factory
|
179
|
+
queue.process(false, @options[:queue_timeout]) do |message|
|
180
|
+
|
181
|
+
next if message.nil?
|
182
|
+
|
183
|
+
execute_plugin 'on_message_received'
|
184
|
+
|
185
|
+
page = Page.from_json message
|
186
|
+
|
187
|
+
unless should_be_visited?(page.url, false)
|
188
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) is no more welcome." }
|
189
|
+
queue.commit
|
190
|
+
next
|
191
|
+
end
|
192
|
+
|
193
|
+
if page_exists? page
|
194
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
|
195
|
+
queue.commit
|
196
|
+
next
|
197
|
+
end
|
198
|
+
|
199
|
+
url = page.url.to_s
|
200
|
+
@logger.debug { "[worker ##{worker_number}] Fetching page: [#{page.url}] Referer: #{page.referer} Depth: #{page.depth}" }
|
201
|
+
|
202
|
+
execute_plugin 'on_before_download'
|
203
|
+
|
204
|
+
pages = http.fetch_pages(url, page.referer, page.depth)
|
205
|
+
if pages.count > 1
|
206
|
+
rurls = pages.map { |e| e.url.to_s }.join(' --> ')
|
207
|
+
@logger.info { "Got redirects! #{rurls}" }
|
208
|
+
page = pages.pop
|
209
|
+
page.aliases = pages.map { |e| e.url }
|
210
|
+
if page_exists? page
|
211
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
|
212
|
+
queue.commit
|
213
|
+
next
|
214
|
+
end
|
215
|
+
else
|
216
|
+
page = pages.last
|
217
|
+
end
|
218
|
+
|
219
|
+
execute_plugin 'on_after_download'
|
220
|
+
|
221
|
+
if page.error
|
222
|
+
@logger.warn { "Page #{page.url} has error: #{page.error}" }
|
223
|
+
incr_error
|
224
|
+
@on_page_error.each { |e| e.call(page) }
|
225
|
+
end
|
226
|
+
|
227
|
+
# Execute on_before_save blocks
|
228
|
+
@on_before_save.each { |e| e.call(page) }
|
229
|
+
|
230
|
+
page.storable? && @storage.add(page)
|
231
|
+
|
232
|
+
@logger.debug { "[worker ##{worker_number}] Fetched page: [#{page.url}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]" }
|
233
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) downloaded" }
|
234
|
+
|
235
|
+
incr_pages
|
236
|
+
|
237
|
+
# Execute on_page_downloaded blocks
|
238
|
+
@on_page_downloaded.each { |e| e.call(page) }
|
239
|
+
|
240
|
+
if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
|
241
|
+
links_for(page).each do |url_to_visit|
|
242
|
+
next unless should_be_visited?(url_to_visit)
|
243
|
+
enqueue url_to_visit, page
|
244
|
+
end
|
245
|
+
else
|
246
|
+
@logger.info { "[worker ##{worker_number}] Depth limit reached #{page.depth}" }
|
247
|
+
end
|
248
|
+
|
249
|
+
@logger.debug { "[worker ##{worker_number}] Queue size: #{queue.size}" }
|
250
|
+
@overflow_manager.perform if @overflow_manager && queue.empty?
|
251
|
+
execute_plugin 'on_message_processed'
|
252
|
+
|
253
|
+
if SignalHandler.terminated?
|
254
|
+
@logger.info { 'About to exit! Thanks for using Polipus' }
|
255
|
+
queue.commit
|
256
|
+
break
|
257
|
+
end
|
258
|
+
true
|
259
|
+
end
|
260
|
+
end
|
261
|
+
end
|
262
|
+
|
263
|
+
@workers_pool.each { |w| w.join }
|
264
|
+
@on_crawl_end.each { |e| e.call(self) }
|
265
|
+
execute_plugin 'on_crawl_end'
|
266
|
+
end
|
267
|
+
|
268
|
+
# A pattern or an array of patterns can be passed as argument
|
269
|
+
# An url will be discarded if it doesn't match patterns
|
270
|
+
def follow_links_like(*patterns)
|
271
|
+
@follow_links_like = @follow_links_like += patterns.uniq.compact
|
272
|
+
self
|
273
|
+
end
|
274
|
+
|
275
|
+
# A pattern or an array of patterns can be passed as argument
|
276
|
+
# An url will be discarded if it matches a pattern
|
277
|
+
def skip_links_like(*patterns)
|
278
|
+
@skip_links_like = @skip_links_like += patterns.uniq.compact
|
279
|
+
self
|
280
|
+
end
|
281
|
+
|
282
|
+
# A block of code will be executed on every page downloaded
|
283
|
+
# The block takes the page as argument
|
284
|
+
def on_page_downloaded(&block)
|
285
|
+
@on_page_downloaded << block
|
286
|
+
self
|
287
|
+
end
|
288
|
+
|
289
|
+
# A block of code will be executed when crawl session is over
|
290
|
+
def on_crawl_end(&block)
|
291
|
+
@on_crawl_end << block
|
292
|
+
self
|
293
|
+
end
|
294
|
+
|
295
|
+
# A block of code will be executed when crawl session is starting
|
296
|
+
def on_crawl_start(&block)
|
297
|
+
@on_crawl_start << block
|
298
|
+
self
|
299
|
+
end
|
300
|
+
|
301
|
+
# A block of code will be executed on every page downloaded
|
302
|
+
# before being saved in the registered storage
|
303
|
+
def on_before_save(&block)
|
304
|
+
@on_before_save << block
|
305
|
+
self
|
306
|
+
end
|
307
|
+
|
308
|
+
# A block of code will be executed whether a page contains an error
|
309
|
+
def on_page_error(&block)
|
310
|
+
@on_page_error << block
|
311
|
+
self
|
312
|
+
end
|
313
|
+
|
314
|
+
# A block of code will be executed
|
315
|
+
# on every page downloaded. The code is used to extract urls to visit
|
316
|
+
# see links_for method
|
317
|
+
def focus_crawl(&block)
|
318
|
+
@focus_crawl_block = block
|
319
|
+
self
|
320
|
+
end
|
321
|
+
|
322
|
+
def redis_options
|
323
|
+
@options[:redis_options]
|
324
|
+
end
|
325
|
+
|
326
|
+
def queue_size
|
327
|
+
internal_queue.size
|
328
|
+
end
|
329
|
+
|
330
|
+
def stats_reset!
|
331
|
+
["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each { |e| redis.del e }
|
332
|
+
end
|
333
|
+
|
334
|
+
def redis_factory(&block)
|
335
|
+
@redis_factory = block
|
336
|
+
self
|
337
|
+
end
|
338
|
+
|
339
|
+
def url_tracker
|
340
|
+
@url_tracker ||=
|
341
|
+
@options[:url_tracker] ||=
|
342
|
+
UrlTracker.bloomfilter(key_name: "polipus_bf_#{job_name}",
|
343
|
+
redis: redis_factory_adapter,
|
344
|
+
driver: 'lua')
|
345
|
+
end
|
346
|
+
|
347
|
+
def redis
|
348
|
+
@redis ||= redis_factory_adapter
|
349
|
+
end
|
350
|
+
|
351
|
+
def add_to_queue(page)
|
352
|
+
if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
|
353
|
+
add_url(page.url, referer: page.referer, depth: page.depth)
|
354
|
+
else
|
355
|
+
add_url(page)
|
356
|
+
end
|
357
|
+
end
|
358
|
+
|
359
|
+
# Enqueue an url, no matter what
|
360
|
+
def add_url(url, params = {})
|
361
|
+
page = Page.new(url, params)
|
362
|
+
yield(page) if block_given?
|
363
|
+
internal_queue << page.to_json
|
364
|
+
end
|
365
|
+
|
366
|
+
# Request to Polipus to stop its work (gracefully)
|
367
|
+
# cler_queue = true if you want to delete all of the pending urls to visit
|
368
|
+
def stop!(cler_queue = false)
|
369
|
+
SignalHandler.terminate
|
370
|
+
internal_queue.clear(true) if cler_queue
|
371
|
+
end
|
372
|
+
|
373
|
+
private
|
374
|
+
|
375
|
+
# URLs enqueue policy
|
376
|
+
def should_be_visited?(url, with_tracker = true)
|
377
|
+
case
|
378
|
+
# robots.txt
|
379
|
+
when !allowed_by_robot?(url)
|
380
|
+
false
|
381
|
+
# Check against whitelist pattern matching
|
382
|
+
when !@follow_links_like.empty? && @follow_links_like.none? { |p| url.path =~ p }
|
383
|
+
false
|
384
|
+
# Check against blacklist pattern matching
|
385
|
+
when @skip_links_like.any? { |p| url.path =~ p }
|
386
|
+
false
|
387
|
+
# Page is marked as expired
|
388
|
+
when page_expired?(Page.new(url))
|
389
|
+
true
|
390
|
+
# Check against url tracker
|
391
|
+
when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/, ''))
|
392
|
+
false
|
393
|
+
else
|
394
|
+
true
|
395
|
+
end
|
396
|
+
end
|
397
|
+
|
398
|
+
# It extracts URLs from the page
|
399
|
+
def links_for(page)
|
400
|
+
page.domain_aliases = domain_aliases
|
401
|
+
@focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
|
402
|
+
end
|
403
|
+
|
404
|
+
# whether a page is expired or not
|
405
|
+
def page_expired?(page)
|
406
|
+
return false if @options[:ttl_page].nil?
|
407
|
+
stored_page = @storage.get(page)
|
408
|
+
r = stored_page && stored_page.expired?(@options[:ttl_page])
|
409
|
+
@logger.debug { "Page #{page.url} marked as expired" } if r
|
410
|
+
r
|
411
|
+
end
|
412
|
+
|
413
|
+
# whether a page exists or not
|
414
|
+
def page_exists?(page)
|
415
|
+
return false if page.user_data && page.user_data.p_seeded
|
416
|
+
@storage.exists?(page) && !page_expired?(page)
|
417
|
+
end
|
418
|
+
|
419
|
+
#
|
420
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
421
|
+
# is granted access in it. Always returns +true+ when we are
|
422
|
+
# not obeying robots.txt.
|
423
|
+
#
|
424
|
+
def allowed_by_robot?(link)
|
425
|
+
return true if @robots.nil?
|
426
|
+
@options[:obey_robots_txt] ? @robots.allowed?(link) : true
|
427
|
+
end
|
428
|
+
|
429
|
+
# The url is enqueued for a later visit
|
430
|
+
def enqueue(url_to_visit, current_page)
|
431
|
+
page_to_visit = Page.new(url_to_visit.to_s, referer: current_page.url.to_s, depth: current_page.depth + 1)
|
432
|
+
internal_queue << page_to_visit.to_json
|
433
|
+
to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/, '')
|
434
|
+
url_tracker.visit to_track
|
435
|
+
@logger.debug { "Added (#{url_to_visit}) to the queue" }
|
436
|
+
end
|
437
|
+
|
438
|
+
# It creates a redis client
|
439
|
+
def redis_factory_adapter
|
440
|
+
if @redis_factory
|
441
|
+
@redis_factory.call(redis_options)
|
442
|
+
else
|
443
|
+
Redis.new(redis_options)
|
444
|
+
end
|
445
|
+
end
|
446
|
+
|
447
|
+
# It creates a new distributed queue
|
448
|
+
def queue_factory
|
449
|
+
Redis::Queue.new("polipus_queue_#{@job_name}", "bp_polipus_queue_#{@job_name}", redis: redis_factory_adapter)
|
450
|
+
end
|
451
|
+
|
452
|
+
# If stats enabled, it increments errors found
|
453
|
+
def incr_error
|
454
|
+
redis.incr "polipus:#{@job_name}:errors" if @options[:stats_enabled]
|
455
|
+
end
|
456
|
+
|
457
|
+
# If stats enabled, it increments pages downloaded
|
458
|
+
def incr_pages
|
459
|
+
redis.incr "polipus:#{@job_name}:pages" if @options[:stats_enabled]
|
460
|
+
end
|
461
|
+
|
462
|
+
# It handles the overflow item policy (if any)
|
463
|
+
def overflow_items_controller
|
464
|
+
@overflow_manager = QueueOverflow::Manager.new(self, queue_factory, @options[:queue_items_limit])
|
465
|
+
|
466
|
+
# In the time, url policy may change so policy is re-evaluated
|
467
|
+
@overflow_manager.url_filter do |page|
|
468
|
+
should_be_visited?(page.url, false)
|
469
|
+
end
|
470
|
+
|
471
|
+
QueueOverflow::Worker.new(@overflow_manager)
|
472
|
+
end
|
473
|
+
|
474
|
+
def internal_queue
|
475
|
+
@internal_queue ||= queue_factory
|
476
|
+
end
|
477
|
+
|
478
|
+
# It invokes a plugin method if any
|
479
|
+
def execute_plugin(method)
|
480
|
+
Polipus::Plugin.plugins.each do |k, p|
|
481
|
+
next unless p.respond_to?(method)
|
482
|
+
@logger.info { "Running plugin method #{method} on #{k}" }
|
483
|
+
ret_val = p.send(method, self)
|
484
|
+
instance_eval(&ret_val) if ret_val.kind_of? Proc
|
485
|
+
end
|
486
|
+
end
|
487
|
+
end
|
488
|
+
end
|