parallel588_polipus 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,14 @@
1
+ # encoding: UTF-8
2
+ require 'polipus'
3
+
4
+ options = {
5
+ user_agent: 'Googlebot', # Act as Google bot
6
+ obey_robots_txt: true # Follow /robots.txt rules if any
7
+ }
8
+
9
+ Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
10
+
11
+ crawler.on_page_downloaded do |page|
12
+ puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
13
+ end
14
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: UTF-8
2
+ require 'polipus'
3
+
4
+ Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
5
+ # In-place page processing
6
+ crawler.on_page_downloaded do |page|
7
+ # A nokogiri object
8
+ puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
9
+ end
10
+ end
data/lib/polipus.rb ADDED
@@ -0,0 +1,488 @@
1
+ # encoding: UTF-8
2
+ require 'redis'
3
+ require 'redis/connection/hiredis'
4
+ require 'redis-queue'
5
+ require 'polipus/version'
6
+ require 'polipus/http'
7
+ require 'polipus/storage'
8
+ require 'polipus/url_tracker'
9
+ require 'polipus/plugin'
10
+ require 'polipus/queue_overflow'
11
+ require 'polipus/robotex'
12
+ require 'polipus/signal_handler'
13
+ require 'thread'
14
+ require 'logger'
15
+ require 'json'
16
+
17
+ module Polipus
18
+ def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
19
+ PolipusCrawler.crawl(job_name, urls, options, &block)
20
+ end
21
+
22
+ class PolipusCrawler
23
+ OPTS = {
24
+ # run 4 threads
25
+ workers: 4,
26
+ # identify self as Polipus/VERSION
27
+ user_agent: "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
28
+ # by default, don't limit the depth of the crawl
29
+ depth_limit: false,
30
+ # number of times HTTP redirects will be followed
31
+ redirect_limit: 5,
32
+ # storage engine defaults to DevNull
33
+ storage: nil,
34
+ # proxy server hostname
35
+ proxy_host: nil,
36
+ # proxy server port number
37
+ proxy_port: false,
38
+ # proxy server username
39
+ proxy_user: nil,
40
+ # proxy server password
41
+ proxy_pass: nil,
42
+ # HTTP read timeout in seconds
43
+ read_timeout: 30,
44
+ # HTTP open connection timeout in seconds
45
+ open_timeout: 10,
46
+ # Time to wait for new messages on Redis
47
+ # After this timeout, current crawling session is marked as terminated
48
+ queue_timeout: 30,
49
+ # An URL tracker instance. default is Bloomfilter based on redis
50
+ url_tracker: nil,
51
+ # A Redis options {} that will be passed directly to Redis.new
52
+ redis_options: {},
53
+ # An instance of logger
54
+ logger: nil,
55
+ # A logger level
56
+ logger_level: nil,
57
+ # whether the query string should be included in the saved page
58
+ include_query_string_in_saved_page: true,
59
+ # Max number of items to keep on redis
60
+ queue_items_limit: 2_000_000,
61
+ # The adapter used to store exceed (queue_items_limit) redis items
62
+ queue_overflow_adapter: nil,
63
+ # Every x seconds, the main queue is checked for overflowed items
64
+ queue_overflow_manager_check_time: 60,
65
+ # If true, each page downloaded will increment a counter on redis
66
+ stats_enabled: false,
67
+ # Cookies strategy
68
+ cookie_jar: nil,
69
+ # whether or not accept cookies
70
+ accept_cookies: false,
71
+ # A set of hosts that should be considered parts of the same domain
72
+ # Eg It can be used to follow links with and without 'www' domain
73
+ domain_aliases: [],
74
+ # Mark a connection as staled after connection_max_hits request
75
+ connection_max_hits: nil,
76
+ # Page TTL: mark a page as expired after ttl_page seconds
77
+ ttl_page: nil,
78
+ # don't obey the robots exclusion protocol
79
+ obey_robots_txt: false,
80
+ # If true, signal handling strategy is enabled.
81
+ # INT and TERM signal will stop polipus gracefully
82
+ # Disable it if polipus will run as a part of Resque or DelayedJob-like system
83
+ enable_signal_handler: true
84
+ }
85
+
86
+ attr_reader :storage
87
+ attr_reader :job_name
88
+ attr_reader :logger
89
+ attr_reader :options
90
+ attr_reader :crawler_name
91
+
92
+ OPTS.keys.each do |key|
93
+ define_method "#{key}=" do |value|
94
+ @options[key.to_sym] = value
95
+ end
96
+ define_method "#{key}" do
97
+ @options[key.to_sym]
98
+ end
99
+ end
100
+
101
+ def initialize(job_name = 'polipus', urls = [], options = {})
102
+ @job_name = job_name
103
+ @options = OPTS.merge(options)
104
+ @options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
105
+ @logger = @options[:logger] ||= Logger.new(nil)
106
+
107
+ unless @logger.class.to_s == 'Log4r::Logger'
108
+ @logger.level = @options[:logger_level] ||= Logger::INFO
109
+ end
110
+
111
+ @storage = @options[:storage] ||= Storage.dev_null
112
+
113
+ @workers_pool = []
114
+
115
+ @follow_links_like = []
116
+ @skip_links_like = []
117
+ @on_page_downloaded = []
118
+ @on_before_save = []
119
+ @on_page_error = []
120
+ @focus_crawl_block = nil
121
+ @on_crawl_start = []
122
+ @on_crawl_end = []
123
+ @redis_factory = nil
124
+
125
+ @overflow_manager = nil
126
+ @crawler_name = `hostname`.strip + "-#{@job_name}"
127
+
128
+ @storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
129
+
130
+ @urls = [urls].flatten.map { |url| URI(url) }
131
+ @urls.each { |url| url.path = '/' if url.path.empty? }
132
+ if @options[:obey_robots_txt]
133
+ @robots =
134
+ if @options[:user_agent].respond_to?(:sample)
135
+ Polipus::Robotex.new(@options[:user_agent].sample)
136
+ else
137
+ Polipus::Robotex.new(@options[:user_agent])
138
+ end
139
+ end
140
+ # Attach signal handling if enabled
141
+ SignalHandler.enable if @options[:enable_signal_handler]
142
+
143
+ if queue_overflow_adapter
144
+ @on_crawl_start << lambda do |_|
145
+ Thread.new do
146
+ Thread.current[:name] = :overflow_items_controller
147
+ overflow_items_controller.run
148
+ end
149
+ end
150
+ end
151
+
152
+ @on_crawl_end << lambda do |_|
153
+ Thread.list.select { |thread| thread.status && Thread.current[:name] == :overflow_items_controller }.each(&:kill)
154
+ end
155
+
156
+ execute_plugin 'on_initialize'
157
+
158
+ yield self if block_given?
159
+ end
160
+
161
+ def self.crawl(*args, &block)
162
+ new(*args, &block).takeover
163
+ end
164
+
165
+ def takeover
166
+ @urls.each do |u|
167
+ add_url(u) { |page| page.user_data.p_seeded = true }
168
+ end
169
+ return if internal_queue.empty?
170
+
171
+ @on_crawl_start.each { |e| e.call(self) }
172
+
173
+ execute_plugin 'on_crawl_start'
174
+ @options[:workers].times do |worker_number|
175
+ @workers_pool << Thread.new do
176
+ @logger.debug { "Start worker #{worker_number}" }
177
+ http = HTTP.new(@options)
178
+ queue = queue_factory
179
+ queue.process(false, @options[:queue_timeout]) do |message|
180
+
181
+ next if message.nil?
182
+
183
+ execute_plugin 'on_message_received'
184
+
185
+ page = Page.from_json message
186
+
187
+ unless should_be_visited?(page.url, false)
188
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) is no more welcome." }
189
+ queue.commit
190
+ next
191
+ end
192
+
193
+ if page_exists? page
194
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
195
+ queue.commit
196
+ next
197
+ end
198
+
199
+ url = page.url.to_s
200
+ @logger.debug { "[worker ##{worker_number}] Fetching page: [#{page.url}] Referer: #{page.referer} Depth: #{page.depth}" }
201
+
202
+ execute_plugin 'on_before_download'
203
+
204
+ pages = http.fetch_pages(url, page.referer, page.depth)
205
+ if pages.count > 1
206
+ rurls = pages.map { |e| e.url.to_s }.join(' --> ')
207
+ @logger.info { "Got redirects! #{rurls}" }
208
+ page = pages.pop
209
+ page.aliases = pages.map { |e| e.url }
210
+ if page_exists? page
211
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
212
+ queue.commit
213
+ next
214
+ end
215
+ else
216
+ page = pages.last
217
+ end
218
+
219
+ execute_plugin 'on_after_download'
220
+
221
+ if page.error
222
+ @logger.warn { "Page #{page.url} has error: #{page.error}" }
223
+ incr_error
224
+ @on_page_error.each { |e| e.call(page) }
225
+ end
226
+
227
+ # Execute on_before_save blocks
228
+ @on_before_save.each { |e| e.call(page) }
229
+
230
+ page.storable? && @storage.add(page)
231
+
232
+ @logger.debug { "[worker ##{worker_number}] Fetched page: [#{page.url}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]" }
233
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) downloaded" }
234
+
235
+ incr_pages
236
+
237
+ # Execute on_page_downloaded blocks
238
+ @on_page_downloaded.each { |e| e.call(page) }
239
+
240
+ if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
241
+ links_for(page).each do |url_to_visit|
242
+ next unless should_be_visited?(url_to_visit)
243
+ enqueue url_to_visit, page
244
+ end
245
+ else
246
+ @logger.info { "[worker ##{worker_number}] Depth limit reached #{page.depth}" }
247
+ end
248
+
249
+ @logger.debug { "[worker ##{worker_number}] Queue size: #{queue.size}" }
250
+ @overflow_manager.perform if @overflow_manager && queue.empty?
251
+ execute_plugin 'on_message_processed'
252
+
253
+ if SignalHandler.terminated?
254
+ @logger.info { 'About to exit! Thanks for using Polipus' }
255
+ queue.commit
256
+ break
257
+ end
258
+ true
259
+ end
260
+ end
261
+ end
262
+
263
+ @workers_pool.each { |w| w.join }
264
+ @on_crawl_end.each { |e| e.call(self) }
265
+ execute_plugin 'on_crawl_end'
266
+ end
267
+
268
+ # A pattern or an array of patterns can be passed as argument
269
+ # An url will be discarded if it doesn't match patterns
270
+ def follow_links_like(*patterns)
271
+ @follow_links_like = @follow_links_like += patterns.uniq.compact
272
+ self
273
+ end
274
+
275
+ # A pattern or an array of patterns can be passed as argument
276
+ # An url will be discarded if it matches a pattern
277
+ def skip_links_like(*patterns)
278
+ @skip_links_like = @skip_links_like += patterns.uniq.compact
279
+ self
280
+ end
281
+
282
+ # A block of code will be executed on every page downloaded
283
+ # The block takes the page as argument
284
+ def on_page_downloaded(&block)
285
+ @on_page_downloaded << block
286
+ self
287
+ end
288
+
289
+ # A block of code will be executed when crawl session is over
290
+ def on_crawl_end(&block)
291
+ @on_crawl_end << block
292
+ self
293
+ end
294
+
295
+ # A block of code will be executed when crawl session is starting
296
+ def on_crawl_start(&block)
297
+ @on_crawl_start << block
298
+ self
299
+ end
300
+
301
+ # A block of code will be executed on every page downloaded
302
+ # before being saved in the registered storage
303
+ def on_before_save(&block)
304
+ @on_before_save << block
305
+ self
306
+ end
307
+
308
+ # A block of code will be executed whether a page contains an error
309
+ def on_page_error(&block)
310
+ @on_page_error << block
311
+ self
312
+ end
313
+
314
+ # A block of code will be executed
315
+ # on every page downloaded. The code is used to extract urls to visit
316
+ # see links_for method
317
+ def focus_crawl(&block)
318
+ @focus_crawl_block = block
319
+ self
320
+ end
321
+
322
+ def redis_options
323
+ @options[:redis_options]
324
+ end
325
+
326
+ def queue_size
327
+ internal_queue.size
328
+ end
329
+
330
+ def stats_reset!
331
+ ["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each { |e| redis.del e }
332
+ end
333
+
334
+ def redis_factory(&block)
335
+ @redis_factory = block
336
+ self
337
+ end
338
+
339
+ def url_tracker
340
+ @url_tracker ||=
341
+ @options[:url_tracker] ||=
342
+ UrlTracker.bloomfilter(key_name: "polipus_bf_#{job_name}",
343
+ redis: redis_factory_adapter,
344
+ driver: 'lua')
345
+ end
346
+
347
+ def redis
348
+ @redis ||= redis_factory_adapter
349
+ end
350
+
351
+ def add_to_queue(page)
352
+ if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
353
+ add_url(page.url, referer: page.referer, depth: page.depth)
354
+ else
355
+ add_url(page)
356
+ end
357
+ end
358
+
359
+ # Enqueue an url, no matter what
360
+ def add_url(url, params = {})
361
+ page = Page.new(url, params)
362
+ yield(page) if block_given?
363
+ internal_queue << page.to_json
364
+ end
365
+
366
+ # Request to Polipus to stop its work (gracefully)
367
+ # cler_queue = true if you want to delete all of the pending urls to visit
368
+ def stop!(cler_queue = false)
369
+ SignalHandler.terminate
370
+ internal_queue.clear(true) if cler_queue
371
+ end
372
+
373
+ private
374
+
375
+ # URLs enqueue policy
376
+ def should_be_visited?(url, with_tracker = true)
377
+ case
378
+ # robots.txt
379
+ when !allowed_by_robot?(url)
380
+ false
381
+ # Check against whitelist pattern matching
382
+ when !@follow_links_like.empty? && @follow_links_like.none? { |p| url.path =~ p }
383
+ false
384
+ # Check against blacklist pattern matching
385
+ when @skip_links_like.any? { |p| url.path =~ p }
386
+ false
387
+ # Page is marked as expired
388
+ when page_expired?(Page.new(url))
389
+ true
390
+ # Check against url tracker
391
+ when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/, ''))
392
+ false
393
+ else
394
+ true
395
+ end
396
+ end
397
+
398
+ # It extracts URLs from the page
399
+ def links_for(page)
400
+ page.domain_aliases = domain_aliases
401
+ @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
402
+ end
403
+
404
+ # whether a page is expired or not
405
+ def page_expired?(page)
406
+ return false if @options[:ttl_page].nil?
407
+ stored_page = @storage.get(page)
408
+ r = stored_page && stored_page.expired?(@options[:ttl_page])
409
+ @logger.debug { "Page #{page.url} marked as expired" } if r
410
+ r
411
+ end
412
+
413
+ # whether a page exists or not
414
+ def page_exists?(page)
415
+ return false if page.user_data && page.user_data.p_seeded
416
+ @storage.exists?(page) && !page_expired?(page)
417
+ end
418
+
419
+ #
420
+ # Returns +true+ if we are obeying robots.txt and the link
421
+ # is granted access in it. Always returns +true+ when we are
422
+ # not obeying robots.txt.
423
+ #
424
+ def allowed_by_robot?(link)
425
+ return true if @robots.nil?
426
+ @options[:obey_robots_txt] ? @robots.allowed?(link) : true
427
+ end
428
+
429
+ # The url is enqueued for a later visit
430
+ def enqueue(url_to_visit, current_page)
431
+ page_to_visit = Page.new(url_to_visit.to_s, referer: current_page.url.to_s, depth: current_page.depth + 1)
432
+ internal_queue << page_to_visit.to_json
433
+ to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/, '')
434
+ url_tracker.visit to_track
435
+ @logger.debug { "Added (#{url_to_visit}) to the queue" }
436
+ end
437
+
438
+ # It creates a redis client
439
+ def redis_factory_adapter
440
+ if @redis_factory
441
+ @redis_factory.call(redis_options)
442
+ else
443
+ Redis.new(redis_options)
444
+ end
445
+ end
446
+
447
+ # It creates a new distributed queue
448
+ def queue_factory
449
+ Redis::Queue.new("polipus_queue_#{@job_name}", "bp_polipus_queue_#{@job_name}", redis: redis_factory_adapter)
450
+ end
451
+
452
+ # If stats enabled, it increments errors found
453
+ def incr_error
454
+ redis.incr "polipus:#{@job_name}:errors" if @options[:stats_enabled]
455
+ end
456
+
457
+ # If stats enabled, it increments pages downloaded
458
+ def incr_pages
459
+ redis.incr "polipus:#{@job_name}:pages" if @options[:stats_enabled]
460
+ end
461
+
462
+ # It handles the overflow item policy (if any)
463
+ def overflow_items_controller
464
+ @overflow_manager = QueueOverflow::Manager.new(self, queue_factory, @options[:queue_items_limit])
465
+
466
+ # In the time, url policy may change so policy is re-evaluated
467
+ @overflow_manager.url_filter do |page|
468
+ should_be_visited?(page.url, false)
469
+ end
470
+
471
+ QueueOverflow::Worker.new(@overflow_manager)
472
+ end
473
+
474
+ def internal_queue
475
+ @internal_queue ||= queue_factory
476
+ end
477
+
478
+ # It invokes a plugin method if any
479
+ def execute_plugin(method)
480
+ Polipus::Plugin.plugins.each do |k, p|
481
+ next unless p.respond_to?(method)
482
+ @logger.info { "Running plugin method #{method} on #{k}" }
483
+ ret_val = p.send(method, self)
484
+ instance_eval(&ret_val) if ret_val.kind_of? Proc
485
+ end
486
+ end
487
+ end
488
+ end