parallel588_polipus 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,14 @@
1
+ # encoding: UTF-8
2
+ require 'polipus'
3
+
4
+ options = {
5
+ user_agent: 'Googlebot', # Act as Google bot
6
+ obey_robots_txt: true # Follow /robots.txt rules if any
7
+ }
8
+
9
+ Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
10
+
11
+ crawler.on_page_downloaded do |page|
12
+ puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
13
+ end
14
+ end
@@ -0,0 +1,10 @@
1
+ # encoding: UTF-8
2
+ require 'polipus'
3
+
4
+ Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
5
+ # In-place page processing
6
+ crawler.on_page_downloaded do |page|
7
+ # A nokogiri object
8
+ puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
9
+ end
10
+ end
data/lib/polipus.rb ADDED
@@ -0,0 +1,488 @@
1
+ # encoding: UTF-8
2
+ require 'redis'
3
+ require 'redis/connection/hiredis'
4
+ require 'redis-queue'
5
+ require 'polipus/version'
6
+ require 'polipus/http'
7
+ require 'polipus/storage'
8
+ require 'polipus/url_tracker'
9
+ require 'polipus/plugin'
10
+ require 'polipus/queue_overflow'
11
+ require 'polipus/robotex'
12
+ require 'polipus/signal_handler'
13
+ require 'thread'
14
+ require 'logger'
15
+ require 'json'
16
+
17
+ module Polipus
18
+ def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
19
+ PolipusCrawler.crawl(job_name, urls, options, &block)
20
+ end
21
+
22
+ class PolipusCrawler
23
+ OPTS = {
24
+ # run 4 threads
25
+ workers: 4,
26
+ # identify self as Polipus/VERSION
27
+ user_agent: "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
28
+ # by default, don't limit the depth of the crawl
29
+ depth_limit: false,
30
+ # number of times HTTP redirects will be followed
31
+ redirect_limit: 5,
32
+ # storage engine defaults to DevNull
33
+ storage: nil,
34
+ # proxy server hostname
35
+ proxy_host: nil,
36
+ # proxy server port number
37
+ proxy_port: false,
38
+ # proxy server username
39
+ proxy_user: nil,
40
+ # proxy server password
41
+ proxy_pass: nil,
42
+ # HTTP read timeout in seconds
43
+ read_timeout: 30,
44
+ # HTTP open connection timeout in seconds
45
+ open_timeout: 10,
46
+ # Time to wait for new messages on Redis
47
+ # After this timeout, current crawling session is marked as terminated
48
+ queue_timeout: 30,
49
+ # An URL tracker instance. default is Bloomfilter based on redis
50
+ url_tracker: nil,
51
+ # A Redis options {} that will be passed directly to Redis.new
52
+ redis_options: {},
53
+ # An instance of logger
54
+ logger: nil,
55
+ # A logger level
56
+ logger_level: nil,
57
+ # whether the query string should be included in the saved page
58
+ include_query_string_in_saved_page: true,
59
+ # Max number of items to keep on redis
60
+ queue_items_limit: 2_000_000,
61
+ # The adapter used to store exceed (queue_items_limit) redis items
62
+ queue_overflow_adapter: nil,
63
+ # Every x seconds, the main queue is checked for overflowed items
64
+ queue_overflow_manager_check_time: 60,
65
+ # If true, each page downloaded will increment a counter on redis
66
+ stats_enabled: false,
67
+ # Cookies strategy
68
+ cookie_jar: nil,
69
+ # whether or not accept cookies
70
+ accept_cookies: false,
71
+ # A set of hosts that should be considered parts of the same domain
72
+ # Eg It can be used to follow links with and without 'www' domain
73
+ domain_aliases: [],
74
+ # Mark a connection as staled after connection_max_hits request
75
+ connection_max_hits: nil,
76
+ # Page TTL: mark a page as expired after ttl_page seconds
77
+ ttl_page: nil,
78
+ # don't obey the robots exclusion protocol
79
+ obey_robots_txt: false,
80
+ # If true, signal handling strategy is enabled.
81
+ # INT and TERM signal will stop polipus gracefully
82
+ # Disable it if polipus will run as a part of Resque or DelayedJob-like system
83
+ enable_signal_handler: true
84
+ }
85
+
86
+ attr_reader :storage
87
+ attr_reader :job_name
88
+ attr_reader :logger
89
+ attr_reader :options
90
+ attr_reader :crawler_name
91
+
92
+ OPTS.keys.each do |key|
93
+ define_method "#{key}=" do |value|
94
+ @options[key.to_sym] = value
95
+ end
96
+ define_method "#{key}" do
97
+ @options[key.to_sym]
98
+ end
99
+ end
100
+
101
+ def initialize(job_name = 'polipus', urls = [], options = {})
102
+ @job_name = job_name
103
+ @options = OPTS.merge(options)
104
+ @options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
105
+ @logger = @options[:logger] ||= Logger.new(nil)
106
+
107
+ unless @logger.class.to_s == 'Log4r::Logger'
108
+ @logger.level = @options[:logger_level] ||= Logger::INFO
109
+ end
110
+
111
+ @storage = @options[:storage] ||= Storage.dev_null
112
+
113
+ @workers_pool = []
114
+
115
+ @follow_links_like = []
116
+ @skip_links_like = []
117
+ @on_page_downloaded = []
118
+ @on_before_save = []
119
+ @on_page_error = []
120
+ @focus_crawl_block = nil
121
+ @on_crawl_start = []
122
+ @on_crawl_end = []
123
+ @redis_factory = nil
124
+
125
+ @overflow_manager = nil
126
+ @crawler_name = `hostname`.strip + "-#{@job_name}"
127
+
128
+ @storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
129
+
130
+ @urls = [urls].flatten.map { |url| URI(url) }
131
+ @urls.each { |url| url.path = '/' if url.path.empty? }
132
+ if @options[:obey_robots_txt]
133
+ @robots =
134
+ if @options[:user_agent].respond_to?(:sample)
135
+ Polipus::Robotex.new(@options[:user_agent].sample)
136
+ else
137
+ Polipus::Robotex.new(@options[:user_agent])
138
+ end
139
+ end
140
+ # Attach signal handling if enabled
141
+ SignalHandler.enable if @options[:enable_signal_handler]
142
+
143
+ if queue_overflow_adapter
144
+ @on_crawl_start << lambda do |_|
145
+ Thread.new do
146
+ Thread.current[:name] = :overflow_items_controller
147
+ overflow_items_controller.run
148
+ end
149
+ end
150
+ end
151
+
152
+ @on_crawl_end << lambda do |_|
153
+ Thread.list.select { |thread| thread.status && Thread.current[:name] == :overflow_items_controller }.each(&:kill)
154
+ end
155
+
156
+ execute_plugin 'on_initialize'
157
+
158
+ yield self if block_given?
159
+ end
160
+
161
+ def self.crawl(*args, &block)
162
+ new(*args, &block).takeover
163
+ end
164
+
165
+ def takeover
166
+ @urls.each do |u|
167
+ add_url(u) { |page| page.user_data.p_seeded = true }
168
+ end
169
+ return if internal_queue.empty?
170
+
171
+ @on_crawl_start.each { |e| e.call(self) }
172
+
173
+ execute_plugin 'on_crawl_start'
174
+ @options[:workers].times do |worker_number|
175
+ @workers_pool << Thread.new do
176
+ @logger.debug { "Start worker #{worker_number}" }
177
+ http = HTTP.new(@options)
178
+ queue = queue_factory
179
+ queue.process(false, @options[:queue_timeout]) do |message|
180
+
181
+ next if message.nil?
182
+
183
+ execute_plugin 'on_message_received'
184
+
185
+ page = Page.from_json message
186
+
187
+ unless should_be_visited?(page.url, false)
188
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) is no more welcome." }
189
+ queue.commit
190
+ next
191
+ end
192
+
193
+ if page_exists? page
194
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
195
+ queue.commit
196
+ next
197
+ end
198
+
199
+ url = page.url.to_s
200
+ @logger.debug { "[worker ##{worker_number}] Fetching page: [#{page.url}] Referer: #{page.referer} Depth: #{page.depth}" }
201
+
202
+ execute_plugin 'on_before_download'
203
+
204
+ pages = http.fetch_pages(url, page.referer, page.depth)
205
+ if pages.count > 1
206
+ rurls = pages.map { |e| e.url.to_s }.join(' --> ')
207
+ @logger.info { "Got redirects! #{rurls}" }
208
+ page = pages.pop
209
+ page.aliases = pages.map { |e| e.url }
210
+ if page_exists? page
211
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
212
+ queue.commit
213
+ next
214
+ end
215
+ else
216
+ page = pages.last
217
+ end
218
+
219
+ execute_plugin 'on_after_download'
220
+
221
+ if page.error
222
+ @logger.warn { "Page #{page.url} has error: #{page.error}" }
223
+ incr_error
224
+ @on_page_error.each { |e| e.call(page) }
225
+ end
226
+
227
+ # Execute on_before_save blocks
228
+ @on_before_save.each { |e| e.call(page) }
229
+
230
+ page.storable? && @storage.add(page)
231
+
232
+ @logger.debug { "[worker ##{worker_number}] Fetched page: [#{page.url}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]" }
233
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) downloaded" }
234
+
235
+ incr_pages
236
+
237
+ # Execute on_page_downloaded blocks
238
+ @on_page_downloaded.each { |e| e.call(page) }
239
+
240
+ if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
241
+ links_for(page).each do |url_to_visit|
242
+ next unless should_be_visited?(url_to_visit)
243
+ enqueue url_to_visit, page
244
+ end
245
+ else
246
+ @logger.info { "[worker ##{worker_number}] Depth limit reached #{page.depth}" }
247
+ end
248
+
249
+ @logger.debug { "[worker ##{worker_number}] Queue size: #{queue.size}" }
250
+ @overflow_manager.perform if @overflow_manager && queue.empty?
251
+ execute_plugin 'on_message_processed'
252
+
253
+ if SignalHandler.terminated?
254
+ @logger.info { 'About to exit! Thanks for using Polipus' }
255
+ queue.commit
256
+ break
257
+ end
258
+ true
259
+ end
260
+ end
261
+ end
262
+
263
+ @workers_pool.each { |w| w.join }
264
+ @on_crawl_end.each { |e| e.call(self) }
265
+ execute_plugin 'on_crawl_end'
266
+ end
267
+
268
+ # A pattern or an array of patterns can be passed as argument
269
+ # An url will be discarded if it doesn't match patterns
270
+ def follow_links_like(*patterns)
271
+ @follow_links_like = @follow_links_like += patterns.uniq.compact
272
+ self
273
+ end
274
+
275
+ # A pattern or an array of patterns can be passed as argument
276
+ # An url will be discarded if it matches a pattern
277
+ def skip_links_like(*patterns)
278
+ @skip_links_like = @skip_links_like += patterns.uniq.compact
279
+ self
280
+ end
281
+
282
+ # A block of code will be executed on every page downloaded
283
+ # The block takes the page as argument
284
+ def on_page_downloaded(&block)
285
+ @on_page_downloaded << block
286
+ self
287
+ end
288
+
289
+ # A block of code will be executed when crawl session is over
290
+ def on_crawl_end(&block)
291
+ @on_crawl_end << block
292
+ self
293
+ end
294
+
295
+ # A block of code will be executed when crawl session is starting
296
+ def on_crawl_start(&block)
297
+ @on_crawl_start << block
298
+ self
299
+ end
300
+
301
+ # A block of code will be executed on every page downloaded
302
+ # before being saved in the registered storage
303
+ def on_before_save(&block)
304
+ @on_before_save << block
305
+ self
306
+ end
307
+
308
+ # A block of code will be executed whether a page contains an error
309
+ def on_page_error(&block)
310
+ @on_page_error << block
311
+ self
312
+ end
313
+
314
+ # A block of code will be executed
315
+ # on every page downloaded. The code is used to extract urls to visit
316
+ # see links_for method
317
+ def focus_crawl(&block)
318
+ @focus_crawl_block = block
319
+ self
320
+ end
321
+
322
+ def redis_options
323
+ @options[:redis_options]
324
+ end
325
+
326
+ def queue_size
327
+ internal_queue.size
328
+ end
329
+
330
+ def stats_reset!
331
+ ["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each { |e| redis.del e }
332
+ end
333
+
334
+ def redis_factory(&block)
335
+ @redis_factory = block
336
+ self
337
+ end
338
+
339
+ def url_tracker
340
+ @url_tracker ||=
341
+ @options[:url_tracker] ||=
342
+ UrlTracker.bloomfilter(key_name: "polipus_bf_#{job_name}",
343
+ redis: redis_factory_adapter,
344
+ driver: 'lua')
345
+ end
346
+
347
+ def redis
348
+ @redis ||= redis_factory_adapter
349
+ end
350
+
351
+ def add_to_queue(page)
352
+ if [:url, :referer, :depth].all? { |method| page.respond_to?(method) }
353
+ add_url(page.url, referer: page.referer, depth: page.depth)
354
+ else
355
+ add_url(page)
356
+ end
357
+ end
358
+
359
+ # Enqueue an url, no matter what
360
+ def add_url(url, params = {})
361
+ page = Page.new(url, params)
362
+ yield(page) if block_given?
363
+ internal_queue << page.to_json
364
+ end
365
+
366
+ # Request to Polipus to stop its work (gracefully)
367
+ # cler_queue = true if you want to delete all of the pending urls to visit
368
+ def stop!(cler_queue = false)
369
+ SignalHandler.terminate
370
+ internal_queue.clear(true) if cler_queue
371
+ end
372
+
373
+ private
374
+
375
+ # URLs enqueue policy
376
+ def should_be_visited?(url, with_tracker = true)
377
+ case
378
+ # robots.txt
379
+ when !allowed_by_robot?(url)
380
+ false
381
+ # Check against whitelist pattern matching
382
+ when !@follow_links_like.empty? && @follow_links_like.none? { |p| url.path =~ p }
383
+ false
384
+ # Check against blacklist pattern matching
385
+ when @skip_links_like.any? { |p| url.path =~ p }
386
+ false
387
+ # Page is marked as expired
388
+ when page_expired?(Page.new(url))
389
+ true
390
+ # Check against url tracker
391
+ when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/, ''))
392
+ false
393
+ else
394
+ true
395
+ end
396
+ end
397
+
398
+ # It extracts URLs from the page
399
+ def links_for(page)
400
+ page.domain_aliases = domain_aliases
401
+ @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
402
+ end
403
+
404
+ # whether a page is expired or not
405
+ def page_expired?(page)
406
+ return false if @options[:ttl_page].nil?
407
+ stored_page = @storage.get(page)
408
+ r = stored_page && stored_page.expired?(@options[:ttl_page])
409
+ @logger.debug { "Page #{page.url} marked as expired" } if r
410
+ r
411
+ end
412
+
413
+ # whether a page exists or not
414
+ def page_exists?(page)
415
+ return false if page.user_data && page.user_data.p_seeded
416
+ @storage.exists?(page) && !page_expired?(page)
417
+ end
418
+
419
+ #
420
+ # Returns +true+ if we are obeying robots.txt and the link
421
+ # is granted access in it. Always returns +true+ when we are
422
+ # not obeying robots.txt.
423
+ #
424
+ def allowed_by_robot?(link)
425
+ return true if @robots.nil?
426
+ @options[:obey_robots_txt] ? @robots.allowed?(link) : true
427
+ end
428
+
429
+ # The url is enqueued for a later visit
430
+ def enqueue(url_to_visit, current_page)
431
+ page_to_visit = Page.new(url_to_visit.to_s, referer: current_page.url.to_s, depth: current_page.depth + 1)
432
+ internal_queue << page_to_visit.to_json
433
+ to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/, '')
434
+ url_tracker.visit to_track
435
+ @logger.debug { "Added (#{url_to_visit}) to the queue" }
436
+ end
437
+
438
+ # It creates a redis client
439
+ def redis_factory_adapter
440
+ if @redis_factory
441
+ @redis_factory.call(redis_options)
442
+ else
443
+ Redis.new(redis_options)
444
+ end
445
+ end
446
+
447
+ # It creates a new distributed queue
448
+ def queue_factory
449
+ Redis::Queue.new("polipus_queue_#{@job_name}", "bp_polipus_queue_#{@job_name}", redis: redis_factory_adapter)
450
+ end
451
+
452
+ # If stats enabled, it increments errors found
453
+ def incr_error
454
+ redis.incr "polipus:#{@job_name}:errors" if @options[:stats_enabled]
455
+ end
456
+
457
+ # If stats enabled, it increments pages downloaded
458
+ def incr_pages
459
+ redis.incr "polipus:#{@job_name}:pages" if @options[:stats_enabled]
460
+ end
461
+
462
+ # It handles the overflow item policy (if any)
463
+ def overflow_items_controller
464
+ @overflow_manager = QueueOverflow::Manager.new(self, queue_factory, @options[:queue_items_limit])
465
+
466
+ # In the time, url policy may change so policy is re-evaluated
467
+ @overflow_manager.url_filter do |page|
468
+ should_be_visited?(page.url, false)
469
+ end
470
+
471
+ QueueOverflow::Worker.new(@overflow_manager)
472
+ end
473
+
474
+ def internal_queue
475
+ @internal_queue ||= queue_factory
476
+ end
477
+
478
+ # It invokes a plugin method if any
479
+ def execute_plugin(method)
480
+ Polipus::Plugin.plugins.each do |k, p|
481
+ next unless p.respond_to?(method)
482
+ @logger.info { "Running plugin method #{method} on #{k}" }
483
+ ret_val = p.send(method, self)
484
+ instance_eval(&ret_val) if ret_val.kind_of? Proc
485
+ end
486
+ end
487
+ end
488
+ end