polipus 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
checksums.yaml ADDED
@@ -0,0 +1,15 @@
1
+ ---
2
+ !binary "U0hBMQ==":
3
+ metadata.gz: !binary |-
4
+ MjAwM2Q2OWZlZDc3ODkxMzE0YzZjY2UzNzcwODBmZDQ0NDdkYWY4Mg==
5
+ data.tar.gz: !binary |-
6
+ ZjUxNGNhY2RmODI3ZWIzNGQzODkwZjgwZjU3Njk1Njk2OTIwYWRjMw==
7
+ !binary "U0hBNTEy":
8
+ metadata.gz: !binary |-
9
+ YTU3N2U4YmJjZjZjNzhiZTE2ZjIwY2YwOTc1MjNkMTVmMzdjZmY1NTlhZWQ4
10
+ MTFjZGIyYjNmY2IwMmM1ZjFiZjFjZGE3NjA4ZWJjNDEyOTA4MmM5MDU4NDYx
11
+ NTI1NGYwNzgwNzExNWI5NzBkMTY5ZDJiMmYzYmE2ZWNmMjIxNjU=
12
+ data.tar.gz: !binary |-
13
+ NWQyYjAyYzZiNDE4ZDM5ZmJlNWI1YjVlNTU3Mzk5ZGI3MDRhYTc2YTFlOWE1
14
+ ZTk3YTc4ZTZjYWJkZDg1OTgzYTA0MjYwYzBlOTEwOWQyMDY3YTg0YTAxZjg5
15
+ ZDYyN2QwNzU0MDk0NWMwZDQ4NjczZDFhMzk0MTVkNGY2ODk3YWE=
data/.document ADDED
@@ -0,0 +1,5 @@
1
+ lib/**/*.rb
2
+ bin/*
3
+ -
4
+ features/**/*.feature
5
+ LICENSE.txt
data/.gitignore ADDED
@@ -0,0 +1,53 @@
1
+ # rcov generated
2
+ coverage
3
+ coverage.data
4
+
5
+ # rdoc generated
6
+ rdoc
7
+
8
+ # yard generated
9
+ doc
10
+ .yardoc
11
+
12
+ # bundler
13
+ .bundle
14
+
15
+ # jeweler generated
16
+ pkg
17
+
18
+ # Have editor/IDE/OS specific files you need to ignore? Consider using a global gitignore:
19
+ #
20
+ # * Create a file at ~/.gitignore
21
+ # * Include files you want ignored
22
+ # * Run: git config --global core.excludesfile ~/.gitignore
23
+ #
24
+ # After doing this, these files will be ignored in all your git projects,
25
+ # saving you from having to 'pollute' every project you touch with them
26
+ #
27
+ # Not sure what to needs to be ignored for particular editors/OSes? Here's some ideas to get you started. (Remember, remove the leading # of the line)
28
+ #
29
+ # For MacOS:
30
+ #
31
+ .DS_Store
32
+
33
+ # For TextMate
34
+ #*.tmproj
35
+ #tmtags
36
+
37
+ # For emacs:
38
+ #*~
39
+ #\#*
40
+ #.\#*
41
+
42
+ # For vim:
43
+ #*.swp
44
+
45
+ # For redcar:
46
+ #.redcar
47
+
48
+ # For rubinius:
49
+ #*.rbc
50
+
51
+ Gemfile.lock
52
+
53
+ my_test/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source "http://rubygems.org"
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2013 Francesco Laurita
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,55 @@
1
+ # Polipus #
2
+
3
+ A distributed web crawler written in ruby, backed by Redis
4
+ This project has been presented to the RubyDay2013
5
+ http://www.slideshare.net/francescolaurita/roll-your-own-web-crawler-rubyday
6
+
7
+ ## Features ##
8
+
9
+ * Easy to use
10
+ * Distributed and scalable
11
+ * It uses a smart/fast and space-efficient probabilistic data structure to determine if an url should be visited or not
12
+ * It doesn't exaust your Redis server
13
+ * Play nicely with MongoDB even if it is not strictly required
14
+ * Easy to write your own page storage strategy
15
+ * Focus crawling made easy
16
+ * Heavily inspired to Anemone https://github.com/chriskite/anemone/
17
+
18
+ ## Survival code example
19
+
20
+ ```ruby
21
+ require "polipus"
22
+
23
+ Polipus.crawler("rubygems","http://rubygems.org/") do |crawler|
24
+ # In-place page processing
25
+ crawler.on_page_downloaded do |page|
26
+ # A nokogiri object
27
+ puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
28
+ end
29
+ end
30
+ ```
31
+
32
+ ## Installation
33
+
34
+ $ gem install polipus
35
+
36
+ ## Testing
37
+
38
+ $ bundle install
39
+ $ rake
40
+
41
+ ## Contributing to polipus ##
42
+
43
+ * Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet.
44
+ * Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it.
45
+ * Fork the project.
46
+ * Start a feature/bugfix branch.
47
+ * Commit and push until you are happy with your contribution.
48
+ * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
49
+ * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
50
+
51
+ ## Copyright ##
52
+
53
+ Copyright (c) 2013 Francesco Laurita. See LICENSE.txt for
54
+ further details.
55
+
data/README.rdoc ADDED
@@ -0,0 +1,3 @@
1
+ = polipus
2
+
3
+ Visit https://github.com/taganaka/polipus for further details.
data/Rakefile ADDED
@@ -0,0 +1,9 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec) do |spec|
5
+ spec.pattern = 'spec/*_spec.rb'
6
+ end
7
+
8
+ task :default => :spec
9
+ task :test => :spec
data/examples/basic.rb ADDED
@@ -0,0 +1,58 @@
1
+ require "polipus"
2
+ require "mongo"
3
+
4
+ # Define a Mongo connection
5
+ mongo = Mongo::Connection.new(:pool_size => 15, :pool_timeout => 5).db('crawler')
6
+
7
+ # Override some default options
8
+ options = {
9
+ #Redis connection
10
+ :redis_options => {
11
+ :host => 'localhost',
12
+ :db => 5,
13
+ :driver => 'hiredis'
14
+ },
15
+ # Page storage: pages is the name of the collection where
16
+ # pages will be stored
17
+ :storage => Polipus::Storage.mongo_store(mongo, 'pages'),
18
+ # Use your custom user agent
19
+ :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
20
+ # Use 5 threads
21
+ :workers => 5,
22
+ # Logs goes to the crawler.log file
23
+ :logger => Logger.new("/tmp/crawler.log")
24
+ }
25
+
26
+ starting_urls = ["http://rubygems.org/gems"]
27
+
28
+ # Crawl the entire rubygems's site
29
+ # Polipus.crawler('polipus-rubygems', starting_urls, options)
30
+
31
+ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
32
+ # Ignore urls pointing to a gem file
33
+ crawler.skip_links_like(/\.gem$/)
34
+ # Ignore urls pointing to an atom feed
35
+ crawler.skip_links_like(/\.atom$/)
36
+ # Ignore urls containing /versions/ path
37
+ crawler.skip_links_like(/\/versions\//)
38
+
39
+ # Adding some metadata to a page
40
+ # The metadata will be stored on mongo
41
+ crawler.on_before_save do |page|
42
+ page.user_data.processed = false
43
+ end
44
+
45
+ # In-place page processing
46
+ crawler.on_page_downloaded do |page|
47
+ # A nokogiri object
48
+ puts "Page title: #{page.doc.css('title').text}"
49
+ end
50
+
51
+ # Do a nifty stuff at the end of the crawling session
52
+ crawler.on_crawl_end do
53
+ # Gong.bang(:loudly)
54
+ end
55
+ end
56
+
57
+
58
+
@@ -0,0 +1,9 @@
1
+ require "polipus"
2
+
3
+ Polipus.crawler("rubygems","http://rubygems.org/") do |crawler|
4
+ # In-place page processing
5
+ crawler.on_page_downloaded do |page|
6
+ # A nokogiri object
7
+ puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
8
+ end
9
+ end
data/lib/polipus.rb ADDED
@@ -0,0 +1,451 @@
1
+ # encoding: UTF-8
2
+ require "redis"
3
+ require "redis/connection/hiredis"
4
+ require "redis-queue"
5
+ require "polipus/version"
6
+ require "polipus/http"
7
+ require "polipus/storage"
8
+ require "polipus/url_tracker"
9
+ require "polipus/plugin"
10
+ require "polipus/queue_overflow"
11
+ require "thread"
12
+ require "logger"
13
+ require "json"
14
+ require "singleton"
15
+
16
+ module Polipus
17
+
18
+ def Polipus.crawler(job_name = 'polipus', urls = [], options = {}, &block)
19
+ PolipusCrawler.crawl(job_name, urls, options, &block)
20
+ end
21
+
22
+ class PolipusCrawler
23
+ OPTS = {
24
+ # run 4 threads
25
+ :workers => 4,
26
+ # identify self as Anemone/VERSION
27
+ :user_agent => "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
28
+ # by default, don't limit the depth of the crawl
29
+ :depth_limit => false,
30
+ # number of times HTTP redirects will be followed
31
+ :redirect_limit => 5,
32
+ # storage engine defaults to DevNull
33
+ :storage => nil,
34
+ # proxy server hostname
35
+ :proxy_host => nil,
36
+ # proxy server port number
37
+ :proxy_port => false,
38
+ # HTTP read timeout in seconds
39
+ :read_timeout => 30,
40
+ # An URL tracker instance. default is Bloomfilter based on redis
41
+ :url_tracker => nil,
42
+ # A Redis options {} that will be passed directly to Redis.new
43
+ :redis_options => {},
44
+ # An instance of logger
45
+ :logger => nil,
46
+ # whether the query string should be included in the saved page
47
+ :include_query_string_in_saved_page => true,
48
+ # Max number of items to keep on redis
49
+ :queue_items_limit => 2_000_000,
50
+ # The adapter used to store exceed (queue_items_limit) redis items
51
+ :queue_overflow_adapter => nil,
52
+ # Every x seconds, the main queue is checked for overflowed items
53
+ :queue_overflow_manager_check_time => 60,
54
+ # If true, each page downloaded will increment a counter on redis
55
+ :stats_enabled => false,
56
+ # Cookies strategy
57
+ :cookie_jar => nil,
58
+ :accept_cookies => false,
59
+ # A set of hosts that should be considered parts of the same domain
60
+ # Eg It can be used to follow links with and without 'www' domain
61
+ :domain_aliases => []
62
+ }
63
+
64
+ attr_reader :storage
65
+ attr_reader :job_name
66
+ attr_reader :logger
67
+ attr_reader :overflow_adapter
68
+ attr_reader :options
69
+ attr_reader :crawler_name
70
+
71
+
72
+ OPTS.keys.each do |key|
73
+ define_method "#{key}=" do |value|
74
+ @options[key.to_sym] = value
75
+ end
76
+ define_method "#{key}" do
77
+ @options[key.to_sym]
78
+ end
79
+ end
80
+
81
+ def initialize(job_name = 'polipus',urls = [], options = {})
82
+
83
+ @job_name = job_name
84
+ @options = OPTS.merge(options)
85
+ @logger = @options[:logger] ||= Logger.new(nil)
86
+ @logger.level = @options[:logger_level] ||= Logger::INFO
87
+ @storage = @options[:storage] ||= Storage.dev_null
88
+
89
+ @http_pool = []
90
+ @workers_pool = []
91
+ @queues_pool = []
92
+
93
+
94
+ @follow_links_like = []
95
+ @skip_links_like = []
96
+ @on_page_downloaded = []
97
+ @on_before_save = []
98
+ @focus_crawl_block = nil
99
+ @on_crawl_end = []
100
+ @redis_factory = nil
101
+
102
+
103
+ @overflow_manager = nil
104
+ @crawler_name = `hostname`.strip + "-#{@job_name}"
105
+
106
+ @storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
107
+
108
+ @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
109
+ @urls.each{ |url| url.path = '/' if url.path.empty? }
110
+ execute_plugin 'on_initialize'
111
+
112
+ yield self if block_given?
113
+
114
+ end
115
+
116
+ def self.crawl(job_name, urls, opts = {})
117
+
118
+ self.new(job_name, urls, opts) do |polipus|
119
+ yield polipus if block_given?
120
+
121
+ polipus.takeover
122
+ end
123
+
124
+ end
125
+
126
+ def takeover
127
+ PolipusSignalHandler.enable
128
+ overflow_items_controller if queue_overflow_adapter
129
+
130
+ q = queue_factory
131
+ @urls.each do |u|
132
+ next if url_tracker.visited?(u.to_s)
133
+ q << Page.new(u.to_s, :referer => '').to_json
134
+ end
135
+
136
+ return if q.empty?
137
+
138
+ execute_plugin 'on_crawl_start'
139
+ @options[:workers].times do |worker_number|
140
+ @workers_pool << Thread.new do
141
+ @logger.debug {"Start worker #{worker_number}"}
142
+ http = @http_pool[worker_number] ||= HTTP.new(@options)
143
+ queue = @queues_pool[worker_number] ||= queue_factory
144
+ queue.process(false, @options[:read_timeout]) do |message|
145
+
146
+ next if message.nil?
147
+
148
+ execute_plugin 'on_message_received'
149
+
150
+ page = Page.from_json message
151
+
152
+ unless should_be_visited?(page.url, false)
153
+ @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] is no more welcome."}
154
+ queue.commit
155
+ next
156
+ end
157
+
158
+ if @storage.exists? page
159
+ @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
160
+ queue.commit
161
+ next
162
+ end
163
+
164
+ url = page.url.to_s
165
+ @logger.debug {"[worker ##{worker_number}] Fetching page: [#{page.url.to_s}] Referer: #{page.referer} Depth: #{page.depth}"}
166
+
167
+ execute_plugin 'on_before_download'
168
+
169
+ pages = http.fetch_pages(url, page.referer, page.depth)
170
+ if pages.count > 1
171
+ rurls = pages.map { |e| e.url.to_s }.join(' --> ')
172
+ @logger.info {"Got redirects! #{rurls}"}
173
+ page = pages.pop
174
+ page.aliases = pages.collect { |e| e.url }
175
+ if @storage.exists?(page)
176
+ @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
177
+ queue.commit
178
+ next
179
+ end
180
+ else
181
+ page = pages.last
182
+ end
183
+
184
+ # Execute on_before_save blocks
185
+ @on_before_save.each {|e| e.call(page)} unless page.nil?
186
+ execute_plugin 'on_after_download'
187
+
188
+ @logger.warn {"Page #{page.url} has error: #{page.error}"} if page.error
189
+
190
+ incr_error if page.error
191
+
192
+ @storage.add page unless page.nil?
193
+
194
+ @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
195
+ @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] downloaded"}
196
+
197
+ incr_pages
198
+
199
+ # Execute on_page_downloaded blocks
200
+ @on_page_downloaded.each {|e| e.call(page)} unless page.nil?
201
+
202
+ if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
203
+ links_for(page).each do |url_to_visit|
204
+ next unless should_be_visited?(url_to_visit)
205
+ enqueue url_to_visit, page, queue
206
+ end
207
+ else
208
+ @logger.info {"[worker ##{worker_number}] Depth limit reached #{page.depth}"}
209
+ end
210
+
211
+ @logger.debug {"[worker ##{worker_number}] Queue size: #{queue.size}"}
212
+ @overflow_manager.perform if @overflow_manager && queue.empty?
213
+ execute_plugin 'on_message_processed'
214
+
215
+ if PolipusSignalHandler.terminated?
216
+ @logger.info {"About to exit! Thanks for using Polipus"}
217
+ queue.commit
218
+ break
219
+ end
220
+ true
221
+ end
222
+ end
223
+ end
224
+ @workers_pool.each {|w| w.join}
225
+ @on_crawl_end.each {|e| e.call(self)}
226
+ execute_plugin 'on_crawl_end'
227
+ end
228
+
229
+ # A pattern or an array of patterns can be passed as argument
230
+ # An url will be discarded if it doesn't match patterns
231
+ def follow_links_like(*patterns)
232
+ @follow_links_like = @follow_links_like += patterns.uniq.compact
233
+ self
234
+ end
235
+
236
+ # A pattern or an array of patterns can be passed as argument
237
+ # An url will be discarded if it matches a pattern
238
+ def skip_links_like(*patterns)
239
+ @skip_links_like = @skip_links_like += patterns.uniq.compact
240
+ self
241
+ end
242
+
243
+ # A block of code will be executed on every page dowloaded
244
+ # The block takes the page as argument
245
+ def on_page_downloaded(&block)
246
+ @on_page_downloaded << block
247
+ self
248
+ end
249
+
250
+ def on_crawl_end(&block)
251
+ @on_crawl_end << block
252
+ self
253
+ end
254
+
255
+ # A block of code will be executed on every page donloaded
256
+ # before being saved in the registered storage
257
+ def on_before_save(&block)
258
+ @on_before_save << block
259
+ self
260
+ end
261
+
262
+ # A block of code will be executed
263
+ # on every page donloaded. The code is used to extract urls to visit
264
+ # see links_for method
265
+ def focus_crawl(&block)
266
+ @focus_crawl_block = block
267
+ self
268
+ end
269
+
270
+ def redis_options
271
+ @options[:redis_options]
272
+ end
273
+
274
+ def overflow_adapter
275
+ @options[:overflow_adapter]
276
+ end
277
+
278
+ def queue_size
279
+ @internal_queue ||= queue_factory
280
+ @internal_queue.size
281
+ end
282
+
283
+ def stats_reset!
284
+ ["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each {|e| redis.del e}
285
+ end
286
+
287
+ def redis_factory(&block)
288
+ @redis_factory = block
289
+ self
290
+ end
291
+
292
+ def url_tracker
293
+ if @url_tracker.nil?
294
+ @url_tracker = @options[:url_tracker] ||= UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}", :redis => redis_factory_adapter, :driver => 'lua')
295
+ end
296
+ @url_tracker
297
+ end
298
+
299
+ def redis
300
+ if @redis.nil?
301
+ @redis = redis_factory_adapter
302
+ end
303
+ @redis
304
+ end
305
+
306
+ def add_url url
307
+ @url_tracker.remove url.to_s
308
+ page = Page.new(url)
309
+ queue_factory << page.to_json
310
+ end
311
+
312
+ # Request to Polipus to stop its work (gracefully)
313
+ # cler_queue = true if you want to delete all of the pending urls to visit
314
+ def stop!(cler_queue = false)
315
+ PolipusSignalHandler.terminate
316
+ queue_factory.clear(true) if cler_queue
317
+ end
318
+
319
+ private
320
+ # URLs enqueue policy
321
+ def should_be_visited?(url, with_tracker = true)
322
+
323
+ # Check against whitelist pattern matching
324
+ unless @follow_links_like.empty?
325
+ return false unless @follow_links_like.any?{|p| url.path =~ p}
326
+ end
327
+
328
+ # Check against blacklist pattern matching
329
+ unless @skip_links_like.empty?
330
+ return false if @skip_links_like.any?{|p| url.path =~ p}
331
+ end
332
+
333
+ # Check against url tracker
334
+ if with_tracker
335
+ return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
336
+ end
337
+ true
338
+ end
339
+
340
+ # It extracts URLs from the page
341
+ def links_for page
342
+ page.domain_aliases = domain_aliases
343
+ links = @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
344
+ links
345
+ end
346
+
347
+ # The url is enqueued for a later visit
348
+ def enqueue url_to_visit, current_page, queue
349
+ page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
350
+ queue << page_to_visit.to_json
351
+ to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/,'')
352
+ url_tracker.visit to_track
353
+ @logger.debug {"Added [#{url_to_visit.to_s}] to the queue"}
354
+ end
355
+
356
+ # It creates a redis client
357
+ def redis_factory_adapter
358
+ unless @redis_factory.nil?
359
+ return @redis_factory.call(redis_options)
360
+ end
361
+ Redis.new(redis_options)
362
+ end
363
+
364
+ # It creates a new distributed queue
365
+ def queue_factory
366
+ Redis::Queue.new("polipus_queue_#{@job_name}","bp_polipus_queue_#{@job_name}", :redis => redis_factory_adapter)
367
+ end
368
+
369
+ # If stats enable, it increments errors found
370
+ def incr_error
371
+ redis.incr "polipus:#{@job_name}:errors" if @options[:stats_enabled]
372
+ end
373
+
374
+ # If stats enable, it increments pages downloaded
375
+ def incr_pages
376
+ redis.incr "polipus:#{@job_name}:pages" if @options[:stats_enabled]
377
+ end
378
+
379
+ # It handles the overflow item policy (if any)
380
+ def overflow_items_controller
381
+ @overflow_manager = QueueOverflow::Manager.new(self, queue_factory, @options[:queue_items_limit])
382
+
383
+ # In the time, url policy may change so policy is re-evaluated
384
+ @overflow_manager.url_filter do |page|
385
+ should_be_visited?(page.url, false)
386
+ end
387
+
388
+ Thread.new do
389
+
390
+ redis_lock = redis_factory_adapter
391
+ op_timeout = @options[:queue_overflow_manager_check_time]
392
+
393
+ while true
394
+ lock = redis_lock.setnx "polipus_queue_overflow-#{@job_name}.lock", 1
395
+
396
+ if lock
397
+ redis_lock.expire "polipus_queue_overflow-#{@job_name}.lock", op_timeout + 350
398
+ removed, restored = @overflow_manager.perform
399
+ @logger.info {"Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}"}
400
+ redis_lock.del "polipus_queue_overflow-#{@job_name}.lock"
401
+ else
402
+ @logger.info {"Lock not acquired"}
403
+ end
404
+
405
+ sleep @options[:queue_overflow_manager_check_time]
406
+ end
407
+ end
408
+ end
409
+
410
+ # It invokes a plugin method if any
411
+ def execute_plugin method
412
+
413
+ Polipus::Plugin.plugins.each do |k,p|
414
+ if p.respond_to? method
415
+ @logger.info("Running plugin method #{method} on #{k}")
416
+ ret_val = p.send(method, self)
417
+ instance_eval(&ret_val) if ret_val.kind_of? Proc
418
+ end
419
+ end
420
+ end
421
+
422
+ end
423
+
424
+ class PolipusSignalHandler
425
+ include Singleton
426
+ attr_accessor :terminated
427
+ def initialize
428
+ self.terminated = false
429
+ end
430
+
431
+ def self.enable
432
+ trap(:INT) {
433
+ puts "Got INT signal"
434
+ self.terminate
435
+ }
436
+ trap(:TERM) {
437
+ puts "Got TERM signal"
438
+ self.terminate
439
+ }
440
+ end
441
+
442
+ def self.terminate
443
+ self.instance.terminated = true
444
+ end
445
+
446
+ def self.terminated?
447
+ self.instance.terminated
448
+ end
449
+ end
450
+
451
+ end