polipus 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +8 -8
  2. data/.rubocop.yml +17 -0
  3. data/.rubocop_todo.yml +37 -0
  4. data/.travis.yml +2 -1
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +10 -0
  7. data/Rakefile +4 -4
  8. data/examples/basic.rb +16 -19
  9. data/examples/incremental.rb +17 -17
  10. data/examples/robots_txt_handling.rb +1 -1
  11. data/examples/survival.rb +3 -3
  12. data/lib/polipus.rb +186 -229
  13. data/lib/polipus/http.rb +41 -42
  14. data/lib/polipus/page.rb +33 -34
  15. data/lib/polipus/plugin.rb +2 -2
  16. data/lib/polipus/plugins/cleaner.rb +7 -8
  17. data/lib/polipus/plugins/sample.rb +6 -9
  18. data/lib/polipus/plugins/sleeper.rb +7 -8
  19. data/lib/polipus/queue_overflow.rb +11 -11
  20. data/lib/polipus/queue_overflow/base.rb +1 -1
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
  22. data/lib/polipus/queue_overflow/manager.rb +28 -25
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
  25. data/lib/polipus/robotex.rb +41 -51
  26. data/lib/polipus/signal_handler.rb +41 -0
  27. data/lib/polipus/storage.rb +11 -11
  28. data/lib/polipus/storage/base.rb +10 -8
  29. data/lib/polipus/storage/dev_null.rb +6 -7
  30. data/lib/polipus/storage/memory_store.rb +21 -22
  31. data/lib/polipus/storage/mongo_store.rb +34 -38
  32. data/lib/polipus/storage/s3_store.rb +33 -38
  33. data/lib/polipus/url_tracker.rb +3 -3
  34. data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
  35. data/lib/polipus/url_tracker/redis_set.rb +3 -4
  36. data/lib/polipus/version.rb +3 -3
  37. data/polipus.gemspec +12 -13
  38. data/spec/clear.rb +3 -3
  39. data/spec/http_spec.rb +27 -28
  40. data/spec/page_spec.rb +16 -16
  41. data/spec/polipus_spec.rb +34 -31
  42. data/spec/queue_overflow_manager_spec.rb +30 -28
  43. data/spec/queue_overflow_spec.rb +15 -15
  44. data/spec/robotex_spec.rb +9 -10
  45. data/spec/signal_handler_spec.rb +18 -0
  46. data/spec/spec_helper.rb +7 -6
  47. data/spec/storage_memory_spec.rb +18 -18
  48. data/spec/storage_mongo_spec.rb +19 -19
  49. data/spec/storage_s3_spec.rb +30 -31
  50. data/spec/url_tracker_spec.rb +7 -7
  51. metadata +7 -2
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- ZTc3MjQ1OWQwNzVhMWFhMGQ2NTdlYjM3ZTkyZDQ3ZDAwZDExZWQ1Mw==
4
+ MWJhN2NlNmRiZTcxODdhNGIzMWJmZWJhMDgwN2JhZTNhNjFlMGE2ZA==
5
5
  data.tar.gz: !binary |-
6
- OTUzYTE5M2U4YTQ3ZGVmZTAzMzdiYjJmZWYzM2Q3MTU0NDMyYzAwMQ==
6
+ ZmE0NThkNzkwNDQ4MDQ5ZGRhZGViMzNmYzAwNWRlMzgyZDAwZmUyNg==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MDk5MGQ2MzBkYzU2MjJlNDg1YTkwYTU1YjJjYWQ0YjAyNDY5OTZkNWJlZDIw
10
- NDAwNjY2ZjMwMGUxZWE0NTNiNzc5YmIzZTg2NjcwNjFjZTMyNzIxZjZlYzZm
11
- N2ZjMTk2ZjRkYjU0M2VjZDk0NWMxYzk0MjE4MWRkOWFiY2M3YTA=
9
+ YTNhOThhMzk4M2RhOGE4NzQ4NDM0OTBjOTlkNjYwNmI2YTlkZmU3MDNjMDQ2
10
+ NDlmZDVmZjQ0ZWJmZDFjYjJkYzFhYzJiMmYyYzRlOTc0N2RmY2NlMTU1ZDIy
11
+ YTZjNDU4NzZkYmQ3ZmI1ZjNjZTVmYTllOTE5OTkzNDI1ZjZjMzI=
12
12
  data.tar.gz: !binary |-
13
- OTgwMTI5MWFhNWQ5Mjk4OWNmZTk3ZGE0MTMyYzM5NDlkMWJhMjFiMWQ4NDQ4
14
- OGI1NDU3ZDQ0ZTkzNWFkMzAyZjg3YmRiNDlmN2I0ZDNlNWRlZmVkMjIzMWQ2
15
- MGY0NGQ4YTQ1ZmEyMGQ0M2VkNzE2YzIyOGMxOGE4MDQzMWFkZjU=
13
+ MTUwYWRjY2VmZDk4Mzk5MWI5ZGNjMjFmZjViZWM2YjA2ZmZjZDViYTIzZGE5
14
+ OGRmY2U4MjNmZDBiNjBkMmNiZDZkNmM5MGNjYzNmODJlNDk0Nzk5OGFhNTdl
15
+ YjZjNzIyYzNjZjY1YzExNTU4YjBiYzAyM2VhYWI3YTY4NTA5N2M=
data/.rubocop.yml ADDED
@@ -0,0 +1,17 @@
1
+ inherit_from: .rubocop_todo.yml
2
+ AllCops:
3
+ Exclude:
4
+ - my_test/**/*
5
+ - examples/**/*
6
+
7
+ Style/LineLength:
8
+ Enabled: false
9
+
10
+ Style/TrivialAccessors:
11
+ Enabled: false
12
+
13
+ Style/ClassLength:
14
+ Enabled: false
15
+
16
+ Style/MethodLength:
17
+ Enabled: false
data/.rubocop_todo.yml ADDED
@@ -0,0 +1,37 @@
1
+ # This configuration was generated by `rubocop --auto-gen-config`
2
+ # on 2014-06-08 11:25:39 -0700 using RuboCop version 0.23.0.
3
+ # The point is for the user to remove these configuration records
4
+ # one by one as the offenses are removed from the code base.
5
+ # Note that changes in the inspected code, or installation of new
6
+ # versions of RuboCop, may require this file to be generated again.
7
+
8
+ # Offense count: 1
9
+ Style/ClassVars:
10
+ Enabled: false
11
+
12
+ # Offense count: 10
13
+ Style/CyclomaticComplexity:
14
+ Max: 16
15
+
16
+ # Offense count: 26
17
+ Style/Documentation:
18
+ Enabled: false
19
+
20
+ # Offense count: 38
21
+ # Configuration parameters: EnforcedStyle, SupportedStyles.
22
+ Style/Encoding:
23
+ Enabled: false
24
+
25
+ # Offense count: 2
26
+ # Configuration parameters: EnforcedStyle, SupportedStyles.
27
+ Style/Next:
28
+ Enabled: false
29
+
30
+ # Offense count: 5
31
+ # Configuration parameters: MaxSlashes.
32
+ Style/RegexpLiteral:
33
+ Enabled: false
34
+
35
+ # Offense count: 4
36
+ Style/RescueModifier:
37
+ Enabled: false
data/.travis.yml CHANGED
@@ -3,7 +3,8 @@ rvm:
3
3
  - jruby
4
4
  - 1.9.3
5
5
  - 2.0.0
6
- - 2.1.1
6
+ - 2.1.2
7
+ - rbx-2
7
8
 
8
9
  services:
9
10
  - mongodb
data/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.3.1 (2015-06-17)
4
+
5
+ [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.0...0.3.1)
6
+
7
+ * Major Code-Style changes and cleanup
8
+ [#35](https://github.com/taganaka/polipus/pull/35)
9
+ * BugFix: proper initialization of internal_queue
10
+ [#38](https://github.com/taganaka/polipus/pull/38)
11
+ * Better INT / TERM Signal handling [#34](https://github.com/taganaka/polipus/pull/34)
12
+
13
+ New option added:
14
+ ```ruby
15
+ enable_signal_handler: true / false
16
+ ```
17
+
18
+ * Zlib::GzipFile::Error handling
19
+ [da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
20
+ * Faster and easier overflow management
21
+ [#39](https://github.com/taganaka/polipus/pull/39)
22
+
3
23
  ## 0.3.0 (2015-06-02)
4
24
 
5
25
  [Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
data/README.md CHANGED
@@ -21,6 +21,15 @@ http://www.slideshare.net/francescolaurita/roll-your-own-web-crawler-rubyday
21
21
  * Focus crawling made easy
22
22
  * Heavily inspired to Anemone https://github.com/chriskite/anemone/
23
23
 
24
+ ## Supported Ruby Interpreters
25
+
26
+ * MRI 1.9.x >= 1.9.1
27
+ * MRI 2.0.0
28
+ * MRI 2.1.2
29
+ * JRuby 1.9 mode
30
+ * Rubinius
31
+
32
+
24
33
  ## Survival code example
25
34
 
26
35
  ```ruby
@@ -52,6 +61,7 @@ end
52
61
  * Start a feature/bugfix branch.
53
62
  * Commit and push until you are happy with your contribution.
54
63
  * Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
64
+ * Install [Rubocop](https://github.com/bbatsov/rubocop) and make sure it is happy
55
65
  * Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
56
66
 
57
67
  ## Copyright ##
data/Rakefile CHANGED
@@ -1,9 +1,9 @@
1
- require "bundler/gem_tasks"
2
- require "rspec/core/rake_task"
1
+ require 'bundler/gem_tasks'
2
+ require 'rspec/core/rake_task'
3
3
 
4
4
  RSpec::Core::RakeTask.new(:spec) do |spec|
5
5
  spec.pattern = 'spec/*_spec.rb'
6
6
  end
7
7
 
8
- task :default => :spec
9
- task :test => :spec
8
+ task default: :spec
9
+ task test: :spec
data/examples/basic.rb CHANGED
@@ -1,29 +1,29 @@
1
- require "polipus"
2
- require "mongo"
3
- require "polipus/plugins/cleaner"
1
+ require 'polipus'
2
+ require 'mongo'
3
+ require 'polipus/plugins/cleaner'
4
4
  # Define a Mongo connection
5
- mongo = Mongo::Connection.new(:pool_size => 15, :pool_timeout => 5).db('crawler')
5
+ mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
6
6
 
7
7
  # Override some default options
8
8
  options = {
9
- #Redis connection
10
- :redis_options => {
11
- :host => 'localhost',
12
- :db => 5,
13
- :driver => 'hiredis'
9
+ # Redis connection
10
+ redis_options: {
11
+ host: 'localhost',
12
+ db: 5,
13
+ driver: 'hiredis'
14
14
  },
15
15
  # Page storage: pages is the name of the collection where
16
16
  # pages will be stored
17
- :storage => Polipus::Storage.mongo_store(mongo, 'pages'),
17
+ storage: Polipus::Storage.mongo_store(mongo, 'pages'),
18
18
  # Use your custom user agent
19
- :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
19
+ user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
20
20
  # Use 5 threads
21
- :workers => 5,
21
+ workers: 5,
22
22
  # Logs goes to the crawler.log file
23
- :logger => Logger.new(STDOUT)
23
+ logger: Logger.new(STDOUT)
24
24
  }
25
- Polipus::Plugin.register Polipus::Plugin::Cleaner, reset:true
26
- starting_urls = ["http://rubygems.org/gems"]
25
+ Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
26
+ starting_urls = ['http://rubygems.org/gems']
27
27
 
28
28
  # Crawl the entire rubygems's site
29
29
  # Polipus.crawler('polipus-rubygems', starting_urls, options)
@@ -49,10 +49,7 @@ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
49
49
  end
50
50
 
51
51
  # Do a nifty stuff at the end of the crawling session
52
- crawler.on_crawl_end do
52
+ crawler.on_crawl_end do
53
53
  # Gong.bang(:loudly)
54
54
  end
55
55
  end
56
-
57
-
58
-
@@ -1,36 +1,36 @@
1
- require "polipus"
2
- require "mongo"
1
+ require 'polipus'
2
+ require 'mongo'
3
3
 
4
4
  # Define a Mongo connection
5
- mongo = Mongo::Connection.new(:pool_size => 15, :pool_timeout => 5).db('crawler')
5
+ mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
6
6
  # Override some default options
7
7
  options = {
8
- #Redis connection
9
- :redis_options => {
10
- :host => 'localhost',
11
- :db => 5,
12
- :driver => 'hiredis'
8
+ # Redis connection
9
+ redis_options: {
10
+ host: 'localhost',
11
+ db: 5,
12
+ driver: 'hiredis'
13
13
  },
14
14
  # Page storage: pages is the name of the collection where
15
15
  # pages will be stored
16
- :storage => Polipus::Storage.mongo_store(mongo, 'pages'),
16
+ storage: Polipus::Storage.mongo_store(mongo, 'pages'),
17
17
  # Use your custom user agent
18
- :user_agent => "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71",
18
+ user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
19
19
  # Use 10 threads
20
- :workers => 20,
20
+ workers: 20,
21
21
  # Logs goes to the crawler.log file
22
- :logger => Logger.new(STDOUT),
22
+ logger: Logger.new(STDOUT),
23
23
  # Do not go deeper than 2 levels
24
- :depth_limit => 5,
24
+ depth_limit: 5,
25
25
 
26
26
  # Incremental download:
27
27
  # Set a ttl for each stored page
28
28
  # If a previous stored page is now expired, it will re-downloaded
29
29
  # Mark a page expired after 60s
30
- :ttl_page => 60
30
+ ttl_page: 60
31
31
  }
32
32
 
33
- starting_urls = ["http://rubygems.org/gems"]
33
+ starting_urls = ['http://rubygems.org/gems']
34
34
 
35
35
  # Crawl the entire rubygems's site
36
36
  # Polipus.crawler('polipus-rubygems', starting_urls, options)
@@ -52,11 +52,11 @@ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
52
52
  # In-place page processing
53
53
  crawler.on_page_downloaded do |page|
54
54
  # A nokogiri object
55
- puts "Page title: #{page.doc.css('title').text}" rescue "ERROR"
55
+ puts "Page title: #{page.doc.css('title').text}" rescue 'ERROR'
56
56
  end
57
57
 
58
58
  # Do a nifty stuff at the end of the crawling session
59
- crawler.on_crawl_end do
59
+ crawler.on_crawl_end do
60
60
  # Gong.bang(:loudly)
61
61
  end
62
62
  end
@@ -6,7 +6,7 @@ options = {
6
6
  }
7
7
 
8
8
  Polipus.crawler('rubygems', 'http://rubygems.org/', options) do |crawler|
9
-
9
+
10
10
  crawler.on_page_downloaded do |page|
11
11
  puts "Page title: '#{page.doc.at_css('title').content}' Page url: #{page.url}"
12
12
  end
data/examples/survival.rb CHANGED
@@ -1,9 +1,9 @@
1
- require "polipus"
1
+ require 'polipus'
2
2
 
3
- Polipus.crawler("rubygems","http://rubygems.org/") do |crawler|
3
+ Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
4
4
  # In-place page processing
5
5
  crawler.on_page_downloaded do |page|
6
6
  # A nokogiri object
7
7
  puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
8
8
  end
9
- end
9
+ end
data/lib/polipus.rb CHANGED
@@ -1,21 +1,20 @@
1
1
  # encoding: UTF-8
2
- require "redis"
3
- require "redis/connection/hiredis"
4
- require "redis-queue"
5
- require "polipus/version"
6
- require "polipus/http"
7
- require "polipus/storage"
8
- require "polipus/url_tracker"
9
- require "polipus/plugin"
10
- require "polipus/queue_overflow"
11
- require "polipus/robotex"
12
- require "thread"
13
- require "logger"
14
- require "json"
15
- require "singleton"
2
+ require 'redis'
3
+ require 'redis/connection/hiredis'
4
+ require 'redis-queue'
5
+ require 'polipus/version'
6
+ require 'polipus/http'
7
+ require 'polipus/storage'
8
+ require 'polipus/url_tracker'
9
+ require 'polipus/plugin'
10
+ require 'polipus/queue_overflow'
11
+ require 'polipus/robotex'
12
+ require 'polipus/signal_handler'
13
+ require 'thread'
14
+ require 'logger'
15
+ require 'json'
16
16
 
17
17
  module Polipus
18
-
19
18
  def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
20
19
  PolipusCrawler.crawl(job_name, urls, options, &block)
21
20
  end
@@ -23,57 +22,61 @@ module Polipus
23
22
  class PolipusCrawler
24
23
  OPTS = {
25
24
  # run 4 threads
26
- :workers => 4,
25
+ workers: 4,
27
26
  # identify self as Polipus/VERSION
28
- :user_agent => "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
27
+ user_agent: "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
29
28
  # by default, don't limit the depth of the crawl
30
- :depth_limit => false,
29
+ depth_limit: false,
31
30
  # number of times HTTP redirects will be followed
32
- :redirect_limit => 5,
33
- # storage engine defaults to DevNull
34
- :storage => nil,
35
- # proxy server hostname
36
- :proxy_host => nil,
31
+ redirect_limit: 5,
32
+ # storage engine defaults to DevNull
33
+ storage: nil,
34
+ # proxy server hostname
35
+ proxy_host: nil,
37
36
  # proxy server port number
38
- :proxy_port => false,
37
+ proxy_port: false,
39
38
  # HTTP read timeout in seconds
40
- :read_timeout => 30,
39
+ read_timeout: 30,
41
40
  # HTTP open connection timeout in seconds
42
- :open_timeout => 10,
41
+ open_timeout: 10,
43
42
  # Time to wait for new messages on Redis
44
43
  # After this timeout, current crawling session is marked as terminated
45
- :queue_timeout => 30,
44
+ queue_timeout: 30,
46
45
  # An URL tracker instance. default is Bloomfilter based on redis
47
- :url_tracker => nil,
46
+ url_tracker: nil,
48
47
  # A Redis options {} that will be passed directly to Redis.new
49
- :redis_options => {},
48
+ redis_options: {},
50
49
  # An instance of logger
51
- :logger => nil,
50
+ logger: nil,
52
51
  # A logger level
53
- :logger_level => nil,
52
+ logger_level: nil,
54
53
  # whether the query string should be included in the saved page
55
- :include_query_string_in_saved_page => true,
54
+ include_query_string_in_saved_page: true,
56
55
  # Max number of items to keep on redis
57
- :queue_items_limit => 2_000_000,
56
+ queue_items_limit: 2_000_000,
58
57
  # The adapter used to store exceed (queue_items_limit) redis items
59
- :queue_overflow_adapter => nil,
58
+ queue_overflow_adapter: nil,
60
59
  # Every x seconds, the main queue is checked for overflowed items
61
- :queue_overflow_manager_check_time => 60,
60
+ queue_overflow_manager_check_time: 60,
62
61
  # If true, each page downloaded will increment a counter on redis
63
- :stats_enabled => false,
62
+ stats_enabled: false,
64
63
  # Cookies strategy
65
- :cookie_jar => nil,
66
- # whether or not accept cookies
67
- :accept_cookies => false,
64
+ cookie_jar: nil,
65
+ # whether or not accept cookies
66
+ accept_cookies: false,
68
67
  # A set of hosts that should be considered parts of the same domain
69
68
  # Eg It can be used to follow links with and without 'www' domain
70
- :domain_aliases => [],
69
+ domain_aliases: [],
71
70
  # Mark a connection as staled after connection_max_hits request
72
- :connection_max_hits => nil,
71
+ connection_max_hits: nil,
73
72
  # Page TTL: mark a page as expired after ttl_page seconds
74
- :ttl_page => nil,
73
+ ttl_page: nil,
75
74
  # don't obey the robots exclusion protocol
76
- :obey_robots_txt => false
75
+ obey_robots_txt: false,
76
+ # If true, signal handling strategy is enabled.
77
+ # INT and TERM signal will stop polipus gracefully
78
+ # Disable it if polipus will run as a part of Resque or DelayedJob-like system
79
+ enable_signal_handler: true
77
80
  }
78
81
 
79
82
  attr_reader :storage
@@ -82,7 +85,6 @@ module Polipus
82
85
  attr_reader :options
83
86
  attr_reader :crawler_name
84
87
 
85
-
86
88
  OPTS.keys.each do |key|
87
89
  define_method "#{key}=" do |value|
88
90
  @options[key.to_sym] = value
@@ -93,13 +95,12 @@ module Polipus
93
95
  end
94
96
 
95
97
  def initialize(job_name = 'polipus', urls = [], options = {})
96
-
97
98
  @job_name = job_name
98
99
  @options = OPTS.merge(options)
99
100
  @options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
100
101
  @logger = @options[:logger] ||= Logger.new(nil)
101
-
102
- unless @logger.class.to_s == "Log4r::Logger"
102
+
103
+ unless @logger.class.to_s == 'Log4r::Logger'
103
104
  @logger.level = @options[:logger_level] ||= Logger::INFO
104
105
  end
105
106
 
@@ -108,8 +109,7 @@ module Polipus
108
109
  @http_pool = []
109
110
  @workers_pool = []
110
111
  @queues_pool = []
111
-
112
-
112
+
113
113
  @follow_links_like = []
114
114
  @skip_links_like = []
115
115
  @on_page_downloaded = []
@@ -119,21 +119,19 @@ module Polipus
119
119
  @on_crawl_end = []
120
120
  @redis_factory = nil
121
121
 
122
-
123
122
  @overflow_manager = nil
124
123
  @crawler_name = `hostname`.strip + "-#{@job_name}"
125
124
 
126
125
  @storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
127
126
 
128
- @urls = [urls].flatten.map{ |url| URI(url) }
129
- @urls.each{ |url| url.path = '/' if url.path.empty? }
130
- @internal_queue = queue_factory
127
+ @urls = [urls].flatten.map { |url| URI(url) }
128
+ @urls.each { |url| url.path = '/' if url.path.empty? }
131
129
  @robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
132
-
130
+ # Attach signal handling if enabled
131
+ SignalHandler.enable if @options[:enable_signal_handler]
133
132
  execute_plugin 'on_initialize'
134
133
 
135
134
  yield self if block_given?
136
-
137
135
  end
138
136
 
139
137
  def self.crawl(*args, &block)
@@ -141,18 +139,17 @@ module Polipus
141
139
  end
142
140
 
143
141
  def takeover
144
- PolipusSignalHandler.enable
145
142
  overflow_items_controller if queue_overflow_adapter
146
143
 
147
144
  @urls.each do |u|
148
145
  add_url(u) { |page| page.user_data.p_seeded = true }
149
146
  end
150
- return if @internal_queue.empty?
147
+ return if internal_queue.empty?
151
148
 
152
149
  execute_plugin 'on_crawl_start'
153
150
  @options[:workers].times do |worker_number|
154
151
  @workers_pool << Thread.new do
155
- @logger.debug {"Start worker #{worker_number}"}
152
+ @logger.debug { "Start worker #{worker_number}" }
156
153
  http = @http_pool[worker_number] ||= HTTP.new(@options)
157
154
  queue = @queues_pool[worker_number] ||= queue_factory
158
155
  queue.process(false, @options[:queue_timeout]) do |message|
@@ -164,75 +161,73 @@ module Polipus
164
161
  page = Page.from_json message
165
162
 
166
163
  unless should_be_visited?(page.url, false)
167
- @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) is no more welcome."}
164
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) is no more welcome." }
168
165
  queue.commit
169
166
  next
170
167
  end
171
168
 
172
169
  if page_exists? page
173
- @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
170
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
174
171
  queue.commit
175
172
  next
176
173
  end
177
-
174
+
178
175
  url = page.url.to_s
179
- @logger.debug {"[worker ##{worker_number}] Fetching page: [#{page.url.to_s}] Referer: #{page.referer} Depth: #{page.depth}"}
176
+ @logger.debug { "[worker ##{worker_number}] Fetching page: [#{page.url}] Referer: #{page.referer} Depth: #{page.depth}" }
180
177
 
181
178
  execute_plugin 'on_before_download'
182
179
 
183
180
  pages = http.fetch_pages(url, page.referer, page.depth)
184
181
  if pages.count > 1
185
182
  rurls = pages.map { |e| e.url.to_s }.join(' --> ')
186
- @logger.info {"Got redirects! #{rurls}"}
183
+ @logger.info { "Got redirects! #{rurls}" }
187
184
  page = pages.pop
188
- page.aliases = pages.collect { |e| e.url }
185
+ page.aliases = pages.map { |e| e.url }
189
186
  if page_exists? page
190
- @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
187
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
191
188
  queue.commit
192
189
  next
193
190
  end
194
191
  else
195
192
  page = pages.last
196
193
  end
197
-
194
+
198
195
  execute_plugin 'on_after_download'
199
-
196
+
200
197
  if page.error
201
- @logger.warn {"Page #{page.url} has error: #{page.error}"}
198
+ @logger.warn { "Page #{page.url} has error: #{page.error}" }
202
199
  incr_error
203
- @on_page_error.each {|e| e.call(page)}
200
+ @on_page_error.each { |e| e.call(page) }
204
201
  end
205
202
 
206
203
  # Execute on_before_save blocks
207
- @on_before_save.each {|e| e.call(page)}
204
+ @on_before_save.each { |e| e.call(page) }
208
205
 
209
- if page.storable?
210
- @storage.add page
211
- end
212
-
213
- @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
214
- @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
206
+ page.storable? && @storage.add(page)
207
+
208
+ @logger.debug { "[worker ##{worker_number}] Fetched page: [#{page.url}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]" }
209
+ @logger.info { "[worker ##{worker_number}] Page (#{page.url}) downloaded" }
215
210
 
216
211
  incr_pages
217
212
 
218
213
  # Execute on_page_downloaded blocks
219
- @on_page_downloaded.each {|e| e.call(page)}
214
+ @on_page_downloaded.each { |e| e.call(page) }
220
215
 
221
- if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
216
+ if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
222
217
  links_for(page).each do |url_to_visit|
223
218
  next unless should_be_visited?(url_to_visit)
224
219
  enqueue url_to_visit, page, queue
225
220
  end
226
221
  else
227
- @logger.info {"[worker ##{worker_number}] Depth limit reached #{page.depth}"}
222
+ @logger.info { "[worker ##{worker_number}] Depth limit reached #{page.depth}" }
228
223
  end
229
224
 
230
- @logger.debug {"[worker ##{worker_number}] Queue size: #{queue.size}"}
225
+ @logger.debug { "[worker ##{worker_number}] Queue size: #{queue.size}" }
231
226
  @overflow_manager.perform if @overflow_manager && queue.empty?
232
227
  execute_plugin 'on_message_processed'
233
228
 
234
- if PolipusSignalHandler.terminated?
235
- @logger.info {"About to exit! Thanks for using Polipus"}
229
+ if SignalHandler.terminated?
230
+ @logger.info { 'About to exit! Thanks for using Polipus' }
236
231
  queue.commit
237
232
  break
238
233
  end
@@ -240,11 +235,11 @@ module Polipus
240
235
  end
241
236
  end
242
237
  end
243
- @workers_pool.each {|w| w.join}
244
- @on_crawl_end.each {|e| e.call(self)}
238
+ @workers_pool.each { |w| w.join }
239
+ @on_crawl_end.each { |e| e.call(self) }
245
240
  execute_plugin 'on_crawl_end'
246
241
  end
247
-
242
+
248
243
  # A pattern or an array of patterns can be passed as argument
249
244
  # An url will be discarded if it doesn't match patterns
250
245
  def follow_links_like(*patterns)
@@ -298,11 +293,11 @@ module Polipus
298
293
  end
299
294
 
300
295
  def queue_size
301
- @internal_queue.size
296
+ internal_queue.size
302
297
  end
303
298
 
304
299
  def stats_reset!
305
- ["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each {|e| redis.del e}
300
+ ["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each { |e| redis.del e }
306
301
  end
307
302
 
308
303
  def redis_factory(&block)
@@ -313,9 +308,9 @@ module Polipus
313
308
  def url_tracker
314
309
  @url_tracker ||=
315
310
  @options[:url_tracker] ||=
316
- UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}",
317
- :redis => redis_factory_adapter,
318
- :driver => 'lua')
311
+ UrlTracker.bloomfilter(key_name: "polipus_bf_#{job_name}",
312
+ redis: redis_factory_adapter,
313
+ driver: 'lua')
319
314
  end
320
315
 
321
316
  def redis
@@ -334,176 +329,138 @@ module Polipus
334
329
  def add_url(url, params = {})
335
330
  page = Page.new(url, params)
336
331
  yield(page) if block_given?
337
- @internal_queue << page.to_json
332
+ internal_queue << page.to_json
338
333
  end
339
334
 
340
335
  # Request to Polipus to stop its work (gracefully)
341
336
  # cler_queue = true if you want to delete all of the pending urls to visit
342
337
  def stop!(cler_queue = false)
343
- PolipusSignalHandler.terminate
344
- @internal_queue.clear(true) if cler_queue
338
+ SignalHandler.terminate
339
+ internal_queue.clear(true) if cler_queue
345
340
  end
346
341
 
347
342
  private
348
- # URLs enqueue policy
349
- def should_be_visited?(url, with_tracker = true)
350
-
351
- case
352
- # robots.txt
353
- when !allowed_by_robot?(url)
354
- false
355
- # Check against whitelist pattern matching
356
- when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
357
- false
358
- # Check against blacklist pattern matching
359
- when @skip_links_like.any?{ |p| url.path =~ p }
360
- false
361
- # Page is marked as expired
362
- when page_expired?(Page.new(url))
363
- true
364
- # Check against url tracker
365
- when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
366
- false
367
- else
368
- true
369
- end
370
- end
371
-
372
- # It extracts URLs from the page
373
- def links_for page
374
- page.domain_aliases = domain_aliases
375
- @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
376
- end
377
-
378
- # whether a page is expired or not
379
- def page_expired? page
380
- return false if @options[:ttl_page].nil?
381
- stored_page = @storage.get(page)
382
- r = stored_page && stored_page.expired?(@options[:ttl_page])
383
- @logger.debug {"Page #{page.url.to_s} marked as expired"} if r
384
- r
385
- end
386
-
387
- # whether a page exists or not
388
- def page_exists? page
389
- return false if page.user_data && page.user_data.p_seeded
390
- @storage.exists?(page) && !page_expired?(page)
391
- end
392
343
 
393
- #
394
- # Returns +true+ if we are obeying robots.txt and the link
395
- # is granted access in it. Always returns +true+ when we are
396
- # not obeying robots.txt.
397
- #
398
- def allowed_by_robot?(link)
399
- return true if @robots.nil?
400
- @options[:obey_robots_txt] ? @robots.allowed?(link) : true
344
+ # URLs enqueue policy
345
+ def should_be_visited?(url, with_tracker = true)
346
+ case
347
+ # robots.txt
348
+ when !allowed_by_robot?(url)
349
+ false
350
+ # Check against whitelist pattern matching
351
+ when !@follow_links_like.empty? && @follow_links_like.none? { |p| url.path =~ p }
352
+ false
353
+ # Check against blacklist pattern matching
354
+ when @skip_links_like.any? { |p| url.path =~ p }
355
+ false
356
+ # Page is marked as expired
357
+ when page_expired?(Page.new(url))
358
+ true
359
+ # Check against url tracker
360
+ when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/, ''))
361
+ false
362
+ else
363
+ true
401
364
  end
365
+ end
402
366
 
367
+ # It extracts URLs from the page
368
+ def links_for(page)
369
+ page.domain_aliases = domain_aliases
370
+ @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
371
+ end
403
372
 
404
- # The url is enqueued for a later visit
405
- def enqueue url_to_visit, current_page, queue
406
- page_to_visit = Page.new(url_to_visit.to_s, :referer => current_page.url.to_s, :depth => current_page.depth + 1)
407
- queue << page_to_visit.to_json
408
- to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/,'')
409
- url_tracker.visit to_track
410
- @logger.debug {"Added [#{url_to_visit.to_s}] to the queue"}
411
- end
373
+ # whether a page is expired or not
374
+ def page_expired?(page)
375
+ return false if @options[:ttl_page].nil?
376
+ stored_page = @storage.get(page)
377
+ r = stored_page && stored_page.expired?(@options[:ttl_page])
378
+ @logger.debug { "Page #{page.url} marked as expired" } if r
379
+ r
380
+ end
412
381
 
413
- # It creates a redis client
414
- def redis_factory_adapter
415
- if @redis_factory
416
- @redis_factory.call(redis_options)
417
- else
418
- Redis.new(redis_options)
419
- end
420
- end
382
+ # whether a page exists or not
383
+ def page_exists?(page)
384
+ return false if page.user_data && page.user_data.p_seeded
385
+ @storage.exists?(page) && !page_expired?(page)
386
+ end
421
387
 
422
- # It creates a new distributed queue
423
- def queue_factory
424
- Redis::Queue.new("polipus_queue_#{@job_name}","bp_polipus_queue_#{@job_name}", :redis => redis_factory_adapter)
425
- end
388
+ #
389
+ # Returns +true+ if we are obeying robots.txt and the link
390
+ # is granted access in it. Always returns +true+ when we are
391
+ # not obeying robots.txt.
392
+ #
393
+ def allowed_by_robot?(link)
394
+ return true if @robots.nil?
395
+ @options[:obey_robots_txt] ? @robots.allowed?(link) : true
396
+ end
426
397
 
427
- # If stats enable, it increments errors found
428
- def incr_error
429
- redis.incr "polipus:#{@job_name}:errors" if @options[:stats_enabled]
430
- end
398
+ # The url is enqueued for a later visit
399
+ def enqueue(url_to_visit, current_page, queue)
400
+ page_to_visit = Page.new(url_to_visit.to_s, referer: current_page.url.to_s, depth: current_page.depth + 1)
401
+ queue << page_to_visit.to_json
402
+ to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/, '')
403
+ url_tracker.visit to_track
404
+ @logger.debug { "Added [#{url_to_visit}] to the queue" }
405
+ end
431
406
 
432
- # If stats enable, it increments pages downloaded
433
- def incr_pages
434
- redis.incr "polipus:#{@job_name}:pages" if @options[:stats_enabled]
407
+ # It creates a redis client
408
+ def redis_factory_adapter
409
+ if @redis_factory
410
+ @redis_factory.call(redis_options)
411
+ else
412
+ Redis.new(redis_options)
435
413
  end
414
+ end
436
415
 
437
- # It handles the overflow item policy (if any)
438
- def overflow_items_controller
439
- @overflow_manager = QueueOverflow::Manager.new(self, queue_factory, @options[:queue_items_limit])
440
-
441
- # In the time, url policy may change so policy is re-evaluated
442
- @overflow_manager.url_filter do |page|
443
- should_be_visited?(page.url, false)
444
- end
416
+ # It creates a new distributed queue
417
+ def queue_factory
418
+ Redis::Queue.new("polipus_queue_#{@job_name}", "bp_polipus_queue_#{@job_name}", redis: redis_factory_adapter)
419
+ end
445
420
 
446
- Thread.new do
447
-
448
- redis_lock = redis_factory_adapter
449
- op_timeout = @options[:queue_overflow_manager_check_time]
421
+ # If stats enabled, it increments errors found
422
+ def incr_error
423
+ redis.incr "polipus:#{@job_name}:errors" if @options[:stats_enabled]
424
+ end
450
425
 
451
- while true
452
- lock = redis_lock.setnx "polipus_queue_overflow-#{@job_name}.lock", 1
426
+ # If stats enabled, it increments pages downloaded
427
+ def incr_pages
428
+ redis.incr "polipus:#{@job_name}:pages" if @options[:stats_enabled]
429
+ end
453
430
 
454
- if lock
455
- redis_lock.expire "polipus_queue_overflow-#{@job_name}.lock", op_timeout + 350
456
- removed, restored = @overflow_manager.perform
457
- @logger.info {"Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}"}
458
- redis_lock.del "polipus_queue_overflow-#{@job_name}.lock"
459
- else
460
- @logger.info {"Lock not acquired"}
461
- end
431
+ # It handles the overflow item policy (if any)
432
+ def overflow_items_controller
433
+ @overflow_manager = QueueOverflow::Manager.new(self, queue_factory, @options[:queue_items_limit])
462
434
 
463
- sleep @options[:queue_overflow_manager_check_time]
464
- end
465
- end
435
+ # In the time, url policy may change so policy is re-evaluated
436
+ @overflow_manager.url_filter do |page|
437
+ should_be_visited?(page.url, false)
466
438
  end
467
439
 
468
- # It invokes a plugin method if any
469
- def execute_plugin method
440
+ Thread.new do
470
441
 
471
- Polipus::Plugin.plugins.each do |k,p|
472
- if p.respond_to? method
473
- @logger.info("Running plugin method #{method} on #{k}")
474
- ret_val = p.send(method, self)
475
- instance_eval(&ret_val) if ret_val.kind_of? Proc
476
- end
442
+ loop do
443
+ @logger.info { 'Overflow Manager: cycle started' }
444
+ removed, restored = @overflow_manager.perform
445
+ @logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}" }
446
+ sleep @options[:queue_overflow_manager_check_time]
477
447
  end
478
- end
479
448
 
480
- end
481
-
482
- class PolipusSignalHandler
483
- include Singleton
484
- attr_accessor :terminated
485
- def initialize
486
- self.terminated = false
487
- end
488
-
489
- def self.enable
490
- trap(:INT) {
491
- puts "Got INT signal"
492
- self.terminate
493
- }
494
- trap(:TERM) {
495
- puts "Got TERM signal"
496
- self.terminate
497
- }
449
+ end
498
450
  end
499
451
 
500
- def self.terminate
501
- self.instance.terminated = true
452
+ def internal_queue
453
+ @internal_queue ||= queue_factory
502
454
  end
503
455
 
504
- def self.terminated?
505
- self.instance.terminated
456
+ # It invokes a plugin method if any
457
+ def execute_plugin(method)
458
+ Polipus::Plugin.plugins.each do |k, p|
459
+ next unless p.respond_to?(method)
460
+ @logger.info { "Running plugin method #{method} on #{k}" }
461
+ ret_val = p.send(method, self)
462
+ instance_eval(&ret_val) if ret_val.kind_of? Proc
463
+ end
506
464
  end
507
465
  end
508
-
509
466
  end