polipus 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
MWJhN2NlNmRiZTcxODdhNGIzMWJmZWJhMDgwN2JhZTNhNjFlMGE2ZA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ZmE0NThkNzkwNDQ4MDQ5ZGRhZGViMzNmYzAwNWRlMzgyZDAwZmUyNg==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YTNhOThhMzk4M2RhOGE4NzQ4NDM0OTBjOTlkNjYwNmI2YTlkZmU3MDNjMDQ2
|
10
|
+
NDlmZDVmZjQ0ZWJmZDFjYjJkYzFhYzJiMmYyYzRlOTc0N2RmY2NlMTU1ZDIy
|
11
|
+
YTZjNDU4NzZkYmQ3ZmI1ZjNjZTVmYTllOTE5OTkzNDI1ZjZjMzI=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTUwYWRjY2VmZDk4Mzk5MWI5ZGNjMjFmZjViZWM2YjA2ZmZjZDViYTIzZGE5
|
14
|
+
OGRmY2U4MjNmZDBiNjBkMmNiZDZkNmM5MGNjYzNmODJlNDk0Nzk5OGFhNTdl
|
15
|
+
YjZjNzIyYzNjZjY1YzExNTU4YjBiYzAyM2VhYWI3YTY4NTA5N2M=
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
2
|
+
AllCops:
|
3
|
+
Exclude:
|
4
|
+
- my_test/**/*
|
5
|
+
- examples/**/*
|
6
|
+
|
7
|
+
Style/LineLength:
|
8
|
+
Enabled: false
|
9
|
+
|
10
|
+
Style/TrivialAccessors:
|
11
|
+
Enabled: false
|
12
|
+
|
13
|
+
Style/ClassLength:
|
14
|
+
Enabled: false
|
15
|
+
|
16
|
+
Style/MethodLength:
|
17
|
+
Enabled: false
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# This configuration was generated by `rubocop --auto-gen-config`
|
2
|
+
# on 2014-06-08 11:25:39 -0700 using RuboCop version 0.23.0.
|
3
|
+
# The point is for the user to remove these configuration records
|
4
|
+
# one by one as the offenses are removed from the code base.
|
5
|
+
# Note that changes in the inspected code, or installation of new
|
6
|
+
# versions of RuboCop, may require this file to be generated again.
|
7
|
+
|
8
|
+
# Offense count: 1
|
9
|
+
Style/ClassVars:
|
10
|
+
Enabled: false
|
11
|
+
|
12
|
+
# Offense count: 10
|
13
|
+
Style/CyclomaticComplexity:
|
14
|
+
Max: 16
|
15
|
+
|
16
|
+
# Offense count: 26
|
17
|
+
Style/Documentation:
|
18
|
+
Enabled: false
|
19
|
+
|
20
|
+
# Offense count: 38
|
21
|
+
# Configuration parameters: EnforcedStyle, SupportedStyles.
|
22
|
+
Style/Encoding:
|
23
|
+
Enabled: false
|
24
|
+
|
25
|
+
# Offense count: 2
|
26
|
+
# Configuration parameters: EnforcedStyle, SupportedStyles.
|
27
|
+
Style/Next:
|
28
|
+
Enabled: false
|
29
|
+
|
30
|
+
# Offense count: 5
|
31
|
+
# Configuration parameters: MaxSlashes.
|
32
|
+
Style/RegexpLiteral:
|
33
|
+
Enabled: false
|
34
|
+
|
35
|
+
# Offense count: 4
|
36
|
+
Style/RescueModifier:
|
37
|
+
Enabled: false
|
data/.travis.yml
CHANGED
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,25 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
## 0.3.1 (2015-06-17)
|
4
|
+
|
5
|
+
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.3.0...0.3.1)
|
6
|
+
|
7
|
+
* Major Code-Style changes and cleanup
|
8
|
+
[#35](https://github.com/taganaka/polipus/pull/35)
|
9
|
+
* BugFix: proper initialization of internal_queue
|
10
|
+
[#38](https://github.com/taganaka/polipus/pull/38)
|
11
|
+
* Better INT / TERM Signal handling [#34](https://github.com/taganaka/polipus/pull/34)
|
12
|
+
|
13
|
+
New option added:
|
14
|
+
```ruby
|
15
|
+
enable_signal_handler: true / false
|
16
|
+
```
|
17
|
+
|
18
|
+
* Zlib::GzipFile::Error handling
|
19
|
+
[da3b927](https://github.com/taganaka/polipus/commit/da3b927acb1b50c26276ed458da0a365c22fd98b)
|
20
|
+
* Faster and easier overflow management
|
21
|
+
[#39](https://github.com/taganaka/polipus/pull/39)
|
22
|
+
|
3
23
|
## 0.3.0 (2015-06-02)
|
4
24
|
|
5
25
|
[Compare changes in gem](https://github.com/taganaka/polipus/compare/0.2.2...0.3.0)
|
data/README.md
CHANGED
@@ -21,6 +21,15 @@ http://www.slideshare.net/francescolaurita/roll-your-own-web-crawler-rubyday
|
|
21
21
|
* Focus crawling made easy
|
22
22
|
* Heavily inspired to Anemone https://github.com/chriskite/anemone/
|
23
23
|
|
24
|
+
## Supported Ruby Interpreters
|
25
|
+
|
26
|
+
* MRI 1.9.x >= 1.9.1
|
27
|
+
* MRI 2.0.0
|
28
|
+
* MRI 2.1.2
|
29
|
+
* JRuby 1.9 mode
|
30
|
+
* Rubinius
|
31
|
+
|
32
|
+
|
24
33
|
## Survival code example
|
25
34
|
|
26
35
|
```ruby
|
@@ -52,6 +61,7 @@ end
|
|
52
61
|
* Start a feature/bugfix branch.
|
53
62
|
* Commit and push until you are happy with your contribution.
|
54
63
|
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
64
|
+
* Install [Rubocop](https://github.com/bbatsov/rubocop) and make sure it is happy
|
55
65
|
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
56
66
|
|
57
67
|
## Copyright ##
|
data/Rakefile
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'bundler/gem_tasks'
|
2
|
+
require 'rspec/core/rake_task'
|
3
3
|
|
4
4
|
RSpec::Core::RakeTask.new(:spec) do |spec|
|
5
5
|
spec.pattern = 'spec/*_spec.rb'
|
6
6
|
end
|
7
7
|
|
8
|
-
task :
|
9
|
-
task :
|
8
|
+
task default: :spec
|
9
|
+
task test: :spec
|
data/examples/basic.rb
CHANGED
@@ -1,29 +1,29 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'polipus'
|
2
|
+
require 'mongo'
|
3
|
+
require 'polipus/plugins/cleaner'
|
4
4
|
# Define a Mongo connection
|
5
|
-
mongo = Mongo::Connection.new(:
|
5
|
+
mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
|
6
6
|
|
7
7
|
# Override some default options
|
8
8
|
options = {
|
9
|
-
#Redis connection
|
10
|
-
:
|
11
|
-
:
|
12
|
-
:
|
13
|
-
:
|
9
|
+
# Redis connection
|
10
|
+
redis_options: {
|
11
|
+
host: 'localhost',
|
12
|
+
db: 5,
|
13
|
+
driver: 'hiredis'
|
14
14
|
},
|
15
15
|
# Page storage: pages is the name of the collection where
|
16
16
|
# pages will be stored
|
17
|
-
:
|
17
|
+
storage: Polipus::Storage.mongo_store(mongo, 'pages'),
|
18
18
|
# Use your custom user agent
|
19
|
-
:
|
19
|
+
user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
|
20
20
|
# Use 5 threads
|
21
|
-
:
|
21
|
+
workers: 5,
|
22
22
|
# Logs goes to the crawler.log file
|
23
|
-
:
|
23
|
+
logger: Logger.new(STDOUT)
|
24
24
|
}
|
25
|
-
Polipus::Plugin.register Polipus::Plugin::Cleaner, reset:true
|
26
|
-
starting_urls = [
|
25
|
+
Polipus::Plugin.register Polipus::Plugin::Cleaner, reset: true
|
26
|
+
starting_urls = ['http://rubygems.org/gems']
|
27
27
|
|
28
28
|
# Crawl the entire rubygems's site
|
29
29
|
# Polipus.crawler('polipus-rubygems', starting_urls, options)
|
@@ -49,10 +49,7 @@ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
|
|
49
49
|
end
|
50
50
|
|
51
51
|
# Do a nifty stuff at the end of the crawling session
|
52
|
-
crawler.on_crawl_end do
|
52
|
+
crawler.on_crawl_end do
|
53
53
|
# Gong.bang(:loudly)
|
54
54
|
end
|
55
55
|
end
|
56
|
-
|
57
|
-
|
58
|
-
|
data/examples/incremental.rb
CHANGED
@@ -1,36 +1,36 @@
|
|
1
|
-
require
|
2
|
-
require
|
1
|
+
require 'polipus'
|
2
|
+
require 'mongo'
|
3
3
|
|
4
4
|
# Define a Mongo connection
|
5
|
-
mongo = Mongo::Connection.new(:
|
5
|
+
mongo = Mongo::Connection.new(pool_size: 15, pool_timeout: 5).db('crawler')
|
6
6
|
# Override some default options
|
7
7
|
options = {
|
8
|
-
#Redis connection
|
9
|
-
:
|
10
|
-
:
|
11
|
-
:
|
12
|
-
:
|
8
|
+
# Redis connection
|
9
|
+
redis_options: {
|
10
|
+
host: 'localhost',
|
11
|
+
db: 5,
|
12
|
+
driver: 'hiredis'
|
13
13
|
},
|
14
14
|
# Page storage: pages is the name of the collection where
|
15
15
|
# pages will be stored
|
16
|
-
:
|
16
|
+
storage: Polipus::Storage.mongo_store(mongo, 'pages'),
|
17
17
|
# Use your custom user agent
|
18
|
-
:
|
18
|
+
user_agent: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9) AppleWebKit/537.71 (KHTML, like Gecko) Version/7.0 Safari/537.71',
|
19
19
|
# Use 10 threads
|
20
|
-
:
|
20
|
+
workers: 20,
|
21
21
|
# Logs goes to the crawler.log file
|
22
|
-
:
|
22
|
+
logger: Logger.new(STDOUT),
|
23
23
|
# Do not go deeper than 2 levels
|
24
|
-
:
|
24
|
+
depth_limit: 5,
|
25
25
|
|
26
26
|
# Incremental download:
|
27
27
|
# Set a ttl for each stored page
|
28
28
|
# If a previous stored page is now expired, it will re-downloaded
|
29
29
|
# Mark a page expired after 60s
|
30
|
-
:
|
30
|
+
ttl_page: 60
|
31
31
|
}
|
32
32
|
|
33
|
-
starting_urls = [
|
33
|
+
starting_urls = ['http://rubygems.org/gems']
|
34
34
|
|
35
35
|
# Crawl the entire rubygems's site
|
36
36
|
# Polipus.crawler('polipus-rubygems', starting_urls, options)
|
@@ -52,11 +52,11 @@ Polipus.crawler('polipus-rubygems', starting_urls, options) do |crawler|
|
|
52
52
|
# In-place page processing
|
53
53
|
crawler.on_page_downloaded do |page|
|
54
54
|
# A nokogiri object
|
55
|
-
puts "Page title: #{page.doc.css('title').text}" rescue
|
55
|
+
puts "Page title: #{page.doc.css('title').text}" rescue 'ERROR'
|
56
56
|
end
|
57
57
|
|
58
58
|
# Do a nifty stuff at the end of the crawling session
|
59
|
-
crawler.on_crawl_end do
|
59
|
+
crawler.on_crawl_end do
|
60
60
|
# Gong.bang(:loudly)
|
61
61
|
end
|
62
62
|
end
|
data/examples/survival.rb
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
require
|
1
|
+
require 'polipus'
|
2
2
|
|
3
|
-
Polipus.crawler(
|
3
|
+
Polipus.crawler('rubygems', 'http://rubygems.org/') do |crawler|
|
4
4
|
# In-place page processing
|
5
5
|
crawler.on_page_downloaded do |page|
|
6
6
|
# A nokogiri object
|
7
7
|
puts "Page title: '#{page.doc.css('title').text}' Page url: #{page.url}"
|
8
8
|
end
|
9
|
-
end
|
9
|
+
end
|
data/lib/polipus.rb
CHANGED
@@ -1,21 +1,20 @@
|
|
1
1
|
# encoding: UTF-8
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
require
|
6
|
-
require
|
7
|
-
require
|
8
|
-
require
|
9
|
-
require
|
10
|
-
require
|
11
|
-
require
|
12
|
-
require
|
13
|
-
require
|
14
|
-
require
|
15
|
-
require
|
2
|
+
require 'redis'
|
3
|
+
require 'redis/connection/hiredis'
|
4
|
+
require 'redis-queue'
|
5
|
+
require 'polipus/version'
|
6
|
+
require 'polipus/http'
|
7
|
+
require 'polipus/storage'
|
8
|
+
require 'polipus/url_tracker'
|
9
|
+
require 'polipus/plugin'
|
10
|
+
require 'polipus/queue_overflow'
|
11
|
+
require 'polipus/robotex'
|
12
|
+
require 'polipus/signal_handler'
|
13
|
+
require 'thread'
|
14
|
+
require 'logger'
|
15
|
+
require 'json'
|
16
16
|
|
17
17
|
module Polipus
|
18
|
-
|
19
18
|
def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
|
20
19
|
PolipusCrawler.crawl(job_name, urls, options, &block)
|
21
20
|
end
|
@@ -23,57 +22,61 @@ module Polipus
|
|
23
22
|
class PolipusCrawler
|
24
23
|
OPTS = {
|
25
24
|
# run 4 threads
|
26
|
-
:
|
25
|
+
workers: 4,
|
27
26
|
# identify self as Polipus/VERSION
|
28
|
-
:
|
27
|
+
user_agent: "Polipus - #{Polipus::VERSION} - #{Polipus::HOMEPAGE}",
|
29
28
|
# by default, don't limit the depth of the crawl
|
30
|
-
:
|
29
|
+
depth_limit: false,
|
31
30
|
# number of times HTTP redirects will be followed
|
32
|
-
:
|
33
|
-
# storage engine defaults to DevNull
|
34
|
-
:
|
35
|
-
# proxy server hostname
|
36
|
-
:
|
31
|
+
redirect_limit: 5,
|
32
|
+
# storage engine defaults to DevNull
|
33
|
+
storage: nil,
|
34
|
+
# proxy server hostname
|
35
|
+
proxy_host: nil,
|
37
36
|
# proxy server port number
|
38
|
-
:
|
37
|
+
proxy_port: false,
|
39
38
|
# HTTP read timeout in seconds
|
40
|
-
:
|
39
|
+
read_timeout: 30,
|
41
40
|
# HTTP open connection timeout in seconds
|
42
|
-
:
|
41
|
+
open_timeout: 10,
|
43
42
|
# Time to wait for new messages on Redis
|
44
43
|
# After this timeout, current crawling session is marked as terminated
|
45
|
-
:
|
44
|
+
queue_timeout: 30,
|
46
45
|
# An URL tracker instance. default is Bloomfilter based on redis
|
47
|
-
:
|
46
|
+
url_tracker: nil,
|
48
47
|
# A Redis options {} that will be passed directly to Redis.new
|
49
|
-
:
|
48
|
+
redis_options: {},
|
50
49
|
# An instance of logger
|
51
|
-
:
|
50
|
+
logger: nil,
|
52
51
|
# A logger level
|
53
|
-
:
|
52
|
+
logger_level: nil,
|
54
53
|
# whether the query string should be included in the saved page
|
55
|
-
:
|
54
|
+
include_query_string_in_saved_page: true,
|
56
55
|
# Max number of items to keep on redis
|
57
|
-
:
|
56
|
+
queue_items_limit: 2_000_000,
|
58
57
|
# The adapter used to store exceed (queue_items_limit) redis items
|
59
|
-
:
|
58
|
+
queue_overflow_adapter: nil,
|
60
59
|
# Every x seconds, the main queue is checked for overflowed items
|
61
|
-
:
|
60
|
+
queue_overflow_manager_check_time: 60,
|
62
61
|
# If true, each page downloaded will increment a counter on redis
|
63
|
-
:
|
62
|
+
stats_enabled: false,
|
64
63
|
# Cookies strategy
|
65
|
-
:
|
66
|
-
# whether or not accept cookies
|
67
|
-
:
|
64
|
+
cookie_jar: nil,
|
65
|
+
# whether or not accept cookies
|
66
|
+
accept_cookies: false,
|
68
67
|
# A set of hosts that should be considered parts of the same domain
|
69
68
|
# Eg It can be used to follow links with and without 'www' domain
|
70
|
-
:
|
69
|
+
domain_aliases: [],
|
71
70
|
# Mark a connection as staled after connection_max_hits request
|
72
|
-
:
|
71
|
+
connection_max_hits: nil,
|
73
72
|
# Page TTL: mark a page as expired after ttl_page seconds
|
74
|
-
:
|
73
|
+
ttl_page: nil,
|
75
74
|
# don't obey the robots exclusion protocol
|
76
|
-
:
|
75
|
+
obey_robots_txt: false,
|
76
|
+
# If true, signal handling strategy is enabled.
|
77
|
+
# INT and TERM signal will stop polipus gracefully
|
78
|
+
# Disable it if polipus will run as a part of Resque or DelayedJob-like system
|
79
|
+
enable_signal_handler: true
|
77
80
|
}
|
78
81
|
|
79
82
|
attr_reader :storage
|
@@ -82,7 +85,6 @@ module Polipus
|
|
82
85
|
attr_reader :options
|
83
86
|
attr_reader :crawler_name
|
84
87
|
|
85
|
-
|
86
88
|
OPTS.keys.each do |key|
|
87
89
|
define_method "#{key}=" do |value|
|
88
90
|
@options[key.to_sym] = value
|
@@ -93,13 +95,12 @@ module Polipus
|
|
93
95
|
end
|
94
96
|
|
95
97
|
def initialize(job_name = 'polipus', urls = [], options = {})
|
96
|
-
|
97
98
|
@job_name = job_name
|
98
99
|
@options = OPTS.merge(options)
|
99
100
|
@options[:queue_timeout] = 1 if @options[:queue_timeout] <= 0
|
100
101
|
@logger = @options[:logger] ||= Logger.new(nil)
|
101
|
-
|
102
|
-
unless @logger.class.to_s ==
|
102
|
+
|
103
|
+
unless @logger.class.to_s == 'Log4r::Logger'
|
103
104
|
@logger.level = @options[:logger_level] ||= Logger::INFO
|
104
105
|
end
|
105
106
|
|
@@ -108,8 +109,7 @@ module Polipus
|
|
108
109
|
@http_pool = []
|
109
110
|
@workers_pool = []
|
110
111
|
@queues_pool = []
|
111
|
-
|
112
|
-
|
112
|
+
|
113
113
|
@follow_links_like = []
|
114
114
|
@skip_links_like = []
|
115
115
|
@on_page_downloaded = []
|
@@ -119,21 +119,19 @@ module Polipus
|
|
119
119
|
@on_crawl_end = []
|
120
120
|
@redis_factory = nil
|
121
121
|
|
122
|
-
|
123
122
|
@overflow_manager = nil
|
124
123
|
@crawler_name = `hostname`.strip + "-#{@job_name}"
|
125
124
|
|
126
125
|
@storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
|
127
126
|
|
128
|
-
@urls = [urls].flatten.map{ |url| URI(url) }
|
129
|
-
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
130
|
-
@internal_queue = queue_factory
|
127
|
+
@urls = [urls].flatten.map { |url| URI(url) }
|
128
|
+
@urls.each { |url| url.path = '/' if url.path.empty? }
|
131
129
|
@robots = Polipus::Robotex.new(@options[:user_agent]) if @options[:obey_robots_txt]
|
132
|
-
|
130
|
+
# Attach signal handling if enabled
|
131
|
+
SignalHandler.enable if @options[:enable_signal_handler]
|
133
132
|
execute_plugin 'on_initialize'
|
134
133
|
|
135
134
|
yield self if block_given?
|
136
|
-
|
137
135
|
end
|
138
136
|
|
139
137
|
def self.crawl(*args, &block)
|
@@ -141,18 +139,17 @@ module Polipus
|
|
141
139
|
end
|
142
140
|
|
143
141
|
def takeover
|
144
|
-
PolipusSignalHandler.enable
|
145
142
|
overflow_items_controller if queue_overflow_adapter
|
146
143
|
|
147
144
|
@urls.each do |u|
|
148
145
|
add_url(u) { |page| page.user_data.p_seeded = true }
|
149
146
|
end
|
150
|
-
return if
|
147
|
+
return if internal_queue.empty?
|
151
148
|
|
152
149
|
execute_plugin 'on_crawl_start'
|
153
150
|
@options[:workers].times do |worker_number|
|
154
151
|
@workers_pool << Thread.new do
|
155
|
-
@logger.debug {"Start worker #{worker_number}"}
|
152
|
+
@logger.debug { "Start worker #{worker_number}" }
|
156
153
|
http = @http_pool[worker_number] ||= HTTP.new(@options)
|
157
154
|
queue = @queues_pool[worker_number] ||= queue_factory
|
158
155
|
queue.process(false, @options[:queue_timeout]) do |message|
|
@@ -164,75 +161,73 @@ module Polipus
|
|
164
161
|
page = Page.from_json message
|
165
162
|
|
166
163
|
unless should_be_visited?(page.url, false)
|
167
|
-
@logger.info {"[worker ##{worker_number}] Page (#{page.url
|
164
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) is no more welcome." }
|
168
165
|
queue.commit
|
169
166
|
next
|
170
167
|
end
|
171
168
|
|
172
169
|
if page_exists? page
|
173
|
-
@logger.info {"[worker ##{worker_number}] Page (#{page.url
|
170
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
|
174
171
|
queue.commit
|
175
172
|
next
|
176
173
|
end
|
177
|
-
|
174
|
+
|
178
175
|
url = page.url.to_s
|
179
|
-
@logger.debug {"[worker ##{worker_number}] Fetching page: [#{page.url
|
176
|
+
@logger.debug { "[worker ##{worker_number}] Fetching page: [#{page.url}] Referer: #{page.referer} Depth: #{page.depth}" }
|
180
177
|
|
181
178
|
execute_plugin 'on_before_download'
|
182
179
|
|
183
180
|
pages = http.fetch_pages(url, page.referer, page.depth)
|
184
181
|
if pages.count > 1
|
185
182
|
rurls = pages.map { |e| e.url.to_s }.join(' --> ')
|
186
|
-
@logger.info {"Got redirects! #{rurls}"}
|
183
|
+
@logger.info { "Got redirects! #{rurls}" }
|
187
184
|
page = pages.pop
|
188
|
-
page.aliases = pages.
|
185
|
+
page.aliases = pages.map { |e| e.url }
|
189
186
|
if page_exists? page
|
190
|
-
@logger.info {"[worker ##{worker_number}] Page (#{page.url
|
187
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) already stored." }
|
191
188
|
queue.commit
|
192
189
|
next
|
193
190
|
end
|
194
191
|
else
|
195
192
|
page = pages.last
|
196
193
|
end
|
197
|
-
|
194
|
+
|
198
195
|
execute_plugin 'on_after_download'
|
199
|
-
|
196
|
+
|
200
197
|
if page.error
|
201
|
-
@logger.warn {"Page #{page.url} has error: #{page.error}"}
|
198
|
+
@logger.warn { "Page #{page.url} has error: #{page.error}" }
|
202
199
|
incr_error
|
203
|
-
@on_page_error.each {|e| e.call(page)}
|
200
|
+
@on_page_error.each { |e| e.call(page) }
|
204
201
|
end
|
205
202
|
|
206
203
|
# Execute on_before_save blocks
|
207
|
-
@on_before_save.each {|e| e.call(page)}
|
204
|
+
@on_before_save.each { |e| e.call(page) }
|
208
205
|
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
|
214
|
-
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
|
206
|
+
page.storable? && @storage.add(page)
|
207
|
+
|
208
|
+
@logger.debug { "[worker ##{worker_number}] Fetched page: [#{page.url}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]" }
|
209
|
+
@logger.info { "[worker ##{worker_number}] Page (#{page.url}) downloaded" }
|
215
210
|
|
216
211
|
incr_pages
|
217
212
|
|
218
213
|
# Execute on_page_downloaded blocks
|
219
|
-
@on_page_downloaded.each {|e| e.call(page)}
|
214
|
+
@on_page_downloaded.each { |e| e.call(page) }
|
220
215
|
|
221
|
-
if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
|
216
|
+
if @options[:depth_limit] == false || @options[:depth_limit] > page.depth
|
222
217
|
links_for(page).each do |url_to_visit|
|
223
218
|
next unless should_be_visited?(url_to_visit)
|
224
219
|
enqueue url_to_visit, page, queue
|
225
220
|
end
|
226
221
|
else
|
227
|
-
@logger.info {"[worker ##{worker_number}] Depth limit reached #{page.depth}"}
|
222
|
+
@logger.info { "[worker ##{worker_number}] Depth limit reached #{page.depth}" }
|
228
223
|
end
|
229
224
|
|
230
|
-
@logger.debug {"[worker ##{worker_number}] Queue size: #{queue.size}"}
|
225
|
+
@logger.debug { "[worker ##{worker_number}] Queue size: #{queue.size}" }
|
231
226
|
@overflow_manager.perform if @overflow_manager && queue.empty?
|
232
227
|
execute_plugin 'on_message_processed'
|
233
228
|
|
234
|
-
if
|
235
|
-
@logger.info {
|
229
|
+
if SignalHandler.terminated?
|
230
|
+
@logger.info { 'About to exit! Thanks for using Polipus' }
|
236
231
|
queue.commit
|
237
232
|
break
|
238
233
|
end
|
@@ -240,11 +235,11 @@ module Polipus
|
|
240
235
|
end
|
241
236
|
end
|
242
237
|
end
|
243
|
-
@workers_pool.each {|w| w.join}
|
244
|
-
@on_crawl_end.each {|e| e.call(self)}
|
238
|
+
@workers_pool.each { |w| w.join }
|
239
|
+
@on_crawl_end.each { |e| e.call(self) }
|
245
240
|
execute_plugin 'on_crawl_end'
|
246
241
|
end
|
247
|
-
|
242
|
+
|
248
243
|
# A pattern or an array of patterns can be passed as argument
|
249
244
|
# An url will be discarded if it doesn't match patterns
|
250
245
|
def follow_links_like(*patterns)
|
@@ -298,11 +293,11 @@ module Polipus
|
|
298
293
|
end
|
299
294
|
|
300
295
|
def queue_size
|
301
|
-
|
296
|
+
internal_queue.size
|
302
297
|
end
|
303
298
|
|
304
299
|
def stats_reset!
|
305
|
-
["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each {|e| redis.del e}
|
300
|
+
["polipus:#{@job_name}:errors", "polipus:#{@job_name}:pages"].each { |e| redis.del e }
|
306
301
|
end
|
307
302
|
|
308
303
|
def redis_factory(&block)
|
@@ -313,9 +308,9 @@ module Polipus
|
|
313
308
|
def url_tracker
|
314
309
|
@url_tracker ||=
|
315
310
|
@options[:url_tracker] ||=
|
316
|
-
UrlTracker.bloomfilter(:
|
317
|
-
:
|
318
|
-
:
|
311
|
+
UrlTracker.bloomfilter(key_name: "polipus_bf_#{job_name}",
|
312
|
+
redis: redis_factory_adapter,
|
313
|
+
driver: 'lua')
|
319
314
|
end
|
320
315
|
|
321
316
|
def redis
|
@@ -334,176 +329,138 @@ module Polipus
|
|
334
329
|
def add_url(url, params = {})
|
335
330
|
page = Page.new(url, params)
|
336
331
|
yield(page) if block_given?
|
337
|
-
|
332
|
+
internal_queue << page.to_json
|
338
333
|
end
|
339
334
|
|
340
335
|
# Request to Polipus to stop its work (gracefully)
|
341
336
|
# cler_queue = true if you want to delete all of the pending urls to visit
|
342
337
|
def stop!(cler_queue = false)
|
343
|
-
|
344
|
-
|
338
|
+
SignalHandler.terminate
|
339
|
+
internal_queue.clear(true) if cler_queue
|
345
340
|
end
|
346
341
|
|
347
342
|
private
|
348
|
-
# URLs enqueue policy
|
349
|
-
def should_be_visited?(url, with_tracker = true)
|
350
|
-
|
351
|
-
case
|
352
|
-
# robots.txt
|
353
|
-
when !allowed_by_robot?(url)
|
354
|
-
false
|
355
|
-
# Check against whitelist pattern matching
|
356
|
-
when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
|
357
|
-
false
|
358
|
-
# Check against blacklist pattern matching
|
359
|
-
when @skip_links_like.any?{ |p| url.path =~ p }
|
360
|
-
false
|
361
|
-
# Page is marked as expired
|
362
|
-
when page_expired?(Page.new(url))
|
363
|
-
true
|
364
|
-
# Check against url tracker
|
365
|
-
when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
|
366
|
-
false
|
367
|
-
else
|
368
|
-
true
|
369
|
-
end
|
370
|
-
end
|
371
|
-
|
372
|
-
# It extracts URLs from the page
|
373
|
-
def links_for page
|
374
|
-
page.domain_aliases = domain_aliases
|
375
|
-
@focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
|
376
|
-
end
|
377
|
-
|
378
|
-
# whether a page is expired or not
|
379
|
-
def page_expired? page
|
380
|
-
return false if @options[:ttl_page].nil?
|
381
|
-
stored_page = @storage.get(page)
|
382
|
-
r = stored_page && stored_page.expired?(@options[:ttl_page])
|
383
|
-
@logger.debug {"Page #{page.url.to_s} marked as expired"} if r
|
384
|
-
r
|
385
|
-
end
|
386
|
-
|
387
|
-
# whether a page exists or not
|
388
|
-
def page_exists? page
|
389
|
-
return false if page.user_data && page.user_data.p_seeded
|
390
|
-
@storage.exists?(page) && !page_expired?(page)
|
391
|
-
end
|
392
343
|
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
#
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
344
|
+
# URLs enqueue policy
|
345
|
+
def should_be_visited?(url, with_tracker = true)
|
346
|
+
case
|
347
|
+
# robots.txt
|
348
|
+
when !allowed_by_robot?(url)
|
349
|
+
false
|
350
|
+
# Check against whitelist pattern matching
|
351
|
+
when !@follow_links_like.empty? && @follow_links_like.none? { |p| url.path =~ p }
|
352
|
+
false
|
353
|
+
# Check against blacklist pattern matching
|
354
|
+
when @skip_links_like.any? { |p| url.path =~ p }
|
355
|
+
false
|
356
|
+
# Page is marked as expired
|
357
|
+
when page_expired?(Page.new(url))
|
358
|
+
true
|
359
|
+
# Check against url tracker
|
360
|
+
when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/, ''))
|
361
|
+
false
|
362
|
+
else
|
363
|
+
true
|
401
364
|
end
|
365
|
+
end
|
402
366
|
|
367
|
+
# It extracts URLs from the page
|
368
|
+
def links_for(page)
|
369
|
+
page.domain_aliases = domain_aliases
|
370
|
+
@focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
|
371
|
+
end
|
403
372
|
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
|
410
|
-
|
411
|
-
|
373
|
+
# whether a page is expired or not
|
374
|
+
def page_expired?(page)
|
375
|
+
return false if @options[:ttl_page].nil?
|
376
|
+
stored_page = @storage.get(page)
|
377
|
+
r = stored_page && stored_page.expired?(@options[:ttl_page])
|
378
|
+
@logger.debug { "Page #{page.url} marked as expired" } if r
|
379
|
+
r
|
380
|
+
end
|
412
381
|
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
Redis.new(redis_options)
|
419
|
-
end
|
420
|
-
end
|
382
|
+
# whether a page exists or not
|
383
|
+
def page_exists?(page)
|
384
|
+
return false if page.user_data && page.user_data.p_seeded
|
385
|
+
@storage.exists?(page) && !page_expired?(page)
|
386
|
+
end
|
421
387
|
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
388
|
+
#
|
389
|
+
# Returns +true+ if we are obeying robots.txt and the link
|
390
|
+
# is granted access in it. Always returns +true+ when we are
|
391
|
+
# not obeying robots.txt.
|
392
|
+
#
|
393
|
+
def allowed_by_robot?(link)
|
394
|
+
return true if @robots.nil?
|
395
|
+
@options[:obey_robots_txt] ? @robots.allowed?(link) : true
|
396
|
+
end
|
426
397
|
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
398
|
+
# The url is enqueued for a later visit
|
399
|
+
def enqueue(url_to_visit, current_page, queue)
|
400
|
+
page_to_visit = Page.new(url_to_visit.to_s, referer: current_page.url.to_s, depth: current_page.depth + 1)
|
401
|
+
queue << page_to_visit.to_json
|
402
|
+
to_track = @options[:include_query_string_in_saved_page] ? url_to_visit.to_s : url_to_visit.to_s.gsub(/\?.*$/, '')
|
403
|
+
url_tracker.visit to_track
|
404
|
+
@logger.debug { "Added [#{url_to_visit}] to the queue" }
|
405
|
+
end
|
431
406
|
|
432
|
-
|
433
|
-
|
434
|
-
|
407
|
+
# It creates a redis client
|
408
|
+
def redis_factory_adapter
|
409
|
+
if @redis_factory
|
410
|
+
@redis_factory.call(redis_options)
|
411
|
+
else
|
412
|
+
Redis.new(redis_options)
|
435
413
|
end
|
414
|
+
end
|
436
415
|
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
# In the time, url policy may change so policy is re-evaluated
|
442
|
-
@overflow_manager.url_filter do |page|
|
443
|
-
should_be_visited?(page.url, false)
|
444
|
-
end
|
416
|
+
# It creates a new distributed queue
|
417
|
+
def queue_factory
|
418
|
+
Redis::Queue.new("polipus_queue_#{@job_name}", "bp_polipus_queue_#{@job_name}", redis: redis_factory_adapter)
|
419
|
+
end
|
445
420
|
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
421
|
+
# If stats enabled, it increments errors found
|
422
|
+
def incr_error
|
423
|
+
redis.incr "polipus:#{@job_name}:errors" if @options[:stats_enabled]
|
424
|
+
end
|
450
425
|
|
451
|
-
|
452
|
-
|
426
|
+
# If stats enabled, it increments pages downloaded
|
427
|
+
def incr_pages
|
428
|
+
redis.incr "polipus:#{@job_name}:pages" if @options[:stats_enabled]
|
429
|
+
end
|
453
430
|
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
@logger.info {"Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}"}
|
458
|
-
redis_lock.del "polipus_queue_overflow-#{@job_name}.lock"
|
459
|
-
else
|
460
|
-
@logger.info {"Lock not acquired"}
|
461
|
-
end
|
431
|
+
# It handles the overflow item policy (if any)
|
432
|
+
def overflow_items_controller
|
433
|
+
@overflow_manager = QueueOverflow::Manager.new(self, queue_factory, @options[:queue_items_limit])
|
462
434
|
|
463
|
-
|
464
|
-
|
465
|
-
|
435
|
+
# In the time, url policy may change so policy is re-evaluated
|
436
|
+
@overflow_manager.url_filter do |page|
|
437
|
+
should_be_visited?(page.url, false)
|
466
438
|
end
|
467
439
|
|
468
|
-
|
469
|
-
def execute_plugin method
|
440
|
+
Thread.new do
|
470
441
|
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
end
|
442
|
+
loop do
|
443
|
+
@logger.info { 'Overflow Manager: cycle started' }
|
444
|
+
removed, restored = @overflow_manager.perform
|
445
|
+
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{queue_overflow_adapter.size}" }
|
446
|
+
sleep @options[:queue_overflow_manager_check_time]
|
477
447
|
end
|
478
|
-
end
|
479
448
|
|
480
|
-
|
481
|
-
|
482
|
-
class PolipusSignalHandler
|
483
|
-
include Singleton
|
484
|
-
attr_accessor :terminated
|
485
|
-
def initialize
|
486
|
-
self.terminated = false
|
487
|
-
end
|
488
|
-
|
489
|
-
def self.enable
|
490
|
-
trap(:INT) {
|
491
|
-
puts "Got INT signal"
|
492
|
-
self.terminate
|
493
|
-
}
|
494
|
-
trap(:TERM) {
|
495
|
-
puts "Got TERM signal"
|
496
|
-
self.terminate
|
497
|
-
}
|
449
|
+
end
|
498
450
|
end
|
499
451
|
|
500
|
-
def
|
501
|
-
|
452
|
+
def internal_queue
|
453
|
+
@internal_queue ||= queue_factory
|
502
454
|
end
|
503
455
|
|
504
|
-
|
505
|
-
|
456
|
+
# It invokes a plugin method if any
|
457
|
+
def execute_plugin(method)
|
458
|
+
Polipus::Plugin.plugins.each do |k, p|
|
459
|
+
next unless p.respond_to?(method)
|
460
|
+
@logger.info { "Running plugin method #{method} on #{k}" }
|
461
|
+
ret_val = p.send(method, self)
|
462
|
+
instance_eval(&ret_val) if ret_val.kind_of? Proc
|
463
|
+
end
|
506
464
|
end
|
507
465
|
end
|
508
|
-
|
509
466
|
end
|