parallel588_polipus 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Plugin
|
4
|
+
class Cleaner
|
5
|
+
def initialize(options = {})
|
6
|
+
@reset = options[:reset] ||= false
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize(crawler)
|
10
|
+
crawler.logger.info { 'Cleaner plugin loaded' }
|
11
|
+
unless @reset
|
12
|
+
crawler.logger.info { 'Cleaner plugin is disabled, add :reset => true to the plugin if you really know what you are doing' }
|
13
|
+
return nil
|
14
|
+
end
|
15
|
+
crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
|
16
|
+
proc do
|
17
|
+
url_tracker.clear
|
18
|
+
storage.clear
|
19
|
+
queue_factory.clear
|
20
|
+
@options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Plugin
|
4
|
+
class Sample
|
5
|
+
def initialize(_options = {})
|
6
|
+
end
|
7
|
+
|
8
|
+
def on_initialize(_crawler)
|
9
|
+
proc do
|
10
|
+
@options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Plugin
|
4
|
+
class Sleeper
|
5
|
+
def initialize(options = {})
|
6
|
+
@delay = options[:delay] ||= 1
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize(crawler)
|
10
|
+
crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
|
11
|
+
proc do
|
12
|
+
# Set to 1 the number of threads
|
13
|
+
@options[:workers] = 1
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def on_message_processed(_crawler)
|
18
|
+
sleep @delay
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'polipus/queue_overflow/manager'
|
3
|
+
require 'polipus/queue_overflow/worker'
|
4
|
+
module Polipus
|
5
|
+
module QueueOverflow
|
6
|
+
def self.mongo_queue(mongo_db, queue_name, options = {})
|
7
|
+
require 'polipus/queue_overflow/mongo_queue'
|
8
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
9
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
10
|
+
self::MongoQueue.new mongo_db, queue_name, options
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
14
|
+
require 'polipus/queue_overflow/mongo_queue_capped'
|
15
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
16
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
17
|
+
options[:max] = 1_000_000 if options[:max].nil?
|
18
|
+
self::MongoQueueCapped.new mongo_db, queue_name, options
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.dev_null_queue(_options = {})
|
22
|
+
require 'polipus/queue_overflow/dev_null_queue'
|
23
|
+
self::DevNullQueue.new
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'thread'
|
3
|
+
module Polipus
|
4
|
+
module QueueOverflow
|
5
|
+
class DevNullQueue
|
6
|
+
def initialize
|
7
|
+
end
|
8
|
+
|
9
|
+
def length
|
10
|
+
0
|
11
|
+
end
|
12
|
+
|
13
|
+
def empty?
|
14
|
+
true
|
15
|
+
end
|
16
|
+
|
17
|
+
def clear
|
18
|
+
end
|
19
|
+
|
20
|
+
def push(_data)
|
21
|
+
end
|
22
|
+
|
23
|
+
def pop(_ = false)
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
alias_method :size, :length
|
28
|
+
alias_method :dec, :pop
|
29
|
+
alias_method :shift, :pop
|
30
|
+
alias_method :enc, :push
|
31
|
+
alias_method :<<, :push
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class Manager
|
5
|
+
attr_accessor :url_filter
|
6
|
+
attr_reader :polipus
|
7
|
+
def initialize(polipus, main_q, item_limit)
|
8
|
+
@polipus = polipus
|
9
|
+
@main_q = main_q
|
10
|
+
@adapter = @polipus.queue_overflow_adapter
|
11
|
+
@item_limit = item_limit
|
12
|
+
@redis = @polipus.redis
|
13
|
+
end
|
14
|
+
|
15
|
+
def url_filter(&block)
|
16
|
+
@url_filter = block
|
17
|
+
end
|
18
|
+
|
19
|
+
def perform
|
20
|
+
removed = 0
|
21
|
+
restored = 0
|
22
|
+
main_q_size = @main_q.size
|
23
|
+
if main_q_size > @item_limit
|
24
|
+
@polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
|
25
|
+
removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
|
26
|
+
elsif main_q_size < @item_limit && !@adapter.empty?
|
27
|
+
@polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
|
28
|
+
restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
|
29
|
+
end
|
30
|
+
[removed, restored]
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def rotate(source, dest)
|
36
|
+
performed = 0
|
37
|
+
loop do
|
38
|
+
message = source.pop(true)
|
39
|
+
if message
|
40
|
+
page = Page.from_json message
|
41
|
+
unless @polipus.storage.exists?(page)
|
42
|
+
allowed = @url_filter.nil? ? true : @url_filter.call(page)
|
43
|
+
if allowed
|
44
|
+
dest << message
|
45
|
+
performed += 1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
source.commit if source.respond_to? :commit
|
50
|
+
break if !message || source.empty?
|
51
|
+
break unless yield source, dest
|
52
|
+
end
|
53
|
+
performed
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'thread'
|
3
|
+
require 'mongo'
|
4
|
+
module Polipus
|
5
|
+
module QueueOverflow
|
6
|
+
class MongoQueue
|
7
|
+
def initialize(mongo_db, queue_name, options = {})
|
8
|
+
@mongo_db = mongo_db
|
9
|
+
@collection_name = "polipus_q_overflow_#{queue_name}"
|
10
|
+
@semaphore = Mutex.new
|
11
|
+
@options = options
|
12
|
+
@options[:ensure_uniq] ||= false
|
13
|
+
@options[:ensure_uniq] && ensure_index
|
14
|
+
end
|
15
|
+
|
16
|
+
def length
|
17
|
+
@mongo_db[@collection_name].count
|
18
|
+
end
|
19
|
+
|
20
|
+
def empty?
|
21
|
+
!(length > 0)
|
22
|
+
end
|
23
|
+
|
24
|
+
def clear
|
25
|
+
@mongo_db[@collection_name].drop
|
26
|
+
@options[:ensure_uniq] && ensure_index
|
27
|
+
end
|
28
|
+
|
29
|
+
def push(data)
|
30
|
+
if @options[:ensure_uniq]
|
31
|
+
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
|
32
|
+
else
|
33
|
+
@mongo_db[@collection_name].insert(payload: data)
|
34
|
+
end
|
35
|
+
true
|
36
|
+
end
|
37
|
+
|
38
|
+
def pop(_ = false)
|
39
|
+
@semaphore.synchronize do
|
40
|
+
doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
|
41
|
+
return nil if doc.nil?
|
42
|
+
@mongo_db[@collection_name].remove(_id: doc['_id'])
|
43
|
+
doc && doc['payload'] ? doc['payload'] : nil
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
alias_method :size, :length
|
48
|
+
alias_method :dec, :pop
|
49
|
+
alias_method :shift, :pop
|
50
|
+
alias_method :enc, :push
|
51
|
+
alias_method :<<, :push
|
52
|
+
|
53
|
+
protected
|
54
|
+
|
55
|
+
def ensure_index
|
56
|
+
@mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'polipus/queue_overflow/mongo_queue'
|
3
|
+
module Polipus
|
4
|
+
module QueueOverflow
|
5
|
+
class MongoQueueCapped < MongoQueue
|
6
|
+
def initialize(mongo_db, queue_name, options = {})
|
7
|
+
super
|
8
|
+
@max = @options[:max]
|
9
|
+
end
|
10
|
+
|
11
|
+
def push(data)
|
12
|
+
super
|
13
|
+
@semaphore.synchronize do
|
14
|
+
s = size
|
15
|
+
if s > @max
|
16
|
+
docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
|
17
|
+
@mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
alias_method :size, :length
|
23
|
+
alias_method :dec, :pop
|
24
|
+
alias_method :shift, :pop
|
25
|
+
alias_method :enc, :push
|
26
|
+
alias_method :<<, :push
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class Worker
|
5
|
+
def initialize(manager)
|
6
|
+
@logger = manager.polipus.logger
|
7
|
+
@delay = manager.polipus.options[:queue_overflow_manager_check_time]
|
8
|
+
@adapter = manager.polipus.queue_overflow_adapter
|
9
|
+
@manager = manager
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
@logger.info { 'Overflow::Worker::run' }
|
14
|
+
loop do
|
15
|
+
@logger.info { 'Overflow Manager: cycle started' }
|
16
|
+
removed, restored = @manager.perform
|
17
|
+
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
|
18
|
+
sleep @delay
|
19
|
+
break if SignalHandler.terminated?
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'open-uri'
|
3
|
+
require 'uri'
|
4
|
+
require 'timeout'
|
5
|
+
module Polipus
|
6
|
+
# Original code taken from
|
7
|
+
# https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
|
8
|
+
|
9
|
+
class Robotex
|
10
|
+
DEFAULT_TIMEOUT = 3
|
11
|
+
VERSION = '1.0.0'
|
12
|
+
|
13
|
+
attr_reader :user_agent
|
14
|
+
|
15
|
+
class ParsedRobots
|
16
|
+
def initialize(uri, user_agent)
|
17
|
+
io = Robotex.get_robots_txt(uri, user_agent)
|
18
|
+
if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
|
19
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
20
|
+
end
|
21
|
+
|
22
|
+
@disallows = {}
|
23
|
+
@allows = {}
|
24
|
+
@delays = {}
|
25
|
+
agent = /.*/
|
26
|
+
io.each do |line|
|
27
|
+
next if line =~ /^\s*(#.*|$)/
|
28
|
+
arr = line.split(':')
|
29
|
+
key = arr.shift
|
30
|
+
value = arr.join(':').strip
|
31
|
+
value.strip!
|
32
|
+
case key.downcase
|
33
|
+
when 'user-agent'
|
34
|
+
agent = to_regex(value)
|
35
|
+
when 'allow'
|
36
|
+
unless value.empty?
|
37
|
+
@allows[agent] ||= []
|
38
|
+
@allows[agent] << to_regex(value)
|
39
|
+
end
|
40
|
+
when 'disallow'
|
41
|
+
unless value.empty?
|
42
|
+
@disallows[agent] ||= []
|
43
|
+
@disallows[agent] << to_regex(value)
|
44
|
+
end
|
45
|
+
when 'crawl-delay'
|
46
|
+
@delays[agent] = value.to_i
|
47
|
+
end
|
48
|
+
end
|
49
|
+
@parsed = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def allowed?(uri, user_agent)
|
53
|
+
return true unless @parsed
|
54
|
+
allowed = true
|
55
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
56
|
+
path = uri.request_uri
|
57
|
+
|
58
|
+
@allows.each do |key, value|
|
59
|
+
unless allowed
|
60
|
+
if user_agent =~ key
|
61
|
+
value.each do |rule|
|
62
|
+
path =~ rule && allowed = true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
@disallows.each do |key, value|
|
69
|
+
if user_agent =~ key
|
70
|
+
value.each do |rule|
|
71
|
+
path =~ rule && allowed = false
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
allowed
|
77
|
+
end
|
78
|
+
|
79
|
+
def delay(user_agent)
|
80
|
+
@delays.each do |agent, delay|
|
81
|
+
return delay if agent =~ user_agent
|
82
|
+
end
|
83
|
+
nil
|
84
|
+
end
|
85
|
+
|
86
|
+
protected
|
87
|
+
|
88
|
+
def to_regex(pattern)
|
89
|
+
pattern = Regexp.escape(pattern)
|
90
|
+
pattern.gsub!(Regexp.escape('*'), '.*')
|
91
|
+
Regexp.compile("^#{pattern}")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.get_robots_txt(uri, user_agent)
|
96
|
+
Timeout.timeout(Robotex.timeout) do
|
97
|
+
URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
|
98
|
+
end
|
99
|
+
rescue Timeout::Error
|
100
|
+
STDERR.puts 'robots.txt request timed out'
|
101
|
+
end
|
102
|
+
|
103
|
+
class << self
|
104
|
+
attr_writer :timeout
|
105
|
+
end
|
106
|
+
|
107
|
+
def self.timeout
|
108
|
+
@timeout || DEFAULT_TIMEOUT
|
109
|
+
end
|
110
|
+
|
111
|
+
def initialize(user_agent = nil)
|
112
|
+
user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
|
113
|
+
@user_agent = user_agent
|
114
|
+
@last_accessed = Time.at(1)
|
115
|
+
@parsed = {}
|
116
|
+
end
|
117
|
+
|
118
|
+
def parse_host(uri)
|
119
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
120
|
+
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
125
|
+
#
|
126
|
+
def allowed?(uri)
|
127
|
+
parse_host(uri).allowed?(uri, @user_agent)
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Return the value of the Crawl-Delay directive, or nil if none
|
132
|
+
def delay(uri)
|
133
|
+
parse_host(uri).delay(@user_agent)
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
|
138
|
+
#
|
139
|
+
def delay!(uri)
|
140
|
+
delay = delay(uri)
|
141
|
+
sleep delay - (Time.now - @last_accessed) if delay
|
142
|
+
@last_accessed = Time.now
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|