parallel588_polipus 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Plugin
|
4
|
+
class Cleaner
|
5
|
+
def initialize(options = {})
|
6
|
+
@reset = options[:reset] ||= false
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize(crawler)
|
10
|
+
crawler.logger.info { 'Cleaner plugin loaded' }
|
11
|
+
unless @reset
|
12
|
+
crawler.logger.info { 'Cleaner plugin is disabled, add :reset => true to the plugin if you really know what you are doing' }
|
13
|
+
return nil
|
14
|
+
end
|
15
|
+
crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
|
16
|
+
proc do
|
17
|
+
url_tracker.clear
|
18
|
+
storage.clear
|
19
|
+
queue_factory.clear
|
20
|
+
@options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Plugin
|
4
|
+
class Sample
|
5
|
+
def initialize(_options = {})
|
6
|
+
end
|
7
|
+
|
8
|
+
def on_initialize(_crawler)
|
9
|
+
proc do
|
10
|
+
@options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Plugin
|
4
|
+
class Sleeper
|
5
|
+
def initialize(options = {})
|
6
|
+
@delay = options[:delay] ||= 1
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize(crawler)
|
10
|
+
crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
|
11
|
+
proc do
|
12
|
+
# Set to 1 the number of threads
|
13
|
+
@options[:workers] = 1
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def on_message_processed(_crawler)
|
18
|
+
sleep @delay
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'polipus/queue_overflow/manager'
|
3
|
+
require 'polipus/queue_overflow/worker'
|
4
|
+
module Polipus
|
5
|
+
module QueueOverflow
|
6
|
+
def self.mongo_queue(mongo_db, queue_name, options = {})
|
7
|
+
require 'polipus/queue_overflow/mongo_queue'
|
8
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
9
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
10
|
+
self::MongoQueue.new mongo_db, queue_name, options
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
14
|
+
require 'polipus/queue_overflow/mongo_queue_capped'
|
15
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
16
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
17
|
+
options[:max] = 1_000_000 if options[:max].nil?
|
18
|
+
self::MongoQueueCapped.new mongo_db, queue_name, options
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.dev_null_queue(_options = {})
|
22
|
+
require 'polipus/queue_overflow/dev_null_queue'
|
23
|
+
self::DevNullQueue.new
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'thread'
|
3
|
+
module Polipus
|
4
|
+
module QueueOverflow
|
5
|
+
class DevNullQueue
|
6
|
+
def initialize
|
7
|
+
end
|
8
|
+
|
9
|
+
def length
|
10
|
+
0
|
11
|
+
end
|
12
|
+
|
13
|
+
def empty?
|
14
|
+
true
|
15
|
+
end
|
16
|
+
|
17
|
+
def clear
|
18
|
+
end
|
19
|
+
|
20
|
+
def push(_data)
|
21
|
+
end
|
22
|
+
|
23
|
+
def pop(_ = false)
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
alias_method :size, :length
|
28
|
+
alias_method :dec, :pop
|
29
|
+
alias_method :shift, :pop
|
30
|
+
alias_method :enc, :push
|
31
|
+
alias_method :<<, :push
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class Manager
|
5
|
+
attr_accessor :url_filter
|
6
|
+
attr_reader :polipus
|
7
|
+
def initialize(polipus, main_q, item_limit)
|
8
|
+
@polipus = polipus
|
9
|
+
@main_q = main_q
|
10
|
+
@adapter = @polipus.queue_overflow_adapter
|
11
|
+
@item_limit = item_limit
|
12
|
+
@redis = @polipus.redis
|
13
|
+
end
|
14
|
+
|
15
|
+
def url_filter(&block)
|
16
|
+
@url_filter = block
|
17
|
+
end
|
18
|
+
|
19
|
+
def perform
|
20
|
+
removed = 0
|
21
|
+
restored = 0
|
22
|
+
main_q_size = @main_q.size
|
23
|
+
if main_q_size > @item_limit
|
24
|
+
@polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
|
25
|
+
removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
|
26
|
+
elsif main_q_size < @item_limit && !@adapter.empty?
|
27
|
+
@polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
|
28
|
+
restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
|
29
|
+
end
|
30
|
+
[removed, restored]
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def rotate(source, dest)
|
36
|
+
performed = 0
|
37
|
+
loop do
|
38
|
+
message = source.pop(true)
|
39
|
+
if message
|
40
|
+
page = Page.from_json message
|
41
|
+
unless @polipus.storage.exists?(page)
|
42
|
+
allowed = @url_filter.nil? ? true : @url_filter.call(page)
|
43
|
+
if allowed
|
44
|
+
dest << message
|
45
|
+
performed += 1
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
source.commit if source.respond_to? :commit
|
50
|
+
break if !message || source.empty?
|
51
|
+
break unless yield source, dest
|
52
|
+
end
|
53
|
+
performed
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'thread'
|
3
|
+
require 'mongo'
|
4
|
+
module Polipus
|
5
|
+
module QueueOverflow
|
6
|
+
class MongoQueue
|
7
|
+
def initialize(mongo_db, queue_name, options = {})
|
8
|
+
@mongo_db = mongo_db
|
9
|
+
@collection_name = "polipus_q_overflow_#{queue_name}"
|
10
|
+
@semaphore = Mutex.new
|
11
|
+
@options = options
|
12
|
+
@options[:ensure_uniq] ||= false
|
13
|
+
@options[:ensure_uniq] && ensure_index
|
14
|
+
end
|
15
|
+
|
16
|
+
def length
|
17
|
+
@mongo_db[@collection_name].count
|
18
|
+
end
|
19
|
+
|
20
|
+
def empty?
|
21
|
+
!(length > 0)
|
22
|
+
end
|
23
|
+
|
24
|
+
def clear
|
25
|
+
@mongo_db[@collection_name].drop
|
26
|
+
@options[:ensure_uniq] && ensure_index
|
27
|
+
end
|
28
|
+
|
29
|
+
def push(data)
|
30
|
+
if @options[:ensure_uniq]
|
31
|
+
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: true, w: 1 })
|
32
|
+
else
|
33
|
+
@mongo_db[@collection_name].insert(payload: data)
|
34
|
+
end
|
35
|
+
true
|
36
|
+
end
|
37
|
+
|
38
|
+
def pop(_ = false)
|
39
|
+
@semaphore.synchronize do
|
40
|
+
doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
|
41
|
+
return nil if doc.nil?
|
42
|
+
@mongo_db[@collection_name].remove(_id: doc['_id'])
|
43
|
+
doc && doc['payload'] ? doc['payload'] : nil
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
alias_method :size, :length
|
48
|
+
alias_method :dec, :pop
|
49
|
+
alias_method :shift, :pop
|
50
|
+
alias_method :enc, :push
|
51
|
+
alias_method :<<, :push
|
52
|
+
|
53
|
+
protected
|
54
|
+
|
55
|
+
def ensure_index
|
56
|
+
@mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'polipus/queue_overflow/mongo_queue'
|
3
|
+
module Polipus
|
4
|
+
module QueueOverflow
|
5
|
+
class MongoQueueCapped < MongoQueue
|
6
|
+
def initialize(mongo_db, queue_name, options = {})
|
7
|
+
super
|
8
|
+
@max = @options[:max]
|
9
|
+
end
|
10
|
+
|
11
|
+
def push(data)
|
12
|
+
super
|
13
|
+
@semaphore.synchronize do
|
14
|
+
s = size
|
15
|
+
if s > @max
|
16
|
+
docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
|
17
|
+
@mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
alias_method :size, :length
|
23
|
+
alias_method :dec, :pop
|
24
|
+
alias_method :shift, :pop
|
25
|
+
alias_method :enc, :push
|
26
|
+
alias_method :<<, :push
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class Worker
|
5
|
+
def initialize(manager)
|
6
|
+
@logger = manager.polipus.logger
|
7
|
+
@delay = manager.polipus.options[:queue_overflow_manager_check_time]
|
8
|
+
@adapter = manager.polipus.queue_overflow_adapter
|
9
|
+
@manager = manager
|
10
|
+
end
|
11
|
+
|
12
|
+
def run
|
13
|
+
@logger.info { 'Overflow::Worker::run' }
|
14
|
+
loop do
|
15
|
+
@logger.info { 'Overflow Manager: cycle started' }
|
16
|
+
removed, restored = @manager.perform
|
17
|
+
@logger.info { "Overflow Manager: items removed=#{removed}, items restored=#{restored}, items stored=#{@adapter.size}" }
|
18
|
+
sleep @delay
|
19
|
+
break if SignalHandler.terminated?
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,145 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'open-uri'
|
3
|
+
require 'uri'
|
4
|
+
require 'timeout'
|
5
|
+
module Polipus
|
6
|
+
# Original code taken from
|
7
|
+
# https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
|
8
|
+
|
9
|
+
class Robotex
|
10
|
+
DEFAULT_TIMEOUT = 3
|
11
|
+
VERSION = '1.0.0'
|
12
|
+
|
13
|
+
attr_reader :user_agent
|
14
|
+
|
15
|
+
class ParsedRobots
|
16
|
+
def initialize(uri, user_agent)
|
17
|
+
io = Robotex.get_robots_txt(uri, user_agent)
|
18
|
+
if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
|
19
|
+
io = StringIO.new("User-agent: *\nAllow: /\n")
|
20
|
+
end
|
21
|
+
|
22
|
+
@disallows = {}
|
23
|
+
@allows = {}
|
24
|
+
@delays = {}
|
25
|
+
agent = /.*/
|
26
|
+
io.each do |line|
|
27
|
+
next if line =~ /^\s*(#.*|$)/
|
28
|
+
arr = line.split(':')
|
29
|
+
key = arr.shift
|
30
|
+
value = arr.join(':').strip
|
31
|
+
value.strip!
|
32
|
+
case key.downcase
|
33
|
+
when 'user-agent'
|
34
|
+
agent = to_regex(value)
|
35
|
+
when 'allow'
|
36
|
+
unless value.empty?
|
37
|
+
@allows[agent] ||= []
|
38
|
+
@allows[agent] << to_regex(value)
|
39
|
+
end
|
40
|
+
when 'disallow'
|
41
|
+
unless value.empty?
|
42
|
+
@disallows[agent] ||= []
|
43
|
+
@disallows[agent] << to_regex(value)
|
44
|
+
end
|
45
|
+
when 'crawl-delay'
|
46
|
+
@delays[agent] = value.to_i
|
47
|
+
end
|
48
|
+
end
|
49
|
+
@parsed = true
|
50
|
+
end
|
51
|
+
|
52
|
+
def allowed?(uri, user_agent)
|
53
|
+
return true unless @parsed
|
54
|
+
allowed = true
|
55
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
56
|
+
path = uri.request_uri
|
57
|
+
|
58
|
+
@allows.each do |key, value|
|
59
|
+
unless allowed
|
60
|
+
if user_agent =~ key
|
61
|
+
value.each do |rule|
|
62
|
+
path =~ rule && allowed = true
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
@disallows.each do |key, value|
|
69
|
+
if user_agent =~ key
|
70
|
+
value.each do |rule|
|
71
|
+
path =~ rule && allowed = false
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
allowed
|
77
|
+
end
|
78
|
+
|
79
|
+
def delay(user_agent)
|
80
|
+
@delays.each do |agent, delay|
|
81
|
+
return delay if agent =~ user_agent
|
82
|
+
end
|
83
|
+
nil
|
84
|
+
end
|
85
|
+
|
86
|
+
protected
|
87
|
+
|
88
|
+
def to_regex(pattern)
|
89
|
+
pattern = Regexp.escape(pattern)
|
90
|
+
pattern.gsub!(Regexp.escape('*'), '.*')
|
91
|
+
Regexp.compile("^#{pattern}")
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.get_robots_txt(uri, user_agent)
|
96
|
+
Timeout.timeout(Robotex.timeout) do
|
97
|
+
URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
|
98
|
+
end
|
99
|
+
rescue Timeout::Error
|
100
|
+
STDERR.puts 'robots.txt request timed out'
|
101
|
+
end
|
102
|
+
|
103
|
+
class << self
|
104
|
+
attr_writer :timeout
|
105
|
+
end
|
106
|
+
|
107
|
+
def self.timeout
|
108
|
+
@timeout || DEFAULT_TIMEOUT
|
109
|
+
end
|
110
|
+
|
111
|
+
def initialize(user_agent = nil)
|
112
|
+
user_agent = "Robotex/#{VERSION} (http://www.github.com/chriskite/robotex)" if user_agent.nil?
|
113
|
+
@user_agent = user_agent
|
114
|
+
@last_accessed = Time.at(1)
|
115
|
+
@parsed = {}
|
116
|
+
end
|
117
|
+
|
118
|
+
def parse_host(uri)
|
119
|
+
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
120
|
+
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
125
|
+
#
|
126
|
+
def allowed?(uri)
|
127
|
+
parse_host(uri).allowed?(uri, @user_agent)
|
128
|
+
end
|
129
|
+
|
130
|
+
#
|
131
|
+
# Return the value of the Crawl-Delay directive, or nil if none
|
132
|
+
def delay(uri)
|
133
|
+
parse_host(uri).delay(@user_agent)
|
134
|
+
end
|
135
|
+
|
136
|
+
#
|
137
|
+
# Sleep for the amount of time necessary to obey the Crawl-Delay specified by the server
|
138
|
+
#
|
139
|
+
def delay!(uri)
|
140
|
+
delay = delay(uri)
|
141
|
+
sleep delay - (Time.now - @last_accessed) if delay
|
142
|
+
@last_accessed = Time.now
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|