polipus 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'thread'
|
2
2
|
module Polipus
|
3
3
|
module QueueOverflow
|
4
4
|
class DevNullQueue
|
@@ -16,18 +16,18 @@ module Polipus
|
|
16
16
|
def clear
|
17
17
|
end
|
18
18
|
|
19
|
-
def push
|
19
|
+
def push(_data)
|
20
20
|
end
|
21
21
|
|
22
22
|
def pop(_ = false)
|
23
23
|
nil
|
24
24
|
end
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
25
|
+
|
26
|
+
alias_method :size, :length
|
27
|
+
alias_method :dec, :pop
|
28
|
+
alias_method :shift, :pop
|
29
|
+
alias_method :enc, :push
|
30
|
+
alias_method :<<, :push
|
31
31
|
end
|
32
32
|
end
|
33
|
-
end
|
33
|
+
end
|
@@ -10,43 +10,46 @@ module Polipus
|
|
10
10
|
@redis = @polipus.redis
|
11
11
|
end
|
12
12
|
|
13
|
-
def url_filter
|
13
|
+
def url_filter(&block)
|
14
14
|
@url_filter = block
|
15
15
|
end
|
16
16
|
|
17
17
|
def perform
|
18
|
-
removed
|
18
|
+
removed = 0
|
19
19
|
restored = 0
|
20
|
-
|
21
|
-
if
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
main_q_size = @main_q.size
|
21
|
+
if main_q_size > @item_limit
|
22
|
+
@polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
|
23
|
+
removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
|
24
|
+
elsif main_q_size < @item_limit && !@adapter.empty?
|
25
|
+
@polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
|
26
|
+
restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
|
25
27
|
end
|
26
28
|
[removed, restored]
|
27
29
|
end
|
28
30
|
|
29
31
|
private
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
|
33
|
+
def rotate(source, dest)
|
34
|
+
performed = 0
|
35
|
+
loop do
|
36
|
+
message = source.pop(true)
|
37
|
+
if message
|
38
|
+
page = Page.from_json message
|
39
|
+
unless @polipus.storage.exists?(page)
|
40
|
+
allowed = @url_filter.nil? ? true : @url_filter.call(page)
|
41
|
+
if allowed
|
42
|
+
dest << message
|
43
|
+
performed += 1
|
42
44
|
end
|
43
45
|
end
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
performed
|
46
|
+
end
|
47
|
+
source.commit if source.respond_to? :commit
|
48
|
+
break if !message || source.empty?
|
49
|
+
break unless yield source, dest
|
49
50
|
end
|
51
|
+
performed
|
52
|
+
end
|
50
53
|
end
|
51
54
|
end
|
52
|
-
end
|
55
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
|
-
require
|
1
|
+
require 'thread'
|
2
|
+
require 'mongo'
|
2
3
|
module Polipus
|
3
4
|
module QueueOverflow
|
4
5
|
class MongoQueue
|
@@ -8,9 +9,7 @@ module Polipus
|
|
8
9
|
@semaphore = Mutex.new
|
9
10
|
@options = options
|
10
11
|
@options[:ensure_uniq] ||= false
|
11
|
-
|
12
|
-
ensure_index
|
13
|
-
end
|
12
|
+
@options[:ensure_uniq] && ensure_index
|
14
13
|
end
|
15
14
|
|
16
15
|
def length
|
@@ -23,39 +22,38 @@ module Polipus
|
|
23
22
|
|
24
23
|
def clear
|
25
24
|
@mongo_db[@collection_name].drop
|
26
|
-
|
27
|
-
ensure_index
|
28
|
-
end
|
25
|
+
@options[:ensure_uniq] && ensure_index
|
29
26
|
end
|
30
27
|
|
31
|
-
def push
|
32
|
-
|
33
|
-
@mongo_db[@collection_name].
|
28
|
+
def push(data)
|
29
|
+
if @options[:ensure_uniq]
|
30
|
+
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: 1, w: 1 })
|
34
31
|
else
|
35
|
-
@mongo_db[@collection_name].
|
32
|
+
@mongo_db[@collection_name].insert(payload: data)
|
36
33
|
end
|
37
|
-
true
|
34
|
+
true
|
38
35
|
end
|
39
36
|
|
40
37
|
def pop(_ = false)
|
41
|
-
@semaphore.synchronize
|
42
|
-
doc = @mongo_db[@collection_name].find({}
|
38
|
+
@semaphore.synchronize do
|
39
|
+
doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
|
43
40
|
return nil if doc.nil?
|
44
|
-
@mongo_db[@collection_name].remove(:
|
41
|
+
@mongo_db[@collection_name].remove(_id: doc['_id'])
|
45
42
|
doc && doc['payload'] ? doc['payload'] : nil
|
46
|
-
|
43
|
+
end
|
47
44
|
end
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
45
|
+
|
46
|
+
alias_method :size, :length
|
47
|
+
alias_method :dec, :pop
|
48
|
+
alias_method :shift, :pop
|
49
|
+
alias_method :enc, :push
|
50
|
+
alias_method :<<, :push
|
54
51
|
|
55
52
|
protected
|
56
|
-
|
57
|
-
|
58
|
-
|
53
|
+
|
54
|
+
def ensure_index
|
55
|
+
@mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
|
56
|
+
end
|
59
57
|
end
|
60
58
|
end
|
61
|
-
end
|
59
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'polipus/queue_overflow/mongo_queue'
|
2
2
|
module Polipus
|
3
3
|
module QueueOverflow
|
4
4
|
class MongoQueueCapped < MongoQueue
|
@@ -7,22 +7,22 @@ module Polipus
|
|
7
7
|
@max = @options[:max]
|
8
8
|
end
|
9
9
|
|
10
|
-
def push
|
10
|
+
def push(data)
|
11
11
|
super
|
12
|
-
@semaphore.synchronize
|
12
|
+
@semaphore.synchronize do
|
13
13
|
s = size
|
14
14
|
if s > @max
|
15
|
-
docs = @mongo_db[@collection_name].find({},{:
|
16
|
-
@mongo_db[@collection_name].remove(
|
15
|
+
docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
|
16
|
+
@mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
|
17
17
|
end
|
18
|
-
|
18
|
+
end
|
19
19
|
end
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
alias_method :size, :length
|
22
|
+
alias_method :dec, :pop
|
23
|
+
alias_method :shift, :pop
|
24
|
+
alias_method :enc, :push
|
25
|
+
alias_method :<<, :push
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end
|
28
|
+
end
|
data/lib/polipus/robotex.rb
CHANGED
@@ -2,22 +2,19 @@ require 'open-uri'
|
|
2
2
|
require 'uri'
|
3
3
|
require 'timeout'
|
4
4
|
module Polipus
|
5
|
-
|
6
5
|
# Original code taken from
|
7
6
|
# https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
|
8
7
|
|
9
8
|
class Robotex
|
10
|
-
|
11
9
|
DEFAULT_TIMEOUT = 3
|
12
10
|
VERSION = '1.0.0'
|
13
11
|
|
14
12
|
attr_reader :user_agent
|
15
|
-
|
13
|
+
|
16
14
|
class ParsedRobots
|
17
|
-
|
18
15
|
def initialize(uri, user_agent)
|
19
16
|
io = Robotex.get_robots_txt(uri, user_agent)
|
20
|
-
if !io || io.content_type !=
|
17
|
+
if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
|
21
18
|
io = StringIO.new("User-agent: *\nAllow: /\n")
|
22
19
|
end
|
23
20
|
|
@@ -27,59 +24,55 @@ module Polipus
|
|
27
24
|
agent = /.*/
|
28
25
|
io.each do |line|
|
29
26
|
next if line =~ /^\s*(#.*|$)/
|
30
|
-
arr = line.split(
|
27
|
+
arr = line.split(':')
|
31
28
|
key = arr.shift
|
32
|
-
value = arr.join(
|
29
|
+
value = arr.join(':').strip
|
33
30
|
value.strip!
|
34
31
|
case key.downcase
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
32
|
+
when 'user-agent'
|
33
|
+
agent = to_regex(value)
|
34
|
+
when 'allow'
|
35
|
+
unless value.empty?
|
36
|
+
@allows[agent] ||= []
|
37
|
+
@allows[agent] << to_regex(value)
|
38
|
+
end
|
39
|
+
when 'disallow'
|
40
|
+
unless value.empty?
|
41
|
+
@disallows[agent] ||= []
|
42
|
+
@disallows[agent] << to_regex(value)
|
43
|
+
end
|
44
|
+
when 'crawl-delay'
|
45
|
+
@delays[agent] = value.to_i
|
49
46
|
end
|
50
47
|
end
|
51
48
|
@parsed = true
|
52
49
|
end
|
53
|
-
|
50
|
+
|
54
51
|
def allowed?(uri, user_agent)
|
55
52
|
return true unless @parsed
|
56
53
|
allowed = true
|
57
54
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
58
55
|
path = uri.request_uri
|
59
|
-
|
56
|
+
|
60
57
|
@allows.each do |key, value|
|
61
|
-
unless allowed
|
58
|
+
unless allowed
|
62
59
|
if user_agent =~ key
|
63
60
|
value.each do |rule|
|
64
|
-
|
65
|
-
allowed = true
|
66
|
-
end
|
61
|
+
path =~ rule && allowed = true
|
67
62
|
end
|
68
63
|
end
|
69
64
|
end
|
70
65
|
end
|
71
|
-
|
66
|
+
|
72
67
|
@disallows.each do |key, value|
|
73
68
|
if user_agent =~ key
|
74
69
|
value.each do |rule|
|
75
|
-
|
76
|
-
allowed = false
|
77
|
-
end
|
70
|
+
path =~ rule && allowed = false
|
78
71
|
end
|
79
72
|
end
|
80
73
|
end
|
81
|
-
|
82
|
-
|
74
|
+
|
75
|
+
allowed
|
83
76
|
end
|
84
77
|
|
85
78
|
def delay(user_agent)
|
@@ -88,30 +81,28 @@ module Polipus
|
|
88
81
|
end
|
89
82
|
nil
|
90
83
|
end
|
91
|
-
|
84
|
+
|
92
85
|
protected
|
93
|
-
|
86
|
+
|
94
87
|
def to_regex(pattern)
|
95
88
|
pattern = Regexp.escape(pattern)
|
96
|
-
pattern.gsub!(Regexp.escape(
|
89
|
+
pattern.gsub!(Regexp.escape('*'), '.*')
|
97
90
|
Regexp.compile("^#{pattern}")
|
98
91
|
end
|
99
92
|
end
|
100
|
-
|
93
|
+
|
101
94
|
def self.get_robots_txt(uri, user_agent)
|
102
|
-
|
103
|
-
|
104
|
-
URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
105
|
-
end
|
106
|
-
rescue Timeout::Error
|
107
|
-
STDERR.puts "robots.txt request timed out"
|
95
|
+
Timeout.timeout(Robotex.timeout) do
|
96
|
+
URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
|
108
97
|
end
|
98
|
+
rescue Timeout::Error
|
99
|
+
STDERR.puts 'robots.txt request timed out'
|
109
100
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
101
|
+
|
102
|
+
class << self
|
103
|
+
attr_writer :timeout
|
113
104
|
end
|
114
|
-
|
105
|
+
|
115
106
|
def self.timeout
|
116
107
|
@timeout || DEFAULT_TIMEOUT
|
117
108
|
end
|
@@ -127,7 +118,7 @@ module Polipus
|
|
127
118
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
128
119
|
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
129
120
|
end
|
130
|
-
|
121
|
+
|
131
122
|
#
|
132
123
|
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
133
124
|
#
|
@@ -146,9 +137,8 @@ module Polipus
|
|
146
137
|
#
|
147
138
|
def delay!(uri)
|
148
139
|
delay = delay(uri)
|
149
|
-
sleep delay - (Time.now - @last_accessed) if
|
140
|
+
sleep delay - (Time.now - @last_accessed) if delay
|
150
141
|
@last_accessed = Time.now
|
151
142
|
end
|
152
|
-
|
153
143
|
end
|
154
|
-
end
|
144
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
module Polipus
|
3
|
+
class SignalHandler
|
4
|
+
include Singleton
|
5
|
+
attr_accessor :terminated
|
6
|
+
attr_accessor :enabled
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
self.terminated = false
|
10
|
+
self.enabled = false
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.enable
|
14
|
+
trap(:INT) do
|
15
|
+
exit unless self.enabled?
|
16
|
+
terminate
|
17
|
+
end
|
18
|
+
trap(:TERM) do
|
19
|
+
exit unless self.enabled?
|
20
|
+
terminate
|
21
|
+
end
|
22
|
+
instance.enabled = true
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.disable
|
26
|
+
instance.enabled = false
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.terminate
|
30
|
+
instance.terminated = true
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.terminated?
|
34
|
+
instance.terminated
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.enabled?
|
38
|
+
instance.enabled
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/polipus/storage.rb
CHANGED
@@ -1,25 +1,25 @@
|
|
1
|
-
require
|
1
|
+
require 'polipus/storage/base'
|
2
2
|
module Polipus
|
3
3
|
module Storage
|
4
4
|
def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
|
5
5
|
require 'polipus/storage/mongo_store'
|
6
|
-
mongo ||= Mongo::Connection.new(
|
7
|
-
|
8
|
-
self::MongoStore.new(:
|
6
|
+
mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
7
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
|
8
|
+
self::MongoStore.new(mongo: mongo, collection: collection_name, except: except)
|
9
9
|
end
|
10
10
|
|
11
11
|
def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
|
12
12
|
require 'polipus/storage/s3_store'
|
13
|
-
|
13
|
+
|
14
14
|
if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
|
15
|
-
|
15
|
+
fail 'You have to specify AWS crediantials: :access_key_id and :secret_access_key'
|
16
16
|
end
|
17
17
|
|
18
18
|
self::S3Store.new(
|
19
|
-
:
|
20
|
-
:
|
21
|
-
:
|
22
|
-
:
|
19
|
+
bucket: bucket_name,
|
20
|
+
access_key_id: aws_credential[:access_key_id],
|
21
|
+
secret_access_key: aws_credential[:secret_access_key],
|
22
|
+
except: except
|
23
23
|
)
|
24
24
|
end
|
25
25
|
|
@@ -33,4 +33,4 @@ module Polipus
|
|
33
33
|
self::MemoryStore.new
|
34
34
|
end
|
35
35
|
end
|
36
|
-
end
|
36
|
+
end
|