polipus 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'thread'
|
2
2
|
module Polipus
|
3
3
|
module QueueOverflow
|
4
4
|
class DevNullQueue
|
@@ -16,18 +16,18 @@ module Polipus
|
|
16
16
|
def clear
|
17
17
|
end
|
18
18
|
|
19
|
-
def push
|
19
|
+
def push(_data)
|
20
20
|
end
|
21
21
|
|
22
22
|
def pop(_ = false)
|
23
23
|
nil
|
24
24
|
end
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
25
|
+
|
26
|
+
alias_method :size, :length
|
27
|
+
alias_method :dec, :pop
|
28
|
+
alias_method :shift, :pop
|
29
|
+
alias_method :enc, :push
|
30
|
+
alias_method :<<, :push
|
31
31
|
end
|
32
32
|
end
|
33
|
-
end
|
33
|
+
end
|
@@ -10,43 +10,46 @@ module Polipus
|
|
10
10
|
@redis = @polipus.redis
|
11
11
|
end
|
12
12
|
|
13
|
-
def url_filter
|
13
|
+
def url_filter(&block)
|
14
14
|
@url_filter = block
|
15
15
|
end
|
16
16
|
|
17
17
|
def perform
|
18
|
-
removed
|
18
|
+
removed = 0
|
19
19
|
restored = 0
|
20
|
-
|
21
|
-
if
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
main_q_size = @main_q.size
|
21
|
+
if main_q_size > @item_limit
|
22
|
+
@polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
|
23
|
+
removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
|
24
|
+
elsif main_q_size < @item_limit && !@adapter.empty?
|
25
|
+
@polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
|
26
|
+
restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
|
25
27
|
end
|
26
28
|
[removed, restored]
|
27
29
|
end
|
28
30
|
|
29
31
|
private
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
32
|
+
|
33
|
+
def rotate(source, dest)
|
34
|
+
performed = 0
|
35
|
+
loop do
|
36
|
+
message = source.pop(true)
|
37
|
+
if message
|
38
|
+
page = Page.from_json message
|
39
|
+
unless @polipus.storage.exists?(page)
|
40
|
+
allowed = @url_filter.nil? ? true : @url_filter.call(page)
|
41
|
+
if allowed
|
42
|
+
dest << message
|
43
|
+
performed += 1
|
42
44
|
end
|
43
45
|
end
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
performed
|
46
|
+
end
|
47
|
+
source.commit if source.respond_to? :commit
|
48
|
+
break if !message || source.empty?
|
49
|
+
break unless yield source, dest
|
49
50
|
end
|
51
|
+
performed
|
52
|
+
end
|
50
53
|
end
|
51
54
|
end
|
52
|
-
end
|
55
|
+
end
|
@@ -1,4 +1,5 @@
|
|
1
|
-
require
|
1
|
+
require 'thread'
|
2
|
+
require 'mongo'
|
2
3
|
module Polipus
|
3
4
|
module QueueOverflow
|
4
5
|
class MongoQueue
|
@@ -8,9 +9,7 @@ module Polipus
|
|
8
9
|
@semaphore = Mutex.new
|
9
10
|
@options = options
|
10
11
|
@options[:ensure_uniq] ||= false
|
11
|
-
|
12
|
-
ensure_index
|
13
|
-
end
|
12
|
+
@options[:ensure_uniq] && ensure_index
|
14
13
|
end
|
15
14
|
|
16
15
|
def length
|
@@ -23,39 +22,38 @@ module Polipus
|
|
23
22
|
|
24
23
|
def clear
|
25
24
|
@mongo_db[@collection_name].drop
|
26
|
-
|
27
|
-
ensure_index
|
28
|
-
end
|
25
|
+
@options[:ensure_uniq] && ensure_index
|
29
26
|
end
|
30
27
|
|
31
|
-
def push
|
32
|
-
|
33
|
-
@mongo_db[@collection_name].
|
28
|
+
def push(data)
|
29
|
+
if @options[:ensure_uniq]
|
30
|
+
@mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: 1, w: 1 })
|
34
31
|
else
|
35
|
-
@mongo_db[@collection_name].
|
32
|
+
@mongo_db[@collection_name].insert(payload: data)
|
36
33
|
end
|
37
|
-
true
|
34
|
+
true
|
38
35
|
end
|
39
36
|
|
40
37
|
def pop(_ = false)
|
41
|
-
@semaphore.synchronize
|
42
|
-
doc = @mongo_db[@collection_name].find({}
|
38
|
+
@semaphore.synchronize do
|
39
|
+
doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
|
43
40
|
return nil if doc.nil?
|
44
|
-
@mongo_db[@collection_name].remove(:
|
41
|
+
@mongo_db[@collection_name].remove(_id: doc['_id'])
|
45
42
|
doc && doc['payload'] ? doc['payload'] : nil
|
46
|
-
|
43
|
+
end
|
47
44
|
end
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
45
|
+
|
46
|
+
alias_method :size, :length
|
47
|
+
alias_method :dec, :pop
|
48
|
+
alias_method :shift, :pop
|
49
|
+
alias_method :enc, :push
|
50
|
+
alias_method :<<, :push
|
54
51
|
|
55
52
|
protected
|
56
|
-
|
57
|
-
|
58
|
-
|
53
|
+
|
54
|
+
def ensure_index
|
55
|
+
@mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
|
56
|
+
end
|
59
57
|
end
|
60
58
|
end
|
61
|
-
end
|
59
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'polipus/queue_overflow/mongo_queue'
|
2
2
|
module Polipus
|
3
3
|
module QueueOverflow
|
4
4
|
class MongoQueueCapped < MongoQueue
|
@@ -7,22 +7,22 @@ module Polipus
|
|
7
7
|
@max = @options[:max]
|
8
8
|
end
|
9
9
|
|
10
|
-
def push
|
10
|
+
def push(data)
|
11
11
|
super
|
12
|
-
@semaphore.synchronize
|
12
|
+
@semaphore.synchronize do
|
13
13
|
s = size
|
14
14
|
if s > @max
|
15
|
-
docs = @mongo_db[@collection_name].find({},{:
|
16
|
-
@mongo_db[@collection_name].remove(
|
15
|
+
docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
|
16
|
+
@mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
|
17
17
|
end
|
18
|
-
|
18
|
+
end
|
19
19
|
end
|
20
20
|
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
alias_method :size, :length
|
22
|
+
alias_method :dec, :pop
|
23
|
+
alias_method :shift, :pop
|
24
|
+
alias_method :enc, :push
|
25
|
+
alias_method :<<, :push
|
26
26
|
end
|
27
27
|
end
|
28
|
-
end
|
28
|
+
end
|
data/lib/polipus/robotex.rb
CHANGED
@@ -2,22 +2,19 @@ require 'open-uri'
|
|
2
2
|
require 'uri'
|
3
3
|
require 'timeout'
|
4
4
|
module Polipus
|
5
|
-
|
6
5
|
# Original code taken from
|
7
6
|
# https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
|
8
7
|
|
9
8
|
class Robotex
|
10
|
-
|
11
9
|
DEFAULT_TIMEOUT = 3
|
12
10
|
VERSION = '1.0.0'
|
13
11
|
|
14
12
|
attr_reader :user_agent
|
15
|
-
|
13
|
+
|
16
14
|
class ParsedRobots
|
17
|
-
|
18
15
|
def initialize(uri, user_agent)
|
19
16
|
io = Robotex.get_robots_txt(uri, user_agent)
|
20
|
-
if !io || io.content_type !=
|
17
|
+
if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
|
21
18
|
io = StringIO.new("User-agent: *\nAllow: /\n")
|
22
19
|
end
|
23
20
|
|
@@ -27,59 +24,55 @@ module Polipus
|
|
27
24
|
agent = /.*/
|
28
25
|
io.each do |line|
|
29
26
|
next if line =~ /^\s*(#.*|$)/
|
30
|
-
arr = line.split(
|
27
|
+
arr = line.split(':')
|
31
28
|
key = arr.shift
|
32
|
-
value = arr.join(
|
29
|
+
value = arr.join(':').strip
|
33
30
|
value.strip!
|
34
31
|
case key.downcase
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
32
|
+
when 'user-agent'
|
33
|
+
agent = to_regex(value)
|
34
|
+
when 'allow'
|
35
|
+
unless value.empty?
|
36
|
+
@allows[agent] ||= []
|
37
|
+
@allows[agent] << to_regex(value)
|
38
|
+
end
|
39
|
+
when 'disallow'
|
40
|
+
unless value.empty?
|
41
|
+
@disallows[agent] ||= []
|
42
|
+
@disallows[agent] << to_regex(value)
|
43
|
+
end
|
44
|
+
when 'crawl-delay'
|
45
|
+
@delays[agent] = value.to_i
|
49
46
|
end
|
50
47
|
end
|
51
48
|
@parsed = true
|
52
49
|
end
|
53
|
-
|
50
|
+
|
54
51
|
def allowed?(uri, user_agent)
|
55
52
|
return true unless @parsed
|
56
53
|
allowed = true
|
57
54
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
58
55
|
path = uri.request_uri
|
59
|
-
|
56
|
+
|
60
57
|
@allows.each do |key, value|
|
61
|
-
unless allowed
|
58
|
+
unless allowed
|
62
59
|
if user_agent =~ key
|
63
60
|
value.each do |rule|
|
64
|
-
|
65
|
-
allowed = true
|
66
|
-
end
|
61
|
+
path =~ rule && allowed = true
|
67
62
|
end
|
68
63
|
end
|
69
64
|
end
|
70
65
|
end
|
71
|
-
|
66
|
+
|
72
67
|
@disallows.each do |key, value|
|
73
68
|
if user_agent =~ key
|
74
69
|
value.each do |rule|
|
75
|
-
|
76
|
-
allowed = false
|
77
|
-
end
|
70
|
+
path =~ rule && allowed = false
|
78
71
|
end
|
79
72
|
end
|
80
73
|
end
|
81
|
-
|
82
|
-
|
74
|
+
|
75
|
+
allowed
|
83
76
|
end
|
84
77
|
|
85
78
|
def delay(user_agent)
|
@@ -88,30 +81,28 @@ module Polipus
|
|
88
81
|
end
|
89
82
|
nil
|
90
83
|
end
|
91
|
-
|
84
|
+
|
92
85
|
protected
|
93
|
-
|
86
|
+
|
94
87
|
def to_regex(pattern)
|
95
88
|
pattern = Regexp.escape(pattern)
|
96
|
-
pattern.gsub!(Regexp.escape(
|
89
|
+
pattern.gsub!(Regexp.escape('*'), '.*')
|
97
90
|
Regexp.compile("^#{pattern}")
|
98
91
|
end
|
99
92
|
end
|
100
|
-
|
93
|
+
|
101
94
|
def self.get_robots_txt(uri, user_agent)
|
102
|
-
|
103
|
-
|
104
|
-
URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
|
105
|
-
end
|
106
|
-
rescue Timeout::Error
|
107
|
-
STDERR.puts "robots.txt request timed out"
|
95
|
+
Timeout.timeout(Robotex.timeout) do
|
96
|
+
URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
|
108
97
|
end
|
98
|
+
rescue Timeout::Error
|
99
|
+
STDERR.puts 'robots.txt request timed out'
|
109
100
|
end
|
110
|
-
|
111
|
-
|
112
|
-
|
101
|
+
|
102
|
+
class << self
|
103
|
+
attr_writer :timeout
|
113
104
|
end
|
114
|
-
|
105
|
+
|
115
106
|
def self.timeout
|
116
107
|
@timeout || DEFAULT_TIMEOUT
|
117
108
|
end
|
@@ -127,7 +118,7 @@ module Polipus
|
|
127
118
|
uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
|
128
119
|
@parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
|
129
120
|
end
|
130
|
-
|
121
|
+
|
131
122
|
#
|
132
123
|
# Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
|
133
124
|
#
|
@@ -146,9 +137,8 @@ module Polipus
|
|
146
137
|
#
|
147
138
|
def delay!(uri)
|
148
139
|
delay = delay(uri)
|
149
|
-
sleep delay - (Time.now - @last_accessed) if
|
140
|
+
sleep delay - (Time.now - @last_accessed) if delay
|
150
141
|
@last_accessed = Time.now
|
151
142
|
end
|
152
|
-
|
153
143
|
end
|
154
|
-
end
|
144
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
require 'singleton'
|
2
|
+
module Polipus
|
3
|
+
class SignalHandler
|
4
|
+
include Singleton
|
5
|
+
attr_accessor :terminated
|
6
|
+
attr_accessor :enabled
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
self.terminated = false
|
10
|
+
self.enabled = false
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.enable
|
14
|
+
trap(:INT) do
|
15
|
+
exit unless self.enabled?
|
16
|
+
terminate
|
17
|
+
end
|
18
|
+
trap(:TERM) do
|
19
|
+
exit unless self.enabled?
|
20
|
+
terminate
|
21
|
+
end
|
22
|
+
instance.enabled = true
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.disable
|
26
|
+
instance.enabled = false
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.terminate
|
30
|
+
instance.terminated = true
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.terminated?
|
34
|
+
instance.terminated
|
35
|
+
end
|
36
|
+
|
37
|
+
def self.enabled?
|
38
|
+
instance.enabled
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/polipus/storage.rb
CHANGED
@@ -1,25 +1,25 @@
|
|
1
|
-
require
|
1
|
+
require 'polipus/storage/base'
|
2
2
|
module Polipus
|
3
3
|
module Storage
|
4
4
|
def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
|
5
5
|
require 'polipus/storage/mongo_store'
|
6
|
-
mongo ||= Mongo::Connection.new(
|
7
|
-
|
8
|
-
self::MongoStore.new(:
|
6
|
+
mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
7
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
|
8
|
+
self::MongoStore.new(mongo: mongo, collection: collection_name, except: except)
|
9
9
|
end
|
10
10
|
|
11
11
|
def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
|
12
12
|
require 'polipus/storage/s3_store'
|
13
|
-
|
13
|
+
|
14
14
|
if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
|
15
|
-
|
15
|
+
fail 'You have to specify AWS crediantials: :access_key_id and :secret_access_key'
|
16
16
|
end
|
17
17
|
|
18
18
|
self::S3Store.new(
|
19
|
-
:
|
20
|
-
:
|
21
|
-
:
|
22
|
-
:
|
19
|
+
bucket: bucket_name,
|
20
|
+
access_key_id: aws_credential[:access_key_id],
|
21
|
+
secret_access_key: aws_credential[:secret_access_key],
|
22
|
+
except: except
|
23
23
|
)
|
24
24
|
end
|
25
25
|
|
@@ -33,4 +33,4 @@ module Polipus
|
|
33
33
|
self::MemoryStore.new
|
34
34
|
end
|
35
35
|
end
|
36
|
-
end
|
36
|
+
end
|