polipus 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
data/lib/polipus/storage/base.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
|
-
require
|
1
|
+
require 'uri'
|
2
2
|
|
3
3
|
module Polipus
|
4
4
|
module Storage
|
5
5
|
class Base
|
6
6
|
attr_accessor :include_query_string_in_uuid
|
7
|
+
|
7
8
|
protected
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
|
13
|
-
Digest::MD5.hexdigest(url_to_hash)
|
9
|
+
|
10
|
+
def uuid(page)
|
11
|
+
if @include_query_string_in_uuid.nil?
|
12
|
+
@include_query_string_in_uuid = true
|
14
13
|
end
|
14
|
+
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
|
15
|
+
Digest::MD5.hexdigest(url_to_hash)
|
16
|
+
end
|
15
17
|
end
|
16
18
|
end
|
17
|
-
end
|
19
|
+
end
|
@@ -1,22 +1,21 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Storage
|
3
3
|
class DevNull < Base
|
4
|
-
|
5
|
-
def initialize(options = {})
|
4
|
+
def initialize(_options = {})
|
6
5
|
end
|
7
6
|
|
8
|
-
def add
|
7
|
+
def add(_page)
|
9
8
|
end
|
10
9
|
|
11
|
-
def exists?(
|
10
|
+
def exists?(_page)
|
12
11
|
false
|
13
12
|
end
|
14
13
|
|
15
|
-
def get
|
14
|
+
def get(_page)
|
16
15
|
nil
|
17
16
|
end
|
18
17
|
|
19
|
-
def remove
|
18
|
+
def remove(_page)
|
20
19
|
false
|
21
20
|
end
|
22
21
|
|
@@ -32,4 +31,4 @@ module Polipus
|
|
32
31
|
end
|
33
32
|
end
|
34
33
|
end
|
35
|
-
end
|
34
|
+
end
|
@@ -1,56 +1,55 @@
|
|
1
|
-
require
|
1
|
+
require 'thread'
|
2
2
|
module Polipus
|
3
3
|
module Storage
|
4
4
|
class MemoryStore < Base
|
5
|
-
|
6
|
-
|
7
|
-
@store = Hash.new
|
5
|
+
def initialize(_options = {})
|
6
|
+
@store = {}
|
8
7
|
@semaphore = Mutex.new
|
9
8
|
end
|
10
9
|
|
11
|
-
def add
|
12
|
-
@semaphore.synchronize
|
10
|
+
def add(page)
|
11
|
+
@semaphore.synchronize do
|
13
12
|
u = uuid(page)
|
14
13
|
@store[u] = page
|
15
14
|
u
|
16
|
-
|
15
|
+
end
|
17
16
|
end
|
18
17
|
|
19
18
|
def exists?(page)
|
20
|
-
@semaphore.synchronize
|
19
|
+
@semaphore.synchronize do
|
21
20
|
@store.key?(uuid(page))
|
22
|
-
|
21
|
+
end
|
23
22
|
end
|
24
23
|
|
25
|
-
def get
|
26
|
-
@semaphore.synchronize
|
24
|
+
def get(page)
|
25
|
+
@semaphore.synchronize do
|
27
26
|
@store[uuid(page)]
|
28
|
-
|
27
|
+
end
|
29
28
|
end
|
30
29
|
|
31
|
-
def remove
|
32
|
-
@semaphore.synchronize
|
30
|
+
def remove(page)
|
31
|
+
@semaphore.synchronize do
|
33
32
|
@store.delete(uuid(page))
|
34
|
-
|
33
|
+
end
|
35
34
|
end
|
36
35
|
|
37
36
|
def count
|
38
|
-
@semaphore.synchronize
|
37
|
+
@semaphore.synchronize do
|
39
38
|
@store.count
|
40
|
-
|
39
|
+
end
|
41
40
|
end
|
42
41
|
|
43
42
|
def each
|
44
|
-
@store.each do |k,v|
|
45
|
-
yield k,v
|
43
|
+
@store.each do |k, v|
|
44
|
+
yield k, v
|
46
45
|
end
|
47
46
|
end
|
48
47
|
|
49
48
|
def clear
|
50
|
-
@semaphore.synchronize
|
49
|
+
@semaphore.synchronize do
|
51
50
|
@store = Hash.new
|
52
|
-
|
51
|
+
end
|
53
52
|
end
|
54
53
|
end
|
55
54
|
end
|
56
|
-
end
|
55
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'mongo'
|
2
|
+
require 'zlib'
|
3
|
+
require 'thread'
|
4
4
|
module Polipus
|
5
5
|
module Storage
|
6
6
|
class MongoStore < Base
|
@@ -9,44 +9,44 @@ module Polipus
|
|
9
9
|
@mongo = options[:mongo]
|
10
10
|
@collection = options[:collection]
|
11
11
|
@mongo.create_collection(@collection)
|
12
|
-
@mongo[@collection].ensure_index(:uuid, :
|
12
|
+
@mongo[@collection].ensure_index(:uuid, unique: true, drop_dups: true, background: true)
|
13
13
|
@compress_body = options[:compress_body] ||= true
|
14
14
|
@except = options[:except] ||= []
|
15
15
|
@semaphore = Mutex.new
|
16
16
|
end
|
17
17
|
|
18
|
-
def add
|
19
|
-
@semaphore.synchronize
|
18
|
+
def add(page)
|
19
|
+
@semaphore.synchronize do
|
20
20
|
obj = page.to_hash
|
21
|
-
@except.each {|e| obj.delete e.to_s}
|
21
|
+
@except.each { |e| obj.delete e.to_s }
|
22
22
|
obj['uuid'] = uuid(page)
|
23
23
|
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
24
24
|
BINARY_FIELDS.each do |field|
|
25
25
|
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
26
26
|
end
|
27
|
-
@mongo[@collection].update({:
|
27
|
+
@mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
|
28
28
|
obj['uuid']
|
29
|
-
|
29
|
+
end
|
30
30
|
end
|
31
31
|
|
32
32
|
def exists?(page)
|
33
|
-
@semaphore.synchronize
|
34
|
-
doc = @mongo[@collection].find({:
|
33
|
+
@semaphore.synchronize do
|
34
|
+
doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
|
35
35
|
!doc.nil?
|
36
|
-
|
36
|
+
end
|
37
37
|
end
|
38
38
|
|
39
|
-
def get
|
40
|
-
@semaphore.synchronize
|
41
|
-
data = @mongo[@collection].find(
|
39
|
+
def get(page)
|
40
|
+
@semaphore.synchronize do
|
41
|
+
data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
|
42
42
|
return load_page(data) if data
|
43
|
-
|
43
|
+
end
|
44
44
|
end
|
45
45
|
|
46
|
-
def remove
|
47
|
-
@semaphore.synchronize
|
48
|
-
@mongo[@collection].remove(
|
49
|
-
|
46
|
+
def remove(page)
|
47
|
+
@semaphore.synchronize do
|
48
|
+
@mongo[@collection].remove(uuid: uuid(page))
|
49
|
+
end
|
50
50
|
end
|
51
51
|
|
52
52
|
def count
|
@@ -54,10 +54,10 @@ module Polipus
|
|
54
54
|
end
|
55
55
|
|
56
56
|
def each
|
57
|
-
@mongo[@collection].find({}
|
57
|
+
@mongo[@collection].find({}, timeout: false) do |cursor|
|
58
58
|
cursor.each do |doc|
|
59
59
|
page = load_page(doc)
|
60
|
-
yield doc['uuid'], page
|
60
|
+
yield doc['uuid'], page
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -67,22 +67,18 @@ module Polipus
|
|
67
67
|
end
|
68
68
|
|
69
69
|
private
|
70
|
-
def load_page(hash)
|
71
|
-
BINARY_FIELDS.each do |field|
|
72
|
-
hash[field] = hash[field].to_s
|
73
|
-
end
|
74
|
-
begin
|
75
|
-
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
76
|
-
page = Page.from_hash(hash)
|
77
|
-
if page.fetched_at.nil?
|
78
|
-
page.fetched_at = hash['_id'].generation_time.to_i
|
79
|
-
end
|
80
|
-
return page
|
81
|
-
rescue
|
82
|
-
end
|
83
|
-
nil
|
84
|
-
end
|
85
70
|
|
71
|
+
def load_page(hash)
|
72
|
+
BINARY_FIELDS.each do |field|
|
73
|
+
hash[field] = hash[field].to_s
|
74
|
+
end
|
75
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
76
|
+
page = Page.from_hash(hash)
|
77
|
+
if page.fetched_at.nil?
|
78
|
+
page.fetched_at = hash['_id'].generation_time.to_i
|
79
|
+
end
|
80
|
+
page
|
81
|
+
end
|
86
82
|
end
|
87
83
|
end
|
88
|
-
end
|
84
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
1
|
+
require 'aws/s3'
|
2
|
+
require 'zlib'
|
3
|
+
require 'thread'
|
4
|
+
require 'json'
|
5
5
|
module Polipus
|
6
6
|
module Storage
|
7
7
|
class S3Store < Base
|
@@ -11,8 +11,8 @@ module Polipus
|
|
11
11
|
@semaphore = Mutex.new
|
12
12
|
|
13
13
|
AWS::S3::Base.establish_connection!(
|
14
|
-
:
|
15
|
-
:
|
14
|
+
access_key_id: @options[:access_key_id],
|
15
|
+
secret_access_key: @options[:secret_access_key]
|
16
16
|
)
|
17
17
|
@options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
|
18
18
|
begin
|
@@ -22,39 +22,37 @@ module Polipus
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def add
|
26
|
-
@semaphore.synchronize
|
25
|
+
def add(page)
|
26
|
+
@semaphore.synchronize do
|
27
27
|
obj = page.to_hash
|
28
|
-
@except.each {|e| obj.delete e.to_s}
|
28
|
+
@except.each { |e| obj.delete e.to_s }
|
29
29
|
puuid = uuid(page)
|
30
30
|
obj['uuid'] = puuid
|
31
31
|
data = Zlib::Deflate.deflate(obj.to_json)
|
32
32
|
AWS::S3::S3Object.store(puuid, data, @bucket.name)
|
33
33
|
puuid
|
34
|
-
|
34
|
+
end
|
35
35
|
end
|
36
36
|
|
37
37
|
def exists?(page)
|
38
38
|
AWS::S3::S3Object.exists? uuid(page), @bucket.name
|
39
39
|
end
|
40
40
|
|
41
|
-
def get
|
42
|
-
@semaphore.synchronize
|
41
|
+
def get(page)
|
42
|
+
@semaphore.synchronize do
|
43
43
|
if exists?(page)
|
44
44
|
data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
|
45
45
|
return load_page(data)
|
46
46
|
end
|
47
47
|
nil
|
48
|
-
|
48
|
+
end
|
49
49
|
end
|
50
50
|
|
51
|
-
def remove
|
52
|
-
@semaphore.synchronize
|
53
|
-
|
54
|
-
AWS::S3::S3Object.delete(uuid(page), @bucket.name)
|
55
|
-
end
|
51
|
+
def remove(page)
|
52
|
+
@semaphore.synchronize do
|
53
|
+
exists?(page) && AWS::S3::S3Object.delete(uuid(page), @bucket.name)
|
56
54
|
true
|
57
|
-
|
55
|
+
end
|
58
56
|
end
|
59
57
|
|
60
58
|
def count
|
@@ -62,39 +60,36 @@ module Polipus
|
|
62
60
|
end
|
63
61
|
|
64
62
|
def clear
|
65
|
-
AWS::S3::Bucket.delete(@bucket.name, :
|
63
|
+
AWS::S3::Bucket.delete(@bucket.name, force: true)
|
66
64
|
create_bucket
|
67
65
|
end
|
68
66
|
|
69
67
|
def each
|
70
68
|
objects = []
|
71
69
|
last_key = nil
|
72
|
-
|
73
|
-
objects = AWS::S3::Bucket.objects(@bucket.name, :
|
70
|
+
loop do
|
71
|
+
objects = AWS::S3::Bucket.objects(@bucket.name, marker: last_key)
|
74
72
|
break if objects.size == 0
|
75
73
|
objects.each do |o|
|
76
74
|
page = load_page(o.value)
|
77
|
-
yield o.key, page
|
75
|
+
yield o.key, page
|
78
76
|
end
|
79
77
|
last_key = objects.last.key
|
80
|
-
end
|
78
|
+
end
|
81
79
|
end
|
82
80
|
|
83
81
|
private
|
84
|
-
def load_page(data)
|
85
|
-
begin
|
86
|
-
payload = Zlib::Inflate.inflate(data)
|
87
|
-
hash = JSON.parse(payload)
|
88
|
-
return Page.from_hash(hash)
|
89
|
-
rescue
|
90
|
-
end
|
91
|
-
nil
|
92
|
-
end
|
93
82
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
83
|
+
def load_page(data)
|
84
|
+
payload = Zlib::Inflate.inflate(data)
|
85
|
+
hash = JSON.parse(payload)
|
86
|
+
Page.from_hash(hash)
|
87
|
+
end
|
88
|
+
|
89
|
+
def create_bucket
|
90
|
+
AWS::S3::Bucket.create(@options[:bucket])
|
91
|
+
@bucket = AWS::S3::Bucket.find(@options[:bucket])
|
92
|
+
end
|
98
93
|
end
|
99
94
|
end
|
100
|
-
end
|
95
|
+
end
|
data/lib/polipus/url_tracker.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Polipus
|
2
2
|
module UrlTracker
|
3
3
|
def self.bloomfilter(options = {})
|
4
|
-
require
|
4
|
+
require 'polipus/url_tracker/bloomfilter'
|
5
5
|
options[:size] ||= 1_000_000
|
6
6
|
options[:error_rate] ||= 0.01
|
7
7
|
options[:key_name] ||= 'polipus-bloomfilter'
|
@@ -11,10 +11,10 @@ module Polipus
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def self.redis_set(options = {})
|
14
|
-
require
|
14
|
+
require 'polipus/url_tracker/redis_set'
|
15
15
|
options[:redis] ||= Redis.current
|
16
16
|
options[:key_name] ||= 'polipus-set'
|
17
17
|
self::RedisSet.new options
|
18
18
|
end
|
19
19
|
end
|
20
|
-
end
|
20
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'redis-bloomfilter'
|
2
2
|
module Polipus
|
3
3
|
module UrlTracker
|
4
4
|
class Bloomfilter
|
@@ -10,18 +10,17 @@ module Polipus
|
|
10
10
|
@bf.include?(url)
|
11
11
|
end
|
12
12
|
|
13
|
-
def visit
|
13
|
+
def visit(url)
|
14
14
|
@bf.insert url
|
15
15
|
end
|
16
16
|
|
17
|
-
def remove
|
17
|
+
def remove(url)
|
18
18
|
@bf.remove url
|
19
19
|
end
|
20
20
|
|
21
21
|
def clear
|
22
22
|
@bf.clear
|
23
23
|
end
|
24
|
-
|
25
24
|
end
|
26
25
|
end
|
27
|
-
end
|
26
|
+
end
|
@@ -1,21 +1,20 @@
|
|
1
1
|
module Polipus
|
2
2
|
module UrlTracker
|
3
3
|
class RedisSet
|
4
|
-
|
5
4
|
def initialize(options = {})
|
6
5
|
@redis = options[:redis] || Redis.current
|
7
6
|
@set_name = options[:key_name]
|
8
7
|
end
|
9
8
|
|
10
9
|
def visited?(url)
|
11
|
-
@redis.sismember(@set_name,url)
|
10
|
+
@redis.sismember(@set_name, url)
|
12
11
|
end
|
13
12
|
|
14
|
-
def visit
|
13
|
+
def visit(url)
|
15
14
|
@redis.sadd(@set_name, url)
|
16
15
|
end
|
17
16
|
|
18
|
-
def remove
|
17
|
+
def remove(url)
|
19
18
|
@redis.srem(@set_name, url, 0)
|
20
19
|
end
|
21
20
|
|