polipus 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
data/lib/polipus/storage/base.rb
CHANGED
@@ -1,17 +1,19 @@
|
|
1
|
-
require
|
1
|
+
require 'uri'
|
2
2
|
|
3
3
|
module Polipus
|
4
4
|
module Storage
|
5
5
|
class Base
|
6
6
|
attr_accessor :include_query_string_in_uuid
|
7
|
+
|
7
8
|
protected
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
|
13
|
-
Digest::MD5.hexdigest(url_to_hash)
|
9
|
+
|
10
|
+
def uuid(page)
|
11
|
+
if @include_query_string_in_uuid.nil?
|
12
|
+
@include_query_string_in_uuid = true
|
14
13
|
end
|
14
|
+
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
|
15
|
+
Digest::MD5.hexdigest(url_to_hash)
|
16
|
+
end
|
15
17
|
end
|
16
18
|
end
|
17
|
-
end
|
19
|
+
end
|
@@ -1,22 +1,21 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Storage
|
3
3
|
class DevNull < Base
|
4
|
-
|
5
|
-
def initialize(options = {})
|
4
|
+
def initialize(_options = {})
|
6
5
|
end
|
7
6
|
|
8
|
-
def add
|
7
|
+
def add(_page)
|
9
8
|
end
|
10
9
|
|
11
|
-
def exists?(
|
10
|
+
def exists?(_page)
|
12
11
|
false
|
13
12
|
end
|
14
13
|
|
15
|
-
def get
|
14
|
+
def get(_page)
|
16
15
|
nil
|
17
16
|
end
|
18
17
|
|
19
|
-
def remove
|
18
|
+
def remove(_page)
|
20
19
|
false
|
21
20
|
end
|
22
21
|
|
@@ -32,4 +31,4 @@ module Polipus
|
|
32
31
|
end
|
33
32
|
end
|
34
33
|
end
|
35
|
-
end
|
34
|
+
end
|
@@ -1,56 +1,55 @@
|
|
1
|
-
require
|
1
|
+
require 'thread'
|
2
2
|
module Polipus
|
3
3
|
module Storage
|
4
4
|
class MemoryStore < Base
|
5
|
-
|
6
|
-
|
7
|
-
@store = Hash.new
|
5
|
+
def initialize(_options = {})
|
6
|
+
@store = {}
|
8
7
|
@semaphore = Mutex.new
|
9
8
|
end
|
10
9
|
|
11
|
-
def add
|
12
|
-
@semaphore.synchronize
|
10
|
+
def add(page)
|
11
|
+
@semaphore.synchronize do
|
13
12
|
u = uuid(page)
|
14
13
|
@store[u] = page
|
15
14
|
u
|
16
|
-
|
15
|
+
end
|
17
16
|
end
|
18
17
|
|
19
18
|
def exists?(page)
|
20
|
-
@semaphore.synchronize
|
19
|
+
@semaphore.synchronize do
|
21
20
|
@store.key?(uuid(page))
|
22
|
-
|
21
|
+
end
|
23
22
|
end
|
24
23
|
|
25
|
-
def get
|
26
|
-
@semaphore.synchronize
|
24
|
+
def get(page)
|
25
|
+
@semaphore.synchronize do
|
27
26
|
@store[uuid(page)]
|
28
|
-
|
27
|
+
end
|
29
28
|
end
|
30
29
|
|
31
|
-
def remove
|
32
|
-
@semaphore.synchronize
|
30
|
+
def remove(page)
|
31
|
+
@semaphore.synchronize do
|
33
32
|
@store.delete(uuid(page))
|
34
|
-
|
33
|
+
end
|
35
34
|
end
|
36
35
|
|
37
36
|
def count
|
38
|
-
@semaphore.synchronize
|
37
|
+
@semaphore.synchronize do
|
39
38
|
@store.count
|
40
|
-
|
39
|
+
end
|
41
40
|
end
|
42
41
|
|
43
42
|
def each
|
44
|
-
@store.each do |k,v|
|
45
|
-
yield k,v
|
43
|
+
@store.each do |k, v|
|
44
|
+
yield k, v
|
46
45
|
end
|
47
46
|
end
|
48
47
|
|
49
48
|
def clear
|
50
|
-
@semaphore.synchronize
|
49
|
+
@semaphore.synchronize do
|
51
50
|
@store = Hash.new
|
52
|
-
|
51
|
+
end
|
53
52
|
end
|
54
53
|
end
|
55
54
|
end
|
56
|
-
end
|
55
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'mongo'
|
2
|
+
require 'zlib'
|
3
|
+
require 'thread'
|
4
4
|
module Polipus
|
5
5
|
module Storage
|
6
6
|
class MongoStore < Base
|
@@ -9,44 +9,44 @@ module Polipus
|
|
9
9
|
@mongo = options[:mongo]
|
10
10
|
@collection = options[:collection]
|
11
11
|
@mongo.create_collection(@collection)
|
12
|
-
@mongo[@collection].ensure_index(:uuid, :
|
12
|
+
@mongo[@collection].ensure_index(:uuid, unique: true, drop_dups: true, background: true)
|
13
13
|
@compress_body = options[:compress_body] ||= true
|
14
14
|
@except = options[:except] ||= []
|
15
15
|
@semaphore = Mutex.new
|
16
16
|
end
|
17
17
|
|
18
|
-
def add
|
19
|
-
@semaphore.synchronize
|
18
|
+
def add(page)
|
19
|
+
@semaphore.synchronize do
|
20
20
|
obj = page.to_hash
|
21
|
-
@except.each {|e| obj.delete e.to_s}
|
21
|
+
@except.each { |e| obj.delete e.to_s }
|
22
22
|
obj['uuid'] = uuid(page)
|
23
23
|
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
24
24
|
BINARY_FIELDS.each do |field|
|
25
25
|
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
26
26
|
end
|
27
|
-
@mongo[@collection].update({:
|
27
|
+
@mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
|
28
28
|
obj['uuid']
|
29
|
-
|
29
|
+
end
|
30
30
|
end
|
31
31
|
|
32
32
|
def exists?(page)
|
33
|
-
@semaphore.synchronize
|
34
|
-
doc = @mongo[@collection].find({:
|
33
|
+
@semaphore.synchronize do
|
34
|
+
doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
|
35
35
|
!doc.nil?
|
36
|
-
|
36
|
+
end
|
37
37
|
end
|
38
38
|
|
39
|
-
def get
|
40
|
-
@semaphore.synchronize
|
41
|
-
data = @mongo[@collection].find(
|
39
|
+
def get(page)
|
40
|
+
@semaphore.synchronize do
|
41
|
+
data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
|
42
42
|
return load_page(data) if data
|
43
|
-
|
43
|
+
end
|
44
44
|
end
|
45
45
|
|
46
|
-
def remove
|
47
|
-
@semaphore.synchronize
|
48
|
-
@mongo[@collection].remove(
|
49
|
-
|
46
|
+
def remove(page)
|
47
|
+
@semaphore.synchronize do
|
48
|
+
@mongo[@collection].remove(uuid: uuid(page))
|
49
|
+
end
|
50
50
|
end
|
51
51
|
|
52
52
|
def count
|
@@ -54,10 +54,10 @@ module Polipus
|
|
54
54
|
end
|
55
55
|
|
56
56
|
def each
|
57
|
-
@mongo[@collection].find({}
|
57
|
+
@mongo[@collection].find({}, timeout: false) do |cursor|
|
58
58
|
cursor.each do |doc|
|
59
59
|
page = load_page(doc)
|
60
|
-
yield doc['uuid'], page
|
60
|
+
yield doc['uuid'], page
|
61
61
|
end
|
62
62
|
end
|
63
63
|
end
|
@@ -67,22 +67,18 @@ module Polipus
|
|
67
67
|
end
|
68
68
|
|
69
69
|
private
|
70
|
-
def load_page(hash)
|
71
|
-
BINARY_FIELDS.each do |field|
|
72
|
-
hash[field] = hash[field].to_s
|
73
|
-
end
|
74
|
-
begin
|
75
|
-
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
76
|
-
page = Page.from_hash(hash)
|
77
|
-
if page.fetched_at.nil?
|
78
|
-
page.fetched_at = hash['_id'].generation_time.to_i
|
79
|
-
end
|
80
|
-
return page
|
81
|
-
rescue
|
82
|
-
end
|
83
|
-
nil
|
84
|
-
end
|
85
70
|
|
71
|
+
def load_page(hash)
|
72
|
+
BINARY_FIELDS.each do |field|
|
73
|
+
hash[field] = hash[field].to_s
|
74
|
+
end
|
75
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
76
|
+
page = Page.from_hash(hash)
|
77
|
+
if page.fetched_at.nil?
|
78
|
+
page.fetched_at = hash['_id'].generation_time.to_i
|
79
|
+
end
|
80
|
+
page
|
81
|
+
end
|
86
82
|
end
|
87
83
|
end
|
88
|
-
end
|
84
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
1
|
+
require 'aws/s3'
|
2
|
+
require 'zlib'
|
3
|
+
require 'thread'
|
4
|
+
require 'json'
|
5
5
|
module Polipus
|
6
6
|
module Storage
|
7
7
|
class S3Store < Base
|
@@ -11,8 +11,8 @@ module Polipus
|
|
11
11
|
@semaphore = Mutex.new
|
12
12
|
|
13
13
|
AWS::S3::Base.establish_connection!(
|
14
|
-
:
|
15
|
-
:
|
14
|
+
access_key_id: @options[:access_key_id],
|
15
|
+
secret_access_key: @options[:secret_access_key]
|
16
16
|
)
|
17
17
|
@options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
|
18
18
|
begin
|
@@ -22,39 +22,37 @@ module Polipus
|
|
22
22
|
end
|
23
23
|
end
|
24
24
|
|
25
|
-
def add
|
26
|
-
@semaphore.synchronize
|
25
|
+
def add(page)
|
26
|
+
@semaphore.synchronize do
|
27
27
|
obj = page.to_hash
|
28
|
-
@except.each {|e| obj.delete e.to_s}
|
28
|
+
@except.each { |e| obj.delete e.to_s }
|
29
29
|
puuid = uuid(page)
|
30
30
|
obj['uuid'] = puuid
|
31
31
|
data = Zlib::Deflate.deflate(obj.to_json)
|
32
32
|
AWS::S3::S3Object.store(puuid, data, @bucket.name)
|
33
33
|
puuid
|
34
|
-
|
34
|
+
end
|
35
35
|
end
|
36
36
|
|
37
37
|
def exists?(page)
|
38
38
|
AWS::S3::S3Object.exists? uuid(page), @bucket.name
|
39
39
|
end
|
40
40
|
|
41
|
-
def get
|
42
|
-
@semaphore.synchronize
|
41
|
+
def get(page)
|
42
|
+
@semaphore.synchronize do
|
43
43
|
if exists?(page)
|
44
44
|
data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
|
45
45
|
return load_page(data)
|
46
46
|
end
|
47
47
|
nil
|
48
|
-
|
48
|
+
end
|
49
49
|
end
|
50
50
|
|
51
|
-
def remove
|
52
|
-
@semaphore.synchronize
|
53
|
-
|
54
|
-
AWS::S3::S3Object.delete(uuid(page), @bucket.name)
|
55
|
-
end
|
51
|
+
def remove(page)
|
52
|
+
@semaphore.synchronize do
|
53
|
+
exists?(page) && AWS::S3::S3Object.delete(uuid(page), @bucket.name)
|
56
54
|
true
|
57
|
-
|
55
|
+
end
|
58
56
|
end
|
59
57
|
|
60
58
|
def count
|
@@ -62,39 +60,36 @@ module Polipus
|
|
62
60
|
end
|
63
61
|
|
64
62
|
def clear
|
65
|
-
AWS::S3::Bucket.delete(@bucket.name, :
|
63
|
+
AWS::S3::Bucket.delete(@bucket.name, force: true)
|
66
64
|
create_bucket
|
67
65
|
end
|
68
66
|
|
69
67
|
def each
|
70
68
|
objects = []
|
71
69
|
last_key = nil
|
72
|
-
|
73
|
-
objects = AWS::S3::Bucket.objects(@bucket.name, :
|
70
|
+
loop do
|
71
|
+
objects = AWS::S3::Bucket.objects(@bucket.name, marker: last_key)
|
74
72
|
break if objects.size == 0
|
75
73
|
objects.each do |o|
|
76
74
|
page = load_page(o.value)
|
77
|
-
yield o.key, page
|
75
|
+
yield o.key, page
|
78
76
|
end
|
79
77
|
last_key = objects.last.key
|
80
|
-
end
|
78
|
+
end
|
81
79
|
end
|
82
80
|
|
83
81
|
private
|
84
|
-
def load_page(data)
|
85
|
-
begin
|
86
|
-
payload = Zlib::Inflate.inflate(data)
|
87
|
-
hash = JSON.parse(payload)
|
88
|
-
return Page.from_hash(hash)
|
89
|
-
rescue
|
90
|
-
end
|
91
|
-
nil
|
92
|
-
end
|
93
82
|
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
83
|
+
def load_page(data)
|
84
|
+
payload = Zlib::Inflate.inflate(data)
|
85
|
+
hash = JSON.parse(payload)
|
86
|
+
Page.from_hash(hash)
|
87
|
+
end
|
88
|
+
|
89
|
+
def create_bucket
|
90
|
+
AWS::S3::Bucket.create(@options[:bucket])
|
91
|
+
@bucket = AWS::S3::Bucket.find(@options[:bucket])
|
92
|
+
end
|
98
93
|
end
|
99
94
|
end
|
100
|
-
end
|
95
|
+
end
|
data/lib/polipus/url_tracker.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Polipus
|
2
2
|
module UrlTracker
|
3
3
|
def self.bloomfilter(options = {})
|
4
|
-
require
|
4
|
+
require 'polipus/url_tracker/bloomfilter'
|
5
5
|
options[:size] ||= 1_000_000
|
6
6
|
options[:error_rate] ||= 0.01
|
7
7
|
options[:key_name] ||= 'polipus-bloomfilter'
|
@@ -11,10 +11,10 @@ module Polipus
|
|
11
11
|
end
|
12
12
|
|
13
13
|
def self.redis_set(options = {})
|
14
|
-
require
|
14
|
+
require 'polipus/url_tracker/redis_set'
|
15
15
|
options[:redis] ||= Redis.current
|
16
16
|
options[:key_name] ||= 'polipus-set'
|
17
17
|
self::RedisSet.new options
|
18
18
|
end
|
19
19
|
end
|
20
|
-
end
|
20
|
+
end
|
@@ -1,4 +1,4 @@
|
|
1
|
-
require
|
1
|
+
require 'redis-bloomfilter'
|
2
2
|
module Polipus
|
3
3
|
module UrlTracker
|
4
4
|
class Bloomfilter
|
@@ -10,18 +10,17 @@ module Polipus
|
|
10
10
|
@bf.include?(url)
|
11
11
|
end
|
12
12
|
|
13
|
-
def visit
|
13
|
+
def visit(url)
|
14
14
|
@bf.insert url
|
15
15
|
end
|
16
16
|
|
17
|
-
def remove
|
17
|
+
def remove(url)
|
18
18
|
@bf.remove url
|
19
19
|
end
|
20
20
|
|
21
21
|
def clear
|
22
22
|
@bf.clear
|
23
23
|
end
|
24
|
-
|
25
24
|
end
|
26
25
|
end
|
27
|
-
end
|
26
|
+
end
|
@@ -1,21 +1,20 @@
|
|
1
1
|
module Polipus
|
2
2
|
module UrlTracker
|
3
3
|
class RedisSet
|
4
|
-
|
5
4
|
def initialize(options = {})
|
6
5
|
@redis = options[:redis] || Redis.current
|
7
6
|
@set_name = options[:key_name]
|
8
7
|
end
|
9
8
|
|
10
9
|
def visited?(url)
|
11
|
-
@redis.sismember(@set_name,url)
|
10
|
+
@redis.sismember(@set_name, url)
|
12
11
|
end
|
13
12
|
|
14
|
-
def visit
|
13
|
+
def visit(url)
|
15
14
|
@redis.sadd(@set_name, url)
|
16
15
|
end
|
17
16
|
|
18
|
-
def remove
|
17
|
+
def remove(url)
|
19
18
|
@redis.srem(@set_name, url, 0)
|
20
19
|
end
|
21
20
|
|