parallel588_polipus 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'singleton'
|
3
|
+
module Polipus
|
4
|
+
class SignalHandler
|
5
|
+
include Singleton
|
6
|
+
attr_accessor :terminated
|
7
|
+
attr_accessor :enabled
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
self.terminated = false
|
11
|
+
self.enabled = false
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.enable
|
15
|
+
trap(:INT) do
|
16
|
+
exit unless self.enabled?
|
17
|
+
terminate
|
18
|
+
end
|
19
|
+
trap(:TERM) do
|
20
|
+
exit unless self.enabled?
|
21
|
+
terminate
|
22
|
+
end
|
23
|
+
instance.enabled = true
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.disable
|
27
|
+
instance.enabled = false
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.terminate
|
31
|
+
instance.terminated = true
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.terminated?
|
35
|
+
instance.terminated
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.enabled?
|
39
|
+
instance.enabled
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'polipus/storage/base'
|
2
|
+
|
3
|
+
module Polipus
|
4
|
+
module Storage
|
5
|
+
COLLECTION = 'pages'
|
6
|
+
|
7
|
+
def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
|
8
|
+
require 'polipus/storage/mongo_store'
|
9
|
+
mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
10
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
|
11
|
+
self::MongoStore.new(mongo: mongo, collection: collection, except: except)
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.rethink_store(conn = nil, table = COLLECTION, except = [])
|
15
|
+
require 'polipus/storage/rethink_store'
|
16
|
+
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
|
17
|
+
fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
|
18
|
+
self::RethinkStore.new(conn: conn, table: table, except: except)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.dev_null
|
22
|
+
require 'polipus/storage/dev_null'
|
23
|
+
self::DevNull.new
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.memory_store
|
27
|
+
require 'polipus/storage/memory_store'
|
28
|
+
self::MemoryStore.new
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Polipus
|
5
|
+
module Storage
|
6
|
+
class Base
|
7
|
+
attr_accessor :include_query_string_in_uuid
|
8
|
+
|
9
|
+
protected
|
10
|
+
|
11
|
+
def uuid(page)
|
12
|
+
if @include_query_string_in_uuid.nil?
|
13
|
+
@include_query_string_in_uuid = true
|
14
|
+
end
|
15
|
+
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
|
16
|
+
Digest::MD5.hexdigest(url_to_hash)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Storage
|
4
|
+
class DevNull < Base
|
5
|
+
def initialize(_options = {})
|
6
|
+
end
|
7
|
+
|
8
|
+
def add(_page)
|
9
|
+
end
|
10
|
+
|
11
|
+
def exists?(_page)
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def get(_page)
|
16
|
+
nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def remove(_page)
|
20
|
+
false
|
21
|
+
end
|
22
|
+
|
23
|
+
def count
|
24
|
+
0
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
yield nil
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'thread'
|
3
|
+
module Polipus
|
4
|
+
module Storage
|
5
|
+
class MemoryStore < Base
|
6
|
+
def initialize(_options = {})
|
7
|
+
@store = {}
|
8
|
+
@semaphore = Mutex.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def add(page)
|
12
|
+
@semaphore.synchronize do
|
13
|
+
u = uuid(page)
|
14
|
+
@store[u] = page
|
15
|
+
u
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def exists?(page)
|
20
|
+
@semaphore.synchronize do
|
21
|
+
@store.key?(uuid(page))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def get(page)
|
26
|
+
@semaphore.synchronize do
|
27
|
+
@store[uuid(page)]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def remove(page)
|
32
|
+
@semaphore.synchronize do
|
33
|
+
@store.delete(uuid(page))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def count
|
38
|
+
@semaphore.synchronize do
|
39
|
+
@store.count
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
@store.each do |k, v|
|
45
|
+
yield k, v
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def clear
|
50
|
+
@semaphore.synchronize do
|
51
|
+
@store = Hash.new
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'mongo'
|
3
|
+
require 'zlib'
|
4
|
+
require 'thread'
|
5
|
+
|
6
|
+
module Polipus
|
7
|
+
module Storage
|
8
|
+
class MongoStore < Base
|
9
|
+
BINARY_FIELDS = %w(body headers data)
|
10
|
+
def initialize(options = {})
|
11
|
+
@mongo = options[:mongo]
|
12
|
+
@collection = options[:collection]
|
13
|
+
@mongo.create_collection(@collection)
|
14
|
+
begin
|
15
|
+
@mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
|
16
|
+
rescue Exception
|
17
|
+
end
|
18
|
+
|
19
|
+
@compress_body = options[:compress_body] ||= true
|
20
|
+
@except = options[:except] ||= []
|
21
|
+
@semaphore = Mutex.new
|
22
|
+
end
|
23
|
+
|
24
|
+
def add(page)
|
25
|
+
@semaphore.synchronize do
|
26
|
+
obj = page.to_hash
|
27
|
+
@except.each { |e| obj.delete e.to_s }
|
28
|
+
obj['uuid'] = uuid(page)
|
29
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
30
|
+
BINARY_FIELDS.each do |field|
|
31
|
+
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
32
|
+
end
|
33
|
+
@mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
|
34
|
+
obj['uuid']
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def exists?(page)
|
39
|
+
@semaphore.synchronize do
|
40
|
+
doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
|
41
|
+
!doc.nil?
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def get(page)
|
46
|
+
@semaphore.synchronize do
|
47
|
+
data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
|
48
|
+
return load_page(data) if data
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def remove(page)
|
53
|
+
@semaphore.synchronize do
|
54
|
+
@mongo[@collection].remove(uuid: uuid(page))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def count
|
59
|
+
@mongo[@collection].count
|
60
|
+
end
|
61
|
+
|
62
|
+
def each
|
63
|
+
@mongo[@collection].find({}, timeout: false) do |cursor|
|
64
|
+
cursor.each do |doc|
|
65
|
+
page = load_page(doc)
|
66
|
+
yield doc['uuid'], page
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def clear
|
72
|
+
@mongo[@collection].drop
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def load_page(hash)
|
78
|
+
BINARY_FIELDS.each do |field|
|
79
|
+
hash[field] = hash[field].to_s
|
80
|
+
end
|
81
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
82
|
+
page = Page.from_hash(hash)
|
83
|
+
if page.fetched_at.nil?
|
84
|
+
page.fetched_at = hash['_id'].generation_time.to_i
|
85
|
+
end
|
86
|
+
page
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'rethinkdb'
|
3
|
+
require 'thread'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
module Polipus
|
7
|
+
module Storage
|
8
|
+
class RethinkStore < Base
|
9
|
+
BINARY_FIELDS = %w(body headers data)
|
10
|
+
def initialize(options = {})
|
11
|
+
@r = RethinkDB::RQL.new
|
12
|
+
@rethink = options[:conn]
|
13
|
+
@table = options[:table]
|
14
|
+
|
15
|
+
unless @r.table_list.run(@rethink).include?(@table)
|
16
|
+
@r.table_create(@table).run(@rethink)
|
17
|
+
@r.table(@table).index_create('created_at')
|
18
|
+
end
|
19
|
+
|
20
|
+
@compress_body = options[:compress_body] ||= true
|
21
|
+
@except = options[:except] ||= []
|
22
|
+
@semaphore = Mutex.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def add(page)
|
26
|
+
@semaphore.synchronize do
|
27
|
+
obj = page.to_hash
|
28
|
+
@except.each { |e| obj.delete e.to_s }
|
29
|
+
obj[:id] = uuid(page)
|
30
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
31
|
+
obj['created_at'] ||= Time.now.to_i
|
32
|
+
BINARY_FIELDS.each do |field|
|
33
|
+
# Use some marshalling?
|
34
|
+
obj[field] = @r.binary(obj[field]) unless obj[field].nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
@r.table(@table).insert(obj).run(@rethink, durability: 'soft')
|
38
|
+
obj[:id]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def exists?(page)
|
43
|
+
@semaphore.synchronize do
|
44
|
+
doc = @r.table(@table).get(uuid(page)).run(@rethink)
|
45
|
+
!doc.nil?
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def get(page)
|
50
|
+
@semaphore.synchronize do
|
51
|
+
data = @r.table(@table).get(uuid(page)).run(@rethink)
|
52
|
+
return load_page(data) if data
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def remove(page)
|
57
|
+
@semaphore.synchronize do
|
58
|
+
@r.table(@table).get(uuid(page)).delete.run(@rethink)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def count
|
63
|
+
@r.table(@table).count.run(@rethink)
|
64
|
+
end
|
65
|
+
|
66
|
+
def each
|
67
|
+
@r.table(@table).run(@rethink).each do |doc|
|
68
|
+
page = load_page(doc)
|
69
|
+
yield doc[:id], page
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def clear
|
74
|
+
@r.table(@table).delete.run(@rethink)
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def load_page(hash)
|
80
|
+
BINARY_FIELDS.each do |field|
|
81
|
+
hash[field] = hash[field].to_s
|
82
|
+
end
|
83
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
84
|
+
page = Page.from_hash(hash)
|
85
|
+
page.fetched_at ||= hash['created_at']
|
86
|
+
page
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module UrlTracker
|
4
|
+
def self.bloomfilter(options = {})
|
5
|
+
require 'polipus/url_tracker/bloomfilter'
|
6
|
+
options[:size] ||= 1_000_000
|
7
|
+
options[:error_rate] ||= 0.01
|
8
|
+
options[:key_name] ||= 'polipus-bloomfilter'
|
9
|
+
options[:redis] ||= Redis.current
|
10
|
+
options[:driver] ||= 'lua'
|
11
|
+
self::Bloomfilter.new options
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.redis_set(options = {})
|
15
|
+
require 'polipus/url_tracker/redis_set'
|
16
|
+
options[:redis] ||= Redis.current
|
17
|
+
options[:key_name] ||= 'polipus-set'
|
18
|
+
self::RedisSet.new options
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'redis-bloomfilter'
|
3
|
+
module Polipus
|
4
|
+
module UrlTracker
|
5
|
+
class Bloomfilter
|
6
|
+
def initialize(options = {})
|
7
|
+
@bf = Redis::Bloomfilter.new options
|
8
|
+
end
|
9
|
+
|
10
|
+
def visited?(url)
|
11
|
+
@bf.include?(url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def visit(url)
|
15
|
+
@bf.insert url
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove(url)
|
19
|
+
@bf.remove url
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear
|
23
|
+
@bf.clear
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module UrlTracker
|
4
|
+
class RedisSet
|
5
|
+
def initialize(options = {})
|
6
|
+
@redis = options[:redis] || Redis.current
|
7
|
+
@set_name = options[:key_name]
|
8
|
+
end
|
9
|
+
|
10
|
+
def visited?(url)
|
11
|
+
@redis.sismember(@set_name, url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def visit(url)
|
15
|
+
@redis.sadd(@set_name, url)
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove(url)
|
19
|
+
@redis.srem(@set_name, url, 0)
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear
|
23
|
+
@redis.del @set_name
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|