parallel588_polipus 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +33 -0
- data/.travis.yml +22 -0
- data/AUTHORS.md +5 -0
- data/CHANGELOG.md +61 -0
- data/Gemfile +12 -0
- data/LICENSE.txt +20 -0
- data/README.md +70 -0
- data/Rakefile +8 -0
- data/examples/basic.rb +63 -0
- data/examples/error_handling.rb +23 -0
- data/examples/incremental.rb +63 -0
- data/examples/robots_txt_handling.rb +14 -0
- data/examples/survival.rb +10 -0
- data/lib/polipus.rb +488 -0
- data/lib/polipus/http.rb +282 -0
- data/lib/polipus/page.rb +256 -0
- data/lib/polipus/plugin.rb +14 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +15 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +26 -0
- data/lib/polipus/queue_overflow/base.rb +7 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
- data/lib/polipus/queue_overflow/manager.rb +57 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
- data/lib/polipus/queue_overflow/worker.rb +24 -0
- data/lib/polipus/robotex.rb +145 -0
- data/lib/polipus/signal_handler.rb +42 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +20 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/memory_store.rb +56 -0
- data/lib/polipus/storage/mongo_store.rb +90 -0
- data/lib/polipus/storage/rethink_store.rb +90 -0
- data/lib/polipus/url_tracker.rb +21 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +5 -0
- data/polipus.gemspec +44 -0
- data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
- data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
- data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
- data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
- data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
- data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
- data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
- data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
- data/spec/cassettes/gzipped_on.yml +147 -0
- data/spec/cassettes/http_cookies.yml +133 -0
- data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +12 -0
- data/spec/polipus/http_spec.rb +139 -0
- data/spec/polipus/page_spec.rb +68 -0
- data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
- data/spec/polipus/queue_overflow_spec.rb +66 -0
- data/spec/polipus/robotex_spec.rb +85 -0
- data/spec/polipus/signal_handler_spec.rb +15 -0
- data/spec/polipus/storage/memory_store_spec.rb +87 -0
- data/spec/polipus/storage/mongo_store_spec.rb +119 -0
- data/spec/polipus/storage/rethink_store_spec.rb +117 -0
- data/spec/polipus/url_tracker_spec.rb +29 -0
- data/spec/polipus_spec.rb +107 -0
- data/spec/spec_helper.rb +42 -0
- metadata +348 -0
@@ -0,0 +1,42 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'singleton'
|
3
|
+
module Polipus
|
4
|
+
class SignalHandler
|
5
|
+
include Singleton
|
6
|
+
attr_accessor :terminated
|
7
|
+
attr_accessor :enabled
|
8
|
+
|
9
|
+
def initialize
|
10
|
+
self.terminated = false
|
11
|
+
self.enabled = false
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.enable
|
15
|
+
trap(:INT) do
|
16
|
+
exit unless self.enabled?
|
17
|
+
terminate
|
18
|
+
end
|
19
|
+
trap(:TERM) do
|
20
|
+
exit unless self.enabled?
|
21
|
+
terminate
|
22
|
+
end
|
23
|
+
instance.enabled = true
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.disable
|
27
|
+
instance.enabled = false
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.terminate
|
31
|
+
instance.terminated = true
|
32
|
+
end
|
33
|
+
|
34
|
+
def self.terminated?
|
35
|
+
instance.terminated
|
36
|
+
end
|
37
|
+
|
38
|
+
def self.enabled?
|
39
|
+
instance.enabled
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'polipus/storage/base'
|
2
|
+
|
3
|
+
module Polipus
|
4
|
+
module Storage
|
5
|
+
COLLECTION = 'pages'
|
6
|
+
|
7
|
+
def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
|
8
|
+
require 'polipus/storage/mongo_store'
|
9
|
+
mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
10
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
|
11
|
+
self::MongoStore.new(mongo: mongo, collection: collection, except: except)
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.rethink_store(conn = nil, table = COLLECTION, except = [])
|
15
|
+
require 'polipus/storage/rethink_store'
|
16
|
+
conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
|
17
|
+
fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
|
18
|
+
self::RethinkStore.new(conn: conn, table: table, except: except)
|
19
|
+
end
|
20
|
+
|
21
|
+
def self.dev_null
|
22
|
+
require 'polipus/storage/dev_null'
|
23
|
+
self::DevNull.new
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.memory_store
|
27
|
+
require 'polipus/storage/memory_store'
|
28
|
+
self::MemoryStore.new
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'uri'
|
3
|
+
|
4
|
+
module Polipus
|
5
|
+
module Storage
|
6
|
+
class Base
|
7
|
+
attr_accessor :include_query_string_in_uuid
|
8
|
+
|
9
|
+
protected
|
10
|
+
|
11
|
+
def uuid(page)
|
12
|
+
if @include_query_string_in_uuid.nil?
|
13
|
+
@include_query_string_in_uuid = true
|
14
|
+
end
|
15
|
+
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
|
16
|
+
Digest::MD5.hexdigest(url_to_hash)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module Storage
|
4
|
+
class DevNull < Base
|
5
|
+
def initialize(_options = {})
|
6
|
+
end
|
7
|
+
|
8
|
+
def add(_page)
|
9
|
+
end
|
10
|
+
|
11
|
+
def exists?(_page)
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def get(_page)
|
16
|
+
nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def remove(_page)
|
20
|
+
false
|
21
|
+
end
|
22
|
+
|
23
|
+
def count
|
24
|
+
0
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
yield nil
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'thread'
|
3
|
+
module Polipus
|
4
|
+
module Storage
|
5
|
+
class MemoryStore < Base
|
6
|
+
def initialize(_options = {})
|
7
|
+
@store = {}
|
8
|
+
@semaphore = Mutex.new
|
9
|
+
end
|
10
|
+
|
11
|
+
def add(page)
|
12
|
+
@semaphore.synchronize do
|
13
|
+
u = uuid(page)
|
14
|
+
@store[u] = page
|
15
|
+
u
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def exists?(page)
|
20
|
+
@semaphore.synchronize do
|
21
|
+
@store.key?(uuid(page))
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def get(page)
|
26
|
+
@semaphore.synchronize do
|
27
|
+
@store[uuid(page)]
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def remove(page)
|
32
|
+
@semaphore.synchronize do
|
33
|
+
@store.delete(uuid(page))
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def count
|
38
|
+
@semaphore.synchronize do
|
39
|
+
@store.count
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def each
|
44
|
+
@store.each do |k, v|
|
45
|
+
yield k, v
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def clear
|
50
|
+
@semaphore.synchronize do
|
51
|
+
@store = Hash.new
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'mongo'
|
3
|
+
require 'zlib'
|
4
|
+
require 'thread'
|
5
|
+
|
6
|
+
module Polipus
|
7
|
+
module Storage
|
8
|
+
class MongoStore < Base
|
9
|
+
BINARY_FIELDS = %w(body headers data)
|
10
|
+
def initialize(options = {})
|
11
|
+
@mongo = options[:mongo]
|
12
|
+
@collection = options[:collection]
|
13
|
+
@mongo.create_collection(@collection)
|
14
|
+
begin
|
15
|
+
@mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
|
16
|
+
rescue Exception
|
17
|
+
end
|
18
|
+
|
19
|
+
@compress_body = options[:compress_body] ||= true
|
20
|
+
@except = options[:except] ||= []
|
21
|
+
@semaphore = Mutex.new
|
22
|
+
end
|
23
|
+
|
24
|
+
def add(page)
|
25
|
+
@semaphore.synchronize do
|
26
|
+
obj = page.to_hash
|
27
|
+
@except.each { |e| obj.delete e.to_s }
|
28
|
+
obj['uuid'] = uuid(page)
|
29
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
30
|
+
BINARY_FIELDS.each do |field|
|
31
|
+
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
32
|
+
end
|
33
|
+
@mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
|
34
|
+
obj['uuid']
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
def exists?(page)
|
39
|
+
@semaphore.synchronize do
|
40
|
+
doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
|
41
|
+
!doc.nil?
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def get(page)
|
46
|
+
@semaphore.synchronize do
|
47
|
+
data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
|
48
|
+
return load_page(data) if data
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
def remove(page)
|
53
|
+
@semaphore.synchronize do
|
54
|
+
@mongo[@collection].remove(uuid: uuid(page))
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
def count
|
59
|
+
@mongo[@collection].count
|
60
|
+
end
|
61
|
+
|
62
|
+
def each
|
63
|
+
@mongo[@collection].find({}, timeout: false) do |cursor|
|
64
|
+
cursor.each do |doc|
|
65
|
+
page = load_page(doc)
|
66
|
+
yield doc['uuid'], page
|
67
|
+
end
|
68
|
+
end
|
69
|
+
end
|
70
|
+
|
71
|
+
def clear
|
72
|
+
@mongo[@collection].drop
|
73
|
+
end
|
74
|
+
|
75
|
+
private
|
76
|
+
|
77
|
+
def load_page(hash)
|
78
|
+
BINARY_FIELDS.each do |field|
|
79
|
+
hash[field] = hash[field].to_s
|
80
|
+
end
|
81
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
82
|
+
page = Page.from_hash(hash)
|
83
|
+
if page.fetched_at.nil?
|
84
|
+
page.fetched_at = hash['_id'].generation_time.to_i
|
85
|
+
end
|
86
|
+
page
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'rethinkdb'
|
3
|
+
require 'thread'
|
4
|
+
require 'zlib'
|
5
|
+
|
6
|
+
module Polipus
|
7
|
+
module Storage
|
8
|
+
class RethinkStore < Base
|
9
|
+
BINARY_FIELDS = %w(body headers data)
|
10
|
+
def initialize(options = {})
|
11
|
+
@r = RethinkDB::RQL.new
|
12
|
+
@rethink = options[:conn]
|
13
|
+
@table = options[:table]
|
14
|
+
|
15
|
+
unless @r.table_list.run(@rethink).include?(@table)
|
16
|
+
@r.table_create(@table).run(@rethink)
|
17
|
+
@r.table(@table).index_create('created_at')
|
18
|
+
end
|
19
|
+
|
20
|
+
@compress_body = options[:compress_body] ||= true
|
21
|
+
@except = options[:except] ||= []
|
22
|
+
@semaphore = Mutex.new
|
23
|
+
end
|
24
|
+
|
25
|
+
def add(page)
|
26
|
+
@semaphore.synchronize do
|
27
|
+
obj = page.to_hash
|
28
|
+
@except.each { |e| obj.delete e.to_s }
|
29
|
+
obj[:id] = uuid(page)
|
30
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
31
|
+
obj['created_at'] ||= Time.now.to_i
|
32
|
+
BINARY_FIELDS.each do |field|
|
33
|
+
# Use some marshalling?
|
34
|
+
obj[field] = @r.binary(obj[field]) unless obj[field].nil?
|
35
|
+
end
|
36
|
+
|
37
|
+
@r.table(@table).insert(obj).run(@rethink, durability: 'soft')
|
38
|
+
obj[:id]
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def exists?(page)
|
43
|
+
@semaphore.synchronize do
|
44
|
+
doc = @r.table(@table).get(uuid(page)).run(@rethink)
|
45
|
+
!doc.nil?
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
def get(page)
|
50
|
+
@semaphore.synchronize do
|
51
|
+
data = @r.table(@table).get(uuid(page)).run(@rethink)
|
52
|
+
return load_page(data) if data
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def remove(page)
|
57
|
+
@semaphore.synchronize do
|
58
|
+
@r.table(@table).get(uuid(page)).delete.run(@rethink)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def count
|
63
|
+
@r.table(@table).count.run(@rethink)
|
64
|
+
end
|
65
|
+
|
66
|
+
def each
|
67
|
+
@r.table(@table).run(@rethink).each do |doc|
|
68
|
+
page = load_page(doc)
|
69
|
+
yield doc[:id], page
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def clear
|
74
|
+
@r.table(@table).delete.run(@rethink)
|
75
|
+
end
|
76
|
+
|
77
|
+
private
|
78
|
+
|
79
|
+
def load_page(hash)
|
80
|
+
BINARY_FIELDS.each do |field|
|
81
|
+
hash[field] = hash[field].to_s
|
82
|
+
end
|
83
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
84
|
+
page = Page.from_hash(hash)
|
85
|
+
page.fetched_at ||= hash['created_at']
|
86
|
+
page
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module UrlTracker
|
4
|
+
def self.bloomfilter(options = {})
|
5
|
+
require 'polipus/url_tracker/bloomfilter'
|
6
|
+
options[:size] ||= 1_000_000
|
7
|
+
options[:error_rate] ||= 0.01
|
8
|
+
options[:key_name] ||= 'polipus-bloomfilter'
|
9
|
+
options[:redis] ||= Redis.current
|
10
|
+
options[:driver] ||= 'lua'
|
11
|
+
self::Bloomfilter.new options
|
12
|
+
end
|
13
|
+
|
14
|
+
def self.redis_set(options = {})
|
15
|
+
require 'polipus/url_tracker/redis_set'
|
16
|
+
options[:redis] ||= Redis.current
|
17
|
+
options[:key_name] ||= 'polipus-set'
|
18
|
+
self::RedisSet.new options
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require 'redis-bloomfilter'
|
3
|
+
module Polipus
|
4
|
+
module UrlTracker
|
5
|
+
class Bloomfilter
|
6
|
+
def initialize(options = {})
|
7
|
+
@bf = Redis::Bloomfilter.new options
|
8
|
+
end
|
9
|
+
|
10
|
+
def visited?(url)
|
11
|
+
@bf.include?(url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def visit(url)
|
15
|
+
@bf.insert url
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove(url)
|
19
|
+
@bf.remove url
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear
|
23
|
+
@bf.clear
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
module Polipus
|
3
|
+
module UrlTracker
|
4
|
+
class RedisSet
|
5
|
+
def initialize(options = {})
|
6
|
+
@redis = options[:redis] || Redis.current
|
7
|
+
@set_name = options[:key_name]
|
8
|
+
end
|
9
|
+
|
10
|
+
def visited?(url)
|
11
|
+
@redis.sismember(@set_name, url)
|
12
|
+
end
|
13
|
+
|
14
|
+
def visit(url)
|
15
|
+
@redis.sadd(@set_name, url)
|
16
|
+
end
|
17
|
+
|
18
|
+
def remove(url)
|
19
|
+
@redis.srem(@set_name, url, 0)
|
20
|
+
end
|
21
|
+
|
22
|
+
def clear
|
23
|
+
@redis.del @set_name
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|