polipus 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +3 -0
- data/Rakefile +9 -0
- data/examples/basic.rb +58 -0
- data/examples/survival.rb +9 -0
- data/lib/polipus.rb +451 -0
- data/lib/polipus/http.rb +195 -0
- data/lib/polipus/page.rb +219 -0
- data/lib/polipus/plugin.rb +13 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +17 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +24 -0
- data/lib/polipus/queue_overflow/base.rb +6 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
- data/lib/polipus/queue_overflow/manager.rb +50 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +17 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/mongo_store.rb +86 -0
- data/lib/polipus/storage/s3_store.rb +100 -0
- data/lib/polipus/url_tracker.rb +20 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +4 -0
- data/polipus.gemspec +39 -0
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +11 -0
- data/spec/http_spec.rb +31 -0
- data/spec/page_spec.rb +22 -0
- data/spec/queue_overflow_manager_spec.rb +89 -0
- data/spec/queue_overflow_spec.rb +71 -0
- data/spec/spec_helper.rb +34 -0
- data/spec/storage_mongo_spec.rb +102 -0
- data/spec/storage_s3_spec.rb +115 -0
- data/spec/url_tracker_spec.rb +28 -0
- metadata +313 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Plugin
|
3
|
+
class Sample
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize crawler
|
10
|
+
Proc.new {
|
11
|
+
@options.each { |k,v| @logger.info {"Polipus configuration: #{k.to_s} => #{v}"} }
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Plugin
|
3
|
+
class Sleeper
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@delay = options[:delay] ||= 1
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize crawler
|
10
|
+
crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
|
11
|
+
Proc.new {
|
12
|
+
# Set to 1 the number of threads
|
13
|
+
@options[:workers] = 1
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
def on_message_processed crawler
|
18
|
+
sleep @delay
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "polipus/queue_overflow/manager"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
def self.mongo_queue(mongo_db, queue_name, options = {})
|
5
|
+
require "polipus/queue_overflow/mongo_queue"
|
6
|
+
mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
|
7
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
|
8
|
+
self::MongoQueue.new mongo_db, queue_name, options
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
12
|
+
require "polipus/queue_overflow/mongo_queue_capped"
|
13
|
+
mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
|
14
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
|
15
|
+
options[:max] = 1_000_000 if options[:max].nil?
|
16
|
+
self::MongoQueueCapped.new mongo_db, queue_name, options
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.dev_null_queue(options = {})
|
20
|
+
require "polipus/queue_overflow/dev_null_queue"
|
21
|
+
self::DevNullQueue.new
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "thread"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class DevNullQueue
|
5
|
+
def initialize
|
6
|
+
end
|
7
|
+
|
8
|
+
def length
|
9
|
+
0
|
10
|
+
end
|
11
|
+
|
12
|
+
def empty?
|
13
|
+
true
|
14
|
+
end
|
15
|
+
|
16
|
+
def clear
|
17
|
+
end
|
18
|
+
|
19
|
+
def push data
|
20
|
+
end
|
21
|
+
|
22
|
+
def pop(_ = false)
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
|
26
|
+
alias :size :length
|
27
|
+
alias :dec :pop
|
28
|
+
alias :shift :pop
|
29
|
+
alias :enc :push
|
30
|
+
alias :<< :push
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Polipus
|
2
|
+
module QueueOverflow
|
3
|
+
class Manager
|
4
|
+
attr_accessor :url_filter
|
5
|
+
def initialize(polipus, main_q, item_limit)
|
6
|
+
@polipus = polipus
|
7
|
+
@main_q = main_q
|
8
|
+
@adapter = @polipus.queue_overflow_adapter
|
9
|
+
@item_limit = item_limit
|
10
|
+
end
|
11
|
+
|
12
|
+
def url_filter &block
|
13
|
+
@url_filter = block
|
14
|
+
end
|
15
|
+
|
16
|
+
def perform
|
17
|
+
removed = 0
|
18
|
+
restored = 0
|
19
|
+
|
20
|
+
if @main_q.size > @item_limit
|
21
|
+
removed = rotate @main_q, @adapter,(@main_q.size - @item_limit)
|
22
|
+
elsif @main_q.size < @item_limit && !@adapter.empty?
|
23
|
+
restored = rotate @adapter, @main_q,(@item_limit - @main_q.size)
|
24
|
+
end
|
25
|
+
[removed, restored]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
def rotate source, dest, items
|
30
|
+
performed = 0
|
31
|
+
1.upto(items){|i|
|
32
|
+
message = source.pop(true)
|
33
|
+
if message
|
34
|
+
page = Page.from_json message
|
35
|
+
unless @polipus.storage.exists?(page)
|
36
|
+
allowed = !@url_filter.nil? ? @url_filter.call(page) : true
|
37
|
+
if allowed
|
38
|
+
dest << message
|
39
|
+
performed += 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
source.commit if source.respond_to? :commit
|
44
|
+
break if !message || source.empty?
|
45
|
+
}
|
46
|
+
performed
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require "thread"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class MongoQueue
|
5
|
+
def initialize(mongo_db, queue_name, options = {})
|
6
|
+
@mongo_db = mongo_db
|
7
|
+
@collection_name = "polipus_q_overflow_#{queue_name}"
|
8
|
+
@semaphore = Mutex.new
|
9
|
+
@options = options
|
10
|
+
@options[:ensure_uniq] ||= false
|
11
|
+
if @options[:ensure_uniq]
|
12
|
+
ensure_index
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def length
|
17
|
+
@mongo_db[@collection_name].count
|
18
|
+
end
|
19
|
+
|
20
|
+
def empty?
|
21
|
+
!(length > 0)
|
22
|
+
end
|
23
|
+
|
24
|
+
def clear
|
25
|
+
@mongo_db[@collection_name].drop
|
26
|
+
if @options[:ensure_uniq]
|
27
|
+
ensure_index
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def push data
|
32
|
+
unless @options[:ensure_uniq]
|
33
|
+
@mongo_db[@collection_name].insert({:payload => data})
|
34
|
+
else
|
35
|
+
@mongo_db[@collection_name].update({:payload => data}, {:payload => data}, {:upsert => 1, :w => 1})
|
36
|
+
end
|
37
|
+
true
|
38
|
+
end
|
39
|
+
|
40
|
+
def pop(_ = false)
|
41
|
+
@semaphore.synchronize {
|
42
|
+
doc = @mongo_db[@collection_name].find({},:sort => {:_id => 1}).limit(1).first
|
43
|
+
return nil if doc.nil?
|
44
|
+
@mongo_db[@collection_name].remove(:_id => doc['_id'])
|
45
|
+
doc && doc['payload'] ? doc['payload'] : nil
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
alias :size :length
|
50
|
+
alias :dec :pop
|
51
|
+
alias :shift :pop
|
52
|
+
alias :enc :push
|
53
|
+
alias :<< :push
|
54
|
+
|
55
|
+
protected
|
56
|
+
def ensure_index
|
57
|
+
@mongo_db[@collection_name].ensure_index({:payload => 1},{:background => 1, :unique => 1, :drop_dups => 1})
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "polipus/queue_overflow/mongo_queue"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class MongoQueueCapped < MongoQueue
|
5
|
+
def initialize(mongo_db, queue_name, options = {})
|
6
|
+
super
|
7
|
+
@max = @options[:max]
|
8
|
+
end
|
9
|
+
|
10
|
+
def push data
|
11
|
+
super
|
12
|
+
@semaphore.synchronize {
|
13
|
+
s = size
|
14
|
+
if s > @max
|
15
|
+
docs = @mongo_db[@collection_name].find({},{:sort => {:_id => 1}, :fields => [:_id]}).limit(s-@max).map { |e| e['_id'] }
|
16
|
+
@mongo_db[@collection_name].remove({:_id => {'$in' => docs}, '$isolated' => 1})
|
17
|
+
end
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
alias :size :length
|
22
|
+
alias :dec :pop
|
23
|
+
alias :shift :pop
|
24
|
+
alias :enc :push
|
25
|
+
alias :<< :push
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "polipus/storage/base"
|
2
|
+
module Polipus
|
3
|
+
module Storage
|
4
|
+
def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
|
5
|
+
require 'polipus/storage/mongo_store'
|
6
|
+
mongo ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
|
7
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo.is_a?(Mongo::DB)
|
8
|
+
self::MongoStore.new(:mongo => mongo, :collection => collection_name, :except => except)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
|
12
|
+
require 'polipus/storage/s3_store'
|
13
|
+
|
14
|
+
if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
|
15
|
+
raise "You have to specify AWS crediantials: :access_key_id and :secret_access_key"
|
16
|
+
end
|
17
|
+
|
18
|
+
self::S3Store.new(
|
19
|
+
:bucket => bucket_name,
|
20
|
+
:access_key_id => aws_credential[:access_key_id],
|
21
|
+
:secret_access_key => aws_credential[:secret_access_key],
|
22
|
+
:except => except
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.dev_null
|
27
|
+
require 'polipus/storage/dev_null'
|
28
|
+
self::DevNull.new
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module Polipus
|
4
|
+
module Storage
|
5
|
+
class Base
|
6
|
+
attr_accessor :include_query_string_in_uuid
|
7
|
+
protected
|
8
|
+
def uuid page
|
9
|
+
if @include_query_string_in_uuid.nil?
|
10
|
+
@include_query_string_in_uuid = true
|
11
|
+
end
|
12
|
+
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
|
13
|
+
Digest::MD5.hexdigest(url_to_hash)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Storage
|
3
|
+
class DevNull < Base
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
end
|
7
|
+
|
8
|
+
def add page
|
9
|
+
end
|
10
|
+
|
11
|
+
def exists?(page)
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def get page
|
16
|
+
nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def remove page
|
20
|
+
false
|
21
|
+
end
|
22
|
+
|
23
|
+
def count
|
24
|
+
0
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
yield nil
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require "mongo"
|
2
|
+
require "zlib"
|
3
|
+
require "thread"
|
4
|
+
module Polipus
|
5
|
+
module Storage
|
6
|
+
class MongoStore < Base
|
7
|
+
BINARY_FIELDS = %w(body headers data)
|
8
|
+
def initialize(options = {})
|
9
|
+
@mongo = options[:mongo]
|
10
|
+
@collection = options[:collection]
|
11
|
+
@mongo.create_collection(@collection)
|
12
|
+
@mongo[@collection].ensure_index(:uuid, :unique => true, :drop_dups => true, :background => true)
|
13
|
+
@compress_body = options[:compress_body] ||= true
|
14
|
+
@except = options[:except] ||= []
|
15
|
+
@semaphore = Mutex.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def add page
|
19
|
+
@semaphore.synchronize {
|
20
|
+
obj = page.to_hash
|
21
|
+
@except.each {|e| obj.delete e.to_s}
|
22
|
+
obj['uuid'] = uuid(page)
|
23
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
24
|
+
BINARY_FIELDS.each do |field|
|
25
|
+
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
26
|
+
end
|
27
|
+
@mongo[@collection].update({:uuid => obj['uuid']}, obj, {:upsert => true, :w => 1})
|
28
|
+
obj['uuid']
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
def exists?(page)
|
33
|
+
@semaphore.synchronize {
|
34
|
+
doc = @mongo[@collection].find({:uuid => uuid(page)}, {:fields => [:_id]}).limit(1).first
|
35
|
+
!doc.nil?
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
def get page
|
40
|
+
@semaphore.synchronize {
|
41
|
+
data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
|
42
|
+
if data
|
43
|
+
return load_page(data)
|
44
|
+
end
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
def remove page
|
49
|
+
@semaphore.synchronize {
|
50
|
+
@mongo[@collection].remove({:uuid => uuid(page)})
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
def count
|
55
|
+
@mongo[@collection].count
|
56
|
+
end
|
57
|
+
|
58
|
+
def each
|
59
|
+
@mongo[@collection].find({},:timeout => false) do |cursor|
|
60
|
+
cursor.each do |doc|
|
61
|
+
page = load_page(doc)
|
62
|
+
yield doc['uuid'], page
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def clear
|
68
|
+
@mongo[@collection].drop
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
def load_page(hash)
|
73
|
+
BINARY_FIELDS.each do |field|
|
74
|
+
hash[field] = hash[field].to_s
|
75
|
+
end
|
76
|
+
begin
|
77
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
78
|
+
return Page.from_hash(hash)
|
79
|
+
rescue
|
80
|
+
end
|
81
|
+
nil
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require "aws/s3"
|
2
|
+
require "zlib"
|
3
|
+
require "thread"
|
4
|
+
require "json"
|
5
|
+
module Polipus
|
6
|
+
module Storage
|
7
|
+
class S3Store < Base
|
8
|
+
def initialize(options = {})
|
9
|
+
@options = options
|
10
|
+
@except = @options[:except] ||= []
|
11
|
+
@semaphore = Mutex.new
|
12
|
+
|
13
|
+
AWS::S3::Base.establish_connection!(
|
14
|
+
:access_key_id => @options[:access_key_id],
|
15
|
+
:secret_access_key => @options[:secret_access_key]
|
16
|
+
)
|
17
|
+
@options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
|
18
|
+
begin
|
19
|
+
@bucket = AWS::S3::Bucket.find(@options[:bucket])
|
20
|
+
rescue AWS::S3::NoSuchBucket
|
21
|
+
create_bucket
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def add page
|
26
|
+
@semaphore.synchronize {
|
27
|
+
obj = page.to_hash
|
28
|
+
@except.each {|e| obj.delete e.to_s}
|
29
|
+
puuid = uuid(page)
|
30
|
+
obj['uuid'] = puuid
|
31
|
+
data = Zlib::Deflate.deflate(obj.to_json)
|
32
|
+
AWS::S3::S3Object.store(puuid, data, @bucket.name)
|
33
|
+
puuid
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def exists?(page)
|
38
|
+
AWS::S3::S3Object.exists? uuid(page), @bucket.name
|
39
|
+
end
|
40
|
+
|
41
|
+
def get page
|
42
|
+
@semaphore.synchronize {
|
43
|
+
if exists?(page)
|
44
|
+
data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
|
45
|
+
return load_page(data)
|
46
|
+
end
|
47
|
+
nil
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def remove page
|
52
|
+
@semaphore.synchronize {
|
53
|
+
if exists?(page)
|
54
|
+
AWS::S3::S3Object.delete(uuid(page), @bucket.name)
|
55
|
+
end
|
56
|
+
true
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
def count
|
61
|
+
@bucket.size
|
62
|
+
end
|
63
|
+
|
64
|
+
def clear
|
65
|
+
AWS::S3::Bucket.delete(@bucket.name, :force => true)
|
66
|
+
create_bucket
|
67
|
+
end
|
68
|
+
|
69
|
+
def each
|
70
|
+
objects = []
|
71
|
+
last_key = nil
|
72
|
+
begin
|
73
|
+
objects = AWS::S3::Bucket.objects(@bucket.name, :marker => last_key)
|
74
|
+
break if objects.size == 0
|
75
|
+
objects.each do |o|
|
76
|
+
page = load_page(o.value)
|
77
|
+
yield o.key, page
|
78
|
+
end
|
79
|
+
last_key = objects.last.key
|
80
|
+
end while true
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
def load_page(data)
|
85
|
+
begin
|
86
|
+
payload = Zlib::Inflate.inflate(data)
|
87
|
+
hash = JSON.parse(payload)
|
88
|
+
return Page.from_hash(hash)
|
89
|
+
rescue
|
90
|
+
end
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
|
94
|
+
def create_bucket
|
95
|
+
AWS::S3::Bucket.create(@options[:bucket])
|
96
|
+
@bucket = AWS::S3::Bucket.find(@options[:bucket])
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|