polipus 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/.document +5 -0
- data/.gitignore +53 -0
- data/.rspec +2 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +20 -0
- data/README.md +55 -0
- data/README.rdoc +3 -0
- data/Rakefile +9 -0
- data/examples/basic.rb +58 -0
- data/examples/survival.rb +9 -0
- data/lib/polipus.rb +451 -0
- data/lib/polipus/http.rb +195 -0
- data/lib/polipus/page.rb +219 -0
- data/lib/polipus/plugin.rb +13 -0
- data/lib/polipus/plugins/cleaner.rb +25 -0
- data/lib/polipus/plugins/sample.rb +17 -0
- data/lib/polipus/plugins/sleeper.rb +22 -0
- data/lib/polipus/queue_overflow.rb +24 -0
- data/lib/polipus/queue_overflow/base.rb +6 -0
- data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
- data/lib/polipus/queue_overflow/manager.rb +50 -0
- data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
- data/lib/polipus/storage.rb +31 -0
- data/lib/polipus/storage/base.rb +17 -0
- data/lib/polipus/storage/dev_null.rb +35 -0
- data/lib/polipus/storage/mongo_store.rb +86 -0
- data/lib/polipus/storage/s3_store.rb +100 -0
- data/lib/polipus/url_tracker.rb +20 -0
- data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
- data/lib/polipus/url_tracker/redis_set.rb +27 -0
- data/lib/polipus/version.rb +4 -0
- data/polipus.gemspec +39 -0
- data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
- data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
- data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
- data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
- data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
- data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
- data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
- data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
- data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
- data/spec/cassettes/http_test.yml +1418 -0
- data/spec/cassettes/http_test_redirect.yml +71 -0
- data/spec/clear.rb +11 -0
- data/spec/http_spec.rb +31 -0
- data/spec/page_spec.rb +22 -0
- data/spec/queue_overflow_manager_spec.rb +89 -0
- data/spec/queue_overflow_spec.rb +71 -0
- data/spec/spec_helper.rb +34 -0
- data/spec/storage_mongo_spec.rb +102 -0
- data/spec/storage_s3_spec.rb +115 -0
- data/spec/url_tracker_spec.rb +28 -0
- metadata +313 -0
@@ -0,0 +1,17 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Plugin
|
3
|
+
class Sample
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize crawler
|
10
|
+
Proc.new {
|
11
|
+
@options.each { |k,v| @logger.info {"Polipus configuration: #{k.to_s} => #{v}"} }
|
12
|
+
}
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Plugin
|
3
|
+
class Sleeper
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
@delay = options[:delay] ||= 1
|
7
|
+
end
|
8
|
+
|
9
|
+
def on_initialize crawler
|
10
|
+
crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
|
11
|
+
Proc.new {
|
12
|
+
# Set to 1 the number of threads
|
13
|
+
@options[:workers] = 1
|
14
|
+
}
|
15
|
+
end
|
16
|
+
|
17
|
+
def on_message_processed crawler
|
18
|
+
sleep @delay
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require "polipus/queue_overflow/manager"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
def self.mongo_queue(mongo_db, queue_name, options = {})
|
5
|
+
require "polipus/queue_overflow/mongo_queue"
|
6
|
+
mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
|
7
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
|
8
|
+
self::MongoQueue.new mongo_db, queue_name, options
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
12
|
+
require "polipus/queue_overflow/mongo_queue_capped"
|
13
|
+
mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
|
14
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
|
15
|
+
options[:max] = 1_000_000 if options[:max].nil?
|
16
|
+
self::MongoQueueCapped.new mongo_db, queue_name, options
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.dev_null_queue(options = {})
|
20
|
+
require "polipus/queue_overflow/dev_null_queue"
|
21
|
+
self::DevNullQueue.new
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require "thread"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class DevNullQueue
|
5
|
+
def initialize
|
6
|
+
end
|
7
|
+
|
8
|
+
def length
|
9
|
+
0
|
10
|
+
end
|
11
|
+
|
12
|
+
def empty?
|
13
|
+
true
|
14
|
+
end
|
15
|
+
|
16
|
+
def clear
|
17
|
+
end
|
18
|
+
|
19
|
+
def push data
|
20
|
+
end
|
21
|
+
|
22
|
+
def pop(_ = false)
|
23
|
+
nil
|
24
|
+
end
|
25
|
+
|
26
|
+
alias :size :length
|
27
|
+
alias :dec :pop
|
28
|
+
alias :shift :pop
|
29
|
+
alias :enc :push
|
30
|
+
alias :<< :push
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Polipus
|
2
|
+
module QueueOverflow
|
3
|
+
class Manager
|
4
|
+
attr_accessor :url_filter
|
5
|
+
def initialize(polipus, main_q, item_limit)
|
6
|
+
@polipus = polipus
|
7
|
+
@main_q = main_q
|
8
|
+
@adapter = @polipus.queue_overflow_adapter
|
9
|
+
@item_limit = item_limit
|
10
|
+
end
|
11
|
+
|
12
|
+
def url_filter &block
|
13
|
+
@url_filter = block
|
14
|
+
end
|
15
|
+
|
16
|
+
def perform
|
17
|
+
removed = 0
|
18
|
+
restored = 0
|
19
|
+
|
20
|
+
if @main_q.size > @item_limit
|
21
|
+
removed = rotate @main_q, @adapter,(@main_q.size - @item_limit)
|
22
|
+
elsif @main_q.size < @item_limit && !@adapter.empty?
|
23
|
+
restored = rotate @adapter, @main_q,(@item_limit - @main_q.size)
|
24
|
+
end
|
25
|
+
[removed, restored]
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
def rotate source, dest, items
|
30
|
+
performed = 0
|
31
|
+
1.upto(items){|i|
|
32
|
+
message = source.pop(true)
|
33
|
+
if message
|
34
|
+
page = Page.from_json message
|
35
|
+
unless @polipus.storage.exists?(page)
|
36
|
+
allowed = !@url_filter.nil? ? @url_filter.call(page) : true
|
37
|
+
if allowed
|
38
|
+
dest << message
|
39
|
+
performed += 1
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
source.commit if source.respond_to? :commit
|
44
|
+
break if !message || source.empty?
|
45
|
+
}
|
46
|
+
performed
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
@@ -0,0 +1,61 @@
|
|
1
|
+
require "thread"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class MongoQueue
|
5
|
+
def initialize(mongo_db, queue_name, options = {})
|
6
|
+
@mongo_db = mongo_db
|
7
|
+
@collection_name = "polipus_q_overflow_#{queue_name}"
|
8
|
+
@semaphore = Mutex.new
|
9
|
+
@options = options
|
10
|
+
@options[:ensure_uniq] ||= false
|
11
|
+
if @options[:ensure_uniq]
|
12
|
+
ensure_index
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def length
|
17
|
+
@mongo_db[@collection_name].count
|
18
|
+
end
|
19
|
+
|
20
|
+
def empty?
|
21
|
+
!(length > 0)
|
22
|
+
end
|
23
|
+
|
24
|
+
def clear
|
25
|
+
@mongo_db[@collection_name].drop
|
26
|
+
if @options[:ensure_uniq]
|
27
|
+
ensure_index
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
def push data
|
32
|
+
unless @options[:ensure_uniq]
|
33
|
+
@mongo_db[@collection_name].insert({:payload => data})
|
34
|
+
else
|
35
|
+
@mongo_db[@collection_name].update({:payload => data}, {:payload => data}, {:upsert => 1, :w => 1})
|
36
|
+
end
|
37
|
+
true
|
38
|
+
end
|
39
|
+
|
40
|
+
def pop(_ = false)
|
41
|
+
@semaphore.synchronize {
|
42
|
+
doc = @mongo_db[@collection_name].find({},:sort => {:_id => 1}).limit(1).first
|
43
|
+
return nil if doc.nil?
|
44
|
+
@mongo_db[@collection_name].remove(:_id => doc['_id'])
|
45
|
+
doc && doc['payload'] ? doc['payload'] : nil
|
46
|
+
}
|
47
|
+
end
|
48
|
+
|
49
|
+
alias :size :length
|
50
|
+
alias :dec :pop
|
51
|
+
alias :shift :pop
|
52
|
+
alias :enc :push
|
53
|
+
alias :<< :push
|
54
|
+
|
55
|
+
protected
|
56
|
+
def ensure_index
|
57
|
+
@mongo_db[@collection_name].ensure_index({:payload => 1},{:background => 1, :unique => 1, :drop_dups => 1})
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "polipus/queue_overflow/mongo_queue"
|
2
|
+
module Polipus
|
3
|
+
module QueueOverflow
|
4
|
+
class MongoQueueCapped < MongoQueue
|
5
|
+
def initialize(mongo_db, queue_name, options = {})
|
6
|
+
super
|
7
|
+
@max = @options[:max]
|
8
|
+
end
|
9
|
+
|
10
|
+
def push data
|
11
|
+
super
|
12
|
+
@semaphore.synchronize {
|
13
|
+
s = size
|
14
|
+
if s > @max
|
15
|
+
docs = @mongo_db[@collection_name].find({},{:sort => {:_id => 1}, :fields => [:_id]}).limit(s-@max).map { |e| e['_id'] }
|
16
|
+
@mongo_db[@collection_name].remove({:_id => {'$in' => docs}, '$isolated' => 1})
|
17
|
+
end
|
18
|
+
}
|
19
|
+
end
|
20
|
+
|
21
|
+
alias :size :length
|
22
|
+
alias :dec :pop
|
23
|
+
alias :shift :pop
|
24
|
+
alias :enc :push
|
25
|
+
alias :<< :push
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require "polipus/storage/base"
|
2
|
+
module Polipus
|
3
|
+
module Storage
|
4
|
+
def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
|
5
|
+
require 'polipus/storage/mongo_store'
|
6
|
+
mongo ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
|
7
|
+
raise "First argument must be an instance of Mongo::DB" unless mongo.is_a?(Mongo::DB)
|
8
|
+
self::MongoStore.new(:mongo => mongo, :collection => collection_name, :except => except)
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
|
12
|
+
require 'polipus/storage/s3_store'
|
13
|
+
|
14
|
+
if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
|
15
|
+
raise "You have to specify AWS crediantials: :access_key_id and :secret_access_key"
|
16
|
+
end
|
17
|
+
|
18
|
+
self::S3Store.new(
|
19
|
+
:bucket => bucket_name,
|
20
|
+
:access_key_id => aws_credential[:access_key_id],
|
21
|
+
:secret_access_key => aws_credential[:secret_access_key],
|
22
|
+
:except => except
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def self.dev_null
|
27
|
+
require 'polipus/storage/dev_null'
|
28
|
+
self::DevNull.new
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
require "uri"
|
2
|
+
|
3
|
+
module Polipus
|
4
|
+
module Storage
|
5
|
+
class Base
|
6
|
+
attr_accessor :include_query_string_in_uuid
|
7
|
+
protected
|
8
|
+
def uuid page
|
9
|
+
if @include_query_string_in_uuid.nil?
|
10
|
+
@include_query_string_in_uuid = true
|
11
|
+
end
|
12
|
+
url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
|
13
|
+
Digest::MD5.hexdigest(url_to_hash)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Polipus
|
2
|
+
module Storage
|
3
|
+
class DevNull < Base
|
4
|
+
|
5
|
+
def initialize(options = {})
|
6
|
+
end
|
7
|
+
|
8
|
+
def add page
|
9
|
+
end
|
10
|
+
|
11
|
+
def exists?(page)
|
12
|
+
false
|
13
|
+
end
|
14
|
+
|
15
|
+
def get page
|
16
|
+
nil
|
17
|
+
end
|
18
|
+
|
19
|
+
def remove page
|
20
|
+
false
|
21
|
+
end
|
22
|
+
|
23
|
+
def count
|
24
|
+
0
|
25
|
+
end
|
26
|
+
|
27
|
+
def each
|
28
|
+
yield nil
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
require "mongo"
|
2
|
+
require "zlib"
|
3
|
+
require "thread"
|
4
|
+
module Polipus
|
5
|
+
module Storage
|
6
|
+
class MongoStore < Base
|
7
|
+
BINARY_FIELDS = %w(body headers data)
|
8
|
+
def initialize(options = {})
|
9
|
+
@mongo = options[:mongo]
|
10
|
+
@collection = options[:collection]
|
11
|
+
@mongo.create_collection(@collection)
|
12
|
+
@mongo[@collection].ensure_index(:uuid, :unique => true, :drop_dups => true, :background => true)
|
13
|
+
@compress_body = options[:compress_body] ||= true
|
14
|
+
@except = options[:except] ||= []
|
15
|
+
@semaphore = Mutex.new
|
16
|
+
end
|
17
|
+
|
18
|
+
def add page
|
19
|
+
@semaphore.synchronize {
|
20
|
+
obj = page.to_hash
|
21
|
+
@except.each {|e| obj.delete e.to_s}
|
22
|
+
obj['uuid'] = uuid(page)
|
23
|
+
obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
|
24
|
+
BINARY_FIELDS.each do |field|
|
25
|
+
obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
|
26
|
+
end
|
27
|
+
@mongo[@collection].update({:uuid => obj['uuid']}, obj, {:upsert => true, :w => 1})
|
28
|
+
obj['uuid']
|
29
|
+
}
|
30
|
+
end
|
31
|
+
|
32
|
+
def exists?(page)
|
33
|
+
@semaphore.synchronize {
|
34
|
+
doc = @mongo[@collection].find({:uuid => uuid(page)}, {:fields => [:_id]}).limit(1).first
|
35
|
+
!doc.nil?
|
36
|
+
}
|
37
|
+
end
|
38
|
+
|
39
|
+
def get page
|
40
|
+
@semaphore.synchronize {
|
41
|
+
data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
|
42
|
+
if data
|
43
|
+
return load_page(data)
|
44
|
+
end
|
45
|
+
}
|
46
|
+
end
|
47
|
+
|
48
|
+
def remove page
|
49
|
+
@semaphore.synchronize {
|
50
|
+
@mongo[@collection].remove({:uuid => uuid(page)})
|
51
|
+
}
|
52
|
+
end
|
53
|
+
|
54
|
+
def count
|
55
|
+
@mongo[@collection].count
|
56
|
+
end
|
57
|
+
|
58
|
+
def each
|
59
|
+
@mongo[@collection].find({},:timeout => false) do |cursor|
|
60
|
+
cursor.each do |doc|
|
61
|
+
page = load_page(doc)
|
62
|
+
yield doc['uuid'], page
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def clear
|
68
|
+
@mongo[@collection].drop
|
69
|
+
end
|
70
|
+
|
71
|
+
private
|
72
|
+
def load_page(hash)
|
73
|
+
BINARY_FIELDS.each do |field|
|
74
|
+
hash[field] = hash[field].to_s
|
75
|
+
end
|
76
|
+
begin
|
77
|
+
hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
|
78
|
+
return Page.from_hash(hash)
|
79
|
+
rescue
|
80
|
+
end
|
81
|
+
nil
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
@@ -0,0 +1,100 @@
|
|
1
|
+
require "aws/s3"
|
2
|
+
require "zlib"
|
3
|
+
require "thread"
|
4
|
+
require "json"
|
5
|
+
module Polipus
|
6
|
+
module Storage
|
7
|
+
class S3Store < Base
|
8
|
+
def initialize(options = {})
|
9
|
+
@options = options
|
10
|
+
@except = @options[:except] ||= []
|
11
|
+
@semaphore = Mutex.new
|
12
|
+
|
13
|
+
AWS::S3::Base.establish_connection!(
|
14
|
+
:access_key_id => @options[:access_key_id],
|
15
|
+
:secret_access_key => @options[:secret_access_key]
|
16
|
+
)
|
17
|
+
@options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
|
18
|
+
begin
|
19
|
+
@bucket = AWS::S3::Bucket.find(@options[:bucket])
|
20
|
+
rescue AWS::S3::NoSuchBucket
|
21
|
+
create_bucket
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def add page
|
26
|
+
@semaphore.synchronize {
|
27
|
+
obj = page.to_hash
|
28
|
+
@except.each {|e| obj.delete e.to_s}
|
29
|
+
puuid = uuid(page)
|
30
|
+
obj['uuid'] = puuid
|
31
|
+
data = Zlib::Deflate.deflate(obj.to_json)
|
32
|
+
AWS::S3::S3Object.store(puuid, data, @bucket.name)
|
33
|
+
puuid
|
34
|
+
}
|
35
|
+
end
|
36
|
+
|
37
|
+
def exists?(page)
|
38
|
+
AWS::S3::S3Object.exists? uuid(page), @bucket.name
|
39
|
+
end
|
40
|
+
|
41
|
+
def get page
|
42
|
+
@semaphore.synchronize {
|
43
|
+
if exists?(page)
|
44
|
+
data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
|
45
|
+
return load_page(data)
|
46
|
+
end
|
47
|
+
nil
|
48
|
+
}
|
49
|
+
end
|
50
|
+
|
51
|
+
def remove page
|
52
|
+
@semaphore.synchronize {
|
53
|
+
if exists?(page)
|
54
|
+
AWS::S3::S3Object.delete(uuid(page), @bucket.name)
|
55
|
+
end
|
56
|
+
true
|
57
|
+
}
|
58
|
+
end
|
59
|
+
|
60
|
+
def count
|
61
|
+
@bucket.size
|
62
|
+
end
|
63
|
+
|
64
|
+
def clear
|
65
|
+
AWS::S3::Bucket.delete(@bucket.name, :force => true)
|
66
|
+
create_bucket
|
67
|
+
end
|
68
|
+
|
69
|
+
def each
|
70
|
+
objects = []
|
71
|
+
last_key = nil
|
72
|
+
begin
|
73
|
+
objects = AWS::S3::Bucket.objects(@bucket.name, :marker => last_key)
|
74
|
+
break if objects.size == 0
|
75
|
+
objects.each do |o|
|
76
|
+
page = load_page(o.value)
|
77
|
+
yield o.key, page
|
78
|
+
end
|
79
|
+
last_key = objects.last.key
|
80
|
+
end while true
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
def load_page(data)
|
85
|
+
begin
|
86
|
+
payload = Zlib::Inflate.inflate(data)
|
87
|
+
hash = JSON.parse(payload)
|
88
|
+
return Page.from_hash(hash)
|
89
|
+
rescue
|
90
|
+
end
|
91
|
+
nil
|
92
|
+
end
|
93
|
+
|
94
|
+
def create_bucket
|
95
|
+
AWS::S3::Bucket.create(@options[:bucket])
|
96
|
+
@bucket = AWS::S3::Bucket.find(@options[:bucket])
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|