polipus 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,17 @@
1
+ module Polipus
2
+ module Plugin
3
+ class Sample
4
+
5
+ def initialize(options = {})
6
+
7
+ end
8
+
9
+ def on_initialize crawler
10
+ Proc.new {
11
+ @options.each { |k,v| @logger.info {"Polipus configuration: #{k.to_s} => #{v}"} }
12
+ }
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,22 @@
1
+ module Polipus
2
+ module Plugin
3
+ class Sleeper
4
+
5
+ def initialize(options = {})
6
+ @delay = options[:delay] ||= 1
7
+ end
8
+
9
+ def on_initialize crawler
10
+ crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
11
+ Proc.new {
12
+ # Set to 1 the number of threads
13
+ @options[:workers] = 1
14
+ }
15
+ end
16
+
17
+ def on_message_processed crawler
18
+ sleep @delay
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,24 @@
1
+ require "polipus/queue_overflow/manager"
2
+ module Polipus
3
+ module QueueOverflow
4
+ def self.mongo_queue(mongo_db, queue_name, options = {})
5
+ require "polipus/queue_overflow/mongo_queue"
6
+ mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
+ raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
8
+ self::MongoQueue.new mongo_db, queue_name, options
9
+ end
10
+
11
+ def self.mongo_queue_capped(mongo_db, queue_name, options = {})
12
+ require "polipus/queue_overflow/mongo_queue_capped"
13
+ mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
14
+ raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
15
+ options[:max] = 1_000_000 if options[:max].nil?
16
+ self::MongoQueueCapped.new mongo_db, queue_name, options
17
+ end
18
+
19
+ def self.dev_null_queue(options = {})
20
+ require "polipus/queue_overflow/dev_null_queue"
21
+ self::DevNullQueue.new
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,6 @@
1
+ module Polipus
2
+ module QueueOverflow
3
+ class Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,33 @@
1
+ require "thread"
2
+ module Polipus
3
+ module QueueOverflow
4
+ class DevNullQueue
5
+ def initialize
6
+ end
7
+
8
+ def length
9
+ 0
10
+ end
11
+
12
+ def empty?
13
+ true
14
+ end
15
+
16
+ def clear
17
+ end
18
+
19
+ def push data
20
+ end
21
+
22
+ def pop(_ = false)
23
+ nil
24
+ end
25
+
26
+ alias :size :length
27
+ alias :dec :pop
28
+ alias :shift :pop
29
+ alias :enc :push
30
+ alias :<< :push
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,50 @@
1
+ module Polipus
2
+ module QueueOverflow
3
+ class Manager
4
+ attr_accessor :url_filter
5
+ def initialize(polipus, main_q, item_limit)
6
+ @polipus = polipus
7
+ @main_q = main_q
8
+ @adapter = @polipus.queue_overflow_adapter
9
+ @item_limit = item_limit
10
+ end
11
+
12
+ def url_filter &block
13
+ @url_filter = block
14
+ end
15
+
16
+ def perform
17
+ removed = 0
18
+ restored = 0
19
+
20
+ if @main_q.size > @item_limit
21
+ removed = rotate @main_q, @adapter,(@main_q.size - @item_limit)
22
+ elsif @main_q.size < @item_limit && !@adapter.empty?
23
+ restored = rotate @adapter, @main_q,(@item_limit - @main_q.size)
24
+ end
25
+ [removed, restored]
26
+ end
27
+
28
+ private
29
+ def rotate source, dest, items
30
+ performed = 0
31
+ 1.upto(items){|i|
32
+ message = source.pop(true)
33
+ if message
34
+ page = Page.from_json message
35
+ unless @polipus.storage.exists?(page)
36
+ allowed = !@url_filter.nil? ? @url_filter.call(page) : true
37
+ if allowed
38
+ dest << message
39
+ performed += 1
40
+ end
41
+ end
42
+ end
43
+ source.commit if source.respond_to? :commit
44
+ break if !message || source.empty?
45
+ }
46
+ performed
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,61 @@
1
+ require "thread"
2
+ module Polipus
3
+ module QueueOverflow
4
+ class MongoQueue
5
+ def initialize(mongo_db, queue_name, options = {})
6
+ @mongo_db = mongo_db
7
+ @collection_name = "polipus_q_overflow_#{queue_name}"
8
+ @semaphore = Mutex.new
9
+ @options = options
10
+ @options[:ensure_uniq] ||= false
11
+ if @options[:ensure_uniq]
12
+ ensure_index
13
+ end
14
+ end
15
+
16
+ def length
17
+ @mongo_db[@collection_name].count
18
+ end
19
+
20
+ def empty?
21
+ !(length > 0)
22
+ end
23
+
24
+ def clear
25
+ @mongo_db[@collection_name].drop
26
+ if @options[:ensure_uniq]
27
+ ensure_index
28
+ end
29
+ end
30
+
31
+ def push data
32
+ unless @options[:ensure_uniq]
33
+ @mongo_db[@collection_name].insert({:payload => data})
34
+ else
35
+ @mongo_db[@collection_name].update({:payload => data}, {:payload => data}, {:upsert => 1, :w => 1})
36
+ end
37
+ true
38
+ end
39
+
40
+ def pop(_ = false)
41
+ @semaphore.synchronize {
42
+ doc = @mongo_db[@collection_name].find({},:sort => {:_id => 1}).limit(1).first
43
+ return nil if doc.nil?
44
+ @mongo_db[@collection_name].remove(:_id => doc['_id'])
45
+ doc && doc['payload'] ? doc['payload'] : nil
46
+ }
47
+ end
48
+
49
+ alias :size :length
50
+ alias :dec :pop
51
+ alias :shift :pop
52
+ alias :enc :push
53
+ alias :<< :push
54
+
55
+ protected
56
+ def ensure_index
57
+ @mongo_db[@collection_name].ensure_index({:payload => 1},{:background => 1, :unique => 1, :drop_dups => 1})
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,28 @@
1
+ require "polipus/queue_overflow/mongo_queue"
2
+ module Polipus
3
+ module QueueOverflow
4
+ class MongoQueueCapped < MongoQueue
5
+ def initialize(mongo_db, queue_name, options = {})
6
+ super
7
+ @max = @options[:max]
8
+ end
9
+
10
+ def push data
11
+ super
12
+ @semaphore.synchronize {
13
+ s = size
14
+ if s > @max
15
+ docs = @mongo_db[@collection_name].find({},{:sort => {:_id => 1}, :fields => [:_id]}).limit(s-@max).map { |e| e['_id'] }
16
+ @mongo_db[@collection_name].remove({:_id => {'$in' => docs}, '$isolated' => 1})
17
+ end
18
+ }
19
+ end
20
+
21
+ alias :size :length
22
+ alias :dec :pop
23
+ alias :shift :pop
24
+ alias :enc :push
25
+ alias :<< :push
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,31 @@
1
+ require "polipus/storage/base"
2
+ module Polipus
3
+ module Storage
4
+ def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
5
+ require 'polipus/storage/mongo_store'
6
+ mongo ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
+ raise "First argument must be an instance of Mongo::DB" unless mongo.is_a?(Mongo::DB)
8
+ self::MongoStore.new(:mongo => mongo, :collection => collection_name, :except => except)
9
+ end
10
+
11
+ def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
12
+ require 'polipus/storage/s3_store'
13
+
14
+ if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
15
+ raise "You have to specify AWS crediantials: :access_key_id and :secret_access_key"
16
+ end
17
+
18
+ self::S3Store.new(
19
+ :bucket => bucket_name,
20
+ :access_key_id => aws_credential[:access_key_id],
21
+ :secret_access_key => aws_credential[:secret_access_key],
22
+ :except => except
23
+ )
24
+ end
25
+
26
+ def self.dev_null
27
+ require 'polipus/storage/dev_null'
28
+ self::DevNull.new
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,17 @@
1
+ require "uri"
2
+
3
+ module Polipus
4
+ module Storage
5
+ class Base
6
+ attr_accessor :include_query_string_in_uuid
7
+ protected
8
+ def uuid page
9
+ if @include_query_string_in_uuid.nil?
10
+ @include_query_string_in_uuid = true
11
+ end
12
+ url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
13
+ Digest::MD5.hexdigest(url_to_hash)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,35 @@
1
+ module Polipus
2
+ module Storage
3
+ class DevNull < Base
4
+
5
+ def initialize(options = {})
6
+ end
7
+
8
+ def add page
9
+ end
10
+
11
+ def exists?(page)
12
+ false
13
+ end
14
+
15
+ def get page
16
+ nil
17
+ end
18
+
19
+ def remove page
20
+ false
21
+ end
22
+
23
+ def count
24
+ 0
25
+ end
26
+
27
+ def each
28
+ yield nil
29
+ end
30
+
31
+ def clear
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,86 @@
1
+ require "mongo"
2
+ require "zlib"
3
+ require "thread"
4
+ module Polipus
5
+ module Storage
6
+ class MongoStore < Base
7
+ BINARY_FIELDS = %w(body headers data)
8
+ def initialize(options = {})
9
+ @mongo = options[:mongo]
10
+ @collection = options[:collection]
11
+ @mongo.create_collection(@collection)
12
+ @mongo[@collection].ensure_index(:uuid, :unique => true, :drop_dups => true, :background => true)
13
+ @compress_body = options[:compress_body] ||= true
14
+ @except = options[:except] ||= []
15
+ @semaphore = Mutex.new
16
+ end
17
+
18
+ def add page
19
+ @semaphore.synchronize {
20
+ obj = page.to_hash
21
+ @except.each {|e| obj.delete e.to_s}
22
+ obj['uuid'] = uuid(page)
23
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
24
+ BINARY_FIELDS.each do |field|
25
+ obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
26
+ end
27
+ @mongo[@collection].update({:uuid => obj['uuid']}, obj, {:upsert => true, :w => 1})
28
+ obj['uuid']
29
+ }
30
+ end
31
+
32
+ def exists?(page)
33
+ @semaphore.synchronize {
34
+ doc = @mongo[@collection].find({:uuid => uuid(page)}, {:fields => [:_id]}).limit(1).first
35
+ !doc.nil?
36
+ }
37
+ end
38
+
39
+ def get page
40
+ @semaphore.synchronize {
41
+ data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
42
+ if data
43
+ return load_page(data)
44
+ end
45
+ }
46
+ end
47
+
48
+ def remove page
49
+ @semaphore.synchronize {
50
+ @mongo[@collection].remove({:uuid => uuid(page)})
51
+ }
52
+ end
53
+
54
+ def count
55
+ @mongo[@collection].count
56
+ end
57
+
58
+ def each
59
+ @mongo[@collection].find({},:timeout => false) do |cursor|
60
+ cursor.each do |doc|
61
+ page = load_page(doc)
62
+ yield doc['uuid'], page
63
+ end
64
+ end
65
+ end
66
+
67
+ def clear
68
+ @mongo[@collection].drop
69
+ end
70
+
71
+ private
72
+ def load_page(hash)
73
+ BINARY_FIELDS.each do |field|
74
+ hash[field] = hash[field].to_s
75
+ end
76
+ begin
77
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
78
+ return Page.from_hash(hash)
79
+ rescue
80
+ end
81
+ nil
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,100 @@
1
+ require "aws/s3"
2
+ require "zlib"
3
+ require "thread"
4
+ require "json"
5
+ module Polipus
6
+ module Storage
7
+ class S3Store < Base
8
+ def initialize(options = {})
9
+ @options = options
10
+ @except = @options[:except] ||= []
11
+ @semaphore = Mutex.new
12
+
13
+ AWS::S3::Base.establish_connection!(
14
+ :access_key_id => @options[:access_key_id],
15
+ :secret_access_key => @options[:secret_access_key]
16
+ )
17
+ @options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
18
+ begin
19
+ @bucket = AWS::S3::Bucket.find(@options[:bucket])
20
+ rescue AWS::S3::NoSuchBucket
21
+ create_bucket
22
+ end
23
+ end
24
+
25
+ def add page
26
+ @semaphore.synchronize {
27
+ obj = page.to_hash
28
+ @except.each {|e| obj.delete e.to_s}
29
+ puuid = uuid(page)
30
+ obj['uuid'] = puuid
31
+ data = Zlib::Deflate.deflate(obj.to_json)
32
+ AWS::S3::S3Object.store(puuid, data, @bucket.name)
33
+ puuid
34
+ }
35
+ end
36
+
37
+ def exists?(page)
38
+ AWS::S3::S3Object.exists? uuid(page), @bucket.name
39
+ end
40
+
41
+ def get page
42
+ @semaphore.synchronize {
43
+ if exists?(page)
44
+ data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
45
+ return load_page(data)
46
+ end
47
+ nil
48
+ }
49
+ end
50
+
51
+ def remove page
52
+ @semaphore.synchronize {
53
+ if exists?(page)
54
+ AWS::S3::S3Object.delete(uuid(page), @bucket.name)
55
+ end
56
+ true
57
+ }
58
+ end
59
+
60
+ def count
61
+ @bucket.size
62
+ end
63
+
64
+ def clear
65
+ AWS::S3::Bucket.delete(@bucket.name, :force => true)
66
+ create_bucket
67
+ end
68
+
69
+ def each
70
+ objects = []
71
+ last_key = nil
72
+ begin
73
+ objects = AWS::S3::Bucket.objects(@bucket.name, :marker => last_key)
74
+ break if objects.size == 0
75
+ objects.each do |o|
76
+ page = load_page(o.value)
77
+ yield o.key, page
78
+ end
79
+ last_key = objects.last.key
80
+ end while true
81
+ end
82
+
83
+ private
84
+ def load_page(data)
85
+ begin
86
+ payload = Zlib::Inflate.inflate(data)
87
+ hash = JSON.parse(payload)
88
+ return Page.from_hash(hash)
89
+ rescue
90
+ end
91
+ nil
92
+ end
93
+
94
+ def create_bucket
95
+ AWS::S3::Bucket.create(@options[:bucket])
96
+ @bucket = AWS::S3::Bucket.find(@options[:bucket])
97
+ end
98
+ end
99
+ end
100
+ end