polipus 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. checksums.yaml +15 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +55 -0
  8. data/README.rdoc +3 -0
  9. data/Rakefile +9 -0
  10. data/examples/basic.rb +58 -0
  11. data/examples/survival.rb +9 -0
  12. data/lib/polipus.rb +451 -0
  13. data/lib/polipus/http.rb +195 -0
  14. data/lib/polipus/page.rb +219 -0
  15. data/lib/polipus/plugin.rb +13 -0
  16. data/lib/polipus/plugins/cleaner.rb +25 -0
  17. data/lib/polipus/plugins/sample.rb +17 -0
  18. data/lib/polipus/plugins/sleeper.rb +22 -0
  19. data/lib/polipus/queue_overflow.rb +24 -0
  20. data/lib/polipus/queue_overflow/base.rb +6 -0
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +33 -0
  22. data/lib/polipus/queue_overflow/manager.rb +50 -0
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +61 -0
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +28 -0
  25. data/lib/polipus/storage.rb +31 -0
  26. data/lib/polipus/storage/base.rb +17 -0
  27. data/lib/polipus/storage/dev_null.rb +35 -0
  28. data/lib/polipus/storage/mongo_store.rb +86 -0
  29. data/lib/polipus/storage/s3_store.rb +100 -0
  30. data/lib/polipus/url_tracker.rb +20 -0
  31. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  32. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  33. data/lib/polipus/version.rb +4 -0
  34. data/polipus.gemspec +39 -0
  35. data/spec/cassettes/08b228db424a926e1ed6ab63b38d847e.yml +166 -0
  36. data/spec/cassettes/20aa41f181b49f00078c3ca30bad5afe.yml +166 -0
  37. data/spec/cassettes/4640919145753505af2d0f8423de37f3.yml +270 -0
  38. data/spec/cassettes/66aae15a03f4aab8efd15e40d2d7882a.yml +194 -0
  39. data/spec/cassettes/76b7c197c95a5bf9b1e882c567192d72.yml +183 -0
  40. data/spec/cassettes/9b1d523b7f5db7214f8a8bd9272cccba.yml +221 -0
  41. data/spec/cassettes/ab333f89535a2efb284913fede6aa7c7.yml +221 -0
  42. data/spec/cassettes/ae5d7cffde3f53122cdf79f3d1367e8e.yml +221 -0
  43. data/spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml +695 -0
  44. data/spec/cassettes/http_test.yml +1418 -0
  45. data/spec/cassettes/http_test_redirect.yml +71 -0
  46. data/spec/clear.rb +11 -0
  47. data/spec/http_spec.rb +31 -0
  48. data/spec/page_spec.rb +22 -0
  49. data/spec/queue_overflow_manager_spec.rb +89 -0
  50. data/spec/queue_overflow_spec.rb +71 -0
  51. data/spec/spec_helper.rb +34 -0
  52. data/spec/storage_mongo_spec.rb +102 -0
  53. data/spec/storage_s3_spec.rb +115 -0
  54. data/spec/url_tracker_spec.rb +28 -0
  55. metadata +313 -0
@@ -0,0 +1,17 @@
1
+ module Polipus
2
+ module Plugin
3
+ class Sample
4
+
5
+ def initialize(options = {})
6
+
7
+ end
8
+
9
+ def on_initialize crawler
10
+ Proc.new {
11
+ @options.each { |k,v| @logger.info {"Polipus configuration: #{k.to_s} => #{v}"} }
12
+ }
13
+ end
14
+
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,22 @@
1
+ module Polipus
2
+ module Plugin
3
+ class Sleeper
4
+
5
+ def initialize(options = {})
6
+ @delay = options[:delay] ||= 1
7
+ end
8
+
9
+ def on_initialize crawler
10
+ crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
11
+ Proc.new {
12
+ # Set to 1 the number of threads
13
+ @options[:workers] = 1
14
+ }
15
+ end
16
+
17
+ def on_message_processed crawler
18
+ sleep @delay
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,24 @@
1
+ require "polipus/queue_overflow/manager"
2
+ module Polipus
3
+ module QueueOverflow
4
+ def self.mongo_queue(mongo_db, queue_name, options = {})
5
+ require "polipus/queue_overflow/mongo_queue"
6
+ mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
+ raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
8
+ self::MongoQueue.new mongo_db, queue_name, options
9
+ end
10
+
11
+ def self.mongo_queue_capped(mongo_db, queue_name, options = {})
12
+ require "polipus/queue_overflow/mongo_queue_capped"
13
+ mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
14
+ raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
15
+ options[:max] = 1_000_000 if options[:max].nil?
16
+ self::MongoQueueCapped.new mongo_db, queue_name, options
17
+ end
18
+
19
+ def self.dev_null_queue(options = {})
20
+ require "polipus/queue_overflow/dev_null_queue"
21
+ self::DevNullQueue.new
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,6 @@
1
+ module Polipus
2
+ module QueueOverflow
3
+ class Base
4
+ end
5
+ end
6
+ end
@@ -0,0 +1,33 @@
1
+ require "thread"
2
+ module Polipus
3
+ module QueueOverflow
4
+ class DevNullQueue
5
+ def initialize
6
+ end
7
+
8
+ def length
9
+ 0
10
+ end
11
+
12
+ def empty?
13
+ true
14
+ end
15
+
16
+ def clear
17
+ end
18
+
19
+ def push data
20
+ end
21
+
22
+ def pop(_ = false)
23
+ nil
24
+ end
25
+
26
+ alias :size :length
27
+ alias :dec :pop
28
+ alias :shift :pop
29
+ alias :enc :push
30
+ alias :<< :push
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,50 @@
1
+ module Polipus
2
+ module QueueOverflow
3
+ class Manager
4
+ attr_accessor :url_filter
5
+ def initialize(polipus, main_q, item_limit)
6
+ @polipus = polipus
7
+ @main_q = main_q
8
+ @adapter = @polipus.queue_overflow_adapter
9
+ @item_limit = item_limit
10
+ end
11
+
12
+ def url_filter &block
13
+ @url_filter = block
14
+ end
15
+
16
+ def perform
17
+ removed = 0
18
+ restored = 0
19
+
20
+ if @main_q.size > @item_limit
21
+ removed = rotate @main_q, @adapter,(@main_q.size - @item_limit)
22
+ elsif @main_q.size < @item_limit && !@adapter.empty?
23
+ restored = rotate @adapter, @main_q,(@item_limit - @main_q.size)
24
+ end
25
+ [removed, restored]
26
+ end
27
+
28
+ private
29
+ def rotate source, dest, items
30
+ performed = 0
31
+ 1.upto(items){|i|
32
+ message = source.pop(true)
33
+ if message
34
+ page = Page.from_json message
35
+ unless @polipus.storage.exists?(page)
36
+ allowed = !@url_filter.nil? ? @url_filter.call(page) : true
37
+ if allowed
38
+ dest << message
39
+ performed += 1
40
+ end
41
+ end
42
+ end
43
+ source.commit if source.respond_to? :commit
44
+ break if !message || source.empty?
45
+ }
46
+ performed
47
+ end
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,61 @@
1
+ require "thread"
2
+ module Polipus
3
+ module QueueOverflow
4
+ class MongoQueue
5
+ def initialize(mongo_db, queue_name, options = {})
6
+ @mongo_db = mongo_db
7
+ @collection_name = "polipus_q_overflow_#{queue_name}"
8
+ @semaphore = Mutex.new
9
+ @options = options
10
+ @options[:ensure_uniq] ||= false
11
+ if @options[:ensure_uniq]
12
+ ensure_index
13
+ end
14
+ end
15
+
16
+ def length
17
+ @mongo_db[@collection_name].count
18
+ end
19
+
20
+ def empty?
21
+ !(length > 0)
22
+ end
23
+
24
+ def clear
25
+ @mongo_db[@collection_name].drop
26
+ if @options[:ensure_uniq]
27
+ ensure_index
28
+ end
29
+ end
30
+
31
+ def push data
32
+ unless @options[:ensure_uniq]
33
+ @mongo_db[@collection_name].insert({:payload => data})
34
+ else
35
+ @mongo_db[@collection_name].update({:payload => data}, {:payload => data}, {:upsert => 1, :w => 1})
36
+ end
37
+ true
38
+ end
39
+
40
+ def pop(_ = false)
41
+ @semaphore.synchronize {
42
+ doc = @mongo_db[@collection_name].find({},:sort => {:_id => 1}).limit(1).first
43
+ return nil if doc.nil?
44
+ @mongo_db[@collection_name].remove(:_id => doc['_id'])
45
+ doc && doc['payload'] ? doc['payload'] : nil
46
+ }
47
+ end
48
+
49
+ alias :size :length
50
+ alias :dec :pop
51
+ alias :shift :pop
52
+ alias :enc :push
53
+ alias :<< :push
54
+
55
+ protected
56
+ def ensure_index
57
+ @mongo_db[@collection_name].ensure_index({:payload => 1},{:background => 1, :unique => 1, :drop_dups => 1})
58
+ end
59
+ end
60
+ end
61
+ end
@@ -0,0 +1,28 @@
1
+ require "polipus/queue_overflow/mongo_queue"
2
+ module Polipus
3
+ module QueueOverflow
4
+ class MongoQueueCapped < MongoQueue
5
+ def initialize(mongo_db, queue_name, options = {})
6
+ super
7
+ @max = @options[:max]
8
+ end
9
+
10
+ def push data
11
+ super
12
+ @semaphore.synchronize {
13
+ s = size
14
+ if s > @max
15
+ docs = @mongo_db[@collection_name].find({},{:sort => {:_id => 1}, :fields => [:_id]}).limit(s-@max).map { |e| e['_id'] }
16
+ @mongo_db[@collection_name].remove({:_id => {'$in' => docs}, '$isolated' => 1})
17
+ end
18
+ }
19
+ end
20
+
21
+ alias :size :length
22
+ alias :dec :pop
23
+ alias :shift :pop
24
+ alias :enc :push
25
+ alias :<< :push
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,31 @@
1
+ require "polipus/storage/base"
2
+ module Polipus
3
+ module Storage
4
+ def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
5
+ require 'polipus/storage/mongo_store'
6
+ mongo ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
+ raise "First argument must be an instance of Mongo::DB" unless mongo.is_a?(Mongo::DB)
8
+ self::MongoStore.new(:mongo => mongo, :collection => collection_name, :except => except)
9
+ end
10
+
11
+ def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
12
+ require 'polipus/storage/s3_store'
13
+
14
+ if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
15
+ raise "You have to specify AWS crediantials: :access_key_id and :secret_access_key"
16
+ end
17
+
18
+ self::S3Store.new(
19
+ :bucket => bucket_name,
20
+ :access_key_id => aws_credential[:access_key_id],
21
+ :secret_access_key => aws_credential[:secret_access_key],
22
+ :except => except
23
+ )
24
+ end
25
+
26
+ def self.dev_null
27
+ require 'polipus/storage/dev_null'
28
+ self::DevNull.new
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,17 @@
1
+ require "uri"
2
+
3
+ module Polipus
4
+ module Storage
5
+ class Base
6
+ attr_accessor :include_query_string_in_uuid
7
+ protected
8
+ def uuid page
9
+ if @include_query_string_in_uuid.nil?
10
+ @include_query_string_in_uuid = true
11
+ end
12
+ url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
13
+ Digest::MD5.hexdigest(url_to_hash)
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,35 @@
1
+ module Polipus
2
+ module Storage
3
+ class DevNull < Base
4
+
5
+ def initialize(options = {})
6
+ end
7
+
8
+ def add page
9
+ end
10
+
11
+ def exists?(page)
12
+ false
13
+ end
14
+
15
+ def get page
16
+ nil
17
+ end
18
+
19
+ def remove page
20
+ false
21
+ end
22
+
23
+ def count
24
+ 0
25
+ end
26
+
27
+ def each
28
+ yield nil
29
+ end
30
+
31
+ def clear
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,86 @@
1
+ require "mongo"
2
+ require "zlib"
3
+ require "thread"
4
+ module Polipus
5
+ module Storage
6
+ class MongoStore < Base
7
+ BINARY_FIELDS = %w(body headers data)
8
+ def initialize(options = {})
9
+ @mongo = options[:mongo]
10
+ @collection = options[:collection]
11
+ @mongo.create_collection(@collection)
12
+ @mongo[@collection].ensure_index(:uuid, :unique => true, :drop_dups => true, :background => true)
13
+ @compress_body = options[:compress_body] ||= true
14
+ @except = options[:except] ||= []
15
+ @semaphore = Mutex.new
16
+ end
17
+
18
+ def add page
19
+ @semaphore.synchronize {
20
+ obj = page.to_hash
21
+ @except.each {|e| obj.delete e.to_s}
22
+ obj['uuid'] = uuid(page)
23
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
24
+ BINARY_FIELDS.each do |field|
25
+ obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
26
+ end
27
+ @mongo[@collection].update({:uuid => obj['uuid']}, obj, {:upsert => true, :w => 1})
28
+ obj['uuid']
29
+ }
30
+ end
31
+
32
+ def exists?(page)
33
+ @semaphore.synchronize {
34
+ doc = @mongo[@collection].find({:uuid => uuid(page)}, {:fields => [:_id]}).limit(1).first
35
+ !doc.nil?
36
+ }
37
+ end
38
+
39
+ def get page
40
+ @semaphore.synchronize {
41
+ data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
42
+ if data
43
+ return load_page(data)
44
+ end
45
+ }
46
+ end
47
+
48
+ def remove page
49
+ @semaphore.synchronize {
50
+ @mongo[@collection].remove({:uuid => uuid(page)})
51
+ }
52
+ end
53
+
54
+ def count
55
+ @mongo[@collection].count
56
+ end
57
+
58
+ def each
59
+ @mongo[@collection].find({},:timeout => false) do |cursor|
60
+ cursor.each do |doc|
61
+ page = load_page(doc)
62
+ yield doc['uuid'], page
63
+ end
64
+ end
65
+ end
66
+
67
+ def clear
68
+ @mongo[@collection].drop
69
+ end
70
+
71
+ private
72
+ def load_page(hash)
73
+ BINARY_FIELDS.each do |field|
74
+ hash[field] = hash[field].to_s
75
+ end
76
+ begin
77
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
78
+ return Page.from_hash(hash)
79
+ rescue
80
+ end
81
+ nil
82
+ end
83
+
84
+ end
85
+ end
86
+ end
@@ -0,0 +1,100 @@
1
+ require "aws/s3"
2
+ require "zlib"
3
+ require "thread"
4
+ require "json"
5
+ module Polipus
6
+ module Storage
7
+ class S3Store < Base
8
+ def initialize(options = {})
9
+ @options = options
10
+ @except = @options[:except] ||= []
11
+ @semaphore = Mutex.new
12
+
13
+ AWS::S3::Base.establish_connection!(
14
+ :access_key_id => @options[:access_key_id],
15
+ :secret_access_key => @options[:secret_access_key]
16
+ )
17
+ @options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
18
+ begin
19
+ @bucket = AWS::S3::Bucket.find(@options[:bucket])
20
+ rescue AWS::S3::NoSuchBucket
21
+ create_bucket
22
+ end
23
+ end
24
+
25
+ def add page
26
+ @semaphore.synchronize {
27
+ obj = page.to_hash
28
+ @except.each {|e| obj.delete e.to_s}
29
+ puuid = uuid(page)
30
+ obj['uuid'] = puuid
31
+ data = Zlib::Deflate.deflate(obj.to_json)
32
+ AWS::S3::S3Object.store(puuid, data, @bucket.name)
33
+ puuid
34
+ }
35
+ end
36
+
37
+ def exists?(page)
38
+ AWS::S3::S3Object.exists? uuid(page), @bucket.name
39
+ end
40
+
41
+ def get page
42
+ @semaphore.synchronize {
43
+ if exists?(page)
44
+ data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
45
+ return load_page(data)
46
+ end
47
+ nil
48
+ }
49
+ end
50
+
51
+ def remove page
52
+ @semaphore.synchronize {
53
+ if exists?(page)
54
+ AWS::S3::S3Object.delete(uuid(page), @bucket.name)
55
+ end
56
+ true
57
+ }
58
+ end
59
+
60
+ def count
61
+ @bucket.size
62
+ end
63
+
64
+ def clear
65
+ AWS::S3::Bucket.delete(@bucket.name, :force => true)
66
+ create_bucket
67
+ end
68
+
69
+ def each
70
+ objects = []
71
+ last_key = nil
72
+ begin
73
+ objects = AWS::S3::Bucket.objects(@bucket.name, :marker => last_key)
74
+ break if objects.size == 0
75
+ objects.each do |o|
76
+ page = load_page(o.value)
77
+ yield o.key, page
78
+ end
79
+ last_key = objects.last.key
80
+ end while true
81
+ end
82
+
83
+ private
84
+ def load_page(data)
85
+ begin
86
+ payload = Zlib::Inflate.inflate(data)
87
+ hash = JSON.parse(payload)
88
+ return Page.from_hash(hash)
89
+ rescue
90
+ end
91
+ nil
92
+ end
93
+
94
+ def create_bucket
95
+ AWS::S3::Bucket.create(@options[:bucket])
96
+ @bucket = AWS::S3::Bucket.find(@options[:bucket])
97
+ end
98
+ end
99
+ end
100
+ end