parallel588_polipus 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,42 @@
1
+ # encoding: UTF-8
2
+ require 'singleton'
3
+ module Polipus
4
+ class SignalHandler
5
+ include Singleton
6
+ attr_accessor :terminated
7
+ attr_accessor :enabled
8
+
9
+ def initialize
10
+ self.terminated = false
11
+ self.enabled = false
12
+ end
13
+
14
+ def self.enable
15
+ trap(:INT) do
16
+ exit unless self.enabled?
17
+ terminate
18
+ end
19
+ trap(:TERM) do
20
+ exit unless self.enabled?
21
+ terminate
22
+ end
23
+ instance.enabled = true
24
+ end
25
+
26
+ def self.disable
27
+ instance.enabled = false
28
+ end
29
+
30
+ def self.terminate
31
+ instance.terminated = true
32
+ end
33
+
34
+ def self.terminated?
35
+ instance.terminated
36
+ end
37
+
38
+ def self.enabled?
39
+ instance.enabled
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,31 @@
1
+ require 'polipus/storage/base'
2
+
3
+ module Polipus
4
+ module Storage
5
+ COLLECTION = 'pages'
6
+
7
+ def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
8
+ require 'polipus/storage/mongo_store'
9
+ mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
10
+ fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
11
+ self::MongoStore.new(mongo: mongo, collection: collection, except: except)
12
+ end
13
+
14
+ def self.rethink_store(conn = nil, table = COLLECTION, except = [])
15
+ require 'polipus/storage/rethink_store'
16
+ conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
17
+ fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
18
+ self::RethinkStore.new(conn: conn, table: table, except: except)
19
+ end
20
+
21
+ def self.dev_null
22
+ require 'polipus/storage/dev_null'
23
+ self::DevNull.new
24
+ end
25
+
26
+ def self.memory_store
27
+ require 'polipus/storage/memory_store'
28
+ self::MemoryStore.new
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,20 @@
1
+ # encoding: UTF-8
2
+ require 'uri'
3
+
4
+ module Polipus
5
+ module Storage
6
+ class Base
7
+ attr_accessor :include_query_string_in_uuid
8
+
9
+ protected
10
+
11
+ def uuid(page)
12
+ if @include_query_string_in_uuid.nil?
13
+ @include_query_string_in_uuid = true
14
+ end
15
+ url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
16
+ Digest::MD5.hexdigest(url_to_hash)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,35 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Storage
4
+ class DevNull < Base
5
+ def initialize(_options = {})
6
+ end
7
+
8
+ def add(_page)
9
+ end
10
+
11
+ def exists?(_page)
12
+ false
13
+ end
14
+
15
+ def get(_page)
16
+ nil
17
+ end
18
+
19
+ def remove(_page)
20
+ false
21
+ end
22
+
23
+ def count
24
+ 0
25
+ end
26
+
27
+ def each
28
+ yield nil
29
+ end
30
+
31
+ def clear
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,56 @@
1
+ # encoding: UTF-8
2
+ require 'thread'
3
+ module Polipus
4
+ module Storage
5
+ class MemoryStore < Base
6
+ def initialize(_options = {})
7
+ @store = {}
8
+ @semaphore = Mutex.new
9
+ end
10
+
11
+ def add(page)
12
+ @semaphore.synchronize do
13
+ u = uuid(page)
14
+ @store[u] = page
15
+ u
16
+ end
17
+ end
18
+
19
+ def exists?(page)
20
+ @semaphore.synchronize do
21
+ @store.key?(uuid(page))
22
+ end
23
+ end
24
+
25
+ def get(page)
26
+ @semaphore.synchronize do
27
+ @store[uuid(page)]
28
+ end
29
+ end
30
+
31
+ def remove(page)
32
+ @semaphore.synchronize do
33
+ @store.delete(uuid(page))
34
+ end
35
+ end
36
+
37
+ def count
38
+ @semaphore.synchronize do
39
+ @store.count
40
+ end
41
+ end
42
+
43
+ def each
44
+ @store.each do |k, v|
45
+ yield k, v
46
+ end
47
+ end
48
+
49
+ def clear
50
+ @semaphore.synchronize do
51
+ @store = Hash.new
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,90 @@
1
+ # encoding: UTF-8
2
+ require 'mongo'
3
+ require 'zlib'
4
+ require 'thread'
5
+
6
+ module Polipus
7
+ module Storage
8
+ class MongoStore < Base
9
+ BINARY_FIELDS = %w(body headers data)
10
+ def initialize(options = {})
11
+ @mongo = options[:mongo]
12
+ @collection = options[:collection]
13
+ @mongo.create_collection(@collection)
14
+ begin
15
+ @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
16
+ rescue Exception
17
+ end
18
+
19
+ @compress_body = options[:compress_body] ||= true
20
+ @except = options[:except] ||= []
21
+ @semaphore = Mutex.new
22
+ end
23
+
24
+ def add(page)
25
+ @semaphore.synchronize do
26
+ obj = page.to_hash
27
+ @except.each { |e| obj.delete e.to_s }
28
+ obj['uuid'] = uuid(page)
29
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
30
+ BINARY_FIELDS.each do |field|
31
+ obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
32
+ end
33
+ @mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
34
+ obj['uuid']
35
+ end
36
+ end
37
+
38
+ def exists?(page)
39
+ @semaphore.synchronize do
40
+ doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
41
+ !doc.nil?
42
+ end
43
+ end
44
+
45
+ def get(page)
46
+ @semaphore.synchronize do
47
+ data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
48
+ return load_page(data) if data
49
+ end
50
+ end
51
+
52
+ def remove(page)
53
+ @semaphore.synchronize do
54
+ @mongo[@collection].remove(uuid: uuid(page))
55
+ end
56
+ end
57
+
58
+ def count
59
+ @mongo[@collection].count
60
+ end
61
+
62
+ def each
63
+ @mongo[@collection].find({}, timeout: false) do |cursor|
64
+ cursor.each do |doc|
65
+ page = load_page(doc)
66
+ yield doc['uuid'], page
67
+ end
68
+ end
69
+ end
70
+
71
+ def clear
72
+ @mongo[@collection].drop
73
+ end
74
+
75
+ private
76
+
77
+ def load_page(hash)
78
+ BINARY_FIELDS.each do |field|
79
+ hash[field] = hash[field].to_s
80
+ end
81
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
82
+ page = Page.from_hash(hash)
83
+ if page.fetched_at.nil?
84
+ page.fetched_at = hash['_id'].generation_time.to_i
85
+ end
86
+ page
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,90 @@
1
+ # encoding: UTF-8
2
+ require 'rethinkdb'
3
+ require 'thread'
4
+ require 'zlib'
5
+
6
+ module Polipus
7
+ module Storage
8
+ class RethinkStore < Base
9
+ BINARY_FIELDS = %w(body headers data)
10
+ def initialize(options = {})
11
+ @r = RethinkDB::RQL.new
12
+ @rethink = options[:conn]
13
+ @table = options[:table]
14
+
15
+ unless @r.table_list.run(@rethink).include?(@table)
16
+ @r.table_create(@table).run(@rethink)
17
+ @r.table(@table).index_create('created_at')
18
+ end
19
+
20
+ @compress_body = options[:compress_body] ||= true
21
+ @except = options[:except] ||= []
22
+ @semaphore = Mutex.new
23
+ end
24
+
25
+ def add(page)
26
+ @semaphore.synchronize do
27
+ obj = page.to_hash
28
+ @except.each { |e| obj.delete e.to_s }
29
+ obj[:id] = uuid(page)
30
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
31
+ obj['created_at'] ||= Time.now.to_i
32
+ BINARY_FIELDS.each do |field|
33
+ # Use some marshalling?
34
+ obj[field] = @r.binary(obj[field]) unless obj[field].nil?
35
+ end
36
+
37
+ @r.table(@table).insert(obj).run(@rethink, durability: 'soft')
38
+ obj[:id]
39
+ end
40
+ end
41
+
42
+ def exists?(page)
43
+ @semaphore.synchronize do
44
+ doc = @r.table(@table).get(uuid(page)).run(@rethink)
45
+ !doc.nil?
46
+ end
47
+ end
48
+
49
+ def get(page)
50
+ @semaphore.synchronize do
51
+ data = @r.table(@table).get(uuid(page)).run(@rethink)
52
+ return load_page(data) if data
53
+ end
54
+ end
55
+
56
+ def remove(page)
57
+ @semaphore.synchronize do
58
+ @r.table(@table).get(uuid(page)).delete.run(@rethink)
59
+ end
60
+ end
61
+
62
+ def count
63
+ @r.table(@table).count.run(@rethink)
64
+ end
65
+
66
+ def each
67
+ @r.table(@table).run(@rethink).each do |doc|
68
+ page = load_page(doc)
69
+ yield doc[:id], page
70
+ end
71
+ end
72
+
73
+ def clear
74
+ @r.table(@table).delete.run(@rethink)
75
+ end
76
+
77
+ private
78
+
79
+ def load_page(hash)
80
+ BINARY_FIELDS.each do |field|
81
+ hash[field] = hash[field].to_s
82
+ end
83
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
84
+ page = Page.from_hash(hash)
85
+ page.fetched_at ||= hash['created_at']
86
+ page
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,21 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module UrlTracker
4
+ def self.bloomfilter(options = {})
5
+ require 'polipus/url_tracker/bloomfilter'
6
+ options[:size] ||= 1_000_000
7
+ options[:error_rate] ||= 0.01
8
+ options[:key_name] ||= 'polipus-bloomfilter'
9
+ options[:redis] ||= Redis.current
10
+ options[:driver] ||= 'lua'
11
+ self::Bloomfilter.new options
12
+ end
13
+
14
+ def self.redis_set(options = {})
15
+ require 'polipus/url_tracker/redis_set'
16
+ options[:redis] ||= Redis.current
17
+ options[:key_name] ||= 'polipus-set'
18
+ self::RedisSet.new options
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: UTF-8
2
+ require 'redis-bloomfilter'
3
+ module Polipus
4
+ module UrlTracker
5
+ class Bloomfilter
6
+ def initialize(options = {})
7
+ @bf = Redis::Bloomfilter.new options
8
+ end
9
+
10
+ def visited?(url)
11
+ @bf.include?(url)
12
+ end
13
+
14
+ def visit(url)
15
+ @bf.insert url
16
+ end
17
+
18
+ def remove(url)
19
+ @bf.remove url
20
+ end
21
+
22
+ def clear
23
+ @bf.clear
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module UrlTracker
4
+ class RedisSet
5
+ def initialize(options = {})
6
+ @redis = options[:redis] || Redis.current
7
+ @set_name = options[:key_name]
8
+ end
9
+
10
+ def visited?(url)
11
+ @redis.sismember(@set_name, url)
12
+ end
13
+
14
+ def visit(url)
15
+ @redis.sadd(@set_name, url)
16
+ end
17
+
18
+ def remove(url)
19
+ @redis.srem(@set_name, url, 0)
20
+ end
21
+
22
+ def clear
23
+ @redis.del @set_name
24
+ end
25
+ end
26
+ end
27
+ end