parallel588_polipus 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. checksums.yaml +7 -0
  2. data/.document +5 -0
  3. data/.gitignore +53 -0
  4. data/.rspec +2 -0
  5. data/.rubocop.yml +17 -0
  6. data/.rubocop_todo.yml +33 -0
  7. data/.travis.yml +22 -0
  8. data/AUTHORS.md +5 -0
  9. data/CHANGELOG.md +61 -0
  10. data/Gemfile +12 -0
  11. data/LICENSE.txt +20 -0
  12. data/README.md +70 -0
  13. data/Rakefile +8 -0
  14. data/examples/basic.rb +63 -0
  15. data/examples/error_handling.rb +23 -0
  16. data/examples/incremental.rb +63 -0
  17. data/examples/robots_txt_handling.rb +14 -0
  18. data/examples/survival.rb +10 -0
  19. data/lib/polipus.rb +488 -0
  20. data/lib/polipus/http.rb +282 -0
  21. data/lib/polipus/page.rb +256 -0
  22. data/lib/polipus/plugin.rb +14 -0
  23. data/lib/polipus/plugins/cleaner.rb +25 -0
  24. data/lib/polipus/plugins/sample.rb +15 -0
  25. data/lib/polipus/plugins/sleeper.rb +22 -0
  26. data/lib/polipus/queue_overflow.rb +26 -0
  27. data/lib/polipus/queue_overflow/base.rb +7 -0
  28. data/lib/polipus/queue_overflow/dev_null_queue.rb +34 -0
  29. data/lib/polipus/queue_overflow/manager.rb +57 -0
  30. data/lib/polipus/queue_overflow/mongo_queue.rb +60 -0
  31. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +29 -0
  32. data/lib/polipus/queue_overflow/worker.rb +24 -0
  33. data/lib/polipus/robotex.rb +145 -0
  34. data/lib/polipus/signal_handler.rb +42 -0
  35. data/lib/polipus/storage.rb +31 -0
  36. data/lib/polipus/storage/base.rb +20 -0
  37. data/lib/polipus/storage/dev_null.rb +35 -0
  38. data/lib/polipus/storage/memory_store.rb +56 -0
  39. data/lib/polipus/storage/mongo_store.rb +90 -0
  40. data/lib/polipus/storage/rethink_store.rb +90 -0
  41. data/lib/polipus/url_tracker.rb +21 -0
  42. data/lib/polipus/url_tracker/bloomfilter.rb +27 -0
  43. data/lib/polipus/url_tracker/redis_set.rb +27 -0
  44. data/lib/polipus/version.rb +5 -0
  45. data/polipus.gemspec +44 -0
  46. data/spec/cassettes/11c3eb8bf35dfc179dc5ce44f6f5f458.yml +6144 -0
  47. data/spec/cassettes/1f6e1d7743ecaa86594b4e68a6462689.yml +11320 -0
  48. data/spec/cassettes/6adfecdb274dd26ffd3713169583ca91.yml +18236 -0
  49. data/spec/cassettes/978ac0eeb5df63a019b754cc8a965b06.yml +18296 -0
  50. data/spec/cassettes/b389efd1dcb8f09393b5aae1627c2a83.yml +36569 -0
  51. data/spec/cassettes/bc6fb220895689be7eeb05b09969a18d.yml +61 -0
  52. data/spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml +18165 -0
  53. data/spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml +11758 -0
  54. data/spec/cassettes/gzipped_on.yml +147 -0
  55. data/spec/cassettes/http_cookies.yml +133 -0
  56. data/spec/cassettes/http_tconnection_max_hits.yml +4138 -0
  57. data/spec/cassettes/http_test.yml +1418 -0
  58. data/spec/cassettes/http_test_redirect.yml +71 -0
  59. data/spec/clear.rb +12 -0
  60. data/spec/polipus/http_spec.rb +139 -0
  61. data/spec/polipus/page_spec.rb +68 -0
  62. data/spec/polipus/queue_overflow/manager_spec.rb +88 -0
  63. data/spec/polipus/queue_overflow_spec.rb +66 -0
  64. data/spec/polipus/robotex_spec.rb +85 -0
  65. data/spec/polipus/signal_handler_spec.rb +15 -0
  66. data/spec/polipus/storage/memory_store_spec.rb +87 -0
  67. data/spec/polipus/storage/mongo_store_spec.rb +119 -0
  68. data/spec/polipus/storage/rethink_store_spec.rb +117 -0
  69. data/spec/polipus/url_tracker_spec.rb +29 -0
  70. data/spec/polipus_spec.rb +107 -0
  71. data/spec/spec_helper.rb +42 -0
  72. metadata +348 -0
@@ -0,0 +1,42 @@
1
+ # encoding: UTF-8
2
+ require 'singleton'
3
+ module Polipus
4
+ class SignalHandler
5
+ include Singleton
6
+ attr_accessor :terminated
7
+ attr_accessor :enabled
8
+
9
+ def initialize
10
+ self.terminated = false
11
+ self.enabled = false
12
+ end
13
+
14
+ def self.enable
15
+ trap(:INT) do
16
+ exit unless self.enabled?
17
+ terminate
18
+ end
19
+ trap(:TERM) do
20
+ exit unless self.enabled?
21
+ terminate
22
+ end
23
+ instance.enabled = true
24
+ end
25
+
26
+ def self.disable
27
+ instance.enabled = false
28
+ end
29
+
30
+ def self.terminate
31
+ instance.terminated = true
32
+ end
33
+
34
+ def self.terminated?
35
+ instance.terminated
36
+ end
37
+
38
+ def self.enabled?
39
+ instance.enabled
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,31 @@
1
+ require 'polipus/storage/base'
2
+
3
+ module Polipus
4
+ module Storage
5
+ COLLECTION = 'pages'
6
+
7
+ def self.mongo_store(mongo = nil, collection = COLLECTION, except = [])
8
+ require 'polipus/storage/mongo_store'
9
+ mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
10
+ fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
11
+ self::MongoStore.new(mongo: mongo, collection: collection, except: except)
12
+ end
13
+
14
+ def self.rethink_store(conn = nil, table = COLLECTION, except = [])
15
+ require 'polipus/storage/rethink_store'
16
+ conn ||= RethinkDB::RQL.new.connect(host: 'localhost', port: 28_015, db: 'polipus' )
17
+ fail "First argument must be a RethinkDB::Connection, got `#{conn.class}`" unless conn.is_a?(RethinkDB::Connection)
18
+ self::RethinkStore.new(conn: conn, table: table, except: except)
19
+ end
20
+
21
+ def self.dev_null
22
+ require 'polipus/storage/dev_null'
23
+ self::DevNull.new
24
+ end
25
+
26
+ def self.memory_store
27
+ require 'polipus/storage/memory_store'
28
+ self::MemoryStore.new
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,20 @@
1
+ # encoding: UTF-8
2
+ require 'uri'
3
+
4
+ module Polipus
5
+ module Storage
6
+ class Base
7
+ attr_accessor :include_query_string_in_uuid
8
+
9
+ protected
10
+
11
+ def uuid(page)
12
+ if @include_query_string_in_uuid.nil?
13
+ @include_query_string_in_uuid = true
14
+ end
15
+ url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
16
+ Digest::MD5.hexdigest(url_to_hash)
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,35 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module Storage
4
+ class DevNull < Base
5
+ def initialize(_options = {})
6
+ end
7
+
8
+ def add(_page)
9
+ end
10
+
11
+ def exists?(_page)
12
+ false
13
+ end
14
+
15
+ def get(_page)
16
+ nil
17
+ end
18
+
19
+ def remove(_page)
20
+ false
21
+ end
22
+
23
+ def count
24
+ 0
25
+ end
26
+
27
+ def each
28
+ yield nil
29
+ end
30
+
31
+ def clear
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,56 @@
1
+ # encoding: UTF-8
2
+ require 'thread'
3
+ module Polipus
4
+ module Storage
5
+ class MemoryStore < Base
6
+ def initialize(_options = {})
7
+ @store = {}
8
+ @semaphore = Mutex.new
9
+ end
10
+
11
+ def add(page)
12
+ @semaphore.synchronize do
13
+ u = uuid(page)
14
+ @store[u] = page
15
+ u
16
+ end
17
+ end
18
+
19
+ def exists?(page)
20
+ @semaphore.synchronize do
21
+ @store.key?(uuid(page))
22
+ end
23
+ end
24
+
25
+ def get(page)
26
+ @semaphore.synchronize do
27
+ @store[uuid(page)]
28
+ end
29
+ end
30
+
31
+ def remove(page)
32
+ @semaphore.synchronize do
33
+ @store.delete(uuid(page))
34
+ end
35
+ end
36
+
37
+ def count
38
+ @semaphore.synchronize do
39
+ @store.count
40
+ end
41
+ end
42
+
43
+ def each
44
+ @store.each do |k, v|
45
+ yield k, v
46
+ end
47
+ end
48
+
49
+ def clear
50
+ @semaphore.synchronize do
51
+ @store = Hash.new
52
+ end
53
+ end
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,90 @@
1
+ # encoding: UTF-8
2
+ require 'mongo'
3
+ require 'zlib'
4
+ require 'thread'
5
+
6
+ module Polipus
7
+ module Storage
8
+ class MongoStore < Base
9
+ BINARY_FIELDS = %w(body headers data)
10
+ def initialize(options = {})
11
+ @mongo = options[:mongo]
12
+ @collection = options[:collection]
13
+ @mongo.create_collection(@collection)
14
+ begin
15
+ @mongo[@collection].ensure_index(:uuid, unique: true, dropDups: true, background: true)
16
+ rescue Exception
17
+ end
18
+
19
+ @compress_body = options[:compress_body] ||= true
20
+ @except = options[:except] ||= []
21
+ @semaphore = Mutex.new
22
+ end
23
+
24
+ def add(page)
25
+ @semaphore.synchronize do
26
+ obj = page.to_hash
27
+ @except.each { |e| obj.delete e.to_s }
28
+ obj['uuid'] = uuid(page)
29
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
30
+ BINARY_FIELDS.each do |field|
31
+ obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
32
+ end
33
+ @mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
34
+ obj['uuid']
35
+ end
36
+ end
37
+
38
+ def exists?(page)
39
+ @semaphore.synchronize do
40
+ doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
41
+ !doc.nil?
42
+ end
43
+ end
44
+
45
+ def get(page)
46
+ @semaphore.synchronize do
47
+ data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
48
+ return load_page(data) if data
49
+ end
50
+ end
51
+
52
+ def remove(page)
53
+ @semaphore.synchronize do
54
+ @mongo[@collection].remove(uuid: uuid(page))
55
+ end
56
+ end
57
+
58
+ def count
59
+ @mongo[@collection].count
60
+ end
61
+
62
+ def each
63
+ @mongo[@collection].find({}, timeout: false) do |cursor|
64
+ cursor.each do |doc|
65
+ page = load_page(doc)
66
+ yield doc['uuid'], page
67
+ end
68
+ end
69
+ end
70
+
71
+ def clear
72
+ @mongo[@collection].drop
73
+ end
74
+
75
+ private
76
+
77
+ def load_page(hash)
78
+ BINARY_FIELDS.each do |field|
79
+ hash[field] = hash[field].to_s
80
+ end
81
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
82
+ page = Page.from_hash(hash)
83
+ if page.fetched_at.nil?
84
+ page.fetched_at = hash['_id'].generation_time.to_i
85
+ end
86
+ page
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,90 @@
1
+ # encoding: UTF-8
2
+ require 'rethinkdb'
3
+ require 'thread'
4
+ require 'zlib'
5
+
6
+ module Polipus
7
+ module Storage
8
+ class RethinkStore < Base
9
+ BINARY_FIELDS = %w(body headers data)
10
+ def initialize(options = {})
11
+ @r = RethinkDB::RQL.new
12
+ @rethink = options[:conn]
13
+ @table = options[:table]
14
+
15
+ unless @r.table_list.run(@rethink).include?(@table)
16
+ @r.table_create(@table).run(@rethink)
17
+ @r.table(@table).index_create('created_at')
18
+ end
19
+
20
+ @compress_body = options[:compress_body] ||= true
21
+ @except = options[:except] ||= []
22
+ @semaphore = Mutex.new
23
+ end
24
+
25
+ def add(page)
26
+ @semaphore.synchronize do
27
+ obj = page.to_hash
28
+ @except.each { |e| obj.delete e.to_s }
29
+ obj[:id] = uuid(page)
30
+ obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
31
+ obj['created_at'] ||= Time.now.to_i
32
+ BINARY_FIELDS.each do |field|
33
+ # Use some marshalling?
34
+ obj[field] = @r.binary(obj[field]) unless obj[field].nil?
35
+ end
36
+
37
+ @r.table(@table).insert(obj).run(@rethink, durability: 'soft')
38
+ obj[:id]
39
+ end
40
+ end
41
+
42
+ def exists?(page)
43
+ @semaphore.synchronize do
44
+ doc = @r.table(@table).get(uuid(page)).run(@rethink)
45
+ !doc.nil?
46
+ end
47
+ end
48
+
49
+ def get(page)
50
+ @semaphore.synchronize do
51
+ data = @r.table(@table).get(uuid(page)).run(@rethink)
52
+ return load_page(data) if data
53
+ end
54
+ end
55
+
56
+ def remove(page)
57
+ @semaphore.synchronize do
58
+ @r.table(@table).get(uuid(page)).delete.run(@rethink)
59
+ end
60
+ end
61
+
62
+ def count
63
+ @r.table(@table).count.run(@rethink)
64
+ end
65
+
66
+ def each
67
+ @r.table(@table).run(@rethink).each do |doc|
68
+ page = load_page(doc)
69
+ yield doc[:id], page
70
+ end
71
+ end
72
+
73
+ def clear
74
+ @r.table(@table).delete.run(@rethink)
75
+ end
76
+
77
+ private
78
+
79
+ def load_page(hash)
80
+ BINARY_FIELDS.each do |field|
81
+ hash[field] = hash[field].to_s
82
+ end
83
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
84
+ page = Page.from_hash(hash)
85
+ page.fetched_at ||= hash['created_at']
86
+ page
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,21 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module UrlTracker
4
+ def self.bloomfilter(options = {})
5
+ require 'polipus/url_tracker/bloomfilter'
6
+ options[:size] ||= 1_000_000
7
+ options[:error_rate] ||= 0.01
8
+ options[:key_name] ||= 'polipus-bloomfilter'
9
+ options[:redis] ||= Redis.current
10
+ options[:driver] ||= 'lua'
11
+ self::Bloomfilter.new options
12
+ end
13
+
14
+ def self.redis_set(options = {})
15
+ require 'polipus/url_tracker/redis_set'
16
+ options[:redis] ||= Redis.current
17
+ options[:key_name] ||= 'polipus-set'
18
+ self::RedisSet.new options
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: UTF-8
2
+ require 'redis-bloomfilter'
3
+ module Polipus
4
+ module UrlTracker
5
+ class Bloomfilter
6
+ def initialize(options = {})
7
+ @bf = Redis::Bloomfilter.new options
8
+ end
9
+
10
+ def visited?(url)
11
+ @bf.include?(url)
12
+ end
13
+
14
+ def visit(url)
15
+ @bf.insert url
16
+ end
17
+
18
+ def remove(url)
19
+ @bf.remove url
20
+ end
21
+
22
+ def clear
23
+ @bf.clear
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,27 @@
1
+ # encoding: UTF-8
2
+ module Polipus
3
+ module UrlTracker
4
+ class RedisSet
5
+ def initialize(options = {})
6
+ @redis = options[:redis] || Redis.current
7
+ @set_name = options[:key_name]
8
+ end
9
+
10
+ def visited?(url)
11
+ @redis.sismember(@set_name, url)
12
+ end
13
+
14
+ def visit(url)
15
+ @redis.sadd(@set_name, url)
16
+ end
17
+
18
+ def remove(url)
19
+ @redis.srem(@set_name, url, 0)
20
+ end
21
+
22
+ def clear
23
+ @redis.del @set_name
24
+ end
25
+ end
26
+ end
27
+ end