polipus 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +8 -8
  2. data/.rubocop.yml +17 -0
  3. data/.rubocop_todo.yml +37 -0
  4. data/.travis.yml +2 -1
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +10 -0
  7. data/Rakefile +4 -4
  8. data/examples/basic.rb +16 -19
  9. data/examples/incremental.rb +17 -17
  10. data/examples/robots_txt_handling.rb +1 -1
  11. data/examples/survival.rb +3 -3
  12. data/lib/polipus.rb +186 -229
  13. data/lib/polipus/http.rb +41 -42
  14. data/lib/polipus/page.rb +33 -34
  15. data/lib/polipus/plugin.rb +2 -2
  16. data/lib/polipus/plugins/cleaner.rb +7 -8
  17. data/lib/polipus/plugins/sample.rb +6 -9
  18. data/lib/polipus/plugins/sleeper.rb +7 -8
  19. data/lib/polipus/queue_overflow.rb +11 -11
  20. data/lib/polipus/queue_overflow/base.rb +1 -1
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
  22. data/lib/polipus/queue_overflow/manager.rb +28 -25
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
  25. data/lib/polipus/robotex.rb +41 -51
  26. data/lib/polipus/signal_handler.rb +41 -0
  27. data/lib/polipus/storage.rb +11 -11
  28. data/lib/polipus/storage/base.rb +10 -8
  29. data/lib/polipus/storage/dev_null.rb +6 -7
  30. data/lib/polipus/storage/memory_store.rb +21 -22
  31. data/lib/polipus/storage/mongo_store.rb +34 -38
  32. data/lib/polipus/storage/s3_store.rb +33 -38
  33. data/lib/polipus/url_tracker.rb +3 -3
  34. data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
  35. data/lib/polipus/url_tracker/redis_set.rb +3 -4
  36. data/lib/polipus/version.rb +3 -3
  37. data/polipus.gemspec +12 -13
  38. data/spec/clear.rb +3 -3
  39. data/spec/http_spec.rb +27 -28
  40. data/spec/page_spec.rb +16 -16
  41. data/spec/polipus_spec.rb +34 -31
  42. data/spec/queue_overflow_manager_spec.rb +30 -28
  43. data/spec/queue_overflow_spec.rb +15 -15
  44. data/spec/robotex_spec.rb +9 -10
  45. data/spec/signal_handler_spec.rb +18 -0
  46. data/spec/spec_helper.rb +7 -6
  47. data/spec/storage_memory_spec.rb +18 -18
  48. data/spec/storage_mongo_spec.rb +19 -19
  49. data/spec/storage_s3_spec.rb +30 -31
  50. data/spec/url_tracker_spec.rb +7 -7
  51. metadata +7 -2
@@ -1,17 +1,19 @@
1
- require "uri"
1
+ require 'uri'
2
2
 
3
3
  module Polipus
4
4
  module Storage
5
5
  class Base
6
6
  attr_accessor :include_query_string_in_uuid
7
+
7
8
  protected
8
- def uuid page
9
- if @include_query_string_in_uuid.nil?
10
- @include_query_string_in_uuid = true
11
- end
12
- url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
13
- Digest::MD5.hexdigest(url_to_hash)
9
+
10
+ def uuid(page)
11
+ if @include_query_string_in_uuid.nil?
12
+ @include_query_string_in_uuid = true
14
13
  end
14
+ url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
15
+ Digest::MD5.hexdigest(url_to_hash)
16
+ end
15
17
  end
16
18
  end
17
- end
19
+ end
@@ -1,22 +1,21 @@
1
1
  module Polipus
2
2
  module Storage
3
3
  class DevNull < Base
4
-
5
- def initialize(options = {})
4
+ def initialize(_options = {})
6
5
  end
7
6
 
8
- def add page
7
+ def add(_page)
9
8
  end
10
9
 
11
- def exists?(page)
10
+ def exists?(_page)
12
11
  false
13
12
  end
14
13
 
15
- def get page
14
+ def get(_page)
16
15
  nil
17
16
  end
18
17
 
19
- def remove page
18
+ def remove(_page)
20
19
  false
21
20
  end
22
21
 
@@ -32,4 +31,4 @@ module Polipus
32
31
  end
33
32
  end
34
33
  end
35
- end
34
+ end
@@ -1,56 +1,55 @@
1
- require "thread"
1
+ require 'thread'
2
2
  module Polipus
3
3
  module Storage
4
4
  class MemoryStore < Base
5
-
6
- def initialize(options = {})
7
- @store = Hash.new
5
+ def initialize(_options = {})
6
+ @store = {}
8
7
  @semaphore = Mutex.new
9
8
  end
10
9
 
11
- def add page
12
- @semaphore.synchronize {
10
+ def add(page)
11
+ @semaphore.synchronize do
13
12
  u = uuid(page)
14
13
  @store[u] = page
15
14
  u
16
- }
15
+ end
17
16
  end
18
17
 
19
18
  def exists?(page)
20
- @semaphore.synchronize {
19
+ @semaphore.synchronize do
21
20
  @store.key?(uuid(page))
22
- }
21
+ end
23
22
  end
24
23
 
25
- def get page
26
- @semaphore.synchronize {
24
+ def get(page)
25
+ @semaphore.synchronize do
27
26
  @store[uuid(page)]
28
- }
27
+ end
29
28
  end
30
29
 
31
- def remove page
32
- @semaphore.synchronize {
30
+ def remove(page)
31
+ @semaphore.synchronize do
33
32
  @store.delete(uuid(page))
34
- }
33
+ end
35
34
  end
36
35
 
37
36
  def count
38
- @semaphore.synchronize {
37
+ @semaphore.synchronize do
39
38
  @store.count
40
- }
39
+ end
41
40
  end
42
41
 
43
42
  def each
44
- @store.each do |k,v|
45
- yield k,v
43
+ @store.each do |k, v|
44
+ yield k, v
46
45
  end
47
46
  end
48
47
 
49
48
  def clear
50
- @semaphore.synchronize {
49
+ @semaphore.synchronize do
51
50
  @store = Hash.new
52
- }
51
+ end
53
52
  end
54
53
  end
55
54
  end
56
- end
55
+ end
@@ -1,6 +1,6 @@
1
- require "mongo"
2
- require "zlib"
3
- require "thread"
1
+ require 'mongo'
2
+ require 'zlib'
3
+ require 'thread'
4
4
  module Polipus
5
5
  module Storage
6
6
  class MongoStore < Base
@@ -9,44 +9,44 @@ module Polipus
9
9
  @mongo = options[:mongo]
10
10
  @collection = options[:collection]
11
11
  @mongo.create_collection(@collection)
12
- @mongo[@collection].ensure_index(:uuid, :unique => true, :drop_dups => true, :background => true)
12
+ @mongo[@collection].ensure_index(:uuid, unique: true, drop_dups: true, background: true)
13
13
  @compress_body = options[:compress_body] ||= true
14
14
  @except = options[:except] ||= []
15
15
  @semaphore = Mutex.new
16
16
  end
17
17
 
18
- def add page
19
- @semaphore.synchronize {
18
+ def add(page)
19
+ @semaphore.synchronize do
20
20
  obj = page.to_hash
21
- @except.each {|e| obj.delete e.to_s}
21
+ @except.each { |e| obj.delete e.to_s }
22
22
  obj['uuid'] = uuid(page)
23
23
  obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
24
24
  BINARY_FIELDS.each do |field|
25
25
  obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
26
26
  end
27
- @mongo[@collection].update({:uuid => obj['uuid']}, obj, {:upsert => true, :w => 1})
27
+ @mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
28
28
  obj['uuid']
29
- }
29
+ end
30
30
  end
31
31
 
32
32
  def exists?(page)
33
- @semaphore.synchronize {
34
- doc = @mongo[@collection].find({:uuid => uuid(page)}, {:fields => [:_id]}).limit(1).first
33
+ @semaphore.synchronize do
34
+ doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
35
35
  !doc.nil?
36
- }
36
+ end
37
37
  end
38
38
 
39
- def get page
40
- @semaphore.synchronize {
41
- data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
39
+ def get(page)
40
+ @semaphore.synchronize do
41
+ data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
42
42
  return load_page(data) if data
43
- }
43
+ end
44
44
  end
45
45
 
46
- def remove page
47
- @semaphore.synchronize {
48
- @mongo[@collection].remove({:uuid => uuid(page)})
49
- }
46
+ def remove(page)
47
+ @semaphore.synchronize do
48
+ @mongo[@collection].remove(uuid: uuid(page))
49
+ end
50
50
  end
51
51
 
52
52
  def count
@@ -54,10 +54,10 @@ module Polipus
54
54
  end
55
55
 
56
56
  def each
57
- @mongo[@collection].find({},:timeout => false) do |cursor|
57
+ @mongo[@collection].find({}, timeout: false) do |cursor|
58
58
  cursor.each do |doc|
59
59
  page = load_page(doc)
60
- yield doc['uuid'], page
60
+ yield doc['uuid'], page
61
61
  end
62
62
  end
63
63
  end
@@ -67,22 +67,18 @@ module Polipus
67
67
  end
68
68
 
69
69
  private
70
- def load_page(hash)
71
- BINARY_FIELDS.each do |field|
72
- hash[field] = hash[field].to_s
73
- end
74
- begin
75
- hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
76
- page = Page.from_hash(hash)
77
- if page.fetched_at.nil?
78
- page.fetched_at = hash['_id'].generation_time.to_i
79
- end
80
- return page
81
- rescue
82
- end
83
- nil
84
- end
85
70
 
71
+ def load_page(hash)
72
+ BINARY_FIELDS.each do |field|
73
+ hash[field] = hash[field].to_s
74
+ end
75
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
76
+ page = Page.from_hash(hash)
77
+ if page.fetched_at.nil?
78
+ page.fetched_at = hash['_id'].generation_time.to_i
79
+ end
80
+ page
81
+ end
86
82
  end
87
83
  end
88
- end
84
+ end
@@ -1,7 +1,7 @@
1
- require "aws/s3"
2
- require "zlib"
3
- require "thread"
4
- require "json"
1
+ require 'aws/s3'
2
+ require 'zlib'
3
+ require 'thread'
4
+ require 'json'
5
5
  module Polipus
6
6
  module Storage
7
7
  class S3Store < Base
@@ -11,8 +11,8 @@ module Polipus
11
11
  @semaphore = Mutex.new
12
12
 
13
13
  AWS::S3::Base.establish_connection!(
14
- :access_key_id => @options[:access_key_id],
15
- :secret_access_key => @options[:secret_access_key]
14
+ access_key_id: @options[:access_key_id],
15
+ secret_access_key: @options[:secret_access_key]
16
16
  )
17
17
  @options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
18
18
  begin
@@ -22,39 +22,37 @@ module Polipus
22
22
  end
23
23
  end
24
24
 
25
- def add page
26
- @semaphore.synchronize {
25
+ def add(page)
26
+ @semaphore.synchronize do
27
27
  obj = page.to_hash
28
- @except.each {|e| obj.delete e.to_s}
28
+ @except.each { |e| obj.delete e.to_s }
29
29
  puuid = uuid(page)
30
30
  obj['uuid'] = puuid
31
31
  data = Zlib::Deflate.deflate(obj.to_json)
32
32
  AWS::S3::S3Object.store(puuid, data, @bucket.name)
33
33
  puuid
34
- }
34
+ end
35
35
  end
36
36
 
37
37
  def exists?(page)
38
38
  AWS::S3::S3Object.exists? uuid(page), @bucket.name
39
39
  end
40
40
 
41
- def get page
42
- @semaphore.synchronize {
41
+ def get(page)
42
+ @semaphore.synchronize do
43
43
  if exists?(page)
44
44
  data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
45
45
  return load_page(data)
46
46
  end
47
47
  nil
48
- }
48
+ end
49
49
  end
50
50
 
51
- def remove page
52
- @semaphore.synchronize {
53
- if exists?(page)
54
- AWS::S3::S3Object.delete(uuid(page), @bucket.name)
55
- end
51
+ def remove(page)
52
+ @semaphore.synchronize do
53
+ exists?(page) && AWS::S3::S3Object.delete(uuid(page), @bucket.name)
56
54
  true
57
- }
55
+ end
58
56
  end
59
57
 
60
58
  def count
@@ -62,39 +60,36 @@ module Polipus
62
60
  end
63
61
 
64
62
  def clear
65
- AWS::S3::Bucket.delete(@bucket.name, :force => true)
63
+ AWS::S3::Bucket.delete(@bucket.name, force: true)
66
64
  create_bucket
67
65
  end
68
66
 
69
67
  def each
70
68
  objects = []
71
69
  last_key = nil
72
- begin
73
- objects = AWS::S3::Bucket.objects(@bucket.name, :marker => last_key)
70
+ loop do
71
+ objects = AWS::S3::Bucket.objects(@bucket.name, marker: last_key)
74
72
  break if objects.size == 0
75
73
  objects.each do |o|
76
74
  page = load_page(o.value)
77
- yield o.key, page
75
+ yield o.key, page
78
76
  end
79
77
  last_key = objects.last.key
80
- end while true
78
+ end
81
79
  end
82
80
 
83
81
  private
84
- def load_page(data)
85
- begin
86
- payload = Zlib::Inflate.inflate(data)
87
- hash = JSON.parse(payload)
88
- return Page.from_hash(hash)
89
- rescue
90
- end
91
- nil
92
- end
93
82
 
94
- def create_bucket
95
- AWS::S3::Bucket.create(@options[:bucket])
96
- @bucket = AWS::S3::Bucket.find(@options[:bucket])
97
- end
83
+ def load_page(data)
84
+ payload = Zlib::Inflate.inflate(data)
85
+ hash = JSON.parse(payload)
86
+ Page.from_hash(hash)
87
+ end
88
+
89
+ def create_bucket
90
+ AWS::S3::Bucket.create(@options[:bucket])
91
+ @bucket = AWS::S3::Bucket.find(@options[:bucket])
92
+ end
98
93
  end
99
94
  end
100
- end
95
+ end
@@ -1,7 +1,7 @@
1
1
  module Polipus
2
2
  module UrlTracker
3
3
  def self.bloomfilter(options = {})
4
- require "polipus/url_tracker/bloomfilter"
4
+ require 'polipus/url_tracker/bloomfilter'
5
5
  options[:size] ||= 1_000_000
6
6
  options[:error_rate] ||= 0.01
7
7
  options[:key_name] ||= 'polipus-bloomfilter'
@@ -11,10 +11,10 @@ module Polipus
11
11
  end
12
12
 
13
13
  def self.redis_set(options = {})
14
- require "polipus/url_tracker/redis_set"
14
+ require 'polipus/url_tracker/redis_set'
15
15
  options[:redis] ||= Redis.current
16
16
  options[:key_name] ||= 'polipus-set'
17
17
  self::RedisSet.new options
18
18
  end
19
19
  end
20
- end
20
+ end
@@ -1,4 +1,4 @@
1
- require "redis-bloomfilter"
1
+ require 'redis-bloomfilter'
2
2
  module Polipus
3
3
  module UrlTracker
4
4
  class Bloomfilter
@@ -10,18 +10,17 @@ module Polipus
10
10
  @bf.include?(url)
11
11
  end
12
12
 
13
- def visit url
13
+ def visit(url)
14
14
  @bf.insert url
15
15
  end
16
16
 
17
- def remove url
17
+ def remove(url)
18
18
  @bf.remove url
19
19
  end
20
20
 
21
21
  def clear
22
22
  @bf.clear
23
23
  end
24
-
25
24
  end
26
25
  end
27
- end
26
+ end
@@ -1,21 +1,20 @@
1
1
  module Polipus
2
2
  module UrlTracker
3
3
  class RedisSet
4
-
5
4
  def initialize(options = {})
6
5
  @redis = options[:redis] || Redis.current
7
6
  @set_name = options[:key_name]
8
7
  end
9
8
 
10
9
  def visited?(url)
11
- @redis.sismember(@set_name,url)
10
+ @redis.sismember(@set_name, url)
12
11
  end
13
12
 
14
- def visit url
13
+ def visit(url)
15
14
  @redis.sadd(@set_name, url)
16
15
  end
17
16
 
18
- def remove url
17
+ def remove(url)
19
18
  @redis.srem(@set_name, url, 0)
20
19
  end
21
20