polipus 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +8 -8
  2. data/.rubocop.yml +17 -0
  3. data/.rubocop_todo.yml +37 -0
  4. data/.travis.yml +2 -1
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +10 -0
  7. data/Rakefile +4 -4
  8. data/examples/basic.rb +16 -19
  9. data/examples/incremental.rb +17 -17
  10. data/examples/robots_txt_handling.rb +1 -1
  11. data/examples/survival.rb +3 -3
  12. data/lib/polipus.rb +186 -229
  13. data/lib/polipus/http.rb +41 -42
  14. data/lib/polipus/page.rb +33 -34
  15. data/lib/polipus/plugin.rb +2 -2
  16. data/lib/polipus/plugins/cleaner.rb +7 -8
  17. data/lib/polipus/plugins/sample.rb +6 -9
  18. data/lib/polipus/plugins/sleeper.rb +7 -8
  19. data/lib/polipus/queue_overflow.rb +11 -11
  20. data/lib/polipus/queue_overflow/base.rb +1 -1
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
  22. data/lib/polipus/queue_overflow/manager.rb +28 -25
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
  25. data/lib/polipus/robotex.rb +41 -51
  26. data/lib/polipus/signal_handler.rb +41 -0
  27. data/lib/polipus/storage.rb +11 -11
  28. data/lib/polipus/storage/base.rb +10 -8
  29. data/lib/polipus/storage/dev_null.rb +6 -7
  30. data/lib/polipus/storage/memory_store.rb +21 -22
  31. data/lib/polipus/storage/mongo_store.rb +34 -38
  32. data/lib/polipus/storage/s3_store.rb +33 -38
  33. data/lib/polipus/url_tracker.rb +3 -3
  34. data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
  35. data/lib/polipus/url_tracker/redis_set.rb +3 -4
  36. data/lib/polipus/version.rb +3 -3
  37. data/polipus.gemspec +12 -13
  38. data/spec/clear.rb +3 -3
  39. data/spec/http_spec.rb +27 -28
  40. data/spec/page_spec.rb +16 -16
  41. data/spec/polipus_spec.rb +34 -31
  42. data/spec/queue_overflow_manager_spec.rb +30 -28
  43. data/spec/queue_overflow_spec.rb +15 -15
  44. data/spec/robotex_spec.rb +9 -10
  45. data/spec/signal_handler_spec.rb +18 -0
  46. data/spec/spec_helper.rb +7 -6
  47. data/spec/storage_memory_spec.rb +18 -18
  48. data/spec/storage_mongo_spec.rb +19 -19
  49. data/spec/storage_s3_spec.rb +30 -31
  50. data/spec/url_tracker_spec.rb +7 -7
  51. metadata +7 -2
@@ -1,17 +1,19 @@
1
- require "uri"
1
+ require 'uri'
2
2
 
3
3
  module Polipus
4
4
  module Storage
5
5
  class Base
6
6
  attr_accessor :include_query_string_in_uuid
7
+
7
8
  protected
8
- def uuid page
9
- if @include_query_string_in_uuid.nil?
10
- @include_query_string_in_uuid = true
11
- end
12
- url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/,'')
13
- Digest::MD5.hexdigest(url_to_hash)
9
+
10
+ def uuid(page)
11
+ if @include_query_string_in_uuid.nil?
12
+ @include_query_string_in_uuid = true
14
13
  end
14
+ url_to_hash = @include_query_string_in_uuid ? page.url.to_s : page.url.to_s.gsub(/\?.*$/, '')
15
+ Digest::MD5.hexdigest(url_to_hash)
16
+ end
15
17
  end
16
18
  end
17
- end
19
+ end
@@ -1,22 +1,21 @@
1
1
  module Polipus
2
2
  module Storage
3
3
  class DevNull < Base
4
-
5
- def initialize(options = {})
4
+ def initialize(_options = {})
6
5
  end
7
6
 
8
- def add page
7
+ def add(_page)
9
8
  end
10
9
 
11
- def exists?(page)
10
+ def exists?(_page)
12
11
  false
13
12
  end
14
13
 
15
- def get page
14
+ def get(_page)
16
15
  nil
17
16
  end
18
17
 
19
- def remove page
18
+ def remove(_page)
20
19
  false
21
20
  end
22
21
 
@@ -32,4 +31,4 @@ module Polipus
32
31
  end
33
32
  end
34
33
  end
35
- end
34
+ end
@@ -1,56 +1,55 @@
1
- require "thread"
1
+ require 'thread'
2
2
  module Polipus
3
3
  module Storage
4
4
  class MemoryStore < Base
5
-
6
- def initialize(options = {})
7
- @store = Hash.new
5
+ def initialize(_options = {})
6
+ @store = {}
8
7
  @semaphore = Mutex.new
9
8
  end
10
9
 
11
- def add page
12
- @semaphore.synchronize {
10
+ def add(page)
11
+ @semaphore.synchronize do
13
12
  u = uuid(page)
14
13
  @store[u] = page
15
14
  u
16
- }
15
+ end
17
16
  end
18
17
 
19
18
  def exists?(page)
20
- @semaphore.synchronize {
19
+ @semaphore.synchronize do
21
20
  @store.key?(uuid(page))
22
- }
21
+ end
23
22
  end
24
23
 
25
- def get page
26
- @semaphore.synchronize {
24
+ def get(page)
25
+ @semaphore.synchronize do
27
26
  @store[uuid(page)]
28
- }
27
+ end
29
28
  end
30
29
 
31
- def remove page
32
- @semaphore.synchronize {
30
+ def remove(page)
31
+ @semaphore.synchronize do
33
32
  @store.delete(uuid(page))
34
- }
33
+ end
35
34
  end
36
35
 
37
36
  def count
38
- @semaphore.synchronize {
37
+ @semaphore.synchronize do
39
38
  @store.count
40
- }
39
+ end
41
40
  end
42
41
 
43
42
  def each
44
- @store.each do |k,v|
45
- yield k,v
43
+ @store.each do |k, v|
44
+ yield k, v
46
45
  end
47
46
  end
48
47
 
49
48
  def clear
50
- @semaphore.synchronize {
49
+ @semaphore.synchronize do
51
50
  @store = Hash.new
52
- }
51
+ end
53
52
  end
54
53
  end
55
54
  end
56
- end
55
+ end
@@ -1,6 +1,6 @@
1
- require "mongo"
2
- require "zlib"
3
- require "thread"
1
+ require 'mongo'
2
+ require 'zlib'
3
+ require 'thread'
4
4
  module Polipus
5
5
  module Storage
6
6
  class MongoStore < Base
@@ -9,44 +9,44 @@ module Polipus
9
9
  @mongo = options[:mongo]
10
10
  @collection = options[:collection]
11
11
  @mongo.create_collection(@collection)
12
- @mongo[@collection].ensure_index(:uuid, :unique => true, :drop_dups => true, :background => true)
12
+ @mongo[@collection].ensure_index(:uuid, unique: true, drop_dups: true, background: true)
13
13
  @compress_body = options[:compress_body] ||= true
14
14
  @except = options[:except] ||= []
15
15
  @semaphore = Mutex.new
16
16
  end
17
17
 
18
- def add page
19
- @semaphore.synchronize {
18
+ def add(page)
19
+ @semaphore.synchronize do
20
20
  obj = page.to_hash
21
- @except.each {|e| obj.delete e.to_s}
21
+ @except.each { |e| obj.delete e.to_s }
22
22
  obj['uuid'] = uuid(page)
23
23
  obj['body'] = Zlib::Deflate.deflate(obj['body']) if @compress_body && obj['body']
24
24
  BINARY_FIELDS.each do |field|
25
25
  obj[field] = BSON::Binary.new(obj[field]) unless obj[field].nil?
26
26
  end
27
- @mongo[@collection].update({:uuid => obj['uuid']}, obj, {:upsert => true, :w => 1})
27
+ @mongo[@collection].update({ uuid: obj['uuid'] }, obj, upsert: true, w: 1)
28
28
  obj['uuid']
29
- }
29
+ end
30
30
  end
31
31
 
32
32
  def exists?(page)
33
- @semaphore.synchronize {
34
- doc = @mongo[@collection].find({:uuid => uuid(page)}, {:fields => [:_id]}).limit(1).first
33
+ @semaphore.synchronize do
34
+ doc = @mongo[@collection].find({ uuid: uuid(page) }, { fields: [:_id] }).limit(1).first
35
35
  !doc.nil?
36
- }
36
+ end
37
37
  end
38
38
 
39
- def get page
40
- @semaphore.synchronize {
41
- data = @mongo[@collection].find({:uuid => uuid(page)}).limit(1).first
39
+ def get(page)
40
+ @semaphore.synchronize do
41
+ data = @mongo[@collection].find(uuid: uuid(page)).limit(1).first
42
42
  return load_page(data) if data
43
- }
43
+ end
44
44
  end
45
45
 
46
- def remove page
47
- @semaphore.synchronize {
48
- @mongo[@collection].remove({:uuid => uuid(page)})
49
- }
46
+ def remove(page)
47
+ @semaphore.synchronize do
48
+ @mongo[@collection].remove(uuid: uuid(page))
49
+ end
50
50
  end
51
51
 
52
52
  def count
@@ -54,10 +54,10 @@ module Polipus
54
54
  end
55
55
 
56
56
  def each
57
- @mongo[@collection].find({},:timeout => false) do |cursor|
57
+ @mongo[@collection].find({}, timeout: false) do |cursor|
58
58
  cursor.each do |doc|
59
59
  page = load_page(doc)
60
- yield doc['uuid'], page
60
+ yield doc['uuid'], page
61
61
  end
62
62
  end
63
63
  end
@@ -67,22 +67,18 @@ module Polipus
67
67
  end
68
68
 
69
69
  private
70
- def load_page(hash)
71
- BINARY_FIELDS.each do |field|
72
- hash[field] = hash[field].to_s
73
- end
74
- begin
75
- hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
76
- page = Page.from_hash(hash)
77
- if page.fetched_at.nil?
78
- page.fetched_at = hash['_id'].generation_time.to_i
79
- end
80
- return page
81
- rescue
82
- end
83
- nil
84
- end
85
70
 
71
+ def load_page(hash)
72
+ BINARY_FIELDS.each do |field|
73
+ hash[field] = hash[field].to_s
74
+ end
75
+ hash['body'] = Zlib::Inflate.inflate(hash['body']) if @compress_body && hash['body'] && !hash['body'].empty?
76
+ page = Page.from_hash(hash)
77
+ if page.fetched_at.nil?
78
+ page.fetched_at = hash['_id'].generation_time.to_i
79
+ end
80
+ page
81
+ end
86
82
  end
87
83
  end
88
- end
84
+ end
@@ -1,7 +1,7 @@
1
- require "aws/s3"
2
- require "zlib"
3
- require "thread"
4
- require "json"
1
+ require 'aws/s3'
2
+ require 'zlib'
3
+ require 'thread'
4
+ require 'json'
5
5
  module Polipus
6
6
  module Storage
7
7
  class S3Store < Base
@@ -11,8 +11,8 @@ module Polipus
11
11
  @semaphore = Mutex.new
12
12
 
13
13
  AWS::S3::Base.establish_connection!(
14
- :access_key_id => @options[:access_key_id],
15
- :secret_access_key => @options[:secret_access_key]
14
+ access_key_id: @options[:access_key_id],
15
+ secret_access_key: @options[:secret_access_key]
16
16
  )
17
17
  @options[:bucket] = "com.polipus.pages.#{@options[:bucket]}"
18
18
  begin
@@ -22,39 +22,37 @@ module Polipus
22
22
  end
23
23
  end
24
24
 
25
- def add page
26
- @semaphore.synchronize {
25
+ def add(page)
26
+ @semaphore.synchronize do
27
27
  obj = page.to_hash
28
- @except.each {|e| obj.delete e.to_s}
28
+ @except.each { |e| obj.delete e.to_s }
29
29
  puuid = uuid(page)
30
30
  obj['uuid'] = puuid
31
31
  data = Zlib::Deflate.deflate(obj.to_json)
32
32
  AWS::S3::S3Object.store(puuid, data, @bucket.name)
33
33
  puuid
34
- }
34
+ end
35
35
  end
36
36
 
37
37
  def exists?(page)
38
38
  AWS::S3::S3Object.exists? uuid(page), @bucket.name
39
39
  end
40
40
 
41
- def get page
42
- @semaphore.synchronize {
41
+ def get(page)
42
+ @semaphore.synchronize do
43
43
  if exists?(page)
44
44
  data = AWS::S3::S3Object.find(uuid(page), @bucket.name).value
45
45
  return load_page(data)
46
46
  end
47
47
  nil
48
- }
48
+ end
49
49
  end
50
50
 
51
- def remove page
52
- @semaphore.synchronize {
53
- if exists?(page)
54
- AWS::S3::S3Object.delete(uuid(page), @bucket.name)
55
- end
51
+ def remove(page)
52
+ @semaphore.synchronize do
53
+ exists?(page) && AWS::S3::S3Object.delete(uuid(page), @bucket.name)
56
54
  true
57
- }
55
+ end
58
56
  end
59
57
 
60
58
  def count
@@ -62,39 +60,36 @@ module Polipus
62
60
  end
63
61
 
64
62
  def clear
65
- AWS::S3::Bucket.delete(@bucket.name, :force => true)
63
+ AWS::S3::Bucket.delete(@bucket.name, force: true)
66
64
  create_bucket
67
65
  end
68
66
 
69
67
  def each
70
68
  objects = []
71
69
  last_key = nil
72
- begin
73
- objects = AWS::S3::Bucket.objects(@bucket.name, :marker => last_key)
70
+ loop do
71
+ objects = AWS::S3::Bucket.objects(@bucket.name, marker: last_key)
74
72
  break if objects.size == 0
75
73
  objects.each do |o|
76
74
  page = load_page(o.value)
77
- yield o.key, page
75
+ yield o.key, page
78
76
  end
79
77
  last_key = objects.last.key
80
- end while true
78
+ end
81
79
  end
82
80
 
83
81
  private
84
- def load_page(data)
85
- begin
86
- payload = Zlib::Inflate.inflate(data)
87
- hash = JSON.parse(payload)
88
- return Page.from_hash(hash)
89
- rescue
90
- end
91
- nil
92
- end
93
82
 
94
- def create_bucket
95
- AWS::S3::Bucket.create(@options[:bucket])
96
- @bucket = AWS::S3::Bucket.find(@options[:bucket])
97
- end
83
+ def load_page(data)
84
+ payload = Zlib::Inflate.inflate(data)
85
+ hash = JSON.parse(payload)
86
+ Page.from_hash(hash)
87
+ end
88
+
89
+ def create_bucket
90
+ AWS::S3::Bucket.create(@options[:bucket])
91
+ @bucket = AWS::S3::Bucket.find(@options[:bucket])
92
+ end
98
93
  end
99
94
  end
100
- end
95
+ end
@@ -1,7 +1,7 @@
1
1
  module Polipus
2
2
  module UrlTracker
3
3
  def self.bloomfilter(options = {})
4
- require "polipus/url_tracker/bloomfilter"
4
+ require 'polipus/url_tracker/bloomfilter'
5
5
  options[:size] ||= 1_000_000
6
6
  options[:error_rate] ||= 0.01
7
7
  options[:key_name] ||= 'polipus-bloomfilter'
@@ -11,10 +11,10 @@ module Polipus
11
11
  end
12
12
 
13
13
  def self.redis_set(options = {})
14
- require "polipus/url_tracker/redis_set"
14
+ require 'polipus/url_tracker/redis_set'
15
15
  options[:redis] ||= Redis.current
16
16
  options[:key_name] ||= 'polipus-set'
17
17
  self::RedisSet.new options
18
18
  end
19
19
  end
20
- end
20
+ end
@@ -1,4 +1,4 @@
1
- require "redis-bloomfilter"
1
+ require 'redis-bloomfilter'
2
2
  module Polipus
3
3
  module UrlTracker
4
4
  class Bloomfilter
@@ -10,18 +10,17 @@ module Polipus
10
10
  @bf.include?(url)
11
11
  end
12
12
 
13
- def visit url
13
+ def visit(url)
14
14
  @bf.insert url
15
15
  end
16
16
 
17
- def remove url
17
+ def remove(url)
18
18
  @bf.remove url
19
19
  end
20
20
 
21
21
  def clear
22
22
  @bf.clear
23
23
  end
24
-
25
24
  end
26
25
  end
27
- end
26
+ end
@@ -1,21 +1,20 @@
1
1
  module Polipus
2
2
  module UrlTracker
3
3
  class RedisSet
4
-
5
4
  def initialize(options = {})
6
5
  @redis = options[:redis] || Redis.current
7
6
  @set_name = options[:key_name]
8
7
  end
9
8
 
10
9
  def visited?(url)
11
- @redis.sismember(@set_name,url)
10
+ @redis.sismember(@set_name, url)
12
11
  end
13
12
 
14
- def visit url
13
+ def visit(url)
15
14
  @redis.sadd(@set_name, url)
16
15
  end
17
16
 
18
- def remove url
17
+ def remove(url)
19
18
  @redis.srem(@set_name, url, 0)
20
19
  end
21
20