polipus 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +8 -8
  2. data/.rubocop.yml +17 -0
  3. data/.rubocop_todo.yml +37 -0
  4. data/.travis.yml +2 -1
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +10 -0
  7. data/Rakefile +4 -4
  8. data/examples/basic.rb +16 -19
  9. data/examples/incremental.rb +17 -17
  10. data/examples/robots_txt_handling.rb +1 -1
  11. data/examples/survival.rb +3 -3
  12. data/lib/polipus.rb +186 -229
  13. data/lib/polipus/http.rb +41 -42
  14. data/lib/polipus/page.rb +33 -34
  15. data/lib/polipus/plugin.rb +2 -2
  16. data/lib/polipus/plugins/cleaner.rb +7 -8
  17. data/lib/polipus/plugins/sample.rb +6 -9
  18. data/lib/polipus/plugins/sleeper.rb +7 -8
  19. data/lib/polipus/queue_overflow.rb +11 -11
  20. data/lib/polipus/queue_overflow/base.rb +1 -1
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
  22. data/lib/polipus/queue_overflow/manager.rb +28 -25
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
  25. data/lib/polipus/robotex.rb +41 -51
  26. data/lib/polipus/signal_handler.rb +41 -0
  27. data/lib/polipus/storage.rb +11 -11
  28. data/lib/polipus/storage/base.rb +10 -8
  29. data/lib/polipus/storage/dev_null.rb +6 -7
  30. data/lib/polipus/storage/memory_store.rb +21 -22
  31. data/lib/polipus/storage/mongo_store.rb +34 -38
  32. data/lib/polipus/storage/s3_store.rb +33 -38
  33. data/lib/polipus/url_tracker.rb +3 -3
  34. data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
  35. data/lib/polipus/url_tracker/redis_set.rb +3 -4
  36. data/lib/polipus/version.rb +3 -3
  37. data/polipus.gemspec +12 -13
  38. data/spec/clear.rb +3 -3
  39. data/spec/http_spec.rb +27 -28
  40. data/spec/page_spec.rb +16 -16
  41. data/spec/polipus_spec.rb +34 -31
  42. data/spec/queue_overflow_manager_spec.rb +30 -28
  43. data/spec/queue_overflow_spec.rb +15 -15
  44. data/spec/robotex_spec.rb +9 -10
  45. data/spec/signal_handler_spec.rb +18 -0
  46. data/spec/spec_helper.rb +7 -6
  47. data/spec/storage_memory_spec.rb +18 -18
  48. data/spec/storage_mongo_spec.rb +19 -19
  49. data/spec/storage_s3_spec.rb +30 -31
  50. data/spec/url_tracker_spec.rb +7 -7
  51. metadata +7 -2
@@ -3,4 +3,4 @@ module Polipus
3
3
  class Base
4
4
  end
5
5
  end
6
- end
6
+ end
@@ -1,4 +1,4 @@
1
- require "thread"
1
+ require 'thread'
2
2
  module Polipus
3
3
  module QueueOverflow
4
4
  class DevNullQueue
@@ -16,18 +16,18 @@ module Polipus
16
16
  def clear
17
17
  end
18
18
 
19
- def push data
19
+ def push(_data)
20
20
  end
21
21
 
22
22
  def pop(_ = false)
23
23
  nil
24
24
  end
25
-
26
- alias :size :length
27
- alias :dec :pop
28
- alias :shift :pop
29
- alias :enc :push
30
- alias :<< :push
25
+
26
+ alias_method :size, :length
27
+ alias_method :dec, :pop
28
+ alias_method :shift, :pop
29
+ alias_method :enc, :push
30
+ alias_method :<<, :push
31
31
  end
32
32
  end
33
- end
33
+ end
@@ -10,43 +10,46 @@ module Polipus
10
10
  @redis = @polipus.redis
11
11
  end
12
12
 
13
- def url_filter &block
13
+ def url_filter(&block)
14
14
  @url_filter = block
15
15
  end
16
16
 
17
17
  def perform
18
- removed = 0
18
+ removed = 0
19
19
  restored = 0
20
-
21
- if @main_q.size > @item_limit
22
- removed = rotate @main_q, @adapter,(@main_q.size - @item_limit)
23
- elsif @main_q.size < @item_limit && !@adapter.empty?
24
- restored = rotate @adapter, @main_q,(@item_limit - @main_q.size)
20
+ main_q_size = @main_q.size
21
+ if main_q_size > @item_limit
22
+ @polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
23
+ removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
24
+ elsif main_q_size < @item_limit && !@adapter.empty?
25
+ @polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
26
+ restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
25
27
  end
26
28
  [removed, restored]
27
29
  end
28
30
 
29
31
  private
30
- def rotate source, dest, items
31
- performed = 0
32
- 1.upto(items){|i|
33
- message = source.pop(true)
34
- if message
35
- page = Page.from_json message
36
- unless @polipus.storage.exists?(page)
37
- allowed = !@url_filter.nil? ? @url_filter.call(page) : true
38
- if allowed
39
- dest << message
40
- performed += 1
41
- end
32
+
33
+ def rotate(source, dest)
34
+ performed = 0
35
+ loop do
36
+ message = source.pop(true)
37
+ if message
38
+ page = Page.from_json message
39
+ unless @polipus.storage.exists?(page)
40
+ allowed = @url_filter.nil? ? true : @url_filter.call(page)
41
+ if allowed
42
+ dest << message
43
+ performed += 1
42
44
  end
43
45
  end
44
- source.commit if source.respond_to? :commit
45
- @redis.expire "polipus_queue_overflow-#{@polipus.job_name}.lock", 180
46
- break if !message || source.empty?
47
- }
48
- performed
46
+ end
47
+ source.commit if source.respond_to? :commit
48
+ break if !message || source.empty?
49
+ break unless yield source, dest
49
50
  end
51
+ performed
52
+ end
50
53
  end
51
54
  end
52
- end
55
+ end
@@ -1,4 +1,5 @@
1
- require "thread"
1
+ require 'thread'
2
+ require 'mongo'
2
3
  module Polipus
3
4
  module QueueOverflow
4
5
  class MongoQueue
@@ -8,9 +9,7 @@ module Polipus
8
9
  @semaphore = Mutex.new
9
10
  @options = options
10
11
  @options[:ensure_uniq] ||= false
11
- if @options[:ensure_uniq]
12
- ensure_index
13
- end
12
+ @options[:ensure_uniq] && ensure_index
14
13
  end
15
14
 
16
15
  def length
@@ -23,39 +22,38 @@ module Polipus
23
22
 
24
23
  def clear
25
24
  @mongo_db[@collection_name].drop
26
- if @options[:ensure_uniq]
27
- ensure_index
28
- end
25
+ @options[:ensure_uniq] && ensure_index
29
26
  end
30
27
 
31
- def push data
32
- unless @options[:ensure_uniq]
33
- @mongo_db[@collection_name].insert({:payload => data})
28
+ def push(data)
29
+ if @options[:ensure_uniq]
30
+ @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: 1, w: 1 })
34
31
  else
35
- @mongo_db[@collection_name].update({:payload => data}, {:payload => data}, {:upsert => 1, :w => 1})
32
+ @mongo_db[@collection_name].insert(payload: data)
36
33
  end
37
- true
34
+ true
38
35
  end
39
36
 
40
37
  def pop(_ = false)
41
- @semaphore.synchronize {
42
- doc = @mongo_db[@collection_name].find({},:sort => {:_id => 1}).limit(1).first
38
+ @semaphore.synchronize do
39
+ doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
43
40
  return nil if doc.nil?
44
- @mongo_db[@collection_name].remove(:_id => doc['_id'])
41
+ @mongo_db[@collection_name].remove(_id: doc['_id'])
45
42
  doc && doc['payload'] ? doc['payload'] : nil
46
- }
43
+ end
47
44
  end
48
-
49
- alias :size :length
50
- alias :dec :pop
51
- alias :shift :pop
52
- alias :enc :push
53
- alias :<< :push
45
+
46
+ alias_method :size, :length
47
+ alias_method :dec, :pop
48
+ alias_method :shift, :pop
49
+ alias_method :enc, :push
50
+ alias_method :<<, :push
54
51
 
55
52
  protected
56
- def ensure_index
57
- @mongo_db[@collection_name].ensure_index({:payload => 1},{:background => 1, :unique => 1, :drop_dups => 1})
58
- end
53
+
54
+ def ensure_index
55
+ @mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
56
+ end
59
57
  end
60
58
  end
61
- end
59
+ end
@@ -1,4 +1,4 @@
1
- require "polipus/queue_overflow/mongo_queue"
1
+ require 'polipus/queue_overflow/mongo_queue'
2
2
  module Polipus
3
3
  module QueueOverflow
4
4
  class MongoQueueCapped < MongoQueue
@@ -7,22 +7,22 @@ module Polipus
7
7
  @max = @options[:max]
8
8
  end
9
9
 
10
- def push data
10
+ def push(data)
11
11
  super
12
- @semaphore.synchronize {
12
+ @semaphore.synchronize do
13
13
  s = size
14
14
  if s > @max
15
- docs = @mongo_db[@collection_name].find({},{:sort => {:_id => 1}, :fields => [:_id]}).limit(s-@max).map { |e| e['_id'] }
16
- @mongo_db[@collection_name].remove({:_id => {'$in' => docs}, '$isolated' => 1})
15
+ docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
16
+ @mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
17
17
  end
18
- }
18
+ end
19
19
  end
20
20
 
21
- alias :size :length
22
- alias :dec :pop
23
- alias :shift :pop
24
- alias :enc :push
25
- alias :<< :push
21
+ alias_method :size, :length
22
+ alias_method :dec, :pop
23
+ alias_method :shift, :pop
24
+ alias_method :enc, :push
25
+ alias_method :<<, :push
26
26
  end
27
27
  end
28
- end
28
+ end
@@ -2,22 +2,19 @@ require 'open-uri'
2
2
  require 'uri'
3
3
  require 'timeout'
4
4
  module Polipus
5
-
6
5
  # Original code taken from
7
6
  # https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
8
7
 
9
8
  class Robotex
10
-
11
9
  DEFAULT_TIMEOUT = 3
12
10
  VERSION = '1.0.0'
13
11
 
14
12
  attr_reader :user_agent
15
-
13
+
16
14
  class ParsedRobots
17
-
18
15
  def initialize(uri, user_agent)
19
16
  io = Robotex.get_robots_txt(uri, user_agent)
20
- if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
17
+ if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
21
18
  io = StringIO.new("User-agent: *\nAllow: /\n")
22
19
  end
23
20
 
@@ -27,59 +24,55 @@ module Polipus
27
24
  agent = /.*/
28
25
  io.each do |line|
29
26
  next if line =~ /^\s*(#.*|$)/
30
- arr = line.split(":")
27
+ arr = line.split(':')
31
28
  key = arr.shift
32
- value = arr.join(":").strip
29
+ value = arr.join(':').strip
33
30
  value.strip!
34
31
  case key.downcase
35
- when "user-agent"
36
- agent = to_regex(value)
37
- when "allow"
38
- unless value.empty?
39
- @allows[agent] ||= []
40
- @allows[agent] << to_regex(value)
41
- end
42
- when "disallow"
43
- unless value.empty?
44
- @disallows[agent] ||= []
45
- @disallows[agent] << to_regex(value)
46
- end
47
- when "crawl-delay"
48
- @delays[agent] = value.to_i
32
+ when 'user-agent'
33
+ agent = to_regex(value)
34
+ when 'allow'
35
+ unless value.empty?
36
+ @allows[agent] ||= []
37
+ @allows[agent] << to_regex(value)
38
+ end
39
+ when 'disallow'
40
+ unless value.empty?
41
+ @disallows[agent] ||= []
42
+ @disallows[agent] << to_regex(value)
43
+ end
44
+ when 'crawl-delay'
45
+ @delays[agent] = value.to_i
49
46
  end
50
47
  end
51
48
  @parsed = true
52
49
  end
53
-
50
+
54
51
  def allowed?(uri, user_agent)
55
52
  return true unless @parsed
56
53
  allowed = true
57
54
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
58
55
  path = uri.request_uri
59
-
56
+
60
57
  @allows.each do |key, value|
61
- unless allowed
58
+ unless allowed
62
59
  if user_agent =~ key
63
60
  value.each do |rule|
64
- if path =~ rule
65
- allowed = true
66
- end
61
+ path =~ rule && allowed = true
67
62
  end
68
63
  end
69
64
  end
70
65
  end
71
-
66
+
72
67
  @disallows.each do |key, value|
73
68
  if user_agent =~ key
74
69
  value.each do |rule|
75
- if path =~ rule
76
- allowed = false
77
- end
70
+ path =~ rule && allowed = false
78
71
  end
79
72
  end
80
73
  end
81
-
82
- return allowed
74
+
75
+ allowed
83
76
  end
84
77
 
85
78
  def delay(user_agent)
@@ -88,30 +81,28 @@ module Polipus
88
81
  end
89
82
  nil
90
83
  end
91
-
84
+
92
85
  protected
93
-
86
+
94
87
  def to_regex(pattern)
95
88
  pattern = Regexp.escape(pattern)
96
- pattern.gsub!(Regexp.escape("*"), ".*")
89
+ pattern.gsub!(Regexp.escape('*'), '.*')
97
90
  Regexp.compile("^#{pattern}")
98
91
  end
99
92
  end
100
-
93
+
101
94
  def self.get_robots_txt(uri, user_agent)
102
- begin
103
- Timeout::timeout(Robotex.timeout) do
104
- URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
105
- end
106
- rescue Timeout::Error
107
- STDERR.puts "robots.txt request timed out"
95
+ Timeout.timeout(Robotex.timeout) do
96
+ URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
108
97
  end
98
+ rescue Timeout::Error
99
+ STDERR.puts 'robots.txt request timed out'
109
100
  end
110
-
111
- def self.timeout=(t)
112
- @timeout = t
101
+
102
+ class << self
103
+ attr_writer :timeout
113
104
  end
114
-
105
+
115
106
  def self.timeout
116
107
  @timeout || DEFAULT_TIMEOUT
117
108
  end
@@ -127,7 +118,7 @@ module Polipus
127
118
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
128
119
  @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
129
120
  end
130
-
121
+
131
122
  #
132
123
  # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
133
124
  #
@@ -146,9 +137,8 @@ module Polipus
146
137
  #
147
138
  def delay!(uri)
148
139
  delay = delay(uri)
149
- sleep delay - (Time.now - @last_accessed) if !!delay
140
+ sleep delay - (Time.now - @last_accessed) if delay
150
141
  @last_accessed = Time.now
151
142
  end
152
-
153
143
  end
154
- end
144
+ end
@@ -0,0 +1,41 @@
1
+ require 'singleton'
2
+ module Polipus
3
+ class SignalHandler
4
+ include Singleton
5
+ attr_accessor :terminated
6
+ attr_accessor :enabled
7
+
8
+ def initialize
9
+ self.terminated = false
10
+ self.enabled = false
11
+ end
12
+
13
+ def self.enable
14
+ trap(:INT) do
15
+ exit unless self.enabled?
16
+ terminate
17
+ end
18
+ trap(:TERM) do
19
+ exit unless self.enabled?
20
+ terminate
21
+ end
22
+ instance.enabled = true
23
+ end
24
+
25
+ def self.disable
26
+ instance.enabled = false
27
+ end
28
+
29
+ def self.terminate
30
+ instance.terminated = true
31
+ end
32
+
33
+ def self.terminated?
34
+ instance.terminated
35
+ end
36
+
37
+ def self.enabled?
38
+ instance.enabled
39
+ end
40
+ end
41
+ end
@@ -1,25 +1,25 @@
1
- require "polipus/storage/base"
1
+ require 'polipus/storage/base'
2
2
  module Polipus
3
3
  module Storage
4
4
  def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
5
5
  require 'polipus/storage/mongo_store'
6
- mongo ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
- raise "First argument must be an instance of Mongo::DB" unless mongo.is_a?(Mongo::DB)
8
- self::MongoStore.new(:mongo => mongo, :collection => collection_name, :except => except)
6
+ mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
7
+ fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
8
+ self::MongoStore.new(mongo: mongo, collection: collection_name, except: except)
9
9
  end
10
10
 
11
11
  def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
12
12
  require 'polipus/storage/s3_store'
13
-
13
+
14
14
  if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
15
- raise "You have to specify AWS crediantials: :access_key_id and :secret_access_key"
15
+ fail 'You have to specify AWS crediantials: :access_key_id and :secret_access_key'
16
16
  end
17
17
 
18
18
  self::S3Store.new(
19
- :bucket => bucket_name,
20
- :access_key_id => aws_credential[:access_key_id],
21
- :secret_access_key => aws_credential[:secret_access_key],
22
- :except => except
19
+ bucket: bucket_name,
20
+ access_key_id: aws_credential[:access_key_id],
21
+ secret_access_key: aws_credential[:secret_access_key],
22
+ except: except
23
23
  )
24
24
  end
25
25
 
@@ -33,4 +33,4 @@ module Polipus
33
33
  self::MemoryStore.new
34
34
  end
35
35
  end
36
- end
36
+ end