polipus 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +8 -8
  2. data/.rubocop.yml +17 -0
  3. data/.rubocop_todo.yml +37 -0
  4. data/.travis.yml +2 -1
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +10 -0
  7. data/Rakefile +4 -4
  8. data/examples/basic.rb +16 -19
  9. data/examples/incremental.rb +17 -17
  10. data/examples/robots_txt_handling.rb +1 -1
  11. data/examples/survival.rb +3 -3
  12. data/lib/polipus.rb +186 -229
  13. data/lib/polipus/http.rb +41 -42
  14. data/lib/polipus/page.rb +33 -34
  15. data/lib/polipus/plugin.rb +2 -2
  16. data/lib/polipus/plugins/cleaner.rb +7 -8
  17. data/lib/polipus/plugins/sample.rb +6 -9
  18. data/lib/polipus/plugins/sleeper.rb +7 -8
  19. data/lib/polipus/queue_overflow.rb +11 -11
  20. data/lib/polipus/queue_overflow/base.rb +1 -1
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
  22. data/lib/polipus/queue_overflow/manager.rb +28 -25
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
  25. data/lib/polipus/robotex.rb +41 -51
  26. data/lib/polipus/signal_handler.rb +41 -0
  27. data/lib/polipus/storage.rb +11 -11
  28. data/lib/polipus/storage/base.rb +10 -8
  29. data/lib/polipus/storage/dev_null.rb +6 -7
  30. data/lib/polipus/storage/memory_store.rb +21 -22
  31. data/lib/polipus/storage/mongo_store.rb +34 -38
  32. data/lib/polipus/storage/s3_store.rb +33 -38
  33. data/lib/polipus/url_tracker.rb +3 -3
  34. data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
  35. data/lib/polipus/url_tracker/redis_set.rb +3 -4
  36. data/lib/polipus/version.rb +3 -3
  37. data/polipus.gemspec +12 -13
  38. data/spec/clear.rb +3 -3
  39. data/spec/http_spec.rb +27 -28
  40. data/spec/page_spec.rb +16 -16
  41. data/spec/polipus_spec.rb +34 -31
  42. data/spec/queue_overflow_manager_spec.rb +30 -28
  43. data/spec/queue_overflow_spec.rb +15 -15
  44. data/spec/robotex_spec.rb +9 -10
  45. data/spec/signal_handler_spec.rb +18 -0
  46. data/spec/spec_helper.rb +7 -6
  47. data/spec/storage_memory_spec.rb +18 -18
  48. data/spec/storage_mongo_spec.rb +19 -19
  49. data/spec/storage_s3_spec.rb +30 -31
  50. data/spec/url_tracker_spec.rb +7 -7
  51. metadata +7 -2
@@ -3,4 +3,4 @@ module Polipus
3
3
  class Base
4
4
  end
5
5
  end
6
- end
6
+ end
@@ -1,4 +1,4 @@
1
- require "thread"
1
+ require 'thread'
2
2
  module Polipus
3
3
  module QueueOverflow
4
4
  class DevNullQueue
@@ -16,18 +16,18 @@ module Polipus
16
16
  def clear
17
17
  end
18
18
 
19
- def push data
19
+ def push(_data)
20
20
  end
21
21
 
22
22
  def pop(_ = false)
23
23
  nil
24
24
  end
25
-
26
- alias :size :length
27
- alias :dec :pop
28
- alias :shift :pop
29
- alias :enc :push
30
- alias :<< :push
25
+
26
+ alias_method :size, :length
27
+ alias_method :dec, :pop
28
+ alias_method :shift, :pop
29
+ alias_method :enc, :push
30
+ alias_method :<<, :push
31
31
  end
32
32
  end
33
- end
33
+ end
@@ -10,43 +10,46 @@ module Polipus
10
10
  @redis = @polipus.redis
11
11
  end
12
12
 
13
- def url_filter &block
13
+ def url_filter(&block)
14
14
  @url_filter = block
15
15
  end
16
16
 
17
17
  def perform
18
- removed = 0
18
+ removed = 0
19
19
  restored = 0
20
-
21
- if @main_q.size > @item_limit
22
- removed = rotate @main_q, @adapter,(@main_q.size - @item_limit)
23
- elsif @main_q.size < @item_limit && !@adapter.empty?
24
- restored = rotate @adapter, @main_q,(@item_limit - @main_q.size)
20
+ main_q_size = @main_q.size
21
+ if main_q_size > @item_limit
22
+ @polipus.logger.info { "Overflow Manager: Going to offload items from redis: ~#{main_q_size - @item_limit}" }
23
+ removed = rotate(@main_q, @adapter) { @main_q.size > @item_limit }
24
+ elsif main_q_size < @item_limit && !@adapter.empty?
25
+ @polipus.logger.info { "Overflow Manager: Going to restore items into redis: ~#{@item_limit - main_q_size}" }
26
+ restored = rotate(@adapter, @main_q) { @main_q.size <= @item_limit }
25
27
  end
26
28
  [removed, restored]
27
29
  end
28
30
 
29
31
  private
30
- def rotate source, dest, items
31
- performed = 0
32
- 1.upto(items){|i|
33
- message = source.pop(true)
34
- if message
35
- page = Page.from_json message
36
- unless @polipus.storage.exists?(page)
37
- allowed = !@url_filter.nil? ? @url_filter.call(page) : true
38
- if allowed
39
- dest << message
40
- performed += 1
41
- end
32
+
33
+ def rotate(source, dest)
34
+ performed = 0
35
+ loop do
36
+ message = source.pop(true)
37
+ if message
38
+ page = Page.from_json message
39
+ unless @polipus.storage.exists?(page)
40
+ allowed = @url_filter.nil? ? true : @url_filter.call(page)
41
+ if allowed
42
+ dest << message
43
+ performed += 1
42
44
  end
43
45
  end
44
- source.commit if source.respond_to? :commit
45
- @redis.expire "polipus_queue_overflow-#{@polipus.job_name}.lock", 180
46
- break if !message || source.empty?
47
- }
48
- performed
46
+ end
47
+ source.commit if source.respond_to? :commit
48
+ break if !message || source.empty?
49
+ break unless yield source, dest
49
50
  end
51
+ performed
52
+ end
50
53
  end
51
54
  end
52
- end
55
+ end
@@ -1,4 +1,5 @@
1
- require "thread"
1
+ require 'thread'
2
+ require 'mongo'
2
3
  module Polipus
3
4
  module QueueOverflow
4
5
  class MongoQueue
@@ -8,9 +9,7 @@ module Polipus
8
9
  @semaphore = Mutex.new
9
10
  @options = options
10
11
  @options[:ensure_uniq] ||= false
11
- if @options[:ensure_uniq]
12
- ensure_index
13
- end
12
+ @options[:ensure_uniq] && ensure_index
14
13
  end
15
14
 
16
15
  def length
@@ -23,39 +22,38 @@ module Polipus
23
22
 
24
23
  def clear
25
24
  @mongo_db[@collection_name].drop
26
- if @options[:ensure_uniq]
27
- ensure_index
28
- end
25
+ @options[:ensure_uniq] && ensure_index
29
26
  end
30
27
 
31
- def push data
32
- unless @options[:ensure_uniq]
33
- @mongo_db[@collection_name].insert({:payload => data})
28
+ def push(data)
29
+ if @options[:ensure_uniq]
30
+ @mongo_db[@collection_name].update({ payload: data }, { payload: data }, { upsert: 1, w: 1 })
34
31
  else
35
- @mongo_db[@collection_name].update({:payload => data}, {:payload => data}, {:upsert => 1, :w => 1})
32
+ @mongo_db[@collection_name].insert(payload: data)
36
33
  end
37
- true
34
+ true
38
35
  end
39
36
 
40
37
  def pop(_ = false)
41
- @semaphore.synchronize {
42
- doc = @mongo_db[@collection_name].find({},:sort => {:_id => 1}).limit(1).first
38
+ @semaphore.synchronize do
39
+ doc = @mongo_db[@collection_name].find({}, sort: { _id: 1 }).limit(1).first
43
40
  return nil if doc.nil?
44
- @mongo_db[@collection_name].remove(:_id => doc['_id'])
41
+ @mongo_db[@collection_name].remove(_id: doc['_id'])
45
42
  doc && doc['payload'] ? doc['payload'] : nil
46
- }
43
+ end
47
44
  end
48
-
49
- alias :size :length
50
- alias :dec :pop
51
- alias :shift :pop
52
- alias :enc :push
53
- alias :<< :push
45
+
46
+ alias_method :size, :length
47
+ alias_method :dec, :pop
48
+ alias_method :shift, :pop
49
+ alias_method :enc, :push
50
+ alias_method :<<, :push
54
51
 
55
52
  protected
56
- def ensure_index
57
- @mongo_db[@collection_name].ensure_index({:payload => 1},{:background => 1, :unique => 1, :drop_dups => 1})
58
- end
53
+
54
+ def ensure_index
55
+ @mongo_db[@collection_name].ensure_index({ payload: 1 }, { background: 1, unique: 1, drop_dups: 1 })
56
+ end
59
57
  end
60
58
  end
61
- end
59
+ end
@@ -1,4 +1,4 @@
1
- require "polipus/queue_overflow/mongo_queue"
1
+ require 'polipus/queue_overflow/mongo_queue'
2
2
  module Polipus
3
3
  module QueueOverflow
4
4
  class MongoQueueCapped < MongoQueue
@@ -7,22 +7,22 @@ module Polipus
7
7
  @max = @options[:max]
8
8
  end
9
9
 
10
- def push data
10
+ def push(data)
11
11
  super
12
- @semaphore.synchronize {
12
+ @semaphore.synchronize do
13
13
  s = size
14
14
  if s > @max
15
- docs = @mongo_db[@collection_name].find({},{:sort => {:_id => 1}, :fields => [:_id]}).limit(s-@max).map { |e| e['_id'] }
16
- @mongo_db[@collection_name].remove({:_id => {'$in' => docs}, '$isolated' => 1})
15
+ docs = @mongo_db[@collection_name].find({}, { sort: { _id: 1 }, fields: [:_id] }).limit(s - @max).map { |e| e['_id'] }
16
+ @mongo_db[@collection_name].remove(:_id => { '$in' => docs }, '$isolated' => 1)
17
17
  end
18
- }
18
+ end
19
19
  end
20
20
 
21
- alias :size :length
22
- alias :dec :pop
23
- alias :shift :pop
24
- alias :enc :push
25
- alias :<< :push
21
+ alias_method :size, :length
22
+ alias_method :dec, :pop
23
+ alias_method :shift, :pop
24
+ alias_method :enc, :push
25
+ alias_method :<<, :push
26
26
  end
27
27
  end
28
- end
28
+ end
@@ -2,22 +2,19 @@ require 'open-uri'
2
2
  require 'uri'
3
3
  require 'timeout'
4
4
  module Polipus
5
-
6
5
  # Original code taken from
7
6
  # https://github.com/chriskite/robotex/blob/master/lib/robotex.rb
8
7
 
9
8
  class Robotex
10
-
11
9
  DEFAULT_TIMEOUT = 3
12
10
  VERSION = '1.0.0'
13
11
 
14
12
  attr_reader :user_agent
15
-
13
+
16
14
  class ParsedRobots
17
-
18
15
  def initialize(uri, user_agent)
19
16
  io = Robotex.get_robots_txt(uri, user_agent)
20
- if !io || io.content_type != "text/plain" || io.status != ["200", "OK"]
17
+ if !io || io.content_type != 'text/plain' || io.status != %w(200 OK)
21
18
  io = StringIO.new("User-agent: *\nAllow: /\n")
22
19
  end
23
20
 
@@ -27,59 +24,55 @@ module Polipus
27
24
  agent = /.*/
28
25
  io.each do |line|
29
26
  next if line =~ /^\s*(#.*|$)/
30
- arr = line.split(":")
27
+ arr = line.split(':')
31
28
  key = arr.shift
32
- value = arr.join(":").strip
29
+ value = arr.join(':').strip
33
30
  value.strip!
34
31
  case key.downcase
35
- when "user-agent"
36
- agent = to_regex(value)
37
- when "allow"
38
- unless value.empty?
39
- @allows[agent] ||= []
40
- @allows[agent] << to_regex(value)
41
- end
42
- when "disallow"
43
- unless value.empty?
44
- @disallows[agent] ||= []
45
- @disallows[agent] << to_regex(value)
46
- end
47
- when "crawl-delay"
48
- @delays[agent] = value.to_i
32
+ when 'user-agent'
33
+ agent = to_regex(value)
34
+ when 'allow'
35
+ unless value.empty?
36
+ @allows[agent] ||= []
37
+ @allows[agent] << to_regex(value)
38
+ end
39
+ when 'disallow'
40
+ unless value.empty?
41
+ @disallows[agent] ||= []
42
+ @disallows[agent] << to_regex(value)
43
+ end
44
+ when 'crawl-delay'
45
+ @delays[agent] = value.to_i
49
46
  end
50
47
  end
51
48
  @parsed = true
52
49
  end
53
-
50
+
54
51
  def allowed?(uri, user_agent)
55
52
  return true unless @parsed
56
53
  allowed = true
57
54
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
58
55
  path = uri.request_uri
59
-
56
+
60
57
  @allows.each do |key, value|
61
- unless allowed
58
+ unless allowed
62
59
  if user_agent =~ key
63
60
  value.each do |rule|
64
- if path =~ rule
65
- allowed = true
66
- end
61
+ path =~ rule && allowed = true
67
62
  end
68
63
  end
69
64
  end
70
65
  end
71
-
66
+
72
67
  @disallows.each do |key, value|
73
68
  if user_agent =~ key
74
69
  value.each do |rule|
75
- if path =~ rule
76
- allowed = false
77
- end
70
+ path =~ rule && allowed = false
78
71
  end
79
72
  end
80
73
  end
81
-
82
- return allowed
74
+
75
+ allowed
83
76
  end
84
77
 
85
78
  def delay(user_agent)
@@ -88,30 +81,28 @@ module Polipus
88
81
  end
89
82
  nil
90
83
  end
91
-
84
+
92
85
  protected
93
-
86
+
94
87
  def to_regex(pattern)
95
88
  pattern = Regexp.escape(pattern)
96
- pattern.gsub!(Regexp.escape("*"), ".*")
89
+ pattern.gsub!(Regexp.escape('*'), '.*')
97
90
  Regexp.compile("^#{pattern}")
98
91
  end
99
92
  end
100
-
93
+
101
94
  def self.get_robots_txt(uri, user_agent)
102
- begin
103
- Timeout::timeout(Robotex.timeout) do
104
- URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil
105
- end
106
- rescue Timeout::Error
107
- STDERR.puts "robots.txt request timed out"
95
+ Timeout.timeout(Robotex.timeout) do
96
+ URI.join(uri.to_s, '/robots.txt').open('User-Agent' => user_agent) rescue nil
108
97
  end
98
+ rescue Timeout::Error
99
+ STDERR.puts 'robots.txt request timed out'
109
100
  end
110
-
111
- def self.timeout=(t)
112
- @timeout = t
101
+
102
+ class << self
103
+ attr_writer :timeout
113
104
  end
114
-
105
+
115
106
  def self.timeout
116
107
  @timeout || DEFAULT_TIMEOUT
117
108
  end
@@ -127,7 +118,7 @@ module Polipus
127
118
  uri = URI.parse(uri.to_s) unless uri.is_a?(URI)
128
119
  @parsed[uri.host] ||= ParsedRobots.new(uri, @user_agent)
129
120
  end
130
-
121
+
131
122
  #
132
123
  # Download the server's robots.txt, and return try if we are allowed to acces the url, false otherwise
133
124
  #
@@ -146,9 +137,8 @@ module Polipus
146
137
  #
147
138
  def delay!(uri)
148
139
  delay = delay(uri)
149
- sleep delay - (Time.now - @last_accessed) if !!delay
140
+ sleep delay - (Time.now - @last_accessed) if delay
150
141
  @last_accessed = Time.now
151
142
  end
152
-
153
143
  end
154
- end
144
+ end
@@ -0,0 +1,41 @@
1
+ require 'singleton'
2
+ module Polipus
3
+ class SignalHandler
4
+ include Singleton
5
+ attr_accessor :terminated
6
+ attr_accessor :enabled
7
+
8
+ def initialize
9
+ self.terminated = false
10
+ self.enabled = false
11
+ end
12
+
13
+ def self.enable
14
+ trap(:INT) do
15
+ exit unless self.enabled?
16
+ terminate
17
+ end
18
+ trap(:TERM) do
19
+ exit unless self.enabled?
20
+ terminate
21
+ end
22
+ instance.enabled = true
23
+ end
24
+
25
+ def self.disable
26
+ instance.enabled = false
27
+ end
28
+
29
+ def self.terminate
30
+ instance.terminated = true
31
+ end
32
+
33
+ def self.terminated?
34
+ instance.terminated
35
+ end
36
+
37
+ def self.enabled?
38
+ instance.enabled
39
+ end
40
+ end
41
+ end
@@ -1,25 +1,25 @@
1
- require "polipus/storage/base"
1
+ require 'polipus/storage/base'
2
2
  module Polipus
3
3
  module Storage
4
4
  def self.mongo_store(mongo = nil, collection_name = 'pages', except = [])
5
5
  require 'polipus/storage/mongo_store'
6
- mongo ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
- raise "First argument must be an instance of Mongo::DB" unless mongo.is_a?(Mongo::DB)
8
- self::MongoStore.new(:mongo => mongo, :collection => collection_name, :except => except)
6
+ mongo ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
7
+ fail 'First argument must be an instance of Mongo::DB' unless mongo.is_a?(Mongo::DB)
8
+ self::MongoStore.new(mongo: mongo, collection: collection_name, except: except)
9
9
  end
10
10
 
11
11
  def self.s3_store(bucket_name = 'pages', aws_credential = {}, except = [])
12
12
  require 'polipus/storage/s3_store'
13
-
13
+
14
14
  if aws_credential[:access_key_id].nil? || aws_credential[:secret_access_key].nil?
15
- raise "You have to specify AWS crediantials: :access_key_id and :secret_access_key"
15
+ fail 'You have to specify AWS crediantials: :access_key_id and :secret_access_key'
16
16
  end
17
17
 
18
18
  self::S3Store.new(
19
- :bucket => bucket_name,
20
- :access_key_id => aws_credential[:access_key_id],
21
- :secret_access_key => aws_credential[:secret_access_key],
22
- :except => except
19
+ bucket: bucket_name,
20
+ access_key_id: aws_credential[:access_key_id],
21
+ secret_access_key: aws_credential[:secret_access_key],
22
+ except: except
23
23
  )
24
24
  end
25
25
 
@@ -33,4 +33,4 @@ module Polipus
33
33
  self::MemoryStore.new
34
34
  end
35
35
  end
36
- end
36
+ end