polipus 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +8 -8
  2. data/.rubocop.yml +17 -0
  3. data/.rubocop_todo.yml +37 -0
  4. data/.travis.yml +2 -1
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +10 -0
  7. data/Rakefile +4 -4
  8. data/examples/basic.rb +16 -19
  9. data/examples/incremental.rb +17 -17
  10. data/examples/robots_txt_handling.rb +1 -1
  11. data/examples/survival.rb +3 -3
  12. data/lib/polipus.rb +186 -229
  13. data/lib/polipus/http.rb +41 -42
  14. data/lib/polipus/page.rb +33 -34
  15. data/lib/polipus/plugin.rb +2 -2
  16. data/lib/polipus/plugins/cleaner.rb +7 -8
  17. data/lib/polipus/plugins/sample.rb +6 -9
  18. data/lib/polipus/plugins/sleeper.rb +7 -8
  19. data/lib/polipus/queue_overflow.rb +11 -11
  20. data/lib/polipus/queue_overflow/base.rb +1 -1
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
  22. data/lib/polipus/queue_overflow/manager.rb +28 -25
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
  25. data/lib/polipus/robotex.rb +41 -51
  26. data/lib/polipus/signal_handler.rb +41 -0
  27. data/lib/polipus/storage.rb +11 -11
  28. data/lib/polipus/storage/base.rb +10 -8
  29. data/lib/polipus/storage/dev_null.rb +6 -7
  30. data/lib/polipus/storage/memory_store.rb +21 -22
  31. data/lib/polipus/storage/mongo_store.rb +34 -38
  32. data/lib/polipus/storage/s3_store.rb +33 -38
  33. data/lib/polipus/url_tracker.rb +3 -3
  34. data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
  35. data/lib/polipus/url_tracker/redis_set.rb +3 -4
  36. data/lib/polipus/version.rb +3 -3
  37. data/polipus.gemspec +12 -13
  38. data/spec/clear.rb +3 -3
  39. data/spec/http_spec.rb +27 -28
  40. data/spec/page_spec.rb +16 -16
  41. data/spec/polipus_spec.rb +34 -31
  42. data/spec/queue_overflow_manager_spec.rb +30 -28
  43. data/spec/queue_overflow_spec.rb +15 -15
  44. data/spec/robotex_spec.rb +9 -10
  45. data/spec/signal_handler_spec.rb +18 -0
  46. data/spec/spec_helper.rb +7 -6
  47. data/spec/storage_memory_spec.rb +18 -18
  48. data/spec/storage_mongo_spec.rb +19 -19
  49. data/spec/storage_s3_spec.rb +30 -31
  50. data/spec/url_tracker_spec.rb +7 -7
  51. metadata +7 -2
data/lib/polipus/http.rb CHANGED
@@ -1,6 +1,6 @@
1
- require "net/https"
2
- require "polipus/page"
3
- require "zlib"
1
+ require 'net/https'
2
+ require 'polipus/page'
3
+ require 'zlib'
4
4
  require 'http/cookie'
5
5
 
6
6
  module Polipus
@@ -20,7 +20,8 @@ module Polipus
20
20
  Net::ProtocolError,
21
21
  SocketError,
22
22
  Timeout::Error,
23
- Zlib::DataError
23
+ Zlib::DataError,
24
+ Zlib::GzipFile::Error
24
25
  ]
25
26
 
26
27
  def initialize(opts = {})
@@ -46,14 +47,14 @@ module Polipus
46
47
  pages = []
47
48
  get(url, referer) do |response, code, location, redirect_to, response_time|
48
49
  handle_compression response
49
- pages << Page.new(location, :body => response.body,
50
- :code => code,
51
- :headers => response.to_hash,
52
- :referer => referer,
53
- :depth => depth,
54
- :redirect_to => redirect_to,
55
- :response_time => response_time,
56
- :fetched_at => Time.now.to_i)
50
+ pages << Page.new(location, body: response.body,
51
+ code: code,
52
+ headers: response.to_hash,
53
+ referer: referer,
54
+ depth: depth,
55
+ redirect_to: redirect_to,
56
+ response_time: response_time,
57
+ fetched_at: Time.now.to_i)
57
58
  end
58
59
 
59
60
  pages
@@ -81,7 +82,6 @@ module Polipus
81
82
  @opts[:user_agent]
82
83
  end
83
84
 
84
-
85
85
  #
86
86
  # The proxy address string
87
87
  #
@@ -141,17 +141,18 @@ module Polipus
141
141
  def get(url, referer = nil)
142
142
  limit = redirect_limit
143
143
  loc = url
144
- begin
145
- # if redirected to a relative url, merge it with the host of the original
146
- # request url
147
- loc = url.merge(loc) if loc.relative?
148
-
149
- response, response_time = get_response(loc, referer)
150
- code = Integer(response.code)
151
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
152
- yield response, code, loc, redirect_to, response_time
153
- limit -= 1
154
- end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
144
+ loop do
145
+ # if redirected to a relative url, merge it with the host of the original
146
+ # request url
147
+ loc = url.merge(loc) if loc.relative?
148
+
149
+ response, response_time = get_response(loc, referer)
150
+ code = Integer(response.code)
151
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
152
+ yield response, code, loc, redirect_to, response_time
153
+ limit -= 1
154
+ break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
155
+ end
155
156
  end
156
157
 
157
158
  #
@@ -166,25 +167,24 @@ module Polipus
166
167
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
167
168
  opts['Accept-Encoding'] = 'gzip,deflate'
168
169
 
169
-
170
170
  retries = 0
171
171
  begin
172
- start = Time.now()
172
+ start = Time.now
173
173
  # format request
174
174
  req = Net::HTTP::Get.new(full_path, opts)
175
175
  # HTTP Basic authentication
176
176
  req.basic_auth url.user, url.password if url.user
177
177
  response = connection(url).request(req)
178
- finish = Time.now()
178
+ finish = Time.now
179
179
  response_time = ((finish - start) * 1000).round
180
- cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
180
+ cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies?
181
181
  return response, response_time
182
182
  rescue *RESCUABLE_ERRORS => e
183
183
  puts e.inspect if verbose?
184
184
  refresh_connection(url)
185
185
  retries += 1
186
- unless retries > 3
187
- retry
186
+ if retries < 3
187
+ retry
188
188
  else
189
189
  raise e
190
190
  end
@@ -195,13 +195,13 @@ module Polipus
195
195
  @connections[url.host] ||= {}
196
196
  @connections_hits[url.host] ||= {}
197
197
 
198
- if conn = @connections[url.host][url.port]
198
+ if @connections[url.host][url.port]
199
199
  if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
200
- @opts[:logger].debug {"Connection #{url.host}:#{url.port} is staled, refreshing"} if @opts[:logger]
200
+ @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
201
201
  return refresh_connection url
202
202
  end
203
203
  @connections_hits[url.host][url.port] += 1
204
- return conn
204
+ return @connections[url.host][url.port]
205
205
  end
206
206
 
207
207
  refresh_connection url
@@ -211,20 +211,20 @@ module Polipus
211
211
  proxy_host, proxy_port = proxy_host_port unless @opts[:proxy_host_port].nil?
212
212
 
213
213
  if @opts[:logger] && proxy_host && proxy_port
214
- @opts[:logger].debug {"Request #{url} using proxy: #{proxy_host}:#{proxy_port}"}
214
+ @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
215
215
  end
216
216
 
217
217
  http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
218
218
 
219
- http.read_timeout = read_timeout if !!read_timeout
220
- http.open_timeout = open_timeout if !!open_timeout
219
+ http.read_timeout = read_timeout if read_timeout
220
+ http.open_timeout = open_timeout if open_timeout
221
221
 
222
222
  if url.scheme == 'https'
223
223
  http.use_ssl = true
224
224
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
225
225
  end
226
226
  @connections_hits[url.host][url.port] = 1
227
- @connections[url.host][url.port] = http.start
227
+ @connections[url.host][url.port] = http.start
228
228
  end
229
229
 
230
230
  def verbose?
@@ -238,15 +238,14 @@ module Polipus
238
238
  to_url.host.nil? || (to_url.host == from_url.host)
239
239
  end
240
240
 
241
- def handle_compression response
242
- case response["content-encoding"]
243
- when "gzip", "x-gzip"
241
+ def handle_compression(response)
242
+ case response['content-encoding']
243
+ when 'gzip', 'x-gzip'
244
244
  body_io = StringIO.new(response.body)
245
245
  response.body.replace Zlib::GzipReader.new(body_io).read
246
- when "deflate"
246
+ when 'deflate'
247
247
  response.body.replace Zlib::Inflate.inflate(response.body)
248
248
  end
249
249
  end
250
-
251
250
  end
252
251
  end
data/lib/polipus/page.rb CHANGED
@@ -4,7 +4,6 @@ require 'ostruct'
4
4
  require 'set'
5
5
  module Polipus
6
6
  class Page
7
-
8
7
  # The URL of the page
9
8
  attr_reader :url
10
9
  # The raw HTTP response body of the page
@@ -31,7 +30,7 @@ module Polipus
31
30
  attr_accessor :domain_aliases
32
31
 
33
32
  # Whether the current page should be stored
34
- # Default: true
33
+ # Default: true
35
34
  attr_accessor :storable
36
35
 
37
36
  attr_accessor :fetched_at
@@ -64,11 +63,11 @@ module Polipus
64
63
  def links
65
64
  return @links.to_a unless @links.nil?
66
65
  @links = Set.new
67
- return [] if !doc
68
-
69
- doc.search("//a[@href]").each do |a|
66
+ return [] unless doc
67
+
68
+ doc.search('//a[@href]').each do |a|
70
69
  u = a['href']
71
- next if u.nil? or u.empty?
70
+ next if u.nil? || u.empty?
72
71
  abs = to_absolute(u) rescue next
73
72
  @links << abs if in_domain?(abs)
74
73
  end
@@ -118,7 +117,7 @@ module Polipus
118
117
  # otherwise.
119
118
  #
120
119
  def html?
121
- !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
120
+ content_type =~ %r{^(text/html|application/xhtml+xml)\b}
122
121
  end
123
122
 
124
123
  #
@@ -151,11 +150,11 @@ module Polipus
151
150
  #
152
151
  def base
153
152
  @base = if doc
154
- href = doc.search('//head/base/@href')
155
- URI(href.to_s) unless href.nil? rescue nil
156
- end unless @base
157
-
158
- return nil if @base && @base.to_s().empty?
153
+ href = doc.search('//head/base/@href')
154
+ URI(href.to_s) unless href.nil? rescue nil
155
+ end unless @base
156
+
157
+ return nil if @base && @base.to_s.empty?
159
158
  @base
160
159
  end
161
160
 
@@ -167,14 +166,14 @@ module Polipus
167
166
  return nil if link.nil?
168
167
 
169
168
  # remove anchor
170
- link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
169
+ link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
171
170
 
172
171
  relative = URI(link)
173
172
  absolute = base ? base.merge(relative) : @url.merge(relative)
174
173
 
175
174
  absolute.path = '/' if absolute.path.empty?
176
175
 
177
- return absolute
176
+ absolute
178
177
  end
179
178
 
180
179
  #
@@ -188,25 +187,25 @@ module Polipus
188
187
 
189
188
  def to_hash
190
189
  {
191
- 'url' => @url.to_s,
192
- 'headers' => Marshal.dump(@headers),
193
- 'body' => @body,
194
- 'links' => links.map(&:to_s),
195
- 'code' => @code,
196
- 'depth' => @depth,
197
- 'referer' => @referer.to_s,
198
- 'redirect_to' => @redirect_to.to_s,
199
- 'response_time' => @response_time,
200
- 'fetched' => @fetched,
201
- 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
202
- 'fetched_at' => @fetched_at,
203
- 'error' => @error
204
- }
190
+ 'url' => @url.to_s,
191
+ 'headers' => Marshal.dump(@headers),
192
+ 'body' => @body,
193
+ 'links' => links.map(&:to_s),
194
+ 'code' => @code,
195
+ 'depth' => @depth,
196
+ 'referer' => @referer.to_s,
197
+ 'redirect_to' => @redirect_to.to_s,
198
+ 'response_time' => @response_time,
199
+ 'fetched' => @fetched,
200
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
201
+ 'fetched_at' => @fetched_at,
202
+ 'error' => @error
203
+ }
205
204
  end
206
205
 
207
206
  def to_json
208
207
  th = to_hash.dup
209
- th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
208
+ th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
210
209
  th.delete('headers') if content_type.empty?
211
210
  th.to_json
212
211
  end
@@ -220,21 +219,21 @@ module Polipus
220
219
  @storable
221
220
  end
222
221
 
223
- def expired? ttl
222
+ def expired?(ttl)
224
223
  return false if fetched_at.nil?
225
224
  (Time.now.to_i - ttl) > fetched_at
226
225
  end
227
226
 
228
227
  def self.from_hash(hash)
229
- page = self.new(URI(hash['url']))
228
+ page = new(URI(hash['url']))
230
229
  {
231
- '@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
230
+ '@headers' => hash['headers'] ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
232
231
  '@body' => hash['body'],
233
232
  '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
234
233
  '@code' => hash['code'].to_i,
235
234
  '@depth' => hash['depth'].to_i,
236
235
  '@referer' => hash['referer'],
237
- '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
236
+ '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
238
237
  '@response_time' => hash['response_time'].to_i,
239
238
  '@fetched' => hash['fetched'],
240
239
  '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
@@ -248,7 +247,7 @@ module Polipus
248
247
 
249
248
  def self.from_json(json)
250
249
  hash = JSON.parse json
251
- self.from_hash hash
250
+ from_hash hash
252
251
  end
253
252
  end
254
253
  end
@@ -1,7 +1,7 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  @@plugins = {}
4
- def self.register plugin, options = {}
4
+ def self.register(plugin, options = {})
5
5
  o = plugin.new(options)
6
6
  @@plugins[o.class.name] = o
7
7
  end
@@ -10,4 +10,4 @@ module Polipus
10
10
  @@plugins
11
11
  end
12
12
  end
13
- end
13
+ end
@@ -1,25 +1,24 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  class Cleaner
4
-
5
4
  def initialize(options = {})
6
5
  @reset = options[:reset] ||= false
7
6
  end
8
7
 
9
- def on_initialize crawler
10
- crawler.logger.info {"Cleaner plugin loaded"}
8
+ def on_initialize(crawler)
9
+ crawler.logger.info { 'Cleaner plugin loaded' }
11
10
  unless @reset
12
- crawler.logger.info {"Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing"}
11
+ crawler.logger.info { 'Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing' }
13
12
  return nil
14
13
  end
15
- crawler.logger.info {"Cleaning all: url_tracker, storage, queue"}
16
- Proc.new {
14
+ crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
15
+ proc do
17
16
  url_tracker.clear
18
17
  storage.clear
19
18
  queue_factory.clear
20
19
  @options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
21
- }
20
+ end
22
21
  end
23
22
  end
24
23
  end
25
- end
24
+ end
@@ -1,17 +1,14 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  class Sample
4
-
5
- def initialize(options = {})
6
-
4
+ def initialize(_options = {})
7
5
  end
8
6
 
9
- def on_initialize crawler
10
- Proc.new {
11
- @options.each { |k,v| @logger.info {"Polipus configuration: #{k.to_s} => #{v}"} }
12
- }
7
+ def on_initialize(_crawler)
8
+ proc do
9
+ @options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
10
+ end
13
11
  end
14
-
15
12
  end
16
13
  end
17
- end
14
+ end
@@ -1,22 +1,21 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  class Sleeper
4
-
5
4
  def initialize(options = {})
6
5
  @delay = options[:delay] ||= 1
7
6
  end
8
7
 
9
- def on_initialize crawler
10
- crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
11
- Proc.new {
8
+ def on_initialize(crawler)
9
+ crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
10
+ proc do
12
11
  # Set to 1 the number of threads
13
12
  @options[:workers] = 1
14
- }
13
+ end
15
14
  end
16
-
17
- def on_message_processed crawler
15
+
16
+ def on_message_processed(_crawler)
18
17
  sleep @delay
19
18
  end
20
19
  end
21
20
  end
22
- end
21
+ end
@@ -1,24 +1,24 @@
1
- require "polipus/queue_overflow/manager"
1
+ require 'polipus/queue_overflow/manager'
2
2
  module Polipus
3
3
  module QueueOverflow
4
4
  def self.mongo_queue(mongo_db, queue_name, options = {})
5
- require "polipus/queue_overflow/mongo_queue"
6
- mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
- raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
5
+ require 'polipus/queue_overflow/mongo_queue'
6
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
7
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
8
8
  self::MongoQueue.new mongo_db, queue_name, options
9
9
  end
10
10
 
11
11
  def self.mongo_queue_capped(mongo_db, queue_name, options = {})
12
- require "polipus/queue_overflow/mongo_queue_capped"
13
- mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
14
- raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
12
+ require 'polipus/queue_overflow/mongo_queue_capped'
13
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
14
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
15
15
  options[:max] = 1_000_000 if options[:max].nil?
16
16
  self::MongoQueueCapped.new mongo_db, queue_name, options
17
17
  end
18
-
19
- def self.dev_null_queue(options = {})
20
- require "polipus/queue_overflow/dev_null_queue"
18
+
19
+ def self.dev_null_queue(_options = {})
20
+ require 'polipus/queue_overflow/dev_null_queue'
21
21
  self::DevNullQueue.new
22
22
  end
23
23
  end
24
- end
24
+ end