polipus 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (51) hide show
  1. checksums.yaml +8 -8
  2. data/.rubocop.yml +17 -0
  3. data/.rubocop_todo.yml +37 -0
  4. data/.travis.yml +2 -1
  5. data/CHANGELOG.md +20 -0
  6. data/README.md +10 -0
  7. data/Rakefile +4 -4
  8. data/examples/basic.rb +16 -19
  9. data/examples/incremental.rb +17 -17
  10. data/examples/robots_txt_handling.rb +1 -1
  11. data/examples/survival.rb +3 -3
  12. data/lib/polipus.rb +186 -229
  13. data/lib/polipus/http.rb +41 -42
  14. data/lib/polipus/page.rb +33 -34
  15. data/lib/polipus/plugin.rb +2 -2
  16. data/lib/polipus/plugins/cleaner.rb +7 -8
  17. data/lib/polipus/plugins/sample.rb +6 -9
  18. data/lib/polipus/plugins/sleeper.rb +7 -8
  19. data/lib/polipus/queue_overflow.rb +11 -11
  20. data/lib/polipus/queue_overflow/base.rb +1 -1
  21. data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
  22. data/lib/polipus/queue_overflow/manager.rb +28 -25
  23. data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
  24. data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
  25. data/lib/polipus/robotex.rb +41 -51
  26. data/lib/polipus/signal_handler.rb +41 -0
  27. data/lib/polipus/storage.rb +11 -11
  28. data/lib/polipus/storage/base.rb +10 -8
  29. data/lib/polipus/storage/dev_null.rb +6 -7
  30. data/lib/polipus/storage/memory_store.rb +21 -22
  31. data/lib/polipus/storage/mongo_store.rb +34 -38
  32. data/lib/polipus/storage/s3_store.rb +33 -38
  33. data/lib/polipus/url_tracker.rb +3 -3
  34. data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
  35. data/lib/polipus/url_tracker/redis_set.rb +3 -4
  36. data/lib/polipus/version.rb +3 -3
  37. data/polipus.gemspec +12 -13
  38. data/spec/clear.rb +3 -3
  39. data/spec/http_spec.rb +27 -28
  40. data/spec/page_spec.rb +16 -16
  41. data/spec/polipus_spec.rb +34 -31
  42. data/spec/queue_overflow_manager_spec.rb +30 -28
  43. data/spec/queue_overflow_spec.rb +15 -15
  44. data/spec/robotex_spec.rb +9 -10
  45. data/spec/signal_handler_spec.rb +18 -0
  46. data/spec/spec_helper.rb +7 -6
  47. data/spec/storage_memory_spec.rb +18 -18
  48. data/spec/storage_mongo_spec.rb +19 -19
  49. data/spec/storage_s3_spec.rb +30 -31
  50. data/spec/url_tracker_spec.rb +7 -7
  51. metadata +7 -2
data/lib/polipus/http.rb CHANGED
@@ -1,6 +1,6 @@
1
- require "net/https"
2
- require "polipus/page"
3
- require "zlib"
1
+ require 'net/https'
2
+ require 'polipus/page'
3
+ require 'zlib'
4
4
  require 'http/cookie'
5
5
 
6
6
  module Polipus
@@ -20,7 +20,8 @@ module Polipus
20
20
  Net::ProtocolError,
21
21
  SocketError,
22
22
  Timeout::Error,
23
- Zlib::DataError
23
+ Zlib::DataError,
24
+ Zlib::GzipFile::Error
24
25
  ]
25
26
 
26
27
  def initialize(opts = {})
@@ -46,14 +47,14 @@ module Polipus
46
47
  pages = []
47
48
  get(url, referer) do |response, code, location, redirect_to, response_time|
48
49
  handle_compression response
49
- pages << Page.new(location, :body => response.body,
50
- :code => code,
51
- :headers => response.to_hash,
52
- :referer => referer,
53
- :depth => depth,
54
- :redirect_to => redirect_to,
55
- :response_time => response_time,
56
- :fetched_at => Time.now.to_i)
50
+ pages << Page.new(location, body: response.body,
51
+ code: code,
52
+ headers: response.to_hash,
53
+ referer: referer,
54
+ depth: depth,
55
+ redirect_to: redirect_to,
56
+ response_time: response_time,
57
+ fetched_at: Time.now.to_i)
57
58
  end
58
59
 
59
60
  pages
@@ -81,7 +82,6 @@ module Polipus
81
82
  @opts[:user_agent]
82
83
  end
83
84
 
84
-
85
85
  #
86
86
  # The proxy address string
87
87
  #
@@ -141,17 +141,18 @@ module Polipus
141
141
  def get(url, referer = nil)
142
142
  limit = redirect_limit
143
143
  loc = url
144
- begin
145
- # if redirected to a relative url, merge it with the host of the original
146
- # request url
147
- loc = url.merge(loc) if loc.relative?
148
-
149
- response, response_time = get_response(loc, referer)
150
- code = Integer(response.code)
151
- redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
152
- yield response, code, loc, redirect_to, response_time
153
- limit -= 1
154
- end while (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
144
+ loop do
145
+ # if redirected to a relative url, merge it with the host of the original
146
+ # request url
147
+ loc = url.merge(loc) if loc.relative?
148
+
149
+ response, response_time = get_response(loc, referer)
150
+ code = Integer(response.code)
151
+ redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
152
+ yield response, code, loc, redirect_to, response_time
153
+ limit -= 1
154
+ break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
155
+ end
155
156
  end
156
157
 
157
158
  #
@@ -166,25 +167,24 @@ module Polipus
166
167
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
167
168
  opts['Accept-Encoding'] = 'gzip,deflate'
168
169
 
169
-
170
170
  retries = 0
171
171
  begin
172
- start = Time.now()
172
+ start = Time.now
173
173
  # format request
174
174
  req = Net::HTTP::Get.new(full_path, opts)
175
175
  # HTTP Basic authentication
176
176
  req.basic_auth url.user, url.password if url.user
177
177
  response = connection(url).request(req)
178
- finish = Time.now()
178
+ finish = Time.now
179
179
  response_time = ((finish - start) * 1000).round
180
- cookie_jar.parse(response["Set-Cookie"], url) if accept_cookies?
180
+ cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies?
181
181
  return response, response_time
182
182
  rescue *RESCUABLE_ERRORS => e
183
183
  puts e.inspect if verbose?
184
184
  refresh_connection(url)
185
185
  retries += 1
186
- unless retries > 3
187
- retry
186
+ if retries < 3
187
+ retry
188
188
  else
189
189
  raise e
190
190
  end
@@ -195,13 +195,13 @@ module Polipus
195
195
  @connections[url.host] ||= {}
196
196
  @connections_hits[url.host] ||= {}
197
197
 
198
- if conn = @connections[url.host][url.port]
198
+ if @connections[url.host][url.port]
199
199
  if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
200
- @opts[:logger].debug {"Connection #{url.host}:#{url.port} is staled, refreshing"} if @opts[:logger]
200
+ @opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
201
201
  return refresh_connection url
202
202
  end
203
203
  @connections_hits[url.host][url.port] += 1
204
- return conn
204
+ return @connections[url.host][url.port]
205
205
  end
206
206
 
207
207
  refresh_connection url
@@ -211,20 +211,20 @@ module Polipus
211
211
  proxy_host, proxy_port = proxy_host_port unless @opts[:proxy_host_port].nil?
212
212
 
213
213
  if @opts[:logger] && proxy_host && proxy_port
214
- @opts[:logger].debug {"Request #{url} using proxy: #{proxy_host}:#{proxy_port}"}
214
+ @opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
215
215
  end
216
216
 
217
217
  http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
218
218
 
219
- http.read_timeout = read_timeout if !!read_timeout
220
- http.open_timeout = open_timeout if !!open_timeout
219
+ http.read_timeout = read_timeout if read_timeout
220
+ http.open_timeout = open_timeout if open_timeout
221
221
 
222
222
  if url.scheme == 'https'
223
223
  http.use_ssl = true
224
224
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
225
225
  end
226
226
  @connections_hits[url.host][url.port] = 1
227
- @connections[url.host][url.port] = http.start
227
+ @connections[url.host][url.port] = http.start
228
228
  end
229
229
 
230
230
  def verbose?
@@ -238,15 +238,14 @@ module Polipus
238
238
  to_url.host.nil? || (to_url.host == from_url.host)
239
239
  end
240
240
 
241
- def handle_compression response
242
- case response["content-encoding"]
243
- when "gzip", "x-gzip"
241
+ def handle_compression(response)
242
+ case response['content-encoding']
243
+ when 'gzip', 'x-gzip'
244
244
  body_io = StringIO.new(response.body)
245
245
  response.body.replace Zlib::GzipReader.new(body_io).read
246
- when "deflate"
246
+ when 'deflate'
247
247
  response.body.replace Zlib::Inflate.inflate(response.body)
248
248
  end
249
249
  end
250
-
251
250
  end
252
251
  end
data/lib/polipus/page.rb CHANGED
@@ -4,7 +4,6 @@ require 'ostruct'
4
4
  require 'set'
5
5
  module Polipus
6
6
  class Page
7
-
8
7
  # The URL of the page
9
8
  attr_reader :url
10
9
  # The raw HTTP response body of the page
@@ -31,7 +30,7 @@ module Polipus
31
30
  attr_accessor :domain_aliases
32
31
 
33
32
  # Whether the current page should be stored
34
- # Default: true
33
+ # Default: true
35
34
  attr_accessor :storable
36
35
 
37
36
  attr_accessor :fetched_at
@@ -64,11 +63,11 @@ module Polipus
64
63
  def links
65
64
  return @links.to_a unless @links.nil?
66
65
  @links = Set.new
67
- return [] if !doc
68
-
69
- doc.search("//a[@href]").each do |a|
66
+ return [] unless doc
67
+
68
+ doc.search('//a[@href]').each do |a|
70
69
  u = a['href']
71
- next if u.nil? or u.empty?
70
+ next if u.nil? || u.empty?
72
71
  abs = to_absolute(u) rescue next
73
72
  @links << abs if in_domain?(abs)
74
73
  end
@@ -118,7 +117,7 @@ module Polipus
118
117
  # otherwise.
119
118
  #
120
119
  def html?
121
- !!(content_type =~ %r{^(text/html|application/xhtml+xml)\b})
120
+ content_type =~ %r{^(text/html|application/xhtml+xml)\b}
122
121
  end
123
122
 
124
123
  #
@@ -151,11 +150,11 @@ module Polipus
151
150
  #
152
151
  def base
153
152
  @base = if doc
154
- href = doc.search('//head/base/@href')
155
- URI(href.to_s) unless href.nil? rescue nil
156
- end unless @base
157
-
158
- return nil if @base && @base.to_s().empty?
153
+ href = doc.search('//head/base/@href')
154
+ URI(href.to_s) unless href.nil? rescue nil
155
+ end unless @base
156
+
157
+ return nil if @base && @base.to_s.empty?
159
158
  @base
160
159
  end
161
160
 
@@ -167,14 +166,14 @@ module Polipus
167
166
  return nil if link.nil?
168
167
 
169
168
  # remove anchor
170
- link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
169
+ link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
171
170
 
172
171
  relative = URI(link)
173
172
  absolute = base ? base.merge(relative) : @url.merge(relative)
174
173
 
175
174
  absolute.path = '/' if absolute.path.empty?
176
175
 
177
- return absolute
176
+ absolute
178
177
  end
179
178
 
180
179
  #
@@ -188,25 +187,25 @@ module Polipus
188
187
 
189
188
  def to_hash
190
189
  {
191
- 'url' => @url.to_s,
192
- 'headers' => Marshal.dump(@headers),
193
- 'body' => @body,
194
- 'links' => links.map(&:to_s),
195
- 'code' => @code,
196
- 'depth' => @depth,
197
- 'referer' => @referer.to_s,
198
- 'redirect_to' => @redirect_to.to_s,
199
- 'response_time' => @response_time,
200
- 'fetched' => @fetched,
201
- 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
202
- 'fetched_at' => @fetched_at,
203
- 'error' => @error
204
- }
190
+ 'url' => @url.to_s,
191
+ 'headers' => Marshal.dump(@headers),
192
+ 'body' => @body,
193
+ 'links' => links.map(&:to_s),
194
+ 'code' => @code,
195
+ 'depth' => @depth,
196
+ 'referer' => @referer.to_s,
197
+ 'redirect_to' => @redirect_to.to_s,
198
+ 'response_time' => @response_time,
199
+ 'fetched' => @fetched,
200
+ 'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
201
+ 'fetched_at' => @fetched_at,
202
+ 'error' => @error
203
+ }
205
204
  end
206
205
 
207
206
  def to_json
208
207
  th = to_hash.dup
209
- th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
208
+ th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
210
209
  th.delete('headers') if content_type.empty?
211
210
  th.to_json
212
211
  end
@@ -220,21 +219,21 @@ module Polipus
220
219
  @storable
221
220
  end
222
221
 
223
- def expired? ttl
222
+ def expired?(ttl)
224
223
  return false if fetched_at.nil?
225
224
  (Time.now.to_i - ttl) > fetched_at
226
225
  end
227
226
 
228
227
  def self.from_hash(hash)
229
- page = self.new(URI(hash['url']))
228
+ page = new(URI(hash['url']))
230
229
  {
231
- '@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
230
+ '@headers' => hash['headers'] ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
232
231
  '@body' => hash['body'],
233
232
  '@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
234
233
  '@code' => hash['code'].to_i,
235
234
  '@depth' => hash['depth'].to_i,
236
235
  '@referer' => hash['referer'],
237
- '@redirect_to' => (!!hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
236
+ '@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
238
237
  '@response_time' => hash['response_time'].to_i,
239
238
  '@fetched' => hash['fetched'],
240
239
  '@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
@@ -248,7 +247,7 @@ module Polipus
248
247
 
249
248
  def self.from_json(json)
250
249
  hash = JSON.parse json
251
- self.from_hash hash
250
+ from_hash hash
252
251
  end
253
252
  end
254
253
  end
@@ -1,7 +1,7 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  @@plugins = {}
4
- def self.register plugin, options = {}
4
+ def self.register(plugin, options = {})
5
5
  o = plugin.new(options)
6
6
  @@plugins[o.class.name] = o
7
7
  end
@@ -10,4 +10,4 @@ module Polipus
10
10
  @@plugins
11
11
  end
12
12
  end
13
- end
13
+ end
@@ -1,25 +1,24 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  class Cleaner
4
-
5
4
  def initialize(options = {})
6
5
  @reset = options[:reset] ||= false
7
6
  end
8
7
 
9
- def on_initialize crawler
10
- crawler.logger.info {"Cleaner plugin loaded"}
8
+ def on_initialize(crawler)
9
+ crawler.logger.info { 'Cleaner plugin loaded' }
11
10
  unless @reset
12
- crawler.logger.info {"Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing"}
11
+ crawler.logger.info { 'Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing' }
13
12
  return nil
14
13
  end
15
- crawler.logger.info {"Cleaning all: url_tracker, storage, queue"}
16
- Proc.new {
14
+ crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
15
+ proc do
17
16
  url_tracker.clear
18
17
  storage.clear
19
18
  queue_factory.clear
20
19
  @options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
21
- }
20
+ end
22
21
  end
23
22
  end
24
23
  end
25
- end
24
+ end
@@ -1,17 +1,14 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  class Sample
4
-
5
- def initialize(options = {})
6
-
4
+ def initialize(_options = {})
7
5
  end
8
6
 
9
- def on_initialize crawler
10
- Proc.new {
11
- @options.each { |k,v| @logger.info {"Polipus configuration: #{k.to_s} => #{v}"} }
12
- }
7
+ def on_initialize(_crawler)
8
+ proc do
9
+ @options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
10
+ end
13
11
  end
14
-
15
12
  end
16
13
  end
17
- end
14
+ end
@@ -1,22 +1,21 @@
1
1
  module Polipus
2
2
  module Plugin
3
3
  class Sleeper
4
-
5
4
  def initialize(options = {})
6
5
  @delay = options[:delay] ||= 1
7
6
  end
8
7
 
9
- def on_initialize crawler
10
- crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
11
- Proc.new {
8
+ def on_initialize(crawler)
9
+ crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
10
+ proc do
12
11
  # Set to 1 the number of threads
13
12
  @options[:workers] = 1
14
- }
13
+ end
15
14
  end
16
-
17
- def on_message_processed crawler
15
+
16
+ def on_message_processed(_crawler)
18
17
  sleep @delay
19
18
  end
20
19
  end
21
20
  end
22
- end
21
+ end
@@ -1,24 +1,24 @@
1
- require "polipus/queue_overflow/manager"
1
+ require 'polipus/queue_overflow/manager'
2
2
  module Polipus
3
3
  module QueueOverflow
4
4
  def self.mongo_queue(mongo_db, queue_name, options = {})
5
- require "polipus/queue_overflow/mongo_queue"
6
- mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
7
- raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
5
+ require 'polipus/queue_overflow/mongo_queue'
6
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
7
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
8
8
  self::MongoQueue.new mongo_db, queue_name, options
9
9
  end
10
10
 
11
11
  def self.mongo_queue_capped(mongo_db, queue_name, options = {})
12
- require "polipus/queue_overflow/mongo_queue_capped"
13
- mongo_db ||= Mongo::Connection.new("localhost", 27017, :pool_size => 15, :pool_timeout => 5).db('polipus')
14
- raise "First argument must be an instance of Mongo::DB" unless mongo_db.is_a?(Mongo::DB)
12
+ require 'polipus/queue_overflow/mongo_queue_capped'
13
+ mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
14
+ fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
15
15
  options[:max] = 1_000_000 if options[:max].nil?
16
16
  self::MongoQueueCapped.new mongo_db, queue_name, options
17
17
  end
18
-
19
- def self.dev_null_queue(options = {})
20
- require "polipus/queue_overflow/dev_null_queue"
18
+
19
+ def self.dev_null_queue(_options = {})
20
+ require 'polipus/queue_overflow/dev_null_queue'
21
21
  self::DevNullQueue.new
22
22
  end
23
23
  end
24
- end
24
+ end