polipus 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
data/lib/polipus/http.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'net/https'
|
2
|
+
require 'polipus/page'
|
3
|
+
require 'zlib'
|
4
4
|
require 'http/cookie'
|
5
5
|
|
6
6
|
module Polipus
|
@@ -20,7 +20,8 @@ module Polipus
|
|
20
20
|
Net::ProtocolError,
|
21
21
|
SocketError,
|
22
22
|
Timeout::Error,
|
23
|
-
Zlib::DataError
|
23
|
+
Zlib::DataError,
|
24
|
+
Zlib::GzipFile::Error
|
24
25
|
]
|
25
26
|
|
26
27
|
def initialize(opts = {})
|
@@ -46,14 +47,14 @@ module Polipus
|
|
46
47
|
pages = []
|
47
48
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
48
49
|
handle_compression response
|
49
|
-
pages << Page.new(location, :
|
50
|
-
:
|
51
|
-
:
|
52
|
-
:
|
53
|
-
:
|
54
|
-
:
|
55
|
-
:
|
56
|
-
:
|
50
|
+
pages << Page.new(location, body: response.body,
|
51
|
+
code: code,
|
52
|
+
headers: response.to_hash,
|
53
|
+
referer: referer,
|
54
|
+
depth: depth,
|
55
|
+
redirect_to: redirect_to,
|
56
|
+
response_time: response_time,
|
57
|
+
fetched_at: Time.now.to_i)
|
57
58
|
end
|
58
59
|
|
59
60
|
pages
|
@@ -81,7 +82,6 @@ module Polipus
|
|
81
82
|
@opts[:user_agent]
|
82
83
|
end
|
83
84
|
|
84
|
-
|
85
85
|
#
|
86
86
|
# The proxy address string
|
87
87
|
#
|
@@ -141,17 +141,18 @@ module Polipus
|
|
141
141
|
def get(url, referer = nil)
|
142
142
|
limit = redirect_limit
|
143
143
|
loc = url
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
144
|
+
loop do
|
145
|
+
# if redirected to a relative url, merge it with the host of the original
|
146
|
+
# request url
|
147
|
+
loc = url.merge(loc) if loc.relative?
|
148
|
+
|
149
|
+
response, response_time = get_response(loc, referer)
|
150
|
+
code = Integer(response.code)
|
151
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
152
|
+
yield response, code, loc, redirect_to, response_time
|
153
|
+
limit -= 1
|
154
|
+
break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
155
|
+
end
|
155
156
|
end
|
156
157
|
|
157
158
|
#
|
@@ -166,25 +167,24 @@ module Polipus
|
|
166
167
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
167
168
|
opts['Accept-Encoding'] = 'gzip,deflate'
|
168
169
|
|
169
|
-
|
170
170
|
retries = 0
|
171
171
|
begin
|
172
|
-
start = Time.now
|
172
|
+
start = Time.now
|
173
173
|
# format request
|
174
174
|
req = Net::HTTP::Get.new(full_path, opts)
|
175
175
|
# HTTP Basic authentication
|
176
176
|
req.basic_auth url.user, url.password if url.user
|
177
177
|
response = connection(url).request(req)
|
178
|
-
finish = Time.now
|
178
|
+
finish = Time.now
|
179
179
|
response_time = ((finish - start) * 1000).round
|
180
|
-
cookie_jar.parse(response[
|
180
|
+
cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies?
|
181
181
|
return response, response_time
|
182
182
|
rescue *RESCUABLE_ERRORS => e
|
183
183
|
puts e.inspect if verbose?
|
184
184
|
refresh_connection(url)
|
185
185
|
retries += 1
|
186
|
-
|
187
|
-
retry
|
186
|
+
if retries < 3
|
187
|
+
retry
|
188
188
|
else
|
189
189
|
raise e
|
190
190
|
end
|
@@ -195,13 +195,13 @@ module Polipus
|
|
195
195
|
@connections[url.host] ||= {}
|
196
196
|
@connections_hits[url.host] ||= {}
|
197
197
|
|
198
|
-
if
|
198
|
+
if @connections[url.host][url.port]
|
199
199
|
if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
|
200
|
-
@opts[:logger].debug {"Connection #{url.host}:#{url.port} is staled, refreshing"} if @opts[:logger]
|
200
|
+
@opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
|
201
201
|
return refresh_connection url
|
202
202
|
end
|
203
203
|
@connections_hits[url.host][url.port] += 1
|
204
|
-
return
|
204
|
+
return @connections[url.host][url.port]
|
205
205
|
end
|
206
206
|
|
207
207
|
refresh_connection url
|
@@ -211,20 +211,20 @@ module Polipus
|
|
211
211
|
proxy_host, proxy_port = proxy_host_port unless @opts[:proxy_host_port].nil?
|
212
212
|
|
213
213
|
if @opts[:logger] && proxy_host && proxy_port
|
214
|
-
@opts[:logger].debug {"Request #{url} using proxy: #{proxy_host}:#{proxy_port}"}
|
214
|
+
@opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
|
215
215
|
end
|
216
216
|
|
217
217
|
http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
|
218
218
|
|
219
|
-
http.read_timeout = read_timeout if
|
220
|
-
http.open_timeout = open_timeout if
|
219
|
+
http.read_timeout = read_timeout if read_timeout
|
220
|
+
http.open_timeout = open_timeout if open_timeout
|
221
221
|
|
222
222
|
if url.scheme == 'https'
|
223
223
|
http.use_ssl = true
|
224
224
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
225
225
|
end
|
226
226
|
@connections_hits[url.host][url.port] = 1
|
227
|
-
@connections[url.host][url.port] = http.start
|
227
|
+
@connections[url.host][url.port] = http.start
|
228
228
|
end
|
229
229
|
|
230
230
|
def verbose?
|
@@ -238,15 +238,14 @@ module Polipus
|
|
238
238
|
to_url.host.nil? || (to_url.host == from_url.host)
|
239
239
|
end
|
240
240
|
|
241
|
-
def handle_compression
|
242
|
-
case response[
|
243
|
-
when
|
241
|
+
def handle_compression(response)
|
242
|
+
case response['content-encoding']
|
243
|
+
when 'gzip', 'x-gzip'
|
244
244
|
body_io = StringIO.new(response.body)
|
245
245
|
response.body.replace Zlib::GzipReader.new(body_io).read
|
246
|
-
when
|
246
|
+
when 'deflate'
|
247
247
|
response.body.replace Zlib::Inflate.inflate(response.body)
|
248
248
|
end
|
249
249
|
end
|
250
|
-
|
251
250
|
end
|
252
251
|
end
|
data/lib/polipus/page.rb
CHANGED
@@ -4,7 +4,6 @@ require 'ostruct'
|
|
4
4
|
require 'set'
|
5
5
|
module Polipus
|
6
6
|
class Page
|
7
|
-
|
8
7
|
# The URL of the page
|
9
8
|
attr_reader :url
|
10
9
|
# The raw HTTP response body of the page
|
@@ -31,7 +30,7 @@ module Polipus
|
|
31
30
|
attr_accessor :domain_aliases
|
32
31
|
|
33
32
|
# Whether the current page should be stored
|
34
|
-
# Default: true
|
33
|
+
# Default: true
|
35
34
|
attr_accessor :storable
|
36
35
|
|
37
36
|
attr_accessor :fetched_at
|
@@ -64,11 +63,11 @@ module Polipus
|
|
64
63
|
def links
|
65
64
|
return @links.to_a unless @links.nil?
|
66
65
|
@links = Set.new
|
67
|
-
return []
|
68
|
-
|
69
|
-
doc.search(
|
66
|
+
return [] unless doc
|
67
|
+
|
68
|
+
doc.search('//a[@href]').each do |a|
|
70
69
|
u = a['href']
|
71
|
-
next if u.nil?
|
70
|
+
next if u.nil? || u.empty?
|
72
71
|
abs = to_absolute(u) rescue next
|
73
72
|
@links << abs if in_domain?(abs)
|
74
73
|
end
|
@@ -118,7 +117,7 @@ module Polipus
|
|
118
117
|
# otherwise.
|
119
118
|
#
|
120
119
|
def html?
|
121
|
-
|
120
|
+
content_type =~ %r{^(text/html|application/xhtml+xml)\b}
|
122
121
|
end
|
123
122
|
|
124
123
|
#
|
@@ -151,11 +150,11 @@ module Polipus
|
|
151
150
|
#
|
152
151
|
def base
|
153
152
|
@base = if doc
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
return nil if @base && @base.to_s
|
153
|
+
href = doc.search('//head/base/@href')
|
154
|
+
URI(href.to_s) unless href.nil? rescue nil
|
155
|
+
end unless @base
|
156
|
+
|
157
|
+
return nil if @base && @base.to_s.empty?
|
159
158
|
@base
|
160
159
|
end
|
161
160
|
|
@@ -167,14 +166,14 @@ module Polipus
|
|
167
166
|
return nil if link.nil?
|
168
167
|
|
169
168
|
# remove anchor
|
170
|
-
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
|
169
|
+
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
171
170
|
|
172
171
|
relative = URI(link)
|
173
172
|
absolute = base ? base.merge(relative) : @url.merge(relative)
|
174
173
|
|
175
174
|
absolute.path = '/' if absolute.path.empty?
|
176
175
|
|
177
|
-
|
176
|
+
absolute
|
178
177
|
end
|
179
178
|
|
180
179
|
#
|
@@ -188,25 +187,25 @@ module Polipus
|
|
188
187
|
|
189
188
|
def to_hash
|
190
189
|
{
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
190
|
+
'url' => @url.to_s,
|
191
|
+
'headers' => Marshal.dump(@headers),
|
192
|
+
'body' => @body,
|
193
|
+
'links' => links.map(&:to_s),
|
194
|
+
'code' => @code,
|
195
|
+
'depth' => @depth,
|
196
|
+
'referer' => @referer.to_s,
|
197
|
+
'redirect_to' => @redirect_to.to_s,
|
198
|
+
'response_time' => @response_time,
|
199
|
+
'fetched' => @fetched,
|
200
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
201
|
+
'fetched_at' => @fetched_at,
|
202
|
+
'error' => @error
|
203
|
+
}
|
205
204
|
end
|
206
205
|
|
207
206
|
def to_json
|
208
207
|
th = to_hash.dup
|
209
|
-
th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
|
208
|
+
th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
210
209
|
th.delete('headers') if content_type.empty?
|
211
210
|
th.to_json
|
212
211
|
end
|
@@ -220,21 +219,21 @@ module Polipus
|
|
220
219
|
@storable
|
221
220
|
end
|
222
221
|
|
223
|
-
def expired?
|
222
|
+
def expired?(ttl)
|
224
223
|
return false if fetched_at.nil?
|
225
224
|
(Time.now.to_i - ttl) > fetched_at
|
226
225
|
end
|
227
226
|
|
228
227
|
def self.from_hash(hash)
|
229
|
-
page =
|
228
|
+
page = new(URI(hash['url']))
|
230
229
|
{
|
231
|
-
'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
|
230
|
+
'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
|
232
231
|
'@body' => hash['body'],
|
233
232
|
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
234
233
|
'@code' => hash['code'].to_i,
|
235
234
|
'@depth' => hash['depth'].to_i,
|
236
235
|
'@referer' => hash['referer'],
|
237
|
-
'@redirect_to' => (
|
236
|
+
'@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
238
237
|
'@response_time' => hash['response_time'].to_i,
|
239
238
|
'@fetched' => hash['fetched'],
|
240
239
|
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
@@ -248,7 +247,7 @@ module Polipus
|
|
248
247
|
|
249
248
|
def self.from_json(json)
|
250
249
|
hash = JSON.parse json
|
251
|
-
|
250
|
+
from_hash hash
|
252
251
|
end
|
253
252
|
end
|
254
253
|
end
|
data/lib/polipus/plugin.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
@@plugins = {}
|
4
|
-
def self.register
|
4
|
+
def self.register(plugin, options = {})
|
5
5
|
o = plugin.new(options)
|
6
6
|
@@plugins[o.class.name] = o
|
7
7
|
end
|
@@ -10,4 +10,4 @@ module Polipus
|
|
10
10
|
@@plugins
|
11
11
|
end
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
@@ -1,25 +1,24 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
class Cleaner
|
4
|
-
|
5
4
|
def initialize(options = {})
|
6
5
|
@reset = options[:reset] ||= false
|
7
6
|
end
|
8
7
|
|
9
|
-
def on_initialize
|
10
|
-
crawler.logger.info {
|
8
|
+
def on_initialize(crawler)
|
9
|
+
crawler.logger.info { 'Cleaner plugin loaded' }
|
11
10
|
unless @reset
|
12
|
-
crawler.logger.info {
|
11
|
+
crawler.logger.info { 'Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing' }
|
13
12
|
return nil
|
14
13
|
end
|
15
|
-
crawler.logger.info {
|
16
|
-
|
14
|
+
crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
|
15
|
+
proc do
|
17
16
|
url_tracker.clear
|
18
17
|
storage.clear
|
19
18
|
queue_factory.clear
|
20
19
|
@options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
|
21
|
-
|
20
|
+
end
|
22
21
|
end
|
23
22
|
end
|
24
23
|
end
|
25
|
-
end
|
24
|
+
end
|
@@ -1,17 +1,14 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
class Sample
|
4
|
-
|
5
|
-
def initialize(options = {})
|
6
|
-
|
4
|
+
def initialize(_options = {})
|
7
5
|
end
|
8
6
|
|
9
|
-
def on_initialize
|
10
|
-
|
11
|
-
@options.each { |k,v| @logger.info {"Polipus configuration: #{k
|
12
|
-
|
7
|
+
def on_initialize(_crawler)
|
8
|
+
proc do
|
9
|
+
@options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
|
10
|
+
end
|
13
11
|
end
|
14
|
-
|
15
12
|
end
|
16
13
|
end
|
17
|
-
end
|
14
|
+
end
|
@@ -1,22 +1,21 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
class Sleeper
|
4
|
-
|
5
4
|
def initialize(options = {})
|
6
5
|
@delay = options[:delay] ||= 1
|
7
6
|
end
|
8
7
|
|
9
|
-
def on_initialize
|
10
|
-
crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
|
11
|
-
|
8
|
+
def on_initialize(crawler)
|
9
|
+
crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
|
10
|
+
proc do
|
12
11
|
# Set to 1 the number of threads
|
13
12
|
@options[:workers] = 1
|
14
|
-
|
13
|
+
end
|
15
14
|
end
|
16
|
-
|
17
|
-
def on_message_processed
|
15
|
+
|
16
|
+
def on_message_processed(_crawler)
|
18
17
|
sleep @delay
|
19
18
|
end
|
20
19
|
end
|
21
20
|
end
|
22
|
-
end
|
21
|
+
end
|
@@ -1,24 +1,24 @@
|
|
1
|
-
require
|
1
|
+
require 'polipus/queue_overflow/manager'
|
2
2
|
module Polipus
|
3
3
|
module QueueOverflow
|
4
4
|
def self.mongo_queue(mongo_db, queue_name, options = {})
|
5
|
-
require
|
6
|
-
mongo_db ||= Mongo::Connection.new(
|
7
|
-
|
5
|
+
require 'polipus/queue_overflow/mongo_queue'
|
6
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
7
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
8
8
|
self::MongoQueue.new mongo_db, queue_name, options
|
9
9
|
end
|
10
10
|
|
11
11
|
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
12
|
-
require
|
13
|
-
mongo_db ||= Mongo::Connection.new(
|
14
|
-
|
12
|
+
require 'polipus/queue_overflow/mongo_queue_capped'
|
13
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
14
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
15
15
|
options[:max] = 1_000_000 if options[:max].nil?
|
16
16
|
self::MongoQueueCapped.new mongo_db, queue_name, options
|
17
17
|
end
|
18
|
-
|
19
|
-
def self.dev_null_queue(
|
20
|
-
require
|
18
|
+
|
19
|
+
def self.dev_null_queue(_options = {})
|
20
|
+
require 'polipus/queue_overflow/dev_null_queue'
|
21
21
|
self::DevNullQueue.new
|
22
22
|
end
|
23
23
|
end
|
24
|
-
end
|
24
|
+
end
|