polipus 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.rubocop.yml +17 -0
- data/.rubocop_todo.yml +37 -0
- data/.travis.yml +2 -1
- data/CHANGELOG.md +20 -0
- data/README.md +10 -0
- data/Rakefile +4 -4
- data/examples/basic.rb +16 -19
- data/examples/incremental.rb +17 -17
- data/examples/robots_txt_handling.rb +1 -1
- data/examples/survival.rb +3 -3
- data/lib/polipus.rb +186 -229
- data/lib/polipus/http.rb +41 -42
- data/lib/polipus/page.rb +33 -34
- data/lib/polipus/plugin.rb +2 -2
- data/lib/polipus/plugins/cleaner.rb +7 -8
- data/lib/polipus/plugins/sample.rb +6 -9
- data/lib/polipus/plugins/sleeper.rb +7 -8
- data/lib/polipus/queue_overflow.rb +11 -11
- data/lib/polipus/queue_overflow/base.rb +1 -1
- data/lib/polipus/queue_overflow/dev_null_queue.rb +9 -9
- data/lib/polipus/queue_overflow/manager.rb +28 -25
- data/lib/polipus/queue_overflow/mongo_queue.rb +24 -26
- data/lib/polipus/queue_overflow/mongo_queue_capped.rb +12 -12
- data/lib/polipus/robotex.rb +41 -51
- data/lib/polipus/signal_handler.rb +41 -0
- data/lib/polipus/storage.rb +11 -11
- data/lib/polipus/storage/base.rb +10 -8
- data/lib/polipus/storage/dev_null.rb +6 -7
- data/lib/polipus/storage/memory_store.rb +21 -22
- data/lib/polipus/storage/mongo_store.rb +34 -38
- data/lib/polipus/storage/s3_store.rb +33 -38
- data/lib/polipus/url_tracker.rb +3 -3
- data/lib/polipus/url_tracker/bloomfilter.rb +4 -5
- data/lib/polipus/url_tracker/redis_set.rb +3 -4
- data/lib/polipus/version.rb +3 -3
- data/polipus.gemspec +12 -13
- data/spec/clear.rb +3 -3
- data/spec/http_spec.rb +27 -28
- data/spec/page_spec.rb +16 -16
- data/spec/polipus_spec.rb +34 -31
- data/spec/queue_overflow_manager_spec.rb +30 -28
- data/spec/queue_overflow_spec.rb +15 -15
- data/spec/robotex_spec.rb +9 -10
- data/spec/signal_handler_spec.rb +18 -0
- data/spec/spec_helper.rb +7 -6
- data/spec/storage_memory_spec.rb +18 -18
- data/spec/storage_mongo_spec.rb +19 -19
- data/spec/storage_s3_spec.rb +30 -31
- data/spec/url_tracker_spec.rb +7 -7
- metadata +7 -2
data/lib/polipus/http.rb
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
1
|
+
require 'net/https'
|
2
|
+
require 'polipus/page'
|
3
|
+
require 'zlib'
|
4
4
|
require 'http/cookie'
|
5
5
|
|
6
6
|
module Polipus
|
@@ -20,7 +20,8 @@ module Polipus
|
|
20
20
|
Net::ProtocolError,
|
21
21
|
SocketError,
|
22
22
|
Timeout::Error,
|
23
|
-
Zlib::DataError
|
23
|
+
Zlib::DataError,
|
24
|
+
Zlib::GzipFile::Error
|
24
25
|
]
|
25
26
|
|
26
27
|
def initialize(opts = {})
|
@@ -46,14 +47,14 @@ module Polipus
|
|
46
47
|
pages = []
|
47
48
|
get(url, referer) do |response, code, location, redirect_to, response_time|
|
48
49
|
handle_compression response
|
49
|
-
pages << Page.new(location, :
|
50
|
-
:
|
51
|
-
:
|
52
|
-
:
|
53
|
-
:
|
54
|
-
:
|
55
|
-
:
|
56
|
-
:
|
50
|
+
pages << Page.new(location, body: response.body,
|
51
|
+
code: code,
|
52
|
+
headers: response.to_hash,
|
53
|
+
referer: referer,
|
54
|
+
depth: depth,
|
55
|
+
redirect_to: redirect_to,
|
56
|
+
response_time: response_time,
|
57
|
+
fetched_at: Time.now.to_i)
|
57
58
|
end
|
58
59
|
|
59
60
|
pages
|
@@ -81,7 +82,6 @@ module Polipus
|
|
81
82
|
@opts[:user_agent]
|
82
83
|
end
|
83
84
|
|
84
|
-
|
85
85
|
#
|
86
86
|
# The proxy address string
|
87
87
|
#
|
@@ -141,17 +141,18 @@ module Polipus
|
|
141
141
|
def get(url, referer = nil)
|
142
142
|
limit = redirect_limit
|
143
143
|
loc = url
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
144
|
+
loop do
|
145
|
+
# if redirected to a relative url, merge it with the host of the original
|
146
|
+
# request url
|
147
|
+
loc = url.merge(loc) if loc.relative?
|
148
|
+
|
149
|
+
response, response_time = get_response(loc, referer)
|
150
|
+
code = Integer(response.code)
|
151
|
+
redirect_to = response.is_a?(Net::HTTPRedirection) ? URI(response['location']).normalize : nil
|
152
|
+
yield response, code, loc, redirect_to, response_time
|
153
|
+
limit -= 1
|
154
|
+
break unless (loc = redirect_to) && allowed?(redirect_to, url) && limit > 0
|
155
|
+
end
|
155
156
|
end
|
156
157
|
|
157
158
|
#
|
@@ -166,25 +167,24 @@ module Polipus
|
|
166
167
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
167
168
|
opts['Accept-Encoding'] = 'gzip,deflate'
|
168
169
|
|
169
|
-
|
170
170
|
retries = 0
|
171
171
|
begin
|
172
|
-
start = Time.now
|
172
|
+
start = Time.now
|
173
173
|
# format request
|
174
174
|
req = Net::HTTP::Get.new(full_path, opts)
|
175
175
|
# HTTP Basic authentication
|
176
176
|
req.basic_auth url.user, url.password if url.user
|
177
177
|
response = connection(url).request(req)
|
178
|
-
finish = Time.now
|
178
|
+
finish = Time.now
|
179
179
|
response_time = ((finish - start) * 1000).round
|
180
|
-
cookie_jar.parse(response[
|
180
|
+
cookie_jar.parse(response['Set-Cookie'], url) if accept_cookies?
|
181
181
|
return response, response_time
|
182
182
|
rescue *RESCUABLE_ERRORS => e
|
183
183
|
puts e.inspect if verbose?
|
184
184
|
refresh_connection(url)
|
185
185
|
retries += 1
|
186
|
-
|
187
|
-
retry
|
186
|
+
if retries < 3
|
187
|
+
retry
|
188
188
|
else
|
189
189
|
raise e
|
190
190
|
end
|
@@ -195,13 +195,13 @@ module Polipus
|
|
195
195
|
@connections[url.host] ||= {}
|
196
196
|
@connections_hits[url.host] ||= {}
|
197
197
|
|
198
|
-
if
|
198
|
+
if @connections[url.host][url.port]
|
199
199
|
if @opts[:connection_max_hits] && @connections_hits[url.host][url.port] >= @opts[:connection_max_hits]
|
200
|
-
@opts[:logger].debug {"Connection #{url.host}:#{url.port} is staled, refreshing"} if @opts[:logger]
|
200
|
+
@opts[:logger].debug { "Connection #{url.host}:#{url.port} is staled, refreshing" } if @opts[:logger]
|
201
201
|
return refresh_connection url
|
202
202
|
end
|
203
203
|
@connections_hits[url.host][url.port] += 1
|
204
|
-
return
|
204
|
+
return @connections[url.host][url.port]
|
205
205
|
end
|
206
206
|
|
207
207
|
refresh_connection url
|
@@ -211,20 +211,20 @@ module Polipus
|
|
211
211
|
proxy_host, proxy_port = proxy_host_port unless @opts[:proxy_host_port].nil?
|
212
212
|
|
213
213
|
if @opts[:logger] && proxy_host && proxy_port
|
214
|
-
@opts[:logger].debug {"Request #{url} using proxy: #{proxy_host}:#{proxy_port}"}
|
214
|
+
@opts[:logger].debug { "Request #{url} using proxy: #{proxy_host}:#{proxy_port}" }
|
215
215
|
end
|
216
216
|
|
217
217
|
http = Net::HTTP.new(url.host, url.port, proxy_host, proxy_port)
|
218
218
|
|
219
|
-
http.read_timeout = read_timeout if
|
220
|
-
http.open_timeout = open_timeout if
|
219
|
+
http.read_timeout = read_timeout if read_timeout
|
220
|
+
http.open_timeout = open_timeout if open_timeout
|
221
221
|
|
222
222
|
if url.scheme == 'https'
|
223
223
|
http.use_ssl = true
|
224
224
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
225
225
|
end
|
226
226
|
@connections_hits[url.host][url.port] = 1
|
227
|
-
@connections[url.host][url.port] = http.start
|
227
|
+
@connections[url.host][url.port] = http.start
|
228
228
|
end
|
229
229
|
|
230
230
|
def verbose?
|
@@ -238,15 +238,14 @@ module Polipus
|
|
238
238
|
to_url.host.nil? || (to_url.host == from_url.host)
|
239
239
|
end
|
240
240
|
|
241
|
-
def handle_compression
|
242
|
-
case response[
|
243
|
-
when
|
241
|
+
def handle_compression(response)
|
242
|
+
case response['content-encoding']
|
243
|
+
when 'gzip', 'x-gzip'
|
244
244
|
body_io = StringIO.new(response.body)
|
245
245
|
response.body.replace Zlib::GzipReader.new(body_io).read
|
246
|
-
when
|
246
|
+
when 'deflate'
|
247
247
|
response.body.replace Zlib::Inflate.inflate(response.body)
|
248
248
|
end
|
249
249
|
end
|
250
|
-
|
251
250
|
end
|
252
251
|
end
|
data/lib/polipus/page.rb
CHANGED
@@ -4,7 +4,6 @@ require 'ostruct'
|
|
4
4
|
require 'set'
|
5
5
|
module Polipus
|
6
6
|
class Page
|
7
|
-
|
8
7
|
# The URL of the page
|
9
8
|
attr_reader :url
|
10
9
|
# The raw HTTP response body of the page
|
@@ -31,7 +30,7 @@ module Polipus
|
|
31
30
|
attr_accessor :domain_aliases
|
32
31
|
|
33
32
|
# Whether the current page should be stored
|
34
|
-
# Default: true
|
33
|
+
# Default: true
|
35
34
|
attr_accessor :storable
|
36
35
|
|
37
36
|
attr_accessor :fetched_at
|
@@ -64,11 +63,11 @@ module Polipus
|
|
64
63
|
def links
|
65
64
|
return @links.to_a unless @links.nil?
|
66
65
|
@links = Set.new
|
67
|
-
return []
|
68
|
-
|
69
|
-
doc.search(
|
66
|
+
return [] unless doc
|
67
|
+
|
68
|
+
doc.search('//a[@href]').each do |a|
|
70
69
|
u = a['href']
|
71
|
-
next if u.nil?
|
70
|
+
next if u.nil? || u.empty?
|
72
71
|
abs = to_absolute(u) rescue next
|
73
72
|
@links << abs if in_domain?(abs)
|
74
73
|
end
|
@@ -118,7 +117,7 @@ module Polipus
|
|
118
117
|
# otherwise.
|
119
118
|
#
|
120
119
|
def html?
|
121
|
-
|
120
|
+
content_type =~ %r{^(text/html|application/xhtml+xml)\b}
|
122
121
|
end
|
123
122
|
|
124
123
|
#
|
@@ -151,11 +150,11 @@ module Polipus
|
|
151
150
|
#
|
152
151
|
def base
|
153
152
|
@base = if doc
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
return nil if @base && @base.to_s
|
153
|
+
href = doc.search('//head/base/@href')
|
154
|
+
URI(href.to_s) unless href.nil? rescue nil
|
155
|
+
end unless @base
|
156
|
+
|
157
|
+
return nil if @base && @base.to_s.empty?
|
159
158
|
@base
|
160
159
|
end
|
161
160
|
|
@@ -167,14 +166,14 @@ module Polipus
|
|
167
166
|
return nil if link.nil?
|
168
167
|
|
169
168
|
# remove anchor
|
170
|
-
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,'')))
|
169
|
+
link = URI.encode(URI.decode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/, '')))
|
171
170
|
|
172
171
|
relative = URI(link)
|
173
172
|
absolute = base ? base.merge(relative) : @url.merge(relative)
|
174
173
|
|
175
174
|
absolute.path = '/' if absolute.path.empty?
|
176
175
|
|
177
|
-
|
176
|
+
absolute
|
178
177
|
end
|
179
178
|
|
180
179
|
#
|
@@ -188,25 +187,25 @@ module Polipus
|
|
188
187
|
|
189
188
|
def to_hash
|
190
189
|
{
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
190
|
+
'url' => @url.to_s,
|
191
|
+
'headers' => Marshal.dump(@headers),
|
192
|
+
'body' => @body,
|
193
|
+
'links' => links.map(&:to_s),
|
194
|
+
'code' => @code,
|
195
|
+
'depth' => @depth,
|
196
|
+
'referer' => @referer.to_s,
|
197
|
+
'redirect_to' => @redirect_to.to_s,
|
198
|
+
'response_time' => @response_time,
|
199
|
+
'fetched' => @fetched,
|
200
|
+
'user_data' => @user_data.nil? ? {} : @user_data.marshal_dump,
|
201
|
+
'fetched_at' => @fetched_at,
|
202
|
+
'error' => @error
|
203
|
+
}
|
205
204
|
end
|
206
205
|
|
207
206
|
def to_json
|
208
207
|
th = to_hash.dup
|
209
|
-
th.each {|k,v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?)}
|
208
|
+
th.each { |k, v| th.delete(k) if v.nil? || (v.respond_to?(:empty?) && v.empty?) }
|
210
209
|
th.delete('headers') if content_type.empty?
|
211
210
|
th.to_json
|
212
211
|
end
|
@@ -220,21 +219,21 @@ module Polipus
|
|
220
219
|
@storable
|
221
220
|
end
|
222
221
|
|
223
|
-
def expired?
|
222
|
+
def expired?(ttl)
|
224
223
|
return false if fetched_at.nil?
|
225
224
|
(Time.now.to_i - ttl) > fetched_at
|
226
225
|
end
|
227
226
|
|
228
227
|
def self.from_hash(hash)
|
229
|
-
page =
|
228
|
+
page = new(URI(hash['url']))
|
230
229
|
{
|
231
|
-
'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : {'content-type' => ['']},
|
230
|
+
'@headers' => hash['headers'] ? Marshal.load(hash['headers']) : { 'content-type' => [''] },
|
232
231
|
'@body' => hash['body'],
|
233
232
|
'@links' => hash['links'] ? hash['links'].map { |link| URI(link) } : [],
|
234
233
|
'@code' => hash['code'].to_i,
|
235
234
|
'@depth' => hash['depth'].to_i,
|
236
235
|
'@referer' => hash['referer'],
|
237
|
-
'@redirect_to' => (
|
236
|
+
'@redirect_to' => (hash['redirect_to'] && !hash['redirect_to'].empty?) ? URI(hash['redirect_to']) : nil,
|
238
237
|
'@response_time' => hash['response_time'].to_i,
|
239
238
|
'@fetched' => hash['fetched'],
|
240
239
|
'@user_data' => hash['user_data'] ? OpenStruct.new(hash['user_data']) : nil,
|
@@ -248,7 +247,7 @@ module Polipus
|
|
248
247
|
|
249
248
|
def self.from_json(json)
|
250
249
|
hash = JSON.parse json
|
251
|
-
|
250
|
+
from_hash hash
|
252
251
|
end
|
253
252
|
end
|
254
253
|
end
|
data/lib/polipus/plugin.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
@@plugins = {}
|
4
|
-
def self.register
|
4
|
+
def self.register(plugin, options = {})
|
5
5
|
o = plugin.new(options)
|
6
6
|
@@plugins[o.class.name] = o
|
7
7
|
end
|
@@ -10,4 +10,4 @@ module Polipus
|
|
10
10
|
@@plugins
|
11
11
|
end
|
12
12
|
end
|
13
|
-
end
|
13
|
+
end
|
@@ -1,25 +1,24 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
class Cleaner
|
4
|
-
|
5
4
|
def initialize(options = {})
|
6
5
|
@reset = options[:reset] ||= false
|
7
6
|
end
|
8
7
|
|
9
|
-
def on_initialize
|
10
|
-
crawler.logger.info {
|
8
|
+
def on_initialize(crawler)
|
9
|
+
crawler.logger.info { 'Cleaner plugin loaded' }
|
11
10
|
unless @reset
|
12
|
-
crawler.logger.info {
|
11
|
+
crawler.logger.info { 'Cleaner plugin is disable, add :reset => true to the plugin if you really know what you are doing' }
|
13
12
|
return nil
|
14
13
|
end
|
15
|
-
crawler.logger.info {
|
16
|
-
|
14
|
+
crawler.logger.info { 'Cleaning all: url_tracker, storage, queue' }
|
15
|
+
proc do
|
17
16
|
url_tracker.clear
|
18
17
|
storage.clear
|
19
18
|
queue_factory.clear
|
20
19
|
@options[:queue_overflow_adapter].clear if @options[:queue_overflow_adapter]
|
21
|
-
|
20
|
+
end
|
22
21
|
end
|
23
22
|
end
|
24
23
|
end
|
25
|
-
end
|
24
|
+
end
|
@@ -1,17 +1,14 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
class Sample
|
4
|
-
|
5
|
-
def initialize(options = {})
|
6
|
-
|
4
|
+
def initialize(_options = {})
|
7
5
|
end
|
8
6
|
|
9
|
-
def on_initialize
|
10
|
-
|
11
|
-
@options.each { |k,v| @logger.info {"Polipus configuration: #{k
|
12
|
-
|
7
|
+
def on_initialize(_crawler)
|
8
|
+
proc do
|
9
|
+
@options.each { |k, v| @logger.info { "Polipus configuration: #{k} => #{v}" } }
|
10
|
+
end
|
13
11
|
end
|
14
|
-
|
15
12
|
end
|
16
13
|
end
|
17
|
-
end
|
14
|
+
end
|
@@ -1,22 +1,21 @@
|
|
1
1
|
module Polipus
|
2
2
|
module Plugin
|
3
3
|
class Sleeper
|
4
|
-
|
5
4
|
def initialize(options = {})
|
6
5
|
@delay = options[:delay] ||= 1
|
7
6
|
end
|
8
7
|
|
9
|
-
def on_initialize
|
10
|
-
crawler.logger.info {"Sleeper plugin loaded, sleep for #{@delay} after each request"}
|
11
|
-
|
8
|
+
def on_initialize(crawler)
|
9
|
+
crawler.logger.info { "Sleeper plugin loaded, sleep for #{@delay} after each request" }
|
10
|
+
proc do
|
12
11
|
# Set to 1 the number of threads
|
13
12
|
@options[:workers] = 1
|
14
|
-
|
13
|
+
end
|
15
14
|
end
|
16
|
-
|
17
|
-
def on_message_processed
|
15
|
+
|
16
|
+
def on_message_processed(_crawler)
|
18
17
|
sleep @delay
|
19
18
|
end
|
20
19
|
end
|
21
20
|
end
|
22
|
-
end
|
21
|
+
end
|
@@ -1,24 +1,24 @@
|
|
1
|
-
require
|
1
|
+
require 'polipus/queue_overflow/manager'
|
2
2
|
module Polipus
|
3
3
|
module QueueOverflow
|
4
4
|
def self.mongo_queue(mongo_db, queue_name, options = {})
|
5
|
-
require
|
6
|
-
mongo_db ||= Mongo::Connection.new(
|
7
|
-
|
5
|
+
require 'polipus/queue_overflow/mongo_queue'
|
6
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
7
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
8
8
|
self::MongoQueue.new mongo_db, queue_name, options
|
9
9
|
end
|
10
10
|
|
11
11
|
def self.mongo_queue_capped(mongo_db, queue_name, options = {})
|
12
|
-
require
|
13
|
-
mongo_db ||= Mongo::Connection.new(
|
14
|
-
|
12
|
+
require 'polipus/queue_overflow/mongo_queue_capped'
|
13
|
+
mongo_db ||= Mongo::Connection.new('localhost', 27_017, pool_size: 15, pool_timeout: 5).db('polipus')
|
14
|
+
fail 'First argument must be an instance of Mongo::DB' unless mongo_db.is_a?(Mongo::DB)
|
15
15
|
options[:max] = 1_000_000 if options[:max].nil?
|
16
16
|
self::MongoQueueCapped.new mongo_db, queue_name, options
|
17
17
|
end
|
18
|
-
|
19
|
-
def self.dev_null_queue(
|
20
|
-
require
|
18
|
+
|
19
|
+
def self.dev_null_queue(_options = {})
|
20
|
+
require 'polipus/queue_overflow/dev_null_queue'
|
21
21
|
self::DevNullQueue.new
|
22
22
|
end
|
23
23
|
end
|
24
|
-
end
|
24
|
+
end
|