polipus 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/.travis.yml +11 -0
- data/AUTHORS +2 -0
- data/README.md +6 -0
- data/lib/polipus.rb +41 -53
- data/lib/polipus/http.rb +30 -24
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +2 -0
- data/spec/cassettes/gzipped_on.yml +137 -0
- data/spec/http_spec.rb +17 -5
- data/spec/spec_helper.rb +4 -0
- metadata +40 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Mzc5NDczMjAxNmUzY2JlYzA4YzUwZDUzMGY0Y2ZjZGQxOTBmMTk5YQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDgxNzhlYTAyNTMxYWE1MTBkMzIxMWMyMDRlMmQ0NjA2ZTc1MzY2Mw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YTNmMGY4YWRkNmU5YThhMTY0ODBiMzY1MmI3MzkzMzEzZjhiZGI4NDI1Mjdh
|
10
|
+
MGI0YWNjZDEyM2NhZDlkZTY2MjY3ZTA5YzVjNWE5YjFjZWEwZjRhMGI3ZWE2
|
11
|
+
MTc0NDczOWIyYTE1YjY1MzA2ZjcwOTRjNWZiOGVlMjIyOTJkNGU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZTVmZGQ0NTgyNTA2NGYwZDVhMTg3NmUzMWM0NDExMGIwOTU5NjM2ZGZkNWM3
|
14
|
+
MWYyYWQwM2NkMmNiMDgxYWY3NTdmYjE5NDZhOGE1YTQxNjRjNjUxNTIzYTc2
|
15
|
+
NTU2ZDZiYzllNWYxZjdiNDIwMzYyNGYyN2YyM2ZmY2VmNWU1NzA=
|
data/.travis.yml
ADDED
data/AUTHORS
ADDED
data/README.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
[![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
|
2
|
+
[![Coverage Status](https://coveralls.io/repos/taganaka/polipus/badge.png?branch=master)](https://coveralls.io/r/taganaka/polipus?branch=master)
|
3
|
+
[![Code Climate](https://codeclimate.com/github/taganaka/polipus.png)](https://codeclimate.com/github/taganaka/polipus)
|
4
|
+
[![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
|
5
|
+
|
6
|
+
|
1
7
|
# Polipus #
|
2
8
|
|
3
9
|
A distributed web crawler written in ruby, backed by Redis
|
data/lib/polipus.rb
CHANGED
@@ -15,7 +15,7 @@ require "singleton"
|
|
15
15
|
|
16
16
|
module Polipus
|
17
17
|
|
18
|
-
def
|
18
|
+
def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
|
19
19
|
PolipusCrawler.crawl(job_name, urls, options, &block)
|
20
20
|
end
|
21
21
|
|
@@ -75,7 +75,6 @@ module Polipus
|
|
75
75
|
attr_reader :storage
|
76
76
|
attr_reader :job_name
|
77
77
|
attr_reader :logger
|
78
|
-
attr_reader :overflow_adapter
|
79
78
|
attr_reader :options
|
80
79
|
attr_reader :crawler_name
|
81
80
|
|
@@ -89,7 +88,7 @@ module Polipus
|
|
89
88
|
end
|
90
89
|
end
|
91
90
|
|
92
|
-
def initialize(job_name = 'polipus',urls = [], options = {})
|
91
|
+
def initialize(job_name = 'polipus', urls = [], options = {})
|
93
92
|
|
94
93
|
@job_name = job_name
|
95
94
|
@options = OPTS.merge(options)
|
@@ -121,22 +120,19 @@ module Polipus
|
|
121
120
|
|
122
121
|
@storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
|
123
122
|
|
124
|
-
@urls = [urls].flatten.map{ |url|
|
123
|
+
@urls = [urls].flatten.map{ |url| URI(url) }
|
125
124
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
125
|
+
|
126
|
+
@internal_queue = queue_factory
|
127
|
+
|
126
128
|
execute_plugin 'on_initialize'
|
127
129
|
|
128
130
|
yield self if block_given?
|
129
131
|
|
130
132
|
end
|
131
133
|
|
132
|
-
def self.crawl(
|
133
|
-
|
134
|
-
self.new(job_name, urls, opts) do |polipus|
|
135
|
-
yield polipus if block_given?
|
136
|
-
|
137
|
-
polipus.takeover
|
138
|
-
end
|
139
|
-
|
134
|
+
def self.crawl(*args, &block)
|
135
|
+
new(*args, &block).takeover
|
140
136
|
end
|
141
137
|
|
142
138
|
def takeover
|
@@ -167,13 +163,13 @@ module Polipus
|
|
167
163
|
page = Page.from_json message
|
168
164
|
|
169
165
|
unless should_be_visited?(page.url, false)
|
170
|
-
@logger.info {"[worker ##{worker_number}] Page
|
166
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) is no more welcome."}
|
171
167
|
queue.commit
|
172
168
|
next
|
173
169
|
end
|
174
170
|
|
175
171
|
if page_exists? page
|
176
|
-
@logger.info {"[worker ##{worker_number}] Page
|
172
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
|
177
173
|
queue.commit
|
178
174
|
next
|
179
175
|
end
|
@@ -190,7 +186,7 @@ module Polipus
|
|
190
186
|
page = pages.pop
|
191
187
|
page.aliases = pages.collect { |e| e.url }
|
192
188
|
if page_exists? page
|
193
|
-
@logger.info {"[worker ##{worker_number}] Page
|
189
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
|
194
190
|
queue.commit
|
195
191
|
next
|
196
192
|
end
|
@@ -212,7 +208,7 @@ module Polipus
|
|
212
208
|
|
213
209
|
if page
|
214
210
|
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
|
215
|
-
@logger.info {"[worker ##{worker_number}] Page
|
211
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
|
216
212
|
end
|
217
213
|
|
218
214
|
incr_pages
|
@@ -261,7 +257,7 @@ module Polipus
|
|
261
257
|
self
|
262
258
|
end
|
263
259
|
|
264
|
-
# A block of code will be executed on every page
|
260
|
+
# A block of code will be executed on every page downloaded
|
265
261
|
# The block takes the page as argument
|
266
262
|
def on_page_downloaded(&block)
|
267
263
|
@on_page_downloaded << block
|
@@ -292,12 +288,7 @@ module Polipus
|
|
292
288
|
@options[:redis_options]
|
293
289
|
end
|
294
290
|
|
295
|
-
def overflow_adapter
|
296
|
-
@options[:overflow_adapter]
|
297
|
-
end
|
298
|
-
|
299
291
|
def queue_size
|
300
|
-
@internal_queue ||= queue_factory
|
301
292
|
@internal_queue.size
|
302
293
|
end
|
303
294
|
|
@@ -311,63 +302,58 @@ module Polipus
|
|
311
302
|
end
|
312
303
|
|
313
304
|
def url_tracker
|
314
|
-
|
315
|
-
@
|
316
|
-
|
317
|
-
|
305
|
+
@url_tracker ||=
|
306
|
+
@options[:url_tracker] ||=
|
307
|
+
UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}",
|
308
|
+
:redis => redis_factory_adapter,
|
309
|
+
:driver => 'lua')
|
318
310
|
end
|
319
311
|
|
320
312
|
def redis
|
321
|
-
|
322
|
-
@redis = redis_factory_adapter
|
323
|
-
end
|
324
|
-
@redis
|
313
|
+
@redis ||= redis_factory_adapter
|
325
314
|
end
|
326
315
|
|
316
|
+
# Enqueue an url, no matter what
|
327
317
|
def add_url url
|
328
|
-
@url_tracker.remove url.to_s
|
329
318
|
page = Page.new(url)
|
330
|
-
|
319
|
+
@internal_queue << page.to_json
|
331
320
|
end
|
332
321
|
|
333
322
|
# Request to Polipus to stop its work (gracefully)
|
334
323
|
# cler_queue = true if you want to delete all of the pending urls to visit
|
335
324
|
def stop!(cler_queue = false)
|
336
325
|
PolipusSignalHandler.terminate
|
337
|
-
|
326
|
+
@internal_queue.clear(true) if cler_queue
|
338
327
|
end
|
339
328
|
|
340
329
|
private
|
341
330
|
# URLs enqueue policy
|
342
331
|
def should_be_visited?(url, with_tracker = true)
|
343
|
-
|
332
|
+
case
|
344
333
|
# Check against whitelist pattern matching
|
345
|
-
|
346
|
-
|
347
|
-
end
|
348
|
-
|
334
|
+
when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
|
335
|
+
false
|
349
336
|
# Check against blacklist pattern matching
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
return true if page_expired?(Page.new(url))
|
356
|
-
|
337
|
+
when @skip_links_like.any?{ |p| url.path =~ p }
|
338
|
+
false
|
339
|
+
# Page is marked as expired
|
340
|
+
when page_expired?(Page.new(url))
|
341
|
+
true
|
357
342
|
# Check against url tracker
|
358
|
-
|
359
|
-
|
343
|
+
when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
|
344
|
+
false
|
345
|
+
else
|
346
|
+
true
|
360
347
|
end
|
361
|
-
true
|
362
348
|
end
|
363
349
|
|
364
350
|
# It extracts URLs from the page
|
365
351
|
def links_for page
|
366
352
|
page.domain_aliases = domain_aliases
|
367
|
-
|
368
|
-
links
|
353
|
+
@focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
|
369
354
|
end
|
370
355
|
|
356
|
+
# whether a page is expired or not
|
371
357
|
def page_expired? page
|
372
358
|
return false if @options[:ttl_page].nil?
|
373
359
|
stored_page = @storage.get(page)
|
@@ -376,6 +362,7 @@ module Polipus
|
|
376
362
|
r
|
377
363
|
end
|
378
364
|
|
365
|
+
# whether a page exists or not
|
379
366
|
def page_exists? page
|
380
367
|
return false if page.user_data && page.user_data.p_seeded
|
381
368
|
@storage.exists?(page) && !page_expired?(page)
|
@@ -392,10 +379,11 @@ module Polipus
|
|
392
379
|
|
393
380
|
# It creates a redis client
|
394
381
|
def redis_factory_adapter
|
395
|
-
|
396
|
-
|
382
|
+
if @redis_factory
|
383
|
+
@redis_factory.call(redis_options)
|
384
|
+
else
|
385
|
+
Redis.new(redis_options)
|
397
386
|
end
|
398
|
-
Redis.new(redis_options)
|
399
387
|
end
|
400
388
|
|
401
389
|
# It creates a new distributed queue
|
data/lib/polipus/http.rb
CHANGED
@@ -27,33 +27,33 @@ module Polipus
|
|
27
27
|
# including redirects
|
28
28
|
#
|
29
29
|
def fetch_pages(url, referer = nil, depth = nil)
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
body = gzip.read
|
38
|
-
end
|
39
|
-
pages << Page.new(location, :body => response.body.dup,
|
40
|
-
:code => code,
|
41
|
-
:headers => response.to_hash,
|
42
|
-
:referer => referer,
|
43
|
-
:depth => depth,
|
44
|
-
:redirect_to => redirect_to,
|
45
|
-
:response_time => response_time,
|
46
|
-
:fetched_at => Time.now.to_i)
|
30
|
+
url = URI(url)
|
31
|
+
pages = []
|
32
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
33
|
+
body = response.body.dup
|
34
|
+
if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
|
35
|
+
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
36
|
+
body = gzip.read
|
47
37
|
end
|
38
|
+
|
39
|
+
pages << Page.new(location, :body => body,
|
40
|
+
:code => code,
|
41
|
+
:headers => response.to_hash,
|
42
|
+
:referer => referer,
|
43
|
+
:depth => depth,
|
44
|
+
:redirect_to => redirect_to,
|
45
|
+
:response_time => response_time,
|
46
|
+
:fetched_at => Time.now.to_i)
|
47
|
+
end
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
end
|
55
|
-
return [Page.new(url, :error => e)]
|
49
|
+
pages
|
50
|
+
rescue StandardError => e
|
51
|
+
if verbose?
|
52
|
+
puts e.inspect
|
53
|
+
puts e.backtrace
|
56
54
|
end
|
55
|
+
|
56
|
+
[Page.new(url, :error => e)]
|
57
57
|
end
|
58
58
|
|
59
59
|
#
|
@@ -154,6 +154,8 @@ module Polipus
|
|
154
154
|
opts['User-Agent'] = user_agent if user_agent
|
155
155
|
opts['Referer'] = referer.to_s if referer
|
156
156
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
157
|
+
opts['Accept-Encoding'] = 'gzip'
|
158
|
+
|
157
159
|
|
158
160
|
retries = 0
|
159
161
|
begin
|
@@ -227,5 +229,9 @@ module Polipus
|
|
227
229
|
to_url.host.nil? || (to_url.host == from_url.host)
|
228
230
|
end
|
229
231
|
|
232
|
+
def gzip_enabled?
|
233
|
+
@opts[:gzip_enabled]
|
234
|
+
end
|
235
|
+
|
230
236
|
end
|
231
237
|
end
|
data/lib/polipus/version.rb
CHANGED
data/polipus.gemspec
CHANGED
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
s.add_development_dependency 'vcr', '~> 2.5', '>= 2.5.0'
|
41
41
|
s.add_development_dependency 'webmock', '>= 1.8.0', '< 1.12'
|
42
42
|
s.add_development_dependency 'flexmock', '~> 1.3', '>= 1.3.2'
|
43
|
+
s.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
|
44
|
+
s.add_development_dependency 'coveralls'
|
43
45
|
|
44
46
|
|
45
47
|
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://www.whatsmyip.org/http-compression-test/
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept-Encoding:
|
11
|
+
- gzip
|
12
|
+
Accept:
|
13
|
+
- "*/*"
|
14
|
+
User-Agent:
|
15
|
+
- Ruby
|
16
|
+
response:
|
17
|
+
status:
|
18
|
+
code: 200
|
19
|
+
message: OK
|
20
|
+
headers:
|
21
|
+
Date:
|
22
|
+
- Mon, 19 May 2014 22:57:51 GMT
|
23
|
+
Server:
|
24
|
+
- Apache/2.2.24 (Unix) DAV/2 mod_fastcgi/2.4.2 mod_ssl/2.2.24 OpenSSL/0.9.8y
|
25
|
+
Cache-Control:
|
26
|
+
- max-age=1
|
27
|
+
Expires:
|
28
|
+
- Mon, 19 May 2014 22:57:52 GMT
|
29
|
+
X-Powered-By:
|
30
|
+
- Web Server Built Entirely Out of Legos
|
31
|
+
Vary:
|
32
|
+
- Accept-Encoding
|
33
|
+
Content-Encoding:
|
34
|
+
- gzip
|
35
|
+
Content-Length:
|
36
|
+
- '4153'
|
37
|
+
Content-Type:
|
38
|
+
- text/html
|
39
|
+
body:
|
40
|
+
encoding: ASCII-8BIT
|
41
|
+
string: !binary |-
|
42
|
+
H4sIAAAAAAAAA91ae3PbNrb/W/kUWDYbyRNTsr3pI7GlTuKmTXbz8MTO7e10
|
43
|
+
OhqIhETEJMElQMtK6+++vwOAL1luk/bOnTs3M45I4OC8z8HBAU/+9t3b04uf
|
44
|
+
zp6zxGTp7N69E/87OEkEj2f3BoMTI00qZj8m3OjXm5dnY1Wu2G/sxcXFGTtV
|
45
|
+
WVEKraXK2YXQ5mTigGlZJgxnUcJLLcw0qMwy/CZoJ3KeiWlwKTZrVcY6YJHK
|
46
|
+
jcgBuPooi32WqXjePsVimXIj9sGkKQDaEN1na7FgWpRXotxnvOBRAii1wNqE
|
47
|
+
53FKowZ80e8CA2sZm+QWF7HQUSkLA4QdRnYKKEqWCqPZRlUMNOVywwwUY9nw
|
48
|
+
K5nUbCFkvmoYFTGrNA2QRGzCvDh4qsX0j35ibEkRjdLJJwHMl0Rc5HyRdnGD
|
49
|
+
sTFzEmH8kpUinQYy4ysx12UUsKQUy2kwsSN6omEOMS7y1fYKXhSpCI2qoiSU
|
50
|
+
EenBLhzWC9sZWj3cpteuaEgt+VUNHjCzKYRna3LtCLQo3CRxICNORpiUWj+8
|
51
|
+
ztLA85ZC8hx6AR7yrmnQ98V35+fseyHimgNykicTQjJeE2C2kQUBTtq3Iik6
|
52
|
+
DHjGI60n6/V6nHGZj/FSs23EtZnYd8uONptU6EQIczcKo1SqPwOH878u7Ad+
|
53
|
+
xd1owGBI4P7Q4e6DDmYnEzf/OQg+/LsS5eZPL9eFzHNRfuZ6ghhc8ZLdh8NW
|
54
|
+
KaJnyoLguBldCTOvytSO0iD93R8tqzwiZxjt0fvgV/v//VHwxepjEeyNeRyf
|
55
|
+
plzrUWASqecFPCvYszgHHUhZEGaAL1VU6RGLUsHzuShLVbKd0JQsAK6rRSZN
|
56
|
+
y8N94bjwbPSw73/hoLGsKFUxGsZSI0hFPETuKSvh6QyyzbwURcojcW7gzKPa
|
57
|
+
T8mofT+libAT4CFxNanFa4k7dRK7qYzFe5Becm1Adkt3DduDjvyjGt1ALtmI
|
58
|
+
benriqejPTadkk3Y3sBDNngGJinV2iMK3ij2/t0r9hzpr0QYNogHN/5BpFrc
|
59
|
+
QtFQ1BHPc+S0WpDv1DqvRbklSQfB4P64UNp0ePfW48aAK24XBnv77FdWS/Zk
|
60
|
+
t5w3HZXdj7nhLbUOuUHHfy3UWCNlmVHwW0fkwWAy4dh3jEfUmbhT4NZyO8Tt
|
61
|
+
cTDQa2mihI1qVn4++KUL2oMdRFwLFlgjBU+6EwO44WbO51pVeTwKdB7Pl1ym
|
62
|
+
PTEGuxyNioNRcCKzlU0Kzfaw+uh8YayvVkOGhD0d2vfh7EQXPEfcIVSnQ4sM
|
63
|
+
E8NZ8LCR4PCXhwGSCcBmuzygFHO76Ym5zIvK6C0eF6Xgl+3QbfHbTfgPdRCr
|
64
|
+
XPwlHWyE7mhgUxW75AeQlX+HIz4MqHg4bTj2ajkpZu/zTjFxBnrsXH4UT9iW
|
65
|
+
Htm/np0sytnpH8EetbDn2Kfzle5N/wPTf/8TxvjLKY49ZFDHnOu53agQmp9r
|
66
|
+
7ir/3zR4rjr2zlUhdhk8V79v7zdvL3ba/H/UzGeKalPJU/b/xuColjm4/z+W
|
67
|
+
2Z7m7H1+mUOH7LmtM15wzd5GUVWWjXE/X9M3nZcbTN3rvDXPYxJ1u3Cy/zrb
|
68
|
+
Qm/nfsreCIMT2KXnteazp6+bzsvN7c29GfLVCc4XpbiCs33n7NMUGjfdiouK
|
69
|
+
jiVHXXB3YeWjpK4Pb5U//VqtoXKvQ4v+an2wruT3tSkRBd3a8vc9Z4uxtvy0
|
70
|
+
1nJYO7BjOzDP9Kr2rZriLcv7Rdv2rwW6uSVHr4i7VRw3LJYiU1fi07nslyJd
|
71
|
+
6n3ytxm9g4ffK4yd6WsaPfzbOeV+LtbW/h0y5D102FBLtpZ5rNZjHAKMwtGm
|
72
|
+
qHRi19nidVhjHdZutg3fI8V+RSU4BHs1TWQjdnOHGfp5a4s55rgbeWoLo7gv
|
73
|
+
p+v1KKv7nu+OQJ0Fo11bRu3mpTBVmbPgW8xNKY3S+n4IkFo7p7STie/o4HGh
|
74
|
+
4o09uNGQKO0J7YTXZ37ktlt5MFWr7l73Cq9DJuPpkCawYMJnhCOWVxYZMB/O
|
75
|
+
7mgRYcbSm3hgxxhxcc9hqJNqpNIqy/XQgee8maAzMI2eJEczn8CoJ3JBp25g
|
76
|
+
O7JYuWUvQ6qu5cKzCGW+VCFfqMqEG1VB1tcYZS8xyp7SKPtJVSRNi6IoTYOi
|
77
|
+
UKUJbf0uSqw9wys7d6+6v8qUUbPKlHCxEsgF1lw0L/0FONc2C3bvlsNbGt1i
|
78
|
+
NF+1jEIhxCB++kDrRDVAeJY6jHMdpkpdVgUW/Pji7ctz9oBnxTH77s15f22Z
|
79
|
+
X7ZrxYL6UmFRhiXPLx25H90ge+dGtlSyEi1pWYR4BV3f8hnOXp6xV/6tvyxB
|
80
|
+
xugpxrmLrvXxwr26ReQSF+LasHeCWmnxTqfQSWtRnZBJETxhxqNE5mSicxqz
|
81
|
+
Z9rXbqzPUJSUHYayNKQuJ46bNU+vX7HTZmTLKzLRUrYbQmhkBuNC4TB4foWT
|
82
|
+
o/WsczvJjGIX9fyWVnTSMsF1AnXCCTmSGvGAAfZDPbDFvW5ZoFZNSIUziZCv
|
83
|
+
LGWrvVMqpk/d2JYPrK6b5aVYVSkvQ3Hdc1SL5p1YYdg3TfsoRB5tKwFDCkYM
|
84
|
+
Y2F/WwU8p4lbTlysW5eA98UqCwtkBuol9/Rw5gfv0kUVt45QFRobYIjsS1JA
|
85
|
+
L8tUFoXl5b2bok2bkXq2bBqVfYUaFZIUiAs4c61RWPIUgwgNDLa++tYkotzp
|
86
|
+
pKlcNGjxDDyv5KLk5WY7Jm+pog7N1qHf2Qnmw3PLJ9eiTTxLqTTWF+Gl2MAx
|
87
|
+
0gj2dar8HjPsx+dn7F9is4WgyjtazKXpOfJ7DEByP9BfqE0nDrEDY0+WIBpy
|
88
|
+
KkN5tLFu0Ayzp354yxeu0xqJP30U8lqk283fYXuDwc4I4GnsxDiZYGdxW0xv
|
89
|
+
77HN/KHfzTozPF6oa7v7+K4n15s88k3SCTUieXw0Xim1SgVmYt/UHiOd+1lq
|
90
|
+
pfJYLzYOaLudeiJz7YkFHTDqjTLbOJ4GqKSoXH0i8xQGDhdIo5fH9nLjyddH
|
91
|
+
3xTXx4mQq8Q8eXxQXNt11IsKeRxGqbSXGxEPi2oRHj36+vDx198cPPry4KvH
|
92
|
+
h0ePerA6VYD8+sujg8OvDh89/oaYBGut5HgadRhsq5fu4G+/sZ9/2bN12ehX
|
93
|
+
W5a3ojYlwNb2LnPpFd/TvFYRDrDhojIGmSZRKTJFDfcpgCD9tzBk32P/fYb9
|
94
|
+
jr2Sl4KFIQlEi+FMwXIRlkoZEtWxVovaHqvifab3AbzHULlRA/sDXpcfqCsY
|
95
|
+
j3FWeZ6KDDrWzzYXfPWGZ2Kk934++AWCU2HYBXm2eRmPgAiYXDl3jMrNI4pw
|
96
|
+
7jXCAwLDMciMZYw5Gdtn53VT22I+pmVjeCD10CcT+G4uIoPjYCQWEHScCzMR
|
97
|
+
+fz9+YSnKdzti+vlIkunhw94UbyMp0dfPTo6eAQrPz768uDwEbXmIc+44CVo
|
98
|
+
v0HSGsPsCN9nYolaaeTlJVPejGIVVcQi6manKVTPw5pw+EHr+HKIorXr3h1d
|
99
|
+
OwsFtbdjJIVNAueCvducT+8o2LVa5ESBThl+BOGCmmsaOJLzCMc8UwMnah0S
|
100
|
+
z2CB1OmHl4oiBXkr5jlvPcJ70cVaGrqRu1gLYbwX8c4NlAbTxsHY0LdXb42k
|
101
|
+
fia0o40aLFlb0P85qa1QWKxK+RHM8zSYWfb8dvMptzwT5BUDM2fjLvNILXBb
|
102
|
+
vZ2pvCp+cJH+8LATTF7OVViklaY2m1e1/AjamYhllf0lM9fm+COxKEe1HREK
|
103
|
+
WRuzhaIo8767FWy1IyPmCjUmtIAdbiEe2smtIBxgyEXhsHYBXkjtNwOrSaR+
|
104
|
+
rxEoc9hcgukuOzuTSMMV5RJ2d3QWCvnJxubeqJ9tfbp12bLNvK3bQqsvVGaL
|
105
|
+
XTZhzfCYxncfPAiR3T3v3domixInx3JT52dPSgtriSZpo+KhMnvHIfGogSlm
|
106
|
+
FzitM7pTZVEiokvtrsbdtT8VVloIhuQq7d07hT5VjnSW6zTN7YUNe2lqFFBW
|
107
|
+
9xKdXUl+11cHgOT5xtMLqRRkKSrjinqu9vY/VkI31/+9m3l7JwYgwXgc0yhT
|
108
|
+
S8aZLkQklzJiVBAQ/qVEAEESy9y4brHVGqB4tCmz7Xsxd701DTrk7Iy922aZ
|
109
|
+
MImiBQKgubriqYQCRK1TqjEK6KfBSR0GH0H20X0Y0U65uP3qAJj5dSrylUmm
|
110
|
+
wdGXj5qE5hpM9iVgtqXitt1p4FTA7XGKjjPE/Yx1GPBdIk++fnMc1G8QoMKr
|
111
|
+
W9wI0Uk0bSdr1vVt+Ccpr37r1gqN7M11XNv2aD+Y8Dfe+mrViOrHAtsJCR4k
|
112
|
+
IsVB4RgynZLxgGg8Hvd56JHzLecemwjSOjDuDpRnpVrDB3vBQs8oLxcylWaz
|
113
|
+
FTVv83TDFm6Rdo5ain9X9GlJJy5qt13LNGVwF0TRGAcFBqVAQO1dshNlNvY8
|
114
|
+
Woo3rm3zpRtPLdpxy87tOi3wWGzMzS2djnH/2gXbTwJVyk/E6rOW1XdOfGK3
|
115
|
+
c3dy6lj1vfnabXYa5w9SWY3UEW7w9qzSWA9MWFXWBtD8Clmk+UoJKSdmErCK
|
116
|
+
hpHrY0okMkM+QurHgYhO3gwuEIsridIFBsQi1OxrJiniUPMxXwiCSz1mzyjc
|
117
|
+
jLNpKeKK1qSKx4Sjk09h/ISSkfRpzXAUyRrbAuMZlReUwMjOFUV1odYuA9eW
|
118
|
+
t4lMM9ttF2yZbvbtp1IkG8uqKGHUg/PLFhuW2LuoXjqPlfUmYREn1ERicFyV
|
119
|
+
R8Jmb2IHI+DcsJQIkkqIA4PDr16K0nNg3R1bJNMZal4S66cOEdpOcO6mHXOf
|
120
|
+
0FN6X8krkXsNU4PR4M+yAjU7dlQBgJ5OLwgJtAkSG0KaK2b7B3aPAFPN51pW
|
121
|
+
gTaNQ312M+1uE92dTrAFReiabxoEtMtI3f/ujT4y2/jPyjqbFe0l9S5mRbZx
|
122
|
+
XGlCITLCyCujMm4P1OC5sRveGHWu9tnp+fk++ycKnXN/vIUf/vfrV01Es6cY
|
123
|
+
oAsedv5fPzAXm/tsnUhYlxTOGzmvsxQ2W/bMa5VkPXMpVxX8sOc9nsa+JUqO
|
124
|
+
QwgrrwNKRdSQsrbddwa0CYgSlftEDCa0XqIo9dkvysgVlJcMZbZFalNbTI4P
|
125
|
+
90CIATkdo4nlsxdn1pkSolfQiGW+Ls0gKGZThBYB2i/3CNS2/Fqf79jznNBv
|
126
|
+
LDVrQ0tnBwFC1xCpbzTp3nYrvc2psTScPUjN8bf0naM2yAWjoPvFY4C69dsH
|
127
|
+
K3PsE1oHGTSTcKuQBTRnq0YYYGELRxsDnU8YUT4hxjFPCh06q2GtWKJsoR4C
|
128
|
+
vfSqpIXLL0ivcCVqySMlYWu0pnS5jYNyTiL3kxxSHHKRVj6djHdm3abcd/WQ
|
129
|
+
MvbW4qRK/Td4sweRKjbHRwcHh+ERDrEWDsO9M9nOAwbJjGIKRe6pf7L3AHWT
|
130
|
+
aQcOWVAXzS4+fLSkyoWXK/rIdb5AXYg97FyucvYWZn35nJ0JI7s97Tt4as7q
|
131
|
+
9rhVMxjsuIlpINuNcPT9nruCaYr2uhXWPcQ5tndd7tRQHYwXHiMdIEjJVEd5
|
132
|
+
pTf/TngVS+VquPprClhT0K4yDSjRdLZzDTePhK+u7CWrntCKsVo1H4hafBMa
|
133
|
+
+IR1WfGP/joaaMs+O9a89jm1F7ufxan7BuBPsOoWfg6vjbPbSqPpQ03cnd2J
|
134
|
+
vXGY/QfAuyd8uC0AAA==
|
135
|
+
http_version:
|
136
|
+
recorded_at: Mon, 19 May 2014 22:57:52 GMT
|
137
|
+
recorded_with: VCR 2.5.0
|
data/spec/http_spec.rb
CHANGED
@@ -6,7 +6,6 @@ require "polipus/page"
|
|
6
6
|
describe Polipus::HTTP do
|
7
7
|
|
8
8
|
it 'should download a page' do
|
9
|
-
|
10
9
|
VCR.use_cassette('http_test') do
|
11
10
|
http = Polipus::HTTP.new
|
12
11
|
page = http.fetch_page("http://sfbay.craigslist.org/apa/")
|
@@ -30,6 +29,7 @@ describe Polipus::HTTP do
|
|
30
29
|
end
|
31
30
|
|
32
31
|
describe 'proxy settings' do
|
32
|
+
|
33
33
|
it 'should set proxy correctly using a procedure' do
|
34
34
|
http = Polipus::HTTP.new({proxy_host: -> con { "127.0.0.0" }, proxy_port: -> con { 8080 }})
|
35
35
|
http.proxy_host.should eq "127.0.0.0"
|
@@ -49,10 +49,25 @@ describe Polipus::HTTP do
|
|
49
49
|
http.proxy_host.should eq "127.0.0.0"
|
50
50
|
end
|
51
51
|
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
describe 'gzipped content handling' do
|
56
|
+
|
57
|
+
it 'should decode gzip content' do
|
58
|
+
VCR.use_cassette('gzipped_on') do
|
59
|
+
http = Polipus::HTTP.new(gzip_enabled: true, logger: Logger.new(STDOUT))
|
60
|
+
page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
|
61
|
+
page.doc.css('.gzip_yes').should_not be_empty
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
end
|
67
|
+
|
52
68
|
describe 'staled connections' do
|
53
69
|
|
54
70
|
it 'should refresh a staled connection' do
|
55
|
-
|
56
71
|
VCR.use_cassette('http_tconnection_max_hits') do
|
57
72
|
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
58
73
|
http.class.__send__(:attr_reader, :connections)
|
@@ -65,11 +80,8 @@ describe Polipus::HTTP do
|
|
65
80
|
http.fetch_page("https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html")
|
66
81
|
http.connections_hits['www.yahoo.com'][443].should be 1
|
67
82
|
http.connections['www.yahoo.com'][443].should_not be old_conn
|
68
|
-
|
69
83
|
end
|
70
84
|
end
|
71
85
|
end
|
72
86
|
|
73
|
-
end
|
74
|
-
|
75
87
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -5,6 +5,10 @@
|
|
5
5
|
#
|
6
6
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
7
|
require "digest/md5"
|
8
|
+
require "coveralls"
|
9
|
+
|
10
|
+
Coveralls.wear!
|
11
|
+
|
8
12
|
RSpec.configure do |config|
|
9
13
|
config.treat_symbols_as_metadata_keys_with_true_values = true
|
10
14
|
config.run_all_when_everything_filtered = true
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis-bloomfilter
|
@@ -270,6 +270,40 @@ dependencies:
|
|
270
270
|
- - ! '>='
|
271
271
|
- !ruby/object:Gem::Version
|
272
272
|
version: 1.3.2
|
273
|
+
- !ruby/object:Gem::Dependency
|
274
|
+
name: rake
|
275
|
+
requirement: !ruby/object:Gem::Requirement
|
276
|
+
requirements:
|
277
|
+
- - ~>
|
278
|
+
- !ruby/object:Gem::Version
|
279
|
+
version: '10.3'
|
280
|
+
- - ! '>='
|
281
|
+
- !ruby/object:Gem::Version
|
282
|
+
version: 10.3.2
|
283
|
+
type: :development
|
284
|
+
prerelease: false
|
285
|
+
version_requirements: !ruby/object:Gem::Requirement
|
286
|
+
requirements:
|
287
|
+
- - ~>
|
288
|
+
- !ruby/object:Gem::Version
|
289
|
+
version: '10.3'
|
290
|
+
- - ! '>='
|
291
|
+
- !ruby/object:Gem::Version
|
292
|
+
version: 10.3.2
|
293
|
+
- !ruby/object:Gem::Dependency
|
294
|
+
name: coveralls
|
295
|
+
requirement: !ruby/object:Gem::Requirement
|
296
|
+
requirements:
|
297
|
+
- - ! '>='
|
298
|
+
- !ruby/object:Gem::Version
|
299
|
+
version: '0'
|
300
|
+
type: :development
|
301
|
+
prerelease: false
|
302
|
+
version_requirements: !ruby/object:Gem::Requirement
|
303
|
+
requirements:
|
304
|
+
- - ! '>='
|
305
|
+
- !ruby/object:Gem::Version
|
306
|
+
version: '0'
|
273
307
|
description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
|
274
308
|
\ "
|
275
309
|
email:
|
@@ -281,6 +315,8 @@ files:
|
|
281
315
|
- .document
|
282
316
|
- .gitignore
|
283
317
|
- .rspec
|
318
|
+
- .travis.yml
|
319
|
+
- AUTHORS
|
284
320
|
- Gemfile
|
285
321
|
- LICENSE.txt
|
286
322
|
- README.md
|
@@ -328,6 +364,7 @@ files:
|
|
328
364
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
329
365
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
330
366
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
367
|
+
- spec/cassettes/gzipped_on.yml
|
331
368
|
- spec/cassettes/http_tconnection_max_hits.yml
|
332
369
|
- spec/cassettes/http_test.yml
|
333
370
|
- spec/cassettes/http_test_redirect.yml
|
@@ -382,6 +419,7 @@ test_files:
|
|
382
419
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
383
420
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
384
421
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
422
|
+
- spec/cassettes/gzipped_on.yml
|
385
423
|
- spec/cassettes/http_tconnection_max_hits.yml
|
386
424
|
- spec/cassettes/http_test.yml
|
387
425
|
- spec/cassettes/http_test_redirect.yml
|