polipus 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/.travis.yml +11 -0
- data/AUTHORS +2 -0
- data/README.md +6 -0
- data/lib/polipus.rb +41 -53
- data/lib/polipus/http.rb +30 -24
- data/lib/polipus/version.rb +1 -1
- data/polipus.gemspec +2 -0
- data/spec/cassettes/gzipped_on.yml +137 -0
- data/spec/http_spec.rb +17 -5
- data/spec/spec_helper.rb +4 -0
- metadata +40 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
Mzc5NDczMjAxNmUzY2JlYzA4YzUwZDUzMGY0Y2ZjZGQxOTBmMTk5YQ==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
NDgxNzhlYTAyNTMxYWE1MTBkMzIxMWMyMDRlMmQ0NjA2ZTc1MzY2Mw==
|
7
7
|
SHA512:
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
YTNmMGY4YWRkNmU5YThhMTY0ODBiMzY1MmI3MzkzMzEzZjhiZGI4NDI1Mjdh
|
10
|
+
MGI0YWNjZDEyM2NhZDlkZTY2MjY3ZTA5YzVjNWE5YjFjZWEwZjRhMGI3ZWE2
|
11
|
+
MTc0NDczOWIyYTE1YjY1MzA2ZjcwOTRjNWZiOGVlMjIyOTJkNGU=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
ZTVmZGQ0NTgyNTA2NGYwZDVhMTg3NmUzMWM0NDExMGIwOTU5NjM2ZGZkNWM3
|
14
|
+
MWYyYWQwM2NkMmNiMDgxYWY3NTdmYjE5NDZhOGE1YTQxNjRjNjUxNTIzYTc2
|
15
|
+
NTU2ZDZiYzllNWYxZjdiNDIwMzYyNGYyN2YyM2ZmY2VmNWU1NzA=
|
data/.travis.yml
ADDED
data/AUTHORS
ADDED
data/README.md
CHANGED
@@ -1,3 +1,9 @@
|
|
1
|
+
[](https://travis-ci.org/taganaka/polipus)
|
2
|
+
[](https://coveralls.io/r/taganaka/polipus?branch=master)
|
3
|
+
[](https://codeclimate.com/github/taganaka/polipus)
|
4
|
+
[](https://rubygems.org/gems/polipus)
|
5
|
+
|
6
|
+
|
1
7
|
# Polipus #
|
2
8
|
|
3
9
|
A distributed web crawler written in ruby, backed by Redis
|
data/lib/polipus.rb
CHANGED
@@ -15,7 +15,7 @@ require "singleton"
|
|
15
15
|
|
16
16
|
module Polipus
|
17
17
|
|
18
|
-
def
|
18
|
+
def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
|
19
19
|
PolipusCrawler.crawl(job_name, urls, options, &block)
|
20
20
|
end
|
21
21
|
|
@@ -75,7 +75,6 @@ module Polipus
|
|
75
75
|
attr_reader :storage
|
76
76
|
attr_reader :job_name
|
77
77
|
attr_reader :logger
|
78
|
-
attr_reader :overflow_adapter
|
79
78
|
attr_reader :options
|
80
79
|
attr_reader :crawler_name
|
81
80
|
|
@@ -89,7 +88,7 @@ module Polipus
|
|
89
88
|
end
|
90
89
|
end
|
91
90
|
|
92
|
-
def initialize(job_name = 'polipus',urls = [], options = {})
|
91
|
+
def initialize(job_name = 'polipus', urls = [], options = {})
|
93
92
|
|
94
93
|
@job_name = job_name
|
95
94
|
@options = OPTS.merge(options)
|
@@ -121,22 +120,19 @@ module Polipus
|
|
121
120
|
|
122
121
|
@storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
|
123
122
|
|
124
|
-
@urls = [urls].flatten.map{ |url|
|
123
|
+
@urls = [urls].flatten.map{ |url| URI(url) }
|
125
124
|
@urls.each{ |url| url.path = '/' if url.path.empty? }
|
125
|
+
|
126
|
+
@internal_queue = queue_factory
|
127
|
+
|
126
128
|
execute_plugin 'on_initialize'
|
127
129
|
|
128
130
|
yield self if block_given?
|
129
131
|
|
130
132
|
end
|
131
133
|
|
132
|
-
def self.crawl(
|
133
|
-
|
134
|
-
self.new(job_name, urls, opts) do |polipus|
|
135
|
-
yield polipus if block_given?
|
136
|
-
|
137
|
-
polipus.takeover
|
138
|
-
end
|
139
|
-
|
134
|
+
def self.crawl(*args, &block)
|
135
|
+
new(*args, &block).takeover
|
140
136
|
end
|
141
137
|
|
142
138
|
def takeover
|
@@ -167,13 +163,13 @@ module Polipus
|
|
167
163
|
page = Page.from_json message
|
168
164
|
|
169
165
|
unless should_be_visited?(page.url, false)
|
170
|
-
@logger.info {"[worker ##{worker_number}] Page
|
166
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) is no more welcome."}
|
171
167
|
queue.commit
|
172
168
|
next
|
173
169
|
end
|
174
170
|
|
175
171
|
if page_exists? page
|
176
|
-
@logger.info {"[worker ##{worker_number}] Page
|
172
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
|
177
173
|
queue.commit
|
178
174
|
next
|
179
175
|
end
|
@@ -190,7 +186,7 @@ module Polipus
|
|
190
186
|
page = pages.pop
|
191
187
|
page.aliases = pages.collect { |e| e.url }
|
192
188
|
if page_exists? page
|
193
|
-
@logger.info {"[worker ##{worker_number}] Page
|
189
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
|
194
190
|
queue.commit
|
195
191
|
next
|
196
192
|
end
|
@@ -212,7 +208,7 @@ module Polipus
|
|
212
208
|
|
213
209
|
if page
|
214
210
|
@logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
|
215
|
-
@logger.info {"[worker ##{worker_number}] Page
|
211
|
+
@logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
|
216
212
|
end
|
217
213
|
|
218
214
|
incr_pages
|
@@ -261,7 +257,7 @@ module Polipus
|
|
261
257
|
self
|
262
258
|
end
|
263
259
|
|
264
|
-
# A block of code will be executed on every page
|
260
|
+
# A block of code will be executed on every page downloaded
|
265
261
|
# The block takes the page as argument
|
266
262
|
def on_page_downloaded(&block)
|
267
263
|
@on_page_downloaded << block
|
@@ -292,12 +288,7 @@ module Polipus
|
|
292
288
|
@options[:redis_options]
|
293
289
|
end
|
294
290
|
|
295
|
-
def overflow_adapter
|
296
|
-
@options[:overflow_adapter]
|
297
|
-
end
|
298
|
-
|
299
291
|
def queue_size
|
300
|
-
@internal_queue ||= queue_factory
|
301
292
|
@internal_queue.size
|
302
293
|
end
|
303
294
|
|
@@ -311,63 +302,58 @@ module Polipus
|
|
311
302
|
end
|
312
303
|
|
313
304
|
def url_tracker
|
314
|
-
|
315
|
-
@
|
316
|
-
|
317
|
-
|
305
|
+
@url_tracker ||=
|
306
|
+
@options[:url_tracker] ||=
|
307
|
+
UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}",
|
308
|
+
:redis => redis_factory_adapter,
|
309
|
+
:driver => 'lua')
|
318
310
|
end
|
319
311
|
|
320
312
|
def redis
|
321
|
-
|
322
|
-
@redis = redis_factory_adapter
|
323
|
-
end
|
324
|
-
@redis
|
313
|
+
@redis ||= redis_factory_adapter
|
325
314
|
end
|
326
315
|
|
316
|
+
# Enqueue an url, no matter what
|
327
317
|
def add_url url
|
328
|
-
@url_tracker.remove url.to_s
|
329
318
|
page = Page.new(url)
|
330
|
-
|
319
|
+
@internal_queue << page.to_json
|
331
320
|
end
|
332
321
|
|
333
322
|
# Request to Polipus to stop its work (gracefully)
|
334
323
|
# cler_queue = true if you want to delete all of the pending urls to visit
|
335
324
|
def stop!(cler_queue = false)
|
336
325
|
PolipusSignalHandler.terminate
|
337
|
-
|
326
|
+
@internal_queue.clear(true) if cler_queue
|
338
327
|
end
|
339
328
|
|
340
329
|
private
|
341
330
|
# URLs enqueue policy
|
342
331
|
def should_be_visited?(url, with_tracker = true)
|
343
|
-
|
332
|
+
case
|
344
333
|
# Check against whitelist pattern matching
|
345
|
-
|
346
|
-
|
347
|
-
end
|
348
|
-
|
334
|
+
when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
|
335
|
+
false
|
349
336
|
# Check against blacklist pattern matching
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
return true if page_expired?(Page.new(url))
|
356
|
-
|
337
|
+
when @skip_links_like.any?{ |p| url.path =~ p }
|
338
|
+
false
|
339
|
+
# Page is marked as expired
|
340
|
+
when page_expired?(Page.new(url))
|
341
|
+
true
|
357
342
|
# Check against url tracker
|
358
|
-
|
359
|
-
|
343
|
+
when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
|
344
|
+
false
|
345
|
+
else
|
346
|
+
true
|
360
347
|
end
|
361
|
-
true
|
362
348
|
end
|
363
349
|
|
364
350
|
# It extracts URLs from the page
|
365
351
|
def links_for page
|
366
352
|
page.domain_aliases = domain_aliases
|
367
|
-
|
368
|
-
links
|
353
|
+
@focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
|
369
354
|
end
|
370
355
|
|
356
|
+
# whether a page is expired or not
|
371
357
|
def page_expired? page
|
372
358
|
return false if @options[:ttl_page].nil?
|
373
359
|
stored_page = @storage.get(page)
|
@@ -376,6 +362,7 @@ module Polipus
|
|
376
362
|
r
|
377
363
|
end
|
378
364
|
|
365
|
+
# whether a page exists or not
|
379
366
|
def page_exists? page
|
380
367
|
return false if page.user_data && page.user_data.p_seeded
|
381
368
|
@storage.exists?(page) && !page_expired?(page)
|
@@ -392,10 +379,11 @@ module Polipus
|
|
392
379
|
|
393
380
|
# It creates a redis client
|
394
381
|
def redis_factory_adapter
|
395
|
-
|
396
|
-
|
382
|
+
if @redis_factory
|
383
|
+
@redis_factory.call(redis_options)
|
384
|
+
else
|
385
|
+
Redis.new(redis_options)
|
397
386
|
end
|
398
|
-
Redis.new(redis_options)
|
399
387
|
end
|
400
388
|
|
401
389
|
# It creates a new distributed queue
|
data/lib/polipus/http.rb
CHANGED
@@ -27,33 +27,33 @@ module Polipus
|
|
27
27
|
# including redirects
|
28
28
|
#
|
29
29
|
def fetch_pages(url, referer = nil, depth = nil)
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
body = gzip.read
|
38
|
-
end
|
39
|
-
pages << Page.new(location, :body => response.body.dup,
|
40
|
-
:code => code,
|
41
|
-
:headers => response.to_hash,
|
42
|
-
:referer => referer,
|
43
|
-
:depth => depth,
|
44
|
-
:redirect_to => redirect_to,
|
45
|
-
:response_time => response_time,
|
46
|
-
:fetched_at => Time.now.to_i)
|
30
|
+
url = URI(url)
|
31
|
+
pages = []
|
32
|
+
get(url, referer) do |response, code, location, redirect_to, response_time|
|
33
|
+
body = response.body.dup
|
34
|
+
if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
|
35
|
+
gzip = Zlib::GzipReader.new(StringIO.new(body))
|
36
|
+
body = gzip.read
|
47
37
|
end
|
38
|
+
|
39
|
+
pages << Page.new(location, :body => body,
|
40
|
+
:code => code,
|
41
|
+
:headers => response.to_hash,
|
42
|
+
:referer => referer,
|
43
|
+
:depth => depth,
|
44
|
+
:redirect_to => redirect_to,
|
45
|
+
:response_time => response_time,
|
46
|
+
:fetched_at => Time.now.to_i)
|
47
|
+
end
|
48
48
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
end
|
55
|
-
return [Page.new(url, :error => e)]
|
49
|
+
pages
|
50
|
+
rescue StandardError => e
|
51
|
+
if verbose?
|
52
|
+
puts e.inspect
|
53
|
+
puts e.backtrace
|
56
54
|
end
|
55
|
+
|
56
|
+
[Page.new(url, :error => e)]
|
57
57
|
end
|
58
58
|
|
59
59
|
#
|
@@ -154,6 +154,8 @@ module Polipus
|
|
154
154
|
opts['User-Agent'] = user_agent if user_agent
|
155
155
|
opts['Referer'] = referer.to_s if referer
|
156
156
|
opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
|
157
|
+
opts['Accept-Encoding'] = 'gzip'
|
158
|
+
|
157
159
|
|
158
160
|
retries = 0
|
159
161
|
begin
|
@@ -227,5 +229,9 @@ module Polipus
|
|
227
229
|
to_url.host.nil? || (to_url.host == from_url.host)
|
228
230
|
end
|
229
231
|
|
232
|
+
def gzip_enabled?
|
233
|
+
@opts[:gzip_enabled]
|
234
|
+
end
|
235
|
+
|
230
236
|
end
|
231
237
|
end
|
data/lib/polipus/version.rb
CHANGED
data/polipus.gemspec
CHANGED
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
|
|
40
40
|
s.add_development_dependency 'vcr', '~> 2.5', '>= 2.5.0'
|
41
41
|
s.add_development_dependency 'webmock', '>= 1.8.0', '< 1.12'
|
42
42
|
s.add_development_dependency 'flexmock', '~> 1.3', '>= 1.3.2'
|
43
|
+
s.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
|
44
|
+
s.add_development_dependency 'coveralls'
|
43
45
|
|
44
46
|
|
45
47
|
end
|
@@ -0,0 +1,137 @@
|
|
1
|
+
---
|
2
|
+
http_interactions:
|
3
|
+
- request:
|
4
|
+
method: get
|
5
|
+
uri: http://www.whatsmyip.org/http-compression-test/
|
6
|
+
body:
|
7
|
+
encoding: US-ASCII
|
8
|
+
string: ''
|
9
|
+
headers:
|
10
|
+
Accept-Encoding:
|
11
|
+
- gzip
|
12
|
+
Accept:
|
13
|
+
- "*/*"
|
14
|
+
User-Agent:
|
15
|
+
- Ruby
|
16
|
+
response:
|
17
|
+
status:
|
18
|
+
code: 200
|
19
|
+
message: OK
|
20
|
+
headers:
|
21
|
+
Date:
|
22
|
+
- Mon, 19 May 2014 22:57:51 GMT
|
23
|
+
Server:
|
24
|
+
- Apache/2.2.24 (Unix) DAV/2 mod_fastcgi/2.4.2 mod_ssl/2.2.24 OpenSSL/0.9.8y
|
25
|
+
Cache-Control:
|
26
|
+
- max-age=1
|
27
|
+
Expires:
|
28
|
+
- Mon, 19 May 2014 22:57:52 GMT
|
29
|
+
X-Powered-By:
|
30
|
+
- Web Server Built Entirely Out of Legos
|
31
|
+
Vary:
|
32
|
+
- Accept-Encoding
|
33
|
+
Content-Encoding:
|
34
|
+
- gzip
|
35
|
+
Content-Length:
|
36
|
+
- '4153'
|
37
|
+
Content-Type:
|
38
|
+
- text/html
|
39
|
+
body:
|
40
|
+
encoding: ASCII-8BIT
|
41
|
+
string: !binary |-
|
42
|
+
H4sIAAAAAAAAA91ae3PbNrb/W/kUWDYbyRNTsr3pI7GlTuKmTXbz8MTO7e10
|
43
|
+
OhqIhETEJMElQMtK6+++vwOAL1luk/bOnTs3M45I4OC8z8HBAU/+9t3b04uf
|
44
|
+
zp6zxGTp7N69E/87OEkEj2f3BoMTI00qZj8m3OjXm5dnY1Wu2G/sxcXFGTtV
|
45
|
+
WVEKraXK2YXQ5mTigGlZJgxnUcJLLcw0qMwy/CZoJ3KeiWlwKTZrVcY6YJHK
|
46
|
+
jcgBuPooi32WqXjePsVimXIj9sGkKQDaEN1na7FgWpRXotxnvOBRAii1wNqE
|
47
|
+
53FKowZ80e8CA2sZm+QWF7HQUSkLA4QdRnYKKEqWCqPZRlUMNOVywwwUY9nw
|
48
|
+
K5nUbCFkvmoYFTGrNA2QRGzCvDh4qsX0j35ibEkRjdLJJwHMl0Rc5HyRdnGD
|
49
|
+
sTFzEmH8kpUinQYy4ysx12UUsKQUy2kwsSN6omEOMS7y1fYKXhSpCI2qoiSU
|
50
|
+
EenBLhzWC9sZWj3cpteuaEgt+VUNHjCzKYRna3LtCLQo3CRxICNORpiUWj+8
|
51
|
+
ztLA85ZC8hx6AR7yrmnQ98V35+fseyHimgNykicTQjJeE2C2kQUBTtq3Iik6
|
52
|
+
DHjGI60n6/V6nHGZj/FSs23EtZnYd8uONptU6EQIczcKo1SqPwOH878u7Ad+
|
53
|
+
xd1owGBI4P7Q4e6DDmYnEzf/OQg+/LsS5eZPL9eFzHNRfuZ6ghhc8ZLdh8NW
|
54
|
+
KaJnyoLguBldCTOvytSO0iD93R8tqzwiZxjt0fvgV/v//VHwxepjEeyNeRyf
|
55
|
+
plzrUWASqecFPCvYszgHHUhZEGaAL1VU6RGLUsHzuShLVbKd0JQsAK6rRSZN
|
56
|
+
y8N94bjwbPSw73/hoLGsKFUxGsZSI0hFPETuKSvh6QyyzbwURcojcW7gzKPa
|
57
|
+
T8mofT+libAT4CFxNanFa4k7dRK7qYzFe5Becm1Adkt3DduDjvyjGt1ALtmI
|
58
|
+
benriqejPTadkk3Y3sBDNngGJinV2iMK3ij2/t0r9hzpr0QYNogHN/5BpFrc
|
59
|
+
QtFQ1BHPc+S0WpDv1DqvRbklSQfB4P64UNp0ePfW48aAK24XBnv77FdWS/Zk
|
60
|
+
t5w3HZXdj7nhLbUOuUHHfy3UWCNlmVHwW0fkwWAy4dh3jEfUmbhT4NZyO8Tt
|
61
|
+
cTDQa2mihI1qVn4++KUL2oMdRFwLFlgjBU+6EwO44WbO51pVeTwKdB7Pl1ym
|
62
|
+
PTEGuxyNioNRcCKzlU0Kzfaw+uh8YayvVkOGhD0d2vfh7EQXPEfcIVSnQ4sM
|
63
|
+
E8NZ8LCR4PCXhwGSCcBmuzygFHO76Ym5zIvK6C0eF6Xgl+3QbfHbTfgPdRCr
|
64
|
+
XPwlHWyE7mhgUxW75AeQlX+HIz4MqHg4bTj2ajkpZu/zTjFxBnrsXH4UT9iW
|
65
|
+
Htm/np0sytnpH8EetbDn2Kfzle5N/wPTf/8TxvjLKY49ZFDHnOu53agQmp9r
|
66
|
+
7ir/3zR4rjr2zlUhdhk8V79v7zdvL3ba/H/UzGeKalPJU/b/xuColjm4/z+W
|
67
|
+
2Z7m7H1+mUOH7LmtM15wzd5GUVWWjXE/X9M3nZcbTN3rvDXPYxJ1u3Cy/zrb
|
68
|
+
Qm/nfsreCIMT2KXnteazp6+bzsvN7c29GfLVCc4XpbiCs33n7NMUGjfdiouK
|
69
|
+
jiVHXXB3YeWjpK4Pb5U//VqtoXKvQ4v+an2wruT3tSkRBd3a8vc9Z4uxtvy0
|
70
|
+
1nJYO7BjOzDP9Kr2rZriLcv7Rdv2rwW6uSVHr4i7VRw3LJYiU1fi07nslyJd
|
71
|
+
6n3ytxm9g4ffK4yd6WsaPfzbOeV+LtbW/h0y5D102FBLtpZ5rNZjHAKMwtGm
|
72
|
+
qHRi19nidVhjHdZutg3fI8V+RSU4BHs1TWQjdnOHGfp5a4s55rgbeWoLo7gv
|
73
|
+
p+v1KKv7nu+OQJ0Fo11bRu3mpTBVmbPgW8xNKY3S+n4IkFo7p7STie/o4HGh
|
74
|
+
4o09uNGQKO0J7YTXZ37ktlt5MFWr7l73Cq9DJuPpkCawYMJnhCOWVxYZMB/O
|
75
|
+
7mgRYcbSm3hgxxhxcc9hqJNqpNIqy/XQgee8maAzMI2eJEczn8CoJ3JBp25g
|
76
|
+
O7JYuWUvQ6qu5cKzCGW+VCFfqMqEG1VB1tcYZS8xyp7SKPtJVSRNi6IoTYOi
|
77
|
+
UKUJbf0uSqw9wys7d6+6v8qUUbPKlHCxEsgF1lw0L/0FONc2C3bvlsNbGt1i
|
78
|
+
NF+1jEIhxCB++kDrRDVAeJY6jHMdpkpdVgUW/Pji7ctz9oBnxTH77s15f22Z
|
79
|
+
X7ZrxYL6UmFRhiXPLx25H90ge+dGtlSyEi1pWYR4BV3f8hnOXp6xV/6tvyxB
|
80
|
+
xugpxrmLrvXxwr26ReQSF+LasHeCWmnxTqfQSWtRnZBJETxhxqNE5mSicxqz
|
81
|
+
Z9rXbqzPUJSUHYayNKQuJ46bNU+vX7HTZmTLKzLRUrYbQmhkBuNC4TB4foWT
|
82
|
+
o/WsczvJjGIX9fyWVnTSMsF1AnXCCTmSGvGAAfZDPbDFvW5ZoFZNSIUziZCv
|
83
|
+
LGWrvVMqpk/d2JYPrK6b5aVYVSkvQ3Hdc1SL5p1YYdg3TfsoRB5tKwFDCkYM
|
84
|
+
Y2F/WwU8p4lbTlysW5eA98UqCwtkBuol9/Rw5gfv0kUVt45QFRobYIjsS1JA
|
85
|
+
L8tUFoXl5b2bok2bkXq2bBqVfYUaFZIUiAs4c61RWPIUgwgNDLa++tYkotzp
|
86
|
+
pKlcNGjxDDyv5KLk5WY7Jm+pog7N1qHf2Qnmw3PLJ9eiTTxLqTTWF+Gl2MAx
|
87
|
+
0gj2dar8HjPsx+dn7F9is4WgyjtazKXpOfJ7DEByP9BfqE0nDrEDY0+WIBpy
|
88
|
+
KkN5tLFu0Ayzp354yxeu0xqJP30U8lqk283fYXuDwc4I4GnsxDiZYGdxW0xv
|
89
|
+
77HN/KHfzTozPF6oa7v7+K4n15s88k3SCTUieXw0Xim1SgVmYt/UHiOd+1lq
|
90
|
+
pfJYLzYOaLudeiJz7YkFHTDqjTLbOJ4GqKSoXH0i8xQGDhdIo5fH9nLjyddH
|
91
|
+
3xTXx4mQq8Q8eXxQXNt11IsKeRxGqbSXGxEPi2oRHj36+vDx198cPPry4KvH
|
92
|
+
h0ePerA6VYD8+sujg8OvDh89/oaYBGut5HgadRhsq5fu4G+/sZ9/2bN12ehX
|
93
|
+
W5a3ojYlwNb2LnPpFd/TvFYRDrDhojIGmSZRKTJFDfcpgCD9tzBk32P/fYb9
|
94
|
+
jr2Sl4KFIQlEi+FMwXIRlkoZEtWxVovaHqvifab3AbzHULlRA/sDXpcfqCsY
|
95
|
+
j3FWeZ6KDDrWzzYXfPWGZ2Kk934++AWCU2HYBXm2eRmPgAiYXDl3jMrNI4pw
|
96
|
+
7jXCAwLDMciMZYw5Gdtn53VT22I+pmVjeCD10CcT+G4uIoPjYCQWEHScCzMR
|
97
|
+
+fz9+YSnKdzti+vlIkunhw94UbyMp0dfPTo6eAQrPz768uDwEbXmIc+44CVo
|
98
|
+
v0HSGsPsCN9nYolaaeTlJVPejGIVVcQi6manKVTPw5pw+EHr+HKIorXr3h1d
|
99
|
+
OwsFtbdjJIVNAueCvducT+8o2LVa5ESBThl+BOGCmmsaOJLzCMc8UwMnah0S
|
100
|
+
z2CB1OmHl4oiBXkr5jlvPcJ70cVaGrqRu1gLYbwX8c4NlAbTxsHY0LdXb42k
|
101
|
+
fia0o40aLFlb0P85qa1QWKxK+RHM8zSYWfb8dvMptzwT5BUDM2fjLvNILXBb
|
102
|
+
vZ2pvCp+cJH+8LATTF7OVViklaY2m1e1/AjamYhllf0lM9fm+COxKEe1HREK
|
103
|
+
WRuzhaIo8767FWy1IyPmCjUmtIAdbiEe2smtIBxgyEXhsHYBXkjtNwOrSaR+
|
104
|
+
rxEoc9hcgukuOzuTSMMV5RJ2d3QWCvnJxubeqJ9tfbp12bLNvK3bQqsvVGaL
|
105
|
+
XTZhzfCYxncfPAiR3T3v3domixInx3JT52dPSgtriSZpo+KhMnvHIfGogSlm
|
106
|
+
FzitM7pTZVEiokvtrsbdtT8VVloIhuQq7d07hT5VjnSW6zTN7YUNe2lqFFBW
|
107
|
+
9xKdXUl+11cHgOT5xtMLqRRkKSrjinqu9vY/VkI31/+9m3l7JwYgwXgc0yhT
|
108
|
+
S8aZLkQklzJiVBAQ/qVEAEESy9y4brHVGqB4tCmz7Xsxd701DTrk7Iy922aZ
|
109
|
+
MImiBQKgubriqYQCRK1TqjEK6KfBSR0GH0H20X0Y0U65uP3qAJj5dSrylUmm
|
110
|
+
wdGXj5qE5hpM9iVgtqXitt1p4FTA7XGKjjPE/Yx1GPBdIk++fnMc1G8QoMKr
|
111
|
+
W9wI0Uk0bSdr1vVt+Ccpr37r1gqN7M11XNv2aD+Y8Dfe+mrViOrHAtsJCR4k
|
112
|
+
IsVB4RgynZLxgGg8Hvd56JHzLecemwjSOjDuDpRnpVrDB3vBQs8oLxcylWaz
|
113
|
+
FTVv83TDFm6Rdo5ain9X9GlJJy5qt13LNGVwF0TRGAcFBqVAQO1dshNlNvY8
|
114
|
+
Woo3rm3zpRtPLdpxy87tOi3wWGzMzS2djnH/2gXbTwJVyk/E6rOW1XdOfGK3
|
115
|
+
c3dy6lj1vfnabXYa5w9SWY3UEW7w9qzSWA9MWFXWBtD8Clmk+UoJKSdmErCK
|
116
|
+
hpHrY0okMkM+QurHgYhO3gwuEIsridIFBsQi1OxrJiniUPMxXwiCSz1mzyjc
|
117
|
+
jLNpKeKK1qSKx4Sjk09h/ISSkfRpzXAUyRrbAuMZlReUwMjOFUV1odYuA9eW
|
118
|
+
t4lMM9ttF2yZbvbtp1IkG8uqKGHUg/PLFhuW2LuoXjqPlfUmYREn1ERicFyV
|
119
|
+
R8Jmb2IHI+DcsJQIkkqIA4PDr16K0nNg3R1bJNMZal4S66cOEdpOcO6mHXOf
|
120
|
+
0FN6X8krkXsNU4PR4M+yAjU7dlQBgJ5OLwgJtAkSG0KaK2b7B3aPAFPN51pW
|
121
|
+
gTaNQ312M+1uE92dTrAFReiabxoEtMtI3f/ujT4y2/jPyjqbFe0l9S5mRbZx
|
122
|
+
XGlCITLCyCujMm4P1OC5sRveGHWu9tnp+fk++ycKnXN/vIUf/vfrV01Es6cY
|
123
|
+
oAsedv5fPzAXm/tsnUhYlxTOGzmvsxQ2W/bMa5VkPXMpVxX8sOc9nsa+JUqO
|
124
|
+
QwgrrwNKRdSQsrbddwa0CYgSlftEDCa0XqIo9dkvysgVlJcMZbZFalNbTI4P
|
125
|
+
90CIATkdo4nlsxdn1pkSolfQiGW+Ls0gKGZThBYB2i/3CNS2/Fqf79jznNBv
|
126
|
+
LDVrQ0tnBwFC1xCpbzTp3nYrvc2psTScPUjN8bf0naM2yAWjoPvFY4C69dsH
|
127
|
+
K3PsE1oHGTSTcKuQBTRnq0YYYGELRxsDnU8YUT4hxjFPCh06q2GtWKJsoR4C
|
128
|
+
vfSqpIXLL0ivcCVqySMlYWu0pnS5jYNyTiL3kxxSHHKRVj6djHdm3abcd/WQ
|
129
|
+
MvbW4qRK/Td4sweRKjbHRwcHh+ERDrEWDsO9M9nOAwbJjGIKRe6pf7L3AHWT
|
130
|
+
aQcOWVAXzS4+fLSkyoWXK/rIdb5AXYg97FyucvYWZn35nJ0JI7s97Tt4as7q
|
131
|
+
9rhVMxjsuIlpINuNcPT9nruCaYr2uhXWPcQ5tndd7tRQHYwXHiMdIEjJVEd5
|
132
|
+
pTf/TngVS+VquPprClhT0K4yDSjRdLZzDTePhK+u7CWrntCKsVo1H4hafBMa
|
133
|
+
+IR1WfGP/joaaMs+O9a89jm1F7ufxan7BuBPsOoWfg6vjbPbSqPpQ03cnd2J
|
134
|
+
vXGY/QfAuyd8uC0AAA==
|
135
|
+
http_version:
|
136
|
+
recorded_at: Mon, 19 May 2014 22:57:52 GMT
|
137
|
+
recorded_with: VCR 2.5.0
|
data/spec/http_spec.rb
CHANGED
@@ -6,7 +6,6 @@ require "polipus/page"
|
|
6
6
|
describe Polipus::HTTP do
|
7
7
|
|
8
8
|
it 'should download a page' do
|
9
|
-
|
10
9
|
VCR.use_cassette('http_test') do
|
11
10
|
http = Polipus::HTTP.new
|
12
11
|
page = http.fetch_page("http://sfbay.craigslist.org/apa/")
|
@@ -30,6 +29,7 @@ describe Polipus::HTTP do
|
|
30
29
|
end
|
31
30
|
|
32
31
|
describe 'proxy settings' do
|
32
|
+
|
33
33
|
it 'should set proxy correctly using a procedure' do
|
34
34
|
http = Polipus::HTTP.new({proxy_host: -> con { "127.0.0.0" }, proxy_port: -> con { 8080 }})
|
35
35
|
http.proxy_host.should eq "127.0.0.0"
|
@@ -49,10 +49,25 @@ describe Polipus::HTTP do
|
|
49
49
|
http.proxy_host.should eq "127.0.0.0"
|
50
50
|
end
|
51
51
|
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
describe 'gzipped content handling' do
|
56
|
+
|
57
|
+
it 'should decode gzip content' do
|
58
|
+
VCR.use_cassette('gzipped_on') do
|
59
|
+
http = Polipus::HTTP.new(gzip_enabled: true, logger: Logger.new(STDOUT))
|
60
|
+
page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
|
61
|
+
page.doc.css('.gzip_yes').should_not be_empty
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
|
66
|
+
end
|
67
|
+
|
52
68
|
describe 'staled connections' do
|
53
69
|
|
54
70
|
it 'should refresh a staled connection' do
|
55
|
-
|
56
71
|
VCR.use_cassette('http_tconnection_max_hits') do
|
57
72
|
http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
|
58
73
|
http.class.__send__(:attr_reader, :connections)
|
@@ -65,11 +80,8 @@ describe Polipus::HTTP do
|
|
65
80
|
http.fetch_page("https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html")
|
66
81
|
http.connections_hits['www.yahoo.com'][443].should be 1
|
67
82
|
http.connections['www.yahoo.com'][443].should_not be old_conn
|
68
|
-
|
69
83
|
end
|
70
84
|
end
|
71
85
|
end
|
72
86
|
|
73
|
-
end
|
74
|
-
|
75
87
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -5,6 +5,10 @@
|
|
5
5
|
#
|
6
6
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
7
7
|
require "digest/md5"
|
8
|
+
require "coveralls"
|
9
|
+
|
10
|
+
Coveralls.wear!
|
11
|
+
|
8
12
|
RSpec.configure do |config|
|
9
13
|
config.treat_symbols_as_metadata_keys_with_true_values = true
|
10
14
|
config.run_all_when_everything_filtered = true
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: polipus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Francesco Laurita
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: redis-bloomfilter
|
@@ -270,6 +270,40 @@ dependencies:
|
|
270
270
|
- - ! '>='
|
271
271
|
- !ruby/object:Gem::Version
|
272
272
|
version: 1.3.2
|
273
|
+
- !ruby/object:Gem::Dependency
|
274
|
+
name: rake
|
275
|
+
requirement: !ruby/object:Gem::Requirement
|
276
|
+
requirements:
|
277
|
+
- - ~>
|
278
|
+
- !ruby/object:Gem::Version
|
279
|
+
version: '10.3'
|
280
|
+
- - ! '>='
|
281
|
+
- !ruby/object:Gem::Version
|
282
|
+
version: 10.3.2
|
283
|
+
type: :development
|
284
|
+
prerelease: false
|
285
|
+
version_requirements: !ruby/object:Gem::Requirement
|
286
|
+
requirements:
|
287
|
+
- - ~>
|
288
|
+
- !ruby/object:Gem::Version
|
289
|
+
version: '10.3'
|
290
|
+
- - ! '>='
|
291
|
+
- !ruby/object:Gem::Version
|
292
|
+
version: 10.3.2
|
293
|
+
- !ruby/object:Gem::Dependency
|
294
|
+
name: coveralls
|
295
|
+
requirement: !ruby/object:Gem::Requirement
|
296
|
+
requirements:
|
297
|
+
- - ! '>='
|
298
|
+
- !ruby/object:Gem::Version
|
299
|
+
version: '0'
|
300
|
+
type: :development
|
301
|
+
prerelease: false
|
302
|
+
version_requirements: !ruby/object:Gem::Requirement
|
303
|
+
requirements:
|
304
|
+
- - ! '>='
|
305
|
+
- !ruby/object:Gem::Version
|
306
|
+
version: '0'
|
273
307
|
description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
|
274
308
|
\ "
|
275
309
|
email:
|
@@ -281,6 +315,8 @@ files:
|
|
281
315
|
- .document
|
282
316
|
- .gitignore
|
283
317
|
- .rspec
|
318
|
+
- .travis.yml
|
319
|
+
- AUTHORS
|
284
320
|
- Gemfile
|
285
321
|
- LICENSE.txt
|
286
322
|
- README.md
|
@@ -328,6 +364,7 @@ files:
|
|
328
364
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
329
365
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
330
366
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
367
|
+
- spec/cassettes/gzipped_on.yml
|
331
368
|
- spec/cassettes/http_tconnection_max_hits.yml
|
332
369
|
- spec/cassettes/http_test.yml
|
333
370
|
- spec/cassettes/http_test_redirect.yml
|
@@ -382,6 +419,7 @@ test_files:
|
|
382
419
|
- spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
|
383
420
|
- spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
|
384
421
|
- spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
|
422
|
+
- spec/cassettes/gzipped_on.yml
|
385
423
|
- spec/cassettes/http_tconnection_max_hits.yml
|
386
424
|
- spec/cassettes/http_test.yml
|
387
425
|
- spec/cassettes/http_test_redirect.yml
|