polipus 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MDlmZGJiYmY1MWQ1NWFiMDMxZGRmYzBmNGQ4OGU4ZGFmOTgxN2M5Mg==
4
+ Mzc5NDczMjAxNmUzY2JlYzA4YzUwZDUzMGY0Y2ZjZGQxOTBmMTk5YQ==
5
5
  data.tar.gz: !binary |-
6
- Y2NjYmI5N2Y1NDA4NjVlNGQ2YWNhMWUxYjkyODA5NjBkOWVmOTljZQ==
6
+ NDgxNzhlYTAyNTMxYWE1MTBkMzIxMWMyMDRlMmQ0NjA2ZTc1MzY2Mw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MmQ1ZDc2M2RiYmVmZjg0NDVlYjYyNTYzOWY4M2NjMGQ1MDZjMGQwZWYwNjJj
10
- OWMzNjY4ODI3ZTA5ZmEzM2U0YTZiNWQxOWI2YzkyNDAzM2Q5YzIyZDFiNmE5
11
- MzQ0YzM3ZDgwNTk5N2ZjNzMxOTY3OTllNWFjNDI3YjQ0NGVhODg=
9
+ YTNmMGY4YWRkNmU5YThhMTY0ODBiMzY1MmI3MzkzMzEzZjhiZGI4NDI1Mjdh
10
+ MGI0YWNjZDEyM2NhZDlkZTY2MjY3ZTA5YzVjNWE5YjFjZWEwZjRhMGI3ZWE2
11
+ MTc0NDczOWIyYTE1YjY1MzA2ZjcwOTRjNWZiOGVlMjIyOTJkNGU=
12
12
  data.tar.gz: !binary |-
13
- ZTI1NWUzNzQwNTA2NTU4Mjk5OTVhNmI3NDlkNDFmOTRmN2JmZDhhZDRhYWZl
14
- Y2VkM2ZkNDZjZWYyOTk0NTgzZTIxOTA0ZGEzMjc1MGNmOGFkYTgxZjc4ZWJj
15
- ZGVhY2FiNDk3MDYwNzllMGNiYTI2NDMwYTAyMzY4NTczYmJjNGQ=
13
+ ZTVmZGQ0NTgyNTA2NGYwZDVhMTg3NmUzMWM0NDExMGIwOTU5NjM2ZGZkNWM3
14
+ MWYyYWQwM2NkMmNiMDgxYWY3NTdmYjE5NDZhOGE1YTQxNjRjNjUxNTIzYTc2
15
+ NTU2ZDZiYzllNWYxZjdiNDIwMzYyNGYyN2YyM2ZmY2VmNWU1NzA=
data/.travis.yml ADDED
@@ -0,0 +1,11 @@
1
+ language: ruby
2
+ rvm:
3
+ - jruby
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - 2.1.1
7
+
8
+ services:
9
+ - mongodb
10
+ - redis
11
+
data/AUTHORS ADDED
@@ -0,0 +1,2 @@
1
+ Francesco Laurita <francesco.laurita@gmail.com>
2
+ Tobias L. Maier <http://tobiasmaier.info/>
data/README.md CHANGED
@@ -1,3 +1,9 @@
1
+ [![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
2
+ [![Coverage Status](https://coveralls.io/repos/taganaka/polipus/badge.png?branch=master)](https://coveralls.io/r/taganaka/polipus?branch=master)
3
+ [![Code Climate](https://codeclimate.com/github/taganaka/polipus.png)](https://codeclimate.com/github/taganaka/polipus)
4
+ [![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
5
+
6
+
1
7
  # Polipus #
2
8
 
3
9
  A distributed web crawler written in ruby, backed by Redis
data/lib/polipus.rb CHANGED
@@ -15,7 +15,7 @@ require "singleton"
15
15
 
16
16
  module Polipus
17
17
 
18
- def Polipus.crawler(job_name = 'polipus', urls = [], options = {}, &block)
18
+ def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
19
19
  PolipusCrawler.crawl(job_name, urls, options, &block)
20
20
  end
21
21
 
@@ -75,7 +75,6 @@ module Polipus
75
75
  attr_reader :storage
76
76
  attr_reader :job_name
77
77
  attr_reader :logger
78
- attr_reader :overflow_adapter
79
78
  attr_reader :options
80
79
  attr_reader :crawler_name
81
80
 
@@ -89,7 +88,7 @@ module Polipus
89
88
  end
90
89
  end
91
90
 
92
- def initialize(job_name = 'polipus',urls = [], options = {})
91
+ def initialize(job_name = 'polipus', urls = [], options = {})
93
92
 
94
93
  @job_name = job_name
95
94
  @options = OPTS.merge(options)
@@ -121,22 +120,19 @@ module Polipus
121
120
 
122
121
  @storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
123
122
 
124
- @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
123
+ @urls = [urls].flatten.map{ |url| URI(url) }
125
124
  @urls.each{ |url| url.path = '/' if url.path.empty? }
125
+
126
+ @internal_queue = queue_factory
127
+
126
128
  execute_plugin 'on_initialize'
127
129
 
128
130
  yield self if block_given?
129
131
 
130
132
  end
131
133
 
132
- def self.crawl(job_name, urls, opts = {})
133
-
134
- self.new(job_name, urls, opts) do |polipus|
135
- yield polipus if block_given?
136
-
137
- polipus.takeover
138
- end
139
-
134
+ def self.crawl(*args, &block)
135
+ new(*args, &block).takeover
140
136
  end
141
137
 
142
138
  def takeover
@@ -167,13 +163,13 @@ module Polipus
167
163
  page = Page.from_json message
168
164
 
169
165
  unless should_be_visited?(page.url, false)
170
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] is no more welcome."}
166
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) is no more welcome."}
171
167
  queue.commit
172
168
  next
173
169
  end
174
170
 
175
171
  if page_exists? page
176
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
172
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
177
173
  queue.commit
178
174
  next
179
175
  end
@@ -190,7 +186,7 @@ module Polipus
190
186
  page = pages.pop
191
187
  page.aliases = pages.collect { |e| e.url }
192
188
  if page_exists? page
193
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
189
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
194
190
  queue.commit
195
191
  next
196
192
  end
@@ -212,7 +208,7 @@ module Polipus
212
208
 
213
209
  if page
214
210
  @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
215
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] downloaded"}
211
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
216
212
  end
217
213
 
218
214
  incr_pages
@@ -261,7 +257,7 @@ module Polipus
261
257
  self
262
258
  end
263
259
 
264
- # A block of code will be executed on every page dowloaded
260
+ # A block of code will be executed on every page downloaded
265
261
  # The block takes the page as argument
266
262
  def on_page_downloaded(&block)
267
263
  @on_page_downloaded << block
@@ -292,12 +288,7 @@ module Polipus
292
288
  @options[:redis_options]
293
289
  end
294
290
 
295
- def overflow_adapter
296
- @options[:overflow_adapter]
297
- end
298
-
299
291
  def queue_size
300
- @internal_queue ||= queue_factory
301
292
  @internal_queue.size
302
293
  end
303
294
 
@@ -311,63 +302,58 @@ module Polipus
311
302
  end
312
303
 
313
304
  def url_tracker
314
- if @url_tracker.nil?
315
- @url_tracker = @options[:url_tracker] ||= UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}", :redis => redis_factory_adapter, :driver => 'lua')
316
- end
317
- @url_tracker
305
+ @url_tracker ||=
306
+ @options[:url_tracker] ||=
307
+ UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}",
308
+ :redis => redis_factory_adapter,
309
+ :driver => 'lua')
318
310
  end
319
311
 
320
312
  def redis
321
- if @redis.nil?
322
- @redis = redis_factory_adapter
323
- end
324
- @redis
313
+ @redis ||= redis_factory_adapter
325
314
  end
326
315
 
316
+ # Enqueue an url, no matter what
327
317
  def add_url url
328
- @url_tracker.remove url.to_s
329
318
  page = Page.new(url)
330
- queue_factory << page.to_json
319
+ @internal_queue << page.to_json
331
320
  end
332
321
 
333
322
  # Request to Polipus to stop its work (gracefully)
334
323
  # cler_queue = true if you want to delete all of the pending urls to visit
335
324
  def stop!(cler_queue = false)
336
325
  PolipusSignalHandler.terminate
337
- queue_factory.clear(true) if cler_queue
326
+ @internal_queue.clear(true) if cler_queue
338
327
  end
339
328
 
340
329
  private
341
330
  # URLs enqueue policy
342
331
  def should_be_visited?(url, with_tracker = true)
343
-
332
+ case
344
333
  # Check against whitelist pattern matching
345
- unless @follow_links_like.empty?
346
- return false unless @follow_links_like.any?{|p| url.path =~ p}
347
- end
348
-
334
+ when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
335
+ false
349
336
  # Check against blacklist pattern matching
350
- unless @skip_links_like.empty?
351
- return false if @skip_links_like.any?{|p| url.path =~ p}
352
- end
353
-
354
- #Page is marked as expired
355
- return true if page_expired?(Page.new(url))
356
-
337
+ when @skip_links_like.any?{ |p| url.path =~ p }
338
+ false
339
+ # Page is marked as expired
340
+ when page_expired?(Page.new(url))
341
+ true
357
342
  # Check against url tracker
358
- if with_tracker
359
- return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
343
+ when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
344
+ false
345
+ else
346
+ true
360
347
  end
361
- true
362
348
  end
363
349
 
364
350
  # It extracts URLs from the page
365
351
  def links_for page
366
352
  page.domain_aliases = domain_aliases
367
- links = @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
368
- links
353
+ @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
369
354
  end
370
355
 
356
+ # whether a page is expired or not
371
357
  def page_expired? page
372
358
  return false if @options[:ttl_page].nil?
373
359
  stored_page = @storage.get(page)
@@ -376,6 +362,7 @@ module Polipus
376
362
  r
377
363
  end
378
364
 
365
+ # whether a page exists or not
379
366
  def page_exists? page
380
367
  return false if page.user_data && page.user_data.p_seeded
381
368
  @storage.exists?(page) && !page_expired?(page)
@@ -392,10 +379,11 @@ module Polipus
392
379
 
393
380
  # It creates a redis client
394
381
  def redis_factory_adapter
395
- unless @redis_factory.nil?
396
- return @redis_factory.call(redis_options)
382
+ if @redis_factory
383
+ @redis_factory.call(redis_options)
384
+ else
385
+ Redis.new(redis_options)
397
386
  end
398
- Redis.new(redis_options)
399
387
  end
400
388
 
401
389
  # It creates a new distributed queue
data/lib/polipus/http.rb CHANGED
@@ -27,33 +27,33 @@ module Polipus
27
27
  # including redirects
28
28
  #
29
29
  def fetch_pages(url, referer = nil, depth = nil)
30
- begin
31
- url = URI(url) unless url.is_a?(URI)
32
- pages = []
33
- get(url, referer) do |response, code, location, redirect_to, response_time|
34
- body = response.body.dup
35
- if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
36
- gzip = Zlib::GzipReader.new(StringIO.new(body))
37
- body = gzip.read
38
- end
39
- pages << Page.new(location, :body => response.body.dup,
40
- :code => code,
41
- :headers => response.to_hash,
42
- :referer => referer,
43
- :depth => depth,
44
- :redirect_to => redirect_to,
45
- :response_time => response_time,
46
- :fetched_at => Time.now.to_i)
30
+ url = URI(url)
31
+ pages = []
32
+ get(url, referer) do |response, code, location, redirect_to, response_time|
33
+ body = response.body.dup
34
+ if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
35
+ gzip = Zlib::GzipReader.new(StringIO.new(body))
36
+ body = gzip.read
47
37
  end
38
+
39
+ pages << Page.new(location, :body => body,
40
+ :code => code,
41
+ :headers => response.to_hash,
42
+ :referer => referer,
43
+ :depth => depth,
44
+ :redirect_to => redirect_to,
45
+ :response_time => response_time,
46
+ :fetched_at => Time.now.to_i)
47
+ end
48
48
 
49
- return pages
50
- rescue StandardError => e
51
- if verbose?
52
- puts e.inspect
53
- puts e.backtrace
54
- end
55
- return [Page.new(url, :error => e)]
49
+ pages
50
+ rescue StandardError => e
51
+ if verbose?
52
+ puts e.inspect
53
+ puts e.backtrace
56
54
  end
55
+
56
+ [Page.new(url, :error => e)]
57
57
  end
58
58
 
59
59
  #
@@ -154,6 +154,8 @@ module Polipus
154
154
  opts['User-Agent'] = user_agent if user_agent
155
155
  opts['Referer'] = referer.to_s if referer
156
156
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
157
+ opts['Accept-Encoding'] = 'gzip'
158
+
157
159
 
158
160
  retries = 0
159
161
  begin
@@ -227,5 +229,9 @@ module Polipus
227
229
  to_url.host.nil? || (to_url.host == from_url.host)
228
230
  end
229
231
 
232
+ def gzip_enabled?
233
+ @opts[:gzip_enabled]
234
+ end
235
+
230
236
  end
231
237
  end
@@ -1,4 +1,4 @@
1
1
  module Polipus
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  HOMEPAGE = "https://github.com/taganaka/polipus"
4
4
  end
data/polipus.gemspec CHANGED
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
40
40
  s.add_development_dependency 'vcr', '~> 2.5', '>= 2.5.0'
41
41
  s.add_development_dependency 'webmock', '>= 1.8.0', '< 1.12'
42
42
  s.add_development_dependency 'flexmock', '~> 1.3', '>= 1.3.2'
43
+ s.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
44
+ s.add_development_dependency 'coveralls'
43
45
 
44
46
 
45
47
  end
@@ -0,0 +1,137 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://www.whatsmyip.org/http-compression-test/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip
12
+ Accept:
13
+ - "*/*"
14
+ User-Agent:
15
+ - Ruby
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Date:
22
+ - Mon, 19 May 2014 22:57:51 GMT
23
+ Server:
24
+ - Apache/2.2.24 (Unix) DAV/2 mod_fastcgi/2.4.2 mod_ssl/2.2.24 OpenSSL/0.9.8y
25
+ Cache-Control:
26
+ - max-age=1
27
+ Expires:
28
+ - Mon, 19 May 2014 22:57:52 GMT
29
+ X-Powered-By:
30
+ - Web Server Built Entirely Out of Legos
31
+ Vary:
32
+ - Accept-Encoding
33
+ Content-Encoding:
34
+ - gzip
35
+ Content-Length:
36
+ - '4153'
37
+ Content-Type:
38
+ - text/html
39
+ body:
40
+ encoding: ASCII-8BIT
41
+ string: !binary |-
42
+ H4sIAAAAAAAAA91ae3PbNrb/W/kUWDYbyRNTsr3pI7GlTuKmTXbz8MTO7e10
43
+ OhqIhETEJMElQMtK6+++vwOAL1luk/bOnTs3M45I4OC8z8HBAU/+9t3b04uf
44
+ zp6zxGTp7N69E/87OEkEj2f3BoMTI00qZj8m3OjXm5dnY1Wu2G/sxcXFGTtV
45
+ WVEKraXK2YXQ5mTigGlZJgxnUcJLLcw0qMwy/CZoJ3KeiWlwKTZrVcY6YJHK
46
+ jcgBuPooi32WqXjePsVimXIj9sGkKQDaEN1na7FgWpRXotxnvOBRAii1wNqE
47
+ 53FKowZ80e8CA2sZm+QWF7HQUSkLA4QdRnYKKEqWCqPZRlUMNOVywwwUY9nw
48
+ K5nUbCFkvmoYFTGrNA2QRGzCvDh4qsX0j35ibEkRjdLJJwHMl0Rc5HyRdnGD
49
+ sTFzEmH8kpUinQYy4ysx12UUsKQUy2kwsSN6omEOMS7y1fYKXhSpCI2qoiSU
50
+ EenBLhzWC9sZWj3cpteuaEgt+VUNHjCzKYRna3LtCLQo3CRxICNORpiUWj+8
51
+ ztLA85ZC8hx6AR7yrmnQ98V35+fseyHimgNykicTQjJeE2C2kQUBTtq3Iik6
52
+ DHjGI60n6/V6nHGZj/FSs23EtZnYd8uONptU6EQIczcKo1SqPwOH878u7Ad+
53
+ xd1owGBI4P7Q4e6DDmYnEzf/OQg+/LsS5eZPL9eFzHNRfuZ6ghhc8ZLdh8NW
54
+ KaJnyoLguBldCTOvytSO0iD93R8tqzwiZxjt0fvgV/v//VHwxepjEeyNeRyf
55
+ plzrUWASqecFPCvYszgHHUhZEGaAL1VU6RGLUsHzuShLVbKd0JQsAK6rRSZN
56
+ y8N94bjwbPSw73/hoLGsKFUxGsZSI0hFPETuKSvh6QyyzbwURcojcW7gzKPa
57
+ T8mofT+libAT4CFxNanFa4k7dRK7qYzFe5Becm1Adkt3DduDjvyjGt1ALtmI
58
+ benriqejPTadkk3Y3sBDNngGJinV2iMK3ij2/t0r9hzpr0QYNogHN/5BpFrc
59
+ QtFQ1BHPc+S0WpDv1DqvRbklSQfB4P64UNp0ePfW48aAK24XBnv77FdWS/Zk
60
+ t5w3HZXdj7nhLbUOuUHHfy3UWCNlmVHwW0fkwWAy4dh3jEfUmbhT4NZyO8Tt
61
+ cTDQa2mihI1qVn4++KUL2oMdRFwLFlgjBU+6EwO44WbO51pVeTwKdB7Pl1ym
62
+ PTEGuxyNioNRcCKzlU0Kzfaw+uh8YayvVkOGhD0d2vfh7EQXPEfcIVSnQ4sM
63
+ E8NZ8LCR4PCXhwGSCcBmuzygFHO76Ym5zIvK6C0eF6Xgl+3QbfHbTfgPdRCr
64
+ XPwlHWyE7mhgUxW75AeQlX+HIz4MqHg4bTj2ajkpZu/zTjFxBnrsXH4UT9iW
65
+ Htm/np0sytnpH8EetbDn2Kfzle5N/wPTf/8TxvjLKY49ZFDHnOu53agQmp9r
66
+ 7ir/3zR4rjr2zlUhdhk8V79v7zdvL3ba/H/UzGeKalPJU/b/xuColjm4/z+W
67
+ 2Z7m7H1+mUOH7LmtM15wzd5GUVWWjXE/X9M3nZcbTN3rvDXPYxJ1u3Cy/zrb
68
+ Qm/nfsreCIMT2KXnteazp6+bzsvN7c29GfLVCc4XpbiCs33n7NMUGjfdiouK
69
+ jiVHXXB3YeWjpK4Pb5U//VqtoXKvQ4v+an2wruT3tSkRBd3a8vc9Z4uxtvy0
70
+ 1nJYO7BjOzDP9Kr2rZriLcv7Rdv2rwW6uSVHr4i7VRw3LJYiU1fi07nslyJd
71
+ 6n3ytxm9g4ffK4yd6WsaPfzbOeV+LtbW/h0y5D102FBLtpZ5rNZjHAKMwtGm
72
+ qHRi19nidVhjHdZutg3fI8V+RSU4BHs1TWQjdnOHGfp5a4s55rgbeWoLo7gv
73
+ p+v1KKv7nu+OQJ0Fo11bRu3mpTBVmbPgW8xNKY3S+n4IkFo7p7STie/o4HGh
74
+ 4o09uNGQKO0J7YTXZ37ktlt5MFWr7l73Cq9DJuPpkCawYMJnhCOWVxYZMB/O
75
+ 7mgRYcbSm3hgxxhxcc9hqJNqpNIqy/XQgee8maAzMI2eJEczn8CoJ3JBp25g
76
+ O7JYuWUvQ6qu5cKzCGW+VCFfqMqEG1VB1tcYZS8xyp7SKPtJVSRNi6IoTYOi
77
+ UKUJbf0uSqw9wys7d6+6v8qUUbPKlHCxEsgF1lw0L/0FONc2C3bvlsNbGt1i
78
+ NF+1jEIhxCB++kDrRDVAeJY6jHMdpkpdVgUW/Pji7ctz9oBnxTH77s15f22Z
79
+ X7ZrxYL6UmFRhiXPLx25H90ge+dGtlSyEi1pWYR4BV3f8hnOXp6xV/6tvyxB
80
+ xugpxrmLrvXxwr26ReQSF+LasHeCWmnxTqfQSWtRnZBJETxhxqNE5mSicxqz
81
+ Z9rXbqzPUJSUHYayNKQuJ46bNU+vX7HTZmTLKzLRUrYbQmhkBuNC4TB4foWT
82
+ o/WsczvJjGIX9fyWVnTSMsF1AnXCCTmSGvGAAfZDPbDFvW5ZoFZNSIUziZCv
83
+ LGWrvVMqpk/d2JYPrK6b5aVYVSkvQ3Hdc1SL5p1YYdg3TfsoRB5tKwFDCkYM
84
+ Y2F/WwU8p4lbTlysW5eA98UqCwtkBuol9/Rw5gfv0kUVt45QFRobYIjsS1JA
85
+ L8tUFoXl5b2bok2bkXq2bBqVfYUaFZIUiAs4c61RWPIUgwgNDLa++tYkotzp
86
+ pKlcNGjxDDyv5KLk5WY7Jm+pog7N1qHf2Qnmw3PLJ9eiTTxLqTTWF+Gl2MAx
87
+ 0gj2dar8HjPsx+dn7F9is4WgyjtazKXpOfJ7DEByP9BfqE0nDrEDY0+WIBpy
88
+ KkN5tLFu0Ayzp354yxeu0xqJP30U8lqk283fYXuDwc4I4GnsxDiZYGdxW0xv
89
+ 77HN/KHfzTozPF6oa7v7+K4n15s88k3SCTUieXw0Xim1SgVmYt/UHiOd+1lq
90
+ pfJYLzYOaLudeiJz7YkFHTDqjTLbOJ4GqKSoXH0i8xQGDhdIo5fH9nLjyddH
91
+ 3xTXx4mQq8Q8eXxQXNt11IsKeRxGqbSXGxEPi2oRHj36+vDx198cPPry4KvH
92
+ h0ePerA6VYD8+sujg8OvDh89/oaYBGut5HgadRhsq5fu4G+/sZ9/2bN12ehX
93
+ W5a3ojYlwNb2LnPpFd/TvFYRDrDhojIGmSZRKTJFDfcpgCD9tzBk32P/fYb9
94
+ jr2Sl4KFIQlEi+FMwXIRlkoZEtWxVovaHqvifab3AbzHULlRA/sDXpcfqCsY
95
+ j3FWeZ6KDDrWzzYXfPWGZ2Kk934++AWCU2HYBXm2eRmPgAiYXDl3jMrNI4pw
96
+ 7jXCAwLDMciMZYw5Gdtn53VT22I+pmVjeCD10CcT+G4uIoPjYCQWEHScCzMR
97
+ +fz9+YSnKdzti+vlIkunhw94UbyMp0dfPTo6eAQrPz768uDwEbXmIc+44CVo
98
+ v0HSGsPsCN9nYolaaeTlJVPejGIVVcQi6manKVTPw5pw+EHr+HKIorXr3h1d
99
+ OwsFtbdjJIVNAueCvducT+8o2LVa5ESBThl+BOGCmmsaOJLzCMc8UwMnah0S
100
+ z2CB1OmHl4oiBXkr5jlvPcJ70cVaGrqRu1gLYbwX8c4NlAbTxsHY0LdXb42k
101
+ fia0o40aLFlb0P85qa1QWKxK+RHM8zSYWfb8dvMptzwT5BUDM2fjLvNILXBb
102
+ vZ2pvCp+cJH+8LATTF7OVViklaY2m1e1/AjamYhllf0lM9fm+COxKEe1HREK
103
+ WRuzhaIo8767FWy1IyPmCjUmtIAdbiEe2smtIBxgyEXhsHYBXkjtNwOrSaR+
104
+ rxEoc9hcgukuOzuTSMMV5RJ2d3QWCvnJxubeqJ9tfbp12bLNvK3bQqsvVGaL
105
+ XTZhzfCYxncfPAiR3T3v3domixInx3JT52dPSgtriSZpo+KhMnvHIfGogSlm
106
+ FzitM7pTZVEiokvtrsbdtT8VVloIhuQq7d07hT5VjnSW6zTN7YUNe2lqFFBW
107
+ 9xKdXUl+11cHgOT5xtMLqRRkKSrjinqu9vY/VkI31/+9m3l7JwYgwXgc0yhT
108
+ S8aZLkQklzJiVBAQ/qVEAEESy9y4brHVGqB4tCmz7Xsxd701DTrk7Iy922aZ
109
+ MImiBQKgubriqYQCRK1TqjEK6KfBSR0GH0H20X0Y0U65uP3qAJj5dSrylUmm
110
+ wdGXj5qE5hpM9iVgtqXitt1p4FTA7XGKjjPE/Yx1GPBdIk++fnMc1G8QoMKr
111
+ W9wI0Uk0bSdr1vVt+Ccpr37r1gqN7M11XNv2aD+Y8Dfe+mrViOrHAtsJCR4k
112
+ IsVB4RgynZLxgGg8Hvd56JHzLecemwjSOjDuDpRnpVrDB3vBQs8oLxcylWaz
113
+ FTVv83TDFm6Rdo5ain9X9GlJJy5qt13LNGVwF0TRGAcFBqVAQO1dshNlNvY8
114
+ Woo3rm3zpRtPLdpxy87tOi3wWGzMzS2djnH/2gXbTwJVyk/E6rOW1XdOfGK3
115
+ c3dy6lj1vfnabXYa5w9SWY3UEW7w9qzSWA9MWFXWBtD8Clmk+UoJKSdmErCK
116
+ hpHrY0okMkM+QurHgYhO3gwuEIsridIFBsQi1OxrJiniUPMxXwiCSz1mzyjc
117
+ jLNpKeKK1qSKx4Sjk09h/ISSkfRpzXAUyRrbAuMZlReUwMjOFUV1odYuA9eW
118
+ t4lMM9ttF2yZbvbtp1IkG8uqKGHUg/PLFhuW2LuoXjqPlfUmYREn1ERicFyV
119
+ R8Jmb2IHI+DcsJQIkkqIA4PDr16K0nNg3R1bJNMZal4S66cOEdpOcO6mHXOf
120
+ 0FN6X8krkXsNU4PR4M+yAjU7dlQBgJ5OLwgJtAkSG0KaK2b7B3aPAFPN51pW
121
+ gTaNQ312M+1uE92dTrAFReiabxoEtMtI3f/ujT4y2/jPyjqbFe0l9S5mRbZx
122
+ XGlCITLCyCujMm4P1OC5sRveGHWu9tnp+fk++ycKnXN/vIUf/vfrV01Es6cY
123
+ oAsedv5fPzAXm/tsnUhYlxTOGzmvsxQ2W/bMa5VkPXMpVxX8sOc9nsa+JUqO
124
+ QwgrrwNKRdSQsrbddwa0CYgSlftEDCa0XqIo9dkvysgVlJcMZbZFalNbTI4P
125
+ 90CIATkdo4nlsxdn1pkSolfQiGW+Ls0gKGZThBYB2i/3CNS2/Fqf79jznNBv
126
+ LDVrQ0tnBwFC1xCpbzTp3nYrvc2psTScPUjN8bf0naM2yAWjoPvFY4C69dsH
127
+ K3PsE1oHGTSTcKuQBTRnq0YYYGELRxsDnU8YUT4hxjFPCh06q2GtWKJsoR4C
128
+ vfSqpIXLL0ivcCVqySMlYWu0pnS5jYNyTiL3kxxSHHKRVj6djHdm3abcd/WQ
129
+ MvbW4qRK/Td4sweRKjbHRwcHh+ERDrEWDsO9M9nOAwbJjGIKRe6pf7L3AHWT
130
+ aQcOWVAXzS4+fLSkyoWXK/rIdb5AXYg97FyucvYWZn35nJ0JI7s97Tt4as7q
131
+ 9rhVMxjsuIlpINuNcPT9nruCaYr2uhXWPcQ5tndd7tRQHYwXHiMdIEjJVEd5
132
+ pTf/TngVS+VquPprClhT0K4yDSjRdLZzDTePhK+u7CWrntCKsVo1H4hafBMa
133
+ +IR1WfGP/joaaMs+O9a89jm1F7ufxan7BuBPsOoWfg6vjbPbSqPpQ03cnd2J
134
+ vXGY/QfAuyd8uC0AAA==
135
+ http_version:
136
+ recorded_at: Mon, 19 May 2014 22:57:52 GMT
137
+ recorded_with: VCR 2.5.0
data/spec/http_spec.rb CHANGED
@@ -6,7 +6,6 @@ require "polipus/page"
6
6
  describe Polipus::HTTP do
7
7
 
8
8
  it 'should download a page' do
9
-
10
9
  VCR.use_cassette('http_test') do
11
10
  http = Polipus::HTTP.new
12
11
  page = http.fetch_page("http://sfbay.craigslist.org/apa/")
@@ -30,6 +29,7 @@ describe Polipus::HTTP do
30
29
  end
31
30
 
32
31
  describe 'proxy settings' do
32
+
33
33
  it 'should set proxy correctly using a procedure' do
34
34
  http = Polipus::HTTP.new({proxy_host: -> con { "127.0.0.0" }, proxy_port: -> con { 8080 }})
35
35
  http.proxy_host.should eq "127.0.0.0"
@@ -49,10 +49,25 @@ describe Polipus::HTTP do
49
49
  http.proxy_host.should eq "127.0.0.0"
50
50
  end
51
51
 
52
+ end
53
+
54
+
55
+ describe 'gzipped content handling' do
56
+
57
+ it 'should decode gzip content' do
58
+ VCR.use_cassette('gzipped_on') do
59
+ http = Polipus::HTTP.new(gzip_enabled: true, logger: Logger.new(STDOUT))
60
+ page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
61
+ page.doc.css('.gzip_yes').should_not be_empty
62
+ end
63
+ end
64
+
65
+
66
+ end
67
+
52
68
  describe 'staled connections' do
53
69
 
54
70
  it 'should refresh a staled connection' do
55
-
56
71
  VCR.use_cassette('http_tconnection_max_hits') do
57
72
  http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
58
73
  http.class.__send__(:attr_reader, :connections)
@@ -65,11 +80,8 @@ describe Polipus::HTTP do
65
80
  http.fetch_page("https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html")
66
81
  http.connections_hits['www.yahoo.com'][443].should be 1
67
82
  http.connections['www.yahoo.com'][443].should_not be old_conn
68
-
69
83
  end
70
84
  end
71
85
  end
72
86
 
73
- end
74
-
75
87
  end
data/spec/spec_helper.rb CHANGED
@@ -5,6 +5,10 @@
5
5
  #
6
6
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
7
  require "digest/md5"
8
+ require "coveralls"
9
+
10
+ Coveralls.wear!
11
+
8
12
  RSpec.configure do |config|
9
13
  config.treat_symbols_as_metadata_keys_with_true_values = true
10
14
  config.run_all_when_everything_filtered = true
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-13 00:00:00.000000000 Z
11
+ date: 2014-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis-bloomfilter
@@ -270,6 +270,40 @@ dependencies:
270
270
  - - ! '>='
271
271
  - !ruby/object:Gem::Version
272
272
  version: 1.3.2
273
+ - !ruby/object:Gem::Dependency
274
+ name: rake
275
+ requirement: !ruby/object:Gem::Requirement
276
+ requirements:
277
+ - - ~>
278
+ - !ruby/object:Gem::Version
279
+ version: '10.3'
280
+ - - ! '>='
281
+ - !ruby/object:Gem::Version
282
+ version: 10.3.2
283
+ type: :development
284
+ prerelease: false
285
+ version_requirements: !ruby/object:Gem::Requirement
286
+ requirements:
287
+ - - ~>
288
+ - !ruby/object:Gem::Version
289
+ version: '10.3'
290
+ - - ! '>='
291
+ - !ruby/object:Gem::Version
292
+ version: 10.3.2
293
+ - !ruby/object:Gem::Dependency
294
+ name: coveralls
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - ! '>='
298
+ - !ruby/object:Gem::Version
299
+ version: '0'
300
+ type: :development
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - ! '>='
305
+ - !ruby/object:Gem::Version
306
+ version: '0'
273
307
  description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
274
308
  \ "
275
309
  email:
@@ -281,6 +315,8 @@ files:
281
315
  - .document
282
316
  - .gitignore
283
317
  - .rspec
318
+ - .travis.yml
319
+ - AUTHORS
284
320
  - Gemfile
285
321
  - LICENSE.txt
286
322
  - README.md
@@ -328,6 +364,7 @@ files:
328
364
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
329
365
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
330
366
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
367
+ - spec/cassettes/gzipped_on.yml
331
368
  - spec/cassettes/http_tconnection_max_hits.yml
332
369
  - spec/cassettes/http_test.yml
333
370
  - spec/cassettes/http_test_redirect.yml
@@ -382,6 +419,7 @@ test_files:
382
419
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
383
420
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
384
421
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
422
+ - spec/cassettes/gzipped_on.yml
385
423
  - spec/cassettes/http_tconnection_max_hits.yml
386
424
  - spec/cassettes/http_test.yml
387
425
  - spec/cassettes/http_test_redirect.yml