polipus 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,15 +1,15 @@
1
1
  ---
2
2
  !binary "U0hBMQ==":
3
3
  metadata.gz: !binary |-
4
- MDlmZGJiYmY1MWQ1NWFiMDMxZGRmYzBmNGQ4OGU4ZGFmOTgxN2M5Mg==
4
+ Mzc5NDczMjAxNmUzY2JlYzA4YzUwZDUzMGY0Y2ZjZGQxOTBmMTk5YQ==
5
5
  data.tar.gz: !binary |-
6
- Y2NjYmI5N2Y1NDA4NjVlNGQ2YWNhMWUxYjkyODA5NjBkOWVmOTljZQ==
6
+ NDgxNzhlYTAyNTMxYWE1MTBkMzIxMWMyMDRlMmQ0NjA2ZTc1MzY2Mw==
7
7
  SHA512:
8
8
  metadata.gz: !binary |-
9
- MmQ1ZDc2M2RiYmVmZjg0NDVlYjYyNTYzOWY4M2NjMGQ1MDZjMGQwZWYwNjJj
10
- OWMzNjY4ODI3ZTA5ZmEzM2U0YTZiNWQxOWI2YzkyNDAzM2Q5YzIyZDFiNmE5
11
- MzQ0YzM3ZDgwNTk5N2ZjNzMxOTY3OTllNWFjNDI3YjQ0NGVhODg=
9
+ YTNmMGY4YWRkNmU5YThhMTY0ODBiMzY1MmI3MzkzMzEzZjhiZGI4NDI1Mjdh
10
+ MGI0YWNjZDEyM2NhZDlkZTY2MjY3ZTA5YzVjNWE5YjFjZWEwZjRhMGI3ZWE2
11
+ MTc0NDczOWIyYTE1YjY1MzA2ZjcwOTRjNWZiOGVlMjIyOTJkNGU=
12
12
  data.tar.gz: !binary |-
13
- ZTI1NWUzNzQwNTA2NTU4Mjk5OTVhNmI3NDlkNDFmOTRmN2JmZDhhZDRhYWZl
14
- Y2VkM2ZkNDZjZWYyOTk0NTgzZTIxOTA0ZGEzMjc1MGNmOGFkYTgxZjc4ZWJj
15
- ZGVhY2FiNDk3MDYwNzllMGNiYTI2NDMwYTAyMzY4NTczYmJjNGQ=
13
+ ZTVmZGQ0NTgyNTA2NGYwZDVhMTg3NmUzMWM0NDExMGIwOTU5NjM2ZGZkNWM3
14
+ MWYyYWQwM2NkMmNiMDgxYWY3NTdmYjE5NDZhOGE1YTQxNjRjNjUxNTIzYTc2
15
+ NTU2ZDZiYzllNWYxZjdiNDIwMzYyNGYyN2YyM2ZmY2VmNWU1NzA=
data/.travis.yml ADDED
@@ -0,0 +1,11 @@
1
+ language: ruby
2
+ rvm:
3
+ - jruby
4
+ - 1.9.3
5
+ - 2.0.0
6
+ - 2.1.1
7
+
8
+ services:
9
+ - mongodb
10
+ - redis
11
+
data/AUTHORS ADDED
@@ -0,0 +1,2 @@
1
+ Francesco Laurita <francesco.laurita@gmail.com>
2
+ Tobias L. Maier <http://tobiasmaier.info/>
data/README.md CHANGED
@@ -1,3 +1,9 @@
1
+ [![Build Status](https://travis-ci.org/taganaka/polipus.svg?branch=master)](https://travis-ci.org/taganaka/polipus)
2
+ [![Coverage Status](https://coveralls.io/repos/taganaka/polipus/badge.png?branch=master)](https://coveralls.io/r/taganaka/polipus?branch=master)
3
+ [![Code Climate](https://codeclimate.com/github/taganaka/polipus.png)](https://codeclimate.com/github/taganaka/polipus)
4
+ [![RubyGems](http://img.shields.io/gem/v/polipus.svg)](https://rubygems.org/gems/polipus)
5
+
6
+
1
7
  # Polipus #
2
8
 
3
9
  A distributed web crawler written in ruby, backed by Redis
data/lib/polipus.rb CHANGED
@@ -15,7 +15,7 @@ require "singleton"
15
15
 
16
16
  module Polipus
17
17
 
18
- def Polipus.crawler(job_name = 'polipus', urls = [], options = {}, &block)
18
+ def self.crawler(job_name = 'polipus', urls = [], options = {}, &block)
19
19
  PolipusCrawler.crawl(job_name, urls, options, &block)
20
20
  end
21
21
 
@@ -75,7 +75,6 @@ module Polipus
75
75
  attr_reader :storage
76
76
  attr_reader :job_name
77
77
  attr_reader :logger
78
- attr_reader :overflow_adapter
79
78
  attr_reader :options
80
79
  attr_reader :crawler_name
81
80
 
@@ -89,7 +88,7 @@ module Polipus
89
88
  end
90
89
  end
91
90
 
92
- def initialize(job_name = 'polipus',urls = [], options = {})
91
+ def initialize(job_name = 'polipus', urls = [], options = {})
93
92
 
94
93
  @job_name = job_name
95
94
  @options = OPTS.merge(options)
@@ -121,22 +120,19 @@ module Polipus
121
120
 
122
121
  @storage.include_query_string_in_uuid = @options[:include_query_string_in_saved_page]
123
122
 
124
- @urls = [urls].flatten.map{ |url| url.is_a?(URI) ? url : URI(url) }
123
+ @urls = [urls].flatten.map{ |url| URI(url) }
125
124
  @urls.each{ |url| url.path = '/' if url.path.empty? }
125
+
126
+ @internal_queue = queue_factory
127
+
126
128
  execute_plugin 'on_initialize'
127
129
 
128
130
  yield self if block_given?
129
131
 
130
132
  end
131
133
 
132
- def self.crawl(job_name, urls, opts = {})
133
-
134
- self.new(job_name, urls, opts) do |polipus|
135
- yield polipus if block_given?
136
-
137
- polipus.takeover
138
- end
139
-
134
+ def self.crawl(*args, &block)
135
+ new(*args, &block).takeover
140
136
  end
141
137
 
142
138
  def takeover
@@ -167,13 +163,13 @@ module Polipus
167
163
  page = Page.from_json message
168
164
 
169
165
  unless should_be_visited?(page.url, false)
170
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] is no more welcome."}
166
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) is no more welcome."}
171
167
  queue.commit
172
168
  next
173
169
  end
174
170
 
175
171
  if page_exists? page
176
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
172
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
177
173
  queue.commit
178
174
  next
179
175
  end
@@ -190,7 +186,7 @@ module Polipus
190
186
  page = pages.pop
191
187
  page.aliases = pages.collect { |e| e.url }
192
188
  if page_exists? page
193
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] already stored."}
189
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) already stored."}
194
190
  queue.commit
195
191
  next
196
192
  end
@@ -212,7 +208,7 @@ module Polipus
212
208
 
213
209
  if page
214
210
  @logger.debug {"[worker ##{worker_number}] Fetched page: [#{page.url.to_s}] Referrer: [#{page.referer}] Depth: [#{page.depth}] Code: [#{page.code}] Response Time: [#{page.response_time}]"}
215
- @logger.info {"[worker ##{worker_number}] Page [#{page.url.to_s}] downloaded"}
211
+ @logger.info {"[worker ##{worker_number}] Page (#{page.url.to_s}) downloaded"}
216
212
  end
217
213
 
218
214
  incr_pages
@@ -261,7 +257,7 @@ module Polipus
261
257
  self
262
258
  end
263
259
 
264
- # A block of code will be executed on every page dowloaded
260
+ # A block of code will be executed on every page downloaded
265
261
  # The block takes the page as argument
266
262
  def on_page_downloaded(&block)
267
263
  @on_page_downloaded << block
@@ -292,12 +288,7 @@ module Polipus
292
288
  @options[:redis_options]
293
289
  end
294
290
 
295
- def overflow_adapter
296
- @options[:overflow_adapter]
297
- end
298
-
299
291
  def queue_size
300
- @internal_queue ||= queue_factory
301
292
  @internal_queue.size
302
293
  end
303
294
 
@@ -311,63 +302,58 @@ module Polipus
311
302
  end
312
303
 
313
304
  def url_tracker
314
- if @url_tracker.nil?
315
- @url_tracker = @options[:url_tracker] ||= UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}", :redis => redis_factory_adapter, :driver => 'lua')
316
- end
317
- @url_tracker
305
+ @url_tracker ||=
306
+ @options[:url_tracker] ||=
307
+ UrlTracker.bloomfilter(:key_name => "polipus_bf_#{job_name}",
308
+ :redis => redis_factory_adapter,
309
+ :driver => 'lua')
318
310
  end
319
311
 
320
312
  def redis
321
- if @redis.nil?
322
- @redis = redis_factory_adapter
323
- end
324
- @redis
313
+ @redis ||= redis_factory_adapter
325
314
  end
326
315
 
316
+ # Enqueue an url, no matter what
327
317
  def add_url url
328
- @url_tracker.remove url.to_s
329
318
  page = Page.new(url)
330
- queue_factory << page.to_json
319
+ @internal_queue << page.to_json
331
320
  end
332
321
 
333
322
  # Request to Polipus to stop its work (gracefully)
334
323
  # cler_queue = true if you want to delete all of the pending urls to visit
335
324
  def stop!(cler_queue = false)
336
325
  PolipusSignalHandler.terminate
337
- queue_factory.clear(true) if cler_queue
326
+ @internal_queue.clear(true) if cler_queue
338
327
  end
339
328
 
340
329
  private
341
330
  # URLs enqueue policy
342
331
  def should_be_visited?(url, with_tracker = true)
343
-
332
+ case
344
333
  # Check against whitelist pattern matching
345
- unless @follow_links_like.empty?
346
- return false unless @follow_links_like.any?{|p| url.path =~ p}
347
- end
348
-
334
+ when !@follow_links_like.empty? && @follow_links_like.none?{ |p| url.path =~ p }
335
+ false
349
336
  # Check against blacklist pattern matching
350
- unless @skip_links_like.empty?
351
- return false if @skip_links_like.any?{|p| url.path =~ p}
352
- end
353
-
354
- #Page is marked as expired
355
- return true if page_expired?(Page.new(url))
356
-
337
+ when @skip_links_like.any?{ |p| url.path =~ p }
338
+ false
339
+ # Page is marked as expired
340
+ when page_expired?(Page.new(url))
341
+ true
357
342
  # Check against url tracker
358
- if with_tracker
359
- return false if url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
343
+ when with_tracker && url_tracker.visited?(@options[:include_query_string_in_saved_page] ? url.to_s : url.to_s.gsub(/\?.*$/,''))
344
+ false
345
+ else
346
+ true
360
347
  end
361
- true
362
348
  end
363
349
 
364
350
  # It extracts URLs from the page
365
351
  def links_for page
366
352
  page.domain_aliases = domain_aliases
367
- links = @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
368
- links
353
+ @focus_crawl_block.nil? ? page.links : @focus_crawl_block.call(page)
369
354
  end
370
355
 
356
+ # whether a page is expired or not
371
357
  def page_expired? page
372
358
  return false if @options[:ttl_page].nil?
373
359
  stored_page = @storage.get(page)
@@ -376,6 +362,7 @@ module Polipus
376
362
  r
377
363
  end
378
364
 
365
+ # whether a page exists or not
379
366
  def page_exists? page
380
367
  return false if page.user_data && page.user_data.p_seeded
381
368
  @storage.exists?(page) && !page_expired?(page)
@@ -392,10 +379,11 @@ module Polipus
392
379
 
393
380
  # It creates a redis client
394
381
  def redis_factory_adapter
395
- unless @redis_factory.nil?
396
- return @redis_factory.call(redis_options)
382
+ if @redis_factory
383
+ @redis_factory.call(redis_options)
384
+ else
385
+ Redis.new(redis_options)
397
386
  end
398
- Redis.new(redis_options)
399
387
  end
400
388
 
401
389
  # It creates a new distributed queue
data/lib/polipus/http.rb CHANGED
@@ -27,33 +27,33 @@ module Polipus
27
27
  # including redirects
28
28
  #
29
29
  def fetch_pages(url, referer = nil, depth = nil)
30
- begin
31
- url = URI(url) unless url.is_a?(URI)
32
- pages = []
33
- get(url, referer) do |response, code, location, redirect_to, response_time|
34
- body = response.body.dup
35
- if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
36
- gzip = Zlib::GzipReader.new(StringIO.new(body))
37
- body = gzip.read
38
- end
39
- pages << Page.new(location, :body => response.body.dup,
40
- :code => code,
41
- :headers => response.to_hash,
42
- :referer => referer,
43
- :depth => depth,
44
- :redirect_to => redirect_to,
45
- :response_time => response_time,
46
- :fetched_at => Time.now.to_i)
30
+ url = URI(url)
31
+ pages = []
32
+ get(url, referer) do |response, code, location, redirect_to, response_time|
33
+ body = response.body.dup
34
+ if response.to_hash.fetch('content-encoding', [])[0] == 'gzip'
35
+ gzip = Zlib::GzipReader.new(StringIO.new(body))
36
+ body = gzip.read
47
37
  end
38
+
39
+ pages << Page.new(location, :body => body,
40
+ :code => code,
41
+ :headers => response.to_hash,
42
+ :referer => referer,
43
+ :depth => depth,
44
+ :redirect_to => redirect_to,
45
+ :response_time => response_time,
46
+ :fetched_at => Time.now.to_i)
47
+ end
48
48
 
49
- return pages
50
- rescue StandardError => e
51
- if verbose?
52
- puts e.inspect
53
- puts e.backtrace
54
- end
55
- return [Page.new(url, :error => e)]
49
+ pages
50
+ rescue StandardError => e
51
+ if verbose?
52
+ puts e.inspect
53
+ puts e.backtrace
56
54
  end
55
+
56
+ [Page.new(url, :error => e)]
57
57
  end
58
58
 
59
59
  #
@@ -154,6 +154,8 @@ module Polipus
154
154
  opts['User-Agent'] = user_agent if user_agent
155
155
  opts['Referer'] = referer.to_s if referer
156
156
  opts['Cookie'] = ::HTTP::Cookie.cookie_value(cookie_jar.cookies(url)) if accept_cookies?
157
+ opts['Accept-Encoding'] = 'gzip'
158
+
157
159
 
158
160
  retries = 0
159
161
  begin
@@ -227,5 +229,9 @@ module Polipus
227
229
  to_url.host.nil? || (to_url.host == from_url.host)
228
230
  end
229
231
 
232
+ def gzip_enabled?
233
+ @opts[:gzip_enabled]
234
+ end
235
+
230
236
  end
231
237
  end
@@ -1,4 +1,4 @@
1
1
  module Polipus
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  HOMEPAGE = "https://github.com/taganaka/polipus"
4
4
  end
data/polipus.gemspec CHANGED
@@ -40,6 +40,8 @@ Gem::Specification.new do |s|
40
40
  s.add_development_dependency 'vcr', '~> 2.5', '>= 2.5.0'
41
41
  s.add_development_dependency 'webmock', '>= 1.8.0', '< 1.12'
42
42
  s.add_development_dependency 'flexmock', '~> 1.3', '>= 1.3.2'
43
+ s.add_development_dependency 'rake', '~> 10.3', '>= 10.3.2'
44
+ s.add_development_dependency 'coveralls'
43
45
 
44
46
 
45
47
  end
@@ -0,0 +1,137 @@
1
+ ---
2
+ http_interactions:
3
+ - request:
4
+ method: get
5
+ uri: http://www.whatsmyip.org/http-compression-test/
6
+ body:
7
+ encoding: US-ASCII
8
+ string: ''
9
+ headers:
10
+ Accept-Encoding:
11
+ - gzip
12
+ Accept:
13
+ - "*/*"
14
+ User-Agent:
15
+ - Ruby
16
+ response:
17
+ status:
18
+ code: 200
19
+ message: OK
20
+ headers:
21
+ Date:
22
+ - Mon, 19 May 2014 22:57:51 GMT
23
+ Server:
24
+ - Apache/2.2.24 (Unix) DAV/2 mod_fastcgi/2.4.2 mod_ssl/2.2.24 OpenSSL/0.9.8y
25
+ Cache-Control:
26
+ - max-age=1
27
+ Expires:
28
+ - Mon, 19 May 2014 22:57:52 GMT
29
+ X-Powered-By:
30
+ - Web Server Built Entirely Out of Legos
31
+ Vary:
32
+ - Accept-Encoding
33
+ Content-Encoding:
34
+ - gzip
35
+ Content-Length:
36
+ - '4153'
37
+ Content-Type:
38
+ - text/html
39
+ body:
40
+ encoding: ASCII-8BIT
41
+ string: !binary |-
42
+ H4sIAAAAAAAAA91ae3PbNrb/W/kUWDYbyRNTsr3pI7GlTuKmTXbz8MTO7e10
43
+ OhqIhETEJMElQMtK6+++vwOAL1luk/bOnTs3M45I4OC8z8HBAU/+9t3b04uf
44
+ zp6zxGTp7N69E/87OEkEj2f3BoMTI00qZj8m3OjXm5dnY1Wu2G/sxcXFGTtV
45
+ WVEKraXK2YXQ5mTigGlZJgxnUcJLLcw0qMwy/CZoJ3KeiWlwKTZrVcY6YJHK
46
+ jcgBuPooi32WqXjePsVimXIj9sGkKQDaEN1na7FgWpRXotxnvOBRAii1wNqE
47
+ 53FKowZ80e8CA2sZm+QWF7HQUSkLA4QdRnYKKEqWCqPZRlUMNOVywwwUY9nw
48
+ K5nUbCFkvmoYFTGrNA2QRGzCvDh4qsX0j35ibEkRjdLJJwHMl0Rc5HyRdnGD
49
+ sTFzEmH8kpUinQYy4ysx12UUsKQUy2kwsSN6omEOMS7y1fYKXhSpCI2qoiSU
50
+ EenBLhzWC9sZWj3cpteuaEgt+VUNHjCzKYRna3LtCLQo3CRxICNORpiUWj+8
51
+ ztLA85ZC8hx6AR7yrmnQ98V35+fseyHimgNykicTQjJeE2C2kQUBTtq3Iik6
52
+ DHjGI60n6/V6nHGZj/FSs23EtZnYd8uONptU6EQIczcKo1SqPwOH878u7Ad+
53
+ xd1owGBI4P7Q4e6DDmYnEzf/OQg+/LsS5eZPL9eFzHNRfuZ6ghhc8ZLdh8NW
54
+ KaJnyoLguBldCTOvytSO0iD93R8tqzwiZxjt0fvgV/v//VHwxepjEeyNeRyf
55
+ plzrUWASqecFPCvYszgHHUhZEGaAL1VU6RGLUsHzuShLVbKd0JQsAK6rRSZN
56
+ y8N94bjwbPSw73/hoLGsKFUxGsZSI0hFPETuKSvh6QyyzbwURcojcW7gzKPa
57
+ T8mofT+libAT4CFxNanFa4k7dRK7qYzFe5Becm1Adkt3DduDjvyjGt1ALtmI
58
+ benriqejPTadkk3Y3sBDNngGJinV2iMK3ij2/t0r9hzpr0QYNogHN/5BpFrc
59
+ QtFQ1BHPc+S0WpDv1DqvRbklSQfB4P64UNp0ePfW48aAK24XBnv77FdWS/Zk
60
+ t5w3HZXdj7nhLbUOuUHHfy3UWCNlmVHwW0fkwWAy4dh3jEfUmbhT4NZyO8Tt
61
+ cTDQa2mihI1qVn4++KUL2oMdRFwLFlgjBU+6EwO44WbO51pVeTwKdB7Pl1ym
62
+ PTEGuxyNioNRcCKzlU0Kzfaw+uh8YayvVkOGhD0d2vfh7EQXPEfcIVSnQ4sM
63
+ E8NZ8LCR4PCXhwGSCcBmuzygFHO76Ym5zIvK6C0eF6Xgl+3QbfHbTfgPdRCr
64
+ XPwlHWyE7mhgUxW75AeQlX+HIz4MqHg4bTj2ajkpZu/zTjFxBnrsXH4UT9iW
65
+ Htm/np0sytnpH8EetbDn2Kfzle5N/wPTf/8TxvjLKY49ZFDHnOu53agQmp9r
66
+ 7ir/3zR4rjr2zlUhdhk8V79v7zdvL3ba/H/UzGeKalPJU/b/xuColjm4/z+W
67
+ 2Z7m7H1+mUOH7LmtM15wzd5GUVWWjXE/X9M3nZcbTN3rvDXPYxJ1u3Cy/zrb
68
+ Qm/nfsreCIMT2KXnteazp6+bzsvN7c29GfLVCc4XpbiCs33n7NMUGjfdiouK
69
+ jiVHXXB3YeWjpK4Pb5U//VqtoXKvQ4v+an2wruT3tSkRBd3a8vc9Z4uxtvy0
70
+ 1nJYO7BjOzDP9Kr2rZriLcv7Rdv2rwW6uSVHr4i7VRw3LJYiU1fi07nslyJd
71
+ 6n3ytxm9g4ffK4yd6WsaPfzbOeV+LtbW/h0y5D102FBLtpZ5rNZjHAKMwtGm
72
+ qHRi19nidVhjHdZutg3fI8V+RSU4BHs1TWQjdnOHGfp5a4s55rgbeWoLo7gv
73
+ p+v1KKv7nu+OQJ0Fo11bRu3mpTBVmbPgW8xNKY3S+n4IkFo7p7STie/o4HGh
74
+ 4o09uNGQKO0J7YTXZ37ktlt5MFWr7l73Cq9DJuPpkCawYMJnhCOWVxYZMB/O
75
+ 7mgRYcbSm3hgxxhxcc9hqJNqpNIqy/XQgee8maAzMI2eJEczn8CoJ3JBp25g
76
+ O7JYuWUvQ6qu5cKzCGW+VCFfqMqEG1VB1tcYZS8xyp7SKPtJVSRNi6IoTYOi
77
+ UKUJbf0uSqw9wys7d6+6v8qUUbPKlHCxEsgF1lw0L/0FONc2C3bvlsNbGt1i
78
+ NF+1jEIhxCB++kDrRDVAeJY6jHMdpkpdVgUW/Pji7ctz9oBnxTH77s15f22Z
79
+ X7ZrxYL6UmFRhiXPLx25H90ge+dGtlSyEi1pWYR4BV3f8hnOXp6xV/6tvyxB
80
+ xugpxrmLrvXxwr26ReQSF+LasHeCWmnxTqfQSWtRnZBJETxhxqNE5mSicxqz
81
+ Z9rXbqzPUJSUHYayNKQuJ46bNU+vX7HTZmTLKzLRUrYbQmhkBuNC4TB4foWT
82
+ o/WsczvJjGIX9fyWVnTSMsF1AnXCCTmSGvGAAfZDPbDFvW5ZoFZNSIUziZCv
83
+ LGWrvVMqpk/d2JYPrK6b5aVYVSkvQ3Hdc1SL5p1YYdg3TfsoRB5tKwFDCkYM
84
+ Y2F/WwU8p4lbTlysW5eA98UqCwtkBuol9/Rw5gfv0kUVt45QFRobYIjsS1JA
85
+ L8tUFoXl5b2bok2bkXq2bBqVfYUaFZIUiAs4c61RWPIUgwgNDLa++tYkotzp
86
+ pKlcNGjxDDyv5KLk5WY7Jm+pog7N1qHf2Qnmw3PLJ9eiTTxLqTTWF+Gl2MAx
87
+ 0gj2dar8HjPsx+dn7F9is4WgyjtazKXpOfJ7DEByP9BfqE0nDrEDY0+WIBpy
88
+ KkN5tLFu0Ayzp354yxeu0xqJP30U8lqk283fYXuDwc4I4GnsxDiZYGdxW0xv
89
+ 77HN/KHfzTozPF6oa7v7+K4n15s88k3SCTUieXw0Xim1SgVmYt/UHiOd+1lq
90
+ pfJYLzYOaLudeiJz7YkFHTDqjTLbOJ4GqKSoXH0i8xQGDhdIo5fH9nLjyddH
91
+ 3xTXx4mQq8Q8eXxQXNt11IsKeRxGqbSXGxEPi2oRHj36+vDx198cPPry4KvH
92
+ h0ePerA6VYD8+sujg8OvDh89/oaYBGut5HgadRhsq5fu4G+/sZ9/2bN12ehX
93
+ W5a3ojYlwNb2LnPpFd/TvFYRDrDhojIGmSZRKTJFDfcpgCD9tzBk32P/fYb9
94
+ jr2Sl4KFIQlEi+FMwXIRlkoZEtWxVovaHqvifab3AbzHULlRA/sDXpcfqCsY
95
+ j3FWeZ6KDDrWzzYXfPWGZ2Kk934++AWCU2HYBXm2eRmPgAiYXDl3jMrNI4pw
96
+ 7jXCAwLDMciMZYw5Gdtn53VT22I+pmVjeCD10CcT+G4uIoPjYCQWEHScCzMR
97
+ +fz9+YSnKdzti+vlIkunhw94UbyMp0dfPTo6eAQrPz768uDwEbXmIc+44CVo
98
+ v0HSGsPsCN9nYolaaeTlJVPejGIVVcQi6manKVTPw5pw+EHr+HKIorXr3h1d
99
+ OwsFtbdjJIVNAueCvducT+8o2LVa5ESBThl+BOGCmmsaOJLzCMc8UwMnah0S
100
+ z2CB1OmHl4oiBXkr5jlvPcJ70cVaGrqRu1gLYbwX8c4NlAbTxsHY0LdXb42k
101
+ fia0o40aLFlb0P85qa1QWKxK+RHM8zSYWfb8dvMptzwT5BUDM2fjLvNILXBb
102
+ vZ2pvCp+cJH+8LATTF7OVViklaY2m1e1/AjamYhllf0lM9fm+COxKEe1HREK
103
+ WRuzhaIo8767FWy1IyPmCjUmtIAdbiEe2smtIBxgyEXhsHYBXkjtNwOrSaR+
104
+ rxEoc9hcgukuOzuTSMMV5RJ2d3QWCvnJxubeqJ9tfbp12bLNvK3bQqsvVGaL
105
+ XTZhzfCYxncfPAiR3T3v3domixInx3JT52dPSgtriSZpo+KhMnvHIfGogSlm
106
+ FzitM7pTZVEiokvtrsbdtT8VVloIhuQq7d07hT5VjnSW6zTN7YUNe2lqFFBW
107
+ 9xKdXUl+11cHgOT5xtMLqRRkKSrjinqu9vY/VkI31/+9m3l7JwYgwXgc0yhT
108
+ S8aZLkQklzJiVBAQ/qVEAEESy9y4brHVGqB4tCmz7Xsxd701DTrk7Iy922aZ
109
+ MImiBQKgubriqYQCRK1TqjEK6KfBSR0GH0H20X0Y0U65uP3qAJj5dSrylUmm
110
+ wdGXj5qE5hpM9iVgtqXitt1p4FTA7XGKjjPE/Yx1GPBdIk++fnMc1G8QoMKr
111
+ W9wI0Uk0bSdr1vVt+Ccpr37r1gqN7M11XNv2aD+Y8Dfe+mrViOrHAtsJCR4k
112
+ IsVB4RgynZLxgGg8Hvd56JHzLecemwjSOjDuDpRnpVrDB3vBQs8oLxcylWaz
113
+ FTVv83TDFm6Rdo5ain9X9GlJJy5qt13LNGVwF0TRGAcFBqVAQO1dshNlNvY8
114
+ Woo3rm3zpRtPLdpxy87tOi3wWGzMzS2djnH/2gXbTwJVyk/E6rOW1XdOfGK3
115
+ c3dy6lj1vfnabXYa5w9SWY3UEW7w9qzSWA9MWFXWBtD8Clmk+UoJKSdmErCK
116
+ hpHrY0okMkM+QurHgYhO3gwuEIsridIFBsQi1OxrJiniUPMxXwiCSz1mzyjc
117
+ jLNpKeKK1qSKx4Sjk09h/ISSkfRpzXAUyRrbAuMZlReUwMjOFUV1odYuA9eW
118
+ t4lMM9ttF2yZbvbtp1IkG8uqKGHUg/PLFhuW2LuoXjqPlfUmYREn1ERicFyV
119
+ R8Jmb2IHI+DcsJQIkkqIA4PDr16K0nNg3R1bJNMZal4S66cOEdpOcO6mHXOf
120
+ 0FN6X8krkXsNU4PR4M+yAjU7dlQBgJ5OLwgJtAkSG0KaK2b7B3aPAFPN51pW
121
+ gTaNQ312M+1uE92dTrAFReiabxoEtMtI3f/ujT4y2/jPyjqbFe0l9S5mRbZx
122
+ XGlCITLCyCujMm4P1OC5sRveGHWu9tnp+fk++ycKnXN/vIUf/vfrV01Es6cY
123
+ oAsedv5fPzAXm/tsnUhYlxTOGzmvsxQ2W/bMa5VkPXMpVxX8sOc9nsa+JUqO
124
+ QwgrrwNKRdSQsrbddwa0CYgSlftEDCa0XqIo9dkvysgVlJcMZbZFalNbTI4P
125
+ 90CIATkdo4nlsxdn1pkSolfQiGW+Ls0gKGZThBYB2i/3CNS2/Fqf79jznNBv
126
+ LDVrQ0tnBwFC1xCpbzTp3nYrvc2psTScPUjN8bf0naM2yAWjoPvFY4C69dsH
127
+ K3PsE1oHGTSTcKuQBTRnq0YYYGELRxsDnU8YUT4hxjFPCh06q2GtWKJsoR4C
128
+ vfSqpIXLL0ivcCVqySMlYWu0pnS5jYNyTiL3kxxSHHKRVj6djHdm3abcd/WQ
129
+ MvbW4qRK/Td4sweRKjbHRwcHh+ERDrEWDsO9M9nOAwbJjGIKRe6pf7L3AHWT
130
+ aQcOWVAXzS4+fLSkyoWXK/rIdb5AXYg97FyucvYWZn35nJ0JI7s97Tt4as7q
131
+ 9rhVMxjsuIlpINuNcPT9nruCaYr2uhXWPcQ5tndd7tRQHYwXHiMdIEjJVEd5
132
+ pTf/TngVS+VquPprClhT0K4yDSjRdLZzDTePhK+u7CWrntCKsVo1H4hafBMa
133
+ +IR1WfGP/joaaMs+O9a89jm1F7ufxan7BuBPsOoWfg6vjbPbSqPpQ03cnd2J
134
+ vXGY/QfAuyd8uC0AAA==
135
+ http_version:
136
+ recorded_at: Mon, 19 May 2014 22:57:52 GMT
137
+ recorded_with: VCR 2.5.0
data/spec/http_spec.rb CHANGED
@@ -6,7 +6,6 @@ require "polipus/page"
6
6
  describe Polipus::HTTP do
7
7
 
8
8
  it 'should download a page' do
9
-
10
9
  VCR.use_cassette('http_test') do
11
10
  http = Polipus::HTTP.new
12
11
  page = http.fetch_page("http://sfbay.craigslist.org/apa/")
@@ -30,6 +29,7 @@ describe Polipus::HTTP do
30
29
  end
31
30
 
32
31
  describe 'proxy settings' do
32
+
33
33
  it 'should set proxy correctly using a procedure' do
34
34
  http = Polipus::HTTP.new({proxy_host: -> con { "127.0.0.0" }, proxy_port: -> con { 8080 }})
35
35
  http.proxy_host.should eq "127.0.0.0"
@@ -49,10 +49,25 @@ describe Polipus::HTTP do
49
49
  http.proxy_host.should eq "127.0.0.0"
50
50
  end
51
51
 
52
+ end
53
+
54
+
55
+ describe 'gzipped content handling' do
56
+
57
+ it 'should decode gzip content' do
58
+ VCR.use_cassette('gzipped_on') do
59
+ http = Polipus::HTTP.new(gzip_enabled: true, logger: Logger.new(STDOUT))
60
+ page = http.fetch_page("http://www.whatsmyip.org/http-compression-test/")
61
+ page.doc.css('.gzip_yes').should_not be_empty
62
+ end
63
+ end
64
+
65
+
66
+ end
67
+
52
68
  describe 'staled connections' do
53
69
 
54
70
  it 'should refresh a staled connection' do
55
-
56
71
  VCR.use_cassette('http_tconnection_max_hits') do
57
72
  http = Polipus::HTTP.new(connection_max_hits: 1, logger: Logger.new(STDOUT))
58
73
  http.class.__send__(:attr_reader, :connections)
@@ -65,11 +80,8 @@ describe Polipus::HTTP do
65
80
  http.fetch_page("https://www.yahoo.com/tech/expectant-parents-asked-the-internet-to-name-their-83416450388.html")
66
81
  http.connections_hits['www.yahoo.com'][443].should be 1
67
82
  http.connections['www.yahoo.com'][443].should_not be old_conn
68
-
69
83
  end
70
84
  end
71
85
  end
72
86
 
73
- end
74
-
75
87
  end
data/spec/spec_helper.rb CHANGED
@@ -5,6 +5,10 @@
5
5
  #
6
6
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
7
7
  require "digest/md5"
8
+ require "coveralls"
9
+
10
+ Coveralls.wear!
11
+
8
12
  RSpec.configure do |config|
9
13
  config.treat_symbols_as_metadata_keys_with_true_values = true
10
14
  config.run_all_when_everything_filtered = true
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: polipus
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Francesco Laurita
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-05-13 00:00:00.000000000 Z
11
+ date: 2014-05-23 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: redis-bloomfilter
@@ -270,6 +270,40 @@ dependencies:
270
270
  - - ! '>='
271
271
  - !ruby/object:Gem::Version
272
272
  version: 1.3.2
273
+ - !ruby/object:Gem::Dependency
274
+ name: rake
275
+ requirement: !ruby/object:Gem::Requirement
276
+ requirements:
277
+ - - ~>
278
+ - !ruby/object:Gem::Version
279
+ version: '10.3'
280
+ - - ! '>='
281
+ - !ruby/object:Gem::Version
282
+ version: 10.3.2
283
+ type: :development
284
+ prerelease: false
285
+ version_requirements: !ruby/object:Gem::Requirement
286
+ requirements:
287
+ - - ~>
288
+ - !ruby/object:Gem::Version
289
+ version: '10.3'
290
+ - - ! '>='
291
+ - !ruby/object:Gem::Version
292
+ version: 10.3.2
293
+ - !ruby/object:Gem::Dependency
294
+ name: coveralls
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - ! '>='
298
+ - !ruby/object:Gem::Version
299
+ version: '0'
300
+ type: :development
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - ! '>='
305
+ - !ruby/object:Gem::Version
306
+ version: '0'
273
307
  description: ! "\n An easy to use distributed web-crawler framework based on Redis\n
274
308
  \ "
275
309
  email:
@@ -281,6 +315,8 @@ files:
281
315
  - .document
282
316
  - .gitignore
283
317
  - .rspec
318
+ - .travis.yml
319
+ - AUTHORS
284
320
  - Gemfile
285
321
  - LICENSE.txt
286
322
  - README.md
@@ -328,6 +364,7 @@ files:
328
364
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
329
365
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
330
366
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
367
+ - spec/cassettes/gzipped_on.yml
331
368
  - spec/cassettes/http_tconnection_max_hits.yml
332
369
  - spec/cassettes/http_test.yml
333
370
  - spec/cassettes/http_test_redirect.yml
@@ -382,6 +419,7 @@ test_files:
382
419
  - spec/cassettes/c5ce68499027d490adfbb6e5541881e4.yml
383
420
  - spec/cassettes/ce16b11a7df0b70fe90c7f90063fdb8c.yml
384
421
  - spec/cassettes/ffe3d588b6df4b9de35e5a7ccaf5a81b.yml
422
+ - spec/cassettes/gzipped_on.yml
385
423
  - spec/cassettes/http_tconnection_max_hits.yml
386
424
  - spec/cassettes/http_test.yml
387
425
  - spec/cassettes/http_test_redirect.yml