pupa 0.0.12 → 0.0.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: c894758e0375a999ec78a825feb05ff7ec9da66a
4
- data.tar.gz: 4f2633cc09888ef1f236cb2c35afdff9681e7924
3
+ metadata.gz: 37fa9e87e20d4fef24b1f028f046b695a3323b88
4
+ data.tar.gz: 0b2ae08ac3597955e7175812f05a700c76d91247
5
5
  SHA512:
6
- metadata.gz: 7e83f57b09cf99dab424032634193aa17055c22721b5ee558362f5fc07face6d6129b71d431e01f0b2b663aa771f752014ce7129e963d44d47beaa154f770c44
7
- data.tar.gz: a51e3cee53727013f60623576af75b7c0ce7c047085be577a5f745fed04babc62ffd64a3c59abf234b909ffca47bc73250645f39eaae0364a89ad131f2593426
6
+ metadata.gz: de98e5ed8f0b145e0ba77401c3e2e983cd47dc2b7a327476a03b6e04eecaeb31f976d76e98a7d51114afd87a95e36c9da491e6a4f4d788f71b02060121cbb94e
7
+ data.tar.gz: 69fe22ea3079034b34aea5b244d233f66eebefd5010ea923ca9865d475bc81ccaebd6cab94e721ff2944cef3b4842dc297c932e8cbc69d949a593a74b2fc46a3
@@ -1,6 +1,7 @@
1
1
  language: ruby
2
2
  rvm:
3
3
  - 2.0.0
4
+ - 2.1.0
4
5
  env:
5
6
  - MODE=default
6
7
  - MODE=compat
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  [![Coverage Status](https://coveralls.io/repos/opennorth/pupa-ruby/badge.png?branch=master)](https://coveralls.io/r/opennorth/pupa-ruby)
6
6
  [![Code Climate](https://codeclimate.com/github/opennorth/pupa-ruby.png)](https://codeclimate.com/github/opennorth/pupa-ruby)
7
7
 
8
- Pupa.rb is a Ruby 2.0 fork of Sunlight Labs' [Pupa](https://github.com/opencivicdata/pupa). It implements an Extract, Transform and Load (ETL) process to scrape data from online sources, transform it, and write it to a database.
8
+ Pupa.rb is a Ruby 2.x fork of Sunlight Labs' [Pupa](https://github.com/opencivicdata/pupa). It implements an Extract, Transform and Load (ETL) process to scrape data from online sources, transform it, and write it to a database.
9
9
 
10
10
  ## What it tries to solve
11
11
 
@@ -187,6 +187,10 @@ The `json-schema` gem is slow compared to, for example, [JSV](https://github.com
187
187
 
188
188
  The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
189
189
 
190
+ ### Ruby version
191
+
192
+ Pupa.rb requires Ruby 2.x. If you have already made all the above optimizations, you may notice a significant improvement by using Ruby 2.1, which has better garbage collection than Ruby 2.0.
193
+
190
194
  ### Profiling
191
195
 
192
196
  You can profile your code using [perftools.rb](https://github.com/tmm1/perftools.rb). First, install the gem:
@@ -46,18 +46,6 @@ module Pupa
46
46
  # @param [String,Hash] params query string parameters
47
47
  # @return a parsed document
48
48
  def get(url, params = {})
49
- # Faraday requires `params` to be a hash.
50
- if String === params
51
- params = CGI.parse(params)
52
-
53
- # Flatten the parameters for Faraday.
54
- params.each do |key,value|
55
- if Array === value && value.size == 1
56
- params[key] = value.first
57
- end
58
- end
59
- end
60
-
61
49
  client.get(url, params).body
62
50
  end
63
51
 
@@ -2,10 +2,12 @@ require 'active_support/cache'
2
2
  require 'faraday_middleware'
3
3
  require 'faraday_middleware/response_middleware'
4
4
 
5
+ require 'pupa/processor/middleware/gzip'
5
6
  require 'pupa/processor/middleware/logger'
6
7
  require 'pupa/processor/middleware/parse_html'
7
8
  require 'pupa/processor/middleware/parse_json'
8
9
  require 'pupa/processor/middleware/raise_error'
10
+ require 'pupa/refinements/faraday'
9
11
  require 'pupa/refinements/faraday_middleware'
10
12
 
11
13
  begin
@@ -49,6 +51,9 @@ module Pupa
49
51
  connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
50
52
  end
51
53
 
54
+ # Must come after the parser middlewares.
55
+ connection.use Middleware::Gzip
56
+
52
57
  if cache_dir
53
58
  connection.response :caching do
54
59
  address = cache_dir[%r{\Amemcached://(.+)\z}, 1]
@@ -0,0 +1,24 @@
1
+ module Pupa
2
+ class Processor
3
+ module Middleware
4
+ # A Faraday response middleware for parsing gzip responses.
5
+ #
6
+ # @see https://gist.github.com/romanbsd/3892387
7
+ class Gzip < Faraday::Response::Middleware
8
+ dependency 'zlib'
9
+
10
+ def on_complete(env)
11
+ encoding = env[:response_headers]['content-encoding'].to_s.downcase
12
+ case encoding
13
+ when 'gzip'
14
+ env[:body] = Zlib::GzipReader.new(StringIO.new(env[:body]), encoding: 'ASCII-8BIT').read
15
+ env[:response_headers].delete('content-encoding')
16
+ when 'deflate'
17
+ env[:body] = Zlib::Inflate.inflate(env[:body])
18
+ env[:response_headers].delete('content-encoding')
19
+ end
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,26 @@
1
+ module Pupa
2
+ module Refinements
3
+ # Faraday requires `params` to be a hash.
4
+ module Connection
5
+ # @see https://github.com/lostisland/faraday/blob/b8d90a59bafb8dd6e19488fae07945a7700f5664/lib/faraday/connection.rb#L137
6
+ def get(url=nil, params=nil, headers=nil, &block)
7
+ if String === params
8
+ params = CGI.parse(params)
9
+
10
+ # Flatten the parameters for Faraday.
11
+ params.each do |key,value|
12
+ if Array === value && value.size == 1
13
+ params[key] = value.first
14
+ end
15
+ end
16
+ end
17
+
18
+ super(url, params, headers, &block)
19
+ end
20
+ end
21
+ end
22
+ end
23
+
24
+ class Faraday::Connection
25
+ prepend Pupa::Refinements::Connection
26
+ end
@@ -1,3 +1,3 @@
1
1
  module Pupa
2
- VERSION = "0.0.12"
2
+ VERSION = "0.0.13"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: pupa
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Open North
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-29 00:00:00.000000000 Z
11
+ date: 2014-02-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -302,12 +302,14 @@ files:
302
302
  - lib/pupa/processor/document_store/file_store.rb
303
303
  - lib/pupa/processor/document_store/redis_store.rb
304
304
  - lib/pupa/processor/helper.rb
305
+ - lib/pupa/processor/middleware/gzip.rb
305
306
  - lib/pupa/processor/middleware/logger.rb
306
307
  - lib/pupa/processor/middleware/parse_html.rb
307
308
  - lib/pupa/processor/middleware/parse_json.rb
308
309
  - lib/pupa/processor/middleware/raise_error.rb
309
310
  - lib/pupa/processor/persistence.rb
310
311
  - lib/pupa/processor/yielder.rb
312
+ - lib/pupa/refinements/faraday.rb
311
313
  - lib/pupa/refinements/faraday_middleware.rb
312
314
  - lib/pupa/refinements/json-schema.rb
313
315
  - lib/pupa/refinements/opencivicdata.rb