pupa 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/README.md +5 -1
- data/lib/pupa/processor.rb +0 -12
- data/lib/pupa/processor/client.rb +5 -0
- data/lib/pupa/processor/middleware/gzip.rb +24 -0
- data/lib/pupa/refinements/faraday.rb +26 -0
- data/lib/pupa/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37fa9e87e20d4fef24b1f028f046b695a3323b88
|
4
|
+
data.tar.gz: 0b2ae08ac3597955e7175812f05a700c76d91247
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de98e5ed8f0b145e0ba77401c3e2e983cd47dc2b7a327476a03b6e04eecaeb31f976d76e98a7d51114afd87a95e36c9da491e6a4f4d788f71b02060121cbb94e
|
7
|
+
data.tar.gz: 69fe22ea3079034b34aea5b244d233f66eebefd5010ea923ca9865d475bc81ccaebd6cab94e721ff2944cef3b4842dc297c932e8cbc69d949a593a74b2fc46a3
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
[![Coverage Status](https://coveralls.io/repos/opennorth/pupa-ruby/badge.png?branch=master)](https://coveralls.io/r/opennorth/pupa-ruby)
|
6
6
|
[![Code Climate](https://codeclimate.com/github/opennorth/pupa-ruby.png)](https://codeclimate.com/github/opennorth/pupa-ruby)
|
7
7
|
|
8
|
-
Pupa.rb is a Ruby 2.
|
8
|
+
Pupa.rb is a Ruby 2.x fork of Sunlight Labs' [Pupa](https://github.com/opencivicdata/pupa). It implements an Extract, Transform and Load (ETL) process to scrape data from online sources, transform it, and write it to a database.
|
9
9
|
|
10
10
|
## What it tries to solve
|
11
11
|
|
@@ -187,6 +187,10 @@ The `json-schema` gem is slow compared to, for example, [JSV](https://github.com
|
|
187
187
|
|
188
188
|
The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
|
189
189
|
|
190
|
+
### Ruby version
|
191
|
+
|
192
|
+
Pupa.rb requires Ruby 2.x. If you have already made all the above optimizations, you may notice a significant improvement by using Ruby 2.1, which has better garbage collection than Ruby 2.0.
|
193
|
+
|
190
194
|
### Profiling
|
191
195
|
|
192
196
|
You can profile your code using [perftools.rb](https://github.com/tmm1/perftools.rb). First, install the gem:
|
data/lib/pupa/processor.rb
CHANGED
@@ -46,18 +46,6 @@ module Pupa
|
|
46
46
|
# @param [String,Hash] params query string parameters
|
47
47
|
# @return a parsed document
|
48
48
|
def get(url, params = {})
|
49
|
-
# Faraday requires `params` to be a hash.
|
50
|
-
if String === params
|
51
|
-
params = CGI.parse(params)
|
52
|
-
|
53
|
-
# Flatten the parameters for Faraday.
|
54
|
-
params.each do |key,value|
|
55
|
-
if Array === value && value.size == 1
|
56
|
-
params[key] = value.first
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
49
|
client.get(url, params).body
|
62
50
|
end
|
63
51
|
|
@@ -2,10 +2,12 @@ require 'active_support/cache'
|
|
2
2
|
require 'faraday_middleware'
|
3
3
|
require 'faraday_middleware/response_middleware'
|
4
4
|
|
5
|
+
require 'pupa/processor/middleware/gzip'
|
5
6
|
require 'pupa/processor/middleware/logger'
|
6
7
|
require 'pupa/processor/middleware/parse_html'
|
7
8
|
require 'pupa/processor/middleware/parse_json'
|
8
9
|
require 'pupa/processor/middleware/raise_error'
|
10
|
+
require 'pupa/refinements/faraday'
|
9
11
|
require 'pupa/refinements/faraday_middleware'
|
10
12
|
|
11
13
|
begin
|
@@ -49,6 +51,9 @@ module Pupa
|
|
49
51
|
connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
|
50
52
|
end
|
51
53
|
|
54
|
+
# Must come after the parser middlewares.
|
55
|
+
connection.use Middleware::Gzip
|
56
|
+
|
52
57
|
if cache_dir
|
53
58
|
connection.response :caching do
|
54
59
|
address = cache_dir[%r{\Amemcached://(.+)\z}, 1]
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
module Middleware
|
4
|
+
# A Faraday response middleware for parsing gzip responses.
|
5
|
+
#
|
6
|
+
# @see https://gist.github.com/romanbsd/3892387
|
7
|
+
class Gzip < Faraday::Response::Middleware
|
8
|
+
dependency 'zlib'
|
9
|
+
|
10
|
+
def on_complete(env)
|
11
|
+
encoding = env[:response_headers]['content-encoding'].to_s.downcase
|
12
|
+
case encoding
|
13
|
+
when 'gzip'
|
14
|
+
env[:body] = Zlib::GzipReader.new(StringIO.new(env[:body]), encoding: 'ASCII-8BIT').read
|
15
|
+
env[:response_headers].delete('content-encoding')
|
16
|
+
when 'deflate'
|
17
|
+
env[:body] = Zlib::Inflate.inflate(env[:body])
|
18
|
+
env[:response_headers].delete('content-encoding')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Pupa
|
2
|
+
module Refinements
|
3
|
+
# Faraday requires `params` to be a hash.
|
4
|
+
module Connection
|
5
|
+
# @see https://github.com/lostisland/faraday/blob/b8d90a59bafb8dd6e19488fae07945a7700f5664/lib/faraday/connection.rb#L137
|
6
|
+
def get(url=nil, params=nil, headers=nil, &block)
|
7
|
+
if String === params
|
8
|
+
params = CGI.parse(params)
|
9
|
+
|
10
|
+
# Flatten the parameters for Faraday.
|
11
|
+
params.each do |key,value|
|
12
|
+
if Array === value && value.size == 1
|
13
|
+
params[key] = value.first
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
super(url, params, headers, &block)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Faraday::Connection
|
25
|
+
prepend Pupa::Refinements::Connection
|
26
|
+
end
|
data/lib/pupa/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -302,12 +302,14 @@ files:
|
|
302
302
|
- lib/pupa/processor/document_store/file_store.rb
|
303
303
|
- lib/pupa/processor/document_store/redis_store.rb
|
304
304
|
- lib/pupa/processor/helper.rb
|
305
|
+
- lib/pupa/processor/middleware/gzip.rb
|
305
306
|
- lib/pupa/processor/middleware/logger.rb
|
306
307
|
- lib/pupa/processor/middleware/parse_html.rb
|
307
308
|
- lib/pupa/processor/middleware/parse_json.rb
|
308
309
|
- lib/pupa/processor/middleware/raise_error.rb
|
309
310
|
- lib/pupa/processor/persistence.rb
|
310
311
|
- lib/pupa/processor/yielder.rb
|
312
|
+
- lib/pupa/refinements/faraday.rb
|
311
313
|
- lib/pupa/refinements/faraday_middleware.rb
|
312
314
|
- lib/pupa/refinements/json-schema.rb
|
313
315
|
- lib/pupa/refinements/opencivicdata.rb
|