pupa 0.0.12 → 0.0.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/README.md +5 -1
- data/lib/pupa/processor.rb +0 -12
- data/lib/pupa/processor/client.rb +5 -0
- data/lib/pupa/processor/middleware/gzip.rb +24 -0
- data/lib/pupa/refinements/faraday.rb +26 -0
- data/lib/pupa/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 37fa9e87e20d4fef24b1f028f046b695a3323b88
|
4
|
+
data.tar.gz: 0b2ae08ac3597955e7175812f05a700c76d91247
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de98e5ed8f0b145e0ba77401c3e2e983cd47dc2b7a327476a03b6e04eecaeb31f976d76e98a7d51114afd87a95e36c9da491e6a4f4d788f71b02060121cbb94e
|
7
|
+
data.tar.gz: 69fe22ea3079034b34aea5b244d233f66eebefd5010ea923ca9865d475bc81ccaebd6cab94e721ff2944cef3b4842dc297c932e8cbc69d949a593a74b2fc46a3
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
[](https://coveralls.io/r/opennorth/pupa-ruby)
|
6
6
|
[](https://codeclimate.com/github/opennorth/pupa-ruby)
|
7
7
|
|
8
|
-
Pupa.rb is a Ruby 2.
|
8
|
+
Pupa.rb is a Ruby 2.x fork of Sunlight Labs' [Pupa](https://github.com/opencivicdata/pupa). It implements an Extract, Transform and Load (ETL) process to scrape data from online sources, transform it, and write it to a database.
|
9
9
|
|
10
10
|
## What it tries to solve
|
11
11
|
|
@@ -187,6 +187,10 @@ The `json-schema` gem is slow compared to, for example, [JSV](https://github.com
|
|
187
187
|
|
188
188
|
The [pupa-validate](https://npmjs.org/package/pupa-validate) npm package can be used to validate JSON documents using the faster JSV. In an example case, using JSV instead of the `json-schema` gem reduced by half the time to validate 10,000 documents.
|
189
189
|
|
190
|
+
### Ruby version
|
191
|
+
|
192
|
+
Pupa.rb requires Ruby 2.x. If you have already made all the above optimizations, you may notice a significant improvement by using Ruby 2.1, which has better garbage collection than Ruby 2.0.
|
193
|
+
|
190
194
|
### Profiling
|
191
195
|
|
192
196
|
You can profile your code using [perftools.rb](https://github.com/tmm1/perftools.rb). First, install the gem:
|
data/lib/pupa/processor.rb
CHANGED
@@ -46,18 +46,6 @@ module Pupa
|
|
46
46
|
# @param [String,Hash] params query string parameters
|
47
47
|
# @return a parsed document
|
48
48
|
def get(url, params = {})
|
49
|
-
# Faraday requires `params` to be a hash.
|
50
|
-
if String === params
|
51
|
-
params = CGI.parse(params)
|
52
|
-
|
53
|
-
# Flatten the parameters for Faraday.
|
54
|
-
params.each do |key,value|
|
55
|
-
if Array === value && value.size == 1
|
56
|
-
params[key] = value.first
|
57
|
-
end
|
58
|
-
end
|
59
|
-
end
|
60
|
-
|
61
49
|
client.get(url, params).body
|
62
50
|
end
|
63
51
|
|
@@ -2,10 +2,12 @@ require 'active_support/cache'
|
|
2
2
|
require 'faraday_middleware'
|
3
3
|
require 'faraday_middleware/response_middleware'
|
4
4
|
|
5
|
+
require 'pupa/processor/middleware/gzip'
|
5
6
|
require 'pupa/processor/middleware/logger'
|
6
7
|
require 'pupa/processor/middleware/parse_html'
|
7
8
|
require 'pupa/processor/middleware/parse_json'
|
8
9
|
require 'pupa/processor/middleware/raise_error'
|
10
|
+
require 'pupa/refinements/faraday'
|
9
11
|
require 'pupa/refinements/faraday_middleware'
|
10
12
|
|
11
13
|
begin
|
@@ -49,6 +51,9 @@ module Pupa
|
|
49
51
|
connection.use FaradayMiddleware::ParseXml, preserve_raw: true, content_type: /\bxml$/
|
50
52
|
end
|
51
53
|
|
54
|
+
# Must come after the parser middlewares.
|
55
|
+
connection.use Middleware::Gzip
|
56
|
+
|
52
57
|
if cache_dir
|
53
58
|
connection.response :caching do
|
54
59
|
address = cache_dir[%r{\Amemcached://(.+)\z}, 1]
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Pupa
|
2
|
+
class Processor
|
3
|
+
module Middleware
|
4
|
+
# A Faraday response middleware for parsing gzip responses.
|
5
|
+
#
|
6
|
+
# @see https://gist.github.com/romanbsd/3892387
|
7
|
+
class Gzip < Faraday::Response::Middleware
|
8
|
+
dependency 'zlib'
|
9
|
+
|
10
|
+
def on_complete(env)
|
11
|
+
encoding = env[:response_headers]['content-encoding'].to_s.downcase
|
12
|
+
case encoding
|
13
|
+
when 'gzip'
|
14
|
+
env[:body] = Zlib::GzipReader.new(StringIO.new(env[:body]), encoding: 'ASCII-8BIT').read
|
15
|
+
env[:response_headers].delete('content-encoding')
|
16
|
+
when 'deflate'
|
17
|
+
env[:body] = Zlib::Inflate.inflate(env[:body])
|
18
|
+
env[:response_headers].delete('content-encoding')
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Pupa
|
2
|
+
module Refinements
|
3
|
+
# Faraday requires `params` to be a hash.
|
4
|
+
module Connection
|
5
|
+
# @see https://github.com/lostisland/faraday/blob/b8d90a59bafb8dd6e19488fae07945a7700f5664/lib/faraday/connection.rb#L137
|
6
|
+
def get(url=nil, params=nil, headers=nil, &block)
|
7
|
+
if String === params
|
8
|
+
params = CGI.parse(params)
|
9
|
+
|
10
|
+
# Flatten the parameters for Faraday.
|
11
|
+
params.each do |key,value|
|
12
|
+
if Array === value && value.size == 1
|
13
|
+
params[key] = value.first
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
super(url, params, headers, &block)
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class Faraday::Connection
|
25
|
+
prepend Pupa::Refinements::Connection
|
26
|
+
end
|
data/lib/pupa/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2014-02-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -302,12 +302,14 @@ files:
|
|
302
302
|
- lib/pupa/processor/document_store/file_store.rb
|
303
303
|
- lib/pupa/processor/document_store/redis_store.rb
|
304
304
|
- lib/pupa/processor/helper.rb
|
305
|
+
- lib/pupa/processor/middleware/gzip.rb
|
305
306
|
- lib/pupa/processor/middleware/logger.rb
|
306
307
|
- lib/pupa/processor/middleware/parse_html.rb
|
307
308
|
- lib/pupa/processor/middleware/parse_json.rb
|
308
309
|
- lib/pupa/processor/middleware/raise_error.rb
|
309
310
|
- lib/pupa/processor/persistence.rb
|
310
311
|
- lib/pupa/processor/yielder.rb
|
312
|
+
- lib/pupa/refinements/faraday.rb
|
311
313
|
- lib/pupa/refinements/faraday_middleware.rb
|
312
314
|
- lib/pupa/refinements/json-schema.rb
|
313
315
|
- lib/pupa/refinements/opencivicdata.rb
|