pupa 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/lib/pupa/processor/client.rb +3 -2
- data/lib/pupa/processor.rb +3 -2
- data/lib/pupa/refinements/opencivicdata.rb +1 -1
- data/lib/pupa/runner.rb +16 -11
- data/lib/pupa/version.rb +1 -1
- data/spec/processor/connection_adapters/postgresql_adapter_spec.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 940e9e4a78d4b1940728bd15b6ee770f07ac213a
|
4
|
+
data.tar.gz: 2cbd50c8ef323d747a37e2a8970253ee3b028bd4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f5698681774b52440270b76fad66d1916120b2768a588c1ac38bdf55979ffb8f76a3cb1b70900896cfdaa858668633e26b3dab868f9bc875109b1e202deca93e
|
7
|
+
data.tar.gz: 9f4e8a37eea18a0e18584081d25c050057da631e70b66ff1d1afa6f2a73394caf9cc5dfdac86cc4d53138601ee75921d4ee31f00a23fb1dbea5b38fcc77153cc
|
data/.travis.yml
CHANGED
@@ -29,9 +29,10 @@ module Pupa
|
|
29
29
|
# @param [String] cache_dir a directory or a Memcached address
|
30
30
|
# (e.g. `memcached://localhost:11211`) in which to cache requests
|
31
31
|
# @param [Integer] expires_in the cache's expiration time in seconds
|
32
|
+
# @param [Integer,String] value_max_bytes the maximum Memcached item size
|
32
33
|
# @param [String] level the log level
|
33
34
|
# @return [Faraday::Connection] a configured Faraday HTTP client
|
34
|
-
def self.new(cache_dir: nil, expires_in: 86400, level: 'INFO') # 1 day
|
35
|
+
def self.new(cache_dir: nil, expires_in: 86400, value_max_bytes: 1048576, level: 'INFO') # 1 day
|
35
36
|
Faraday.new do |connection|
|
36
37
|
connection.request :url_encoded
|
37
38
|
connection.use Middleware::Logger, Logger.new('faraday', level: level)
|
@@ -58,7 +59,7 @@ module Pupa
|
|
58
59
|
connection.response :caching do
|
59
60
|
address = cache_dir[%r{\Amemcached://(.+)\z}, 1]
|
60
61
|
if address
|
61
|
-
ActiveSupport::Cache::MemCacheStore.new(address, expires_in: expires_in)
|
62
|
+
ActiveSupport::Cache::MemCacheStore.new(address, expires_in: expires_in, value_max_bytes: 1048576)
|
62
63
|
else
|
63
64
|
ActiveSupport::Cache::FileStore.new(cache_dir, expires_in: expires_in)
|
64
65
|
end
|
data/lib/pupa/processor.rb
CHANGED
@@ -24,14 +24,15 @@ module Pupa
|
|
24
24
|
# @param [String] cache_dir the directory or Memcached address
|
25
25
|
# (e.g. `memcached://localhost:11211`) in which to cache HTTP responses
|
26
26
|
# @param [Integer] expires_in the cache's expiration time in seconds
|
27
|
+
# @param [Integer,String] value_max_bytes the maximum Memcached item size
|
27
28
|
# @param [String] database_url the database URL
|
28
29
|
# @param [Boolean] validate whether to validate JSON documents
|
29
30
|
# @param [String] level the log level
|
30
31
|
# @param [String,IO] logdev the log device
|
31
32
|
# @param [Hash] options criteria for selecting the methods to run
|
32
|
-
def initialize(output_dir, pipelined: false, cache_dir: nil, expires_in: 86400, database_url: 'mongodb://localhost:27017/pupa', validate: true, level: 'INFO', logdev: STDOUT, options: {})
|
33
|
+
def initialize(output_dir, pipelined: false, cache_dir: nil, expires_in: 86400, value_max_bytes: 1048576, database_url: 'mongodb://localhost:27017/pupa', validate: true, level: 'INFO', logdev: STDOUT, options: {})
|
33
34
|
@store = DocumentStore.new(output_dir, pipelined: pipelined)
|
34
|
-
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
|
35
|
+
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, value_max_bytes: value_max_bytes, level: level)
|
35
36
|
@connection = Connection.new(database_url)
|
36
37
|
@logger = Logger.new('pupa', level: level, logdev: logdev)
|
37
38
|
@validate = validate
|
@@ -32,7 +32,7 @@ end
|
|
32
32
|
#
|
33
33
|
# Instead of adding a callback, we can override `to_h` when `persist` is `true`.
|
34
34
|
ObjectSpace.each_object(Class) do |base|
|
35
|
-
if base.include?(Pupa::Model)
|
35
|
+
if base != Sequel::Model && base.include?(Pupa::Model) # Sequel::Model will error on #include?
|
36
36
|
base.class_eval do
|
37
37
|
set_callback(:save, :before) do |object|
|
38
38
|
object._type = object._type.camelize.demodulize.underscore
|
data/lib/pupa/runner.rb
CHANGED
@@ -11,16 +11,17 @@ module Pupa
|
|
11
11
|
@processor_class = processor_class
|
12
12
|
|
13
13
|
@options = OpenStruct.new({
|
14
|
-
actions:
|
15
|
-
tasks:
|
16
|
-
output_dir:
|
17
|
-
pipelined:
|
18
|
-
cache_dir:
|
19
|
-
expires_in:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
14
|
+
actions: [],
|
15
|
+
tasks: [],
|
16
|
+
output_dir: File.expand_path('scraped_data', Dir.pwd),
|
17
|
+
pipelined: false,
|
18
|
+
cache_dir: File.expand_path('web_cache', Dir.pwd),
|
19
|
+
expires_in: 86400, # 1 day
|
20
|
+
value_max_bytes: 1048576, # 1 MB
|
21
|
+
database_url: 'mongodb://localhost:27017/pupa',
|
22
|
+
validate: true,
|
23
|
+
level: 'INFO',
|
24
|
+
dry_run: false,
|
24
25
|
}.merge(defaults))
|
25
26
|
|
26
27
|
@actions = {
|
@@ -82,6 +83,9 @@ module Pupa
|
|
82
83
|
opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
|
83
84
|
options.expires_in = v
|
84
85
|
end
|
86
|
+
opts.on('-value_max_bytes BYTES', "The maximum Memcached item size") do |v|
|
87
|
+
options.value_max_bytes = v
|
88
|
+
end
|
85
89
|
opts.on('-d', '--database_url SCHEME://USERNAME:PASSWORD@HOST:PORT/DATABASE', 'The database URL') do |v|
|
86
90
|
options.database_url = v
|
87
91
|
end
|
@@ -142,6 +146,7 @@ module Pupa
|
|
142
146
|
pipelined: options.pipelined,
|
143
147
|
cache_dir: options.cache_dir,
|
144
148
|
expires_in: options.expires_in,
|
149
|
+
value_max_bytes: options.value_max_bytes,
|
145
150
|
database_url: options.database_url,
|
146
151
|
validate: options.validate,
|
147
152
|
level: options.level,
|
@@ -160,7 +165,7 @@ module Pupa
|
|
160
165
|
end
|
161
166
|
|
162
167
|
if options.level == 'DEBUG'
|
163
|
-
%w(output_dir pipelined cache_dir expires_in database_url validate level).each do |option|
|
168
|
+
%w(output_dir pipelined cache_dir expires_in value_max_bytes database_url validate level).each do |option|
|
164
169
|
puts "#{option}: #{options[option]}"
|
165
170
|
end
|
166
171
|
unless rest.empty?
|
data/lib/pupa/version.rb
CHANGED
@@ -14,7 +14,7 @@ describe Pupa::Processor::Connection::PostgreSQLAdapter do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
before :all do
|
17
|
-
connection.raw_connection.drop_table(:people)
|
17
|
+
connection.raw_connection.drop_table?(:people)
|
18
18
|
connection.raw_connection.create_table(:people) do
|
19
19
|
primary_key :id
|
20
20
|
String :_id
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|