pupa 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -0
- data/lib/pupa/processor/client.rb +3 -2
- data/lib/pupa/processor.rb +3 -2
- data/lib/pupa/refinements/opencivicdata.rb +1 -1
- data/lib/pupa/runner.rb +16 -11
- data/lib/pupa/version.rb +1 -1
- data/spec/processor/connection_adapters/postgresql_adapter_spec.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 940e9e4a78d4b1940728bd15b6ee770f07ac213a
|
4
|
+
data.tar.gz: 2cbd50c8ef323d747a37e2a8970253ee3b028bd4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f5698681774b52440270b76fad66d1916120b2768a588c1ac38bdf55979ffb8f76a3cb1b70900896cfdaa858668633e26b3dab868f9bc875109b1e202deca93e
|
7
|
+
data.tar.gz: 9f4e8a37eea18a0e18584081d25c050057da631e70b66ff1d1afa6f2a73394caf9cc5dfdac86cc4d53138601ee75921d4ee31f00a23fb1dbea5b38fcc77153cc
|
data/.travis.yml
CHANGED
@@ -29,9 +29,10 @@ module Pupa
|
|
29
29
|
# @param [String] cache_dir a directory or a Memcached address
|
30
30
|
# (e.g. `memcached://localhost:11211`) in which to cache requests
|
31
31
|
# @param [Integer] expires_in the cache's expiration time in seconds
|
32
|
+
# @param [Integer,String] value_max_bytes the maximum Memcached item size
|
32
33
|
# @param [String] level the log level
|
33
34
|
# @return [Faraday::Connection] a configured Faraday HTTP client
|
34
|
-
def self.new(cache_dir: nil, expires_in: 86400, level: 'INFO') # 1 day
|
35
|
+
def self.new(cache_dir: nil, expires_in: 86400, value_max_bytes: 1048576, level: 'INFO') # 1 day
|
35
36
|
Faraday.new do |connection|
|
36
37
|
connection.request :url_encoded
|
37
38
|
connection.use Middleware::Logger, Logger.new('faraday', level: level)
|
@@ -58,7 +59,7 @@ module Pupa
|
|
58
59
|
connection.response :caching do
|
59
60
|
address = cache_dir[%r{\Amemcached://(.+)\z}, 1]
|
60
61
|
if address
|
61
|
-
ActiveSupport::Cache::MemCacheStore.new(address, expires_in: expires_in)
|
62
|
+
ActiveSupport::Cache::MemCacheStore.new(address, expires_in: expires_in, value_max_bytes: 1048576)
|
62
63
|
else
|
63
64
|
ActiveSupport::Cache::FileStore.new(cache_dir, expires_in: expires_in)
|
64
65
|
end
|
data/lib/pupa/processor.rb
CHANGED
@@ -24,14 +24,15 @@ module Pupa
|
|
24
24
|
# @param [String] cache_dir the directory or Memcached address
|
25
25
|
# (e.g. `memcached://localhost:11211`) in which to cache HTTP responses
|
26
26
|
# @param [Integer] expires_in the cache's expiration time in seconds
|
27
|
+
# @param [Integer,String] value_max_bytes the maximum Memcached item size
|
27
28
|
# @param [String] database_url the database URL
|
28
29
|
# @param [Boolean] validate whether to validate JSON documents
|
29
30
|
# @param [String] level the log level
|
30
31
|
# @param [String,IO] logdev the log device
|
31
32
|
# @param [Hash] options criteria for selecting the methods to run
|
32
|
-
def initialize(output_dir, pipelined: false, cache_dir: nil, expires_in: 86400, database_url: 'mongodb://localhost:27017/pupa', validate: true, level: 'INFO', logdev: STDOUT, options: {})
|
33
|
+
def initialize(output_dir, pipelined: false, cache_dir: nil, expires_in: 86400, value_max_bytes: 1048576, database_url: 'mongodb://localhost:27017/pupa', validate: true, level: 'INFO', logdev: STDOUT, options: {})
|
33
34
|
@store = DocumentStore.new(output_dir, pipelined: pipelined)
|
34
|
-
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, level: level)
|
35
|
+
@client = Client.new(cache_dir: cache_dir, expires_in: expires_in, value_max_bytes: value_max_bytes, level: level)
|
35
36
|
@connection = Connection.new(database_url)
|
36
37
|
@logger = Logger.new('pupa', level: level, logdev: logdev)
|
37
38
|
@validate = validate
|
@@ -32,7 +32,7 @@ end
|
|
32
32
|
#
|
33
33
|
# Instead of adding a callback, we can override `to_h` when `persist` is `true`.
|
34
34
|
ObjectSpace.each_object(Class) do |base|
|
35
|
-
if base.include?(Pupa::Model)
|
35
|
+
if base != Sequel::Model && base.include?(Pupa::Model) # Sequel::Model will error on #include?
|
36
36
|
base.class_eval do
|
37
37
|
set_callback(:save, :before) do |object|
|
38
38
|
object._type = object._type.camelize.demodulize.underscore
|
data/lib/pupa/runner.rb
CHANGED
@@ -11,16 +11,17 @@ module Pupa
|
|
11
11
|
@processor_class = processor_class
|
12
12
|
|
13
13
|
@options = OpenStruct.new({
|
14
|
-
actions:
|
15
|
-
tasks:
|
16
|
-
output_dir:
|
17
|
-
pipelined:
|
18
|
-
cache_dir:
|
19
|
-
expires_in:
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
14
|
+
actions: [],
|
15
|
+
tasks: [],
|
16
|
+
output_dir: File.expand_path('scraped_data', Dir.pwd),
|
17
|
+
pipelined: false,
|
18
|
+
cache_dir: File.expand_path('web_cache', Dir.pwd),
|
19
|
+
expires_in: 86400, # 1 day
|
20
|
+
value_max_bytes: 1048576, # 1 MB
|
21
|
+
database_url: 'mongodb://localhost:27017/pupa',
|
22
|
+
validate: true,
|
23
|
+
level: 'INFO',
|
24
|
+
dry_run: false,
|
24
25
|
}.merge(defaults))
|
25
26
|
|
26
27
|
@actions = {
|
@@ -82,6 +83,9 @@ module Pupa
|
|
82
83
|
opts.on('-e', '--expires_in SECONDS', "The cache's expiration time in seconds") do |v|
|
83
84
|
options.expires_in = v
|
84
85
|
end
|
86
|
+
opts.on('-value_max_bytes BYTES', "The maximum Memcached item size") do |v|
|
87
|
+
options.value_max_bytes = v
|
88
|
+
end
|
85
89
|
opts.on('-d', '--database_url SCHEME://USERNAME:PASSWORD@HOST:PORT/DATABASE', 'The database URL') do |v|
|
86
90
|
options.database_url = v
|
87
91
|
end
|
@@ -142,6 +146,7 @@ module Pupa
|
|
142
146
|
pipelined: options.pipelined,
|
143
147
|
cache_dir: options.cache_dir,
|
144
148
|
expires_in: options.expires_in,
|
149
|
+
value_max_bytes: options.value_max_bytes,
|
145
150
|
database_url: options.database_url,
|
146
151
|
validate: options.validate,
|
147
152
|
level: options.level,
|
@@ -160,7 +165,7 @@ module Pupa
|
|
160
165
|
end
|
161
166
|
|
162
167
|
if options.level == 'DEBUG'
|
163
|
-
%w(output_dir pipelined cache_dir expires_in database_url validate level).each do |option|
|
168
|
+
%w(output_dir pipelined cache_dir expires_in value_max_bytes database_url validate level).each do |option|
|
164
169
|
puts "#{option}: #{options[option]}"
|
165
170
|
end
|
166
171
|
unless rest.empty?
|
data/lib/pupa/version.rb
CHANGED
@@ -14,7 +14,7 @@ describe Pupa::Processor::Connection::PostgreSQLAdapter do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
before :all do
|
17
|
-
connection.raw_connection.drop_table(:people)
|
17
|
+
connection.raw_connection.drop_table?(:people)
|
18
18
|
connection.raw_connection.create_table(:people) do
|
19
19
|
primary_key :id
|
20
20
|
String :_id
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: pupa
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Open North
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|