dataflow-rb 0.15.0 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/dataflow-rb.gemspec +3 -3
- data/lib/dataflow/adapters/mongo_db_adapter.rb +14 -15
- data/lib/dataflow/adapters/mysql_adapter.rb +1 -1
- data/lib/dataflow/adapters/psql_adapter.rb +1 -1
- data/lib/dataflow/adapters/sql_adapter.rb +12 -18
- data/lib/dataflow/nodes/data_node.rb +15 -5
- data/lib/dataflow/nodes/export/to_csv_node.rb +15 -1
- data/lib/dataflow/version.rb +1 -1
- metadata +12 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b185b31cd30d2380019e022c2e4810ac709e4d9d
|
4
|
+
data.tar.gz: 73f2b6dd47c2ece6792182479faae17321ffab2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29df923f46791f74dd9f5c25f79f293631600acf643b61f6c0000dbf63c7c604bf68530e1e693927dc8be9675e9880fcf4ef501dae713c11c339638f816fadfd
|
7
|
+
data.tar.gz: 9ab45da37c5b2dbbb266e817bf2a51dbfb24765697394ee82bcfc004de958528c1386319eace100e7aeb8bb28f51cf9c7d7fec77e81b0d2cedd4b2873db07f65
|
data/dataflow-rb.gemspec
CHANGED
@@ -35,9 +35,9 @@ Gem::Specification.new do |spec|
|
|
35
35
|
spec.add_dependency 'parallel', '~>1.10'
|
36
36
|
spec.add_dependency 'mongoid', '~>6.0'
|
37
37
|
spec.add_dependency 'sequel', '~>4.0'
|
38
|
-
spec.add_dependency 'mysql2', '
|
39
|
-
spec.add_dependency 'pg', '0.
|
40
|
-
spec.add_dependency 'sequel_pg', '~>1.
|
38
|
+
spec.add_dependency 'mysql2', '>=0.3'
|
39
|
+
spec.add_dependency 'pg', '~>0.21'
|
40
|
+
spec.add_dependency 'sequel_pg', '~>1.7'
|
41
41
|
spec.add_dependency 'msgpack', '~>1.0'
|
42
42
|
spec.add_dependency 'smarter_csv', '1.1.0'
|
43
43
|
spec.add_dependency 'timeliness', '~>0.3'
|
@@ -210,19 +210,16 @@ module Dataflow
|
|
210
210
|
end
|
211
211
|
|
212
212
|
def usage(dataset:)
|
213
|
-
indexes = retrieve_collection_indexes(dataset)
|
214
213
|
command = { collstats: dataset }
|
215
214
|
result = client.database.command(command).documents[0]
|
216
215
|
{
|
217
216
|
memory: result['size'],
|
218
217
|
storage: result['storageSize'],
|
219
|
-
effective_indexes: indexes
|
220
218
|
}
|
221
219
|
rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
|
222
220
|
{
|
223
221
|
memory: 0,
|
224
222
|
storage: 0,
|
225
|
-
effective_indexes: indexes
|
226
223
|
}
|
227
224
|
end
|
228
225
|
|
@@ -289,6 +286,20 @@ module Dataflow
|
|
289
286
|
sanitized_opts
|
290
287
|
end
|
291
288
|
|
289
|
+
def retrieve_dataset_indexes(collection)
|
290
|
+
mongo_indexes = client[collection].indexes
|
291
|
+
mongo_indexes.map do |idx|
|
292
|
+
# skip the default index
|
293
|
+
next if idx['key'].keys == ['_id']
|
294
|
+
|
295
|
+
index = { 'key' => idx['key'].keys }
|
296
|
+
index['unique'] = true if idx['unique']
|
297
|
+
index
|
298
|
+
end.compact
|
299
|
+
rescue Mongo::Error::OperationFailure
|
300
|
+
[]
|
301
|
+
end
|
302
|
+
|
292
303
|
private
|
293
304
|
|
294
305
|
def write_dataset_name
|
@@ -322,18 +333,6 @@ module Dataflow
|
|
322
333
|
index[:unique] = true if dataset_index[:unique]
|
323
334
|
index
|
324
335
|
end
|
325
|
-
|
326
|
-
def retrieve_collection_indexes(collection)
|
327
|
-
mongo_indexes = client[collection].indexes
|
328
|
-
mongo_indexes.map do |idx|
|
329
|
-
# skip the default index
|
330
|
-
next if idx['key'].keys == ['_id']
|
331
|
-
|
332
|
-
index = { 'key' => idx['key'].keys }
|
333
|
-
index['unique'] = true if idx['unique']
|
334
|
-
index
|
335
|
-
end.compact
|
336
|
-
end
|
337
336
|
end
|
338
337
|
end
|
339
338
|
end
|
@@ -4,7 +4,7 @@ module Dataflow
|
|
4
4
|
# Interface between a data node and mongodb.
|
5
5
|
# We use mongodb to perform all the store/retrieve operations.
|
6
6
|
class MysqlAdapter < SqlAdapter
|
7
|
-
def
|
7
|
+
def usage(dataset:)
|
8
8
|
size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
|
9
9
|
{
|
10
10
|
memory: size,
|
@@ -4,7 +4,7 @@ module Dataflow
|
|
4
4
|
# Interface between a data node and mongodb.
|
5
5
|
# We use mongodb to perform all the store/retrieve operations.
|
6
6
|
class PsqlAdapter < SqlAdapter
|
7
|
-
def
|
7
|
+
def usage(dataset:)
|
8
8
|
size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
|
9
9
|
{
|
10
10
|
memory: size,
|
@@ -227,12 +227,6 @@ module Dataflow
|
|
227
227
|
end
|
228
228
|
end
|
229
229
|
|
230
|
-
def usage(dataset:)
|
231
|
-
indexes = retrieve_collection_indexes(dataset)
|
232
|
-
table_usage = fetch_table_usage(dataset: dataset)
|
233
|
-
table_usage.merge(effective_indexes: indexes)
|
234
|
-
end
|
235
|
-
|
236
230
|
def transform_to_query(opts)
|
237
231
|
# map to a serie of AND clauses queries
|
238
232
|
opts.flat_map do |k, v|
|
@@ -263,6 +257,18 @@ module Dataflow
|
|
263
257
|
end
|
264
258
|
end
|
265
259
|
|
260
|
+
def retrieve_dataset_indexes(dataset_name)
|
261
|
+
psql_indexes = client.indexes(dataset_name)
|
262
|
+
psql_indexes.values.map do |idx|
|
263
|
+
cols = idx[:columns].map(&:to_s)
|
264
|
+
index = { 'key' => cols }
|
265
|
+
index['unique'] = true if idx[:unique]
|
266
|
+
index
|
267
|
+
end.compact
|
268
|
+
rescue Sequel::DatabaseError
|
269
|
+
[]
|
270
|
+
end
|
271
|
+
|
266
272
|
private
|
267
273
|
|
268
274
|
MAX_INT = 2_147_483_647
|
@@ -330,18 +336,6 @@ module Dataflow
|
|
330
336
|
params
|
331
337
|
end
|
332
338
|
|
333
|
-
def retrieve_collection_indexes(dataset_name)
|
334
|
-
psql_indexes = client.indexes(dataset_name)
|
335
|
-
psql_indexes.values.map do |idx|
|
336
|
-
cols = idx[:columns].map(&:to_s)
|
337
|
-
index = { 'key' => cols }
|
338
|
-
index['unique'] = true if idx[:unique]
|
339
|
-
index
|
340
|
-
end.compact
|
341
|
-
rescue Sequel::DatabaseError
|
342
|
-
[]
|
343
|
-
end
|
344
|
-
|
345
339
|
def logger
|
346
340
|
@logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
|
347
341
|
end
|
@@ -69,6 +69,7 @@ module Dataflow
|
|
69
69
|
# internal use: where to read/write from. Use 1 and 2 for legacy reasons.
|
70
70
|
field :read_dataset_idx, type: Integer, editable: false, default: 1
|
71
71
|
field :write_dataset_idx, type: Integer, editable: false, default: 2
|
72
|
+
field :double_buffer_prefix, type: String, editable: false, default: 'buffer'
|
72
73
|
|
73
74
|
# Necessary fields:
|
74
75
|
validates_presence_of :db_name
|
@@ -234,7 +235,7 @@ module Dataflow
|
|
234
235
|
return @temporary_read_dataset if @temporary_read_dataset
|
235
236
|
|
236
237
|
if use_double_buffering
|
237
|
-
"#{name}
|
238
|
+
"#{name}_#{double_buffer_prefix}#{read_dataset_idx}"
|
238
239
|
else
|
239
240
|
name
|
240
241
|
end
|
@@ -242,7 +243,7 @@ module Dataflow
|
|
242
243
|
|
243
244
|
def write_dataset_name
|
244
245
|
if use_double_buffering
|
245
|
-
"#{name}
|
246
|
+
"#{name}_#{double_buffer_prefix}#{write_dataset_idx}"
|
246
247
|
else
|
247
248
|
name
|
248
249
|
end
|
@@ -297,12 +298,17 @@ module Dataflow
|
|
297
298
|
updated_at: updated_at,
|
298
299
|
record_count: count,
|
299
300
|
indexes: indexes,
|
300
|
-
|
301
|
+
db_indexes: db_indexes(write_dataset: write_dataset),
|
301
302
|
mem_usage: usage[:memory],
|
302
303
|
storage_usage: usage[:storage]
|
303
304
|
}
|
304
305
|
end
|
305
306
|
|
307
|
+
def db_indexes(write_dataset: false)
|
308
|
+
dataset = write_dataset ? write_dataset_name : read_dataset_name
|
309
|
+
db_adapter.retrieve_dataset_indexes(dataset)
|
310
|
+
end
|
311
|
+
|
306
312
|
def use_symbols?
|
307
313
|
(db_backend.to_s =~ /sql/).present?
|
308
314
|
end
|
@@ -367,7 +373,7 @@ module Dataflow
|
|
367
373
|
raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a double buffer dataset but node '#{name}' is expecting a single buffered one." if read_idx > 0 && !use_double_buffering
|
368
374
|
|
369
375
|
if use_double_buffering
|
370
|
-
dataset_name =
|
376
|
+
dataset_name = dataset_name_for_buffer(read_idx)
|
371
377
|
else
|
372
378
|
dataset_name = name
|
373
379
|
end
|
@@ -416,12 +422,16 @@ module Dataflow
|
|
416
422
|
|
417
423
|
def valid_dataset_names
|
418
424
|
if use_double_buffering
|
419
|
-
[
|
425
|
+
[dataset_name_for_buffer(read_dataset_idx), dataset_name_for_buffer(write_dataset_idx)]
|
420
426
|
else
|
421
427
|
[name]
|
422
428
|
end
|
423
429
|
end
|
424
430
|
|
431
|
+
def dataset_name_for_buffer(idx)
|
432
|
+
"#{name}_#{double_buffer_prefix}#{idx}"
|
433
|
+
end
|
434
|
+
|
425
435
|
def logger
|
426
436
|
@logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
|
427
437
|
end
|
@@ -43,13 +43,27 @@ module Dataflow
|
|
43
43
|
# TODO: re-enabled event on_export_progressed
|
44
44
|
# progress = (idx / queries.count.to_f * 100).ceil
|
45
45
|
# on_export_progressed(pct_complete: progress)
|
46
|
-
|
46
|
+
|
47
|
+
fields = transform_fields(node.db_backend, sch.keys)
|
48
|
+
|
49
|
+
batch = node.all(where: query.merge(where), fields: fields, sort: { system_id => 1 })
|
47
50
|
csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
|
48
51
|
end
|
49
52
|
|
50
53
|
# needed by the csv exporter to finalize in a single file
|
51
54
|
csv_adapter.on_save_finished
|
52
55
|
end
|
56
|
+
|
57
|
+
# Transform the keys to the field that need to be selected on the backend.
|
58
|
+
# That's a fix meant especially for selecting nested values on mongo
|
59
|
+
def transform_fields(db_backend, keys)
|
60
|
+
return keys unless db_backend == :mongodb
|
61
|
+
|
62
|
+
# replace the separator with a dot and make sure we don't select individual
|
63
|
+
# array keys... it seems to breakdown mongodb
|
64
|
+
keys.map { |k| k.gsub(Dataflow::SchemaMixin::SEPARATOR, '.') }
|
65
|
+
.map { |k| k.gsub(/\.[0-9]+/, '') }.uniq
|
66
|
+
end
|
53
67
|
end
|
54
68
|
end
|
55
69
|
end
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -196,44 +196,44 @@ dependencies:
|
|
196
196
|
name: mysql2
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
198
198
|
requirements:
|
199
|
-
- - "
|
199
|
+
- - ">="
|
200
200
|
- !ruby/object:Gem::Version
|
201
|
-
version: '0.
|
201
|
+
version: '0.3'
|
202
202
|
type: :runtime
|
203
203
|
prerelease: false
|
204
204
|
version_requirements: !ruby/object:Gem::Requirement
|
205
205
|
requirements:
|
206
|
-
- - "
|
206
|
+
- - ">="
|
207
207
|
- !ruby/object:Gem::Version
|
208
|
-
version: '0.
|
208
|
+
version: '0.3'
|
209
209
|
- !ruby/object:Gem::Dependency
|
210
210
|
name: pg
|
211
211
|
requirement: !ruby/object:Gem::Requirement
|
212
212
|
requirements:
|
213
|
-
- -
|
213
|
+
- - "~>"
|
214
214
|
- !ruby/object:Gem::Version
|
215
|
-
version: '0.
|
215
|
+
version: '0.21'
|
216
216
|
type: :runtime
|
217
217
|
prerelease: false
|
218
218
|
version_requirements: !ruby/object:Gem::Requirement
|
219
219
|
requirements:
|
220
|
-
- -
|
220
|
+
- - "~>"
|
221
221
|
- !ruby/object:Gem::Version
|
222
|
-
version: '0.
|
222
|
+
version: '0.21'
|
223
223
|
- !ruby/object:Gem::Dependency
|
224
224
|
name: sequel_pg
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|
226
226
|
requirements:
|
227
227
|
- - "~>"
|
228
228
|
- !ruby/object:Gem::Version
|
229
|
-
version: '1.
|
229
|
+
version: '1.7'
|
230
230
|
type: :runtime
|
231
231
|
prerelease: false
|
232
232
|
version_requirements: !ruby/object:Gem::Requirement
|
233
233
|
requirements:
|
234
234
|
- - "~>"
|
235
235
|
- !ruby/object:Gem::Version
|
236
|
-
version: '1.
|
236
|
+
version: '1.7'
|
237
237
|
- !ruby/object:Gem::Dependency
|
238
238
|
name: msgpack
|
239
239
|
requirement: !ruby/object:Gem::Requirement
|