dataflow-rb 0.15.0 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/dataflow-rb.gemspec +3 -3
- data/lib/dataflow/adapters/mongo_db_adapter.rb +14 -15
- data/lib/dataflow/adapters/mysql_adapter.rb +1 -1
- data/lib/dataflow/adapters/psql_adapter.rb +1 -1
- data/lib/dataflow/adapters/sql_adapter.rb +12 -18
- data/lib/dataflow/nodes/data_node.rb +15 -5
- data/lib/dataflow/nodes/export/to_csv_node.rb +15 -1
- data/lib/dataflow/version.rb +1 -1
- metadata +12 -12
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b185b31cd30d2380019e022c2e4810ac709e4d9d
|
4
|
+
data.tar.gz: 73f2b6dd47c2ece6792182479faae17321ffab2a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 29df923f46791f74dd9f5c25f79f293631600acf643b61f6c0000dbf63c7c604bf68530e1e693927dc8be9675e9880fcf4ef501dae713c11c339638f816fadfd
|
7
|
+
data.tar.gz: 9ab45da37c5b2dbbb266e817bf2a51dbfb24765697394ee82bcfc004de958528c1386319eace100e7aeb8bb28f51cf9c7d7fec77e81b0d2cedd4b2873db07f65
|
data/dataflow-rb.gemspec
CHANGED
@@ -35,9 +35,9 @@ Gem::Specification.new do |spec|
|
|
35
35
|
spec.add_dependency 'parallel', '~>1.10'
|
36
36
|
spec.add_dependency 'mongoid', '~>6.0'
|
37
37
|
spec.add_dependency 'sequel', '~>4.0'
|
38
|
-
spec.add_dependency 'mysql2', '
|
39
|
-
spec.add_dependency 'pg', '0.
|
40
|
-
spec.add_dependency 'sequel_pg', '~>1.
|
38
|
+
spec.add_dependency 'mysql2', '>=0.3'
|
39
|
+
spec.add_dependency 'pg', '~>0.21'
|
40
|
+
spec.add_dependency 'sequel_pg', '~>1.7'
|
41
41
|
spec.add_dependency 'msgpack', '~>1.0'
|
42
42
|
spec.add_dependency 'smarter_csv', '1.1.0'
|
43
43
|
spec.add_dependency 'timeliness', '~>0.3'
|
@@ -210,19 +210,16 @@ module Dataflow
|
|
210
210
|
end
|
211
211
|
|
212
212
|
def usage(dataset:)
|
213
|
-
indexes = retrieve_collection_indexes(dataset)
|
214
213
|
command = { collstats: dataset }
|
215
214
|
result = client.database.command(command).documents[0]
|
216
215
|
{
|
217
216
|
memory: result['size'],
|
218
217
|
storage: result['storageSize'],
|
219
|
-
effective_indexes: indexes
|
220
218
|
}
|
221
219
|
rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
|
222
220
|
{
|
223
221
|
memory: 0,
|
224
222
|
storage: 0,
|
225
|
-
effective_indexes: indexes
|
226
223
|
}
|
227
224
|
end
|
228
225
|
|
@@ -289,6 +286,20 @@ module Dataflow
|
|
289
286
|
sanitized_opts
|
290
287
|
end
|
291
288
|
|
289
|
+
def retrieve_dataset_indexes(collection)
|
290
|
+
mongo_indexes = client[collection].indexes
|
291
|
+
mongo_indexes.map do |idx|
|
292
|
+
# skip the default index
|
293
|
+
next if idx['key'].keys == ['_id']
|
294
|
+
|
295
|
+
index = { 'key' => idx['key'].keys }
|
296
|
+
index['unique'] = true if idx['unique']
|
297
|
+
index
|
298
|
+
end.compact
|
299
|
+
rescue Mongo::Error::OperationFailure
|
300
|
+
[]
|
301
|
+
end
|
302
|
+
|
292
303
|
private
|
293
304
|
|
294
305
|
def write_dataset_name
|
@@ -322,18 +333,6 @@ module Dataflow
|
|
322
333
|
index[:unique] = true if dataset_index[:unique]
|
323
334
|
index
|
324
335
|
end
|
325
|
-
|
326
|
-
def retrieve_collection_indexes(collection)
|
327
|
-
mongo_indexes = client[collection].indexes
|
328
|
-
mongo_indexes.map do |idx|
|
329
|
-
# skip the default index
|
330
|
-
next if idx['key'].keys == ['_id']
|
331
|
-
|
332
|
-
index = { 'key' => idx['key'].keys }
|
333
|
-
index['unique'] = true if idx['unique']
|
334
|
-
index
|
335
|
-
end.compact
|
336
|
-
end
|
337
336
|
end
|
338
337
|
end
|
339
338
|
end
|
@@ -4,7 +4,7 @@ module Dataflow
|
|
4
4
|
# Interface between a data node and mongodb.
|
5
5
|
# We use mongodb to perform all the store/retrieve operations.
|
6
6
|
class MysqlAdapter < SqlAdapter
|
7
|
-
def
|
7
|
+
def usage(dataset:)
|
8
8
|
size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
|
9
9
|
{
|
10
10
|
memory: size,
|
@@ -4,7 +4,7 @@ module Dataflow
|
|
4
4
|
# Interface between a data node and mongodb.
|
5
5
|
# We use mongodb to perform all the store/retrieve operations.
|
6
6
|
class PsqlAdapter < SqlAdapter
|
7
|
-
def
|
7
|
+
def usage(dataset:)
|
8
8
|
size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
|
9
9
|
{
|
10
10
|
memory: size,
|
@@ -227,12 +227,6 @@ module Dataflow
|
|
227
227
|
end
|
228
228
|
end
|
229
229
|
|
230
|
-
def usage(dataset:)
|
231
|
-
indexes = retrieve_collection_indexes(dataset)
|
232
|
-
table_usage = fetch_table_usage(dataset: dataset)
|
233
|
-
table_usage.merge(effective_indexes: indexes)
|
234
|
-
end
|
235
|
-
|
236
230
|
def transform_to_query(opts)
|
237
231
|
# map to a serie of AND clauses queries
|
238
232
|
opts.flat_map do |k, v|
|
@@ -263,6 +257,18 @@ module Dataflow
|
|
263
257
|
end
|
264
258
|
end
|
265
259
|
|
260
|
+
def retrieve_dataset_indexes(dataset_name)
|
261
|
+
psql_indexes = client.indexes(dataset_name)
|
262
|
+
psql_indexes.values.map do |idx|
|
263
|
+
cols = idx[:columns].map(&:to_s)
|
264
|
+
index = { 'key' => cols }
|
265
|
+
index['unique'] = true if idx[:unique]
|
266
|
+
index
|
267
|
+
end.compact
|
268
|
+
rescue Sequel::DatabaseError
|
269
|
+
[]
|
270
|
+
end
|
271
|
+
|
266
272
|
private
|
267
273
|
|
268
274
|
MAX_INT = 2_147_483_647
|
@@ -330,18 +336,6 @@ module Dataflow
|
|
330
336
|
params
|
331
337
|
end
|
332
338
|
|
333
|
-
def retrieve_collection_indexes(dataset_name)
|
334
|
-
psql_indexes = client.indexes(dataset_name)
|
335
|
-
psql_indexes.values.map do |idx|
|
336
|
-
cols = idx[:columns].map(&:to_s)
|
337
|
-
index = { 'key' => cols }
|
338
|
-
index['unique'] = true if idx[:unique]
|
339
|
-
index
|
340
|
-
end.compact
|
341
|
-
rescue Sequel::DatabaseError
|
342
|
-
[]
|
343
|
-
end
|
344
|
-
|
345
339
|
def logger
|
346
340
|
@logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
|
347
341
|
end
|
@@ -69,6 +69,7 @@ module Dataflow
|
|
69
69
|
# internal use: where to read/write from. Use 1 and 2 for legacy reasons.
|
70
70
|
field :read_dataset_idx, type: Integer, editable: false, default: 1
|
71
71
|
field :write_dataset_idx, type: Integer, editable: false, default: 2
|
72
|
+
field :double_buffer_prefix, type: String, editable: false, default: 'buffer'
|
72
73
|
|
73
74
|
# Necessary fields:
|
74
75
|
validates_presence_of :db_name
|
@@ -234,7 +235,7 @@ module Dataflow
|
|
234
235
|
return @temporary_read_dataset if @temporary_read_dataset
|
235
236
|
|
236
237
|
if use_double_buffering
|
237
|
-
"#{name}
|
238
|
+
"#{name}_#{double_buffer_prefix}#{read_dataset_idx}"
|
238
239
|
else
|
239
240
|
name
|
240
241
|
end
|
@@ -242,7 +243,7 @@ module Dataflow
|
|
242
243
|
|
243
244
|
def write_dataset_name
|
244
245
|
if use_double_buffering
|
245
|
-
"#{name}
|
246
|
+
"#{name}_#{double_buffer_prefix}#{write_dataset_idx}"
|
246
247
|
else
|
247
248
|
name
|
248
249
|
end
|
@@ -297,12 +298,17 @@ module Dataflow
|
|
297
298
|
updated_at: updated_at,
|
298
299
|
record_count: count,
|
299
300
|
indexes: indexes,
|
300
|
-
|
301
|
+
db_indexes: db_indexes(write_dataset: write_dataset),
|
301
302
|
mem_usage: usage[:memory],
|
302
303
|
storage_usage: usage[:storage]
|
303
304
|
}
|
304
305
|
end
|
305
306
|
|
307
|
+
def db_indexes(write_dataset: false)
|
308
|
+
dataset = write_dataset ? write_dataset_name : read_dataset_name
|
309
|
+
db_adapter.retrieve_dataset_indexes(dataset)
|
310
|
+
end
|
311
|
+
|
306
312
|
def use_symbols?
|
307
313
|
(db_backend.to_s =~ /sql/).present?
|
308
314
|
end
|
@@ -367,7 +373,7 @@ module Dataflow
|
|
367
373
|
raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a double buffer dataset but node '#{name}' is expecting a single buffered one." if read_idx > 0 && !use_double_buffering
|
368
374
|
|
369
375
|
if use_double_buffering
|
370
|
-
dataset_name =
|
376
|
+
dataset_name = dataset_name_for_buffer(read_idx)
|
371
377
|
else
|
372
378
|
dataset_name = name
|
373
379
|
end
|
@@ -416,12 +422,16 @@ module Dataflow
|
|
416
422
|
|
417
423
|
def valid_dataset_names
|
418
424
|
if use_double_buffering
|
419
|
-
[
|
425
|
+
[dataset_name_for_buffer(read_dataset_idx), dataset_name_for_buffer(write_dataset_idx)]
|
420
426
|
else
|
421
427
|
[name]
|
422
428
|
end
|
423
429
|
end
|
424
430
|
|
431
|
+
def dataset_name_for_buffer(idx)
|
432
|
+
"#{name}_#{double_buffer_prefix}#{idx}"
|
433
|
+
end
|
434
|
+
|
425
435
|
def logger
|
426
436
|
@logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
|
427
437
|
end
|
@@ -43,13 +43,27 @@ module Dataflow
|
|
43
43
|
# TODO: re-enabled event on_export_progressed
|
44
44
|
# progress = (idx / queries.count.to_f * 100).ceil
|
45
45
|
# on_export_progressed(pct_complete: progress)
|
46
|
-
|
46
|
+
|
47
|
+
fields = transform_fields(node.db_backend, sch.keys)
|
48
|
+
|
49
|
+
batch = node.all(where: query.merge(where), fields: fields, sort: { system_id => 1 })
|
47
50
|
csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
|
48
51
|
end
|
49
52
|
|
50
53
|
# needed by the csv exporter to finalize in a single file
|
51
54
|
csv_adapter.on_save_finished
|
52
55
|
end
|
56
|
+
|
57
|
+
# Transform the keys to the field that need to be selected on the backend.
|
58
|
+
# That's a fix meant especially for selecting nested values on mongo
|
59
|
+
def transform_fields(db_backend, keys)
|
60
|
+
return keys unless db_backend == :mongodb
|
61
|
+
|
62
|
+
# replace the separator with a dot and make sure we don't select individual
|
63
|
+
# array keys... it seems to breakdown mongodb
|
64
|
+
keys.map { |k| k.gsub(Dataflow::SchemaMixin::SEPARATOR, '.') }
|
65
|
+
.map { |k| k.gsub(/\.[0-9]+/, '') }.uniq
|
66
|
+
end
|
53
67
|
end
|
54
68
|
end
|
55
69
|
end
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-08-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -196,44 +196,44 @@ dependencies:
|
|
196
196
|
name: mysql2
|
197
197
|
requirement: !ruby/object:Gem::Requirement
|
198
198
|
requirements:
|
199
|
-
- - "
|
199
|
+
- - ">="
|
200
200
|
- !ruby/object:Gem::Version
|
201
|
-
version: '0.
|
201
|
+
version: '0.3'
|
202
202
|
type: :runtime
|
203
203
|
prerelease: false
|
204
204
|
version_requirements: !ruby/object:Gem::Requirement
|
205
205
|
requirements:
|
206
|
-
- - "
|
206
|
+
- - ">="
|
207
207
|
- !ruby/object:Gem::Version
|
208
|
-
version: '0.
|
208
|
+
version: '0.3'
|
209
209
|
- !ruby/object:Gem::Dependency
|
210
210
|
name: pg
|
211
211
|
requirement: !ruby/object:Gem::Requirement
|
212
212
|
requirements:
|
213
|
-
- -
|
213
|
+
- - "~>"
|
214
214
|
- !ruby/object:Gem::Version
|
215
|
-
version: '0.
|
215
|
+
version: '0.21'
|
216
216
|
type: :runtime
|
217
217
|
prerelease: false
|
218
218
|
version_requirements: !ruby/object:Gem::Requirement
|
219
219
|
requirements:
|
220
|
-
- -
|
220
|
+
- - "~>"
|
221
221
|
- !ruby/object:Gem::Version
|
222
|
-
version: '0.
|
222
|
+
version: '0.21'
|
223
223
|
- !ruby/object:Gem::Dependency
|
224
224
|
name: sequel_pg
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|
226
226
|
requirements:
|
227
227
|
- - "~>"
|
228
228
|
- !ruby/object:Gem::Version
|
229
|
-
version: '1.
|
229
|
+
version: '1.7'
|
230
230
|
type: :runtime
|
231
231
|
prerelease: false
|
232
232
|
version_requirements: !ruby/object:Gem::Requirement
|
233
233
|
requirements:
|
234
234
|
- - "~>"
|
235
235
|
- !ruby/object:Gem::Version
|
236
|
-
version: '1.
|
236
|
+
version: '1.7'
|
237
237
|
- !ruby/object:Gem::Dependency
|
238
238
|
name: msgpack
|
239
239
|
requirement: !ruby/object:Gem::Requirement
|