dataflow-rb 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: '081f28d6c668f92bfe5da20f2301136af28949ae'
4
- data.tar.gz: 39dd214829a164c21b0c8c6b0d3406f423c84e82
3
+ metadata.gz: b185b31cd30d2380019e022c2e4810ac709e4d9d
4
+ data.tar.gz: 73f2b6dd47c2ece6792182479faae17321ffab2a
5
5
  SHA512:
6
- metadata.gz: 56db86c9444331cfa4d7ab41d6210066aff33ed4aabe0e139131ce42659f94be09f3a241a14580bec68871dc18bc5c371d075f3c5831689c5b08e982ff12e639
7
- data.tar.gz: 05a3a5e3eab0b89aa89046f9c70c68c6cbdb4cc59c672dd327d4dc069299a964f7f1f34e3dca299e9d097c8da4c486054a2447b6f700e9958bd32a90fd6b3794
6
+ metadata.gz: 29df923f46791f74dd9f5c25f79f293631600acf643b61f6c0000dbf63c7c604bf68530e1e693927dc8be9675e9880fcf4ef501dae713c11c339638f816fadfd
7
+ data.tar.gz: 9ab45da37c5b2dbbb266e817bf2a51dbfb24765697394ee82bcfc004de958528c1386319eace100e7aeb8bb28f51cf9c7d7fec77e81b0d2cedd4b2873db07f65
data/dataflow-rb.gemspec CHANGED
@@ -35,9 +35,9 @@ Gem::Specification.new do |spec|
35
35
  spec.add_dependency 'parallel', '~>1.10'
36
36
  spec.add_dependency 'mongoid', '~>6.0'
37
37
  spec.add_dependency 'sequel', '~>4.0'
38
- spec.add_dependency 'mysql2', '~>0.4'
39
- spec.add_dependency 'pg', '0.20'
40
- spec.add_dependency 'sequel_pg', '~>1.6'
38
+ spec.add_dependency 'mysql2', '>=0.3'
39
+ spec.add_dependency 'pg', '~>0.21'
40
+ spec.add_dependency 'sequel_pg', '~>1.7'
41
41
  spec.add_dependency 'msgpack', '~>1.0'
42
42
  spec.add_dependency 'smarter_csv', '1.1.0'
43
43
  spec.add_dependency 'timeliness', '~>0.3'
@@ -210,19 +210,16 @@ module Dataflow
210
210
  end
211
211
 
212
212
  def usage(dataset:)
213
- indexes = retrieve_collection_indexes(dataset)
214
213
  command = { collstats: dataset }
215
214
  result = client.database.command(command).documents[0]
216
215
  {
217
216
  memory: result['size'],
218
217
  storage: result['storageSize'],
219
- effective_indexes: indexes
220
218
  }
221
219
  rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
222
220
  {
223
221
  memory: 0,
224
222
  storage: 0,
225
- effective_indexes: indexes
226
223
  }
227
224
  end
228
225
 
@@ -289,6 +286,20 @@ module Dataflow
289
286
  sanitized_opts
290
287
  end
291
288
 
289
+ def retrieve_dataset_indexes(collection)
290
+ mongo_indexes = client[collection].indexes
291
+ mongo_indexes.map do |idx|
292
+ # skip the default index
293
+ next if idx['key'].keys == ['_id']
294
+
295
+ index = { 'key' => idx['key'].keys }
296
+ index['unique'] = true if idx['unique']
297
+ index
298
+ end.compact
299
+ rescue Mongo::Error::OperationFailure
300
+ []
301
+ end
302
+
292
303
  private
293
304
 
294
305
  def write_dataset_name
@@ -322,18 +333,6 @@ module Dataflow
322
333
  index[:unique] = true if dataset_index[:unique]
323
334
  index
324
335
  end
325
-
326
- def retrieve_collection_indexes(collection)
327
- mongo_indexes = client[collection].indexes
328
- mongo_indexes.map do |idx|
329
- # skip the default index
330
- next if idx['key'].keys == ['_id']
331
-
332
- index = { 'key' => idx['key'].keys }
333
- index['unique'] = true if idx['unique']
334
- index
335
- end.compact
336
- end
337
336
  end
338
337
  end
339
338
  end
@@ -4,7 +4,7 @@ module Dataflow
4
4
  # Interface between a data node and mongodb.
5
5
  # We use mongodb to perform all the store/retrieve operations.
6
6
  class MysqlAdapter < SqlAdapter
7
- def fetch_table_usage(dataset:)
7
+ def usage(dataset:)
8
8
  size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
9
9
  {
10
10
  memory: size,
@@ -4,7 +4,7 @@ module Dataflow
4
4
  # Interface between a data node and mongodb.
5
5
  # We use mongodb to perform all the store/retrieve operations.
6
6
  class PsqlAdapter < SqlAdapter
7
- def fetch_table_usage(dataset:)
7
+ def usage(dataset:)
8
8
  size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
9
9
  {
10
10
  memory: size,
@@ -227,12 +227,6 @@ module Dataflow
227
227
  end
228
228
  end
229
229
 
230
- def usage(dataset:)
231
- indexes = retrieve_collection_indexes(dataset)
232
- table_usage = fetch_table_usage(dataset: dataset)
233
- table_usage.merge(effective_indexes: indexes)
234
- end
235
-
236
230
  def transform_to_query(opts)
237
231
  # map to a serie of AND clauses queries
238
232
  opts.flat_map do |k, v|
@@ -263,6 +257,18 @@ module Dataflow
263
257
  end
264
258
  end
265
259
 
260
+ def retrieve_dataset_indexes(dataset_name)
261
+ psql_indexes = client.indexes(dataset_name)
262
+ psql_indexes.values.map do |idx|
263
+ cols = idx[:columns].map(&:to_s)
264
+ index = { 'key' => cols }
265
+ index['unique'] = true if idx[:unique]
266
+ index
267
+ end.compact
268
+ rescue Sequel::DatabaseError
269
+ []
270
+ end
271
+
266
272
  private
267
273
 
268
274
  MAX_INT = 2_147_483_647
@@ -330,18 +336,6 @@ module Dataflow
330
336
  params
331
337
  end
332
338
 
333
- def retrieve_collection_indexes(dataset_name)
334
- psql_indexes = client.indexes(dataset_name)
335
- psql_indexes.values.map do |idx|
336
- cols = idx[:columns].map(&:to_s)
337
- index = { 'key' => cols }
338
- index['unique'] = true if idx[:unique]
339
- index
340
- end.compact
341
- rescue Sequel::DatabaseError
342
- []
343
- end
344
-
345
339
  def logger
346
340
  @logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
347
341
  end
@@ -69,6 +69,7 @@ module Dataflow
69
69
  # internal use: where to read/write from. Use 1 and 2 for legacy reasons.
70
70
  field :read_dataset_idx, type: Integer, editable: false, default: 1
71
71
  field :write_dataset_idx, type: Integer, editable: false, default: 2
72
+ field :double_buffer_prefix, type: String, editable: false, default: 'buffer'
72
73
 
73
74
  # Necessary fields:
74
75
  validates_presence_of :db_name
@@ -234,7 +235,7 @@ module Dataflow
234
235
  return @temporary_read_dataset if @temporary_read_dataset
235
236
 
236
237
  if use_double_buffering
237
- "#{name}_buffer#{read_dataset_idx}"
238
+ "#{name}_#{double_buffer_prefix}#{read_dataset_idx}"
238
239
  else
239
240
  name
240
241
  end
@@ -242,7 +243,7 @@ module Dataflow
242
243
 
243
244
  def write_dataset_name
244
245
  if use_double_buffering
245
- "#{name}_buffer#{write_dataset_idx}"
246
+ "#{name}_#{double_buffer_prefix}#{write_dataset_idx}"
246
247
  else
247
248
  name
248
249
  end
@@ -297,12 +298,17 @@ module Dataflow
297
298
  updated_at: updated_at,
298
299
  record_count: count,
299
300
  indexes: indexes,
300
- effective_indexes: usage[:effective_indexes],
301
+ db_indexes: db_indexes(write_dataset: write_dataset),
301
302
  mem_usage: usage[:memory],
302
303
  storage_usage: usage[:storage]
303
304
  }
304
305
  end
305
306
 
307
+ def db_indexes(write_dataset: false)
308
+ dataset = write_dataset ? write_dataset_name : read_dataset_name
309
+ db_adapter.retrieve_dataset_indexes(dataset)
310
+ end
311
+
306
312
  def use_symbols?
307
313
  (db_backend.to_s =~ /sql/).present?
308
314
  end
@@ -367,7 +373,7 @@ module Dataflow
367
373
  raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a double buffer dataset but node '#{name}' is expecting a single buffered one." if read_idx > 0 && !use_double_buffering
368
374
 
369
375
  if use_double_buffering
370
- dataset_name = valid_dataset_names[read_idx - 1]
376
+ dataset_name = dataset_name_for_buffer(read_idx)
371
377
  else
372
378
  dataset_name = name
373
379
  end
@@ -416,12 +422,16 @@ module Dataflow
416
422
 
417
423
  def valid_dataset_names
418
424
  if use_double_buffering
419
- ["#{name}_buffer1", "#{name}_buffer2"]
425
+ [dataset_name_for_buffer(read_dataset_idx), dataset_name_for_buffer(write_dataset_idx)]
420
426
  else
421
427
  [name]
422
428
  end
423
429
  end
424
430
 
431
+ def dataset_name_for_buffer(idx)
432
+ "#{name}_#{double_buffer_prefix}#{idx}"
433
+ end
434
+
425
435
  def logger
426
436
  @logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
427
437
  end
@@ -43,13 +43,27 @@ module Dataflow
43
43
  # TODO: re-enabled event on_export_progressed
44
44
  # progress = (idx / queries.count.to_f * 100).ceil
45
45
  # on_export_progressed(pct_complete: progress)
46
- batch = node.all(where: query.merge(where), fields: sch.keys, sort: { system_id => 1 })
46
+
47
+ fields = transform_fields(node.db_backend, sch.keys)
48
+
49
+ batch = node.all(where: query.merge(where), fields: fields, sort: { system_id => 1 })
47
50
  csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
48
51
  end
49
52
 
50
53
  # needed by the csv exporter to finalize in a single file
51
54
  csv_adapter.on_save_finished
52
55
  end
56
+
57
+ # Transform the keys to the field that need to be selected on the backend.
58
+ # That's a fix meant especially for selecting nested values on mongo
59
+ def transform_fields(db_backend, keys)
60
+ return keys unless db_backend == :mongodb
61
+
62
+ # replace the separator with a dot and make sure we don't select individual
63
+ # array keys... it seems to breakdown mongodb
64
+ keys.map { |k| k.gsub(Dataflow::SchemaMixin::SEPARATOR, '.') }
65
+ .map { |k| k.gsub(/\.[0-9]+/, '') }.uniq
66
+ end
53
67
  end
54
68
  end
55
69
  end
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.15.0'
3
+ VERSION = '0.16.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-07-06 00:00:00.000000000 Z
11
+ date: 2017-08-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -196,44 +196,44 @@ dependencies:
196
196
  name: mysql2
197
197
  requirement: !ruby/object:Gem::Requirement
198
198
  requirements:
199
- - - "~>"
199
+ - - ">="
200
200
  - !ruby/object:Gem::Version
201
- version: '0.4'
201
+ version: '0.3'
202
202
  type: :runtime
203
203
  prerelease: false
204
204
  version_requirements: !ruby/object:Gem::Requirement
205
205
  requirements:
206
- - - "~>"
206
+ - - ">="
207
207
  - !ruby/object:Gem::Version
208
- version: '0.4'
208
+ version: '0.3'
209
209
  - !ruby/object:Gem::Dependency
210
210
  name: pg
211
211
  requirement: !ruby/object:Gem::Requirement
212
212
  requirements:
213
- - - '='
213
+ - - "~>"
214
214
  - !ruby/object:Gem::Version
215
- version: '0.20'
215
+ version: '0.21'
216
216
  type: :runtime
217
217
  prerelease: false
218
218
  version_requirements: !ruby/object:Gem::Requirement
219
219
  requirements:
220
- - - '='
220
+ - - "~>"
221
221
  - !ruby/object:Gem::Version
222
- version: '0.20'
222
+ version: '0.21'
223
223
  - !ruby/object:Gem::Dependency
224
224
  name: sequel_pg
225
225
  requirement: !ruby/object:Gem::Requirement
226
226
  requirements:
227
227
  - - "~>"
228
228
  - !ruby/object:Gem::Version
229
- version: '1.6'
229
+ version: '1.7'
230
230
  type: :runtime
231
231
  prerelease: false
232
232
  version_requirements: !ruby/object:Gem::Requirement
233
233
  requirements:
234
234
  - - "~>"
235
235
  - !ruby/object:Gem::Version
236
- version: '1.6'
236
+ version: '1.7'
237
237
  - !ruby/object:Gem::Dependency
238
238
  name: msgpack
239
239
  requirement: !ruby/object:Gem::Requirement