dataflow-rb 0.15.0 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: '081f28d6c668f92bfe5da20f2301136af28949ae'
4
- data.tar.gz: 39dd214829a164c21b0c8c6b0d3406f423c84e82
3
+ metadata.gz: b185b31cd30d2380019e022c2e4810ac709e4d9d
4
+ data.tar.gz: 73f2b6dd47c2ece6792182479faae17321ffab2a
5
5
  SHA512:
6
- metadata.gz: 56db86c9444331cfa4d7ab41d6210066aff33ed4aabe0e139131ce42659f94be09f3a241a14580bec68871dc18bc5c371d075f3c5831689c5b08e982ff12e639
7
- data.tar.gz: 05a3a5e3eab0b89aa89046f9c70c68c6cbdb4cc59c672dd327d4dc069299a964f7f1f34e3dca299e9d097c8da4c486054a2447b6f700e9958bd32a90fd6b3794
6
+ metadata.gz: 29df923f46791f74dd9f5c25f79f293631600acf643b61f6c0000dbf63c7c604bf68530e1e693927dc8be9675e9880fcf4ef501dae713c11c339638f816fadfd
7
+ data.tar.gz: 9ab45da37c5b2dbbb266e817bf2a51dbfb24765697394ee82bcfc004de958528c1386319eace100e7aeb8bb28f51cf9c7d7fec77e81b0d2cedd4b2873db07f65
data/dataflow-rb.gemspec CHANGED
@@ -35,9 +35,9 @@ Gem::Specification.new do |spec|
35
35
  spec.add_dependency 'parallel', '~>1.10'
36
36
  spec.add_dependency 'mongoid', '~>6.0'
37
37
  spec.add_dependency 'sequel', '~>4.0'
38
- spec.add_dependency 'mysql2', '~>0.4'
39
- spec.add_dependency 'pg', '0.20'
40
- spec.add_dependency 'sequel_pg', '~>1.6'
38
+ spec.add_dependency 'mysql2', '>=0.3'
39
+ spec.add_dependency 'pg', '~>0.21'
40
+ spec.add_dependency 'sequel_pg', '~>1.7'
41
41
  spec.add_dependency 'msgpack', '~>1.0'
42
42
  spec.add_dependency 'smarter_csv', '1.1.0'
43
43
  spec.add_dependency 'timeliness', '~>0.3'
@@ -210,19 +210,16 @@ module Dataflow
210
210
  end
211
211
 
212
212
  def usage(dataset:)
213
- indexes = retrieve_collection_indexes(dataset)
214
213
  command = { collstats: dataset }
215
214
  result = client.database.command(command).documents[0]
216
215
  {
217
216
  memory: result['size'],
218
217
  storage: result['storageSize'],
219
- effective_indexes: indexes
220
218
  }
221
219
  rescue Mongo::Error::OperationFailure, Mongo::Error::InvalidCollectionName
222
220
  {
223
221
  memory: 0,
224
222
  storage: 0,
225
- effective_indexes: indexes
226
223
  }
227
224
  end
228
225
 
@@ -289,6 +286,20 @@ module Dataflow
289
286
  sanitized_opts
290
287
  end
291
288
 
289
+ def retrieve_dataset_indexes(collection)
290
+ mongo_indexes = client[collection].indexes
291
+ mongo_indexes.map do |idx|
292
+ # skip the default index
293
+ next if idx['key'].keys == ['_id']
294
+
295
+ index = { 'key' => idx['key'].keys }
296
+ index['unique'] = true if idx['unique']
297
+ index
298
+ end.compact
299
+ rescue Mongo::Error::OperationFailure
300
+ []
301
+ end
302
+
292
303
  private
293
304
 
294
305
  def write_dataset_name
@@ -322,18 +333,6 @@ module Dataflow
322
333
  index[:unique] = true if dataset_index[:unique]
323
334
  index
324
335
  end
325
-
326
- def retrieve_collection_indexes(collection)
327
- mongo_indexes = client[collection].indexes
328
- mongo_indexes.map do |idx|
329
- # skip the default index
330
- next if idx['key'].keys == ['_id']
331
-
332
- index = { 'key' => idx['key'].keys }
333
- index['unique'] = true if idx['unique']
334
- index
335
- end.compact
336
- end
337
336
  end
338
337
  end
339
338
  end
@@ -4,7 +4,7 @@ module Dataflow
4
4
  # Interface between a data node and mongodb.
5
5
  # We use mongodb to perform all the store/retrieve operations.
6
6
  class MysqlAdapter < SqlAdapter
7
- def fetch_table_usage(dataset:)
7
+ def usage(dataset:)
8
8
  size = client["SELECT data_length + index_length as size from information_schema.TABLES WHERE table_schema = '#{settings.db_name}' and table_name = '#{dataset}'"].first[:size]
9
9
  {
10
10
  memory: size,
@@ -4,7 +4,7 @@ module Dataflow
4
4
  # Interface between a data node and mongodb.
5
5
  # We use mongodb to perform all the store/retrieve operations.
6
6
  class PsqlAdapter < SqlAdapter
7
- def fetch_table_usage(dataset:)
7
+ def usage(dataset:)
8
8
  size = client["SELECT pg_relation_size('#{dataset}') as size"].first[:size]
9
9
  {
10
10
  memory: size,
@@ -227,12 +227,6 @@ module Dataflow
227
227
  end
228
228
  end
229
229
 
230
- def usage(dataset:)
231
- indexes = retrieve_collection_indexes(dataset)
232
- table_usage = fetch_table_usage(dataset: dataset)
233
- table_usage.merge(effective_indexes: indexes)
234
- end
235
-
236
230
  def transform_to_query(opts)
237
231
  # map to a serie of AND clauses queries
238
232
  opts.flat_map do |k, v|
@@ -263,6 +257,18 @@ module Dataflow
263
257
  end
264
258
  end
265
259
 
260
+ def retrieve_dataset_indexes(dataset_name)
261
+ psql_indexes = client.indexes(dataset_name)
262
+ psql_indexes.values.map do |idx|
263
+ cols = idx[:columns].map(&:to_s)
264
+ index = { 'key' => cols }
265
+ index['unique'] = true if idx[:unique]
266
+ index
267
+ end.compact
268
+ rescue Sequel::DatabaseError
269
+ []
270
+ end
271
+
266
272
  private
267
273
 
268
274
  MAX_INT = 2_147_483_647
@@ -330,18 +336,6 @@ module Dataflow
330
336
  params
331
337
  end
332
338
 
333
- def retrieve_collection_indexes(dataset_name)
334
- psql_indexes = client.indexes(dataset_name)
335
- psql_indexes.values.map do |idx|
336
- cols = idx[:columns].map(&:to_s)
337
- index = { 'key' => cols }
338
- index['unique'] = true if idx[:unique]
339
- index
340
- end.compact
341
- rescue Sequel::DatabaseError
342
- []
343
- end
344
-
345
339
  def logger
346
340
  @logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
347
341
  end
@@ -69,6 +69,7 @@ module Dataflow
69
69
  # internal use: where to read/write from. Use 1 and 2 for legacy reasons.
70
70
  field :read_dataset_idx, type: Integer, editable: false, default: 1
71
71
  field :write_dataset_idx, type: Integer, editable: false, default: 2
72
+ field :double_buffer_prefix, type: String, editable: false, default: 'buffer'
72
73
 
73
74
  # Necessary fields:
74
75
  validates_presence_of :db_name
@@ -234,7 +235,7 @@ module Dataflow
234
235
  return @temporary_read_dataset if @temporary_read_dataset
235
236
 
236
237
  if use_double_buffering
237
- "#{name}_buffer#{read_dataset_idx}"
238
+ "#{name}_#{double_buffer_prefix}#{read_dataset_idx}"
238
239
  else
239
240
  name
240
241
  end
@@ -242,7 +243,7 @@ module Dataflow
242
243
 
243
244
  def write_dataset_name
244
245
  if use_double_buffering
245
- "#{name}_buffer#{write_dataset_idx}"
246
+ "#{name}_#{double_buffer_prefix}#{write_dataset_idx}"
246
247
  else
247
248
  name
248
249
  end
@@ -297,12 +298,17 @@ module Dataflow
297
298
  updated_at: updated_at,
298
299
  record_count: count,
299
300
  indexes: indexes,
300
- effective_indexes: usage[:effective_indexes],
301
+ db_indexes: db_indexes(write_dataset: write_dataset),
301
302
  mem_usage: usage[:memory],
302
303
  storage_usage: usage[:storage]
303
304
  }
304
305
  end
305
306
 
307
+ def db_indexes(write_dataset: false)
308
+ dataset = write_dataset ? write_dataset_name : read_dataset_name
309
+ db_adapter.retrieve_dataset_indexes(dataset)
310
+ end
311
+
306
312
  def use_symbols?
307
313
  (db_backend.to_s =~ /sql/).present?
308
314
  end
@@ -367,7 +373,7 @@ module Dataflow
367
373
  raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a double buffer dataset but node '#{name}' is expecting a single buffered one." if read_idx > 0 && !use_double_buffering
368
374
 
369
375
  if use_double_buffering
370
- dataset_name = valid_dataset_names[read_idx - 1]
376
+ dataset_name = dataset_name_for_buffer(read_idx)
371
377
  else
372
378
  dataset_name = name
373
379
  end
@@ -416,12 +422,16 @@ module Dataflow
416
422
 
417
423
  def valid_dataset_names
418
424
  if use_double_buffering
419
- ["#{name}_buffer1", "#{name}_buffer2"]
425
+ [dataset_name_for_buffer(read_dataset_idx), dataset_name_for_buffer(write_dataset_idx)]
420
426
  else
421
427
  [name]
422
428
  end
423
429
  end
424
430
 
431
+ def dataset_name_for_buffer(idx)
432
+ "#{name}_#{double_buffer_prefix}#{idx}"
433
+ end
434
+
425
435
  def logger
426
436
  @logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
427
437
  end
@@ -43,13 +43,27 @@ module Dataflow
43
43
  # TODO: re-enabled event on_export_progressed
44
44
  # progress = (idx / queries.count.to_f * 100).ceil
45
45
  # on_export_progressed(pct_complete: progress)
46
- batch = node.all(where: query.merge(where), fields: sch.keys, sort: { system_id => 1 })
46
+
47
+ fields = transform_fields(node.db_backend, sch.keys)
48
+
49
+ batch = node.all(where: query.merge(where), fields: fields, sort: { system_id => 1 })
47
50
  csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
48
51
  end
49
52
 
50
53
  # needed by the csv exporter to finalize in a single file
51
54
  csv_adapter.on_save_finished
52
55
  end
56
+
57
+ # Transform the keys to the field that need to be selected on the backend.
58
+ # That's a fix meant especially for selecting nested values on mongo
59
+ def transform_fields(db_backend, keys)
60
+ return keys unless db_backend == :mongodb
61
+
62
+ # replace the separator with a dot and make sure we don't select individual
63
+ # array keys... it seems to breakdown mongodb
64
+ keys.map { |k| k.gsub(Dataflow::SchemaMixin::SEPARATOR, '.') }
65
+ .map { |k| k.gsub(/\.[0-9]+/, '') }.uniq
66
+ end
53
67
  end
54
68
  end
55
69
  end
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.15.0'
3
+ VERSION = '0.16.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.0
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-07-06 00:00:00.000000000 Z
11
+ date: 2017-08-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -196,44 +196,44 @@ dependencies:
196
196
  name: mysql2
197
197
  requirement: !ruby/object:Gem::Requirement
198
198
  requirements:
199
- - - "~>"
199
+ - - ">="
200
200
  - !ruby/object:Gem::Version
201
- version: '0.4'
201
+ version: '0.3'
202
202
  type: :runtime
203
203
  prerelease: false
204
204
  version_requirements: !ruby/object:Gem::Requirement
205
205
  requirements:
206
- - - "~>"
206
+ - - ">="
207
207
  - !ruby/object:Gem::Version
208
- version: '0.4'
208
+ version: '0.3'
209
209
  - !ruby/object:Gem::Dependency
210
210
  name: pg
211
211
  requirement: !ruby/object:Gem::Requirement
212
212
  requirements:
213
- - - '='
213
+ - - "~>"
214
214
  - !ruby/object:Gem::Version
215
- version: '0.20'
215
+ version: '0.21'
216
216
  type: :runtime
217
217
  prerelease: false
218
218
  version_requirements: !ruby/object:Gem::Requirement
219
219
  requirements:
220
- - - '='
220
+ - - "~>"
221
221
  - !ruby/object:Gem::Version
222
- version: '0.20'
222
+ version: '0.21'
223
223
  - !ruby/object:Gem::Dependency
224
224
  name: sequel_pg
225
225
  requirement: !ruby/object:Gem::Requirement
226
226
  requirements:
227
227
  - - "~>"
228
228
  - !ruby/object:Gem::Version
229
- version: '1.6'
229
+ version: '1.7'
230
230
  type: :runtime
231
231
  prerelease: false
232
232
  version_requirements: !ruby/object:Gem::Requirement
233
233
  requirements:
234
234
  - - "~>"
235
235
  - !ruby/object:Gem::Version
236
- version: '1.6'
236
+ version: '1.7'
237
237
  - !ruby/object:Gem::Dependency
238
238
  name: msgpack
239
239
  requirement: !ruby/object:Gem::Requirement