dataflow-rb 0.13.0 → 0.14.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 72840e2477fe869fb06b0299c96d5ae2a57c7713
4
- data.tar.gz: f9f03314f23473585a9e742740c0e809f2d99bc7
3
+ metadata.gz: 3d45b64a7e367df85841ae86e6fde6550c33319a
4
+ data.tar.gz: c4ac87dcaf77cd8a7b842a87523271a7be70a850
5
5
  SHA512:
6
- metadata.gz: 43f7cef4b2150017871cb7b3c0f21602a01f385e40d07ddf7000f455a4adc007669974fd4e7170e4acc3807feae907f6114e3b3cbfbdbbf36f96348c3a06f60c
7
- data.tar.gz: d16411f178fa8ccc00cc9dbaefd0905040a5f8354874f7b00ad5080681d912fa8051dd16e3975d25aed8c737fdf70594f3bb77a948e95665d92a198f0e65206c
6
+ metadata.gz: 13abdbc494c020670183e3630261d684df43ea5c4fef110f567b97ca95883f7f8515014ce1e8b9eacca452e5180727a493cf8b7ea10595b1ea41aa632e074074
7
+ data.tar.gz: a0f6a2aff4b1ecce23b74610574cadf83c598d523b49ac7e05a4d68768f441398767a58a77959f3f4b037962d3b0f3b94804688c051b9576b8c6fe9abce6e159
@@ -1,5 +1,22 @@
1
1
  # Changelog
2
2
 
3
+ #### 0.14.0
4
+ - [ef8ddcd] Do not assume a minimum of 1 dependency per compute node.
5
+ - [b131bb1] Add type check on the data node #add methods. Filter nil values.
6
+ - [effc5a4] Set the rabbitmq as coming from the env
7
+ - [577ea2e] Add support for computing a node remotely.
8
+ - [4a450c3] Remove the custom not implemented error and use the default one.
9
+ - [f9c48c5] Added some new lines
10
+ - [336b9f8] Fix the backup options
11
+ - [2b2fbee] Make the runtime query node a subclass of the read only data node
12
+ - [fe237c4] Change the backup structure to isolate the db name by folder
13
+ - [654927f] Experiment with querying arrays
14
+ - [506f105] Order by system id when exporting
15
+ - [fa8fdc3] Keep the data ordered when exporting to csv
16
+ - [5e1718d] Add support for postgresql when inferring partial schemas (needed for export)
17
+
18
+ #### 0.13.1
19
+ - [aa3ed2e] Fix a bug when storing a db connection
3
20
 
4
21
  #### 0.13.0
5
22
  - [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
@@ -42,4 +42,5 @@ Gem::Specification.new do |spec|
42
42
  spec.add_dependency 'smarter_csv', '1.1.0'
43
43
  spec.add_dependency 'timeliness', '~>0.3'
44
44
  spec.add_dependency 'chronic', '~>0.10'
45
+ spec.add_dependency 'bunny', '~>2.7'
45
46
  end
@@ -17,6 +17,8 @@ require 'dataflow/logger'
17
17
  require 'dataflow/properties_mixin'
18
18
  require 'dataflow/schema_mixin'
19
19
  require 'dataflow/node'
20
+ require 'dataflow/executor'
21
+ require 'dataflow/remote_worker'
20
22
 
21
23
  require 'dataflow/adapters/csv_adapter'
22
24
  require 'dataflow/adapters/mongo_db_adapter'
@@ -26,7 +28,7 @@ require 'dataflow/adapters/psql_adapter'
26
28
  require 'dataflow/adapters/settings'
27
29
 
28
30
  require 'dataflow/errors/invalid_configuration_error'
29
- require 'dataflow/errors/not_implemented_error'
31
+ require 'dataflow/errors/remote_execution_error'
30
32
 
31
33
  require 'dataflow/nodes/mixin/add_internal_timestamp'
32
34
  require 'dataflow/nodes/mixin/rename_dotted_fields'
@@ -27,7 +27,7 @@ module Dataflow
27
27
 
28
28
  # retrieve a single element from a data node
29
29
  def find(where: opts = {})
30
- raise Errors::NotImplementedError, '#find is not yet support on CSV.'
30
+ raise NotImplementedError, '#find is not yet support on CSV.'
31
31
  end
32
32
 
33
33
  # retrieve all elements from a data node
@@ -43,8 +43,8 @@ module Dataflow
43
43
  end
44
44
 
45
45
  # save the given records
46
- def save(records:)
47
- write_csv_part(records, keys: @schema.keys)
46
+ def save(records:, part: nil)
47
+ write_csv_part(records, keys: @schema.keys, part: part)
48
48
  end
49
49
 
50
50
  def on_save_finished
@@ -52,7 +52,7 @@ module Dataflow
52
52
  end
53
53
 
54
54
  def remove(_opts = {})
55
- raise Errors::NotImplementedError, '#find is not yet support on CSV.'
55
+ raise NotImplementedError, '#find is not yet support on CSV.'
56
56
  end
57
57
 
58
58
  def recreate_dataset(dataset: nil)
@@ -79,10 +79,10 @@ module Dataflow
79
79
 
80
80
  def file_parts
81
81
  part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
82
- Dir["#{file_path}.part_*"]
82
+ Dir["#{file_path}.part_*"].sort
83
83
  end
84
84
 
85
- def write_csv_part(data, keys:)
85
+ def write_csv_part(data, keys:, part:)
86
86
  # prepare the data
87
87
  key_tokens = keys.map { |key| record_dig_tokens(key: key) }
88
88
  rows = data.map do |datum|
@@ -90,8 +90,8 @@ module Dataflow
90
90
  end
91
91
 
92
92
  # dump in a part file
93
- uuid = SecureRandom.hex
94
- CSV.open("#{file_path}.part_#{uuid}", 'w') do |csv|
93
+ part ||= SecureRandom.hex
94
+ CSV.open("#{file_path}.part_#{part}", 'w') do |csv|
95
95
  rows.each { |row| csv << row }
96
96
  end
97
97
  end
@@ -226,24 +226,26 @@ module Dataflow
226
226
  end
227
227
 
228
228
  def dump(base_folder:)
229
- archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.gz"
230
- options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
231
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
232
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
233
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
234
- options += "--password=#{@settings.db_password}" if @settings.db_password.present?
235
- `mkdir -p #{base_folder}`
229
+ archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.gz"
230
+ options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
231
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
232
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
233
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
234
+ options += "--password=#{@settings.db_password} " if @settings.db_password.present?
235
+
236
+ `mkdir -p #{base_folder}/#{@settings.db_name}`
236
237
  `mongodump #{options} --gzip`
237
238
  archive_path
238
239
  end
239
240
 
240
241
  def restore(filepath:)
241
- options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
242
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
243
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
244
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
245
- options += "--password=#{@settings.db_password}" if @settings.db_password.present?
246
- `mongorestore #{options} --gzip`
242
+ options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
243
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
244
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
245
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
246
+ options += "--password=#{@settings.db_password} " if @settings.db_password.present?
247
+
248
+ `mongorestore #{options} --drop --gzip`
247
249
  end
248
250
 
249
251
  def transform_to_query(opts)
@@ -26,24 +26,26 @@ module Dataflow
26
26
  end
27
27
 
28
28
  def dump(base_folder:)
29
- archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.dump"
30
- options = "--table=public.#{@settings.read_dataset_name}"
31
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
32
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
33
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
29
+ archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.dump"
30
+ options = "--table=public.#{@settings.read_dataset_name} "
31
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
32
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
33
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
34
34
  password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
35
- `mkdir -p #{base_folder}`
35
+
36
+ `mkdir -p #{base_folder}/#{@settings.db_name}`
36
37
  `#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
37
38
  archive_path
38
39
  end
39
40
 
40
41
  def restore(filepath:)
41
- options = "--table=#{@settings.read_dataset_name}"
42
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
43
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
44
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
42
+ options = "--table=#{@settings.read_dataset_name} "
43
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
44
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
45
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
45
46
  password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
46
- p "#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}"
47
+
48
+ drop_dataset(@settings.read_dataset_name)
47
49
  `#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
48
50
  end
49
51
  end
@@ -12,16 +12,17 @@ module Dataflow
12
12
  def client(settings)
13
13
  @clients ||= {}
14
14
  connection_uri = settings.connection_uri_or_default
15
- return @clients[connection_uri] if @clients[connection_uri].present?
15
+ full_uri = "#{connection_uri}/#{settings.db_name}?encoding=utf8"
16
+ return @clients[full_uri] if @clients[full_uri].present?
16
17
 
17
18
  # first, make sure the DB is created (if it is not an external db)
18
19
  is_external_db = settings.connection_uri.present?
19
20
  try_create_db(connection_uri, settings.db_name) unless is_external_db
20
21
 
21
22
  # then, create the connection object
22
- db = Sequel.connect("#{connection_uri}/#{settings.db_name}?encoding=utf8")
23
+ db = Sequel.connect(full_uri)
23
24
  add_extensions(settings, db)
24
- @clients[connection_uri] = db
25
+ @clients[full_uri] = db
25
26
  end
26
27
 
27
28
  # Used internally to try to create the DB automatically.
@@ -242,6 +243,8 @@ module Dataflow
242
243
  end
243
244
  when '<', '<=', '>', '>='
244
245
  Sequel.lit("#{k} #{operator} ?", value)
246
+ when '@>', '<@'
247
+ Sequel.lit("#{k} #{operator} ?", Sequel.pg_array(Array(value)))
245
248
  when '~'
246
249
  Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
247
250
  when '~*'
@@ -291,19 +294,15 @@ module Dataflow
291
294
  end
292
295
  when 'numeric'
293
296
  col_type = 'real'
294
- when 'array', 'hash'
295
- logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
296
- col_type = 'json'
297
297
  when 'date', 'time'
298
298
  # keep as-is
299
299
  col_type = type
300
300
  else
301
- logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
302
301
  col_type = type
303
302
  end
304
303
 
305
304
  # create a column with the given type
306
- p "#{column} #{type} -> #{col_type}"
305
+ logger.log("#{column} #{type} -> #{col_type}")
307
306
  column(column.to_sym, col_type)
308
307
  end
309
308
  end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Errors
4
+ class RemoteExecutionError < StandardError
5
+
6
+ def initialize(msg, backtrace)
7
+ super(msg)
8
+ set_backtrace(backtrace)
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+ require 'bunny'
3
+ require 'json'
4
+ require 'thread'
5
+
6
+ module Dataflow
7
+ class Executor
8
+ class << self
9
+ def execute(node)
10
+ case node.execution_model
11
+ when :remote
12
+ execute_remote_computation(node: node, is_batch_execution: false)
13
+ when :remote_batch
14
+ execute_remote_computation(node: node, is_batch_execution: true)
15
+ when :local
16
+ node.execute_local_computation
17
+ else
18
+ raise ArgumentError, "Unknown execution model #{execution_model}"
19
+ end
20
+ end
21
+
22
+ def execute_remote_computation(node:, is_batch_execution:)
23
+ execution_uuid = node.execution_uuid
24
+ raise ArgumentError, "Expected execution uuid to be set on '#{node.name}' (##{node._id})" unless execution_uuid.present?
25
+
26
+ logger.log("Started processing '#{node.name}'")
27
+ conn, channel, completion_queue = open_communication_channel
28
+ logger.log("Opened a completion queue for '#{node.name}': #{completion_queue.name}")
29
+
30
+ messages = send_execution_messages(channel, node, is_batch_execution, completion_queue.name)
31
+ error_data = await_execution_completion(completion_queue, messages.count)
32
+ logger.log("Finished processing '#{node.name}'")
33
+
34
+ raise Errors::RemoteExecutionError.new(error_data['message'], error_data['backtrace']) if error_data
35
+ ensure
36
+ conn&.close
37
+ end
38
+
39
+ def open_communication_channel
40
+ conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
41
+ conn.start
42
+
43
+ ch = conn.create_channel
44
+ completion_queue = ch.queue('', exclusive: true)
45
+
46
+ return conn, ch, completion_queue
47
+ end
48
+
49
+ def send_execution_messages(channel, node, is_batch_execution, completion_queue_name)
50
+ execution_params = make_execution_params(node, is_batch_execution, completion_queue_name)
51
+
52
+ execution_queue = channel.queue(node.execution_queue)
53
+ execution_params.each do |exec_params|
54
+ execution_queue.publish(exec_params.to_json)
55
+ end
56
+
57
+ execution_params
58
+ end
59
+
60
+ def make_execution_params(node, is_batch_execution, completion_queue_name)
61
+ execution_params = if is_batch_execution
62
+ node.make_batch_params
63
+ else
64
+ [{}]
65
+ end
66
+
67
+ execution_params.each_with_index.map do |params, idx|
68
+ {
69
+ msg_id: idx,
70
+ node_id: node._id.to_s,
71
+ is_batch: is_batch_execution,
72
+ params: params,
73
+ execution_uuid: node.execution_uuid.to_s,
74
+ completion_queue_name: completion_queue_name
75
+ }
76
+ end
77
+ end
78
+
79
+ def await_execution_completion(completion_queue, expected_completion_count)
80
+ completed_message_indexes = []
81
+ unblock = Queue.new
82
+
83
+ consumer = completion_queue.subscribe do |_delivery_info, _properties, payload|
84
+ data = JSON.parse(payload)
85
+ unblock.enq(data['error']) if data['error'].present?
86
+
87
+ completed_message_indexes << data['msg_id']
88
+ if completed_message_indexes.count == expected_completion_count
89
+ unblock.enq(false)
90
+ end
91
+ end
92
+
93
+ error_data = unblock.deq
94
+ consumer.cancel
95
+
96
+ error_data
97
+ end
98
+
99
+ def logger
100
+ @logger ||= Dataflow::Logger.new(prefix: 'Executor')
101
+ end
102
+ end
103
+ end
104
+ end
@@ -57,6 +57,16 @@ module Dataflow
57
57
  # The node name
58
58
  field :name, type: String
59
59
 
60
+ # The execution model:
61
+ field :execution_model, type: Symbol, default: :local
62
+
63
+ # For remote computation only:
64
+ # Controls on which queue this execution wi;l be routed
65
+ field :execution_queue, type: String, default: 'dataflow.ruby'
66
+
67
+ # Unique ID of the current execution
68
+ field :execution_uuid, type: BSON::ObjectId
69
+
60
70
  # The data node to which we will write the computation output
61
71
  field :data_node_id, type: BSON::ObjectId
62
72
 
@@ -261,7 +271,7 @@ module Dataflow
261
271
  end
262
272
 
263
273
  send_heartbeat
264
- compute_impl
274
+ Executor.execute(self)
265
275
 
266
276
  if clear_data_on_compute
267
277
  # Post-compute, delay creating other indexes for insert speed
@@ -281,6 +291,9 @@ module Dataflow
281
291
  logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
282
292
  end
283
293
 
294
+ rescue Errors::RemoteExecutionError => e
295
+ on_computing_finished(state: 'error', error: e) if has_compute_lock
296
+ logger.error(error: e, custom_message: "#{name} failed computing remotely.")
284
297
  rescue StandardError => e
285
298
  on_computing_finished(state: 'error', error: e) if has_compute_lock
286
299
  logger.error(error: e, custom_message: "#{name} failed computing.")
@@ -296,13 +309,9 @@ module Dataflow
296
309
  def valid_for_computation?
297
310
  # Perform additional checks: also add errors to "self.errors"
298
311
  opts = self.class.dependency_opts
299
- if opts.key?(:exactly)
300
- ensure_exact_dependencies(count: opts[:exactly])
301
- elsif opts.key?(:max)
302
- ensure_at_most_dependencies(count: opts[:max])
303
- else # even if the min is not specified, we need at least 1 dependency
304
- ensure_at_least_dependencies(count: opts[:min] || 1)
305
- end
312
+ ensure_exact_dependencies(count: opts[:exactly]) if opts.key?(:exactly)
313
+ ensure_at_most_dependencies(count: opts[:max]) if opts.key?(:max)
314
+ ensure_at_least_dependencies(count: opts[:min]) if opts.key?(:min)
306
315
  ensure_no_cyclic_dependencies
307
316
  ensure_keys_are_set
308
317
  ensure_data_node_exists if self.class.data_node_opts[:ensure_exists]
@@ -322,37 +331,67 @@ module Dataflow
322
331
  release_computing_lock!
323
332
  end
324
333
 
334
+ def execution_valid?(uuid)
335
+ execution_uuid.to_s == uuid.to_s
336
+ end
337
+
325
338
  # Keep a compatible interface with the data node
326
339
  def schema
327
340
  required_schema
328
341
  end
329
342
 
343
+ # Interface to execute this node locally
344
+ def execute_local_computation
345
+ compute_impl
346
+ end
347
+
348
+ # Interface to execute a part (batch) of this node locally.
349
+ # This method is called when the framework needs to execute a batch on a worker.
350
+ # Override when needed, to execute a batch depending on the params.
351
+ # If you override, you may want to override the make_batch_params as well.
352
+ def execute_local_batch_computation(batch_params)
353
+ records = dependencies.first.all(where: batch_params)
354
+ new_records = compute_batch(records: records)
355
+ data_node&.add(records: new_records)
356
+ end
357
+
358
+ # Interface used to retrieve the params for scheduled batchs. Override when needed.
359
+ # The default implemention is to make queries that would
360
+ # ensure the full processing of the first dependency's records.
361
+ # @return [Array] of params that are passed to scheduled batches.
362
+ def make_batch_params
363
+ make_batch_queries(node: dependencies.first)
364
+ end
365
+
330
366
  private
331
367
 
332
- # Compute implementation:
368
+ # Default compute implementation:
333
369
  # - recreate the table
334
370
  # - compute the records
335
371
  # - save them to the DB
336
372
  # (the process may be overwritten on a per-node basis if needed)
373
+ # Override if you need to have a completely custom compute implementation
337
374
  def compute_impl
338
375
  process_parallel(node: dependencies.first)
339
376
  end
340
377
 
341
- def process_parallel(node:)
342
- return if node.blank?
343
- record_count = node.count
344
- return if record_count == 0
378
+ # This is an interface only.
379
+ # Override when you can implement a computation in terms of
380
+ # the records of the first dependent node.
381
+ # @param records [Array] a batch of records from the first dependency
382
+ # @return [Array] an array of results that are to be pushed to the data node (if set).
383
+ def compute_batch(records:)
384
+ []
385
+ end
345
386
 
346
- equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
347
- count_per_process = equal_split_per_process
348
- limit = limit_per_process.to_i
349
- count_per_process = [limit, equal_split_per_process].min if limit > 0
387
+ def process_parallel(node:)
388
+ queries = make_batch_queries(node: node)
389
+ return if queries.blank?
350
390
 
351
- queries = node.ordered_system_id_queries(batch_size: count_per_process)
352
391
  queries_count = queries.count
353
-
354
392
  parallel_each(queries.each_with_index) do |query, idx|
355
393
  send_heartbeat
394
+
356
395
  progress = (idx / queries_count.to_f * 100).ceil
357
396
  on_computing_progressed(pct_complete: progress)
358
397
  logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
@@ -365,25 +404,42 @@ module Dataflow
365
404
  compute_batch(records: records)
366
405
  end
367
406
 
368
- data_node.add(records: new_records)
407
+ data_node&.add(records: new_records)
369
408
  end
370
409
  end
371
410
 
372
- # This is an interface only.
373
- # Override with record computation logic.
374
- def compute_batch(records:)
375
- records
411
+ # Makes queries that support traversing the node's records in parallel without overlap.
412
+ def make_batch_queries(node:)
413
+ return [] if node.blank?
414
+ record_count = node.count
415
+ return [] if record_count == 0
416
+
417
+ equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
418
+ count_per_process = equal_split_per_process
419
+ limit = limit_per_process.to_i
420
+ count_per_process = [limit, equal_split_per_process].min if limit > 0
421
+
422
+ queries = node.ordered_system_id_queries(batch_size: count_per_process)
376
423
  end
377
424
 
378
425
  def acquire_computing_lock!
379
426
  # make sure that any pending changes are saved.
380
427
  save
428
+
429
+ compute_state = {
430
+ computing_state: 'computing',
431
+ computing_started_at: Time.now,
432
+ execution_uuid: BSON::ObjectId.new
433
+ }
381
434
  find_query = { _id: _id, computing_state: { '$ne' => 'computing' } }
382
- update_query = { '$set' => { computing_state: 'computing', computing_started_at: Time.now } }
435
+ update_query = { '$set' => compute_state }
436
+
383
437
  # send a query directly to avoid mongoid's caching layers
384
438
  res = Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
439
+
385
440
  # reload the model data after the query above
386
441
  reload
442
+
387
443
  # the query is atomic so if res != nil, we acquired the lock
388
444
  !res.nil?
389
445
  end
@@ -391,20 +447,21 @@ module Dataflow
391
447
  def release_computing_lock!
392
448
  # make sure that any pending changes are saved.
393
449
  save
450
+
394
451
  find_query = { _id: _id }
395
- update_query = { '$set' => { computing_state: nil, computing_started_at: nil } }
452
+ update_query = { '$set' => { computing_state: nil, computing_started_at: nil, execution_uuid: nil } }
453
+
396
454
  # send a query directly to avoid mongoid's caching layers
397
455
  Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
456
+
398
457
  # reload the model data after the query above
399
458
  reload
400
459
  end
401
460
 
402
461
  def await_computing!
403
- start_waiting_at = Time.now
404
- # TODO: should the max wait time be dependent on e.g. the recompute interval?
405
462
  max_wait_time = 15.minutes
406
- while Time.now < start_waiting_at + max_wait_time
407
- sleep 2
463
+ while Time.now < last_heartbeat_time + max_wait_time
464
+ sleep 5
408
465
  # reloads with the data stored on mongodb:
409
466
  # something maybe have been changed by another process.
410
467
  reload
@@ -436,7 +493,6 @@ module Dataflow
436
493
  update_query = { '$set' => { last_compute_starting_time: time } }
437
494
  Dataflow::Nodes::ComputeNode.where(_id: _id)
438
495
  .find_one_and_update(update_query)
439
-
440
496
  end
441
497
 
442
498
  ##############################
@@ -184,6 +184,8 @@ module Dataflow
184
184
  # Adds the given records to the dataset and updates the updated_at time.
185
185
  # @param records [Array] an array of the records to be added.
186
186
  def add(records:)
187
+ raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
188
+ records = records.compact
187
189
  return if records.blank?
188
190
  db_adapter.save(records: records)
189
191
  self.updated_at = Time.now
@@ -380,7 +382,7 @@ module Dataflow
380
382
  return @postgresql_adapter
381
383
  end
382
384
 
383
- raise Errors::NotImplementedError, "'#{db_backend}' backend is not implemented."
385
+ raise NotImplementedError, "'#{db_backend}' backend is not implemented."
384
386
  end
385
387
 
386
388
  def valid_dataset_names
@@ -37,13 +37,14 @@ module Dataflow
37
37
  count_per_process = [max_per_process, equal_split_per_process].min
38
38
 
39
39
  queries = node.ordered_system_id_queries(batch_size: count_per_process)
40
+ system_id = node.send(:db_adapter).class::SYSTEM_ID
40
41
 
41
- parallel_each(queries.each_with_index) do |query, _idx|
42
+ parallel_each(queries.each_with_index) do |query, idx|
42
43
  # TODO: re-enabled event on_export_progressed
43
44
  # progress = (idx / queries.count.to_f * 100).ceil
44
45
  # on_export_progressed(pct_complete: progress)
45
- batch = node.all(where: query.merge(where), fields: sch.keys)
46
- csv_adapter.save(records: batch)
46
+ batch = node.all(where: query.merge(where), fields: sch.keys, sort: { system_id => 1 })
47
+ csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
47
48
  end
48
49
 
49
50
  # needed by the csv exporter to finalize in a single file
@@ -3,38 +3,36 @@ module Dataflow
3
3
  module Nodes
4
4
  # Only supports read operations
5
5
  class ReadOnlyDataNode < DataNode
6
-
7
6
  def set_defaults
8
7
  super
9
8
  self.use_double_buffering = false
10
9
  end
11
10
 
12
-
13
11
  def handle_dataset_settings_changed
14
12
  # ignore - do not do anyhing
15
13
  end
16
14
 
17
- def add(*args)
15
+ def add(*_args)
18
16
  raise_read_only_error!
19
17
  end
20
18
 
21
- def clear(*args)
19
+ def clear(*_args)
22
20
  raise_read_only_error!
23
21
  end
24
22
 
25
- def recreate_dataset(*args)
23
+ def recreate_dataset(*_args)
26
24
  raise_read_only_error!
27
25
  end
28
26
 
29
- def create_unique_indexes(*args)
27
+ def create_unique_indexes(*_args)
30
28
  raise_read_only_error!
31
29
  end
32
30
 
33
- def create_non_unique_indexes(*args)
31
+ def create_non_unique_indexes(*_args)
34
32
  raise_read_only_error!
35
33
  end
36
34
 
37
- def read_dataset_name=(*args)
35
+ def read_dataset_name=(*_args)
38
36
  raise_read_only_error!
39
37
  end
40
38
 
@@ -42,21 +40,27 @@ module Dataflow
42
40
  raise_read_only_error!
43
41
  end
44
42
 
45
- def import(*args)
43
+ def import(*_args)
46
44
  raise_read_only_error!
47
45
  end
48
46
 
49
-
50
47
  def drop_dataset!
51
48
  raise_read_only_error!
52
49
  end
53
50
 
51
+ def dump_dataset(*_args)
52
+ raise_read_only_error!
53
+ end
54
+
55
+ def restore_dataset(*_args)
56
+ raise_read_only_error!
57
+ end
58
+
54
59
  private
55
60
 
56
61
  def raise_read_only_error!
57
- raise NotImplementedError, 'External data nodes are read only'
62
+ raise NotImplementedError, 'This node is read only'
58
63
  end
59
-
60
64
  end # class ExternalDataNode
61
65
  end # module Nodes
62
66
  end # module Dataflow
@@ -1,19 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
3
  # Interface for a node that behaves as a dataset.
4
- # Does not support any operation.
4
+ # Does not support any write operation.
5
5
  # Inherit and override to implement custom behavior.
6
6
  module Nodes
7
- class RuntimeQueryNode < DataNode
7
+ class RuntimeQueryNode < ReadOnlyDataNode
8
8
 
9
9
  after_initialize do
10
10
  self.db_backend = :none
11
11
  end
12
12
 
13
- def handle_dataset_settings_changed
14
- # dot not do anything, there is no real dataset
15
- end
16
-
17
13
  def all(*_args)
18
14
  raise NotImplementedError, 'this node does not support #all'
19
15
  end
@@ -30,13 +26,6 @@ module Dataflow
30
26
  raise NotImplementedError, 'this node does not support #all_paginated'
31
27
  end
32
28
 
33
- def add(*_args)
34
- raise NotImplementedError, 'this node does not support #add'
35
- end
36
-
37
- def clear(*_args)
38
- raise NotImplementedError, 'this node does not support #clear'
39
- end
40
29
  end
41
30
  end
42
31
  end
@@ -30,6 +30,10 @@ module Dataflow
30
30
  end
31
31
 
32
32
  def add(records:)
33
+ raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
34
+ records = records.compact
35
+ return if records.blank?
36
+
33
37
  # TODO: create a chain of behavior "before add"
34
38
  rename_dotted_fields(records: records)
35
39
  add_internal_timestamp(records: records)
@@ -42,6 +42,8 @@ module Dataflow
42
42
  end
43
43
 
44
44
  def add(records:)
45
+ raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
46
+ records = records.compact
45
47
  return if records.blank?
46
48
 
47
49
  # TODO: create a chain of behavior "before add"
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+ require 'bunny'
3
+ require 'json'
4
+
5
+ module Dataflow
6
+ class RemoteWorker
7
+ class << self
8
+ def work(work_queue_name = 'dataflow.ruby')
9
+ conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
10
+ conn.start
11
+
12
+ ch = conn.create_channel
13
+ queue = ch.queue(work_queue_name)
14
+ ch.prefetch(1)
15
+
16
+ logger.log("Accepting work on #{work_queue_name}...")
17
+
18
+ queue.subscribe(block: true, manual_ack: true) do |delivery_info, _properties, payload|
19
+ data = JSON.parse(payload)
20
+ response = process(data)
21
+ if response.present?
22
+ ch.default_exchange.publish(response.to_json, routing_key: data['completion_queue_name'])
23
+ end
24
+ ch.ack(delivery_info.delivery_tag)
25
+ end
26
+ ensure
27
+ conn.close
28
+ logger.log('Connection closed, stopped accepting work.')
29
+ end
30
+
31
+ def process(data)
32
+ node = Dataflow::Nodes::ComputeNode.find(data['node_id'])
33
+
34
+ unless node.execution_valid?(data['execution_uuid'])
35
+ logger.log("[#{data['msg_id']}] work on '#{node.name}' has expired. Skipping.")
36
+ return
37
+ end
38
+
39
+ errors = execute(node, data)
40
+ response = { msg_id: data['msg_id'] }
41
+ response.merge(errors[0])
42
+ rescue Mongoid::Errors::DocumentNotFound => e
43
+ { error: { message: e.message, backtrace: e.backtrace } }
44
+ end
45
+
46
+ def execute(node, payload_data)
47
+ # execute in a different process, so that once it's finished
48
+ # we can purge the memory
49
+ Parallel.map([payload_data]) do |data|
50
+ error = {}
51
+ logger.log("[#{data['msg_id']}] working on '#{node.name}'...")
52
+
53
+ begin
54
+ if data['is_batch']
55
+ node.execute_local_batch_computation(data['params'])
56
+ else
57
+ node.execute_local_computation
58
+ end
59
+ rescue StandardError => e
60
+ error = { error: { message: e.message, backtrace: e.backtrace } }
61
+ end
62
+
63
+ logger.log("[#{data['msg_id']}] done working on '#{node.name}'.")
64
+ error
65
+ end
66
+ end
67
+
68
+ def logger
69
+ @logger ||= Dataflow::Logger.new(prefix: 'Worker')
70
+ end
71
+ end
72
+ end
73
+ end
@@ -51,6 +51,15 @@ module Dataflow
51
51
  end
52
52
 
53
53
  def infer_partial_schema(where:, extended: false)
54
+ if db_backend == :postgresql
55
+ # Experimental
56
+ sch = db_adapter.client.schema(read_dataset_name).to_h
57
+ sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
58
+ self.inferred_schema = sch
59
+ save
60
+ return sch
61
+ end
62
+
54
63
  data_count = count(where: where)
55
64
  return {} if data_count == 0
56
65
 
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.13.0'
3
+ VERSION = '0.14.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-23 00:00:00.000000000 Z
11
+ date: 2017-06-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -290,6 +290,20 @@ dependencies:
290
290
  - - "~>"
291
291
  - !ruby/object:Gem::Version
292
292
  version: '0.10'
293
+ - !ruby/object:Gem::Dependency
294
+ name: bunny
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - "~>"
298
+ - !ruby/object:Gem::Version
299
+ version: '2.7'
300
+ type: :runtime
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - "~>"
305
+ - !ruby/object:Gem::Version
306
+ version: '2.7'
293
307
  description: Helps building data pipelines. It handles recomputing dependencies and
294
308
  parallel execution.
295
309
  email:
@@ -319,8 +333,9 @@ files:
319
333
  - lib/dataflow/adapters/settings.rb
320
334
  - lib/dataflow/adapters/sql_adapter.rb
321
335
  - lib/dataflow/errors/invalid_configuration_error.rb
322
- - lib/dataflow/errors/not_implemented_error.rb
336
+ - lib/dataflow/errors/remote_execution_error.rb
323
337
  - lib/dataflow/event_mixin.rb
338
+ - lib/dataflow/executor.rb
324
339
  - lib/dataflow/extensions/mongo_driver.rb
325
340
  - lib/dataflow/extensions/msgpack.rb
326
341
  - lib/dataflow/logger.rb
@@ -344,6 +359,7 @@ files:
344
359
  - lib/dataflow/nodes/transformation/to_time_node.rb
345
360
  - lib/dataflow/nodes/upsert_node.rb
346
361
  - lib/dataflow/properties_mixin.rb
362
+ - lib/dataflow/remote_worker.rb
347
363
  - lib/dataflow/schema_mixin.rb
348
364
  - lib/dataflow/version.rb
349
365
  homepage: https://phybbit.com
@@ -1,7 +0,0 @@
1
- # frozen_string_literal: true
2
- module Dataflow
3
- module Errors
4
- class NotImplementedError < StandardError
5
- end
6
- end
7
- end