dataflow-rb 0.13.0 → 0.14.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 72840e2477fe869fb06b0299c96d5ae2a57c7713
4
- data.tar.gz: f9f03314f23473585a9e742740c0e809f2d99bc7
3
+ metadata.gz: 3d45b64a7e367df85841ae86e6fde6550c33319a
4
+ data.tar.gz: c4ac87dcaf77cd8a7b842a87523271a7be70a850
5
5
  SHA512:
6
- metadata.gz: 43f7cef4b2150017871cb7b3c0f21602a01f385e40d07ddf7000f455a4adc007669974fd4e7170e4acc3807feae907f6114e3b3cbfbdbbf36f96348c3a06f60c
7
- data.tar.gz: d16411f178fa8ccc00cc9dbaefd0905040a5f8354874f7b00ad5080681d912fa8051dd16e3975d25aed8c737fdf70594f3bb77a948e95665d92a198f0e65206c
6
+ metadata.gz: 13abdbc494c020670183e3630261d684df43ea5c4fef110f567b97ca95883f7f8515014ce1e8b9eacca452e5180727a493cf8b7ea10595b1ea41aa632e074074
7
+ data.tar.gz: a0f6a2aff4b1ecce23b74610574cadf83c598d523b49ac7e05a4d68768f441398767a58a77959f3f4b037962d3b0f3b94804688c051b9576b8c6fe9abce6e159
@@ -1,5 +1,22 @@
1
1
  # Changelog
2
2
 
3
+ #### 0.14.0
4
+ - [ef8ddcd] Do not assume a minimum of 1 dependency per compute node.
5
+ - [b131bb1] Add type check on the data node #add methods. Filter nil values.
6
+ - [effc5a4] Set the rabbitmq as coming from the env
7
+ - [577ea2e] Add support for computing a node remotely.
8
+ - [4a450c3] Remove the custom not implemented error and use the default one.
9
+ - [f9c48c5] Added some new lines
10
+ - [336b9f8] Fix the backup options
11
+ - [2b2fbee] Make the runtime query node a subclass of the read only data node
12
+ - [fe237c4] Change the backup structure to isolate the db name by folder
13
+ - [654927f] Experiment with querying arrays
14
+ - [506f105] Order by system id when exporting
15
+ - [fa8fdc3] Keep the data ordered when exporting to csv
16
+ - [5e1718d] Add support for postgresql when inferring partial schemas (needed for export)
17
+
18
+ #### 0.13.1
19
+ - [aa3ed2e] Fix a bug when storing a db connection
3
20
 
4
21
  #### 0.13.0
5
22
  - [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
@@ -42,4 +42,5 @@ Gem::Specification.new do |spec|
42
42
  spec.add_dependency 'smarter_csv', '1.1.0'
43
43
  spec.add_dependency 'timeliness', '~>0.3'
44
44
  spec.add_dependency 'chronic', '~>0.10'
45
+ spec.add_dependency 'bunny', '~>2.7'
45
46
  end
@@ -17,6 +17,8 @@ require 'dataflow/logger'
17
17
  require 'dataflow/properties_mixin'
18
18
  require 'dataflow/schema_mixin'
19
19
  require 'dataflow/node'
20
+ require 'dataflow/executor'
21
+ require 'dataflow/remote_worker'
20
22
 
21
23
  require 'dataflow/adapters/csv_adapter'
22
24
  require 'dataflow/adapters/mongo_db_adapter'
@@ -26,7 +28,7 @@ require 'dataflow/adapters/psql_adapter'
26
28
  require 'dataflow/adapters/settings'
27
29
 
28
30
  require 'dataflow/errors/invalid_configuration_error'
29
- require 'dataflow/errors/not_implemented_error'
31
+ require 'dataflow/errors/remote_execution_error'
30
32
 
31
33
  require 'dataflow/nodes/mixin/add_internal_timestamp'
32
34
  require 'dataflow/nodes/mixin/rename_dotted_fields'
@@ -27,7 +27,7 @@ module Dataflow
27
27
 
28
28
  # retrieve a single element from a data node
29
29
  def find(where: opts = {})
30
- raise Errors::NotImplementedError, '#find is not yet support on CSV.'
30
+ raise NotImplementedError, '#find is not yet support on CSV.'
31
31
  end
32
32
 
33
33
  # retrieve all elements from a data node
@@ -43,8 +43,8 @@ module Dataflow
43
43
  end
44
44
 
45
45
  # save the given records
46
- def save(records:)
47
- write_csv_part(records, keys: @schema.keys)
46
+ def save(records:, part: nil)
47
+ write_csv_part(records, keys: @schema.keys, part: part)
48
48
  end
49
49
 
50
50
  def on_save_finished
@@ -52,7 +52,7 @@ module Dataflow
52
52
  end
53
53
 
54
54
  def remove(_opts = {})
55
- raise Errors::NotImplementedError, '#find is not yet support on CSV.'
55
+ raise NotImplementedError, '#find is not yet support on CSV.'
56
56
  end
57
57
 
58
58
  def recreate_dataset(dataset: nil)
@@ -79,10 +79,10 @@ module Dataflow
79
79
 
80
80
  def file_parts
81
81
  part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
82
- Dir["#{file_path}.part_*"]
82
+ Dir["#{file_path}.part_*"].sort
83
83
  end
84
84
 
85
- def write_csv_part(data, keys:)
85
+ def write_csv_part(data, keys:, part:)
86
86
  # prepare the data
87
87
  key_tokens = keys.map { |key| record_dig_tokens(key: key) }
88
88
  rows = data.map do |datum|
@@ -90,8 +90,8 @@ module Dataflow
90
90
  end
91
91
 
92
92
  # dump in a part file
93
- uuid = SecureRandom.hex
94
- CSV.open("#{file_path}.part_#{uuid}", 'w') do |csv|
93
+ part ||= SecureRandom.hex
94
+ CSV.open("#{file_path}.part_#{part}", 'w') do |csv|
95
95
  rows.each { |row| csv << row }
96
96
  end
97
97
  end
@@ -226,24 +226,26 @@ module Dataflow
226
226
  end
227
227
 
228
228
  def dump(base_folder:)
229
- archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.gz"
230
- options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
231
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
232
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
233
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
234
- options += "--password=#{@settings.db_password}" if @settings.db_password.present?
235
- `mkdir -p #{base_folder}`
229
+ archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.gz"
230
+ options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
231
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
232
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
233
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
234
+ options += "--password=#{@settings.db_password} " if @settings.db_password.present?
235
+
236
+ `mkdir -p #{base_folder}/#{@settings.db_name}`
236
237
  `mongodump #{options} --gzip`
237
238
  archive_path
238
239
  end
239
240
 
240
241
  def restore(filepath:)
241
- options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
242
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
243
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
244
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
245
- options += "--password=#{@settings.db_password}" if @settings.db_password.present?
246
- `mongorestore #{options} --gzip`
242
+ options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
243
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
244
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
245
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
246
+ options += "--password=#{@settings.db_password} " if @settings.db_password.present?
247
+
248
+ `mongorestore #{options} --drop --gzip`
247
249
  end
248
250
 
249
251
  def transform_to_query(opts)
@@ -26,24 +26,26 @@ module Dataflow
26
26
  end
27
27
 
28
28
  def dump(base_folder:)
29
- archive_path = "#{base_folder}/#{@settings.db_name}.#{@settings.dataset_name}.dump"
30
- options = "--table=public.#{@settings.read_dataset_name}"
31
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
32
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
33
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
29
+ archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.dump"
30
+ options = "--table=public.#{@settings.read_dataset_name} "
31
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
32
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
33
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
34
34
  password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
35
- `mkdir -p #{base_folder}`
35
+
36
+ `mkdir -p #{base_folder}/#{@settings.db_name}`
36
37
  `#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
37
38
  archive_path
38
39
  end
39
40
 
40
41
  def restore(filepath:)
41
- options = "--table=#{@settings.read_dataset_name}"
42
- options += "--host=#{@settings.db_host}" if @settings.db_host.present?
43
- options += "--port=#{@settings.db_port}" if @settings.db_port.present?
44
- options += "--username=#{@settings.db_user}" if @settings.db_user.present?
42
+ options = "--table=#{@settings.read_dataset_name} "
43
+ options += "--host=#{@settings.db_host} " if @settings.db_host.present?
44
+ options += "--port=#{@settings.db_port} " if @settings.db_port.present?
45
+ options += "--username=#{@settings.db_user} " if @settings.db_user.present?
45
46
  password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
46
- p "#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}"
47
+
48
+ drop_dataset(@settings.read_dataset_name)
47
49
  `#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
48
50
  end
49
51
  end
@@ -12,16 +12,17 @@ module Dataflow
12
12
  def client(settings)
13
13
  @clients ||= {}
14
14
  connection_uri = settings.connection_uri_or_default
15
- return @clients[connection_uri] if @clients[connection_uri].present?
15
+ full_uri = "#{connection_uri}/#{settings.db_name}?encoding=utf8"
16
+ return @clients[full_uri] if @clients[full_uri].present?
16
17
 
17
18
  # first, make sure the DB is created (if it is not an external db)
18
19
  is_external_db = settings.connection_uri.present?
19
20
  try_create_db(connection_uri, settings.db_name) unless is_external_db
20
21
 
21
22
  # then, create the connection object
22
- db = Sequel.connect("#{connection_uri}/#{settings.db_name}?encoding=utf8")
23
+ db = Sequel.connect(full_uri)
23
24
  add_extensions(settings, db)
24
- @clients[connection_uri] = db
25
+ @clients[full_uri] = db
25
26
  end
26
27
 
27
28
  # Used internally to try to create the DB automatically.
@@ -242,6 +243,8 @@ module Dataflow
242
243
  end
243
244
  when '<', '<=', '>', '>='
244
245
  Sequel.lit("#{k} #{operator} ?", value)
246
+ when '@>', '<@'
247
+ Sequel.lit("#{k} #{operator} ?", Sequel.pg_array(Array(value)))
245
248
  when '~'
246
249
  Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
247
250
  when '~*'
@@ -291,19 +294,15 @@ module Dataflow
291
294
  end
292
295
  when 'numeric'
293
296
  col_type = 'real'
294
- when 'array', 'hash'
295
- logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
296
- col_type = 'json'
297
297
  when 'date', 'time'
298
298
  # keep as-is
299
299
  col_type = type
300
300
  else
301
- logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
302
301
  col_type = type
303
302
  end
304
303
 
305
304
  # create a column with the given type
306
- p "#{column} #{type} -> #{col_type}"
305
+ logger.log("#{column} #{type} -> #{col_type}")
307
306
  column(column.to_sym, col_type)
308
307
  end
309
308
  end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Errors
4
+ class RemoteExecutionError < StandardError
5
+
6
+ def initialize(msg, backtrace)
7
+ super(msg)
8
+ set_backtrace(backtrace)
9
+ end
10
+
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,104 @@
1
+ # frozen_string_literal: true
2
+ require 'bunny'
3
+ require 'json'
4
+ require 'thread'
5
+
6
+ module Dataflow
7
+ class Executor
8
+ class << self
9
+ def execute(node)
10
+ case node.execution_model
11
+ when :remote
12
+ execute_remote_computation(node: node, is_batch_execution: false)
13
+ when :remote_batch
14
+ execute_remote_computation(node: node, is_batch_execution: true)
15
+ when :local
16
+ node.execute_local_computation
17
+ else
18
+ raise ArgumentError, "Unknown execution model #{execution_model}"
19
+ end
20
+ end
21
+
22
+ def execute_remote_computation(node:, is_batch_execution:)
23
+ execution_uuid = node.execution_uuid
24
+ raise ArgumentError, "Expected execution uuid to be set on '#{node.name}' (##{node._id})" unless execution_uuid.present?
25
+
26
+ logger.log("Started processing '#{node.name}'")
27
+ conn, channel, completion_queue = open_communication_channel
28
+ logger.log("Opened a completion queue for '#{node.name}': #{completion_queue.name}")
29
+
30
+ messages = send_execution_messages(channel, node, is_batch_execution, completion_queue.name)
31
+ error_data = await_execution_completion(completion_queue, messages.count)
32
+ logger.log("Finished processing '#{node.name}'")
33
+
34
+ raise Errors::RemoteExecutionError.new(error_data['message'], error_data['backtrace']) if error_data
35
+ ensure
36
+ conn&.close
37
+ end
38
+
39
+ def open_communication_channel
40
+ conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
41
+ conn.start
42
+
43
+ ch = conn.create_channel
44
+ completion_queue = ch.queue('', exclusive: true)
45
+
46
+ return conn, ch, completion_queue
47
+ end
48
+
49
+ def send_execution_messages(channel, node, is_batch_execution, completion_queue_name)
50
+ execution_params = make_execution_params(node, is_batch_execution, completion_queue_name)
51
+
52
+ execution_queue = channel.queue(node.execution_queue)
53
+ execution_params.each do |exec_params|
54
+ execution_queue.publish(exec_params.to_json)
55
+ end
56
+
57
+ execution_params
58
+ end
59
+
60
+ def make_execution_params(node, is_batch_execution, completion_queue_name)
61
+ execution_params = if is_batch_execution
62
+ node.make_batch_params
63
+ else
64
+ [{}]
65
+ end
66
+
67
+ execution_params.each_with_index.map do |params, idx|
68
+ {
69
+ msg_id: idx,
70
+ node_id: node._id.to_s,
71
+ is_batch: is_batch_execution,
72
+ params: params,
73
+ execution_uuid: node.execution_uuid.to_s,
74
+ completion_queue_name: completion_queue_name
75
+ }
76
+ end
77
+ end
78
+
79
+ def await_execution_completion(completion_queue, expected_completion_count)
80
+ completed_message_indexes = []
81
+ unblock = Queue.new
82
+
83
+ consumer = completion_queue.subscribe do |_delivery_info, _properties, payload|
84
+ data = JSON.parse(payload)
85
+ unblock.enq(data['error']) if data['error'].present?
86
+
87
+ completed_message_indexes << data['msg_id']
88
+ if completed_message_indexes.count == expected_completion_count
89
+ unblock.enq(false)
90
+ end
91
+ end
92
+
93
+ error_data = unblock.deq
94
+ consumer.cancel
95
+
96
+ error_data
97
+ end
98
+
99
+ def logger
100
+ @logger ||= Dataflow::Logger.new(prefix: 'Executor')
101
+ end
102
+ end
103
+ end
104
+ end
@@ -57,6 +57,16 @@ module Dataflow
57
57
  # The node name
58
58
  field :name, type: String
59
59
 
60
+ # The execution model:
61
+ field :execution_model, type: Symbol, default: :local
62
+
63
+ # For remote computation only:
64
+ # Controls on which queue this execution wi;l be routed
65
+ field :execution_queue, type: String, default: 'dataflow.ruby'
66
+
67
+ # Unique ID of the current execution
68
+ field :execution_uuid, type: BSON::ObjectId
69
+
60
70
  # The data node to which we will write the computation output
61
71
  field :data_node_id, type: BSON::ObjectId
62
72
 
@@ -261,7 +271,7 @@ module Dataflow
261
271
  end
262
272
 
263
273
  send_heartbeat
264
- compute_impl
274
+ Executor.execute(self)
265
275
 
266
276
  if clear_data_on_compute
267
277
  # Post-compute, delay creating other indexes for insert speed
@@ -281,6 +291,9 @@ module Dataflow
281
291
  logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
282
292
  end
283
293
 
294
+ rescue Errors::RemoteExecutionError => e
295
+ on_computing_finished(state: 'error', error: e) if has_compute_lock
296
+ logger.error(error: e, custom_message: "#{name} failed computing remotely.")
284
297
  rescue StandardError => e
285
298
  on_computing_finished(state: 'error', error: e) if has_compute_lock
286
299
  logger.error(error: e, custom_message: "#{name} failed computing.")
@@ -296,13 +309,9 @@ module Dataflow
296
309
  def valid_for_computation?
297
310
  # Perform additional checks: also add errors to "self.errors"
298
311
  opts = self.class.dependency_opts
299
- if opts.key?(:exactly)
300
- ensure_exact_dependencies(count: opts[:exactly])
301
- elsif opts.key?(:max)
302
- ensure_at_most_dependencies(count: opts[:max])
303
- else # even if the min is not specified, we need at least 1 dependency
304
- ensure_at_least_dependencies(count: opts[:min] || 1)
305
- end
312
+ ensure_exact_dependencies(count: opts[:exactly]) if opts.key?(:exactly)
313
+ ensure_at_most_dependencies(count: opts[:max]) if opts.key?(:max)
314
+ ensure_at_least_dependencies(count: opts[:min]) if opts.key?(:min)
306
315
  ensure_no_cyclic_dependencies
307
316
  ensure_keys_are_set
308
317
  ensure_data_node_exists if self.class.data_node_opts[:ensure_exists]
@@ -322,37 +331,67 @@ module Dataflow
322
331
  release_computing_lock!
323
332
  end
324
333
 
334
+ def execution_valid?(uuid)
335
+ execution_uuid.to_s == uuid.to_s
336
+ end
337
+
325
338
  # Keep a compatible interface with the data node
326
339
  def schema
327
340
  required_schema
328
341
  end
329
342
 
343
+ # Interface to execute this node locally
344
+ def execute_local_computation
345
+ compute_impl
346
+ end
347
+
348
+ # Interface to execute a part (batch) of this node locally.
349
+ # This method is called when the framework needs to execute a batch on a worker.
350
+ # Override when needed, to execute a batch depending on the params.
351
+ # If you override, you may want to override the make_batch_params as well.
352
+ def execute_local_batch_computation(batch_params)
353
+ records = dependencies.first.all(where: batch_params)
354
+ new_records = compute_batch(records: records)
355
+ data_node&.add(records: new_records)
356
+ end
357
+
358
+ # Interface used to retrieve the params for scheduled batchs. Override when needed.
359
+ # The default implemention is to make queries that would
360
+ # ensure the full processing of the first dependency's records.
361
+ # @return [Array] of params that are passed to scheduled batches.
362
+ def make_batch_params
363
+ make_batch_queries(node: dependencies.first)
364
+ end
365
+
330
366
  private
331
367
 
332
- # Compute implementation:
368
+ # Default compute implementation:
333
369
  # - recreate the table
334
370
  # - compute the records
335
371
  # - save them to the DB
336
372
  # (the process may be overwritten on a per-node basis if needed)
373
+ # Override if you need to have a completely custom compute implementation
337
374
  def compute_impl
338
375
  process_parallel(node: dependencies.first)
339
376
  end
340
377
 
341
- def process_parallel(node:)
342
- return if node.blank?
343
- record_count = node.count
344
- return if record_count == 0
378
+ # This is an interface only.
379
+ # Override when you can implement a computation in terms of
380
+ # the records of the first dependent node.
381
+ # @param records [Array] a batch of records from the first dependency
382
+ # @return [Array] an array of results that are to be pushed to the data node (if set).
383
+ def compute_batch(records:)
384
+ []
385
+ end
345
386
 
346
- equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
347
- count_per_process = equal_split_per_process
348
- limit = limit_per_process.to_i
349
- count_per_process = [limit, equal_split_per_process].min if limit > 0
387
+ def process_parallel(node:)
388
+ queries = make_batch_queries(node: node)
389
+ return if queries.blank?
350
390
 
351
- queries = node.ordered_system_id_queries(batch_size: count_per_process)
352
391
  queries_count = queries.count
353
-
354
392
  parallel_each(queries.each_with_index) do |query, idx|
355
393
  send_heartbeat
394
+
356
395
  progress = (idx / queries_count.to_f * 100).ceil
357
396
  on_computing_progressed(pct_complete: progress)
358
397
  logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
@@ -365,25 +404,42 @@ module Dataflow
365
404
  compute_batch(records: records)
366
405
  end
367
406
 
368
- data_node.add(records: new_records)
407
+ data_node&.add(records: new_records)
369
408
  end
370
409
  end
371
410
 
372
- # This is an interface only.
373
- # Override with record computation logic.
374
- def compute_batch(records:)
375
- records
411
+ # Makes queries that support traversing the node's records in parallel without overlap.
412
+ def make_batch_queries(node:)
413
+ return [] if node.blank?
414
+ record_count = node.count
415
+ return [] if record_count == 0
416
+
417
+ equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
418
+ count_per_process = equal_split_per_process
419
+ limit = limit_per_process.to_i
420
+ count_per_process = [limit, equal_split_per_process].min if limit > 0
421
+
422
+ queries = node.ordered_system_id_queries(batch_size: count_per_process)
376
423
  end
377
424
 
378
425
  def acquire_computing_lock!
379
426
  # make sure that any pending changes are saved.
380
427
  save
428
+
429
+ compute_state = {
430
+ computing_state: 'computing',
431
+ computing_started_at: Time.now,
432
+ execution_uuid: BSON::ObjectId.new
433
+ }
381
434
  find_query = { _id: _id, computing_state: { '$ne' => 'computing' } }
382
- update_query = { '$set' => { computing_state: 'computing', computing_started_at: Time.now } }
435
+ update_query = { '$set' => compute_state }
436
+
383
437
  # send a query directly to avoid mongoid's caching layers
384
438
  res = Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
439
+
385
440
  # reload the model data after the query above
386
441
  reload
442
+
387
443
  # the query is atomic so if res != nil, we acquired the lock
388
444
  !res.nil?
389
445
  end
@@ -391,20 +447,21 @@ module Dataflow
391
447
  def release_computing_lock!
392
448
  # make sure that any pending changes are saved.
393
449
  save
450
+
394
451
  find_query = { _id: _id }
395
- update_query = { '$set' => { computing_state: nil, computing_started_at: nil } }
452
+ update_query = { '$set' => { computing_state: nil, computing_started_at: nil, execution_uuid: nil } }
453
+
396
454
  # send a query directly to avoid mongoid's caching layers
397
455
  Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
456
+
398
457
  # reload the model data after the query above
399
458
  reload
400
459
  end
401
460
 
402
461
  def await_computing!
403
- start_waiting_at = Time.now
404
- # TODO: should the max wait time be dependent on e.g. the recompute interval?
405
462
  max_wait_time = 15.minutes
406
- while Time.now < start_waiting_at + max_wait_time
407
- sleep 2
463
+ while Time.now < last_heartbeat_time + max_wait_time
464
+ sleep 5
408
465
  # reloads with the data stored on mongodb:
409
466
  # something maybe have been changed by another process.
410
467
  reload
@@ -436,7 +493,6 @@ module Dataflow
436
493
  update_query = { '$set' => { last_compute_starting_time: time } }
437
494
  Dataflow::Nodes::ComputeNode.where(_id: _id)
438
495
  .find_one_and_update(update_query)
439
-
440
496
  end
441
497
 
442
498
  ##############################
@@ -184,6 +184,8 @@ module Dataflow
184
184
  # Adds the given records to the dataset and updates the updated_at time.
185
185
  # @param records [Array] an array of the records to be added.
186
186
  def add(records:)
187
+ raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
188
+ records = records.compact
187
189
  return if records.blank?
188
190
  db_adapter.save(records: records)
189
191
  self.updated_at = Time.now
@@ -380,7 +382,7 @@ module Dataflow
380
382
  return @postgresql_adapter
381
383
  end
382
384
 
383
- raise Errors::NotImplementedError, "'#{db_backend}' backend is not implemented."
385
+ raise NotImplementedError, "'#{db_backend}' backend is not implemented."
384
386
  end
385
387
 
386
388
  def valid_dataset_names
@@ -37,13 +37,14 @@ module Dataflow
37
37
  count_per_process = [max_per_process, equal_split_per_process].min
38
38
 
39
39
  queries = node.ordered_system_id_queries(batch_size: count_per_process)
40
+ system_id = node.send(:db_adapter).class::SYSTEM_ID
40
41
 
41
- parallel_each(queries.each_with_index) do |query, _idx|
42
+ parallel_each(queries.each_with_index) do |query, idx|
42
43
  # TODO: re-enabled event on_export_progressed
43
44
  # progress = (idx / queries.count.to_f * 100).ceil
44
45
  # on_export_progressed(pct_complete: progress)
45
- batch = node.all(where: query.merge(where), fields: sch.keys)
46
- csv_adapter.save(records: batch)
46
+ batch = node.all(where: query.merge(where), fields: sch.keys, sort: { system_id => 1 })
47
+ csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
47
48
  end
48
49
 
49
50
  # needed by the csv exporter to finalize in a single file
@@ -3,38 +3,36 @@ module Dataflow
3
3
  module Nodes
4
4
  # Only supports read operations
5
5
  class ReadOnlyDataNode < DataNode
6
-
7
6
  def set_defaults
8
7
  super
9
8
  self.use_double_buffering = false
10
9
  end
11
10
 
12
-
13
11
  def handle_dataset_settings_changed
14
12
  # ignore - do not do anyhing
15
13
  end
16
14
 
17
- def add(*args)
15
+ def add(*_args)
18
16
  raise_read_only_error!
19
17
  end
20
18
 
21
- def clear(*args)
19
+ def clear(*_args)
22
20
  raise_read_only_error!
23
21
  end
24
22
 
25
- def recreate_dataset(*args)
23
+ def recreate_dataset(*_args)
26
24
  raise_read_only_error!
27
25
  end
28
26
 
29
- def create_unique_indexes(*args)
27
+ def create_unique_indexes(*_args)
30
28
  raise_read_only_error!
31
29
  end
32
30
 
33
- def create_non_unique_indexes(*args)
31
+ def create_non_unique_indexes(*_args)
34
32
  raise_read_only_error!
35
33
  end
36
34
 
37
- def read_dataset_name=(*args)
35
+ def read_dataset_name=(*_args)
38
36
  raise_read_only_error!
39
37
  end
40
38
 
@@ -42,21 +40,27 @@ module Dataflow
42
40
  raise_read_only_error!
43
41
  end
44
42
 
45
- def import(*args)
43
+ def import(*_args)
46
44
  raise_read_only_error!
47
45
  end
48
46
 
49
-
50
47
  def drop_dataset!
51
48
  raise_read_only_error!
52
49
  end
53
50
 
51
+ def dump_dataset(*_args)
52
+ raise_read_only_error!
53
+ end
54
+
55
+ def restore_dataset(*_args)
56
+ raise_read_only_error!
57
+ end
58
+
54
59
  private
55
60
 
56
61
  def raise_read_only_error!
57
- raise NotImplementedError, 'External data nodes are read only'
62
+ raise NotImplementedError, 'This node is read only'
58
63
  end
59
-
60
64
  end # class ExternalDataNode
61
65
  end # module Nodes
62
66
  end # module Dataflow
@@ -1,19 +1,15 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
3
  # Interface for a node that behaves as a dataset.
4
- # Does not support any operation.
4
+ # Does not support any write operation.
5
5
  # Inherit and override to implement custom behavior.
6
6
  module Nodes
7
- class RuntimeQueryNode < DataNode
7
+ class RuntimeQueryNode < ReadOnlyDataNode
8
8
 
9
9
  after_initialize do
10
10
  self.db_backend = :none
11
11
  end
12
12
 
13
- def handle_dataset_settings_changed
14
- # dot not do anything, there is no real dataset
15
- end
16
-
17
13
  def all(*_args)
18
14
  raise NotImplementedError, 'this node does not support #all'
19
15
  end
@@ -30,13 +26,6 @@ module Dataflow
30
26
  raise NotImplementedError, 'this node does not support #all_paginated'
31
27
  end
32
28
 
33
- def add(*_args)
34
- raise NotImplementedError, 'this node does not support #add'
35
- end
36
-
37
- def clear(*_args)
38
- raise NotImplementedError, 'this node does not support #clear'
39
- end
40
29
  end
41
30
  end
42
31
  end
@@ -30,6 +30,10 @@ module Dataflow
30
30
  end
31
31
 
32
32
  def add(records:)
33
+ raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
34
+ records = records.compact
35
+ return if records.blank?
36
+
33
37
  # TODO: create a chain of behavior "before add"
34
38
  rename_dotted_fields(records: records)
35
39
  add_internal_timestamp(records: records)
@@ -42,6 +42,8 @@ module Dataflow
42
42
  end
43
43
 
44
44
  def add(records:)
45
+ raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
46
+ records = records.compact
45
47
  return if records.blank?
46
48
 
47
49
  # TODO: create a chain of behavior "before add"
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+ require 'bunny'
3
+ require 'json'
4
+
5
+ module Dataflow
6
+ class RemoteWorker
7
+ class << self
8
+ def work(work_queue_name = 'dataflow.ruby')
9
+ conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
10
+ conn.start
11
+
12
+ ch = conn.create_channel
13
+ queue = ch.queue(work_queue_name)
14
+ ch.prefetch(1)
15
+
16
+ logger.log("Accepting work on #{work_queue_name}...")
17
+
18
+ queue.subscribe(block: true, manual_ack: true) do |delivery_info, _properties, payload|
19
+ data = JSON.parse(payload)
20
+ response = process(data)
21
+ if response.present?
22
+ ch.default_exchange.publish(response.to_json, routing_key: data['completion_queue_name'])
23
+ end
24
+ ch.ack(delivery_info.delivery_tag)
25
+ end
26
+ ensure
27
+ conn.close
28
+ logger.log('Connection closed, stopped accepting work.')
29
+ end
30
+
31
+ def process(data)
32
+ node = Dataflow::Nodes::ComputeNode.find(data['node_id'])
33
+
34
+ unless node.execution_valid?(data['execution_uuid'])
35
+ logger.log("[#{data['msg_id']}] work on '#{node.name}' has expired. Skipping.")
36
+ return
37
+ end
38
+
39
+ errors = execute(node, data)
40
+ response = { msg_id: data['msg_id'] }
41
+ response.merge(errors[0])
42
+ rescue Mongoid::Errors::DocumentNotFound => e
43
+ { error: { message: e.message, backtrace: e.backtrace } }
44
+ end
45
+
46
+ def execute(node, payload_data)
47
+ # execute in a different process, so that once it's finished
48
+ # we can purge the memory
49
+ Parallel.map([payload_data]) do |data|
50
+ error = {}
51
+ logger.log("[#{data['msg_id']}] working on '#{node.name}'...")
52
+
53
+ begin
54
+ if data['is_batch']
55
+ node.execute_local_batch_computation(data['params'])
56
+ else
57
+ node.execute_local_computation
58
+ end
59
+ rescue StandardError => e
60
+ error = { error: { message: e.message, backtrace: e.backtrace } }
61
+ end
62
+
63
+ logger.log("[#{data['msg_id']}] done working on '#{node.name}'.")
64
+ error
65
+ end
66
+ end
67
+
68
+ def logger
69
+ @logger ||= Dataflow::Logger.new(prefix: 'Worker')
70
+ end
71
+ end
72
+ end
73
+ end
@@ -51,6 +51,15 @@ module Dataflow
51
51
  end
52
52
 
53
53
  def infer_partial_schema(where:, extended: false)
54
+ if db_backend == :postgresql
55
+ # Experimental
56
+ sch = db_adapter.client.schema(read_dataset_name).to_h
57
+ sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
58
+ self.inferred_schema = sch
59
+ save
60
+ return sch
61
+ end
62
+
54
63
  data_count = count(where: where)
55
64
  return {} if data_count == 0
56
65
 
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.13.0'
3
+ VERSION = '0.14.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.13.0
4
+ version: 0.14.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-05-23 00:00:00.000000000 Z
11
+ date: 2017-06-06 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -290,6 +290,20 @@ dependencies:
290
290
  - - "~>"
291
291
  - !ruby/object:Gem::Version
292
292
  version: '0.10'
293
+ - !ruby/object:Gem::Dependency
294
+ name: bunny
295
+ requirement: !ruby/object:Gem::Requirement
296
+ requirements:
297
+ - - "~>"
298
+ - !ruby/object:Gem::Version
299
+ version: '2.7'
300
+ type: :runtime
301
+ prerelease: false
302
+ version_requirements: !ruby/object:Gem::Requirement
303
+ requirements:
304
+ - - "~>"
305
+ - !ruby/object:Gem::Version
306
+ version: '2.7'
293
307
  description: Helps building data pipelines. It handles recomputing dependencies and
294
308
  parallel execution.
295
309
  email:
@@ -319,8 +333,9 @@ files:
319
333
  - lib/dataflow/adapters/settings.rb
320
334
  - lib/dataflow/adapters/sql_adapter.rb
321
335
  - lib/dataflow/errors/invalid_configuration_error.rb
322
- - lib/dataflow/errors/not_implemented_error.rb
336
+ - lib/dataflow/errors/remote_execution_error.rb
323
337
  - lib/dataflow/event_mixin.rb
338
+ - lib/dataflow/executor.rb
324
339
  - lib/dataflow/extensions/mongo_driver.rb
325
340
  - lib/dataflow/extensions/msgpack.rb
326
341
  - lib/dataflow/logger.rb
@@ -344,6 +359,7 @@ files:
344
359
  - lib/dataflow/nodes/transformation/to_time_node.rb
345
360
  - lib/dataflow/nodes/upsert_node.rb
346
361
  - lib/dataflow/properties_mixin.rb
362
+ - lib/dataflow/remote_worker.rb
347
363
  - lib/dataflow/schema_mixin.rb
348
364
  - lib/dataflow/version.rb
349
365
  homepage: https://phybbit.com
@@ -1,7 +0,0 @@
1
- # frozen_string_literal: true
2
- module Dataflow
3
- module Errors
4
- class NotImplementedError < StandardError
5
- end
6
- end
7
- end