dataflow-rb 0.13.0 → 0.14.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/dataflow-rb.gemspec +1 -0
- data/lib/dataflow-rb.rb +3 -1
- data/lib/dataflow/adapters/csv_adapter.rb +8 -8
- data/lib/dataflow/adapters/mongo_db_adapter.rb +15 -13
- data/lib/dataflow/adapters/psql_adapter.rb +13 -11
- data/lib/dataflow/adapters/sql_adapter.rb +7 -8
- data/lib/dataflow/errors/remote_execution_error.rb +13 -0
- data/lib/dataflow/executor.rb +104 -0
- data/lib/dataflow/nodes/compute_node.rb +87 -31
- data/lib/dataflow/nodes/data_node.rb +3 -1
- data/lib/dataflow/nodes/export/to_csv_node.rb +4 -3
- data/lib/dataflow/nodes/read_only_data_node.rb +16 -12
- data/lib/dataflow/nodes/runtime_query_node.rb +2 -13
- data/lib/dataflow/nodes/snapshot_node.rb +4 -0
- data/lib/dataflow/nodes/upsert_node.rb +2 -0
- data/lib/dataflow/remote_worker.rb +73 -0
- data/lib/dataflow/schema_mixin.rb +9 -0
- data/lib/dataflow/version.rb +1 -1
- metadata +19 -3
- data/lib/dataflow/errors/not_implemented_error.rb +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d45b64a7e367df85841ae86e6fde6550c33319a
|
4
|
+
data.tar.gz: c4ac87dcaf77cd8a7b842a87523271a7be70a850
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13abdbc494c020670183e3630261d684df43ea5c4fef110f567b97ca95883f7f8515014ce1e8b9eacca452e5180727a493cf8b7ea10595b1ea41aa632e074074
|
7
|
+
data.tar.gz: a0f6a2aff4b1ecce23b74610574cadf83c598d523b49ac7e05a4d68768f441398767a58a77959f3f4b037962d3b0f3b94804688c051b9576b8c6fe9abce6e159
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
#### 0.14.0
|
4
|
+
- [ef8ddcd] Do not assume a minimum of 1 dependency per compute node.
|
5
|
+
- [b131bb1] Add type check on the data node #add methods. Filter nil values.
|
6
|
+
- [effc5a4] Set the rabbitmq as coming from the env
|
7
|
+
- [577ea2e] Add support for computing a node remotely.
|
8
|
+
- [4a450c3] Remove the custom not implemented error and use the default one.
|
9
|
+
- [f9c48c5] Added some new lines
|
10
|
+
- [336b9f8] Fix the backup options
|
11
|
+
- [2b2fbee] Make the runtime query node a subclass of the read only data node
|
12
|
+
- [fe237c4] Change the backup structure to isolate the db name by folder
|
13
|
+
- [654927f] Experiment with querying arrays
|
14
|
+
- [506f105] Order by system id when exporting
|
15
|
+
- [fa8fdc3] Keep the data ordered when exporting to csv
|
16
|
+
- [5e1718d] Add support for postgresql when inferring partial schemas (needed for export)
|
17
|
+
|
18
|
+
#### 0.13.1
|
19
|
+
- [aa3ed2e] Fix a bug when storing a db connection
|
3
20
|
|
4
21
|
#### 0.13.0
|
5
22
|
- [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
|
data/dataflow-rb.gemspec
CHANGED
data/lib/dataflow-rb.rb
CHANGED
@@ -17,6 +17,8 @@ require 'dataflow/logger'
|
|
17
17
|
require 'dataflow/properties_mixin'
|
18
18
|
require 'dataflow/schema_mixin'
|
19
19
|
require 'dataflow/node'
|
20
|
+
require 'dataflow/executor'
|
21
|
+
require 'dataflow/remote_worker'
|
20
22
|
|
21
23
|
require 'dataflow/adapters/csv_adapter'
|
22
24
|
require 'dataflow/adapters/mongo_db_adapter'
|
@@ -26,7 +28,7 @@ require 'dataflow/adapters/psql_adapter'
|
|
26
28
|
require 'dataflow/adapters/settings'
|
27
29
|
|
28
30
|
require 'dataflow/errors/invalid_configuration_error'
|
29
|
-
require 'dataflow/errors/
|
31
|
+
require 'dataflow/errors/remote_execution_error'
|
30
32
|
|
31
33
|
require 'dataflow/nodes/mixin/add_internal_timestamp'
|
32
34
|
require 'dataflow/nodes/mixin/rename_dotted_fields'
|
@@ -27,7 +27,7 @@ module Dataflow
|
|
27
27
|
|
28
28
|
# retrieve a single element from a data node
|
29
29
|
def find(where: opts = {})
|
30
|
-
raise
|
30
|
+
raise NotImplementedError, '#find is not yet support on CSV.'
|
31
31
|
end
|
32
32
|
|
33
33
|
# retrieve all elements from a data node
|
@@ -43,8 +43,8 @@ module Dataflow
|
|
43
43
|
end
|
44
44
|
|
45
45
|
# save the given records
|
46
|
-
def save(records:)
|
47
|
-
write_csv_part(records, keys: @schema.keys)
|
46
|
+
def save(records:, part: nil)
|
47
|
+
write_csv_part(records, keys: @schema.keys, part: part)
|
48
48
|
end
|
49
49
|
|
50
50
|
def on_save_finished
|
@@ -52,7 +52,7 @@ module Dataflow
|
|
52
52
|
end
|
53
53
|
|
54
54
|
def remove(_opts = {})
|
55
|
-
raise
|
55
|
+
raise NotImplementedError, '#find is not yet support on CSV.'
|
56
56
|
end
|
57
57
|
|
58
58
|
def recreate_dataset(dataset: nil)
|
@@ -79,10 +79,10 @@ module Dataflow
|
|
79
79
|
|
80
80
|
def file_parts
|
81
81
|
part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
|
82
|
-
Dir["#{file_path}.part_*"]
|
82
|
+
Dir["#{file_path}.part_*"].sort
|
83
83
|
end
|
84
84
|
|
85
|
-
def write_csv_part(data, keys:)
|
85
|
+
def write_csv_part(data, keys:, part:)
|
86
86
|
# prepare the data
|
87
87
|
key_tokens = keys.map { |key| record_dig_tokens(key: key) }
|
88
88
|
rows = data.map do |datum|
|
@@ -90,8 +90,8 @@ module Dataflow
|
|
90
90
|
end
|
91
91
|
|
92
92
|
# dump in a part file
|
93
|
-
|
94
|
-
CSV.open("#{file_path}.part_#{
|
93
|
+
part ||= SecureRandom.hex
|
94
|
+
CSV.open("#{file_path}.part_#{part}", 'w') do |csv|
|
95
95
|
rows.each { |row| csv << row }
|
96
96
|
end
|
97
97
|
end
|
@@ -226,24 +226,26 @@ module Dataflow
|
|
226
226
|
end
|
227
227
|
|
228
228
|
def dump(base_folder:)
|
229
|
-
archive_path = "#{base_folder}/#{@settings.db_name}
|
230
|
-
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
|
231
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
232
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
233
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
234
|
-
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
235
|
-
|
229
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.gz"
|
230
|
+
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
|
231
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
232
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
233
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
234
|
+
options += "--password=#{@settings.db_password} " if @settings.db_password.present?
|
235
|
+
|
236
|
+
`mkdir -p #{base_folder}/#{@settings.db_name}`
|
236
237
|
`mongodump #{options} --gzip`
|
237
238
|
archive_path
|
238
239
|
end
|
239
240
|
|
240
241
|
def restore(filepath:)
|
241
|
-
options = "--archive=#{filepath}
|
242
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
243
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
244
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
245
|
-
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
246
|
-
|
242
|
+
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
|
243
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
244
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
245
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
246
|
+
options += "--password=#{@settings.db_password} " if @settings.db_password.present?
|
247
|
+
|
248
|
+
`mongorestore #{options} --drop --gzip`
|
247
249
|
end
|
248
250
|
|
249
251
|
def transform_to_query(opts)
|
@@ -26,24 +26,26 @@ module Dataflow
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def dump(base_folder:)
|
29
|
-
archive_path = "#{base_folder}/#{@settings.db_name}
|
30
|
-
options = "--table=public.#{@settings.read_dataset_name}"
|
31
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
32
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
33
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
29
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.dump"
|
30
|
+
options = "--table=public.#{@settings.read_dataset_name} "
|
31
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
32
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
33
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
34
34
|
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
35
|
-
|
35
|
+
|
36
|
+
`mkdir -p #{base_folder}/#{@settings.db_name}`
|
36
37
|
`#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
|
37
38
|
archive_path
|
38
39
|
end
|
39
40
|
|
40
41
|
def restore(filepath:)
|
41
|
-
options = "--table=#{@settings.read_dataset_name}"
|
42
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
43
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
44
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
42
|
+
options = "--table=#{@settings.read_dataset_name} "
|
43
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
44
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
45
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
45
46
|
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
46
|
-
|
47
|
+
|
48
|
+
drop_dataset(@settings.read_dataset_name)
|
47
49
|
`#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
|
48
50
|
end
|
49
51
|
end
|
@@ -12,16 +12,17 @@ module Dataflow
|
|
12
12
|
def client(settings)
|
13
13
|
@clients ||= {}
|
14
14
|
connection_uri = settings.connection_uri_or_default
|
15
|
-
|
15
|
+
full_uri = "#{connection_uri}/#{settings.db_name}?encoding=utf8"
|
16
|
+
return @clients[full_uri] if @clients[full_uri].present?
|
16
17
|
|
17
18
|
# first, make sure the DB is created (if it is not an external db)
|
18
19
|
is_external_db = settings.connection_uri.present?
|
19
20
|
try_create_db(connection_uri, settings.db_name) unless is_external_db
|
20
21
|
|
21
22
|
# then, create the connection object
|
22
|
-
db = Sequel.connect(
|
23
|
+
db = Sequel.connect(full_uri)
|
23
24
|
add_extensions(settings, db)
|
24
|
-
@clients[
|
25
|
+
@clients[full_uri] = db
|
25
26
|
end
|
26
27
|
|
27
28
|
# Used internally to try to create the DB automatically.
|
@@ -242,6 +243,8 @@ module Dataflow
|
|
242
243
|
end
|
243
244
|
when '<', '<=', '>', '>='
|
244
245
|
Sequel.lit("#{k} #{operator} ?", value)
|
246
|
+
when '@>', '<@'
|
247
|
+
Sequel.lit("#{k} #{operator} ?", Sequel.pg_array(Array(value)))
|
245
248
|
when '~'
|
246
249
|
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
247
250
|
when '~*'
|
@@ -291,19 +294,15 @@ module Dataflow
|
|
291
294
|
end
|
292
295
|
when 'numeric'
|
293
296
|
col_type = 'real'
|
294
|
-
when 'array', 'hash'
|
295
|
-
logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
|
296
|
-
col_type = 'json'
|
297
297
|
when 'date', 'time'
|
298
298
|
# keep as-is
|
299
299
|
col_type = type
|
300
300
|
else
|
301
|
-
logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
|
302
301
|
col_type = type
|
303
302
|
end
|
304
303
|
|
305
304
|
# create a column with the given type
|
306
|
-
|
305
|
+
logger.log("#{column} #{type} -> #{col_type}")
|
307
306
|
column(column.to_sym, col_type)
|
308
307
|
end
|
309
308
|
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'bunny'
|
3
|
+
require 'json'
|
4
|
+
require 'thread'
|
5
|
+
|
6
|
+
module Dataflow
|
7
|
+
class Executor
|
8
|
+
class << self
|
9
|
+
def execute(node)
|
10
|
+
case node.execution_model
|
11
|
+
when :remote
|
12
|
+
execute_remote_computation(node: node, is_batch_execution: false)
|
13
|
+
when :remote_batch
|
14
|
+
execute_remote_computation(node: node, is_batch_execution: true)
|
15
|
+
when :local
|
16
|
+
node.execute_local_computation
|
17
|
+
else
|
18
|
+
raise ArgumentError, "Unknown execution model #{execution_model}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def execute_remote_computation(node:, is_batch_execution:)
|
23
|
+
execution_uuid = node.execution_uuid
|
24
|
+
raise ArgumentError, "Expected execution uuid to be set on '#{node.name}' (##{node._id})" unless execution_uuid.present?
|
25
|
+
|
26
|
+
logger.log("Started processing '#{node.name}'")
|
27
|
+
conn, channel, completion_queue = open_communication_channel
|
28
|
+
logger.log("Opened a completion queue for '#{node.name}': #{completion_queue.name}")
|
29
|
+
|
30
|
+
messages = send_execution_messages(channel, node, is_batch_execution, completion_queue.name)
|
31
|
+
error_data = await_execution_completion(completion_queue, messages.count)
|
32
|
+
logger.log("Finished processing '#{node.name}'")
|
33
|
+
|
34
|
+
raise Errors::RemoteExecutionError.new(error_data['message'], error_data['backtrace']) if error_data
|
35
|
+
ensure
|
36
|
+
conn&.close
|
37
|
+
end
|
38
|
+
|
39
|
+
def open_communication_channel
|
40
|
+
conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
|
41
|
+
conn.start
|
42
|
+
|
43
|
+
ch = conn.create_channel
|
44
|
+
completion_queue = ch.queue('', exclusive: true)
|
45
|
+
|
46
|
+
return conn, ch, completion_queue
|
47
|
+
end
|
48
|
+
|
49
|
+
def send_execution_messages(channel, node, is_batch_execution, completion_queue_name)
|
50
|
+
execution_params = make_execution_params(node, is_batch_execution, completion_queue_name)
|
51
|
+
|
52
|
+
execution_queue = channel.queue(node.execution_queue)
|
53
|
+
execution_params.each do |exec_params|
|
54
|
+
execution_queue.publish(exec_params.to_json)
|
55
|
+
end
|
56
|
+
|
57
|
+
execution_params
|
58
|
+
end
|
59
|
+
|
60
|
+
def make_execution_params(node, is_batch_execution, completion_queue_name)
|
61
|
+
execution_params = if is_batch_execution
|
62
|
+
node.make_batch_params
|
63
|
+
else
|
64
|
+
[{}]
|
65
|
+
end
|
66
|
+
|
67
|
+
execution_params.each_with_index.map do |params, idx|
|
68
|
+
{
|
69
|
+
msg_id: idx,
|
70
|
+
node_id: node._id.to_s,
|
71
|
+
is_batch: is_batch_execution,
|
72
|
+
params: params,
|
73
|
+
execution_uuid: node.execution_uuid.to_s,
|
74
|
+
completion_queue_name: completion_queue_name
|
75
|
+
}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def await_execution_completion(completion_queue, expected_completion_count)
|
80
|
+
completed_message_indexes = []
|
81
|
+
unblock = Queue.new
|
82
|
+
|
83
|
+
consumer = completion_queue.subscribe do |_delivery_info, _properties, payload|
|
84
|
+
data = JSON.parse(payload)
|
85
|
+
unblock.enq(data['error']) if data['error'].present?
|
86
|
+
|
87
|
+
completed_message_indexes << data['msg_id']
|
88
|
+
if completed_message_indexes.count == expected_completion_count
|
89
|
+
unblock.enq(false)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
error_data = unblock.deq
|
94
|
+
consumer.cancel
|
95
|
+
|
96
|
+
error_data
|
97
|
+
end
|
98
|
+
|
99
|
+
def logger
|
100
|
+
@logger ||= Dataflow::Logger.new(prefix: 'Executor')
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -57,6 +57,16 @@ module Dataflow
|
|
57
57
|
# The node name
|
58
58
|
field :name, type: String
|
59
59
|
|
60
|
+
# The execution model:
|
61
|
+
field :execution_model, type: Symbol, default: :local
|
62
|
+
|
63
|
+
# For remote computation only:
|
64
|
+
# Controls on which queue this execution wi;l be routed
|
65
|
+
field :execution_queue, type: String, default: 'dataflow.ruby'
|
66
|
+
|
67
|
+
# Unique ID of the current execution
|
68
|
+
field :execution_uuid, type: BSON::ObjectId
|
69
|
+
|
60
70
|
# The data node to which we will write the computation output
|
61
71
|
field :data_node_id, type: BSON::ObjectId
|
62
72
|
|
@@ -261,7 +271,7 @@ module Dataflow
|
|
261
271
|
end
|
262
272
|
|
263
273
|
send_heartbeat
|
264
|
-
|
274
|
+
Executor.execute(self)
|
265
275
|
|
266
276
|
if clear_data_on_compute
|
267
277
|
# Post-compute, delay creating other indexes for insert speed
|
@@ -281,6 +291,9 @@ module Dataflow
|
|
281
291
|
logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
|
282
292
|
end
|
283
293
|
|
294
|
+
rescue Errors::RemoteExecutionError => e
|
295
|
+
on_computing_finished(state: 'error', error: e) if has_compute_lock
|
296
|
+
logger.error(error: e, custom_message: "#{name} failed computing remotely.")
|
284
297
|
rescue StandardError => e
|
285
298
|
on_computing_finished(state: 'error', error: e) if has_compute_lock
|
286
299
|
logger.error(error: e, custom_message: "#{name} failed computing.")
|
@@ -296,13 +309,9 @@ module Dataflow
|
|
296
309
|
def valid_for_computation?
|
297
310
|
# Perform additional checks: also add errors to "self.errors"
|
298
311
|
opts = self.class.dependency_opts
|
299
|
-
if opts.key?(:exactly)
|
300
|
-
|
301
|
-
|
302
|
-
ensure_at_most_dependencies(count: opts[:max])
|
303
|
-
else # even if the min is not specified, we need at least 1 dependency
|
304
|
-
ensure_at_least_dependencies(count: opts[:min] || 1)
|
305
|
-
end
|
312
|
+
ensure_exact_dependencies(count: opts[:exactly]) if opts.key?(:exactly)
|
313
|
+
ensure_at_most_dependencies(count: opts[:max]) if opts.key?(:max)
|
314
|
+
ensure_at_least_dependencies(count: opts[:min]) if opts.key?(:min)
|
306
315
|
ensure_no_cyclic_dependencies
|
307
316
|
ensure_keys_are_set
|
308
317
|
ensure_data_node_exists if self.class.data_node_opts[:ensure_exists]
|
@@ -322,37 +331,67 @@ module Dataflow
|
|
322
331
|
release_computing_lock!
|
323
332
|
end
|
324
333
|
|
334
|
+
def execution_valid?(uuid)
|
335
|
+
execution_uuid.to_s == uuid.to_s
|
336
|
+
end
|
337
|
+
|
325
338
|
# Keep a compatible interface with the data node
|
326
339
|
def schema
|
327
340
|
required_schema
|
328
341
|
end
|
329
342
|
|
343
|
+
# Interface to execute this node locally
|
344
|
+
def execute_local_computation
|
345
|
+
compute_impl
|
346
|
+
end
|
347
|
+
|
348
|
+
# Interface to execute a part (batch) of this node locally.
|
349
|
+
# This method is called when the framework needs to execute a batch on a worker.
|
350
|
+
# Override when needed, to execute a batch depending on the params.
|
351
|
+
# If you override, you may want to override the make_batch_params as well.
|
352
|
+
def execute_local_batch_computation(batch_params)
|
353
|
+
records = dependencies.first.all(where: batch_params)
|
354
|
+
new_records = compute_batch(records: records)
|
355
|
+
data_node&.add(records: new_records)
|
356
|
+
end
|
357
|
+
|
358
|
+
# Interface used to retrieve the params for scheduled batchs. Override when needed.
|
359
|
+
# The default implemention is to make queries that would
|
360
|
+
# ensure the full processing of the first dependency's records.
|
361
|
+
# @return [Array] of params that are passed to scheduled batches.
|
362
|
+
def make_batch_params
|
363
|
+
make_batch_queries(node: dependencies.first)
|
364
|
+
end
|
365
|
+
|
330
366
|
private
|
331
367
|
|
332
|
-
#
|
368
|
+
# Default compute implementation:
|
333
369
|
# - recreate the table
|
334
370
|
# - compute the records
|
335
371
|
# - save them to the DB
|
336
372
|
# (the process may be overwritten on a per-node basis if needed)
|
373
|
+
# Override if you need to have a completely custom compute implementation
|
337
374
|
def compute_impl
|
338
375
|
process_parallel(node: dependencies.first)
|
339
376
|
end
|
340
377
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
378
|
+
# This is an interface only.
|
379
|
+
# Override when you can implement a computation in terms of
|
380
|
+
# the records of the first dependent node.
|
381
|
+
# @param records [Array] a batch of records from the first dependency
|
382
|
+
# @return [Array] an array of results that are to be pushed to the data node (if set).
|
383
|
+
def compute_batch(records:)
|
384
|
+
[]
|
385
|
+
end
|
345
386
|
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
count_per_process = [limit, equal_split_per_process].min if limit > 0
|
387
|
+
def process_parallel(node:)
|
388
|
+
queries = make_batch_queries(node: node)
|
389
|
+
return if queries.blank?
|
350
390
|
|
351
|
-
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
352
391
|
queries_count = queries.count
|
353
|
-
|
354
392
|
parallel_each(queries.each_with_index) do |query, idx|
|
355
393
|
send_heartbeat
|
394
|
+
|
356
395
|
progress = (idx / queries_count.to_f * 100).ceil
|
357
396
|
on_computing_progressed(pct_complete: progress)
|
358
397
|
logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
|
@@ -365,25 +404,42 @@ module Dataflow
|
|
365
404
|
compute_batch(records: records)
|
366
405
|
end
|
367
406
|
|
368
|
-
data_node
|
407
|
+
data_node&.add(records: new_records)
|
369
408
|
end
|
370
409
|
end
|
371
410
|
|
372
|
-
#
|
373
|
-
|
374
|
-
|
375
|
-
|
411
|
+
# Makes queries that support traversing the node's records in parallel without overlap.
|
412
|
+
def make_batch_queries(node:)
|
413
|
+
return [] if node.blank?
|
414
|
+
record_count = node.count
|
415
|
+
return [] if record_count == 0
|
416
|
+
|
417
|
+
equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
|
418
|
+
count_per_process = equal_split_per_process
|
419
|
+
limit = limit_per_process.to_i
|
420
|
+
count_per_process = [limit, equal_split_per_process].min if limit > 0
|
421
|
+
|
422
|
+
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
376
423
|
end
|
377
424
|
|
378
425
|
def acquire_computing_lock!
|
379
426
|
# make sure that any pending changes are saved.
|
380
427
|
save
|
428
|
+
|
429
|
+
compute_state = {
|
430
|
+
computing_state: 'computing',
|
431
|
+
computing_started_at: Time.now,
|
432
|
+
execution_uuid: BSON::ObjectId.new
|
433
|
+
}
|
381
434
|
find_query = { _id: _id, computing_state: { '$ne' => 'computing' } }
|
382
|
-
update_query = { '$set' =>
|
435
|
+
update_query = { '$set' => compute_state }
|
436
|
+
|
383
437
|
# send a query directly to avoid mongoid's caching layers
|
384
438
|
res = Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
|
439
|
+
|
385
440
|
# reload the model data after the query above
|
386
441
|
reload
|
442
|
+
|
387
443
|
# the query is atomic so if res != nil, we acquired the lock
|
388
444
|
!res.nil?
|
389
445
|
end
|
@@ -391,20 +447,21 @@ module Dataflow
|
|
391
447
|
def release_computing_lock!
|
392
448
|
# make sure that any pending changes are saved.
|
393
449
|
save
|
450
|
+
|
394
451
|
find_query = { _id: _id }
|
395
|
-
update_query = { '$set' => { computing_state: nil, computing_started_at: nil } }
|
452
|
+
update_query = { '$set' => { computing_state: nil, computing_started_at: nil, execution_uuid: nil } }
|
453
|
+
|
396
454
|
# send a query directly to avoid mongoid's caching layers
|
397
455
|
Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
|
456
|
+
|
398
457
|
# reload the model data after the query above
|
399
458
|
reload
|
400
459
|
end
|
401
460
|
|
402
461
|
def await_computing!
|
403
|
-
start_waiting_at = Time.now
|
404
|
-
# TODO: should the max wait time be dependent on e.g. the recompute interval?
|
405
462
|
max_wait_time = 15.minutes
|
406
|
-
while Time.now <
|
407
|
-
sleep
|
463
|
+
while Time.now < last_heartbeat_time + max_wait_time
|
464
|
+
sleep 5
|
408
465
|
# reloads with the data stored on mongodb:
|
409
466
|
# something maybe have been changed by another process.
|
410
467
|
reload
|
@@ -436,7 +493,6 @@ module Dataflow
|
|
436
493
|
update_query = { '$set' => { last_compute_starting_time: time } }
|
437
494
|
Dataflow::Nodes::ComputeNode.where(_id: _id)
|
438
495
|
.find_one_and_update(update_query)
|
439
|
-
|
440
496
|
end
|
441
497
|
|
442
498
|
##############################
|
@@ -184,6 +184,8 @@ module Dataflow
|
|
184
184
|
# Adds the given records to the dataset and updates the updated_at time.
|
185
185
|
# @param records [Array] an array of the records to be added.
|
186
186
|
def add(records:)
|
187
|
+
raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
|
188
|
+
records = records.compact
|
187
189
|
return if records.blank?
|
188
190
|
db_adapter.save(records: records)
|
189
191
|
self.updated_at = Time.now
|
@@ -380,7 +382,7 @@ module Dataflow
|
|
380
382
|
return @postgresql_adapter
|
381
383
|
end
|
382
384
|
|
383
|
-
raise
|
385
|
+
raise NotImplementedError, "'#{db_backend}' backend is not implemented."
|
384
386
|
end
|
385
387
|
|
386
388
|
def valid_dataset_names
|
@@ -37,13 +37,14 @@ module Dataflow
|
|
37
37
|
count_per_process = [max_per_process, equal_split_per_process].min
|
38
38
|
|
39
39
|
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
40
|
+
system_id = node.send(:db_adapter).class::SYSTEM_ID
|
40
41
|
|
41
|
-
parallel_each(queries.each_with_index) do |query,
|
42
|
+
parallel_each(queries.each_with_index) do |query, idx|
|
42
43
|
# TODO: re-enabled event on_export_progressed
|
43
44
|
# progress = (idx / queries.count.to_f * 100).ceil
|
44
45
|
# on_export_progressed(pct_complete: progress)
|
45
|
-
batch = node.all(where: query.merge(where), fields: sch.keys)
|
46
|
-
csv_adapter.save(records: batch)
|
46
|
+
batch = node.all(where: query.merge(where), fields: sch.keys, sort: { system_id => 1 })
|
47
|
+
csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
|
47
48
|
end
|
48
49
|
|
49
50
|
# needed by the csv exporter to finalize in a single file
|
@@ -3,38 +3,36 @@ module Dataflow
|
|
3
3
|
module Nodes
|
4
4
|
# Only supports read operations
|
5
5
|
class ReadOnlyDataNode < DataNode
|
6
|
-
|
7
6
|
def set_defaults
|
8
7
|
super
|
9
8
|
self.use_double_buffering = false
|
10
9
|
end
|
11
10
|
|
12
|
-
|
13
11
|
def handle_dataset_settings_changed
|
14
12
|
# ignore - do not do anyhing
|
15
13
|
end
|
16
14
|
|
17
|
-
def add(*
|
15
|
+
def add(*_args)
|
18
16
|
raise_read_only_error!
|
19
17
|
end
|
20
18
|
|
21
|
-
def clear(*
|
19
|
+
def clear(*_args)
|
22
20
|
raise_read_only_error!
|
23
21
|
end
|
24
22
|
|
25
|
-
def recreate_dataset(*
|
23
|
+
def recreate_dataset(*_args)
|
26
24
|
raise_read_only_error!
|
27
25
|
end
|
28
26
|
|
29
|
-
def create_unique_indexes(*
|
27
|
+
def create_unique_indexes(*_args)
|
30
28
|
raise_read_only_error!
|
31
29
|
end
|
32
30
|
|
33
|
-
def create_non_unique_indexes(*
|
31
|
+
def create_non_unique_indexes(*_args)
|
34
32
|
raise_read_only_error!
|
35
33
|
end
|
36
34
|
|
37
|
-
def read_dataset_name=(*
|
35
|
+
def read_dataset_name=(*_args)
|
38
36
|
raise_read_only_error!
|
39
37
|
end
|
40
38
|
|
@@ -42,21 +40,27 @@ module Dataflow
|
|
42
40
|
raise_read_only_error!
|
43
41
|
end
|
44
42
|
|
45
|
-
def import(*
|
43
|
+
def import(*_args)
|
46
44
|
raise_read_only_error!
|
47
45
|
end
|
48
46
|
|
49
|
-
|
50
47
|
def drop_dataset!
|
51
48
|
raise_read_only_error!
|
52
49
|
end
|
53
50
|
|
51
|
+
def dump_dataset(*_args)
|
52
|
+
raise_read_only_error!
|
53
|
+
end
|
54
|
+
|
55
|
+
def restore_dataset(*_args)
|
56
|
+
raise_read_only_error!
|
57
|
+
end
|
58
|
+
|
54
59
|
private
|
55
60
|
|
56
61
|
def raise_read_only_error!
|
57
|
-
raise NotImplementedError, '
|
62
|
+
raise NotImplementedError, 'This node is read only'
|
58
63
|
end
|
59
|
-
|
60
64
|
end # class ExternalDataNode
|
61
65
|
end # module Nodes
|
62
66
|
end # module Dataflow
|
@@ -1,19 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
module Dataflow
|
3
3
|
# Interface for a node that behaves as a dataset.
|
4
|
-
# Does not support any operation.
|
4
|
+
# Does not support any write operation.
|
5
5
|
# Inherit and override to implement custom behavior.
|
6
6
|
module Nodes
|
7
|
-
class RuntimeQueryNode <
|
7
|
+
class RuntimeQueryNode < ReadOnlyDataNode
|
8
8
|
|
9
9
|
after_initialize do
|
10
10
|
self.db_backend = :none
|
11
11
|
end
|
12
12
|
|
13
|
-
def handle_dataset_settings_changed
|
14
|
-
# dot not do anything, there is no real dataset
|
15
|
-
end
|
16
|
-
|
17
13
|
def all(*_args)
|
18
14
|
raise NotImplementedError, 'this node does not support #all'
|
19
15
|
end
|
@@ -30,13 +26,6 @@ module Dataflow
|
|
30
26
|
raise NotImplementedError, 'this node does not support #all_paginated'
|
31
27
|
end
|
32
28
|
|
33
|
-
def add(*_args)
|
34
|
-
raise NotImplementedError, 'this node does not support #add'
|
35
|
-
end
|
36
|
-
|
37
|
-
def clear(*_args)
|
38
|
-
raise NotImplementedError, 'this node does not support #clear'
|
39
|
-
end
|
40
29
|
end
|
41
30
|
end
|
42
31
|
end
|
@@ -30,6 +30,10 @@ module Dataflow
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def add(records:)
|
33
|
+
raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
|
34
|
+
records = records.compact
|
35
|
+
return if records.blank?
|
36
|
+
|
33
37
|
# TODO: create a chain of behavior "before add"
|
34
38
|
rename_dotted_fields(records: records)
|
35
39
|
add_internal_timestamp(records: records)
|
@@ -42,6 +42,8 @@ module Dataflow
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def add(records:)
|
45
|
+
raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
|
46
|
+
records = records.compact
|
45
47
|
return if records.blank?
|
46
48
|
|
47
49
|
# TODO: create a chain of behavior "before add"
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'bunny'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Dataflow
|
6
|
+
class RemoteWorker
|
7
|
+
class << self
|
8
|
+
def work(work_queue_name = 'dataflow.ruby')
|
9
|
+
conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
|
10
|
+
conn.start
|
11
|
+
|
12
|
+
ch = conn.create_channel
|
13
|
+
queue = ch.queue(work_queue_name)
|
14
|
+
ch.prefetch(1)
|
15
|
+
|
16
|
+
logger.log("Accepting work on #{work_queue_name}...")
|
17
|
+
|
18
|
+
queue.subscribe(block: true, manual_ack: true) do |delivery_info, _properties, payload|
|
19
|
+
data = JSON.parse(payload)
|
20
|
+
response = process(data)
|
21
|
+
if response.present?
|
22
|
+
ch.default_exchange.publish(response.to_json, routing_key: data['completion_queue_name'])
|
23
|
+
end
|
24
|
+
ch.ack(delivery_info.delivery_tag)
|
25
|
+
end
|
26
|
+
ensure
|
27
|
+
conn.close
|
28
|
+
logger.log('Connection closed, stopped accepting work.')
|
29
|
+
end
|
30
|
+
|
31
|
+
def process(data)
|
32
|
+
node = Dataflow::Nodes::ComputeNode.find(data['node_id'])
|
33
|
+
|
34
|
+
unless node.execution_valid?(data['execution_uuid'])
|
35
|
+
logger.log("[#{data['msg_id']}] work on '#{node.name}' has expired. Skipping.")
|
36
|
+
return
|
37
|
+
end
|
38
|
+
|
39
|
+
errors = execute(node, data)
|
40
|
+
response = { msg_id: data['msg_id'] }
|
41
|
+
response.merge(errors[0])
|
42
|
+
rescue Mongoid::Errors::DocumentNotFound => e
|
43
|
+
{ error: { message: e.message, backtrace: e.backtrace } }
|
44
|
+
end
|
45
|
+
|
46
|
+
def execute(node, payload_data)
|
47
|
+
# execute in a different process, so that once it's finished
|
48
|
+
# we can purge the memory
|
49
|
+
Parallel.map([payload_data]) do |data|
|
50
|
+
error = {}
|
51
|
+
logger.log("[#{data['msg_id']}] working on '#{node.name}'...")
|
52
|
+
|
53
|
+
begin
|
54
|
+
if data['is_batch']
|
55
|
+
node.execute_local_batch_computation(data['params'])
|
56
|
+
else
|
57
|
+
node.execute_local_computation
|
58
|
+
end
|
59
|
+
rescue StandardError => e
|
60
|
+
error = { error: { message: e.message, backtrace: e.backtrace } }
|
61
|
+
end
|
62
|
+
|
63
|
+
logger.log("[#{data['msg_id']}] done working on '#{node.name}'.")
|
64
|
+
error
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def logger
|
69
|
+
@logger ||= Dataflow::Logger.new(prefix: 'Worker')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -51,6 +51,15 @@ module Dataflow
|
|
51
51
|
end
|
52
52
|
|
53
53
|
def infer_partial_schema(where:, extended: false)
|
54
|
+
if db_backend == :postgresql
|
55
|
+
# Experimental
|
56
|
+
sch = db_adapter.client.schema(read_dataset_name).to_h
|
57
|
+
sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
|
58
|
+
self.inferred_schema = sch
|
59
|
+
save
|
60
|
+
return sch
|
61
|
+
end
|
62
|
+
|
54
63
|
data_count = count(where: where)
|
55
64
|
return {} if data_count == 0
|
56
65
|
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-06-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -290,6 +290,20 @@ dependencies:
|
|
290
290
|
- - "~>"
|
291
291
|
- !ruby/object:Gem::Version
|
292
292
|
version: '0.10'
|
293
|
+
- !ruby/object:Gem::Dependency
|
294
|
+
name: bunny
|
295
|
+
requirement: !ruby/object:Gem::Requirement
|
296
|
+
requirements:
|
297
|
+
- - "~>"
|
298
|
+
- !ruby/object:Gem::Version
|
299
|
+
version: '2.7'
|
300
|
+
type: :runtime
|
301
|
+
prerelease: false
|
302
|
+
version_requirements: !ruby/object:Gem::Requirement
|
303
|
+
requirements:
|
304
|
+
- - "~>"
|
305
|
+
- !ruby/object:Gem::Version
|
306
|
+
version: '2.7'
|
293
307
|
description: Helps building data pipelines. It handles recomputing dependencies and
|
294
308
|
parallel execution.
|
295
309
|
email:
|
@@ -319,8 +333,9 @@ files:
|
|
319
333
|
- lib/dataflow/adapters/settings.rb
|
320
334
|
- lib/dataflow/adapters/sql_adapter.rb
|
321
335
|
- lib/dataflow/errors/invalid_configuration_error.rb
|
322
|
-
- lib/dataflow/errors/
|
336
|
+
- lib/dataflow/errors/remote_execution_error.rb
|
323
337
|
- lib/dataflow/event_mixin.rb
|
338
|
+
- lib/dataflow/executor.rb
|
324
339
|
- lib/dataflow/extensions/mongo_driver.rb
|
325
340
|
- lib/dataflow/extensions/msgpack.rb
|
326
341
|
- lib/dataflow/logger.rb
|
@@ -344,6 +359,7 @@ files:
|
|
344
359
|
- lib/dataflow/nodes/transformation/to_time_node.rb
|
345
360
|
- lib/dataflow/nodes/upsert_node.rb
|
346
361
|
- lib/dataflow/properties_mixin.rb
|
362
|
+
- lib/dataflow/remote_worker.rb
|
347
363
|
- lib/dataflow/schema_mixin.rb
|
348
364
|
- lib/dataflow/version.rb
|
349
365
|
homepage: https://phybbit.com
|