dataflow-rb 0.13.0 → 0.14.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/dataflow-rb.gemspec +1 -0
- data/lib/dataflow-rb.rb +3 -1
- data/lib/dataflow/adapters/csv_adapter.rb +8 -8
- data/lib/dataflow/adapters/mongo_db_adapter.rb +15 -13
- data/lib/dataflow/adapters/psql_adapter.rb +13 -11
- data/lib/dataflow/adapters/sql_adapter.rb +7 -8
- data/lib/dataflow/errors/remote_execution_error.rb +13 -0
- data/lib/dataflow/executor.rb +104 -0
- data/lib/dataflow/nodes/compute_node.rb +87 -31
- data/lib/dataflow/nodes/data_node.rb +3 -1
- data/lib/dataflow/nodes/export/to_csv_node.rb +4 -3
- data/lib/dataflow/nodes/read_only_data_node.rb +16 -12
- data/lib/dataflow/nodes/runtime_query_node.rb +2 -13
- data/lib/dataflow/nodes/snapshot_node.rb +4 -0
- data/lib/dataflow/nodes/upsert_node.rb +2 -0
- data/lib/dataflow/remote_worker.rb +73 -0
- data/lib/dataflow/schema_mixin.rb +9 -0
- data/lib/dataflow/version.rb +1 -1
- metadata +19 -3
- data/lib/dataflow/errors/not_implemented_error.rb +0 -7
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 3d45b64a7e367df85841ae86e6fde6550c33319a
|
4
|
+
data.tar.gz: c4ac87dcaf77cd8a7b842a87523271a7be70a850
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 13abdbc494c020670183e3630261d684df43ea5c4fef110f567b97ca95883f7f8515014ce1e8b9eacca452e5180727a493cf8b7ea10595b1ea41aa632e074074
|
7
|
+
data.tar.gz: a0f6a2aff4b1ecce23b74610574cadf83c598d523b49ac7e05a4d68768f441398767a58a77959f3f4b037962d3b0f3b94804688c051b9576b8c6fe9abce6e159
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
#### 0.14.0
|
4
|
+
- [ef8ddcd] Do not assume a minimum of 1 dependency per compute node.
|
5
|
+
- [b131bb1] Add type check on the data node #add methods. Filter nil values.
|
6
|
+
- [effc5a4] Set the rabbitmq as coming from the env
|
7
|
+
- [577ea2e] Add support for computing a node remotely.
|
8
|
+
- [4a450c3] Remove the custom not implemented error and use the default one.
|
9
|
+
- [f9c48c5] Added some new lines
|
10
|
+
- [336b9f8] Fix the backup options
|
11
|
+
- [2b2fbee] Make the runtime query node a subclass of the read only data node
|
12
|
+
- [fe237c4] Change the backup structure to isolate the db name by folder
|
13
|
+
- [654927f] Experiment with querying arrays
|
14
|
+
- [506f105] Order by system id when exporting
|
15
|
+
- [fa8fdc3] Keep the data ordered when exporting to csv
|
16
|
+
- [5e1718d] Add support for postgresql when inferring partial schemas (needed for export)
|
17
|
+
|
18
|
+
#### 0.13.1
|
19
|
+
- [aa3ed2e] Fix a bug when storing a db connection
|
3
20
|
|
4
21
|
#### 0.13.0
|
5
22
|
- [b79c96f] Fix a bug in the sql adapter: support multiple ORDER BY clauses
|
data/dataflow-rb.gemspec
CHANGED
data/lib/dataflow-rb.rb
CHANGED
@@ -17,6 +17,8 @@ require 'dataflow/logger'
|
|
17
17
|
require 'dataflow/properties_mixin'
|
18
18
|
require 'dataflow/schema_mixin'
|
19
19
|
require 'dataflow/node'
|
20
|
+
require 'dataflow/executor'
|
21
|
+
require 'dataflow/remote_worker'
|
20
22
|
|
21
23
|
require 'dataflow/adapters/csv_adapter'
|
22
24
|
require 'dataflow/adapters/mongo_db_adapter'
|
@@ -26,7 +28,7 @@ require 'dataflow/adapters/psql_adapter'
|
|
26
28
|
require 'dataflow/adapters/settings'
|
27
29
|
|
28
30
|
require 'dataflow/errors/invalid_configuration_error'
|
29
|
-
require 'dataflow/errors/
|
31
|
+
require 'dataflow/errors/remote_execution_error'
|
30
32
|
|
31
33
|
require 'dataflow/nodes/mixin/add_internal_timestamp'
|
32
34
|
require 'dataflow/nodes/mixin/rename_dotted_fields'
|
@@ -27,7 +27,7 @@ module Dataflow
|
|
27
27
|
|
28
28
|
# retrieve a single element from a data node
|
29
29
|
def find(where: opts = {})
|
30
|
-
raise
|
30
|
+
raise NotImplementedError, '#find is not yet support on CSV.'
|
31
31
|
end
|
32
32
|
|
33
33
|
# retrieve all elements from a data node
|
@@ -43,8 +43,8 @@ module Dataflow
|
|
43
43
|
end
|
44
44
|
|
45
45
|
# save the given records
|
46
|
-
def save(records:)
|
47
|
-
write_csv_part(records, keys: @schema.keys)
|
46
|
+
def save(records:, part: nil)
|
47
|
+
write_csv_part(records, keys: @schema.keys, part: part)
|
48
48
|
end
|
49
49
|
|
50
50
|
def on_save_finished
|
@@ -52,7 +52,7 @@ module Dataflow
|
|
52
52
|
end
|
53
53
|
|
54
54
|
def remove(_opts = {})
|
55
|
-
raise
|
55
|
+
raise NotImplementedError, '#find is not yet support on CSV.'
|
56
56
|
end
|
57
57
|
|
58
58
|
def recreate_dataset(dataset: nil)
|
@@ -79,10 +79,10 @@ module Dataflow
|
|
79
79
|
|
80
80
|
def file_parts
|
81
81
|
part = "#{settings.db_name}.#{settings.dataset_name}.csv.part_"
|
82
|
-
Dir["#{file_path}.part_*"]
|
82
|
+
Dir["#{file_path}.part_*"].sort
|
83
83
|
end
|
84
84
|
|
85
|
-
def write_csv_part(data, keys:)
|
85
|
+
def write_csv_part(data, keys:, part:)
|
86
86
|
# prepare the data
|
87
87
|
key_tokens = keys.map { |key| record_dig_tokens(key: key) }
|
88
88
|
rows = data.map do |datum|
|
@@ -90,8 +90,8 @@ module Dataflow
|
|
90
90
|
end
|
91
91
|
|
92
92
|
# dump in a part file
|
93
|
-
|
94
|
-
CSV.open("#{file_path}.part_#{
|
93
|
+
part ||= SecureRandom.hex
|
94
|
+
CSV.open("#{file_path}.part_#{part}", 'w') do |csv|
|
95
95
|
rows.each { |row| csv << row }
|
96
96
|
end
|
97
97
|
end
|
@@ -226,24 +226,26 @@ module Dataflow
|
|
226
226
|
end
|
227
227
|
|
228
228
|
def dump(base_folder:)
|
229
|
-
archive_path = "#{base_folder}/#{@settings.db_name}
|
230
|
-
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name}"
|
231
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
232
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
233
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
234
|
-
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
235
|
-
|
229
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.gz"
|
230
|
+
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
|
231
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
232
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
233
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
234
|
+
options += "--password=#{@settings.db_password} " if @settings.db_password.present?
|
235
|
+
|
236
|
+
`mkdir -p #{base_folder}/#{@settings.db_name}`
|
236
237
|
`mongodump #{options} --gzip`
|
237
238
|
archive_path
|
238
239
|
end
|
239
240
|
|
240
241
|
def restore(filepath:)
|
241
|
-
options = "--archive=#{filepath}
|
242
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
243
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
244
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
245
|
-
options += "--password=#{@settings.db_password}" if @settings.db_password.present?
|
246
|
-
|
242
|
+
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
|
243
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
244
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
245
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
246
|
+
options += "--password=#{@settings.db_password} " if @settings.db_password.present?
|
247
|
+
|
248
|
+
`mongorestore #{options} --drop --gzip`
|
247
249
|
end
|
248
250
|
|
249
251
|
def transform_to_query(opts)
|
@@ -26,24 +26,26 @@ module Dataflow
|
|
26
26
|
end
|
27
27
|
|
28
28
|
def dump(base_folder:)
|
29
|
-
archive_path = "#{base_folder}/#{@settings.db_name}
|
30
|
-
options = "--table=public.#{@settings.read_dataset_name}"
|
31
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
32
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
33
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
29
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.dump"
|
30
|
+
options = "--table=public.#{@settings.read_dataset_name} "
|
31
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
32
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
33
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
34
34
|
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
35
|
-
|
35
|
+
|
36
|
+
`mkdir -p #{base_folder}/#{@settings.db_name}`
|
36
37
|
`#{password}pg_dump #{options} -Fc #{@settings.db_name} > #{archive_path}`
|
37
38
|
archive_path
|
38
39
|
end
|
39
40
|
|
40
41
|
def restore(filepath:)
|
41
|
-
options = "--table=#{@settings.read_dataset_name}"
|
42
|
-
options += "--host=#{@settings.db_host}" if @settings.db_host.present?
|
43
|
-
options += "--port=#{@settings.db_port}" if @settings.db_port.present?
|
44
|
-
options += "--username=#{@settings.db_user}" if @settings.db_user.present?
|
42
|
+
options = "--table=#{@settings.read_dataset_name} "
|
43
|
+
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
44
|
+
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
45
|
+
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
45
46
|
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
46
|
-
|
47
|
+
|
48
|
+
drop_dataset(@settings.read_dataset_name)
|
47
49
|
`#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
|
48
50
|
end
|
49
51
|
end
|
@@ -12,16 +12,17 @@ module Dataflow
|
|
12
12
|
def client(settings)
|
13
13
|
@clients ||= {}
|
14
14
|
connection_uri = settings.connection_uri_or_default
|
15
|
-
|
15
|
+
full_uri = "#{connection_uri}/#{settings.db_name}?encoding=utf8"
|
16
|
+
return @clients[full_uri] if @clients[full_uri].present?
|
16
17
|
|
17
18
|
# first, make sure the DB is created (if it is not an external db)
|
18
19
|
is_external_db = settings.connection_uri.present?
|
19
20
|
try_create_db(connection_uri, settings.db_name) unless is_external_db
|
20
21
|
|
21
22
|
# then, create the connection object
|
22
|
-
db = Sequel.connect(
|
23
|
+
db = Sequel.connect(full_uri)
|
23
24
|
add_extensions(settings, db)
|
24
|
-
@clients[
|
25
|
+
@clients[full_uri] = db
|
25
26
|
end
|
26
27
|
|
27
28
|
# Used internally to try to create the DB automatically.
|
@@ -242,6 +243,8 @@ module Dataflow
|
|
242
243
|
end
|
243
244
|
when '<', '<=', '>', '>='
|
244
245
|
Sequel.lit("#{k} #{operator} ?", value)
|
246
|
+
when '@>', '<@'
|
247
|
+
Sequel.lit("#{k} #{operator} ?", Sequel.pg_array(Array(value)))
|
245
248
|
when '~'
|
246
249
|
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
247
250
|
when '~*'
|
@@ -291,19 +294,15 @@ module Dataflow
|
|
291
294
|
end
|
292
295
|
when 'numeric'
|
293
296
|
col_type = 'real'
|
294
|
-
when 'array', 'hash'
|
295
|
-
logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
|
296
|
-
col_type = 'json'
|
297
297
|
when 'date', 'time'
|
298
298
|
# keep as-is
|
299
299
|
col_type = type
|
300
300
|
else
|
301
|
-
logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
|
302
301
|
col_type = type
|
303
302
|
end
|
304
303
|
|
305
304
|
# create a column with the given type
|
306
|
-
|
305
|
+
logger.log("#{column} #{type} -> #{col_type}")
|
307
306
|
column(column.to_sym, col_type)
|
308
307
|
end
|
309
308
|
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'bunny'
|
3
|
+
require 'json'
|
4
|
+
require 'thread'
|
5
|
+
|
6
|
+
module Dataflow
|
7
|
+
class Executor
|
8
|
+
class << self
|
9
|
+
def execute(node)
|
10
|
+
case node.execution_model
|
11
|
+
when :remote
|
12
|
+
execute_remote_computation(node: node, is_batch_execution: false)
|
13
|
+
when :remote_batch
|
14
|
+
execute_remote_computation(node: node, is_batch_execution: true)
|
15
|
+
when :local
|
16
|
+
node.execute_local_computation
|
17
|
+
else
|
18
|
+
raise ArgumentError, "Unknown execution model #{execution_model}"
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def execute_remote_computation(node:, is_batch_execution:)
|
23
|
+
execution_uuid = node.execution_uuid
|
24
|
+
raise ArgumentError, "Expected execution uuid to be set on '#{node.name}' (##{node._id})" unless execution_uuid.present?
|
25
|
+
|
26
|
+
logger.log("Started processing '#{node.name}'")
|
27
|
+
conn, channel, completion_queue = open_communication_channel
|
28
|
+
logger.log("Opened a completion queue for '#{node.name}': #{completion_queue.name}")
|
29
|
+
|
30
|
+
messages = send_execution_messages(channel, node, is_batch_execution, completion_queue.name)
|
31
|
+
error_data = await_execution_completion(completion_queue, messages.count)
|
32
|
+
logger.log("Finished processing '#{node.name}'")
|
33
|
+
|
34
|
+
raise Errors::RemoteExecutionError.new(error_data['message'], error_data['backtrace']) if error_data
|
35
|
+
ensure
|
36
|
+
conn&.close
|
37
|
+
end
|
38
|
+
|
39
|
+
def open_communication_channel
|
40
|
+
conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
|
41
|
+
conn.start
|
42
|
+
|
43
|
+
ch = conn.create_channel
|
44
|
+
completion_queue = ch.queue('', exclusive: true)
|
45
|
+
|
46
|
+
return conn, ch, completion_queue
|
47
|
+
end
|
48
|
+
|
49
|
+
def send_execution_messages(channel, node, is_batch_execution, completion_queue_name)
|
50
|
+
execution_params = make_execution_params(node, is_batch_execution, completion_queue_name)
|
51
|
+
|
52
|
+
execution_queue = channel.queue(node.execution_queue)
|
53
|
+
execution_params.each do |exec_params|
|
54
|
+
execution_queue.publish(exec_params.to_json)
|
55
|
+
end
|
56
|
+
|
57
|
+
execution_params
|
58
|
+
end
|
59
|
+
|
60
|
+
def make_execution_params(node, is_batch_execution, completion_queue_name)
|
61
|
+
execution_params = if is_batch_execution
|
62
|
+
node.make_batch_params
|
63
|
+
else
|
64
|
+
[{}]
|
65
|
+
end
|
66
|
+
|
67
|
+
execution_params.each_with_index.map do |params, idx|
|
68
|
+
{
|
69
|
+
msg_id: idx,
|
70
|
+
node_id: node._id.to_s,
|
71
|
+
is_batch: is_batch_execution,
|
72
|
+
params: params,
|
73
|
+
execution_uuid: node.execution_uuid.to_s,
|
74
|
+
completion_queue_name: completion_queue_name
|
75
|
+
}
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
def await_execution_completion(completion_queue, expected_completion_count)
|
80
|
+
completed_message_indexes = []
|
81
|
+
unblock = Queue.new
|
82
|
+
|
83
|
+
consumer = completion_queue.subscribe do |_delivery_info, _properties, payload|
|
84
|
+
data = JSON.parse(payload)
|
85
|
+
unblock.enq(data['error']) if data['error'].present?
|
86
|
+
|
87
|
+
completed_message_indexes << data['msg_id']
|
88
|
+
if completed_message_indexes.count == expected_completion_count
|
89
|
+
unblock.enq(false)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
error_data = unblock.deq
|
94
|
+
consumer.cancel
|
95
|
+
|
96
|
+
error_data
|
97
|
+
end
|
98
|
+
|
99
|
+
def logger
|
100
|
+
@logger ||= Dataflow::Logger.new(prefix: 'Executor')
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -57,6 +57,16 @@ module Dataflow
|
|
57
57
|
# The node name
|
58
58
|
field :name, type: String
|
59
59
|
|
60
|
+
# The execution model:
|
61
|
+
field :execution_model, type: Symbol, default: :local
|
62
|
+
|
63
|
+
# For remote computation only:
|
64
|
+
# Controls on which queue this execution wi;l be routed
|
65
|
+
field :execution_queue, type: String, default: 'dataflow.ruby'
|
66
|
+
|
67
|
+
# Unique ID of the current execution
|
68
|
+
field :execution_uuid, type: BSON::ObjectId
|
69
|
+
|
60
70
|
# The data node to which we will write the computation output
|
61
71
|
field :data_node_id, type: BSON::ObjectId
|
62
72
|
|
@@ -261,7 +271,7 @@ module Dataflow
|
|
261
271
|
end
|
262
272
|
|
263
273
|
send_heartbeat
|
264
|
-
|
274
|
+
Executor.execute(self)
|
265
275
|
|
266
276
|
if clear_data_on_compute
|
267
277
|
# Post-compute, delay creating other indexes for insert speed
|
@@ -281,6 +291,9 @@ module Dataflow
|
|
281
291
|
logger.log("#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}.")
|
282
292
|
end
|
283
293
|
|
294
|
+
rescue Errors::RemoteExecutionError => e
|
295
|
+
on_computing_finished(state: 'error', error: e) if has_compute_lock
|
296
|
+
logger.error(error: e, custom_message: "#{name} failed computing remotely.")
|
284
297
|
rescue StandardError => e
|
285
298
|
on_computing_finished(state: 'error', error: e) if has_compute_lock
|
286
299
|
logger.error(error: e, custom_message: "#{name} failed computing.")
|
@@ -296,13 +309,9 @@ module Dataflow
|
|
296
309
|
def valid_for_computation?
|
297
310
|
# Perform additional checks: also add errors to "self.errors"
|
298
311
|
opts = self.class.dependency_opts
|
299
|
-
if opts.key?(:exactly)
|
300
|
-
|
301
|
-
|
302
|
-
ensure_at_most_dependencies(count: opts[:max])
|
303
|
-
else # even if the min is not specified, we need at least 1 dependency
|
304
|
-
ensure_at_least_dependencies(count: opts[:min] || 1)
|
305
|
-
end
|
312
|
+
ensure_exact_dependencies(count: opts[:exactly]) if opts.key?(:exactly)
|
313
|
+
ensure_at_most_dependencies(count: opts[:max]) if opts.key?(:max)
|
314
|
+
ensure_at_least_dependencies(count: opts[:min]) if opts.key?(:min)
|
306
315
|
ensure_no_cyclic_dependencies
|
307
316
|
ensure_keys_are_set
|
308
317
|
ensure_data_node_exists if self.class.data_node_opts[:ensure_exists]
|
@@ -322,37 +331,67 @@ module Dataflow
|
|
322
331
|
release_computing_lock!
|
323
332
|
end
|
324
333
|
|
334
|
+
def execution_valid?(uuid)
|
335
|
+
execution_uuid.to_s == uuid.to_s
|
336
|
+
end
|
337
|
+
|
325
338
|
# Keep a compatible interface with the data node
|
326
339
|
def schema
|
327
340
|
required_schema
|
328
341
|
end
|
329
342
|
|
343
|
+
# Interface to execute this node locally
|
344
|
+
def execute_local_computation
|
345
|
+
compute_impl
|
346
|
+
end
|
347
|
+
|
348
|
+
# Interface to execute a part (batch) of this node locally.
|
349
|
+
# This method is called when the framework needs to execute a batch on a worker.
|
350
|
+
# Override when needed, to execute a batch depending on the params.
|
351
|
+
# If you override, you may want to override the make_batch_params as well.
|
352
|
+
def execute_local_batch_computation(batch_params)
|
353
|
+
records = dependencies.first.all(where: batch_params)
|
354
|
+
new_records = compute_batch(records: records)
|
355
|
+
data_node&.add(records: new_records)
|
356
|
+
end
|
357
|
+
|
358
|
+
# Interface used to retrieve the params for scheduled batchs. Override when needed.
|
359
|
+
# The default implemention is to make queries that would
|
360
|
+
# ensure the full processing of the first dependency's records.
|
361
|
+
# @return [Array] of params that are passed to scheduled batches.
|
362
|
+
def make_batch_params
|
363
|
+
make_batch_queries(node: dependencies.first)
|
364
|
+
end
|
365
|
+
|
330
366
|
private
|
331
367
|
|
332
|
-
#
|
368
|
+
# Default compute implementation:
|
333
369
|
# - recreate the table
|
334
370
|
# - compute the records
|
335
371
|
# - save them to the DB
|
336
372
|
# (the process may be overwritten on a per-node basis if needed)
|
373
|
+
# Override if you need to have a completely custom compute implementation
|
337
374
|
def compute_impl
|
338
375
|
process_parallel(node: dependencies.first)
|
339
376
|
end
|
340
377
|
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
378
|
+
# This is an interface only.
|
379
|
+
# Override when you can implement a computation in terms of
|
380
|
+
# the records of the first dependent node.
|
381
|
+
# @param records [Array] a batch of records from the first dependency
|
382
|
+
# @return [Array] an array of results that are to be pushed to the data node (if set).
|
383
|
+
def compute_batch(records:)
|
384
|
+
[]
|
385
|
+
end
|
345
386
|
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
count_per_process = [limit, equal_split_per_process].min if limit > 0
|
387
|
+
def process_parallel(node:)
|
388
|
+
queries = make_batch_queries(node: node)
|
389
|
+
return if queries.blank?
|
350
390
|
|
351
|
-
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
352
391
|
queries_count = queries.count
|
353
|
-
|
354
392
|
parallel_each(queries.each_with_index) do |query, idx|
|
355
393
|
send_heartbeat
|
394
|
+
|
356
395
|
progress = (idx / queries_count.to_f * 100).ceil
|
357
396
|
on_computing_progressed(pct_complete: progress)
|
358
397
|
logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
|
@@ -365,25 +404,42 @@ module Dataflow
|
|
365
404
|
compute_batch(records: records)
|
366
405
|
end
|
367
406
|
|
368
|
-
data_node
|
407
|
+
data_node&.add(records: new_records)
|
369
408
|
end
|
370
409
|
end
|
371
410
|
|
372
|
-
#
|
373
|
-
|
374
|
-
|
375
|
-
|
411
|
+
# Makes queries that support traversing the node's records in parallel without overlap.
|
412
|
+
def make_batch_queries(node:)
|
413
|
+
return [] if node.blank?
|
414
|
+
record_count = node.count
|
415
|
+
return [] if record_count == 0
|
416
|
+
|
417
|
+
equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
|
418
|
+
count_per_process = equal_split_per_process
|
419
|
+
limit = limit_per_process.to_i
|
420
|
+
count_per_process = [limit, equal_split_per_process].min if limit > 0
|
421
|
+
|
422
|
+
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
376
423
|
end
|
377
424
|
|
378
425
|
def acquire_computing_lock!
|
379
426
|
# make sure that any pending changes are saved.
|
380
427
|
save
|
428
|
+
|
429
|
+
compute_state = {
|
430
|
+
computing_state: 'computing',
|
431
|
+
computing_started_at: Time.now,
|
432
|
+
execution_uuid: BSON::ObjectId.new
|
433
|
+
}
|
381
434
|
find_query = { _id: _id, computing_state: { '$ne' => 'computing' } }
|
382
|
-
update_query = { '$set' =>
|
435
|
+
update_query = { '$set' => compute_state }
|
436
|
+
|
383
437
|
# send a query directly to avoid mongoid's caching layers
|
384
438
|
res = Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
|
439
|
+
|
385
440
|
# reload the model data after the query above
|
386
441
|
reload
|
442
|
+
|
387
443
|
# the query is atomic so if res != nil, we acquired the lock
|
388
444
|
!res.nil?
|
389
445
|
end
|
@@ -391,20 +447,21 @@ module Dataflow
|
|
391
447
|
def release_computing_lock!
|
392
448
|
# make sure that any pending changes are saved.
|
393
449
|
save
|
450
|
+
|
394
451
|
find_query = { _id: _id }
|
395
|
-
update_query = { '$set' => { computing_state: nil, computing_started_at: nil } }
|
452
|
+
update_query = { '$set' => { computing_state: nil, computing_started_at: nil, execution_uuid: nil } }
|
453
|
+
|
396
454
|
# send a query directly to avoid mongoid's caching layers
|
397
455
|
Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
|
456
|
+
|
398
457
|
# reload the model data after the query above
|
399
458
|
reload
|
400
459
|
end
|
401
460
|
|
402
461
|
def await_computing!
|
403
|
-
start_waiting_at = Time.now
|
404
|
-
# TODO: should the max wait time be dependent on e.g. the recompute interval?
|
405
462
|
max_wait_time = 15.minutes
|
406
|
-
while Time.now <
|
407
|
-
sleep
|
463
|
+
while Time.now < last_heartbeat_time + max_wait_time
|
464
|
+
sleep 5
|
408
465
|
# reloads with the data stored on mongodb:
|
409
466
|
# something maybe have been changed by another process.
|
410
467
|
reload
|
@@ -436,7 +493,6 @@ module Dataflow
|
|
436
493
|
update_query = { '$set' => { last_compute_starting_time: time } }
|
437
494
|
Dataflow::Nodes::ComputeNode.where(_id: _id)
|
438
495
|
.find_one_and_update(update_query)
|
439
|
-
|
440
496
|
end
|
441
497
|
|
442
498
|
##############################
|
@@ -184,6 +184,8 @@ module Dataflow
|
|
184
184
|
# Adds the given records to the dataset and updates the updated_at time.
|
185
185
|
# @param records [Array] an array of the records to be added.
|
186
186
|
def add(records:)
|
187
|
+
raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
|
188
|
+
records = records.compact
|
187
189
|
return if records.blank?
|
188
190
|
db_adapter.save(records: records)
|
189
191
|
self.updated_at = Time.now
|
@@ -380,7 +382,7 @@ module Dataflow
|
|
380
382
|
return @postgresql_adapter
|
381
383
|
end
|
382
384
|
|
383
|
-
raise
|
385
|
+
raise NotImplementedError, "'#{db_backend}' backend is not implemented."
|
384
386
|
end
|
385
387
|
|
386
388
|
def valid_dataset_names
|
@@ -37,13 +37,14 @@ module Dataflow
|
|
37
37
|
count_per_process = [max_per_process, equal_split_per_process].min
|
38
38
|
|
39
39
|
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
40
|
+
system_id = node.send(:db_adapter).class::SYSTEM_ID
|
40
41
|
|
41
|
-
parallel_each(queries.each_with_index) do |query,
|
42
|
+
parallel_each(queries.each_with_index) do |query, idx|
|
42
43
|
# TODO: re-enabled event on_export_progressed
|
43
44
|
# progress = (idx / queries.count.to_f * 100).ceil
|
44
45
|
# on_export_progressed(pct_complete: progress)
|
45
|
-
batch = node.all(where: query.merge(where), fields: sch.keys)
|
46
|
-
csv_adapter.save(records: batch)
|
46
|
+
batch = node.all(where: query.merge(where), fields: sch.keys, sort: { system_id => 1 })
|
47
|
+
csv_adapter.save(records: batch, part: idx.to_s.rjust(queries.count.to_s.length, "0"))
|
47
48
|
end
|
48
49
|
|
49
50
|
# needed by the csv exporter to finalize in a single file
|
@@ -3,38 +3,36 @@ module Dataflow
|
|
3
3
|
module Nodes
|
4
4
|
# Only supports read operations
|
5
5
|
class ReadOnlyDataNode < DataNode
|
6
|
-
|
7
6
|
def set_defaults
|
8
7
|
super
|
9
8
|
self.use_double_buffering = false
|
10
9
|
end
|
11
10
|
|
12
|
-
|
13
11
|
def handle_dataset_settings_changed
|
14
12
|
# ignore - do not do anyhing
|
15
13
|
end
|
16
14
|
|
17
|
-
def add(*
|
15
|
+
def add(*_args)
|
18
16
|
raise_read_only_error!
|
19
17
|
end
|
20
18
|
|
21
|
-
def clear(*
|
19
|
+
def clear(*_args)
|
22
20
|
raise_read_only_error!
|
23
21
|
end
|
24
22
|
|
25
|
-
def recreate_dataset(*
|
23
|
+
def recreate_dataset(*_args)
|
26
24
|
raise_read_only_error!
|
27
25
|
end
|
28
26
|
|
29
|
-
def create_unique_indexes(*
|
27
|
+
def create_unique_indexes(*_args)
|
30
28
|
raise_read_only_error!
|
31
29
|
end
|
32
30
|
|
33
|
-
def create_non_unique_indexes(*
|
31
|
+
def create_non_unique_indexes(*_args)
|
34
32
|
raise_read_only_error!
|
35
33
|
end
|
36
34
|
|
37
|
-
def read_dataset_name=(*
|
35
|
+
def read_dataset_name=(*_args)
|
38
36
|
raise_read_only_error!
|
39
37
|
end
|
40
38
|
|
@@ -42,21 +40,27 @@ module Dataflow
|
|
42
40
|
raise_read_only_error!
|
43
41
|
end
|
44
42
|
|
45
|
-
def import(*
|
43
|
+
def import(*_args)
|
46
44
|
raise_read_only_error!
|
47
45
|
end
|
48
46
|
|
49
|
-
|
50
47
|
def drop_dataset!
|
51
48
|
raise_read_only_error!
|
52
49
|
end
|
53
50
|
|
51
|
+
def dump_dataset(*_args)
|
52
|
+
raise_read_only_error!
|
53
|
+
end
|
54
|
+
|
55
|
+
def restore_dataset(*_args)
|
56
|
+
raise_read_only_error!
|
57
|
+
end
|
58
|
+
|
54
59
|
private
|
55
60
|
|
56
61
|
def raise_read_only_error!
|
57
|
-
raise NotImplementedError, '
|
62
|
+
raise NotImplementedError, 'This node is read only'
|
58
63
|
end
|
59
|
-
|
60
64
|
end # class ExternalDataNode
|
61
65
|
end # module Nodes
|
62
66
|
end # module Dataflow
|
@@ -1,19 +1,15 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
module Dataflow
|
3
3
|
# Interface for a node that behaves as a dataset.
|
4
|
-
# Does not support any operation.
|
4
|
+
# Does not support any write operation.
|
5
5
|
# Inherit and override to implement custom behavior.
|
6
6
|
module Nodes
|
7
|
-
class RuntimeQueryNode <
|
7
|
+
class RuntimeQueryNode < ReadOnlyDataNode
|
8
8
|
|
9
9
|
after_initialize do
|
10
10
|
self.db_backend = :none
|
11
11
|
end
|
12
12
|
|
13
|
-
def handle_dataset_settings_changed
|
14
|
-
# dot not do anything, there is no real dataset
|
15
|
-
end
|
16
|
-
|
17
13
|
def all(*_args)
|
18
14
|
raise NotImplementedError, 'this node does not support #all'
|
19
15
|
end
|
@@ -30,13 +26,6 @@ module Dataflow
|
|
30
26
|
raise NotImplementedError, 'this node does not support #all_paginated'
|
31
27
|
end
|
32
28
|
|
33
|
-
def add(*_args)
|
34
|
-
raise NotImplementedError, 'this node does not support #add'
|
35
|
-
end
|
36
|
-
|
37
|
-
def clear(*_args)
|
38
|
-
raise NotImplementedError, 'this node does not support #clear'
|
39
|
-
end
|
40
29
|
end
|
41
30
|
end
|
42
31
|
end
|
@@ -30,6 +30,10 @@ module Dataflow
|
|
30
30
|
end
|
31
31
|
|
32
32
|
def add(records:)
|
33
|
+
raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
|
34
|
+
records = records.compact
|
35
|
+
return if records.blank?
|
36
|
+
|
33
37
|
# TODO: create a chain of behavior "before add"
|
34
38
|
rename_dotted_fields(records: records)
|
35
39
|
add_internal_timestamp(records: records)
|
@@ -42,6 +42,8 @@ module Dataflow
|
|
42
42
|
end
|
43
43
|
|
44
44
|
def add(records:)
|
45
|
+
raise ArgumentError, "records must be an array of documents. Received: '#{records.class}'." unless records.is_a?(Array)
|
46
|
+
records = records.compact
|
45
47
|
return if records.blank?
|
46
48
|
|
47
49
|
# TODO: create a chain of behavior "before add"
|
@@ -0,0 +1,73 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
require 'bunny'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
module Dataflow
|
6
|
+
class RemoteWorker
|
7
|
+
class << self
|
8
|
+
def work(work_queue_name = 'dataflow.ruby')
|
9
|
+
conn = Bunny.new(ENV['MOJACO_RABBITMQ_URI'])
|
10
|
+
conn.start
|
11
|
+
|
12
|
+
ch = conn.create_channel
|
13
|
+
queue = ch.queue(work_queue_name)
|
14
|
+
ch.prefetch(1)
|
15
|
+
|
16
|
+
logger.log("Accepting work on #{work_queue_name}...")
|
17
|
+
|
18
|
+
queue.subscribe(block: true, manual_ack: true) do |delivery_info, _properties, payload|
|
19
|
+
data = JSON.parse(payload)
|
20
|
+
response = process(data)
|
21
|
+
if response.present?
|
22
|
+
ch.default_exchange.publish(response.to_json, routing_key: data['completion_queue_name'])
|
23
|
+
end
|
24
|
+
ch.ack(delivery_info.delivery_tag)
|
25
|
+
end
|
26
|
+
ensure
|
27
|
+
conn.close
|
28
|
+
logger.log('Connection closed, stopped accepting work.')
|
29
|
+
end
|
30
|
+
|
31
|
+
def process(data)
|
32
|
+
node = Dataflow::Nodes::ComputeNode.find(data['node_id'])
|
33
|
+
|
34
|
+
unless node.execution_valid?(data['execution_uuid'])
|
35
|
+
logger.log("[#{data['msg_id']}] work on '#{node.name}' has expired. Skipping.")
|
36
|
+
return
|
37
|
+
end
|
38
|
+
|
39
|
+
errors = execute(node, data)
|
40
|
+
response = { msg_id: data['msg_id'] }
|
41
|
+
response.merge(errors[0])
|
42
|
+
rescue Mongoid::Errors::DocumentNotFound => e
|
43
|
+
{ error: { message: e.message, backtrace: e.backtrace } }
|
44
|
+
end
|
45
|
+
|
46
|
+
def execute(node, payload_data)
|
47
|
+
# execute in a different process, so that once it's finished
|
48
|
+
# we can purge the memory
|
49
|
+
Parallel.map([payload_data]) do |data|
|
50
|
+
error = {}
|
51
|
+
logger.log("[#{data['msg_id']}] working on '#{node.name}'...")
|
52
|
+
|
53
|
+
begin
|
54
|
+
if data['is_batch']
|
55
|
+
node.execute_local_batch_computation(data['params'])
|
56
|
+
else
|
57
|
+
node.execute_local_computation
|
58
|
+
end
|
59
|
+
rescue StandardError => e
|
60
|
+
error = { error: { message: e.message, backtrace: e.backtrace } }
|
61
|
+
end
|
62
|
+
|
63
|
+
logger.log("[#{data['msg_id']}] done working on '#{node.name}'.")
|
64
|
+
error
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def logger
|
69
|
+
@logger ||= Dataflow::Logger.new(prefix: 'Worker')
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -51,6 +51,15 @@ module Dataflow
|
|
51
51
|
end
|
52
52
|
|
53
53
|
def infer_partial_schema(where:, extended: false)
|
54
|
+
if db_backend == :postgresql
|
55
|
+
# Experimental
|
56
|
+
sch = db_adapter.client.schema(read_dataset_name).to_h
|
57
|
+
sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
|
58
|
+
self.inferred_schema = sch
|
59
|
+
save
|
60
|
+
return sch
|
61
|
+
end
|
62
|
+
|
54
63
|
data_count = count(where: where)
|
55
64
|
return {} if data_count == 0
|
56
65
|
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.14.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-06-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -290,6 +290,20 @@ dependencies:
|
|
290
290
|
- - "~>"
|
291
291
|
- !ruby/object:Gem::Version
|
292
292
|
version: '0.10'
|
293
|
+
- !ruby/object:Gem::Dependency
|
294
|
+
name: bunny
|
295
|
+
requirement: !ruby/object:Gem::Requirement
|
296
|
+
requirements:
|
297
|
+
- - "~>"
|
298
|
+
- !ruby/object:Gem::Version
|
299
|
+
version: '2.7'
|
300
|
+
type: :runtime
|
301
|
+
prerelease: false
|
302
|
+
version_requirements: !ruby/object:Gem::Requirement
|
303
|
+
requirements:
|
304
|
+
- - "~>"
|
305
|
+
- !ruby/object:Gem::Version
|
306
|
+
version: '2.7'
|
293
307
|
description: Helps building data pipelines. It handles recomputing dependencies and
|
294
308
|
parallel execution.
|
295
309
|
email:
|
@@ -319,8 +333,9 @@ files:
|
|
319
333
|
- lib/dataflow/adapters/settings.rb
|
320
334
|
- lib/dataflow/adapters/sql_adapter.rb
|
321
335
|
- lib/dataflow/errors/invalid_configuration_error.rb
|
322
|
-
- lib/dataflow/errors/
|
336
|
+
- lib/dataflow/errors/remote_execution_error.rb
|
323
337
|
- lib/dataflow/event_mixin.rb
|
338
|
+
- lib/dataflow/executor.rb
|
324
339
|
- lib/dataflow/extensions/mongo_driver.rb
|
325
340
|
- lib/dataflow/extensions/msgpack.rb
|
326
341
|
- lib/dataflow/logger.rb
|
@@ -344,6 +359,7 @@ files:
|
|
344
359
|
- lib/dataflow/nodes/transformation/to_time_node.rb
|
345
360
|
- lib/dataflow/nodes/upsert_node.rb
|
346
361
|
- lib/dataflow/properties_mixin.rb
|
362
|
+
- lib/dataflow/remote_worker.rb
|
347
363
|
- lib/dataflow/schema_mixin.rb
|
348
364
|
- lib/dataflow/version.rb
|
349
365
|
homepage: https://phybbit.com
|