dataflow-rb 0.14.0 → 0.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/dataflow-rb.gemspec +1 -1
- data/lib/dataflow-rb.rb +81 -1
- data/lib/dataflow/adapters/csv_adapter.rb +1 -1
- data/lib/dataflow/adapters/mongo_db_adapter.rb +7 -6
- data/lib/dataflow/adapters/psql_adapter.rb +5 -6
- data/lib/dataflow/adapters/sql_adapter.rb +6 -2
- data/lib/dataflow/executor.rb +7 -3
- data/lib/dataflow/node.rb +17 -0
- data/lib/dataflow/nodes/compute_node.rb +9 -8
- data/lib/dataflow/nodes/data_node.rb +37 -8
- data/lib/dataflow/nodes/read_only_data_node.rb +11 -0
- data/lib/dataflow/remote_worker.rb +12 -6
- data/lib/dataflow/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '081f28d6c668f92bfe5da20f2301136af28949ae'
|
4
|
+
data.tar.gz: 39dd214829a164c21b0c8c6b0d3406f423c84e82
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56db86c9444331cfa4d7ab41d6210066aff33ed4aabe0e139131ce42659f94be09f3a241a14580bec68871dc18bc5c371d075f3c5831689c5b08e982ff12e639
|
7
|
+
data.tar.gz: 05a3a5e3eab0b89aa89046f9c70c68c6cbdb4cc59c672dd327d4dc069299a964f7f1f34e3dca299e9d097c8da4c486054a2447b6f700e9958bd32a90fd6b3794
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
#### 0.15.0
|
4
|
+
|
5
|
+
- [4b48b74] Make sure computed data is included in the dump
|
6
|
+
- [54fd18d] Added support for #export #import a dataflow with its data
|
7
|
+
- [a63972f] Add #metadata to the nodes
|
8
|
+
- [696ea35] Add #all_dependencies interface
|
9
|
+
- [5165c71] Fix re-using the same variable in the RemoteWorker results
|
10
|
+
- [216a066] Only warn once about a missing node
|
11
|
+
- [c101144] Support gettng data back from remote batch workers and adding it to the data node.
|
12
|
+
- [9a06ee3] Support remapping the dataset from which to read from on ReadOnlyDataNodes
|
13
|
+
- [2fc623a] Setting the clean if exist option to pg restore. Set pg to 0.20
|
14
|
+
- [205317c] Support including the system id in the data
|
15
|
+
- [0b9b578] Fix restoring postgresql indexes and other constraints
|
16
|
+
- [e396265] Make sure indexes are recreated in postgres after a restore
|
17
|
+
- [426300a] Add the exported dataset idx in the dump filename. Make sure when restoring that the settings are compatible
|
18
|
+
- [ca44a9d] Set the no owner flag when restoring to psql
|
19
|
+
|
3
20
|
#### 0.14.0
|
4
21
|
- [ef8ddcd] Do not assume a minimum of 1 dependency per compute node.
|
5
22
|
- [b131bb1] Add type check on the data node #add methods. Filter nil values.
|
data/dataflow-rb.gemspec
CHANGED
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_dependency 'mongoid', '~>6.0'
|
37
37
|
spec.add_dependency 'sequel', '~>4.0'
|
38
38
|
spec.add_dependency 'mysql2', '~>0.4'
|
39
|
-
spec.add_dependency 'pg', '
|
39
|
+
spec.add_dependency 'pg', '0.20'
|
40
40
|
spec.add_dependency 'sequel_pg', '~>1.6'
|
41
41
|
spec.add_dependency 'msgpack', '~>1.0'
|
42
42
|
spec.add_dependency 'smarter_csv', '1.1.0'
|
data/lib/dataflow-rb.rb
CHANGED
@@ -80,6 +80,80 @@ module Dataflow
|
|
80
80
|
def self.clear_tmp_datasets
|
81
81
|
Dataflow::Nodes::DataNode.all.each(&:safely_clear_write_dataset)
|
82
82
|
end
|
83
|
+
|
84
|
+
# Exports nodes and their data. Use #import to re-import them elsewhere.
|
85
|
+
def self.export(nodes:, export_dir: './flows', include_data: false)
|
86
|
+
raise ArgumentError, 'nodes must be an array of nodes' unless nodes.is_a?(Array)
|
87
|
+
# make a tmp folder with the export dir
|
88
|
+
archive_name = "flow_#{Time.now.strftime("%Y-%m-%d_%H-%M-%S")}"
|
89
|
+
tmp_dir = "#{export_dir}/#{archive_name}"
|
90
|
+
`mkdir -p #{tmp_dir}`
|
91
|
+
|
92
|
+
# export all the dependencies
|
93
|
+
all_nodes = nodes + nodes.flat_map(&:all_dependencies)
|
94
|
+
# and all the compute node's datasets
|
95
|
+
all_nodes += all_nodes.select { |x| x.is_a?(Dataflow::Nodes::ComputeNode) }
|
96
|
+
.map { |x| x.data_node }
|
97
|
+
# get all the nodes' metadata in the yaml format
|
98
|
+
metadata_yaml = all_nodes.compact.uniq.map(&:metadata).to_yaml
|
99
|
+
File.write("#{tmp_dir}/metadata.yaml", metadata_yaml)
|
100
|
+
|
101
|
+
# add the dataset's data if necessary
|
102
|
+
if include_data
|
103
|
+
all_nodes.select { |x| x.is_a?(Dataflow::Nodes::DataNode) }
|
104
|
+
.each { |x| x.dump_dataset(base_folder: tmp_dir) }
|
105
|
+
end
|
106
|
+
|
107
|
+
# pack all the content in a tar archive
|
108
|
+
archive_path = "#{archive_name}.tar"
|
109
|
+
`(cd #{export_dir} && tar -cvf #{archive_path} #{archive_name})`
|
110
|
+
|
111
|
+
# clear the tmp folder
|
112
|
+
`rm -rf #{tmp_dir}`
|
113
|
+
|
114
|
+
"#{export_dir}/#{archive_path}"
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.import(archive_path:)
|
118
|
+
raise ArgumentError, 'expecting a tar archive file' unless archive_path.end_with?('.tar')
|
119
|
+
|
120
|
+
# extract the tar
|
121
|
+
folder_name = archive_path.split('/')[-1].split('.')[0]
|
122
|
+
`tar -xvf #{archive_path}`
|
123
|
+
|
124
|
+
# load and restore the content in the metadata.yaml
|
125
|
+
metadata = YAML.load_file("#{folder_name}/metadata.yaml")
|
126
|
+
|
127
|
+
# restore the nodes
|
128
|
+
metadata.each do |m|
|
129
|
+
klass = m[:_type].constantize
|
130
|
+
|
131
|
+
# try to delete previously existing node
|
132
|
+
begin
|
133
|
+
previous_node = klass.find(m[:_id])
|
134
|
+
previous_node.delete
|
135
|
+
rescue Mongoid::Errors::DocumentNotFound
|
136
|
+
end
|
137
|
+
|
138
|
+
# create the node
|
139
|
+
klass.create(m)
|
140
|
+
end
|
141
|
+
|
142
|
+
# look for dataset dumps and restore them
|
143
|
+
filepaths = Dir["./#{folder_name}/**/*.gz"] + Dir["./#{folder_name}/**/*.dump"]
|
144
|
+
|
145
|
+
filepaths.each do |filepath|
|
146
|
+
# filepath: "./folder/db_name/dataset.1.gz"
|
147
|
+
db_name = filepath.split('/')[2]
|
148
|
+
dataset = filepath.split('/')[3].split('.')[0]
|
149
|
+
n = Dataflow::Nodes::DataNode.find_by(db_name: db_name, name: dataset)
|
150
|
+
n.restore_dataset(filepath: filepath)
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
# clean up the extracted folder
|
155
|
+
`rm -rf #{folder_name}`
|
156
|
+
end
|
83
157
|
end
|
84
158
|
|
85
159
|
###############################################################################
|
@@ -93,7 +167,13 @@ module Dataflow
|
|
93
167
|
super
|
94
168
|
rescue NameError => e
|
95
169
|
raise e unless e.message =~ /Dataflow::Nodes/
|
96
|
-
|
170
|
+
|
171
|
+
@name_errors ||= Set.new
|
172
|
+
unless @name_errors.include?(e.message)
|
173
|
+
p "Warning -- Node class not found. #{e}"
|
174
|
+
@name_errors << e.message
|
175
|
+
end
|
176
|
+
|
97
177
|
Dataflow::Nodes::ComputeNode
|
98
178
|
end
|
99
179
|
end
|
@@ -31,7 +31,7 @@ module Dataflow
|
|
31
31
|
end
|
32
32
|
|
33
33
|
# retrieve all elements from a data node
|
34
|
-
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
34
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0, include_system_id: false)
|
35
35
|
SmarterCSV.process(file_path, strings_as_keys: true)
|
36
36
|
rescue Errno::ENOENT => e
|
37
37
|
[]
|
@@ -26,6 +26,7 @@ module Dataflow
|
|
26
26
|
def disconnect_clients
|
27
27
|
@clients ||= {}
|
28
28
|
@clients.values.each(&:close)
|
29
|
+
@clients = {}
|
29
30
|
end
|
30
31
|
end
|
31
32
|
|
@@ -48,10 +49,10 @@ module Dataflow
|
|
48
49
|
end
|
49
50
|
|
50
51
|
# retrieve all elements from a data node
|
51
|
-
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
52
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0, include_system_id: false)
|
52
53
|
projection = fields.map { |field| [field, 1] }
|
53
54
|
|
54
|
-
unless fields.map(&:to_s).include?(SYSTEM_ID)
|
55
|
+
unless include_system_id || fields.map(&:to_s).include?(SYSTEM_ID)
|
55
56
|
# by default, do not select the _id field
|
56
57
|
projection << [SYSTEM_ID, 0].freeze
|
57
58
|
end
|
@@ -225,8 +226,8 @@ module Dataflow
|
|
225
226
|
}
|
226
227
|
end
|
227
228
|
|
228
|
-
def dump(base_folder:)
|
229
|
-
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.gz"
|
229
|
+
def dump(base_folder:, read_dataset_idx:)
|
230
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.#{read_dataset_idx}.gz"
|
230
231
|
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
|
231
232
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
232
233
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
@@ -238,8 +239,8 @@ module Dataflow
|
|
238
239
|
archive_path
|
239
240
|
end
|
240
241
|
|
241
|
-
def restore(filepath:)
|
242
|
-
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{
|
242
|
+
def restore(filepath:, dataset_name:)
|
243
|
+
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{dataset_name} "
|
243
244
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
244
245
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
245
246
|
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
@@ -25,8 +25,8 @@ module Dataflow
|
|
25
25
|
'~*'
|
26
26
|
end
|
27
27
|
|
28
|
-
def dump(base_folder:)
|
29
|
-
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.dump"
|
28
|
+
def dump(base_folder:, read_dataset_idx:)
|
29
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.#{read_dataset_idx}.dump"
|
30
30
|
options = "--table=public.#{@settings.read_dataset_name} "
|
31
31
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
32
32
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
@@ -38,14 +38,13 @@ module Dataflow
|
|
38
38
|
archive_path
|
39
39
|
end
|
40
40
|
|
41
|
-
def restore(filepath:)
|
42
|
-
options = "--
|
41
|
+
def restore(filepath:, dataset_name:)
|
42
|
+
options = "-v --clean --if-exists --no-owner "
|
43
43
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
44
44
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
45
|
-
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
45
|
+
options += "--username=#{@settings.db_user} --role=#{@settings.db_user} " if @settings.db_user.present?
|
46
46
|
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
47
47
|
|
48
|
-
drop_dataset(@settings.read_dataset_name)
|
49
48
|
`#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
|
50
49
|
end
|
51
50
|
end
|
@@ -53,6 +53,7 @@ module Dataflow
|
|
53
53
|
def disconnect_clients
|
54
54
|
@clients ||= {}
|
55
55
|
@clients.values.each(&:disconnect)
|
56
|
+
@clients = {}
|
56
57
|
end
|
57
58
|
end
|
58
59
|
|
@@ -77,12 +78,15 @@ module Dataflow
|
|
77
78
|
end
|
78
79
|
|
79
80
|
# retrieve all elements from a data node
|
80
|
-
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
81
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0, include_system_id: false)
|
81
82
|
res = client[settings.read_dataset_name.to_sym]
|
82
83
|
|
83
84
|
# if there is no fields, automatically
|
84
85
|
# select all the fields expect the system _id
|
85
|
-
|
86
|
+
if fields.blank?
|
87
|
+
fields = res.columns
|
88
|
+
fields = fields.reject { |x| x == SYSTEM_ID } unless include_system_id
|
89
|
+
end
|
86
90
|
|
87
91
|
res = res.select(*fields.map(&:to_sym)) if fields.present?
|
88
92
|
res = apply_query(res, where)
|
data/lib/dataflow/executor.rb
CHANGED
@@ -28,7 +28,7 @@ module Dataflow
|
|
28
28
|
logger.log("Opened a completion queue for '#{node.name}': #{completion_queue.name}")
|
29
29
|
|
30
30
|
messages = send_execution_messages(channel, node, is_batch_execution, completion_queue.name)
|
31
|
-
error_data = await_execution_completion(completion_queue, messages.count)
|
31
|
+
error_data = await_execution_completion(node, completion_queue, messages.count)
|
32
32
|
logger.log("Finished processing '#{node.name}'")
|
33
33
|
|
34
34
|
raise Errors::RemoteExecutionError.new(error_data['message'], error_data['backtrace']) if error_data
|
@@ -43,7 +43,7 @@ module Dataflow
|
|
43
43
|
ch = conn.create_channel
|
44
44
|
completion_queue = ch.queue('', exclusive: true)
|
45
45
|
|
46
|
-
|
46
|
+
[conn, ch, completion_queue]
|
47
47
|
end
|
48
48
|
|
49
49
|
def send_execution_messages(channel, node, is_batch_execution, completion_queue_name)
|
@@ -76,7 +76,7 @@ module Dataflow
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
-
def await_execution_completion(completion_queue, expected_completion_count)
|
79
|
+
def await_execution_completion(node, completion_queue, expected_completion_count)
|
80
80
|
completed_message_indexes = []
|
81
81
|
unblock = Queue.new
|
82
82
|
|
@@ -84,6 +84,10 @@ module Dataflow
|
|
84
84
|
data = JSON.parse(payload)
|
85
85
|
unblock.enq(data['error']) if data['error'].present?
|
86
86
|
|
87
|
+
# Support adding the data to the compute's data_node is the
|
88
|
+
# remote process returns anything.
|
89
|
+
node.data_node&.add(records: data['data']) if data['data'].present?
|
90
|
+
|
87
91
|
completed_message_indexes << data['msg_id']
|
88
92
|
if completed_message_indexes.count == expected_completion_count
|
89
93
|
unblock.enq(false)
|
data/lib/dataflow/node.rb
CHANGED
@@ -30,10 +30,27 @@ module Dataflow
|
|
30
30
|
true
|
31
31
|
end
|
32
32
|
|
33
|
+
def all_dependencies
|
34
|
+
[]
|
35
|
+
end
|
36
|
+
|
33
37
|
def required_by
|
34
38
|
Dataflow::Nodes::ComputeNode.where(dependency_ids: _id).map { |node|
|
35
39
|
{ node: node, type: 'dependency' }
|
36
40
|
}
|
37
41
|
end
|
42
|
+
|
43
|
+
def metadata
|
44
|
+
metadata = {
|
45
|
+
_id: self._id,
|
46
|
+
_type: self._type,
|
47
|
+
}
|
48
|
+
properties_data = self.class.properties.keys.map do |property_name|
|
49
|
+
value = self[property_name]
|
50
|
+
[property_name, value]
|
51
|
+
end.to_h
|
52
|
+
|
53
|
+
metadata.merge(properties_data)
|
54
|
+
end
|
38
55
|
end
|
39
56
|
end
|
@@ -351,8 +351,7 @@ module Dataflow
|
|
351
351
|
# If you override, you may want to override the make_batch_params as well.
|
352
352
|
def execute_local_batch_computation(batch_params)
|
353
353
|
records = dependencies.first.all(where: batch_params)
|
354
|
-
|
355
|
-
data_node&.add(records: new_records)
|
354
|
+
compute_batch(records: records)
|
356
355
|
end
|
357
356
|
|
358
357
|
# Interface used to retrieve the params for scheduled batchs. Override when needed.
|
@@ -583,9 +582,7 @@ module Dataflow
|
|
583
582
|
|
584
583
|
def parallel_each(itr)
|
585
584
|
# before fork: always disconnect currently used connections.
|
586
|
-
|
587
|
-
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
588
|
-
Mongoid.disconnect_clients
|
585
|
+
disconnect_db_clients
|
589
586
|
|
590
587
|
# set to true to debug code in the iteration
|
591
588
|
is_debugging_impl = ENV['DEBUG_DATAFLOW']
|
@@ -600,12 +597,16 @@ module Dataflow
|
|
600
597
|
|
601
598
|
Parallel.each(itr, opts) do |*args|
|
602
599
|
yield(*args)
|
603
|
-
|
604
|
-
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
605
|
-
Mongoid.disconnect_clients
|
600
|
+
disconnect_db_clients
|
606
601
|
end
|
607
602
|
end
|
608
603
|
|
604
|
+
def disconnect_db_clients
|
605
|
+
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
606
|
+
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
607
|
+
Mongoid.disconnect_clients
|
608
|
+
end
|
609
|
+
|
609
610
|
def logger
|
610
611
|
@logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
|
611
612
|
end
|
@@ -143,8 +143,8 @@ module Dataflow
|
|
143
143
|
# can be called to stream the results rather than load everything in memory.
|
144
144
|
# Other methods can also be called depending on the backend,
|
145
145
|
# the downside being back-end portability (use at your own risk).
|
146
|
-
def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, &block)
|
147
|
-
db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, &block)
|
146
|
+
def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, include_system_id: false, &block)
|
147
|
+
db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, include_system_id: include_system_id, &block)
|
148
148
|
end
|
149
149
|
|
150
150
|
# Supports paginating efficiently through the dataset.
|
@@ -316,9 +316,9 @@ module Dataflow
|
|
316
316
|
end
|
317
317
|
|
318
318
|
def required_by
|
319
|
-
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map
|
319
|
+
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map do |node|
|
320
320
|
{ node: node, type: 'dataset' }
|
321
|
-
|
321
|
+
end
|
322
322
|
end
|
323
323
|
|
324
324
|
# this is not safe if there is some parallel processing going on
|
@@ -341,15 +341,44 @@ module Dataflow
|
|
341
341
|
end
|
342
342
|
|
343
343
|
# Dump a backup of this dataset to a file.
|
344
|
-
# @return [String] the filepath to the dump file.
|
344
|
+
# @return [String] the filepath to the dump file. The filename is
|
345
|
+
# formatted as <node_name>.<read_dataset_idx>.<ext>
|
345
346
|
def dump_dataset(base_folder: './dump')
|
346
|
-
|
347
|
+
read_idx = 0
|
348
|
+
read_idx = read_dataset_idx if use_double_buffering
|
349
|
+
|
350
|
+
db_adapter.dump(base_folder: base_folder, read_dataset_idx: read_idx)
|
347
351
|
end
|
348
352
|
|
349
353
|
# Restore a dump of this dataset
|
350
|
-
# @param files [String] the filepath to the dump file.
|
354
|
+
# @param files [String] the filepath to the dump file. The filename has
|
355
|
+
# to be formatted as <node_name>.<read_dataset_idx>.<ext>
|
351
356
|
def restore_dataset(filepath:)
|
352
|
-
|
357
|
+
filename = filepath.split('/')[-1]
|
358
|
+
read_idx = if filename.count('.') < 2
|
359
|
+
# for compatibility reasons: previously we were not
|
360
|
+
# exporting the read idx in the filename
|
361
|
+
0
|
362
|
+
else
|
363
|
+
filename.split('.')[1].to_i
|
364
|
+
end
|
365
|
+
|
366
|
+
raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a single buffer dataset but node '#{name}' is expecting a double buffered one." if read_idx == 0 && use_double_buffering
|
367
|
+
raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a double buffer dataset but node '#{name}' is expecting a single buffered one." if read_idx > 0 && !use_double_buffering
|
368
|
+
|
369
|
+
if use_double_buffering
|
370
|
+
dataset_name = valid_dataset_names[read_idx - 1]
|
371
|
+
else
|
372
|
+
dataset_name = name
|
373
|
+
end
|
374
|
+
|
375
|
+
db_adapter.restore(filepath: filepath, dataset_name: dataset_name)
|
376
|
+
self.read_dataset_idx = read_idx
|
377
|
+
save
|
378
|
+
|
379
|
+
db_adapter.update_settings(data_node: self)
|
380
|
+
|
381
|
+
true
|
353
382
|
end
|
354
383
|
|
355
384
|
private
|
@@ -3,11 +3,22 @@ module Dataflow
|
|
3
3
|
module Nodes
|
4
4
|
# Only supports read operations
|
5
5
|
class ReadOnlyDataNode < DataNode
|
6
|
+
|
7
|
+
# Support overriding which dataset to read from.
|
8
|
+
# Use this to decouple the name from the dataset name
|
9
|
+
# it will actually access.
|
10
|
+
field :dataset_name, type: String
|
11
|
+
|
6
12
|
def set_defaults
|
7
13
|
super
|
8
14
|
self.use_double_buffering = false
|
9
15
|
end
|
10
16
|
|
17
|
+
def read_dataset_name
|
18
|
+
return dataset_name if dataset_name.present?
|
19
|
+
super
|
20
|
+
end
|
21
|
+
|
11
22
|
def handle_dataset_settings_changed
|
12
23
|
# ignore - do not do anyhing
|
13
24
|
end
|
@@ -36,9 +36,9 @@ module Dataflow
|
|
36
36
|
return
|
37
37
|
end
|
38
38
|
|
39
|
-
|
39
|
+
results = execute(node, data)
|
40
40
|
response = { msg_id: data['msg_id'] }
|
41
|
-
response.merge(
|
41
|
+
response.merge(results[0])
|
42
42
|
rescue Mongoid::Errors::DocumentNotFound => e
|
43
43
|
{ error: { message: e.message, backtrace: e.backtrace } }
|
44
44
|
end
|
@@ -47,21 +47,27 @@ module Dataflow
|
|
47
47
|
# execute in a different process, so that once it's finished
|
48
48
|
# we can purge the memory
|
49
49
|
Parallel.map([payload_data]) do |data|
|
50
|
-
|
50
|
+
result = {}
|
51
51
|
logger.log("[#{data['msg_id']}] working on '#{node.name}'...")
|
52
52
|
|
53
53
|
begin
|
54
54
|
if data['is_batch']
|
55
|
-
node.execute_local_batch_computation(data['params'])
|
55
|
+
records = node.execute_local_batch_computation(data['params'])
|
56
|
+
# in ruby, we already have access to the node, so we
|
57
|
+
# add the data directly here instead of returning it through
|
58
|
+
# the queue. The default batch behavior on other languages
|
59
|
+
# is to return the output data in the 'data' key, e.g.:
|
60
|
+
# result['data] = records
|
61
|
+
node.data_node&.add(records: records)
|
56
62
|
else
|
57
63
|
node.execute_local_computation
|
58
64
|
end
|
59
65
|
rescue StandardError => e
|
60
|
-
|
66
|
+
result = { error: { message: e.message, backtrace: e.backtrace } }
|
61
67
|
end
|
62
68
|
|
63
69
|
logger.log("[#{data['msg_id']}] done working on '#{node.name}'.")
|
64
|
-
|
70
|
+
result
|
65
71
|
end
|
66
72
|
end
|
67
73
|
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -210,16 +210,16 @@ dependencies:
|
|
210
210
|
name: pg
|
211
211
|
requirement: !ruby/object:Gem::Requirement
|
212
212
|
requirements:
|
213
|
-
- -
|
213
|
+
- - '='
|
214
214
|
- !ruby/object:Gem::Version
|
215
|
-
version: '0.
|
215
|
+
version: '0.20'
|
216
216
|
type: :runtime
|
217
217
|
prerelease: false
|
218
218
|
version_requirements: !ruby/object:Gem::Requirement
|
219
219
|
requirements:
|
220
|
-
- -
|
220
|
+
- - '='
|
221
221
|
- !ruby/object:Gem::Version
|
222
|
-
version: '0.
|
222
|
+
version: '0.20'
|
223
223
|
- !ruby/object:Gem::Dependency
|
224
224
|
name: sequel_pg
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|