dataflow-rb 0.14.0 → 0.15.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +17 -0
- data/dataflow-rb.gemspec +1 -1
- data/lib/dataflow-rb.rb +81 -1
- data/lib/dataflow/adapters/csv_adapter.rb +1 -1
- data/lib/dataflow/adapters/mongo_db_adapter.rb +7 -6
- data/lib/dataflow/adapters/psql_adapter.rb +5 -6
- data/lib/dataflow/adapters/sql_adapter.rb +6 -2
- data/lib/dataflow/executor.rb +7 -3
- data/lib/dataflow/node.rb +17 -0
- data/lib/dataflow/nodes/compute_node.rb +9 -8
- data/lib/dataflow/nodes/data_node.rb +37 -8
- data/lib/dataflow/nodes/read_only_data_node.rb +11 -0
- data/lib/dataflow/remote_worker.rb +12 -6
- data/lib/dataflow/version.rb +1 -1
- metadata +6 -6
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '081f28d6c668f92bfe5da20f2301136af28949ae'
|
4
|
+
data.tar.gz: 39dd214829a164c21b0c8c6b0d3406f423c84e82
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 56db86c9444331cfa4d7ab41d6210066aff33ed4aabe0e139131ce42659f94be09f3a241a14580bec68871dc18bc5c371d075f3c5831689c5b08e982ff12e639
|
7
|
+
data.tar.gz: 05a3a5e3eab0b89aa89046f9c70c68c6cbdb4cc59c672dd327d4dc069299a964f7f1f34e3dca299e9d097c8da4c486054a2447b6f700e9958bd32a90fd6b3794
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,22 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
#### 0.15.0
|
4
|
+
|
5
|
+
- [4b48b74] Make sure computed data is included in the dump
|
6
|
+
- [54fd18d] Added support for #export #import a dataflow with its data
|
7
|
+
- [a63972f] Add #metadata to the nodes
|
8
|
+
- [696ea35] Add #all_dependencies interface
|
9
|
+
- [5165c71] Fix re-using the same variable in the RemoteWorker results
|
10
|
+
- [216a066] Only warn once about a missing node
|
11
|
+
- [c101144] Support gettng data back from remote batch workers and adding it to the data node.
|
12
|
+
- [9a06ee3] Support remapping the dataset from which to read from on ReadOnlyDataNodes
|
13
|
+
- [2fc623a] Setting the clean if exist option to pg restore. Set pg to 0.20
|
14
|
+
- [205317c] Support including the system id in the data
|
15
|
+
- [0b9b578] Fix restoring postgresql indexes and other constraints
|
16
|
+
- [e396265] Make sure indexes are recreated in postgres after a restore
|
17
|
+
- [426300a] Add the exported dataset idx in the dump filename. Make sure when restoring that the settings are compatible
|
18
|
+
- [ca44a9d] Set the no owner flag when restoring to psql
|
19
|
+
|
3
20
|
#### 0.14.0
|
4
21
|
- [ef8ddcd] Do not assume a minimum of 1 dependency per compute node.
|
5
22
|
- [b131bb1] Add type check on the data node #add methods. Filter nil values.
|
data/dataflow-rb.gemspec
CHANGED
@@ -36,7 +36,7 @@ Gem::Specification.new do |spec|
|
|
36
36
|
spec.add_dependency 'mongoid', '~>6.0'
|
37
37
|
spec.add_dependency 'sequel', '~>4.0'
|
38
38
|
spec.add_dependency 'mysql2', '~>0.4'
|
39
|
-
spec.add_dependency 'pg', '
|
39
|
+
spec.add_dependency 'pg', '0.20'
|
40
40
|
spec.add_dependency 'sequel_pg', '~>1.6'
|
41
41
|
spec.add_dependency 'msgpack', '~>1.0'
|
42
42
|
spec.add_dependency 'smarter_csv', '1.1.0'
|
data/lib/dataflow-rb.rb
CHANGED
@@ -80,6 +80,80 @@ module Dataflow
|
|
80
80
|
def self.clear_tmp_datasets
|
81
81
|
Dataflow::Nodes::DataNode.all.each(&:safely_clear_write_dataset)
|
82
82
|
end
|
83
|
+
|
84
|
+
# Exports nodes and their data. Use #import to re-import them elsewhere.
|
85
|
+
def self.export(nodes:, export_dir: './flows', include_data: false)
|
86
|
+
raise ArgumentError, 'nodes must be an array of nodes' unless nodes.is_a?(Array)
|
87
|
+
# make a tmp folder with the export dir
|
88
|
+
archive_name = "flow_#{Time.now.strftime("%Y-%m-%d_%H-%M-%S")}"
|
89
|
+
tmp_dir = "#{export_dir}/#{archive_name}"
|
90
|
+
`mkdir -p #{tmp_dir}`
|
91
|
+
|
92
|
+
# export all the dependencies
|
93
|
+
all_nodes = nodes + nodes.flat_map(&:all_dependencies)
|
94
|
+
# and all the compute node's datasets
|
95
|
+
all_nodes += all_nodes.select { |x| x.is_a?(Dataflow::Nodes::ComputeNode) }
|
96
|
+
.map { |x| x.data_node }
|
97
|
+
# get all the nodes' metadata in the yaml format
|
98
|
+
metadata_yaml = all_nodes.compact.uniq.map(&:metadata).to_yaml
|
99
|
+
File.write("#{tmp_dir}/metadata.yaml", metadata_yaml)
|
100
|
+
|
101
|
+
# add the dataset's data if necessary
|
102
|
+
if include_data
|
103
|
+
all_nodes.select { |x| x.is_a?(Dataflow::Nodes::DataNode) }
|
104
|
+
.each { |x| x.dump_dataset(base_folder: tmp_dir) }
|
105
|
+
end
|
106
|
+
|
107
|
+
# pack all the content in a tar archive
|
108
|
+
archive_path = "#{archive_name}.tar"
|
109
|
+
`(cd #{export_dir} && tar -cvf #{archive_path} #{archive_name})`
|
110
|
+
|
111
|
+
# clear the tmp folder
|
112
|
+
`rm -rf #{tmp_dir}`
|
113
|
+
|
114
|
+
"#{export_dir}/#{archive_path}"
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.import(archive_path:)
|
118
|
+
raise ArgumentError, 'expecting a tar archive file' unless archive_path.end_with?('.tar')
|
119
|
+
|
120
|
+
# extract the tar
|
121
|
+
folder_name = archive_path.split('/')[-1].split('.')[0]
|
122
|
+
`tar -xvf #{archive_path}`
|
123
|
+
|
124
|
+
# load and restore the content in the metadata.yaml
|
125
|
+
metadata = YAML.load_file("#{folder_name}/metadata.yaml")
|
126
|
+
|
127
|
+
# restore the nodes
|
128
|
+
metadata.each do |m|
|
129
|
+
klass = m[:_type].constantize
|
130
|
+
|
131
|
+
# try to delete previously existing node
|
132
|
+
begin
|
133
|
+
previous_node = klass.find(m[:_id])
|
134
|
+
previous_node.delete
|
135
|
+
rescue Mongoid::Errors::DocumentNotFound
|
136
|
+
end
|
137
|
+
|
138
|
+
# create the node
|
139
|
+
klass.create(m)
|
140
|
+
end
|
141
|
+
|
142
|
+
# look for dataset dumps and restore them
|
143
|
+
filepaths = Dir["./#{folder_name}/**/*.gz"] + Dir["./#{folder_name}/**/*.dump"]
|
144
|
+
|
145
|
+
filepaths.each do |filepath|
|
146
|
+
# filepath: "./folder/db_name/dataset.1.gz"
|
147
|
+
db_name = filepath.split('/')[2]
|
148
|
+
dataset = filepath.split('/')[3].split('.')[0]
|
149
|
+
n = Dataflow::Nodes::DataNode.find_by(db_name: db_name, name: dataset)
|
150
|
+
n.restore_dataset(filepath: filepath)
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
# clean up the extracted folder
|
155
|
+
`rm -rf #{folder_name}`
|
156
|
+
end
|
83
157
|
end
|
84
158
|
|
85
159
|
###############################################################################
|
@@ -93,7 +167,13 @@ module Dataflow
|
|
93
167
|
super
|
94
168
|
rescue NameError => e
|
95
169
|
raise e unless e.message =~ /Dataflow::Nodes/
|
96
|
-
|
170
|
+
|
171
|
+
@name_errors ||= Set.new
|
172
|
+
unless @name_errors.include?(e.message)
|
173
|
+
p "Warning -- Node class not found. #{e}"
|
174
|
+
@name_errors << e.message
|
175
|
+
end
|
176
|
+
|
97
177
|
Dataflow::Nodes::ComputeNode
|
98
178
|
end
|
99
179
|
end
|
@@ -31,7 +31,7 @@ module Dataflow
|
|
31
31
|
end
|
32
32
|
|
33
33
|
# retrieve all elements from a data node
|
34
|
-
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
34
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0, include_system_id: false)
|
35
35
|
SmarterCSV.process(file_path, strings_as_keys: true)
|
36
36
|
rescue Errno::ENOENT => e
|
37
37
|
[]
|
@@ -26,6 +26,7 @@ module Dataflow
|
|
26
26
|
def disconnect_clients
|
27
27
|
@clients ||= {}
|
28
28
|
@clients.values.each(&:close)
|
29
|
+
@clients = {}
|
29
30
|
end
|
30
31
|
end
|
31
32
|
|
@@ -48,10 +49,10 @@ module Dataflow
|
|
48
49
|
end
|
49
50
|
|
50
51
|
# retrieve all elements from a data node
|
51
|
-
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
52
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0, include_system_id: false)
|
52
53
|
projection = fields.map { |field| [field, 1] }
|
53
54
|
|
54
|
-
unless fields.map(&:to_s).include?(SYSTEM_ID)
|
55
|
+
unless include_system_id || fields.map(&:to_s).include?(SYSTEM_ID)
|
55
56
|
# by default, do not select the _id field
|
56
57
|
projection << [SYSTEM_ID, 0].freeze
|
57
58
|
end
|
@@ -225,8 +226,8 @@ module Dataflow
|
|
225
226
|
}
|
226
227
|
end
|
227
228
|
|
228
|
-
def dump(base_folder:)
|
229
|
-
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.gz"
|
229
|
+
def dump(base_folder:, read_dataset_idx:)
|
230
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.#{read_dataset_idx}.gz"
|
230
231
|
options = "--archive=#{archive_path} --db=#{@settings.db_name} --collection=#{read_dataset_name} "
|
231
232
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
232
233
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
@@ -238,8 +239,8 @@ module Dataflow
|
|
238
239
|
archive_path
|
239
240
|
end
|
240
241
|
|
241
|
-
def restore(filepath:)
|
242
|
-
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{
|
242
|
+
def restore(filepath:, dataset_name:)
|
243
|
+
options = "--archive=#{filepath} --db=#{@settings.db_name} --collection=#{dataset_name} "
|
243
244
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
244
245
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
245
246
|
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
@@ -25,8 +25,8 @@ module Dataflow
|
|
25
25
|
'~*'
|
26
26
|
end
|
27
27
|
|
28
|
-
def dump(base_folder:)
|
29
|
-
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.dump"
|
28
|
+
def dump(base_folder:, read_dataset_idx:)
|
29
|
+
archive_path = "#{base_folder}/#{@settings.db_name}/#{@settings.dataset_name}.#{read_dataset_idx}.dump"
|
30
30
|
options = "--table=public.#{@settings.read_dataset_name} "
|
31
31
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
32
32
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
@@ -38,14 +38,13 @@ module Dataflow
|
|
38
38
|
archive_path
|
39
39
|
end
|
40
40
|
|
41
|
-
def restore(filepath:)
|
42
|
-
options = "--
|
41
|
+
def restore(filepath:, dataset_name:)
|
42
|
+
options = "-v --clean --if-exists --no-owner "
|
43
43
|
options += "--host=#{@settings.db_host} " if @settings.db_host.present?
|
44
44
|
options += "--port=#{@settings.db_port} " if @settings.db_port.present?
|
45
|
-
options += "--username=#{@settings.db_user} " if @settings.db_user.present?
|
45
|
+
options += "--username=#{@settings.db_user} --role=#{@settings.db_user} " if @settings.db_user.present?
|
46
46
|
password = "PGPASSWORD=#{@settings.db_password} " if @settings.db_password.present?
|
47
47
|
|
48
|
-
drop_dataset(@settings.read_dataset_name)
|
49
48
|
`#{password}pg_restore #{options} -Fc --dbname=#{@settings.db_name} #{filepath}`
|
50
49
|
end
|
51
50
|
end
|
@@ -53,6 +53,7 @@ module Dataflow
|
|
53
53
|
def disconnect_clients
|
54
54
|
@clients ||= {}
|
55
55
|
@clients.values.each(&:disconnect)
|
56
|
+
@clients = {}
|
56
57
|
end
|
57
58
|
end
|
58
59
|
|
@@ -77,12 +78,15 @@ module Dataflow
|
|
77
78
|
end
|
78
79
|
|
79
80
|
# retrieve all elements from a data node
|
80
|
-
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0)
|
81
|
+
def all(where: {}, fields: [], sort: {}, offset: 0, limit: 0, include_system_id: false)
|
81
82
|
res = client[settings.read_dataset_name.to_sym]
|
82
83
|
|
83
84
|
# if there is no fields, automatically
|
84
85
|
# select all the fields expect the system _id
|
85
|
-
|
86
|
+
if fields.blank?
|
87
|
+
fields = res.columns
|
88
|
+
fields = fields.reject { |x| x == SYSTEM_ID } unless include_system_id
|
89
|
+
end
|
86
90
|
|
87
91
|
res = res.select(*fields.map(&:to_sym)) if fields.present?
|
88
92
|
res = apply_query(res, where)
|
data/lib/dataflow/executor.rb
CHANGED
@@ -28,7 +28,7 @@ module Dataflow
|
|
28
28
|
logger.log("Opened a completion queue for '#{node.name}': #{completion_queue.name}")
|
29
29
|
|
30
30
|
messages = send_execution_messages(channel, node, is_batch_execution, completion_queue.name)
|
31
|
-
error_data = await_execution_completion(completion_queue, messages.count)
|
31
|
+
error_data = await_execution_completion(node, completion_queue, messages.count)
|
32
32
|
logger.log("Finished processing '#{node.name}'")
|
33
33
|
|
34
34
|
raise Errors::RemoteExecutionError.new(error_data['message'], error_data['backtrace']) if error_data
|
@@ -43,7 +43,7 @@ module Dataflow
|
|
43
43
|
ch = conn.create_channel
|
44
44
|
completion_queue = ch.queue('', exclusive: true)
|
45
45
|
|
46
|
-
|
46
|
+
[conn, ch, completion_queue]
|
47
47
|
end
|
48
48
|
|
49
49
|
def send_execution_messages(channel, node, is_batch_execution, completion_queue_name)
|
@@ -76,7 +76,7 @@ module Dataflow
|
|
76
76
|
end
|
77
77
|
end
|
78
78
|
|
79
|
-
def await_execution_completion(completion_queue, expected_completion_count)
|
79
|
+
def await_execution_completion(node, completion_queue, expected_completion_count)
|
80
80
|
completed_message_indexes = []
|
81
81
|
unblock = Queue.new
|
82
82
|
|
@@ -84,6 +84,10 @@ module Dataflow
|
|
84
84
|
data = JSON.parse(payload)
|
85
85
|
unblock.enq(data['error']) if data['error'].present?
|
86
86
|
|
87
|
+
# Support adding the data to the compute's data_node is the
|
88
|
+
# remote process returns anything.
|
89
|
+
node.data_node&.add(records: data['data']) if data['data'].present?
|
90
|
+
|
87
91
|
completed_message_indexes << data['msg_id']
|
88
92
|
if completed_message_indexes.count == expected_completion_count
|
89
93
|
unblock.enq(false)
|
data/lib/dataflow/node.rb
CHANGED
@@ -30,10 +30,27 @@ module Dataflow
|
|
30
30
|
true
|
31
31
|
end
|
32
32
|
|
33
|
+
def all_dependencies
|
34
|
+
[]
|
35
|
+
end
|
36
|
+
|
33
37
|
def required_by
|
34
38
|
Dataflow::Nodes::ComputeNode.where(dependency_ids: _id).map { |node|
|
35
39
|
{ node: node, type: 'dependency' }
|
36
40
|
}
|
37
41
|
end
|
42
|
+
|
43
|
+
def metadata
|
44
|
+
metadata = {
|
45
|
+
_id: self._id,
|
46
|
+
_type: self._type,
|
47
|
+
}
|
48
|
+
properties_data = self.class.properties.keys.map do |property_name|
|
49
|
+
value = self[property_name]
|
50
|
+
[property_name, value]
|
51
|
+
end.to_h
|
52
|
+
|
53
|
+
metadata.merge(properties_data)
|
54
|
+
end
|
38
55
|
end
|
39
56
|
end
|
@@ -351,8 +351,7 @@ module Dataflow
|
|
351
351
|
# If you override, you may want to override the make_batch_params as well.
|
352
352
|
def execute_local_batch_computation(batch_params)
|
353
353
|
records = dependencies.first.all(where: batch_params)
|
354
|
-
|
355
|
-
data_node&.add(records: new_records)
|
354
|
+
compute_batch(records: records)
|
356
355
|
end
|
357
356
|
|
358
357
|
# Interface used to retrieve the params for scheduled batchs. Override when needed.
|
@@ -583,9 +582,7 @@ module Dataflow
|
|
583
582
|
|
584
583
|
def parallel_each(itr)
|
585
584
|
# before fork: always disconnect currently used connections.
|
586
|
-
|
587
|
-
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
588
|
-
Mongoid.disconnect_clients
|
585
|
+
disconnect_db_clients
|
589
586
|
|
590
587
|
# set to true to debug code in the iteration
|
591
588
|
is_debugging_impl = ENV['DEBUG_DATAFLOW']
|
@@ -600,12 +597,16 @@ module Dataflow
|
|
600
597
|
|
601
598
|
Parallel.each(itr, opts) do |*args|
|
602
599
|
yield(*args)
|
603
|
-
|
604
|
-
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
605
|
-
Mongoid.disconnect_clients
|
600
|
+
disconnect_db_clients
|
606
601
|
end
|
607
602
|
end
|
608
603
|
|
604
|
+
def disconnect_db_clients
|
605
|
+
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
606
|
+
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
607
|
+
Mongoid.disconnect_clients
|
608
|
+
end
|
609
|
+
|
609
610
|
def logger
|
610
611
|
@logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
|
611
612
|
end
|
@@ -143,8 +143,8 @@ module Dataflow
|
|
143
143
|
# can be called to stream the results rather than load everything in memory.
|
144
144
|
# Other methods can also be called depending on the backend,
|
145
145
|
# the downside being back-end portability (use at your own risk).
|
146
|
-
def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, &block)
|
147
|
-
db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, &block)
|
146
|
+
def all(where: {}, fields: [], sort: {}, limit: 0, offset: 0, include_system_id: false, &block)
|
147
|
+
db_adapter.all(where: where, fields: fields, sort: sort, limit: limit, offset: offset, include_system_id: include_system_id, &block)
|
148
148
|
end
|
149
149
|
|
150
150
|
# Supports paginating efficiently through the dataset.
|
@@ -316,9 +316,9 @@ module Dataflow
|
|
316
316
|
end
|
317
317
|
|
318
318
|
def required_by
|
319
|
-
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map
|
319
|
+
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map do |node|
|
320
320
|
{ node: node, type: 'dataset' }
|
321
|
-
|
321
|
+
end
|
322
322
|
end
|
323
323
|
|
324
324
|
# this is not safe if there is some parallel processing going on
|
@@ -341,15 +341,44 @@ module Dataflow
|
|
341
341
|
end
|
342
342
|
|
343
343
|
# Dump a backup of this dataset to a file.
|
344
|
-
# @return [String] the filepath to the dump file.
|
344
|
+
# @return [String] the filepath to the dump file. The filename is
|
345
|
+
# formatted as <node_name>.<read_dataset_idx>.<ext>
|
345
346
|
def dump_dataset(base_folder: './dump')
|
346
|
-
|
347
|
+
read_idx = 0
|
348
|
+
read_idx = read_dataset_idx if use_double_buffering
|
349
|
+
|
350
|
+
db_adapter.dump(base_folder: base_folder, read_dataset_idx: read_idx)
|
347
351
|
end
|
348
352
|
|
349
353
|
# Restore a dump of this dataset
|
350
|
-
# @param files [String] the filepath to the dump file.
|
354
|
+
# @param files [String] the filepath to the dump file. The filename has
|
355
|
+
# to be formatted as <node_name>.<read_dataset_idx>.<ext>
|
351
356
|
def restore_dataset(filepath:)
|
352
|
-
|
357
|
+
filename = filepath.split('/')[-1]
|
358
|
+
read_idx = if filename.count('.') < 2
|
359
|
+
# for compatibility reasons: previously we were not
|
360
|
+
# exporting the read idx in the filename
|
361
|
+
0
|
362
|
+
else
|
363
|
+
filename.split('.')[1].to_i
|
364
|
+
end
|
365
|
+
|
366
|
+
raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a single buffer dataset but node '#{name}' is expecting a double buffered one." if read_idx == 0 && use_double_buffering
|
367
|
+
raise "Called #restore_dataset with incompatible datasets settings: #{filepath} contains a double buffer dataset but node '#{name}' is expecting a single buffered one." if read_idx > 0 && !use_double_buffering
|
368
|
+
|
369
|
+
if use_double_buffering
|
370
|
+
dataset_name = valid_dataset_names[read_idx - 1]
|
371
|
+
else
|
372
|
+
dataset_name = name
|
373
|
+
end
|
374
|
+
|
375
|
+
db_adapter.restore(filepath: filepath, dataset_name: dataset_name)
|
376
|
+
self.read_dataset_idx = read_idx
|
377
|
+
save
|
378
|
+
|
379
|
+
db_adapter.update_settings(data_node: self)
|
380
|
+
|
381
|
+
true
|
353
382
|
end
|
354
383
|
|
355
384
|
private
|
@@ -3,11 +3,22 @@ module Dataflow
|
|
3
3
|
module Nodes
|
4
4
|
# Only supports read operations
|
5
5
|
class ReadOnlyDataNode < DataNode
|
6
|
+
|
7
|
+
# Support overriding which dataset to read from.
|
8
|
+
# Use this to decouple the name from the dataset name
|
9
|
+
# it will actually access.
|
10
|
+
field :dataset_name, type: String
|
11
|
+
|
6
12
|
def set_defaults
|
7
13
|
super
|
8
14
|
self.use_double_buffering = false
|
9
15
|
end
|
10
16
|
|
17
|
+
def read_dataset_name
|
18
|
+
return dataset_name if dataset_name.present?
|
19
|
+
super
|
20
|
+
end
|
21
|
+
|
11
22
|
def handle_dataset_settings_changed
|
12
23
|
# ignore - do not do anyhing
|
13
24
|
end
|
@@ -36,9 +36,9 @@ module Dataflow
|
|
36
36
|
return
|
37
37
|
end
|
38
38
|
|
39
|
-
|
39
|
+
results = execute(node, data)
|
40
40
|
response = { msg_id: data['msg_id'] }
|
41
|
-
response.merge(
|
41
|
+
response.merge(results[0])
|
42
42
|
rescue Mongoid::Errors::DocumentNotFound => e
|
43
43
|
{ error: { message: e.message, backtrace: e.backtrace } }
|
44
44
|
end
|
@@ -47,21 +47,27 @@ module Dataflow
|
|
47
47
|
# execute in a different process, so that once it's finished
|
48
48
|
# we can purge the memory
|
49
49
|
Parallel.map([payload_data]) do |data|
|
50
|
-
|
50
|
+
result = {}
|
51
51
|
logger.log("[#{data['msg_id']}] working on '#{node.name}'...")
|
52
52
|
|
53
53
|
begin
|
54
54
|
if data['is_batch']
|
55
|
-
node.execute_local_batch_computation(data['params'])
|
55
|
+
records = node.execute_local_batch_computation(data['params'])
|
56
|
+
# in ruby, we already have access to the node, so we
|
57
|
+
# add the data directly here instead of returning it through
|
58
|
+
# the queue. The default batch behavior on other languages
|
59
|
+
# is to return the output data in the 'data' key, e.g.:
|
60
|
+
# result['data] = records
|
61
|
+
node.data_node&.add(records: records)
|
56
62
|
else
|
57
63
|
node.execute_local_computation
|
58
64
|
end
|
59
65
|
rescue StandardError => e
|
60
|
-
|
66
|
+
result = { error: { message: e.message, backtrace: e.backtrace } }
|
61
67
|
end
|
62
68
|
|
63
69
|
logger.log("[#{data['msg_id']}] done working on '#{node.name}'.")
|
64
|
-
|
70
|
+
result
|
65
71
|
end
|
66
72
|
end
|
67
73
|
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.15.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-07-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -210,16 +210,16 @@ dependencies:
|
|
210
210
|
name: pg
|
211
211
|
requirement: !ruby/object:Gem::Requirement
|
212
212
|
requirements:
|
213
|
-
- -
|
213
|
+
- - '='
|
214
214
|
- !ruby/object:Gem::Version
|
215
|
-
version: '0.
|
215
|
+
version: '0.20'
|
216
216
|
type: :runtime
|
217
217
|
prerelease: false
|
218
218
|
version_requirements: !ruby/object:Gem::Requirement
|
219
219
|
requirements:
|
220
|
-
- -
|
220
|
+
- - '='
|
221
221
|
- !ruby/object:Gem::Version
|
222
|
-
version: '0.
|
222
|
+
version: '0.20'
|
223
223
|
- !ruby/object:Gem::Dependency
|
224
224
|
name: sequel_pg
|
225
225
|
requirement: !ruby/object:Gem::Requirement
|