dataflow-rb 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.env.test.example +6 -0
- data/.gitignore +14 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +21 -0
- data/README.md +46 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/dataflow-rb.gemspec +42 -0
- data/lib/config/mongoid.yml +21 -0
- data/lib/dataflow/adapters/csv_adapter.rb +123 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
- data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
- data/lib/dataflow/adapters/psql_adapter.rb +21 -0
- data/lib/dataflow/adapters/settings.rb +33 -0
- data/lib/dataflow/adapters/sql_adapter.rb +322 -0
- data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
- data/lib/dataflow/errors/not_implemented_error.rb +7 -0
- data/lib/dataflow/event_mixin.rb +77 -0
- data/lib/dataflow/extensions/mongo_driver.rb +21 -0
- data/lib/dataflow/extensions/msgpack.rb +19 -0
- data/lib/dataflow/logger.rb +27 -0
- data/lib/dataflow/node.rb +37 -0
- data/lib/dataflow/nodes/compute_node.rb +495 -0
- data/lib/dataflow/nodes/data_node.rb +331 -0
- data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
- data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
- data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
- data/lib/dataflow/nodes/filter/where_node.rb +44 -0
- data/lib/dataflow/nodes/join_node.rb +151 -0
- data/lib/dataflow/nodes/map_node.rb +50 -0
- data/lib/dataflow/nodes/merge_node.rb +33 -0
- data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
- data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
- data/lib/dataflow/nodes/select_keys_node.rb +39 -0
- data/lib/dataflow/nodes/snapshot_node.rb +77 -0
- data/lib/dataflow/nodes/sql_query_node.rb +50 -0
- data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
- data/lib/dataflow/nodes/upsert_node.rb +68 -0
- data/lib/dataflow/properties_mixin.rb +35 -0
- data/lib/dataflow/schema_mixin.rb +134 -0
- data/lib/dataflow/version.rb +4 -0
- data/lib/dataflow-rb.rb +72 -0
- metadata +371 -0
@@ -0,0 +1,495 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Dataflow
|
3
|
+
module Nodes
|
4
|
+
# Represents a compution. May stores its output in a separate data node.
|
5
|
+
# It depends on other data nodes to compute its own data.
|
6
|
+
class ComputeNode
|
7
|
+
include Mongoid::Document
|
8
|
+
include Dataflow::Node
|
9
|
+
include Dataflow::PropertiesMixin
|
10
|
+
include Dataflow::EventMixin
|
11
|
+
include Dataflow::SchemaMixin
|
12
|
+
|
13
|
+
event :computing_started # handler(node)
|
14
|
+
event :computing_progressed # handler(node, pct_complete)
|
15
|
+
event :computing_finished # handler(node, state)
|
16
|
+
|
17
|
+
delegate :find, :all, :all_paginated, :count, :ordered_system_id_queries,
|
18
|
+
:db_backend, :db_name, :use_symbols?,
|
19
|
+
:schema, :read_dataset_name, :write_dataset_name,
|
20
|
+
to: :data_node
|
21
|
+
|
22
|
+
#############################################
|
23
|
+
# Dependencies definition
|
24
|
+
#############################################
|
25
|
+
class << self
|
26
|
+
def dependency_opts
|
27
|
+
@dependency_opts || {}
|
28
|
+
end
|
29
|
+
|
30
|
+
def data_node_opts
|
31
|
+
@data_node_opts || {}
|
32
|
+
end
|
33
|
+
|
34
|
+
# DSL to be used while making computed nodes. It supports enforcing validations
|
35
|
+
# by checking whether there is exactly, at_least (min) or at_most (max)
|
36
|
+
# a given number of dependencies. Usage:
|
37
|
+
# class MyComputeNode < ComputeNode
|
38
|
+
# ensure_dependencies exactly: 1 # could be e.g.: min: 3, or max: 5
|
39
|
+
# end
|
40
|
+
def ensure_dependencies(opts)
|
41
|
+
raise Dataflow::Errors::InvalidConfigurationError, "ensure_dependencies must be given a hash. Received: #{opts.class}" unless opts.is_a?(Hash)
|
42
|
+
valid_keys = %i(exactly min max).freeze
|
43
|
+
has_attributes = (valid_keys - opts.keys).count < valid_keys.count
|
44
|
+
raise Dataflow::Errors::InvalidConfigurationError, "ensure_dependencies must have at least one of 'min', 'max' or 'exactly' attributes set. Given: #{opts.keys}" unless has_attributes
|
45
|
+
|
46
|
+
add_property(:dependency_ids, opts)
|
47
|
+
@dependency_opts = opts
|
48
|
+
end
|
49
|
+
|
50
|
+
# DSL to ensure that a data node must be set before a computed node
|
51
|
+
# can be recomputed (as it will presumably use it to store data).
|
52
|
+
def ensure_data_node_exists
|
53
|
+
@data_node_opts = { ensure_exists: true }
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# The node name
|
58
|
+
field :name, type: String
|
59
|
+
|
60
|
+
# The data node to which we will write the computation output
|
61
|
+
field :data_node_id, type: BSON::ObjectId
|
62
|
+
|
63
|
+
# Whether to clear the data from the data node before computing
|
64
|
+
field :clear_data_on_compute, type: Boolean, default: true
|
65
|
+
|
66
|
+
# The dependencies this node requires for computing.
|
67
|
+
field :dependency_ids, type: Array, default: []
|
68
|
+
|
69
|
+
# Represents the maximum record count that should be used
|
70
|
+
# per process during computation.
|
71
|
+
field :limit_per_process, type: Integer, default: 0
|
72
|
+
|
73
|
+
# Use automatic recomputing interval. In seconds.
|
74
|
+
field :recompute_interval, type: Integer, default: 0
|
75
|
+
|
76
|
+
# Used as a computing lock. Will be set to 'computing'
|
77
|
+
# if currently computing or nil otherwise.
|
78
|
+
field :computing_state, type: String, editable: false
|
79
|
+
|
80
|
+
# When has the computing started.
|
81
|
+
field :computing_started_at, type: Time, editable: false
|
82
|
+
|
83
|
+
# Indicates the last time a successful computation has started.
|
84
|
+
field :last_compute_starting_time, type: Time, editable: false
|
85
|
+
|
86
|
+
# Necessary fields:
|
87
|
+
validates_presence_of :name
|
88
|
+
|
89
|
+
# Before create: run default initializations
|
90
|
+
before_create :set_defaults
|
91
|
+
|
92
|
+
# Sets the default parameters before creating the object.
|
93
|
+
def set_defaults
|
94
|
+
# support setting the fields with a Document rather
|
95
|
+
# than an ObjectId. Handle the transformations here:
|
96
|
+
if data_node_id.present?
|
97
|
+
self.data_node_id = data_node_id._id unless data_node_id.is_a?(BSON::ObjectId)
|
98
|
+
|
99
|
+
# the data node use_double_buffering setting
|
100
|
+
# must match clear_data_on_compute:
|
101
|
+
if data_node.use_double_buffering != clear_data_on_compute
|
102
|
+
data_node.use_double_buffering = clear_data_on_compute
|
103
|
+
data_node.save
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
# Again support having an ObjectId or a document.
|
108
|
+
self.dependency_ids = dependency_ids.map { |dep|
|
109
|
+
next dep if dep.is_a? BSON::ObjectId
|
110
|
+
dep._id
|
111
|
+
}
|
112
|
+
|
113
|
+
# Update the data node schema with the required schema
|
114
|
+
# for this computed node.
|
115
|
+
data_node&.update_schema(required_schema)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Fetch the data node if it is set
|
119
|
+
def data_node
|
120
|
+
@data_node ||= Dataflow::Nodes::DataNode.find(data_node_id) if data_node_id.present?
|
121
|
+
end
|
122
|
+
|
123
|
+
# Override the relation because self.dependencies is not ordered.
|
124
|
+
def dependencies(reload: false)
|
125
|
+
return @dependencies if @dependencies.present? && !reload
|
126
|
+
@dependencies = dependency_ids.map do |x|
|
127
|
+
Dataflow::Node.find(x)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
# retrieve the whole dependency tree
|
132
|
+
def all_dependencies
|
133
|
+
(dependencies + dependencies.flat_map(&:all_dependencies)).uniq
|
134
|
+
end
|
135
|
+
|
136
|
+
# Returns false if any of our dependencies has
|
137
|
+
# been updated after our last update.
|
138
|
+
# We define a computed node's last update as the time it started its
|
139
|
+
# last successful update (instead of the time it completed it, has
|
140
|
+
# dependencies may have changed in the mean time).
|
141
|
+
# @return [Boolean]
|
142
|
+
def updated?
|
143
|
+
return false if updated_at.blank?
|
144
|
+
|
145
|
+
dependencies.each do |dependency|
|
146
|
+
return false unless dependency.updated?
|
147
|
+
return false if dependency.updated_at > updated_at
|
148
|
+
end
|
149
|
+
true
|
150
|
+
end
|
151
|
+
|
152
|
+
# Keep a uniform interface with a DataNode.
|
153
|
+
def updated_at
|
154
|
+
last_compute_starting_time
|
155
|
+
end
|
156
|
+
|
157
|
+
def updated_at=(val)
|
158
|
+
self.last_compute_starting_time = val
|
159
|
+
end
|
160
|
+
|
161
|
+
# Checks whether an automatic recomputing is needed.
|
162
|
+
# @return [Boolean]
|
163
|
+
def needs_automatic_recomputing?
|
164
|
+
interval = recompute_interval.to_i
|
165
|
+
return false if interval <= 0
|
166
|
+
return false if updated?
|
167
|
+
return false if locked_for_computing?
|
168
|
+
return true if updated_at.blank?
|
169
|
+
|
170
|
+
updated_at + interval.seconds < Time.now
|
171
|
+
end
|
172
|
+
|
173
|
+
# Update the dependencies that need to be updated
|
174
|
+
# and then compute its own data.
|
175
|
+
# @param force_recompute [Boolean] if true, computes
|
176
|
+
# even if the node is already up to date.
|
177
|
+
def recompute(depth: 0, force_recompute: false)
|
178
|
+
logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
|
179
|
+
start_time = Time.now
|
180
|
+
|
181
|
+
parallel_each(dependencies) do |dependency|
|
182
|
+
logger.log "#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}..."
|
183
|
+
if !dependency.updated? || force_recompute
|
184
|
+
dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Dependencies data may have changed in a child process.
|
189
|
+
# Reload to make sure we have the latest metadata.
|
190
|
+
logger.log "#{'>' * (depth + 1)} #{name} reloading dependencies..."
|
191
|
+
dependencies(reload: true)
|
192
|
+
|
193
|
+
compute(depth: depth, force_compute: force_recompute)
|
194
|
+
logger.log "#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute."
|
195
|
+
|
196
|
+
true
|
197
|
+
end
|
198
|
+
|
199
|
+
# Compute this node's data if not already updated.
|
200
|
+
# Acquires a computing lock before computing.
|
201
|
+
# In the eventuality that the lock is already acquired, it awaits
|
202
|
+
# until it finishes or times out.
|
203
|
+
# @param force_compute [Boolean] if true, computes
|
204
|
+
# even if the node is already up to date.
|
205
|
+
def compute(depth: 0, force_compute: false, source: nil)
|
206
|
+
has_compute_lock = false
|
207
|
+
validate!
|
208
|
+
|
209
|
+
if updated? && !force_compute
|
210
|
+
logger.log "#{'>' * (depth + 1)} #{name} is up-to-date."
|
211
|
+
return
|
212
|
+
end
|
213
|
+
|
214
|
+
has_compute_lock = acquire_computing_lock!
|
215
|
+
if has_compute_lock
|
216
|
+
logger.log "#{'>' * (depth + 1)} #{name} started computing."
|
217
|
+
on_computing_started
|
218
|
+
start_time = Time.now
|
219
|
+
|
220
|
+
# update this node's schema with the necessary fields
|
221
|
+
data_node&.update_schema(required_schema)
|
222
|
+
|
223
|
+
pre_compute(force_compute: force_compute)
|
224
|
+
|
225
|
+
if clear_data_on_compute
|
226
|
+
# Pre-compute, we recreate the table, the unique indexes
|
227
|
+
data_node&.recreate_dataset(dataset_type: :write)
|
228
|
+
data_node&.create_unique_indexes(dataset_type: :write)
|
229
|
+
end
|
230
|
+
|
231
|
+
compute_impl
|
232
|
+
|
233
|
+
if clear_data_on_compute
|
234
|
+
# Post-compute, delay creating other indexes for insert speed
|
235
|
+
data_node&.create_non_unique_indexes(dataset_type: :write)
|
236
|
+
# swap read/write datasets
|
237
|
+
data_node&.swap_read_write_datasets!
|
238
|
+
end
|
239
|
+
|
240
|
+
self.last_compute_starting_time = start_time
|
241
|
+
duration = Time.now - start_time
|
242
|
+
logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
|
243
|
+
on_computing_finished(state: 'computed')
|
244
|
+
else
|
245
|
+
logger.log "#{'>' * (depth + 1)} [IS AWAITING] #{name}."
|
246
|
+
await_computing!
|
247
|
+
logger.log "#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}."
|
248
|
+
end
|
249
|
+
|
250
|
+
rescue StandardError => e
|
251
|
+
on_computing_finished(state: 'error', error: e) if has_compute_lock
|
252
|
+
logger.log "#{'>' * (depth + 1)} [ERROR] #{name} failed computing: #{e}"
|
253
|
+
raise
|
254
|
+
ensure
|
255
|
+
release_computing_lock! if has_compute_lock
|
256
|
+
true
|
257
|
+
end
|
258
|
+
|
259
|
+
# Check wethere this node can or not compute.
|
260
|
+
# Errors are added to the active model errors.
|
261
|
+
# @return [Boolean] true has no errors and can be computed.
|
262
|
+
def valid_for_computation?
|
263
|
+
# Perform additional checks: also add errors to "self.errors"
|
264
|
+
opts = self.class.dependency_opts
|
265
|
+
if opts.key?(:exactly)
|
266
|
+
ensure_exact_dependencies(count: opts[:exactly])
|
267
|
+
elsif opts.key?(:max)
|
268
|
+
ensure_at_most_dependencies(count: opts[:max])
|
269
|
+
else # even if the min is not specified, we need at least 1 dependency
|
270
|
+
ensure_at_least_dependencies(count: opts[:min] || 1)
|
271
|
+
end
|
272
|
+
ensure_no_cyclic_dependencies
|
273
|
+
ensure_keys_are_set
|
274
|
+
ensure_data_node_exists if self.class.data_node_opts[:ensure_exists]
|
275
|
+
|
276
|
+
errors.count == 0
|
277
|
+
end
|
278
|
+
|
279
|
+
# Check this node's locking status.
|
280
|
+
# @return [Boolean] Whtere this node is locked or not.
|
281
|
+
def locked_for_computing?
|
282
|
+
computing_state == 'computing'
|
283
|
+
end
|
284
|
+
|
285
|
+
# Force the release of this node's computing lock.
|
286
|
+
# Do not use unless there is a problem with the lock.
|
287
|
+
def force_computing_lock_release!
|
288
|
+
release_computing_lock!
|
289
|
+
end
|
290
|
+
|
291
|
+
private
|
292
|
+
|
293
|
+
# Compute implementation:
|
294
|
+
# - recreate the table
|
295
|
+
# - compute the records
|
296
|
+
# - save them to the DB
|
297
|
+
# (the process may be overwritten on a per-node basis if needed)
|
298
|
+
def compute_impl
|
299
|
+
process_parallel(node: dependencies.first)
|
300
|
+
end
|
301
|
+
|
302
|
+
def process_parallel(node:)
|
303
|
+
record_count = node.count
|
304
|
+
return if record_count == 0
|
305
|
+
|
306
|
+
equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
|
307
|
+
count_per_process = equal_split_per_process
|
308
|
+
limit = limit_per_process.to_i
|
309
|
+
count_per_process = [limit, equal_split_per_process].min if limit > 0
|
310
|
+
|
311
|
+
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
312
|
+
|
313
|
+
parallel_each(queries.each_with_index) do |query, idx|
|
314
|
+
progress = (idx / queries.count.to_f * 100).ceil
|
315
|
+
on_computing_progressed(pct_complete: progress)
|
316
|
+
|
317
|
+
records = node.all(where: query)
|
318
|
+
|
319
|
+
new_records = if block_given?
|
320
|
+
yield records
|
321
|
+
else
|
322
|
+
compute_batch(records: records)
|
323
|
+
end
|
324
|
+
|
325
|
+
data_node.add(records: new_records)
|
326
|
+
end
|
327
|
+
end
|
328
|
+
|
329
|
+
# This is an interface only.
|
330
|
+
# Override with record computation logic.
|
331
|
+
def compute_batch(records:)
|
332
|
+
records
|
333
|
+
end
|
334
|
+
|
335
|
+
def acquire_computing_lock!
|
336
|
+
# make sure that any pending changes are saved.
|
337
|
+
save
|
338
|
+
find_query = { _id: _id, computing_state: { '$ne' => 'computing' } }
|
339
|
+
update_query = { '$set' => { computing_state: 'computing', computing_started_at: Time.now } }
|
340
|
+
# send a query directly to avoid mongoid's caching layers
|
341
|
+
res = Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
|
342
|
+
# reload the model data after the query above
|
343
|
+
reload
|
344
|
+
# the query is atomic so if res != nil, we acquired the lock
|
345
|
+
!res.nil?
|
346
|
+
end
|
347
|
+
|
348
|
+
def release_computing_lock!
|
349
|
+
# make sure that any pending changes are saved.
|
350
|
+
save
|
351
|
+
find_query = { _id: _id }
|
352
|
+
update_query = { '$set' => { computing_state: nil, computing_started_at: nil } }
|
353
|
+
# send a query directly to avoid mongoid's caching layers
|
354
|
+
Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
|
355
|
+
# reload the model data after the query above
|
356
|
+
reload
|
357
|
+
end
|
358
|
+
|
359
|
+
def await_computing!
|
360
|
+
start_waiting_at = Time.now
|
361
|
+
# TODO: should the max wait time be dependent on e.g. the recompute interval?
|
362
|
+
max_wait_time = 15.minutes
|
363
|
+
while Time.now < start_waiting_at + max_wait_time
|
364
|
+
sleep 2
|
365
|
+
# reloads with the data stored on mongodb:
|
366
|
+
# something maybe have been changed by another process.
|
367
|
+
reload
|
368
|
+
return unless locked_for_computing?
|
369
|
+
end
|
370
|
+
|
371
|
+
raise StandardError, "Awaiting computing on #{name} reached timeout."
|
372
|
+
end
|
373
|
+
|
374
|
+
# Interface only. Re-implement for node-specific behavior before computing
|
375
|
+
def pre_compute(force_compute:); end
|
376
|
+
|
377
|
+
# Override to define a required schema.
|
378
|
+
def required_schema
|
379
|
+
schema
|
380
|
+
end
|
381
|
+
|
382
|
+
##############################
|
383
|
+
# Dependency validations
|
384
|
+
##############################
|
385
|
+
|
386
|
+
def ensure_no_cyclic_dependencies
|
387
|
+
node_map = Dataflow::Nodes::ComputeNode.all.map { |n| [n._id, n] }.to_h
|
388
|
+
|
389
|
+
dep_ids = (dependency_ids || [])
|
390
|
+
dep_ids.each do |dependency_id|
|
391
|
+
next unless has_dependency_in_hierarchy?(node_map[dependency_id], dependency_id, node_map)
|
392
|
+
error_msg = "Dependency to node #{dependency_id} ('#{node_map[dependency_id].name}') is cylic."
|
393
|
+
errors.add(:dependency_ids, error_msg)
|
394
|
+
end
|
395
|
+
end
|
396
|
+
|
397
|
+
def has_dependency_in_hierarchy?(node, dependency_id, node_map)
|
398
|
+
return false if node.blank?
|
399
|
+
# if we're reach a node that has no more deps, then we did not find
|
400
|
+
# the given dependency_id in the hierarchy
|
401
|
+
return true if (node.dependency_ids || []).include?(dependency_id)
|
402
|
+
(node.dependency_ids || []).any? do |dep_id|
|
403
|
+
has_dependency_in_hierarchy?(node_map[dep_id], dependency_id, node_map)
|
404
|
+
end
|
405
|
+
end
|
406
|
+
|
407
|
+
def ensure_no_cyclic_dependencies!
|
408
|
+
ensure_no_cyclic_dependencies
|
409
|
+
raise_dependendy_errors_if_needed!
|
410
|
+
end
|
411
|
+
|
412
|
+
def ensure_exact_dependencies(count:)
|
413
|
+
# we need to use .size, not .count
|
414
|
+
# for the mongo relation to work as expected
|
415
|
+
current_count = (dependency_ids || []).size
|
416
|
+
return if current_count == count
|
417
|
+
|
418
|
+
error_msg = "Expecting exactly #{count} dependencies. Has #{current_count} dependencies."
|
419
|
+
errors.add(:dependency_ids, error_msg)
|
420
|
+
end
|
421
|
+
|
422
|
+
def ensure_at_least_dependencies(count:)
|
423
|
+
# we need to use .size, not .count
|
424
|
+
# for the mongo relation to work as expected
|
425
|
+
current_count = (dependency_ids || []).size
|
426
|
+
return if current_count >= count
|
427
|
+
|
428
|
+
error_msg = "Expecting at least #{count} dependencies. Has #{current_count} dependencies."
|
429
|
+
errors.add(:dependency_ids, error_msg)
|
430
|
+
end
|
431
|
+
|
432
|
+
def ensure_at_most_dependencies(count:)
|
433
|
+
# we need to use .size, not .count
|
434
|
+
# for the mongo relation to work as expected
|
435
|
+
current_count = (dependency_ids || []).size
|
436
|
+
return if current_count <= count
|
437
|
+
|
438
|
+
error_msg = "Expecting at most #{count} dependencies. Has #{current_count} dependencies."
|
439
|
+
errors.add(:dependency_ids, error_msg)
|
440
|
+
end
|
441
|
+
|
442
|
+
def ensure_keys_are_set
|
443
|
+
required_keys = self.class.properties.select { |_k, opts| opts[:required_for_computing] }
|
444
|
+
required_keys.each do |key, opts|
|
445
|
+
errors.add(key, "#{self.class}.#{key} must be set for computing.") if self[key].nil?
|
446
|
+
if opts[:values].is_a?(Array)
|
447
|
+
# make sure the key's value is one of the possible values
|
448
|
+
errors.add(key, "#{self.class}.#{key} must be set to one of #{opts[:values].join(', ')}. Given: #{self[key]}") unless opts[:values].include?(self[key])
|
449
|
+
end
|
450
|
+
end
|
451
|
+
end
|
452
|
+
|
453
|
+
def ensure_data_node_exists
|
454
|
+
if data_node_id.blank?
|
455
|
+
error_msg = 'Expecting a data node to be set.'
|
456
|
+
errors.add(:data_node_id, error_msg)
|
457
|
+
return
|
458
|
+
end
|
459
|
+
|
460
|
+
# the data node id is present. Check if it found
|
461
|
+
Dataflow::Nodes::DataNode.find(data_node.id)
|
462
|
+
rescue Mongoid::Errors::DocumentNotFound
|
463
|
+
# it was not found:
|
464
|
+
error_msg = "No data node was found for Id: '#{data_node_id}'."
|
465
|
+
errors.add(:data_node_id, error_msg)
|
466
|
+
end
|
467
|
+
|
468
|
+
def parallel_each(itr)
|
469
|
+
# before fork: always disconnect currently used connections.
|
470
|
+
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
471
|
+
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
472
|
+
Mongoid.disconnect_clients
|
473
|
+
|
474
|
+
# set to true to debug code in the iteration
|
475
|
+
is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
|
476
|
+
if is_debugging_impl # || true
|
477
|
+
itr.each do |*args|
|
478
|
+
yield(*args)
|
479
|
+
end
|
480
|
+
else
|
481
|
+
Parallel.each(itr) do |*args|
|
482
|
+
yield(*args)
|
483
|
+
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
484
|
+
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
485
|
+
Mongoid.disconnect_clients
|
486
|
+
end
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
def logger
|
491
|
+
@logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
|
492
|
+
end
|
493
|
+
end # class ComputeNode
|
494
|
+
end # module Nodes
|
495
|
+
end # module Dataflow
|