dataflow-rb 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
@@ -0,0 +1,495 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ # Represents a compution. May stores its output in a separate data node.
5
+ # It depends on other data nodes to compute its own data.
6
+ class ComputeNode
7
+ include Mongoid::Document
8
+ include Dataflow::Node
9
+ include Dataflow::PropertiesMixin
10
+ include Dataflow::EventMixin
11
+ include Dataflow::SchemaMixin
12
+
13
+ event :computing_started # handler(node)
14
+ event :computing_progressed # handler(node, pct_complete)
15
+ event :computing_finished # handler(node, state)
16
+
17
+ delegate :find, :all, :all_paginated, :count, :ordered_system_id_queries,
18
+ :db_backend, :db_name, :use_symbols?,
19
+ :schema, :read_dataset_name, :write_dataset_name,
20
+ to: :data_node
21
+
22
+ #############################################
23
+ # Dependencies definition
24
+ #############################################
25
+ class << self
26
+ def dependency_opts
27
+ @dependency_opts || {}
28
+ end
29
+
30
+ def data_node_opts
31
+ @data_node_opts || {}
32
+ end
33
+
34
+ # DSL to be used while making computed nodes. It supports enforcing validations
35
+ # by checking whether there is exactly, at_least (min) or at_most (max)
36
+ # a given number of dependencies. Usage:
37
+ # class MyComputeNode < ComputeNode
38
+ # ensure_dependencies exactly: 1 # could be e.g.: min: 3, or max: 5
39
+ # end
40
+ def ensure_dependencies(opts)
41
+ raise Dataflow::Errors::InvalidConfigurationError, "ensure_dependencies must be given a hash. Received: #{opts.class}" unless opts.is_a?(Hash)
42
+ valid_keys = %i(exactly min max).freeze
43
+ has_attributes = (valid_keys - opts.keys).count < valid_keys.count
44
+ raise Dataflow::Errors::InvalidConfigurationError, "ensure_dependencies must have at least one of 'min', 'max' or 'exactly' attributes set. Given: #{opts.keys}" unless has_attributes
45
+
46
+ add_property(:dependency_ids, opts)
47
+ @dependency_opts = opts
48
+ end
49
+
50
+ # DSL to ensure that a data node must be set before a computed node
51
+ # can be recomputed (as it will presumably use it to store data).
52
+ def ensure_data_node_exists
53
+ @data_node_opts = { ensure_exists: true }
54
+ end
55
+ end
56
+
57
+ # The node name
58
+ field :name, type: String
59
+
60
+ # The data node to which we will write the computation output
61
+ field :data_node_id, type: BSON::ObjectId
62
+
63
+ # Whether to clear the data from the data node before computing
64
+ field :clear_data_on_compute, type: Boolean, default: true
65
+
66
+ # The dependencies this node requires for computing.
67
+ field :dependency_ids, type: Array, default: []
68
+
69
+ # Represents the maximum record count that should be used
70
+ # per process during computation.
71
+ field :limit_per_process, type: Integer, default: 0
72
+
73
+ # Use automatic recomputing interval. In seconds.
74
+ field :recompute_interval, type: Integer, default: 0
75
+
76
+ # Used as a computing lock. Will be set to 'computing'
77
+ # if currently computing or nil otherwise.
78
+ field :computing_state, type: String, editable: false
79
+
80
+ # When has the computing started.
81
+ field :computing_started_at, type: Time, editable: false
82
+
83
+ # Indicates the last time a successful computation has started.
84
+ field :last_compute_starting_time, type: Time, editable: false
85
+
86
+ # Necessary fields:
87
+ validates_presence_of :name
88
+
89
+ # Before create: run default initializations
90
+ before_create :set_defaults
91
+
92
+ # Sets the default parameters before creating the object.
93
+ def set_defaults
94
+ # support setting the fields with a Document rather
95
+ # than an ObjectId. Handle the transformations here:
96
+ if data_node_id.present?
97
+ self.data_node_id = data_node_id._id unless data_node_id.is_a?(BSON::ObjectId)
98
+
99
+ # the data node use_double_buffering setting
100
+ # must match clear_data_on_compute:
101
+ if data_node.use_double_buffering != clear_data_on_compute
102
+ data_node.use_double_buffering = clear_data_on_compute
103
+ data_node.save
104
+ end
105
+ end
106
+
107
+ # Again support having an ObjectId or a document.
108
+ self.dependency_ids = dependency_ids.map { |dep|
109
+ next dep if dep.is_a? BSON::ObjectId
110
+ dep._id
111
+ }
112
+
113
+ # Update the data node schema with the required schema
114
+ # for this computed node.
115
+ data_node&.update_schema(required_schema)
116
+ end
117
+
118
+ # Fetch the data node if it is set
119
+ def data_node
120
+ @data_node ||= Dataflow::Nodes::DataNode.find(data_node_id) if data_node_id.present?
121
+ end
122
+
123
+ # Override the relation because self.dependencies is not ordered.
124
+ def dependencies(reload: false)
125
+ return @dependencies if @dependencies.present? && !reload
126
+ @dependencies = dependency_ids.map do |x|
127
+ Dataflow::Node.find(x)
128
+ end
129
+ end
130
+
131
+ # retrieve the whole dependency tree
132
+ def all_dependencies
133
+ (dependencies + dependencies.flat_map(&:all_dependencies)).uniq
134
+ end
135
+
136
+ # Returns false if any of our dependencies has
137
+ # been updated after our last update.
138
+ # We define a computed node's last update as the time it started its
139
+ # last successful update (instead of the time it completed it, has
140
+ # dependencies may have changed in the mean time).
141
+ # @return [Boolean]
142
+ def updated?
143
+ return false if updated_at.blank?
144
+
145
+ dependencies.each do |dependency|
146
+ return false unless dependency.updated?
147
+ return false if dependency.updated_at > updated_at
148
+ end
149
+ true
150
+ end
151
+
152
+ # Keep a uniform interface with a DataNode.
153
+ def updated_at
154
+ last_compute_starting_time
155
+ end
156
+
157
+ def updated_at=(val)
158
+ self.last_compute_starting_time = val
159
+ end
160
+
161
+ # Checks whether an automatic recomputing is needed.
162
+ # @return [Boolean]
163
+ def needs_automatic_recomputing?
164
+ interval = recompute_interval.to_i
165
+ return false if interval <= 0
166
+ return false if updated?
167
+ return false if locked_for_computing?
168
+ return true if updated_at.blank?
169
+
170
+ updated_at + interval.seconds < Time.now
171
+ end
172
+
173
+ # Update the dependencies that need to be updated
174
+ # and then compute its own data.
175
+ # @param force_recompute [Boolean] if true, computes
176
+ # even if the node is already up to date.
177
+ def recompute(depth: 0, force_recompute: false)
178
+ logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
179
+ start_time = Time.now
180
+
181
+ parallel_each(dependencies) do |dependency|
182
+ logger.log "#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}..."
183
+ if !dependency.updated? || force_recompute
184
+ dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
185
+ end
186
+ end
187
+
188
+ # Dependencies data may have changed in a child process.
189
+ # Reload to make sure we have the latest metadata.
190
+ logger.log "#{'>' * (depth + 1)} #{name} reloading dependencies..."
191
+ dependencies(reload: true)
192
+
193
+ compute(depth: depth, force_compute: force_recompute)
194
+ logger.log "#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute."
195
+
196
+ true
197
+ end
198
+
199
+ # Compute this node's data if not already updated.
200
+ # Acquires a computing lock before computing.
201
+ # In the eventuality that the lock is already acquired, it awaits
202
+ # until it finishes or times out.
203
+ # @param force_compute [Boolean] if true, computes
204
+ # even if the node is already up to date.
205
+ def compute(depth: 0, force_compute: false, source: nil)
206
+ has_compute_lock = false
207
+ validate!
208
+
209
+ if updated? && !force_compute
210
+ logger.log "#{'>' * (depth + 1)} #{name} is up-to-date."
211
+ return
212
+ end
213
+
214
+ has_compute_lock = acquire_computing_lock!
215
+ if has_compute_lock
216
+ logger.log "#{'>' * (depth + 1)} #{name} started computing."
217
+ on_computing_started
218
+ start_time = Time.now
219
+
220
+ # update this node's schema with the necessary fields
221
+ data_node&.update_schema(required_schema)
222
+
223
+ pre_compute(force_compute: force_compute)
224
+
225
+ if clear_data_on_compute
226
+ # Pre-compute, we recreate the table, the unique indexes
227
+ data_node&.recreate_dataset(dataset_type: :write)
228
+ data_node&.create_unique_indexes(dataset_type: :write)
229
+ end
230
+
231
+ compute_impl
232
+
233
+ if clear_data_on_compute
234
+ # Post-compute, delay creating other indexes for insert speed
235
+ data_node&.create_non_unique_indexes(dataset_type: :write)
236
+ # swap read/write datasets
237
+ data_node&.swap_read_write_datasets!
238
+ end
239
+
240
+ self.last_compute_starting_time = start_time
241
+ duration = Time.now - start_time
242
+ logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
243
+ on_computing_finished(state: 'computed')
244
+ else
245
+ logger.log "#{'>' * (depth + 1)} [IS AWAITING] #{name}."
246
+ await_computing!
247
+ logger.log "#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}."
248
+ end
249
+
250
+ rescue StandardError => e
251
+ on_computing_finished(state: 'error', error: e) if has_compute_lock
252
+ logger.log "#{'>' * (depth + 1)} [ERROR] #{name} failed computing: #{e}"
253
+ raise
254
+ ensure
255
+ release_computing_lock! if has_compute_lock
256
+ true
257
+ end
258
+
259
+ # Check wethere this node can or not compute.
260
+ # Errors are added to the active model errors.
261
+ # @return [Boolean] true has no errors and can be computed.
262
+ def valid_for_computation?
263
+ # Perform additional checks: also add errors to "self.errors"
264
+ opts = self.class.dependency_opts
265
+ if opts.key?(:exactly)
266
+ ensure_exact_dependencies(count: opts[:exactly])
267
+ elsif opts.key?(:max)
268
+ ensure_at_most_dependencies(count: opts[:max])
269
+ else # even if the min is not specified, we need at least 1 dependency
270
+ ensure_at_least_dependencies(count: opts[:min] || 1)
271
+ end
272
+ ensure_no_cyclic_dependencies
273
+ ensure_keys_are_set
274
+ ensure_data_node_exists if self.class.data_node_opts[:ensure_exists]
275
+
276
+ errors.count == 0
277
+ end
278
+
279
+ # Check this node's locking status.
280
+ # @return [Boolean] Whtere this node is locked or not.
281
+ def locked_for_computing?
282
+ computing_state == 'computing'
283
+ end
284
+
285
+ # Force the release of this node's computing lock.
286
+ # Do not use unless there is a problem with the lock.
287
+ def force_computing_lock_release!
288
+ release_computing_lock!
289
+ end
290
+
291
+ private
292
+
293
+ # Compute implementation:
294
+ # - recreate the table
295
+ # - compute the records
296
+ # - save them to the DB
297
+ # (the process may be overwritten on a per-node basis if needed)
298
+ def compute_impl
299
+ process_parallel(node: dependencies.first)
300
+ end
301
+
302
+ def process_parallel(node:)
303
+ record_count = node.count
304
+ return if record_count == 0
305
+
306
+ equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
307
+ count_per_process = equal_split_per_process
308
+ limit = limit_per_process.to_i
309
+ count_per_process = [limit, equal_split_per_process].min if limit > 0
310
+
311
+ queries = node.ordered_system_id_queries(batch_size: count_per_process)
312
+
313
+ parallel_each(queries.each_with_index) do |query, idx|
314
+ progress = (idx / queries.count.to_f * 100).ceil
315
+ on_computing_progressed(pct_complete: progress)
316
+
317
+ records = node.all(where: query)
318
+
319
+ new_records = if block_given?
320
+ yield records
321
+ else
322
+ compute_batch(records: records)
323
+ end
324
+
325
+ data_node.add(records: new_records)
326
+ end
327
+ end
328
+
329
+ # This is an interface only.
330
+ # Override with record computation logic.
331
+ def compute_batch(records:)
332
+ records
333
+ end
334
+
335
+ def acquire_computing_lock!
336
+ # make sure that any pending changes are saved.
337
+ save
338
+ find_query = { _id: _id, computing_state: { '$ne' => 'computing' } }
339
+ update_query = { '$set' => { computing_state: 'computing', computing_started_at: Time.now } }
340
+ # send a query directly to avoid mongoid's caching layers
341
+ res = Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
342
+ # reload the model data after the query above
343
+ reload
344
+ # the query is atomic so if res != nil, we acquired the lock
345
+ !res.nil?
346
+ end
347
+
348
+ def release_computing_lock!
349
+ # make sure that any pending changes are saved.
350
+ save
351
+ find_query = { _id: _id }
352
+ update_query = { '$set' => { computing_state: nil, computing_started_at: nil } }
353
+ # send a query directly to avoid mongoid's caching layers
354
+ Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
355
+ # reload the model data after the query above
356
+ reload
357
+ end
358
+
359
+ def await_computing!
360
+ start_waiting_at = Time.now
361
+ # TODO: should the max wait time be dependent on e.g. the recompute interval?
362
+ max_wait_time = 15.minutes
363
+ while Time.now < start_waiting_at + max_wait_time
364
+ sleep 2
365
+ # reloads with the data stored on mongodb:
366
+ # something maybe have been changed by another process.
367
+ reload
368
+ return unless locked_for_computing?
369
+ end
370
+
371
+ raise StandardError, "Awaiting computing on #{name} reached timeout."
372
+ end
373
+
374
+ # Interface only. Re-implement for node-specific behavior before computing
375
+ def pre_compute(force_compute:); end
376
+
377
+ # Override to define a required schema.
378
+ def required_schema
379
+ schema
380
+ end
381
+
382
+ ##############################
383
+ # Dependency validations
384
+ ##############################
385
+
386
+ def ensure_no_cyclic_dependencies
387
+ node_map = Dataflow::Nodes::ComputeNode.all.map { |n| [n._id, n] }.to_h
388
+
389
+ dep_ids = (dependency_ids || [])
390
+ dep_ids.each do |dependency_id|
391
+ next unless has_dependency_in_hierarchy?(node_map[dependency_id], dependency_id, node_map)
392
+ error_msg = "Dependency to node #{dependency_id} ('#{node_map[dependency_id].name}') is cylic."
393
+ errors.add(:dependency_ids, error_msg)
394
+ end
395
+ end
396
+
397
+ def has_dependency_in_hierarchy?(node, dependency_id, node_map)
398
+ return false if node.blank?
399
+ # if we're reach a node that has no more deps, then we did not find
400
+ # the given dependency_id in the hierarchy
401
+ return true if (node.dependency_ids || []).include?(dependency_id)
402
+ (node.dependency_ids || []).any? do |dep_id|
403
+ has_dependency_in_hierarchy?(node_map[dep_id], dependency_id, node_map)
404
+ end
405
+ end
406
+
407
+ def ensure_no_cyclic_dependencies!
408
+ ensure_no_cyclic_dependencies
409
+ raise_dependendy_errors_if_needed!
410
+ end
411
+
412
+ def ensure_exact_dependencies(count:)
413
+ # we need to use .size, not .count
414
+ # for the mongo relation to work as expected
415
+ current_count = (dependency_ids || []).size
416
+ return if current_count == count
417
+
418
+ error_msg = "Expecting exactly #{count} dependencies. Has #{current_count} dependencies."
419
+ errors.add(:dependency_ids, error_msg)
420
+ end
421
+
422
+ def ensure_at_least_dependencies(count:)
423
+ # we need to use .size, not .count
424
+ # for the mongo relation to work as expected
425
+ current_count = (dependency_ids || []).size
426
+ return if current_count >= count
427
+
428
+ error_msg = "Expecting at least #{count} dependencies. Has #{current_count} dependencies."
429
+ errors.add(:dependency_ids, error_msg)
430
+ end
431
+
432
+ def ensure_at_most_dependencies(count:)
433
+ # we need to use .size, not .count
434
+ # for the mongo relation to work as expected
435
+ current_count = (dependency_ids || []).size
436
+ return if current_count <= count
437
+
438
+ error_msg = "Expecting at most #{count} dependencies. Has #{current_count} dependencies."
439
+ errors.add(:dependency_ids, error_msg)
440
+ end
441
+
442
+ def ensure_keys_are_set
443
+ required_keys = self.class.properties.select { |_k, opts| opts[:required_for_computing] }
444
+ required_keys.each do |key, opts|
445
+ errors.add(key, "#{self.class}.#{key} must be set for computing.") if self[key].nil?
446
+ if opts[:values].is_a?(Array)
447
+ # make sure the key's value is one of the possible values
448
+ errors.add(key, "#{self.class}.#{key} must be set to one of #{opts[:values].join(', ')}. Given: #{self[key]}") unless opts[:values].include?(self[key])
449
+ end
450
+ end
451
+ end
452
+
453
+ def ensure_data_node_exists
454
+ if data_node_id.blank?
455
+ error_msg = 'Expecting a data node to be set.'
456
+ errors.add(:data_node_id, error_msg)
457
+ return
458
+ end
459
+
460
+ # the data node id is present. Check if it found
461
+ Dataflow::Nodes::DataNode.find(data_node.id)
462
+ rescue Mongoid::Errors::DocumentNotFound
463
+ # it was not found:
464
+ error_msg = "No data node was found for Id: '#{data_node_id}'."
465
+ errors.add(:data_node_id, error_msg)
466
+ end
467
+
468
+ def parallel_each(itr)
469
+ # before fork: always disconnect currently used connections.
470
+ Dataflow::Adapters::SqlAdapter.disconnect_clients
471
+ Dataflow::Adapters::MongoDbAdapter.disconnect_clients
472
+ Mongoid.disconnect_clients
473
+
474
+ # set to true to debug code in the iteration
475
+ is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
476
+ if is_debugging_impl # || true
477
+ itr.each do |*args|
478
+ yield(*args)
479
+ end
480
+ else
481
+ Parallel.each(itr) do |*args|
482
+ yield(*args)
483
+ Dataflow::Adapters::SqlAdapter.disconnect_clients
484
+ Dataflow::Adapters::MongoDbAdapter.disconnect_clients
485
+ Mongoid.disconnect_clients
486
+ end
487
+ end
488
+ end
489
+
490
+ def logger
491
+ @logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
492
+ end
493
+ end # class ComputeNode
494
+ end # module Nodes
495
+ end # module Dataflow