dataflow-rb 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (47) hide show
  1. checksums.yaml +7 -0
  2. data/.env.test.example +6 -0
  3. data/.gitignore +14 -0
  4. data/.rspec +2 -0
  5. data/.travis.yml +4 -0
  6. data/Gemfile +4 -0
  7. data/LICENSE +21 -0
  8. data/README.md +46 -0
  9. data/Rakefile +6 -0
  10. data/bin/console +14 -0
  11. data/bin/setup +7 -0
  12. data/dataflow-rb.gemspec +42 -0
  13. data/lib/config/mongoid.yml +21 -0
  14. data/lib/dataflow/adapters/csv_adapter.rb +123 -0
  15. data/lib/dataflow/adapters/mongo_db_adapter.rb +307 -0
  16. data/lib/dataflow/adapters/mysql_adapter.rb +21 -0
  17. data/lib/dataflow/adapters/psql_adapter.rb +21 -0
  18. data/lib/dataflow/adapters/settings.rb +33 -0
  19. data/lib/dataflow/adapters/sql_adapter.rb +322 -0
  20. data/lib/dataflow/errors/invalid_configuration_error.rb +7 -0
  21. data/lib/dataflow/errors/not_implemented_error.rb +7 -0
  22. data/lib/dataflow/event_mixin.rb +77 -0
  23. data/lib/dataflow/extensions/mongo_driver.rb +21 -0
  24. data/lib/dataflow/extensions/msgpack.rb +19 -0
  25. data/lib/dataflow/logger.rb +27 -0
  26. data/lib/dataflow/node.rb +37 -0
  27. data/lib/dataflow/nodes/compute_node.rb +495 -0
  28. data/lib/dataflow/nodes/data_node.rb +331 -0
  29. data/lib/dataflow/nodes/export/to_csv_node.rb +54 -0
  30. data/lib/dataflow/nodes/filter/drop_while_node.rb +117 -0
  31. data/lib/dataflow/nodes/filter/newest_node.rb +66 -0
  32. data/lib/dataflow/nodes/filter/where_node.rb +44 -0
  33. data/lib/dataflow/nodes/join_node.rb +151 -0
  34. data/lib/dataflow/nodes/map_node.rb +50 -0
  35. data/lib/dataflow/nodes/merge_node.rb +33 -0
  36. data/lib/dataflow/nodes/mixin/add_internal_timestamp.rb +27 -0
  37. data/lib/dataflow/nodes/mixin/rename_dotted_fields.rb +63 -0
  38. data/lib/dataflow/nodes/select_keys_node.rb +39 -0
  39. data/lib/dataflow/nodes/snapshot_node.rb +77 -0
  40. data/lib/dataflow/nodes/sql_query_node.rb +50 -0
  41. data/lib/dataflow/nodes/transformation/to_time_node.rb +41 -0
  42. data/lib/dataflow/nodes/upsert_node.rb +68 -0
  43. data/lib/dataflow/properties_mixin.rb +35 -0
  44. data/lib/dataflow/schema_mixin.rb +134 -0
  45. data/lib/dataflow/version.rb +4 -0
  46. data/lib/dataflow-rb.rb +72 -0
  47. metadata +371 -0
@@ -0,0 +1,495 @@
1
+ # frozen_string_literal: true
2
+ module Dataflow
3
+ module Nodes
4
+ # Represents a compution. May stores its output in a separate data node.
5
+ # It depends on other data nodes to compute its own data.
6
+ class ComputeNode
7
+ include Mongoid::Document
8
+ include Dataflow::Node
9
+ include Dataflow::PropertiesMixin
10
+ include Dataflow::EventMixin
11
+ include Dataflow::SchemaMixin
12
+
13
+ event :computing_started # handler(node)
14
+ event :computing_progressed # handler(node, pct_complete)
15
+ event :computing_finished # handler(node, state)
16
+
17
+ delegate :find, :all, :all_paginated, :count, :ordered_system_id_queries,
18
+ :db_backend, :db_name, :use_symbols?,
19
+ :schema, :read_dataset_name, :write_dataset_name,
20
+ to: :data_node
21
+
22
+ #############################################
23
+ # Dependencies definition
24
+ #############################################
25
+ class << self
26
+ def dependency_opts
27
+ @dependency_opts || {}
28
+ end
29
+
30
+ def data_node_opts
31
+ @data_node_opts || {}
32
+ end
33
+
34
+ # DSL to be used while making computed nodes. It supports enforcing validations
35
+ # by checking whether there is exactly, at_least (min) or at_most (max)
36
+ # a given number of dependencies. Usage:
37
+ # class MyComputeNode < ComputeNode
38
+ # ensure_dependencies exactly: 1 # could be e.g.: min: 3, or max: 5
39
+ # end
40
+ def ensure_dependencies(opts)
41
+ raise Dataflow::Errors::InvalidConfigurationError, "ensure_dependencies must be given a hash. Received: #{opts.class}" unless opts.is_a?(Hash)
42
+ valid_keys = %i(exactly min max).freeze
43
+ has_attributes = (valid_keys - opts.keys).count < valid_keys.count
44
+ raise Dataflow::Errors::InvalidConfigurationError, "ensure_dependencies must have at least one of 'min', 'max' or 'exactly' attributes set. Given: #{opts.keys}" unless has_attributes
45
+
46
+ add_property(:dependency_ids, opts)
47
+ @dependency_opts = opts
48
+ end
49
+
50
+ # DSL to ensure that a data node must be set before a computed node
51
+ # can be recomputed (as it will presumably use it to store data).
52
+ def ensure_data_node_exists
53
+ @data_node_opts = { ensure_exists: true }
54
+ end
55
+ end
56
+
57
+ # The node name
58
+ field :name, type: String
59
+
60
+ # The data node to which we will write the computation output
61
+ field :data_node_id, type: BSON::ObjectId
62
+
63
+ # Whether to clear the data from the data node before computing
64
+ field :clear_data_on_compute, type: Boolean, default: true
65
+
66
+ # The dependencies this node requires for computing.
67
+ field :dependency_ids, type: Array, default: []
68
+
69
+ # Represents the maximum record count that should be used
70
+ # per process during computation.
71
+ field :limit_per_process, type: Integer, default: 0
72
+
73
+ # Use automatic recomputing interval. In seconds.
74
+ field :recompute_interval, type: Integer, default: 0
75
+
76
+ # Used as a computing lock. Will be set to 'computing'
77
+ # if currently computing or nil otherwise.
78
+ field :computing_state, type: String, editable: false
79
+
80
+ # When has the computing started.
81
+ field :computing_started_at, type: Time, editable: false
82
+
83
+ # Indicates the last time a successful computation has started.
84
+ field :last_compute_starting_time, type: Time, editable: false
85
+
86
+ # Necessary fields:
87
+ validates_presence_of :name
88
+
89
+ # Before create: run default initializations
90
+ before_create :set_defaults
91
+
92
+ # Sets the default parameters before creating the object.
93
+ def set_defaults
94
+ # support setting the fields with a Document rather
95
+ # than an ObjectId. Handle the transformations here:
96
+ if data_node_id.present?
97
+ self.data_node_id = data_node_id._id unless data_node_id.is_a?(BSON::ObjectId)
98
+
99
+ # the data node use_double_buffering setting
100
+ # must match clear_data_on_compute:
101
+ if data_node.use_double_buffering != clear_data_on_compute
102
+ data_node.use_double_buffering = clear_data_on_compute
103
+ data_node.save
104
+ end
105
+ end
106
+
107
+ # Again support having an ObjectId or a document.
108
+ self.dependency_ids = dependency_ids.map { |dep|
109
+ next dep if dep.is_a? BSON::ObjectId
110
+ dep._id
111
+ }
112
+
113
+ # Update the data node schema with the required schema
114
+ # for this computed node.
115
+ data_node&.update_schema(required_schema)
116
+ end
117
+
118
+ # Fetch the data node if it is set
119
+ def data_node
120
+ @data_node ||= Dataflow::Nodes::DataNode.find(data_node_id) if data_node_id.present?
121
+ end
122
+
123
+ # Override the relation because self.dependencies is not ordered.
124
+ def dependencies(reload: false)
125
+ return @dependencies if @dependencies.present? && !reload
126
+ @dependencies = dependency_ids.map do |x|
127
+ Dataflow::Node.find(x)
128
+ end
129
+ end
130
+
131
+ # retrieve the whole dependency tree
132
+ def all_dependencies
133
+ (dependencies + dependencies.flat_map(&:all_dependencies)).uniq
134
+ end
135
+
136
+ # Returns false if any of our dependencies has
137
+ # been updated after our last update.
138
+ # We define a computed node's last update as the time it started its
139
+ # last successful update (instead of the time it completed it, has
140
+ # dependencies may have changed in the mean time).
141
+ # @return [Boolean]
142
+ def updated?
143
+ return false if updated_at.blank?
144
+
145
+ dependencies.each do |dependency|
146
+ return false unless dependency.updated?
147
+ return false if dependency.updated_at > updated_at
148
+ end
149
+ true
150
+ end
151
+
152
+ # Keep a uniform interface with a DataNode.
153
+ def updated_at
154
+ last_compute_starting_time
155
+ end
156
+
157
+ def updated_at=(val)
158
+ self.last_compute_starting_time = val
159
+ end
160
+
161
+ # Checks whether an automatic recomputing is needed.
162
+ # @return [Boolean]
163
+ def needs_automatic_recomputing?
164
+ interval = recompute_interval.to_i
165
+ return false if interval <= 0
166
+ return false if updated?
167
+ return false if locked_for_computing?
168
+ return true if updated_at.blank?
169
+
170
+ updated_at + interval.seconds < Time.now
171
+ end
172
+
173
+ # Update the dependencies that need to be updated
174
+ # and then compute its own data.
175
+ # @param force_recompute [Boolean] if true, computes
176
+ # even if the node is already up to date.
177
+ def recompute(depth: 0, force_recompute: false)
178
+ logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
179
+ start_time = Time.now
180
+
181
+ parallel_each(dependencies) do |dependency|
182
+ logger.log "#{'>' * (depth + 1)} #{name} checking deps: #{dependency.name}..."
183
+ if !dependency.updated? || force_recompute
184
+ dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
185
+ end
186
+ end
187
+
188
+ # Dependencies data may have changed in a child process.
189
+ # Reload to make sure we have the latest metadata.
190
+ logger.log "#{'>' * (depth + 1)} #{name} reloading dependencies..."
191
+ dependencies(reload: true)
192
+
193
+ compute(depth: depth, force_compute: force_recompute)
194
+ logger.log "#{'>' * (depth + 1)} #{name} took #{Time.now - start_time} seconds to recompute."
195
+
196
+ true
197
+ end
198
+
199
+ # Compute this node's data if not already updated.
200
+ # Acquires a computing lock before computing.
201
+ # In the eventuality that the lock is already acquired, it awaits
202
+ # until it finishes or times out.
203
+ # @param force_compute [Boolean] if true, computes
204
+ # even if the node is already up to date.
205
+ def compute(depth: 0, force_compute: false, source: nil)
206
+ has_compute_lock = false
207
+ validate!
208
+
209
+ if updated? && !force_compute
210
+ logger.log "#{'>' * (depth + 1)} #{name} is up-to-date."
211
+ return
212
+ end
213
+
214
+ has_compute_lock = acquire_computing_lock!
215
+ if has_compute_lock
216
+ logger.log "#{'>' * (depth + 1)} #{name} started computing."
217
+ on_computing_started
218
+ start_time = Time.now
219
+
220
+ # update this node's schema with the necessary fields
221
+ data_node&.update_schema(required_schema)
222
+
223
+ pre_compute(force_compute: force_compute)
224
+
225
+ if clear_data_on_compute
226
+ # Pre-compute, we recreate the table, the unique indexes
227
+ data_node&.recreate_dataset(dataset_type: :write)
228
+ data_node&.create_unique_indexes(dataset_type: :write)
229
+ end
230
+
231
+ compute_impl
232
+
233
+ if clear_data_on_compute
234
+ # Post-compute, delay creating other indexes for insert speed
235
+ data_node&.create_non_unique_indexes(dataset_type: :write)
236
+ # swap read/write datasets
237
+ data_node&.swap_read_write_datasets!
238
+ end
239
+
240
+ self.last_compute_starting_time = start_time
241
+ duration = Time.now - start_time
242
+ logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
243
+ on_computing_finished(state: 'computed')
244
+ else
245
+ logger.log "#{'>' * (depth + 1)} [IS AWAITING] #{name}."
246
+ await_computing!
247
+ logger.log "#{'>' * (depth + 1)} [IS DONE AWAITING] #{name}."
248
+ end
249
+
250
+ rescue StandardError => e
251
+ on_computing_finished(state: 'error', error: e) if has_compute_lock
252
+ logger.log "#{'>' * (depth + 1)} [ERROR] #{name} failed computing: #{e}"
253
+ raise
254
+ ensure
255
+ release_computing_lock! if has_compute_lock
256
+ true
257
+ end
258
+
259
+ # Check wethere this node can or not compute.
260
+ # Errors are added to the active model errors.
261
+ # @return [Boolean] true has no errors and can be computed.
262
+ def valid_for_computation?
263
+ # Perform additional checks: also add errors to "self.errors"
264
+ opts = self.class.dependency_opts
265
+ if opts.key?(:exactly)
266
+ ensure_exact_dependencies(count: opts[:exactly])
267
+ elsif opts.key?(:max)
268
+ ensure_at_most_dependencies(count: opts[:max])
269
+ else # even if the min is not specified, we need at least 1 dependency
270
+ ensure_at_least_dependencies(count: opts[:min] || 1)
271
+ end
272
+ ensure_no_cyclic_dependencies
273
+ ensure_keys_are_set
274
+ ensure_data_node_exists if self.class.data_node_opts[:ensure_exists]
275
+
276
+ errors.count == 0
277
+ end
278
+
279
+ # Check this node's locking status.
280
+ # @return [Boolean] Whtere this node is locked or not.
281
+ def locked_for_computing?
282
+ computing_state == 'computing'
283
+ end
284
+
285
+ # Force the release of this node's computing lock.
286
+ # Do not use unless there is a problem with the lock.
287
+ def force_computing_lock_release!
288
+ release_computing_lock!
289
+ end
290
+
291
+ private
292
+
293
+ # Compute implementation:
294
+ # - recreate the table
295
+ # - compute the records
296
+ # - save them to the DB
297
+ # (the process may be overwritten on a per-node basis if needed)
298
+ def compute_impl
299
+ process_parallel(node: dependencies.first)
300
+ end
301
+
302
+ def process_parallel(node:)
303
+ record_count = node.count
304
+ return if record_count == 0
305
+
306
+ equal_split_per_process = (record_count / Parallel.processor_count.to_f).ceil
307
+ count_per_process = equal_split_per_process
308
+ limit = limit_per_process.to_i
309
+ count_per_process = [limit, equal_split_per_process].min if limit > 0
310
+
311
+ queries = node.ordered_system_id_queries(batch_size: count_per_process)
312
+
313
+ parallel_each(queries.each_with_index) do |query, idx|
314
+ progress = (idx / queries.count.to_f * 100).ceil
315
+ on_computing_progressed(pct_complete: progress)
316
+
317
+ records = node.all(where: query)
318
+
319
+ new_records = if block_given?
320
+ yield records
321
+ else
322
+ compute_batch(records: records)
323
+ end
324
+
325
+ data_node.add(records: new_records)
326
+ end
327
+ end
328
+
329
+ # This is an interface only.
330
+ # Override with record computation logic.
331
+ def compute_batch(records:)
332
+ records
333
+ end
334
+
335
+ def acquire_computing_lock!
336
+ # make sure that any pending changes are saved.
337
+ save
338
+ find_query = { _id: _id, computing_state: { '$ne' => 'computing' } }
339
+ update_query = { '$set' => { computing_state: 'computing', computing_started_at: Time.now } }
340
+ # send a query directly to avoid mongoid's caching layers
341
+ res = Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
342
+ # reload the model data after the query above
343
+ reload
344
+ # the query is atomic so if res != nil, we acquired the lock
345
+ !res.nil?
346
+ end
347
+
348
+ def release_computing_lock!
349
+ # make sure that any pending changes are saved.
350
+ save
351
+ find_query = { _id: _id }
352
+ update_query = { '$set' => { computing_state: nil, computing_started_at: nil } }
353
+ # send a query directly to avoid mongoid's caching layers
354
+ Dataflow::Nodes::ComputeNode.where(find_query).find_one_and_update(update_query)
355
+ # reload the model data after the query above
356
+ reload
357
+ end
358
+
359
+ def await_computing!
360
+ start_waiting_at = Time.now
361
+ # TODO: should the max wait time be dependent on e.g. the recompute interval?
362
+ max_wait_time = 15.minutes
363
+ while Time.now < start_waiting_at + max_wait_time
364
+ sleep 2
365
+ # reloads with the data stored on mongodb:
366
+ # something maybe have been changed by another process.
367
+ reload
368
+ return unless locked_for_computing?
369
+ end
370
+
371
+ raise StandardError, "Awaiting computing on #{name} reached timeout."
372
+ end
373
+
374
+ # Interface only. Re-implement for node-specific behavior before computing
375
+ def pre_compute(force_compute:); end
376
+
377
+ # Override to define a required schema.
378
+ def required_schema
379
+ schema
380
+ end
381
+
382
+ ##############################
383
+ # Dependency validations
384
+ ##############################
385
+
386
+ def ensure_no_cyclic_dependencies
387
+ node_map = Dataflow::Nodes::ComputeNode.all.map { |n| [n._id, n] }.to_h
388
+
389
+ dep_ids = (dependency_ids || [])
390
+ dep_ids.each do |dependency_id|
391
+ next unless has_dependency_in_hierarchy?(node_map[dependency_id], dependency_id, node_map)
392
+ error_msg = "Dependency to node #{dependency_id} ('#{node_map[dependency_id].name}') is cylic."
393
+ errors.add(:dependency_ids, error_msg)
394
+ end
395
+ end
396
+
397
+ def has_dependency_in_hierarchy?(node, dependency_id, node_map)
398
+ return false if node.blank?
399
+ # if we're reach a node that has no more deps, then we did not find
400
+ # the given dependency_id in the hierarchy
401
+ return true if (node.dependency_ids || []).include?(dependency_id)
402
+ (node.dependency_ids || []).any? do |dep_id|
403
+ has_dependency_in_hierarchy?(node_map[dep_id], dependency_id, node_map)
404
+ end
405
+ end
406
+
407
+ def ensure_no_cyclic_dependencies!
408
+ ensure_no_cyclic_dependencies
409
+ raise_dependendy_errors_if_needed!
410
+ end
411
+
412
+ def ensure_exact_dependencies(count:)
413
+ # we need to use .size, not .count
414
+ # for the mongo relation to work as expected
415
+ current_count = (dependency_ids || []).size
416
+ return if current_count == count
417
+
418
+ error_msg = "Expecting exactly #{count} dependencies. Has #{current_count} dependencies."
419
+ errors.add(:dependency_ids, error_msg)
420
+ end
421
+
422
+ def ensure_at_least_dependencies(count:)
423
+ # we need to use .size, not .count
424
+ # for the mongo relation to work as expected
425
+ current_count = (dependency_ids || []).size
426
+ return if current_count >= count
427
+
428
+ error_msg = "Expecting at least #{count} dependencies. Has #{current_count} dependencies."
429
+ errors.add(:dependency_ids, error_msg)
430
+ end
431
+
432
+ def ensure_at_most_dependencies(count:)
433
+ # we need to use .size, not .count
434
+ # for the mongo relation to work as expected
435
+ current_count = (dependency_ids || []).size
436
+ return if current_count <= count
437
+
438
+ error_msg = "Expecting at most #{count} dependencies. Has #{current_count} dependencies."
439
+ errors.add(:dependency_ids, error_msg)
440
+ end
441
+
442
+ def ensure_keys_are_set
443
+ required_keys = self.class.properties.select { |_k, opts| opts[:required_for_computing] }
444
+ required_keys.each do |key, opts|
445
+ errors.add(key, "#{self.class}.#{key} must be set for computing.") if self[key].nil?
446
+ if opts[:values].is_a?(Array)
447
+ # make sure the key's value is one of the possible values
448
+ errors.add(key, "#{self.class}.#{key} must be set to one of #{opts[:values].join(', ')}. Given: #{self[key]}") unless opts[:values].include?(self[key])
449
+ end
450
+ end
451
+ end
452
+
453
+ def ensure_data_node_exists
454
+ if data_node_id.blank?
455
+ error_msg = 'Expecting a data node to be set.'
456
+ errors.add(:data_node_id, error_msg)
457
+ return
458
+ end
459
+
460
+ # the data node id is present. Check if it found
461
+ Dataflow::Nodes::DataNode.find(data_node.id)
462
+ rescue Mongoid::Errors::DocumentNotFound
463
+ # it was not found:
464
+ error_msg = "No data node was found for Id: '#{data_node_id}'."
465
+ errors.add(:data_node_id, error_msg)
466
+ end
467
+
468
+ def parallel_each(itr)
469
+ # before fork: always disconnect currently used connections.
470
+ Dataflow::Adapters::SqlAdapter.disconnect_clients
471
+ Dataflow::Adapters::MongoDbAdapter.disconnect_clients
472
+ Mongoid.disconnect_clients
473
+
474
+ # set to true to debug code in the iteration
475
+ is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
476
+ if is_debugging_impl # || true
477
+ itr.each do |*args|
478
+ yield(*args)
479
+ end
480
+ else
481
+ Parallel.each(itr) do |*args|
482
+ yield(*args)
483
+ Dataflow::Adapters::SqlAdapter.disconnect_clients
484
+ Dataflow::Adapters::MongoDbAdapter.disconnect_clients
485
+ Mongoid.disconnect_clients
486
+ end
487
+ end
488
+ end
489
+
490
+ def logger
491
+ @logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
492
+ end
493
+ end # class ComputeNode
494
+ end # module Nodes
495
+ end # module Dataflow