dataflow-rb 0.10.2 → 0.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 27b271c3e39d4e3ead1ef50ba784ad127270fb36
4
- data.tar.gz: 9fcccbc62f7714b61a58dee11cdc823da6ff00f1
3
+ metadata.gz: d2ac7fa848d641d2c1fd0856ff92bb81f17bb670
4
+ data.tar.gz: 31eaf46d3785777d712739bc7f1a6d3ca328280e
5
5
  SHA512:
6
- metadata.gz: d44eee8bd3a364e25582d6710e3aaf9adbf8be9c247b99d7e8b7ecfbf91b702959ad455986306285f8ce50aba4debddc93a228e01d9c8e9c0d4eda3c779a543b
7
- data.tar.gz: 64e4c5fb8c4dedce76e7234cc3ce3d0c0460a7191ecb0222534d3025dc26b9046e4f00b6f08509fadbec0bdc8570523efd510174f6751a0dc693d903ee0ccd79
6
+ metadata.gz: bedf2430c023cef3e4408a7e213eee4f5cf206574f0a5264dbb2b7cad10defc85fff8ebdd4e859f7bce414ce3941bb2b7bbe30ffbf7c7e1194a3e0c716470047
7
+ data.tar.gz: e2470aa7d5aba0da5c67822f1eb8426564134d2ea6346bb627326a174c9198bcb89c1b598aa4d0d8cfb910071079f92a04970c12f9b4f65b0bc4bca4e39be20c
data/CHANGELOG.md CHANGED
@@ -1,5 +1,27 @@
1
1
  # Changelog
2
2
 
3
+ #### 0.11.0
4
+ - [7c09e8a] Add data_node#drop_dataset! to completely drop the data
5
+ - [ba0532f] Added upsert on psql adapter
6
+ - [4d44bbd] Support setting the number of parallel processes
7
+ - [8b48a6b] Add support for double buffered schema inferrence on postgresql
8
+ - [49bfe1a] Add support for clearing unused datasets
9
+ - [aabd5e3] Added #required_by to the node interface
10
+ - [4fd2617] Handle forks having the same thread id
11
+ - [7fc3064] Add error logging and trace id
12
+ - [fbbd58b] Added heartbeats when recomputing the dependencies and before the pre-compute callback
13
+
14
+ #### 0.10.2
15
+ - [966e771] Do not crash if there is an unknown node type in the metadata.
16
+
17
+ #### 0.10.1
18
+ - [9ee24a4] Cleanly set the mongoid env Fix the bin/console script
19
+ - [7fdc6f1] Support symbols in schema keys when merging schemas in the join node
20
+ - [6c7ad5c] Fail silently if no table exists when fetching its metadata
21
+ - [6b0886e] Make the ComputeNode#schema public
22
+ - [03f37e2] Optimize the select keys node to avoid recomputing keys at each record.
23
+ - [23ae504] ComputeNode#schema returns the required schema
24
+
3
25
  #### 0.10.0
4
26
  - [2f6284c] Allow the pre-compute to modify the necessary schema
5
27
  - [cec8a1d] Do not crash if process_parallel is called without dependencies.
@@ -166,11 +166,16 @@ module Dataflow
166
166
  # recreate the table/collection
167
167
  def recreate_dataset(dataset: nil)
168
168
  dataset ||= write_dataset_name
169
+ drop_dataset(dataset)
169
170
  collection = client[dataset]
170
- collection.drop
171
171
  collection.create
172
172
  end
173
173
 
174
+ def drop_dataset(dataset)
175
+ collection = client[dataset]
176
+ collection.drop
177
+ end
178
+
174
179
  # Create the indexes on this dataset.
175
180
  # @param dataset [String] Specify on which dataset the operation will be performed.
176
181
  # Default: the adatpter's settings' dataset.
@@ -110,6 +110,11 @@ module Dataflow
110
110
  end
111
111
  end
112
112
 
113
+ def all_paginated(where: {}, fields: [], cursor: nil)
114
+ # for now, retrieve all records at once
115
+ { 'data' => all(where: where, fields: fields), 'next_cursor' => '' }
116
+ end
117
+
113
118
  # Create queries that permit processing the whole dataset in parallel without using offsets.
114
119
  def ordered_system_id_queries(batch_size:)
115
120
  ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
@@ -137,16 +142,31 @@ module Dataflow
137
142
  end
138
143
 
139
144
  # Save the given records
140
- # TODO: support :replace_by parameter
141
- def save(records:)
142
- dataset = client[settings.write_dataset_name.to_sym]
145
+ # @param replace_by [Array] if the replace_by key is provided,
146
+ # it will try to replace records with the matching key,
147
+ # or insert if none is found.
148
+ # NOTE: the replace_by keys must be UNIQUE indexes.
149
+ def save(records:, replace_by: nil)
150
+ dataset_name = settings.write_dataset_name.to_sym
151
+ dataset = client[dataset_name]
143
152
  columns = dataset.columns.reject { |x| x == SYSTEM_ID }
144
153
 
145
154
  tabular_data = records.map do |record|
146
155
  columns.map { |col| record[col] }
147
156
  end
148
157
 
149
- dataset.insert_ignore.import(columns, tabular_data)
158
+ if replace_by.present?
159
+ index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
160
+
161
+ # update every field on conflict
162
+ update_clause = columns.map { |k| [k, :"excluded__#{k}"] }.to_h
163
+ dataset
164
+ .insert_conflict(target: index_keys, update: update_clause)
165
+ .import(columns, tabular_data)
166
+ else
167
+ # ignore insert conflicts
168
+ dataset.insert_conflict.import(columns, tabular_data)
169
+ end
150
170
  end
151
171
 
152
172
  # Delete records that match the options.
@@ -162,10 +182,15 @@ module Dataflow
162
182
  # recreate the table/collection
163
183
  def recreate_dataset(dataset: nil)
164
184
  dataset ||= settings.write_dataset_name.to_sym
165
- client.drop_table?(dataset)
185
+ drop_dataset(dataset)
166
186
  create_table(dataset, @schema)
167
187
  end
168
188
 
189
+ # drops the given dataset
190
+ def drop_dataset(dataset)
191
+ client.drop_table?(dataset)
192
+ end
193
+
169
194
  # Create the indexes on this dataset.
170
195
  # @param dataset [String] Specify on which dataset the operation will be performed.
171
196
  # Default: the adatpter's settings' dataset.
@@ -195,7 +220,7 @@ module Dataflow
195
220
 
196
221
  # log columns not found but do not raise an error
197
222
  if e.wrapped_exception.is_a?(PG::UndefinedColumn)
198
- logger.log("[Error] add_index on #{dataset} failed. #{e}")
223
+ logger.error(custom_message: "add_index on #{dataset} failed.", error: e)
199
224
  next
200
225
  end
201
226
 
@@ -312,8 +337,8 @@ module Dataflow
312
337
  params
313
338
  end
314
339
 
315
- def retrieve_collection_indexes(collection)
316
- psql_indexes = client.indexes(collection)
340
+ def retrieve_collection_indexes(dataset_name)
341
+ psql_indexes = client.indexes(dataset_name)
317
342
  psql_indexes.values.map do |idx|
318
343
  cols = idx[:columns].map(&:to_s)
319
344
  index = { 'key' => cols }
@@ -66,7 +66,7 @@ module Dataflow
66
66
  begin
67
67
  handler.call(self, *args)
68
68
  rescue StandardError => e
69
- @logger&.log("ERROR IN HANDLER [on_#{event_name}]: #{e}")
69
+ @logger&.error(error: e, custom_message: "ERROR IN HANDLER [on_#{event_name}]")
70
70
  # ignore error in handlers
71
71
  end
72
72
  end
@@ -13,11 +13,24 @@ module Dataflow
13
13
  def log(str)
14
14
  return if ENV['RACK_ENV'] == 'test'
15
15
  now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
16
- message = "[#{now}] #{prefix} :: #{str}"
16
+ message = "[#{now}][#{trace_id}] #{prefix} | #{str}"
17
17
  logger_impl = @@impl
18
18
  logger_impl.log(message)
19
19
  end
20
20
 
21
+ def error(error:, custom_message: '')
22
+ first_line = "[ERROR => #{error.class}: '#{error.message}']"
23
+ first_line += " #{custom_message}" if custom_message.present?
24
+ first_line += ' Backtrace: '
25
+ log(first_line)
26
+ log('--')
27
+ (error.backtrace || []).each_with_index { |line, idx| log("#{idx}: #{line}") }
28
+ end
29
+
30
+ def trace_id
31
+ (Process.pid + Thread.current.object_id).to_s(16)[-8..-1]
32
+ end
33
+
21
34
  class LoggerImpl
22
35
  def log(message)
23
36
  puts message
data/lib/dataflow/node.rb CHANGED
@@ -33,5 +33,11 @@ module Dataflow
33
33
  raise Dataflow::Errors::InvalidConfigurationError, errors.messages unless valid
34
34
  true
35
35
  end
36
+
37
+ def required_by
38
+ Dataflow::Nodes::ComputeNode.where(dependency_ids: _id).map { |node|
39
+ { node: node, type: 'dependency' }
40
+ }
41
+ end
36
42
  end
37
43
  end
@@ -70,6 +70,9 @@ module Dataflow
70
70
  # per process during computation.
71
71
  field :limit_per_process, type: Integer, default: 0
72
72
 
73
+ # Maximum number of processes to use in parallel. Use 1 per core when 0.
74
+ field :max_parallel_processes, type: Integer, default: 0
75
+
73
76
  # Use automatic recomputing interval. In seconds.
74
77
  field :recompute_interval, type: Integer, default: 0
75
78
 
@@ -179,6 +182,7 @@ module Dataflow
179
182
  # @param force_recompute [Boolean] if true, computes
180
183
  # even if the node is already up to date.
181
184
  def recompute(depth: 0, force_recompute: false)
185
+ send_heartbeat
182
186
  logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
183
187
  start_time = Time.now
184
188
 
@@ -187,6 +191,7 @@ module Dataflow
187
191
  if !dependency.updated? || force_recompute
188
192
  dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
189
193
  end
194
+ send_heartbeat
190
195
  end
191
196
 
192
197
  # Dependencies data may have changed in a child process.
@@ -226,13 +231,13 @@ module Dataflow
226
231
  data_node.use_double_buffering = clear_data_on_compute
227
232
  data_node.save
228
233
  end
229
-
234
+
235
+ send_heartbeat
230
236
  pre_compute(force_compute: force_compute)
231
237
 
232
238
  # update this node's schema with the necessary fields
233
239
  data_node&.update_schema(required_schema)
234
240
 
235
-
236
241
  if clear_data_on_compute
237
242
  # Pre-compute, we recreate the table, the unique indexes
238
243
  data_node&.recreate_dataset(dataset_type: :write)
@@ -250,6 +255,7 @@ module Dataflow
250
255
  end
251
256
 
252
257
  self.last_compute_starting_time = start_time
258
+ save
253
259
  duration = Time.now - start_time
254
260
  logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
255
261
  on_computing_finished(state: 'computed')
@@ -261,7 +267,7 @@ module Dataflow
261
267
 
262
268
  rescue StandardError => e
263
269
  on_computing_finished(state: 'error', error: e) if has_compute_lock
264
- logger.log "#{'>' * (depth + 1)} [ERROR] #{name} failed computing: #{e}"
270
+ logger.error(error: e, custom_message: "#{name} failed computing.")
265
271
  raise
266
272
  ensure
267
273
  release_computing_lock! if has_compute_lock
@@ -503,7 +509,13 @@ module Dataflow
503
509
  yield(*args)
504
510
  end
505
511
  else
506
- Parallel.each(itr) do |*args|
512
+ opts = if max_parallel_processes > 0
513
+ { in_processes: max_parallel_processes }
514
+ else
515
+ {}
516
+ end
517
+
518
+ Parallel.each(itr, opts) do |*args|
507
519
  yield(*args)
508
520
  Dataflow::Adapters::SqlAdapter.disconnect_clients
509
521
  Dataflow::Adapters::MongoDbAdapter.disconnect_clients
@@ -297,6 +297,32 @@ module Dataflow
297
297
  (db_backend.to_s =~ /sql/).present?
298
298
  end
299
299
 
300
+
301
+ def required_by
302
+ super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
303
+ { node: node, type: 'dataset' }
304
+ }
305
+ end
306
+
307
+ # this is not safe if there is some parallel processing going on
308
+ def safely_clear_write_dataset
309
+ # we can only clear the write dataset if we're using double buffering
310
+ return unless use_double_buffering
311
+ # check if there is any node that is currently computing to this dataset
312
+ used_by = required_by.select { |x| x[:type] == 'dataset' && x[:node].locked_for_computing? }
313
+ return if used_by.present?
314
+
315
+ logger.log("Dropping #{db_name}.#{write_dataset_name} on #{db_backend}.")
316
+ # TODO: lock the node?
317
+ db_adapter.drop_dataset(write_dataset_name)
318
+ end
319
+
320
+ def drop_dataset!
321
+ db_adapter.drop_dataset(write_dataset_name)
322
+ return unless use_double_buffering
323
+ db_adapter.drop_dataset(read_dataset_name)
324
+ end
325
+
300
326
  private
301
327
 
302
328
  def db_adapter(connection_opts = {})
@@ -337,6 +363,10 @@ module Dataflow
337
363
  [name]
338
364
  end
339
365
  end
366
+
367
+ def logger
368
+ @logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
369
+ end
340
370
  end # class DataNode
341
371
  end # module Nodes
342
372
  end # module Dataflow
@@ -13,7 +13,7 @@ module Dataflow
13
13
  def infer_schema(samples_count: 0, extended: false)
14
14
  if db_backend == :postgresql
15
15
  # Experimental
16
- sch = db_adapter.client.schema(name).to_h
16
+ sch = db_adapter.client.schema(read_dataset_name).to_h
17
17
  sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
18
18
  self.inferred_schema = sch
19
19
  save
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.10.2'
3
+ VERSION = '0.11.0'
4
4
  end
data/lib/dataflow-rb.rb CHANGED
@@ -69,8 +69,14 @@ module Dataflow
69
69
  rescue Mongoid::Errors::DocumentNotFound
70
70
  Dataflow::Nodes::ComputeNode.find_by(name: id)
71
71
  end
72
- end
73
72
 
73
+ # helper that helps clearing un-used datasets
74
+ # NOTE: although there is a best attempt to not delete datasets that are
75
+ # currently being written to, this is not safe to use while executing in parallel.
76
+ def self.clear_tmp_datasets
77
+ Dataflow::Nodes::DataNode.all.each(&:safely_clear_write_dataset)
78
+ end
79
+ end
74
80
 
75
81
  ###############################################################################
76
82
  # Override the #constantize in active_support/inflector/methods.rb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.2
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-03-15 00:00:00.000000000 Z
11
+ date: 2017-04-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -364,10 +364,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
364
364
  version: '0'
365
365
  requirements: []
366
366
  rubyforge_project:
367
- rubygems_version: 2.5.2
367
+ rubygems_version: 2.6.11
368
368
  signing_key:
369
369
  specification_version: 4
370
370
  summary: Helps building data and automation pipelines. It handles recomputing dependencies
371
371
  and parallel execution.
372
372
  test_files: []
373
- has_rdoc: