dataflow-rb 0.10.2 → 0.11.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 27b271c3e39d4e3ead1ef50ba784ad127270fb36
4
- data.tar.gz: 9fcccbc62f7714b61a58dee11cdc823da6ff00f1
3
+ metadata.gz: d2ac7fa848d641d2c1fd0856ff92bb81f17bb670
4
+ data.tar.gz: 31eaf46d3785777d712739bc7f1a6d3ca328280e
5
5
  SHA512:
6
- metadata.gz: d44eee8bd3a364e25582d6710e3aaf9adbf8be9c247b99d7e8b7ecfbf91b702959ad455986306285f8ce50aba4debddc93a228e01d9c8e9c0d4eda3c779a543b
7
- data.tar.gz: 64e4c5fb8c4dedce76e7234cc3ce3d0c0460a7191ecb0222534d3025dc26b9046e4f00b6f08509fadbec0bdc8570523efd510174f6751a0dc693d903ee0ccd79
6
+ metadata.gz: bedf2430c023cef3e4408a7e213eee4f5cf206574f0a5264dbb2b7cad10defc85fff8ebdd4e859f7bce414ce3941bb2b7bbe30ffbf7c7e1194a3e0c716470047
7
+ data.tar.gz: e2470aa7d5aba0da5c67822f1eb8426564134d2ea6346bb627326a174c9198bcb89c1b598aa4d0d8cfb910071079f92a04970c12f9b4f65b0bc4bca4e39be20c
data/CHANGELOG.md CHANGED
@@ -1,5 +1,27 @@
1
1
  # Changelog
2
2
 
3
+ #### 0.11.0
4
+ - [7c09e8a] Add data_node#drop_dataset! to completely drop the data
5
+ - [ba0532f] Added upsert on psql adapter
6
+ - [4d44bbd] Support setting the number of parallel processes
7
+ - [8b48a6b] Add support for double buffered schema inferrence on postgresql
8
+ - [49bfe1a] Add support for clearing unused datasets
9
+ - [aabd5e3] Added #required_by to the node interface
10
+ - [4fd2617] Handle forks having the same thread id
11
+ - [7fc3064] Add error logging and trace id
12
+ - [fbbd58b] Added heartbeats when recomputing the dependencies and before the pre-compute callback
13
+
14
+ #### 0.10.2
15
+ - [966e771] Do not crash if there is an unknown node type in the metadata.
16
+
17
+ #### 0.10.1
18
+ - [9ee24a4] Cleanly set the mongoid env Fix the bin/console script
19
+ - [7fdc6f1] Support symbols in schema keys when merging schemas in the join node
20
+ - [6c7ad5c] Fail silently if no table exists when fetching its metadata
21
+ - [6b0886e] Make the ComputeNode#schema public
22
+ - [03f37e2] Optimize the select keys node to avoid recomputing keys at each record.
23
+ - [23ae504] ComputeNode#schema returns the required schema
24
+
3
25
  #### 0.10.0
4
26
  - [2f6284c] Allow the pre-compute to modify the necessary schema
5
27
  - [cec8a1d] Do not crash if process_parallel is called without dependencies.
@@ -166,11 +166,16 @@ module Dataflow
166
166
  # recreate the table/collection
167
167
  def recreate_dataset(dataset: nil)
168
168
  dataset ||= write_dataset_name
169
+ drop_dataset(dataset)
169
170
  collection = client[dataset]
170
- collection.drop
171
171
  collection.create
172
172
  end
173
173
 
174
+ def drop_dataset(dataset)
175
+ collection = client[dataset]
176
+ collection.drop
177
+ end
178
+
174
179
  # Create the indexes on this dataset.
175
180
  # @param dataset [String] Specify on which dataset the operation will be performed.
176
181
  # Default: the adatpter's settings' dataset.
@@ -110,6 +110,11 @@ module Dataflow
110
110
  end
111
111
  end
112
112
 
113
+ def all_paginated(where: {}, fields: [], cursor: nil)
114
+ # for now, retrieve all records at once
115
+ { 'data' => all(where: where, fields: fields), 'next_cursor' => '' }
116
+ end
117
+
113
118
  # Create queries that permit processing the whole dataset in parallel without using offsets.
114
119
  def ordered_system_id_queries(batch_size:)
115
120
  ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
@@ -137,16 +142,31 @@ module Dataflow
137
142
  end
138
143
 
139
144
  # Save the given records
140
- # TODO: support :replace_by parameter
141
- def save(records:)
142
- dataset = client[settings.write_dataset_name.to_sym]
145
+ # @param replace_by [Array] if the replace_by key is provided,
146
+ # it will try to replace records with the matching key,
147
+ # or insert if none is found.
148
+ # NOTE: the replace_by keys must be UNIQUE indexes.
149
+ def save(records:, replace_by: nil)
150
+ dataset_name = settings.write_dataset_name.to_sym
151
+ dataset = client[dataset_name]
143
152
  columns = dataset.columns.reject { |x| x == SYSTEM_ID }
144
153
 
145
154
  tabular_data = records.map do |record|
146
155
  columns.map { |col| record[col] }
147
156
  end
148
157
 
149
- dataset.insert_ignore.import(columns, tabular_data)
158
+ if replace_by.present?
159
+ index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
160
+
161
+ # update every field on conflict
162
+ update_clause = columns.map { |k| [k, :"excluded__#{k}"] }.to_h
163
+ dataset
164
+ .insert_conflict(target: index_keys, update: update_clause)
165
+ .import(columns, tabular_data)
166
+ else
167
+ # ignore insert conflicts
168
+ dataset.insert_conflict.import(columns, tabular_data)
169
+ end
150
170
  end
151
171
 
152
172
  # Delete records that match the options.
@@ -162,10 +182,15 @@ module Dataflow
162
182
  # recreate the table/collection
163
183
  def recreate_dataset(dataset: nil)
164
184
  dataset ||= settings.write_dataset_name.to_sym
165
- client.drop_table?(dataset)
185
+ drop_dataset(dataset)
166
186
  create_table(dataset, @schema)
167
187
  end
168
188
 
189
+ # drops the given dataset
190
+ def drop_dataset(dataset)
191
+ client.drop_table?(dataset)
192
+ end
193
+
169
194
  # Create the indexes on this dataset.
170
195
  # @param dataset [String] Specify on which dataset the operation will be performed.
171
196
  # Default: the adatpter's settings' dataset.
@@ -195,7 +220,7 @@ module Dataflow
195
220
 
196
221
  # log columns not found but do not raise an error
197
222
  if e.wrapped_exception.is_a?(PG::UndefinedColumn)
198
- logger.log("[Error] add_index on #{dataset} failed. #{e}")
223
+ logger.error(custom_message: "add_index on #{dataset} failed.", error: e)
199
224
  next
200
225
  end
201
226
 
@@ -312,8 +337,8 @@ module Dataflow
312
337
  params
313
338
  end
314
339
 
315
- def retrieve_collection_indexes(collection)
316
- psql_indexes = client.indexes(collection)
340
+ def retrieve_collection_indexes(dataset_name)
341
+ psql_indexes = client.indexes(dataset_name)
317
342
  psql_indexes.values.map do |idx|
318
343
  cols = idx[:columns].map(&:to_s)
319
344
  index = { 'key' => cols }
@@ -66,7 +66,7 @@ module Dataflow
66
66
  begin
67
67
  handler.call(self, *args)
68
68
  rescue StandardError => e
69
- @logger&.log("ERROR IN HANDLER [on_#{event_name}]: #{e}")
69
+ @logger&.error(error: e, custom_message: "ERROR IN HANDLER [on_#{event_name}]")
70
70
  # ignore error in handlers
71
71
  end
72
72
  end
@@ -13,11 +13,24 @@ module Dataflow
13
13
  def log(str)
14
14
  return if ENV['RACK_ENV'] == 'test'
15
15
  now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
16
- message = "[#{now}] #{prefix} :: #{str}"
16
+ message = "[#{now}][#{trace_id}] #{prefix} | #{str}"
17
17
  logger_impl = @@impl
18
18
  logger_impl.log(message)
19
19
  end
20
20
 
21
+ def error(error:, custom_message: '')
22
+ first_line = "[ERROR => #{error.class}: '#{error.message}']"
23
+ first_line += " #{custom_message}" if custom_message.present?
24
+ first_line += ' Backtrace: '
25
+ log(first_line)
26
+ log('--')
27
+ (error.backtrace || []).each_with_index { |line, idx| log("#{idx}: #{line}") }
28
+ end
29
+
30
+ def trace_id
31
+ (Process.pid + Thread.current.object_id).to_s(16)[-8..-1]
32
+ end
33
+
21
34
  class LoggerImpl
22
35
  def log(message)
23
36
  puts message
data/lib/dataflow/node.rb CHANGED
@@ -33,5 +33,11 @@ module Dataflow
33
33
  raise Dataflow::Errors::InvalidConfigurationError, errors.messages unless valid
34
34
  true
35
35
  end
36
+
37
+ def required_by
38
+ Dataflow::Nodes::ComputeNode.where(dependency_ids: _id).map { |node|
39
+ { node: node, type: 'dependency' }
40
+ }
41
+ end
36
42
  end
37
43
  end
@@ -70,6 +70,9 @@ module Dataflow
70
70
  # per process during computation.
71
71
  field :limit_per_process, type: Integer, default: 0
72
72
 
73
+ # Maximum number of processes to use in parallel. Use 1 per core when 0.
74
+ field :max_parallel_processes, type: Integer, default: 0
75
+
73
76
  # Use automatic recomputing interval. In seconds.
74
77
  field :recompute_interval, type: Integer, default: 0
75
78
 
@@ -179,6 +182,7 @@ module Dataflow
179
182
  # @param force_recompute [Boolean] if true, computes
180
183
  # even if the node is already up to date.
181
184
  def recompute(depth: 0, force_recompute: false)
185
+ send_heartbeat
182
186
  logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
183
187
  start_time = Time.now
184
188
 
@@ -187,6 +191,7 @@ module Dataflow
187
191
  if !dependency.updated? || force_recompute
188
192
  dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
189
193
  end
194
+ send_heartbeat
190
195
  end
191
196
 
192
197
  # Dependencies data may have changed in a child process.
@@ -226,13 +231,13 @@ module Dataflow
226
231
  data_node.use_double_buffering = clear_data_on_compute
227
232
  data_node.save
228
233
  end
229
-
234
+
235
+ send_heartbeat
230
236
  pre_compute(force_compute: force_compute)
231
237
 
232
238
  # update this node's schema with the necessary fields
233
239
  data_node&.update_schema(required_schema)
234
240
 
235
-
236
241
  if clear_data_on_compute
237
242
  # Pre-compute, we recreate the table, the unique indexes
238
243
  data_node&.recreate_dataset(dataset_type: :write)
@@ -250,6 +255,7 @@ module Dataflow
250
255
  end
251
256
 
252
257
  self.last_compute_starting_time = start_time
258
+ save
253
259
  duration = Time.now - start_time
254
260
  logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
255
261
  on_computing_finished(state: 'computed')
@@ -261,7 +267,7 @@ module Dataflow
261
267
 
262
268
  rescue StandardError => e
263
269
  on_computing_finished(state: 'error', error: e) if has_compute_lock
264
- logger.log "#{'>' * (depth + 1)} [ERROR] #{name} failed computing: #{e}"
270
+ logger.error(error: e, custom_message: "#{name} failed computing.")
265
271
  raise
266
272
  ensure
267
273
  release_computing_lock! if has_compute_lock
@@ -503,7 +509,13 @@ module Dataflow
503
509
  yield(*args)
504
510
  end
505
511
  else
506
- Parallel.each(itr) do |*args|
512
+ opts = if max_parallel_processes > 0
513
+ { in_processes: max_parallel_processes }
514
+ else
515
+ {}
516
+ end
517
+
518
+ Parallel.each(itr, opts) do |*args|
507
519
  yield(*args)
508
520
  Dataflow::Adapters::SqlAdapter.disconnect_clients
509
521
  Dataflow::Adapters::MongoDbAdapter.disconnect_clients
@@ -297,6 +297,32 @@ module Dataflow
297
297
  (db_backend.to_s =~ /sql/).present?
298
298
  end
299
299
 
300
+
301
+ def required_by
302
+ super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
303
+ { node: node, type: 'dataset' }
304
+ }
305
+ end
306
+
307
+ # this is not safe if there is some parallel processing going on
308
+ def safely_clear_write_dataset
309
+ # we can only clear the write dataset if we're using double buffering
310
+ return unless use_double_buffering
311
+ # check if there is any node that is currently computing to this dataset
312
+ used_by = required_by.select { |x| x[:type] == 'dataset' && x[:node].locked_for_computing? }
313
+ return if used_by.present?
314
+
315
+ logger.log("Dropping #{db_name}.#{write_dataset_name} on #{db_backend}.")
316
+ # TODO: lock the node?
317
+ db_adapter.drop_dataset(write_dataset_name)
318
+ end
319
+
320
+ def drop_dataset!
321
+ db_adapter.drop_dataset(write_dataset_name)
322
+ return unless use_double_buffering
323
+ db_adapter.drop_dataset(read_dataset_name)
324
+ end
325
+
300
326
  private
301
327
 
302
328
  def db_adapter(connection_opts = {})
@@ -337,6 +363,10 @@ module Dataflow
337
363
  [name]
338
364
  end
339
365
  end
366
+
367
+ def logger
368
+ @logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
369
+ end
340
370
  end # class DataNode
341
371
  end # module Nodes
342
372
  end # module Dataflow
@@ -13,7 +13,7 @@ module Dataflow
13
13
  def infer_schema(samples_count: 0, extended: false)
14
14
  if db_backend == :postgresql
15
15
  # Experimental
16
- sch = db_adapter.client.schema(name).to_h
16
+ sch = db_adapter.client.schema(read_dataset_name).to_h
17
17
  sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
18
18
  self.inferred_schema = sch
19
19
  save
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.10.2'
3
+ VERSION = '0.11.0'
4
4
  end
data/lib/dataflow-rb.rb CHANGED
@@ -69,8 +69,14 @@ module Dataflow
69
69
  rescue Mongoid::Errors::DocumentNotFound
70
70
  Dataflow::Nodes::ComputeNode.find_by(name: id)
71
71
  end
72
- end
73
72
 
73
+ # helper that helps clearing un-used datasets
74
+ # NOTE: although there is a best attempt to not delete datasets that are
75
+ # currently being written to, this is not safe to use while executing in parallel.
76
+ def self.clear_tmp_datasets
77
+ Dataflow::Nodes::DataNode.all.each(&:safely_clear_write_dataset)
78
+ end
79
+ end
74
80
 
75
81
  ###############################################################################
76
82
  # Override the #constantize in active_support/inflector/methods.rb
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.10.2
4
+ version: 0.11.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-03-15 00:00:00.000000000 Z
11
+ date: 2017-04-25 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -364,10 +364,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
364
364
  version: '0'
365
365
  requirements: []
366
366
  rubyforge_project:
367
- rubygems_version: 2.5.2
367
+ rubygems_version: 2.6.11
368
368
  signing_key:
369
369
  specification_version: 4
370
370
  summary: Helps building data and automation pipelines. It handles recomputing dependencies
371
371
  and parallel execution.
372
372
  test_files: []
373
- has_rdoc: