dataflow-rb 0.10.2 → 0.11.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +6 -1
- data/lib/dataflow/adapters/sql_adapter.rb +33 -8
- data/lib/dataflow/event_mixin.rb +1 -1
- data/lib/dataflow/logger.rb +14 -1
- data/lib/dataflow/node.rb +6 -0
- data/lib/dataflow/nodes/compute_node.rb +16 -4
- data/lib/dataflow/nodes/data_node.rb +30 -0
- data/lib/dataflow/schema_mixin.rb +1 -1
- data/lib/dataflow/version.rb +1 -1
- data/lib/dataflow-rb.rb +7 -1
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2ac7fa848d641d2c1fd0856ff92bb81f17bb670
|
4
|
+
data.tar.gz: 31eaf46d3785777d712739bc7f1a6d3ca328280e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bedf2430c023cef3e4408a7e213eee4f5cf206574f0a5264dbb2b7cad10defc85fff8ebdd4e859f7bce414ce3941bb2b7bbe30ffbf7c7e1194a3e0c716470047
|
7
|
+
data.tar.gz: e2470aa7d5aba0da5c67822f1eb8426564134d2ea6346bb627326a174c9198bcb89c1b598aa4d0d8cfb910071079f92a04970c12f9b4f65b0bc4bca4e39be20c
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,27 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
#### 0.11.0
|
4
|
+
- [7c09e8a] Add data_node#drop_dataset! to completely drop the data
|
5
|
+
- [ba0532f] Added upsert on psql adapter
|
6
|
+
- [4d44bbd] Support setting the number of parallel processes
|
7
|
+
- [8b48a6b] Add support for double buffered schema inferrence on postgresql
|
8
|
+
- [49bfe1a] Add support for clearing unused datasets
|
9
|
+
- [aabd5e3] Added #required_by to the node interface
|
10
|
+
- [4fd2617] Handle forks having the same thread id
|
11
|
+
- [7fc3064] Add error logging and trace id
|
12
|
+
- [fbbd58b] Added heartbeats when recomputing the dependencies and before the pre-compute callback
|
13
|
+
|
14
|
+
#### 0.10.2
|
15
|
+
- [966e771] Do not crash if there is an unknown node type in the metadata.
|
16
|
+
|
17
|
+
#### 0.10.1
|
18
|
+
- [9ee24a4] Cleanly set the mongoid env Fix the bin/console script
|
19
|
+
- [7fdc6f1] Support symbols in schema keys when merging schemas in the join node
|
20
|
+
- [6c7ad5c] Fail silently if no table exists when fetching its metadata
|
21
|
+
- [6b0886e] Make the ComputeNode#schema public
|
22
|
+
- [03f37e2] Optimize the select keys node to avoid recomputing keys at each record.
|
23
|
+
- [23ae504] ComputeNode#schema returns the required schema
|
24
|
+
|
3
25
|
#### 0.10.0
|
4
26
|
- [2f6284c] Allow the pre-compute to modify the necessary schema
|
5
27
|
- [cec8a1d] Do not crash if process_parallel is called without dependencies.
|
@@ -166,11 +166,16 @@ module Dataflow
|
|
166
166
|
# recreate the table/collection
|
167
167
|
def recreate_dataset(dataset: nil)
|
168
168
|
dataset ||= write_dataset_name
|
169
|
+
drop_dataset(dataset)
|
169
170
|
collection = client[dataset]
|
170
|
-
collection.drop
|
171
171
|
collection.create
|
172
172
|
end
|
173
173
|
|
174
|
+
def drop_dataset(dataset)
|
175
|
+
collection = client[dataset]
|
176
|
+
collection.drop
|
177
|
+
end
|
178
|
+
|
174
179
|
# Create the indexes on this dataset.
|
175
180
|
# @param dataset [String] Specify on which dataset the operation will be performed.
|
176
181
|
# Default: the adatpter's settings' dataset.
|
@@ -110,6 +110,11 @@ module Dataflow
|
|
110
110
|
end
|
111
111
|
end
|
112
112
|
|
113
|
+
def all_paginated(where: {}, fields: [], cursor: nil)
|
114
|
+
# for now, retrieve all records at once
|
115
|
+
{ 'data' => all(where: where, fields: fields), 'next_cursor' => '' }
|
116
|
+
end
|
117
|
+
|
113
118
|
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
114
119
|
def ordered_system_id_queries(batch_size:)
|
115
120
|
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
|
@@ -137,16 +142,31 @@ module Dataflow
|
|
137
142
|
end
|
138
143
|
|
139
144
|
# Save the given records
|
140
|
-
#
|
141
|
-
|
142
|
-
|
145
|
+
# @param replace_by [Array] if the replace_by key is provided,
|
146
|
+
# it will try to replace records with the matching key,
|
147
|
+
# or insert if none is found.
|
148
|
+
# NOTE: the replace_by keys must be UNIQUE indexes.
|
149
|
+
def save(records:, replace_by: nil)
|
150
|
+
dataset_name = settings.write_dataset_name.to_sym
|
151
|
+
dataset = client[dataset_name]
|
143
152
|
columns = dataset.columns.reject { |x| x == SYSTEM_ID }
|
144
153
|
|
145
154
|
tabular_data = records.map do |record|
|
146
155
|
columns.map { |col| record[col] }
|
147
156
|
end
|
148
157
|
|
149
|
-
|
158
|
+
if replace_by.present?
|
159
|
+
index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
|
160
|
+
|
161
|
+
# update every field on conflict
|
162
|
+
update_clause = columns.map { |k| [k, :"excluded__#{k}"] }.to_h
|
163
|
+
dataset
|
164
|
+
.insert_conflict(target: index_keys, update: update_clause)
|
165
|
+
.import(columns, tabular_data)
|
166
|
+
else
|
167
|
+
# ignore insert conflicts
|
168
|
+
dataset.insert_conflict.import(columns, tabular_data)
|
169
|
+
end
|
150
170
|
end
|
151
171
|
|
152
172
|
# Delete records that match the options.
|
@@ -162,10 +182,15 @@ module Dataflow
|
|
162
182
|
# recreate the table/collection
|
163
183
|
def recreate_dataset(dataset: nil)
|
164
184
|
dataset ||= settings.write_dataset_name.to_sym
|
165
|
-
|
185
|
+
drop_dataset(dataset)
|
166
186
|
create_table(dataset, @schema)
|
167
187
|
end
|
168
188
|
|
189
|
+
# drops the given dataset
|
190
|
+
def drop_dataset(dataset)
|
191
|
+
client.drop_table?(dataset)
|
192
|
+
end
|
193
|
+
|
169
194
|
# Create the indexes on this dataset.
|
170
195
|
# @param dataset [String] Specify on which dataset the operation will be performed.
|
171
196
|
# Default: the adatpter's settings' dataset.
|
@@ -195,7 +220,7 @@ module Dataflow
|
|
195
220
|
|
196
221
|
# log columns not found but do not raise an error
|
197
222
|
if e.wrapped_exception.is_a?(PG::UndefinedColumn)
|
198
|
-
logger.
|
223
|
+
logger.error(custom_message: "add_index on #{dataset} failed.", error: e)
|
199
224
|
next
|
200
225
|
end
|
201
226
|
|
@@ -312,8 +337,8 @@ module Dataflow
|
|
312
337
|
params
|
313
338
|
end
|
314
339
|
|
315
|
-
def retrieve_collection_indexes(
|
316
|
-
psql_indexes = client.indexes(
|
340
|
+
def retrieve_collection_indexes(dataset_name)
|
341
|
+
psql_indexes = client.indexes(dataset_name)
|
317
342
|
psql_indexes.values.map do |idx|
|
318
343
|
cols = idx[:columns].map(&:to_s)
|
319
344
|
index = { 'key' => cols }
|
data/lib/dataflow/event_mixin.rb
CHANGED
@@ -66,7 +66,7 @@ module Dataflow
|
|
66
66
|
begin
|
67
67
|
handler.call(self, *args)
|
68
68
|
rescue StandardError => e
|
69
|
-
@logger&.
|
69
|
+
@logger&.error(error: e, custom_message: "ERROR IN HANDLER [on_#{event_name}]")
|
70
70
|
# ignore error in handlers
|
71
71
|
end
|
72
72
|
end
|
data/lib/dataflow/logger.rb
CHANGED
@@ -13,11 +13,24 @@ module Dataflow
|
|
13
13
|
def log(str)
|
14
14
|
return if ENV['RACK_ENV'] == 'test'
|
15
15
|
now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
|
16
|
-
message = "[#{now}] #{prefix}
|
16
|
+
message = "[#{now}][#{trace_id}] #{prefix} | #{str}"
|
17
17
|
logger_impl = @@impl
|
18
18
|
logger_impl.log(message)
|
19
19
|
end
|
20
20
|
|
21
|
+
def error(error:, custom_message: '')
|
22
|
+
first_line = "[ERROR => #{error.class}: '#{error.message}']"
|
23
|
+
first_line += " #{custom_message}" if custom_message.present?
|
24
|
+
first_line += ' Backtrace: '
|
25
|
+
log(first_line)
|
26
|
+
log('--')
|
27
|
+
(error.backtrace || []).each_with_index { |line, idx| log("#{idx}: #{line}") }
|
28
|
+
end
|
29
|
+
|
30
|
+
def trace_id
|
31
|
+
(Process.pid + Thread.current.object_id).to_s(16)[-8..-1]
|
32
|
+
end
|
33
|
+
|
21
34
|
class LoggerImpl
|
22
35
|
def log(message)
|
23
36
|
puts message
|
data/lib/dataflow/node.rb
CHANGED
@@ -33,5 +33,11 @@ module Dataflow
|
|
33
33
|
raise Dataflow::Errors::InvalidConfigurationError, errors.messages unless valid
|
34
34
|
true
|
35
35
|
end
|
36
|
+
|
37
|
+
def required_by
|
38
|
+
Dataflow::Nodes::ComputeNode.where(dependency_ids: _id).map { |node|
|
39
|
+
{ node: node, type: 'dependency' }
|
40
|
+
}
|
41
|
+
end
|
36
42
|
end
|
37
43
|
end
|
@@ -70,6 +70,9 @@ module Dataflow
|
|
70
70
|
# per process during computation.
|
71
71
|
field :limit_per_process, type: Integer, default: 0
|
72
72
|
|
73
|
+
# Maximum number of processes to use in parallel. Use 1 per core when 0.
|
74
|
+
field :max_parallel_processes, type: Integer, default: 0
|
75
|
+
|
73
76
|
# Use automatic recomputing interval. In seconds.
|
74
77
|
field :recompute_interval, type: Integer, default: 0
|
75
78
|
|
@@ -179,6 +182,7 @@ module Dataflow
|
|
179
182
|
# @param force_recompute [Boolean] if true, computes
|
180
183
|
# even if the node is already up to date.
|
181
184
|
def recompute(depth: 0, force_recompute: false)
|
185
|
+
send_heartbeat
|
182
186
|
logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
|
183
187
|
start_time = Time.now
|
184
188
|
|
@@ -187,6 +191,7 @@ module Dataflow
|
|
187
191
|
if !dependency.updated? || force_recompute
|
188
192
|
dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
|
189
193
|
end
|
194
|
+
send_heartbeat
|
190
195
|
end
|
191
196
|
|
192
197
|
# Dependencies data may have changed in a child process.
|
@@ -226,13 +231,13 @@ module Dataflow
|
|
226
231
|
data_node.use_double_buffering = clear_data_on_compute
|
227
232
|
data_node.save
|
228
233
|
end
|
229
|
-
|
234
|
+
|
235
|
+
send_heartbeat
|
230
236
|
pre_compute(force_compute: force_compute)
|
231
237
|
|
232
238
|
# update this node's schema with the necessary fields
|
233
239
|
data_node&.update_schema(required_schema)
|
234
240
|
|
235
|
-
|
236
241
|
if clear_data_on_compute
|
237
242
|
# Pre-compute, we recreate the table, the unique indexes
|
238
243
|
data_node&.recreate_dataset(dataset_type: :write)
|
@@ -250,6 +255,7 @@ module Dataflow
|
|
250
255
|
end
|
251
256
|
|
252
257
|
self.last_compute_starting_time = start_time
|
258
|
+
save
|
253
259
|
duration = Time.now - start_time
|
254
260
|
logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
|
255
261
|
on_computing_finished(state: 'computed')
|
@@ -261,7 +267,7 @@ module Dataflow
|
|
261
267
|
|
262
268
|
rescue StandardError => e
|
263
269
|
on_computing_finished(state: 'error', error: e) if has_compute_lock
|
264
|
-
logger.
|
270
|
+
logger.error(error: e, custom_message: "#{name} failed computing.")
|
265
271
|
raise
|
266
272
|
ensure
|
267
273
|
release_computing_lock! if has_compute_lock
|
@@ -503,7 +509,13 @@ module Dataflow
|
|
503
509
|
yield(*args)
|
504
510
|
end
|
505
511
|
else
|
506
|
-
|
512
|
+
opts = if max_parallel_processes > 0
|
513
|
+
{ in_processes: max_parallel_processes }
|
514
|
+
else
|
515
|
+
{}
|
516
|
+
end
|
517
|
+
|
518
|
+
Parallel.each(itr, opts) do |*args|
|
507
519
|
yield(*args)
|
508
520
|
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
509
521
|
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
@@ -297,6 +297,32 @@ module Dataflow
|
|
297
297
|
(db_backend.to_s =~ /sql/).present?
|
298
298
|
end
|
299
299
|
|
300
|
+
|
301
|
+
def required_by
|
302
|
+
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
|
303
|
+
{ node: node, type: 'dataset' }
|
304
|
+
}
|
305
|
+
end
|
306
|
+
|
307
|
+
# this is not safe if there is some parallel processing going on
|
308
|
+
def safely_clear_write_dataset
|
309
|
+
# we can only clear the write dataset if we're using double buffering
|
310
|
+
return unless use_double_buffering
|
311
|
+
# check if there is any node that is currently computing to this dataset
|
312
|
+
used_by = required_by.select { |x| x[:type] == 'dataset' && x[:node].locked_for_computing? }
|
313
|
+
return if used_by.present?
|
314
|
+
|
315
|
+
logger.log("Dropping #{db_name}.#{write_dataset_name} on #{db_backend}.")
|
316
|
+
# TODO: lock the node?
|
317
|
+
db_adapter.drop_dataset(write_dataset_name)
|
318
|
+
end
|
319
|
+
|
320
|
+
def drop_dataset!
|
321
|
+
db_adapter.drop_dataset(write_dataset_name)
|
322
|
+
return unless use_double_buffering
|
323
|
+
db_adapter.drop_dataset(read_dataset_name)
|
324
|
+
end
|
325
|
+
|
300
326
|
private
|
301
327
|
|
302
328
|
def db_adapter(connection_opts = {})
|
@@ -337,6 +363,10 @@ module Dataflow
|
|
337
363
|
[name]
|
338
364
|
end
|
339
365
|
end
|
366
|
+
|
367
|
+
def logger
|
368
|
+
@logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
|
369
|
+
end
|
340
370
|
end # class DataNode
|
341
371
|
end # module Nodes
|
342
372
|
end # module Dataflow
|
@@ -13,7 +13,7 @@ module Dataflow
|
|
13
13
|
def infer_schema(samples_count: 0, extended: false)
|
14
14
|
if db_backend == :postgresql
|
15
15
|
# Experimental
|
16
|
-
sch = db_adapter.client.schema(
|
16
|
+
sch = db_adapter.client.schema(read_dataset_name).to_h
|
17
17
|
sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
|
18
18
|
self.inferred_schema = sch
|
19
19
|
save
|
data/lib/dataflow/version.rb
CHANGED
data/lib/dataflow-rb.rb
CHANGED
@@ -69,8 +69,14 @@ module Dataflow
|
|
69
69
|
rescue Mongoid::Errors::DocumentNotFound
|
70
70
|
Dataflow::Nodes::ComputeNode.find_by(name: id)
|
71
71
|
end
|
72
|
-
end
|
73
72
|
|
73
|
+
# helper that helps clearing un-used datasets
|
74
|
+
# NOTE: although there is a best attempt to not delete datasets that are
|
75
|
+
# currently being written to, this is not safe to use while executing in parallel.
|
76
|
+
def self.clear_tmp_datasets
|
77
|
+
Dataflow::Nodes::DataNode.all.each(&:safely_clear_write_dataset)
|
78
|
+
end
|
79
|
+
end
|
74
80
|
|
75
81
|
###############################################################################
|
76
82
|
# Override the #constantize in active_support/inflector/methods.rb
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-04-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -364,10 +364,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
364
364
|
version: '0'
|
365
365
|
requirements: []
|
366
366
|
rubyforge_project:
|
367
|
-
rubygems_version: 2.
|
367
|
+
rubygems_version: 2.6.11
|
368
368
|
signing_key:
|
369
369
|
specification_version: 4
|
370
370
|
summary: Helps building data and automation pipelines. It handles recomputing dependencies
|
371
371
|
and parallel execution.
|
372
372
|
test_files: []
|
373
|
-
has_rdoc:
|