dataflow-rb 0.10.2 → 0.11.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +22 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +6 -1
- data/lib/dataflow/adapters/sql_adapter.rb +33 -8
- data/lib/dataflow/event_mixin.rb +1 -1
- data/lib/dataflow/logger.rb +14 -1
- data/lib/dataflow/node.rb +6 -0
- data/lib/dataflow/nodes/compute_node.rb +16 -4
- data/lib/dataflow/nodes/data_node.rb +30 -0
- data/lib/dataflow/schema_mixin.rb +1 -1
- data/lib/dataflow/version.rb +1 -1
- data/lib/dataflow-rb.rb +7 -1
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d2ac7fa848d641d2c1fd0856ff92bb81f17bb670
|
4
|
+
data.tar.gz: 31eaf46d3785777d712739bc7f1a6d3ca328280e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bedf2430c023cef3e4408a7e213eee4f5cf206574f0a5264dbb2b7cad10defc85fff8ebdd4e859f7bce414ce3941bb2b7bbe30ffbf7c7e1194a3e0c716470047
|
7
|
+
data.tar.gz: e2470aa7d5aba0da5c67822f1eb8426564134d2ea6346bb627326a174c9198bcb89c1b598aa4d0d8cfb910071079f92a04970c12f9b4f65b0bc4bca4e39be20c
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,27 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
#### 0.11.0
|
4
|
+
- [7c09e8a] Add data_node#drop_dataset! to completely drop the data
|
5
|
+
- [ba0532f] Added upsert on psql adapter
|
6
|
+
- [4d44bbd] Support setting the number of parallel processes
|
7
|
+
- [8b48a6b] Add support for double buffered schema inferrence on postgresql
|
8
|
+
- [49bfe1a] Add support for clearing unused datasets
|
9
|
+
- [aabd5e3] Added #required_by to the node interface
|
10
|
+
- [4fd2617] Handle forks having the same thread id
|
11
|
+
- [7fc3064] Add error logging and trace id
|
12
|
+
- [fbbd58b] Added heartbeats when recomputing the dependencies and before the pre-compute callback
|
13
|
+
|
14
|
+
#### 0.10.2
|
15
|
+
- [966e771] Do not crash if there is an unknown node type in the metadata.
|
16
|
+
|
17
|
+
#### 0.10.1
|
18
|
+
- [9ee24a4] Cleanly set the mongoid env Fix the bin/console script
|
19
|
+
- [7fdc6f1] Support symbols in schema keys when merging schemas in the join node
|
20
|
+
- [6c7ad5c] Fail silently if no table exists when fetching its metadata
|
21
|
+
- [6b0886e] Make the ComputeNode#schema public
|
22
|
+
- [03f37e2] Optimize the select keys node to avoid recomputing keys at each record.
|
23
|
+
- [23ae504] ComputeNode#schema returns the required schema
|
24
|
+
|
3
25
|
#### 0.10.0
|
4
26
|
- [2f6284c] Allow the pre-compute to modify the necessary schema
|
5
27
|
- [cec8a1d] Do not crash if process_parallel is called without dependencies.
|
@@ -166,11 +166,16 @@ module Dataflow
|
|
166
166
|
# recreate the table/collection
|
167
167
|
def recreate_dataset(dataset: nil)
|
168
168
|
dataset ||= write_dataset_name
|
169
|
+
drop_dataset(dataset)
|
169
170
|
collection = client[dataset]
|
170
|
-
collection.drop
|
171
171
|
collection.create
|
172
172
|
end
|
173
173
|
|
174
|
+
def drop_dataset(dataset)
|
175
|
+
collection = client[dataset]
|
176
|
+
collection.drop
|
177
|
+
end
|
178
|
+
|
174
179
|
# Create the indexes on this dataset.
|
175
180
|
# @param dataset [String] Specify on which dataset the operation will be performed.
|
176
181
|
# Default: the adatpter's settings' dataset.
|
@@ -110,6 +110,11 @@ module Dataflow
|
|
110
110
|
end
|
111
111
|
end
|
112
112
|
|
113
|
+
def all_paginated(where: {}, fields: [], cursor: nil)
|
114
|
+
# for now, retrieve all records at once
|
115
|
+
{ 'data' => all(where: where, fields: fields), 'next_cursor' => '' }
|
116
|
+
end
|
117
|
+
|
113
118
|
# Create queries that permit processing the whole dataset in parallel without using offsets.
|
114
119
|
def ordered_system_id_queries(batch_size:)
|
115
120
|
ids = all(fields: [SYSTEM_ID], sort: { SYSTEM_ID => 1 }).map { |x| x[SYSTEM_ID] }
|
@@ -137,16 +142,31 @@ module Dataflow
|
|
137
142
|
end
|
138
143
|
|
139
144
|
# Save the given records
|
140
|
-
#
|
141
|
-
|
142
|
-
|
145
|
+
# @param replace_by [Array] if the replace_by key is provided,
|
146
|
+
# it will try to replace records with the matching key,
|
147
|
+
# or insert if none is found.
|
148
|
+
# NOTE: the replace_by keys must be UNIQUE indexes.
|
149
|
+
def save(records:, replace_by: nil)
|
150
|
+
dataset_name = settings.write_dataset_name.to_sym
|
151
|
+
dataset = client[dataset_name]
|
143
152
|
columns = dataset.columns.reject { |x| x == SYSTEM_ID }
|
144
153
|
|
145
154
|
tabular_data = records.map do |record|
|
146
155
|
columns.map { |col| record[col] }
|
147
156
|
end
|
148
157
|
|
149
|
-
|
158
|
+
if replace_by.present?
|
159
|
+
index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
|
160
|
+
|
161
|
+
# update every field on conflict
|
162
|
+
update_clause = columns.map { |k| [k, :"excluded__#{k}"] }.to_h
|
163
|
+
dataset
|
164
|
+
.insert_conflict(target: index_keys, update: update_clause)
|
165
|
+
.import(columns, tabular_data)
|
166
|
+
else
|
167
|
+
# ignore insert conflicts
|
168
|
+
dataset.insert_conflict.import(columns, tabular_data)
|
169
|
+
end
|
150
170
|
end
|
151
171
|
|
152
172
|
# Delete records that match the options.
|
@@ -162,10 +182,15 @@ module Dataflow
|
|
162
182
|
# recreate the table/collection
|
163
183
|
def recreate_dataset(dataset: nil)
|
164
184
|
dataset ||= settings.write_dataset_name.to_sym
|
165
|
-
|
185
|
+
drop_dataset(dataset)
|
166
186
|
create_table(dataset, @schema)
|
167
187
|
end
|
168
188
|
|
189
|
+
# drops the given dataset
|
190
|
+
def drop_dataset(dataset)
|
191
|
+
client.drop_table?(dataset)
|
192
|
+
end
|
193
|
+
|
169
194
|
# Create the indexes on this dataset.
|
170
195
|
# @param dataset [String] Specify on which dataset the operation will be performed.
|
171
196
|
# Default: the adatpter's settings' dataset.
|
@@ -195,7 +220,7 @@ module Dataflow
|
|
195
220
|
|
196
221
|
# log columns not found but do not raise an error
|
197
222
|
if e.wrapped_exception.is_a?(PG::UndefinedColumn)
|
198
|
-
logger.
|
223
|
+
logger.error(custom_message: "add_index on #{dataset} failed.", error: e)
|
199
224
|
next
|
200
225
|
end
|
201
226
|
|
@@ -312,8 +337,8 @@ module Dataflow
|
|
312
337
|
params
|
313
338
|
end
|
314
339
|
|
315
|
-
def retrieve_collection_indexes(
|
316
|
-
psql_indexes = client.indexes(
|
340
|
+
def retrieve_collection_indexes(dataset_name)
|
341
|
+
psql_indexes = client.indexes(dataset_name)
|
317
342
|
psql_indexes.values.map do |idx|
|
318
343
|
cols = idx[:columns].map(&:to_s)
|
319
344
|
index = { 'key' => cols }
|
data/lib/dataflow/event_mixin.rb
CHANGED
@@ -66,7 +66,7 @@ module Dataflow
|
|
66
66
|
begin
|
67
67
|
handler.call(self, *args)
|
68
68
|
rescue StandardError => e
|
69
|
-
@logger&.
|
69
|
+
@logger&.error(error: e, custom_message: "ERROR IN HANDLER [on_#{event_name}]")
|
70
70
|
# ignore error in handlers
|
71
71
|
end
|
72
72
|
end
|
data/lib/dataflow/logger.rb
CHANGED
@@ -13,11 +13,24 @@ module Dataflow
|
|
13
13
|
def log(str)
|
14
14
|
return if ENV['RACK_ENV'] == 'test'
|
15
15
|
now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
|
16
|
-
message = "[#{now}] #{prefix}
|
16
|
+
message = "[#{now}][#{trace_id}] #{prefix} | #{str}"
|
17
17
|
logger_impl = @@impl
|
18
18
|
logger_impl.log(message)
|
19
19
|
end
|
20
20
|
|
21
|
+
def error(error:, custom_message: '')
|
22
|
+
first_line = "[ERROR => #{error.class}: '#{error.message}']"
|
23
|
+
first_line += " #{custom_message}" if custom_message.present?
|
24
|
+
first_line += ' Backtrace: '
|
25
|
+
log(first_line)
|
26
|
+
log('--')
|
27
|
+
(error.backtrace || []).each_with_index { |line, idx| log("#{idx}: #{line}") }
|
28
|
+
end
|
29
|
+
|
30
|
+
def trace_id
|
31
|
+
(Process.pid + Thread.current.object_id).to_s(16)[-8..-1]
|
32
|
+
end
|
33
|
+
|
21
34
|
class LoggerImpl
|
22
35
|
def log(message)
|
23
36
|
puts message
|
data/lib/dataflow/node.rb
CHANGED
@@ -33,5 +33,11 @@ module Dataflow
|
|
33
33
|
raise Dataflow::Errors::InvalidConfigurationError, errors.messages unless valid
|
34
34
|
true
|
35
35
|
end
|
36
|
+
|
37
|
+
def required_by
|
38
|
+
Dataflow::Nodes::ComputeNode.where(dependency_ids: _id).map { |node|
|
39
|
+
{ node: node, type: 'dependency' }
|
40
|
+
}
|
41
|
+
end
|
36
42
|
end
|
37
43
|
end
|
@@ -70,6 +70,9 @@ module Dataflow
|
|
70
70
|
# per process during computation.
|
71
71
|
field :limit_per_process, type: Integer, default: 0
|
72
72
|
|
73
|
+
# Maximum number of processes to use in parallel. Use 1 per core when 0.
|
74
|
+
field :max_parallel_processes, type: Integer, default: 0
|
75
|
+
|
73
76
|
# Use automatic recomputing interval. In seconds.
|
74
77
|
field :recompute_interval, type: Integer, default: 0
|
75
78
|
|
@@ -179,6 +182,7 @@ module Dataflow
|
|
179
182
|
# @param force_recompute [Boolean] if true, computes
|
180
183
|
# even if the node is already up to date.
|
181
184
|
def recompute(depth: 0, force_recompute: false)
|
185
|
+
send_heartbeat
|
182
186
|
logger.log "#{'>' * (depth + 1)} #{name} started recomputing..."
|
183
187
|
start_time = Time.now
|
184
188
|
|
@@ -187,6 +191,7 @@ module Dataflow
|
|
187
191
|
if !dependency.updated? || force_recompute
|
188
192
|
dependency.recompute(depth: depth + 1, force_recompute: force_recompute)
|
189
193
|
end
|
194
|
+
send_heartbeat
|
190
195
|
end
|
191
196
|
|
192
197
|
# Dependencies data may have changed in a child process.
|
@@ -226,13 +231,13 @@ module Dataflow
|
|
226
231
|
data_node.use_double_buffering = clear_data_on_compute
|
227
232
|
data_node.save
|
228
233
|
end
|
229
|
-
|
234
|
+
|
235
|
+
send_heartbeat
|
230
236
|
pre_compute(force_compute: force_compute)
|
231
237
|
|
232
238
|
# update this node's schema with the necessary fields
|
233
239
|
data_node&.update_schema(required_schema)
|
234
240
|
|
235
|
-
|
236
241
|
if clear_data_on_compute
|
237
242
|
# Pre-compute, we recreate the table, the unique indexes
|
238
243
|
data_node&.recreate_dataset(dataset_type: :write)
|
@@ -250,6 +255,7 @@ module Dataflow
|
|
250
255
|
end
|
251
256
|
|
252
257
|
self.last_compute_starting_time = start_time
|
258
|
+
save
|
253
259
|
duration = Time.now - start_time
|
254
260
|
logger.log "#{'>' * (depth + 1)} #{name} took #{duration} seconds to compute."
|
255
261
|
on_computing_finished(state: 'computed')
|
@@ -261,7 +267,7 @@ module Dataflow
|
|
261
267
|
|
262
268
|
rescue StandardError => e
|
263
269
|
on_computing_finished(state: 'error', error: e) if has_compute_lock
|
264
|
-
logger.
|
270
|
+
logger.error(error: e, custom_message: "#{name} failed computing.")
|
265
271
|
raise
|
266
272
|
ensure
|
267
273
|
release_computing_lock! if has_compute_lock
|
@@ -503,7 +509,13 @@ module Dataflow
|
|
503
509
|
yield(*args)
|
504
510
|
end
|
505
511
|
else
|
506
|
-
|
512
|
+
opts = if max_parallel_processes > 0
|
513
|
+
{ in_processes: max_parallel_processes }
|
514
|
+
else
|
515
|
+
{}
|
516
|
+
end
|
517
|
+
|
518
|
+
Parallel.each(itr, opts) do |*args|
|
507
519
|
yield(*args)
|
508
520
|
Dataflow::Adapters::SqlAdapter.disconnect_clients
|
509
521
|
Dataflow::Adapters::MongoDbAdapter.disconnect_clients
|
@@ -297,6 +297,32 @@ module Dataflow
|
|
297
297
|
(db_backend.to_s =~ /sql/).present?
|
298
298
|
end
|
299
299
|
|
300
|
+
|
301
|
+
def required_by
|
302
|
+
super + Dataflow::Nodes::ComputeNode.where(data_node_id: _id).map { |node|
|
303
|
+
{ node: node, type: 'dataset' }
|
304
|
+
}
|
305
|
+
end
|
306
|
+
|
307
|
+
# this is not safe if there is some parallel processing going on
|
308
|
+
def safely_clear_write_dataset
|
309
|
+
# we can only clear the write dataset if we're using double buffering
|
310
|
+
return unless use_double_buffering
|
311
|
+
# check if there is any node that is currently computing to this dataset
|
312
|
+
used_by = required_by.select { |x| x[:type] == 'dataset' && x[:node].locked_for_computing? }
|
313
|
+
return if used_by.present?
|
314
|
+
|
315
|
+
logger.log("Dropping #{db_name}.#{write_dataset_name} on #{db_backend}.")
|
316
|
+
# TODO: lock the node?
|
317
|
+
db_adapter.drop_dataset(write_dataset_name)
|
318
|
+
end
|
319
|
+
|
320
|
+
def drop_dataset!
|
321
|
+
db_adapter.drop_dataset(write_dataset_name)
|
322
|
+
return unless use_double_buffering
|
323
|
+
db_adapter.drop_dataset(read_dataset_name)
|
324
|
+
end
|
325
|
+
|
300
326
|
private
|
301
327
|
|
302
328
|
def db_adapter(connection_opts = {})
|
@@ -337,6 +363,10 @@ module Dataflow
|
|
337
363
|
[name]
|
338
364
|
end
|
339
365
|
end
|
366
|
+
|
367
|
+
def logger
|
368
|
+
@logger ||= Dataflow::Logger.new(prefix: 'Dataflow')
|
369
|
+
end
|
340
370
|
end # class DataNode
|
341
371
|
end # module Nodes
|
342
372
|
end # module Dataflow
|
@@ -13,7 +13,7 @@ module Dataflow
|
|
13
13
|
def infer_schema(samples_count: 0, extended: false)
|
14
14
|
if db_backend == :postgresql
|
15
15
|
# Experimental
|
16
|
-
sch = db_adapter.client.schema(
|
16
|
+
sch = db_adapter.client.schema(read_dataset_name).to_h
|
17
17
|
sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
|
18
18
|
self.inferred_schema = sch
|
19
19
|
save
|
data/lib/dataflow/version.rb
CHANGED
data/lib/dataflow-rb.rb
CHANGED
@@ -69,8 +69,14 @@ module Dataflow
|
|
69
69
|
rescue Mongoid::Errors::DocumentNotFound
|
70
70
|
Dataflow::Nodes::ComputeNode.find_by(name: id)
|
71
71
|
end
|
72
|
-
end
|
73
72
|
|
73
|
+
# helper that helps clearing un-used datasets
|
74
|
+
# NOTE: although there is a best attempt to not delete datasets that are
|
75
|
+
# currently being written to, this is not safe to use while executing in parallel.
|
76
|
+
def self.clear_tmp_datasets
|
77
|
+
Dataflow::Nodes::DataNode.all.each(&:safely_clear_write_dataset)
|
78
|
+
end
|
79
|
+
end
|
74
80
|
|
75
81
|
###############################################################################
|
76
82
|
# Override the #constantize in active_support/inflector/methods.rb
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.11.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-04-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -364,10 +364,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
364
364
|
version: '0'
|
365
365
|
requirements: []
|
366
366
|
rubyforge_project:
|
367
|
-
rubygems_version: 2.
|
367
|
+
rubygems_version: 2.6.11
|
368
368
|
signing_key:
|
369
369
|
specification_version: 4
|
370
370
|
summary: Helps building data and automation pipelines. It handles recomputing dependencies
|
371
371
|
and parallel execution.
|
372
372
|
test_files: []
|
373
|
-
has_rdoc:
|