dataflow-rb 0.9.2 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -1
- data/lib/dataflow/adapters/sql_adapter.rb +22 -14
- data/lib/dataflow/nodes/compute_node.rb +23 -3
- data/lib/dataflow/nodes/data_node.rb +13 -2
- data/lib/dataflow/nodes/export/to_csv_node.rb +9 -8
- data/lib/dataflow/nodes/join_node.rb +42 -8
- data/lib/dataflow/nodes/select_keys_node.rb +2 -2
- data/lib/dataflow/schema_mixin.rb +11 -2
- data/lib/dataflow/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ccef7eb0d8bf531e4e19c62562763dff31f85f4
|
4
|
+
data.tar.gz: 9c5e496748df17dcc3fd41ea90c4c3b5dd4df43a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2c62ef0c24df46a551d5a90bd8d7cf4b6b4ece57bbfd3dad720c7399b4b325ae949d10f3f644403413f2534f1ae08ae797dcaf8d925bea28d2da8b23c7ca0b3a
|
7
|
+
data.tar.gz: 95f0b3764d07d39cb15ef06a5fcea00e71942d02560b0b4e3475adf3a83b8b70d719f42d260decb496ad44668cdc3947fc32567b84b7eb0fd5a35b0e356e97a0
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,19 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
-
|
3
|
+
#### 0.10.0
|
4
|
+
- [2f6284c] Allow the pre-compute to modify the necessary schema
|
5
|
+
- [cec8a1d] Do not crash if process_parallel is called without dependencies.
|
6
|
+
- [83e1bb5] Various fixes related to the csv export feature
|
7
|
+
- [61e74d7] Force the data node to use double buffering when necessary.
|
8
|
+
- [553b1ea] Fix documentation
|
9
|
+
- [be21031] Added an heartbeat to the compute node
|
10
|
+
- [78308c0] Added tests to the join node. Add multi key support on Postgresql impl and select_keys support on software join.
|
11
|
+
- [090c81f] Experimental: fetch the schema directly from the DB.
|
12
|
+
- [46a7915] Fix: use the samples count when inferring a schema
|
13
|
+
- [dcd7750] Add support for selecting which keys to include in a join.
|
14
|
+
- [9005b6c] Set a default updated_at when creating a data node. Do not change the dataset immediately if we're using double buffering. Wait for the next buffer to be created instead.
|
15
|
+
- [d98d9c1] Do not crash if an index cannot be added. Use the logger instead of the stdout for the sql adapter.
|
16
|
+
- [cc40642] Catch DatabaseError.
|
4
17
|
|
5
18
|
#### 0.9.2
|
6
19
|
- [2f3129c] Fix bug when joining datasets directly in SQL
|
@@ -72,15 +72,11 @@ module Dataflow
|
|
72
72
|
def initialize(args)
|
73
73
|
update_settings(args)
|
74
74
|
@client = SqlAdapter.client(settings)
|
75
|
-
@schema = settings.schema || [] # TODO: detect if the table schema has a mis-match
|
76
75
|
end
|
77
76
|
|
78
77
|
def update_settings(args)
|
79
78
|
@settings = Dataflow::Adapters::Settings.new(args)
|
80
|
-
|
81
|
-
|
82
|
-
def set_schema(schema)
|
83
|
-
@schema = schema
|
79
|
+
@schema = @settings.schema
|
84
80
|
end
|
85
81
|
|
86
82
|
# retrieve a single element from a data node
|
@@ -167,12 +163,6 @@ module Dataflow
|
|
167
163
|
def recreate_dataset(dataset: nil)
|
168
164
|
dataset ||= settings.write_dataset_name.to_sym
|
169
165
|
client.drop_table?(dataset)
|
170
|
-
|
171
|
-
unless @schema.present?
|
172
|
-
p 'WARNING: recreate dataset aborted: no schema'
|
173
|
-
return
|
174
|
-
end
|
175
|
-
|
176
166
|
create_table(dataset, @schema)
|
177
167
|
end
|
178
168
|
|
@@ -201,7 +191,16 @@ module Dataflow
|
|
201
191
|
client.add_index(dataset, *params)
|
202
192
|
rescue Sequel::DatabaseError => e
|
203
193
|
# ignore index already exists
|
204
|
-
|
194
|
+
next if e.wrapped_exception.is_a?(PG::DuplicateTable)
|
195
|
+
|
196
|
+
# log columns not found but do not raise an error
|
197
|
+
if e.wrapped_exception.is_a?(PG::UndefinedColumn)
|
198
|
+
logger.log("[Error] add_index on #{dataset} failed. #{e}")
|
199
|
+
next
|
200
|
+
end
|
201
|
+
|
202
|
+
# re-raise for everything else
|
203
|
+
raise e
|
205
204
|
end
|
206
205
|
end
|
207
206
|
end
|
@@ -236,6 +235,8 @@ module Dataflow
|
|
236
235
|
end
|
237
236
|
when 'time'
|
238
237
|
col_type = 'timestamp'
|
238
|
+
when 'datetime'
|
239
|
+
col_type = 'timestamp with time zone'
|
239
240
|
when 'integer'
|
240
241
|
max_size ||= MAX_INT + 1
|
241
242
|
col_type = if max_size <= MAX_INT
|
@@ -246,10 +247,13 @@ module Dataflow
|
|
246
247
|
when 'numeric'
|
247
248
|
col_type = 'real'
|
248
249
|
when 'array', 'hash'
|
249
|
-
|
250
|
+
puts "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
|
250
251
|
col_type = 'json'
|
252
|
+
when 'date', 'time'
|
253
|
+
# keep as-is
|
254
|
+
col_type = type
|
251
255
|
else
|
252
|
-
|
256
|
+
puts "[Error] unexpected type '#{type}'. Keeping as-is."
|
253
257
|
col_type = type
|
254
258
|
end
|
255
259
|
|
@@ -317,6 +321,10 @@ module Dataflow
|
|
317
321
|
index
|
318
322
|
end.compact
|
319
323
|
end
|
324
|
+
|
325
|
+
def logger
|
326
|
+
@logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
|
327
|
+
end
|
320
328
|
end
|
321
329
|
end
|
322
330
|
end
|
@@ -11,7 +11,7 @@ module Dataflow
|
|
11
11
|
include Dataflow::SchemaMixin
|
12
12
|
|
13
13
|
event :computing_started # handler(node)
|
14
|
-
event :computing_progressed # handler(node, pct_complete)
|
14
|
+
event :computing_progressed # handler(node, pct_complete:)
|
15
15
|
event :computing_finished # handler(node, state)
|
16
16
|
|
17
17
|
delegate :find, :all, :all_paginated, :count, :ordered_system_id_queries,
|
@@ -31,7 +31,7 @@ module Dataflow
|
|
31
31
|
@data_node_opts || {}
|
32
32
|
end
|
33
33
|
|
34
|
-
# DSL to be used while making
|
34
|
+
# DSL to be used while making computeqd nodes. It supports enforcing validations
|
35
35
|
# by checking whether there is exactly, at_least (min) or at_most (max)
|
36
36
|
# a given number of dependencies. Usage:
|
37
37
|
# class MyComputeNode < ComputeNode
|
@@ -83,6 +83,10 @@ module Dataflow
|
|
83
83
|
# Indicates the last time a successful computation has started.
|
84
84
|
field :last_compute_starting_time, type: Time, editable: false
|
85
85
|
|
86
|
+
# The last time an heartbeat was received.
|
87
|
+
# Useful to detect stale computation that need to be reaped.
|
88
|
+
field :last_heartbeat_time, type: Time, editable: false
|
89
|
+
|
86
90
|
# Necessary fields:
|
87
91
|
validates_presence_of :name
|
88
92
|
|
@@ -217,10 +221,17 @@ module Dataflow
|
|
217
221
|
on_computing_started
|
218
222
|
start_time = Time.now
|
219
223
|
|
224
|
+
if data_node.present? && clear_data_on_compute != data_node.use_double_buffering
|
225
|
+
# make sure the data node has a compatible settings
|
226
|
+
data_node.use_double_buffering = clear_data_on_compute
|
227
|
+
data_node.save
|
228
|
+
end
|
229
|
+
|
230
|
+
pre_compute(force_compute: force_compute)
|
231
|
+
|
220
232
|
# update this node's schema with the necessary fields
|
221
233
|
data_node&.update_schema(required_schema)
|
222
234
|
|
223
|
-
pre_compute(force_compute: force_compute)
|
224
235
|
|
225
236
|
if clear_data_on_compute
|
226
237
|
# Pre-compute, we recreate the table, the unique indexes
|
@@ -228,6 +239,7 @@ module Dataflow
|
|
228
239
|
data_node&.create_unique_indexes(dataset_type: :write)
|
229
240
|
end
|
230
241
|
|
242
|
+
send_heartbeat
|
231
243
|
compute_impl
|
232
244
|
|
233
245
|
if clear_data_on_compute
|
@@ -300,6 +312,7 @@ module Dataflow
|
|
300
312
|
end
|
301
313
|
|
302
314
|
def process_parallel(node:)
|
315
|
+
return if node.blank?
|
303
316
|
record_count = node.count
|
304
317
|
return if record_count == 0
|
305
318
|
|
@@ -311,6 +324,7 @@ module Dataflow
|
|
311
324
|
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
312
325
|
|
313
326
|
parallel_each(queries.each_with_index) do |query, idx|
|
327
|
+
send_heartbeat
|
314
328
|
progress = (idx / queries.count.to_f * 100).ceil
|
315
329
|
on_computing_progressed(pct_complete: progress)
|
316
330
|
|
@@ -379,6 +393,12 @@ module Dataflow
|
|
379
393
|
schema
|
380
394
|
end
|
381
395
|
|
396
|
+
def send_heartbeat
|
397
|
+
update_query = { '$set' => { last_heartbeat_time: Time.now } }
|
398
|
+
Dataflow::Nodes::ComputeNode.where(_id: _id)
|
399
|
+
.find_one_and_update(update_query)
|
400
|
+
end
|
401
|
+
|
382
402
|
##############################
|
383
403
|
# Dependency validations
|
384
404
|
##############################
|
@@ -76,6 +76,9 @@ module Dataflow
|
|
76
76
|
# Use the schema as the inferred schema if none is provided.
|
77
77
|
# This useful when there is no need to infer schemas (e.g. in SQL)
|
78
78
|
self.inferred_schema ||= schema
|
79
|
+
|
80
|
+
# This is needed for the flow to compute properly
|
81
|
+
self.updated_at = Time.now
|
79
82
|
end
|
80
83
|
|
81
84
|
# Callback: after creation make sure the underlying dataset matches this node's properties.
|
@@ -95,6 +98,10 @@ module Dataflow
|
|
95
98
|
def handle_dataset_settings_changed
|
96
99
|
db_adapter.update_settings(data_node: self)
|
97
100
|
|
101
|
+
# if we're using double buffering, just wait for the next buffer
|
102
|
+
# to be created to apply the changes.
|
103
|
+
return if use_double_buffering
|
104
|
+
|
98
105
|
# recreate the dataset if there is no data
|
99
106
|
if db_adapter.count.zero?
|
100
107
|
db_adapter.recreate_dataset(dataset: read_dataset_name)
|
@@ -257,10 +264,14 @@ module Dataflow
|
|
257
264
|
add(records: records)
|
258
265
|
end
|
259
266
|
|
260
|
-
def export(connection_opts: { db_backend: :csv }, keys:
|
267
|
+
def export(connection_opts: { db_backend: :csv }, keys: [], where: {})
|
261
268
|
on_export_started(connection_opts: connection_opts, keys: keys)
|
262
269
|
# instanciate and export without saving anything
|
263
|
-
Export::ToCsvNode.new(
|
270
|
+
Export::ToCsvNode.new(
|
271
|
+
dependency_ids: [self],
|
272
|
+
query: where.to_json,
|
273
|
+
keys: keys
|
274
|
+
).compute_impl
|
264
275
|
on_export_finished
|
265
276
|
end
|
266
277
|
|
@@ -9,17 +9,19 @@ module Dataflow
|
|
9
9
|
# A JSON encoded query to pass along.
|
10
10
|
field :query, type: String, default: {}.to_json
|
11
11
|
|
12
|
+
# Which fields to export
|
13
|
+
field :keys, type: Array, default: []
|
14
|
+
|
12
15
|
def compute_impl
|
13
16
|
node = dependencies.first
|
14
17
|
where = JSON.parse(query)
|
15
18
|
|
16
19
|
# fetch the schema
|
17
|
-
sch =
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
20
|
+
sch = if keys.present?
|
21
|
+
keys.map { |k| [k, { type: 'string' }] }.to_h
|
22
|
+
else
|
23
|
+
node.infer_partial_schema(where: where, extended: true)
|
24
|
+
end
|
23
25
|
|
24
26
|
# create the dataset
|
25
27
|
csv_adapter = Adapters::CsvAdapter.new(data_node: node)
|
@@ -40,8 +42,7 @@ module Dataflow
|
|
40
42
|
# TODO: re-enabled event on_export_progressed
|
41
43
|
# progress = (idx / queries.count.to_f * 100).ceil
|
42
44
|
# on_export_progressed(pct_complete: progress)
|
43
|
-
|
44
|
-
batch = node.all(where: query.merge(where))
|
45
|
+
batch = node.all(where: query.merge(where), fields: sch.keys)
|
45
46
|
csv_adapter.save(records: batch)
|
46
47
|
end
|
47
48
|
|
@@ -11,6 +11,10 @@ module Dataflow
|
|
11
11
|
# other_keys_1 and 2 must match in length
|
12
12
|
field :other_keys1, type: Array, default: []
|
13
13
|
field :other_keys2, type: Array, default: []
|
14
|
+
# Which keys to select on each dataset
|
15
|
+
field :select_keys1, type: Array, default: []
|
16
|
+
field :select_keys2, type: Array, default: []
|
17
|
+
# How to prefix each key
|
14
18
|
field :prefix1, type: String, default: ''
|
15
19
|
field :prefix2, type: String, default: ''
|
16
20
|
|
@@ -30,8 +34,13 @@ module Dataflow
|
|
30
34
|
return {} unless dependencies.count == 2
|
31
35
|
|
32
36
|
# merge both dependencies schemas
|
33
|
-
|
34
|
-
|
37
|
+
sch1 = dependencies.first.schema || {}
|
38
|
+
sch1 = sch1.select { |k,v| select_keys1.include?(k) } if select_keys1.present?
|
39
|
+
sch2 = dependencies.second.schema || {}
|
40
|
+
sch2 = sch2.select { |k,v| select_keys2.include?(k) } if select_keys2.present?
|
41
|
+
sch = sch1.merge(sch2)
|
42
|
+
|
43
|
+
sch
|
35
44
|
end
|
36
45
|
|
37
46
|
def compute_impl
|
@@ -53,17 +62,35 @@ module Dataflow
|
|
53
62
|
private
|
54
63
|
|
55
64
|
def sql_join_query
|
56
|
-
|
57
|
-
|
58
|
-
|
65
|
+
d0_keys = dataset_keys(idx: 0)
|
66
|
+
# only select the remaining keys as we don't support yet prefixing fields
|
67
|
+
d1_keys = dataset_keys(idx: 1) - d0_keys
|
68
|
+
insert_keys = d0_keys + d1_keys
|
69
|
+
select_keys = d0_keys.map { |x| "d0.#{x}" } + d1_keys.map { |x| "d1.#{x}" }
|
70
|
+
query = "INSERT INTO #{write_dataset_name} (#{insert_keys.join(',')})
|
59
71
|
SELECT #{select_keys.join(', ')}
|
60
|
-
FROM #{dependencies[0].read_dataset_name} as
|
61
|
-
|
62
|
-
ON
|
72
|
+
FROM #{dependencies[0].read_dataset_name} as d0
|
73
|
+
#{join_type.upcase} JOIN #{dependencies[1].read_dataset_name} as d1
|
74
|
+
ON d0.#{key1} = d1.#{key2}"
|
75
|
+
|
76
|
+
if has_multiple_keys?
|
77
|
+
join_keys = other_keys1.each_with_index.map { |k, idx| "d0.#{k} = d1.#{other_keys2[idx]}" }
|
78
|
+
query = "#{query}
|
79
|
+
AND #{join_keys.join("\nAND ")}"
|
80
|
+
end
|
81
|
+
|
82
|
+
query
|
83
|
+
end
|
84
|
+
|
85
|
+
def dataset_keys(idx:)
|
86
|
+
keys = send("select_keys#{idx + 1}")
|
87
|
+
keys = dependencies[idx].schema.keys if keys.blank?
|
88
|
+
keys
|
63
89
|
end
|
64
90
|
|
65
91
|
def execute_sql_join
|
66
92
|
query = sql_join_query
|
93
|
+
logger.log(query)
|
67
94
|
# TODO: work on a better way to interface this
|
68
95
|
sql_adapter = data_node.send(:db_adapter)
|
69
96
|
sql_adapter.client[query].to_a
|
@@ -105,6 +132,9 @@ module Dataflow
|
|
105
132
|
end
|
106
133
|
|
107
134
|
# for each datum in dataset1, find the corresponding datum in dataset2
|
135
|
+
select_keys_set1 = select_keys1.to_set
|
136
|
+
select_keys_set2 = select_keys2.to_set
|
137
|
+
|
108
138
|
n1_records.map do |d1|
|
109
139
|
join_value = d1.dig(*tokens_key1)
|
110
140
|
next if join_value.nil?
|
@@ -122,6 +152,10 @@ module Dataflow
|
|
122
152
|
# there might be the case that nothing was found after-all
|
123
153
|
d2 ||= {}
|
124
154
|
|
155
|
+
# only keep the needed keys
|
156
|
+
d1 = d1.select { |k| select_keys_set1.include?(k) } if select_keys_set1.present?
|
157
|
+
d2 = d2.select { |k| select_keys_set2.include?(k) } if select_keys_set2.present?
|
158
|
+
|
125
159
|
# prefix if needed
|
126
160
|
d1 = Hash[d1.map { |k, v| ["#{prefix1}#{k}", v] }] if prefix1.present?
|
127
161
|
d2 = Hash[d2.map { |k, v| ["#{prefix2}#{k}", v] }] if prefix2.present?
|
@@ -8,8 +8,8 @@ module Dataflow
|
|
8
8
|
ensure_data_node_exists
|
9
9
|
ensure_dependencies exactly: 1
|
10
10
|
|
11
|
-
def export
|
12
|
-
|
11
|
+
def export
|
12
|
+
data_node.export(keys: keys)
|
13
13
|
end
|
14
14
|
|
15
15
|
private
|
@@ -11,17 +11,26 @@ module Dataflow
|
|
11
11
|
# @return [Hash] with one entry per 'column'/'field'. The values
|
12
12
|
# contains information about the type and usage.
|
13
13
|
def infer_schema(samples_count: 0, extended: false)
|
14
|
+
if db_backend == :postgresql
|
15
|
+
# Experimental
|
16
|
+
sch = db_adapter.client.schema(name).to_h
|
17
|
+
sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
|
18
|
+
self.inferred_schema = sch
|
19
|
+
save
|
20
|
+
return sch
|
21
|
+
end
|
22
|
+
|
14
23
|
data_count = samples_count == 0 ? count : samples_count # invoked in the base class
|
15
24
|
return {} if data_count == 0
|
16
25
|
|
17
26
|
# find out how many batches are needed
|
18
27
|
max_per_process = 1000
|
19
|
-
max_per_process = limit_per_process if respond_to?
|
28
|
+
max_per_process = limit_per_process if respond_to?(:limit_per_process) && limit_per_process > 0
|
20
29
|
|
21
30
|
equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
|
22
31
|
count_per_process = [max_per_process, equal_split_per_process].min
|
23
32
|
|
24
|
-
queries = ordered_system_id_queries(batch_size: count_per_process)
|
33
|
+
queries = ordered_system_id_queries(batch_size: count_per_process)[0...data_count]
|
25
34
|
|
26
35
|
self.inferred_schema_at = Time.now
|
27
36
|
self.inferred_schema_from = samples_count
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|