dataflow-rb 0.9.2 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -1
- data/lib/dataflow/adapters/sql_adapter.rb +22 -14
- data/lib/dataflow/nodes/compute_node.rb +23 -3
- data/lib/dataflow/nodes/data_node.rb +13 -2
- data/lib/dataflow/nodes/export/to_csv_node.rb +9 -8
- data/lib/dataflow/nodes/join_node.rb +42 -8
- data/lib/dataflow/nodes/select_keys_node.rb +2 -2
- data/lib/dataflow/schema_mixin.rb +11 -2
- data/lib/dataflow/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ccef7eb0d8bf531e4e19c62562763dff31f85f4
|
4
|
+
data.tar.gz: 9c5e496748df17dcc3fd41ea90c4c3b5dd4df43a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2c62ef0c24df46a551d5a90bd8d7cf4b6b4ece57bbfd3dad720c7399b4b325ae949d10f3f644403413f2534f1ae08ae797dcaf8d925bea28d2da8b23c7ca0b3a
|
7
|
+
data.tar.gz: 95f0b3764d07d39cb15ef06a5fcea00e71942d02560b0b4e3475adf3a83b8b70d719f42d260decb496ad44668cdc3947fc32567b84b7eb0fd5a35b0e356e97a0
|
data/CHANGELOG.md
CHANGED
@@ -1,6 +1,19 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
-
|
3
|
+
#### 0.10.0
|
4
|
+
- [2f6284c] Allow the pre-compute to modify the necessary schema
|
5
|
+
- [cec8a1d] Do not crash if process_parallel is called without dependencies.
|
6
|
+
- [83e1bb5] Various fixes related to the csv export feature
|
7
|
+
- [61e74d7] Force the data node to use double buffering when necessary.
|
8
|
+
- [553b1ea] Fix documentation
|
9
|
+
- [be21031] Added an heartbeat to the compute node
|
10
|
+
- [78308c0] Added tests to the join node. Add multi key support on Postgresql impl and select_keys support on software join.
|
11
|
+
- [090c81f] Experimental: fetch the schema directly from the DB.
|
12
|
+
- [46a7915] Fix: use the samples count when inferring a schema
|
13
|
+
- [dcd7750] Add support for selecting which keys to include in a join.
|
14
|
+
- [9005b6c] Set a default updated_at when creating a data node. Do not change the dataset immediately if we're using double buffering. Wait for the next buffer to be created instead.
|
15
|
+
- [d98d9c1] Do not crash if an index cannot be added. Use the logger instead of the stdout for the sql adapter.
|
16
|
+
- [cc40642] Catch DatabaseError.
|
4
17
|
|
5
18
|
#### 0.9.2
|
6
19
|
- [2f3129c] Fix bug when joining datasets directly in SQL
|
@@ -72,15 +72,11 @@ module Dataflow
|
|
72
72
|
def initialize(args)
|
73
73
|
update_settings(args)
|
74
74
|
@client = SqlAdapter.client(settings)
|
75
|
-
@schema = settings.schema || [] # TODO: detect if the table schema has a mis-match
|
76
75
|
end
|
77
76
|
|
78
77
|
def update_settings(args)
|
79
78
|
@settings = Dataflow::Adapters::Settings.new(args)
|
80
|
-
|
81
|
-
|
82
|
-
def set_schema(schema)
|
83
|
-
@schema = schema
|
79
|
+
@schema = @settings.schema
|
84
80
|
end
|
85
81
|
|
86
82
|
# retrieve a single element from a data node
|
@@ -167,12 +163,6 @@ module Dataflow
|
|
167
163
|
def recreate_dataset(dataset: nil)
|
168
164
|
dataset ||= settings.write_dataset_name.to_sym
|
169
165
|
client.drop_table?(dataset)
|
170
|
-
|
171
|
-
unless @schema.present?
|
172
|
-
p 'WARNING: recreate dataset aborted: no schema'
|
173
|
-
return
|
174
|
-
end
|
175
|
-
|
176
166
|
create_table(dataset, @schema)
|
177
167
|
end
|
178
168
|
|
@@ -201,7 +191,16 @@ module Dataflow
|
|
201
191
|
client.add_index(dataset, *params)
|
202
192
|
rescue Sequel::DatabaseError => e
|
203
193
|
# ignore index already exists
|
204
|
-
|
194
|
+
next if e.wrapped_exception.is_a?(PG::DuplicateTable)
|
195
|
+
|
196
|
+
# log columns not found but do not raise an error
|
197
|
+
if e.wrapped_exception.is_a?(PG::UndefinedColumn)
|
198
|
+
logger.log("[Error] add_index on #{dataset} failed. #{e}")
|
199
|
+
next
|
200
|
+
end
|
201
|
+
|
202
|
+
# re-raise for everything else
|
203
|
+
raise e
|
205
204
|
end
|
206
205
|
end
|
207
206
|
end
|
@@ -236,6 +235,8 @@ module Dataflow
|
|
236
235
|
end
|
237
236
|
when 'time'
|
238
237
|
col_type = 'timestamp'
|
238
|
+
when 'datetime'
|
239
|
+
col_type = 'timestamp with time zone'
|
239
240
|
when 'integer'
|
240
241
|
max_size ||= MAX_INT + 1
|
241
242
|
col_type = if max_size <= MAX_INT
|
@@ -246,10 +247,13 @@ module Dataflow
|
|
246
247
|
when 'numeric'
|
247
248
|
col_type = 'real'
|
248
249
|
when 'array', 'hash'
|
249
|
-
|
250
|
+
puts "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
|
250
251
|
col_type = 'json'
|
252
|
+
when 'date', 'time'
|
253
|
+
# keep as-is
|
254
|
+
col_type = type
|
251
255
|
else
|
252
|
-
|
256
|
+
puts "[Error] unexpected type '#{type}'. Keeping as-is."
|
253
257
|
col_type = type
|
254
258
|
end
|
255
259
|
|
@@ -317,6 +321,10 @@ module Dataflow
|
|
317
321
|
index
|
318
322
|
end.compact
|
319
323
|
end
|
324
|
+
|
325
|
+
def logger
|
326
|
+
@logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
|
327
|
+
end
|
320
328
|
end
|
321
329
|
end
|
322
330
|
end
|
@@ -11,7 +11,7 @@ module Dataflow
|
|
11
11
|
include Dataflow::SchemaMixin
|
12
12
|
|
13
13
|
event :computing_started # handler(node)
|
14
|
-
event :computing_progressed # handler(node, pct_complete)
|
14
|
+
event :computing_progressed # handler(node, pct_complete:)
|
15
15
|
event :computing_finished # handler(node, state)
|
16
16
|
|
17
17
|
delegate :find, :all, :all_paginated, :count, :ordered_system_id_queries,
|
@@ -31,7 +31,7 @@ module Dataflow
|
|
31
31
|
@data_node_opts || {}
|
32
32
|
end
|
33
33
|
|
34
|
-
# DSL to be used while making
|
34
|
+
# DSL to be used while making computeqd nodes. It supports enforcing validations
|
35
35
|
# by checking whether there is exactly, at_least (min) or at_most (max)
|
36
36
|
# a given number of dependencies. Usage:
|
37
37
|
# class MyComputeNode < ComputeNode
|
@@ -83,6 +83,10 @@ module Dataflow
|
|
83
83
|
# Indicates the last time a successful computation has started.
|
84
84
|
field :last_compute_starting_time, type: Time, editable: false
|
85
85
|
|
86
|
+
# The last time an heartbeat was received.
|
87
|
+
# Useful to detect stale computation that need to be reaped.
|
88
|
+
field :last_heartbeat_time, type: Time, editable: false
|
89
|
+
|
86
90
|
# Necessary fields:
|
87
91
|
validates_presence_of :name
|
88
92
|
|
@@ -217,10 +221,17 @@ module Dataflow
|
|
217
221
|
on_computing_started
|
218
222
|
start_time = Time.now
|
219
223
|
|
224
|
+
if data_node.present? && clear_data_on_compute != data_node.use_double_buffering
|
225
|
+
# make sure the data node has a compatible settings
|
226
|
+
data_node.use_double_buffering = clear_data_on_compute
|
227
|
+
data_node.save
|
228
|
+
end
|
229
|
+
|
230
|
+
pre_compute(force_compute: force_compute)
|
231
|
+
|
220
232
|
# update this node's schema with the necessary fields
|
221
233
|
data_node&.update_schema(required_schema)
|
222
234
|
|
223
|
-
pre_compute(force_compute: force_compute)
|
224
235
|
|
225
236
|
if clear_data_on_compute
|
226
237
|
# Pre-compute, we recreate the table, the unique indexes
|
@@ -228,6 +239,7 @@ module Dataflow
|
|
228
239
|
data_node&.create_unique_indexes(dataset_type: :write)
|
229
240
|
end
|
230
241
|
|
242
|
+
send_heartbeat
|
231
243
|
compute_impl
|
232
244
|
|
233
245
|
if clear_data_on_compute
|
@@ -300,6 +312,7 @@ module Dataflow
|
|
300
312
|
end
|
301
313
|
|
302
314
|
def process_parallel(node:)
|
315
|
+
return if node.blank?
|
303
316
|
record_count = node.count
|
304
317
|
return if record_count == 0
|
305
318
|
|
@@ -311,6 +324,7 @@ module Dataflow
|
|
311
324
|
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
312
325
|
|
313
326
|
parallel_each(queries.each_with_index) do |query, idx|
|
327
|
+
send_heartbeat
|
314
328
|
progress = (idx / queries.count.to_f * 100).ceil
|
315
329
|
on_computing_progressed(pct_complete: progress)
|
316
330
|
|
@@ -379,6 +393,12 @@ module Dataflow
|
|
379
393
|
schema
|
380
394
|
end
|
381
395
|
|
396
|
+
def send_heartbeat
|
397
|
+
update_query = { '$set' => { last_heartbeat_time: Time.now } }
|
398
|
+
Dataflow::Nodes::ComputeNode.where(_id: _id)
|
399
|
+
.find_one_and_update(update_query)
|
400
|
+
end
|
401
|
+
|
382
402
|
##############################
|
383
403
|
# Dependency validations
|
384
404
|
##############################
|
@@ -76,6 +76,9 @@ module Dataflow
|
|
76
76
|
# Use the schema as the inferred schema if none is provided.
|
77
77
|
# This useful when there is no need to infer schemas (e.g. in SQL)
|
78
78
|
self.inferred_schema ||= schema
|
79
|
+
|
80
|
+
# This is needed for the flow to compute properly
|
81
|
+
self.updated_at = Time.now
|
79
82
|
end
|
80
83
|
|
81
84
|
# Callback: after creation make sure the underlying dataset matches this node's properties.
|
@@ -95,6 +98,10 @@ module Dataflow
|
|
95
98
|
def handle_dataset_settings_changed
|
96
99
|
db_adapter.update_settings(data_node: self)
|
97
100
|
|
101
|
+
# if we're using double buffering, just wait for the next buffer
|
102
|
+
# to be created to apply the changes.
|
103
|
+
return if use_double_buffering
|
104
|
+
|
98
105
|
# recreate the dataset if there is no data
|
99
106
|
if db_adapter.count.zero?
|
100
107
|
db_adapter.recreate_dataset(dataset: read_dataset_name)
|
@@ -257,10 +264,14 @@ module Dataflow
|
|
257
264
|
add(records: records)
|
258
265
|
end
|
259
266
|
|
260
|
-
def export(connection_opts: { db_backend: :csv }, keys:
|
267
|
+
def export(connection_opts: { db_backend: :csv }, keys: [], where: {})
|
261
268
|
on_export_started(connection_opts: connection_opts, keys: keys)
|
262
269
|
# instanciate and export without saving anything
|
263
|
-
Export::ToCsvNode.new(
|
270
|
+
Export::ToCsvNode.new(
|
271
|
+
dependency_ids: [self],
|
272
|
+
query: where.to_json,
|
273
|
+
keys: keys
|
274
|
+
).compute_impl
|
264
275
|
on_export_finished
|
265
276
|
end
|
266
277
|
|
@@ -9,17 +9,19 @@ module Dataflow
|
|
9
9
|
# A JSON encoded query to pass along.
|
10
10
|
field :query, type: String, default: {}.to_json
|
11
11
|
|
12
|
+
# Which fields to export
|
13
|
+
field :keys, type: Array, default: []
|
14
|
+
|
12
15
|
def compute_impl
|
13
16
|
node = dependencies.first
|
14
17
|
where = JSON.parse(query)
|
15
18
|
|
16
19
|
# fetch the schema
|
17
|
-
sch =
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
20
|
+
sch = if keys.present?
|
21
|
+
keys.map { |k| [k, { type: 'string' }] }.to_h
|
22
|
+
else
|
23
|
+
node.infer_partial_schema(where: where, extended: true)
|
24
|
+
end
|
23
25
|
|
24
26
|
# create the dataset
|
25
27
|
csv_adapter = Adapters::CsvAdapter.new(data_node: node)
|
@@ -40,8 +42,7 @@ module Dataflow
|
|
40
42
|
# TODO: re-enabled event on_export_progressed
|
41
43
|
# progress = (idx / queries.count.to_f * 100).ceil
|
42
44
|
# on_export_progressed(pct_complete: progress)
|
43
|
-
|
44
|
-
batch = node.all(where: query.merge(where))
|
45
|
+
batch = node.all(where: query.merge(where), fields: sch.keys)
|
45
46
|
csv_adapter.save(records: batch)
|
46
47
|
end
|
47
48
|
|
@@ -11,6 +11,10 @@ module Dataflow
|
|
11
11
|
# other_keys_1 and 2 must match in length
|
12
12
|
field :other_keys1, type: Array, default: []
|
13
13
|
field :other_keys2, type: Array, default: []
|
14
|
+
# Which keys to select on each dataset
|
15
|
+
field :select_keys1, type: Array, default: []
|
16
|
+
field :select_keys2, type: Array, default: []
|
17
|
+
# How to prefix each key
|
14
18
|
field :prefix1, type: String, default: ''
|
15
19
|
field :prefix2, type: String, default: ''
|
16
20
|
|
@@ -30,8 +34,13 @@ module Dataflow
|
|
30
34
|
return {} unless dependencies.count == 2
|
31
35
|
|
32
36
|
# merge both dependencies schemas
|
33
|
-
|
34
|
-
|
37
|
+
sch1 = dependencies.first.schema || {}
|
38
|
+
sch1 = sch1.select { |k,v| select_keys1.include?(k) } if select_keys1.present?
|
39
|
+
sch2 = dependencies.second.schema || {}
|
40
|
+
sch2 = sch2.select { |k,v| select_keys2.include?(k) } if select_keys2.present?
|
41
|
+
sch = sch1.merge(sch2)
|
42
|
+
|
43
|
+
sch
|
35
44
|
end
|
36
45
|
|
37
46
|
def compute_impl
|
@@ -53,17 +62,35 @@ module Dataflow
|
|
53
62
|
private
|
54
63
|
|
55
64
|
def sql_join_query
|
56
|
-
|
57
|
-
|
58
|
-
|
65
|
+
d0_keys = dataset_keys(idx: 0)
|
66
|
+
# only select the remaining keys as we don't support yet prefixing fields
|
67
|
+
d1_keys = dataset_keys(idx: 1) - d0_keys
|
68
|
+
insert_keys = d0_keys + d1_keys
|
69
|
+
select_keys = d0_keys.map { |x| "d0.#{x}" } + d1_keys.map { |x| "d1.#{x}" }
|
70
|
+
query = "INSERT INTO #{write_dataset_name} (#{insert_keys.join(',')})
|
59
71
|
SELECT #{select_keys.join(', ')}
|
60
|
-
FROM #{dependencies[0].read_dataset_name} as
|
61
|
-
|
62
|
-
ON
|
72
|
+
FROM #{dependencies[0].read_dataset_name} as d0
|
73
|
+
#{join_type.upcase} JOIN #{dependencies[1].read_dataset_name} as d1
|
74
|
+
ON d0.#{key1} = d1.#{key2}"
|
75
|
+
|
76
|
+
if has_multiple_keys?
|
77
|
+
join_keys = other_keys1.each_with_index.map { |k, idx| "d0.#{k} = d1.#{other_keys2[idx]}" }
|
78
|
+
query = "#{query}
|
79
|
+
AND #{join_keys.join("\nAND ")}"
|
80
|
+
end
|
81
|
+
|
82
|
+
query
|
83
|
+
end
|
84
|
+
|
85
|
+
def dataset_keys(idx:)
|
86
|
+
keys = send("select_keys#{idx + 1}")
|
87
|
+
keys = dependencies[idx].schema.keys if keys.blank?
|
88
|
+
keys
|
63
89
|
end
|
64
90
|
|
65
91
|
def execute_sql_join
|
66
92
|
query = sql_join_query
|
93
|
+
logger.log(query)
|
67
94
|
# TODO: work on a better way to interface this
|
68
95
|
sql_adapter = data_node.send(:db_adapter)
|
69
96
|
sql_adapter.client[query].to_a
|
@@ -105,6 +132,9 @@ module Dataflow
|
|
105
132
|
end
|
106
133
|
|
107
134
|
# for each datum in dataset1, find the corresponding datum in dataset2
|
135
|
+
select_keys_set1 = select_keys1.to_set
|
136
|
+
select_keys_set2 = select_keys2.to_set
|
137
|
+
|
108
138
|
n1_records.map do |d1|
|
109
139
|
join_value = d1.dig(*tokens_key1)
|
110
140
|
next if join_value.nil?
|
@@ -122,6 +152,10 @@ module Dataflow
|
|
122
152
|
# there might be the case that nothing was found after-all
|
123
153
|
d2 ||= {}
|
124
154
|
|
155
|
+
# only keep the needed keys
|
156
|
+
d1 = d1.select { |k| select_keys_set1.include?(k) } if select_keys_set1.present?
|
157
|
+
d2 = d2.select { |k| select_keys_set2.include?(k) } if select_keys_set2.present?
|
158
|
+
|
125
159
|
# prefix if needed
|
126
160
|
d1 = Hash[d1.map { |k, v| ["#{prefix1}#{k}", v] }] if prefix1.present?
|
127
161
|
d2 = Hash[d2.map { |k, v| ["#{prefix2}#{k}", v] }] if prefix2.present?
|
@@ -8,8 +8,8 @@ module Dataflow
|
|
8
8
|
ensure_data_node_exists
|
9
9
|
ensure_dependencies exactly: 1
|
10
10
|
|
11
|
-
def export
|
12
|
-
|
11
|
+
def export
|
12
|
+
data_node.export(keys: keys)
|
13
13
|
end
|
14
14
|
|
15
15
|
private
|
@@ -11,17 +11,26 @@ module Dataflow
|
|
11
11
|
# @return [Hash] with one entry per 'column'/'field'. The values
|
12
12
|
# contains information about the type and usage.
|
13
13
|
def infer_schema(samples_count: 0, extended: false)
|
14
|
+
if db_backend == :postgresql
|
15
|
+
# Experimental
|
16
|
+
sch = db_adapter.client.schema(name).to_h
|
17
|
+
sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
|
18
|
+
self.inferred_schema = sch
|
19
|
+
save
|
20
|
+
return sch
|
21
|
+
end
|
22
|
+
|
14
23
|
data_count = samples_count == 0 ? count : samples_count # invoked in the base class
|
15
24
|
return {} if data_count == 0
|
16
25
|
|
17
26
|
# find out how many batches are needed
|
18
27
|
max_per_process = 1000
|
19
|
-
max_per_process = limit_per_process if respond_to?
|
28
|
+
max_per_process = limit_per_process if respond_to?(:limit_per_process) && limit_per_process > 0
|
20
29
|
|
21
30
|
equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
|
22
31
|
count_per_process = [max_per_process, equal_split_per_process].min
|
23
32
|
|
24
|
-
queries = ordered_system_id_queries(batch_size: count_per_process)
|
33
|
+
queries = ordered_system_id_queries(batch_size: count_per_process)[0...data_count]
|
25
34
|
|
26
35
|
self.inferred_schema_at = Time.now
|
27
36
|
self.inferred_schema_from = samples_count
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.10.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|