dataflow-rb 0.9.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: da8a0cc4aa93a9a282f672e830d2ab8931e6fe58
4
- data.tar.gz: a4a205460bcda2715d1e5bd16b4fe0982a0f652c
3
+ metadata.gz: 5ccef7eb0d8bf531e4e19c62562763dff31f85f4
4
+ data.tar.gz: 9c5e496748df17dcc3fd41ea90c4c3b5dd4df43a
5
5
  SHA512:
6
- metadata.gz: 14cdd199d230e5048d599372798343274bc130cc906dcb4f39449bb4dd54eec89bd06047ef16560e0cedc15588d333701550bdc0ad5ba37d6511b9935d7b7d5d
7
- data.tar.gz: 1d1658b28845cd78128d44e0f9acae8848117ebc304ec37f059f326faa2d22f9547c47405773becfa49f99a7071c6c11ce02b73926b14994f9c6c4f0c7643489
6
+ metadata.gz: 2c62ef0c24df46a551d5a90bd8d7cf4b6b4ece57bbfd3dad720c7399b4b325ae949d10f3f644403413f2534f1ae08ae797dcaf8d925bea28d2da8b23c7ca0b3a
7
+ data.tar.gz: 95f0b3764d07d39cb15ef06a5fcea00e71942d02560b0b4e3475adf3a83b8b70d719f42d260decb496ad44668cdc3947fc32567b84b7eb0fd5a35b0e356e97a0
data/CHANGELOG.md CHANGED
@@ -1,6 +1,19 @@
1
1
  # Changelog
2
2
 
3
-
3
+ #### 0.10.0
4
+ - [2f6284c] Allow the pre-compute to modify the necessary schema
5
+ - [cec8a1d] Do not crash if process_parallel is called without dependencies.
6
+ - [83e1bb5] Various fixes related to the csv export feature
7
+ - [61e74d7] Force the data node to use double buffering when necessary.
8
+ - [553b1ea] Fix documentation
9
+ - [be21031] Added an heartbeat to the compute node
10
+ - [78308c0] Added tests to the join node. Add multi key support on Postgresql impl and select_keys support on software join.
11
+ - [090c81f] Experimental: fetch the schema directly from the DB.
12
+ - [46a7915] Fix: use the samples count when inferring a schema
13
+ - [dcd7750] Add support for selecting which keys to include in a join.
14
+ - [9005b6c] Set a default updated_at when creating a data node. Do not change the dataset immediately if we're using double buffering. Wait for the next buffer to be created instead.
15
+ - [d98d9c1] Do not crash if an index cannot be added. Use the logger instead of the stdout for the sql adapter.
16
+ - [cc40642] Catch DatabaseError.
4
17
 
5
18
  #### 0.9.2
6
19
  - [2f3129c] Fix bug when joining datasets directly in SQL
@@ -72,15 +72,11 @@ module Dataflow
72
72
  def initialize(args)
73
73
  update_settings(args)
74
74
  @client = SqlAdapter.client(settings)
75
- @schema = settings.schema || [] # TODO: detect if the table schema has a mis-match
76
75
  end
77
76
 
78
77
  def update_settings(args)
79
78
  @settings = Dataflow::Adapters::Settings.new(args)
80
- end
81
-
82
- def set_schema(schema)
83
- @schema = schema
79
+ @schema = @settings.schema
84
80
  end
85
81
 
86
82
  # retrieve a single element from a data node
@@ -167,12 +163,6 @@ module Dataflow
167
163
  def recreate_dataset(dataset: nil)
168
164
  dataset ||= settings.write_dataset_name.to_sym
169
165
  client.drop_table?(dataset)
170
-
171
- unless @schema.present?
172
- p 'WARNING: recreate dataset aborted: no schema'
173
- return
174
- end
175
-
176
166
  create_table(dataset, @schema)
177
167
  end
178
168
 
@@ -201,7 +191,16 @@ module Dataflow
201
191
  client.add_index(dataset, *params)
202
192
  rescue Sequel::DatabaseError => e
203
193
  # ignore index already exists
204
- raise e unless e.wrapped_exception.is_a?(PG::DuplicateTable)
194
+ next if e.wrapped_exception.is_a?(PG::DuplicateTable)
195
+
196
+ # log columns not found but do not raise an error
197
+ if e.wrapped_exception.is_a?(PG::UndefinedColumn)
198
+ logger.log("[Error] add_index on #{dataset} failed. #{e}")
199
+ next
200
+ end
201
+
202
+ # re-raise for everything else
203
+ raise e
205
204
  end
206
205
  end
207
206
  end
@@ -236,6 +235,8 @@ module Dataflow
236
235
  end
237
236
  when 'time'
238
237
  col_type = 'timestamp'
238
+ when 'datetime'
239
+ col_type = 'timestamp with time zone'
239
240
  when 'integer'
240
241
  max_size ||= MAX_INT + 1
241
242
  col_type = if max_size <= MAX_INT
@@ -246,10 +247,13 @@ module Dataflow
246
247
  when 'numeric'
247
248
  col_type = 'real'
248
249
  when 'array', 'hash'
249
- p "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
250
+ puts "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
250
251
  col_type = 'json'
252
+ when 'date', 'time'
253
+ # keep as-is
254
+ col_type = type
251
255
  else
252
- p "Error: unexpected type '#{type}'. Keeping as-is."
256
+ puts "[Error] unexpected type '#{type}'. Keeping as-is."
253
257
  col_type = type
254
258
  end
255
259
 
@@ -317,6 +321,10 @@ module Dataflow
317
321
  index
318
322
  end.compact
319
323
  end
324
+
325
+ def logger
326
+ @logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
327
+ end
320
328
  end
321
329
  end
322
330
  end
@@ -11,7 +11,7 @@ module Dataflow
11
11
  include Dataflow::SchemaMixin
12
12
 
13
13
  event :computing_started # handler(node)
14
- event :computing_progressed # handler(node, pct_complete)
14
+ event :computing_progressed # handler(node, pct_complete:)
15
15
  event :computing_finished # handler(node, state)
16
16
 
17
17
  delegate :find, :all, :all_paginated, :count, :ordered_system_id_queries,
@@ -31,7 +31,7 @@ module Dataflow
31
31
  @data_node_opts || {}
32
32
  end
33
33
 
34
- # DSL to be used while making computed nodes. It supports enforcing validations
34
+ # DSL to be used while making computeqd nodes. It supports enforcing validations
35
35
  # by checking whether there is exactly, at_least (min) or at_most (max)
36
36
  # a given number of dependencies. Usage:
37
37
  # class MyComputeNode < ComputeNode
@@ -83,6 +83,10 @@ module Dataflow
83
83
  # Indicates the last time a successful computation has started.
84
84
  field :last_compute_starting_time, type: Time, editable: false
85
85
 
86
+ # The last time an heartbeat was received.
87
+ # Useful to detect stale computation that need to be reaped.
88
+ field :last_heartbeat_time, type: Time, editable: false
89
+
86
90
  # Necessary fields:
87
91
  validates_presence_of :name
88
92
 
@@ -217,10 +221,17 @@ module Dataflow
217
221
  on_computing_started
218
222
  start_time = Time.now
219
223
 
224
+ if data_node.present? && clear_data_on_compute != data_node.use_double_buffering
225
+ # make sure the data node has a compatible settings
226
+ data_node.use_double_buffering = clear_data_on_compute
227
+ data_node.save
228
+ end
229
+
230
+ pre_compute(force_compute: force_compute)
231
+
220
232
  # update this node's schema with the necessary fields
221
233
  data_node&.update_schema(required_schema)
222
234
 
223
- pre_compute(force_compute: force_compute)
224
235
 
225
236
  if clear_data_on_compute
226
237
  # Pre-compute, we recreate the table, the unique indexes
@@ -228,6 +239,7 @@ module Dataflow
228
239
  data_node&.create_unique_indexes(dataset_type: :write)
229
240
  end
230
241
 
242
+ send_heartbeat
231
243
  compute_impl
232
244
 
233
245
  if clear_data_on_compute
@@ -300,6 +312,7 @@ module Dataflow
300
312
  end
301
313
 
302
314
  def process_parallel(node:)
315
+ return if node.blank?
303
316
  record_count = node.count
304
317
  return if record_count == 0
305
318
 
@@ -311,6 +324,7 @@ module Dataflow
311
324
  queries = node.ordered_system_id_queries(batch_size: count_per_process)
312
325
 
313
326
  parallel_each(queries.each_with_index) do |query, idx|
327
+ send_heartbeat
314
328
  progress = (idx / queries.count.to_f * 100).ceil
315
329
  on_computing_progressed(pct_complete: progress)
316
330
 
@@ -379,6 +393,12 @@ module Dataflow
379
393
  schema
380
394
  end
381
395
 
396
+ def send_heartbeat
397
+ update_query = { '$set' => { last_heartbeat_time: Time.now } }
398
+ Dataflow::Nodes::ComputeNode.where(_id: _id)
399
+ .find_one_and_update(update_query)
400
+ end
401
+
382
402
  ##############################
383
403
  # Dependency validations
384
404
  ##############################
@@ -76,6 +76,9 @@ module Dataflow
76
76
  # Use the schema as the inferred schema if none is provided.
77
77
  # This useful when there is no need to infer schemas (e.g. in SQL)
78
78
  self.inferred_schema ||= schema
79
+
80
+ # This is needed for the flow to compute properly
81
+ self.updated_at = Time.now
79
82
  end
80
83
 
81
84
  # Callback: after creation make sure the underlying dataset matches this node's properties.
@@ -95,6 +98,10 @@ module Dataflow
95
98
  def handle_dataset_settings_changed
96
99
  db_adapter.update_settings(data_node: self)
97
100
 
101
+ # if we're using double buffering, just wait for the next buffer
102
+ # to be created to apply the changes.
103
+ return if use_double_buffering
104
+
98
105
  # recreate the dataset if there is no data
99
106
  if db_adapter.count.zero?
100
107
  db_adapter.recreate_dataset(dataset: read_dataset_name)
@@ -257,10 +264,14 @@ module Dataflow
257
264
  add(records: records)
258
265
  end
259
266
 
260
- def export(connection_opts: { db_backend: :csv }, keys: nil, where: {})
267
+ def export(connection_opts: { db_backend: :csv }, keys: [], where: {})
261
268
  on_export_started(connection_opts: connection_opts, keys: keys)
262
269
  # instanciate and export without saving anything
263
- Export::ToCsvNode.new(dependency_ids: [self], query: where.to_json).compute_impl
270
+ Export::ToCsvNode.new(
271
+ dependency_ids: [self],
272
+ query: where.to_json,
273
+ keys: keys
274
+ ).compute_impl
264
275
  on_export_finished
265
276
  end
266
277
 
@@ -9,17 +9,19 @@ module Dataflow
9
9
  # A JSON encoded query to pass along.
10
10
  field :query, type: String, default: {}.to_json
11
11
 
12
+ # Which fields to export
13
+ field :keys, type: Array, default: []
14
+
12
15
  def compute_impl
13
16
  node = dependencies.first
14
17
  where = JSON.parse(query)
15
18
 
16
19
  # fetch the schema
17
- sch = node.infer_partial_schema(where: where, extended: true)
18
-
19
- # re-order the schema if needed
20
- if node.respond_to? :keys
21
- sch = node.keys.map { |k| [k, sch[k]] }.to_h if keys.present?
22
- end
20
+ sch = if keys.present?
21
+ keys.map { |k| [k, { type: 'string' }] }.to_h
22
+ else
23
+ node.infer_partial_schema(where: where, extended: true)
24
+ end
23
25
 
24
26
  # create the dataset
25
27
  csv_adapter = Adapters::CsvAdapter.new(data_node: node)
@@ -40,8 +42,7 @@ module Dataflow
40
42
  # TODO: re-enabled event on_export_progressed
41
43
  # progress = (idx / queries.count.to_f * 100).ceil
42
44
  # on_export_progressed(pct_complete: progress)
43
-
44
- batch = node.all(where: query.merge(where))
45
+ batch = node.all(where: query.merge(where), fields: sch.keys)
45
46
  csv_adapter.save(records: batch)
46
47
  end
47
48
 
@@ -11,6 +11,10 @@ module Dataflow
11
11
  # other_keys_1 and 2 must match in length
12
12
  field :other_keys1, type: Array, default: []
13
13
  field :other_keys2, type: Array, default: []
14
+ # Which keys to select on each dataset
15
+ field :select_keys1, type: Array, default: []
16
+ field :select_keys2, type: Array, default: []
17
+ # How to prefix each key
14
18
  field :prefix1, type: String, default: ''
15
19
  field :prefix2, type: String, default: ''
16
20
 
@@ -30,8 +34,13 @@ module Dataflow
30
34
  return {} unless dependencies.count == 2
31
35
 
32
36
  # merge both dependencies schemas
33
- sch = dependencies.first.schema || {}
34
- sch.merge(dependencies.second.schema || {})
37
+ sch1 = dependencies.first.schema || {}
38
+ sch1 = sch1.select { |k,v| select_keys1.include?(k) } if select_keys1.present?
39
+ sch2 = dependencies.second.schema || {}
40
+ sch2 = sch2.select { |k,v| select_keys2.include?(k) } if select_keys2.present?
41
+ sch = sch1.merge(sch2)
42
+
43
+ sch
35
44
  end
36
45
 
37
46
  def compute_impl
@@ -53,17 +62,35 @@ module Dataflow
53
62
  private
54
63
 
55
64
  def sql_join_query
56
- fields = required_schema.keys
57
- select_keys = dependencies[0].schema.keys.map { |x| "d1.#{x}" } + (dependencies[1].schema.keys - dependencies[0].schema.keys).map { |x| "d2.#{x}" }
58
- query = "INSERT INTO #{write_dataset_name} (#{fields.join(',')})
65
+ d0_keys = dataset_keys(idx: 0)
66
+ # only select the remaining keys as we don't support yet prefixing fields
67
+ d1_keys = dataset_keys(idx: 1) - d0_keys
68
+ insert_keys = d0_keys + d1_keys
69
+ select_keys = d0_keys.map { |x| "d0.#{x}" } + d1_keys.map { |x| "d1.#{x}" }
70
+ query = "INSERT INTO #{write_dataset_name} (#{insert_keys.join(',')})
59
71
  SELECT #{select_keys.join(', ')}
60
- FROM #{dependencies[0].read_dataset_name} as d1
61
- INNER JOIN #{dependencies[1].read_dataset_name} as d2
62
- ON d1.#{key1} = d2.#{key2}"
72
+ FROM #{dependencies[0].read_dataset_name} as d0
73
+ #{join_type.upcase} JOIN #{dependencies[1].read_dataset_name} as d1
74
+ ON d0.#{key1} = d1.#{key2}"
75
+
76
+ if has_multiple_keys?
77
+ join_keys = other_keys1.each_with_index.map { |k, idx| "d0.#{k} = d1.#{other_keys2[idx]}" }
78
+ query = "#{query}
79
+ AND #{join_keys.join("\nAND ")}"
80
+ end
81
+
82
+ query
83
+ end
84
+
85
+ def dataset_keys(idx:)
86
+ keys = send("select_keys#{idx + 1}")
87
+ keys = dependencies[idx].schema.keys if keys.blank?
88
+ keys
63
89
  end
64
90
 
65
91
  def execute_sql_join
66
92
  query = sql_join_query
93
+ logger.log(query)
67
94
  # TODO: work on a better way to interface this
68
95
  sql_adapter = data_node.send(:db_adapter)
69
96
  sql_adapter.client[query].to_a
@@ -105,6 +132,9 @@ module Dataflow
105
132
  end
106
133
 
107
134
  # for each datum in dataset1, find the corresponding datum in dataset2
135
+ select_keys_set1 = select_keys1.to_set
136
+ select_keys_set2 = select_keys2.to_set
137
+
108
138
  n1_records.map do |d1|
109
139
  join_value = d1.dig(*tokens_key1)
110
140
  next if join_value.nil?
@@ -122,6 +152,10 @@ module Dataflow
122
152
  # there might be the case that nothing was found after-all
123
153
  d2 ||= {}
124
154
 
155
+ # only keep the needed keys
156
+ d1 = d1.select { |k| select_keys_set1.include?(k) } if select_keys_set1.present?
157
+ d2 = d2.select { |k| select_keys_set2.include?(k) } if select_keys_set2.present?
158
+
125
159
  # prefix if needed
126
160
  d1 = Hash[d1.map { |k, v| ["#{prefix1}#{k}", v] }] if prefix1.present?
127
161
  d2 = Hash[d2.map { |k, v| ["#{prefix2}#{k}", v] }] if prefix2.present?
@@ -8,8 +8,8 @@ module Dataflow
8
8
  ensure_data_node_exists
9
9
  ensure_dependencies exactly: 1
10
10
 
11
- def export(connection_opts: { db_backend: :csv }, keys: nil)
12
- super(connection_opts: connection_opts, keys: keys || self.keys)
11
+ def export
12
+ data_node.export(keys: keys)
13
13
  end
14
14
 
15
15
  private
@@ -11,17 +11,26 @@ module Dataflow
11
11
  # @return [Hash] with one entry per 'column'/'field'. The values
12
12
  # contains information about the type and usage.
13
13
  def infer_schema(samples_count: 0, extended: false)
14
+ if db_backend == :postgresql
15
+ # Experimental
16
+ sch = db_adapter.client.schema(name).to_h
17
+ sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
18
+ self.inferred_schema = sch
19
+ save
20
+ return sch
21
+ end
22
+
14
23
  data_count = samples_count == 0 ? count : samples_count # invoked in the base class
15
24
  return {} if data_count == 0
16
25
 
17
26
  # find out how many batches are needed
18
27
  max_per_process = 1000
19
- max_per_process = limit_per_process if respond_to? :limit_per_process
28
+ max_per_process = limit_per_process if respond_to?(:limit_per_process) && limit_per_process > 0
20
29
 
21
30
  equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
22
31
  count_per_process = [max_per_process, equal_split_per_process].min
23
32
 
24
- queries = ordered_system_id_queries(batch_size: count_per_process)
33
+ queries = ordered_system_id_queries(batch_size: count_per_process)[0...data_count]
25
34
 
26
35
  self.inferred_schema_at = Time.now
27
36
  self.inferred_schema_from = samples_count
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.9.2'
3
+ VERSION = '0.10.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.2
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-02-14 00:00:00.000000000 Z
11
+ date: 2017-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler