dataflow-rb 0.9.2 → 0.10.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: da8a0cc4aa93a9a282f672e830d2ab8931e6fe58
4
- data.tar.gz: a4a205460bcda2715d1e5bd16b4fe0982a0f652c
3
+ metadata.gz: 5ccef7eb0d8bf531e4e19c62562763dff31f85f4
4
+ data.tar.gz: 9c5e496748df17dcc3fd41ea90c4c3b5dd4df43a
5
5
  SHA512:
6
- metadata.gz: 14cdd199d230e5048d599372798343274bc130cc906dcb4f39449bb4dd54eec89bd06047ef16560e0cedc15588d333701550bdc0ad5ba37d6511b9935d7b7d5d
7
- data.tar.gz: 1d1658b28845cd78128d44e0f9acae8848117ebc304ec37f059f326faa2d22f9547c47405773becfa49f99a7071c6c11ce02b73926b14994f9c6c4f0c7643489
6
+ metadata.gz: 2c62ef0c24df46a551d5a90bd8d7cf4b6b4ece57bbfd3dad720c7399b4b325ae949d10f3f644403413f2534f1ae08ae797dcaf8d925bea28d2da8b23c7ca0b3a
7
+ data.tar.gz: 95f0b3764d07d39cb15ef06a5fcea00e71942d02560b0b4e3475adf3a83b8b70d719f42d260decb496ad44668cdc3947fc32567b84b7eb0fd5a35b0e356e97a0
data/CHANGELOG.md CHANGED
@@ -1,6 +1,19 @@
1
1
  # Changelog
2
2
 
3
-
3
+ #### 0.10.0
4
+ - [2f6284c] Allow the pre-compute to modify the necessary schema
5
+ - [cec8a1d] Do not crash if process_parallel is called without dependencies.
6
+ - [83e1bb5] Various fixes related to the csv export feature
7
+ - [61e74d7] Force the data node to use double buffering when necessary.
8
+ - [553b1ea] Fix documentation
9
+ - [be21031] Added an heartbeat to the compute node
10
+ - [78308c0] Added tests to the join node. Add multi key support on Postgresql impl and select_keys support on software join.
11
+ - [090c81f] Experimental: fetch the schema directly from the DB.
12
+ - [46a7915] Fix: use the samples count when inferring a schema
13
+ - [dcd7750] Add support for selecting which keys to include in a join.
14
+ - [9005b6c] Set a default updated_at when creating a data node. Do not change the dataset immediately if we're using double buffering. Wait for the next buffer to be created instead.
15
+ - [d98d9c1] Do not crash if an index cannot be added. Use the logger instead of the stdout for the sql adapter.
16
+ - [cc40642] Catch DatabaseError.
4
17
 
5
18
  #### 0.9.2
6
19
  - [2f3129c] Fix bug when joining datasets directly in SQL
@@ -72,15 +72,11 @@ module Dataflow
72
72
  def initialize(args)
73
73
  update_settings(args)
74
74
  @client = SqlAdapter.client(settings)
75
- @schema = settings.schema || [] # TODO: detect if the table schema has a mis-match
76
75
  end
77
76
 
78
77
  def update_settings(args)
79
78
  @settings = Dataflow::Adapters::Settings.new(args)
80
- end
81
-
82
- def set_schema(schema)
83
- @schema = schema
79
+ @schema = @settings.schema
84
80
  end
85
81
 
86
82
  # retrieve a single element from a data node
@@ -167,12 +163,6 @@ module Dataflow
167
163
  def recreate_dataset(dataset: nil)
168
164
  dataset ||= settings.write_dataset_name.to_sym
169
165
  client.drop_table?(dataset)
170
-
171
- unless @schema.present?
172
- p 'WARNING: recreate dataset aborted: no schema'
173
- return
174
- end
175
-
176
166
  create_table(dataset, @schema)
177
167
  end
178
168
 
@@ -201,7 +191,16 @@ module Dataflow
201
191
  client.add_index(dataset, *params)
202
192
  rescue Sequel::DatabaseError => e
203
193
  # ignore index already exists
204
- raise e unless e.wrapped_exception.is_a?(PG::DuplicateTable)
194
+ next if e.wrapped_exception.is_a?(PG::DuplicateTable)
195
+
196
+ # log columns not found but do not raise an error
197
+ if e.wrapped_exception.is_a?(PG::UndefinedColumn)
198
+ logger.log("[Error] add_index on #{dataset} failed. #{e}")
199
+ next
200
+ end
201
+
202
+ # re-raise for everything else
203
+ raise e
205
204
  end
206
205
  end
207
206
  end
@@ -236,6 +235,8 @@ module Dataflow
236
235
  end
237
236
  when 'time'
238
237
  col_type = 'timestamp'
238
+ when 'datetime'
239
+ col_type = 'timestamp with time zone'
239
240
  when 'integer'
240
241
  max_size ||= MAX_INT + 1
241
242
  col_type = if max_size <= MAX_INT
@@ -246,10 +247,13 @@ module Dataflow
246
247
  when 'numeric'
247
248
  col_type = 'real'
248
249
  when 'array', 'hash'
249
- p "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
250
+ puts "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
250
251
  col_type = 'json'
252
+ when 'date', 'time'
253
+ # keep as-is
254
+ col_type = type
251
255
  else
252
- p "Error: unexpected type '#{type}'. Keeping as-is."
256
+ puts "[Error] unexpected type '#{type}'. Keeping as-is."
253
257
  col_type = type
254
258
  end
255
259
 
@@ -317,6 +321,10 @@ module Dataflow
317
321
  index
318
322
  end.compact
319
323
  end
324
+
325
+ def logger
326
+ @logger ||= Dataflow::Logger.new(prefix: "Dataflow[#{settings.dataset_name}]")
327
+ end
320
328
  end
321
329
  end
322
330
  end
@@ -11,7 +11,7 @@ module Dataflow
11
11
  include Dataflow::SchemaMixin
12
12
 
13
13
  event :computing_started # handler(node)
14
- event :computing_progressed # handler(node, pct_complete)
14
+ event :computing_progressed # handler(node, pct_complete:)
15
15
  event :computing_finished # handler(node, state)
16
16
 
17
17
  delegate :find, :all, :all_paginated, :count, :ordered_system_id_queries,
@@ -31,7 +31,7 @@ module Dataflow
31
31
  @data_node_opts || {}
32
32
  end
33
33
 
34
- # DSL to be used while making computed nodes. It supports enforcing validations
34
+ # DSL to be used while making computeqd nodes. It supports enforcing validations
35
35
  # by checking whether there is exactly, at_least (min) or at_most (max)
36
36
  # a given number of dependencies. Usage:
37
37
  # class MyComputeNode < ComputeNode
@@ -83,6 +83,10 @@ module Dataflow
83
83
  # Indicates the last time a successful computation has started.
84
84
  field :last_compute_starting_time, type: Time, editable: false
85
85
 
86
+ # The last time an heartbeat was received.
87
+ # Useful to detect stale computation that need to be reaped.
88
+ field :last_heartbeat_time, type: Time, editable: false
89
+
86
90
  # Necessary fields:
87
91
  validates_presence_of :name
88
92
 
@@ -217,10 +221,17 @@ module Dataflow
217
221
  on_computing_started
218
222
  start_time = Time.now
219
223
 
224
+ if data_node.present? && clear_data_on_compute != data_node.use_double_buffering
225
+ # make sure the data node has a compatible settings
226
+ data_node.use_double_buffering = clear_data_on_compute
227
+ data_node.save
228
+ end
229
+
230
+ pre_compute(force_compute: force_compute)
231
+
220
232
  # update this node's schema with the necessary fields
221
233
  data_node&.update_schema(required_schema)
222
234
 
223
- pre_compute(force_compute: force_compute)
224
235
 
225
236
  if clear_data_on_compute
226
237
  # Pre-compute, we recreate the table, the unique indexes
@@ -228,6 +239,7 @@ module Dataflow
228
239
  data_node&.create_unique_indexes(dataset_type: :write)
229
240
  end
230
241
 
242
+ send_heartbeat
231
243
  compute_impl
232
244
 
233
245
  if clear_data_on_compute
@@ -300,6 +312,7 @@ module Dataflow
300
312
  end
301
313
 
302
314
  def process_parallel(node:)
315
+ return if node.blank?
303
316
  record_count = node.count
304
317
  return if record_count == 0
305
318
 
@@ -311,6 +324,7 @@ module Dataflow
311
324
  queries = node.ordered_system_id_queries(batch_size: count_per_process)
312
325
 
313
326
  parallel_each(queries.each_with_index) do |query, idx|
327
+ send_heartbeat
314
328
  progress = (idx / queries.count.to_f * 100).ceil
315
329
  on_computing_progressed(pct_complete: progress)
316
330
 
@@ -379,6 +393,12 @@ module Dataflow
379
393
  schema
380
394
  end
381
395
 
396
+ def send_heartbeat
397
+ update_query = { '$set' => { last_heartbeat_time: Time.now } }
398
+ Dataflow::Nodes::ComputeNode.where(_id: _id)
399
+ .find_one_and_update(update_query)
400
+ end
401
+
382
402
  ##############################
383
403
  # Dependency validations
384
404
  ##############################
@@ -76,6 +76,9 @@ module Dataflow
76
76
  # Use the schema as the inferred schema if none is provided.
77
77
  # This useful when there is no need to infer schemas (e.g. in SQL)
78
78
  self.inferred_schema ||= schema
79
+
80
+ # This is needed for the flow to compute properly
81
+ self.updated_at = Time.now
79
82
  end
80
83
 
81
84
  # Callback: after creation make sure the underlying dataset matches this node's properties.
@@ -95,6 +98,10 @@ module Dataflow
95
98
  def handle_dataset_settings_changed
96
99
  db_adapter.update_settings(data_node: self)
97
100
 
101
+ # if we're using double buffering, just wait for the next buffer
102
+ # to be created to apply the changes.
103
+ return if use_double_buffering
104
+
98
105
  # recreate the dataset if there is no data
99
106
  if db_adapter.count.zero?
100
107
  db_adapter.recreate_dataset(dataset: read_dataset_name)
@@ -257,10 +264,14 @@ module Dataflow
257
264
  add(records: records)
258
265
  end
259
266
 
260
- def export(connection_opts: { db_backend: :csv }, keys: nil, where: {})
267
+ def export(connection_opts: { db_backend: :csv }, keys: [], where: {})
261
268
  on_export_started(connection_opts: connection_opts, keys: keys)
262
269
  # instanciate and export without saving anything
263
- Export::ToCsvNode.new(dependency_ids: [self], query: where.to_json).compute_impl
270
+ Export::ToCsvNode.new(
271
+ dependency_ids: [self],
272
+ query: where.to_json,
273
+ keys: keys
274
+ ).compute_impl
264
275
  on_export_finished
265
276
  end
266
277
 
@@ -9,17 +9,19 @@ module Dataflow
9
9
  # A JSON encoded query to pass along.
10
10
  field :query, type: String, default: {}.to_json
11
11
 
12
+ # Which fields to export
13
+ field :keys, type: Array, default: []
14
+
12
15
  def compute_impl
13
16
  node = dependencies.first
14
17
  where = JSON.parse(query)
15
18
 
16
19
  # fetch the schema
17
- sch = node.infer_partial_schema(where: where, extended: true)
18
-
19
- # re-order the schema if needed
20
- if node.respond_to? :keys
21
- sch = node.keys.map { |k| [k, sch[k]] }.to_h if keys.present?
22
- end
20
+ sch = if keys.present?
21
+ keys.map { |k| [k, { type: 'string' }] }.to_h
22
+ else
23
+ node.infer_partial_schema(where: where, extended: true)
24
+ end
23
25
 
24
26
  # create the dataset
25
27
  csv_adapter = Adapters::CsvAdapter.new(data_node: node)
@@ -40,8 +42,7 @@ module Dataflow
40
42
  # TODO: re-enabled event on_export_progressed
41
43
  # progress = (idx / queries.count.to_f * 100).ceil
42
44
  # on_export_progressed(pct_complete: progress)
43
-
44
- batch = node.all(where: query.merge(where))
45
+ batch = node.all(where: query.merge(where), fields: sch.keys)
45
46
  csv_adapter.save(records: batch)
46
47
  end
47
48
 
@@ -11,6 +11,10 @@ module Dataflow
11
11
  # other_keys_1 and 2 must match in length
12
12
  field :other_keys1, type: Array, default: []
13
13
  field :other_keys2, type: Array, default: []
14
+ # Which keys to select on each dataset
15
+ field :select_keys1, type: Array, default: []
16
+ field :select_keys2, type: Array, default: []
17
+ # How to prefix each key
14
18
  field :prefix1, type: String, default: ''
15
19
  field :prefix2, type: String, default: ''
16
20
 
@@ -30,8 +34,13 @@ module Dataflow
30
34
  return {} unless dependencies.count == 2
31
35
 
32
36
  # merge both dependencies schemas
33
- sch = dependencies.first.schema || {}
34
- sch.merge(dependencies.second.schema || {})
37
+ sch1 = dependencies.first.schema || {}
38
+ sch1 = sch1.select { |k,v| select_keys1.include?(k) } if select_keys1.present?
39
+ sch2 = dependencies.second.schema || {}
40
+ sch2 = sch2.select { |k,v| select_keys2.include?(k) } if select_keys2.present?
41
+ sch = sch1.merge(sch2)
42
+
43
+ sch
35
44
  end
36
45
 
37
46
  def compute_impl
@@ -53,17 +62,35 @@ module Dataflow
53
62
  private
54
63
 
55
64
  def sql_join_query
56
- fields = required_schema.keys
57
- select_keys = dependencies[0].schema.keys.map { |x| "d1.#{x}" } + (dependencies[1].schema.keys - dependencies[0].schema.keys).map { |x| "d2.#{x}" }
58
- query = "INSERT INTO #{write_dataset_name} (#{fields.join(',')})
65
+ d0_keys = dataset_keys(idx: 0)
66
+ # only select the remaining keys as we don't support yet prefixing fields
67
+ d1_keys = dataset_keys(idx: 1) - d0_keys
68
+ insert_keys = d0_keys + d1_keys
69
+ select_keys = d0_keys.map { |x| "d0.#{x}" } + d1_keys.map { |x| "d1.#{x}" }
70
+ query = "INSERT INTO #{write_dataset_name} (#{insert_keys.join(',')})
59
71
  SELECT #{select_keys.join(', ')}
60
- FROM #{dependencies[0].read_dataset_name} as d1
61
- INNER JOIN #{dependencies[1].read_dataset_name} as d2
62
- ON d1.#{key1} = d2.#{key2}"
72
+ FROM #{dependencies[0].read_dataset_name} as d0
73
+ #{join_type.upcase} JOIN #{dependencies[1].read_dataset_name} as d1
74
+ ON d0.#{key1} = d1.#{key2}"
75
+
76
+ if has_multiple_keys?
77
+ join_keys = other_keys1.each_with_index.map { |k, idx| "d0.#{k} = d1.#{other_keys2[idx]}" }
78
+ query = "#{query}
79
+ AND #{join_keys.join("\nAND ")}"
80
+ end
81
+
82
+ query
83
+ end
84
+
85
+ def dataset_keys(idx:)
86
+ keys = send("select_keys#{idx + 1}")
87
+ keys = dependencies[idx].schema.keys if keys.blank?
88
+ keys
63
89
  end
64
90
 
65
91
  def execute_sql_join
66
92
  query = sql_join_query
93
+ logger.log(query)
67
94
  # TODO: work on a better way to interface this
68
95
  sql_adapter = data_node.send(:db_adapter)
69
96
  sql_adapter.client[query].to_a
@@ -105,6 +132,9 @@ module Dataflow
105
132
  end
106
133
 
107
134
  # for each datum in dataset1, find the corresponding datum in dataset2
135
+ select_keys_set1 = select_keys1.to_set
136
+ select_keys_set2 = select_keys2.to_set
137
+
108
138
  n1_records.map do |d1|
109
139
  join_value = d1.dig(*tokens_key1)
110
140
  next if join_value.nil?
@@ -122,6 +152,10 @@ module Dataflow
122
152
  # there might be the case that nothing was found after-all
123
153
  d2 ||= {}
124
154
 
155
+ # only keep the needed keys
156
+ d1 = d1.select { |k| select_keys_set1.include?(k) } if select_keys_set1.present?
157
+ d2 = d2.select { |k| select_keys_set2.include?(k) } if select_keys_set2.present?
158
+
125
159
  # prefix if needed
126
160
  d1 = Hash[d1.map { |k, v| ["#{prefix1}#{k}", v] }] if prefix1.present?
127
161
  d2 = Hash[d2.map { |k, v| ["#{prefix2}#{k}", v] }] if prefix2.present?
@@ -8,8 +8,8 @@ module Dataflow
8
8
  ensure_data_node_exists
9
9
  ensure_dependencies exactly: 1
10
10
 
11
- def export(connection_opts: { db_backend: :csv }, keys: nil)
12
- super(connection_opts: connection_opts, keys: keys || self.keys)
11
+ def export
12
+ data_node.export(keys: keys)
13
13
  end
14
14
 
15
15
  private
@@ -11,17 +11,26 @@ module Dataflow
11
11
  # @return [Hash] with one entry per 'column'/'field'. The values
12
12
  # contains information about the type and usage.
13
13
  def infer_schema(samples_count: 0, extended: false)
14
+ if db_backend == :postgresql
15
+ # Experimental
16
+ sch = db_adapter.client.schema(name).to_h
17
+ sch = sch.reject{ |k, v| k == :_id }.map { |k,v| [k, {type: v[:type].to_s}] }.to_h
18
+ self.inferred_schema = sch
19
+ save
20
+ return sch
21
+ end
22
+
14
23
  data_count = samples_count == 0 ? count : samples_count # invoked in the base class
15
24
  return {} if data_count == 0
16
25
 
17
26
  # find out how many batches are needed
18
27
  max_per_process = 1000
19
- max_per_process = limit_per_process if respond_to? :limit_per_process
28
+ max_per_process = limit_per_process if respond_to?(:limit_per_process) && limit_per_process > 0
20
29
 
21
30
  equal_split_per_process = (data_count / Parallel.processor_count.to_f).ceil
22
31
  count_per_process = [max_per_process, equal_split_per_process].min
23
32
 
24
- queries = ordered_system_id_queries(batch_size: count_per_process)
33
+ queries = ordered_system_id_queries(batch_size: count_per_process)[0...data_count]
25
34
 
26
35
  self.inferred_schema_at = Time.now
27
36
  self.inferred_schema_from = samples_count
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.9.2'
3
+ VERSION = '0.10.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.9.2
4
+ version: 0.10.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-02-14 00:00:00.000000000 Z
11
+ date: 2017-03-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler