dataflow-rb 0.11.0 → 0.12.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d2ac7fa848d641d2c1fd0856ff92bb81f17bb670
4
- data.tar.gz: 31eaf46d3785777d712739bc7f1a6d3ca328280e
3
+ metadata.gz: 10b9fa54ef77a143c26b7b7a9f86a5b72ea80e49
4
+ data.tar.gz: 1bc1e1246308da00b7113d058b2f6acb86fb0624
5
5
  SHA512:
6
- metadata.gz: bedf2430c023cef3e4408a7e213eee4f5cf206574f0a5264dbb2b7cad10defc85fff8ebdd4e859f7bce414ce3941bb2b7bbe30ffbf7c7e1194a3e0c716470047
7
- data.tar.gz: e2470aa7d5aba0da5c67822f1eb8426564134d2ea6346bb627326a174c9198bcb89c1b598aa4d0d8cfb910071079f92a04970c12f9b4f65b0bc4bca4e39be20c
6
+ metadata.gz: 4e425087b3b6e1610433f907df70c46b3661cdee5e8fb91f881b409afa21620fe3176399ee63a3b2783be8aa945263f82c0ef90b06099399c5287100b92a12e2
7
+ data.tar.gz: 2a892c113fbf8f667b72dcda841a9a0ddac2c1dfd797f51654f9f9c2195e45870d3b5e4c7b861a361861b21857b258e36f2f8a7d33917e596f4e801d7cab95ab
data/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ #### 0.12.0
4
+ - [4a510df] Add support for case insentive regex matching on mysql
5
+ - [63b0771] Add logging to understand the current computation batch progress
6
+ - [df86157] Add support for pg array types
7
+ - [ce04cb3] Add the loose count extension for Sequel Postgres
8
+ - [3618060] Fix Sequel deprecation warnings
9
+ - [1cea32e] Skip logging during tests sessions
10
+ - [fdddf23] Add support for regex matching
11
+ - [b4717c5] Move the refactor the mongo batch insert
12
+ - [e2897df] Use named indexes to reduce their name size
13
+ - [bc4f598] Revert to insert_ignore to support mysql adapter
14
+
3
15
  #### 0.11.0
4
16
  - [7c09e8a] Add data_node#drop_dataset! to completely drop the data
5
17
  - [ba0532f] Added upsert on psql adapter
@@ -153,7 +153,13 @@ module Dataflow
153
153
  end
154
154
  client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
155
155
  else
156
- save_many(records: records)
156
+ client[write_dataset_name].insert_many(records, ordered: false)
157
+ end
158
+ rescue Mongo::Error::BulkWriteError => e
159
+ dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
160
+ # don't raise if it is errors about duplicated keys
161
+ unless dup_key_error
162
+ raise e
157
163
  end
158
164
  end
159
165
 
@@ -256,6 +262,10 @@ module Dataflow
256
262
  sanitized_opts[k]['$gt'] = try_cast_value(k, value)
257
263
  when '>='
258
264
  sanitized_opts[k]['$gte'] = try_cast_value(k, value)
265
+ when '~*' # match regex /regex/i (case insensitive)
266
+ sanitized_opts[k]['$regex'] = /#{value}/i
267
+ when '~' # match regex /regex/ (case sensitive)
268
+ sanitized_opts[k]['$regex'] = /#{value}/
259
269
  end
260
270
  end
261
271
  else
@@ -275,14 +285,6 @@ module Dataflow
275
285
  value
276
286
  end
277
287
 
278
- def save_many(records:)
279
- client[write_dataset_name].insert_many(records, ordered: false)
280
- rescue Mongo::Error::BulkWriteError => e
281
- dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
282
- # don't raise if it is errors about duplicated keys
283
- raise e unless dup_key_error
284
- end
285
-
286
288
  # Required index format for mongodb:
287
289
  # { :key => { name: 1 }, :unique => true },
288
290
  def format_index(dataset_index)
@@ -291,7 +293,8 @@ module Dataflow
291
293
  index_key = {}
292
294
  keys = Array(dataset_index[:key])
293
295
  keys.each { |k| index_key[k] = 1 }
294
- index = { key: index_key }
296
+ name = keys.map { |k| k[0..1] }.push(SecureRandom.hex(4)).join('_')
297
+ index = { key: index_key, name: name }
295
298
  index[:unique] = true if dataset_index[:unique]
296
299
  index
297
300
  end
@@ -16,6 +16,14 @@ module Dataflow
16
16
  storage: 0
17
17
  }
18
18
  end
19
+
20
+ def regex_case_senstive_op
21
+ raise NotImplementedError, 'Mysql does not support a case sensitive regex matching operator'
22
+ end
23
+
24
+ def regex_case_insensitive_op
25
+ 'REGEXP'
26
+ end
19
27
  end
20
28
  end
21
29
  end
@@ -16,6 +16,14 @@ module Dataflow
16
16
  storage: 0
17
17
  }
18
18
  end
19
+
20
+ def regex_case_senstive_op
21
+ '~'
22
+ end
23
+
24
+ def regex_case_insensitive_op
25
+ '~*'
26
+ end
19
27
  end
20
28
  end
21
29
  end
@@ -39,7 +39,9 @@ module Dataflow
39
39
  try_create_db(uri, db_name, user, password) unless is_external_db
40
40
 
41
41
  # then, create the connection object
42
- @clients[connection_uri] ||= Sequel.connect("#{connection_uri}?encoding=utf8")
42
+ db = Sequel.connect("#{connection_uri}?encoding=utf8")
43
+ add_extensions(settings, db)
44
+ @clients[connection_uri] = db
43
45
  end
44
46
 
45
47
  # Used internally to try to create the DB automatically.
@@ -56,6 +58,15 @@ module Dataflow
56
58
  false
57
59
  end
58
60
 
61
+ # load Sequel extensions based on the type
62
+ def add_extensions(settings, db)
63
+ if settings.adapter_type == 'postgresql'
64
+ db.extension(:pg_array)
65
+ # db.extension(:pg_json)
66
+ db.extension(:pg_loose_count)
67
+ end
68
+ end
69
+
59
70
  # Force the clients to disconnect their connections.
60
71
  # Use before forking.
61
72
  def disconnect_clients
@@ -158,14 +169,15 @@ module Dataflow
158
169
  if replace_by.present?
159
170
  index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
160
171
 
161
- # update every field on conflict
162
- update_clause = columns.map { |k| [k, :"excluded__#{k}"] }.to_h
172
+ # On conflict update every field. On Postgresql we can refer
173
+ # to the "conflicting" rows using the "excluded_" prefix:
174
+ update_clause = columns.map { |k| [k, Sequel.qualify('excluded', k)] }.to_h
163
175
  dataset
164
176
  .insert_conflict(target: index_keys, update: update_clause)
165
177
  .import(columns, tabular_data)
166
178
  else
167
179
  # ignore insert conflicts
168
- dataset.insert_conflict.import(columns, tabular_data)
180
+ dataset.insert_ignore.import(columns, tabular_data)
169
181
  end
170
182
  end
171
183
 
@@ -272,13 +284,13 @@ module Dataflow
272
284
  when 'numeric'
273
285
  col_type = 'real'
274
286
  when 'array', 'hash'
275
- puts "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
287
+ logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
276
288
  col_type = 'json'
277
289
  when 'date', 'time'
278
290
  # keep as-is
279
291
  col_type = type
280
292
  else
281
- puts "[Error] unexpected type '#{type}'. Keeping as-is."
293
+ logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
282
294
  col_type = type
283
295
  end
284
296
 
@@ -305,18 +317,16 @@ module Dataflow
305
317
  case operator
306
318
  when '!='
307
319
  if value.is_a? Array
308
- ["#{k} NOT IN ?", value]
320
+ Sequel.lit("#{k} NOT IN ?", value)
309
321
  else
310
- ["#{k} <> ?", value]
322
+ Sequel.lit("#{k} <> ?", value)
311
323
  end
312
- when '<'
313
- ["#{k} < ?", value]
314
- when '<='
315
- ["#{k} <= ?", value]
316
- when '>'
317
- ["#{k} > ?", value]
318
- when '>='
319
- ["#{k} >= ?", value]
324
+ when '<', '<=', '>', '>='
325
+ Sequel.lit("#{k} #{operator} ?", value)
326
+ when '~'
327
+ Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
328
+ when '~*'
329
+ Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
320
330
  end
321
331
  end
322
332
  else
@@ -11,7 +11,7 @@ module Dataflow
11
11
  end
12
12
 
13
13
  def log(str)
14
- return if ENV['RACK_ENV'] == 'test'
14
+ return if ENV['DATAFLOW_SKIP_LOGGING']
15
15
  now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
16
16
  message = "[#{now}][#{trace_id}] #{prefix} | #{str}"
17
17
  logger_impl = @@impl
@@ -333,11 +333,13 @@ module Dataflow
333
333
  count_per_process = [limit, equal_split_per_process].min if limit > 0
334
334
 
335
335
  queries = node.ordered_system_id_queries(batch_size: count_per_process)
336
+ queries_count = queries.count
336
337
 
337
338
  parallel_each(queries.each_with_index) do |query, idx|
338
339
  send_heartbeat
339
- progress = (idx / queries.count.to_f * 100).ceil
340
+ progress = (idx / queries_count.to_f * 100).ceil
340
341
  on_computing_progressed(pct_complete: progress)
342
+ logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
341
343
 
342
344
  records = node.all(where: query)
343
345
 
@@ -504,7 +506,7 @@ module Dataflow
504
506
 
505
507
  # set to true to debug code in the iteration
506
508
  is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
507
- if is_debugging_impl # || true
509
+ if is_debugging_impl || true
508
510
  itr.each do |*args|
509
511
  yield(*args)
510
512
  end
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.11.0'
3
+ VERSION = '0.12.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-04-25 00:00:00.000000000 Z
11
+ date: 2017-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler