dataflow-rb 0.11.0 → 0.12.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d2ac7fa848d641d2c1fd0856ff92bb81f17bb670
4
- data.tar.gz: 31eaf46d3785777d712739bc7f1a6d3ca328280e
3
+ metadata.gz: 10b9fa54ef77a143c26b7b7a9f86a5b72ea80e49
4
+ data.tar.gz: 1bc1e1246308da00b7113d058b2f6acb86fb0624
5
5
  SHA512:
6
- metadata.gz: bedf2430c023cef3e4408a7e213eee4f5cf206574f0a5264dbb2b7cad10defc85fff8ebdd4e859f7bce414ce3941bb2b7bbe30ffbf7c7e1194a3e0c716470047
7
- data.tar.gz: e2470aa7d5aba0da5c67822f1eb8426564134d2ea6346bb627326a174c9198bcb89c1b598aa4d0d8cfb910071079f92a04970c12f9b4f65b0bc4bca4e39be20c
6
+ metadata.gz: 4e425087b3b6e1610433f907df70c46b3661cdee5e8fb91f881b409afa21620fe3176399ee63a3b2783be8aa945263f82c0ef90b06099399c5287100b92a12e2
7
+ data.tar.gz: 2a892c113fbf8f667b72dcda841a9a0ddac2c1dfd797f51654f9f9c2195e45870d3b5e4c7b861a361861b21857b258e36f2f8a7d33917e596f4e801d7cab95ab
data/CHANGELOG.md CHANGED
@@ -1,5 +1,17 @@
1
1
  # Changelog
2
2
 
3
+ #### 0.12.0
4
+ - [4a510df] Add support for case insentive regex matching on mysql
5
+ - [63b0771] Add logging to understand the current computation batch progress
6
+ - [df86157] Add support for pg array types
7
+ - [ce04cb3] Add the loose count extension for Sequel Postgres
8
+ - [3618060] Fix Sequel deprecation warnings
9
+ - [1cea32e] Skip logging during tests sessions
10
+ - [fdddf23] Add support for regex matching
11
+ - [b4717c5] Move the refactor the mongo batch insert
12
+ - [e2897df] Use named indexes to reduce their name size
13
+ - [bc4f598] Revert to insert_ignore to support mysql adapter
14
+
3
15
  #### 0.11.0
4
16
  - [7c09e8a] Add data_node#drop_dataset! to completely drop the data
5
17
  - [ba0532f] Added upsert on psql adapter
@@ -153,7 +153,13 @@ module Dataflow
153
153
  end
154
154
  client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
155
155
  else
156
- save_many(records: records)
156
+ client[write_dataset_name].insert_many(records, ordered: false)
157
+ end
158
+ rescue Mongo::Error::BulkWriteError => e
159
+ dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
160
+ # don't raise if it is errors about duplicated keys
161
+ unless dup_key_error
162
+ raise e
157
163
  end
158
164
  end
159
165
 
@@ -256,6 +262,10 @@ module Dataflow
256
262
  sanitized_opts[k]['$gt'] = try_cast_value(k, value)
257
263
  when '>='
258
264
  sanitized_opts[k]['$gte'] = try_cast_value(k, value)
265
+ when '~*' # match regex /regex/i (case insensitive)
266
+ sanitized_opts[k]['$regex'] = /#{value}/i
267
+ when '~' # match regex /regex/ (case sensitive)
268
+ sanitized_opts[k]['$regex'] = /#{value}/
259
269
  end
260
270
  end
261
271
  else
@@ -275,14 +285,6 @@ module Dataflow
275
285
  value
276
286
  end
277
287
 
278
- def save_many(records:)
279
- client[write_dataset_name].insert_many(records, ordered: false)
280
- rescue Mongo::Error::BulkWriteError => e
281
- dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
282
- # don't raise if it is errors about duplicated keys
283
- raise e unless dup_key_error
284
- end
285
-
286
288
  # Required index format for mongodb:
287
289
  # { :key => { name: 1 }, :unique => true },
288
290
  def format_index(dataset_index)
@@ -291,7 +293,8 @@ module Dataflow
291
293
  index_key = {}
292
294
  keys = Array(dataset_index[:key])
293
295
  keys.each { |k| index_key[k] = 1 }
294
- index = { key: index_key }
296
+ name = keys.map { |k| k[0..1] }.push(SecureRandom.hex(4)).join('_')
297
+ index = { key: index_key, name: name }
295
298
  index[:unique] = true if dataset_index[:unique]
296
299
  index
297
300
  end
@@ -16,6 +16,14 @@ module Dataflow
16
16
  storage: 0
17
17
  }
18
18
  end
19
+
20
+ def regex_case_senstive_op
21
+ raise NotImplementedError, 'Mysql does not support a case sensitive regex matching operator'
22
+ end
23
+
24
+ def regex_case_insensitive_op
25
+ 'REGEXP'
26
+ end
19
27
  end
20
28
  end
21
29
  end
@@ -16,6 +16,14 @@ module Dataflow
16
16
  storage: 0
17
17
  }
18
18
  end
19
+
20
+ def regex_case_senstive_op
21
+ '~'
22
+ end
23
+
24
+ def regex_case_insensitive_op
25
+ '~*'
26
+ end
19
27
  end
20
28
  end
21
29
  end
@@ -39,7 +39,9 @@ module Dataflow
39
39
  try_create_db(uri, db_name, user, password) unless is_external_db
40
40
 
41
41
  # then, create the connection object
42
- @clients[connection_uri] ||= Sequel.connect("#{connection_uri}?encoding=utf8")
42
+ db = Sequel.connect("#{connection_uri}?encoding=utf8")
43
+ add_extensions(settings, db)
44
+ @clients[connection_uri] = db
43
45
  end
44
46
 
45
47
  # Used internally to try to create the DB automatically.
@@ -56,6 +58,15 @@ module Dataflow
56
58
  false
57
59
  end
58
60
 
61
+ # load Sequel extensions based on the type
62
+ def add_extensions(settings, db)
63
+ if settings.adapter_type == 'postgresql'
64
+ db.extension(:pg_array)
65
+ # db.extension(:pg_json)
66
+ db.extension(:pg_loose_count)
67
+ end
68
+ end
69
+
59
70
  # Force the clients to disconnect their connections.
60
71
  # Use before forking.
61
72
  def disconnect_clients
@@ -158,14 +169,15 @@ module Dataflow
158
169
  if replace_by.present?
159
170
  index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
160
171
 
161
- # update every field on conflict
162
- update_clause = columns.map { |k| [k, :"excluded__#{k}"] }.to_h
172
+ # On conflict update every field. On Postgresql we can refer
173
+ # to the "conflicting" rows using the "excluded_" prefix:
174
+ update_clause = columns.map { |k| [k, Sequel.qualify('excluded', k)] }.to_h
163
175
  dataset
164
176
  .insert_conflict(target: index_keys, update: update_clause)
165
177
  .import(columns, tabular_data)
166
178
  else
167
179
  # ignore insert conflicts
168
- dataset.insert_conflict.import(columns, tabular_data)
180
+ dataset.insert_ignore.import(columns, tabular_data)
169
181
  end
170
182
  end
171
183
 
@@ -272,13 +284,13 @@ module Dataflow
272
284
  when 'numeric'
273
285
  col_type = 'real'
274
286
  when 'array', 'hash'
275
- puts "Check type of field #{column} (given: #{type}). Not expecting to use JSON."
287
+ logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
276
288
  col_type = 'json'
277
289
  when 'date', 'time'
278
290
  # keep as-is
279
291
  col_type = type
280
292
  else
281
- puts "[Error] unexpected type '#{type}'. Keeping as-is."
293
+ logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
282
294
  col_type = type
283
295
  end
284
296
 
@@ -305,18 +317,16 @@ module Dataflow
305
317
  case operator
306
318
  when '!='
307
319
  if value.is_a? Array
308
- ["#{k} NOT IN ?", value]
320
+ Sequel.lit("#{k} NOT IN ?", value)
309
321
  else
310
- ["#{k} <> ?", value]
322
+ Sequel.lit("#{k} <> ?", value)
311
323
  end
312
- when '<'
313
- ["#{k} < ?", value]
314
- when '<='
315
- ["#{k} <= ?", value]
316
- when '>'
317
- ["#{k} > ?", value]
318
- when '>='
319
- ["#{k} >= ?", value]
324
+ when '<', '<=', '>', '>='
325
+ Sequel.lit("#{k} #{operator} ?", value)
326
+ when '~'
327
+ Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
328
+ when '~*'
329
+ Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
320
330
  end
321
331
  end
322
332
  else
@@ -11,7 +11,7 @@ module Dataflow
11
11
  end
12
12
 
13
13
  def log(str)
14
- return if ENV['RACK_ENV'] == 'test'
14
+ return if ENV['DATAFLOW_SKIP_LOGGING']
15
15
  now = DateTime.now.strftime('%y-%m-%d %H:%M:%S')
16
16
  message = "[#{now}][#{trace_id}] #{prefix} | #{str}"
17
17
  logger_impl = @@impl
@@ -333,11 +333,13 @@ module Dataflow
333
333
  count_per_process = [limit, equal_split_per_process].min if limit > 0
334
334
 
335
335
  queries = node.ordered_system_id_queries(batch_size: count_per_process)
336
+ queries_count = queries.count
336
337
 
337
338
  parallel_each(queries.each_with_index) do |query, idx|
338
339
  send_heartbeat
339
- progress = (idx / queries.count.to_f * 100).ceil
340
+ progress = (idx / queries_count.to_f * 100).ceil
340
341
  on_computing_progressed(pct_complete: progress)
342
+ logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
341
343
 
342
344
  records = node.all(where: query)
343
345
 
@@ -504,7 +506,7 @@ module Dataflow
504
506
 
505
507
  # set to true to debug code in the iteration
506
508
  is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
507
- if is_debugging_impl # || true
509
+ if is_debugging_impl || true
508
510
  itr.each do |*args|
509
511
  yield(*args)
510
512
  end
@@ -1,4 +1,4 @@
1
1
  # frozen_string_literal: true
2
2
  module Dataflow
3
- VERSION = '0.11.0'
3
+ VERSION = '0.12.0'
4
4
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: dataflow-rb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.11.0
4
+ version: 0.12.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Eurico Doirado
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-04-25 00:00:00.000000000 Z
11
+ date: 2017-05-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler