dataflow-rb 0.11.0 → 0.12.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +13 -10
- data/lib/dataflow/adapters/mysql_adapter.rb +8 -0
- data/lib/dataflow/adapters/psql_adapter.rb +8 -0
- data/lib/dataflow/adapters/sql_adapter.rb +26 -16
- data/lib/dataflow/logger.rb +1 -1
- data/lib/dataflow/nodes/compute_node.rb +4 -2
- data/lib/dataflow/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 10b9fa54ef77a143c26b7b7a9f86a5b72ea80e49
|
4
|
+
data.tar.gz: 1bc1e1246308da00b7113d058b2f6acb86fb0624
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4e425087b3b6e1610433f907df70c46b3661cdee5e8fb91f881b409afa21620fe3176399ee63a3b2783be8aa945263f82c0ef90b06099399c5287100b92a12e2
|
7
|
+
data.tar.gz: 2a892c113fbf8f667b72dcda841a9a0ddac2c1dfd797f51654f9f9c2195e45870d3b5e4c7b861a361861b21857b258e36f2f8a7d33917e596f4e801d7cab95ab
|
data/CHANGELOG.md
CHANGED
@@ -1,5 +1,17 @@
|
|
1
1
|
# Changelog
|
2
2
|
|
3
|
+
#### 0.12.0
|
4
|
+
- [4a510df] Add support for case insentive regex matching on mysql
|
5
|
+
- [63b0771] Add logging to understand the current computation batch progress
|
6
|
+
- [df86157] Add support for pg array types
|
7
|
+
- [ce04cb3] Add the loose count extension for Sequel Postgres
|
8
|
+
- [3618060] Fix Sequel deprecation warnings
|
9
|
+
- [1cea32e] Skip logging during tests sessions
|
10
|
+
- [fdddf23] Add support for regex matching
|
11
|
+
- [b4717c5] Move the refactor the mongo batch insert
|
12
|
+
- [e2897df] Use named indexes to reduce their name size
|
13
|
+
- [bc4f598] Revert to insert_ignore to support mysql adapter
|
14
|
+
|
3
15
|
#### 0.11.0
|
4
16
|
- [7c09e8a] Add data_node#drop_dataset! to completely drop the data
|
5
17
|
- [ba0532f] Added upsert on psql adapter
|
@@ -153,7 +153,13 @@ module Dataflow
|
|
153
153
|
end
|
154
154
|
client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
|
155
155
|
else
|
156
|
-
|
156
|
+
client[write_dataset_name].insert_many(records, ordered: false)
|
157
|
+
end
|
158
|
+
rescue Mongo::Error::BulkWriteError => e
|
159
|
+
dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
|
160
|
+
# don't raise if it is errors about duplicated keys
|
161
|
+
unless dup_key_error
|
162
|
+
raise e
|
157
163
|
end
|
158
164
|
end
|
159
165
|
|
@@ -256,6 +262,10 @@ module Dataflow
|
|
256
262
|
sanitized_opts[k]['$gt'] = try_cast_value(k, value)
|
257
263
|
when '>='
|
258
264
|
sanitized_opts[k]['$gte'] = try_cast_value(k, value)
|
265
|
+
when '~*' # match regex /regex/i (case insensitive)
|
266
|
+
sanitized_opts[k]['$regex'] = /#{value}/i
|
267
|
+
when '~' # match regex /regex/ (case sensitive)
|
268
|
+
sanitized_opts[k]['$regex'] = /#{value}/
|
259
269
|
end
|
260
270
|
end
|
261
271
|
else
|
@@ -275,14 +285,6 @@ module Dataflow
|
|
275
285
|
value
|
276
286
|
end
|
277
287
|
|
278
|
-
def save_many(records:)
|
279
|
-
client[write_dataset_name].insert_many(records, ordered: false)
|
280
|
-
rescue Mongo::Error::BulkWriteError => e
|
281
|
-
dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
|
282
|
-
# don't raise if it is errors about duplicated keys
|
283
|
-
raise e unless dup_key_error
|
284
|
-
end
|
285
|
-
|
286
288
|
# Required index format for mongodb:
|
287
289
|
# { :key => { name: 1 }, :unique => true },
|
288
290
|
def format_index(dataset_index)
|
@@ -291,7 +293,8 @@ module Dataflow
|
|
291
293
|
index_key = {}
|
292
294
|
keys = Array(dataset_index[:key])
|
293
295
|
keys.each { |k| index_key[k] = 1 }
|
294
|
-
|
296
|
+
name = keys.map { |k| k[0..1] }.push(SecureRandom.hex(4)).join('_')
|
297
|
+
index = { key: index_key, name: name }
|
295
298
|
index[:unique] = true if dataset_index[:unique]
|
296
299
|
index
|
297
300
|
end
|
@@ -16,6 +16,14 @@ module Dataflow
|
|
16
16
|
storage: 0
|
17
17
|
}
|
18
18
|
end
|
19
|
+
|
20
|
+
def regex_case_senstive_op
|
21
|
+
raise NotImplementedError, 'Mysql does not support a case sensitive regex matching operator'
|
22
|
+
end
|
23
|
+
|
24
|
+
def regex_case_insensitive_op
|
25
|
+
'REGEXP'
|
26
|
+
end
|
19
27
|
end
|
20
28
|
end
|
21
29
|
end
|
@@ -39,7 +39,9 @@ module Dataflow
|
|
39
39
|
try_create_db(uri, db_name, user, password) unless is_external_db
|
40
40
|
|
41
41
|
# then, create the connection object
|
42
|
-
|
42
|
+
db = Sequel.connect("#{connection_uri}?encoding=utf8")
|
43
|
+
add_extensions(settings, db)
|
44
|
+
@clients[connection_uri] = db
|
43
45
|
end
|
44
46
|
|
45
47
|
# Used internally to try to create the DB automatically.
|
@@ -56,6 +58,15 @@ module Dataflow
|
|
56
58
|
false
|
57
59
|
end
|
58
60
|
|
61
|
+
# load Sequel extensions based on the type
|
62
|
+
def add_extensions(settings, db)
|
63
|
+
if settings.adapter_type == 'postgresql'
|
64
|
+
db.extension(:pg_array)
|
65
|
+
# db.extension(:pg_json)
|
66
|
+
db.extension(:pg_loose_count)
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
59
70
|
# Force the clients to disconnect their connections.
|
60
71
|
# Use before forking.
|
61
72
|
def disconnect_clients
|
@@ -158,14 +169,15 @@ module Dataflow
|
|
158
169
|
if replace_by.present?
|
159
170
|
index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
|
160
171
|
|
161
|
-
# update every field
|
162
|
-
|
172
|
+
# On conflict update every field. On Postgresql we can refer
|
173
|
+
# to the "conflicting" rows using the "excluded_" prefix:
|
174
|
+
update_clause = columns.map { |k| [k, Sequel.qualify('excluded', k)] }.to_h
|
163
175
|
dataset
|
164
176
|
.insert_conflict(target: index_keys, update: update_clause)
|
165
177
|
.import(columns, tabular_data)
|
166
178
|
else
|
167
179
|
# ignore insert conflicts
|
168
|
-
dataset.
|
180
|
+
dataset.insert_ignore.import(columns, tabular_data)
|
169
181
|
end
|
170
182
|
end
|
171
183
|
|
@@ -272,13 +284,13 @@ module Dataflow
|
|
272
284
|
when 'numeric'
|
273
285
|
col_type = 'real'
|
274
286
|
when 'array', 'hash'
|
275
|
-
|
287
|
+
logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
|
276
288
|
col_type = 'json'
|
277
289
|
when 'date', 'time'
|
278
290
|
# keep as-is
|
279
291
|
col_type = type
|
280
292
|
else
|
281
|
-
|
293
|
+
logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
|
282
294
|
col_type = type
|
283
295
|
end
|
284
296
|
|
@@ -305,18 +317,16 @@ module Dataflow
|
|
305
317
|
case operator
|
306
318
|
when '!='
|
307
319
|
if value.is_a? Array
|
308
|
-
|
320
|
+
Sequel.lit("#{k} NOT IN ?", value)
|
309
321
|
else
|
310
|
-
|
322
|
+
Sequel.lit("#{k} <> ?", value)
|
311
323
|
end
|
312
|
-
when '<'
|
313
|
-
|
314
|
-
when '
|
315
|
-
|
316
|
-
when '
|
317
|
-
|
318
|
-
when '>='
|
319
|
-
["#{k} >= ?", value]
|
324
|
+
when '<', '<=', '>', '>='
|
325
|
+
Sequel.lit("#{k} #{operator} ?", value)
|
326
|
+
when '~'
|
327
|
+
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
328
|
+
when '~*'
|
329
|
+
Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
|
320
330
|
end
|
321
331
|
end
|
322
332
|
else
|
data/lib/dataflow/logger.rb
CHANGED
@@ -333,11 +333,13 @@ module Dataflow
|
|
333
333
|
count_per_process = [limit, equal_split_per_process].min if limit > 0
|
334
334
|
|
335
335
|
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
336
|
+
queries_count = queries.count
|
336
337
|
|
337
338
|
parallel_each(queries.each_with_index) do |query, idx|
|
338
339
|
send_heartbeat
|
339
|
-
progress = (idx /
|
340
|
+
progress = (idx / queries_count.to_f * 100).ceil
|
340
341
|
on_computing_progressed(pct_complete: progress)
|
342
|
+
logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
|
341
343
|
|
342
344
|
records = node.all(where: query)
|
343
345
|
|
@@ -504,7 +506,7 @@ module Dataflow
|
|
504
506
|
|
505
507
|
# set to true to debug code in the iteration
|
506
508
|
is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
|
507
|
-
if is_debugging_impl
|
509
|
+
if is_debugging_impl || true
|
508
510
|
itr.each do |*args|
|
509
511
|
yield(*args)
|
510
512
|
end
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dataflow-rb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.12.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Eurico Doirado
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|