dataflow-rb 0.11.0 → 0.12.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/lib/dataflow/adapters/mongo_db_adapter.rb +13 -10
- data/lib/dataflow/adapters/mysql_adapter.rb +8 -0
- data/lib/dataflow/adapters/psql_adapter.rb +8 -0
- data/lib/dataflow/adapters/sql_adapter.rb +26 -16
- data/lib/dataflow/logger.rb +1 -1
- data/lib/dataflow/nodes/compute_node.rb +4 -2
- data/lib/dataflow/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 10b9fa54ef77a143c26b7b7a9f86a5b72ea80e49
|
|
4
|
+
data.tar.gz: 1bc1e1246308da00b7113d058b2f6acb86fb0624
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4e425087b3b6e1610433f907df70c46b3661cdee5e8fb91f881b409afa21620fe3176399ee63a3b2783be8aa945263f82c0ef90b06099399c5287100b92a12e2
|
|
7
|
+
data.tar.gz: 2a892c113fbf8f667b72dcda841a9a0ddac2c1dfd797f51654f9f9c2195e45870d3b5e4c7b861a361861b21857b258e36f2f8a7d33917e596f4e801d7cab95ab
|
data/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,17 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
#### 0.12.0
|
|
4
|
+
- [4a510df] Add support for case insentive regex matching on mysql
|
|
5
|
+
- [63b0771] Add logging to understand the current computation batch progress
|
|
6
|
+
- [df86157] Add support for pg array types
|
|
7
|
+
- [ce04cb3] Add the loose count extension for Sequel Postgres
|
|
8
|
+
- [3618060] Fix Sequel deprecation warnings
|
|
9
|
+
- [1cea32e] Skip logging during tests sessions
|
|
10
|
+
- [fdddf23] Add support for regex matching
|
|
11
|
+
- [b4717c5] Move the refactor the mongo batch insert
|
|
12
|
+
- [e2897df] Use named indexes to reduce their name size
|
|
13
|
+
- [bc4f598] Revert to insert_ignore to support mysql adapter
|
|
14
|
+
|
|
3
15
|
#### 0.11.0
|
|
4
16
|
- [7c09e8a] Add data_node#drop_dataset! to completely drop the data
|
|
5
17
|
- [ba0532f] Added upsert on psql adapter
|
|
@@ -153,7 +153,13 @@ module Dataflow
|
|
|
153
153
|
end
|
|
154
154
|
client[write_dataset_name].bulk_write(bulk_ops, ordered: false)
|
|
155
155
|
else
|
|
156
|
-
|
|
156
|
+
client[write_dataset_name].insert_many(records, ordered: false)
|
|
157
|
+
end
|
|
158
|
+
rescue Mongo::Error::BulkWriteError => e
|
|
159
|
+
dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
|
|
160
|
+
# don't raise if it is errors about duplicated keys
|
|
161
|
+
unless dup_key_error
|
|
162
|
+
raise e
|
|
157
163
|
end
|
|
158
164
|
end
|
|
159
165
|
|
|
@@ -256,6 +262,10 @@ module Dataflow
|
|
|
256
262
|
sanitized_opts[k]['$gt'] = try_cast_value(k, value)
|
|
257
263
|
when '>='
|
|
258
264
|
sanitized_opts[k]['$gte'] = try_cast_value(k, value)
|
|
265
|
+
when '~*' # match regex /regex/i (case insensitive)
|
|
266
|
+
sanitized_opts[k]['$regex'] = /#{value}/i
|
|
267
|
+
when '~' # match regex /regex/ (case sensitive)
|
|
268
|
+
sanitized_opts[k]['$regex'] = /#{value}/
|
|
259
269
|
end
|
|
260
270
|
end
|
|
261
271
|
else
|
|
@@ -275,14 +285,6 @@ module Dataflow
|
|
|
275
285
|
value
|
|
276
286
|
end
|
|
277
287
|
|
|
278
|
-
def save_many(records:)
|
|
279
|
-
client[write_dataset_name].insert_many(records, ordered: false)
|
|
280
|
-
rescue Mongo::Error::BulkWriteError => e
|
|
281
|
-
dup_key_error = e.result['writeErrors'].all? { |x| x['code'] == 11_000 }
|
|
282
|
-
# don't raise if it is errors about duplicated keys
|
|
283
|
-
raise e unless dup_key_error
|
|
284
|
-
end
|
|
285
|
-
|
|
286
288
|
# Required index format for mongodb:
|
|
287
289
|
# { :key => { name: 1 }, :unique => true },
|
|
288
290
|
def format_index(dataset_index)
|
|
@@ -291,7 +293,8 @@ module Dataflow
|
|
|
291
293
|
index_key = {}
|
|
292
294
|
keys = Array(dataset_index[:key])
|
|
293
295
|
keys.each { |k| index_key[k] = 1 }
|
|
294
|
-
|
|
296
|
+
name = keys.map { |k| k[0..1] }.push(SecureRandom.hex(4)).join('_')
|
|
297
|
+
index = { key: index_key, name: name }
|
|
295
298
|
index[:unique] = true if dataset_index[:unique]
|
|
296
299
|
index
|
|
297
300
|
end
|
|
@@ -16,6 +16,14 @@ module Dataflow
|
|
|
16
16
|
storage: 0
|
|
17
17
|
}
|
|
18
18
|
end
|
|
19
|
+
|
|
20
|
+
def regex_case_senstive_op
|
|
21
|
+
raise NotImplementedError, 'Mysql does not support a case sensitive regex matching operator'
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def regex_case_insensitive_op
|
|
25
|
+
'REGEXP'
|
|
26
|
+
end
|
|
19
27
|
end
|
|
20
28
|
end
|
|
21
29
|
end
|
|
@@ -39,7 +39,9 @@ module Dataflow
|
|
|
39
39
|
try_create_db(uri, db_name, user, password) unless is_external_db
|
|
40
40
|
|
|
41
41
|
# then, create the connection object
|
|
42
|
-
|
|
42
|
+
db = Sequel.connect("#{connection_uri}?encoding=utf8")
|
|
43
|
+
add_extensions(settings, db)
|
|
44
|
+
@clients[connection_uri] = db
|
|
43
45
|
end
|
|
44
46
|
|
|
45
47
|
# Used internally to try to create the DB automatically.
|
|
@@ -56,6 +58,15 @@ module Dataflow
|
|
|
56
58
|
false
|
|
57
59
|
end
|
|
58
60
|
|
|
61
|
+
# load Sequel extensions based on the type
|
|
62
|
+
def add_extensions(settings, db)
|
|
63
|
+
if settings.adapter_type == 'postgresql'
|
|
64
|
+
db.extension(:pg_array)
|
|
65
|
+
# db.extension(:pg_json)
|
|
66
|
+
db.extension(:pg_loose_count)
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
59
70
|
# Force the clients to disconnect their connections.
|
|
60
71
|
# Use before forking.
|
|
61
72
|
def disconnect_clients
|
|
@@ -158,14 +169,15 @@ module Dataflow
|
|
|
158
169
|
if replace_by.present?
|
|
159
170
|
index_keys = Array(replace_by).map { |c| c.to_sym}.uniq
|
|
160
171
|
|
|
161
|
-
# update every field
|
|
162
|
-
|
|
172
|
+
# On conflict update every field. On Postgresql we can refer
|
|
173
|
+
# to the "conflicting" rows using the "excluded_" prefix:
|
|
174
|
+
update_clause = columns.map { |k| [k, Sequel.qualify('excluded', k)] }.to_h
|
|
163
175
|
dataset
|
|
164
176
|
.insert_conflict(target: index_keys, update: update_clause)
|
|
165
177
|
.import(columns, tabular_data)
|
|
166
178
|
else
|
|
167
179
|
# ignore insert conflicts
|
|
168
|
-
dataset.
|
|
180
|
+
dataset.insert_ignore.import(columns, tabular_data)
|
|
169
181
|
end
|
|
170
182
|
end
|
|
171
183
|
|
|
@@ -272,13 +284,13 @@ module Dataflow
|
|
|
272
284
|
when 'numeric'
|
|
273
285
|
col_type = 'real'
|
|
274
286
|
when 'array', 'hash'
|
|
275
|
-
|
|
287
|
+
logger.log("Check type of field #{column} (given: #{type}). Not expecting to use JSON.")
|
|
276
288
|
col_type = 'json'
|
|
277
289
|
when 'date', 'time'
|
|
278
290
|
# keep as-is
|
|
279
291
|
col_type = type
|
|
280
292
|
else
|
|
281
|
-
|
|
293
|
+
logger.log("[Error] unexpected type '#{type}'. Keeping as-is.")
|
|
282
294
|
col_type = type
|
|
283
295
|
end
|
|
284
296
|
|
|
@@ -305,18 +317,16 @@ module Dataflow
|
|
|
305
317
|
case operator
|
|
306
318
|
when '!='
|
|
307
319
|
if value.is_a? Array
|
|
308
|
-
|
|
320
|
+
Sequel.lit("#{k} NOT IN ?", value)
|
|
309
321
|
else
|
|
310
|
-
|
|
322
|
+
Sequel.lit("#{k} <> ?", value)
|
|
311
323
|
end
|
|
312
|
-
when '<'
|
|
313
|
-
|
|
314
|
-
when '
|
|
315
|
-
|
|
316
|
-
when '
|
|
317
|
-
|
|
318
|
-
when '>='
|
|
319
|
-
["#{k} >= ?", value]
|
|
324
|
+
when '<', '<=', '>', '>='
|
|
325
|
+
Sequel.lit("#{k} #{operator} ?", value)
|
|
326
|
+
when '~'
|
|
327
|
+
Sequel.lit("#{k} #{regex_case_senstive_op} ?", value)
|
|
328
|
+
when '~*'
|
|
329
|
+
Sequel.lit("#{k} #{regex_case_insensitive_op} ?", value)
|
|
320
330
|
end
|
|
321
331
|
end
|
|
322
332
|
else
|
data/lib/dataflow/logger.rb
CHANGED
|
@@ -333,11 +333,13 @@ module Dataflow
|
|
|
333
333
|
count_per_process = [limit, equal_split_per_process].min if limit > 0
|
|
334
334
|
|
|
335
335
|
queries = node.ordered_system_id_queries(batch_size: count_per_process)
|
|
336
|
+
queries_count = queries.count
|
|
336
337
|
|
|
337
338
|
parallel_each(queries.each_with_index) do |query, idx|
|
|
338
339
|
send_heartbeat
|
|
339
|
-
progress = (idx /
|
|
340
|
+
progress = (idx / queries_count.to_f * 100).ceil
|
|
340
341
|
on_computing_progressed(pct_complete: progress)
|
|
342
|
+
logger.log("Executing #{name} [Batch #{idx}/#{queries_count}]")
|
|
341
343
|
|
|
342
344
|
records = node.all(where: query)
|
|
343
345
|
|
|
@@ -504,7 +506,7 @@ module Dataflow
|
|
|
504
506
|
|
|
505
507
|
# set to true to debug code in the iteration
|
|
506
508
|
is_debugging_impl = (ENV['RACK_ENV'] == 'test' && ENV['DEBUG'])
|
|
507
|
-
if is_debugging_impl
|
|
509
|
+
if is_debugging_impl || true
|
|
508
510
|
itr.each do |*args|
|
|
509
511
|
yield(*args)
|
|
510
512
|
end
|
data/lib/dataflow/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: dataflow-rb
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.12.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Eurico Doirado
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exe
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2017-
|
|
11
|
+
date: 2017-05-13 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|