idata 0.1.28 → 0.1.29
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +15 -3
- data/bin/ivalidate +38 -10
- data/lib/idata/version.rb +1 -1
- data/sample.sh +1 -2
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA1:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: d37af89a0b0962c026e20d63637f3a750030429f
|
|
4
|
+
data.tar.gz: a67cdfd07638f57788c837667c01e4f259867fa6
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 58eec49a072286d87b643f58be7c456e0d9140c7341b0816fc46b790241a9ef07fff0c2e86f72b2eda17f3c76defebb3259bd0fc50f912a9e90ff0fda2cd0511
|
|
7
|
+
data.tar.gz: bd83cd1f605863ed36bdf6c71236233bed41a0830013683b6906c9834fe23b12d482807eef38d318f6dec364ab14d55d8017734555ec01a63b6faa0270d67f10
|
data/README.md
CHANGED
|
@@ -1,14 +1,26 @@
|
|
|
1
|
-
#
|
|
1
|
+
# Overview
|
|
2
2
|
We provide some useful utilities for validating data in a PostgreSQL data table
|
|
3
3
|
These utilities can be used as simple terminal commands and can be installed by:
|
|
4
4
|
|
|
5
5
|
gem install idata
|
|
6
6
|
|
|
7
|
+
idata comes along with the following commands:
|
|
8
|
+
* iload
|
|
9
|
+
* ivalidate
|
|
10
|
+
* ipatch
|
|
11
|
+
* ieval
|
|
12
|
+
* iexport
|
|
13
|
+
* imerge
|
|
14
|
+
* isanitize
|
|
15
|
+
|
|
16
|
+
Run a command with --help switch for the details
|
|
17
|
+
|
|
7
18
|
Prequisites:
|
|
8
19
|
* PostgreSQL 9.0 or above
|
|
9
20
|
* Ruby 2.0 or above
|
|
21
|
+
* An auto ID field is required for data table to be validated using ivalidate
|
|
10
22
|
|
|
11
|
-
#
|
|
23
|
+
# Usage
|
|
12
24
|
Suppose we have an `items` table, and we want to validate its records against certain criteria like:
|
|
13
25
|
|
|
14
26
|
* `item_id` must not be null
|
|
@@ -65,7 +77,7 @@ For example, the following two checks are equivalent:
|
|
|
65
77
|
Note: run `ivalidate --help` to see the full list of supported switches
|
|
66
78
|
|
|
67
79
|
|
|
68
|
-
#
|
|
80
|
+
# Put it all together
|
|
69
81
|
You can put several `ivalidate` commands (for several data tables) in one single bash/sh file.
|
|
70
82
|
Besides `ivalidate`, we also support some other utilities to:
|
|
71
83
|
+ Load data from text files to SQL tables
|
data/bin/ivalidate
CHANGED
|
@@ -73,6 +73,10 @@ parser = OptionParser.new("", 24) do |opts|
|
|
|
73
73
|
opts.on("--pretty", "Use more human-readable error message") do |v|
|
|
74
74
|
$options[:pretty] = v
|
|
75
75
|
end
|
|
76
|
+
|
|
77
|
+
opts.on("--case-insensitive", "Use more human-readable error message") do |v|
|
|
78
|
+
$options[:case_insensitive] = v
|
|
79
|
+
end
|
|
76
80
|
|
|
77
81
|
opts.on("-h", "--host HOST", "PostgreSQL host") do |v|
|
|
78
82
|
$options[:host] = v
|
|
@@ -185,6 +189,15 @@ class String
|
|
|
185
189
|
|
|
186
190
|
"(#{sql})"
|
|
187
191
|
end
|
|
192
|
+
|
|
193
|
+
def lower
|
|
194
|
+
a = self.split(/\s*,\s*/)
|
|
195
|
+
sql = a.map{|s|
|
|
196
|
+
"lower(#{s})"
|
|
197
|
+
}.join(",")
|
|
198
|
+
|
|
199
|
+
sql
|
|
200
|
+
end
|
|
188
201
|
end
|
|
189
202
|
|
|
190
203
|
|
|
@@ -215,11 +228,15 @@ ActiveRecord::Base.connection.execute(pre_sql)
|
|
|
215
228
|
$options[:unique].each do |field|
|
|
216
229
|
begin
|
|
217
230
|
puts "Checking uniqueness: #{field}"
|
|
218
|
-
|
|
231
|
+
if $options[:case_insensitive]
|
|
232
|
+
f_lower = field.lower
|
|
233
|
+
else
|
|
234
|
+
f_lower = field
|
|
235
|
+
end
|
|
219
236
|
uniq_sql = <<-eos
|
|
220
237
|
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('[#{field}] is not unique', ' || '), ' || ')
|
|
221
238
|
WHERE id IN (
|
|
222
|
-
SELECT unnest(array_agg(id)) FROM #{$options[:table]} GROUP BY #{
|
|
239
|
+
SELECT unnest(array_agg(id)) FROM #{$options[:table]} GROUP BY #{f_lower}
|
|
223
240
|
HAVING count(*) > 1
|
|
224
241
|
) AND #{field.not_null_sql};
|
|
225
242
|
eos
|
|
@@ -239,19 +256,24 @@ $options[:consistent_by].each do |fields|
|
|
|
239
256
|
|
|
240
257
|
raise "input must be in field1|field2 format" if fields.count != 2
|
|
241
258
|
|
|
242
|
-
f1 = fields.first
|
|
243
|
-
f2 = fields.last
|
|
259
|
+
f1_case = f1 = fields.first
|
|
260
|
+
f2_case = f2 = fields.last
|
|
261
|
+
|
|
262
|
+
if $options[:case_insensitive]
|
|
263
|
+
f1_case = f1_case.lower
|
|
264
|
+
f2_case = f2_case.lower
|
|
265
|
+
end
|
|
244
266
|
|
|
245
|
-
puts "Checking
|
|
267
|
+
puts "Checking consistency: #{f1} | #{f2}"
|
|
246
268
|
|
|
247
269
|
uniq_sql = <<-eos
|
|
248
|
-
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('same [#{f2}] but with different #{f1}', ' || '), ' || ')
|
|
270
|
+
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('same [#{f2}] but with different [#{f1}]', ' || '), ' || ')
|
|
249
271
|
WHERE id IN
|
|
250
272
|
(
|
|
251
273
|
SELECT unnest(array_agg(id)) FROM #{$options[:table]}
|
|
252
274
|
WHERE #{f1.not_null_sql} AND #{f2.not_null_sql}
|
|
253
|
-
GROUP BY #{
|
|
254
|
-
HAVING COUNT(distinct #{
|
|
275
|
+
GROUP BY #{f2_case}
|
|
276
|
+
HAVING COUNT(distinct #{f1_case}) > 1
|
|
255
277
|
);
|
|
256
278
|
eos
|
|
257
279
|
|
|
@@ -340,19 +362,25 @@ $options[:cross_reference].each do |value|
|
|
|
340
362
|
raise "Error: Wrong argument for --cross-reference switch"
|
|
341
363
|
exit(0)
|
|
342
364
|
end
|
|
343
|
-
|
|
365
|
+
|
|
344
366
|
field = values[0]
|
|
345
367
|
ref_table = values[1]
|
|
346
368
|
ref_field = values[2]
|
|
347
369
|
|
|
348
370
|
puts "Checking data integrity: #{value}"
|
|
371
|
+
|
|
372
|
+
if $options[:case_insensitive]
|
|
373
|
+
join_condition = "on lower(origin.#{field}) = lower(target.#{ref_field})"
|
|
374
|
+
else
|
|
375
|
+
join_condition = "on origin.#{field} = target.#{ref_field}"
|
|
376
|
+
end
|
|
349
377
|
|
|
350
378
|
# @todo: poor performance here, think of a better SQL!!!
|
|
351
379
|
ref_sql = <<-eos
|
|
352
380
|
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('#{field} does not reference #{values[1]}.#{values[2]}', ' || '), ' || ')
|
|
353
381
|
WHERE #{field} IN (
|
|
354
382
|
SELECT origin.#{field} from #{$options[:table]} origin LEFT JOIN #{ref_table} target
|
|
355
|
-
|
|
383
|
+
#{join_condition}
|
|
356
384
|
where target.#{ref_field} is null
|
|
357
385
|
) AND #{field} IS NOT NULL AND length(trim(#{field})) <> 0;
|
|
358
386
|
eos
|
data/lib/idata/version.rb
CHANGED
data/sample.sh
CHANGED
|
@@ -94,7 +94,7 @@ ivalidate --table=$ITEM \
|
|
|
94
94
|
# Step 3 - Generate summary report
|
|
95
95
|
###################################################################################
|
|
96
96
|
# After the validation step above, an additional field named validation_errors
|
|
97
|
-
# is added to every table. In case the record does not pass a validation
|
|
97
|
+
# is added to every table. In case the record does not pass a validation criterion, a corresponding error shall be logged to this field
|
|
98
98
|
# One record may have more than one error logged
|
|
99
99
|
#
|
|
100
100
|
# You can simply look at the validation_errors field to see errors associated to a record
|
|
@@ -104,7 +104,6 @@ ivalidate --table=$ITEM \
|
|
|
104
104
|
# 2 Extract the first 1000 sample records for every error
|
|
105
105
|
# 3 Put all together into one single Excel report
|
|
106
106
|
|
|
107
|
-
|
|
108
107
|
# 1) Create error summary report table and write to /tmp/summary.csv
|
|
109
108
|
# This can be done using the iexport utility which can generate a CSV file from a data table or from a custom query
|
|
110
109
|
# Run iexport --help for more information
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: idata
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.1.
|
|
4
|
+
version: 0.1.29
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Nghi Pham
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2014-07-
|
|
11
|
+
date: 2014-07-24 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: bundler
|