idata 0.1.28 → 0.1.29
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +15 -3
- data/bin/ivalidate +38 -10
- data/lib/idata/version.rb +1 -1
- data/sample.sh +1 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d37af89a0b0962c026e20d63637f3a750030429f
|
4
|
+
data.tar.gz: a67cdfd07638f57788c837667c01e4f259867fa6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 58eec49a072286d87b643f58be7c456e0d9140c7341b0816fc46b790241a9ef07fff0c2e86f72b2eda17f3c76defebb3259bd0fc50f912a9e90ff0fda2cd0511
|
7
|
+
data.tar.gz: bd83cd1f605863ed36bdf6c71236233bed41a0830013683b6906c9834fe23b12d482807eef38d318f6dec364ab14d55d8017734555ec01a63b6faa0270d67f10
|
data/README.md
CHANGED
@@ -1,14 +1,26 @@
|
|
1
|
-
#
|
1
|
+
# Overview
|
2
2
|
We provide some useful utilities for validating data in a PostgreSQL data table
|
3
3
|
These utilities can be used as simple terminal commands and can be installed by:
|
4
4
|
|
5
5
|
gem install idata
|
6
6
|
|
7
|
+
idata comes along with the following commands:
|
8
|
+
* iload
|
9
|
+
* ivalidate
|
10
|
+
* ipatch
|
11
|
+
* ieval
|
12
|
+
* iexport
|
13
|
+
* imerge
|
14
|
+
* isanitize
|
15
|
+
|
16
|
+
Run a command with --help switch for the details
|
17
|
+
|
7
18
|
Prequisites:
|
8
19
|
* PostgreSQL 9.0 or above
|
9
20
|
* Ruby 2.0 or above
|
21
|
+
* An auto ID field is required for data table to be validated using ivalidate
|
10
22
|
|
11
|
-
#
|
23
|
+
# Usage
|
12
24
|
Suppose we have an `items` table, and we want to validate its records against certain criteria like:
|
13
25
|
|
14
26
|
* `item_id` must not be null
|
@@ -65,7 +77,7 @@ For example, the following two checks are equivalent:
|
|
65
77
|
Note: run `ivalidate --help` to see the full list of supported switches
|
66
78
|
|
67
79
|
|
68
|
-
#
|
80
|
+
# Put it all together
|
69
81
|
You can put several `ivalidate` commands (for several data tables) in one single bash/sh file.
|
70
82
|
Besides `ivalidate`, we also support some other utilities to:
|
71
83
|
+ Load data from text files to SQL tables
|
data/bin/ivalidate
CHANGED
@@ -73,6 +73,10 @@ parser = OptionParser.new("", 24) do |opts|
|
|
73
73
|
opts.on("--pretty", "Use more human-readable error message") do |v|
|
74
74
|
$options[:pretty] = v
|
75
75
|
end
|
76
|
+
|
77
|
+
opts.on("--case-insensitive", "Use more human-readable error message") do |v|
|
78
|
+
$options[:case_insensitive] = v
|
79
|
+
end
|
76
80
|
|
77
81
|
opts.on("-h", "--host HOST", "PostgreSQL host") do |v|
|
78
82
|
$options[:host] = v
|
@@ -185,6 +189,15 @@ class String
|
|
185
189
|
|
186
190
|
"(#{sql})"
|
187
191
|
end
|
192
|
+
|
193
|
+
def lower
|
194
|
+
a = self.split(/\s*,\s*/)
|
195
|
+
sql = a.map{|s|
|
196
|
+
"lower(#{s})"
|
197
|
+
}.join(",")
|
198
|
+
|
199
|
+
sql
|
200
|
+
end
|
188
201
|
end
|
189
202
|
|
190
203
|
|
@@ -215,11 +228,15 @@ ActiveRecord::Base.connection.execute(pre_sql)
|
|
215
228
|
$options[:unique].each do |field|
|
216
229
|
begin
|
217
230
|
puts "Checking uniqueness: #{field}"
|
218
|
-
|
231
|
+
if $options[:case_insensitive]
|
232
|
+
f_lower = field.lower
|
233
|
+
else
|
234
|
+
f_lower = field
|
235
|
+
end
|
219
236
|
uniq_sql = <<-eos
|
220
237
|
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('[#{field}] is not unique', ' || '), ' || ')
|
221
238
|
WHERE id IN (
|
222
|
-
SELECT unnest(array_agg(id)) FROM #{$options[:table]} GROUP BY #{
|
239
|
+
SELECT unnest(array_agg(id)) FROM #{$options[:table]} GROUP BY #{f_lower}
|
223
240
|
HAVING count(*) > 1
|
224
241
|
) AND #{field.not_null_sql};
|
225
242
|
eos
|
@@ -239,19 +256,24 @@ $options[:consistent_by].each do |fields|
|
|
239
256
|
|
240
257
|
raise "input must be in field1|field2 format" if fields.count != 2
|
241
258
|
|
242
|
-
f1 = fields.first
|
243
|
-
f2 = fields.last
|
259
|
+
f1_case = f1 = fields.first
|
260
|
+
f2_case = f2 = fields.last
|
261
|
+
|
262
|
+
if $options[:case_insensitive]
|
263
|
+
f1_case = f1_case.lower
|
264
|
+
f2_case = f2_case.lower
|
265
|
+
end
|
244
266
|
|
245
|
-
puts "Checking
|
267
|
+
puts "Checking consistency: #{f1} | #{f2}"
|
246
268
|
|
247
269
|
uniq_sql = <<-eos
|
248
|
-
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('same [#{f2}] but with different #{f1}', ' || '), ' || ')
|
270
|
+
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('same [#{f2}] but with different [#{f1}]', ' || '), ' || ')
|
249
271
|
WHERE id IN
|
250
272
|
(
|
251
273
|
SELECT unnest(array_agg(id)) FROM #{$options[:table]}
|
252
274
|
WHERE #{f1.not_null_sql} AND #{f2.not_null_sql}
|
253
|
-
GROUP BY #{
|
254
|
-
HAVING COUNT(distinct #{
|
275
|
+
GROUP BY #{f2_case}
|
276
|
+
HAVING COUNT(distinct #{f1_case}) > 1
|
255
277
|
);
|
256
278
|
eos
|
257
279
|
|
@@ -340,19 +362,25 @@ $options[:cross_reference].each do |value|
|
|
340
362
|
raise "Error: Wrong argument for --cross-reference switch"
|
341
363
|
exit(0)
|
342
364
|
end
|
343
|
-
|
365
|
+
|
344
366
|
field = values[0]
|
345
367
|
ref_table = values[1]
|
346
368
|
ref_field = values[2]
|
347
369
|
|
348
370
|
puts "Checking data integrity: #{value}"
|
371
|
+
|
372
|
+
if $options[:case_insensitive]
|
373
|
+
join_condition = "on lower(origin.#{field}) = lower(target.#{ref_field})"
|
374
|
+
else
|
375
|
+
join_condition = "on origin.#{field} = target.#{ref_field}"
|
376
|
+
end
|
349
377
|
|
350
378
|
# @todo: poor performance here, think of a better SQL!!!
|
351
379
|
ref_sql = <<-eos
|
352
380
|
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('#{field} does not reference #{values[1]}.#{values[2]}', ' || '), ' || ')
|
353
381
|
WHERE #{field} IN (
|
354
382
|
SELECT origin.#{field} from #{$options[:table]} origin LEFT JOIN #{ref_table} target
|
355
|
-
|
383
|
+
#{join_condition}
|
356
384
|
where target.#{ref_field} is null
|
357
385
|
) AND #{field} IS NOT NULL AND length(trim(#{field})) <> 0;
|
358
386
|
eos
|
data/lib/idata/version.rb
CHANGED
data/sample.sh
CHANGED
@@ -94,7 +94,7 @@ ivalidate --table=$ITEM \
|
|
94
94
|
# Step 3 - Generate summary report
|
95
95
|
###################################################################################
|
96
96
|
# After the validation step above, an additional field named validation_errors
|
97
|
-
# is added to every table. In case the record does not pass a validation
|
97
|
+
# is added to every table. In case the record does not pass a validation criterion, a corresponding error shall be logged to this field
|
98
98
|
# One record may have more than one error logged
|
99
99
|
#
|
100
100
|
# You can simply look at the validation_errors field to see errors associated to a record
|
@@ -104,7 +104,6 @@ ivalidate --table=$ITEM \
|
|
104
104
|
# 2 Extract the first 1000 sample records for every error
|
105
105
|
# 3 Put all together into one single Excel report
|
106
106
|
|
107
|
-
|
108
107
|
# 1) Create error summary report table and write to /tmp/summary.csv
|
109
108
|
# This can be done using the iexport utility which can generate a CSV file from a data table or from a custom query
|
110
109
|
# Run iexport --help for more information
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: idata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.29
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nghi Pham
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|