idata 0.1.26 → 0.1.27
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +65 -17
- data/bin/ivalidate +17 -3
- data/lib/idata/version.rb +1 -1
- data/sample.sh +143 -0
- metadata +3 -3
- data/guideline/Guideline.docx +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a77794735a66403f2d15fb44b25a24ae954a07d0
|
4
|
+
data.tar.gz: 6ffa6f9e348842a760bdc4b1ea5895fad91bd1c8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: add59efee5a88c2f914008e585d0e3bb6f988a25b56e31a3314155ec9b93c99390d659f4f9967645d4857345815486ebdbf01cbb4af66cb24f7bcd4d40a56b19
|
7
|
+
data.tar.gz: 9b0d4e913980fc6fdefad215a5bafe3d59effce4da0ac256e17da63b6a53d09a71b4c42c964418dea3d17b0b54ec73ea5203915c6d7faffa2c73c181b9e9cec4
|
data/README.md
CHANGED
@@ -1,29 +1,77 @@
|
|
1
|
-
#
|
1
|
+
# OVERVIEW
|
2
|
+
We provide some useful utilities for validating data in a PostgreSQL data table
|
3
|
+
These utilities can be used as simple terminal commands and can be installed by:
|
2
4
|
|
3
|
-
|
5
|
+
gem install idata
|
4
6
|
|
5
|
-
|
7
|
+
Prequisites:
|
8
|
+
* PostgreSQL 9.0 or above
|
9
|
+
* Ruby 2.0 or above
|
6
10
|
|
7
|
-
|
11
|
+
# USAGE
|
12
|
+
Suppose we have an `items` table, and we want to validate its records against certain criteria like:
|
8
13
|
|
9
|
-
|
14
|
+
* `item_id` must not be null
|
15
|
+
* `item_title` must not be null
|
16
|
+
* The composite `[item_id, item_title]` must be unique
|
17
|
+
* One `item_id` corresponds to only ONE `item_title` (in other words, there must not be two items with different titles but with the same `item_id`)
|
18
|
+
and vice-versa
|
19
|
+
* `vendor_code` must reference the `code` column in the `vendors` table
|
10
20
|
|
11
|
-
|
21
|
+
Then the validation command could be:
|
22
|
+
```
|
23
|
+
ivalidate --host=localhost --user=postgres --database=mydb --table=items --log-to=validation_errors \
|
24
|
+
--not-null="vendor_id" \
|
25
|
+
--not-null="vendor_name" \
|
26
|
+
--unique="vendor_id,vendor_name" \
|
27
|
+
--consistent-by="vendor_id|vendor_name" \
|
28
|
+
--consistent-by="vendor_id|vendor_name" \
|
29
|
+
--cross-reference="vendor_code|vendors.code"
|
30
|
+
```
|
31
|
+
Validation results for every single record are logged to an additional column named `validation_errors`
|
32
|
+
of the `items` table, as specified by the `--log-to` switch
|
12
33
|
|
13
|
-
|
34
|
+
As you can see, most common checks can be performed using the supported switches:
|
35
|
+
```
|
36
|
+
--not-null
|
37
|
+
--unique
|
38
|
+
--consistent-by
|
39
|
+
--cross-reference
|
40
|
+
```
|
41
|
+
For more generic check, we support some other switches:
|
14
42
|
|
15
|
-
|
43
|
+
The `--match="field/pattern/"` switch tells the program to check if value of a `field` matches the provided `pattern` (which is a regular expression).
|
44
|
+
For example:
|
45
|
+
```
|
46
|
+
# Check if item_id is a number:
|
47
|
+
ivalidate --match="item_id/[0-9]+/"
|
48
|
+
|
49
|
+
# Check if value of status is either 'A' or 'I' (any other value is not allowed)
|
50
|
+
ivalidate --match="status/^(A|I)$/"
|
51
|
+
```
|
52
|
+
In case you need even more customized validation other than the supported ones (match, unique, not-null, cross-reference...)
|
53
|
+
then `--query` switch may be helpful. For example:
|
54
|
+
```
|
55
|
+
ivalidate --query="string_to_date(start_date) >= '01/02/2014' -- invalid date"
|
56
|
+
```
|
57
|
+
You can also use `--rquery` which is the reversed counterpart of `--query`
|
58
|
+
For example, the following two checks are equivalent:
|
59
|
+
```
|
60
|
+
ivalidate --query="string_to_date(start_date) >= '01/02/2014' -- invalid date"
|
61
|
+
ivalidate --rquery="string_to_date(start_date) < '01/02/2014' -- invalid date"
|
62
|
+
```
|
63
|
+
(mark any record whose `start_date < '01/02/2014'` as "invalid date")
|
16
64
|
|
17
|
-
|
65
|
+
Note: run `ivalidate --help` to see the full list of supported switches
|
18
66
|
|
19
|
-
## Usage
|
20
67
|
|
21
|
-
|
68
|
+
# PUT IT ALL TOGETHER
|
69
|
+
You can put several `ivalidate` commands (for several data tables) in one single bash/sh file.
|
70
|
+
Besides `ivalidate`, we also support some other utilities to:
|
71
|
+
+ Load data from text files to SQL tables
|
72
|
+
+ Modify data tables
|
73
|
+
+ Generate summary reports
|
74
|
+
|
75
|
+
For a full example, see our `sample.sh`
|
22
76
|
|
23
|
-
## Contributing
|
24
77
|
|
25
|
-
1. Fork it
|
26
|
-
2. Create your feature branch (`git checkout -b my-new-feature`)
|
27
|
-
3. Commit your changes (`git commit -am 'Add some feature'`)
|
28
|
-
4. Push to the branch (`git push origin my-new-feature`)
|
29
|
-
5. Create new Pull Request
|
data/bin/ivalidate
CHANGED
@@ -69,6 +69,10 @@ parser = OptionParser.new("", 24) do |opts|
|
|
69
69
|
opts.on("--log-to FIELD", "Field to log error to") do |v|
|
70
70
|
$options[:log_to] = v
|
71
71
|
end
|
72
|
+
|
73
|
+
opts.on("--pretty", "Use more human-readable error message") do |v|
|
74
|
+
$options[:pretty] = v
|
75
|
+
end
|
72
76
|
|
73
77
|
opts.on("-h", "--host HOST", "PostgreSQL host") do |v|
|
74
78
|
$options[:host] = v
|
@@ -270,8 +274,16 @@ $options[:not_null].each do |field|
|
|
270
274
|
eos
|
271
275
|
|
272
276
|
ActiveRecord::Base.connection.execute(not_null_sql)
|
277
|
+
|
273
278
|
rescue Exception => ex
|
274
|
-
|
279
|
+
if ex.message.include?('PG::UndefinedColumn')
|
280
|
+
not_null_sql = <<-eos
|
281
|
+
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('Column `#{field}` is missing', ' || '), ' || ')
|
282
|
+
eos
|
283
|
+
ActiveRecord::Base.connection.execute(not_null_sql)
|
284
|
+
else
|
285
|
+
puts " --> *** ERROR ***: #{ex.message.split("\n").first }"
|
286
|
+
end
|
275
287
|
end
|
276
288
|
end
|
277
289
|
|
@@ -283,9 +295,11 @@ $options[:match].each do |value|
|
|
283
295
|
field = value[/^[^\/]+/]
|
284
296
|
regexp = value[/(?<=\/).*(?=\/)/]
|
285
297
|
puts "Checking REGEXP matching: #{field} ~ #{regexp}"
|
286
|
-
|
298
|
+
|
299
|
+
msg = $options[:pretty] ? "Invalid #{field}" : "#{field} does not match [#{regexp.gsub("'", "''")}]"
|
300
|
+
|
287
301
|
match_sql = <<-eos
|
288
|
-
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('#{
|
302
|
+
UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('#{msg}', ' || '), ' || ')
|
289
303
|
WHERE #{field} IS NOT NULL AND length(trim(#{field})) <> 0 AND #{field} !~ '#{regexp}';
|
290
304
|
eos
|
291
305
|
|
data/lib/idata/version.rb
CHANGED
data/sample.sh
ADDED
@@ -0,0 +1,143 @@
|
|
1
|
+
# Idea: to validate data in a text file, we first load it to a data table
|
2
|
+
# then use the validation utilities to validate the table
|
3
|
+
#
|
4
|
+
# The followings are performed by this script:
|
5
|
+
# * 1 Load raw text files into corresponding SQL tables
|
6
|
+
# * 2 Perform validation
|
7
|
+
# * 3 Generate reports
|
8
|
+
#
|
9
|
+
# @author NghiPM
|
10
|
+
# @date May 2014
|
11
|
+
|
12
|
+
###################################################################################
|
13
|
+
# SET UP ENVIRONMENT VARIABLES
|
14
|
+
###################################################################################
|
15
|
+
# Instead of passing PostgreSQL credentials as parameters to every validation command,
|
16
|
+
# you can set the corresponding environment variables which can be used by the those commands
|
17
|
+
export HOST="localhost"
|
18
|
+
export USERNAME="postgres"
|
19
|
+
export PASSWORD="postgres"
|
20
|
+
export DATABASE="northeast_georgia"
|
21
|
+
export LISTEN=5432
|
22
|
+
|
23
|
+
# Input file paths and corresponding table names
|
24
|
+
FVENDOR="VendorMaster.csv"
|
25
|
+
VENDOR="vendors"
|
26
|
+
|
27
|
+
FITEM="ItemMaster.csv"
|
28
|
+
ITEM="items"
|
29
|
+
|
30
|
+
# Specify a temp folder for writing temporary outputs
|
31
|
+
# Specify the path to the output summary report
|
32
|
+
TMP="/tmp"
|
33
|
+
REPORT="/tmp/report.xls"
|
34
|
+
|
35
|
+
###################################################################################
|
36
|
+
# STEP 1 - Load raw text files into corresponding SQL tables
|
37
|
+
###################################################################################
|
38
|
+
|
39
|
+
# Load data from VendorMaster.csv to the corresponding vendors table
|
40
|
+
# and from ItemMaster.csv to items table.
|
41
|
+
# Note: instead of using iload utility,you can use the PSQL COPY of PostgreSQL
|
42
|
+
iload -i "$FVENDOR" -t "$VENDOR" -f csv
|
43
|
+
iload -i "$FITEM" -t "$ITEM" -f csv
|
44
|
+
|
45
|
+
###################################################################################
|
46
|
+
# STEP 2 - Perform validation, log the results to an additional field
|
47
|
+
###################################################################################
|
48
|
+
# validate the vendors table
|
49
|
+
ivalidate --table=$VENDOR --log-to=validation_errors \
|
50
|
+
--not-null="vendor_code" \
|
51
|
+
--not-null="vendor_name" \
|
52
|
+
--unique="vendor_code" \
|
53
|
+
--unique="vendor_name" \
|
54
|
+
--match="vendor_code/[a-zA-Z0-9]/" \
|
55
|
+
--match="vendor_name/[a-zA-Z0-9]/" \
|
56
|
+
--consistent-by="vendor_code|vendor_name" \
|
57
|
+
--consistent-by="vendor_name|vendor_code" \
|
58
|
+
--consistent-by="country_code|country_name" \
|
59
|
+
--consistent-by="country_name|country_code"
|
60
|
+
|
61
|
+
# validate the items table
|
62
|
+
ivalidate --table=$ITEM \
|
63
|
+
--log-to=validation_errors \
|
64
|
+
--not-null="item_id" \
|
65
|
+
--match="item_id/[a-zA-Z0-9]/" \
|
66
|
+
--not-null="item_desc" \
|
67
|
+
--match="item_desc/[a-zA-Z0-9]/" \
|
68
|
+
--not-null="item_uom" \
|
69
|
+
--not-null="default_uom" \
|
70
|
+
--not-null="item_price" \
|
71
|
+
--not-null="item_qoe" \
|
72
|
+
--not-null="corp_id" \
|
73
|
+
--not-null="corp_name" \
|
74
|
+
--not-null="vendor_code" \
|
75
|
+
--not-null="vendor_name" \
|
76
|
+
--not-null="mfr_number" \
|
77
|
+
--not-null="mfr_name" \
|
78
|
+
--not-null="active" \
|
79
|
+
--match="corp_id/[a-zA-Z0-9]/" \
|
80
|
+
--match="corp_name/[a-zA-Z0-9]/" \
|
81
|
+
--match="vendor_code/[a-zA-Z0-9]/" \
|
82
|
+
--match="vendor_name/[a-zA-Z0-9]/" \
|
83
|
+
--match="mfr_number/[a-zA-Z0-9]/" \
|
84
|
+
--match="mfr_name/[a-zA-Z0-9]/" \
|
85
|
+
--match="active/^(1|2|3|A|I)$/" \
|
86
|
+
--consisten-by="corp_id|corp_name" \
|
87
|
+
--consisten-by="corp_name|corp_id" \
|
88
|
+
--consisten-by="vendor_code|vendor_name" \
|
89
|
+
--consisten-by="vendor_name|vendor_code" \
|
90
|
+
--cross-reference="vendor_code|$VENDOR.vendor_code" \
|
91
|
+
--cross-reference="vendor_name|$VENDOR.vendor_name"
|
92
|
+
|
93
|
+
###################################################################################
|
94
|
+
# Step 3 - Generate summary report
|
95
|
+
###################################################################################
|
96
|
+
# After the validation step above, an additional field named validation_errors
|
97
|
+
# is added to every table. In case the record does not pass a validation creterion, a corresponding error shall be logged to this field
|
98
|
+
# One record may have more than one error logged
|
99
|
+
#
|
100
|
+
# You can simply look at the validation_errors field to see errors associated to a record
|
101
|
+
#
|
102
|
+
# Just to make a MORE comprehensive report, we can:
|
103
|
+
# 1 Create a summary table which tells us how many errors found, how many records associated with each...
|
104
|
+
# 2 Extract the first 1000 sample records for every error
|
105
|
+
# 3 Put all together into one single Excel report
|
106
|
+
|
107
|
+
|
108
|
+
# 1) Create error summary report table and write to /tmp/summary.csv
|
109
|
+
# This can be done using the iexport utility which can generate a CSV file from a data table or from a custom query
|
110
|
+
# Run iexport --help for more information
|
111
|
+
iexport --output="$TMP/summary.csv" -f csv --no-quote-empty --quotes --headers \
|
112
|
+
--query="(select '$FVENDOR' as input_file, unnest(string_to_array(validation_errors, ' || ')) as error, count(*), round((count(*) * 100)::numeric / (select count(*) from $VENDOR), 2)::varchar || '%' as percentage from $VENDOR group by error order by error) union
|
113
|
+
(select '$FITEM' as input_file, unnest(string_to_array(validation_errors, ' || ')) as error, count(*), round((count(*) * 100)::numeric / (select count(*) from $ITEM), 2)::varchar || '%' as percentage from $ITEM group by error order by error)"
|
114
|
+
|
115
|
+
# Export the first 1000 records of every error in the items table
|
116
|
+
# Write the results to /tmp/items.csv
|
117
|
+
iexport --table=$VENDOR --output="$TMP/$VENDOR.csv" -f csv --no-quote-empty --quotes --headers \
|
118
|
+
--query="select * from (select ROW_NUMBER() OVER (PARTITION BY error) AS group_index, *
|
119
|
+
FROM ( select unnest(string_to_array(validation_errors, ' || ')) as error, * from
|
120
|
+
$VENDOR order by id ) as main) as tmp
|
121
|
+
where group_index <= 1000" \
|
122
|
+
--exclude="id, validation_errors, group_index"
|
123
|
+
|
124
|
+
# 2) Export the first 1000 records of every error in the vendors table
|
125
|
+
# Write the results to /tmp/vendors.csv
|
126
|
+
iexport --table=$ITEM --output="$TMP/$ITEM.csv" -f csv --no-quote-empty --quotes --headers \
|
127
|
+
--query="select * from (select ROW_NUMBER() OVER (PARTITION BY error) AS group_index, *
|
128
|
+
FROM ( select unnest(string_to_array(validation_errors, ' || ')) as error, * from
|
129
|
+
$ITEM order by id ) as main) as tmp
|
130
|
+
where group_index <= 1000" \
|
131
|
+
--exclude="id, validation_errors, group_index"
|
132
|
+
|
133
|
+
# 3) Put the above 3 CSV files into one Excel file /tmp/report.xls
|
134
|
+
# This can be done using imerge which takes a list of CSV files put them to corresponding sheets
|
135
|
+
# of one single Excel file
|
136
|
+
imerge --output=$REPORT \
|
137
|
+
--input="Summary:$TMP/summary.csv" \
|
138
|
+
--input="$FVENDOR:$TMP/$VENDOR.csv" \
|
139
|
+
--input="ItemMaster:$TMP/$ITEM.csv"
|
140
|
+
|
141
|
+
# CLEANUP
|
142
|
+
# Remember to drop the temporary tables you create (items and vendors)
|
143
|
+
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: idata
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.27
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nghi Pham
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-07-
|
11
|
+
date: 2014-07-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -92,10 +92,10 @@ files:
|
|
92
92
|
- bin/ipatch
|
93
93
|
- bin/isanitize
|
94
94
|
- bin/ivalidate
|
95
|
-
- guideline/Guideline.docx
|
96
95
|
- idata.gemspec
|
97
96
|
- lib/idata.rb
|
98
97
|
- lib/idata/version.rb
|
98
|
+
- sample.sh
|
99
99
|
homepage: http://bolero.vn
|
100
100
|
licenses:
|
101
101
|
- MIT
|
data/guideline/Guideline.docx
DELETED
Binary file
|