RubyGems - idata - Versions diffs - 0.1.26 → 0.1.27 - Mend

idata 0.1.26 → 0.1.27

Files changed (7) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 7615b5632f23a4d6a9f12ae44516ba1a985c71d9
-  data.tar.gz: 1de25681e830eba21537288a82a36b5d0a4658e6
+  metadata.gz: a77794735a66403f2d15fb44b25a24ae954a07d0
+  data.tar.gz: 6ffa6f9e348842a760bdc4b1ea5895fad91bd1c8
 SHA512:
-  metadata.gz: 8c38b43eac63a08f792b2fee967ec6aaa59e12e446a92641f4c5c7fb6a783ff210ac2925ba5051d4a27b2e7c0bd7e7f1678a6e7df43d9040d5294c44a0fa13a1
-  data.tar.gz: 7f8a19d9c36edb2a5a1d2521b457cb631671755838f626f13b0f9f12339812c71ceebfb2af445350ef779158e6748fd4320115f537fcd4b3e8b934c800224eff
+  metadata.gz: add59efee5a88c2f914008e585d0e3bb6f988a25b56e31a3314155ec9b93c99390d659f4f9967645d4857345815486ebdbf01cbb4af66cb24f7bcd4d40a56b19
+  data.tar.gz: 9b0d4e913980fc6fdefad215a5bafe3d59effce4da0ac256e17da63b6a53d09a71b4c42c964418dea3d17b0b54ec73ea5203915c6d7faffa2c73c181b9e9cec4

data/README.md CHANGED Viewed

@@ -1,29 +1,77 @@
-# Idata
+# OVERVIEW
+We provide some useful utilities for validating data in a PostgreSQL data table
+These utilities can be used as simple terminal commands and can be installed by:
-TODO: Write a gem description
+    gem install idata
-## Installation
+Prequisites:
+* PostgreSQL 9.0 or above
+* Ruby 2.0 or above
-Add this line to your application's Gemfile:
+# USAGE
+Suppose we have an `items` table, and we want to validate its records against certain criteria like:
-    gem 'idata'
+* `item_id` must not be null
+* `item_title` must not be null
+* The composite `[item_id, item_title]` must be unique
+* One `item_id` corresponds to only ONE `item_title` (in other words, there must not be two items with different titles but with the same `item_id`)
+and vice-versa
+* `vendor_code` must reference the `code` column in the `vendors` table
-And then execute:
+Then the validation command could be:
+```
+      ivalidate --host=localhost --user=postgres --database=mydb --table=items --log-to=validation_errors \
+                --not-null="vendor_id" \
+                --not-null="vendor_name" \
+                --unique="vendor_id,vendor_name" \
+                --consistent-by="vendor_id|vendor_name" \
+                --consistent-by="vendor_id|vendor_name" \
+                --cross-reference="vendor_code|vendors.code"
+```
+Validation results for every single record are logged to an additional column named `validation_errors`
+of the `items` table, as specified by the `--log-to` switch
-    $ bundle
+As you can see, most common checks can be performed using the supported switches:
+```
+    --not-null
+    --unique
+    --consistent-by
+    --cross-reference
+```
+For more generic check, we support some other switches:
-Or install it yourself as:
+The `--match="field/pattern/"` switch tells the program to check if value of a `field` matches the provided `pattern` (which is a regular expression).
+For example:
+```
+    # Check if item_id is a number:
+    ivalidate --match="item_id/[0-9]+/"
+    # Check if value of status is either 'A' or 'I' (any other value is not allowed)
+    ivalidate --match="status/^(A|I)$/"
+```
+In case you need even more customized validation other than the supported ones (match, unique, not-null, cross-reference...)
+then `--query` switch may be helpful. For example:
+```
+    ivalidate --query="string_to_date(start_date) >= '01/02/2014' -- invalid date"
+```
+You can also use `--rquery` which is the reversed counterpart of `--query`
+For example, the following two checks are equivalent:
+```
+    ivalidate --query="string_to_date(start_date) >= '01/02/2014' -- invalid date"
+    ivalidate --rquery="string_to_date(start_date) < '01/02/2014' -- invalid date"
+```
+(mark any record whose `start_date < '01/02/2014'` as "invalid date")
-    $ gem install idata
+Note: run `ivalidate --help` to see the full list of supported switches
-## Usage
-TODO: Write usage instructions here
+# PUT IT ALL TOGETHER
+You can put several `ivalidate` commands (for several data tables) in one single bash/sh file.
+Besides `ivalidate`, we also support some other utilities to:
++ Load data from text files to SQL tables
++ Modify data tables
++ Generate summary reports
+For a full example, see our `sample.sh`
-## Contributing
-1. Fork it
-2. Create your feature branch (`git checkout -b my-new-feature`)
-3. Commit your changes (`git commit -am 'Add some feature'`)
-4. Push to the branch (`git push origin my-new-feature`)
-5. Create new Pull Request

data/bin/ivalidate CHANGED Viewed

@@ -69,6 +69,10 @@ parser = OptionParser.new("", 24) do |opts|
   opts.on("--log-to FIELD", "Field to log error to") do |v|
     $options[:log_to] = v
   end
+  opts.on("--pretty", "Use more human-readable error message") do |v|
+    $options[:pretty] = v
+  end
   opts.on("-h", "--host HOST", "PostgreSQL host") do |v|
     $options[:host] = v
@@ -270,8 +274,16 @@ $options[:not_null].each do |field|
     eos
     ActiveRecord::Base.connection.execute(not_null_sql)
   rescue Exception => ex
-    puts "    --> *** ERROR ***: #{ex.message.split("\n").first }"
+    if ex.message.include?('PG::UndefinedColumn')
+      not_null_sql = <<-eos
+        UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('Column `#{field}` is missing', ' || '), ' || ')
+      eos
+      ActiveRecord::Base.connection.execute(not_null_sql)
+    else
+      puts "    --> *** ERROR ***: #{ex.message.split("\n").first }"
+    end
   end
 end
@@ -283,9 +295,11 @@ $options[:match].each do |value|
     field = value[/^[^\/]+/]
     regexp = value[/(?<=\/).*(?=\/)/]
     puts "Checking REGEXP matching: #{field} ~ #{regexp}"
+    msg = $options[:pretty] ? "Invalid #{field}" : "#{field} does not match [#{regexp.gsub("'", "''")}]"
     match_sql = <<-eos
-      UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('#{field} does not match [#{regexp.gsub("'", "''")}]', ' || '), ' || ')
+      UPDATE #{$options[:table]} SET #{$options[:log_to]} = array_to_string(string_to_array(#{$options[:log_to]}, ' || ') || string_to_array('#{msg}', ' || '), ' || ')
       WHERE #{field} IS NOT NULL AND length(trim(#{field})) <> 0 AND #{field} !~ '#{regexp}';
     eos

data/lib/idata/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Idata
-  VERSION = "0.1.26"
+  VERSION = "0.1.27"
 end

data/sample.sh ADDED Viewed

@@ -0,0 +1,143 @@
+# Idea: to validate data in a text file, we first load it to a data table
+# then use the validation utilities to validate the table
+#
+# The followings are performed by this script:
+#     * 1 Load raw text files into corresponding SQL tables
+#     * 2 Perform validation
+#     * 3 Generate reports
+#
+# @author NghiPM
+# @date May 2014
+###################################################################################
+# SET UP ENVIRONMENT VARIABLES
+###################################################################################
+# Instead of passing PostgreSQL credentials as parameters to every validation command,
+# you can set the corresponding environment variables which can be used by the those commands
+export HOST="localhost"
+export USERNAME="postgres"
+export PASSWORD="postgres"
+export DATABASE="northeast_georgia"
+export LISTEN=5432
+# Input file paths and corresponding table names
+FVENDOR="VendorMaster.csv"
+VENDOR="vendors"
+FITEM="ItemMaster.csv"
+ITEM="items"
+# Specify a temp folder for writing temporary outputs
+# Specify the path to the output summary report
+TMP="/tmp"
+REPORT="/tmp/report.xls"
+###################################################################################
+# STEP 1 - Load raw text files into corresponding SQL tables
+###################################################################################
+# Load data from VendorMaster.csv to the corresponding vendors table
+# and from ItemMaster.csv to items table.
+# Note: instead of using iload utility,you can use the PSQL COPY of PostgreSQL
+iload -i "$FVENDOR" -t "$VENDOR" -f csv
+iload -i "$FITEM" -t "$ITEM" -f csv
+###################################################################################
+# STEP 2 - Perform validation, log the results to an additional field
+###################################################################################
+# validate the vendors table
+ivalidate --table=$VENDOR --log-to=validation_errors \
+       --not-null="vendor_code" \
+       --not-null="vendor_name" \
+       --unique="vendor_code" \
+       --unique="vendor_name" \
+       --match="vendor_code/[a-zA-Z0-9]/" \
+       --match="vendor_name/[a-zA-Z0-9]/" \
+       --consistent-by="vendor_code|vendor_name" \
+       --consistent-by="vendor_name|vendor_code" \
+       --consistent-by="country_code|country_name" \
+       --consistent-by="country_name|country_code"
+# validate the items table
+ivalidate --table=$ITEM \
+       --log-to=validation_errors \
+       --not-null="item_id" \
+       --match="item_id/[a-zA-Z0-9]/" \
+       --not-null="item_desc" \
+       --match="item_desc/[a-zA-Z0-9]/" \
+       --not-null="item_uom" \
+       --not-null="default_uom" \
+       --not-null="item_price" \
+       --not-null="item_qoe" \
+       --not-null="corp_id" \
+       --not-null="corp_name" \
+       --not-null="vendor_code" \
+       --not-null="vendor_name" \
+       --not-null="mfr_number" \
+       --not-null="mfr_name" \
+       --not-null="active" \
+       --match="corp_id/[a-zA-Z0-9]/" \
+       --match="corp_name/[a-zA-Z0-9]/" \
+       --match="vendor_code/[a-zA-Z0-9]/" \
+       --match="vendor_name/[a-zA-Z0-9]/" \
+       --match="mfr_number/[a-zA-Z0-9]/" \
+       --match="mfr_name/[a-zA-Z0-9]/" \
+       --match="active/^(1|2|3|A|I)$/" \
+       --consisten-by="corp_id|corp_name" \
+       --consisten-by="corp_name|corp_id" \
+       --consisten-by="vendor_code|vendor_name" \
+       --consisten-by="vendor_name|vendor_code" \
+       --cross-reference="vendor_code|$VENDOR.vendor_code" \
+       --cross-reference="vendor_name|$VENDOR.vendor_name"
+###################################################################################
+# Step 3 - Generate summary report
+###################################################################################
+# After the validation step above, an additional field named validation_errors
+# is added to every table. In case the record does not pass a validation creterion, a corresponding error shall be logged to this field
+# One record may have more than one error logged
+#
+# You can simply look at the validation_errors field to see errors associated to a record
+#
+# Just to make a MORE comprehensive report, we can:
+#    1 Create a summary table which tells us how many errors found, how many records associated with each...
+#    2 Extract the first 1000 sample records for every error
+#    3 Put all together into one single Excel report
+# 1) Create error summary report table and write to /tmp/summary.csv
+# This can be done using the iexport utility which can generate a CSV file from a data table or from a custom query
+# Run iexport --help for more information
+iexport --output="$TMP/summary.csv" -f csv --no-quote-empty --quotes --headers \
+        --query="(select '$FVENDOR' as input_file, unnest(string_to_array(validation_errors, ' || ')) as error, count(*), round((count(*) * 100)::numeric / (select count(*) from $VENDOR), 2)::varchar || '%' as percentage from $VENDOR group by error order by error) union
+                 (select '$FITEM' as input_file, unnest(string_to_array(validation_errors, ' || ')) as error, count(*), round((count(*) * 100)::numeric / (select count(*) from $ITEM), 2)::varchar || '%' as percentage from $ITEM group by error order by error)"
+# Export the first 1000 records of every error in the items table
+# Write the results to /tmp/items.csv
+iexport --table=$VENDOR --output="$TMP/$VENDOR.csv" -f csv --no-quote-empty --quotes --headers \
+        --query="select * from (select ROW_NUMBER() OVER (PARTITION BY error) AS group_index, *
+                 FROM ( select unnest(string_to_array(validation_errors, ' || ')) as error, * from
+                 $VENDOR order by id  ) as main) as tmp
+                 where group_index <= 1000" \
+        --exclude="id, validation_errors, group_index"
+# 2) Export the first 1000 records of every error in the vendors table
+# Write the results to /tmp/vendors.csv
+iexport --table=$ITEM --output="$TMP/$ITEM.csv" -f csv --no-quote-empty --quotes --headers \
+        --query="select * from (select ROW_NUMBER() OVER (PARTITION BY error) AS group_index, *
+                 FROM ( select unnest(string_to_array(validation_errors, ' || ')) as error, * from
+                 $ITEM order by id  ) as main) as tmp
+                 where group_index <= 1000" \
+        --exclude="id, validation_errors, group_index"
+# 3) Put the above 3 CSV files into one Excel file /tmp/report.xls
+# This can be done using imerge which takes a list of CSV files put them to corresponding sheets
+# of one single Excel file
+imerge --output=$REPORT \
+       --input="Summary:$TMP/summary.csv" \
+       --input="$FVENDOR:$TMP/$VENDOR.csv" \
+       --input="ItemMaster:$TMP/$ITEM.csv"
+# CLEANUP
+# Remember to drop the temporary tables you create (items and vendors)

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: idata
 version: !ruby/object:Gem::Version
-  version: 0.1.26
+  version: 0.1.27
 platform: ruby
 authors:
 - Nghi Pham
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2014-07-04 00:00:00.000000000 Z
+date: 2014-07-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -92,10 +92,10 @@ files:
 - bin/ipatch
 - bin/isanitize
 - bin/ivalidate
-- guideline/Guideline.docx
 - idata.gemspec
 - lib/idata.rb
 - lib/idata/version.rb
+- sample.sh
 homepage: http://bolero.vn
 licenses:
 - MIT

data/guideline/Guideline.docx DELETED Viewed

Binary file