RubyGems - csv-utils - Versions diffs - 0.3.15 → 0.3.16 - Mend

csv-utils 0.3.15 → 0.3.16

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0b511a7a2cac6926477ed165212fa2009947dc84040f7d0451f7fbd7fdd9543b
-  data.tar.gz: 6661363cabaebcd2f21a41159f7ec0712b7729da5dde0cc7b1818415d8f0bc81
+  metadata.gz: cda417e397f1791d177bb6eadd107edfdcd31a6680a3d2fef07b5be6df91dd6c
+  data.tar.gz: 7dee11a2b59738a8ad1ea9f692ef2da6472bab65039fbeed630c34d369a952c0
 SHA512:
-  metadata.gz: cb860bdbe29726e44e1af6c528623daf7acc186f48eecefdbd47530d4ae7ff3472a2c7044cef8de079e70d4d9ada0b1bfbff65923fc3fcb9c6236d3d2e9606d8
-  data.tar.gz: fe7a67353de3c0e4dd5e36b71345abb373c19ca3e23bc0a8624f7f4147058911e487db17218c35e2ae2e77dfaed61185ca5ffea77736c410a3b18d1334514cd8
+  metadata.gz: af9f536d693cdcac8db949cbec1981b38637149927910b59aa570becef637cbae54c5371bc9d6031698c993e536c063b0ba89a0ca1be6c6d6ca7056e4326cc83
+  data.tar.gz: f48853f7756f3a7e36c34e37fd38ce611db880fb5e083340eedf817800f094dcd7900b89f3f7edab2eb8eac6e2c837e9209ec3dba1cc79d95d2f065f36b15d45

data/bin/csv-diff ADDED Viewed

@@ -0,0 +1,125 @@
+#!/usr/bin/env ruby
+require 'optparse'
+options = {
+  unique_headers: [],
+  ignore_headers: [],
+  sort_batch_size: 1_000_000
+}
+OptionParser.new do |opts|
+  opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
+  opts.on('-h', '--help', 'Prints this help') do
+    puts opts
+    exit
+  end
+  opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
+    options[:unique_headers] = v.split(',')
+  end
+  opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
+    options[:ignore_headers] = v.split(',')
+  end
+  opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
+    opts[:sort_batch_size] = v
+  end
+end.parse!
+require 'csv-utils'
+csv1 = CSVUtils::CSVIterator.new(ARGV[0])
+csv2 = CSVUtils::CSVIterator.new(ARGV[1])
+unless csv1.first.keys == csv2.first.keys
+  $stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
+  exit 1
+end
+unknown_unique_headers = options[:unique_headers] - csv1.first.keys
+unless unknown_unique_headers.empty?
+  $stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
+  exit 1
+end
+unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
+unless unknown_ignore_headers.empty?
+  $stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
+  exit 1
+end
+options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
+puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
+unique_header_indexes = []
+csv1.first.keys.each_with_index do |header, idx|
+  unique_header_indexes << idx if options[:unique_headers].include?(header)
+end
+sort_compare_proc = proc do |csv1_row, csv2_row|
+  result = 0
+  unique_header_indexes.each do |idx|
+    result = csv1_row[idx] <=> csv2_row[idx]
+    break unless result == 0
+  end
+  if result == 0
+    csv1_row.each_with_index do |csv1_col, idx|
+      csv2_col = csv2_row[idx]
+      result = csv1_col <=> csv2_col
+      break unless result == 0
+    end
+  end
+  result
+end
+csv1_sorted_file_name = ARGV[0] + '.sorted'
+csv2_sorted_file_name = ARGV[1] + '.sorted'
+puts "sorting #{ARGV[0]}"
+sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
+sorter.sort(options[:sort_batch_size], &sort_compare_proc)
+puts "sorting #{ARGV[1]}"
+sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
+sorter.sort(options[:sort_batch_size], &sort_compare_proc)
+options[:unique_headers] = [csv1.first.keys.first]
+update_comparison_columns = csv1.first.keys - options[:unique_headers]
+update_comparison_columns -= options[:ignore_headers]
+comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
+  result = 0
+  options[:unique_headers].each do |header|
+    result = csv1_row[header] <=> csv2_row[header]
+    break unless result == 0
+  end
+  result
+end
+stats = Hash.new(0)
+puts "comparing #{ARGV[0]} with #{ARGV[1]}"
+diff_file_name = 'diff-results-' + ARGV[0]
+CSV.open(diff_file_name, 'wb') do |out|
+  out << ['Result'] + csv1.first.keys
+  comparer.compare(csv2_sorted_file_name) do |action, record|
+    stats[action] += 1
+    out << [action] + record.values
+  end
+end
+puts "differences found #{stats}"
+File.unlink(csv1_sorted_file_name)
+File.unlink(csv2_sorted_file_name)
+if stats.empty?
+  puts "files were identical"
+  File.unlink(diff_file_name)
+else
+  puts "results can be found in #{diff_file_name}"
+end

data/csv-utils.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name        = 'csv-utils'
-  s.version     = '0.3.15'
+  s.version     = '0.3.16'
   s.licenses    = ['MIT']
   s.summary     = 'CSV Utils'
   s.description = 'Tools for debugging malformed CSV files'

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: csv-utils
 version: !ruby/object:Gem::Version
-  version: 0.3.15
+  version: 0.3.16
 platform: ruby
 authors:
 - Doug Youch
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-08-25 00:00:00.000000000 Z
+date: 2022-09-15 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: inheritance-helper
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
 email: dougyouch@gmail.com
 executables:
 - csv-change-eol
+- csv-diff
 - csv-duplicate-finder
 - csv-explorer
 - csv-find-error
@@ -44,6 +45,7 @@ files:
 - LICENSE
 - README.md
 - bin/csv-change-eol
+- bin/csv-diff
 - bin/csv-duplicate-finder
 - bin/csv-explorer
 - bin/csv-find-error