RubyGems - csv-utils - Versions diffs - 0.3.15 → 0.3.17 - Mend

csv-utils 0.3.15 → 0.3.17

Files changed (5) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 0b511a7a2cac6926477ed165212fa2009947dc84040f7d0451f7fbd7fdd9543b
-  data.tar.gz: 6661363cabaebcd2f21a41159f7ec0712b7729da5dde0cc7b1818415d8f0bc81
+  metadata.gz: 64f2fb9f8282fcb5ba93856ab707e9797ddb4f0195c46ae5564b455527315157
+  data.tar.gz: a6ccb029fd622dfa4906d644a32ec98bd5ff4deecd35869f0adb76f00445ce2b
 SHA512:
-  metadata.gz: cb860bdbe29726e44e1af6c528623daf7acc186f48eecefdbd47530d4ae7ff3472a2c7044cef8de079e70d4d9ada0b1bfbff65923fc3fcb9c6236d3d2e9606d8
-  data.tar.gz: fe7a67353de3c0e4dd5e36b71345abb373c19ca3e23bc0a8624f7f4147058911e487db17218c35e2ae2e77dfaed61185ca5ffea77736c410a3b18d1334514cd8
+  metadata.gz: 84721ad624e58a68fe4027a596748982531df61640e944b1bb52e2b6adae3ca64d36584a98a3ea74306272f03da3deaec9a32d89497ac73b4051adb559597fb0
+  data.tar.gz: 1f0097b68da9fee5b4765f7280e8a249318f9d6c2c215ac1174f9a035992b497b31f150e9caf1fd439bffe27fc868bd120ea1b9bf4a521786dc84b29aa82e030

data/bin/csv-diff ADDED Viewed

@@ -0,0 +1,125 @@
+#!/usr/bin/env ruby
+require 'optparse'
+options = {
+  unique_headers: [],
+  ignore_headers: [],
+  sort_batch_size: 1_000_000
+}
+OptionParser.new do |opts|
+  opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
+  opts.on('-h', '--help', 'Prints this help') do
+    puts opts
+    exit
+  end
+  opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
+    options[:unique_headers] = v.split(',')
+  end
+  opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
+    options[:ignore_headers] = v.split(',')
+  end
+  opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
+    opts[:sort_batch_size] = v
+  end
+end.parse!
+require 'csv-utils'
+csv1 = CSVUtils::CSVIterator.new(ARGV[0])
+csv2 = CSVUtils::CSVIterator.new(ARGV[1])
+unless csv1.first.keys == csv2.first.keys
+  $stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
+  exit 1
+end
+unknown_unique_headers = options[:unique_headers] - csv1.first.keys
+unless unknown_unique_headers.empty?
+  $stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
+  exit 1
+end
+unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
+unless unknown_ignore_headers.empty?
+  $stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
+  exit 1
+end
+options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
+puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
+unique_header_indexes = []
+csv1.first.keys.each_with_index do |header, idx|
+  unique_header_indexes << idx if options[:unique_headers].include?(header)
+end
+sort_compare_proc = proc do |csv1_row, csv2_row|
+  result = 0
+  unique_header_indexes.each do |idx|
+    result = csv1_row[idx] <=> csv2_row[idx]
+    break unless result == 0
+  end
+  if result == 0
+    csv1_row.each_with_index do |csv1_col, idx|
+      csv2_col = csv2_row[idx]
+      result = csv1_col <=> csv2_col
+      break unless result == 0
+    end
+  end
+  result
+end
+csv1_sorted_file_name = ARGV[0] + '.sorted'
+csv2_sorted_file_name = ARGV[1] + '.sorted'
+puts "sorting #{ARGV[0]}"
+sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
+sorter.sort(options[:sort_batch_size], &sort_compare_proc)
+puts "sorting #{ARGV[1]}"
+sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
+sorter.sort(options[:sort_batch_size], &sort_compare_proc)
+options[:unique_headers] = [csv1.first.keys.first]
+update_comparison_columns = csv1.first.keys - options[:unique_headers]
+update_comparison_columns -= options[:ignore_headers]
+comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
+  result = 0
+  options[:unique_headers].each do |header|
+    result = csv1_row[header] <=> csv2_row[header]
+    break unless result == 0
+  end
+  result
+end
+stats = Hash.new(0)
+puts "comparing #{ARGV[0]} with #{ARGV[1]}"
+diff_file_name = 'diff-results-' + File.basename(ARGV[0])
+CSV.open(diff_file_name, 'wb') do |out|
+  out << ['Result'] + csv1.first.keys
+  comparer.compare(csv2_sorted_file_name) do |action, record|
+    stats[action] += 1
+    out << [action] + record.values
+  end
+end
+puts "differences found #{stats}"
+File.unlink(csv1_sorted_file_name)
+File.unlink(csv2_sorted_file_name)
+if stats.empty?
+  puts "files were identical"
+  File.unlink(diff_file_name)
+else
+  puts "results can be found in #{diff_file_name}"
+end

data/csv-utils.gemspec CHANGED Viewed

@@ -2,7 +2,7 @@
 Gem::Specification.new do |s|
   s.name        = 'csv-utils'
-  s.version     = '0.3.15'
+  s.version     = '0.3.17'
   s.licenses    = ['MIT']
   s.summary     = 'CSV Utils'
   s.description = 'Tools for debugging malformed CSV files'

data/lib/csv_utils/csv_compare.rb CHANGED Viewed

@@ -1,5 +1,3 @@
-# frozen_string_literal: true
 # CSVUtils::CSVCompare purpose is to determine which rows in the secondary_data_file need to be created, deleted or updated
 # **requires both CSV files to be sorted on the same columns, CSVUtils::CSVSort can accomplish this
 # In order to receive updates, update_comparison_columns must configured or use inheritance and change the update_row? method
@@ -19,10 +17,12 @@ class CSVUtils::CSVCompare
   end
   def compare(secondary_data_file)
-    src = CSV.open(primary_data_file)
+    src = CSV.open(primary_data_file, 'rb')
     src_headers = src.shift
-    dest = CSV.open(secondary_data_file)
+    strip_bom!(src_headers[0])
+    dest = CSV.open(secondary_data_file, 'rb')
     dest_headers = dest.shift
+    strip_bom!(dest_headers[0])
     read_next_src = true
     read_next_dest = true
@@ -80,4 +80,8 @@ class CSVUtils::CSVCompare
     false
   end
+  def strip_bom!(col)
+    col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
+  end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: csv-utils
 version: !ruby/object:Gem::Version
-  version: 0.3.15
+  version: 0.3.17
 platform: ruby
 authors:
 - Doug Youch
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2022-08-25 00:00:00.000000000 Z
+date: 2022-09-28 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: inheritance-helper
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
 email: dougyouch@gmail.com
 executables:
 - csv-change-eol
+- csv-diff
 - csv-duplicate-finder
 - csv-explorer
 - csv-find-error
@@ -44,6 +45,7 @@ files:
 - LICENSE
 - README.md
 - bin/csv-change-eol
+- bin/csv-diff
 - bin/csv-duplicate-finder
 - bin/csv-explorer
 - bin/csv-find-error