csv-utils 0.3.15 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/bin/csv-diff +125 -0
  3. data/csv-utils.gemspec +1 -1
  4. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0b511a7a2cac6926477ed165212fa2009947dc84040f7d0451f7fbd7fdd9543b
4
- data.tar.gz: 6661363cabaebcd2f21a41159f7ec0712b7729da5dde0cc7b1818415d8f0bc81
3
+ metadata.gz: cda417e397f1791d177bb6eadd107edfdcd31a6680a3d2fef07b5be6df91dd6c
4
+ data.tar.gz: 7dee11a2b59738a8ad1ea9f692ef2da6472bab65039fbeed630c34d369a952c0
5
5
  SHA512:
6
- metadata.gz: cb860bdbe29726e44e1af6c528623daf7acc186f48eecefdbd47530d4ae7ff3472a2c7044cef8de079e70d4d9ada0b1bfbff65923fc3fcb9c6236d3d2e9606d8
7
- data.tar.gz: fe7a67353de3c0e4dd5e36b71345abb373c19ca3e23bc0a8624f7f4147058911e487db17218c35e2ae2e77dfaed61185ca5ffea77736c410a3b18d1334514cd8
6
+ metadata.gz: af9f536d693cdcac8db949cbec1981b38637149927910b59aa570becef637cbae54c5371bc9d6031698c993e536c063b0ba89a0ca1be6c6d6ca7056e4326cc83
7
+ data.tar.gz: f48853f7756f3a7e36c34e37fd38ce611db880fb5e083340eedf817800f094dcd7900b89f3f7edab2eb8eac6e2c837e9209ec3dba1cc79d95d2f065f36b15d45
data/bin/csv-diff ADDED
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ options = {
6
+ unique_headers: [],
7
+ ignore_headers: [],
8
+ sort_batch_size: 1_000_000
9
+ }
10
+ OptionParser.new do |opts|
11
+ opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
12
+
13
+ opts.on('-h', '--help', 'Prints this help') do
14
+ puts opts
15
+ exit
16
+ end
17
+
18
+ opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
19
+ options[:unique_headers] = v.split(',')
20
+ end
21
+
22
+ opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
23
+ options[:ignore_headers] = v.split(',')
24
+ end
25
+
26
+ opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
27
+ opts[:sort_batch_size] = v
28
+ end
29
+ end.parse!
30
+
31
+ require 'csv-utils'
32
+
33
+ csv1 = CSVUtils::CSVIterator.new(ARGV[0])
34
+ csv2 = CSVUtils::CSVIterator.new(ARGV[1])
35
+
36
+ unless csv1.first.keys == csv2.first.keys
37
+ $stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
38
+ exit 1
39
+ end
40
+
41
+ unknown_unique_headers = options[:unique_headers] - csv1.first.keys
42
+ unless unknown_unique_headers.empty?
43
+ $stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
44
+ exit 1
45
+ end
46
+
47
+ unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
48
+ unless unknown_ignore_headers.empty?
49
+ $stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
50
+ exit 1
51
+ end
52
+
53
+ options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
54
+
55
+ puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
56
+
57
+ unique_header_indexes = []
58
+ csv1.first.keys.each_with_index do |header, idx|
59
+ unique_header_indexes << idx if options[:unique_headers].include?(header)
60
+ end
61
+
62
+ sort_compare_proc = proc do |csv1_row, csv2_row|
63
+ result = 0
64
+ unique_header_indexes.each do |idx|
65
+ result = csv1_row[idx] <=> csv2_row[idx]
66
+ break unless result == 0
67
+ end
68
+
69
+ if result == 0
70
+ csv1_row.each_with_index do |csv1_col, idx|
71
+ csv2_col = csv2_row[idx]
72
+ result = csv1_col <=> csv2_col
73
+ break unless result == 0
74
+ end
75
+ end
76
+
77
+ result
78
+ end
79
+
80
+ csv1_sorted_file_name = ARGV[0] + '.sorted'
81
+ csv2_sorted_file_name = ARGV[1] + '.sorted'
82
+
83
+ puts "sorting #{ARGV[0]}"
84
+ sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
85
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
86
+
87
+ puts "sorting #{ARGV[1]}"
88
+ sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
89
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
90
+
91
+ options[:unique_headers] = [csv1.first.keys.first]
92
+ update_comparison_columns = csv1.first.keys - options[:unique_headers]
93
+ update_comparison_columns -= options[:ignore_headers]
94
+
95
+ comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
96
+ result = 0
97
+ options[:unique_headers].each do |header|
98
+ result = csv1_row[header] <=> csv2_row[header]
99
+ break unless result == 0
100
+ end
101
+
102
+ result
103
+ end
104
+
105
+ stats = Hash.new(0)
106
+ puts "comparing #{ARGV[0]} with #{ARGV[1]}"
107
+ diff_file_name = 'diff-results-' + ARGV[0]
108
+ CSV.open(diff_file_name, 'wb') do |out|
109
+ out << ['Result'] + csv1.first.keys
110
+ comparer.compare(csv2_sorted_file_name) do |action, record|
111
+ stats[action] += 1
112
+ out << [action] + record.values
113
+ end
114
+ end
115
+ puts "differences found #{stats}"
116
+
117
+ File.unlink(csv1_sorted_file_name)
118
+ File.unlink(csv2_sorted_file_name)
119
+
120
+ if stats.empty?
121
+ puts "files were identical"
122
+ File.unlink(diff_file_name)
123
+ else
124
+ puts "results can be found in #{diff_file_name}"
125
+ end
data/csv-utils.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.3.15'
5
+ s.version = '0.3.16'
6
6
  s.licenses = ['MIT']
7
7
  s.summary = 'CSV Utils'
8
8
  s.description = 'Tools for debugging malformed CSV files'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.15
4
+ version: 0.3.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-25 00:00:00.000000000 Z
11
+ date: 2022-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inheritance-helper
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
28
28
  email: dougyouch@gmail.com
29
29
  executables:
30
30
  - csv-change-eol
31
+ - csv-diff
31
32
  - csv-duplicate-finder
32
33
  - csv-explorer
33
34
  - csv-find-error
@@ -44,6 +45,7 @@ files:
44
45
  - LICENSE
45
46
  - README.md
46
47
  - bin/csv-change-eol
48
+ - bin/csv-diff
47
49
  - bin/csv-duplicate-finder
48
50
  - bin/csv-explorer
49
51
  - bin/csv-find-error