csv-utils 0.3.15 → 0.3.16

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/bin/csv-diff +125 -0
  3. data/csv-utils.gemspec +1 -1
  4. metadata +4 -2
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0b511a7a2cac6926477ed165212fa2009947dc84040f7d0451f7fbd7fdd9543b
4
- data.tar.gz: 6661363cabaebcd2f21a41159f7ec0712b7729da5dde0cc7b1818415d8f0bc81
3
+ metadata.gz: cda417e397f1791d177bb6eadd107edfdcd31a6680a3d2fef07b5be6df91dd6c
4
+ data.tar.gz: 7dee11a2b59738a8ad1ea9f692ef2da6472bab65039fbeed630c34d369a952c0
5
5
  SHA512:
6
- metadata.gz: cb860bdbe29726e44e1af6c528623daf7acc186f48eecefdbd47530d4ae7ff3472a2c7044cef8de079e70d4d9ada0b1bfbff65923fc3fcb9c6236d3d2e9606d8
7
- data.tar.gz: fe7a67353de3c0e4dd5e36b71345abb373c19ca3e23bc0a8624f7f4147058911e487db17218c35e2ae2e77dfaed61185ca5ffea77736c410a3b18d1334514cd8
6
+ metadata.gz: af9f536d693cdcac8db949cbec1981b38637149927910b59aa570becef637cbae54c5371bc9d6031698c993e536c063b0ba89a0ca1be6c6d6ca7056e4326cc83
7
+ data.tar.gz: f48853f7756f3a7e36c34e37fd38ce611db880fb5e083340eedf817800f094dcd7900b89f3f7edab2eb8eac6e2c837e9209ec3dba1cc79d95d2f065f36b15d45
data/bin/csv-diff ADDED
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ options = {
6
+ unique_headers: [],
7
+ ignore_headers: [],
8
+ sort_batch_size: 1_000_000
9
+ }
10
+ OptionParser.new do |opts|
11
+ opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
12
+
13
+ opts.on('-h', '--help', 'Prints this help') do
14
+ puts opts
15
+ exit
16
+ end
17
+
18
+ opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
19
+ options[:unique_headers] = v.split(',')
20
+ end
21
+
22
+ opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
23
+ options[:ignore_headers] = v.split(',')
24
+ end
25
+
26
+ opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
27
+ opts[:sort_batch_size] = v
28
+ end
29
+ end.parse!
30
+
31
+ require 'csv-utils'
32
+
33
+ csv1 = CSVUtils::CSVIterator.new(ARGV[0])
34
+ csv2 = CSVUtils::CSVIterator.new(ARGV[1])
35
+
36
+ unless csv1.first.keys == csv2.first.keys
37
+ $stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
38
+ exit 1
39
+ end
40
+
41
+ unknown_unique_headers = options[:unique_headers] - csv1.first.keys
42
+ unless unknown_unique_headers.empty?
43
+ $stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
44
+ exit 1
45
+ end
46
+
47
+ unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
48
+ unless unknown_ignore_headers.empty?
49
+ $stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
50
+ exit 1
51
+ end
52
+
53
+ options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
54
+
55
+ puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
56
+
57
+ unique_header_indexes = []
58
+ csv1.first.keys.each_with_index do |header, idx|
59
+ unique_header_indexes << idx if options[:unique_headers].include?(header)
60
+ end
61
+
62
+ sort_compare_proc = proc do |csv1_row, csv2_row|
63
+ result = 0
64
+ unique_header_indexes.each do |idx|
65
+ result = csv1_row[idx] <=> csv2_row[idx]
66
+ break unless result == 0
67
+ end
68
+
69
+ if result == 0
70
+ csv1_row.each_with_index do |csv1_col, idx|
71
+ csv2_col = csv2_row[idx]
72
+ result = csv1_col <=> csv2_col
73
+ break unless result == 0
74
+ end
75
+ end
76
+
77
+ result
78
+ end
79
+
80
+ csv1_sorted_file_name = ARGV[0] + '.sorted'
81
+ csv2_sorted_file_name = ARGV[1] + '.sorted'
82
+
83
+ puts "sorting #{ARGV[0]}"
84
+ sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
85
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
86
+
87
+ puts "sorting #{ARGV[1]}"
88
+ sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
89
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
90
+
91
+ options[:unique_headers] = [csv1.first.keys.first]
92
+ update_comparison_columns = csv1.first.keys - options[:unique_headers]
93
+ update_comparison_columns -= options[:ignore_headers]
94
+
95
+ comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
96
+ result = 0
97
+ options[:unique_headers].each do |header|
98
+ result = csv1_row[header] <=> csv2_row[header]
99
+ break unless result == 0
100
+ end
101
+
102
+ result
103
+ end
104
+
105
+ stats = Hash.new(0)
106
+ puts "comparing #{ARGV[0]} with #{ARGV[1]}"
107
+ diff_file_name = 'diff-results-' + ARGV[0]
108
+ CSV.open(diff_file_name, 'wb') do |out|
109
+ out << ['Result'] + csv1.first.keys
110
+ comparer.compare(csv2_sorted_file_name) do |action, record|
111
+ stats[action] += 1
112
+ out << [action] + record.values
113
+ end
114
+ end
115
+ puts "differences found #{stats}"
116
+
117
+ File.unlink(csv1_sorted_file_name)
118
+ File.unlink(csv2_sorted_file_name)
119
+
120
+ if stats.empty?
121
+ puts "files were identical"
122
+ File.unlink(diff_file_name)
123
+ else
124
+ puts "results can be found in #{diff_file_name}"
125
+ end
data/csv-utils.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.3.15'
5
+ s.version = '0.3.16'
6
6
  s.licenses = ['MIT']
7
7
  s.summary = 'CSV Utils'
8
8
  s.description = 'Tools for debugging malformed CSV files'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.15
4
+ version: 0.3.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-25 00:00:00.000000000 Z
11
+ date: 2022-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inheritance-helper
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
28
28
  email: dougyouch@gmail.com
29
29
  executables:
30
30
  - csv-change-eol
31
+ - csv-diff
31
32
  - csv-duplicate-finder
32
33
  - csv-explorer
33
34
  - csv-find-error
@@ -44,6 +45,7 @@ files:
44
45
  - LICENSE
45
46
  - README.md
46
47
  - bin/csv-change-eol
48
+ - bin/csv-diff
47
49
  - bin/csv-duplicate-finder
48
50
  - bin/csv-explorer
49
51
  - bin/csv-find-error