csv-utils 0.3.15 → 0.3.16
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/csv-diff +125 -0
- data/csv-utils.gemspec +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: cda417e397f1791d177bb6eadd107edfdcd31a6680a3d2fef07b5be6df91dd6c
|
4
|
+
data.tar.gz: 7dee11a2b59738a8ad1ea9f692ef2da6472bab65039fbeed630c34d369a952c0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af9f536d693cdcac8db949cbec1981b38637149927910b59aa570becef637cbae54c5371bc9d6031698c993e536c063b0ba89a0ca1be6c6d6ca7056e4326cc83
|
7
|
+
data.tar.gz: f48853f7756f3a7e36c34e37fd38ce611db880fb5e083340eedf817800f094dcd7900b89f3f7edab2eb8eac6e2c837e9209ec3dba1cc79d95d2f065f36b15d45
|
data/bin/csv-diff
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
options = {
|
6
|
+
unique_headers: [],
|
7
|
+
ignore_headers: [],
|
8
|
+
sort_batch_size: 1_000_000
|
9
|
+
}
|
10
|
+
OptionParser.new do |opts|
|
11
|
+
opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
|
12
|
+
|
13
|
+
opts.on('-h', '--help', 'Prints this help') do
|
14
|
+
puts opts
|
15
|
+
exit
|
16
|
+
end
|
17
|
+
|
18
|
+
opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
|
19
|
+
options[:unique_headers] = v.split(',')
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
|
23
|
+
options[:ignore_headers] = v.split(',')
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
|
27
|
+
opts[:sort_batch_size] = v
|
28
|
+
end
|
29
|
+
end.parse!
|
30
|
+
|
31
|
+
require 'csv-utils'
|
32
|
+
|
33
|
+
csv1 = CSVUtils::CSVIterator.new(ARGV[0])
|
34
|
+
csv2 = CSVUtils::CSVIterator.new(ARGV[1])
|
35
|
+
|
36
|
+
unless csv1.first.keys == csv2.first.keys
|
37
|
+
$stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
|
38
|
+
exit 1
|
39
|
+
end
|
40
|
+
|
41
|
+
unknown_unique_headers = options[:unique_headers] - csv1.first.keys
|
42
|
+
unless unknown_unique_headers.empty?
|
43
|
+
$stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
|
44
|
+
exit 1
|
45
|
+
end
|
46
|
+
|
47
|
+
unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
|
48
|
+
unless unknown_ignore_headers.empty?
|
49
|
+
$stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
|
50
|
+
exit 1
|
51
|
+
end
|
52
|
+
|
53
|
+
options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
|
54
|
+
|
55
|
+
puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
|
56
|
+
|
57
|
+
unique_header_indexes = []
|
58
|
+
csv1.first.keys.each_with_index do |header, idx|
|
59
|
+
unique_header_indexes << idx if options[:unique_headers].include?(header)
|
60
|
+
end
|
61
|
+
|
62
|
+
sort_compare_proc = proc do |csv1_row, csv2_row|
|
63
|
+
result = 0
|
64
|
+
unique_header_indexes.each do |idx|
|
65
|
+
result = csv1_row[idx] <=> csv2_row[idx]
|
66
|
+
break unless result == 0
|
67
|
+
end
|
68
|
+
|
69
|
+
if result == 0
|
70
|
+
csv1_row.each_with_index do |csv1_col, idx|
|
71
|
+
csv2_col = csv2_row[idx]
|
72
|
+
result = csv1_col <=> csv2_col
|
73
|
+
break unless result == 0
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
result
|
78
|
+
end
|
79
|
+
|
80
|
+
csv1_sorted_file_name = ARGV[0] + '.sorted'
|
81
|
+
csv2_sorted_file_name = ARGV[1] + '.sorted'
|
82
|
+
|
83
|
+
puts "sorting #{ARGV[0]}"
|
84
|
+
sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
|
85
|
+
sorter.sort(options[:sort_batch_size], &sort_compare_proc)
|
86
|
+
|
87
|
+
puts "sorting #{ARGV[1]}"
|
88
|
+
sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
|
89
|
+
sorter.sort(options[:sort_batch_size], &sort_compare_proc)
|
90
|
+
|
91
|
+
options[:unique_headers] = [csv1.first.keys.first]
|
92
|
+
update_comparison_columns = csv1.first.keys - options[:unique_headers]
|
93
|
+
update_comparison_columns -= options[:ignore_headers]
|
94
|
+
|
95
|
+
comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
|
96
|
+
result = 0
|
97
|
+
options[:unique_headers].each do |header|
|
98
|
+
result = csv1_row[header] <=> csv2_row[header]
|
99
|
+
break unless result == 0
|
100
|
+
end
|
101
|
+
|
102
|
+
result
|
103
|
+
end
|
104
|
+
|
105
|
+
stats = Hash.new(0)
|
106
|
+
puts "comparing #{ARGV[0]} with #{ARGV[1]}"
|
107
|
+
diff_file_name = 'diff-results-' + ARGV[0]
|
108
|
+
CSV.open(diff_file_name, 'wb') do |out|
|
109
|
+
out << ['Result'] + csv1.first.keys
|
110
|
+
comparer.compare(csv2_sorted_file_name) do |action, record|
|
111
|
+
stats[action] += 1
|
112
|
+
out << [action] + record.values
|
113
|
+
end
|
114
|
+
end
|
115
|
+
puts "differences found #{stats}"
|
116
|
+
|
117
|
+
File.unlink(csv1_sorted_file_name)
|
118
|
+
File.unlink(csv2_sorted_file_name)
|
119
|
+
|
120
|
+
if stats.empty?
|
121
|
+
puts "files were identical"
|
122
|
+
File.unlink(diff_file_name)
|
123
|
+
else
|
124
|
+
puts "results can be found in #{diff_file_name}"
|
125
|
+
end
|
data/csv-utils.gemspec
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.16
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Doug Youch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-09-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: inheritance-helper
|
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
|
|
28
28
|
email: dougyouch@gmail.com
|
29
29
|
executables:
|
30
30
|
- csv-change-eol
|
31
|
+
- csv-diff
|
31
32
|
- csv-duplicate-finder
|
32
33
|
- csv-explorer
|
33
34
|
- csv-find-error
|
@@ -44,6 +45,7 @@ files:
|
|
44
45
|
- LICENSE
|
45
46
|
- README.md
|
46
47
|
- bin/csv-change-eol
|
48
|
+
- bin/csv-diff
|
47
49
|
- bin/csv-duplicate-finder
|
48
50
|
- bin/csv-explorer
|
49
51
|
- bin/csv-find-error
|