csv-utils 0.3.15 → 0.3.17
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/csv-diff +125 -0
- data/csv-utils.gemspec +1 -1
- data/lib/csv_utils/csv_compare.rb +8 -4
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 64f2fb9f8282fcb5ba93856ab707e9797ddb4f0195c46ae5564b455527315157
|
4
|
+
data.tar.gz: a6ccb029fd622dfa4906d644a32ec98bd5ff4deecd35869f0adb76f00445ce2b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 84721ad624e58a68fe4027a596748982531df61640e944b1bb52e2b6adae3ca64d36584a98a3ea74306272f03da3deaec9a32d89497ac73b4051adb559597fb0
|
7
|
+
data.tar.gz: 1f0097b68da9fee5b4765f7280e8a249318f9d6c2c215ac1174f9a035992b497b31f150e9caf1fd439bffe27fc868bd120ea1b9bf4a521786dc84b29aa82e030
|
data/bin/csv-diff
ADDED
@@ -0,0 +1,125 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'optparse'
|
4
|
+
|
5
|
+
options = {
|
6
|
+
unique_headers: [],
|
7
|
+
ignore_headers: [],
|
8
|
+
sort_batch_size: 1_000_000
|
9
|
+
}
|
10
|
+
OptionParser.new do |opts|
|
11
|
+
opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
|
12
|
+
|
13
|
+
opts.on('-h', '--help', 'Prints this help') do
|
14
|
+
puts opts
|
15
|
+
exit
|
16
|
+
end
|
17
|
+
|
18
|
+
opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
|
19
|
+
options[:unique_headers] = v.split(',')
|
20
|
+
end
|
21
|
+
|
22
|
+
opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
|
23
|
+
options[:ignore_headers] = v.split(',')
|
24
|
+
end
|
25
|
+
|
26
|
+
opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
|
27
|
+
opts[:sort_batch_size] = v
|
28
|
+
end
|
29
|
+
end.parse!
|
30
|
+
|
31
|
+
require 'csv-utils'
|
32
|
+
|
33
|
+
csv1 = CSVUtils::CSVIterator.new(ARGV[0])
|
34
|
+
csv2 = CSVUtils::CSVIterator.new(ARGV[1])
|
35
|
+
|
36
|
+
unless csv1.first.keys == csv2.first.keys
|
37
|
+
$stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
|
38
|
+
exit 1
|
39
|
+
end
|
40
|
+
|
41
|
+
unknown_unique_headers = options[:unique_headers] - csv1.first.keys
|
42
|
+
unless unknown_unique_headers.empty?
|
43
|
+
$stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
|
44
|
+
exit 1
|
45
|
+
end
|
46
|
+
|
47
|
+
unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
|
48
|
+
unless unknown_ignore_headers.empty?
|
49
|
+
$stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
|
50
|
+
exit 1
|
51
|
+
end
|
52
|
+
|
53
|
+
options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
|
54
|
+
|
55
|
+
puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
|
56
|
+
|
57
|
+
unique_header_indexes = []
|
58
|
+
csv1.first.keys.each_with_index do |header, idx|
|
59
|
+
unique_header_indexes << idx if options[:unique_headers].include?(header)
|
60
|
+
end
|
61
|
+
|
62
|
+
sort_compare_proc = proc do |csv1_row, csv2_row|
|
63
|
+
result = 0
|
64
|
+
unique_header_indexes.each do |idx|
|
65
|
+
result = csv1_row[idx] <=> csv2_row[idx]
|
66
|
+
break unless result == 0
|
67
|
+
end
|
68
|
+
|
69
|
+
if result == 0
|
70
|
+
csv1_row.each_with_index do |csv1_col, idx|
|
71
|
+
csv2_col = csv2_row[idx]
|
72
|
+
result = csv1_col <=> csv2_col
|
73
|
+
break unless result == 0
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
result
|
78
|
+
end
|
79
|
+
|
80
|
+
csv1_sorted_file_name = ARGV[0] + '.sorted'
|
81
|
+
csv2_sorted_file_name = ARGV[1] + '.sorted'
|
82
|
+
|
83
|
+
puts "sorting #{ARGV[0]}"
|
84
|
+
sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
|
85
|
+
sorter.sort(options[:sort_batch_size], &sort_compare_proc)
|
86
|
+
|
87
|
+
puts "sorting #{ARGV[1]}"
|
88
|
+
sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
|
89
|
+
sorter.sort(options[:sort_batch_size], &sort_compare_proc)
|
90
|
+
|
91
|
+
options[:unique_headers] = [csv1.first.keys.first]
|
92
|
+
update_comparison_columns = csv1.first.keys - options[:unique_headers]
|
93
|
+
update_comparison_columns -= options[:ignore_headers]
|
94
|
+
|
95
|
+
comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
|
96
|
+
result = 0
|
97
|
+
options[:unique_headers].each do |header|
|
98
|
+
result = csv1_row[header] <=> csv2_row[header]
|
99
|
+
break unless result == 0
|
100
|
+
end
|
101
|
+
|
102
|
+
result
|
103
|
+
end
|
104
|
+
|
105
|
+
stats = Hash.new(0)
|
106
|
+
puts "comparing #{ARGV[0]} with #{ARGV[1]}"
|
107
|
+
diff_file_name = 'diff-results-' + File.basename(ARGV[0])
|
108
|
+
CSV.open(diff_file_name, 'wb') do |out|
|
109
|
+
out << ['Result'] + csv1.first.keys
|
110
|
+
comparer.compare(csv2_sorted_file_name) do |action, record|
|
111
|
+
stats[action] += 1
|
112
|
+
out << [action] + record.values
|
113
|
+
end
|
114
|
+
end
|
115
|
+
puts "differences found #{stats}"
|
116
|
+
|
117
|
+
File.unlink(csv1_sorted_file_name)
|
118
|
+
File.unlink(csv2_sorted_file_name)
|
119
|
+
|
120
|
+
if stats.empty?
|
121
|
+
puts "files were identical"
|
122
|
+
File.unlink(diff_file_name)
|
123
|
+
else
|
124
|
+
puts "results can be found in #{diff_file_name}"
|
125
|
+
end
|
data/csv-utils.gemspec
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
# frozen_string_literal: true
|
2
|
-
|
3
1
|
# CSVUtils::CSVCompare purpose is to determine which rows in the secondary_data_file need to be created, deleted or updated
|
4
2
|
# **requires both CSV files to be sorted on the same columns, CSVUtils::CSVSort can accomplish this
|
5
3
|
# In order to receive updates, update_comparison_columns must configured or use inheritance and change the update_row? method
|
@@ -19,10 +17,12 @@ class CSVUtils::CSVCompare
|
|
19
17
|
end
|
20
18
|
|
21
19
|
def compare(secondary_data_file)
|
22
|
-
src = CSV.open(primary_data_file)
|
20
|
+
src = CSV.open(primary_data_file, 'rb')
|
23
21
|
src_headers = src.shift
|
24
|
-
|
22
|
+
strip_bom!(src_headers[0])
|
23
|
+
dest = CSV.open(secondary_data_file, 'rb')
|
25
24
|
dest_headers = dest.shift
|
25
|
+
strip_bom!(dest_headers[0])
|
26
26
|
|
27
27
|
read_next_src = true
|
28
28
|
read_next_dest = true
|
@@ -80,4 +80,8 @@ class CSVUtils::CSVCompare
|
|
80
80
|
|
81
81
|
false
|
82
82
|
end
|
83
|
+
|
84
|
+
def strip_bom!(col)
|
85
|
+
col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
|
86
|
+
end
|
83
87
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.17
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Doug Youch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
11
|
+
date: 2022-09-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: inheritance-helper
|
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
|
|
28
28
|
email: dougyouch@gmail.com
|
29
29
|
executables:
|
30
30
|
- csv-change-eol
|
31
|
+
- csv-diff
|
31
32
|
- csv-duplicate-finder
|
32
33
|
- csv-explorer
|
33
34
|
- csv-find-error
|
@@ -44,6 +45,7 @@ files:
|
|
44
45
|
- LICENSE
|
45
46
|
- README.md
|
46
47
|
- bin/csv-change-eol
|
48
|
+
- bin/csv-diff
|
47
49
|
- bin/csv-duplicate-finder
|
48
50
|
- bin/csv-explorer
|
49
51
|
- bin/csv-find-error
|