csv-utils 0.3.15 → 0.3.17

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0b511a7a2cac6926477ed165212fa2009947dc84040f7d0451f7fbd7fdd9543b
4
- data.tar.gz: 6661363cabaebcd2f21a41159f7ec0712b7729da5dde0cc7b1818415d8f0bc81
3
+ metadata.gz: 64f2fb9f8282fcb5ba93856ab707e9797ddb4f0195c46ae5564b455527315157
4
+ data.tar.gz: a6ccb029fd622dfa4906d644a32ec98bd5ff4deecd35869f0adb76f00445ce2b
5
5
  SHA512:
6
- metadata.gz: cb860bdbe29726e44e1af6c528623daf7acc186f48eecefdbd47530d4ae7ff3472a2c7044cef8de079e70d4d9ada0b1bfbff65923fc3fcb9c6236d3d2e9606d8
7
- data.tar.gz: fe7a67353de3c0e4dd5e36b71345abb373c19ca3e23bc0a8624f7f4147058911e487db17218c35e2ae2e77dfaed61185ca5ffea77736c410a3b18d1334514cd8
6
+ metadata.gz: 84721ad624e58a68fe4027a596748982531df61640e944b1bb52e2b6adae3ca64d36584a98a3ea74306272f03da3deaec9a32d89497ac73b4051adb559597fb0
7
+ data.tar.gz: 1f0097b68da9fee5b4765f7280e8a249318f9d6c2c215ac1174f9a035992b497b31f150e9caf1fd439bffe27fc868bd120ea1b9bf4a521786dc84b29aa82e030
data/bin/csv-diff ADDED
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ options = {
6
+ unique_headers: [],
7
+ ignore_headers: [],
8
+ sort_batch_size: 1_000_000
9
+ }
10
+ OptionParser.new do |opts|
11
+ opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
12
+
13
+ opts.on('-h', '--help', 'Prints this help') do
14
+ puts opts
15
+ exit
16
+ end
17
+
18
+ opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
19
+ options[:unique_headers] = v.split(',')
20
+ end
21
+
22
+ opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
23
+ options[:ignore_headers] = v.split(',')
24
+ end
25
+
26
+ opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
27
+ opts[:sort_batch_size] = v
28
+ end
29
+ end.parse!
30
+
31
+ require 'csv-utils'
32
+
33
+ csv1 = CSVUtils::CSVIterator.new(ARGV[0])
34
+ csv2 = CSVUtils::CSVIterator.new(ARGV[1])
35
+
36
+ unless csv1.first.keys == csv2.first.keys
37
+ $stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
38
+ exit 1
39
+ end
40
+
41
+ unknown_unique_headers = options[:unique_headers] - csv1.first.keys
42
+ unless unknown_unique_headers.empty?
43
+ $stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
44
+ exit 1
45
+ end
46
+
47
+ unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
48
+ unless unknown_ignore_headers.empty?
49
+ $stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
50
+ exit 1
51
+ end
52
+
53
+ options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
54
+
55
+ puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
56
+
57
+ unique_header_indexes = []
58
+ csv1.first.keys.each_with_index do |header, idx|
59
+ unique_header_indexes << idx if options[:unique_headers].include?(header)
60
+ end
61
+
62
+ sort_compare_proc = proc do |csv1_row, csv2_row|
63
+ result = 0
64
+ unique_header_indexes.each do |idx|
65
+ result = csv1_row[idx] <=> csv2_row[idx]
66
+ break unless result == 0
67
+ end
68
+
69
+ if result == 0
70
+ csv1_row.each_with_index do |csv1_col, idx|
71
+ csv2_col = csv2_row[idx]
72
+ result = csv1_col <=> csv2_col
73
+ break unless result == 0
74
+ end
75
+ end
76
+
77
+ result
78
+ end
79
+
80
+ csv1_sorted_file_name = ARGV[0] + '.sorted'
81
+ csv2_sorted_file_name = ARGV[1] + '.sorted'
82
+
83
+ puts "sorting #{ARGV[0]}"
84
+ sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
85
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
86
+
87
+ puts "sorting #{ARGV[1]}"
88
+ sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
89
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
90
+
91
+ options[:unique_headers] = [csv1.first.keys.first]
92
+ update_comparison_columns = csv1.first.keys - options[:unique_headers]
93
+ update_comparison_columns -= options[:ignore_headers]
94
+
95
+ comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
96
+ result = 0
97
+ options[:unique_headers].each do |header|
98
+ result = csv1_row[header] <=> csv2_row[header]
99
+ break unless result == 0
100
+ end
101
+
102
+ result
103
+ end
104
+
105
+ stats = Hash.new(0)
106
+ puts "comparing #{ARGV[0]} with #{ARGV[1]}"
107
+ diff_file_name = 'diff-results-' + File.basename(ARGV[0])
108
+ CSV.open(diff_file_name, 'wb') do |out|
109
+ out << ['Result'] + csv1.first.keys
110
+ comparer.compare(csv2_sorted_file_name) do |action, record|
111
+ stats[action] += 1
112
+ out << [action] + record.values
113
+ end
114
+ end
115
+ puts "differences found #{stats}"
116
+
117
+ File.unlink(csv1_sorted_file_name)
118
+ File.unlink(csv2_sorted_file_name)
119
+
120
+ if stats.empty?
121
+ puts "files were identical"
122
+ File.unlink(diff_file_name)
123
+ else
124
+ puts "results can be found in #{diff_file_name}"
125
+ end
data/csv-utils.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.3.15'
5
+ s.version = '0.3.17'
6
6
  s.licenses = ['MIT']
7
7
  s.summary = 'CSV Utils'
8
8
  s.description = 'Tools for debugging malformed CSV files'
@@ -1,5 +1,3 @@
1
- # frozen_string_literal: true
2
-
3
1
  # CSVUtils::CSVCompare purpose is to determine which rows in the secondary_data_file need to be created, deleted or updated
4
2
  # **requires both CSV files to be sorted on the same columns, CSVUtils::CSVSort can accomplish this
5
3
  # In order to receive updates, update_comparison_columns must configured or use inheritance and change the update_row? method
@@ -19,10 +17,12 @@ class CSVUtils::CSVCompare
19
17
  end
20
18
 
21
19
  def compare(secondary_data_file)
22
- src = CSV.open(primary_data_file)
20
+ src = CSV.open(primary_data_file, 'rb')
23
21
  src_headers = src.shift
24
- dest = CSV.open(secondary_data_file)
22
+ strip_bom!(src_headers[0])
23
+ dest = CSV.open(secondary_data_file, 'rb')
25
24
  dest_headers = dest.shift
25
+ strip_bom!(dest_headers[0])
26
26
 
27
27
  read_next_src = true
28
28
  read_next_dest = true
@@ -80,4 +80,8 @@ class CSVUtils::CSVCompare
80
80
 
81
81
  false
82
82
  end
83
+
84
+ def strip_bom!(col)
85
+ col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
86
+ end
83
87
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.15
4
+ version: 0.3.17
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-08-25 00:00:00.000000000 Z
11
+ date: 2022-09-28 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inheritance-helper
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
28
28
  email: dougyouch@gmail.com
29
29
  executables:
30
30
  - csv-change-eol
31
+ - csv-diff
31
32
  - csv-duplicate-finder
32
33
  - csv-explorer
33
34
  - csv-find-error
@@ -44,6 +45,7 @@ files:
44
45
  - LICENSE
45
46
  - README.md
46
47
  - bin/csv-change-eol
48
+ - bin/csv-diff
47
49
  - bin/csv-duplicate-finder
48
50
  - bin/csv-explorer
49
51
  - bin/csv-find-error