csv-utils 0.3.14 → 0.3.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: a15793e118aa3bd4175b59c6a17e7800dd322390f95ce55af29170a6422ae0c3
4
- data.tar.gz: 481c775dc66bc47fec11f5ab39f98a1a35020cb00f6e565761b47badc4d53ad8
3
+ metadata.gz: cda417e397f1791d177bb6eadd107edfdcd31a6680a3d2fef07b5be6df91dd6c
4
+ data.tar.gz: 7dee11a2b59738a8ad1ea9f692ef2da6472bab65039fbeed630c34d369a952c0
5
5
  SHA512:
6
- metadata.gz: e987380b48bc309fbccb7bfd4311c8664077c8bac6ae31cf0c8c83f8035366f40fbdf60c8bf5b476e2a84e1899d6adada995fe81cd08531376511501700e2dca
7
- data.tar.gz: 020e3c5d08bbc1025b72fbde2de838efd45c24cf41ba89c9fb93cceb619e4184ec0c1671bf26b3ae1d9dad8978ce206359c53672bce843a3ac8651cc6ccf0412
6
+ metadata.gz: af9f536d693cdcac8db949cbec1981b38637149927910b59aa570becef637cbae54c5371bc9d6031698c993e536c063b0ba89a0ca1be6c6d6ca7056e4326cc83
7
+ data.tar.gz: f48853f7756f3a7e36c34e37fd38ce611db880fb5e083340eedf817800f094dcd7900b89f3f7edab2eb8eac6e2c837e9209ec3dba1cc79d95d2f065f36b15d45
data/.ruby-version CHANGED
@@ -1 +1 @@
1
- 3.0.0
1
+ 3.1.0
data/Gemfile.lock CHANGED
@@ -2,51 +2,51 @@ GEM
2
2
  remote: http://rubygems.org/
3
3
  specs:
4
4
  ast (2.4.2)
5
- diff-lcs (1.4.4)
6
- docile (1.3.5)
7
- inheritance-helper (0.1.5)
8
- parallel (1.20.1)
9
- parser (3.0.0.0)
5
+ diff-lcs (1.5.0)
6
+ docile (1.4.0)
7
+ inheritance-helper (0.2.5)
8
+ parallel (1.22.1)
9
+ parser (3.1.1.0)
10
10
  ast (~> 2.4.1)
11
- rainbow (3.0.0)
12
- rake (13.0.3)
13
- regexp_parser (2.1.1)
14
- rexml (3.2.4)
15
- rspec (3.10.0)
16
- rspec-core (~> 3.10.0)
17
- rspec-expectations (~> 3.10.0)
18
- rspec-mocks (~> 3.10.0)
19
- rspec-core (3.10.1)
20
- rspec-support (~> 3.10.0)
21
- rspec-expectations (3.10.1)
11
+ rainbow (3.1.1)
12
+ rake (13.0.6)
13
+ regexp_parser (2.2.1)
14
+ rexml (3.2.5)
15
+ rspec (3.11.0)
16
+ rspec-core (~> 3.11.0)
17
+ rspec-expectations (~> 3.11.0)
18
+ rspec-mocks (~> 3.11.0)
19
+ rspec-core (3.11.0)
20
+ rspec-support (~> 3.11.0)
21
+ rspec-expectations (3.11.0)
22
22
  diff-lcs (>= 1.2.0, < 2.0)
23
- rspec-support (~> 3.10.0)
24
- rspec-mocks (3.10.2)
23
+ rspec-support (~> 3.11.0)
24
+ rspec-mocks (3.11.0)
25
25
  diff-lcs (>= 1.2.0, < 2.0)
26
- rspec-support (~> 3.10.0)
27
- rspec-support (3.10.2)
28
- rubocop (1.11.0)
26
+ rspec-support (~> 3.11.0)
27
+ rspec-support (3.11.0)
28
+ rubocop (1.26.1)
29
29
  parallel (~> 1.10)
30
- parser (>= 3.0.0.0)
30
+ parser (>= 3.1.0.0)
31
31
  rainbow (>= 2.2.2, < 4.0)
32
32
  regexp_parser (>= 1.8, < 3.0)
33
33
  rexml
34
- rubocop-ast (>= 1.2.0, < 2.0)
34
+ rubocop-ast (>= 1.16.0, < 2.0)
35
35
  ruby-progressbar (~> 1.7)
36
36
  unicode-display_width (>= 1.4.0, < 3.0)
37
- rubocop-ast (1.4.1)
38
- parser (>= 2.7.1.5)
37
+ rubocop-ast (1.16.0)
38
+ parser (>= 3.1.1.0)
39
39
  ruby-progressbar (1.11.0)
40
40
  simplecov (0.21.2)
41
41
  docile (~> 1.1)
42
42
  simplecov-html (~> 0.11)
43
43
  simplecov_json_formatter (~> 0.1)
44
44
  simplecov-html (0.12.3)
45
- simplecov_json_formatter (0.1.2)
46
- unicode-display_width (2.0.0)
45
+ simplecov_json_formatter (0.1.4)
46
+ unicode-display_width (2.1.0)
47
47
 
48
48
  PLATFORMS
49
- x86_64-darwin-20
49
+ x86_64-darwin-21
50
50
 
51
51
  DEPENDENCIES
52
52
  inheritance-helper
@@ -56,4 +56,4 @@ DEPENDENCIES
56
56
  simplecov
57
57
 
58
58
  BUNDLED WITH
59
- 2.2.3
59
+ 2.3.3
data/bin/csv-diff ADDED
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ options = {
6
+ unique_headers: [],
7
+ ignore_headers: [],
8
+ sort_batch_size: 1_000_000
9
+ }
10
+ OptionParser.new do |opts|
11
+ opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
12
+
13
+ opts.on('-h', '--help', 'Prints this help') do
14
+ puts opts
15
+ exit
16
+ end
17
+
18
+ opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
19
+ options[:unique_headers] = v.split(',')
20
+ end
21
+
22
+ opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
23
+ options[:ignore_headers] = v.split(',')
24
+ end
25
+
26
+ opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
27
+ opts[:sort_batch_size] = v
28
+ end
29
+ end.parse!
30
+
31
+ require 'csv-utils'
32
+
33
+ csv1 = CSVUtils::CSVIterator.new(ARGV[0])
34
+ csv2 = CSVUtils::CSVIterator.new(ARGV[1])
35
+
36
+ unless csv1.first.keys == csv2.first.keys
37
+ $stderr.puts("headers do not match #{ARGV[0]} headers #{csv1.first.keys}, #{ARGV[1]} headers #{csv2.first.keys}")
38
+ exit 1
39
+ end
40
+
41
+ unknown_unique_headers = options[:unique_headers] - csv1.first.keys
42
+ unless unknown_unique_headers.empty?
43
+ $stderr.puts("specified unique headers are unknown #{unknown_unique_headers}")
44
+ exit 1
45
+ end
46
+
47
+ unknown_ignore_headers = options[:ignore_headers] - csv1.first.keys
48
+ unless unknown_ignore_headers.empty?
49
+ $stderr.puts("specified headers to ignore are unknown #{unknown_ignore_headers}")
50
+ exit 1
51
+ end
52
+
53
+ options[:unique_headers] = [csv1.first.keys.first] if options[:unique_headers].empty?
54
+
55
+ puts "uniqueness header(s) are #{options[:unique_headers].join(', ')}"
56
+
57
+ unique_header_indexes = []
58
+ csv1.first.keys.each_with_index do |header, idx|
59
+ unique_header_indexes << idx if options[:unique_headers].include?(header)
60
+ end
61
+
62
+ sort_compare_proc = proc do |csv1_row, csv2_row|
63
+ result = 0
64
+ unique_header_indexes.each do |idx|
65
+ result = csv1_row[idx] <=> csv2_row[idx]
66
+ break unless result == 0
67
+ end
68
+
69
+ if result == 0
70
+ csv1_row.each_with_index do |csv1_col, idx|
71
+ csv2_col = csv2_row[idx]
72
+ result = csv1_col <=> csv2_col
73
+ break unless result == 0
74
+ end
75
+ end
76
+
77
+ result
78
+ end
79
+
80
+ csv1_sorted_file_name = ARGV[0] + '.sorted'
81
+ csv2_sorted_file_name = ARGV[1] + '.sorted'
82
+
83
+ puts "sorting #{ARGV[0]}"
84
+ sorter = CSVUtils::CSVSort.new(ARGV[0], csv1_sorted_file_name)
85
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
86
+
87
+ puts "sorting #{ARGV[1]}"
88
+ sorter = CSVUtils::CSVSort.new(ARGV[1], csv2_sorted_file_name)
89
+ sorter.sort(options[:sort_batch_size], &sort_compare_proc)
90
+
91
+ options[:unique_headers] = [csv1.first.keys.first]
92
+ update_comparison_columns = csv1.first.keys - options[:unique_headers]
93
+ update_comparison_columns -= options[:ignore_headers]
94
+
95
+ comparer = CSVUtils::CSVCompare.new(csv1_sorted_file_name, update_comparison_columns) do |csv1_row, csv2_row|
96
+ result = 0
97
+ options[:unique_headers].each do |header|
98
+ result = csv1_row[header] <=> csv2_row[header]
99
+ break unless result == 0
100
+ end
101
+
102
+ result
103
+ end
104
+
105
+ stats = Hash.new(0)
106
+ puts "comparing #{ARGV[0]} with #{ARGV[1]}"
107
+ diff_file_name = 'diff-results-' + ARGV[0]
108
+ CSV.open(diff_file_name, 'wb') do |out|
109
+ out << ['Result'] + csv1.first.keys
110
+ comparer.compare(csv2_sorted_file_name) do |action, record|
111
+ stats[action] += 1
112
+ out << [action] + record.values
113
+ end
114
+ end
115
+ puts "differences found #{stats}"
116
+
117
+ File.unlink(csv1_sorted_file_name)
118
+ File.unlink(csv2_sorted_file_name)
119
+
120
+ if stats.empty?
121
+ puts "files were identical"
122
+ File.unlink(diff_file_name)
123
+ else
124
+ puts "results can be found in #{diff_file_name}"
125
+ end
@@ -0,0 +1,69 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'optparse'
4
+
5
+ options = {
6
+ ignore_columns: []
7
+ }
8
+ OptionParser.new do |opts|
9
+ opts.banner = 'Usage: ' + File.basename(__FILE__) + ' [options] <csv file>'
10
+
11
+ opts.on('-h', '--help', 'Prints this help') do
12
+ puts opts
13
+ exit
14
+ end
15
+
16
+ opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore') do |v|
17
+ options[:ignore_columns] = v.split(',')
18
+ end
19
+ end.parse!
20
+
21
+ require 'digest/sha2'
22
+ require 'json'
23
+ require 'csv-utils'
24
+
25
+ csv = CSVUtils::CSVIterator.new(ARGV[0])
26
+
27
+ missing_headers = options[:ignore_columns] - csv.first.keys
28
+ unless missing_headers.empty?
29
+ raise("unkown headers #{missing_headers.join(', ')} configured ingnore headers")
30
+ end
31
+
32
+ hashed_rows = {}
33
+
34
+ csv.each_with_index do |row, idx|
35
+ options[:ignore_columns].each do |ignore_column|
36
+ row[ignore_column] = ''
37
+ end
38
+ key = Digest::SHA256.hexdigest(row.to_json)
39
+ hashed_rows[key] ||= []
40
+ hashed_rows[key] << idx
41
+ end
42
+
43
+ duplicate_rows = {}
44
+
45
+ hashed_rows.reject! { |key, row_numbers| row_numbers.size <= 1 }
46
+
47
+ hashed_rows.each do |key, row_numbers|
48
+ hashed_rows[key] = {}
49
+ row_numbers.each do |row_number|
50
+ duplicate_rows[row_number] = key
51
+ hashed_rows[key][row_number] = nil
52
+ end
53
+ end
54
+
55
+ csv.each_with_index do |row, idx|
56
+ next unless (key = duplicate_rows[idx])
57
+
58
+ hashed_rows[key][idx] = row
59
+ end
60
+
61
+ CSV.open('duplicates-' + File.basename(ARGV[0]), 'wb') do |out|
62
+ out << ['duplicate_key', 'row_no'] + csv.first.keys
63
+
64
+ hashed_rows.each do |key, rows|
65
+ rows.each do |idx, row|
66
+ out << [key, idx] + row.values
67
+ end
68
+ end
69
+ end
data/csv-utils.gemspec CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.3.14'
5
+ s.version = '0.3.16'
6
6
  s.licenses = ['MIT']
7
7
  s.summary = 'CSV Utils'
8
8
  s.description = 'Tools for debugging malformed CSV files'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.14
4
+ version: 0.3.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-07-23 00:00:00.000000000 Z
11
+ date: 2022-09-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inheritance-helper
@@ -28,6 +28,8 @@ description: Tools for debugging malformed CSV files
28
28
  email: dougyouch@gmail.com
29
29
  executables:
30
30
  - csv-change-eol
31
+ - csv-diff
32
+ - csv-duplicate-finder
31
33
  - csv-explorer
32
34
  - csv-find-error
33
35
  - csv-readline
@@ -43,6 +45,8 @@ files:
43
45
  - LICENSE
44
46
  - README.md
45
47
  - bin/csv-change-eol
48
+ - bin/csv-diff
49
+ - bin/csv-duplicate-finder
46
50
  - bin/csv-explorer
47
51
  - bin/csv-find-error
48
52
  - bin/csv-readline
@@ -78,7 +82,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
78
82
  - !ruby/object:Gem::Version
79
83
  version: '0'
80
84
  requirements: []
81
- rubygems_version: 3.2.3
85
+ rubygems_version: 3.3.3
82
86
  signing_key:
83
87
  specification_version: 4
84
88
  summary: CSV Utils