csv-utils 0.3.25 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/ci.yml +53 -0
- data/.rubocop.yml +81 -0
- data/ARCHITECTURE.md +154 -0
- data/CLAUDE.md +63 -0
- data/Gemfile +2 -1
- data/Gemfile.lock +5 -0
- data/README.md +238 -16
- data/bin/csv-diff +3 -3
- data/bin/csv-duplicate-finder +1 -1
- data/bin/csv-grep +3 -3
- data/bin/csv-readline +4 -5
- data/bin/csv-splitter +1 -1
- data/bin/csv-validator +38 -36
- data/csv-utils.gemspec +6 -5
- data/lib/csv-utils.rb +3 -0
- data/lib/csv_utils/csv_compare.rb +77 -71
- data/lib/csv_utils/csv_extender.rb +45 -41
- data/lib/csv_utils/csv_iterator.rb +90 -75
- data/lib/csv_utils/csv_options.rb +11 -11
- data/lib/csv_utils/csv_report.rb +5 -2
- data/lib/csv_utils/csv_row.rb +3 -1
- data/lib/csv_utils/csv_row_matcher.rb +34 -0
- data/lib/csv_utils/csv_sort.rb +110 -96
- data/lib/csv_utils/csv_transformer.rb +95 -92
- data/lib/csv_utils/csv_wrapper.rb +40 -36
- metadata +13 -6
- data/docs/ARCHITECTURE.md +0 -134
data/bin/csv-diff
CHANGED
|
@@ -15,16 +15,16 @@ OptionParser.new do |opts|
|
|
|
15
15
|
exit
|
|
16
16
|
end
|
|
17
17
|
|
|
18
|
-
opts.on('-u', '--unique HEADERS', '
|
|
18
|
+
opts.on('-u', '--unique HEADERS', 'Comma separated list of headers that generate a unique key per row, uses 1st column by default') do |v|
|
|
19
19
|
options[:unique_headers] = v.split(',')
|
|
20
20
|
end
|
|
21
21
|
|
|
22
|
-
opts.on('-i', '--ignore HEADERS', '
|
|
22
|
+
opts.on('-i', '--ignore HEADERS', 'Comma separated list of headers to ignore during row comparison') do |v|
|
|
23
23
|
options[:ignore_headers] = v.split(',')
|
|
24
24
|
end
|
|
25
25
|
|
|
26
26
|
opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
|
|
27
|
-
|
|
27
|
+
options[:sort_batch_size] = v
|
|
28
28
|
end
|
|
29
29
|
end.parse!
|
|
30
30
|
|
data/bin/csv-duplicate-finder
CHANGED
|
@@ -26,7 +26,7 @@ csv = CSVUtils::CSVIterator.new(ARGV[0])
|
|
|
26
26
|
|
|
27
27
|
missing_headers = options[:ignore_columns] - csv.first.keys
|
|
28
28
|
unless missing_headers.empty?
|
|
29
|
-
raise("
|
|
29
|
+
raise("unknown headers #{missing_headers.join(', ')} in configured ignore headers")
|
|
30
30
|
end
|
|
31
31
|
|
|
32
32
|
hashed_rows = {}
|
data/bin/csv-grep
CHANGED
|
@@ -34,7 +34,7 @@ OptionParser.new do |opts|
|
|
|
34
34
|
options[:limit] = v
|
|
35
35
|
end
|
|
36
36
|
|
|
37
|
-
opts.on('-i', '--ignore-case', 'Ignore case') do
|
|
37
|
+
opts.on('-i', '--ignore-case', 'Ignore case') do
|
|
38
38
|
options[:search_regex_options] = Regexp::IGNORECASE
|
|
39
39
|
end
|
|
40
40
|
end.parse!
|
|
@@ -55,7 +55,7 @@ search_regex =
|
|
|
55
55
|
headers =
|
|
56
56
|
case options[:headers]
|
|
57
57
|
when :first
|
|
58
|
-
[csv
|
|
58
|
+
[csv.headers.first]
|
|
59
59
|
when :all
|
|
60
60
|
csv.headers
|
|
61
61
|
else
|
|
@@ -63,7 +63,7 @@ headers =
|
|
|
63
63
|
end
|
|
64
64
|
|
|
65
65
|
missing_headers = headers - csv.headers
|
|
66
|
-
raise("unknown headers #{
|
|
66
|
+
raise("unknown headers #{missing_headers.join(', ')}") unless missing_headers.empty?
|
|
67
67
|
|
|
68
68
|
matching_row_proc = proc do |row|
|
|
69
69
|
result = false
|
data/bin/csv-readline
CHANGED
|
@@ -114,16 +114,15 @@ OptionParser.new do |opts|
|
|
|
114
114
|
end
|
|
115
115
|
end.parse!
|
|
116
116
|
|
|
117
|
-
file = File.open(ARGV[0], 'rb')
|
|
118
117
|
lineno = ARGV[1].to_i
|
|
119
118
|
number_of_lines = (ARGV[2] || 1).to_i
|
|
120
119
|
|
|
121
120
|
raise "no lineno specified" unless lineno > 0
|
|
122
121
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
122
|
+
data = File.open(ARGV[0], 'rb') do |file|
|
|
123
|
+
headers = strip_byte_order_mark(file.readline.strip).split(',')
|
|
124
|
+
headers.zip(parse_csv_row(file, lineno, number_of_lines))
|
|
125
|
+
end
|
|
127
126
|
|
|
128
127
|
cnt = 0
|
|
129
128
|
data.each do |k, (v, status)|
|
data/bin/csv-splitter
CHANGED
data/bin/csv-validator
CHANGED
|
@@ -29,53 +29,55 @@ def strip_bom!(col)
|
|
|
29
29
|
col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
|
|
30
30
|
end
|
|
31
31
|
|
|
32
|
-
csv = CSV.open(ARGV[0], 'rb')
|
|
33
32
|
id_column_name = ARGV[1]
|
|
33
|
+
csv = CSV.open(ARGV[0], 'rb')
|
|
34
|
+
out = nil
|
|
34
35
|
|
|
35
|
-
|
|
36
|
-
|
|
36
|
+
begin
|
|
37
|
+
headers = csv.shift
|
|
38
|
+
strip_bom!(headers[0])
|
|
37
39
|
|
|
38
|
-
id_column_name ||= headers[0]
|
|
39
|
-
unless headers.include?(id_column_name)
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
end
|
|
40
|
+
id_column_name ||= headers[0]
|
|
41
|
+
unless headers.include?(id_column_name)
|
|
42
|
+
$stderr.puts("header #{id_column_name} not found in current set of headers")
|
|
43
|
+
exit 1
|
|
44
|
+
end
|
|
43
45
|
|
|
44
|
-
id_column_num = headers.index(id_column_name)
|
|
46
|
+
id_column_num = headers.index(id_column_name)
|
|
45
47
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
end
|
|
48
|
+
out_proc = proc do |row|
|
|
49
|
+
out ||=
|
|
50
|
+
begin
|
|
51
|
+
out = CSV.open('utf8-correction.csv', 'wb')
|
|
52
|
+
out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
|
|
53
|
+
out
|
|
54
|
+
end
|
|
54
55
|
|
|
55
|
-
|
|
56
|
-
end
|
|
56
|
+
out << row
|
|
57
|
+
end
|
|
57
58
|
|
|
58
|
-
csv_lineno = 1
|
|
59
|
+
csv_lineno = 1
|
|
59
60
|
|
|
60
|
-
while (row = csv.shift)
|
|
61
|
-
|
|
61
|
+
while (row = csv.shift)
|
|
62
|
+
csv_lineno += 1
|
|
62
63
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
64
|
+
unless row.size == headers.size
|
|
65
|
+
$stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
|
|
66
|
+
end
|
|
66
67
|
|
|
67
|
-
|
|
68
|
-
|
|
68
|
+
row.each_with_index do |col, idx|
|
|
69
|
+
next if col.nil? || utf8?(col)
|
|
69
70
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
71
|
+
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
|
|
72
|
+
if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
|
|
73
|
+
puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
|
|
74
|
+
out_proc.call [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
|
|
75
|
+
else
|
|
76
|
+
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
|
|
77
|
+
end
|
|
76
78
|
end
|
|
77
79
|
end
|
|
80
|
+
ensure
|
|
81
|
+
csv.close
|
|
82
|
+
out&.close
|
|
78
83
|
end
|
|
79
|
-
|
|
80
|
-
csv.close
|
|
81
|
-
out.close if out
|
data/csv-utils.gemspec
CHANGED
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
Gem::Specification.new do |s|
|
|
4
4
|
s.name = 'csv-utils'
|
|
5
|
-
s.version = '0.
|
|
5
|
+
s.version = '0.5.0'
|
|
6
6
|
s.licenses = ['MIT']
|
|
7
|
-
s.summary = 'CSV
|
|
8
|
-
s.description = '
|
|
7
|
+
s.summary = 'Comprehensive CSV manipulation and debugging utilities for Ruby'
|
|
8
|
+
s.description = 'A Ruby library for CSV file processing featuring comparison, transformation, sorting, and validation. Includes CLI tools for debugging malformed CSVs, auto-detection of encodings and separators, and efficient handling of large files.'
|
|
9
9
|
s.authors = ['Doug Youch']
|
|
10
10
|
s.email = 'dougyouch@gmail.com'
|
|
11
11
|
s.homepage = 'https://github.com/dougyouch/csv-utils'
|
|
@@ -13,6 +13,7 @@ Gem::Specification.new do |s|
|
|
|
13
13
|
s.bindir = 'bin'
|
|
14
14
|
s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
|
15
15
|
|
|
16
|
-
s.
|
|
17
|
-
s.
|
|
16
|
+
s.add_dependency 'csv'
|
|
17
|
+
s.add_dependency 'inheritance-helper'
|
|
18
|
+
s.metadata['rubygems_mfa_required'] = 'true'
|
|
18
19
|
end
|
data/lib/csv-utils.rb
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
require 'csv'
|
|
2
4
|
|
|
3
5
|
# Collection of tools for working with CSV files.
|
|
@@ -8,6 +10,7 @@ module CSVUtils
|
|
|
8
10
|
autoload :CSVOptions, 'csv_utils/csv_options'
|
|
9
11
|
autoload :CSVReport, 'csv_utils/csv_report'
|
|
10
12
|
autoload :CSVRow, 'csv_utils/csv_row'
|
|
13
|
+
autoload :CSVRowMatcher, 'csv_utils/csv_row_matcher'
|
|
11
14
|
autoload :CSVSort, 'csv_utils/csv_sort'
|
|
12
15
|
autoload :CSVTransformer, 'csv_utils/csv_transformer'
|
|
13
16
|
autoload :CSVWrapper, 'csv_utils/csv_wrapper'
|
|
@@ -1,87 +1,93 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
# CSVUtils::CSVCompare purpose is to determine which rows in the secondary_data_file need to be created, deleted or updated
|
|
2
4
|
# **requires both CSV files to be sorted on the same columns, CSVUtils::CSVSort can accomplish this
|
|
3
5
|
# In order to receive updates, update_comparison_columns must configured or use inheritance and change the update_row? method
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def compare(secondary_data_file)
|
|
20
|
-
src = CSV.open(primary_data_file, 'rb')
|
|
21
|
-
src_headers = src.shift
|
|
22
|
-
strip_bom!(src_headers[0])
|
|
23
|
-
dest = CSV.open(secondary_data_file, 'rb')
|
|
24
|
-
dest_headers = dest.shift
|
|
25
|
-
strip_bom!(dest_headers[0])
|
|
26
|
-
|
|
27
|
-
read_next_src = true
|
|
28
|
-
read_next_dest = true
|
|
29
|
-
|
|
30
|
-
while(!src.eof? || !dest.eof?)
|
|
31
|
-
src_record = next_record_from_file(src_headers, src) if read_next_src
|
|
32
|
-
dest_record = next_record_from_file(dest_headers, dest) if read_next_dest
|
|
33
|
-
|
|
34
|
-
if ! src_record
|
|
35
|
-
read_next_src = false
|
|
36
|
-
read_next_dest = true
|
|
37
|
-
|
|
38
|
-
yield :delete, dest_record
|
|
39
|
-
elsif ! dest_record
|
|
40
|
-
read_next_src = true
|
|
41
|
-
read_next_dest = false
|
|
42
|
-
|
|
43
|
-
yield :create, src_record
|
|
44
|
-
elsif compare_proc.call(src_record, dest_record) == 0
|
|
45
|
-
read_next_src = true
|
|
46
|
-
read_next_dest = true
|
|
47
|
-
|
|
48
|
-
yield(:update, src_record) if update_row?(src_record, dest_record)
|
|
49
|
-
elsif compare_proc.call(src_record, dest_record) > 0
|
|
50
|
-
read_next_src = false
|
|
51
|
-
read_next_dest = true
|
|
52
|
-
|
|
53
|
-
yield :delete, dest_record
|
|
54
|
-
else
|
|
55
|
-
read_next_src = true
|
|
56
|
-
read_next_dest = false
|
|
6
|
+
module CSVUtils
|
|
7
|
+
class CSVCompare
|
|
8
|
+
# primary_data_file is the source of truth
|
|
9
|
+
# compare_proc used to compare the id column(s)
|
|
10
|
+
# update_comparison_columns column(s) to compare for equality, ex: updated_at, timestamp, hash
|
|
11
|
+
# caveat: update_comparison_columns need to be in both csv files
|
|
12
|
+
attr_reader :primary_data_file,
|
|
13
|
+
:update_comparison_columns,
|
|
14
|
+
:compare_proc
|
|
15
|
+
|
|
16
|
+
def initialize(primary_data_file, update_comparison_columns = nil, &block)
|
|
17
|
+
@primary_data_file = primary_data_file
|
|
18
|
+
@update_comparison_columns = update_comparison_columns
|
|
19
|
+
@compare_proc = block
|
|
20
|
+
end
|
|
57
21
|
|
|
58
|
-
|
|
22
|
+
# rubocop:disable Metrics/MethodLength
|
|
23
|
+
def compare(secondary_data_file)
|
|
24
|
+
src = CSV.open(primary_data_file, 'rb')
|
|
25
|
+
begin
|
|
26
|
+
src_headers = src.shift
|
|
27
|
+
strip_bom!(src_headers[0])
|
|
28
|
+
dest = CSV.open(secondary_data_file, 'rb')
|
|
29
|
+
begin
|
|
30
|
+
dest_headers = dest.shift
|
|
31
|
+
strip_bom!(dest_headers[0])
|
|
32
|
+
|
|
33
|
+
read_next_src = true
|
|
34
|
+
read_next_dest = true
|
|
35
|
+
|
|
36
|
+
while !src.eof? || !dest.eof?
|
|
37
|
+
src_record = next_record_from_file(src_headers, src) if read_next_src
|
|
38
|
+
dest_record = next_record_from_file(dest_headers, dest) if read_next_dest
|
|
39
|
+
|
|
40
|
+
if !src_record
|
|
41
|
+
read_next_src = false
|
|
42
|
+
read_next_dest = true
|
|
43
|
+
yield :delete, dest_record
|
|
44
|
+
elsif !dest_record
|
|
45
|
+
read_next_src = true
|
|
46
|
+
read_next_dest = false
|
|
47
|
+
yield :create, src_record
|
|
48
|
+
elsif compare_proc.call(src_record, dest_record).zero?
|
|
49
|
+
read_next_src = true
|
|
50
|
+
read_next_dest = true
|
|
51
|
+
yield(:update, src_record) if update_row?(src_record, dest_record)
|
|
52
|
+
elsif compare_proc.call(src_record, dest_record).positive?
|
|
53
|
+
read_next_src = false
|
|
54
|
+
read_next_dest = true
|
|
55
|
+
yield :delete, dest_record
|
|
56
|
+
else
|
|
57
|
+
read_next_src = true
|
|
58
|
+
read_next_dest = false
|
|
59
|
+
yield :create, src_record
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
ensure
|
|
63
|
+
dest.close
|
|
64
|
+
end
|
|
65
|
+
ensure
|
|
66
|
+
src.close
|
|
59
67
|
end
|
|
60
68
|
end
|
|
69
|
+
# rubocop:enable Metrics/MethodLength
|
|
61
70
|
|
|
62
|
-
|
|
63
|
-
dest.close
|
|
64
|
-
end
|
|
71
|
+
private
|
|
65
72
|
|
|
66
|
-
|
|
73
|
+
def next_record_from_file(headers, file)
|
|
74
|
+
return nil if file.eof?
|
|
67
75
|
|
|
68
|
-
|
|
69
|
-
|
|
76
|
+
headers.zip(file.shift).to_h
|
|
77
|
+
end
|
|
70
78
|
|
|
71
|
-
|
|
72
|
-
|
|
79
|
+
def update_row?(src_record, dest_record)
|
|
80
|
+
return false unless update_comparison_columns
|
|
73
81
|
|
|
74
|
-
|
|
75
|
-
|
|
82
|
+
update_comparison_columns.each do |column_name|
|
|
83
|
+
return true unless src_record[column_name] == dest_record[column_name]
|
|
84
|
+
end
|
|
76
85
|
|
|
77
|
-
|
|
78
|
-
return true unless src_record[column_name] == dest_record[column_name]
|
|
86
|
+
false
|
|
79
87
|
end
|
|
80
88
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
def strip_bom!(col)
|
|
85
|
-
col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
|
|
89
|
+
def strip_bom!(col)
|
|
90
|
+
col.sub!((+"\xEF\xBB\xBF").force_encoding('ASCII-8BIT'), '')
|
|
91
|
+
end
|
|
86
92
|
end
|
|
87
93
|
end
|
|
@@ -1,63 +1,67 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
1
3
|
# Utility class for appending data to a csv file.
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
module CSVUtils
|
|
5
|
+
class CSVExtender
|
|
6
|
+
def initialize(src_csv, dest_csv, csv_options = {})
|
|
7
|
+
@src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
|
|
8
|
+
@dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
|
|
9
|
+
end
|
|
7
10
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
def append(additional_headers)
|
|
12
|
+
process(additional_headers) do |current_headers|
|
|
13
|
+
while (row = @src_csv.shift)
|
|
14
|
+
additional_columns = yield row, current_headers
|
|
15
|
+
@dest_csv << (row + additional_columns)
|
|
16
|
+
end
|
|
13
17
|
end
|
|
14
18
|
end
|
|
15
|
-
end
|
|
16
19
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
+
def append_in_batches(additional_headers, batch_size = 1_000)
|
|
21
|
+
process(additional_headers) do |current_headers|
|
|
22
|
+
batch = []
|
|
23
|
+
|
|
24
|
+
process_batch_proc = proc do
|
|
25
|
+
additional_rows = yield batch, current_headers
|
|
20
26
|
|
|
21
|
-
|
|
22
|
-
|
|
27
|
+
batch.each_with_index do |row, idx|
|
|
28
|
+
@dest_csv << (row + additional_rows[idx])
|
|
29
|
+
end
|
|
23
30
|
|
|
24
|
-
|
|
25
|
-
@dest_csv << (row + additional_rows[idx])
|
|
31
|
+
batch = []
|
|
26
32
|
end
|
|
27
33
|
|
|
28
|
-
|
|
29
|
-
|
|
34
|
+
while (row = @src_csv.shift)
|
|
35
|
+
batch << row
|
|
30
36
|
|
|
31
|
-
|
|
32
|
-
|
|
37
|
+
process_batch_proc.call if batch.size >= batch_size
|
|
38
|
+
end
|
|
33
39
|
|
|
34
|
-
process_batch_proc.call if batch.size
|
|
40
|
+
process_batch_proc.call if batch.size.positive?
|
|
35
41
|
end
|
|
36
|
-
|
|
37
|
-
process_batch_proc.call if batch.size > 0
|
|
38
42
|
end
|
|
39
|
-
end
|
|
40
43
|
|
|
41
|
-
|
|
44
|
+
private
|
|
42
45
|
|
|
43
|
-
|
|
44
|
-
|
|
46
|
+
def process(additional_headers)
|
|
47
|
+
current_headers = append_headers(additional_headers)
|
|
45
48
|
|
|
46
|
-
|
|
49
|
+
yield current_headers
|
|
47
50
|
|
|
48
|
-
|
|
49
|
-
|
|
51
|
+
close
|
|
52
|
+
end
|
|
50
53
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
54
|
+
def close
|
|
55
|
+
@src_csv.close
|
|
56
|
+
@dest_csv.close
|
|
57
|
+
end
|
|
55
58
|
|
|
56
|
-
|
|
57
|
-
|
|
59
|
+
def append_headers(additional_headers)
|
|
60
|
+
return nil unless additional_headers
|
|
58
61
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
+
current_headers = @src_csv.shift
|
|
63
|
+
@dest_csv << (current_headers + additional_headers)
|
|
64
|
+
current_headers
|
|
65
|
+
end
|
|
62
66
|
end
|
|
63
67
|
end
|