csv-utils 0.3.25 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/csv-diff CHANGED
@@ -15,16 +15,16 @@ OptionParser.new do |opts|
15
15
  exit
16
16
  end
17
17
 
18
- opts.on('-u', '--unique HEADERS', 'Comman separated list of headers that genrate a unique key per a row, use 1st column by default') do |v|
18
+ opts.on('-u', '--unique HEADERS', 'Comma separated list of headers that generate a unique key per row, uses 1st column by default') do |v|
19
19
  options[:unique_headers] = v.split(',')
20
20
  end
21
21
 
22
- opts.on('-i', '--ignore HEADERS', 'Comman separated list of headers to ignore during row comparison') do |v|
22
+ opts.on('-i', '--ignore HEADERS', 'Comma separated list of headers to ignore during row comparison') do |v|
23
23
  options[:ignore_headers] = v.split(',')
24
24
  end
25
25
 
26
26
  opts.on('--sort-batch-size SIZE', Integer, 'Number of rows to load into memory while sorting') do |v|
27
- opts[:sort_batch_size] = v
27
+ options[:sort_batch_size] = v
28
28
  end
29
29
  end.parse!
30
30
 
@@ -26,7 +26,7 @@ csv = CSVUtils::CSVIterator.new(ARGV[0])
26
26
 
27
27
  missing_headers = options[:ignore_columns] - csv.first.keys
28
28
  unless missing_headers.empty?
29
- raise("unkown headers #{missing_headers.join(', ')} configured ingnore headers")
29
+ raise("unknown headers #{missing_headers.join(', ')} in configured ignore headers")
30
30
  end
31
31
 
32
32
  hashed_rows = {}
data/bin/csv-grep CHANGED
@@ -34,7 +34,7 @@ OptionParser.new do |opts|
34
34
  options[:limit] = v
35
35
  end
36
36
 
37
- opts.on('-i', '--ignore-case', 'Ignore case') do |v|
37
+ opts.on('-i', '--ignore-case', 'Ignore case') do
38
38
  options[:search_regex_options] = Regexp::IGNORECASE
39
39
  end
40
40
  end.parse!
@@ -55,7 +55,7 @@ search_regex =
55
55
  headers =
56
56
  case options[:headers]
57
57
  when :first
58
- [csv .headers.first]
58
+ [csv.headers.first]
59
59
  when :all
60
60
  csv.headers
61
61
  else
@@ -63,7 +63,7 @@ headers =
63
63
  end
64
64
 
65
65
  missing_headers = headers - csv.headers
66
- raise("unknown headers #{headers.join(', ')}") unless missing_headers.empty?
66
+ raise("unknown headers #{missing_headers.join(', ')}") unless missing_headers.empty?
67
67
 
68
68
  matching_row_proc = proc do |row|
69
69
  result = false
data/bin/csv-readline CHANGED
@@ -114,16 +114,15 @@ OptionParser.new do |opts|
114
114
  end
115
115
  end.parse!
116
116
 
117
- file = File.open(ARGV[0], 'rb')
118
117
  lineno = ARGV[1].to_i
119
118
  number_of_lines = (ARGV[2] || 1).to_i
120
119
 
121
120
  raise "no lineno specified" unless lineno > 0
122
121
 
123
- headers = strip_byte_order_mark(file.readline.strip).split(',')
124
-
125
- data = headers.zip(parse_csv_row(file, lineno, number_of_lines))
126
- file.close
122
+ data = File.open(ARGV[0], 'rb') do |file|
123
+ headers = strip_byte_order_mark(file.readline.strip).split(',')
124
+ headers.zip(parse_csv_row(file, lineno, number_of_lines))
125
+ end
127
126
 
128
127
  cnt = 0
129
128
  data.each do |k, (v, status)|
data/bin/csv-splitter CHANGED
@@ -78,4 +78,4 @@ while (row = csv.shift)
78
78
  append_row_proc.call(row)
79
79
  end
80
80
 
81
- out.close
81
+ out&.close
data/bin/csv-validator CHANGED
@@ -29,53 +29,55 @@ def strip_bom!(col)
29
29
  col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
30
30
  end
31
31
 
32
- csv = CSV.open(ARGV[0], 'rb')
33
32
  id_column_name = ARGV[1]
33
+ csv = CSV.open(ARGV[0], 'rb')
34
+ out = nil
34
35
 
35
- headers = csv.shift
36
- strip_bom!(headers[0])
36
+ begin
37
+ headers = csv.shift
38
+ strip_bom!(headers[0])
37
39
 
38
- id_column_name ||= headers[0]
39
- unless headers.include?(id_column_name)
40
- $stderr.puts("header #{id_column_name} not found in current set of headers")
41
- exit 1
42
- end
40
+ id_column_name ||= headers[0]
41
+ unless headers.include?(id_column_name)
42
+ $stderr.puts("header #{id_column_name} not found in current set of headers")
43
+ exit 1
44
+ end
43
45
 
44
- id_column_num = headers.index(id_column_name)
46
+ id_column_num = headers.index(id_column_name)
45
47
 
46
- out = nil
47
- out_proc = Proc.new do |row|
48
- out ||=
49
- begin
50
- out = CSV.open('utf8-correctsion.csv', 'wb')
51
- out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
52
- out
53
- end
48
+ out_proc = proc do |row|
49
+ out ||=
50
+ begin
51
+ out = CSV.open('utf8-correction.csv', 'wb')
52
+ out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
53
+ out
54
+ end
54
55
 
55
- out << row
56
- end
56
+ out << row
57
+ end
57
58
 
58
- csv_lineno = 1
59
+ csv_lineno = 1
59
60
 
60
- while (row = csv.shift)
61
- csv_lineno += 1
61
+ while (row = csv.shift)
62
+ csv_lineno += 1
62
63
 
63
- unless row.size == headers.size
64
- $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
65
- end
64
+ unless row.size == headers.size
65
+ $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
66
+ end
66
67
 
67
- row.each_with_index do |col, idx|
68
- next if col.nil? || utf8?(col)
68
+ row.each_with_index do |col, idx|
69
+ next if col.nil? || utf8?(col)
69
70
 
70
- $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
71
- if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
72
- puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
73
- out_proc.call [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
74
- else
75
- $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
71
+ $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
72
+ if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
73
+ puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
74
+ out_proc.call [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
75
+ else
76
+ $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
77
+ end
76
78
  end
77
79
  end
80
+ ensure
81
+ csv.close
82
+ out&.close
78
83
  end
79
-
80
- csv.close
81
- out.close if out
data/csv-utils.gemspec CHANGED
@@ -2,10 +2,10 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.3.25'
5
+ s.version = '0.5.0'
6
6
  s.licenses = ['MIT']
7
- s.summary = 'CSV Utils'
8
- s.description = 'Tools for debugging malformed CSV files'
7
+ s.summary = 'Comprehensive CSV manipulation and debugging utilities for Ruby'
8
+ s.description = 'A Ruby library for CSV file processing featuring comparison, transformation, sorting, and validation. Includes CLI tools for debugging malformed CSVs, auto-detection of encodings and separators, and efficient handling of large files.'
9
9
  s.authors = ['Doug Youch']
10
10
  s.email = 'dougyouch@gmail.com'
11
11
  s.homepage = 'https://github.com/dougyouch/csv-utils'
@@ -13,6 +13,7 @@ Gem::Specification.new do |s|
13
13
  s.bindir = 'bin'
14
14
  s.executables = s.files.grep(%r{^bin/}) { |f| File.basename(f) }
15
15
 
16
- s.add_runtime_dependency 'csv'
17
- s.add_runtime_dependency 'inheritance-helper'
16
+ s.add_dependency 'csv'
17
+ s.add_dependency 'inheritance-helper'
18
+ s.metadata['rubygems_mfa_required'] = 'true'
18
19
  end
data/lib/csv-utils.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'csv'
2
4
 
3
5
  # Collection of tools for working with CSV files.
@@ -8,6 +10,7 @@ module CSVUtils
8
10
  autoload :CSVOptions, 'csv_utils/csv_options'
9
11
  autoload :CSVReport, 'csv_utils/csv_report'
10
12
  autoload :CSVRow, 'csv_utils/csv_row'
13
+ autoload :CSVRowMatcher, 'csv_utils/csv_row_matcher'
11
14
  autoload :CSVSort, 'csv_utils/csv_sort'
12
15
  autoload :CSVTransformer, 'csv_utils/csv_transformer'
13
16
  autoload :CSVWrapper, 'csv_utils/csv_wrapper'
@@ -1,87 +1,93 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # CSVUtils::CSVCompare purpose is to determine which rows in the secondary_data_file need to be created, deleted or updated
2
4
  # **requires both CSV files to be sorted on the same columns, CSVUtils::CSVSort can accomplish this
3
5
  # In order to receive updates, update_comparison_columns must configured or use inheritance and change the update_row? method
4
- class CSVUtils::CSVCompare
5
- # primary_data_file is the source of truth
6
- # compare_proc used to compare the id column(s)
7
- # update_comparison_columns column(s) to compare for equality, ex: updated_at, timestamp, hash
8
- # caveat: update_comparison_columns need to be in both csv files
9
- attr_reader :primary_data_file,
10
- :update_comparison_columns,
11
- :compare_proc
12
-
13
- def initialize(primary_data_file, update_comparison_columns=nil, &block)
14
- @primary_data_file = primary_data_file
15
- @update_comparison_columns = update_comparison_columns
16
- @compare_proc = block
17
- end
18
-
19
- def compare(secondary_data_file)
20
- src = CSV.open(primary_data_file, 'rb')
21
- src_headers = src.shift
22
- strip_bom!(src_headers[0])
23
- dest = CSV.open(secondary_data_file, 'rb')
24
- dest_headers = dest.shift
25
- strip_bom!(dest_headers[0])
26
-
27
- read_next_src = true
28
- read_next_dest = true
29
-
30
- while(!src.eof? || !dest.eof?)
31
- src_record = next_record_from_file(src_headers, src) if read_next_src
32
- dest_record = next_record_from_file(dest_headers, dest) if read_next_dest
33
-
34
- if ! src_record
35
- read_next_src = false
36
- read_next_dest = true
37
-
38
- yield :delete, dest_record
39
- elsif ! dest_record
40
- read_next_src = true
41
- read_next_dest = false
42
-
43
- yield :create, src_record
44
- elsif compare_proc.call(src_record, dest_record) == 0
45
- read_next_src = true
46
- read_next_dest = true
47
-
48
- yield(:update, src_record) if update_row?(src_record, dest_record)
49
- elsif compare_proc.call(src_record, dest_record) > 0
50
- read_next_src = false
51
- read_next_dest = true
52
-
53
- yield :delete, dest_record
54
- else
55
- read_next_src = true
56
- read_next_dest = false
6
+ module CSVUtils
7
+ class CSVCompare
8
+ # primary_data_file is the source of truth
9
+ # compare_proc used to compare the id column(s)
10
+ # update_comparison_columns column(s) to compare for equality, ex: updated_at, timestamp, hash
11
+ # caveat: update_comparison_columns need to be in both csv files
12
+ attr_reader :primary_data_file,
13
+ :update_comparison_columns,
14
+ :compare_proc
15
+
16
+ def initialize(primary_data_file, update_comparison_columns = nil, &block)
17
+ @primary_data_file = primary_data_file
18
+ @update_comparison_columns = update_comparison_columns
19
+ @compare_proc = block
20
+ end
57
21
 
58
- yield :create, src_record
22
+ # rubocop:disable Metrics/MethodLength
23
+ def compare(secondary_data_file)
24
+ src = CSV.open(primary_data_file, 'rb')
25
+ begin
26
+ src_headers = src.shift
27
+ strip_bom!(src_headers[0])
28
+ dest = CSV.open(secondary_data_file, 'rb')
29
+ begin
30
+ dest_headers = dest.shift
31
+ strip_bom!(dest_headers[0])
32
+
33
+ read_next_src = true
34
+ read_next_dest = true
35
+
36
+ while !src.eof? || !dest.eof?
37
+ src_record = next_record_from_file(src_headers, src) if read_next_src
38
+ dest_record = next_record_from_file(dest_headers, dest) if read_next_dest
39
+
40
+ if !src_record
41
+ read_next_src = false
42
+ read_next_dest = true
43
+ yield :delete, dest_record
44
+ elsif !dest_record
45
+ read_next_src = true
46
+ read_next_dest = false
47
+ yield :create, src_record
48
+ elsif compare_proc.call(src_record, dest_record).zero?
49
+ read_next_src = true
50
+ read_next_dest = true
51
+ yield(:update, src_record) if update_row?(src_record, dest_record)
52
+ elsif compare_proc.call(src_record, dest_record).positive?
53
+ read_next_src = false
54
+ read_next_dest = true
55
+ yield :delete, dest_record
56
+ else
57
+ read_next_src = true
58
+ read_next_dest = false
59
+ yield :create, src_record
60
+ end
61
+ end
62
+ ensure
63
+ dest.close
64
+ end
65
+ ensure
66
+ src.close
59
67
  end
60
68
  end
69
+ # rubocop:enable Metrics/MethodLength
61
70
 
62
- src.close
63
- dest.close
64
- end
71
+ private
65
72
 
66
- private
73
+ def next_record_from_file(headers, file)
74
+ return nil if file.eof?
67
75
 
68
- def next_record_from_file(headers, file)
69
- return nil if file.eof?
76
+ headers.zip(file.shift).to_h
77
+ end
70
78
 
71
- Hash[headers.zip(file.shift)]
72
- end
79
+ def update_row?(src_record, dest_record)
80
+ return false unless update_comparison_columns
73
81
 
74
- def update_row?(src_record, dest_record)
75
- return false unless update_comparison_columns
82
+ update_comparison_columns.each do |column_name|
83
+ return true unless src_record[column_name] == dest_record[column_name]
84
+ end
76
85
 
77
- update_comparison_columns.each do |column_name|
78
- return true unless src_record[column_name] == dest_record[column_name]
86
+ false
79
87
  end
80
88
 
81
- false
82
- end
83
-
84
- def strip_bom!(col)
85
- col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
89
+ def strip_bom!(col)
90
+ col.sub!((+"\xEF\xBB\xBF").force_encoding('ASCII-8BIT'), '')
91
+ end
86
92
  end
87
93
  end
@@ -1,63 +1,67 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Utility class for appending data to a csv file.
2
- class CSVUtils::CSVExtender
3
- def initialize(src_csv, dest_csv, csv_options = {})
4
- @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
5
- @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
6
- end
4
+ module CSVUtils
5
+ class CSVExtender
6
+ def initialize(src_csv, dest_csv, csv_options = {})
7
+ @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
8
+ @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
9
+ end
7
10
 
8
- def append(additional_headers)
9
- process(additional_headers) do |current_headers|
10
- while (row = @src_csv.shift)
11
- additional_columns = yield row, current_headers
12
- @dest_csv << (row + additional_columns)
11
+ def append(additional_headers)
12
+ process(additional_headers) do |current_headers|
13
+ while (row = @src_csv.shift)
14
+ additional_columns = yield row, current_headers
15
+ @dest_csv << (row + additional_columns)
16
+ end
13
17
  end
14
18
  end
15
- end
16
19
 
17
- def append_in_batches(additional_headers, batch_size = 1_000)
18
- process(additional_headers) do |current_headers|
19
- batch = []
20
+ def append_in_batches(additional_headers, batch_size = 1_000)
21
+ process(additional_headers) do |current_headers|
22
+ batch = []
23
+
24
+ process_batch_proc = proc do
25
+ additional_rows = yield batch, current_headers
20
26
 
21
- process_batch_proc = Proc.new do
22
- additional_rows = yield batch, current_headers
27
+ batch.each_with_index do |row, idx|
28
+ @dest_csv << (row + additional_rows[idx])
29
+ end
23
30
 
24
- batch.each_with_index do |row, idx|
25
- @dest_csv << (row + additional_rows[idx])
31
+ batch = []
26
32
  end
27
33
 
28
- batch = []
29
- end
34
+ while (row = @src_csv.shift)
35
+ batch << row
30
36
 
31
- while (row = @src_csv.shift)
32
- batch << row
37
+ process_batch_proc.call if batch.size >= batch_size
38
+ end
33
39
 
34
- process_batch_proc.call if batch.size >= batch_size
40
+ process_batch_proc.call if batch.size.positive?
35
41
  end
36
-
37
- process_batch_proc.call if batch.size > 0
38
42
  end
39
- end
40
43
 
41
- private
44
+ private
42
45
 
43
- def process(additional_headers)
44
- current_headers = append_headers(additional_headers)
46
+ def process(additional_headers)
47
+ current_headers = append_headers(additional_headers)
45
48
 
46
- yield current_headers
49
+ yield current_headers
47
50
 
48
- close
49
- end
51
+ close
52
+ end
50
53
 
51
- def close
52
- @src_csv.close
53
- @dest_csv.close
54
- end
54
+ def close
55
+ @src_csv.close
56
+ @dest_csv.close
57
+ end
55
58
 
56
- def append_headers(additional_headers)
57
- return nil unless additional_headers
59
+ def append_headers(additional_headers)
60
+ return nil unless additional_headers
58
61
 
59
- current_headers = @src_csv.shift
60
- @dest_csv << (current_headers + additional_headers)
61
- current_headers
62
+ current_headers = @src_csv.shift
63
+ @dest_csv << (current_headers + additional_headers)
64
+ current_headers
65
+ end
62
66
  end
63
67
  end