csv-utils 0.3.4 → 0.3.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.ruby-version +1 -1
- data/Gemfile.lock +33 -31
- data/bin/csv-explorer +12 -0
- data/bin/csv-find-error +3 -1
- data/bin/csv-validator +17 -13
- data/csv-utils.gemspec +1 -1
- data/lib/csv-utils.rb +2 -0
- data/lib/csv_utils/csv_compare.rb +83 -0
- data/lib/csv_utils/csv_iterator.rb +51 -0
- data/lib/csv_utils/csv_report.rb +10 -5
- data/lib/csv_utils/csv_sort.rb +6 -6
- data/lib/csv_utils/csv_wrapper.rb +5 -1
- metadata +7 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b5b6f30da64b899586ef7d801904b8da9b01b6053f9d975f0702edcc5a7b65e0
|
4
|
+
data.tar.gz: e7e5006c6b63a9b8472e4658c748071619d09adf0ca73a84109b43b8862f1bcd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7ded9a4318f44f77b6c2ccf72a844a7cec734745052c62be5e19c2dbadc8e989b3c89528058fabfd63cbcb59a1575293b9cc87a343bb4d68dbad3051ba04616d
|
7
|
+
data.tar.gz: dd8f0d7cd953eee05489423d0ef55e0b486500d2499d442d7d523636ced12858a2e9cec7ad60ce9d0126516121674d3c1e2a879ea89bb3a93cf7100709204b9d
|
data/.ruby-version
CHANGED
@@ -1 +1 @@
|
|
1
|
-
|
1
|
+
3.0.0
|
data/Gemfile.lock
CHANGED
@@ -1,50 +1,52 @@
|
|
1
1
|
GEM
|
2
2
|
remote: http://rubygems.org/
|
3
3
|
specs:
|
4
|
-
ast (2.4.
|
5
|
-
diff-lcs (1.
|
6
|
-
docile (1.3.
|
4
|
+
ast (2.4.2)
|
5
|
+
diff-lcs (1.4.4)
|
6
|
+
docile (1.3.5)
|
7
7
|
inheritance-helper (0.1.5)
|
8
|
-
parallel (1.
|
9
|
-
parser (
|
8
|
+
parallel (1.20.1)
|
9
|
+
parser (3.0.0.0)
|
10
10
|
ast (~> 2.4.1)
|
11
11
|
rainbow (3.0.0)
|
12
|
-
rake (13.0.
|
13
|
-
regexp_parser (1.
|
12
|
+
rake (13.0.3)
|
13
|
+
regexp_parser (2.1.1)
|
14
14
|
rexml (3.2.4)
|
15
|
-
rspec (3.
|
16
|
-
rspec-core (~> 3.
|
17
|
-
rspec-expectations (~> 3.
|
18
|
-
rspec-mocks (~> 3.
|
19
|
-
rspec-core (3.
|
20
|
-
rspec-support (~> 3.
|
21
|
-
rspec-expectations (3.
|
15
|
+
rspec (3.10.0)
|
16
|
+
rspec-core (~> 3.10.0)
|
17
|
+
rspec-expectations (~> 3.10.0)
|
18
|
+
rspec-mocks (~> 3.10.0)
|
19
|
+
rspec-core (3.10.1)
|
20
|
+
rspec-support (~> 3.10.0)
|
21
|
+
rspec-expectations (3.10.1)
|
22
22
|
diff-lcs (>= 1.2.0, < 2.0)
|
23
|
-
rspec-support (~> 3.
|
24
|
-
rspec-mocks (3.
|
23
|
+
rspec-support (~> 3.10.0)
|
24
|
+
rspec-mocks (3.10.2)
|
25
25
|
diff-lcs (>= 1.2.0, < 2.0)
|
26
|
-
rspec-support (~> 3.
|
27
|
-
rspec-support (3.
|
28
|
-
rubocop (
|
26
|
+
rspec-support (~> 3.10.0)
|
27
|
+
rspec-support (3.10.2)
|
28
|
+
rubocop (1.11.0)
|
29
29
|
parallel (~> 1.10)
|
30
|
-
parser (>=
|
30
|
+
parser (>= 3.0.0.0)
|
31
31
|
rainbow (>= 2.2.2, < 4.0)
|
32
|
-
regexp_parser (>= 1.
|
32
|
+
regexp_parser (>= 1.8, < 3.0)
|
33
33
|
rexml
|
34
|
-
rubocop-ast (>=
|
34
|
+
rubocop-ast (>= 1.2.0, < 2.0)
|
35
35
|
ruby-progressbar (~> 1.7)
|
36
|
-
unicode-display_width (>= 1.4.0, <
|
37
|
-
rubocop-ast (
|
38
|
-
parser (>= 2.7.
|
39
|
-
ruby-progressbar (1.
|
40
|
-
simplecov (0.
|
36
|
+
unicode-display_width (>= 1.4.0, < 3.0)
|
37
|
+
rubocop-ast (1.4.1)
|
38
|
+
parser (>= 2.7.1.5)
|
39
|
+
ruby-progressbar (1.11.0)
|
40
|
+
simplecov (0.21.2)
|
41
41
|
docile (~> 1.1)
|
42
42
|
simplecov-html (~> 0.11)
|
43
|
-
|
44
|
-
|
43
|
+
simplecov_json_formatter (~> 0.1)
|
44
|
+
simplecov-html (0.12.3)
|
45
|
+
simplecov_json_formatter (0.1.2)
|
46
|
+
unicode-display_width (2.0.0)
|
45
47
|
|
46
48
|
PLATFORMS
|
47
|
-
|
49
|
+
x86_64-darwin-20
|
48
50
|
|
49
51
|
DEPENDENCIES
|
50
52
|
inheritance-helper
|
@@ -54,4 +56,4 @@ DEPENDENCIES
|
|
54
56
|
simplecov
|
55
57
|
|
56
58
|
BUNDLED WITH
|
57
|
-
|
59
|
+
2.2.3
|
data/bin/csv-explorer
ADDED
data/bin/csv-find-error
CHANGED
@@ -3,10 +3,12 @@
|
|
3
3
|
require 'csv'
|
4
4
|
require 'shellwords'
|
5
5
|
|
6
|
+
prev_row = nil
|
6
7
|
begin
|
7
|
-
CSV.open(ARGV[0], 'rb').each { }
|
8
|
+
CSV.open(ARGV[0], 'rb').each { |row| prev_row = row }
|
8
9
|
rescue CSV::MalformedCSVError => e
|
9
10
|
puts e.class.to_s + ': ' + e.message
|
11
|
+
puts "previous row was #{prev_row}"
|
10
12
|
if e.message =~ /line (\d+)/
|
11
13
|
lineno = $1.to_i
|
12
14
|
cmd = "csv-readline #{Shellwords.escape(ARGV[0])} #{lineno}"
|
data/bin/csv-validator
CHANGED
@@ -35,20 +35,24 @@ id_column_name = ARGV[1]
|
|
35
35
|
headers = csv.shift
|
36
36
|
strip_bom!(headers[0])
|
37
37
|
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
exit 1
|
43
|
-
end
|
44
|
-
|
45
|
-
id_column_num = headers.index(id_column_name)
|
38
|
+
id_column_name ||= headers[0]
|
39
|
+
unless headers.include?(id_column_name)
|
40
|
+
$stderr.puts("header #{id_column_name} not found in current set of headers")
|
41
|
+
exit 1
|
46
42
|
end
|
47
43
|
|
44
|
+
id_column_num = headers.index(id_column_name)
|
45
|
+
|
48
46
|
out = nil
|
49
|
-
|
50
|
-
out
|
51
|
-
|
47
|
+
out_proc = Proc.new do |row|
|
48
|
+
out ||=
|
49
|
+
begin
|
50
|
+
out = CSV.open('utf8-correctsion.csv', 'wb')
|
51
|
+
out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
|
52
|
+
out
|
53
|
+
end
|
54
|
+
|
55
|
+
out << row
|
52
56
|
end
|
53
57
|
|
54
58
|
csv_lineno = 1
|
@@ -61,12 +65,12 @@ while (row = csv.shift)
|
|
61
65
|
end
|
62
66
|
|
63
67
|
row.each_with_index do |col, idx|
|
64
|
-
next if utf8?(col)
|
68
|
+
next if col.nil? || utf8?(col)
|
65
69
|
|
66
70
|
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
|
67
71
|
if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
|
68
72
|
puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
|
69
|
-
|
73
|
+
out_proc.call [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
|
70
74
|
else
|
71
75
|
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
|
72
76
|
end
|
data/csv-utils.gemspec
CHANGED
data/lib/csv-utils.rb
CHANGED
@@ -2,7 +2,9 @@ require 'csv'
|
|
2
2
|
|
3
3
|
# Collection of tools for working with CSV files.
|
4
4
|
module CSVUtils
|
5
|
+
autoload :CSVCompare, 'csv_utils/csv_compare'
|
5
6
|
autoload :CSVExtender, 'csv_utils/csv_extender'
|
7
|
+
autoload :CSVIterator, 'csv_utils/csv_iterator'
|
6
8
|
autoload :CSVOptions, 'csv_utils/csv_options'
|
7
9
|
autoload :CSVReport, 'csv_utils/csv_report'
|
8
10
|
autoload :CSVRow, 'csv_utils/csv_row'
|
@@ -0,0 +1,83 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# CSVUtils::CSVCompare purpose is to determine which rows in the secondary_data_file need to be created, deleted or updated
|
4
|
+
# **requires both CSV files to be sorted on the same columns, CSVUtils::CSVSort can accomplish this
|
5
|
+
# In order to receive updates, update_comparison_columns must configured or use inheritance and change the update_row? method
|
6
|
+
class CSVUtils::CSVCompare
|
7
|
+
# primary_data_file is the source of truth
|
8
|
+
# compare_proc used to compare the id column(s)
|
9
|
+
# update_comparison_columns column(s) to compare for equality, ex: updated_at, timestamp, hash
|
10
|
+
# caveat: update_comparison_columns need to be in both csv files
|
11
|
+
attr_reader :primary_data_file,
|
12
|
+
:update_comparison_columns,
|
13
|
+
:compare_proc
|
14
|
+
|
15
|
+
def initialize(primary_data_file, update_comparison_columns=nil, &block)
|
16
|
+
@primary_data_file = primary_data_file
|
17
|
+
@update_comparison_columns = update_comparison_columns
|
18
|
+
@compare_proc = block
|
19
|
+
end
|
20
|
+
|
21
|
+
def compare(secondary_data_file)
|
22
|
+
src = CSV.open(primary_data_file)
|
23
|
+
src_headers = src.shift
|
24
|
+
dest = CSV.open(secondary_data_file)
|
25
|
+
dest_headers = dest.shift
|
26
|
+
|
27
|
+
read_next_src = true
|
28
|
+
read_next_dest = true
|
29
|
+
|
30
|
+
while(!src.eof? || !dest.eof?)
|
31
|
+
src_record = next_record_from_file(src_headers, src) if read_next_src
|
32
|
+
dest_record = next_record_from_file(dest_headers, dest) if read_next_dest
|
33
|
+
|
34
|
+
if ! src_record
|
35
|
+
read_next_src = false
|
36
|
+
read_next_dest = true
|
37
|
+
|
38
|
+
yield :delete, dest_record
|
39
|
+
elsif ! dest_record
|
40
|
+
read_next_src = true
|
41
|
+
read_next_dest = false
|
42
|
+
|
43
|
+
yield :create, src_record
|
44
|
+
elsif compare_proc.call(src_record, dest_record) == 0
|
45
|
+
read_next_src = true
|
46
|
+
read_next_dest = true
|
47
|
+
|
48
|
+
yield(:update, src_record) if update_row?(src_record, dest_record)
|
49
|
+
elsif compare_proc.call(src_record, dest_record) > 0
|
50
|
+
read_next_src = false
|
51
|
+
read_next_dest = true
|
52
|
+
|
53
|
+
yield :delete, dest_record
|
54
|
+
else
|
55
|
+
read_next_src = true
|
56
|
+
read_next_dest = false
|
57
|
+
|
58
|
+
yield :create, src_record
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
src.close
|
63
|
+
dest.close
|
64
|
+
end
|
65
|
+
|
66
|
+
private
|
67
|
+
|
68
|
+
def next_record_from_file(headers, file)
|
69
|
+
return nil if file.eof?
|
70
|
+
|
71
|
+
Hash[headers.zip(file.shift)]
|
72
|
+
end
|
73
|
+
|
74
|
+
def update_row?(src_record, dest_record)
|
75
|
+
return false unless update_comparison_columns
|
76
|
+
|
77
|
+
update_comparison_columns.each do |column_name|
|
78
|
+
return true unless src_record[column_name] == dest_record[column_name]
|
79
|
+
end
|
80
|
+
|
81
|
+
false
|
82
|
+
end
|
83
|
+
end
|
@@ -0,0 +1,51 @@
|
|
1
|
+
# Search a CSV given a series of steps
|
2
|
+
class CSVUtils::CSVIterator
|
3
|
+
include Enumerable
|
4
|
+
|
5
|
+
attr_reader :prev_row
|
6
|
+
|
7
|
+
class RowWrapper < Hash
|
8
|
+
attr_accessor :lineno
|
9
|
+
|
10
|
+
def self.create(headers, row, lineno)
|
11
|
+
row_wrapper = RowWrapper[headers.zip(row)]
|
12
|
+
row_wrapper.lineno = lineno
|
13
|
+
row_wrapper
|
14
|
+
end
|
15
|
+
|
16
|
+
def to_pretty_s
|
17
|
+
reject { |_, v| v.strip.empty? }
|
18
|
+
.each_with_index
|
19
|
+
.map { |(k, v), idx| sprintf(' %-3d %s: %s', idx+1, k, v) }
|
20
|
+
.join("\n") + "\n"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def initialize(src_csv, csv_options = {})
|
25
|
+
@src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
|
26
|
+
end
|
27
|
+
|
28
|
+
def each(headers = nil)
|
29
|
+
@src_csv.rewind
|
30
|
+
|
31
|
+
lineno = 0
|
32
|
+
unless headers
|
33
|
+
headers = @src_csv.shift
|
34
|
+
strip_bom!(headers[0])
|
35
|
+
lineno += 1
|
36
|
+
end
|
37
|
+
|
38
|
+
@prev_row = nil
|
39
|
+
while (row = @src_csv.shift)
|
40
|
+
lineno += 1
|
41
|
+
yield RowWrapper.create(headers, row, lineno)
|
42
|
+
@prev_row = row
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def strip_bom!(col)
|
49
|
+
col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
|
50
|
+
end
|
51
|
+
end
|
data/lib/csv_utils/csv_report.rb
CHANGED
@@ -9,19 +9,20 @@ module CSVUtils
|
|
9
9
|
if csv.is_a?(String)
|
10
10
|
@must_close = true
|
11
11
|
mode = csv_options.delete(:mode) || 'wb'
|
12
|
-
CSV.open(csv, mode, csv_options)
|
12
|
+
CSV.open(csv, mode, **csv_options)
|
13
13
|
else
|
14
14
|
@must_close = false
|
15
15
|
csv
|
16
16
|
end
|
17
17
|
|
18
|
-
|
18
|
+
add_headers(headers) if headers
|
19
|
+
|
20
|
+
generate(&block) if block
|
19
21
|
end
|
20
22
|
|
21
|
-
def generate
|
22
|
-
add_headers(headers) if headers
|
23
|
+
def generate
|
23
24
|
yield self
|
24
|
-
|
25
|
+
close if @must_close
|
25
26
|
end
|
26
27
|
|
27
28
|
def append(csv_row)
|
@@ -37,5 +38,9 @@ module CSVUtils
|
|
37
38
|
def add_headers(csv_row)
|
38
39
|
append(csv_row.is_a?(Array) ? csv_row : csv_row.csv_headers)
|
39
40
|
end
|
41
|
+
|
42
|
+
def close
|
43
|
+
@csv.close
|
44
|
+
end
|
40
45
|
end
|
41
46
|
end
|
data/lib/csv_utils/csv_sort.rb
CHANGED
@@ -25,9 +25,9 @@ class CSVUtils::CSVSort
|
|
25
25
|
private
|
26
26
|
|
27
27
|
def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
|
28
|
-
src1 = CSV.open(src_csv_file1, 'rb', csv_options)
|
29
|
-
src2 = CSV.open(src_csv_file2, 'rb', csv_options)
|
30
|
-
dest = CSV.open(dest_csv_file, 'wb', csv_options)
|
28
|
+
src1 = CSV.open(src_csv_file1, 'rb', **csv_options)
|
29
|
+
src2 = CSV.open(src_csv_file2, 'rb', **csv_options)
|
30
|
+
dest = CSV.open(dest_csv_file, 'wb', **csv_options)
|
31
31
|
|
32
32
|
if @headers
|
33
33
|
dest << @headers
|
@@ -66,7 +66,7 @@ class CSVUtils::CSVSort
|
|
66
66
|
end
|
67
67
|
|
68
68
|
def create_sorted_csv_part_files(batch_size, &block)
|
69
|
-
src = CSV.open(csv_file, 'rb', csv_options)
|
69
|
+
src = CSV.open(csv_file, 'rb', **csv_options)
|
70
70
|
|
71
71
|
@headers = src.shift if has_headers
|
72
72
|
|
@@ -74,7 +74,7 @@ class CSVUtils::CSVSort
|
|
74
74
|
create_batch_part_proc = Proc.new do
|
75
75
|
batch.sort!(&block)
|
76
76
|
@csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
|
77
|
-
CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
|
77
|
+
CSV.open(@csv_part_files.last, 'wb', **csv_options) do |csv|
|
78
78
|
csv << @headers if @headers
|
79
79
|
batch.each { |row| csv << row }
|
80
80
|
end
|
@@ -107,6 +107,6 @@ class CSVUtils::CSVSort
|
|
107
107
|
File.unlink(csv_part_file2)
|
108
108
|
end
|
109
109
|
|
110
|
-
FileUtils.mv(@csv_part_files.last, new_csv_file)
|
110
|
+
FileUtils.mv(@csv_part_files.last || @csv_file, new_csv_file)
|
111
111
|
end
|
112
112
|
end
|
@@ -20,7 +20,7 @@ class CSVUtils::CSVWrapper
|
|
20
20
|
def open(csv, mode, csv_options)
|
21
21
|
if csv.is_a?(String)
|
22
22
|
@close_when_done = true
|
23
|
-
@csv = CSV.open(csv, mode, csv_options)
|
23
|
+
@csv = CSV.open(csv, mode, **csv_options)
|
24
24
|
else
|
25
25
|
@close_when_done = false
|
26
26
|
@csv = csv
|
@@ -35,6 +35,10 @@ class CSVUtils::CSVWrapper
|
|
35
35
|
csv.shift
|
36
36
|
end
|
37
37
|
|
38
|
+
def rewind
|
39
|
+
csv.rewind
|
40
|
+
end
|
41
|
+
|
38
42
|
def close
|
39
43
|
csv.close if close_when_done?
|
40
44
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Doug Youch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-03-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: inheritance-helper
|
@@ -28,6 +28,7 @@ description: Tools for debugging malformed CSV files
|
|
28
28
|
email: dougyouch@gmail.com
|
29
29
|
executables:
|
30
30
|
- csv-change-eol
|
31
|
+
- csv-explorer
|
31
32
|
- csv-find-error
|
32
33
|
- csv-readline
|
33
34
|
- csv-validator
|
@@ -42,12 +43,15 @@ files:
|
|
42
43
|
- LICENSE
|
43
44
|
- README.md
|
44
45
|
- bin/csv-change-eol
|
46
|
+
- bin/csv-explorer
|
45
47
|
- bin/csv-find-error
|
46
48
|
- bin/csv-readline
|
47
49
|
- bin/csv-validator
|
48
50
|
- csv-utils.gemspec
|
49
51
|
- lib/csv-utils.rb
|
52
|
+
- lib/csv_utils/csv_compare.rb
|
50
53
|
- lib/csv_utils/csv_extender.rb
|
54
|
+
- lib/csv_utils/csv_iterator.rb
|
51
55
|
- lib/csv_utils/csv_options.rb
|
52
56
|
- lib/csv_utils/csv_report.rb
|
53
57
|
- lib/csv_utils/csv_row.rb
|
@@ -74,7 +78,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
74
78
|
- !ruby/object:Gem::Version
|
75
79
|
version: '0'
|
76
80
|
requirements: []
|
77
|
-
rubygems_version: 3.
|
81
|
+
rubygems_version: 3.2.3
|
78
82
|
signing_key:
|
79
83
|
specification_version: 4
|
80
84
|
summary: CSV Utils
|