csv-utils 0.2.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/csv-validator +47 -0
- data/csv-utils.gemspec +1 -1
- data/lib/csv-utils.rb +4 -0
- data/lib/csv_utils/csv_extender.rb +63 -0
- data/lib/csv_utils/csv_report.rb +4 -3
- data/lib/csv_utils/csv_row.rb +13 -7
- data/lib/csv_utils/csv_sort.rb +112 -0
- data/lib/csv_utils/csv_transformer.rb +119 -0
- data/lib/csv_utils/csv_wrapper.rb +47 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '019dcd269f036bc21e93019e567e8a0223d8436e87c56519d74af02383640bdf'
|
4
|
+
data.tar.gz: 34b4e8035a533e897c395943e892de0bae16fbdc3847a4990cc0281225d21bd4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e770276baa097fa30551266882910818f331890c6e9bfd7fa92ab01654826a14ad3b67f151f2ce858c816d6d628170174fcc872038636998c15c818dc129130a
|
7
|
+
data.tar.gz: a1689e7404f5d9b70f092b7cda83c8f200df0af1725d27ecc222d798bafcbb1ea1612cbff74dbef493fe427a12ff8330f7828ed608becb4a3d3cd7267179319e
|
data/bin/csv-validator
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'rchardet'
|
5
|
+
|
6
|
+
def utf8?(str)
|
7
|
+
str
|
8
|
+
.force_encoding('utf-8')
|
9
|
+
.valid_encoding?
|
10
|
+
end
|
11
|
+
|
12
|
+
def convert_to_utf8(str, current_encoding)
|
13
|
+
str.force_encoding(current_encoding)
|
14
|
+
return nil unless str.valid_encoding?
|
15
|
+
|
16
|
+
str.encode('utf-8')
|
17
|
+
end
|
18
|
+
|
19
|
+
def detect_encoding(col)
|
20
|
+
CharDet.detect(col)['encoding']
|
21
|
+
end
|
22
|
+
|
23
|
+
csv = CSV.open(ARGV[0], 'rb')
|
24
|
+
|
25
|
+
headers = csv.shift
|
26
|
+
csv_lineno = 1
|
27
|
+
|
28
|
+
while (row = csv.shift)
|
29
|
+
csv_lineno += 1
|
30
|
+
|
31
|
+
unless row.size == headers.size
|
32
|
+
$stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
|
33
|
+
end
|
34
|
+
|
35
|
+
row.each_with_index do |col, idx|
|
36
|
+
next if utf8?(col)
|
37
|
+
|
38
|
+
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): none UTF-8 characters found in \"#{col}\""
|
39
|
+
if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
|
40
|
+
puts "row(#{csv_lineno}),col(#{idx + 1}): converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
|
41
|
+
else
|
42
|
+
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): unknown character encoding"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
csv.close
|
data/csv-utils.gemspec
CHANGED
data/lib/csv-utils.rb
CHANGED
@@ -2,7 +2,11 @@ require 'csv'
|
|
2
2
|
|
3
3
|
# Collection of tools for working with CSV files.
|
4
4
|
module CSVUtils
|
5
|
+
autoload :CSVExtender, 'csv_utils/csv_extender'
|
5
6
|
autoload :CSVOptions, 'csv_utils/csv_options'
|
6
7
|
autoload :CSVReport, 'csv_utils/csv_report'
|
7
8
|
autoload :CSVRow, 'csv_utils/csv_row'
|
9
|
+
autoload :CSVSort, 'csv_utils/csv_sort'
|
10
|
+
autoload :CSVTransformer, 'csv_utils/csv_transformer'
|
11
|
+
autoload :CSVWrapper, 'csv_utils/csv_wrapper'
|
8
12
|
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# Utility class for appending data to a csv file.
|
2
|
+
class CSVUtils::CSVExtender
|
3
|
+
def initialize(src_csv, dest_csv, csv_options = {})
|
4
|
+
@src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
|
5
|
+
@dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def append(additional_headers)
|
9
|
+
process(additional_headers) do |current_headers|
|
10
|
+
while (row = @src_csv.shift)
|
11
|
+
additional_columns = yield row, current_headers
|
12
|
+
@dest_csv << (row + additional_columns)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def append_in_batches(additional_headers, batch_size = 1_000)
|
18
|
+
process(additional_headers) do |current_headers|
|
19
|
+
batch = []
|
20
|
+
|
21
|
+
process_batch_proc = Proc.new do
|
22
|
+
additional_rows = yield batch, current_headers
|
23
|
+
|
24
|
+
batch.each_with_index do |row, idx|
|
25
|
+
@dest_csv << (row + additional_rows[idx])
|
26
|
+
end
|
27
|
+
|
28
|
+
batch = []
|
29
|
+
end
|
30
|
+
|
31
|
+
while (row = @src_csv.shift)
|
32
|
+
batch << row
|
33
|
+
|
34
|
+
process_batch_proc.call if batch.size >= batch_size
|
35
|
+
end
|
36
|
+
|
37
|
+
process_batch_proc.call if batch.size > 0
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def process(additional_headers)
|
44
|
+
current_headers = append_headers(additional_headers)
|
45
|
+
|
46
|
+
yield current_headers
|
47
|
+
|
48
|
+
close
|
49
|
+
end
|
50
|
+
|
51
|
+
def close
|
52
|
+
@src_csv.close
|
53
|
+
@dest_csv.close
|
54
|
+
end
|
55
|
+
|
56
|
+
def append_headers(additional_headers)
|
57
|
+
return nil unless additional_headers
|
58
|
+
|
59
|
+
current_headers = @src_csv.shift
|
60
|
+
@dest_csv << (current_headers + additional_headers)
|
61
|
+
current_headers
|
62
|
+
end
|
63
|
+
end
|
data/lib/csv_utils/csv_report.rb
CHANGED
@@ -4,7 +4,7 @@ module CSVUtils
|
|
4
4
|
attr_reader :csv,
|
5
5
|
:must_close
|
6
6
|
|
7
|
-
def initialize(csv, csv_options = {}, &block)
|
7
|
+
def initialize(csv, headers = nil, csv_options = {}, &block)
|
8
8
|
@csv =
|
9
9
|
if csv.is_a?(String)
|
10
10
|
@must_close = true
|
@@ -15,10 +15,11 @@ module CSVUtils
|
|
15
15
|
csv
|
16
16
|
end
|
17
17
|
|
18
|
-
generate(&block) if block
|
18
|
+
generate(headers, &block) if block
|
19
19
|
end
|
20
20
|
|
21
|
-
def generate
|
21
|
+
def generate(headers = nil)
|
22
|
+
add_headers(headers) if headers
|
22
23
|
yield self
|
23
24
|
@csv.close if @must_close
|
24
25
|
end
|
data/lib/csv_utils/csv_row.rb
CHANGED
@@ -23,10 +23,16 @@ module CSVUtils
|
|
23
23
|
|
24
24
|
add_value_to_class_method(:csv_columns, header => options)
|
25
25
|
end
|
26
|
-
end
|
27
26
|
|
28
|
-
|
29
|
-
|
27
|
+
def csv_headers
|
28
|
+
csv_columns.values.map { |column_options| csv_column_header(column_options) }
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def csv_column_header(column_options)
|
34
|
+
column_options[:header]
|
35
|
+
end
|
30
36
|
end
|
31
37
|
|
32
38
|
def csv_row
|
@@ -34,12 +40,12 @@ module CSVUtils
|
|
34
40
|
end
|
35
41
|
alias_method :to_a, :csv_row
|
36
42
|
|
37
|
-
|
38
|
-
|
39
|
-
def csv_column_header(column_options)
|
40
|
-
column_options[:header]
|
43
|
+
def csv_headers
|
44
|
+
self.class.csv_headers
|
41
45
|
end
|
42
46
|
|
47
|
+
private
|
48
|
+
|
43
49
|
def csv_column_value(column_options)
|
44
50
|
if column_options[:proc]
|
45
51
|
instance_eval(&column_options[:proc])
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
# Utility class for sorting the rows for a csv file
|
4
|
+
class CSVUtils::CSVSort
|
5
|
+
attr_reader :csv_file,
|
6
|
+
:new_csv_file,
|
7
|
+
:has_headers,
|
8
|
+
:csv_options,
|
9
|
+
:headers
|
10
|
+
|
11
|
+
def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
|
12
|
+
@csv_file = csv_file
|
13
|
+
@new_csv_file = new_csv_file
|
14
|
+
@has_headers = has_headers
|
15
|
+
@csv_options = csv_options
|
16
|
+
@csv_part_files = []
|
17
|
+
@files_to_delete = []
|
18
|
+
end
|
19
|
+
|
20
|
+
def sort(batch_size = 100_000, &block)
|
21
|
+
create_sorted_csv_part_files(batch_size, &block)
|
22
|
+
merge_csv_part_files(&block)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
|
28
|
+
src1 = CSV.open(src_csv_file1, 'rb', csv_options)
|
29
|
+
src2 = CSV.open(src_csv_file2, 'rb', csv_options)
|
30
|
+
dest = CSV.open(dest_csv_file, 'wb', csv_options)
|
31
|
+
|
32
|
+
if @headers
|
33
|
+
dest << @headers
|
34
|
+
src1.shift
|
35
|
+
src2.shift
|
36
|
+
end
|
37
|
+
|
38
|
+
row1 = src1.shift
|
39
|
+
row2 = src2.shift
|
40
|
+
|
41
|
+
append_row1_proc = Proc.new do
|
42
|
+
dest << row1
|
43
|
+
row1 = src1.shift
|
44
|
+
end
|
45
|
+
|
46
|
+
append_row2_proc = Proc.new do
|
47
|
+
dest << row2
|
48
|
+
row2 = src2.shift
|
49
|
+
end
|
50
|
+
|
51
|
+
while row1 || row2
|
52
|
+
if row1.nil?
|
53
|
+
append_row2_proc.call
|
54
|
+
elsif row2.nil?
|
55
|
+
append_row1_proc.call
|
56
|
+
elsif yield(row1, row2) <= 0
|
57
|
+
append_row1_proc.call
|
58
|
+
else
|
59
|
+
append_row2_proc.call
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
src1.close
|
64
|
+
src2.close
|
65
|
+
dest.close
|
66
|
+
end
|
67
|
+
|
68
|
+
def create_sorted_csv_part_files(batch_size, &block)
|
69
|
+
src = CSV.open(csv_file, 'rb', csv_options)
|
70
|
+
|
71
|
+
@headers = src.shift if has_headers
|
72
|
+
|
73
|
+
batch = []
|
74
|
+
create_batch_part_proc = Proc.new do
|
75
|
+
batch.sort!(&block)
|
76
|
+
@csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
|
77
|
+
CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
|
78
|
+
csv << @headers if @headers
|
79
|
+
batch.each { |row| csv << row }
|
80
|
+
end
|
81
|
+
batch = []
|
82
|
+
end
|
83
|
+
|
84
|
+
while (row = src.shift)
|
85
|
+
batch << row
|
86
|
+
create_batch_part_proc.call if batch.size >= batch_size
|
87
|
+
end
|
88
|
+
|
89
|
+
create_batch_part_proc.call if batch.size > 0
|
90
|
+
|
91
|
+
src.close
|
92
|
+
end
|
93
|
+
|
94
|
+
def merge_csv_part_files(&block)
|
95
|
+
file_merge_cnt = 0
|
96
|
+
|
97
|
+
while @csv_part_files.size > 1
|
98
|
+
file_merge_cnt += 1
|
99
|
+
|
100
|
+
csv_part_file1 = @csv_part_files.shift
|
101
|
+
csv_part_file2 = @csv_part_files.shift
|
102
|
+
@csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
|
103
|
+
|
104
|
+
merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
|
105
|
+
|
106
|
+
File.unlink(csv_part_file1)
|
107
|
+
File.unlink(csv_part_file2)
|
108
|
+
end
|
109
|
+
|
110
|
+
FileUtils.mv(@csv_part_files.last, new_csv_file)
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# Transforms a CSV given a series of steps
|
2
|
+
class CSVUtils::CSVTransformer
|
3
|
+
attr_reader :headers
|
4
|
+
|
5
|
+
def initialize(src_csv, dest_csv, csv_options = {})
|
6
|
+
@src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
|
7
|
+
@dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
|
8
|
+
end
|
9
|
+
|
10
|
+
def read_headers
|
11
|
+
@headers = @src_csv.shift
|
12
|
+
self
|
13
|
+
end
|
14
|
+
|
15
|
+
def additional_data(&block)
|
16
|
+
steps << [:additional_data, @headers, block]
|
17
|
+
self
|
18
|
+
end
|
19
|
+
|
20
|
+
def select(&block)
|
21
|
+
steps << [:select, @headers, block]
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def reject(&block)
|
26
|
+
steps << [:reject, @headers, block]
|
27
|
+
self
|
28
|
+
end
|
29
|
+
|
30
|
+
def map(new_headers, &block)
|
31
|
+
steps << [:map, @headers, block]
|
32
|
+
@headers = new_headers
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def append(additional_headers, &block)
|
37
|
+
steps << [:append, @headers, block]
|
38
|
+
|
39
|
+
if additional_headers
|
40
|
+
@headers += additional_headers
|
41
|
+
else
|
42
|
+
@headers = nil
|
43
|
+
end
|
44
|
+
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def each(&block)
|
49
|
+
steps << [:each, @headers, block]
|
50
|
+
self
|
51
|
+
end
|
52
|
+
|
53
|
+
def set_headers(headers)
|
54
|
+
@headers = headers
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
def process(batch_size = 10_000, &block)
|
59
|
+
batch = []
|
60
|
+
|
61
|
+
@dest_csv << @headers if @headers
|
62
|
+
|
63
|
+
steps_proc = Proc.new do
|
64
|
+
steps.each do |step_type, current_headers, proc|
|
65
|
+
batch = process_step(step_type, current_headers, batch, &proc)
|
66
|
+
end
|
67
|
+
|
68
|
+
batch.each { |row| @dest_csv << row }
|
69
|
+
|
70
|
+
batch = []
|
71
|
+
end
|
72
|
+
|
73
|
+
while (row = @src_csv.shift)
|
74
|
+
batch << row
|
75
|
+
steps_proc.call if batch.size >= batch_size
|
76
|
+
end
|
77
|
+
|
78
|
+
steps_proc.call if batch.size > 0
|
79
|
+
|
80
|
+
@src_csv.close
|
81
|
+
@dest_csv.close
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def steps
|
87
|
+
@steps ||= []
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def process_step(step_type, current_headers, batch, &block)
|
92
|
+
case step_type
|
93
|
+
when :select
|
94
|
+
batch.select! do |row|
|
95
|
+
block.call row, current_headers, @additional_data
|
96
|
+
end
|
97
|
+
when :reject
|
98
|
+
batch.reject! do |row|
|
99
|
+
block.call row, current_headers, @additional_data
|
100
|
+
end
|
101
|
+
when :map
|
102
|
+
batch.map! do |row|
|
103
|
+
block.call row, current_headers, @additional_data
|
104
|
+
end
|
105
|
+
when :append
|
106
|
+
batch.map! do |row|
|
107
|
+
row + block.call(row, current_headers, @additional_data)
|
108
|
+
end
|
109
|
+
when :additional_data
|
110
|
+
@additional_data = block.call(batch, current_headers)
|
111
|
+
when :each
|
112
|
+
batch.each do |row|
|
113
|
+
block.call(row, current_headers, @additional_data)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
batch
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Wraps a CSV object, if wrapper opens the csv file it will close it
|
2
|
+
class CSVUtils::CSVWrapper
|
3
|
+
attr_reader :csv
|
4
|
+
|
5
|
+
def initialize(csv, mode, csv_options)
|
6
|
+
open(csv, mode, csv_options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.open(file, mode, csv_options = {})
|
10
|
+
csv = new(file, mode, csv_options)
|
11
|
+
|
12
|
+
if block_given?
|
13
|
+
yield csv
|
14
|
+
csv.close
|
15
|
+
else
|
16
|
+
csv
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def open(csv, mode, csv_options)
|
21
|
+
if csv.is_a?(String)
|
22
|
+
@close_when_done = true
|
23
|
+
@csv = CSV.open(csv, mode, csv_options)
|
24
|
+
else
|
25
|
+
@close_when_done = false
|
26
|
+
@csv = csv
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def <<(row)
|
31
|
+
csv << row
|
32
|
+
end
|
33
|
+
|
34
|
+
def shift
|
35
|
+
csv.shift
|
36
|
+
end
|
37
|
+
|
38
|
+
def close
|
39
|
+
csv.close if close_when_done?
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def close_when_done?
|
45
|
+
@close_when_done
|
46
|
+
end
|
47
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Doug Youch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: inheritance-helper
|
@@ -30,6 +30,7 @@ executables:
|
|
30
30
|
- csv-change-eol
|
31
31
|
- csv-find-error
|
32
32
|
- csv-readline
|
33
|
+
- csv-validator
|
33
34
|
extensions: []
|
34
35
|
extra_rdoc_files: []
|
35
36
|
files:
|
@@ -43,11 +44,16 @@ files:
|
|
43
44
|
- bin/csv-change-eol
|
44
45
|
- bin/csv-find-error
|
45
46
|
- bin/csv-readline
|
47
|
+
- bin/csv-validator
|
46
48
|
- csv-utils.gemspec
|
47
49
|
- lib/csv-utils.rb
|
50
|
+
- lib/csv_utils/csv_extender.rb
|
48
51
|
- lib/csv_utils/csv_options.rb
|
49
52
|
- lib/csv_utils/csv_report.rb
|
50
53
|
- lib/csv_utils/csv_row.rb
|
54
|
+
- lib/csv_utils/csv_sort.rb
|
55
|
+
- lib/csv_utils/csv_transformer.rb
|
56
|
+
- lib/csv_utils/csv_wrapper.rb
|
51
57
|
- script/console
|
52
58
|
homepage: https://github.com/dougyouch/csv-utils
|
53
59
|
licenses:
|