csv-utils 0.2.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/csv-validator +47 -0
- data/csv-utils.gemspec +1 -1
- data/lib/csv-utils.rb +4 -0
- data/lib/csv_utils/csv_extender.rb +63 -0
- data/lib/csv_utils/csv_report.rb +4 -3
- data/lib/csv_utils/csv_row.rb +13 -7
- data/lib/csv_utils/csv_sort.rb +112 -0
- data/lib/csv_utils/csv_transformer.rb +119 -0
- data/lib/csv_utils/csv_wrapper.rb +47 -0
- metadata +8 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: '019dcd269f036bc21e93019e567e8a0223d8436e87c56519d74af02383640bdf'
|
4
|
+
data.tar.gz: 34b4e8035a533e897c395943e892de0bae16fbdc3847a4990cc0281225d21bd4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e770276baa097fa30551266882910818f331890c6e9bfd7fa92ab01654826a14ad3b67f151f2ce858c816d6d628170174fcc872038636998c15c818dc129130a
|
7
|
+
data.tar.gz: a1689e7404f5d9b70f092b7cda83c8f200df0af1725d27ecc222d798bafcbb1ea1612cbff74dbef493fe427a12ff8330f7828ed608becb4a3d3cd7267179319e
|
data/bin/csv-validator
ADDED
@@ -0,0 +1,47 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
require 'rchardet'
|
5
|
+
|
6
|
+
def utf8?(str)
|
7
|
+
str
|
8
|
+
.force_encoding('utf-8')
|
9
|
+
.valid_encoding?
|
10
|
+
end
|
11
|
+
|
12
|
+
def convert_to_utf8(str, current_encoding)
|
13
|
+
str.force_encoding(current_encoding)
|
14
|
+
return nil unless str.valid_encoding?
|
15
|
+
|
16
|
+
str.encode('utf-8')
|
17
|
+
end
|
18
|
+
|
19
|
+
def detect_encoding(col)
|
20
|
+
CharDet.detect(col)['encoding']
|
21
|
+
end
|
22
|
+
|
23
|
+
csv = CSV.open(ARGV[0], 'rb')
|
24
|
+
|
25
|
+
headers = csv.shift
|
26
|
+
csv_lineno = 1
|
27
|
+
|
28
|
+
while (row = csv.shift)
|
29
|
+
csv_lineno += 1
|
30
|
+
|
31
|
+
unless row.size == headers.size
|
32
|
+
$stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
|
33
|
+
end
|
34
|
+
|
35
|
+
row.each_with_index do |col, idx|
|
36
|
+
next if utf8?(col)
|
37
|
+
|
38
|
+
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): none UTF-8 characters found in \"#{col}\""
|
39
|
+
if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
|
40
|
+
puts "row(#{csv_lineno}),col(#{idx + 1}): converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
|
41
|
+
else
|
42
|
+
$stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): unknown character encoding"
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
csv.close
|
data/csv-utils.gemspec
CHANGED
data/lib/csv-utils.rb
CHANGED
@@ -2,7 +2,11 @@ require 'csv'
|
|
2
2
|
|
3
3
|
# Collection of tools for working with CSV files.
|
4
4
|
module CSVUtils
|
5
|
+
autoload :CSVExtender, 'csv_utils/csv_extender'
|
5
6
|
autoload :CSVOptions, 'csv_utils/csv_options'
|
6
7
|
autoload :CSVReport, 'csv_utils/csv_report'
|
7
8
|
autoload :CSVRow, 'csv_utils/csv_row'
|
9
|
+
autoload :CSVSort, 'csv_utils/csv_sort'
|
10
|
+
autoload :CSVTransformer, 'csv_utils/csv_transformer'
|
11
|
+
autoload :CSVWrapper, 'csv_utils/csv_wrapper'
|
8
12
|
end
|
@@ -0,0 +1,63 @@
|
|
1
|
+
# Utility class for appending data to a csv file.
|
2
|
+
class CSVUtils::CSVExtender
|
3
|
+
def initialize(src_csv, dest_csv, csv_options = {})
|
4
|
+
@src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
|
5
|
+
@dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
|
6
|
+
end
|
7
|
+
|
8
|
+
def append(additional_headers)
|
9
|
+
process(additional_headers) do |current_headers|
|
10
|
+
while (row = @src_csv.shift)
|
11
|
+
additional_columns = yield row, current_headers
|
12
|
+
@dest_csv << (row + additional_columns)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def append_in_batches(additional_headers, batch_size = 1_000)
|
18
|
+
process(additional_headers) do |current_headers|
|
19
|
+
batch = []
|
20
|
+
|
21
|
+
process_batch_proc = Proc.new do
|
22
|
+
additional_rows = yield batch, current_headers
|
23
|
+
|
24
|
+
batch.each_with_index do |row, idx|
|
25
|
+
@dest_csv << (row + additional_rows[idx])
|
26
|
+
end
|
27
|
+
|
28
|
+
batch = []
|
29
|
+
end
|
30
|
+
|
31
|
+
while (row = @src_csv.shift)
|
32
|
+
batch << row
|
33
|
+
|
34
|
+
process_batch_proc.call if batch.size >= batch_size
|
35
|
+
end
|
36
|
+
|
37
|
+
process_batch_proc.call if batch.size > 0
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def process(additional_headers)
|
44
|
+
current_headers = append_headers(additional_headers)
|
45
|
+
|
46
|
+
yield current_headers
|
47
|
+
|
48
|
+
close
|
49
|
+
end
|
50
|
+
|
51
|
+
def close
|
52
|
+
@src_csv.close
|
53
|
+
@dest_csv.close
|
54
|
+
end
|
55
|
+
|
56
|
+
def append_headers(additional_headers)
|
57
|
+
return nil unless additional_headers
|
58
|
+
|
59
|
+
current_headers = @src_csv.shift
|
60
|
+
@dest_csv << (current_headers + additional_headers)
|
61
|
+
current_headers
|
62
|
+
end
|
63
|
+
end
|
data/lib/csv_utils/csv_report.rb
CHANGED
@@ -4,7 +4,7 @@ module CSVUtils
|
|
4
4
|
attr_reader :csv,
|
5
5
|
:must_close
|
6
6
|
|
7
|
-
def initialize(csv, csv_options = {}, &block)
|
7
|
+
def initialize(csv, headers = nil, csv_options = {}, &block)
|
8
8
|
@csv =
|
9
9
|
if csv.is_a?(String)
|
10
10
|
@must_close = true
|
@@ -15,10 +15,11 @@ module CSVUtils
|
|
15
15
|
csv
|
16
16
|
end
|
17
17
|
|
18
|
-
generate(&block) if block
|
18
|
+
generate(headers, &block) if block
|
19
19
|
end
|
20
20
|
|
21
|
-
def generate
|
21
|
+
def generate(headers = nil)
|
22
|
+
add_headers(headers) if headers
|
22
23
|
yield self
|
23
24
|
@csv.close if @must_close
|
24
25
|
end
|
data/lib/csv_utils/csv_row.rb
CHANGED
@@ -23,10 +23,16 @@ module CSVUtils
|
|
23
23
|
|
24
24
|
add_value_to_class_method(:csv_columns, header => options)
|
25
25
|
end
|
26
|
-
end
|
27
26
|
|
28
|
-
|
29
|
-
|
27
|
+
def csv_headers
|
28
|
+
csv_columns.values.map { |column_options| csv_column_header(column_options) }
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
|
33
|
+
def csv_column_header(column_options)
|
34
|
+
column_options[:header]
|
35
|
+
end
|
30
36
|
end
|
31
37
|
|
32
38
|
def csv_row
|
@@ -34,12 +40,12 @@ module CSVUtils
|
|
34
40
|
end
|
35
41
|
alias_method :to_a, :csv_row
|
36
42
|
|
37
|
-
|
38
|
-
|
39
|
-
def csv_column_header(column_options)
|
40
|
-
column_options[:header]
|
43
|
+
def csv_headers
|
44
|
+
self.class.csv_headers
|
41
45
|
end
|
42
46
|
|
47
|
+
private
|
48
|
+
|
43
49
|
def csv_column_value(column_options)
|
44
50
|
if column_options[:proc]
|
45
51
|
instance_eval(&column_options[:proc])
|
@@ -0,0 +1,112 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
|
3
|
+
# Utility class for sorting the rows for a csv file
|
4
|
+
class CSVUtils::CSVSort
|
5
|
+
attr_reader :csv_file,
|
6
|
+
:new_csv_file,
|
7
|
+
:has_headers,
|
8
|
+
:csv_options,
|
9
|
+
:headers
|
10
|
+
|
11
|
+
def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
|
12
|
+
@csv_file = csv_file
|
13
|
+
@new_csv_file = new_csv_file
|
14
|
+
@has_headers = has_headers
|
15
|
+
@csv_options = csv_options
|
16
|
+
@csv_part_files = []
|
17
|
+
@files_to_delete = []
|
18
|
+
end
|
19
|
+
|
20
|
+
def sort(batch_size = 100_000, &block)
|
21
|
+
create_sorted_csv_part_files(batch_size, &block)
|
22
|
+
merge_csv_part_files(&block)
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
|
28
|
+
src1 = CSV.open(src_csv_file1, 'rb', csv_options)
|
29
|
+
src2 = CSV.open(src_csv_file2, 'rb', csv_options)
|
30
|
+
dest = CSV.open(dest_csv_file, 'wb', csv_options)
|
31
|
+
|
32
|
+
if @headers
|
33
|
+
dest << @headers
|
34
|
+
src1.shift
|
35
|
+
src2.shift
|
36
|
+
end
|
37
|
+
|
38
|
+
row1 = src1.shift
|
39
|
+
row2 = src2.shift
|
40
|
+
|
41
|
+
append_row1_proc = Proc.new do
|
42
|
+
dest << row1
|
43
|
+
row1 = src1.shift
|
44
|
+
end
|
45
|
+
|
46
|
+
append_row2_proc = Proc.new do
|
47
|
+
dest << row2
|
48
|
+
row2 = src2.shift
|
49
|
+
end
|
50
|
+
|
51
|
+
while row1 || row2
|
52
|
+
if row1.nil?
|
53
|
+
append_row2_proc.call
|
54
|
+
elsif row2.nil?
|
55
|
+
append_row1_proc.call
|
56
|
+
elsif yield(row1, row2) <= 0
|
57
|
+
append_row1_proc.call
|
58
|
+
else
|
59
|
+
append_row2_proc.call
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
src1.close
|
64
|
+
src2.close
|
65
|
+
dest.close
|
66
|
+
end
|
67
|
+
|
68
|
+
def create_sorted_csv_part_files(batch_size, &block)
|
69
|
+
src = CSV.open(csv_file, 'rb', csv_options)
|
70
|
+
|
71
|
+
@headers = src.shift if has_headers
|
72
|
+
|
73
|
+
batch = []
|
74
|
+
create_batch_part_proc = Proc.new do
|
75
|
+
batch.sort!(&block)
|
76
|
+
@csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
|
77
|
+
CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
|
78
|
+
csv << @headers if @headers
|
79
|
+
batch.each { |row| csv << row }
|
80
|
+
end
|
81
|
+
batch = []
|
82
|
+
end
|
83
|
+
|
84
|
+
while (row = src.shift)
|
85
|
+
batch << row
|
86
|
+
create_batch_part_proc.call if batch.size >= batch_size
|
87
|
+
end
|
88
|
+
|
89
|
+
create_batch_part_proc.call if batch.size > 0
|
90
|
+
|
91
|
+
src.close
|
92
|
+
end
|
93
|
+
|
94
|
+
def merge_csv_part_files(&block)
|
95
|
+
file_merge_cnt = 0
|
96
|
+
|
97
|
+
while @csv_part_files.size > 1
|
98
|
+
file_merge_cnt += 1
|
99
|
+
|
100
|
+
csv_part_file1 = @csv_part_files.shift
|
101
|
+
csv_part_file2 = @csv_part_files.shift
|
102
|
+
@csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
|
103
|
+
|
104
|
+
merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
|
105
|
+
|
106
|
+
File.unlink(csv_part_file1)
|
107
|
+
File.unlink(csv_part_file2)
|
108
|
+
end
|
109
|
+
|
110
|
+
FileUtils.mv(@csv_part_files.last, new_csv_file)
|
111
|
+
end
|
112
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
# Transforms a CSV given a series of steps
|
2
|
+
class CSVUtils::CSVTransformer
|
3
|
+
attr_reader :headers
|
4
|
+
|
5
|
+
def initialize(src_csv, dest_csv, csv_options = {})
|
6
|
+
@src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
|
7
|
+
@dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
|
8
|
+
end
|
9
|
+
|
10
|
+
def read_headers
|
11
|
+
@headers = @src_csv.shift
|
12
|
+
self
|
13
|
+
end
|
14
|
+
|
15
|
+
def additional_data(&block)
|
16
|
+
steps << [:additional_data, @headers, block]
|
17
|
+
self
|
18
|
+
end
|
19
|
+
|
20
|
+
def select(&block)
|
21
|
+
steps << [:select, @headers, block]
|
22
|
+
self
|
23
|
+
end
|
24
|
+
|
25
|
+
def reject(&block)
|
26
|
+
steps << [:reject, @headers, block]
|
27
|
+
self
|
28
|
+
end
|
29
|
+
|
30
|
+
def map(new_headers, &block)
|
31
|
+
steps << [:map, @headers, block]
|
32
|
+
@headers = new_headers
|
33
|
+
self
|
34
|
+
end
|
35
|
+
|
36
|
+
def append(additional_headers, &block)
|
37
|
+
steps << [:append, @headers, block]
|
38
|
+
|
39
|
+
if additional_headers
|
40
|
+
@headers += additional_headers
|
41
|
+
else
|
42
|
+
@headers = nil
|
43
|
+
end
|
44
|
+
|
45
|
+
self
|
46
|
+
end
|
47
|
+
|
48
|
+
def each(&block)
|
49
|
+
steps << [:each, @headers, block]
|
50
|
+
self
|
51
|
+
end
|
52
|
+
|
53
|
+
def set_headers(headers)
|
54
|
+
@headers = headers
|
55
|
+
self
|
56
|
+
end
|
57
|
+
|
58
|
+
def process(batch_size = 10_000, &block)
|
59
|
+
batch = []
|
60
|
+
|
61
|
+
@dest_csv << @headers if @headers
|
62
|
+
|
63
|
+
steps_proc = Proc.new do
|
64
|
+
steps.each do |step_type, current_headers, proc|
|
65
|
+
batch = process_step(step_type, current_headers, batch, &proc)
|
66
|
+
end
|
67
|
+
|
68
|
+
batch.each { |row| @dest_csv << row }
|
69
|
+
|
70
|
+
batch = []
|
71
|
+
end
|
72
|
+
|
73
|
+
while (row = @src_csv.shift)
|
74
|
+
batch << row
|
75
|
+
steps_proc.call if batch.size >= batch_size
|
76
|
+
end
|
77
|
+
|
78
|
+
steps_proc.call if batch.size > 0
|
79
|
+
|
80
|
+
@src_csv.close
|
81
|
+
@dest_csv.close
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
|
86
|
+
def steps
|
87
|
+
@steps ||= []
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def process_step(step_type, current_headers, batch, &block)
|
92
|
+
case step_type
|
93
|
+
when :select
|
94
|
+
batch.select! do |row|
|
95
|
+
block.call row, current_headers, @additional_data
|
96
|
+
end
|
97
|
+
when :reject
|
98
|
+
batch.reject! do |row|
|
99
|
+
block.call row, current_headers, @additional_data
|
100
|
+
end
|
101
|
+
when :map
|
102
|
+
batch.map! do |row|
|
103
|
+
block.call row, current_headers, @additional_data
|
104
|
+
end
|
105
|
+
when :append
|
106
|
+
batch.map! do |row|
|
107
|
+
row + block.call(row, current_headers, @additional_data)
|
108
|
+
end
|
109
|
+
when :additional_data
|
110
|
+
@additional_data = block.call(batch, current_headers)
|
111
|
+
when :each
|
112
|
+
batch.each do |row|
|
113
|
+
block.call(row, current_headers, @additional_data)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
batch
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# Wraps a CSV object, if wrapper opens the csv file it will close it
|
2
|
+
class CSVUtils::CSVWrapper
|
3
|
+
attr_reader :csv
|
4
|
+
|
5
|
+
def initialize(csv, mode, csv_options)
|
6
|
+
open(csv, mode, csv_options)
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.open(file, mode, csv_options = {})
|
10
|
+
csv = new(file, mode, csv_options)
|
11
|
+
|
12
|
+
if block_given?
|
13
|
+
yield csv
|
14
|
+
csv.close
|
15
|
+
else
|
16
|
+
csv
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def open(csv, mode, csv_options)
|
21
|
+
if csv.is_a?(String)
|
22
|
+
@close_when_done = true
|
23
|
+
@csv = CSV.open(csv, mode, csv_options)
|
24
|
+
else
|
25
|
+
@close_when_done = false
|
26
|
+
@csv = csv
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def <<(row)
|
31
|
+
csv << row
|
32
|
+
end
|
33
|
+
|
34
|
+
def shift
|
35
|
+
csv.shift
|
36
|
+
end
|
37
|
+
|
38
|
+
def close
|
39
|
+
csv.close if close_when_done?
|
40
|
+
end
|
41
|
+
|
42
|
+
private
|
43
|
+
|
44
|
+
def close_when_done?
|
45
|
+
@close_when_done
|
46
|
+
end
|
47
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: csv-utils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Doug Youch
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-
|
11
|
+
date: 2020-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: inheritance-helper
|
@@ -30,6 +30,7 @@ executables:
|
|
30
30
|
- csv-change-eol
|
31
31
|
- csv-find-error
|
32
32
|
- csv-readline
|
33
|
+
- csv-validator
|
33
34
|
extensions: []
|
34
35
|
extra_rdoc_files: []
|
35
36
|
files:
|
@@ -43,11 +44,16 @@ files:
|
|
43
44
|
- bin/csv-change-eol
|
44
45
|
- bin/csv-find-error
|
45
46
|
- bin/csv-readline
|
47
|
+
- bin/csv-validator
|
46
48
|
- csv-utils.gemspec
|
47
49
|
- lib/csv-utils.rb
|
50
|
+
- lib/csv_utils/csv_extender.rb
|
48
51
|
- lib/csv_utils/csv_options.rb
|
49
52
|
- lib/csv_utils/csv_report.rb
|
50
53
|
- lib/csv_utils/csv_row.rb
|
54
|
+
- lib/csv_utils/csv_sort.rb
|
55
|
+
- lib/csv_utils/csv_transformer.rb
|
56
|
+
- lib/csv_utils/csv_wrapper.rb
|
51
57
|
- script/console
|
52
58
|
homepage: https://github.com/dougyouch/csv-utils
|
53
59
|
licenses:
|