csv-utils 0.2.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ecb75f60c8e9b9db4cc3eb0e4ca3a0ac53aad67726ed995b7e8c341cd0dc76a3
4
- data.tar.gz: 5138b5cc82eec0b7667c9e3435c2662bbfa1de51e469582372a158b943e57d7f
3
+ metadata.gz: '019dcd269f036bc21e93019e567e8a0223d8436e87c56519d74af02383640bdf'
4
+ data.tar.gz: 34b4e8035a533e897c395943e892de0bae16fbdc3847a4990cc0281225d21bd4
5
5
  SHA512:
6
- metadata.gz: 1a7d685b0db28805833596b32793fca968c6a5a1f57346223b487579622423d0151a122253c69806e377015fe0cf9cf02381bafbee37cb0ba54fa290a857c1cc
7
- data.tar.gz: 7c572f9e7c74d626084612afa188bb15b036377bbac16ed5374d7fcff300e58fab8100a77729c2fa428e4294ac6b3b643feb00b2e87e7de78883c8548193de54
6
+ metadata.gz: e770276baa097fa30551266882910818f331890c6e9bfd7fa92ab01654826a14ad3b67f151f2ce858c816d6d628170174fcc872038636998c15c818dc129130a
7
+ data.tar.gz: a1689e7404f5d9b70f092b7cda83c8f200df0af1725d27ecc222d798bafcbb1ea1612cbff74dbef493fe427a12ff8330f7828ed608becb4a3d3cd7267179319e
@@ -0,0 +1,47 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'csv'
4
+ require 'rchardet'
5
+
6
+ def utf8?(str)
7
+ str
8
+ .force_encoding('utf-8')
9
+ .valid_encoding?
10
+ end
11
+
12
+ def convert_to_utf8(str, current_encoding)
13
+ str.force_encoding(current_encoding)
14
+ return nil unless str.valid_encoding?
15
+
16
+ str.encode('utf-8')
17
+ end
18
+
19
+ def detect_encoding(col)
20
+ CharDet.detect(col)['encoding']
21
+ end
22
+
23
+ csv = CSV.open(ARGV[0], 'rb')
24
+
25
+ headers = csv.shift
26
+ csv_lineno = 1
27
+
28
+ while (row = csv.shift)
29
+ csv_lineno += 1
30
+
31
+ unless row.size == headers.size
32
+ $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
33
+ end
34
+
35
+ row.each_with_index do |col, idx|
36
+ next if utf8?(col)
37
+
38
+ $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): none UTF-8 characters found in \"#{col}\""
39
+ if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
40
+ puts "row(#{csv_lineno}),col(#{idx + 1}): converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
41
+ else
42
+ $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}): unknown character encoding"
43
+ end
44
+ end
45
+ end
46
+
47
+ csv.close
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.2.0'
5
+ s.version = '0.3.1'
6
6
  s.licenses = ['MIT']
7
7
  s.summary = 'CSV Utils'
8
8
  s.description = 'Tools for debugging malformed CSV files'
@@ -2,7 +2,11 @@ require 'csv'
2
2
 
3
3
  # Collection of tools for working with CSV files.
4
4
  module CSVUtils
5
+ autoload :CSVExtender, 'csv_utils/csv_extender'
5
6
  autoload :CSVOptions, 'csv_utils/csv_options'
6
7
  autoload :CSVReport, 'csv_utils/csv_report'
7
8
  autoload :CSVRow, 'csv_utils/csv_row'
9
+ autoload :CSVSort, 'csv_utils/csv_sort'
10
+ autoload :CSVTransformer, 'csv_utils/csv_transformer'
11
+ autoload :CSVWrapper, 'csv_utils/csv_wrapper'
8
12
  end
@@ -0,0 +1,63 @@
1
+ # Utility class for appending data to a csv file.
2
+ class CSVUtils::CSVExtender
3
+ def initialize(src_csv, dest_csv, csv_options = {})
4
+ @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
5
+ @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
6
+ end
7
+
8
+ def append(additional_headers)
9
+ process(additional_headers) do |current_headers|
10
+ while (row = @src_csv.shift)
11
+ additional_columns = yield row, current_headers
12
+ @dest_csv << (row + additional_columns)
13
+ end
14
+ end
15
+ end
16
+
17
+ def append_in_batches(additional_headers, batch_size = 1_000)
18
+ process(additional_headers) do |current_headers|
19
+ batch = []
20
+
21
+ process_batch_proc = Proc.new do
22
+ additional_rows = yield batch, current_headers
23
+
24
+ batch.each_with_index do |row, idx|
25
+ @dest_csv << (row + additional_rows[idx])
26
+ end
27
+
28
+ batch = []
29
+ end
30
+
31
+ while (row = @src_csv.shift)
32
+ batch << row
33
+
34
+ process_batch_proc.call if batch.size >= batch_size
35
+ end
36
+
37
+ process_batch_proc.call if batch.size > 0
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def process(additional_headers)
44
+ current_headers = append_headers(additional_headers)
45
+
46
+ yield current_headers
47
+
48
+ close
49
+ end
50
+
51
+ def close
52
+ @src_csv.close
53
+ @dest_csv.close
54
+ end
55
+
56
+ def append_headers(additional_headers)
57
+ return nil unless additional_headers
58
+
59
+ current_headers = @src_csv.shift
60
+ @dest_csv << (current_headers + additional_headers)
61
+ current_headers
62
+ end
63
+ end
@@ -4,7 +4,7 @@ module CSVUtils
4
4
  attr_reader :csv,
5
5
  :must_close
6
6
 
7
- def initialize(csv, csv_options = {}, &block)
7
+ def initialize(csv, headers = nil, csv_options = {}, &block)
8
8
  @csv =
9
9
  if csv.is_a?(String)
10
10
  @must_close = true
@@ -15,10 +15,11 @@ module CSVUtils
15
15
  csv
16
16
  end
17
17
 
18
- generate(&block) if block
18
+ generate(headers, &block) if block
19
19
  end
20
20
 
21
- def generate
21
+ def generate(headers = nil)
22
+ add_headers(headers) if headers
22
23
  yield self
23
24
  @csv.close if @must_close
24
25
  end
@@ -23,10 +23,16 @@ module CSVUtils
23
23
 
24
24
  add_value_to_class_method(:csv_columns, header => options)
25
25
  end
26
- end
27
26
 
28
- def csv_headers
29
- self.class.csv_columns.values.map { |column_options| csv_column_header(column_options) }
27
+ def csv_headers
28
+ csv_columns.values.map { |column_options| csv_column_header(column_options) }
29
+ end
30
+
31
+ private
32
+
33
+ def csv_column_header(column_options)
34
+ column_options[:header]
35
+ end
30
36
  end
31
37
 
32
38
  def csv_row
@@ -34,12 +40,12 @@ module CSVUtils
34
40
  end
35
41
  alias_method :to_a, :csv_row
36
42
 
37
- private
38
-
39
- def csv_column_header(column_options)
40
- column_options[:header]
43
+ def csv_headers
44
+ self.class.csv_headers
41
45
  end
42
46
 
47
+ private
48
+
43
49
  def csv_column_value(column_options)
44
50
  if column_options[:proc]
45
51
  instance_eval(&column_options[:proc])
@@ -0,0 +1,112 @@
1
+ require 'fileutils'
2
+
3
+ # Utility class for sorting the rows for a csv file
4
+ class CSVUtils::CSVSort
5
+ attr_reader :csv_file,
6
+ :new_csv_file,
7
+ :has_headers,
8
+ :csv_options,
9
+ :headers
10
+
11
+ def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
12
+ @csv_file = csv_file
13
+ @new_csv_file = new_csv_file
14
+ @has_headers = has_headers
15
+ @csv_options = csv_options
16
+ @csv_part_files = []
17
+ @files_to_delete = []
18
+ end
19
+
20
+ def sort(batch_size = 100_000, &block)
21
+ create_sorted_csv_part_files(batch_size, &block)
22
+ merge_csv_part_files(&block)
23
+ end
24
+
25
+ private
26
+
27
+ def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
28
+ src1 = CSV.open(src_csv_file1, 'rb', csv_options)
29
+ src2 = CSV.open(src_csv_file2, 'rb', csv_options)
30
+ dest = CSV.open(dest_csv_file, 'wb', csv_options)
31
+
32
+ if @headers
33
+ dest << @headers
34
+ src1.shift
35
+ src2.shift
36
+ end
37
+
38
+ row1 = src1.shift
39
+ row2 = src2.shift
40
+
41
+ append_row1_proc = Proc.new do
42
+ dest << row1
43
+ row1 = src1.shift
44
+ end
45
+
46
+ append_row2_proc = Proc.new do
47
+ dest << row2
48
+ row2 = src2.shift
49
+ end
50
+
51
+ while row1 || row2
52
+ if row1.nil?
53
+ append_row2_proc.call
54
+ elsif row2.nil?
55
+ append_row1_proc.call
56
+ elsif yield(row1, row2) <= 0
57
+ append_row1_proc.call
58
+ else
59
+ append_row2_proc.call
60
+ end
61
+ end
62
+
63
+ src1.close
64
+ src2.close
65
+ dest.close
66
+ end
67
+
68
+ def create_sorted_csv_part_files(batch_size, &block)
69
+ src = CSV.open(csv_file, 'rb', csv_options)
70
+
71
+ @headers = src.shift if has_headers
72
+
73
+ batch = []
74
+ create_batch_part_proc = Proc.new do
75
+ batch.sort!(&block)
76
+ @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
77
+ CSV.open(@csv_part_files.last, 'wb', csv_options) do |csv|
78
+ csv << @headers if @headers
79
+ batch.each { |row| csv << row }
80
+ end
81
+ batch = []
82
+ end
83
+
84
+ while (row = src.shift)
85
+ batch << row
86
+ create_batch_part_proc.call if batch.size >= batch_size
87
+ end
88
+
89
+ create_batch_part_proc.call if batch.size > 0
90
+
91
+ src.close
92
+ end
93
+
94
+ def merge_csv_part_files(&block)
95
+ file_merge_cnt = 0
96
+
97
+ while @csv_part_files.size > 1
98
+ file_merge_cnt += 1
99
+
100
+ csv_part_file1 = @csv_part_files.shift
101
+ csv_part_file2 = @csv_part_files.shift
102
+ @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
103
+
104
+ merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
105
+
106
+ File.unlink(csv_part_file1)
107
+ File.unlink(csv_part_file2)
108
+ end
109
+
110
+ FileUtils.mv(@csv_part_files.last, new_csv_file)
111
+ end
112
+ end
@@ -0,0 +1,119 @@
1
+ # Transforms a CSV given a series of steps
2
+ class CSVUtils::CSVTransformer
3
+ attr_reader :headers
4
+
5
+ def initialize(src_csv, dest_csv, csv_options = {})
6
+ @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
7
+ @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
8
+ end
9
+
10
+ def read_headers
11
+ @headers = @src_csv.shift
12
+ self
13
+ end
14
+
15
+ def additional_data(&block)
16
+ steps << [:additional_data, @headers, block]
17
+ self
18
+ end
19
+
20
+ def select(&block)
21
+ steps << [:select, @headers, block]
22
+ self
23
+ end
24
+
25
+ def reject(&block)
26
+ steps << [:reject, @headers, block]
27
+ self
28
+ end
29
+
30
+ def map(new_headers, &block)
31
+ steps << [:map, @headers, block]
32
+ @headers = new_headers
33
+ self
34
+ end
35
+
36
+ def append(additional_headers, &block)
37
+ steps << [:append, @headers, block]
38
+
39
+ if additional_headers
40
+ @headers += additional_headers
41
+ else
42
+ @headers = nil
43
+ end
44
+
45
+ self
46
+ end
47
+
48
+ def each(&block)
49
+ steps << [:each, @headers, block]
50
+ self
51
+ end
52
+
53
+ def set_headers(headers)
54
+ @headers = headers
55
+ self
56
+ end
57
+
58
+ def process(batch_size = 10_000, &block)
59
+ batch = []
60
+
61
+ @dest_csv << @headers if @headers
62
+
63
+ steps_proc = Proc.new do
64
+ steps.each do |step_type, current_headers, proc|
65
+ batch = process_step(step_type, current_headers, batch, &proc)
66
+ end
67
+
68
+ batch.each { |row| @dest_csv << row }
69
+
70
+ batch = []
71
+ end
72
+
73
+ while (row = @src_csv.shift)
74
+ batch << row
75
+ steps_proc.call if batch.size >= batch_size
76
+ end
77
+
78
+ steps_proc.call if batch.size > 0
79
+
80
+ @src_csv.close
81
+ @dest_csv.close
82
+ end
83
+
84
+ private
85
+
86
+ def steps
87
+ @steps ||= []
88
+ end
89
+
90
+
91
+ def process_step(step_type, current_headers, batch, &block)
92
+ case step_type
93
+ when :select
94
+ batch.select! do |row|
95
+ block.call row, current_headers, @additional_data
96
+ end
97
+ when :reject
98
+ batch.reject! do |row|
99
+ block.call row, current_headers, @additional_data
100
+ end
101
+ when :map
102
+ batch.map! do |row|
103
+ block.call row, current_headers, @additional_data
104
+ end
105
+ when :append
106
+ batch.map! do |row|
107
+ row + block.call(row, current_headers, @additional_data)
108
+ end
109
+ when :additional_data
110
+ @additional_data = block.call(batch, current_headers)
111
+ when :each
112
+ batch.each do |row|
113
+ block.call(row, current_headers, @additional_data)
114
+ end
115
+ end
116
+
117
+ batch
118
+ end
119
+ end
@@ -0,0 +1,47 @@
1
+ # Wraps a CSV object, if wrapper opens the csv file it will close it
2
+ class CSVUtils::CSVWrapper
3
+ attr_reader :csv
4
+
5
+ def initialize(csv, mode, csv_options)
6
+ open(csv, mode, csv_options)
7
+ end
8
+
9
+ def self.open(file, mode, csv_options = {})
10
+ csv = new(file, mode, csv_options)
11
+
12
+ if block_given?
13
+ yield csv
14
+ csv.close
15
+ else
16
+ csv
17
+ end
18
+ end
19
+
20
+ def open(csv, mode, csv_options)
21
+ if csv.is_a?(String)
22
+ @close_when_done = true
23
+ @csv = CSV.open(csv, mode, csv_options)
24
+ else
25
+ @close_when_done = false
26
+ @csv = csv
27
+ end
28
+ end
29
+
30
+ def <<(row)
31
+ csv << row
32
+ end
33
+
34
+ def shift
35
+ csv.shift
36
+ end
37
+
38
+ def close
39
+ csv.close if close_when_done?
40
+ end
41
+
42
+ private
43
+
44
+ def close_when_done?
45
+ @close_when_done
46
+ end
47
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.0
4
+ version: 0.3.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-06-23 00:00:00.000000000 Z
11
+ date: 2020-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inheritance-helper
@@ -30,6 +30,7 @@ executables:
30
30
  - csv-change-eol
31
31
  - csv-find-error
32
32
  - csv-readline
33
+ - csv-validator
33
34
  extensions: []
34
35
  extra_rdoc_files: []
35
36
  files:
@@ -43,11 +44,16 @@ files:
43
44
  - bin/csv-change-eol
44
45
  - bin/csv-find-error
45
46
  - bin/csv-readline
47
+ - bin/csv-validator
46
48
  - csv-utils.gemspec
47
49
  - lib/csv-utils.rb
50
+ - lib/csv_utils/csv_extender.rb
48
51
  - lib/csv_utils/csv_options.rb
49
52
  - lib/csv_utils/csv_report.rb
50
53
  - lib/csv_utils/csv_row.rb
54
+ - lib/csv_utils/csv_sort.rb
55
+ - lib/csv_utils/csv_transformer.rb
56
+ - lib/csv_utils/csv_wrapper.rb
51
57
  - script/console
52
58
  homepage: https://github.com/dougyouch/csv-utils
53
59
  licenses: