csv-utils 0.2.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8b1634cd26b21129c05cf1c5655e46ff3b81307135ce8dd686022cd8a4ff5adc
4
- data.tar.gz: 53312f6841e32ddcae0e97cd9aab4513a47580c35a3ec39964f8933fdbbcff10
3
+ metadata.gz: 8e649e8e220856a0676e01ef58811ab9df3822a1757eae4e776a45256063c510
4
+ data.tar.gz: fad8bcac595659bc5d91b4ebc19ee8f18ebbdf910568e75ff812768e1c23515f
5
5
  SHA512:
6
- metadata.gz: 5733c0ff8b730e957c46fc73b1080690dd79fb481b30f9591f4256507abc3ea46139d9758605c4c4e143e23781468b9adbd623812ceb105bb610a61943ffbc57
7
- data.tar.gz: 4ce9256c44ed50fc289fbe2b0ce970b2798310d50b4f69dffe9fa57098b0acab0539049bd5c945b43768ea7cb3d97b2421e54bf4c069df7751bd3704b68db9ec
6
+ metadata.gz: 4e4a26c3494c466a0099f72db534b8bd73910444c6eadf1b1faa78213d225b6752e9e8b60818bbeabe57f4542bd235da1969a9b6a3bad968328622531f89f077
7
+ data.tar.gz: d35d7a46f4f58a0fe87e10c6bcfec3ededc840d5d3c7ce0f1e5162e22f6f9ddb1516ed23a59b93877529ae326d597dfbde5089d9676393d488e476f6a9ad0924
@@ -0,0 +1,77 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'csv'
4
+ begin
5
+ require 'rchardet'
6
+ rescue LoadError
7
+ $stderr.puts 'gem install rchardet'
8
+ exit 1
9
+ end
10
+
11
+ def utf8?(str)
12
+ str
13
+ .force_encoding('utf-8')
14
+ .valid_encoding?
15
+ end
16
+
17
+ def convert_to_utf8(str, current_encoding)
18
+ str.force_encoding(current_encoding)
19
+ return nil unless str.valid_encoding?
20
+
21
+ str.encode('utf-8')
22
+ end
23
+
24
+ def detect_encoding(col)
25
+ CharDet.detect(col)['encoding']
26
+ end
27
+
28
+ def strip_bom!(col)
29
+ col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
30
+ end
31
+
32
+ csv = CSV.open(ARGV[0], 'rb')
33
+ id_column_name = ARGV[1]
34
+
35
+ headers = csv.shift
36
+ strip_bom!(headers[0])
37
+
38
+ id_column_num = nil
39
+ if id_column_name
40
+ unless headers.include?(id_column_name)
41
+ $stderr.puts("header #{id_column_name} not found in current set of headers")
42
+ exit 1
43
+ end
44
+
45
+ id_column_num = headers.index(id_column_name)
46
+ end
47
+
48
+ out = nil
49
+ if id_column_num
50
+ out = CSV.open('utf8-correctsion.csv', 'wb')
51
+ out << [id_column_name, 'Row', 'Col', 'Header', 'Value']
52
+ end
53
+
54
+ csv_lineno = 1
55
+
56
+ while (row = csv.shift)
57
+ csv_lineno += 1
58
+
59
+ unless row.size == headers.size
60
+ $stderr.puts "row(#{csv_lineno}): invalid number of columns, expected #{headers.size} got #{row.size}"
61
+ end
62
+
63
+ row.each_with_index do |col, idx|
64
+ next if utf8?(col)
65
+
66
+ $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: none UTF-8 characters found in \"#{col}\""
67
+ if (col_utf8_encoded = convert_to_utf8(col, detect_encoding(col)))
68
+ puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: converted to UTF-8 from #{detect_encoding(col)} \"#{col_utf8_encoded}\""
69
+ out << [row[id_column_num], csv_lineno, (idx + 1), headers[idx], col_utf8_encoded]
70
+ else
71
+ $stderr.puts "row(#{csv_lineno}),col(#{idx + 1}) #{headers[idx]}: unknown character encoding"
72
+ end
73
+ end
74
+ end
75
+
76
+ csv.close
77
+ out.close if out
@@ -2,7 +2,7 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = 'csv-utils'
5
- s.version = '0.2.3'
5
+ s.version = '0.3.4'
6
6
  s.licenses = ['MIT']
7
7
  s.summary = 'CSV Utils'
8
8
  s.description = 'Tools for debugging malformed CSV files'
@@ -7,4 +7,6 @@ module CSVUtils
7
7
  autoload :CSVReport, 'csv_utils/csv_report'
8
8
  autoload :CSVRow, 'csv_utils/csv_row'
9
9
  autoload :CSVSort, 'csv_utils/csv_sort'
10
+ autoload :CSVTransformer, 'csv_utils/csv_transformer'
11
+ autoload :CSVWrapper, 'csv_utils/csv_wrapper'
10
12
  end
@@ -1,20 +1,15 @@
1
1
  # Utility class for appending data to a csv file.
2
2
  class CSVUtils::CSVExtender
3
- attr_reader :csv_file,
4
- :new_csv_file,
5
- :csv_options
6
-
7
- def initialize(csv_file, new_csv_file, csv_options = {})
8
- @csv_file = csv_file
9
- @new_csv_file = new_csv_file
10
- @csv_options = csv_options
3
+ def initialize(src_csv, dest_csv, csv_options = {})
4
+ @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
5
+ @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
11
6
  end
12
7
 
13
8
  def append(additional_headers)
14
9
  process(additional_headers) do |current_headers|
15
- while (row = src.shift)
10
+ while (row = @src_csv.shift)
16
11
  additional_columns = yield row, current_headers
17
- dest << (row + additional_columns)
12
+ @dest_csv << (row + additional_columns)
18
13
  end
19
14
  end
20
15
  end
@@ -27,13 +22,13 @@ class CSVUtils::CSVExtender
27
22
  additional_rows = yield batch, current_headers
28
23
 
29
24
  batch.each_with_index do |row, idx|
30
- dest << (row + additional_rows[idx])
25
+ @dest_csv << (row + additional_rows[idx])
31
26
  end
32
27
 
33
28
  batch = []
34
29
  end
35
30
 
36
- while (row = src.shift)
31
+ while (row = @src_csv.shift)
37
32
  batch << row
38
33
 
39
34
  process_batch_proc.call if batch.size >= batch_size
@@ -43,6 +38,8 @@ class CSVUtils::CSVExtender
43
38
  end
44
39
  end
45
40
 
41
+ private
42
+
46
43
  def process(additional_headers)
47
44
  current_headers = append_headers(additional_headers)
48
45
 
@@ -51,26 +48,16 @@ class CSVUtils::CSVExtender
51
48
  close
52
49
  end
53
50
 
54
- def src
55
- @src ||= CSV.open(csv_file, 'rb', csv_options)
56
- end
57
-
58
- def dest
59
- @dest ||= CSV.open(new_csv_file, 'wb', csv_options)
60
- end
61
-
62
51
  def close
63
- src.close
64
- dest.close
52
+ @src_csv.close
53
+ @dest_csv.close
65
54
  end
66
55
 
67
- private
68
-
69
56
  def append_headers(additional_headers)
70
57
  return nil unless additional_headers
71
58
 
72
- current_headers = src.shift
73
- dest << (current_headers + additional_headers)
59
+ current_headers = @src_csv.shift
60
+ @dest_csv << (current_headers + additional_headers)
74
61
  current_headers
75
62
  end
76
63
  end
@@ -0,0 +1,119 @@
1
+ # Transforms a CSV given a series of steps
2
+ class CSVUtils::CSVTransformer
3
+ attr_reader :headers
4
+
5
+ def initialize(src_csv, dest_csv, csv_options = {})
6
+ @src_csv = CSVUtils::CSVWrapper.new(src_csv, 'rb', csv_options)
7
+ @dest_csv = CSVUtils::CSVWrapper.new(dest_csv, 'wb', csv_options)
8
+ end
9
+
10
+ def read_headers
11
+ @headers = @src_csv.shift
12
+ self
13
+ end
14
+
15
+ def additional_data(&block)
16
+ steps << [:additional_data, @headers, block]
17
+ self
18
+ end
19
+
20
+ def select(&block)
21
+ steps << [:select, @headers, block]
22
+ self
23
+ end
24
+
25
+ def reject(&block)
26
+ steps << [:reject, @headers, block]
27
+ self
28
+ end
29
+
30
+ def map(new_headers, &block)
31
+ steps << [:map, @headers, block]
32
+ @headers = new_headers
33
+ self
34
+ end
35
+
36
+ def append(additional_headers, &block)
37
+ steps << [:append, @headers, block]
38
+
39
+ if additional_headers
40
+ @headers += additional_headers
41
+ else
42
+ @headers = nil
43
+ end
44
+
45
+ self
46
+ end
47
+
48
+ def each(&block)
49
+ steps << [:each, @headers, block]
50
+ self
51
+ end
52
+
53
+ def set_headers(headers)
54
+ @headers = headers
55
+ self
56
+ end
57
+
58
+ def process(batch_size = 10_000, &block)
59
+ batch = []
60
+
61
+ @dest_csv << @headers if @headers
62
+
63
+ steps_proc = Proc.new do
64
+ steps.each do |step_type, current_headers, proc|
65
+ batch = process_step(step_type, current_headers, batch, &proc)
66
+ end
67
+
68
+ batch.each { |row| @dest_csv << row }
69
+
70
+ batch = []
71
+ end
72
+
73
+ while (row = @src_csv.shift)
74
+ batch << row
75
+ steps_proc.call if batch.size >= batch_size
76
+ end
77
+
78
+ steps_proc.call if batch.size > 0
79
+
80
+ @src_csv.close
81
+ @dest_csv.close
82
+ end
83
+
84
+ private
85
+
86
+ def steps
87
+ @steps ||= []
88
+ end
89
+
90
+
91
+ def process_step(step_type, current_headers, batch, &block)
92
+ case step_type
93
+ when :select
94
+ batch.select! do |row|
95
+ block.call row, current_headers, @additional_data
96
+ end
97
+ when :reject
98
+ batch.reject! do |row|
99
+ block.call row, current_headers, @additional_data
100
+ end
101
+ when :map
102
+ batch.map! do |row|
103
+ block.call row, current_headers, @additional_data
104
+ end
105
+ when :append
106
+ batch.map! do |row|
107
+ row + block.call(row, current_headers, @additional_data)
108
+ end
109
+ when :additional_data
110
+ @additional_data = block.call(batch, current_headers)
111
+ when :each
112
+ batch.each do |row|
113
+ block.call(row, current_headers, @additional_data)
114
+ end
115
+ end
116
+
117
+ batch
118
+ end
119
+ end
@@ -0,0 +1,47 @@
1
+ # Wraps a CSV object, if wrapper opens the csv file it will close it
2
+ class CSVUtils::CSVWrapper
3
+ attr_reader :csv
4
+
5
+ def initialize(csv, mode, csv_options)
6
+ open(csv, mode, csv_options)
7
+ end
8
+
9
+ def self.open(file, mode, csv_options = {})
10
+ csv = new(file, mode, csv_options)
11
+
12
+ if block_given?
13
+ yield csv
14
+ csv.close
15
+ else
16
+ csv
17
+ end
18
+ end
19
+
20
+ def open(csv, mode, csv_options)
21
+ if csv.is_a?(String)
22
+ @close_when_done = true
23
+ @csv = CSV.open(csv, mode, csv_options)
24
+ else
25
+ @close_when_done = false
26
+ @csv = csv
27
+ end
28
+ end
29
+
30
+ def <<(row)
31
+ csv << row
32
+ end
33
+
34
+ def shift
35
+ csv.shift
36
+ end
37
+
38
+ def close
39
+ csv.close if close_when_done?
40
+ end
41
+
42
+ private
43
+
44
+ def close_when_done?
45
+ @close_when_done
46
+ end
47
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: csv-utils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.3
4
+ version: 0.3.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Doug Youch
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-07-08 00:00:00.000000000 Z
11
+ date: 2020-07-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: inheritance-helper
@@ -30,6 +30,7 @@ executables:
30
30
  - csv-change-eol
31
31
  - csv-find-error
32
32
  - csv-readline
33
+ - csv-validator
33
34
  extensions: []
34
35
  extra_rdoc_files: []
35
36
  files:
@@ -43,6 +44,7 @@ files:
43
44
  - bin/csv-change-eol
44
45
  - bin/csv-find-error
45
46
  - bin/csv-readline
47
+ - bin/csv-validator
46
48
  - csv-utils.gemspec
47
49
  - lib/csv-utils.rb
48
50
  - lib/csv_utils/csv_extender.rb
@@ -50,6 +52,8 @@ files:
50
52
  - lib/csv_utils/csv_report.rb
51
53
  - lib/csv_utils/csv_row.rb
52
54
  - lib/csv_utils/csv_sort.rb
55
+ - lib/csv_utils/csv_transformer.rb
56
+ - lib/csv_utils/csv_wrapper.rb
53
57
  - script/console
54
58
  homepage: https://github.com/dougyouch/csv-utils
55
59
  licenses: