csv-utils 0.3.25 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,102 +1,117 @@
1
- # Search a CSV given a series of steps
2
- class CSVUtils::CSVIterator
3
- include Enumerable
4
-
5
- attr_reader :prev_row
1
+ # frozen_string_literal: true
6
2
 
7
- class RowWrapper < Hash
8
- attr_accessor :lineno
3
+ # Search a CSV given a series of steps
4
+ module CSVUtils
5
+ class CSVIterator
6
+ include Enumerable
7
+
8
+ BYTE_ORDER_MARKS = [
9
+ (+"\xEF\xBB\xBF").force_encoding('ASCII-8BIT'), # UTF-8
10
+ (+"\xFE\xFF").force_encoding('ASCII-8BIT'), # UTF-16 BE
11
+ (+"\xFF\xFE").force_encoding('ASCII-8BIT'), # UTF-16 LE
12
+ (+"\x00\x00\xFE\xFF").force_encoding('ASCII-8BIT'), # UTF-32 BE
13
+ (+"\xFF\xFE\x00\x00").force_encoding('ASCII-8BIT') # UTF-32 LE
14
+ ].freeze
15
+
16
+ attr_reader :prev_row
17
+
18
+ class RowWrapper < Hash
19
+ attr_accessor :lineno
20
+
21
+ def self.create(headers, row, lineno)
22
+ row_wrapper = RowWrapper[headers.zip(row)]
23
+ row_wrapper.lineno = lineno
24
+ row_wrapper
25
+ end
9
26
 
10
- def self.create(headers, row, lineno)
11
- row_wrapper = RowWrapper[headers.zip(row)]
12
- row_wrapper.lineno = lineno
13
- row_wrapper
27
+ def to_pretty_s
28
+ reject { |_, v| v.nil? || v.strip.empty? }
29
+ .each_with_index
30
+ .map { |(k, v), idx| format(' %-3d %s: %s', idx + 1, k, v) }
31
+ .join("\n") + "\n"
32
+ end
14
33
  end
15
34
 
16
- def to_pretty_s
17
- reject { |_, v| v.nil? || v.strip.empty? }
18
- .each_with_index
19
- .map { |(k, v), idx| sprintf(' %-3d %s: %s', idx+1, k, v) }
20
- .join("\n") + "\n"
35
+ def initialize(src_csv, csv_options = {}, mode = 'rb')
36
+ @src_csv = CSVUtils::CSVWrapper.new(src_csv, mode, csv_options)
21
37
  end
22
- end
23
38
 
24
- def initialize(src_csv, csv_options = {}, mode = 'rb')
25
- @src_csv = CSVUtils::CSVWrapper.new(src_csv, mode, csv_options)
26
- end
39
+ def each(headers = nil)
40
+ @src_csv.rewind
27
41
 
28
- def each(headers = nil)
29
- @src_csv.rewind
42
+ lineno = 0
43
+ unless headers
44
+ headers = @src_csv.shift
45
+ strip_bom!(headers[0])
46
+ lineno += 1
47
+ end
30
48
 
31
- lineno = 0
32
- unless headers
33
- headers = @src_csv.shift
34
- strip_bom!(headers[0])
35
- lineno += 1
49
+ @prev_row = nil
50
+ while (row = @src_csv.shift)
51
+ lineno += 1
52
+ yield RowWrapper.create(headers, row, lineno)
53
+ @prev_row = row
54
+ end
36
55
  end
37
56
 
38
- @prev_row = nil
39
- while (row = @src_csv.shift)
40
- lineno += 1
41
- yield RowWrapper.create(headers, row, lineno)
42
- @prev_row = row
57
+ def headers
58
+ @src_csv.rewind
59
+ headers = @src_csv.shift
60
+ strip_bom!(headers[0])
61
+ headers
43
62
  end
44
- end
45
63
 
46
- def headers
47
- @src_csv.rewind
48
- headers = @src_csv.shift
49
- strip_bom!(headers[0])
50
- headers
51
- end
64
+ def to_hash(key, value = nil, &)
65
+ raise("header #{key} not found in #{headers}") unless headers.include?(key)
66
+ raise("headers #{value} not found in #{headers}") if value && !headers.include?(value)
52
67
 
53
- def to_hash(key, value = nil)
54
- raise("header #{key} not found in #{headers}") unless headers.include?(key)
55
- raise("headers #{value} not found in #{headers}") if value && !headers.include?(value)
68
+ value_proc =
69
+ if value
70
+ proc { |row| row[value] }
71
+ else
72
+ proc(&)
73
+ end
56
74
 
57
- value_proc =
58
- if value
59
- proc { |row| row[value] }
60
- else
61
- proc { |row| yield(row) }
75
+ each_with_object({}) do |row, hsh|
76
+ hsh[row[key]] = value_proc.call(row)
62
77
  end
63
-
64
- each_with_object({}) do |row, hsh|
65
- hsh[row[key]] = value_proc.call(row)
66
78
  end
67
- end
68
79
 
69
- def size
70
- @src_csv.rewind
71
- @src_csv.shift
72
- cnt = 0
73
- while @src_csv.shift
74
- cnt +=1
80
+ def size
81
+ @src_csv.rewind
82
+ @src_csv.shift
83
+ cnt = 0
84
+ cnt += 1 while @src_csv.shift
85
+ cnt
75
86
  end
76
- cnt
77
- end
78
87
 
79
- def each_batch(batch_size = 1_000)
80
- batch = []
81
-
82
- process_batch_proc = Proc.new do
83
- yield batch
88
+ def each_batch(batch_size = 1_000)
84
89
  batch = []
85
- end
86
90
 
87
- each do |row|
88
- batch << row
89
- process_batch_proc.call if batch.size >= batch_size
90
- end
91
+ process_batch_proc = proc do
92
+ yield batch
93
+ batch = []
94
+ end
91
95
 
92
- process_batch_proc.call if batch.size > 0
96
+ each do |row|
97
+ batch << row
98
+ process_batch_proc.call if batch.size >= batch_size
99
+ end
93
100
 
94
- nil
95
- end
101
+ process_batch_proc.call if batch.size.positive?
96
102
 
97
- private
103
+ nil
104
+ end
98
105
 
99
- def strip_bom!(col)
100
- col.sub!("\xEF\xBB\xBF".force_encoding('ASCII-8BIT'), '')
106
+ private
107
+
108
+ def strip_bom!(col)
109
+ BYTE_ORDER_MARKS.each do |bom|
110
+ if col.start_with?(bom)
111
+ col.sub!(bom, '')
112
+ break
113
+ end
114
+ end
115
+ end
101
116
  end
102
117
  end
@@ -1,35 +1,35 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Auto detect a csv files options
2
4
  module CSVUtils
3
5
  class CSVOptions
4
-
5
6
  # this list is from https://en.wikipedia.org/wiki/Byte_order_mark
6
7
  BYTE_ORDER_MARKS = {
7
- "\xEF\xBB\xBF".force_encoding('ASCII-8BIT') => 'UTF-8',
8
- "\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-16',
9
- "\xFF\xFE".force_encoding('ASCII-8BIT') => 'UTF-16',
10
- "\x00\x00\xFE\xFF".force_encoding('ASCII-8BIT') => 'UTF-32',
11
- "\xFF\xFE\x00\x00".force_encoding('ASCII-8BIT') => 'UTF-32'
12
- }
8
+ (+"\xEF\xBB\xBF").force_encoding('ASCII-8BIT') => 'UTF-8',
9
+ (+"\xFE\xFF").force_encoding('ASCII-8BIT') => 'UTF-16',
10
+ (+"\xFF\xFE").force_encoding('ASCII-8BIT') => 'UTF-16',
11
+ (+"\x00\x00\xFE\xFF").force_encoding('ASCII-8BIT') => 'UTF-32',
12
+ (+"\xFF\xFE\x00\x00").force_encoding('ASCII-8BIT') => 'UTF-32'
13
+ }.freeze
13
14
 
14
15
  COL_SEPARATORS = [
15
16
  "\x02",
16
17
  "\t",
17
18
  '|',
18
19
  ','
19
- ]
20
+ ].freeze
20
21
 
21
22
  ROW_SEPARATORS = [
22
23
  "\r\n",
23
24
  "\n",
24
25
  "\r"
25
- ]
26
+ ].freeze
26
27
 
27
28
  attr_reader :columns,
28
29
  :byte_order_mark,
29
30
  :encoding,
30
31
  :col_separator,
31
32
  :row_separator
32
-
33
33
 
34
34
  def initialize(io)
35
35
  line =
@@ -81,7 +81,7 @@ module CSVUtils
81
81
  end
82
82
 
83
83
  def strip_byte_order_marks(header)
84
- @byte_order_marks ? header.sub(@byte_order_marks, '') : header
84
+ @byte_order_mark ? header.sub(@byte_order_mark, '') : header
85
85
  end
86
86
  end
87
87
  end
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Builds a csv file from csv rows
2
4
  module CSVUtils
3
5
  class CSVReport
@@ -8,8 +10,9 @@ module CSVUtils
8
10
  @csv =
9
11
  if csv.is_a?(String)
10
12
  @must_close = true
11
- mode = csv_options.delete(:mode) || 'wb'
12
- CSV.open(csv, mode, **csv_options)
13
+ opts = csv_options.dup
14
+ mode = opts.delete(:mode) || 'wb'
15
+ CSV.open(csv, mode, **opts)
13
16
  else
14
17
  @must_close = false
15
18
  csv
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'inheritance-helper'
2
4
 
3
5
  module CSVUtils
@@ -38,7 +40,7 @@ module CSVUtils
38
40
  def csv_row
39
41
  self.class.csv_columns.values.map { |column_options| csv_column_value(column_options) }
40
42
  end
41
- alias_method :to_a, :csv_row
43
+ alias to_a csv_row
42
44
 
43
45
  def csv_headers
44
46
  self.class.csv_headers
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module CSVUtils
4
+ class CSVRowMatcher
5
+ attr_accessor :regex,
6
+ :columns
7
+
8
+ def initialize(regex, columns = :all)
9
+ self.regex = regex
10
+ self.columns = columns
11
+ end
12
+
13
+ def match?(row)
14
+ if columns == :all
15
+ row.each_value do |value|
16
+ return true if value&.match?(regex)
17
+ end
18
+ else
19
+ columns.each do |column_name|
20
+ value = row[column_name]
21
+ return true if value&.match?(regex)
22
+ end
23
+ end
24
+
25
+ false
26
+ end
27
+
28
+ def to_proc
29
+ proc do |row|
30
+ match?(row)
31
+ end
32
+ end
33
+ end
34
+ end
@@ -1,116 +1,130 @@
1
+ # frozen_string_literal: true
2
+
1
3
  require 'fileutils'
2
4
 
3
5
  # Utility class for sorting the rows for a csv file
4
- class CSVUtils::CSVSort
5
- attr_reader :csv_file,
6
- :new_csv_file,
7
- :has_headers,
8
- :csv_options,
9
- :headers
10
-
11
- def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
12
- @csv_file = csv_file
13
- @new_csv_file = new_csv_file
14
- @has_headers = has_headers
15
- @csv_options = csv_options
16
- @csv_part_files = []
17
- @files_to_delete = []
18
- end
19
-
20
- def sort(batch_size = 100_000, &block)
21
- create_sorted_csv_part_files(batch_size, &block)
22
- merge_csv_part_files(&block)
23
- end
24
-
25
- private
26
-
27
- def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
28
- src1 = CSV.open(src_csv_file1, 'rb', **csv_options)
29
- src2 = CSV.open(src_csv_file2, 'rb', **csv_options)
30
- dest = CSV.open(dest_csv_file, 'wb', **csv_options)
31
-
32
- if @headers
33
- dest << @headers
34
- src1.shift
35
- src2.shift
6
+ module CSVUtils
7
+ class CSVSort
8
+ attr_reader :csv_file,
9
+ :new_csv_file,
10
+ :has_headers,
11
+ :csv_options,
12
+ :headers
13
+
14
+ def initialize(csv_file, new_csv_file, has_headers = true, csv_options = {})
15
+ @csv_file = csv_file
16
+ @new_csv_file = new_csv_file
17
+ @has_headers = has_headers
18
+ @csv_options = csv_options
19
+ @csv_part_files = []
20
+ @files_to_delete = []
36
21
  end
37
22
 
38
- row1 = src1.shift
39
- row2 = src2.shift
40
-
41
- append_row1_proc = Proc.new do
42
- dest << row1
43
- row1 = src1.shift
44
- end
45
-
46
- append_row2_proc = Proc.new do
47
- dest << row2
48
- row2 = src2.shift
23
+ def sort(batch_size = 100_000, &)
24
+ create_sorted_csv_part_files(batch_size, &)
25
+ merge_csv_part_files(&)
49
26
  end
50
27
 
51
- while row1 || row2
52
- if row1.nil?
53
- append_row2_proc.call
54
- elsif row2.nil?
55
- append_row1_proc.call
56
- elsif yield(row1, row2) <= 0
57
- append_row1_proc.call
58
- else
59
- append_row2_proc.call
28
+ private
29
+
30
+ # rubocop:disable Metrics/MethodLength
31
+ def merge_sort_csv_files(src_csv_file1, src_csv_file2, dest_csv_file)
32
+ src1 = CSV.open(src_csv_file1, 'rb', **csv_options)
33
+ begin
34
+ src2 = CSV.open(src_csv_file2, 'rb', **csv_options)
35
+ begin
36
+ dest = CSV.open(dest_csv_file, 'wb', **csv_options)
37
+ begin
38
+ if @headers
39
+ dest << @headers
40
+ src1.shift
41
+ src2.shift
42
+ end
43
+
44
+ row1 = src1.shift
45
+ row2 = src2.shift
46
+
47
+ append_row1_proc = proc do
48
+ dest << row1
49
+ row1 = src1.shift
50
+ end
51
+
52
+ append_row2_proc = proc do
53
+ dest << row2
54
+ row2 = src2.shift
55
+ end
56
+
57
+ while row1 || row2
58
+ if row1.nil?
59
+ append_row2_proc.call
60
+ elsif row2.nil?
61
+ append_row1_proc.call
62
+ elsif yield(row1, row2) <= 0
63
+ append_row1_proc.call
64
+ else
65
+ append_row2_proc.call
66
+ end
67
+ end
68
+ ensure
69
+ dest.close
70
+ end
71
+ ensure
72
+ src2.close
73
+ end
74
+ ensure
75
+ src1.close
60
76
  end
61
77
  end
62
-
63
- src1.close
64
- src2.close
65
- dest.close
66
- end
67
-
68
- def create_sorted_csv_part_files(batch_size, &block)
69
- src = CSV.open(csv_file, 'rb', **csv_options)
70
-
71
- @headers = src.shift if has_headers
72
-
73
- batch = []
74
- create_batch_part_proc = Proc.new do
75
- batch.sort!(&block)
76
- @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
77
- CSV.open(@csv_part_files.last, 'wb', **csv_options) do |csv|
78
- csv << @headers if @headers
79
- batch.each { |row| csv << row }
78
+ # rubocop:enable Metrics/MethodLength
79
+
80
+ def create_sorted_csv_part_files(batch_size, &block)
81
+ src = CSV.open(csv_file, 'rb', **csv_options)
82
+ begin
83
+ @headers = src.shift if has_headers
84
+
85
+ batch = []
86
+ create_batch_part_proc = proc do
87
+ batch.sort!(&block)
88
+ @csv_part_files << "#{new_csv_file}.part.#{@csv_part_files.size}"
89
+ CSV.open(@csv_part_files.last, 'wb', **csv_options) do |csv|
90
+ csv << @headers if @headers
91
+ batch.each { |row| csv << row }
92
+ end
93
+ batch = []
94
+ end
95
+
96
+ while (row = src.shift)
97
+ batch << row
98
+ create_batch_part_proc.call if batch.size >= batch_size
99
+ end
100
+
101
+ create_batch_part_proc.call if batch.size.positive?
102
+ ensure
103
+ src.close
80
104
  end
81
- batch = []
82
105
  end
83
106
 
84
- while (row = src.shift)
85
- batch << row
86
- create_batch_part_proc.call if batch.size >= batch_size
87
- end
88
-
89
- create_batch_part_proc.call if batch.size > 0
90
-
91
- src.close
92
- end
93
-
94
- def merge_csv_part_files(&block)
95
- file_merge_cnt = 0
107
+ def merge_csv_part_files(&)
108
+ file_merge_cnt = 0
96
109
 
97
- while @csv_part_files.size > 1
98
- file_merge_cnt += 1
110
+ while @csv_part_files.size > 1
111
+ file_merge_cnt += 1
99
112
 
100
- csv_part_file1 = @csv_part_files.shift
101
- csv_part_file2 = @csv_part_files.shift
102
- @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
113
+ csv_part_file1 = @csv_part_files.shift
114
+ csv_part_file2 = @csv_part_files.shift
115
+ @csv_part_files << "#{new_csv_file}.merge.#{file_merge_cnt}"
103
116
 
104
- merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &block)
117
+ merge_sort_csv_files(csv_part_file1, csv_part_file2, @csv_part_files.last, &)
105
118
 
106
- File.unlink(csv_part_file1)
107
- File.unlink(csv_part_file2)
108
- end
119
+ File.unlink(csv_part_file1)
120
+ File.unlink(csv_part_file2)
121
+ end
109
122
 
110
- if @csv_part_files.size > 0
111
- FileUtils.mv(@csv_part_files.last, new_csv_file)
112
- else
113
- FileUtils.cp(@csv_file, new_csv_file)
123
+ if @csv_part_files.size.positive?
124
+ FileUtils.mv(@csv_part_files.last, new_csv_file)
125
+ else
126
+ FileUtils.cp(@csv_file, new_csv_file)
127
+ end
114
128
  end
115
129
  end
116
130
  end