smarter_csv 1.9.2 → 1.9.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 3e4032569303bd062a92b3c3f45f5166346808291667dda9ebd91af123f532ef
4
- data.tar.gz: 78b73abc411d8ed866feae600b87b72c3c99fd3b00b67c81eac227c17f8d38ea
3
+ metadata.gz: 5f35e10ff8bc0e79ff1ed9bea8e413f746f51128a6f6a9622d246873fd588366
4
+ data.tar.gz: 5cc30cf6f4422dd16f3019915bc5305a92aaaa4b99665e4c4c525d3bbf489cfd
5
5
  SHA512:
6
- metadata.gz: 1712951a2ce4f6e8ad93a6e76a105a3a8d4890babacfbb9ae3eead11ac638962d9da3d45421a327049e87c9d54b43c0dca1327f11a13bbd54440d3a7fefc6253
7
- data.tar.gz: 3d8b81f04c8eb16a7b2ab9ddf27bdaf2b2bfdd2ee3a8b70765a88f809fc9869500debe950d8ec27e3a6af818e6f1e415d96d078e52784d638f1363619088faa3
6
+ metadata.gz: 057472a73ae0be95318b16428b276ecffba384a68479af715c5ec3ca7601405ca73928b0fbf245c9b3f46fd33b82a8c6d9c9e6330ddb0305b83ae23f58173df0
7
+ data.tar.gz: 319b12a53875c1963eed6d27aa67850135d33a5b3a9f70607e6d812906733b711ade6c3ee6e789d78c2e159004a879e59e700145224134745b16d279039ac38a
data/CHANGELOG.md CHANGED
@@ -1,6 +1,11 @@
1
1
 
2
2
  # SmarterCSV 1.x Change Log
3
3
 
4
+ ## 1.9.3 (2023-12-16)
5
+ * raise SmarterCSV::IncorrectOption when `user_provided_headers` are empty
6
+ * code refactor / no functional changes
7
+ * added test cases
8
+
4
9
  ## 1.9.2 (2023-11-12)
5
10
  * fixed bug with '\\' at end of line (issue #252, thanks to averycrespi-moz)
6
11
  * fixed require statements (issue #249, thanks to PikachuEXE, courtsimas)
data/README.md CHANGED
@@ -300,7 +300,7 @@ And header and data validations will also be supported in 2.x
300
300
  | Option | Default | Explanation |
301
301
  ---------------------------------------------------------------------------------------------------------------------------------
302
302
  | :key_mapping | nil | a hash which maps headers from the CSV file to keys in the result hash |
303
- | :silence_missing_key | false | ignore missing keys in `key_mapping` |
303
+ | :silence_missing_keys | false | ignore missing keys in `key_mapping` |
304
304
  | | | if set to true: makes all mapped keys optional |
305
305
  | | | if given an array, makes only the keys listed in it optional |
306
306
  | :required_keys | nil | An array. Specify the required names AFTER header transformation. |
@@ -0,0 +1,73 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ protected
6
+
7
+ # If file has headers, then guesses column separator from headers.
8
+ # Otherwise guesses column separator from contents.
9
+ # Raises exception if none is found.
10
+ def guess_column_separator(filehandle, options)
11
+ skip_lines(filehandle, options)
12
+
13
+ delimiters = [',', "\t", ';', ':', '|']
14
+
15
+ line = nil
16
+ has_header = options[:headers_in_file]
17
+ candidates = Hash.new(0)
18
+ count = has_header ? 1 : 5
19
+ count.times do
20
+ line = readline_with_counts(filehandle, options)
21
+ delimiters.each do |d|
22
+ candidates[d] += line.scan(d).count
23
+ end
24
+ rescue EOFError # short files
25
+ break
26
+ end
27
+ rewind(filehandle)
28
+
29
+ if candidates.values.max == 0
30
+ # if the header only contains
31
+ return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
32
+
33
+ raise SmarterCSV::NoColSepDetected
34
+ end
35
+
36
+ candidates.key(candidates.values.max)
37
+ end
38
+
39
+ # limitation: this currently reads the whole file in before making a decision
40
+ def guess_line_ending(filehandle, options)
41
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
42
+ quoted_char = false
43
+
44
+ # count how many of the pre-defined line-endings we find
45
+ # ignoring those contained within quote characters
46
+ last_char = nil
47
+ lines = 0
48
+ filehandle.each_char do |c|
49
+ quoted_char = !quoted_char if c == options[:quote_char]
50
+ next if quoted_char
51
+
52
+ if last_char == "\r"
53
+ if c == "\n"
54
+ counts["\r\n"] += 1
55
+ else
56
+ counts["\r"] += 1 # \r are counted after they appeared
57
+ end
58
+ elsif c == "\n"
59
+ counts["\n"] += 1
60
+ end
61
+ last_char = c
62
+ lines += 1
63
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
64
+ end
65
+ rewind(filehandle)
66
+
67
+ counts["\r"] += 1 if last_char == "\r"
68
+ # find the most frequent key/value pair:
69
+ most_frequent_key, _count = counts.max_by{|_, v| v}
70
+ most_frequent_key
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,50 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ protected
6
+
7
+ def readline_with_counts(filehandle, options)
8
+ line = filehandle.readline(options[:row_sep])
9
+ @file_line_count += 1
10
+ @csv_line_count += 1
11
+ line = remove_bom(line) if @csv_line_count == 1
12
+ line
13
+ end
14
+
15
+ def skip_lines(filehandle, options)
16
+ options[:skip_lines].to_i.times do
17
+ readline_with_counts(filehandle, options)
18
+ end
19
+ end
20
+
21
+ def rewind(filehandle)
22
+ @file_line_count = 0
23
+ @csv_line_count = 0
24
+ filehandle.rewind
25
+ end
26
+
27
+ private
28
+
29
+ UTF_32_BOM = %w[0 0 fe ff].freeze
30
+ UTF_32LE_BOM = %w[ff fe 0 0].freeze
31
+ UTF_8_BOM = %w[ef bb bf].freeze
32
+ UTF_16_BOM = %w[fe ff].freeze
33
+ UTF_16LE_BOM = %w[ff fe].freeze
34
+
35
+ def remove_bom(str)
36
+ str_as_hex = str.bytes.map{|x| x.to_s(16)}
37
+ # if string does not start with one of the bytes, there is no BOM
38
+ return str unless %w[ef fe ff 0].include?(str_as_hex[0])
39
+
40
+ return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
41
+ return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
42
+ return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
43
+
44
+ # :nocov:
45
+ puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
46
+ str
47
+ # :nocov:
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,160 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ def process_headers(filehandle, options)
6
+ @raw_header = nil # header as it appears in the file
7
+ @headers = nil # the processed headers
8
+ header_array = []
9
+ file_header_size = nil
10
+
11
+ # if headers_in_file, get the headers -> We get the number of columns, even when user provided headers
12
+ if options[:headers_in_file] # extract the header line
13
+ # process the header line in the CSV file..
14
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
15
+ header_line = @raw_header = readline_with_counts(filehandle, options)
16
+ header_line = preprocess_header_line(header_line, options)
17
+ file_header_array, file_header_size = parse_and_modify_headers(header_line, options)
18
+ else
19
+ unless options[:user_provided_headers]
20
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers"
21
+ end
22
+ end
23
+
24
+ if options[:user_provided_headers]
25
+ unless options[:user_provided_headers].is_a?(Array) && !options[:user_provided_headers].empty?
26
+ raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for user_provided_headers! Expecting array with headers.")
27
+ end
28
+
29
+ # use user-provided headers
30
+ user_header_array = options[:user_provided_headers]
31
+ # user_provided_headers: their count should match the headers_in_file if any
32
+ if defined?(file_header_size) && !file_header_size.nil?
33
+ if user_header_array.size != file_header_size
34
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{user_header_array.size} headers != CSV-file has #{file_header_size} headers"
35
+ else
36
+ # we could print out the mapping of file_header_array to header_array here
37
+ end
38
+ end
39
+ header_array = user_header_array
40
+ else
41
+ header_array = file_header_array
42
+ end
43
+
44
+ # detect duplicate headers and disambiguate
45
+ header_array = disambiguate_headers(header_array, options) if options[:duplicate_header_suffix]
46
+
47
+ # symbolize headers
48
+ header_array.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
49
+
50
+ # wouldn't make sense to re-map user provided headers
51
+ header_array = remap_headers(header_array, options) if options[:key_mapping] && !options[:user_provided_headers]
52
+
53
+ validate_and_deprecate_headers(header_array, options)
54
+
55
+ [header_array, header_array.size]
56
+ end
57
+
58
+ private
59
+
60
+ def preprocess_header_line(header_line, options)
61
+ header_line = enforce_utf8_encoding(header_line, options)
62
+ header_line = remove_comments_from_header(header_line, options)
63
+ header_line = header_line.chomp(options[:row_sep])
64
+ header_line.gsub!(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
65
+ header_line
66
+ end
67
+
68
+ def parse_and_modify_headers(header_line, options)
69
+ file_header_array, file_header_size = parse(header_line, options)
70
+
71
+ file_header_array.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
72
+ file_header_array.map!{|x| x.strip} if options[:strip_whitespace]
73
+
74
+ unless options[:keep_original_headers]
75
+ file_header_array.map!{|x| x.gsub(/\s+|-+/, '_')}
76
+ file_header_array.map!{|x| x.downcase} if options[:downcase_header]
77
+ end
78
+ [file_header_array, file_header_size]
79
+ end
80
+
81
+ def disambiguate_headers(headers, options)
82
+ counts = Hash.new(0)
83
+ headers.map do |header|
84
+ counts[header] += 1
85
+ counts[header] > 1 ? "#{header}#{options[:duplicate_header_suffix]}#{counts[header]}" : header
86
+ end
87
+ end
88
+
89
+ # do some key mapping on the keys in the file header
90
+ # if you want to completely delete a key, then map it to nil or to ''
91
+ def remap_headers(headers, options)
92
+ key_mapping = options[:key_mapping]
93
+ if key_mapping.empty? || !key_mapping.is_a?(Hash) || key_mapping.keys.empty?
94
+ raise(SmarterCSV::IncorrectOption, "ERROR: incorrect format for key_mapping! Expecting hash with from -> to mappings")
95
+ end
96
+
97
+ key_mapping = options[:key_mapping]
98
+ # if silence_missing_keys are not set, raise error if missing header
99
+ missing_keys = key_mapping.keys - headers
100
+ # if the user passes a list of speciffic mapped keys that are optional
101
+ missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
102
+
103
+ unless missing_keys.empty? || options[:silence_missing_keys] == true
104
+ raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
105
+ end
106
+
107
+ headers.map! do |header|
108
+ if key_mapping.has_key?(header)
109
+ key_mapping[header].nil? ? nil : key_mapping[header]
110
+ elsif options[:remove_unmapped_keys]
111
+ nil
112
+ else
113
+ header
114
+ end
115
+ end
116
+ headers
117
+ end
118
+
119
+ # header_validations
120
+ def validate_and_deprecate_headers(headers, options)
121
+ duplicate_headers = []
122
+ headers.compact.each do |k|
123
+ duplicate_headers << k if headers.select{|x| x == k}.size > 1
124
+ end
125
+
126
+ unless options[:user_provided_headers] || duplicate_headers.empty?
127
+ raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
128
+ end
129
+
130
+ # deprecate required_headers
131
+ unless options[:required_headers].nil?
132
+ puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
133
+ if options[:required_keys].nil?
134
+ options[:required_keys] = options[:required_headers]
135
+ options[:required_headers] = nil
136
+ end
137
+ end
138
+
139
+ if options[:required_keys] && options[:required_keys].is_a?(Array)
140
+ missing_keys = []
141
+ options[:required_keys].each do |k|
142
+ missing_keys << k unless headers.include?(k)
143
+ end
144
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
145
+ end
146
+ end
147
+
148
+ def enforce_utf8_encoding(header, options)
149
+ return header unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
150
+
151
+ header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
152
+ end
153
+
154
+ def remove_comments_from_header(header, options)
155
+ return header unless options[:comment_regexp]
156
+
157
+ header.sub(options[:comment_regexp], '')
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,90 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ protected
6
+
7
+ ###
8
+ ### Thin wrapper around C-extension
9
+ ###
10
+ def parse(line, options, header_size = nil)
11
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
12
+
13
+ if options[:acceleration] && has_acceleration?
14
+ # :nocov:
15
+ has_quotes = line =~ /#{options[:quote_char]}/
16
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
17
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
18
+ [elements, elements.size]
19
+ # :nocov:
20
+ else
21
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
22
+ parse_csv_line_ruby(line, options, header_size)
23
+ end
24
+ end
25
+
26
+ # ------------------------------------------------------------------
27
+ # Ruby equivalent of the C-extension for parse_line
28
+ #
29
+ # parses a single line: either a CSV header and body line
30
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
31
+ # - we are not assuming that quotes inside a fields need to be doubled
32
+ # - we are not assuming that all fields need to be quoted (0 is even)
33
+ # - works with multi-char col_sep
34
+ # - if header_size is given, only up to header_size fields are parsed
35
+ #
36
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
37
+ # in case there are trailing col_sep characters in line
38
+ #
39
+ # Our convention is that empty fields are returned as empty strings, not as nil.
40
+ #
41
+ #
42
+ # the purpose of the max_size parameter is to handle a corner case where
43
+ # CSV lines contain more fields than the header.
44
+ # In which case the remaining fields in the line are ignored
45
+ #
46
+ def parse_csv_line_ruby(line, options, header_size = nil)
47
+ return [] if line.nil?
48
+
49
+ line_size = line.size
50
+ col_sep = options[:col_sep]
51
+ col_sep_size = col_sep.size
52
+ quote = options[:quote_char]
53
+ quote_count = 0
54
+ elements = []
55
+ start = 0
56
+ i = 0
57
+
58
+ previous_char = ''
59
+ while i < line_size
60
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
61
+ break if !header_size.nil? && elements.size >= header_size
62
+
63
+ elements << cleanup_quotes(line[start...i], quote)
64
+ previous_char = line[i]
65
+ i += col_sep.size
66
+ start = i
67
+ else
68
+ quote_count += 1 if line[i] == quote && previous_char != '\\'
69
+ previous_char = line[i]
70
+ i += 1
71
+ end
72
+ end
73
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
74
+ [elements, elements.size]
75
+ end
76
+
77
+ def cleanup_quotes(field, quote)
78
+ return field if field.nil?
79
+
80
+ # return if field !~ /#{quote}/ # this check can probably eliminated
81
+
82
+ if field.start_with?(quote) && field.end_with?(quote)
83
+ field.delete_prefix!(quote)
84
+ field.delete_suffix!(quote)
85
+ end
86
+ field.gsub!("#{quote}#{quote}", quote)
87
+ field
88
+ end
89
+ end
90
+ end
@@ -14,10 +14,8 @@ module SmarterCSV
14
14
  def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
15
  options = process_options(given_options)
16
16
 
17
- headerA = []
18
- result = []
19
- @file_line_count = 0
20
- @csv_line_count = 0
17
+ initialize_variables
18
+
21
19
  has_rails = !!defined?(Rails)
22
20
  begin
23
21
  fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
@@ -33,13 +31,14 @@ module SmarterCSV
33
31
 
34
32
  skip_lines(fh, options)
35
33
 
36
- headerA, header_size = process_headers(fh, options)
34
+ @headers, header_size = process_headers(fh, options)
35
+ @headerA = @headers # @headerA is deprecated, use @headers
37
36
 
38
37
  # in case we use chunking.. we'll need to set it up..
39
- if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
38
+ if options[:chunk_size].to_i > 0
40
39
  use_chunks = true
41
40
  chunk_size = options[:chunk_size].to_i
42
- chunk_count = 0
41
+ @chunk_count = 0
43
42
  chunk = []
44
43
  else
45
44
  use_chunks = false
@@ -78,7 +77,7 @@ module SmarterCSV
78
77
  # if all values are blank, then ignore this line
79
78
  next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
80
79
 
81
- hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
80
+ hash = @headers.zip(dataA).to_h
82
81
 
83
82
  # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
84
83
  hash.delete(nil)
@@ -95,7 +94,7 @@ module SmarterCSV
95
94
  if options[:convert_values_to_numeric]
96
95
  hash.each do |k, v|
97
96
  # deal with the :only / :except options to :convert_values_to_numeric
98
- next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
97
+ next if limit_execution_for_only_or_except(options, :convert_values_to_numeric, k)
99
98
 
100
99
  # convert if it's a numeric value:
101
100
  case v
@@ -128,9 +127,9 @@ module SmarterCSV
128
127
  if block_given?
129
128
  yield chunk # do something with the hashes in the chunk in the block
130
129
  else
131
- result << chunk # not sure yet, why anybody would want to do this without a block
130
+ @result << chunk # not sure yet, why anybody would want to do this without a block
132
131
  end
133
- chunk_count += 1
132
+ @chunk_count += 1
134
133
  chunk = [] # initialize for next chunk of data
135
134
  else
136
135
 
@@ -144,7 +143,7 @@ module SmarterCSV
144
143
  if block_given?
145
144
  yield [hash] # do something with the hash in the block (better to use chunking here)
146
145
  else
147
- result << hash
146
+ @result << hash
148
147
  end
149
148
  end
150
149
  end
@@ -158,34 +157,23 @@ module SmarterCSV
158
157
  if block_given?
159
158
  yield chunk # do something with the hashes in the chunk in the block
160
159
  else
161
- result << chunk # not sure yet, why anybody would want to do this without a block
160
+ @result << chunk # not sure yet, why anybody would want to do this without a block
162
161
  end
163
- chunk_count += 1
162
+ @chunk_count += 1
164
163
  # chunk = [] # initialize for next chunk of data
165
164
  end
166
165
  ensure
167
166
  fh.close if fh.respond_to?(:close)
168
167
  end
168
+
169
169
  if block_given?
170
- chunk_count # when we do processing through a block we only care how many chunks we processed
170
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
171
171
  else
172
- result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
172
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
173
173
  end
174
174
  end
175
175
 
176
176
  class << self
177
- def has_acceleration?
178
- @has_acceleration ||= !!defined?(parse_csv_line_c)
179
- end
180
-
181
- def raw_header
182
- @raw_header
183
- end
184
-
185
- def headers
186
- @headers
187
- end
188
-
189
177
  # * the `scan` method iterates through the string and finds all occurrences of the pattern
190
178
  # * The reqular expression:
191
179
  # - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
@@ -198,111 +186,22 @@ module SmarterCSV
198
186
  line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
199
187
  end
200
188
 
201
- protected
202
-
203
- def readline_with_counts(filehandle, options)
204
- line = filehandle.readline(options[:row_sep])
205
- @file_line_count += 1
206
- @csv_line_count += 1
207
- line = remove_bom(line) if @csv_line_count == 1
208
- line
209
- end
210
-
211
- def skip_lines(filehandle, options)
212
- return unless options[:skip_lines].to_i > 0
213
-
214
- options[:skip_lines].to_i.times do
215
- readline_with_counts(filehandle, options)
216
- end
217
- end
218
-
219
- def rewind(filehandle)
220
- @file_line_count = 0
221
- @csv_line_count = 0
222
- filehandle.rewind
189
+ def has_acceleration?
190
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
223
191
  end
224
192
 
225
- ###
226
- ### Thin wrapper around C-extension
227
- ###
228
- def parse(line, options, header_size = nil)
229
- # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
230
-
231
- if options[:acceleration] && has_acceleration?
232
- # :nocov:
233
- has_quotes = line =~ /#{options[:quote_char]}/
234
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
235
- elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
236
- [elements, elements.size]
237
- # :nocov:
238
- else
239
- # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
240
- parse_csv_line_ruby(line, options, header_size)
241
- end
242
- end
193
+ protected
243
194
 
244
- # ------------------------------------------------------------------
245
- # Ruby equivalent of the C-extension for parse_line
246
- #
247
- # parses a single line: either a CSV header and body line
248
- # - quoting rules compared to RFC-4180 are somewhat relaxed
249
- # - we are not assuming that quotes inside a fields need to be doubled
250
- # - we are not assuming that all fields need to be quoted (0 is even)
251
- # - works with multi-char col_sep
252
- # - if header_size is given, only up to header_size fields are parsed
253
- #
254
- # We use header_size for parsing the body lines to make sure we always match the number of headers
255
- # in case there are trailing col_sep characters in line
256
- #
257
- # Our convention is that empty fields are returned as empty strings, not as nil.
258
- #
259
- #
260
- # the purpose of the max_size parameter is to handle a corner case where
261
- # CSV lines contain more fields than the header.
262
- # In which case the remaining fields in the line are ignored
263
- #
264
- def parse_csv_line_ruby(line, options, header_size = nil)
265
- return [] if line.nil?
266
-
267
- line_size = line.size
268
- col_sep = options[:col_sep]
269
- col_sep_size = col_sep.size
270
- quote = options[:quote_char]
271
- quote_count = 0
272
- elements = []
273
- start = 0
274
- i = 0
275
-
276
- previous_char = ''
277
- while i < line_size
278
- if line[i...i+col_sep_size] == col_sep && quote_count.even?
279
- break if !header_size.nil? && elements.size >= header_size
280
-
281
- elements << cleanup_quotes(line[start...i], quote)
282
- previous_char = line[i]
283
- i += col_sep.size
284
- start = i
285
- else
286
- quote_count += 1 if line[i] == quote && previous_char != '\\'
287
- previous_char = line[i]
288
- i += 1
195
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
196
+ def limit_execution_for_only_or_except(options, option_name, key)
197
+ if options[option_name].is_a?(Hash)
198
+ if options[option_name].has_key?(:except)
199
+ return true if Array(options[option_name][:except]).include?(key)
200
+ elsif options[option_name].has_key?(:only)
201
+ return true unless Array(options[option_name][:only]).include?(key)
289
202
  end
290
203
  end
291
- elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
292
- [elements, elements.size]
293
- end
294
-
295
- def cleanup_quotes(field, quote)
296
- return field if field.nil?
297
-
298
- # return if field !~ /#{quote}/ # this check can probably eliminated
299
-
300
- if field.start_with?(quote) && field.end_with?(quote)
301
- field.delete_prefix!(quote)
302
- field.delete_suffix!(quote)
303
- end
304
- field.gsub!("#{quote}#{quote}", quote)
305
- field
204
+ false
306
205
  end
307
206
 
308
207
  # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
@@ -340,217 +239,5 @@ module SmarterCSV
340
239
  false
341
240
  end
342
241
  end
343
-
344
- # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
345
- def only_or_except_limit_execution(options, option_name, key)
346
- if options[option_name].is_a?(Hash)
347
- if options[option_name].has_key?(:except)
348
- return true if Array(options[option_name][:except]).include?(key)
349
- elsif options[option_name].has_key?(:only)
350
- return true unless Array(options[option_name][:only]).include?(key)
351
- end
352
- end
353
- false
354
- end
355
-
356
- # If file has headers, then guesses column separator from headers.
357
- # Otherwise guesses column separator from contents.
358
- # Raises exception if none is found.
359
- def guess_column_separator(filehandle, options)
360
- skip_lines(filehandle, options)
361
-
362
- delimiters = [',', "\t", ';', ':', '|']
363
-
364
- line = nil
365
- has_header = options[:headers_in_file]
366
- candidates = Hash.new(0)
367
- count = has_header ? 1 : 5
368
- count.times do
369
- line = readline_with_counts(filehandle, options)
370
- delimiters.each do |d|
371
- candidates[d] += line.scan(d).count
372
- end
373
- rescue EOFError # short files
374
- break
375
- end
376
- rewind(filehandle)
377
-
378
- if candidates.values.max == 0
379
- # if the header only contains
380
- return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
381
-
382
- raise SmarterCSV::NoColSepDetected
383
- end
384
-
385
- candidates.key(candidates.values.max)
386
- end
387
-
388
- # limitation: this currently reads the whole file in before making a decision
389
- def guess_line_ending(filehandle, options)
390
- counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
391
- quoted_char = false
392
-
393
- # count how many of the pre-defined line-endings we find
394
- # ignoring those contained within quote characters
395
- last_char = nil
396
- lines = 0
397
- filehandle.each_char do |c|
398
- quoted_char = !quoted_char if c == options[:quote_char]
399
- next if quoted_char
400
-
401
- if last_char == "\r"
402
- if c == "\n"
403
- counts["\r\n"] += 1
404
- else
405
- counts["\r"] += 1 # \r are counted after they appeared
406
- end
407
- elsif c == "\n"
408
- counts["\n"] += 1
409
- end
410
- last_char = c
411
- lines += 1
412
- break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
413
- end
414
- rewind(filehandle)
415
-
416
- counts["\r"] += 1 if last_char == "\r"
417
- # find the most frequent key/value pair:
418
- most_frequent_key, _count = counts.max_by{|_, v| v}
419
- most_frequent_key
420
- end
421
-
422
- def process_headers(filehandle, options)
423
- @raw_header = nil
424
- @headers = nil
425
- if options[:headers_in_file] # extract the header line
426
- # process the header line in the CSV file..
427
- # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
428
- header = readline_with_counts(filehandle, options)
429
- @raw_header = header
430
-
431
- header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
432
- header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
433
- header = header.chomp(options[:row_sep])
434
-
435
- header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
436
-
437
- file_headerA, file_header_size = parse(header, options)
438
-
439
- file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
440
- file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
441
-
442
- unless options[:keep_original_headers]
443
- file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
444
- file_headerA.map!{|x| x.downcase} if options[:downcase_header]
445
- end
446
- else
447
- raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
448
- end
449
- if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
450
- # use user-provided headers
451
- headerA = options[:user_provided_headers]
452
- if defined?(file_header_size) && !file_header_size.nil?
453
- if headerA.size != file_header_size
454
- raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
455
- else
456
- # we could print out the mapping of file_headerA to headerA here
457
- end
458
- end
459
- else
460
- headerA = file_headerA
461
- end
462
-
463
- # detect duplicate headers and disambiguate
464
- headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
465
- header_size = headerA.size # used for splitting lines
466
-
467
- headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
468
-
469
- unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
470
- key_mappingH = options[:key_mapping]
471
-
472
- # do some key mapping on the keys in the file header
473
- # if you want to completely delete a key, then map it to nil or to ''
474
- if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
475
- # if silence_missing_keys are not set, raise error if missing header
476
- missing_keys = key_mappingH.keys - headerA
477
- # if the user passes a list of speciffic mapped keys that are optional
478
- missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
479
-
480
- unless missing_keys.empty? || options[:silence_missing_keys] == true
481
- raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
482
- end
483
-
484
- headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
485
- end
486
- end
487
-
488
- # header_validations
489
- duplicate_headers = []
490
- headerA.compact.each do |k|
491
- duplicate_headers << k if headerA.select{|x| x == k}.size > 1
492
- end
493
-
494
- unless options[:user_provided_headers] || duplicate_headers.empty?
495
- raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
496
- end
497
-
498
- # deprecate required_headers
499
- unless options[:required_headers].nil?
500
- puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
501
- if options[:required_keys].nil?
502
- options[:required_keys] = options[:required_headers]
503
- options[:required_headers] = nil
504
- end
505
- end
506
-
507
- if options[:required_keys] && options[:required_keys].is_a?(Array)
508
- missing_keys = []
509
- options[:required_keys].each do |k|
510
- missing_keys << k unless headerA.include?(k)
511
- end
512
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
513
- end
514
-
515
- @headers = headerA
516
- [headerA, header_size]
517
- end
518
-
519
- def process_duplicate_headers(headers, options)
520
- counts = Hash.new(0)
521
- result = []
522
- headers.each do |key|
523
- counts[key] += 1
524
- if counts[key] == 1
525
- result << key
526
- else
527
- result << [key, options[:duplicate_header_suffix], counts[key]].join
528
- end
529
- end
530
- result
531
- end
532
-
533
- private
534
-
535
- UTF_32_BOM = %w[0 0 fe ff].freeze
536
- UTF_32LE_BOM = %w[ff fe 0 0].freeze
537
- UTF_8_BOM = %w[ef bb bf].freeze
538
- UTF_16_BOM = %w[fe ff].freeze
539
- UTF_16LE_BOM = %w[ff fe].freeze
540
-
541
- def remove_bom(str)
542
- str_as_hex = str.bytes.map{|x| x.to_s(16)}
543
- # if string does not start with one of the bytes, there is no BOM
544
- return str unless %w[ef fe ff 0].include?(str_as_hex[0])
545
-
546
- return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
547
- return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
548
- return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
549
-
550
- # :nocov:
551
- puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
552
- str
553
- # :nocov:
554
- end
555
242
  end
556
243
  end
@@ -0,0 +1,26 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ attr_reader :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
+
7
+ def initialize_variables
8
+ @csv_line_count = 0
9
+ @chunk_count = 0
10
+ @errors = {}
11
+ @file_line_count = 0
12
+ @headerA = []
13
+ @headers = nil
14
+ @raw_header = nil # header as it appears in the file
15
+ @result = []
16
+ @warnings = {}
17
+ end
18
+
19
+ # :nocov:
20
+ def headerA
21
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
22
+ @headerA
23
+ end
24
+ # :nocov:
25
+ end
26
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.9.2"
4
+ VERSION = "1.9.3"
5
5
  end
data/lib/smarter_csv.rb CHANGED
@@ -1,9 +1,12 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require "core_ext/hash"
4
-
5
3
  require "smarter_csv/version"
4
+ require "smarter_csv/file_io"
6
5
  require "smarter_csv/options_processing"
6
+ require "smarter_csv/auto_detection"
7
+ require "smarter_csv/variables"
8
+ require "smarter_csv/headers"
9
+ require "smarter_csv/parse"
7
10
 
8
11
  case RUBY_ENGINE
9
12
  when 'ruby'
@@ -11,9 +14,11 @@ when 'ruby'
11
14
  if `uname -s`.chomp == 'Darwin'
12
15
  require 'smarter_csv/smarter_csv.bundle'
13
16
  else
17
+ # :nocov:
14
18
  require_relative "smarter_csv/smarter_csv"
19
+ # :nocov:
15
20
  end
16
- rescue Exception
21
+ rescue Exception # rubocop:disable Lint/RescueException
17
22
  # require_relative 'smarter_csv/smarter_csv'
18
23
  end
19
24
  # :nocov:
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.9.2
4
+ version: 1.9.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-11-12 00:00:00.000000000 Z
11
+ date: 2023-12-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: awesome_print
@@ -115,10 +115,14 @@ files:
115
115
  - TO_DO_v2.md
116
116
  - ext/smarter_csv/extconf.rb
117
117
  - ext/smarter_csv/smarter_csv.c
118
- - lib/core_ext/hash.rb
119
118
  - lib/smarter_csv.rb
119
+ - lib/smarter_csv/auto_detection.rb
120
+ - lib/smarter_csv/file_io.rb
121
+ - lib/smarter_csv/headers.rb
120
122
  - lib/smarter_csv/options_processing.rb
123
+ - lib/smarter_csv/parse.rb
121
124
  - lib/smarter_csv/smarter_csv.rb
125
+ - lib/smarter_csv/variables.rb
122
126
  - lib/smarter_csv/version.rb
123
127
  - smarter_csv.gemspec
124
128
  homepage: https://github.com/tilo/smarter_csv
data/lib/core_ext/hash.rb DELETED
@@ -1,9 +0,0 @@
1
- # frozen_string_literal: true
2
-
3
- # the following extension for class Hash is needed (from Facets of Ruby library):
4
-
5
- class Hash
6
- def self.zip(keys, values) # from Facets of Ruby library
7
- keys.zip(values).to_h
8
- end
9
- end