smarter_csv 1.9.2 → 1.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,65 +12,81 @@ module SmarterCSV
12
12
 
13
13
  # first parameter: filename or input object which responds to readline method
14
14
  def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
+ initialize_variables
16
+
15
17
  options = process_options(given_options)
16
18
 
17
- headerA = []
18
- result = []
19
- @file_line_count = 0
20
- @csv_line_count = 0
21
- has_rails = !!defined?(Rails)
19
+ @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
20
+ @verbose = options[:verbose]
21
+
22
22
  begin
23
23
  fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
24
24
 
25
+ if @enforce_utf8 && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
26
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
27
+ end
28
+
25
29
  # auto-detect the row separator
26
30
  options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
27
31
  # attempt to auto-detect column separator
28
32
  options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
29
33
 
30
- if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
31
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
32
- end
33
-
34
34
  skip_lines(fh, options)
35
35
 
36
- headerA, header_size = process_headers(fh, options)
36
+ @headers, header_size = process_headers(fh, options)
37
+ @headerA = @headers # @headerA is deprecated, use @headers
38
+
39
+ puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
40
+
41
+ header_validations(@headers, options)
37
42
 
38
43
  # in case we use chunking.. we'll need to set it up..
39
- if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
44
+ if options[:chunk_size].to_i > 0
40
45
  use_chunks = true
41
46
  chunk_size = options[:chunk_size].to_i
42
- chunk_count = 0
47
+ @chunk_count = 0
43
48
  chunk = []
44
49
  else
45
50
  use_chunks = false
46
51
  end
47
52
 
48
53
  # now on to processing all the rest of the lines in the CSV file:
54
+ # fh.each_line |line|
49
55
  until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
50
56
  line = readline_with_counts(fh, options)
51
57
 
52
58
  # replace invalid byte sequence in UTF-8 with question mark to avoid errors
53
- line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
59
+ line = enforce_utf8_encoding(line, options) if @enforce_utf8
54
60
 
55
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
61
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
56
62
 
57
63
  next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
58
64
 
59
65
  # cater for the quoted csv data containing the row separator carriage return character
60
66
  # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
61
67
  # by detecting the existence of an uneven number of quote characters
68
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
62
69
 
63
- multiline = count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
64
- while count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
70
+ while multiline
65
71
  next_line = fh.readline(options[:row_sep])
66
- next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
72
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
67
73
  line += next_line
68
74
  @file_line_count += 1
75
+
76
+ break if fh.eof? # Exit loop if end of file is reached
77
+
78
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
69
79
  end
70
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
80
+
81
+ # :nocov:
82
+ if multiline && @verbose
83
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
84
+ end
85
+ # :nocov:
71
86
 
72
87
  line.chomp!(options[:row_sep])
73
88
 
89
+ # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
74
90
  dataA, _data_size = parse(line, options, header_size)
75
91
 
76
92
  dataA.map!{|x| x.strip} if options[:strip_whitespace]
@@ -78,48 +94,25 @@ module SmarterCSV
78
94
  # if all values are blank, then ignore this line
79
95
  next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
80
96
 
81
- hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
97
+ # --- HASH TRANSFORMATIONS ------------------------------------------------------------
98
+ hash = @headers.zip(dataA).to_h
82
99
 
83
- # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
84
- hash.delete(nil)
85
- hash.delete('')
86
- hash.delete(:"")
87
-
88
- if options[:remove_empty_values] == true
89
- hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
90
- end
100
+ hash = hash_transformations(hash, options)
91
101
 
92
- hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
93
- hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
94
-
95
- if options[:convert_values_to_numeric]
96
- hash.each do |k, v|
97
- # deal with the :only / :except options to :convert_values_to_numeric
98
- next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
99
-
100
- # convert if it's a numeric value:
101
- case v
102
- when /^[+-]?\d+\.\d+$/
103
- hash[k] = v.to_f
104
- when /^[+-]?\d+$/
105
- hash[k] = v.to_i
106
- end
107
- end
108
- end
109
-
110
- if options[:value_converters]
111
- hash.each do |k, v|
112
- converter = options[:value_converters][k]
113
- next unless converter
114
-
115
- hash[k] = converter.convert(v)
116
- end
117
- end
102
+ # --- HASH VALIDATIONS ----------------------------------------------------------------
103
+ # will go here, and be able to:
104
+ # - validate correct format of the values for fields
105
+ # - required fields to be non-empty
106
+ # - ...
107
+ # -------------------------------------------------------------------------------------
118
108
 
119
109
  next if options[:remove_empty_hashes] && hash.empty?
120
110
 
111
+ puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
112
+ # optional adding of csv_line_number to the hash to help debugging
121
113
  hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
122
114
 
115
+ # process the chunks or the resulting hash
123
116
  if use_chunks
124
117
  chunk << hash # append temp result to chunk
125
118
 
@@ -128,183 +121,74 @@ module SmarterCSV
128
121
  if block_given?
129
122
  yield chunk # do something with the hashes in the chunk in the block
130
123
  else
131
- result << chunk # not sure yet, why anybody would want to do this without a block
124
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
132
125
  end
133
- chunk_count += 1
134
- chunk = [] # initialize for next chunk of data
126
+ @chunk_count += 1
127
+ chunk.clear # re-initialize for next chunk of data
135
128
  else
136
-
137
- # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
138
-
129
+ # the last chunk may contain partial data, which is handled below
139
130
  end
140
-
141
131
  # while a chunk is being filled up we don't need to do anything else here
142
132
 
143
133
  else # no chunk handling
144
134
  if block_given?
145
135
  yield [hash] # do something with the hash in the block (better to use chunking here)
146
136
  else
147
- result << hash
137
+ @result << hash
148
138
  end
149
139
  end
150
140
  end
151
141
 
152
142
  # print new line to retain last processing line message
153
- print "\n" if options[:verbose]
143
+ print "\n" if @verbose
154
144
 
155
- # last chunk:
145
+ # handling of last chunk:
156
146
  if !chunk.nil? && chunk.size > 0
157
147
  # do something with the chunk
158
148
  if block_given?
159
149
  yield chunk # do something with the hashes in the chunk in the block
160
150
  else
161
- result << chunk # not sure yet, why anybody would want to do this without a block
151
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
162
152
  end
163
- chunk_count += 1
153
+ @chunk_count += 1
164
154
  # chunk = [] # initialize for next chunk of data
165
155
  end
166
156
  ensure
167
157
  fh.close if fh.respond_to?(:close)
168
158
  end
159
+
169
160
  if block_given?
170
- chunk_count # when we do processing through a block we only care how many chunks we processed
161
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
171
162
  else
172
- result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
163
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
173
164
  end
174
165
  end
175
166
 
176
167
  class << self
177
- def has_acceleration?
178
- @has_acceleration ||= !!defined?(parse_csv_line_c)
179
- end
180
-
181
- def raw_header
182
- @raw_header
183
- end
184
-
185
- def headers
186
- @headers
187
- end
188
-
189
- # * the `scan` method iterates through the string and finds all occurrences of the pattern
190
- # * The reqular expression:
191
- # - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
192
- # - (?:\\\\)* : Non-capturing group for an even number of backslashes (escaped backslashes).
193
- # This allows for any number of escaped backslashes before the quote character.
194
- # - #{Regexp.escape(quote_char)} : Dynamically inserts the quote_char into the regex,
195
- # ensuring it's properly escaped for use in the regex.
196
- #
197
168
  def count_quote_chars(line, quote_char)
198
- line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
199
- end
200
-
201
- protected
202
-
203
- def readline_with_counts(filehandle, options)
204
- line = filehandle.readline(options[:row_sep])
205
- @file_line_count += 1
206
- @csv_line_count += 1
207
- line = remove_bom(line) if @csv_line_count == 1
208
- line
209
- end
169
+ return 0 if line.nil? || quote_char.nil? || quote_char.empty?
210
170
 
211
- def skip_lines(filehandle, options)
212
- return unless options[:skip_lines].to_i > 0
171
+ count = 0
172
+ escaped = false
213
173
 
214
- options[:skip_lines].to_i.times do
215
- readline_with_counts(filehandle, options)
216
- end
217
- end
218
-
219
- def rewind(filehandle)
220
- @file_line_count = 0
221
- @csv_line_count = 0
222
- filehandle.rewind
223
- end
224
-
225
- ###
226
- ### Thin wrapper around C-extension
227
- ###
228
- def parse(line, options, header_size = nil)
229
- # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
230
-
231
- if options[:acceleration] && has_acceleration?
232
- # :nocov:
233
- has_quotes = line =~ /#{options[:quote_char]}/
234
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
235
- elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
236
- [elements, elements.size]
237
- # :nocov:
238
- else
239
- # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
240
- parse_csv_line_ruby(line, options, header_size)
241
- end
242
- end
243
-
244
- # ------------------------------------------------------------------
245
- # Ruby equivalent of the C-extension for parse_line
246
- #
247
- # parses a single line: either a CSV header and body line
248
- # - quoting rules compared to RFC-4180 are somewhat relaxed
249
- # - we are not assuming that quotes inside a fields need to be doubled
250
- # - we are not assuming that all fields need to be quoted (0 is even)
251
- # - works with multi-char col_sep
252
- # - if header_size is given, only up to header_size fields are parsed
253
- #
254
- # We use header_size for parsing the body lines to make sure we always match the number of headers
255
- # in case there are trailing col_sep characters in line
256
- #
257
- # Our convention is that empty fields are returned as empty strings, not as nil.
258
- #
259
- #
260
- # the purpose of the max_size parameter is to handle a corner case where
261
- # CSV lines contain more fields than the header.
262
- # In which case the remaining fields in the line are ignored
263
- #
264
- def parse_csv_line_ruby(line, options, header_size = nil)
265
- return [] if line.nil?
266
-
267
- line_size = line.size
268
- col_sep = options[:col_sep]
269
- col_sep_size = col_sep.size
270
- quote = options[:quote_char]
271
- quote_count = 0
272
- elements = []
273
- start = 0
274
- i = 0
275
-
276
- previous_char = ''
277
- while i < line_size
278
- if line[i...i+col_sep_size] == col_sep && quote_count.even?
279
- break if !header_size.nil? && elements.size >= header_size
280
-
281
- elements << cleanup_quotes(line[start...i], quote)
282
- previous_char = line[i]
283
- i += col_sep.size
284
- start = i
174
+ line.each_char do |char|
175
+ if char == '\\' && !escaped
176
+ escaped = true
285
177
  else
286
- quote_count += 1 if line[i] == quote && previous_char != '\\'
287
- previous_char = line[i]
288
- i += 1
178
+ count += 1 if char == quote_char && !escaped
179
+ escaped = false
289
180
  end
290
181
  end
291
- elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
292
- [elements, elements.size]
293
- end
294
-
295
- def cleanup_quotes(field, quote)
296
- return field if field.nil?
297
182
 
298
- # return if field !~ /#{quote}/ # this check can probably eliminated
183
+ count
184
+ end
299
185
 
300
- if field.start_with?(quote) && field.end_with?(quote)
301
- field.delete_prefix!(quote)
302
- field.delete_suffix!(quote)
303
- end
304
- field.gsub!("#{quote}#{quote}", quote)
305
- field
186
+ def has_acceleration?
187
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
306
188
  end
307
189
 
190
+ protected
191
+
308
192
  # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
309
193
  # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
310
194
  BLANK_RE = /\A\s*\z/.freeze
@@ -312,245 +196,24 @@ module SmarterCSV
312
196
  def blank?(value)
313
197
  case value
314
198
  when String
315
- value.empty? || BLANK_RE.match?(value)
316
-
199
+ BLANK_RE.match?(value)
317
200
  when NilClass
318
201
  true
319
-
320
202
  when Array
321
- value.empty? || value.inject(true){|result, x| result && elem_blank?(x)}
322
-
203
+ value.all? { |elem| blank?(elem) }
323
204
  when Hash
324
- value.empty? || value.values.inject(true){|result, x| result && elem_blank?(x)}
325
-
205
+ value.values.all? { |elem| blank?(elem) } # Focus on values only
326
206
  else
327
207
  false
328
208
  end
329
209
  end
330
210
 
331
- def elem_blank?(value)
332
- case value
333
- when String
334
- value.empty? || BLANK_RE.match?(value)
335
-
336
- when NilClass
337
- true
338
-
339
- else
340
- false
341
- end
342
- end
343
-
344
- # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
345
- def only_or_except_limit_execution(options, option_name, key)
346
- if options[option_name].is_a?(Hash)
347
- if options[option_name].has_key?(:except)
348
- return true if Array(options[option_name][:except]).include?(key)
349
- elsif options[option_name].has_key?(:only)
350
- return true unless Array(options[option_name][:only]).include?(key)
351
- end
352
- end
353
- false
354
- end
355
-
356
- # If file has headers, then guesses column separator from headers.
357
- # Otherwise guesses column separator from contents.
358
- # Raises exception if none is found.
359
- def guess_column_separator(filehandle, options)
360
- skip_lines(filehandle, options)
361
-
362
- delimiters = [',', "\t", ';', ':', '|']
363
-
364
- line = nil
365
- has_header = options[:headers_in_file]
366
- candidates = Hash.new(0)
367
- count = has_header ? 1 : 5
368
- count.times do
369
- line = readline_with_counts(filehandle, options)
370
- delimiters.each do |d|
371
- candidates[d] += line.scan(d).count
372
- end
373
- rescue EOFError # short files
374
- break
375
- end
376
- rewind(filehandle)
377
-
378
- if candidates.values.max == 0
379
- # if the header only contains
380
- return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
381
-
382
- raise SmarterCSV::NoColSepDetected
383
- end
384
-
385
- candidates.key(candidates.values.max)
386
- end
387
-
388
- # limitation: this currently reads the whole file in before making a decision
389
- def guess_line_ending(filehandle, options)
390
- counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
391
- quoted_char = false
392
-
393
- # count how many of the pre-defined line-endings we find
394
- # ignoring those contained within quote characters
395
- last_char = nil
396
- lines = 0
397
- filehandle.each_char do |c|
398
- quoted_char = !quoted_char if c == options[:quote_char]
399
- next if quoted_char
400
-
401
- if last_char == "\r"
402
- if c == "\n"
403
- counts["\r\n"] += 1
404
- else
405
- counts["\r"] += 1 # \r are counted after they appeared
406
- end
407
- elsif c == "\n"
408
- counts["\n"] += 1
409
- end
410
- last_char = c
411
- lines += 1
412
- break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
413
- end
414
- rewind(filehandle)
415
-
416
- counts["\r"] += 1 if last_char == "\r"
417
- # find the most frequent key/value pair:
418
- most_frequent_key, _count = counts.max_by{|_, v| v}
419
- most_frequent_key
420
- end
421
-
422
- def process_headers(filehandle, options)
423
- @raw_header = nil
424
- @headers = nil
425
- if options[:headers_in_file] # extract the header line
426
- # process the header line in the CSV file..
427
- # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
428
- header = readline_with_counts(filehandle, options)
429
- @raw_header = header
430
-
431
- header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
432
- header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
433
- header = header.chomp(options[:row_sep])
434
-
435
- header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
436
-
437
- file_headerA, file_header_size = parse(header, options)
438
-
439
- file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
440
- file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
441
-
442
- unless options[:keep_original_headers]
443
- file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
444
- file_headerA.map!{|x| x.downcase} if options[:downcase_header]
445
- end
446
- else
447
- raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
448
- end
449
- if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
450
- # use user-provided headers
451
- headerA = options[:user_provided_headers]
452
- if defined?(file_header_size) && !file_header_size.nil?
453
- if headerA.size != file_header_size
454
- raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
455
- else
456
- # we could print out the mapping of file_headerA to headerA here
457
- end
458
- end
459
- else
460
- headerA = file_headerA
461
- end
462
-
463
- # detect duplicate headers and disambiguate
464
- headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
465
- header_size = headerA.size # used for splitting lines
466
-
467
- headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
468
-
469
- unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
470
- key_mappingH = options[:key_mapping]
471
-
472
- # do some key mapping on the keys in the file header
473
- # if you want to completely delete a key, then map it to nil or to ''
474
- if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
475
- # if silence_missing_keys are not set, raise error if missing header
476
- missing_keys = key_mappingH.keys - headerA
477
- # if the user passes a list of speciffic mapped keys that are optional
478
- missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
479
-
480
- unless missing_keys.empty? || options[:silence_missing_keys] == true
481
- raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
482
- end
483
-
484
- headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
485
- end
486
- end
487
-
488
- # header_validations
489
- duplicate_headers = []
490
- headerA.compact.each do |k|
491
- duplicate_headers << k if headerA.select{|x| x == k}.size > 1
492
- end
493
-
494
- unless options[:user_provided_headers] || duplicate_headers.empty?
495
- raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
496
- end
497
-
498
- # deprecate required_headers
499
- unless options[:required_headers].nil?
500
- puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
501
- if options[:required_keys].nil?
502
- options[:required_keys] = options[:required_headers]
503
- options[:required_headers] = nil
504
- end
505
- end
506
-
507
- if options[:required_keys] && options[:required_keys].is_a?(Array)
508
- missing_keys = []
509
- options[:required_keys].each do |k|
510
- missing_keys << k unless headerA.include?(k)
511
- end
512
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
513
- end
514
-
515
- @headers = headerA
516
- [headerA, header_size]
517
- end
518
-
519
- def process_duplicate_headers(headers, options)
520
- counts = Hash.new(0)
521
- result = []
522
- headers.each do |key|
523
- counts[key] += 1
524
- if counts[key] == 1
525
- result << key
526
- else
527
- result << [key, options[:duplicate_header_suffix], counts[key]].join
528
- end
529
- end
530
- result
531
- end
532
-
533
211
  private
534
212
 
535
- UTF_32_BOM = %w[0 0 fe ff].freeze
536
- UTF_32LE_BOM = %w[ff fe 0 0].freeze
537
- UTF_8_BOM = %w[ef bb bf].freeze
538
- UTF_16_BOM = %w[fe ff].freeze
539
- UTF_16LE_BOM = %w[ff fe].freeze
540
-
541
- def remove_bom(str)
542
- str_as_hex = str.bytes.map{|x| x.to_s(16)}
543
- # if string does not start with one of the bytes, there is no BOM
544
- return str unless %w[ef fe ff 0].include?(str_as_hex[0])
545
-
546
- return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
547
- return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
548
- return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
549
-
550
- # :nocov:
551
- puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
552
- str
553
- # :nocov:
213
+ def enforce_utf8_encoding(line, options)
214
+ # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
215
+
216
+ line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
554
217
  end
555
218
  end
556
219
  end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
+
7
+ def initialize_variables
8
+ @has_rails = !!defined?(Rails)
9
+ @csv_line_count = 0
10
+ @chunk_count = 0
11
+ @errors = {}
12
+ @file_line_count = 0
13
+ @headerA = []
14
+ @headers = nil
15
+ @raw_header = nil # header as it appears in the file
16
+ @result = []
17
+ @warnings = {}
18
+ @enforce_utf8 = false # only set to true if needed (after options parsing)
19
+ end
20
+
21
+ # :nocov:
22
+ # rubocop:disable Naming/MethodName
23
+ def headerA
24
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
25
+ @headerA
26
+ end
27
+ # rubocop:enable Naming/MethodName
28
+ # :nocov:
29
+ end
30
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.9.2"
4
+ VERSION = "1.10.0"
5
5
  end