smarter_csv 1.9.2 → 1.10.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -12,65 +12,81 @@ module SmarterCSV
12
12
 
13
13
  # first parameter: filename or input object which responds to readline method
14
14
  def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
+ initialize_variables
16
+
15
17
  options = process_options(given_options)
16
18
 
17
- headerA = []
18
- result = []
19
- @file_line_count = 0
20
- @csv_line_count = 0
21
- has_rails = !!defined?(Rails)
19
+ @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
20
+ @verbose = options[:verbose]
21
+
22
22
  begin
23
23
  fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
24
24
 
25
+ if @enforce_utf8 && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
26
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
27
+ end
28
+
25
29
  # auto-detect the row separator
26
30
  options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
27
31
  # attempt to auto-detect column separator
28
32
  options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
29
33
 
30
- if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
31
- puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
32
- end
33
-
34
34
  skip_lines(fh, options)
35
35
 
36
- headerA, header_size = process_headers(fh, options)
36
+ @headers, header_size = process_headers(fh, options)
37
+ @headerA = @headers # @headerA is deprecated, use @headers
38
+
39
+ puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
40
+
41
+ header_validations(@headers, options)
37
42
 
38
43
  # in case we use chunking.. we'll need to set it up..
39
- if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
44
+ if options[:chunk_size].to_i > 0
40
45
  use_chunks = true
41
46
  chunk_size = options[:chunk_size].to_i
42
- chunk_count = 0
47
+ @chunk_count = 0
43
48
  chunk = []
44
49
  else
45
50
  use_chunks = false
46
51
  end
47
52
 
48
53
  # now on to processing all the rest of the lines in the CSV file:
54
+ # fh.each_line |line|
49
55
  until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
50
56
  line = readline_with_counts(fh, options)
51
57
 
52
58
  # replace invalid byte sequence in UTF-8 with question mark to avoid errors
53
- line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
59
+ line = enforce_utf8_encoding(line, options) if @enforce_utf8
54
60
 
55
- print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
61
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
56
62
 
57
63
  next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
58
64
 
59
65
  # cater for the quoted csv data containing the row separator carriage return character
60
66
  # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
61
67
  # by detecting the existence of an uneven number of quote characters
68
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
62
69
 
63
- multiline = count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
64
- while count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
70
+ while multiline
65
71
  next_line = fh.readline(options[:row_sep])
66
- next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
72
+ next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
67
73
  line += next_line
68
74
  @file_line_count += 1
75
+
76
+ break if fh.eof? # Exit loop if end of file is reached
77
+
78
+ multiline = count_quote_chars(line, options[:quote_char]).odd?
69
79
  end
70
- print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
80
+
81
+ # :nocov:
82
+ if multiline && @verbose
83
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
84
+ end
85
+ # :nocov:
71
86
 
72
87
  line.chomp!(options[:row_sep])
73
88
 
89
+ # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
74
90
  dataA, _data_size = parse(line, options, header_size)
75
91
 
76
92
  dataA.map!{|x| x.strip} if options[:strip_whitespace]
@@ -78,48 +94,25 @@ module SmarterCSV
78
94
  # if all values are blank, then ignore this line
79
95
  next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
80
96
 
81
- hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
97
+ # --- HASH TRANSFORMATIONS ------------------------------------------------------------
98
+ hash = @headers.zip(dataA).to_h
82
99
 
83
- # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
84
- hash.delete(nil)
85
- hash.delete('')
86
- hash.delete(:"")
87
-
88
- if options[:remove_empty_values] == true
89
- hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
90
- end
100
+ hash = hash_transformations(hash, options)
91
101
 
92
- hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
93
- hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
94
-
95
- if options[:convert_values_to_numeric]
96
- hash.each do |k, v|
97
- # deal with the :only / :except options to :convert_values_to_numeric
98
- next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
99
-
100
- # convert if it's a numeric value:
101
- case v
102
- when /^[+-]?\d+\.\d+$/
103
- hash[k] = v.to_f
104
- when /^[+-]?\d+$/
105
- hash[k] = v.to_i
106
- end
107
- end
108
- end
109
-
110
- if options[:value_converters]
111
- hash.each do |k, v|
112
- converter = options[:value_converters][k]
113
- next unless converter
114
-
115
- hash[k] = converter.convert(v)
116
- end
117
- end
102
+ # --- HASH VALIDATIONS ----------------------------------------------------------------
103
+ # will go here, and be able to:
104
+ # - validate correct format of the values for fields
105
+ # - required fields to be non-empty
106
+ # - ...
107
+ # -------------------------------------------------------------------------------------
118
108
 
119
109
  next if options[:remove_empty_hashes] && hash.empty?
120
110
 
111
+ puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
112
+ # optional adding of csv_line_number to the hash to help debugging
121
113
  hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
122
114
 
115
+ # process the chunks or the resulting hash
123
116
  if use_chunks
124
117
  chunk << hash # append temp result to chunk
125
118
 
@@ -128,183 +121,74 @@ module SmarterCSV
128
121
  if block_given?
129
122
  yield chunk # do something with the hashes in the chunk in the block
130
123
  else
131
- result << chunk # not sure yet, why anybody would want to do this without a block
124
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
132
125
  end
133
- chunk_count += 1
134
- chunk = [] # initialize for next chunk of data
126
+ @chunk_count += 1
127
+ chunk.clear # re-initialize for next chunk of data
135
128
  else
136
-
137
- # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
138
-
129
+ # the last chunk may contain partial data, which is handled below
139
130
  end
140
-
141
131
  # while a chunk is being filled up we don't need to do anything else here
142
132
 
143
133
  else # no chunk handling
144
134
  if block_given?
145
135
  yield [hash] # do something with the hash in the block (better to use chunking here)
146
136
  else
147
- result << hash
137
+ @result << hash
148
138
  end
149
139
  end
150
140
  end
151
141
 
152
142
  # print new line to retain last processing line message
153
- print "\n" if options[:verbose]
143
+ print "\n" if @verbose
154
144
 
155
- # last chunk:
145
+ # handling of last chunk:
156
146
  if !chunk.nil? && chunk.size > 0
157
147
  # do something with the chunk
158
148
  if block_given?
159
149
  yield chunk # do something with the hashes in the chunk in the block
160
150
  else
161
- result << chunk # not sure yet, why anybody would want to do this without a block
151
+ @result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
162
152
  end
163
- chunk_count += 1
153
+ @chunk_count += 1
164
154
  # chunk = [] # initialize for next chunk of data
165
155
  end
166
156
  ensure
167
157
  fh.close if fh.respond_to?(:close)
168
158
  end
159
+
169
160
  if block_given?
170
- chunk_count # when we do processing through a block we only care how many chunks we processed
161
+ @chunk_count # when we do processing through a block we only care how many chunks we processed
171
162
  else
172
- result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
163
+ @result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
173
164
  end
174
165
  end
175
166
 
176
167
  class << self
177
- def has_acceleration?
178
- @has_acceleration ||= !!defined?(parse_csv_line_c)
179
- end
180
-
181
- def raw_header
182
- @raw_header
183
- end
184
-
185
- def headers
186
- @headers
187
- end
188
-
189
- # * the `scan` method iterates through the string and finds all occurrences of the pattern
190
- # * The reqular expression:
191
- # - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
192
- # - (?:\\\\)* : Non-capturing group for an even number of backslashes (escaped backslashes).
193
- # This allows for any number of escaped backslashes before the quote character.
194
- # - #{Regexp.escape(quote_char)} : Dynamically inserts the quote_char into the regex,
195
- # ensuring it's properly escaped for use in the regex.
196
- #
197
168
  def count_quote_chars(line, quote_char)
198
- line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
199
- end
200
-
201
- protected
202
-
203
- def readline_with_counts(filehandle, options)
204
- line = filehandle.readline(options[:row_sep])
205
- @file_line_count += 1
206
- @csv_line_count += 1
207
- line = remove_bom(line) if @csv_line_count == 1
208
- line
209
- end
169
+ return 0 if line.nil? || quote_char.nil? || quote_char.empty?
210
170
 
211
- def skip_lines(filehandle, options)
212
- return unless options[:skip_lines].to_i > 0
171
+ count = 0
172
+ escaped = false
213
173
 
214
- options[:skip_lines].to_i.times do
215
- readline_with_counts(filehandle, options)
216
- end
217
- end
218
-
219
- def rewind(filehandle)
220
- @file_line_count = 0
221
- @csv_line_count = 0
222
- filehandle.rewind
223
- end
224
-
225
- ###
226
- ### Thin wrapper around C-extension
227
- ###
228
- def parse(line, options, header_size = nil)
229
- # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
230
-
231
- if options[:acceleration] && has_acceleration?
232
- # :nocov:
233
- has_quotes = line =~ /#{options[:quote_char]}/
234
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
235
- elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
236
- [elements, elements.size]
237
- # :nocov:
238
- else
239
- # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
240
- parse_csv_line_ruby(line, options, header_size)
241
- end
242
- end
243
-
244
- # ------------------------------------------------------------------
245
- # Ruby equivalent of the C-extension for parse_line
246
- #
247
- # parses a single line: either a CSV header and body line
248
- # - quoting rules compared to RFC-4180 are somewhat relaxed
249
- # - we are not assuming that quotes inside a fields need to be doubled
250
- # - we are not assuming that all fields need to be quoted (0 is even)
251
- # - works with multi-char col_sep
252
- # - if header_size is given, only up to header_size fields are parsed
253
- #
254
- # We use header_size for parsing the body lines to make sure we always match the number of headers
255
- # in case there are trailing col_sep characters in line
256
- #
257
- # Our convention is that empty fields are returned as empty strings, not as nil.
258
- #
259
- #
260
- # the purpose of the max_size parameter is to handle a corner case where
261
- # CSV lines contain more fields than the header.
262
- # In which case the remaining fields in the line are ignored
263
- #
264
- def parse_csv_line_ruby(line, options, header_size = nil)
265
- return [] if line.nil?
266
-
267
- line_size = line.size
268
- col_sep = options[:col_sep]
269
- col_sep_size = col_sep.size
270
- quote = options[:quote_char]
271
- quote_count = 0
272
- elements = []
273
- start = 0
274
- i = 0
275
-
276
- previous_char = ''
277
- while i < line_size
278
- if line[i...i+col_sep_size] == col_sep && quote_count.even?
279
- break if !header_size.nil? && elements.size >= header_size
280
-
281
- elements << cleanup_quotes(line[start...i], quote)
282
- previous_char = line[i]
283
- i += col_sep.size
284
- start = i
174
+ line.each_char do |char|
175
+ if char == '\\' && !escaped
176
+ escaped = true
285
177
  else
286
- quote_count += 1 if line[i] == quote && previous_char != '\\'
287
- previous_char = line[i]
288
- i += 1
178
+ count += 1 if char == quote_char && !escaped
179
+ escaped = false
289
180
  end
290
181
  end
291
- elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
292
- [elements, elements.size]
293
- end
294
-
295
- def cleanup_quotes(field, quote)
296
- return field if field.nil?
297
182
 
298
- # return if field !~ /#{quote}/ # this check can probably eliminated
183
+ count
184
+ end
299
185
 
300
- if field.start_with?(quote) && field.end_with?(quote)
301
- field.delete_prefix!(quote)
302
- field.delete_suffix!(quote)
303
- end
304
- field.gsub!("#{quote}#{quote}", quote)
305
- field
186
+ def has_acceleration?
187
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
306
188
  end
307
189
 
190
+ protected
191
+
308
192
  # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
309
193
  # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
310
194
  BLANK_RE = /\A\s*\z/.freeze
@@ -312,245 +196,24 @@ module SmarterCSV
312
196
  def blank?(value)
313
197
  case value
314
198
  when String
315
- value.empty? || BLANK_RE.match?(value)
316
-
199
+ BLANK_RE.match?(value)
317
200
  when NilClass
318
201
  true
319
-
320
202
  when Array
321
- value.empty? || value.inject(true){|result, x| result && elem_blank?(x)}
322
-
203
+ value.all? { |elem| blank?(elem) }
323
204
  when Hash
324
- value.empty? || value.values.inject(true){|result, x| result && elem_blank?(x)}
325
-
205
+ value.values.all? { |elem| blank?(elem) } # Focus on values only
326
206
  else
327
207
  false
328
208
  end
329
209
  end
330
210
 
331
- def elem_blank?(value)
332
- case value
333
- when String
334
- value.empty? || BLANK_RE.match?(value)
335
-
336
- when NilClass
337
- true
338
-
339
- else
340
- false
341
- end
342
- end
343
-
344
- # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
345
- def only_or_except_limit_execution(options, option_name, key)
346
- if options[option_name].is_a?(Hash)
347
- if options[option_name].has_key?(:except)
348
- return true if Array(options[option_name][:except]).include?(key)
349
- elsif options[option_name].has_key?(:only)
350
- return true unless Array(options[option_name][:only]).include?(key)
351
- end
352
- end
353
- false
354
- end
355
-
356
- # If file has headers, then guesses column separator from headers.
357
- # Otherwise guesses column separator from contents.
358
- # Raises exception if none is found.
359
- def guess_column_separator(filehandle, options)
360
- skip_lines(filehandle, options)
361
-
362
- delimiters = [',', "\t", ';', ':', '|']
363
-
364
- line = nil
365
- has_header = options[:headers_in_file]
366
- candidates = Hash.new(0)
367
- count = has_header ? 1 : 5
368
- count.times do
369
- line = readline_with_counts(filehandle, options)
370
- delimiters.each do |d|
371
- candidates[d] += line.scan(d).count
372
- end
373
- rescue EOFError # short files
374
- break
375
- end
376
- rewind(filehandle)
377
-
378
- if candidates.values.max == 0
379
- # if the header only contains
380
- return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
381
-
382
- raise SmarterCSV::NoColSepDetected
383
- end
384
-
385
- candidates.key(candidates.values.max)
386
- end
387
-
388
- # limitation: this currently reads the whole file in before making a decision
389
- def guess_line_ending(filehandle, options)
390
- counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
391
- quoted_char = false
392
-
393
- # count how many of the pre-defined line-endings we find
394
- # ignoring those contained within quote characters
395
- last_char = nil
396
- lines = 0
397
- filehandle.each_char do |c|
398
- quoted_char = !quoted_char if c == options[:quote_char]
399
- next if quoted_char
400
-
401
- if last_char == "\r"
402
- if c == "\n"
403
- counts["\r\n"] += 1
404
- else
405
- counts["\r"] += 1 # \r are counted after they appeared
406
- end
407
- elsif c == "\n"
408
- counts["\n"] += 1
409
- end
410
- last_char = c
411
- lines += 1
412
- break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
413
- end
414
- rewind(filehandle)
415
-
416
- counts["\r"] += 1 if last_char == "\r"
417
- # find the most frequent key/value pair:
418
- most_frequent_key, _count = counts.max_by{|_, v| v}
419
- most_frequent_key
420
- end
421
-
422
- def process_headers(filehandle, options)
423
- @raw_header = nil
424
- @headers = nil
425
- if options[:headers_in_file] # extract the header line
426
- # process the header line in the CSV file..
427
- # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
428
- header = readline_with_counts(filehandle, options)
429
- @raw_header = header
430
-
431
- header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
432
- header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
433
- header = header.chomp(options[:row_sep])
434
-
435
- header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
436
-
437
- file_headerA, file_header_size = parse(header, options)
438
-
439
- file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
440
- file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
441
-
442
- unless options[:keep_original_headers]
443
- file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
444
- file_headerA.map!{|x| x.downcase} if options[:downcase_header]
445
- end
446
- else
447
- raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
448
- end
449
- if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
450
- # use user-provided headers
451
- headerA = options[:user_provided_headers]
452
- if defined?(file_header_size) && !file_header_size.nil?
453
- if headerA.size != file_header_size
454
- raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
455
- else
456
- # we could print out the mapping of file_headerA to headerA here
457
- end
458
- end
459
- else
460
- headerA = file_headerA
461
- end
462
-
463
- # detect duplicate headers and disambiguate
464
- headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
465
- header_size = headerA.size # used for splitting lines
466
-
467
- headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
468
-
469
- unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
470
- key_mappingH = options[:key_mapping]
471
-
472
- # do some key mapping on the keys in the file header
473
- # if you want to completely delete a key, then map it to nil or to ''
474
- if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
475
- # if silence_missing_keys are not set, raise error if missing header
476
- missing_keys = key_mappingH.keys - headerA
477
- # if the user passes a list of speciffic mapped keys that are optional
478
- missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
479
-
480
- unless missing_keys.empty? || options[:silence_missing_keys] == true
481
- raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
482
- end
483
-
484
- headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
485
- end
486
- end
487
-
488
- # header_validations
489
- duplicate_headers = []
490
- headerA.compact.each do |k|
491
- duplicate_headers << k if headerA.select{|x| x == k}.size > 1
492
- end
493
-
494
- unless options[:user_provided_headers] || duplicate_headers.empty?
495
- raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
496
- end
497
-
498
- # deprecate required_headers
499
- unless options[:required_headers].nil?
500
- puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
501
- if options[:required_keys].nil?
502
- options[:required_keys] = options[:required_headers]
503
- options[:required_headers] = nil
504
- end
505
- end
506
-
507
- if options[:required_keys] && options[:required_keys].is_a?(Array)
508
- missing_keys = []
509
- options[:required_keys].each do |k|
510
- missing_keys << k unless headerA.include?(k)
511
- end
512
- raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
513
- end
514
-
515
- @headers = headerA
516
- [headerA, header_size]
517
- end
518
-
519
- def process_duplicate_headers(headers, options)
520
- counts = Hash.new(0)
521
- result = []
522
- headers.each do |key|
523
- counts[key] += 1
524
- if counts[key] == 1
525
- result << key
526
- else
527
- result << [key, options[:duplicate_header_suffix], counts[key]].join
528
- end
529
- end
530
- result
531
- end
532
-
533
211
  private
534
212
 
535
- UTF_32_BOM = %w[0 0 fe ff].freeze
536
- UTF_32LE_BOM = %w[ff fe 0 0].freeze
537
- UTF_8_BOM = %w[ef bb bf].freeze
538
- UTF_16_BOM = %w[fe ff].freeze
539
- UTF_16LE_BOM = %w[ff fe].freeze
540
-
541
- def remove_bom(str)
542
- str_as_hex = str.bytes.map{|x| x.to_s(16)}
543
- # if string does not start with one of the bytes, there is no BOM
544
- return str unless %w[ef fe ff 0].include?(str_as_hex[0])
545
-
546
- return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
547
- return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
548
- return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
549
-
550
- # :nocov:
551
- puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
552
- str
553
- # :nocov:
213
+ def enforce_utf8_encoding(line, options)
214
+ # return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
215
+
216
+ line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
554
217
  end
555
218
  end
556
219
  end
@@ -0,0 +1,30 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class << self
5
+ attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
6
+
7
+ def initialize_variables
8
+ @has_rails = !!defined?(Rails)
9
+ @csv_line_count = 0
10
+ @chunk_count = 0
11
+ @errors = {}
12
+ @file_line_count = 0
13
+ @headerA = []
14
+ @headers = nil
15
+ @raw_header = nil # header as it appears in the file
16
+ @result = []
17
+ @warnings = {}
18
+ @enforce_utf8 = false # only set to true if needed (after options parsing)
19
+ end
20
+
21
+ # :nocov:
22
+ # rubocop:disable Naming/MethodName
23
+ def headerA
24
+ warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
25
+ @headerA
26
+ end
27
+ # rubocop:enable Naming/MethodName
28
+ # :nocov:
29
+ end
30
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.9.2"
4
+ VERSION = "1.10.0"
5
5
  end