smarter_csv 1.9.0 → 1.9.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,556 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class SmarterCSVException < StandardError; end
5
+ class HeaderSizeMismatch < SmarterCSVException; end
6
+ class IncorrectOption < SmarterCSVException; end
7
+ class ValidationError < SmarterCSVException; end
8
+ class DuplicateHeaders < SmarterCSVException; end
9
+ class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
10
+ class NoColSepDetected < SmarterCSVException; end
11
+ class KeyMappingError < SmarterCSVException; end
12
+
13
+ # first parameter: filename or input object which responds to readline method
14
+ def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
+ options = process_options(given_options)
16
+
17
+ headerA = []
18
+ result = []
19
+ @file_line_count = 0
20
+ @csv_line_count = 0
21
+ has_rails = !!defined?(Rails)
22
+ begin
23
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
24
+
25
+ # auto-detect the row separator
26
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
27
+ # attempt to auto-detect column separator
28
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
29
+
30
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
31
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
32
+ end
33
+
34
+ skip_lines(fh, options)
35
+
36
+ headerA, header_size = process_headers(fh, options)
37
+
38
+ # in case we use chunking.. we'll need to set it up..
39
+ if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
40
+ use_chunks = true
41
+ chunk_size = options[:chunk_size].to_i
42
+ chunk_count = 0
43
+ chunk = []
44
+ else
45
+ use_chunks = false
46
+ end
47
+
48
+ # now on to processing all the rest of the lines in the CSV file:
49
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
50
+ line = readline_with_counts(fh, options)
51
+
52
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
53
+ line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
54
+
55
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
56
+
57
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
58
+
59
+ # cater for the quoted csv data containing the row separator carriage return character
60
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
61
+ # by detecting the existence of an uneven number of quote characters
62
+
63
+ multiline = count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
64
+ while count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
65
+ next_line = fh.readline(options[:row_sep])
66
+ next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
67
+ line += next_line
68
+ @file_line_count += 1
69
+ end
70
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
71
+
72
+ line.chomp!(options[:row_sep])
73
+
74
+ dataA, _data_size = parse(line, options, header_size)
75
+
76
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
77
+
78
+ # if all values are blank, then ignore this line
79
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
80
+
81
+ hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
82
+
83
+ # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
84
+ hash.delete(nil)
85
+ hash.delete('')
86
+ hash.delete(:"")
87
+
88
+ if options[:remove_empty_values] == true
89
+ hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
90
+ end
91
+
92
+ hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
93
+ hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
94
+
95
+ if options[:convert_values_to_numeric]
96
+ hash.each do |k, v|
97
+ # deal with the :only / :except options to :convert_values_to_numeric
98
+ next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
99
+
100
+ # convert if it's a numeric value:
101
+ case v
102
+ when /^[+-]?\d+\.\d+$/
103
+ hash[k] = v.to_f
104
+ when /^[+-]?\d+$/
105
+ hash[k] = v.to_i
106
+ end
107
+ end
108
+ end
109
+
110
+ if options[:value_converters]
111
+ hash.each do |k, v|
112
+ converter = options[:value_converters][k]
113
+ next unless converter
114
+
115
+ hash[k] = converter.convert(v)
116
+ end
117
+ end
118
+
119
+ next if options[:remove_empty_hashes] && hash.empty?
120
+
121
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
122
+
123
+ if use_chunks
124
+ chunk << hash # append temp result to chunk
125
+
126
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
127
+ # do something with the chunk
128
+ if block_given?
129
+ yield chunk # do something with the hashes in the chunk in the block
130
+ else
131
+ result << chunk # not sure yet, why anybody would want to do this without a block
132
+ end
133
+ chunk_count += 1
134
+ chunk = [] # initialize for next chunk of data
135
+ else
136
+
137
+ # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
138
+
139
+ end
140
+
141
+ # while a chunk is being filled up we don't need to do anything else here
142
+
143
+ else # no chunk handling
144
+ if block_given?
145
+ yield [hash] # do something with the hash in the block (better to use chunking here)
146
+ else
147
+ result << hash
148
+ end
149
+ end
150
+ end
151
+
152
+ # print new line to retain last processing line message
153
+ print "\n" if options[:verbose]
154
+
155
+ # last chunk:
156
+ if !chunk.nil? && chunk.size > 0
157
+ # do something with the chunk
158
+ if block_given?
159
+ yield chunk # do something with the hashes in the chunk in the block
160
+ else
161
+ result << chunk # not sure yet, why anybody would want to do this without a block
162
+ end
163
+ chunk_count += 1
164
+ # chunk = [] # initialize for next chunk of data
165
+ end
166
+ ensure
167
+ fh.close if fh.respond_to?(:close)
168
+ end
169
+ if block_given?
170
+ chunk_count # when we do processing through a block we only care how many chunks we processed
171
+ else
172
+ result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
173
+ end
174
+ end
175
+
176
+ class << self
177
+ def has_acceleration?
178
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
179
+ end
180
+
181
+ def raw_header
182
+ @raw_header
183
+ end
184
+
185
+ def headers
186
+ @headers
187
+ end
188
+
189
+ # * the `scan` method iterates through the string and finds all occurrences of the pattern
190
+ # * The reqular expression:
191
+ # - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
192
+ # - (?:\\\\)* : Non-capturing group for an even number of backslashes (escaped backslashes).
193
+ # This allows for any number of escaped backslashes before the quote character.
194
+ # - #{Regexp.escape(quote_char)} : Dynamically inserts the quote_char into the regex,
195
+ # ensuring it's properly escaped for use in the regex.
196
+ #
197
+ def count_quote_chars(line, quote_char)
198
+ line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
199
+ end
200
+
201
+ protected
202
+
203
+ def readline_with_counts(filehandle, options)
204
+ line = filehandle.readline(options[:row_sep])
205
+ @file_line_count += 1
206
+ @csv_line_count += 1
207
+ line = remove_bom(line) if @csv_line_count == 1
208
+ line
209
+ end
210
+
211
+ def skip_lines(filehandle, options)
212
+ return unless options[:skip_lines].to_i > 0
213
+
214
+ options[:skip_lines].to_i.times do
215
+ readline_with_counts(filehandle, options)
216
+ end
217
+ end
218
+
219
+ def rewind(filehandle)
220
+ @file_line_count = 0
221
+ @csv_line_count = 0
222
+ filehandle.rewind
223
+ end
224
+
225
+ ###
226
+ ### Thin wrapper around C-extension
227
+ ###
228
+ def parse(line, options, header_size = nil)
229
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
230
+
231
+ if options[:acceleration] && has_acceleration?
232
+ # :nocov:
233
+ has_quotes = line =~ /#{options[:quote_char]}/
234
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
235
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
236
+ [elements, elements.size]
237
+ # :nocov:
238
+ else
239
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
240
+ parse_csv_line_ruby(line, options, header_size)
241
+ end
242
+ end
243
+
244
+ # ------------------------------------------------------------------
245
+ # Ruby equivalent of the C-extension for parse_line
246
+ #
247
+ # parses a single line: either a CSV header and body line
248
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
249
+ # - we are not assuming that quotes inside a fields need to be doubled
250
+ # - we are not assuming that all fields need to be quoted (0 is even)
251
+ # - works with multi-char col_sep
252
+ # - if header_size is given, only up to header_size fields are parsed
253
+ #
254
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
255
+ # in case there are trailing col_sep characters in line
256
+ #
257
+ # Our convention is that empty fields are returned as empty strings, not as nil.
258
+ #
259
+ #
260
+ # the purpose of the max_size parameter is to handle a corner case where
261
+ # CSV lines contain more fields than the header.
262
+ # In which case the remaining fields in the line are ignored
263
+ #
264
+ def parse_csv_line_ruby(line, options, header_size = nil)
265
+ return [] if line.nil?
266
+
267
+ line_size = line.size
268
+ col_sep = options[:col_sep]
269
+ col_sep_size = col_sep.size
270
+ quote = options[:quote_char]
271
+ quote_count = 0
272
+ elements = []
273
+ start = 0
274
+ i = 0
275
+
276
+ previous_char = ''
277
+ while i < line_size
278
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
279
+ break if !header_size.nil? && elements.size >= header_size
280
+
281
+ elements << cleanup_quotes(line[start...i], quote)
282
+ previous_char = line[i]
283
+ i += col_sep.size
284
+ start = i
285
+ else
286
+ quote_count += 1 if line[i] == quote && previous_char != '\\'
287
+ previous_char = line[i]
288
+ i += 1
289
+ end
290
+ end
291
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
292
+ [elements, elements.size]
293
+ end
294
+
295
+ def cleanup_quotes(field, quote)
296
+ return field if field.nil?
297
+
298
+ # return if field !~ /#{quote}/ # this check can probably eliminated
299
+
300
+ if field.start_with?(quote) && field.end_with?(quote)
301
+ field.delete_prefix!(quote)
302
+ field.delete_suffix!(quote)
303
+ end
304
+ field.gsub!("#{quote}#{quote}", quote)
305
+ field
306
+ end
307
+
308
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
309
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
310
+ BLANK_RE = /\A\s*\z/.freeze
311
+
312
+ def blank?(value)
313
+ case value
314
+ when String
315
+ value.empty? || BLANK_RE.match?(value)
316
+
317
+ when NilClass
318
+ true
319
+
320
+ when Array
321
+ value.empty? || value.inject(true){|result, x| result && elem_blank?(x)}
322
+
323
+ when Hash
324
+ value.empty? || value.values.inject(true){|result, x| result && elem_blank?(x)}
325
+
326
+ else
327
+ false
328
+ end
329
+ end
330
+
331
+ def elem_blank?(value)
332
+ case value
333
+ when String
334
+ value.empty? || BLANK_RE.match?(value)
335
+
336
+ when NilClass
337
+ true
338
+
339
+ else
340
+ false
341
+ end
342
+ end
343
+
344
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
345
+ def only_or_except_limit_execution(options, option_name, key)
346
+ if options[option_name].is_a?(Hash)
347
+ if options[option_name].has_key?(:except)
348
+ return true if Array(options[option_name][:except]).include?(key)
349
+ elsif options[option_name].has_key?(:only)
350
+ return true unless Array(options[option_name][:only]).include?(key)
351
+ end
352
+ end
353
+ false
354
+ end
355
+
356
+ # If file has headers, then guesses column separator from headers.
357
+ # Otherwise guesses column separator from contents.
358
+ # Raises exception if none is found.
359
+ def guess_column_separator(filehandle, options)
360
+ skip_lines(filehandle, options)
361
+
362
+ delimiters = [',', "\t", ';', ':', '|']
363
+
364
+ line = nil
365
+ has_header = options[:headers_in_file]
366
+ candidates = Hash.new(0)
367
+ count = has_header ? 1 : 5
368
+ count.times do
369
+ line = readline_with_counts(filehandle, options)
370
+ delimiters.each do |d|
371
+ candidates[d] += line.scan(d).count
372
+ end
373
+ rescue EOFError # short files
374
+ break
375
+ end
376
+ rewind(filehandle)
377
+
378
+ if candidates.values.max == 0
379
+ # if the header only contains
380
+ return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
381
+
382
+ raise SmarterCSV::NoColSepDetected
383
+ end
384
+
385
+ candidates.key(candidates.values.max)
386
+ end
387
+
388
+ # limitation: this currently reads the whole file in before making a decision
389
+ def guess_line_ending(filehandle, options)
390
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
391
+ quoted_char = false
392
+
393
+ # count how many of the pre-defined line-endings we find
394
+ # ignoring those contained within quote characters
395
+ last_char = nil
396
+ lines = 0
397
+ filehandle.each_char do |c|
398
+ quoted_char = !quoted_char if c == options[:quote_char]
399
+ next if quoted_char
400
+
401
+ if last_char == "\r"
402
+ if c == "\n"
403
+ counts["\r\n"] += 1
404
+ else
405
+ counts["\r"] += 1 # \r are counted after they appeared
406
+ end
407
+ elsif c == "\n"
408
+ counts["\n"] += 1
409
+ end
410
+ last_char = c
411
+ lines += 1
412
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
413
+ end
414
+ rewind(filehandle)
415
+
416
+ counts["\r"] += 1 if last_char == "\r"
417
+ # find the most frequent key/value pair:
418
+ most_frequent_key, _count = counts.max_by{|_, v| v}
419
+ most_frequent_key
420
+ end
421
+
422
+ def process_headers(filehandle, options)
423
+ @raw_header = nil
424
+ @headers = nil
425
+ if options[:headers_in_file] # extract the header line
426
+ # process the header line in the CSV file..
427
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
428
+ header = readline_with_counts(filehandle, options)
429
+ @raw_header = header
430
+
431
+ header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
432
+ header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
433
+ header = header.chomp(options[:row_sep])
434
+
435
+ header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
436
+
437
+ file_headerA, file_header_size = parse(header, options)
438
+
439
+ file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
440
+ file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
441
+
442
+ unless options[:keep_original_headers]
443
+ file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
444
+ file_headerA.map!{|x| x.downcase} if options[:downcase_header]
445
+ end
446
+ else
447
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
448
+ end
449
+ if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
450
+ # use user-provided headers
451
+ headerA = options[:user_provided_headers]
452
+ if defined?(file_header_size) && !file_header_size.nil?
453
+ if headerA.size != file_header_size
454
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
455
+ else
456
+ # we could print out the mapping of file_headerA to headerA here
457
+ end
458
+ end
459
+ else
460
+ headerA = file_headerA
461
+ end
462
+
463
+ # detect duplicate headers and disambiguate
464
+ headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
465
+ header_size = headerA.size # used for splitting lines
466
+
467
+ headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
468
+
469
+ unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
470
+ key_mappingH = options[:key_mapping]
471
+
472
+ # do some key mapping on the keys in the file header
473
+ # if you want to completely delete a key, then map it to nil or to ''
474
+ if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
475
+ # if silence_missing_keys are not set, raise error if missing header
476
+ missing_keys = key_mappingH.keys - headerA
477
+ # if the user passes a list of speciffic mapped keys that are optional
478
+ missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
479
+
480
+ unless missing_keys.empty? || options[:silence_missing_keys] == true
481
+ raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
482
+ end
483
+
484
+ headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
485
+ end
486
+ end
487
+
488
+ # header_validations
489
+ duplicate_headers = []
490
+ headerA.compact.each do |k|
491
+ duplicate_headers << k if headerA.select{|x| x == k}.size > 1
492
+ end
493
+
494
+ unless options[:user_provided_headers] || duplicate_headers.empty?
495
+ raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
496
+ end
497
+
498
+ # deprecate required_headers
499
+ unless options[:required_headers].nil?
500
+ puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
501
+ if options[:required_keys].nil?
502
+ options[:required_keys] = options[:required_headers]
503
+ options[:required_headers] = nil
504
+ end
505
+ end
506
+
507
+ if options[:required_keys] && options[:required_keys].is_a?(Array)
508
+ missing_keys = []
509
+ options[:required_keys].each do |k|
510
+ missing_keys << k unless headerA.include?(k)
511
+ end
512
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
513
+ end
514
+
515
+ @headers = headerA
516
+ [headerA, header_size]
517
+ end
518
+
519
+ def process_duplicate_headers(headers, options)
520
+ counts = Hash.new(0)
521
+ result = []
522
+ headers.each do |key|
523
+ counts[key] += 1
524
+ if counts[key] == 1
525
+ result << key
526
+ else
527
+ result << [key, options[:duplicate_header_suffix], counts[key]].join
528
+ end
529
+ end
530
+ result
531
+ end
532
+
533
+ private
534
+
535
+ UTF_32_BOM = %w[0 0 fe ff].freeze
536
+ UTF_32LE_BOM = %w[ff fe 0 0].freeze
537
+ UTF_8_BOM = %w[ef bb bf].freeze
538
+ UTF_16_BOM = %w[fe ff].freeze
539
+ UTF_16LE_BOM = %w[ff fe].freeze
540
+
541
+ def remove_bom(str)
542
+ str_as_hex = str.bytes.map{|x| x.to_s(16)}
543
+ # if string does not start with one of the bytes, there is no BOM
544
+ return str unless %w[ef fe ff 0].include?(str_as_hex[0])
545
+
546
+ return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
547
+ return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
548
+ return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
549
+
550
+ # :nocov:
551
+ puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
552
+ str
553
+ # :nocov:
554
+ end
555
+ end
556
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.9.0"
4
+ VERSION = "1.9.2"
5
5
  end