smarter_csv 1.8.5 → 1.9.2.pre01

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,556 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ class SmarterCSVException < StandardError; end
5
+ class HeaderSizeMismatch < SmarterCSVException; end
6
+ class IncorrectOption < SmarterCSVException; end
7
+ class ValidationError < SmarterCSVException; end
8
+ class DuplicateHeaders < SmarterCSVException; end
9
+ class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
10
+ class NoColSepDetected < SmarterCSVException; end
11
+ class KeyMappingError < SmarterCSVException; end
12
+
13
+ # first parameter: filename or input object which responds to readline method
14
+ def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
15
+ options = process_options(given_options)
16
+
17
+ headerA = []
18
+ result = []
19
+ @file_line_count = 0
20
+ @csv_line_count = 0
21
+ has_rails = !!defined?(Rails)
22
+ begin
23
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
24
+
25
+ # auto-detect the row separator
26
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
27
+ # attempt to auto-detect column separator
28
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
29
+
30
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
31
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
32
+ end
33
+
34
+ skip_lines(fh, options)
35
+
36
+ headerA, header_size = process_headers(fh, options)
37
+
38
+ # in case we use chunking.. we'll need to set it up..
39
+ if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
40
+ use_chunks = true
41
+ chunk_size = options[:chunk_size].to_i
42
+ chunk_count = 0
43
+ chunk = []
44
+ else
45
+ use_chunks = false
46
+ end
47
+
48
+ # now on to processing all the rest of the lines in the CSV file:
49
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
50
+ line = readline_with_counts(fh, options)
51
+
52
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
53
+ line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
54
+
55
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
56
+
57
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
58
+
59
+ # cater for the quoted csv data containing the row separator carriage return character
60
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
61
+ # by detecting the existence of an uneven number of quote characters
62
+
63
+ multiline = count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
64
+ while count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
65
+ next_line = fh.readline(options[:row_sep])
66
+ next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
67
+ line += next_line
68
+ @file_line_count += 1
69
+ end
70
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
71
+
72
+ line.chomp!(options[:row_sep])
73
+
74
+ dataA, _data_size = parse(line, options, header_size)
75
+
76
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
77
+
78
+ # if all values are blank, then ignore this line
79
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
80
+
81
+ hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
82
+
83
+ # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
84
+ hash.delete(nil)
85
+ hash.delete('')
86
+ hash.delete(:"")
87
+
88
+ if options[:remove_empty_values] == true
89
+ hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
90
+ end
91
+
92
+ hash.delete_if{|_k, v| !v.nil? && v =~ /^(0+|0+\.0+)$/} if options[:remove_zero_values] # values are Strings
93
+ hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
94
+
95
+ if options[:convert_values_to_numeric]
96
+ hash.each do |k, v|
97
+ # deal with the :only / :except options to :convert_values_to_numeric
98
+ next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
99
+
100
+ # convert if it's a numeric value:
101
+ case v
102
+ when /^[+-]?\d+\.\d+$/
103
+ hash[k] = v.to_f
104
+ when /^[+-]?\d+$/
105
+ hash[k] = v.to_i
106
+ end
107
+ end
108
+ end
109
+
110
+ if options[:value_converters]
111
+ hash.each do |k, v|
112
+ converter = options[:value_converters][k]
113
+ next unless converter
114
+
115
+ hash[k] = converter.convert(v)
116
+ end
117
+ end
118
+
119
+ next if options[:remove_empty_hashes] && hash.empty?
120
+
121
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
122
+
123
+ if use_chunks
124
+ chunk << hash # append temp result to chunk
125
+
126
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
127
+ # do something with the chunk
128
+ if block_given?
129
+ yield chunk # do something with the hashes in the chunk in the block
130
+ else
131
+ result << chunk # not sure yet, why anybody would want to do this without a block
132
+ end
133
+ chunk_count += 1
134
+ chunk = [] # initialize for next chunk of data
135
+ else
136
+
137
+ # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
138
+
139
+ end
140
+
141
+ # while a chunk is being filled up we don't need to do anything else here
142
+
143
+ else # no chunk handling
144
+ if block_given?
145
+ yield [hash] # do something with the hash in the block (better to use chunking here)
146
+ else
147
+ result << hash
148
+ end
149
+ end
150
+ end
151
+
152
+ # print new line to retain last processing line message
153
+ print "\n" if options[:verbose]
154
+
155
+ # last chunk:
156
+ if !chunk.nil? && chunk.size > 0
157
+ # do something with the chunk
158
+ if block_given?
159
+ yield chunk # do something with the hashes in the chunk in the block
160
+ else
161
+ result << chunk # not sure yet, why anybody would want to do this without a block
162
+ end
163
+ chunk_count += 1
164
+ # chunk = [] # initialize for next chunk of data
165
+ end
166
+ ensure
167
+ fh.close if fh.respond_to?(:close)
168
+ end
169
+ if block_given?
170
+ chunk_count # when we do processing through a block we only care how many chunks we processed
171
+ else
172
+ result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
173
+ end
174
+ end
175
+
176
+ class << self
177
+ def has_acceleration?
178
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
179
+ end
180
+
181
+ def raw_header
182
+ @raw_header
183
+ end
184
+
185
+ def headers
186
+ @headers
187
+ end
188
+
189
+ # * the `scan` method iterates through the string and finds all occurrences of the pattern
190
+ # * The reqular expression:
191
+ # - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
192
+ # - (?:\\\\)* : Non-capturing group for an even number of backslashes (escaped backslashes).
193
+ # This allows for any number of escaped backslashes before the quote character.
194
+ # - #{Regexp.escape(quote_char)} : Dynamically inserts the quote_char into the regex,
195
+ # ensuring it's properly escaped for use in the regex.
196
+ #
197
+ def count_quote_chars(line, quote_char)
198
+ line.scan(/(?<!\\)(?:\\\\)*#{Regexp.escape(quote_char)}/).count
199
+ end
200
+
201
+ protected
202
+
203
+ def readline_with_counts(filehandle, options)
204
+ line = filehandle.readline(options[:row_sep])
205
+ @file_line_count += 1
206
+ @csv_line_count += 1
207
+ line = remove_bom(line) if @csv_line_count == 1
208
+ line
209
+ end
210
+
211
+ def skip_lines(filehandle, options)
212
+ return unless options[:skip_lines].to_i > 0
213
+
214
+ options[:skip_lines].to_i.times do
215
+ readline_with_counts(filehandle, options)
216
+ end
217
+ end
218
+
219
+ def rewind(filehandle)
220
+ @file_line_count = 0
221
+ @csv_line_count = 0
222
+ filehandle.rewind
223
+ end
224
+
225
+ ###
226
+ ### Thin wrapper around C-extension
227
+ ###
228
+ def parse(line, options, header_size = nil)
229
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
230
+
231
+ if options[:acceleration] && has_acceleration?
232
+ # :nocov:
233
+ has_quotes = line =~ /#{options[:quote_char]}/
234
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
235
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
236
+ [elements, elements.size]
237
+ # :nocov:
238
+ else
239
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
240
+ parse_csv_line_ruby(line, options, header_size)
241
+ end
242
+ end
243
+
244
+ # ------------------------------------------------------------------
245
+ # Ruby equivalent of the C-extension for parse_line
246
+ #
247
+ # parses a single line: either a CSV header and body line
248
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
249
+ # - we are not assuming that quotes inside a fields need to be doubled
250
+ # - we are not assuming that all fields need to be quoted (0 is even)
251
+ # - works with multi-char col_sep
252
+ # - if header_size is given, only up to header_size fields are parsed
253
+ #
254
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
255
+ # in case there are trailing col_sep characters in line
256
+ #
257
+ # Our convention is that empty fields are returned as empty strings, not as nil.
258
+ #
259
+ #
260
+ # the purpose of the max_size parameter is to handle a corner case where
261
+ # CSV lines contain more fields than the header.
262
+ # In which case the remaining fields in the line are ignored
263
+ #
264
+ def parse_csv_line_ruby(line, options, header_size = nil)
265
+ return [] if line.nil?
266
+
267
+ line_size = line.size
268
+ col_sep = options[:col_sep]
269
+ col_sep_size = col_sep.size
270
+ quote = options[:quote_char]
271
+ quote_count = 0
272
+ elements = []
273
+ start = 0
274
+ i = 0
275
+
276
+ previous_char = ''
277
+ while i < line_size
278
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
279
+ break if !header_size.nil? && elements.size >= header_size
280
+
281
+ elements << cleanup_quotes(line[start...i], quote)
282
+ previous_char = line[i]
283
+ i += col_sep.size
284
+ start = i
285
+ else
286
+ quote_count += 1 if line[i] == quote && previous_char != '\\'
287
+ previous_char = line[i]
288
+ i += 1
289
+ end
290
+ end
291
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
292
+ [elements, elements.size]
293
+ end
294
+
295
+ def cleanup_quotes(field, quote)
296
+ return field if field.nil?
297
+
298
+ # return if field !~ /#{quote}/ # this check can probably eliminated
299
+
300
+ if field.start_with?(quote) && field.end_with?(quote)
301
+ field.delete_prefix!(quote)
302
+ field.delete_suffix!(quote)
303
+ end
304
+ field.gsub!("#{quote}#{quote}", quote)
305
+ field
306
+ end
307
+
308
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
309
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
310
+ BLANK_RE = /\A\s*\z/.freeze
311
+
312
+ def blank?(value)
313
+ case value
314
+ when String
315
+ value.empty? || BLANK_RE.match?(value)
316
+
317
+ when NilClass
318
+ true
319
+
320
+ when Array
321
+ value.empty? || value.inject(true){|result, x| result && elem_blank?(x)}
322
+
323
+ when Hash
324
+ value.empty? || value.values.inject(true){|result, x| result && elem_blank?(x)}
325
+
326
+ else
327
+ false
328
+ end
329
+ end
330
+
331
+ def elem_blank?(value)
332
+ case value
333
+ when String
334
+ value.empty? || BLANK_RE.match?(value)
335
+
336
+ when NilClass
337
+ true
338
+
339
+ else
340
+ false
341
+ end
342
+ end
343
+
344
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
345
+ def only_or_except_limit_execution(options, option_name, key)
346
+ if options[option_name].is_a?(Hash)
347
+ if options[option_name].has_key?(:except)
348
+ return true if Array(options[option_name][:except]).include?(key)
349
+ elsif options[option_name].has_key?(:only)
350
+ return true unless Array(options[option_name][:only]).include?(key)
351
+ end
352
+ end
353
+ false
354
+ end
355
+
356
+ # If file has headers, then guesses column separator from headers.
357
+ # Otherwise guesses column separator from contents.
358
+ # Raises exception if none is found.
359
+ def guess_column_separator(filehandle, options)
360
+ skip_lines(filehandle, options)
361
+
362
+ delimiters = [',', "\t", ';', ':', '|']
363
+
364
+ line = nil
365
+ has_header = options[:headers_in_file]
366
+ candidates = Hash.new(0)
367
+ count = has_header ? 1 : 5
368
+ count.times do
369
+ line = readline_with_counts(filehandle, options)
370
+ delimiters.each do |d|
371
+ candidates[d] += line.scan(d).count
372
+ end
373
+ rescue EOFError # short files
374
+ break
375
+ end
376
+ rewind(filehandle)
377
+
378
+ if candidates.values.max == 0
379
+ # if the header only contains
380
+ return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
381
+
382
+ raise SmarterCSV::NoColSepDetected
383
+ end
384
+
385
+ candidates.key(candidates.values.max)
386
+ end
387
+
388
+ # limitation: this currently reads the whole file in before making a decision
389
+ def guess_line_ending(filehandle, options)
390
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
391
+ quoted_char = false
392
+
393
+ # count how many of the pre-defined line-endings we find
394
+ # ignoring those contained within quote characters
395
+ last_char = nil
396
+ lines = 0
397
+ filehandle.each_char do |c|
398
+ quoted_char = !quoted_char if c == options[:quote_char]
399
+ next if quoted_char
400
+
401
+ if last_char == "\r"
402
+ if c == "\n"
403
+ counts["\r\n"] += 1
404
+ else
405
+ counts["\r"] += 1 # \r are counted after they appeared
406
+ end
407
+ elsif c == "\n"
408
+ counts["\n"] += 1
409
+ end
410
+ last_char = c
411
+ lines += 1
412
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
413
+ end
414
+ rewind(filehandle)
415
+
416
+ counts["\r"] += 1 if last_char == "\r"
417
+ # find the most frequent key/value pair:
418
+ most_frequent_key, _count = counts.max_by{|_, v| v}
419
+ most_frequent_key
420
+ end
421
+
422
+ def process_headers(filehandle, options)
423
+ @raw_header = nil
424
+ @headers = nil
425
+ if options[:headers_in_file] # extract the header line
426
+ # process the header line in the CSV file..
427
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
428
+ header = readline_with_counts(filehandle, options)
429
+ @raw_header = header
430
+
431
+ header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
432
+ header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
433
+ header = header.chomp(options[:row_sep])
434
+
435
+ header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
436
+
437
+ file_headerA, file_header_size = parse(header, options)
438
+
439
+ file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
440
+ file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
441
+
442
+ unless options[:keep_original_headers]
443
+ file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
444
+ file_headerA.map!{|x| x.downcase} if options[:downcase_header]
445
+ end
446
+ else
447
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
448
+ end
449
+ if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
450
+ # use user-provided headers
451
+ headerA = options[:user_provided_headers]
452
+ if defined?(file_header_size) && !file_header_size.nil?
453
+ if headerA.size != file_header_size
454
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
455
+ else
456
+ # we could print out the mapping of file_headerA to headerA here
457
+ end
458
+ end
459
+ else
460
+ headerA = file_headerA
461
+ end
462
+
463
+ # detect duplicate headers and disambiguate
464
+ headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
465
+ header_size = headerA.size # used for splitting lines
466
+
467
+ headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
468
+
469
+ unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
470
+ key_mappingH = options[:key_mapping]
471
+
472
+ # do some key mapping on the keys in the file header
473
+ # if you want to completely delete a key, then map it to nil or to ''
474
+ if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
475
+ # if silence_missing_keys are not set, raise error if missing header
476
+ missing_keys = key_mappingH.keys - headerA
477
+ # if the user passes a list of speciffic mapped keys that are optional
478
+ missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
479
+
480
+ unless missing_keys.empty? || options[:silence_missing_keys] == true
481
+ raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
482
+ end
483
+
484
+ headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
485
+ end
486
+ end
487
+
488
+ # header_validations
489
+ duplicate_headers = []
490
+ headerA.compact.each do |k|
491
+ duplicate_headers << k if headerA.select{|x| x == k}.size > 1
492
+ end
493
+
494
+ unless options[:user_provided_headers] || duplicate_headers.empty?
495
+ raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
496
+ end
497
+
498
+ # deprecate required_headers
499
+ unless options[:required_headers].nil?
500
+ puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
501
+ if options[:required_keys].nil?
502
+ options[:required_keys] = options[:required_headers]
503
+ options[:required_headers] = nil
504
+ end
505
+ end
506
+
507
+ if options[:required_keys] && options[:required_keys].is_a?(Array)
508
+ missing_keys = []
509
+ options[:required_keys].each do |k|
510
+ missing_keys << k unless headerA.include?(k)
511
+ end
512
+ raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
513
+ end
514
+
515
+ @headers = headerA
516
+ [headerA, header_size]
517
+ end
518
+
519
+ def process_duplicate_headers(headers, options)
520
+ counts = Hash.new(0)
521
+ result = []
522
+ headers.each do |key|
523
+ counts[key] += 1
524
+ if counts[key] == 1
525
+ result << key
526
+ else
527
+ result << [key, options[:duplicate_header_suffix], counts[key]].join
528
+ end
529
+ end
530
+ result
531
+ end
532
+
533
+ private
534
+
535
+ UTF_32_BOM = %w[0 0 fe ff].freeze
536
+ UTF_32LE_BOM = %w[ff fe 0 0].freeze
537
+ UTF_8_BOM = %w[ef bb bf].freeze
538
+ UTF_16_BOM = %w[fe ff].freeze
539
+ UTF_16LE_BOM = %w[ff fe].freeze
540
+
541
+ def remove_bom(str)
542
+ str_as_hex = str.bytes.map{|x| x.to_s(16)}
543
+ # if string does not start with one of the bytes, there is no BOM
544
+ return str unless %w[ef fe ff 0].include?(str_as_hex[0])
545
+
546
+ return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
547
+ return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
548
+ return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
549
+
550
+ # :nocov:
551
+ puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
552
+ str
553
+ # :nocov:
554
+ end
555
+ end
556
+ end
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.8.5"
4
+ VERSION = "1.9.2.pre01" # this is a pretty odd situation
5
5
  end