smarter_csv 1.6.0 → 1.7.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -0
  3. data/CHANGELOG.md +28 -0
  4. data/CONTRIBUTORS.md +3 -0
  5. data/Gemfile +7 -4
  6. data/README.md +30 -26
  7. data/Rakefile +15 -13
  8. data/ext/smarter_csv/extconf.rb +14 -0
  9. data/ext/smarter_csv/smarter_csv.c +86 -0
  10. data/lib/extensions/hash.rb +4 -2
  11. data/lib/smarter_csv/version.rb +3 -1
  12. data/lib/smarter_csv.rb +524 -10
  13. data/smarter_csv.gemspec +22 -7
  14. metadata +55 -177
  15. data/.gitignore +0 -10
  16. data/.rspec +0 -2
  17. data/.travis.yml +0 -27
  18. data/lib/smarter_csv/smarter_csv.rb +0 -461
  19. data/spec/fixtures/additional_separator.csv +0 -6
  20. data/spec/fixtures/basic.csv +0 -8
  21. data/spec/fixtures/binary.csv +0 -1
  22. data/spec/fixtures/carriage_returns_n.csv +0 -18
  23. data/spec/fixtures/carriage_returns_quoted.csv +0 -3
  24. data/spec/fixtures/carriage_returns_r.csv +0 -1
  25. data/spec/fixtures/carriage_returns_rn.csv +0 -18
  26. data/spec/fixtures/chunk_cornercase.csv +0 -10
  27. data/spec/fixtures/duplicate_headers.csv +0 -3
  28. data/spec/fixtures/empty.csv +0 -5
  29. data/spec/fixtures/empty_columns_1.csv +0 -2
  30. data/spec/fixtures/empty_columns_2.csv +0 -2
  31. data/spec/fixtures/hard_sample.csv +0 -2
  32. data/spec/fixtures/ignore_comments.csv +0 -11
  33. data/spec/fixtures/ignore_comments2.csv +0 -3
  34. data/spec/fixtures/key_mapping.csv +0 -2
  35. data/spec/fixtures/line_endings_n.csv +0 -4
  36. data/spec/fixtures/line_endings_r.csv +0 -1
  37. data/spec/fixtures/line_endings_rn.csv +0 -4
  38. data/spec/fixtures/lots_of_columns.csv +0 -2
  39. data/spec/fixtures/malformed.csv +0 -3
  40. data/spec/fixtures/malformed_header.csv +0 -3
  41. data/spec/fixtures/money.csv +0 -3
  42. data/spec/fixtures/no_header.csv +0 -7
  43. data/spec/fixtures/numeric.csv +0 -5
  44. data/spec/fixtures/pets.csv +0 -5
  45. data/spec/fixtures/problematic.csv +0 -8
  46. data/spec/fixtures/quote_char.csv +0 -9
  47. data/spec/fixtures/quoted.csv +0 -5
  48. data/spec/fixtures/quoted2.csv +0 -4
  49. data/spec/fixtures/separator_colon.csv +0 -4
  50. data/spec/fixtures/separator_comma.csv +0 -4
  51. data/spec/fixtures/separator_pipe.csv +0 -4
  52. data/spec/fixtures/separator_semi.csv +0 -4
  53. data/spec/fixtures/separator_tab.csv +0 -4
  54. data/spec/fixtures/skip_lines.csv +0 -8
  55. data/spec/fixtures/trading.csv +0 -3
  56. data/spec/fixtures/user_import.csv +0 -3
  57. data/spec/fixtures/valid_unicode.csv +0 -5
  58. data/spec/fixtures/with_dashes.csv +0 -8
  59. data/spec/fixtures/with_dates.csv +0 -4
  60. data/spec/smarter_csv/additional_separator_spec.rb +0 -45
  61. data/spec/smarter_csv/binary_file2_spec.rb +0 -24
  62. data/spec/smarter_csv/binary_file_spec.rb +0 -22
  63. data/spec/smarter_csv/blank_spec.rb +0 -55
  64. data/spec/smarter_csv/carriage_return_spec.rb +0 -190
  65. data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
  66. data/spec/smarter_csv/close_file_spec.rb +0 -15
  67. data/spec/smarter_csv/column_separator_spec.rb +0 -95
  68. data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
  69. data/spec/smarter_csv/duplicate_headers_spec.rb +0 -76
  70. data/spec/smarter_csv/empty_columns_spec.rb +0 -74
  71. data/spec/smarter_csv/extenstions_spec.rb +0 -17
  72. data/spec/smarter_csv/hard_sample_spec.rb +0 -24
  73. data/spec/smarter_csv/header_transformation_spec.rb +0 -21
  74. data/spec/smarter_csv/ignore_comments_spec.rb +0 -45
  75. data/spec/smarter_csv/invalid_headers_spec.rb +0 -38
  76. data/spec/smarter_csv/keep_headers_spec.rb +0 -24
  77. data/spec/smarter_csv/key_mapping_spec.rb +0 -56
  78. data/spec/smarter_csv/line_ending_spec.rb +0 -43
  79. data/spec/smarter_csv/load_basic_spec.rb +0 -20
  80. data/spec/smarter_csv/malformed_spec.rb +0 -25
  81. data/spec/smarter_csv/no_header_spec.rb +0 -29
  82. data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
  83. data/spec/smarter_csv/parse/column_separator_spec.rb +0 -61
  84. data/spec/smarter_csv/parse/old_csv_library_spec.rb +0 -74
  85. data/spec/smarter_csv/parse/rfc4180_and_more_spec.rb +0 -170
  86. data/spec/smarter_csv/problematic.rb +0 -34
  87. data/spec/smarter_csv/quoted_spec.rb +0 -52
  88. data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
  89. data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
  90. data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
  91. data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
  92. data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
  93. data/spec/smarter_csv/skip_lines_spec.rb +0 -29
  94. data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
  95. data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
  96. data/spec/smarter_csv/trading_spec.rb +0 -25
  97. data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
  98. data/spec/smarter_csv/value_converters_spec.rb +0 -52
  99. data/spec/spec/spec_helper.rb +0 -17
  100. data/spec/spec.opts +0 -2
  101. data/spec/spec_helper.rb +0 -21
data/lib/smarter_csv.rb CHANGED
@@ -1,12 +1,526 @@
1
- if ENV['COVERAGE']
2
- require 'simplecov'
3
- SimpleCov.start do
4
- add_filter "/spec/"
5
- add_filter "/pkg/"
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "extensions/hash"
4
+ require_relative "smarter_csv/version"
5
+
6
+ require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
7
+ # require 'smarter_csv.bundle' unless ENV['CI'] # does not compile/link in CI?
8
+
9
+ module SmarterCSV
10
+ class SmarterCSVException < StandardError; end
11
+ class HeaderSizeMismatch < SmarterCSVException; end
12
+ class IncorrectOption < SmarterCSVException; end
13
+ class DuplicateHeaders < SmarterCSVException; end
14
+ class MissingHeaders < SmarterCSVException; end
15
+ class NoColSepDetected < SmarterCSVException; end
16
+ class KeyMappingError < SmarterCSVException; end
17
+ class MalformedCSVError < SmarterCSVException; end
18
+
19
+ # first parameter: filename or input object which responds to readline method
20
+ def SmarterCSV.process(input, options = {}, &block)
21
+ options = default_options.merge(options)
22
+ options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
23
+ puts "SmarterCSV OPTIONS: #{options.inspect}" if options[:verbose]
24
+
25
+ headerA = []
26
+ result = []
27
+ @file_line_count = 0
28
+ @csv_line_count = 0
29
+ has_rails = !!defined?(Rails)
30
+ begin
31
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
32
+
33
+ # auto-detect the row separator
34
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
35
+ # attempt to auto-detect column separator
36
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
37
+
38
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
39
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
40
+ end
41
+
42
+ if options[:skip_lines].to_i > 0
43
+ options[:skip_lines].to_i.times do
44
+ readline_with_counts(fh, options)
45
+ end
46
+ end
47
+
48
+ headerA, header_size = process_headers(fh, options)
49
+
50
+ # in case we use chunking.. we'll need to set it up..
51
+ if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
52
+ use_chunks = true
53
+ chunk_size = options[:chunk_size].to_i
54
+ chunk_count = 0
55
+ chunk = []
56
+ else
57
+ use_chunks = false
58
+ end
59
+
60
+ # now on to processing all the rest of the lines in the CSV file:
61
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
62
+ line = readline_with_counts(fh, options)
63
+
64
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
65
+ line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
66
+
67
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
68
+
69
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
70
+
71
+ # cater for the quoted csv data containing the row separator carriage return character
72
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
73
+ # by detecting the existence of an uneven number of quote characters
74
+
75
+ multiline = line.count(options[:quote_char]).odd? # should handle quote_char nil
76
+ while line.count(options[:quote_char]).odd? # should handle quote_char nil
77
+ next_line = fh.readline(options[:row_sep])
78
+ next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
79
+ line += next_line
80
+ @file_line_count += 1
81
+ end
82
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
83
+
84
+ line.chomp!(options[:row_sep])
85
+
86
+ dataA, _data_size = parse(line, options, header_size)
87
+
88
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
89
+
90
+ # if all values are blank, then ignore this line
91
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
92
+
93
+ hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
94
+
95
+ # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
96
+ # Note: Ruby < 1.9 doesn't allow empty symbol literals!
97
+ hash.delete(nil)
98
+ hash.delete('')
99
+ eval('hash.delete(:"")') if RUBY_VERSION.to_f > 1.8
100
+
101
+ if options[:remove_empty_values] == true
102
+ hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
103
+ end
104
+
105
+ hash.delete_if{|_k, v| !v.nil? && v =~ /^(\d+|\d+\.\d+)$/ && v.to_f == 0} if options[:remove_zero_values] # values are typically Strings!
106
+ hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
107
+
108
+ if options[:convert_values_to_numeric]
109
+ hash.each do |k, v|
110
+ # deal with the :only / :except options to :convert_values_to_numeric
111
+ next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
112
+
113
+ # convert if it's a numeric value:
114
+ case v
115
+ when /^[+-]?\d+\.\d+$/
116
+ hash[k] = v.to_f
117
+ when /^[+-]?\d+$/
118
+ hash[k] = v.to_i
119
+ end
120
+ end
121
+ end
122
+
123
+ if options[:value_converters]
124
+ hash.each do |k, v|
125
+ converter = options[:value_converters][k]
126
+ next unless converter
127
+
128
+ hash[k] = converter.convert(v)
129
+ end
130
+ end
131
+
132
+ next if options[:remove_empty_hashes] && hash.empty?
133
+
134
+ hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
135
+
136
+ if use_chunks
137
+ chunk << hash # append temp result to chunk
138
+
139
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
140
+ # do something with the chunk
141
+ if block_given?
142
+ yield chunk # do something with the hashes in the chunk in the block
143
+ else
144
+ result << chunk # not sure yet, why anybody would want to do this without a block
145
+ end
146
+ chunk_count += 1
147
+ chunk = [] # initialize for next chunk of data
148
+ else
149
+
150
+ # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
151
+
152
+ end
153
+
154
+ # while a chunk is being filled up we don't need to do anything else here
155
+
156
+ else # no chunk handling
157
+ if block_given?
158
+ yield [hash] # do something with the hash in the block (better to use chunking here)
159
+ else
160
+ result << hash
161
+ end
162
+ end
163
+ end
164
+
165
+ # print new line to retain last processing line message
166
+ print "\n" if options[:verbose]
167
+
168
+ # last chunk:
169
+ if !chunk.nil? && chunk.size > 0
170
+ # do something with the chunk
171
+ if block_given?
172
+ yield chunk # do something with the hashes in the chunk in the block
173
+ else
174
+ result << chunk # not sure yet, why anybody would want to do this without a block
175
+ end
176
+ chunk_count += 1
177
+ chunk = [] # initialize for next chunk of data
178
+ end
179
+ ensure
180
+ fh.close if fh.respond_to?(:close)
181
+ end
182
+ if block_given?
183
+ return chunk_count # when we do processing through a block we only care how many chunks we processed
184
+ else
185
+ return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
186
+ end
6
187
  end
7
- end
8
188
 
9
- require 'csv'
10
- require "smarter_csv/version"
11
- require "extensions/hash.rb"
12
- require "smarter_csv/smarter_csv.rb"
189
+ class << self
190
+ def has_acceleration?
191
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
192
+ end
193
+
194
+ def raw_header
195
+ @raw_header
196
+ end
197
+
198
+ def headers
199
+ @headers
200
+ end
201
+
202
+ protected
203
+
204
+ # NOTE: this is not called when "parse" methods are tested by themselves
205
+ def default_options
206
+ {
207
+ acceleration: true,
208
+ auto_row_sep_chars: 500,
209
+ chunk_size: nil,
210
+ col_sep: ',',
211
+ comment_regexp: nil, # was: /\A#/,
212
+ convert_values_to_numeric: true,
213
+ downcase_header: true,
214
+ duplicate_header_suffix: nil,
215
+ file_encoding: 'utf-8',
216
+ force_simple_split: false,
217
+ force_utf8: false,
218
+ headers_in_file: true,
219
+ invalid_byte_sequence: '',
220
+ keep_original_headers: false,
221
+ key_mapping_hash: nil,
222
+ quote_char: '"',
223
+ remove_empty_hashes: true,
224
+ remove_empty_values: true,
225
+ remove_unmapped_keys: false,
226
+ remove_values_matching: nil,
227
+ remove_zero_values: false,
228
+ required_headers: nil,
229
+ row_sep: $/,
230
+ skip_lines: nil,
231
+ strings_as_keys: false,
232
+ strip_chars_from_headers: nil,
233
+ strip_whitespace: true,
234
+ user_provided_headers: nil,
235
+ value_converters: nil,
236
+ verbose: false,
237
+ with_line_numbers: false,
238
+ }
239
+ end
240
+
241
+ def readline_with_counts(filehandle, options)
242
+ line = filehandle.readline(options[:row_sep])
243
+ @file_line_count += 1
244
+ @csv_line_count += 1
245
+ line
246
+ end
247
+
248
+ ###
249
+ ### Thin wrapper around C-extension
250
+ ###
251
+ def parse(line, options, header_size = nil)
252
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
253
+
254
+ if options[:acceleration] && has_acceleration?
255
+ # :nocov:
256
+ has_quotes = line =~ /#{options[:quote_char]}/
257
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
258
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
259
+ return [elements, elements.size]
260
+ # :nocov:
261
+ else
262
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
263
+ return parse_csv_line_ruby(line, options, header_size)
264
+ end
265
+ end
266
+
267
+ # ------------------------------------------------------------------
268
+ # Ruby equivalent of the C-extension for parse_line
269
+ #
270
+ # parses a single line: either a CSV header and body line
271
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
272
+ # - we are not assuming that quotes inside a fields need to be doubled
273
+ # - we are not assuming that all fields need to be quoted (0 is even)
274
+ # - works with multi-char col_sep
275
+ # - if header_size is given, only up to header_size fields are parsed
276
+ #
277
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
278
+ # in case there are trailing col_sep characters in line
279
+ #
280
+ # Our convention is that empty fields are returned as empty strings, not as nil.
281
+ #
282
+ #
283
+ # the purpose of the max_size parameter is to handle a corner case where
284
+ # CSV lines contain more fields than the header.
285
+ # In which case the remaining fields in the line are ignored
286
+ #
287
+ def parse_csv_line_ruby(line, options, header_size = nil)
288
+ return [] if line.nil?
289
+
290
+ line_size = line.size
291
+ col_sep = options[:col_sep]
292
+ col_sep_size = col_sep.size
293
+ quote = options[:quote_char]
294
+ quote_count = 0
295
+ elements = []
296
+ start = 0
297
+ i = 0
298
+
299
+ while i < line_size
300
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
301
+ break if !header_size.nil? && elements.size >= header_size
302
+
303
+ elements << cleanup_quotes(line[start...i], quote)
304
+ i += col_sep.size
305
+ start = i
306
+ else
307
+ quote_count += 1 if line[i] == quote
308
+ i += 1
309
+ end
310
+ end
311
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
312
+ [elements, elements.size]
313
+ end
314
+
315
+ def cleanup_quotes(field, quote)
316
+ return field if field.nil?
317
+
318
+ # return if field !~ /#{quote}/ # this check can probably eliminated
319
+
320
+ if field.start_with?(quote) && field.end_with?(quote)
321
+ field.delete_prefix!(quote)
322
+ field.delete_suffix!(quote)
323
+ end
324
+ field.gsub!("#{quote}#{quote}", quote)
325
+ field
326
+ end
327
+
328
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
329
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
330
+ BLANK_RE = /\A\s*\z/.freeze
331
+
332
+ def blank?(value)
333
+ case value
334
+ when String
335
+ value.empty? || BLANK_RE.match?(value)
336
+
337
+ when NilClass
338
+ true
339
+
340
+ when Array
341
+ value.empty? || value.inject(true){|result, x| result &&= elem_blank?(x)}
342
+
343
+ when Hash
344
+ value.empty? || value.values.inject(true){|result, x| result &&= elem_blank?(x)}
345
+
346
+ else
347
+ false
348
+ end
349
+ end
350
+
351
+ def elem_blank?(value)
352
+ case value
353
+ when String
354
+ value.empty? || BLANK_RE.match?(value)
355
+
356
+ when NilClass
357
+ true
358
+
359
+ else
360
+ false
361
+ end
362
+ end
363
+
364
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
365
+ def only_or_except_limit_execution(options, option_name, key)
366
+ if options[option_name].is_a?(Hash)
367
+ if options[option_name].has_key?(:except)
368
+ return true if Array(options[option_name][:except]).include?(key)
369
+ elsif options[option_name].has_key?(:only)
370
+ return true unless Array(options[option_name][:only]).include?(key)
371
+ end
372
+ end
373
+ return false
374
+ end
375
+
376
+ # raise exception if none is found
377
+ def guess_column_separator(filehandle, options)
378
+ del = [',', "\t", ';', ':', '|']
379
+ n = Hash.new(0)
380
+
381
+ 5.times do
382
+ line = filehandle.readline(options[:row_sep])
383
+ del.each do |d|
384
+ n[d] += line.scan(d).count
385
+ end
386
+ rescue EOFError # short files
387
+ break
388
+ end
389
+
390
+ filehandle.rewind
391
+ raise SmarterCSV::NoColSepDetected if n.values.max == 0
392
+
393
+ col_sep = n.key(n.values.max)
394
+ end
395
+
396
+ # limitation: this currently reads the whole file in before making a decision
397
+ def guess_line_ending(filehandle, options)
398
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
399
+ quoted_char = false
400
+
401
+ # count how many of the pre-defined line-endings we find
402
+ # ignoring those contained within quote characters
403
+ last_char = nil
404
+ lines = 0
405
+ filehandle.each_char do |c|
406
+ quoted_char = !quoted_char if c == options[:quote_char]
407
+ next if quoted_char
408
+
409
+ if last_char == "\r"
410
+ if c == "\n"
411
+ counts["\r\n"] += 1
412
+ else
413
+ counts["\r"] += 1 # \r are counted after they appeared
414
+ end
415
+ elsif c == "\n"
416
+ counts["\n"] += 1
417
+ end
418
+ last_char = c
419
+ lines += 1
420
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
421
+ end
422
+ filehandle.rewind
423
+
424
+ counts["\r"] += 1 if last_char == "\r"
425
+ # find the most frequent key/value pair:
426
+ k, _ = counts.max_by{|_, v| v}
427
+ return k
428
+ end
429
+
430
+ def process_headers(filehandle, options)
431
+ @raw_header = nil
432
+ @headers = nil
433
+ if options[:headers_in_file] # extract the header line
434
+ # process the header line in the CSV file..
435
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
436
+ header = readline_with_counts(filehandle, options)
437
+ @raw_header = header
438
+
439
+ header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
440
+ header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
441
+ header = header.chomp(options[:row_sep])
442
+
443
+ header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
444
+
445
+ file_headerA, file_header_size = parse(header, options)
446
+
447
+ file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
448
+ file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
449
+ unless options[:keep_original_headers]
450
+ file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
451
+ file_headerA.map!{|x| x.downcase} if options[:downcase_header]
452
+ end
453
+ else
454
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
455
+ end
456
+ if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
457
+ # use user-provided headers
458
+ headerA = options[:user_provided_headers]
459
+ if defined?(file_header_size) && !file_header_size.nil?
460
+ if headerA.size != file_header_size
461
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
462
+ else
463
+ # we could print out the mapping of file_headerA to headerA here
464
+ end
465
+ end
466
+ else
467
+ headerA = file_headerA
468
+ end
469
+
470
+ # detect duplicate headers and disambiguate
471
+ headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
472
+ header_size = headerA.size # used for splitting lines
473
+
474
+ headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
475
+
476
+ unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
477
+ key_mappingH = options[:key_mapping]
478
+
479
+ # do some key mapping on the keys in the file header
480
+ # if you want to completely delete a key, then map it to nil or to ''
481
+ if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
482
+ # we can't map keys that are not there
483
+ missing_keys = key_mappingH.keys - headerA
484
+ puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty?
485
+
486
+ headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
487
+ end
488
+ end
489
+
490
+ # header_validations
491
+ duplicate_headers = []
492
+ headerA.compact.each do |k|
493
+ duplicate_headers << k if headerA.select{|x| x == k}.size > 1
494
+ end
495
+
496
+ unless options[:user_provided_headers] || duplicate_headers.empty?
497
+ raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
498
+ end
499
+
500
+ if options[:required_headers] && options[:required_headers].is_a?(Array)
501
+ missing_headers = []
502
+ options[:required_headers].each do |k|
503
+ missing_headers << k unless headerA.include?(k)
504
+ end
505
+ raise SmarterCSV::MissingHeaders, "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
506
+ end
507
+
508
+ @headers = headerA
509
+ [headerA, header_size]
510
+ end
511
+
512
+ def process_duplicate_headers(headers, options)
513
+ counts = Hash.new(0)
514
+ result = []
515
+ headers.each do |key|
516
+ counts[key] += 1
517
+ if counts[key] == 1
518
+ result << key
519
+ else
520
+ result << [key, options[:duplicate_header_suffix], counts[key]].join
521
+ end
522
+ end
523
+ result
524
+ end
525
+ end
526
+ end
data/smarter_csv.gemspec CHANGED
@@ -12,14 +12,29 @@ Gem::Specification.new do |spec|
12
12
  spec.homepage = "https://github.com/tilo/smarter_csv"
13
13
  spec.license = 'MIT'
14
14
 
15
- spec.files = `git ls-files`.split($\)
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(__dir__) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ (f == __FILE__) ||
24
+ f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)}) || f.match(/\.h\z/)
25
+ end
26
+ end
27
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
28
+
16
29
  spec.executables = spec.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
- spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
- spec.require_paths = ["lib"]
30
+ spec.require_paths = ["lib"] # add ext here?
31
+ spec.extensions = ["ext/smarter_csv/extconf.rb"]
32
+
33
+
34
+ spec.add_development_dependency "awesome_print"
35
+ spec.add_development_dependency "codecov"
36
+ spec.add_development_dependency "pry"
19
37
  spec.add_development_dependency "rspec"
38
+ spec.add_development_dependency "rubocop"
20
39
  spec.add_development_dependency "simplecov"
21
- spec.add_development_dependency "awesome_print"
22
- # spec.add_development_dependency "guard-rspec"
23
-
24
- spec.metadata["homepage_uri"] = spec.homepage
25
40
  end