smarter_csv 1.6.1 → 1.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -0
  3. data/CHANGELOG.md +22 -1
  4. data/CONTRIBUTORS.md +3 -0
  5. data/Gemfile +7 -4
  6. data/README.md +8 -6
  7. data/Rakefile +15 -13
  8. data/ext/smarter_csv/extconf.rb +14 -0
  9. data/ext/smarter_csv/smarter_csv.c +86 -0
  10. data/lib/extensions/hash.rb +4 -2
  11. data/lib/smarter_csv/version.rb +3 -1
  12. data/lib/smarter_csv.rb +519 -10
  13. data/smarter_csv.gemspec +22 -7
  14. metadata +54 -176
  15. data/.gitignore +0 -10
  16. data/.rspec +0 -2
  17. data/.travis.yml +0 -27
  18. data/lib/smarter_csv/smarter_csv.rb +0 -461
  19. data/spec/fixtures/additional_separator.csv +0 -6
  20. data/spec/fixtures/basic.csv +0 -8
  21. data/spec/fixtures/binary.csv +0 -1
  22. data/spec/fixtures/carriage_returns_n.csv +0 -18
  23. data/spec/fixtures/carriage_returns_quoted.csv +0 -3
  24. data/spec/fixtures/carriage_returns_r.csv +0 -1
  25. data/spec/fixtures/carriage_returns_rn.csv +0 -18
  26. data/spec/fixtures/chunk_cornercase.csv +0 -10
  27. data/spec/fixtures/duplicate_headers.csv +0 -3
  28. data/spec/fixtures/empty.csv +0 -5
  29. data/spec/fixtures/empty_columns_1.csv +0 -2
  30. data/spec/fixtures/empty_columns_2.csv +0 -2
  31. data/spec/fixtures/hard_sample.csv +0 -2
  32. data/spec/fixtures/ignore_comments.csv +0 -11
  33. data/spec/fixtures/ignore_comments2.csv +0 -3
  34. data/spec/fixtures/key_mapping.csv +0 -2
  35. data/spec/fixtures/line_endings_n.csv +0 -4
  36. data/spec/fixtures/line_endings_r.csv +0 -1
  37. data/spec/fixtures/line_endings_rn.csv +0 -4
  38. data/spec/fixtures/lots_of_columns.csv +0 -2
  39. data/spec/fixtures/malformed.csv +0 -3
  40. data/spec/fixtures/malformed_header.csv +0 -3
  41. data/spec/fixtures/money.csv +0 -3
  42. data/spec/fixtures/no_header.csv +0 -7
  43. data/spec/fixtures/numeric.csv +0 -5
  44. data/spec/fixtures/pets.csv +0 -5
  45. data/spec/fixtures/problematic.csv +0 -8
  46. data/spec/fixtures/quote_char.csv +0 -9
  47. data/spec/fixtures/quoted.csv +0 -5
  48. data/spec/fixtures/quoted2.csv +0 -4
  49. data/spec/fixtures/separator_colon.csv +0 -4
  50. data/spec/fixtures/separator_comma.csv +0 -4
  51. data/spec/fixtures/separator_pipe.csv +0 -4
  52. data/spec/fixtures/separator_semi.csv +0 -4
  53. data/spec/fixtures/separator_tab.csv +0 -4
  54. data/spec/fixtures/skip_lines.csv +0 -8
  55. data/spec/fixtures/trading.csv +0 -3
  56. data/spec/fixtures/user_import.csv +0 -3
  57. data/spec/fixtures/valid_unicode.csv +0 -5
  58. data/spec/fixtures/with_dashes.csv +0 -8
  59. data/spec/fixtures/with_dates.csv +0 -4
  60. data/spec/smarter_csv/additional_separator_spec.rb +0 -45
  61. data/spec/smarter_csv/binary_file2_spec.rb +0 -24
  62. data/spec/smarter_csv/binary_file_spec.rb +0 -22
  63. data/spec/smarter_csv/blank_spec.rb +0 -55
  64. data/spec/smarter_csv/carriage_return_spec.rb +0 -190
  65. data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
  66. data/spec/smarter_csv/close_file_spec.rb +0 -15
  67. data/spec/smarter_csv/column_separator_spec.rb +0 -95
  68. data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
  69. data/spec/smarter_csv/duplicate_headers_spec.rb +0 -76
  70. data/spec/smarter_csv/empty_columns_spec.rb +0 -74
  71. data/spec/smarter_csv/extenstions_spec.rb +0 -17
  72. data/spec/smarter_csv/hard_sample_spec.rb +0 -24
  73. data/spec/smarter_csv/header_transformation_spec.rb +0 -21
  74. data/spec/smarter_csv/ignore_comments_spec.rb +0 -45
  75. data/spec/smarter_csv/invalid_headers_spec.rb +0 -38
  76. data/spec/smarter_csv/keep_headers_spec.rb +0 -24
  77. data/spec/smarter_csv/key_mapping_spec.rb +0 -56
  78. data/spec/smarter_csv/line_ending_spec.rb +0 -43
  79. data/spec/smarter_csv/load_basic_spec.rb +0 -20
  80. data/spec/smarter_csv/malformed_spec.rb +0 -25
  81. data/spec/smarter_csv/no_header_spec.rb +0 -29
  82. data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
  83. data/spec/smarter_csv/parse/column_separator_spec.rb +0 -61
  84. data/spec/smarter_csv/parse/old_csv_library_spec.rb +0 -74
  85. data/spec/smarter_csv/parse/rfc4180_and_more_spec.rb +0 -170
  86. data/spec/smarter_csv/problematic.rb +0 -34
  87. data/spec/smarter_csv/quoted_spec.rb +0 -52
  88. data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
  89. data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
  90. data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
  91. data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
  92. data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
  93. data/spec/smarter_csv/skip_lines_spec.rb +0 -29
  94. data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
  95. data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
  96. data/spec/smarter_csv/trading_spec.rb +0 -25
  97. data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
  98. data/spec/smarter_csv/value_converters_spec.rb +0 -52
  99. data/spec/spec/spec_helper.rb +0 -17
  100. data/spec/spec.opts +0 -2
  101. data/spec/spec_helper.rb +0 -21
data/lib/smarter_csv.rb CHANGED
@@ -1,12 +1,521 @@
1
- if ENV['COVERAGE']
2
- require 'simplecov'
3
- SimpleCov.start do
4
- add_filter "/spec/"
5
- add_filter "/pkg/"
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "extensions/hash"
4
+ require_relative "smarter_csv/version"
5
+ require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
6
+
7
+ module SmarterCSV
8
+ class SmarterCSVException < StandardError; end
9
+ class HeaderSizeMismatch < SmarterCSVException; end
10
+ class IncorrectOption < SmarterCSVException; end
11
+ class DuplicateHeaders < SmarterCSVException; end
12
+ class MissingHeaders < SmarterCSVException; end
13
+ class NoColSepDetected < SmarterCSVException; end
14
+ class KeyMappingError < SmarterCSVException; end
15
+ class MalformedCSVError < SmarterCSVException; end
16
+
17
+ # first parameter: filename or input object which responds to readline method
18
+ def SmarterCSV.process(input, options = {}, &block)
19
+ options = default_options.merge(options)
20
+ options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
21
+ puts "SmarterCSV OPTIONS: #{options.inspect}" if options[:verbose]
22
+
23
+ headerA = []
24
+ result = []
25
+ @file_line_count = 0
26
+ @csv_line_count = 0
27
+ has_rails = !!defined?(Rails)
28
+ begin
29
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
30
+
31
+ # auto-detect the row separator
32
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
33
+ # attempt to auto-detect column separator
34
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
35
+
36
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
37
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
38
+ end
39
+
40
+ if options[:skip_lines].to_i > 0
41
+ options[:skip_lines].to_i.times do
42
+ readline_with_counts(fh, options)
43
+ end
44
+ end
45
+
46
+ headerA, header_size = process_headers(fh, options)
47
+
48
+ # in case we use chunking.. we'll need to set it up..
49
+ if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
50
+ use_chunks = true
51
+ chunk_size = options[:chunk_size].to_i
52
+ chunk_count = 0
53
+ chunk = []
54
+ else
55
+ use_chunks = false
56
+ end
57
+
58
+ # now on to processing all the rest of the lines in the CSV file:
59
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
60
+ line = readline_with_counts(fh, options)
61
+
62
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
63
+ line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
64
+
65
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
66
+
67
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
68
+
69
+ # cater for the quoted csv data containing the row separator carriage return character
70
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
71
+ # by detecting the existence of an uneven number of quote characters
72
+
73
+ multiline = line.count(options[:quote_char]).odd? # should handle quote_char nil
74
+ while line.count(options[:quote_char]).odd? # should handle quote_char nil
75
+ next_line = fh.readline(options[:row_sep])
76
+ next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
77
+ line += next_line
78
+ @file_line_count += 1
79
+ end
80
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
81
+
82
+ line.chomp!(options[:row_sep])
83
+
84
+ dataA, _data_size = parse(line, options, header_size)
85
+
86
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
87
+
88
+ # if all values are blank, then ignore this line
89
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
90
+
91
+ hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
92
+
93
+ # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
94
+ # Note: Ruby < 1.9 doesn't allow empty symbol literals!
95
+ hash.delete(nil)
96
+ hash.delete('')
97
+ eval('hash.delete(:"")') if RUBY_VERSION.to_f > 1.8
98
+
99
+ if options[:remove_empty_values] == true
100
+ hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
101
+ end
102
+
103
+ hash.delete_if{|_k, v| !v.nil? && v =~ /^(\d+|\d+\.\d+)$/ && v.to_f == 0} if options[:remove_zero_values] # values are typically Strings!
104
+ hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
105
+
106
+ if options[:convert_values_to_numeric]
107
+ hash.each do |k, v|
108
+ # deal with the :only / :except options to :convert_values_to_numeric
109
+ next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
110
+
111
+ # convert if it's a numeric value:
112
+ case v
113
+ when /^[+-]?\d+\.\d+$/
114
+ hash[k] = v.to_f
115
+ when /^[+-]?\d+$/
116
+ hash[k] = v.to_i
117
+ end
118
+ end
119
+ end
120
+
121
+ if options[:value_converters]
122
+ hash.each do |k, v|
123
+ converter = options[:value_converters][k]
124
+ next unless converter
125
+
126
+ hash[k] = converter.convert(v)
127
+ end
128
+ end
129
+
130
+ next if options[:remove_empty_hashes] && hash.empty?
131
+
132
+ if use_chunks
133
+ chunk << hash # append temp result to chunk
134
+
135
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
136
+ # do something with the chunk
137
+ if block_given?
138
+ yield chunk # do something with the hashes in the chunk in the block
139
+ else
140
+ result << chunk # not sure yet, why anybody would want to do this without a block
141
+ end
142
+ chunk_count += 1
143
+ chunk = [] # initialize for next chunk of data
144
+ else
145
+
146
+ # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
147
+
148
+ end
149
+
150
+ # while a chunk is being filled up we don't need to do anything else here
151
+
152
+ else # no chunk handling
153
+ if block_given?
154
+ yield [hash] # do something with the hash in the block (better to use chunking here)
155
+ else
156
+ result << hash
157
+ end
158
+ end
159
+ end
160
+
161
+ # print new line to retain last processing line message
162
+ print "\n" if options[:verbose]
163
+
164
+ # last chunk:
165
+ if !chunk.nil? && chunk.size > 0
166
+ # do something with the chunk
167
+ if block_given?
168
+ yield chunk # do something with the hashes in the chunk in the block
169
+ else
170
+ result << chunk # not sure yet, why anybody would want to do this without a block
171
+ end
172
+ chunk_count += 1
173
+ chunk = [] # initialize for next chunk of data
174
+ end
175
+ ensure
176
+ fh.close if fh.respond_to?(:close)
177
+ end
178
+ if block_given?
179
+ return chunk_count # when we do processing through a block we only care how many chunks we processed
180
+ else
181
+ return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
182
+ end
6
183
  end
7
- end
8
184
 
9
- require 'csv'
10
- require "smarter_csv/version"
11
- require "extensions/hash.rb"
12
- require "smarter_csv/smarter_csv.rb"
185
+ class << self
186
+ def has_acceleration?
187
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
188
+ end
189
+
190
+ def raw_header
191
+ @raw_header
192
+ end
193
+
194
+ def headers
195
+ @headers
196
+ end
197
+
198
+ protected
199
+
200
+ # NOTE: this is not called when "parse" methods are tested by themselves
201
+ def default_options
202
+ {
203
+ acceleration: true,
204
+ auto_row_sep_chars: 500,
205
+ chunk_size: nil,
206
+ col_sep: ',',
207
+ comment_regexp: nil, # was: /\A#/,
208
+ convert_values_to_numeric: true,
209
+ downcase_header: true,
210
+ duplicate_header_suffix: nil,
211
+ file_encoding: 'utf-8',
212
+ force_simple_split: false,
213
+ force_utf8: false,
214
+ headers_in_file: true,
215
+ invalid_byte_sequence: '',
216
+ keep_original_headers: false,
217
+ key_mapping_hash: nil,
218
+ quote_char: '"',
219
+ remove_empty_hashes: true,
220
+ remove_empty_values: true,
221
+ remove_unmapped_keys: false,
222
+ remove_values_matching: nil,
223
+ remove_zero_values: false,
224
+ required_headers: nil,
225
+ row_sep: $/,
226
+ skip_lines: nil,
227
+ strings_as_keys: false,
228
+ strip_chars_from_headers: nil,
229
+ strip_whitespace: true,
230
+ user_provided_headers: nil,
231
+ value_converters: nil,
232
+ verbose: false,
233
+ }
234
+ end
235
+
236
+ def readline_with_counts(filehandle, options)
237
+ line = filehandle.readline(options[:row_sep])
238
+ @file_line_count += 1
239
+ @csv_line_count += 1
240
+ line
241
+ end
242
+
243
+ ###
244
+ ### Thin wrapper around C-extension
245
+ ###
246
+ def parse(line, options, header_size = nil)
247
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
248
+
249
+ if options[:acceleration] && has_acceleration?
250
+ # puts "NOTICE: Accelerated SmarterCSV / #{options[:acceleration]}" if options[:verbose]
251
+ has_quotes = line =~ /#{options[:quote_char]}/
252
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
253
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
254
+ return [elements, elements.size]
255
+
256
+ else
257
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
258
+ return parse_csv_line_ruby(line, options, header_size)
259
+ end
260
+ end
261
+
262
+ # ------------------------------------------------------------------
263
+ # Ruby equivalent of the C-extension for parse_line
264
+ #
265
+ # parses a single line: either a CSV header and body line
266
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
267
+ # - we are not assuming that quotes inside a fields need to be doubled
268
+ # - we are not assuming that all fields need to be quoted (0 is even)
269
+ # - works with multi-char col_sep
270
+ # - if header_size is given, only up to header_size fields are parsed
271
+ #
272
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
273
+ # in case there are trailing col_sep characters in line
274
+ #
275
+ # Our convention is that empty fields are returned as empty strings, not as nil.
276
+ #
277
+ #
278
+ # the purpose of the max_size parameter is to handle a corner case where
279
+ # CSV lines contain more fields than the header.
280
+ # In which case the remaining fields in the line are ignored
281
+ #
282
+ def parse_csv_line_ruby(line, options, header_size = nil)
283
+ return [] if line.nil?
284
+
285
+ line_size = line.size
286
+ col_sep = options[:col_sep]
287
+ col_sep_size = col_sep.size
288
+ quote = options[:quote_char]
289
+ quote_count = 0
290
+ elements = []
291
+ start = 0
292
+ i = 0
293
+
294
+ while i < line_size
295
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
296
+ break if !header_size.nil? && elements.size >= header_size
297
+
298
+ elements << cleanup_quotes(line[start...i], quote)
299
+ i += col_sep.size
300
+ start = i
301
+ else
302
+ quote_count += 1 if line[i] == quote
303
+ i += 1
304
+ end
305
+ end
306
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
307
+ [elements, elements.size]
308
+ end
309
+
310
+ def cleanup_quotes(field, quote)
311
+ return field if field.nil?
312
+
313
+ # return if field !~ /#{quote}/ # this check can probably eliminated
314
+
315
+ if field.start_with?(quote) && field.end_with?(quote)
316
+ field.delete_prefix!(quote)
317
+ field.delete_suffix!(quote)
318
+ end
319
+ field.gsub!("#{quote}#{quote}", quote)
320
+ field
321
+ end
322
+
323
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
324
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
325
+ BLANK_RE = /\A\s*\z/.freeze
326
+
327
+ def blank?(value)
328
+ case value
329
+ when String
330
+ value.empty? || BLANK_RE.match?(value)
331
+
332
+ when NilClass
333
+ true
334
+
335
+ when Array
336
+ value.empty? || value.inject(true){|result, x| result &&= elem_blank?(x)}
337
+
338
+ when Hash
339
+ value.empty? || value.values.inject(true){|result, x| result &&= elem_blank?(x)}
340
+
341
+ else
342
+ false
343
+ end
344
+ end
345
+
346
+ def elem_blank?(value)
347
+ case value
348
+ when String
349
+ value.empty? || BLANK_RE.match?(value)
350
+
351
+ when NilClass
352
+ true
353
+
354
+ else
355
+ false
356
+ end
357
+ end
358
+
359
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
360
+ def only_or_except_limit_execution(options, option_name, key)
361
+ if options[option_name].is_a?(Hash)
362
+ if options[option_name].has_key?(:except)
363
+ return true if Array(options[option_name][:except]).include?(key)
364
+ elsif options[option_name].has_key?(:only)
365
+ return true unless Array(options[option_name][:only]).include?(key)
366
+ end
367
+ end
368
+ return false
369
+ end
370
+
371
+ # raise exception if none is found
372
+ def guess_column_separator(filehandle, options)
373
+ del = [',', "\t", ';', ':', '|']
374
+ n = Hash.new(0)
375
+
376
+ 5.times do
377
+ line = filehandle.readline(options[:row_sep])
378
+ del.each do |d|
379
+ n[d] += line.scan(d).count
380
+ end
381
+ rescue EOFError # short files
382
+ break
383
+ end
384
+
385
+ filehandle.rewind
386
+ raise SmarterCSV::NoColSepDetected if n.values.max == 0
387
+
388
+ col_sep = n.key(n.values.max)
389
+ end
390
+
391
+ # limitation: this currently reads the whole file in before making a decision
392
+ def guess_line_ending(filehandle, options)
393
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
394
+ quoted_char = false
395
+
396
+ # count how many of the pre-defined line-endings we find
397
+ # ignoring those contained within quote characters
398
+ last_char = nil
399
+ lines = 0
400
+ filehandle.each_char do |c|
401
+ quoted_char = !quoted_char if c == options[:quote_char]
402
+ next if quoted_char
403
+
404
+ if last_char == "\r"
405
+ if c == "\n"
406
+ counts["\r\n"] += 1
407
+ else
408
+ counts["\r"] += 1 # \r are counted after they appeared
409
+ end
410
+ elsif c == "\n"
411
+ counts["\n"] += 1
412
+ end
413
+ last_char = c
414
+ lines += 1
415
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
416
+ end
417
+ filehandle.rewind
418
+
419
+ counts["\r"] += 1 if last_char == "\r"
420
+ # find the most frequent key/value pair:
421
+ k, _ = counts.max_by{|_, v| v}
422
+ return k
423
+ end
424
+
425
+ def process_headers(filehandle, options)
426
+ @raw_header = nil
427
+ @headers = nil
428
+ if options[:headers_in_file] # extract the header line
429
+ # process the header line in the CSV file..
430
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
431
+ header = readline_with_counts(filehandle, options)
432
+ @raw_header = header
433
+
434
+ header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
435
+ header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
436
+ header = header.chomp(options[:row_sep])
437
+
438
+ header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
439
+
440
+ file_headerA, file_header_size = parse(header, options)
441
+
442
+ file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
443
+ file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
444
+ unless options[:keep_original_headers]
445
+ file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
446
+ file_headerA.map!{|x| x.downcase} if options[:downcase_header]
447
+ end
448
+ else
449
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
450
+ end
451
+ if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
452
+ # use user-provided headers
453
+ headerA = options[:user_provided_headers]
454
+ if defined?(file_header_size) && !file_header_size.nil?
455
+ if headerA.size != file_header_size
456
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
457
+ else
458
+ # we could print out the mapping of file_headerA to headerA here
459
+ end
460
+ end
461
+ else
462
+ headerA = file_headerA
463
+ end
464
+
465
+ # detect duplicate headers and disambiguate
466
+ headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
467
+ header_size = headerA.size # used for splitting lines
468
+
469
+ headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
470
+
471
+ unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
472
+ key_mappingH = options[:key_mapping]
473
+
474
+ # do some key mapping on the keys in the file header
475
+ # if you want to completely delete a key, then map it to nil or to ''
476
+ if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
477
+ # we can't map keys that are not there
478
+ missing_keys = key_mappingH.keys - headerA
479
+ puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty?
480
+
481
+ headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
482
+ end
483
+ end
484
+
485
+ # header_validations
486
+ duplicate_headers = []
487
+ headerA.compact.each do |k|
488
+ duplicate_headers << k if headerA.select{|x| x == k}.size > 1
489
+ end
490
+
491
+ unless options[:user_provided_headers] || duplicate_headers.empty?
492
+ raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
493
+ end
494
+
495
+ if options[:required_headers] && options[:required_headers].is_a?(Array)
496
+ missing_headers = []
497
+ options[:required_headers].each do |k|
498
+ missing_headers << k unless headerA.include?(k)
499
+ end
500
+ raise SmarterCSV::MissingHeaders, "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
501
+ end
502
+
503
+ @headers = headerA
504
+ [headerA, header_size]
505
+ end
506
+
507
+ def process_duplicate_headers(headers, options)
508
+ counts = Hash.new(0)
509
+ result = []
510
+ headers.each do |key|
511
+ counts[key] += 1
512
+ if counts[key] == 1
513
+ result << key
514
+ else
515
+ result << [key, options[:duplicate_header_suffix], counts[key]].join
516
+ end
517
+ end
518
+ result
519
+ end
520
+ end
521
+ end
data/smarter_csv.gemspec CHANGED
@@ -12,14 +12,29 @@ Gem::Specification.new do |spec|
12
12
  spec.homepage = "https://github.com/tilo/smarter_csv"
13
13
  spec.license = 'MIT'
14
14
 
15
- spec.files = `git ls-files`.split($\)
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(__dir__) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ (f == __FILE__) ||
24
+ f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)}) || f.match(/\.h\z/)
25
+ end
26
+ end
27
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
28
+
16
29
  spec.executables = spec.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
- spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
- spec.require_paths = ["lib"]
30
+ spec.require_paths = ["lib"] # add ext here?
31
+ spec.extensions = ["ext/smarter_csv/extconf.rb"]
32
+
33
+
34
+ spec.add_development_dependency "awesome_print"
35
+ spec.add_development_dependency "codecov"
36
+ spec.add_development_dependency "pry"
19
37
  spec.add_development_dependency "rspec"
38
+ spec.add_development_dependency "rubocop"
20
39
  spec.add_development_dependency "simplecov"
21
- spec.add_development_dependency "awesome_print"
22
- # spec.add_development_dependency "guard-rspec"
23
-
24
- spec.metadata["homepage_uri"] = spec.homepage
25
40
  end