smarter_csv 1.6.1 → 1.7.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (101) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +133 -0
  3. data/CHANGELOG.md +22 -1
  4. data/CONTRIBUTORS.md +3 -0
  5. data/Gemfile +7 -4
  6. data/README.md +8 -6
  7. data/Rakefile +15 -13
  8. data/ext/smarter_csv/extconf.rb +14 -0
  9. data/ext/smarter_csv/smarter_csv.c +86 -0
  10. data/lib/extensions/hash.rb +4 -2
  11. data/lib/smarter_csv/version.rb +3 -1
  12. data/lib/smarter_csv.rb +519 -10
  13. data/smarter_csv.gemspec +22 -7
  14. metadata +54 -176
  15. data/.gitignore +0 -10
  16. data/.rspec +0 -2
  17. data/.travis.yml +0 -27
  18. data/lib/smarter_csv/smarter_csv.rb +0 -461
  19. data/spec/fixtures/additional_separator.csv +0 -6
  20. data/spec/fixtures/basic.csv +0 -8
  21. data/spec/fixtures/binary.csv +0 -1
  22. data/spec/fixtures/carriage_returns_n.csv +0 -18
  23. data/spec/fixtures/carriage_returns_quoted.csv +0 -3
  24. data/spec/fixtures/carriage_returns_r.csv +0 -1
  25. data/spec/fixtures/carriage_returns_rn.csv +0 -18
  26. data/spec/fixtures/chunk_cornercase.csv +0 -10
  27. data/spec/fixtures/duplicate_headers.csv +0 -3
  28. data/spec/fixtures/empty.csv +0 -5
  29. data/spec/fixtures/empty_columns_1.csv +0 -2
  30. data/spec/fixtures/empty_columns_2.csv +0 -2
  31. data/spec/fixtures/hard_sample.csv +0 -2
  32. data/spec/fixtures/ignore_comments.csv +0 -11
  33. data/spec/fixtures/ignore_comments2.csv +0 -3
  34. data/spec/fixtures/key_mapping.csv +0 -2
  35. data/spec/fixtures/line_endings_n.csv +0 -4
  36. data/spec/fixtures/line_endings_r.csv +0 -1
  37. data/spec/fixtures/line_endings_rn.csv +0 -4
  38. data/spec/fixtures/lots_of_columns.csv +0 -2
  39. data/spec/fixtures/malformed.csv +0 -3
  40. data/spec/fixtures/malformed_header.csv +0 -3
  41. data/spec/fixtures/money.csv +0 -3
  42. data/spec/fixtures/no_header.csv +0 -7
  43. data/spec/fixtures/numeric.csv +0 -5
  44. data/spec/fixtures/pets.csv +0 -5
  45. data/spec/fixtures/problematic.csv +0 -8
  46. data/spec/fixtures/quote_char.csv +0 -9
  47. data/spec/fixtures/quoted.csv +0 -5
  48. data/spec/fixtures/quoted2.csv +0 -4
  49. data/spec/fixtures/separator_colon.csv +0 -4
  50. data/spec/fixtures/separator_comma.csv +0 -4
  51. data/spec/fixtures/separator_pipe.csv +0 -4
  52. data/spec/fixtures/separator_semi.csv +0 -4
  53. data/spec/fixtures/separator_tab.csv +0 -4
  54. data/spec/fixtures/skip_lines.csv +0 -8
  55. data/spec/fixtures/trading.csv +0 -3
  56. data/spec/fixtures/user_import.csv +0 -3
  57. data/spec/fixtures/valid_unicode.csv +0 -5
  58. data/spec/fixtures/with_dashes.csv +0 -8
  59. data/spec/fixtures/with_dates.csv +0 -4
  60. data/spec/smarter_csv/additional_separator_spec.rb +0 -45
  61. data/spec/smarter_csv/binary_file2_spec.rb +0 -24
  62. data/spec/smarter_csv/binary_file_spec.rb +0 -22
  63. data/spec/smarter_csv/blank_spec.rb +0 -55
  64. data/spec/smarter_csv/carriage_return_spec.rb +0 -190
  65. data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
  66. data/spec/smarter_csv/close_file_spec.rb +0 -15
  67. data/spec/smarter_csv/column_separator_spec.rb +0 -95
  68. data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
  69. data/spec/smarter_csv/duplicate_headers_spec.rb +0 -76
  70. data/spec/smarter_csv/empty_columns_spec.rb +0 -74
  71. data/spec/smarter_csv/extenstions_spec.rb +0 -17
  72. data/spec/smarter_csv/hard_sample_spec.rb +0 -24
  73. data/spec/smarter_csv/header_transformation_spec.rb +0 -21
  74. data/spec/smarter_csv/ignore_comments_spec.rb +0 -45
  75. data/spec/smarter_csv/invalid_headers_spec.rb +0 -38
  76. data/spec/smarter_csv/keep_headers_spec.rb +0 -24
  77. data/spec/smarter_csv/key_mapping_spec.rb +0 -56
  78. data/spec/smarter_csv/line_ending_spec.rb +0 -43
  79. data/spec/smarter_csv/load_basic_spec.rb +0 -20
  80. data/spec/smarter_csv/malformed_spec.rb +0 -25
  81. data/spec/smarter_csv/no_header_spec.rb +0 -29
  82. data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
  83. data/spec/smarter_csv/parse/column_separator_spec.rb +0 -61
  84. data/spec/smarter_csv/parse/old_csv_library_spec.rb +0 -74
  85. data/spec/smarter_csv/parse/rfc4180_and_more_spec.rb +0 -170
  86. data/spec/smarter_csv/problematic.rb +0 -34
  87. data/spec/smarter_csv/quoted_spec.rb +0 -52
  88. data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
  89. data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
  90. data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
  91. data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
  92. data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
  93. data/spec/smarter_csv/skip_lines_spec.rb +0 -29
  94. data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
  95. data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
  96. data/spec/smarter_csv/trading_spec.rb +0 -25
  97. data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
  98. data/spec/smarter_csv/value_converters_spec.rb +0 -52
  99. data/spec/spec/spec_helper.rb +0 -17
  100. data/spec/spec.opts +0 -2
  101. data/spec/spec_helper.rb +0 -21
data/lib/smarter_csv.rb CHANGED
@@ -1,12 +1,521 @@
1
- if ENV['COVERAGE']
2
- require 'simplecov'
3
- SimpleCov.start do
4
- add_filter "/spec/"
5
- add_filter "/pkg/"
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "extensions/hash"
4
+ require_relative "smarter_csv/version"
5
+ require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
6
+
7
+ module SmarterCSV
8
+ class SmarterCSVException < StandardError; end
9
+ class HeaderSizeMismatch < SmarterCSVException; end
10
+ class IncorrectOption < SmarterCSVException; end
11
+ class DuplicateHeaders < SmarterCSVException; end
12
+ class MissingHeaders < SmarterCSVException; end
13
+ class NoColSepDetected < SmarterCSVException; end
14
+ class KeyMappingError < SmarterCSVException; end
15
+ class MalformedCSVError < SmarterCSVException; end
16
+
17
+ # first parameter: filename or input object which responds to readline method
18
+ def SmarterCSV.process(input, options = {}, &block)
19
+ options = default_options.merge(options)
20
+ options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
21
+ puts "SmarterCSV OPTIONS: #{options.inspect}" if options[:verbose]
22
+
23
+ headerA = []
24
+ result = []
25
+ @file_line_count = 0
26
+ @csv_line_count = 0
27
+ has_rails = !!defined?(Rails)
28
+ begin
29
+ fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
30
+
31
+ # auto-detect the row separator
32
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
33
+ # attempt to auto-detect column separator
34
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
35
+
36
+ if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
37
+ puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
38
+ end
39
+
40
+ if options[:skip_lines].to_i > 0
41
+ options[:skip_lines].to_i.times do
42
+ readline_with_counts(fh, options)
43
+ end
44
+ end
45
+
46
+ headerA, header_size = process_headers(fh, options)
47
+
48
+ # in case we use chunking.. we'll need to set it up..
49
+ if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
50
+ use_chunks = true
51
+ chunk_size = options[:chunk_size].to_i
52
+ chunk_count = 0
53
+ chunk = []
54
+ else
55
+ use_chunks = false
56
+ end
57
+
58
+ # now on to processing all the rest of the lines in the CSV file:
59
+ until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
60
+ line = readline_with_counts(fh, options)
61
+
62
+ # replace invalid byte sequence in UTF-8 with question mark to avoid errors
63
+ line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
64
+
65
+ print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
66
+
67
+ next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
68
+
69
+ # cater for the quoted csv data containing the row separator carriage return character
70
+ # in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
71
+ # by detecting the existence of an uneven number of quote characters
72
+
73
+ multiline = line.count(options[:quote_char]).odd? # should handle quote_char nil
74
+ while line.count(options[:quote_char]).odd? # should handle quote_char nil
75
+ next_line = fh.readline(options[:row_sep])
76
+ next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
77
+ line += next_line
78
+ @file_line_count += 1
79
+ end
80
+ print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
81
+
82
+ line.chomp!(options[:row_sep])
83
+
84
+ dataA, _data_size = parse(line, options, header_size)
85
+
86
+ dataA.map!{|x| x.strip} if options[:strip_whitespace]
87
+
88
+ # if all values are blank, then ignore this line
89
+ next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
90
+
91
+ hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
92
+
93
+ # make sure we delete any key/value pairs from the hash, which the user wanted to delete:
94
+ # Note: Ruby < 1.9 doesn't allow empty symbol literals!
95
+ hash.delete(nil)
96
+ hash.delete('')
97
+ eval('hash.delete(:"")') if RUBY_VERSION.to_f > 1.8
98
+
99
+ if options[:remove_empty_values] == true
100
+ hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
101
+ end
102
+
103
+ hash.delete_if{|_k, v| !v.nil? && v =~ /^(\d+|\d+\.\d+)$/ && v.to_f == 0} if options[:remove_zero_values] # values are typically Strings!
104
+ hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
105
+
106
+ if options[:convert_values_to_numeric]
107
+ hash.each do |k, v|
108
+ # deal with the :only / :except options to :convert_values_to_numeric
109
+ next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
110
+
111
+ # convert if it's a numeric value:
112
+ case v
113
+ when /^[+-]?\d+\.\d+$/
114
+ hash[k] = v.to_f
115
+ when /^[+-]?\d+$/
116
+ hash[k] = v.to_i
117
+ end
118
+ end
119
+ end
120
+
121
+ if options[:value_converters]
122
+ hash.each do |k, v|
123
+ converter = options[:value_converters][k]
124
+ next unless converter
125
+
126
+ hash[k] = converter.convert(v)
127
+ end
128
+ end
129
+
130
+ next if options[:remove_empty_hashes] && hash.empty?
131
+
132
+ if use_chunks
133
+ chunk << hash # append temp result to chunk
134
+
135
+ if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
136
+ # do something with the chunk
137
+ if block_given?
138
+ yield chunk # do something with the hashes in the chunk in the block
139
+ else
140
+ result << chunk # not sure yet, why anybody would want to do this without a block
141
+ end
142
+ chunk_count += 1
143
+ chunk = [] # initialize for next chunk of data
144
+ else
145
+
146
+ # the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
147
+
148
+ end
149
+
150
+ # while a chunk is being filled up we don't need to do anything else here
151
+
152
+ else # no chunk handling
153
+ if block_given?
154
+ yield [hash] # do something with the hash in the block (better to use chunking here)
155
+ else
156
+ result << hash
157
+ end
158
+ end
159
+ end
160
+
161
+ # print new line to retain last processing line message
162
+ print "\n" if options[:verbose]
163
+
164
+ # last chunk:
165
+ if !chunk.nil? && chunk.size > 0
166
+ # do something with the chunk
167
+ if block_given?
168
+ yield chunk # do something with the hashes in the chunk in the block
169
+ else
170
+ result << chunk # not sure yet, why anybody would want to do this without a block
171
+ end
172
+ chunk_count += 1
173
+ chunk = [] # initialize for next chunk of data
174
+ end
175
+ ensure
176
+ fh.close if fh.respond_to?(:close)
177
+ end
178
+ if block_given?
179
+ return chunk_count # when we do processing through a block we only care how many chunks we processed
180
+ else
181
+ return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
182
+ end
6
183
  end
7
- end
8
184
 
9
- require 'csv'
10
- require "smarter_csv/version"
11
- require "extensions/hash.rb"
12
- require "smarter_csv/smarter_csv.rb"
185
+ class << self
186
+ def has_acceleration?
187
+ @has_acceleration ||= !!defined?(parse_csv_line_c)
188
+ end
189
+
190
+ def raw_header
191
+ @raw_header
192
+ end
193
+
194
+ def headers
195
+ @headers
196
+ end
197
+
198
+ protected
199
+
200
+ # NOTE: this is not called when "parse" methods are tested by themselves
201
+ def default_options
202
+ {
203
+ acceleration: true,
204
+ auto_row_sep_chars: 500,
205
+ chunk_size: nil,
206
+ col_sep: ',',
207
+ comment_regexp: nil, # was: /\A#/,
208
+ convert_values_to_numeric: true,
209
+ downcase_header: true,
210
+ duplicate_header_suffix: nil,
211
+ file_encoding: 'utf-8',
212
+ force_simple_split: false,
213
+ force_utf8: false,
214
+ headers_in_file: true,
215
+ invalid_byte_sequence: '',
216
+ keep_original_headers: false,
217
+ key_mapping_hash: nil,
218
+ quote_char: '"',
219
+ remove_empty_hashes: true,
220
+ remove_empty_values: true,
221
+ remove_unmapped_keys: false,
222
+ remove_values_matching: nil,
223
+ remove_zero_values: false,
224
+ required_headers: nil,
225
+ row_sep: $/,
226
+ skip_lines: nil,
227
+ strings_as_keys: false,
228
+ strip_chars_from_headers: nil,
229
+ strip_whitespace: true,
230
+ user_provided_headers: nil,
231
+ value_converters: nil,
232
+ verbose: false,
233
+ }
234
+ end
235
+
236
+ def readline_with_counts(filehandle, options)
237
+ line = filehandle.readline(options[:row_sep])
238
+ @file_line_count += 1
239
+ @csv_line_count += 1
240
+ line
241
+ end
242
+
243
+ ###
244
+ ### Thin wrapper around C-extension
245
+ ###
246
+ def parse(line, options, header_size = nil)
247
+ # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
248
+
249
+ if options[:acceleration] && has_acceleration?
250
+ # puts "NOTICE: Accelerated SmarterCSV / #{options[:acceleration]}" if options[:verbose]
251
+ has_quotes = line =~ /#{options[:quote_char]}/
252
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
253
+ elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
254
+ return [elements, elements.size]
255
+
256
+ else
257
+ # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
258
+ return parse_csv_line_ruby(line, options, header_size)
259
+ end
260
+ end
261
+
262
+ # ------------------------------------------------------------------
263
+ # Ruby equivalent of the C-extension for parse_line
264
+ #
265
+ # parses a single line: either a CSV header and body line
266
+ # - quoting rules compared to RFC-4180 are somewhat relaxed
267
+ # - we are not assuming that quotes inside a fields need to be doubled
268
+ # - we are not assuming that all fields need to be quoted (0 is even)
269
+ # - works with multi-char col_sep
270
+ # - if header_size is given, only up to header_size fields are parsed
271
+ #
272
+ # We use header_size for parsing the body lines to make sure we always match the number of headers
273
+ # in case there are trailing col_sep characters in line
274
+ #
275
+ # Our convention is that empty fields are returned as empty strings, not as nil.
276
+ #
277
+ #
278
+ # the purpose of the max_size parameter is to handle a corner case where
279
+ # CSV lines contain more fields than the header.
280
+ # In which case the remaining fields in the line are ignored
281
+ #
282
+ def parse_csv_line_ruby(line, options, header_size = nil)
283
+ return [] if line.nil?
284
+
285
+ line_size = line.size
286
+ col_sep = options[:col_sep]
287
+ col_sep_size = col_sep.size
288
+ quote = options[:quote_char]
289
+ quote_count = 0
290
+ elements = []
291
+ start = 0
292
+ i = 0
293
+
294
+ while i < line_size
295
+ if line[i...i+col_sep_size] == col_sep && quote_count.even?
296
+ break if !header_size.nil? && elements.size >= header_size
297
+
298
+ elements << cleanup_quotes(line[start...i], quote)
299
+ i += col_sep.size
300
+ start = i
301
+ else
302
+ quote_count += 1 if line[i] == quote
303
+ i += 1
304
+ end
305
+ end
306
+ elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
307
+ [elements, elements.size]
308
+ end
309
+
310
+ def cleanup_quotes(field, quote)
311
+ return field if field.nil?
312
+
313
+ # return if field !~ /#{quote}/ # this check can probably eliminated
314
+
315
+ if field.start_with?(quote) && field.end_with?(quote)
316
+ field.delete_prefix!(quote)
317
+ field.delete_suffix!(quote)
318
+ end
319
+ field.gsub!("#{quote}#{quote}", quote)
320
+ field
321
+ end
322
+
323
+ # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
324
+ # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
325
+ BLANK_RE = /\A\s*\z/.freeze
326
+
327
+ def blank?(value)
328
+ case value
329
+ when String
330
+ value.empty? || BLANK_RE.match?(value)
331
+
332
+ when NilClass
333
+ true
334
+
335
+ when Array
336
+ value.empty? || value.inject(true){|result, x| result &&= elem_blank?(x)}
337
+
338
+ when Hash
339
+ value.empty? || value.values.inject(true){|result, x| result &&= elem_blank?(x)}
340
+
341
+ else
342
+ false
343
+ end
344
+ end
345
+
346
+ def elem_blank?(value)
347
+ case value
348
+ when String
349
+ value.empty? || BLANK_RE.match?(value)
350
+
351
+ when NilClass
352
+ true
353
+
354
+ else
355
+ false
356
+ end
357
+ end
358
+
359
+ # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
360
+ def only_or_except_limit_execution(options, option_name, key)
361
+ if options[option_name].is_a?(Hash)
362
+ if options[option_name].has_key?(:except)
363
+ return true if Array(options[option_name][:except]).include?(key)
364
+ elsif options[option_name].has_key?(:only)
365
+ return true unless Array(options[option_name][:only]).include?(key)
366
+ end
367
+ end
368
+ return false
369
+ end
370
+
371
+ # raise exception if none is found
372
+ def guess_column_separator(filehandle, options)
373
+ del = [',', "\t", ';', ':', '|']
374
+ n = Hash.new(0)
375
+
376
+ 5.times do
377
+ line = filehandle.readline(options[:row_sep])
378
+ del.each do |d|
379
+ n[d] += line.scan(d).count
380
+ end
381
+ rescue EOFError # short files
382
+ break
383
+ end
384
+
385
+ filehandle.rewind
386
+ raise SmarterCSV::NoColSepDetected if n.values.max == 0
387
+
388
+ col_sep = n.key(n.values.max)
389
+ end
390
+
391
+ # limitation: this currently reads the whole file in before making a decision
392
+ def guess_line_ending(filehandle, options)
393
+ counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
394
+ quoted_char = false
395
+
396
+ # count how many of the pre-defined line-endings we find
397
+ # ignoring those contained within quote characters
398
+ last_char = nil
399
+ lines = 0
400
+ filehandle.each_char do |c|
401
+ quoted_char = !quoted_char if c == options[:quote_char]
402
+ next if quoted_char
403
+
404
+ if last_char == "\r"
405
+ if c == "\n"
406
+ counts["\r\n"] += 1
407
+ else
408
+ counts["\r"] += 1 # \r are counted after they appeared
409
+ end
410
+ elsif c == "\n"
411
+ counts["\n"] += 1
412
+ end
413
+ last_char = c
414
+ lines += 1
415
+ break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
416
+ end
417
+ filehandle.rewind
418
+
419
+ counts["\r"] += 1 if last_char == "\r"
420
+ # find the most frequent key/value pair:
421
+ k, _ = counts.max_by{|_, v| v}
422
+ return k
423
+ end
424
+
425
+ def process_headers(filehandle, options)
426
+ @raw_header = nil
427
+ @headers = nil
428
+ if options[:headers_in_file] # extract the header line
429
+ # process the header line in the CSV file..
430
+ # the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
431
+ header = readline_with_counts(filehandle, options)
432
+ @raw_header = header
433
+
434
+ header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
435
+ header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
436
+ header = header.chomp(options[:row_sep])
437
+
438
+ header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
439
+
440
+ file_headerA, file_header_size = parse(header, options)
441
+
442
+ file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
443
+ file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
444
+ unless options[:keep_original_headers]
445
+ file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
446
+ file_headerA.map!{|x| x.downcase} if options[:downcase_header]
447
+ end
448
+ else
449
+ raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
450
+ end
451
+ if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
452
+ # use user-provided headers
453
+ headerA = options[:user_provided_headers]
454
+ if defined?(file_header_size) && !file_header_size.nil?
455
+ if headerA.size != file_header_size
456
+ raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
457
+ else
458
+ # we could print out the mapping of file_headerA to headerA here
459
+ end
460
+ end
461
+ else
462
+ headerA = file_headerA
463
+ end
464
+
465
+ # detect duplicate headers and disambiguate
466
+ headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
467
+ header_size = headerA.size # used for splitting lines
468
+
469
+ headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
470
+
471
+ unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
472
+ key_mappingH = options[:key_mapping]
473
+
474
+ # do some key mapping on the keys in the file header
475
+ # if you want to completely delete a key, then map it to nil or to ''
476
+ if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
477
+ # we can't map keys that are not there
478
+ missing_keys = key_mappingH.keys - headerA
479
+ puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty?
480
+
481
+ headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
482
+ end
483
+ end
484
+
485
+ # header_validations
486
+ duplicate_headers = []
487
+ headerA.compact.each do |k|
488
+ duplicate_headers << k if headerA.select{|x| x == k}.size > 1
489
+ end
490
+
491
+ unless options[:user_provided_headers] || duplicate_headers.empty?
492
+ raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
493
+ end
494
+
495
+ if options[:required_headers] && options[:required_headers].is_a?(Array)
496
+ missing_headers = []
497
+ options[:required_headers].each do |k|
498
+ missing_headers << k unless headerA.include?(k)
499
+ end
500
+ raise SmarterCSV::MissingHeaders, "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
501
+ end
502
+
503
+ @headers = headerA
504
+ [headerA, header_size]
505
+ end
506
+
507
+ def process_duplicate_headers(headers, options)
508
+ counts = Hash.new(0)
509
+ result = []
510
+ headers.each do |key|
511
+ counts[key] += 1
512
+ if counts[key] == 1
513
+ result << key
514
+ else
515
+ result << [key, options[:duplicate_header_suffix], counts[key]].join
516
+ end
517
+ end
518
+ result
519
+ end
520
+ end
521
+ end
data/smarter_csv.gemspec CHANGED
@@ -12,14 +12,29 @@ Gem::Specification.new do |spec|
12
12
  spec.homepage = "https://github.com/tilo/smarter_csv"
13
13
  spec.license = 'MIT'
14
14
 
15
- spec.files = `git ls-files`.split($\)
15
+ spec.metadata["homepage_uri"] = spec.homepage
16
+ spec.metadata["source_code_uri"] = spec.homepage
17
+ spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
18
+
19
+ # Specify which files should be added to the gem when it is released.
20
+ # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
21
+ spec.files = Dir.chdir(__dir__) do
22
+ `git ls-files -z`.split("\x0").reject do |f|
23
+ (f == __FILE__) ||
24
+ f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)}) || f.match(/\.h\z/)
25
+ end
26
+ end
27
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
28
+
16
29
  spec.executables = spec.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
17
- spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
18
- spec.require_paths = ["lib"]
30
+ spec.require_paths = ["lib"] # add ext here?
31
+ spec.extensions = ["ext/smarter_csv/extconf.rb"]
32
+
33
+
34
+ spec.add_development_dependency "awesome_print"
35
+ spec.add_development_dependency "codecov"
36
+ spec.add_development_dependency "pry"
19
37
  spec.add_development_dependency "rspec"
38
+ spec.add_development_dependency "rubocop"
20
39
  spec.add_development_dependency "simplecov"
21
- spec.add_development_dependency "awesome_print"
22
- # spec.add_development_dependency "guard-rspec"
23
-
24
- spec.metadata["homepage_uri"] = spec.homepage
25
40
  end