smarter_csv 1.9.2 → 1.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +23 -0
- data/README.md +29 -8
- data/lib/smarter_csv/auto_detection.rb +73 -0
- data/lib/smarter_csv/file_io.rb +50 -0
- data/lib/smarter_csv/hash_transformations.rb +91 -0
- data/lib/smarter_csv/header_transformations.rb +63 -0
- data/lib/smarter_csv/header_validations.rb +34 -0
- data/lib/smarter_csv/headers.rb +68 -0
- data/lib/smarter_csv/options_processing.rb +10 -1
- data/lib/smarter_csv/parse.rb +90 -0
- data/lib/smarter_csv/smarter_csv.rb +79 -416
- data/lib/smarter_csv/variables.rb +30 -0
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +16 -3
- metadata +11 -4
- data/lib/core_ext/hash.rb +0 -9
@@ -12,65 +12,81 @@ module SmarterCSV
|
|
12
12
|
|
13
13
|
# first parameter: filename or input object which responds to readline method
|
14
14
|
def SmarterCSV.process(input, given_options = {}, &block) # rubocop:disable Lint/UnusedMethodArgument
|
15
|
+
initialize_variables
|
16
|
+
|
15
17
|
options = process_options(given_options)
|
16
18
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
@csv_line_count = 0
|
21
|
-
has_rails = !!defined?(Rails)
|
19
|
+
@enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
20
|
+
@verbose = options[:verbose]
|
21
|
+
|
22
22
|
begin
|
23
23
|
fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
|
24
24
|
|
25
|
+
if @enforce_utf8 && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
|
26
|
+
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
|
27
|
+
end
|
28
|
+
|
25
29
|
# auto-detect the row separator
|
26
30
|
options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
|
27
31
|
# attempt to auto-detect column separator
|
28
32
|
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
|
29
33
|
|
30
|
-
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
|
31
|
-
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
|
32
|
-
end
|
33
|
-
|
34
34
|
skip_lines(fh, options)
|
35
35
|
|
36
|
-
|
36
|
+
@headers, header_size = process_headers(fh, options)
|
37
|
+
@headerA = @headers # @headerA is deprecated, use @headers
|
38
|
+
|
39
|
+
puts "Effective headers:\n#{pp(@headers)}\n" if @verbose
|
40
|
+
|
41
|
+
header_validations(@headers, options)
|
37
42
|
|
38
43
|
# in case we use chunking.. we'll need to set it up..
|
39
|
-
if
|
44
|
+
if options[:chunk_size].to_i > 0
|
40
45
|
use_chunks = true
|
41
46
|
chunk_size = options[:chunk_size].to_i
|
42
|
-
chunk_count = 0
|
47
|
+
@chunk_count = 0
|
43
48
|
chunk = []
|
44
49
|
else
|
45
50
|
use_chunks = false
|
46
51
|
end
|
47
52
|
|
48
53
|
# now on to processing all the rest of the lines in the CSV file:
|
54
|
+
# fh.each_line |line|
|
49
55
|
until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
|
50
56
|
line = readline_with_counts(fh, options)
|
51
57
|
|
52
58
|
# replace invalid byte sequence in UTF-8 with question mark to avoid errors
|
53
|
-
line = line
|
59
|
+
line = enforce_utf8_encoding(line, options) if @enforce_utf8
|
54
60
|
|
55
|
-
print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if
|
61
|
+
print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if @verbose
|
56
62
|
|
57
63
|
next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
|
58
64
|
|
59
65
|
# cater for the quoted csv data containing the row separator carriage return character
|
60
66
|
# in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
|
61
67
|
# by detecting the existence of an uneven number of quote characters
|
68
|
+
multiline = count_quote_chars(line, options[:quote_char]).odd?
|
62
69
|
|
63
|
-
multiline
|
64
|
-
while count_quote_chars(line, options[:quote_char]).odd? # should handle quote_char nil
|
70
|
+
while multiline
|
65
71
|
next_line = fh.readline(options[:row_sep])
|
66
|
-
next_line = next_line
|
72
|
+
next_line = enforce_utf8_encoding(next_line, options) if @enforce_utf8
|
67
73
|
line += next_line
|
68
74
|
@file_line_count += 1
|
75
|
+
|
76
|
+
break if fh.eof? # Exit loop if end of file is reached
|
77
|
+
|
78
|
+
multiline = count_quote_chars(line, options[:quote_char]).odd?
|
69
79
|
end
|
70
|
-
|
80
|
+
|
81
|
+
# :nocov:
|
82
|
+
if multiline && @verbose
|
83
|
+
print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count
|
84
|
+
end
|
85
|
+
# :nocov:
|
71
86
|
|
72
87
|
line.chomp!(options[:row_sep])
|
73
88
|
|
89
|
+
# --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
|
74
90
|
dataA, _data_size = parse(line, options, header_size)
|
75
91
|
|
76
92
|
dataA.map!{|x| x.strip} if options[:strip_whitespace]
|
@@ -78,48 +94,25 @@ module SmarterCSV
|
|
78
94
|
# if all values are blank, then ignore this line
|
79
95
|
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
|
80
96
|
|
81
|
-
|
97
|
+
# --- HASH TRANSFORMATIONS ------------------------------------------------------------
|
98
|
+
hash = @headers.zip(dataA).to_h
|
82
99
|
|
83
|
-
|
84
|
-
hash.delete(nil)
|
85
|
-
hash.delete('')
|
86
|
-
hash.delete(:"")
|
87
|
-
|
88
|
-
if options[:remove_empty_values] == true
|
89
|
-
hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
|
90
|
-
end
|
100
|
+
hash = hash_transformations(hash, options)
|
91
101
|
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
|
99
|
-
|
100
|
-
# convert if it's a numeric value:
|
101
|
-
case v
|
102
|
-
when /^[+-]?\d+\.\d+$/
|
103
|
-
hash[k] = v.to_f
|
104
|
-
when /^[+-]?\d+$/
|
105
|
-
hash[k] = v.to_i
|
106
|
-
end
|
107
|
-
end
|
108
|
-
end
|
109
|
-
|
110
|
-
if options[:value_converters]
|
111
|
-
hash.each do |k, v|
|
112
|
-
converter = options[:value_converters][k]
|
113
|
-
next unless converter
|
114
|
-
|
115
|
-
hash[k] = converter.convert(v)
|
116
|
-
end
|
117
|
-
end
|
102
|
+
# --- HASH VALIDATIONS ----------------------------------------------------------------
|
103
|
+
# will go here, and be able to:
|
104
|
+
# - validate correct format of the values for fields
|
105
|
+
# - required fields to be non-empty
|
106
|
+
# - ...
|
107
|
+
# -------------------------------------------------------------------------------------
|
118
108
|
|
119
109
|
next if options[:remove_empty_hashes] && hash.empty?
|
120
110
|
|
111
|
+
puts "CSV Line #{@file_line_count}: #{pp(hash)}" if @verbose == '2' # very verbose setting
|
112
|
+
# optional adding of csv_line_number to the hash to help debugging
|
121
113
|
hash[:csv_line_number] = @csv_line_count if options[:with_line_numbers]
|
122
114
|
|
115
|
+
# process the chunks or the resulting hash
|
123
116
|
if use_chunks
|
124
117
|
chunk << hash # append temp result to chunk
|
125
118
|
|
@@ -128,183 +121,74 @@ module SmarterCSV
|
|
128
121
|
if block_given?
|
129
122
|
yield chunk # do something with the hashes in the chunk in the block
|
130
123
|
else
|
131
|
-
result << chunk #
|
124
|
+
@result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
|
132
125
|
end
|
133
|
-
chunk_count += 1
|
134
|
-
chunk
|
126
|
+
@chunk_count += 1
|
127
|
+
chunk.clear # re-initialize for next chunk of data
|
135
128
|
else
|
136
|
-
|
137
|
-
# the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
|
138
|
-
|
129
|
+
# the last chunk may contain partial data, which is handled below
|
139
130
|
end
|
140
|
-
|
141
131
|
# while a chunk is being filled up we don't need to do anything else here
|
142
132
|
|
143
133
|
else # no chunk handling
|
144
134
|
if block_given?
|
145
135
|
yield [hash] # do something with the hash in the block (better to use chunking here)
|
146
136
|
else
|
147
|
-
result << hash
|
137
|
+
@result << hash
|
148
138
|
end
|
149
139
|
end
|
150
140
|
end
|
151
141
|
|
152
142
|
# print new line to retain last processing line message
|
153
|
-
print "\n" if
|
143
|
+
print "\n" if @verbose
|
154
144
|
|
155
|
-
# last chunk:
|
145
|
+
# handling of last chunk:
|
156
146
|
if !chunk.nil? && chunk.size > 0
|
157
147
|
# do something with the chunk
|
158
148
|
if block_given?
|
159
149
|
yield chunk # do something with the hashes in the chunk in the block
|
160
150
|
else
|
161
|
-
result << chunk #
|
151
|
+
@result << chunk.dup # Append chunk to result (use .dup to keep a copy after we do chunk.clear)
|
162
152
|
end
|
163
|
-
chunk_count += 1
|
153
|
+
@chunk_count += 1
|
164
154
|
# chunk = [] # initialize for next chunk of data
|
165
155
|
end
|
166
156
|
ensure
|
167
157
|
fh.close if fh.respond_to?(:close)
|
168
158
|
end
|
159
|
+
|
169
160
|
if block_given?
|
170
|
-
chunk_count # when we do processing through a block we only care how many chunks we processed
|
161
|
+
@chunk_count # when we do processing through a block we only care how many chunks we processed
|
171
162
|
else
|
172
|
-
result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
163
|
+
@result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
173
164
|
end
|
174
165
|
end
|
175
166
|
|
176
167
|
class << self
|
177
|
-
def has_acceleration?
|
178
|
-
@has_acceleration ||= !!defined?(parse_csv_line_c)
|
179
|
-
end
|
180
|
-
|
181
|
-
def raw_header
|
182
|
-
@raw_header
|
183
|
-
end
|
184
|
-
|
185
|
-
def headers
|
186
|
-
@headers
|
187
|
-
end
|
188
|
-
|
189
|
-
# * the `scan` method iterates through the string and finds all occurrences of the pattern
|
190
|
-
# * The reqular expression:
|
191
|
-
# - (?<!\\) : Negative lookbehind to ensure the quote character is not preceded by an unescaped backslash.
|
192
|
-
# - (?:\\\\)* : Non-capturing group for an even number of backslashes (escaped backslashes).
|
193
|
-
# This allows for any number of escaped backslashes before the quote character.
|
194
|
-
# - #{Regexp.escape(quote_char)} : Dynamically inserts the quote_char into the regex,
|
195
|
-
# ensuring it's properly escaped for use in the regex.
|
196
|
-
#
|
197
168
|
def count_quote_chars(line, quote_char)
|
198
|
-
line.
|
199
|
-
end
|
200
|
-
|
201
|
-
protected
|
202
|
-
|
203
|
-
def readline_with_counts(filehandle, options)
|
204
|
-
line = filehandle.readline(options[:row_sep])
|
205
|
-
@file_line_count += 1
|
206
|
-
@csv_line_count += 1
|
207
|
-
line = remove_bom(line) if @csv_line_count == 1
|
208
|
-
line
|
209
|
-
end
|
169
|
+
return 0 if line.nil? || quote_char.nil? || quote_char.empty?
|
210
170
|
|
211
|
-
|
212
|
-
|
171
|
+
count = 0
|
172
|
+
escaped = false
|
213
173
|
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
end
|
218
|
-
|
219
|
-
def rewind(filehandle)
|
220
|
-
@file_line_count = 0
|
221
|
-
@csv_line_count = 0
|
222
|
-
filehandle.rewind
|
223
|
-
end
|
224
|
-
|
225
|
-
###
|
226
|
-
### Thin wrapper around C-extension
|
227
|
-
###
|
228
|
-
def parse(line, options, header_size = nil)
|
229
|
-
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
|
230
|
-
|
231
|
-
if options[:acceleration] && has_acceleration?
|
232
|
-
# :nocov:
|
233
|
-
has_quotes = line =~ /#{options[:quote_char]}/
|
234
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
|
235
|
-
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
|
236
|
-
[elements, elements.size]
|
237
|
-
# :nocov:
|
238
|
-
else
|
239
|
-
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
240
|
-
parse_csv_line_ruby(line, options, header_size)
|
241
|
-
end
|
242
|
-
end
|
243
|
-
|
244
|
-
# ------------------------------------------------------------------
|
245
|
-
# Ruby equivalent of the C-extension for parse_line
|
246
|
-
#
|
247
|
-
# parses a single line: either a CSV header and body line
|
248
|
-
# - quoting rules compared to RFC-4180 are somewhat relaxed
|
249
|
-
# - we are not assuming that quotes inside a fields need to be doubled
|
250
|
-
# - we are not assuming that all fields need to be quoted (0 is even)
|
251
|
-
# - works with multi-char col_sep
|
252
|
-
# - if header_size is given, only up to header_size fields are parsed
|
253
|
-
#
|
254
|
-
# We use header_size for parsing the body lines to make sure we always match the number of headers
|
255
|
-
# in case there are trailing col_sep characters in line
|
256
|
-
#
|
257
|
-
# Our convention is that empty fields are returned as empty strings, not as nil.
|
258
|
-
#
|
259
|
-
#
|
260
|
-
# the purpose of the max_size parameter is to handle a corner case where
|
261
|
-
# CSV lines contain more fields than the header.
|
262
|
-
# In which case the remaining fields in the line are ignored
|
263
|
-
#
|
264
|
-
def parse_csv_line_ruby(line, options, header_size = nil)
|
265
|
-
return [] if line.nil?
|
266
|
-
|
267
|
-
line_size = line.size
|
268
|
-
col_sep = options[:col_sep]
|
269
|
-
col_sep_size = col_sep.size
|
270
|
-
quote = options[:quote_char]
|
271
|
-
quote_count = 0
|
272
|
-
elements = []
|
273
|
-
start = 0
|
274
|
-
i = 0
|
275
|
-
|
276
|
-
previous_char = ''
|
277
|
-
while i < line_size
|
278
|
-
if line[i...i+col_sep_size] == col_sep && quote_count.even?
|
279
|
-
break if !header_size.nil? && elements.size >= header_size
|
280
|
-
|
281
|
-
elements << cleanup_quotes(line[start...i], quote)
|
282
|
-
previous_char = line[i]
|
283
|
-
i += col_sep.size
|
284
|
-
start = i
|
174
|
+
line.each_char do |char|
|
175
|
+
if char == '\\' && !escaped
|
176
|
+
escaped = true
|
285
177
|
else
|
286
|
-
|
287
|
-
|
288
|
-
i += 1
|
178
|
+
count += 1 if char == quote_char && !escaped
|
179
|
+
escaped = false
|
289
180
|
end
|
290
181
|
end
|
291
|
-
elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
|
292
|
-
[elements, elements.size]
|
293
|
-
end
|
294
|
-
|
295
|
-
def cleanup_quotes(field, quote)
|
296
|
-
return field if field.nil?
|
297
182
|
|
298
|
-
|
183
|
+
count
|
184
|
+
end
|
299
185
|
|
300
|
-
|
301
|
-
|
302
|
-
field.delete_suffix!(quote)
|
303
|
-
end
|
304
|
-
field.gsub!("#{quote}#{quote}", quote)
|
305
|
-
field
|
186
|
+
def has_acceleration?
|
187
|
+
@has_acceleration ||= !!defined?(parse_csv_line_c)
|
306
188
|
end
|
307
189
|
|
190
|
+
protected
|
191
|
+
|
308
192
|
# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
|
309
193
|
# and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
|
310
194
|
BLANK_RE = /\A\s*\z/.freeze
|
@@ -312,245 +196,24 @@ module SmarterCSV
|
|
312
196
|
def blank?(value)
|
313
197
|
case value
|
314
198
|
when String
|
315
|
-
|
316
|
-
|
199
|
+
BLANK_RE.match?(value)
|
317
200
|
when NilClass
|
318
201
|
true
|
319
|
-
|
320
202
|
when Array
|
321
|
-
value.
|
322
|
-
|
203
|
+
value.all? { |elem| blank?(elem) }
|
323
204
|
when Hash
|
324
|
-
value.
|
325
|
-
|
205
|
+
value.values.all? { |elem| blank?(elem) } # Focus on values only
|
326
206
|
else
|
327
207
|
false
|
328
208
|
end
|
329
209
|
end
|
330
210
|
|
331
|
-
def elem_blank?(value)
|
332
|
-
case value
|
333
|
-
when String
|
334
|
-
value.empty? || BLANK_RE.match?(value)
|
335
|
-
|
336
|
-
when NilClass
|
337
|
-
true
|
338
|
-
|
339
|
-
else
|
340
|
-
false
|
341
|
-
end
|
342
|
-
end
|
343
|
-
|
344
|
-
# acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
|
345
|
-
def only_or_except_limit_execution(options, option_name, key)
|
346
|
-
if options[option_name].is_a?(Hash)
|
347
|
-
if options[option_name].has_key?(:except)
|
348
|
-
return true if Array(options[option_name][:except]).include?(key)
|
349
|
-
elsif options[option_name].has_key?(:only)
|
350
|
-
return true unless Array(options[option_name][:only]).include?(key)
|
351
|
-
end
|
352
|
-
end
|
353
|
-
false
|
354
|
-
end
|
355
|
-
|
356
|
-
# If file has headers, then guesses column separator from headers.
|
357
|
-
# Otherwise guesses column separator from contents.
|
358
|
-
# Raises exception if none is found.
|
359
|
-
def guess_column_separator(filehandle, options)
|
360
|
-
skip_lines(filehandle, options)
|
361
|
-
|
362
|
-
delimiters = [',', "\t", ';', ':', '|']
|
363
|
-
|
364
|
-
line = nil
|
365
|
-
has_header = options[:headers_in_file]
|
366
|
-
candidates = Hash.new(0)
|
367
|
-
count = has_header ? 1 : 5
|
368
|
-
count.times do
|
369
|
-
line = readline_with_counts(filehandle, options)
|
370
|
-
delimiters.each do |d|
|
371
|
-
candidates[d] += line.scan(d).count
|
372
|
-
end
|
373
|
-
rescue EOFError # short files
|
374
|
-
break
|
375
|
-
end
|
376
|
-
rewind(filehandle)
|
377
|
-
|
378
|
-
if candidates.values.max == 0
|
379
|
-
# if the header only contains
|
380
|
-
return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
|
381
|
-
|
382
|
-
raise SmarterCSV::NoColSepDetected
|
383
|
-
end
|
384
|
-
|
385
|
-
candidates.key(candidates.values.max)
|
386
|
-
end
|
387
|
-
|
388
|
-
# limitation: this currently reads the whole file in before making a decision
|
389
|
-
def guess_line_ending(filehandle, options)
|
390
|
-
counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
|
391
|
-
quoted_char = false
|
392
|
-
|
393
|
-
# count how many of the pre-defined line-endings we find
|
394
|
-
# ignoring those contained within quote characters
|
395
|
-
last_char = nil
|
396
|
-
lines = 0
|
397
|
-
filehandle.each_char do |c|
|
398
|
-
quoted_char = !quoted_char if c == options[:quote_char]
|
399
|
-
next if quoted_char
|
400
|
-
|
401
|
-
if last_char == "\r"
|
402
|
-
if c == "\n"
|
403
|
-
counts["\r\n"] += 1
|
404
|
-
else
|
405
|
-
counts["\r"] += 1 # \r are counted after they appeared
|
406
|
-
end
|
407
|
-
elsif c == "\n"
|
408
|
-
counts["\n"] += 1
|
409
|
-
end
|
410
|
-
last_char = c
|
411
|
-
lines += 1
|
412
|
-
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
|
413
|
-
end
|
414
|
-
rewind(filehandle)
|
415
|
-
|
416
|
-
counts["\r"] += 1 if last_char == "\r"
|
417
|
-
# find the most frequent key/value pair:
|
418
|
-
most_frequent_key, _count = counts.max_by{|_, v| v}
|
419
|
-
most_frequent_key
|
420
|
-
end
|
421
|
-
|
422
|
-
def process_headers(filehandle, options)
|
423
|
-
@raw_header = nil
|
424
|
-
@headers = nil
|
425
|
-
if options[:headers_in_file] # extract the header line
|
426
|
-
# process the header line in the CSV file..
|
427
|
-
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
428
|
-
header = readline_with_counts(filehandle, options)
|
429
|
-
@raw_header = header
|
430
|
-
|
431
|
-
header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
432
|
-
header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
|
433
|
-
header = header.chomp(options[:row_sep])
|
434
|
-
|
435
|
-
header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
|
436
|
-
|
437
|
-
file_headerA, file_header_size = parse(header, options)
|
438
|
-
|
439
|
-
file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
|
440
|
-
file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
|
441
|
-
|
442
|
-
unless options[:keep_original_headers]
|
443
|
-
file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
|
444
|
-
file_headerA.map!{|x| x.downcase} if options[:downcase_header]
|
445
|
-
end
|
446
|
-
else
|
447
|
-
raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
|
448
|
-
end
|
449
|
-
if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
|
450
|
-
# use user-provided headers
|
451
|
-
headerA = options[:user_provided_headers]
|
452
|
-
if defined?(file_header_size) && !file_header_size.nil?
|
453
|
-
if headerA.size != file_header_size
|
454
|
-
raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
|
455
|
-
else
|
456
|
-
# we could print out the mapping of file_headerA to headerA here
|
457
|
-
end
|
458
|
-
end
|
459
|
-
else
|
460
|
-
headerA = file_headerA
|
461
|
-
end
|
462
|
-
|
463
|
-
# detect duplicate headers and disambiguate
|
464
|
-
headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
|
465
|
-
header_size = headerA.size # used for splitting lines
|
466
|
-
|
467
|
-
headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
|
468
|
-
|
469
|
-
unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
|
470
|
-
key_mappingH = options[:key_mapping]
|
471
|
-
|
472
|
-
# do some key mapping on the keys in the file header
|
473
|
-
# if you want to completely delete a key, then map it to nil or to ''
|
474
|
-
if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
|
475
|
-
# if silence_missing_keys are not set, raise error if missing header
|
476
|
-
missing_keys = key_mappingH.keys - headerA
|
477
|
-
# if the user passes a list of speciffic mapped keys that are optional
|
478
|
-
missing_keys -= options[:silence_missing_keys] if options[:silence_missing_keys].is_a?(Array)
|
479
|
-
|
480
|
-
unless missing_keys.empty? || options[:silence_missing_keys] == true
|
481
|
-
raise SmarterCSV::KeyMappingError, "ERROR: can not map headers: #{missing_keys.join(', ')}"
|
482
|
-
end
|
483
|
-
|
484
|
-
headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
|
485
|
-
end
|
486
|
-
end
|
487
|
-
|
488
|
-
# header_validations
|
489
|
-
duplicate_headers = []
|
490
|
-
headerA.compact.each do |k|
|
491
|
-
duplicate_headers << k if headerA.select{|x| x == k}.size > 1
|
492
|
-
end
|
493
|
-
|
494
|
-
unless options[:user_provided_headers] || duplicate_headers.empty?
|
495
|
-
raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
|
496
|
-
end
|
497
|
-
|
498
|
-
# deprecate required_headers
|
499
|
-
unless options[:required_headers].nil?
|
500
|
-
puts "DEPRECATION WARNING: please use 'required_keys' instead of 'required_headers'"
|
501
|
-
if options[:required_keys].nil?
|
502
|
-
options[:required_keys] = options[:required_headers]
|
503
|
-
options[:required_headers] = nil
|
504
|
-
end
|
505
|
-
end
|
506
|
-
|
507
|
-
if options[:required_keys] && options[:required_keys].is_a?(Array)
|
508
|
-
missing_keys = []
|
509
|
-
options[:required_keys].each do |k|
|
510
|
-
missing_keys << k unless headerA.include?(k)
|
511
|
-
end
|
512
|
-
raise SmarterCSV::MissingKeys, "ERROR: missing attributes: #{missing_keys.join(',')}" unless missing_keys.empty?
|
513
|
-
end
|
514
|
-
|
515
|
-
@headers = headerA
|
516
|
-
[headerA, header_size]
|
517
|
-
end
|
518
|
-
|
519
|
-
def process_duplicate_headers(headers, options)
|
520
|
-
counts = Hash.new(0)
|
521
|
-
result = []
|
522
|
-
headers.each do |key|
|
523
|
-
counts[key] += 1
|
524
|
-
if counts[key] == 1
|
525
|
-
result << key
|
526
|
-
else
|
527
|
-
result << [key, options[:duplicate_header_suffix], counts[key]].join
|
528
|
-
end
|
529
|
-
end
|
530
|
-
result
|
531
|
-
end
|
532
|
-
|
533
211
|
private
|
534
212
|
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
UTF_16LE_BOM = %w[ff fe].freeze
|
540
|
-
|
541
|
-
def remove_bom(str)
|
542
|
-
str_as_hex = str.bytes.map{|x| x.to_s(16)}
|
543
|
-
# if string does not start with one of the bytes, there is no BOM
|
544
|
-
return str unless %w[ef fe ff 0].include?(str_as_hex[0])
|
545
|
-
|
546
|
-
return str.byteslice(4..-1) if [UTF_32_BOM, UTF_32LE_BOM].include?(str_as_hex[0..3])
|
547
|
-
return str.byteslice(3..-1) if str_as_hex[0..2] == UTF_8_BOM
|
548
|
-
return str.byteslice(2..-1) if [UTF_16_BOM, UTF_16LE_BOM].include?(str_as_hex[0..1])
|
549
|
-
|
550
|
-
# :nocov:
|
551
|
-
puts "SmarterCSV found unhandled BOM! #{str.chars[0..7].inspect}"
|
552
|
-
str
|
553
|
-
# :nocov:
|
213
|
+
def enforce_utf8_encoding(line, options)
|
214
|
+
# return line unless options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
215
|
+
|
216
|
+
line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence])
|
554
217
|
end
|
555
218
|
end
|
556
219
|
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module SmarterCSV
|
4
|
+
class << self
|
5
|
+
attr_reader :has_rails, :csv_line_count, :chunk_count, :errors, :file_line_count, :headers, :raw_header, :result, :warnings
|
6
|
+
|
7
|
+
def initialize_variables
|
8
|
+
@has_rails = !!defined?(Rails)
|
9
|
+
@csv_line_count = 0
|
10
|
+
@chunk_count = 0
|
11
|
+
@errors = {}
|
12
|
+
@file_line_count = 0
|
13
|
+
@headerA = []
|
14
|
+
@headers = nil
|
15
|
+
@raw_header = nil # header as it appears in the file
|
16
|
+
@result = []
|
17
|
+
@warnings = {}
|
18
|
+
@enforce_utf8 = false # only set to true if needed (after options parsing)
|
19
|
+
end
|
20
|
+
|
21
|
+
# :nocov:
|
22
|
+
# rubocop:disable Naming/MethodName
|
23
|
+
def headerA
|
24
|
+
warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
|
25
|
+
@headerA
|
26
|
+
end
|
27
|
+
# rubocop:enable Naming/MethodName
|
28
|
+
# :nocov:
|
29
|
+
end
|
30
|
+
end
|
data/lib/smarter_csv/version.rb
CHANGED