smarter_csv 1.6.1 → 1.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +133 -0
- data/CHANGELOG.md +22 -1
- data/CONTRIBUTORS.md +3 -0
- data/Gemfile +7 -4
- data/README.md +8 -6
- data/Rakefile +15 -13
- data/ext/smarter_csv/extconf.rb +14 -0
- data/ext/smarter_csv/smarter_csv.c +86 -0
- data/lib/extensions/hash.rb +4 -2
- data/lib/smarter_csv/version.rb +3 -1
- data/lib/smarter_csv.rb +519 -10
- data/smarter_csv.gemspec +22 -7
- metadata +54 -176
- data/.gitignore +0 -10
- data/.rspec +0 -2
- data/.travis.yml +0 -27
- data/lib/smarter_csv/smarter_csv.rb +0 -461
- data/spec/fixtures/additional_separator.csv +0 -6
- data/spec/fixtures/basic.csv +0 -8
- data/spec/fixtures/binary.csv +0 -1
- data/spec/fixtures/carriage_returns_n.csv +0 -18
- data/spec/fixtures/carriage_returns_quoted.csv +0 -3
- data/spec/fixtures/carriage_returns_r.csv +0 -1
- data/spec/fixtures/carriage_returns_rn.csv +0 -18
- data/spec/fixtures/chunk_cornercase.csv +0 -10
- data/spec/fixtures/duplicate_headers.csv +0 -3
- data/spec/fixtures/empty.csv +0 -5
- data/spec/fixtures/empty_columns_1.csv +0 -2
- data/spec/fixtures/empty_columns_2.csv +0 -2
- data/spec/fixtures/hard_sample.csv +0 -2
- data/spec/fixtures/ignore_comments.csv +0 -11
- data/spec/fixtures/ignore_comments2.csv +0 -3
- data/spec/fixtures/key_mapping.csv +0 -2
- data/spec/fixtures/line_endings_n.csv +0 -4
- data/spec/fixtures/line_endings_r.csv +0 -1
- data/spec/fixtures/line_endings_rn.csv +0 -4
- data/spec/fixtures/lots_of_columns.csv +0 -2
- data/spec/fixtures/malformed.csv +0 -3
- data/spec/fixtures/malformed_header.csv +0 -3
- data/spec/fixtures/money.csv +0 -3
- data/spec/fixtures/no_header.csv +0 -7
- data/spec/fixtures/numeric.csv +0 -5
- data/spec/fixtures/pets.csv +0 -5
- data/spec/fixtures/problematic.csv +0 -8
- data/spec/fixtures/quote_char.csv +0 -9
- data/spec/fixtures/quoted.csv +0 -5
- data/spec/fixtures/quoted2.csv +0 -4
- data/spec/fixtures/separator_colon.csv +0 -4
- data/spec/fixtures/separator_comma.csv +0 -4
- data/spec/fixtures/separator_pipe.csv +0 -4
- data/spec/fixtures/separator_semi.csv +0 -4
- data/spec/fixtures/separator_tab.csv +0 -4
- data/spec/fixtures/skip_lines.csv +0 -8
- data/spec/fixtures/trading.csv +0 -3
- data/spec/fixtures/user_import.csv +0 -3
- data/spec/fixtures/valid_unicode.csv +0 -5
- data/spec/fixtures/with_dashes.csv +0 -8
- data/spec/fixtures/with_dates.csv +0 -4
- data/spec/smarter_csv/additional_separator_spec.rb +0 -45
- data/spec/smarter_csv/binary_file2_spec.rb +0 -24
- data/spec/smarter_csv/binary_file_spec.rb +0 -22
- data/spec/smarter_csv/blank_spec.rb +0 -55
- data/spec/smarter_csv/carriage_return_spec.rb +0 -190
- data/spec/smarter_csv/chunked_reading_spec.rb +0 -14
- data/spec/smarter_csv/close_file_spec.rb +0 -15
- data/spec/smarter_csv/column_separator_spec.rb +0 -95
- data/spec/smarter_csv/convert_values_to_numeric_spec.rb +0 -48
- data/spec/smarter_csv/duplicate_headers_spec.rb +0 -76
- data/spec/smarter_csv/empty_columns_spec.rb +0 -74
- data/spec/smarter_csv/extenstions_spec.rb +0 -17
- data/spec/smarter_csv/hard_sample_spec.rb +0 -24
- data/spec/smarter_csv/header_transformation_spec.rb +0 -21
- data/spec/smarter_csv/ignore_comments_spec.rb +0 -45
- data/spec/smarter_csv/invalid_headers_spec.rb +0 -38
- data/spec/smarter_csv/keep_headers_spec.rb +0 -24
- data/spec/smarter_csv/key_mapping_spec.rb +0 -56
- data/spec/smarter_csv/line_ending_spec.rb +0 -43
- data/spec/smarter_csv/load_basic_spec.rb +0 -20
- data/spec/smarter_csv/malformed_spec.rb +0 -25
- data/spec/smarter_csv/no_header_spec.rb +0 -29
- data/spec/smarter_csv/not_downcase_header_spec.rb +0 -24
- data/spec/smarter_csv/parse/column_separator_spec.rb +0 -61
- data/spec/smarter_csv/parse/old_csv_library_spec.rb +0 -74
- data/spec/smarter_csv/parse/rfc4180_and_more_spec.rb +0 -170
- data/spec/smarter_csv/problematic.rb +0 -34
- data/spec/smarter_csv/quoted_spec.rb +0 -52
- data/spec/smarter_csv/remove_empty_values_spec.rb +0 -13
- data/spec/smarter_csv/remove_keys_from_hashes_spec.rb +0 -25
- data/spec/smarter_csv/remove_not_mapped_keys_spec.rb +0 -35
- data/spec/smarter_csv/remove_values_matching_spec.rb +0 -26
- data/spec/smarter_csv/remove_zero_values_spec.rb +0 -25
- data/spec/smarter_csv/skip_lines_spec.rb +0 -29
- data/spec/smarter_csv/strings_as_keys_spec.rb +0 -24
- data/spec/smarter_csv/strip_chars_from_headers_spec.rb +0 -24
- data/spec/smarter_csv/trading_spec.rb +0 -25
- data/spec/smarter_csv/valid_unicode_spec.rb +0 -94
- data/spec/smarter_csv/value_converters_spec.rb +0 -52
- data/spec/spec/spec_helper.rb +0 -17
- data/spec/spec.opts +0 -2
- data/spec/spec_helper.rb +0 -21
data/lib/smarter_csv.rb
CHANGED
|
@@ -1,12 +1,521 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "extensions/hash"
|
|
4
|
+
require_relative "smarter_csv/version"
|
|
5
|
+
require_relative "smarter_csv/smarter_csv" unless ENV['CI'] # does not compile/link in CI?
|
|
6
|
+
|
|
7
|
+
module SmarterCSV
|
|
8
|
+
class SmarterCSVException < StandardError; end
|
|
9
|
+
class HeaderSizeMismatch < SmarterCSVException; end
|
|
10
|
+
class IncorrectOption < SmarterCSVException; end
|
|
11
|
+
class DuplicateHeaders < SmarterCSVException; end
|
|
12
|
+
class MissingHeaders < SmarterCSVException; end
|
|
13
|
+
class NoColSepDetected < SmarterCSVException; end
|
|
14
|
+
class KeyMappingError < SmarterCSVException; end
|
|
15
|
+
class MalformedCSVError < SmarterCSVException; end
|
|
16
|
+
|
|
17
|
+
# first parameter: filename or input object which responds to readline method
|
|
18
|
+
def SmarterCSV.process(input, options = {}, &block)
|
|
19
|
+
options = default_options.merge(options)
|
|
20
|
+
options[:invalid_byte_sequence] = '' if options[:invalid_byte_sequence].nil?
|
|
21
|
+
puts "SmarterCSV OPTIONS: #{options.inspect}" if options[:verbose]
|
|
22
|
+
|
|
23
|
+
headerA = []
|
|
24
|
+
result = []
|
|
25
|
+
@file_line_count = 0
|
|
26
|
+
@csv_line_count = 0
|
|
27
|
+
has_rails = !!defined?(Rails)
|
|
28
|
+
begin
|
|
29
|
+
fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
|
|
30
|
+
|
|
31
|
+
# auto-detect the row separator
|
|
32
|
+
options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
|
|
33
|
+
# attempt to auto-detect column separator
|
|
34
|
+
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
|
|
35
|
+
|
|
36
|
+
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
|
|
37
|
+
puts 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
if options[:skip_lines].to_i > 0
|
|
41
|
+
options[:skip_lines].to_i.times do
|
|
42
|
+
readline_with_counts(fh, options)
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
headerA, header_size = process_headers(fh, options)
|
|
47
|
+
|
|
48
|
+
# in case we use chunking.. we'll need to set it up..
|
|
49
|
+
if !options[:chunk_size].nil? && options[:chunk_size].to_i > 0
|
|
50
|
+
use_chunks = true
|
|
51
|
+
chunk_size = options[:chunk_size].to_i
|
|
52
|
+
chunk_count = 0
|
|
53
|
+
chunk = []
|
|
54
|
+
else
|
|
55
|
+
use_chunks = false
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# now on to processing all the rest of the lines in the CSV file:
|
|
59
|
+
until fh.eof? # we can't use fh.readlines() here, because this would read the whole file into memory at once, and eof => true
|
|
60
|
+
line = readline_with_counts(fh, options)
|
|
61
|
+
|
|
62
|
+
# replace invalid byte sequence in UTF-8 with question mark to avoid errors
|
|
63
|
+
line = line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
|
64
|
+
|
|
65
|
+
print "processing file line %10d, csv line %10d\r" % [@file_line_count, @csv_line_count] if options[:verbose]
|
|
66
|
+
|
|
67
|
+
next if options[:comment_regexp] && line =~ options[:comment_regexp] # ignore all comment lines if there are any
|
|
68
|
+
|
|
69
|
+
# cater for the quoted csv data containing the row separator carriage return character
|
|
70
|
+
# in which case the row data will be split across multiple lines (see the sample content in spec/fixtures/carriage_returns_rn.csv)
|
|
71
|
+
# by detecting the existence of an uneven number of quote characters
|
|
72
|
+
|
|
73
|
+
multiline = line.count(options[:quote_char]).odd? # should handle quote_char nil
|
|
74
|
+
while line.count(options[:quote_char]).odd? # should handle quote_char nil
|
|
75
|
+
next_line = fh.readline(options[:row_sep])
|
|
76
|
+
next_line = next_line.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
|
77
|
+
line += next_line
|
|
78
|
+
@file_line_count += 1
|
|
79
|
+
end
|
|
80
|
+
print "\nline contains uneven number of quote chars so including content through file line %d\n" % @file_line_count if options[:verbose] && multiline
|
|
81
|
+
|
|
82
|
+
line.chomp!(options[:row_sep])
|
|
83
|
+
|
|
84
|
+
dataA, _data_size = parse(line, options, header_size)
|
|
85
|
+
|
|
86
|
+
dataA.map!{|x| x.strip} if options[:strip_whitespace]
|
|
87
|
+
|
|
88
|
+
# if all values are blank, then ignore this line
|
|
89
|
+
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
|
|
90
|
+
|
|
91
|
+
hash = Hash.zip(headerA, dataA) # from Facets of Ruby library
|
|
92
|
+
|
|
93
|
+
# make sure we delete any key/value pairs from the hash, which the user wanted to delete:
|
|
94
|
+
# Note: Ruby < 1.9 doesn't allow empty symbol literals!
|
|
95
|
+
hash.delete(nil)
|
|
96
|
+
hash.delete('')
|
|
97
|
+
eval('hash.delete(:"")') if RUBY_VERSION.to_f > 1.8
|
|
98
|
+
|
|
99
|
+
if options[:remove_empty_values] == true
|
|
100
|
+
hash.delete_if{|_k, v| has_rails ? v.blank? : blank?(v)}
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
hash.delete_if{|_k, v| !v.nil? && v =~ /^(\d+|\d+\.\d+)$/ && v.to_f == 0} if options[:remove_zero_values] # values are typically Strings!
|
|
104
|
+
hash.delete_if{|_k, v| v =~ options[:remove_values_matching]} if options[:remove_values_matching]
|
|
105
|
+
|
|
106
|
+
if options[:convert_values_to_numeric]
|
|
107
|
+
hash.each do |k, v|
|
|
108
|
+
# deal with the :only / :except options to :convert_values_to_numeric
|
|
109
|
+
next if only_or_except_limit_execution(options, :convert_values_to_numeric, k)
|
|
110
|
+
|
|
111
|
+
# convert if it's a numeric value:
|
|
112
|
+
case v
|
|
113
|
+
when /^[+-]?\d+\.\d+$/
|
|
114
|
+
hash[k] = v.to_f
|
|
115
|
+
when /^[+-]?\d+$/
|
|
116
|
+
hash[k] = v.to_i
|
|
117
|
+
end
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
|
|
121
|
+
if options[:value_converters]
|
|
122
|
+
hash.each do |k, v|
|
|
123
|
+
converter = options[:value_converters][k]
|
|
124
|
+
next unless converter
|
|
125
|
+
|
|
126
|
+
hash[k] = converter.convert(v)
|
|
127
|
+
end
|
|
128
|
+
end
|
|
129
|
+
|
|
130
|
+
next if options[:remove_empty_hashes] && hash.empty?
|
|
131
|
+
|
|
132
|
+
if use_chunks
|
|
133
|
+
chunk << hash # append temp result to chunk
|
|
134
|
+
|
|
135
|
+
if chunk.size >= chunk_size || fh.eof? # if chunk if full, or EOF reached
|
|
136
|
+
# do something with the chunk
|
|
137
|
+
if block_given?
|
|
138
|
+
yield chunk # do something with the hashes in the chunk in the block
|
|
139
|
+
else
|
|
140
|
+
result << chunk # not sure yet, why anybody would want to do this without a block
|
|
141
|
+
end
|
|
142
|
+
chunk_count += 1
|
|
143
|
+
chunk = [] # initialize for next chunk of data
|
|
144
|
+
else
|
|
145
|
+
|
|
146
|
+
# the last chunk may contain partial data, which also needs to be returned (BUG / ISSUE-18)
|
|
147
|
+
|
|
148
|
+
end
|
|
149
|
+
|
|
150
|
+
# while a chunk is being filled up we don't need to do anything else here
|
|
151
|
+
|
|
152
|
+
else # no chunk handling
|
|
153
|
+
if block_given?
|
|
154
|
+
yield [hash] # do something with the hash in the block (better to use chunking here)
|
|
155
|
+
else
|
|
156
|
+
result << hash
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# print new line to retain last processing line message
|
|
162
|
+
print "\n" if options[:verbose]
|
|
163
|
+
|
|
164
|
+
# last chunk:
|
|
165
|
+
if !chunk.nil? && chunk.size > 0
|
|
166
|
+
# do something with the chunk
|
|
167
|
+
if block_given?
|
|
168
|
+
yield chunk # do something with the hashes in the chunk in the block
|
|
169
|
+
else
|
|
170
|
+
result << chunk # not sure yet, why anybody would want to do this without a block
|
|
171
|
+
end
|
|
172
|
+
chunk_count += 1
|
|
173
|
+
chunk = [] # initialize for next chunk of data
|
|
174
|
+
end
|
|
175
|
+
ensure
|
|
176
|
+
fh.close if fh.respond_to?(:close)
|
|
177
|
+
end
|
|
178
|
+
if block_given?
|
|
179
|
+
return chunk_count # when we do processing through a block we only care how many chunks we processed
|
|
180
|
+
else
|
|
181
|
+
return result # returns either an Array of Hashes, or an Array of Arrays of Hashes (if in chunked mode)
|
|
182
|
+
end
|
|
6
183
|
end
|
|
7
|
-
end
|
|
8
184
|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
185
|
+
class << self
|
|
186
|
+
def has_acceleration?
|
|
187
|
+
@has_acceleration ||= !!defined?(parse_csv_line_c)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def raw_header
|
|
191
|
+
@raw_header
|
|
192
|
+
end
|
|
193
|
+
|
|
194
|
+
def headers
|
|
195
|
+
@headers
|
|
196
|
+
end
|
|
197
|
+
|
|
198
|
+
protected
|
|
199
|
+
|
|
200
|
+
# NOTE: this is not called when "parse" methods are tested by themselves
|
|
201
|
+
def default_options
|
|
202
|
+
{
|
|
203
|
+
acceleration: true,
|
|
204
|
+
auto_row_sep_chars: 500,
|
|
205
|
+
chunk_size: nil,
|
|
206
|
+
col_sep: ',',
|
|
207
|
+
comment_regexp: nil, # was: /\A#/,
|
|
208
|
+
convert_values_to_numeric: true,
|
|
209
|
+
downcase_header: true,
|
|
210
|
+
duplicate_header_suffix: nil,
|
|
211
|
+
file_encoding: 'utf-8',
|
|
212
|
+
force_simple_split: false,
|
|
213
|
+
force_utf8: false,
|
|
214
|
+
headers_in_file: true,
|
|
215
|
+
invalid_byte_sequence: '',
|
|
216
|
+
keep_original_headers: false,
|
|
217
|
+
key_mapping_hash: nil,
|
|
218
|
+
quote_char: '"',
|
|
219
|
+
remove_empty_hashes: true,
|
|
220
|
+
remove_empty_values: true,
|
|
221
|
+
remove_unmapped_keys: false,
|
|
222
|
+
remove_values_matching: nil,
|
|
223
|
+
remove_zero_values: false,
|
|
224
|
+
required_headers: nil,
|
|
225
|
+
row_sep: $/,
|
|
226
|
+
skip_lines: nil,
|
|
227
|
+
strings_as_keys: false,
|
|
228
|
+
strip_chars_from_headers: nil,
|
|
229
|
+
strip_whitespace: true,
|
|
230
|
+
user_provided_headers: nil,
|
|
231
|
+
value_converters: nil,
|
|
232
|
+
verbose: false,
|
|
233
|
+
}
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
def readline_with_counts(filehandle, options)
|
|
237
|
+
line = filehandle.readline(options[:row_sep])
|
|
238
|
+
@file_line_count += 1
|
|
239
|
+
@csv_line_count += 1
|
|
240
|
+
line
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
###
|
|
244
|
+
### Thin wrapper around C-extension
|
|
245
|
+
###
|
|
246
|
+
def parse(line, options, header_size = nil)
|
|
247
|
+
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
|
|
248
|
+
|
|
249
|
+
if options[:acceleration] && has_acceleration?
|
|
250
|
+
# puts "NOTICE: Accelerated SmarterCSV / #{options[:acceleration]}" if options[:verbose]
|
|
251
|
+
has_quotes = line =~ /#{options[:quote_char]}/
|
|
252
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
|
|
253
|
+
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
|
|
254
|
+
return [elements, elements.size]
|
|
255
|
+
|
|
256
|
+
else
|
|
257
|
+
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
|
258
|
+
return parse_csv_line_ruby(line, options, header_size)
|
|
259
|
+
end
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
# ------------------------------------------------------------------
|
|
263
|
+
# Ruby equivalent of the C-extension for parse_line
|
|
264
|
+
#
|
|
265
|
+
# parses a single line: either a CSV header and body line
|
|
266
|
+
# - quoting rules compared to RFC-4180 are somewhat relaxed
|
|
267
|
+
# - we are not assuming that quotes inside a fields need to be doubled
|
|
268
|
+
# - we are not assuming that all fields need to be quoted (0 is even)
|
|
269
|
+
# - works with multi-char col_sep
|
|
270
|
+
# - if header_size is given, only up to header_size fields are parsed
|
|
271
|
+
#
|
|
272
|
+
# We use header_size for parsing the body lines to make sure we always match the number of headers
|
|
273
|
+
# in case there are trailing col_sep characters in line
|
|
274
|
+
#
|
|
275
|
+
# Our convention is that empty fields are returned as empty strings, not as nil.
|
|
276
|
+
#
|
|
277
|
+
#
|
|
278
|
+
# the purpose of the max_size parameter is to handle a corner case where
|
|
279
|
+
# CSV lines contain more fields than the header.
|
|
280
|
+
# In which case the remaining fields in the line are ignored
|
|
281
|
+
#
|
|
282
|
+
def parse_csv_line_ruby(line, options, header_size = nil)
|
|
283
|
+
return [] if line.nil?
|
|
284
|
+
|
|
285
|
+
line_size = line.size
|
|
286
|
+
col_sep = options[:col_sep]
|
|
287
|
+
col_sep_size = col_sep.size
|
|
288
|
+
quote = options[:quote_char]
|
|
289
|
+
quote_count = 0
|
|
290
|
+
elements = []
|
|
291
|
+
start = 0
|
|
292
|
+
i = 0
|
|
293
|
+
|
|
294
|
+
while i < line_size
|
|
295
|
+
if line[i...i+col_sep_size] == col_sep && quote_count.even?
|
|
296
|
+
break if !header_size.nil? && elements.size >= header_size
|
|
297
|
+
|
|
298
|
+
elements << cleanup_quotes(line[start...i], quote)
|
|
299
|
+
i += col_sep.size
|
|
300
|
+
start = i
|
|
301
|
+
else
|
|
302
|
+
quote_count += 1 if line[i] == quote
|
|
303
|
+
i += 1
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
elements << cleanup_quotes(line[start..-1], quote) if header_size.nil? || elements.size < header_size
|
|
307
|
+
[elements, elements.size]
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
def cleanup_quotes(field, quote)
|
|
311
|
+
return field if field.nil?
|
|
312
|
+
|
|
313
|
+
# return if field !~ /#{quote}/ # this check can probably eliminated
|
|
314
|
+
|
|
315
|
+
if field.start_with?(quote) && field.end_with?(quote)
|
|
316
|
+
field.delete_prefix!(quote)
|
|
317
|
+
field.delete_suffix!(quote)
|
|
318
|
+
end
|
|
319
|
+
field.gsub!("#{quote}#{quote}", quote)
|
|
320
|
+
field
|
|
321
|
+
end
|
|
322
|
+
|
|
323
|
+
# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
|
|
324
|
+
# and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
|
|
325
|
+
BLANK_RE = /\A\s*\z/.freeze
|
|
326
|
+
|
|
327
|
+
def blank?(value)
|
|
328
|
+
case value
|
|
329
|
+
when String
|
|
330
|
+
value.empty? || BLANK_RE.match?(value)
|
|
331
|
+
|
|
332
|
+
when NilClass
|
|
333
|
+
true
|
|
334
|
+
|
|
335
|
+
when Array
|
|
336
|
+
value.empty? || value.inject(true){|result, x| result &&= elem_blank?(x)}
|
|
337
|
+
|
|
338
|
+
when Hash
|
|
339
|
+
value.empty? || value.values.inject(true){|result, x| result &&= elem_blank?(x)}
|
|
340
|
+
|
|
341
|
+
else
|
|
342
|
+
false
|
|
343
|
+
end
|
|
344
|
+
end
|
|
345
|
+
|
|
346
|
+
def elem_blank?(value)
|
|
347
|
+
case value
|
|
348
|
+
when String
|
|
349
|
+
value.empty? || BLANK_RE.match?(value)
|
|
350
|
+
|
|
351
|
+
when NilClass
|
|
352
|
+
true
|
|
353
|
+
|
|
354
|
+
else
|
|
355
|
+
false
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
# acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
|
|
360
|
+
def only_or_except_limit_execution(options, option_name, key)
|
|
361
|
+
if options[option_name].is_a?(Hash)
|
|
362
|
+
if options[option_name].has_key?(:except)
|
|
363
|
+
return true if Array(options[option_name][:except]).include?(key)
|
|
364
|
+
elsif options[option_name].has_key?(:only)
|
|
365
|
+
return true unless Array(options[option_name][:only]).include?(key)
|
|
366
|
+
end
|
|
367
|
+
end
|
|
368
|
+
return false
|
|
369
|
+
end
|
|
370
|
+
|
|
371
|
+
# raise exception if none is found
|
|
372
|
+
def guess_column_separator(filehandle, options)
|
|
373
|
+
del = [',', "\t", ';', ':', '|']
|
|
374
|
+
n = Hash.new(0)
|
|
375
|
+
|
|
376
|
+
5.times do
|
|
377
|
+
line = filehandle.readline(options[:row_sep])
|
|
378
|
+
del.each do |d|
|
|
379
|
+
n[d] += line.scan(d).count
|
|
380
|
+
end
|
|
381
|
+
rescue EOFError # short files
|
|
382
|
+
break
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
filehandle.rewind
|
|
386
|
+
raise SmarterCSV::NoColSepDetected if n.values.max == 0
|
|
387
|
+
|
|
388
|
+
col_sep = n.key(n.values.max)
|
|
389
|
+
end
|
|
390
|
+
|
|
391
|
+
# limitation: this currently reads the whole file in before making a decision
|
|
392
|
+
def guess_line_ending(filehandle, options)
|
|
393
|
+
counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
|
|
394
|
+
quoted_char = false
|
|
395
|
+
|
|
396
|
+
# count how many of the pre-defined line-endings we find
|
|
397
|
+
# ignoring those contained within quote characters
|
|
398
|
+
last_char = nil
|
|
399
|
+
lines = 0
|
|
400
|
+
filehandle.each_char do |c|
|
|
401
|
+
quoted_char = !quoted_char if c == options[:quote_char]
|
|
402
|
+
next if quoted_char
|
|
403
|
+
|
|
404
|
+
if last_char == "\r"
|
|
405
|
+
if c == "\n"
|
|
406
|
+
counts["\r\n"] += 1
|
|
407
|
+
else
|
|
408
|
+
counts["\r"] += 1 # \r are counted after they appeared
|
|
409
|
+
end
|
|
410
|
+
elsif c == "\n"
|
|
411
|
+
counts["\n"] += 1
|
|
412
|
+
end
|
|
413
|
+
last_char = c
|
|
414
|
+
lines += 1
|
|
415
|
+
break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
|
|
416
|
+
end
|
|
417
|
+
filehandle.rewind
|
|
418
|
+
|
|
419
|
+
counts["\r"] += 1 if last_char == "\r"
|
|
420
|
+
# find the most frequent key/value pair:
|
|
421
|
+
k, _ = counts.max_by{|_, v| v}
|
|
422
|
+
return k
|
|
423
|
+
end
|
|
424
|
+
|
|
425
|
+
def process_headers(filehandle, options)
|
|
426
|
+
@raw_header = nil
|
|
427
|
+
@headers = nil
|
|
428
|
+
if options[:headers_in_file] # extract the header line
|
|
429
|
+
# process the header line in the CSV file..
|
|
430
|
+
# the first line of a CSV file contains the header .. it might be commented out, so we need to read it anyhow
|
|
431
|
+
header = readline_with_counts(filehandle, options)
|
|
432
|
+
@raw_header = header
|
|
433
|
+
|
|
434
|
+
header = header.force_encoding('utf-8').encode('utf-8', invalid: :replace, undef: :replace, replace: options[:invalid_byte_sequence]) if options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
|
435
|
+
header = header.sub(options[:comment_regexp], '') if options[:comment_regexp]
|
|
436
|
+
header = header.chomp(options[:row_sep])
|
|
437
|
+
|
|
438
|
+
header = header.gsub(options[:strip_chars_from_headers], '') if options[:strip_chars_from_headers]
|
|
439
|
+
|
|
440
|
+
file_headerA, file_header_size = parse(header, options)
|
|
441
|
+
|
|
442
|
+
file_headerA.map!{|x| x.gsub(%r/#{options[:quote_char]}/, '')}
|
|
443
|
+
file_headerA.map!{|x| x.strip} if options[:strip_whitespace]
|
|
444
|
+
unless options[:keep_original_headers]
|
|
445
|
+
file_headerA.map!{|x| x.gsub(/\s+|-+/, '_')}
|
|
446
|
+
file_headerA.map!{|x| x.downcase} if options[:downcase_header]
|
|
447
|
+
end
|
|
448
|
+
else
|
|
449
|
+
raise SmarterCSV::IncorrectOption, "ERROR: If :headers_in_file is set to false, you have to provide :user_provided_headers" unless options[:user_provided_headers]
|
|
450
|
+
end
|
|
451
|
+
if options[:user_provided_headers] && options[:user_provided_headers].class == Array && !options[:user_provided_headers].empty?
|
|
452
|
+
# use user-provided headers
|
|
453
|
+
headerA = options[:user_provided_headers]
|
|
454
|
+
if defined?(file_header_size) && !file_header_size.nil?
|
|
455
|
+
if headerA.size != file_header_size
|
|
456
|
+
raise SmarterCSV::HeaderSizeMismatch, "ERROR: :user_provided_headers defines #{headerA.size} headers != CSV-file has #{file_header_size} headers"
|
|
457
|
+
else
|
|
458
|
+
# we could print out the mapping of file_headerA to headerA here
|
|
459
|
+
end
|
|
460
|
+
end
|
|
461
|
+
else
|
|
462
|
+
headerA = file_headerA
|
|
463
|
+
end
|
|
464
|
+
|
|
465
|
+
# detect duplicate headers and disambiguate
|
|
466
|
+
headerA = process_duplicate_headers(headerA, options) if options[:duplicate_header_suffix]
|
|
467
|
+
header_size = headerA.size # used for splitting lines
|
|
468
|
+
|
|
469
|
+
headerA.map!{|x| x.to_sym } unless options[:strings_as_keys] || options[:keep_original_headers]
|
|
470
|
+
|
|
471
|
+
unless options[:user_provided_headers] # wouldn't make sense to re-map user provided headers
|
|
472
|
+
key_mappingH = options[:key_mapping]
|
|
473
|
+
|
|
474
|
+
# do some key mapping on the keys in the file header
|
|
475
|
+
# if you want to completely delete a key, then map it to nil or to ''
|
|
476
|
+
if !key_mappingH.nil? && key_mappingH.class == Hash && key_mappingH.keys.size > 0
|
|
477
|
+
# we can't map keys that are not there
|
|
478
|
+
missing_keys = key_mappingH.keys - headerA
|
|
479
|
+
puts "WARNING: missing header(s): #{missing_keys.join(",")}" unless missing_keys.empty?
|
|
480
|
+
|
|
481
|
+
headerA.map!{|x| key_mappingH.has_key?(x) ? (key_mappingH[x].nil? ? nil : key_mappingH[x]) : (options[:remove_unmapped_keys] ? nil : x)}
|
|
482
|
+
end
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
# header_validations
|
|
486
|
+
duplicate_headers = []
|
|
487
|
+
headerA.compact.each do |k|
|
|
488
|
+
duplicate_headers << k if headerA.select{|x| x == k}.size > 1
|
|
489
|
+
end
|
|
490
|
+
|
|
491
|
+
unless options[:user_provided_headers] || duplicate_headers.empty?
|
|
492
|
+
raise SmarterCSV::DuplicateHeaders, "ERROR: duplicate headers: #{duplicate_headers.join(',')}"
|
|
493
|
+
end
|
|
494
|
+
|
|
495
|
+
if options[:required_headers] && options[:required_headers].is_a?(Array)
|
|
496
|
+
missing_headers = []
|
|
497
|
+
options[:required_headers].each do |k|
|
|
498
|
+
missing_headers << k unless headerA.include?(k)
|
|
499
|
+
end
|
|
500
|
+
raise SmarterCSV::MissingHeaders, "ERROR: missing headers: #{missing_headers.join(',')}" unless missing_headers.empty?
|
|
501
|
+
end
|
|
502
|
+
|
|
503
|
+
@headers = headerA
|
|
504
|
+
[headerA, header_size]
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
def process_duplicate_headers(headers, options)
|
|
508
|
+
counts = Hash.new(0)
|
|
509
|
+
result = []
|
|
510
|
+
headers.each do |key|
|
|
511
|
+
counts[key] += 1
|
|
512
|
+
if counts[key] == 1
|
|
513
|
+
result << key
|
|
514
|
+
else
|
|
515
|
+
result << [key, options[:duplicate_header_suffix], counts[key]].join
|
|
516
|
+
end
|
|
517
|
+
end
|
|
518
|
+
result
|
|
519
|
+
end
|
|
520
|
+
end
|
|
521
|
+
end
|
data/smarter_csv.gemspec
CHANGED
|
@@ -12,14 +12,29 @@ Gem::Specification.new do |spec|
|
|
|
12
12
|
spec.homepage = "https://github.com/tilo/smarter_csv"
|
|
13
13
|
spec.license = 'MIT'
|
|
14
14
|
|
|
15
|
-
spec.
|
|
15
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
16
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
17
|
+
spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
|
|
18
|
+
|
|
19
|
+
# Specify which files should be added to the gem when it is released.
|
|
20
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
21
|
+
spec.files = Dir.chdir(__dir__) do
|
|
22
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
|
23
|
+
(f == __FILE__) ||
|
|
24
|
+
f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)}) || f.match(/\.h\z/)
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
|
28
|
+
|
|
16
29
|
spec.executables = spec.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
|
17
|
-
spec.
|
|
18
|
-
spec.
|
|
30
|
+
spec.require_paths = ["lib"] # add ext here?
|
|
31
|
+
spec.extensions = ["ext/smarter_csv/extconf.rb"]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
spec.add_development_dependency "awesome_print"
|
|
35
|
+
spec.add_development_dependency "codecov"
|
|
36
|
+
spec.add_development_dependency "pry"
|
|
19
37
|
spec.add_development_dependency "rspec"
|
|
38
|
+
spec.add_development_dependency "rubocop"
|
|
20
39
|
spec.add_development_dependency "simplecov"
|
|
21
|
-
spec.add_development_dependency "awesome_print"
|
|
22
|
-
# spec.add_development_dependency "guard-rspec"
|
|
23
|
-
|
|
24
|
-
spec.metadata["homepage_uri"] = spec.homepage
|
|
25
40
|
end
|