smarter_csv 1.15.2 → 1.16.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +2 -0
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +112 -1
- data/CONTRIBUTORS.md +4 -1
- data/Gemfile +1 -0
- data/README.md +129 -27
- data/docs/_introduction.md +45 -24
- data/docs/bad_row_quarantine.md +342 -0
- data/docs/basic_read_api.md +152 -9
- data/docs/basic_write_api.md +475 -59
- data/docs/batch_processing.md +162 -4
- data/docs/column_selection.md +184 -0
- data/docs/data_transformations.md +163 -29
- data/docs/examples.md +340 -46
- data/docs/header_transformations.md +94 -12
- data/docs/header_validations.md +57 -18
- data/docs/history.md +119 -0
- data/docs/instrumentation.md +166 -0
- data/docs/migrating_from_csv.md +565 -0
- data/docs/options.md +151 -87
- data/docs/parsing_strategy.md +64 -1
- data/docs/real_world_csv.md +263 -0
- data/docs/releases/1.16.0/benchmarks.md +223 -0
- data/docs/releases/1.16.0/changes.md +273 -0
- data/docs/releases/1.16.0/performance_notes.md +114 -0
- data/docs/row_col_sep.md +15 -5
- data/docs/ruby_csv_pitfalls.md +514 -0
- data/docs/value_converters.md +194 -57
- data/ext/smarter_csv/extconf.rb +3 -0
- data/ext/smarter_csv/smarter_csv.c +1017 -82
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
- data/lib/smarter_csv/errors.rb +8 -0
- data/lib/smarter_csv/file_io.rb +1 -1
- data/lib/smarter_csv/hash_transformations.rb +14 -13
- data/lib/smarter_csv/header_transformations.rb +21 -2
- data/lib/smarter_csv/headers.rb +2 -1
- data/lib/smarter_csv/options.rb +124 -7
- data/lib/smarter_csv/parser.rb +358 -74
- data/lib/smarter_csv/reader.rb +494 -46
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv/writer.rb +71 -19
- data/lib/smarter_csv.rb +134 -13
- data/smarter_csv.gemspec +20 -10
- metadata +38 -80
data/lib/smarter_csv/parser.rb
CHANGED
|
@@ -3,6 +3,16 @@
|
|
|
3
3
|
module SmarterCSV
|
|
4
4
|
module Parser
|
|
5
5
|
EMPTY_STRING = '' # already frozen
|
|
6
|
+
# Optimization #13: byteindex (byte-position search) was added in Ruby 3.2.
|
|
7
|
+
# When available, it lets Opt #10/#12 skip-ahead use byte offsets directly —
|
|
8
|
+
# no conversion from byte position to character position needed.
|
|
9
|
+
#
|
|
10
|
+
# Restricted to MRI Ruby (RUBY_ENGINE == 'ruby'): JRuby and TruffleRuby implement
|
|
11
|
+
# byteindex but require the offset to land on a character boundary. Our byte-level
|
|
12
|
+
# loop advances i one byte at a time, so i can point to a UTF-8 continuation byte
|
|
13
|
+
# (0x80–0xBF) when Opt #10/#12 fires — which raises IndexError on those runtimes.
|
|
14
|
+
# The inline getbyte fallback below is correct for all Ruby implementations.
|
|
15
|
+
BYTEINDEX_AVAILABLE = RUBY_ENGINE == 'ruby' && String.method_defined?(:byteindex)
|
|
6
16
|
|
|
7
17
|
protected
|
|
8
18
|
|
|
@@ -20,7 +30,7 @@ module SmarterCSV
|
|
|
20
30
|
|
|
21
31
|
if options[:acceleration] && has_acceleration
|
|
22
32
|
# :nocov:
|
|
23
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash)
|
|
33
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash, options[:quote_boundary] == :standard, options[:row_sep])
|
|
24
34
|
[elements, elements.size]
|
|
25
35
|
# :nocov:
|
|
26
36
|
else
|
|
@@ -31,33 +41,65 @@ module SmarterCSV
|
|
|
31
41
|
end
|
|
32
42
|
|
|
33
43
|
def parse_with_auto_fallback(line, options, header_size = nil)
|
|
44
|
+
# Optimization #4: cache merged options hashes for :auto mode
|
|
45
|
+
@quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
|
|
46
|
+
@quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
|
|
47
|
+
|
|
48
|
+
# Optimization #5: if the line contains no backslash, backslash escaping cannot
|
|
49
|
+
# affect parsing (a backslash only matters immediately before a quote char).
|
|
50
|
+
# RFC 4180 and backslash modes give identical results — skip the try-backslash
|
|
51
|
+
# dance and call directly with RFC options (tighter C inner loop + memchr).
|
|
34
52
|
has_quotes = line.include?(options[:quote_char])
|
|
53
|
+
unless line.include?('\\')
|
|
54
|
+
if options[:acceleration] && has_acceleration
|
|
55
|
+
# :nocov:
|
|
56
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
57
|
+
return [elements, elements.size]
|
|
58
|
+
# :nocov:
|
|
59
|
+
else
|
|
60
|
+
return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Line has a backslash — try backslash-escape interpretation first.
|
|
35
65
|
|
|
36
|
-
begin
|
|
66
|
+
result = begin
|
|
37
67
|
# Try backslash-escape interpretation first
|
|
38
68
|
if options[:acceleration] && has_acceleration
|
|
39
69
|
# :nocov:
|
|
40
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true)
|
|
70
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true, options[:quote_boundary] == :standard, options[:row_sep])
|
|
41
71
|
[elements, elements.size]
|
|
42
72
|
# :nocov:
|
|
43
73
|
else
|
|
44
|
-
|
|
45
|
-
@backslash_options ||= options.merge(quote_escaping: :backslash)
|
|
46
|
-
parse_csv_line_ruby(line, @backslash_options, header_size, has_quotes)
|
|
74
|
+
parse_csv_line_ruby(line, @quote_escaping_backslash, header_size, has_quotes)
|
|
47
75
|
end
|
|
48
76
|
rescue MalformedCSV
|
|
49
|
-
# Backslash
|
|
77
|
+
# Backslash raised a hard error — fall back to RFC 4180 immediately
|
|
50
78
|
if options[:acceleration] && has_acceleration
|
|
51
79
|
# :nocov:
|
|
52
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false)
|
|
53
|
-
[elements, elements.size]
|
|
80
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
81
|
+
return [elements, elements.size]
|
|
54
82
|
# :nocov:
|
|
55
83
|
else
|
|
56
|
-
|
|
57
|
-
@rfc_options ||= options.merge(quote_escaping: :double_quotes)
|
|
58
|
-
parse_csv_line_ruby(line, @rfc_options, header_size, has_quotes)
|
|
84
|
+
return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
59
85
|
end
|
|
60
86
|
end
|
|
87
|
+
|
|
88
|
+
# Backslash sees unclosed quote (-1): RFC may still close it (e.g. header "val\")
|
|
89
|
+
if result[1] == -1
|
|
90
|
+
rfc_result = if options[:acceleration] && has_acceleration
|
|
91
|
+
# :nocov:
|
|
92
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
93
|
+
[elements, elements.size]
|
|
94
|
+
# :nocov:
|
|
95
|
+
else
|
|
96
|
+
parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
97
|
+
end
|
|
98
|
+
return rfc_result unless rfc_result[1] == -1
|
|
99
|
+
# Both agree line is incomplete → propagate -1
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
result
|
|
61
103
|
end
|
|
62
104
|
|
|
63
105
|
# Parse a CSV line directly into a hash, with support for extra columns.
|
|
@@ -78,35 +120,54 @@ module SmarterCSV
|
|
|
78
120
|
end
|
|
79
121
|
|
|
80
122
|
def parse_line_to_hash_auto(line, headers, options)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
123
|
+
# Optimization #4: cache merged options hashes for :auto mode
|
|
124
|
+
@quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
|
|
125
|
+
@quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
|
|
126
|
+
|
|
127
|
+
if options[:acceleration] && has_acceleration
|
|
128
|
+
# C path: zero Ruby string scanning on the hot path.
|
|
129
|
+
# C handles Opt #5 internally — if backslash mode is requested but the line
|
|
130
|
+
# contains no backslash, C automatically downgrades to RFC mode in Section 5
|
|
131
|
+
# (enabling the memchr-inside-quotes optimisation). For unquoted lines, Section 4
|
|
132
|
+
# fast path is taken and allow_escaped_quotes is irrelevant anyway.
|
|
133
|
+
# :nocov:
|
|
134
|
+
result = parse_line_to_hash_c(line, headers, @quote_escaping_backslash)
|
|
135
|
+
if result[1] == -1 && line.include?('\\')
|
|
136
|
+
# Backslash mode sees unclosed quote on a line that contains a backslash.
|
|
137
|
+
# RFC 4180 may close it differently (e.g. "val\" is open in backslash
|
|
138
|
+
# mode but closed in RFC mode). Only try RFC when a backslash is present —
|
|
139
|
+
# if there is no backslash, both modes give identical results and the extra
|
|
140
|
+
# call is wasted work (common case: embedded-newline partial stitching lines).
|
|
141
|
+
rfc_result = parse_line_to_hash_c(line, headers, @quote_escaping_double)
|
|
142
|
+
return rfc_result unless rfc_result[1] == -1
|
|
143
|
+
# Both agree line is incomplete → propagate [nil, -1]
|
|
94
144
|
end
|
|
145
|
+
# :nocov:
|
|
146
|
+
return result
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
# Ruby fallback path: explicit backslash/quote checks still needed
|
|
150
|
+
has_quotes = line.include?(options[:quote_char])
|
|
151
|
+
unless line.include?('\\')
|
|
152
|
+
return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
result = begin
|
|
156
|
+
parse_line_to_hash_ruby(line, headers, @quote_escaping_backslash, has_quotes)
|
|
95
157
|
rescue MalformedCSV
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
@rfc_options ||= options.merge(quote_escaping: :double_quotes)
|
|
107
|
-
parse_line_to_hash_ruby(line, headers, @rfc_options, has_quotes)
|
|
108
|
-
end
|
|
158
|
+
return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
|
|
159
|
+
end
|
|
160
|
+
|
|
161
|
+
# Backslash path sees an unclosed quote ([nil, -1]): RFC 4180 may still close
|
|
162
|
+
# the field — e.g. a field ending with \" is open in backslash mode but closed
|
|
163
|
+
# in RFC mode. Try RFC; if it also returns -1 both agree the line is incomplete.
|
|
164
|
+
if result[1] == -1
|
|
165
|
+
rfc_result = parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
|
|
166
|
+
return rfc_result unless rfc_result[1] == -1
|
|
167
|
+
# Both interpretations agree the line is incomplete → propagate [nil, -1]
|
|
109
168
|
end
|
|
169
|
+
|
|
170
|
+
result
|
|
110
171
|
end
|
|
111
172
|
|
|
112
173
|
# Ruby implementation of parse_line_to_hash
|
|
@@ -116,14 +177,55 @@ module SmarterCSV
|
|
|
116
177
|
# Chomp trailing row separator
|
|
117
178
|
line = line.chomp(options[:row_sep]) if options[:row_sep]
|
|
118
179
|
|
|
119
|
-
|
|
180
|
+
col_sep = options[:col_sep]
|
|
181
|
+
strip = options[:strip_whitespace]
|
|
182
|
+
prefix = options[:missing_header_prefix]
|
|
183
|
+
|
|
184
|
+
# Optimization #11: for unquoted lines, build the hash in one pass directly
|
|
185
|
+
# from String#split — no intermediate array returned from parse_csv_line_ruby
|
|
186
|
+
# and no second iteration to convert array → hash. Saves one Array allocation
|
|
187
|
+
# + one full-row iteration per row (most impactful on wide-column files).
|
|
188
|
+
#
|
|
189
|
+
# Optimization #14: when remove_empty_values is set (default: true), skip
|
|
190
|
+
# empty fields inline during hash building instead of inserting them and
|
|
191
|
+
# deleting later in hash_transformations. With strip_whitespace: true
|
|
192
|
+
# (default), v.empty? after strip catches both empty and whitespace-only
|
|
193
|
+
# fields without a regex. Most impactful on sparse files (many empty fields).
|
|
194
|
+
unless has_quotes || col_sep == ' '
|
|
195
|
+
fields = line.split(col_sep, -1)
|
|
196
|
+
n = fields.size
|
|
197
|
+
|
|
198
|
+
if options[:remove_empty_hashes]
|
|
199
|
+
all_blank = fields.empty? || fields.all? { |v| v.strip.empty? }
|
|
200
|
+
return [nil, n] if all_blank
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
# Batch-strip using C-level map! — faster than per-element strip inside the loop
|
|
204
|
+
fields.map!(&:strip) if strip
|
|
205
|
+
|
|
206
|
+
remove_empty = options[:remove_empty_values]
|
|
207
|
+
hash = {}
|
|
208
|
+
fields.each_with_index do |v, i| # C-level iteration, faster than Ruby while counter loop
|
|
209
|
+
next if remove_empty && v.empty?
|
|
210
|
+
hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = v
|
|
211
|
+
end
|
|
212
|
+
|
|
213
|
+
unless remove_empty
|
|
214
|
+
(n...headers.size).each { |i| hash[headers[i]] = nil }
|
|
215
|
+
end
|
|
216
|
+
|
|
217
|
+
return [hash, n]
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
# Quoted/complex path: parse into elements array, then build hash.
|
|
120
221
|
elements, data_size = parse_csv_line_ruby(line, options, nil, has_quotes)
|
|
222
|
+
return [nil, -1] if data_size == -1 # unclosed quote at EOL → caller stitches next line
|
|
121
223
|
|
|
122
224
|
# Optimization #6: elements are always String or nil from parse_csv_line_ruby,
|
|
123
225
|
# so .to_s is unnecessary. If strip_whitespace is on, fields are already
|
|
124
226
|
# stripped, so .strip is also redundant — just check .empty?.
|
|
125
227
|
if options[:remove_empty_hashes]
|
|
126
|
-
all_blank = if
|
|
228
|
+
all_blank = if strip
|
|
127
229
|
elements.empty? || elements.all? { |v| v.nil? || v.empty? }
|
|
128
230
|
else
|
|
129
231
|
elements.empty? || elements.all? { |v| v.nil? || v.strip.empty? }
|
|
@@ -131,22 +233,21 @@ module SmarterCSV
|
|
|
131
233
|
return [nil, data_size] if all_blank
|
|
132
234
|
end
|
|
133
235
|
|
|
134
|
-
# Build the hash -
|
|
236
|
+
# Build the hash — integer-index while loop avoids enumerator overhead vs each_with_index
|
|
237
|
+
n = elements.size
|
|
135
238
|
hash = {}
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
"#{options[:missing_header_prefix]}#{i + 1}".to_sym
|
|
141
|
-
end
|
|
142
|
-
hash[key] = value
|
|
239
|
+
i = 0
|
|
240
|
+
while i < n
|
|
241
|
+
hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = elements[i]
|
|
242
|
+
i += 1
|
|
143
243
|
end
|
|
144
244
|
|
|
145
245
|
# Add nil for missing columns only when remove_empty_values is false
|
|
146
246
|
# (when true, nils would be removed anyway by hash_transformations)
|
|
147
247
|
unless options[:remove_empty_values]
|
|
148
|
-
|
|
248
|
+
while i < headers.size
|
|
149
249
|
hash[headers[i]] = nil
|
|
250
|
+
i += 1
|
|
150
251
|
end
|
|
151
252
|
end
|
|
152
253
|
|
|
@@ -182,7 +283,9 @@ module SmarterCSV
|
|
|
182
283
|
|
|
183
284
|
# Ensure has_quotes is set correctly (callers via parse/parse_line_to_hash
|
|
184
285
|
# always pass this, but direct callers may not)
|
|
286
|
+
# rubocop:disable Style/OrAssignment
|
|
185
287
|
has_quotes = line.include?(options[:quote_char]) unless has_quotes
|
|
288
|
+
# rubocop:enable Style/OrAssignment
|
|
186
289
|
|
|
187
290
|
# Optimization #7: when line has no quotes, use String#split (C-implemented)
|
|
188
291
|
# to bypass the entire character-by-character loop.
|
|
@@ -193,6 +296,7 @@ module SmarterCSV
|
|
|
193
296
|
if header_size && header_size <= 0
|
|
194
297
|
return [[], 0]
|
|
195
298
|
end
|
|
299
|
+
|
|
196
300
|
elements = line.split(col_sep, -1) # -1 preserves trailing empty fields
|
|
197
301
|
elements = elements[0, header_size] if header_size
|
|
198
302
|
elements.map!(&:strip) if strip
|
|
@@ -210,74 +314,254 @@ module SmarterCSV
|
|
|
210
314
|
backslash_count = 0
|
|
211
315
|
in_quotes = false
|
|
212
316
|
allow_escaped_quotes = options[:quote_escaping] == :backslash
|
|
317
|
+
quote_boundary_standard = options[:quote_boundary] == :standard
|
|
318
|
+
field_started = false # for boundary tracking (standard mode only)
|
|
319
|
+
row_sep = options[:row_sep]
|
|
320
|
+
row_sep_size = row_sep.is_a?(String) ? row_sep.size : 0
|
|
213
321
|
|
|
214
322
|
# Optimization #1: for the common single-char separator, use direct
|
|
215
323
|
# character comparison instead of allocating a substring via line[i...i+n].
|
|
216
324
|
if col_sep_size == 1
|
|
217
|
-
|
|
218
|
-
|
|
325
|
+
# Optimization #13: byte-level indexing for single-char separator.
|
|
326
|
+
# col_sep and quote_char are both validated to be single-byte at option
|
|
327
|
+
# parsing time. UTF-8 multi-byte continuation bytes (0x80–0xBF) never
|
|
328
|
+
# alias ASCII delimiter bytes (0x00–0x7F), so byte scanning is safe for
|
|
329
|
+
# UTF-8 strings with ASCII delimiters — no String allocation per character.
|
|
330
|
+
col_sep_byte = col_sep.getbyte(0)
|
|
331
|
+
quote_byte = quote.getbyte(0)
|
|
332
|
+
bytesize = line.bytesize
|
|
333
|
+
row_sep_bytesize = row_sep.is_a?(String) ? row_sep.bytesize : 0
|
|
334
|
+
|
|
335
|
+
while i < bytesize
|
|
336
|
+
# Optimization #10: inside a quoted field with no backslash escaping, jump
|
|
337
|
+
# directly to the next quote character using byteindex (C-level scan).
|
|
338
|
+
# Avoids per-character Ruby iteration through long field content.
|
|
339
|
+
if in_quotes && !allow_escaped_quotes
|
|
340
|
+
next_q = if BYTEINDEX_AVAILABLE
|
|
341
|
+
line.byteindex(quote, i)
|
|
342
|
+
else
|
|
343
|
+
j = i
|
|
344
|
+
j += 1 while j < bytesize && line.getbyte(j) != quote_byte
|
|
345
|
+
j < bytesize ? j : nil
|
|
346
|
+
end
|
|
347
|
+
if next_q.nil?
|
|
348
|
+
i = bytesize # no closing quote — exit loop, return [[], -1] below
|
|
349
|
+
break
|
|
350
|
+
end
|
|
351
|
+
i = next_q # land on the quote; fall through to normal quote-handling below
|
|
352
|
+
b = quote_byte
|
|
353
|
+
|
|
354
|
+
# Optimization #12: in :standard mode, once we know the current field is
|
|
355
|
+
# unquoted (field_started && !in_quotes), remaining quotes are literal and
|
|
356
|
+
# cannot affect parser state — jump directly to the next col_sep.
|
|
357
|
+
# Mirrors Opt #10 for the unquoted side of the same trade-off.
|
|
358
|
+
elsif quote_boundary_standard && field_started && !in_quotes
|
|
359
|
+
next_sep = if BYTEINDEX_AVAILABLE
|
|
360
|
+
line.byteindex(col_sep, i)
|
|
361
|
+
else
|
|
362
|
+
j = i
|
|
363
|
+
j += 1 while j < bytesize && line.getbyte(j) != col_sep_byte
|
|
364
|
+
j < bytesize ? j : nil
|
|
365
|
+
end
|
|
366
|
+
if next_sep.nil?
|
|
367
|
+
break
|
|
368
|
+
end
|
|
369
|
+
|
|
370
|
+
i = next_sep
|
|
371
|
+
b = col_sep_byte
|
|
372
|
+
|
|
373
|
+
else
|
|
374
|
+
b = line.getbyte(i)
|
|
375
|
+
end
|
|
376
|
+
|
|
377
|
+
if b == col_sep_byte && !in_quotes
|
|
219
378
|
break if !header_size.nil? && elements.size >= header_size
|
|
220
379
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
380
|
+
# Optimization #15: for quoted fields, extract content directly without
|
|
381
|
+
# surrounding quotes to avoid the double allocation of byteslice + field[1..-2]
|
|
382
|
+
# inside cleanup_quotes. Safe because the line is pre-chomped and the state
|
|
383
|
+
# machine has already found and validated the closing quote.
|
|
384
|
+
field_len = i - start
|
|
385
|
+
if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(i - 1) == quote_byte
|
|
386
|
+
field = line.byteslice(start + 1, field_len - 2)
|
|
387
|
+
field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
|
|
388
|
+
field.strip! if strip # in-place: no extra allocation; safe on fresh byteslice
|
|
389
|
+
elements << field
|
|
390
|
+
else
|
|
391
|
+
field = line.byteslice(start, field_len)
|
|
392
|
+
field = cleanup_quotes(field, quote)
|
|
393
|
+
elements << (strip ? field.strip : field) # cleanup_quotes may return frozen EMPTY_STRING
|
|
394
|
+
end
|
|
224
395
|
i += 1
|
|
225
396
|
start = i
|
|
226
397
|
backslash_count = 0
|
|
398
|
+
field_started = false # reset for next field
|
|
227
399
|
else
|
|
228
|
-
if allow_escaped_quotes &&
|
|
400
|
+
if allow_escaped_quotes && b == 92 # backslash '\\'
|
|
229
401
|
backslash_count += 1
|
|
402
|
+
field_started = true if quote_boundary_standard && !in_quotes
|
|
230
403
|
else
|
|
231
|
-
if
|
|
404
|
+
if b == quote_byte
|
|
232
405
|
if !allow_escaped_quotes || backslash_count % 2 == 0
|
|
233
|
-
|
|
406
|
+
if quote_boundary_standard
|
|
407
|
+
if in_quotes
|
|
408
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
409
|
+
next_i = i + 1
|
|
410
|
+
if next_i >= bytesize ||
|
|
411
|
+
line.getbyte(next_i) == col_sep_byte ||
|
|
412
|
+
(row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
|
|
413
|
+
in_quotes = false
|
|
414
|
+
field_started = true
|
|
415
|
+
end
|
|
416
|
+
# else: quote inside quoted field → literal (handles "" doubling)
|
|
417
|
+
elsif !field_started # at field boundary: open quoted field
|
|
418
|
+
in_quotes = true
|
|
419
|
+
field_started = true
|
|
420
|
+
end
|
|
421
|
+
# else: mid-field quote → literal, no state change
|
|
422
|
+
else
|
|
423
|
+
in_quotes = !in_quotes
|
|
424
|
+
end
|
|
425
|
+
end
|
|
426
|
+
elsif quote_boundary_standard && !in_quotes && !field_started
|
|
427
|
+
# Non-quote, non-separator: mark field as started (only needs to fire once
|
|
428
|
+
# per field — Opt #12 skips the rest once this is set).
|
|
429
|
+
# rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
|
|
430
|
+
if strip && (b == 32 || b == 9) # ' ' == 32, '\t' == 9
|
|
431
|
+
start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
|
|
432
|
+
else
|
|
433
|
+
field_started = true
|
|
234
434
|
end
|
|
435
|
+
# rubocop:enable Style/MultipleComparison
|
|
235
436
|
end
|
|
236
437
|
backslash_count = 0
|
|
237
438
|
end
|
|
238
439
|
i += 1
|
|
239
440
|
end
|
|
240
441
|
end
|
|
442
|
+
|
|
443
|
+
# Unclosed quote at end of line: signal "needs more data" to the caller.
|
|
444
|
+
# The read loop will stitch the next physical line and re-parse rather than raising.
|
|
445
|
+
return [[], -1] if in_quotes
|
|
446
|
+
|
|
447
|
+
# Process the remaining field
|
|
448
|
+
if header_size.nil? || elements.size < header_size
|
|
449
|
+
# Optimization #15 (final field): same direct extraction; safe because line is pre-chomped.
|
|
450
|
+
field_len = bytesize - start
|
|
451
|
+
if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(bytesize - 1) == quote_byte
|
|
452
|
+
field = line.byteslice(start + 1, field_len - 2)
|
|
453
|
+
field.gsub!(doubled_quote(quote), quote)
|
|
454
|
+
field.strip! if strip
|
|
455
|
+
elements << field
|
|
456
|
+
else
|
|
457
|
+
field = line.byteslice(start, field_len)
|
|
458
|
+
field = cleanup_quotes(field, quote)
|
|
459
|
+
elements << (strip ? field.strip : field)
|
|
460
|
+
end
|
|
461
|
+
end
|
|
241
462
|
else
|
|
242
463
|
# Multi-char col_sep: use substring comparison (original path)
|
|
243
464
|
while i < line_size
|
|
465
|
+
# Optimization #10 (multi-char path): same skip-ahead as single-char path above.
|
|
466
|
+
if in_quotes && !allow_escaped_quotes
|
|
467
|
+
next_q = line.index(quote, i)
|
|
468
|
+
if next_q.nil?
|
|
469
|
+
i = line_size
|
|
470
|
+
break
|
|
471
|
+
end
|
|
472
|
+
i = next_q
|
|
473
|
+
end
|
|
474
|
+
|
|
475
|
+
# Optimization #12 (multi-char path): mirror of single-char path above.
|
|
476
|
+
if quote_boundary_standard && field_started && !in_quotes
|
|
477
|
+
next_sep = line.index(col_sep, i)
|
|
478
|
+
if next_sep.nil?
|
|
479
|
+
break
|
|
480
|
+
end
|
|
481
|
+
|
|
482
|
+
i = next_sep
|
|
483
|
+
end
|
|
484
|
+
|
|
244
485
|
if line[i...i+col_sep_size] == col_sep && !in_quotes
|
|
245
486
|
break if !header_size.nil? && elements.size >= header_size
|
|
246
487
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
488
|
+
# Optimization #15 (multi-char path): same direct extraction using character indexing.
|
|
489
|
+
field_len = i - start
|
|
490
|
+
if field_len >= 2 && line[start] == quote && line[i - 1] == quote
|
|
491
|
+
field = line[start + 1...i - 1]
|
|
492
|
+
field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
|
|
493
|
+
field.strip! if strip
|
|
494
|
+
elements << field
|
|
495
|
+
else
|
|
496
|
+
field = line[start...i]
|
|
497
|
+
field = cleanup_quotes(field, quote)
|
|
498
|
+
elements << (strip ? field.strip : field)
|
|
499
|
+
end
|
|
250
500
|
i += col_sep_size
|
|
251
501
|
start = i
|
|
252
502
|
backslash_count = 0
|
|
503
|
+
field_started = false # reset for next field
|
|
253
504
|
else
|
|
254
505
|
if allow_escaped_quotes && line[i] == '\\'
|
|
255
506
|
backslash_count += 1
|
|
507
|
+
field_started = true if quote_boundary_standard && !in_quotes
|
|
256
508
|
else
|
|
257
509
|
if line[i] == quote
|
|
258
510
|
if !allow_escaped_quotes || backslash_count % 2 == 0
|
|
259
|
-
|
|
511
|
+
if quote_boundary_standard
|
|
512
|
+
if in_quotes
|
|
513
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
514
|
+
next_i = i + 1
|
|
515
|
+
if next_i >= line_size ||
|
|
516
|
+
line[next_i...next_i + col_sep_size] == col_sep ||
|
|
517
|
+
(row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
|
|
518
|
+
in_quotes = false
|
|
519
|
+
field_started = true
|
|
520
|
+
end
|
|
521
|
+
# else: quote inside quoted field → literal (handles "" doubling)
|
|
522
|
+
elsif !field_started # at field boundary: open quoted field
|
|
523
|
+
in_quotes = true
|
|
524
|
+
field_started = true
|
|
525
|
+
end
|
|
526
|
+
# else: mid-field quote → literal, no state change
|
|
527
|
+
else
|
|
528
|
+
in_quotes = !in_quotes
|
|
529
|
+
end
|
|
260
530
|
end
|
|
531
|
+
elsif quote_boundary_standard && !in_quotes && !field_started
|
|
532
|
+
# rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
|
|
533
|
+
if strip && (line[i] == ' ' || line[i] == "\t")
|
|
534
|
+
start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
|
|
535
|
+
else
|
|
536
|
+
field_started = true
|
|
537
|
+
end
|
|
538
|
+
# rubocop:enable Style/MultipleComparison
|
|
261
539
|
end
|
|
262
540
|
backslash_count = 0
|
|
263
541
|
end
|
|
264
542
|
i += 1
|
|
265
543
|
end
|
|
266
544
|
end
|
|
267
|
-
end
|
|
268
|
-
|
|
269
|
-
# Check for unclosed quotes at the end of the line
|
|
270
|
-
if in_quotes
|
|
271
|
-
# :nocov:
|
|
272
|
-
raise MalformedCSV, "Unclosed quoted field detected in line: #{line}"
|
|
273
|
-
# :nocov:
|
|
274
|
-
end
|
|
275
545
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
546
|
+
# Unclosed quote at end of line: signal "needs more data" to the caller.
|
|
547
|
+
# The read loop will stitch the next physical line and re-parse rather than raising.
|
|
548
|
+
return [[], -1] if in_quotes
|
|
549
|
+
|
|
550
|
+
# Process the remaining field
|
|
551
|
+
if header_size.nil? || elements.size < header_size
|
|
552
|
+
# Optimization #15 (multi-char final field): same direct extraction; line is pre-chomped.
|
|
553
|
+
field_len = line_size - start
|
|
554
|
+
if field_len >= 2 && line[start] == quote && line[line_size - 1] == quote
|
|
555
|
+
field = line[start + 1..line_size - 2]
|
|
556
|
+
field.gsub!(doubled_quote(quote), quote)
|
|
557
|
+
field.strip! if strip
|
|
558
|
+
elements << field
|
|
559
|
+
else
|
|
560
|
+
field = line[start..-1]
|
|
561
|
+
field = cleanup_quotes(field, quote)
|
|
562
|
+
elements << (strip ? field.strip : field)
|
|
563
|
+
end
|
|
564
|
+
end
|
|
281
565
|
end
|
|
282
566
|
|
|
283
567
|
[elements, elements.size]
|