smarter_csv 1.15.2 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +68 -1
- data/CONTRIBUTORS.md +3 -1
- data/Gemfile +1 -0
- data/README.md +123 -27
- data/docs/_introduction.md +40 -24
- data/docs/bad_row_quarantine.md +285 -0
- data/docs/basic_read_api.md +151 -9
- data/docs/basic_write_api.md +474 -59
- data/docs/batch_processing.md +161 -4
- data/docs/column_selection.md +183 -0
- data/docs/data_transformations.md +162 -29
- data/docs/examples.md +339 -46
- data/docs/header_transformations.md +93 -12
- data/docs/header_validations.md +56 -18
- data/docs/history.md +117 -0
- data/docs/instrumentation.md +165 -0
- data/docs/migrating_from_csv.md +290 -0
- data/docs/options.md +150 -87
- data/docs/parsing_strategy.md +63 -1
- data/docs/real_world_csv.md +262 -0
- data/docs/releases/1.16.0/benchmarks.md +223 -0
- data/docs/releases/1.16.0/changes.md +272 -0
- data/docs/releases/1.16.0/performance_notes.md +114 -0
- data/docs/row_col_sep.md +14 -5
- data/docs/value_converters.md +193 -57
- data/ext/smarter_csv/extconf.rb +3 -0
- data/ext/smarter_csv/smarter_csv.c +1007 -71
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
- data/lib/smarter_csv/errors.rb +8 -0
- data/lib/smarter_csv/file_io.rb +1 -1
- data/lib/smarter_csv/hash_transformations.rb +14 -13
- data/lib/smarter_csv/header_transformations.rb +21 -2
- data/lib/smarter_csv/headers.rb +2 -1
- data/lib/smarter_csv/options.rb +124 -7
- data/lib/smarter_csv/parser.rb +362 -75
- data/lib/smarter_csv/reader.rb +494 -46
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv/writer.rb +71 -19
- data/lib/smarter_csv.rb +95 -12
- data/smarter_csv.gemspec +20 -10
- metadata +37 -80
data/lib/smarter_csv/parser.rb
CHANGED
|
@@ -3,6 +3,16 @@
|
|
|
3
3
|
module SmarterCSV
|
|
4
4
|
module Parser
|
|
5
5
|
EMPTY_STRING = '' # already frozen
|
|
6
|
+
# Optimization #13: byteindex (byte-position search) was added in Ruby 3.2.
|
|
7
|
+
# When available, it lets Opt #10/#12 skip-ahead use byte offsets directly —
|
|
8
|
+
# no conversion from byte position to character position needed.
|
|
9
|
+
#
|
|
10
|
+
# Restricted to MRI Ruby (RUBY_ENGINE == 'ruby'): JRuby and TruffleRuby implement
|
|
11
|
+
# byteindex but require the offset to land on a character boundary. Our byte-level
|
|
12
|
+
# loop advances i one byte at a time, so i can point to a UTF-8 continuation byte
|
|
13
|
+
# (0x80–0xBF) when Opt #10/#12 fires — which raises IndexError on those runtimes.
|
|
14
|
+
# The inline getbyte fallback below is correct for all Ruby implementations.
|
|
15
|
+
BYTEINDEX_AVAILABLE = RUBY_ENGINE == 'ruby' && String.method_defined?(:byteindex)
|
|
6
16
|
|
|
7
17
|
protected
|
|
8
18
|
|
|
@@ -20,7 +30,7 @@ module SmarterCSV
|
|
|
20
30
|
|
|
21
31
|
if options[:acceleration] && has_acceleration
|
|
22
32
|
# :nocov:
|
|
23
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash)
|
|
33
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash, options[:quote_boundary] == :standard, options[:row_sep])
|
|
24
34
|
[elements, elements.size]
|
|
25
35
|
# :nocov:
|
|
26
36
|
else
|
|
@@ -31,33 +41,68 @@ module SmarterCSV
|
|
|
31
41
|
end
|
|
32
42
|
|
|
33
43
|
def parse_with_auto_fallback(line, options, header_size = nil)
|
|
34
|
-
|
|
44
|
+
# Optimization #4: cache merged options hashes for :auto mode
|
|
45
|
+
@quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
|
|
46
|
+
@quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
|
|
47
|
+
|
|
48
|
+
# Optimization #5: if the line contains no backslash, backslash escaping cannot
|
|
49
|
+
# affect parsing (a backslash only matters immediately before a quote char).
|
|
50
|
+
# RFC 4180 and backslash modes give identical results — skip the try-backslash
|
|
51
|
+
# dance and call directly with RFC options (tighter C inner loop + memchr).
|
|
52
|
+
# has_quotes is only needed for the Ruby fallback path — C computes it internally.
|
|
53
|
+
unless line.include?('\\')
|
|
54
|
+
if options[:acceleration] && has_acceleration
|
|
55
|
+
# :nocov:
|
|
56
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
57
|
+
return [elements, elements.size]
|
|
58
|
+
# :nocov:
|
|
59
|
+
else
|
|
60
|
+
has_quotes = line.include?(options[:quote_char])
|
|
61
|
+
return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
62
|
+
end
|
|
63
|
+
end
|
|
35
64
|
|
|
36
|
-
|
|
65
|
+
# Line has a backslash — try backslash-escape interpretation first.
|
|
66
|
+
# has_quotes only needed for Ruby fallback path.
|
|
67
|
+
has_quotes = line.include?(options[:quote_char]) unless options[:acceleration] && has_acceleration
|
|
68
|
+
|
|
69
|
+
result = begin
|
|
37
70
|
# Try backslash-escape interpretation first
|
|
38
71
|
if options[:acceleration] && has_acceleration
|
|
39
72
|
# :nocov:
|
|
40
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size,
|
|
73
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], true, options[:quote_boundary] == :standard, options[:row_sep])
|
|
41
74
|
[elements, elements.size]
|
|
42
75
|
# :nocov:
|
|
43
76
|
else
|
|
44
|
-
|
|
45
|
-
@backslash_options ||= options.merge(quote_escaping: :backslash)
|
|
46
|
-
parse_csv_line_ruby(line, @backslash_options, header_size, has_quotes)
|
|
77
|
+
parse_csv_line_ruby(line, @quote_escaping_backslash, header_size, has_quotes)
|
|
47
78
|
end
|
|
48
79
|
rescue MalformedCSV
|
|
49
|
-
# Backslash
|
|
80
|
+
# Backslash raised a hard error — fall back to RFC 4180 immediately
|
|
50
81
|
if options[:acceleration] && has_acceleration
|
|
51
82
|
# :nocov:
|
|
52
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size,
|
|
53
|
-
[elements, elements.size]
|
|
83
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
84
|
+
return [elements, elements.size]
|
|
54
85
|
# :nocov:
|
|
55
86
|
else
|
|
56
|
-
|
|
57
|
-
@rfc_options ||= options.merge(quote_escaping: :double_quotes)
|
|
58
|
-
parse_csv_line_ruby(line, @rfc_options, header_size, has_quotes)
|
|
87
|
+
return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
59
88
|
end
|
|
60
89
|
end
|
|
90
|
+
|
|
91
|
+
# Backslash sees unclosed quote (-1): RFC may still close it (e.g. header "val\")
|
|
92
|
+
if result[1] == -1
|
|
93
|
+
rfc_result = if options[:acceleration] && has_acceleration
|
|
94
|
+
# :nocov:
|
|
95
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
96
|
+
[elements, elements.size]
|
|
97
|
+
# :nocov:
|
|
98
|
+
else
|
|
99
|
+
parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
100
|
+
end
|
|
101
|
+
return rfc_result unless rfc_result[1] == -1
|
|
102
|
+
# Both agree line is incomplete → propagate -1
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
result
|
|
61
106
|
end
|
|
62
107
|
|
|
63
108
|
# Parse a CSV line directly into a hash, with support for extra columns.
|
|
@@ -78,35 +123,54 @@ module SmarterCSV
|
|
|
78
123
|
end
|
|
79
124
|
|
|
80
125
|
def parse_line_to_hash_auto(line, headers, options)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
126
|
+
# Optimization #4: cache merged options hashes for :auto mode
|
|
127
|
+
@quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
|
|
128
|
+
@quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
|
|
129
|
+
|
|
130
|
+
if options[:acceleration] && has_acceleration
|
|
131
|
+
# C path: zero Ruby string scanning on the hot path.
|
|
132
|
+
# C handles Opt #5 internally — if backslash mode is requested but the line
|
|
133
|
+
# contains no backslash, C automatically downgrades to RFC mode in Section 5
|
|
134
|
+
# (enabling the memchr-inside-quotes optimisation). For unquoted lines, Section 4
|
|
135
|
+
# fast path is taken and allow_escaped_quotes is irrelevant anyway.
|
|
136
|
+
# :nocov:
|
|
137
|
+
result = parse_line_to_hash_c(line, headers, @quote_escaping_backslash)
|
|
138
|
+
if result[1] == -1 && line.include?('\\')
|
|
139
|
+
# Backslash mode sees unclosed quote on a line that contains a backslash.
|
|
140
|
+
# RFC 4180 may close it differently (e.g. "val\" is open in backslash
|
|
141
|
+
# mode but closed in RFC mode). Only try RFC when a backslash is present —
|
|
142
|
+
# if there is no backslash, both modes give identical results and the extra
|
|
143
|
+
# call is wasted work (common case: embedded-newline partial stitching lines).
|
|
144
|
+
rfc_result = parse_line_to_hash_c(line, headers, @quote_escaping_double)
|
|
145
|
+
return rfc_result unless rfc_result[1] == -1
|
|
146
|
+
# Both agree line is incomplete → propagate [nil, -1]
|
|
94
147
|
end
|
|
148
|
+
# :nocov:
|
|
149
|
+
return result
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Ruby fallback path: explicit backslash/quote checks still needed
|
|
153
|
+
has_quotes = line.include?(options[:quote_char])
|
|
154
|
+
unless line.include?('\\')
|
|
155
|
+
return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
|
|
156
|
+
end
|
|
157
|
+
|
|
158
|
+
result = begin
|
|
159
|
+
parse_line_to_hash_ruby(line, headers, @quote_escaping_backslash, has_quotes)
|
|
95
160
|
rescue MalformedCSV
|
|
96
|
-
|
|
97
|
-
if options[:acceleration] && has_acceleration
|
|
98
|
-
# :nocov:
|
|
99
|
-
# Optimization #4: cache merged options hashes for :auto mode
|
|
100
|
-
@rfc_options ||= options.merge(quote_escaping: :double_quotes)
|
|
101
|
-
parse_line_to_hash_c(line, headers, @rfc_options)
|
|
102
|
-
# :nocov:
|
|
103
|
-
else
|
|
104
|
-
has_quotes = line.include?(options[:quote_char])
|
|
105
|
-
# Optimization #4: cache merged options hashes for :auto mode
|
|
106
|
-
@rfc_options ||= options.merge(quote_escaping: :double_quotes)
|
|
107
|
-
parse_line_to_hash_ruby(line, headers, @rfc_options, has_quotes)
|
|
108
|
-
end
|
|
161
|
+
return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
|
|
109
162
|
end
|
|
163
|
+
|
|
164
|
+
# Backslash path sees an unclosed quote ([nil, -1]): RFC 4180 may still close
|
|
165
|
+
# the field — e.g. a field ending with \" is open in backslash mode but closed
|
|
166
|
+
# in RFC mode. Try RFC; if it also returns -1 both agree the line is incomplete.
|
|
167
|
+
if result[1] == -1
|
|
168
|
+
rfc_result = parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
|
|
169
|
+
return rfc_result unless rfc_result[1] == -1
|
|
170
|
+
# Both interpretations agree the line is incomplete → propagate [nil, -1]
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
result
|
|
110
174
|
end
|
|
111
175
|
|
|
112
176
|
# Ruby implementation of parse_line_to_hash
|
|
@@ -116,14 +180,55 @@ module SmarterCSV
|
|
|
116
180
|
# Chomp trailing row separator
|
|
117
181
|
line = line.chomp(options[:row_sep]) if options[:row_sep]
|
|
118
182
|
|
|
119
|
-
|
|
183
|
+
col_sep = options[:col_sep]
|
|
184
|
+
strip = options[:strip_whitespace]
|
|
185
|
+
prefix = options[:missing_header_prefix]
|
|
186
|
+
|
|
187
|
+
# Optimization #11: for unquoted lines, build the hash in one pass directly
|
|
188
|
+
# from String#split — no intermediate array returned from parse_csv_line_ruby
|
|
189
|
+
# and no second iteration to convert array → hash. Saves one Array allocation
|
|
190
|
+
# + one full-row iteration per row (most impactful on wide-column files).
|
|
191
|
+
#
|
|
192
|
+
# Optimization #14: when remove_empty_values is set (default: true), skip
|
|
193
|
+
# empty fields inline during hash building instead of inserting them and
|
|
194
|
+
# deleting later in hash_transformations. With strip_whitespace: true
|
|
195
|
+
# (default), v.empty? after strip catches both empty and whitespace-only
|
|
196
|
+
# fields without a regex. Most impactful on sparse files (many empty fields).
|
|
197
|
+
unless has_quotes || col_sep == ' '
|
|
198
|
+
fields = line.split(col_sep, -1)
|
|
199
|
+
n = fields.size
|
|
200
|
+
|
|
201
|
+
if options[:remove_empty_hashes]
|
|
202
|
+
all_blank = fields.empty? || fields.all? { |v| v.strip.empty? }
|
|
203
|
+
return [nil, n] if all_blank
|
|
204
|
+
end
|
|
205
|
+
|
|
206
|
+
# Batch-strip using C-level map! — faster than per-element strip inside the loop
|
|
207
|
+
fields.map!(&:strip) if strip
|
|
208
|
+
|
|
209
|
+
remove_empty = options[:remove_empty_values]
|
|
210
|
+
hash = {}
|
|
211
|
+
fields.each_with_index do |v, i| # C-level iteration, faster than Ruby while counter loop
|
|
212
|
+
next if remove_empty && v.empty?
|
|
213
|
+
hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = v
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
unless remove_empty
|
|
217
|
+
(n...headers.size).each { |i| hash[headers[i]] = nil }
|
|
218
|
+
end
|
|
219
|
+
|
|
220
|
+
return [hash, n]
|
|
221
|
+
end
|
|
222
|
+
|
|
223
|
+
# Quoted/complex path: parse into elements array, then build hash.
|
|
120
224
|
elements, data_size = parse_csv_line_ruby(line, options, nil, has_quotes)
|
|
225
|
+
return [nil, -1] if data_size == -1 # unclosed quote at EOL → caller stitches next line
|
|
121
226
|
|
|
122
227
|
# Optimization #6: elements are always String or nil from parse_csv_line_ruby,
|
|
123
228
|
# so .to_s is unnecessary. If strip_whitespace is on, fields are already
|
|
124
229
|
# stripped, so .strip is also redundant — just check .empty?.
|
|
125
230
|
if options[:remove_empty_hashes]
|
|
126
|
-
all_blank = if
|
|
231
|
+
all_blank = if strip
|
|
127
232
|
elements.empty? || elements.all? { |v| v.nil? || v.empty? }
|
|
128
233
|
else
|
|
129
234
|
elements.empty? || elements.all? { |v| v.nil? || v.strip.empty? }
|
|
@@ -131,22 +236,21 @@ module SmarterCSV
|
|
|
131
236
|
return [nil, data_size] if all_blank
|
|
132
237
|
end
|
|
133
238
|
|
|
134
|
-
# Build the hash -
|
|
239
|
+
# Build the hash — integer-index while loop avoids enumerator overhead vs each_with_index
|
|
240
|
+
n = elements.size
|
|
135
241
|
hash = {}
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
"#{options[:missing_header_prefix]}#{i + 1}".to_sym
|
|
141
|
-
end
|
|
142
|
-
hash[key] = value
|
|
242
|
+
i = 0
|
|
243
|
+
while i < n
|
|
244
|
+
hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = elements[i]
|
|
245
|
+
i += 1
|
|
143
246
|
end
|
|
144
247
|
|
|
145
248
|
# Add nil for missing columns only when remove_empty_values is false
|
|
146
249
|
# (when true, nils would be removed anyway by hash_transformations)
|
|
147
250
|
unless options[:remove_empty_values]
|
|
148
|
-
|
|
251
|
+
while i < headers.size
|
|
149
252
|
hash[headers[i]] = nil
|
|
253
|
+
i += 1
|
|
150
254
|
end
|
|
151
255
|
end
|
|
152
256
|
|
|
@@ -182,7 +286,9 @@ module SmarterCSV
|
|
|
182
286
|
|
|
183
287
|
# Ensure has_quotes is set correctly (callers via parse/parse_line_to_hash
|
|
184
288
|
# always pass this, but direct callers may not)
|
|
289
|
+
# rubocop:disable Style/OrAssignment
|
|
185
290
|
has_quotes = line.include?(options[:quote_char]) unless has_quotes
|
|
291
|
+
# rubocop:enable Style/OrAssignment
|
|
186
292
|
|
|
187
293
|
# Optimization #7: when line has no quotes, use String#split (C-implemented)
|
|
188
294
|
# to bypass the entire character-by-character loop.
|
|
@@ -193,6 +299,7 @@ module SmarterCSV
|
|
|
193
299
|
if header_size && header_size <= 0
|
|
194
300
|
return [[], 0]
|
|
195
301
|
end
|
|
302
|
+
|
|
196
303
|
elements = line.split(col_sep, -1) # -1 preserves trailing empty fields
|
|
197
304
|
elements = elements[0, header_size] if header_size
|
|
198
305
|
elements.map!(&:strip) if strip
|
|
@@ -210,74 +317,254 @@ module SmarterCSV
|
|
|
210
317
|
backslash_count = 0
|
|
211
318
|
in_quotes = false
|
|
212
319
|
allow_escaped_quotes = options[:quote_escaping] == :backslash
|
|
320
|
+
quote_boundary_standard = options[:quote_boundary] == :standard
|
|
321
|
+
field_started = false # for boundary tracking (standard mode only)
|
|
322
|
+
row_sep = options[:row_sep]
|
|
323
|
+
row_sep_size = row_sep.is_a?(String) ? row_sep.size : 0
|
|
213
324
|
|
|
214
325
|
# Optimization #1: for the common single-char separator, use direct
|
|
215
326
|
# character comparison instead of allocating a substring via line[i...i+n].
|
|
216
327
|
if col_sep_size == 1
|
|
217
|
-
|
|
218
|
-
|
|
328
|
+
# Optimization #13: byte-level indexing for single-char separator.
|
|
329
|
+
# col_sep and quote_char are both validated to be single-byte at option
|
|
330
|
+
# parsing time. UTF-8 multi-byte continuation bytes (0x80–0xBF) never
|
|
331
|
+
# alias ASCII delimiter bytes (0x00–0x7F), so byte scanning is safe for
|
|
332
|
+
# UTF-8 strings with ASCII delimiters — no String allocation per character.
|
|
333
|
+
col_sep_byte = col_sep.getbyte(0)
|
|
334
|
+
quote_byte = quote.getbyte(0)
|
|
335
|
+
bytesize = line.bytesize
|
|
336
|
+
row_sep_bytesize = row_sep.is_a?(String) ? row_sep.bytesize : 0
|
|
337
|
+
|
|
338
|
+
while i < bytesize
|
|
339
|
+
# Optimization #10: inside a quoted field with no backslash escaping, jump
|
|
340
|
+
# directly to the next quote character using byteindex (C-level scan).
|
|
341
|
+
# Avoids per-character Ruby iteration through long field content.
|
|
342
|
+
if in_quotes && !allow_escaped_quotes
|
|
343
|
+
next_q = if BYTEINDEX_AVAILABLE
|
|
344
|
+
line.byteindex(quote, i)
|
|
345
|
+
else
|
|
346
|
+
j = i
|
|
347
|
+
j += 1 while j < bytesize && line.getbyte(j) != quote_byte
|
|
348
|
+
j < bytesize ? j : nil
|
|
349
|
+
end
|
|
350
|
+
if next_q.nil?
|
|
351
|
+
i = bytesize # no closing quote — exit loop, return [[], -1] below
|
|
352
|
+
break
|
|
353
|
+
end
|
|
354
|
+
i = next_q # land on the quote; fall through to normal quote-handling below
|
|
355
|
+
b = quote_byte
|
|
356
|
+
|
|
357
|
+
# Optimization #12: in :standard mode, once we know the current field is
|
|
358
|
+
# unquoted (field_started && !in_quotes), remaining quotes are literal and
|
|
359
|
+
# cannot affect parser state — jump directly to the next col_sep.
|
|
360
|
+
# Mirrors Opt #10 for the unquoted side of the same trade-off.
|
|
361
|
+
elsif quote_boundary_standard && field_started && !in_quotes
|
|
362
|
+
next_sep = if BYTEINDEX_AVAILABLE
|
|
363
|
+
line.byteindex(col_sep, i)
|
|
364
|
+
else
|
|
365
|
+
j = i
|
|
366
|
+
j += 1 while j < bytesize && line.getbyte(j) != col_sep_byte
|
|
367
|
+
j < bytesize ? j : nil
|
|
368
|
+
end
|
|
369
|
+
if next_sep.nil?
|
|
370
|
+
break
|
|
371
|
+
end
|
|
372
|
+
|
|
373
|
+
i = next_sep
|
|
374
|
+
b = col_sep_byte
|
|
375
|
+
|
|
376
|
+
else
|
|
377
|
+
b = line.getbyte(i)
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
if b == col_sep_byte && !in_quotes
|
|
219
381
|
break if !header_size.nil? && elements.size >= header_size
|
|
220
382
|
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
383
|
+
# Optimization #15: for quoted fields, extract content directly without
|
|
384
|
+
# surrounding quotes to avoid the double allocation of byteslice + field[1..-2]
|
|
385
|
+
# inside cleanup_quotes. Safe because the line is pre-chomped and the state
|
|
386
|
+
# machine has already found and validated the closing quote.
|
|
387
|
+
field_len = i - start
|
|
388
|
+
if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(i - 1) == quote_byte
|
|
389
|
+
field = line.byteslice(start + 1, field_len - 2)
|
|
390
|
+
field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
|
|
391
|
+
field.strip! if strip # in-place: no extra allocation; safe on fresh byteslice
|
|
392
|
+
elements << field
|
|
393
|
+
else
|
|
394
|
+
field = line.byteslice(start, field_len)
|
|
395
|
+
field = cleanup_quotes(field, quote)
|
|
396
|
+
elements << (strip ? field.strip : field) # cleanup_quotes may return frozen EMPTY_STRING
|
|
397
|
+
end
|
|
224
398
|
i += 1
|
|
225
399
|
start = i
|
|
226
400
|
backslash_count = 0
|
|
401
|
+
field_started = false # reset for next field
|
|
227
402
|
else
|
|
228
|
-
if allow_escaped_quotes &&
|
|
403
|
+
if allow_escaped_quotes && b == 92 # backslash '\\'
|
|
229
404
|
backslash_count += 1
|
|
405
|
+
field_started = true if quote_boundary_standard && !in_quotes
|
|
230
406
|
else
|
|
231
|
-
if
|
|
407
|
+
if b == quote_byte
|
|
232
408
|
if !allow_escaped_quotes || backslash_count % 2 == 0
|
|
233
|
-
|
|
409
|
+
if quote_boundary_standard
|
|
410
|
+
if in_quotes
|
|
411
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
412
|
+
next_i = i + 1
|
|
413
|
+
if next_i >= bytesize ||
|
|
414
|
+
line.getbyte(next_i) == col_sep_byte ||
|
|
415
|
+
(row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
|
|
416
|
+
in_quotes = false
|
|
417
|
+
field_started = true
|
|
418
|
+
end
|
|
419
|
+
# else: quote inside quoted field → literal (handles "" doubling)
|
|
420
|
+
elsif !field_started # at field boundary: open quoted field
|
|
421
|
+
in_quotes = true
|
|
422
|
+
field_started = true
|
|
423
|
+
end
|
|
424
|
+
# else: mid-field quote → literal, no state change
|
|
425
|
+
else
|
|
426
|
+
in_quotes = !in_quotes
|
|
427
|
+
end
|
|
234
428
|
end
|
|
429
|
+
elsif quote_boundary_standard && !in_quotes && !field_started
|
|
430
|
+
# Non-quote, non-separator: mark field as started (only needs to fire once
|
|
431
|
+
# per field — Opt #12 skips the rest once this is set).
|
|
432
|
+
# rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
|
|
433
|
+
if strip && (b == 32 || b == 9) # ' ' == 32, '\t' == 9
|
|
434
|
+
start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
|
|
435
|
+
else
|
|
436
|
+
field_started = true
|
|
437
|
+
end
|
|
438
|
+
# rubocop:enable Style/MultipleComparison
|
|
235
439
|
end
|
|
236
440
|
backslash_count = 0
|
|
237
441
|
end
|
|
238
442
|
i += 1
|
|
239
443
|
end
|
|
240
444
|
end
|
|
445
|
+
|
|
446
|
+
# Unclosed quote at end of line: signal "needs more data" to the caller.
|
|
447
|
+
# The read loop will stitch the next physical line and re-parse rather than raising.
|
|
448
|
+
return [[], -1] if in_quotes
|
|
449
|
+
|
|
450
|
+
# Process the remaining field
|
|
451
|
+
if header_size.nil? || elements.size < header_size
|
|
452
|
+
# Optimization #15 (final field): same direct extraction; safe because line is pre-chomped.
|
|
453
|
+
field_len = bytesize - start
|
|
454
|
+
if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(bytesize - 1) == quote_byte
|
|
455
|
+
field = line.byteslice(start + 1, field_len - 2)
|
|
456
|
+
field.gsub!(doubled_quote(quote), quote)
|
|
457
|
+
field.strip! if strip
|
|
458
|
+
elements << field
|
|
459
|
+
else
|
|
460
|
+
field = line.byteslice(start, field_len)
|
|
461
|
+
field = cleanup_quotes(field, quote)
|
|
462
|
+
elements << (strip ? field.strip : field)
|
|
463
|
+
end
|
|
464
|
+
end
|
|
241
465
|
else
|
|
242
466
|
# Multi-char col_sep: use substring comparison (original path)
|
|
243
467
|
while i < line_size
|
|
468
|
+
# Optimization #10 (multi-char path): same skip-ahead as single-char path above.
|
|
469
|
+
if in_quotes && !allow_escaped_quotes
|
|
470
|
+
next_q = line.index(quote, i)
|
|
471
|
+
if next_q.nil?
|
|
472
|
+
i = line_size
|
|
473
|
+
break
|
|
474
|
+
end
|
|
475
|
+
i = next_q
|
|
476
|
+
end
|
|
477
|
+
|
|
478
|
+
# Optimization #12 (multi-char path): mirror of single-char path above.
|
|
479
|
+
if quote_boundary_standard && field_started && !in_quotes
|
|
480
|
+
next_sep = line.index(col_sep, i)
|
|
481
|
+
if next_sep.nil?
|
|
482
|
+
break
|
|
483
|
+
end
|
|
484
|
+
|
|
485
|
+
i = next_sep
|
|
486
|
+
end
|
|
487
|
+
|
|
244
488
|
if line[i...i+col_sep_size] == col_sep && !in_quotes
|
|
245
489
|
break if !header_size.nil? && elements.size >= header_size
|
|
246
490
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
491
|
+
# Optimization #15 (multi-char path): same direct extraction using character indexing.
|
|
492
|
+
field_len = i - start
|
|
493
|
+
if field_len >= 2 && line[start] == quote && line[i - 1] == quote
|
|
494
|
+
field = line[start + 1...i - 1]
|
|
495
|
+
field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
|
|
496
|
+
field.strip! if strip
|
|
497
|
+
elements << field
|
|
498
|
+
else
|
|
499
|
+
field = line[start...i]
|
|
500
|
+
field = cleanup_quotes(field, quote)
|
|
501
|
+
elements << (strip ? field.strip : field)
|
|
502
|
+
end
|
|
250
503
|
i += col_sep_size
|
|
251
504
|
start = i
|
|
252
505
|
backslash_count = 0
|
|
506
|
+
field_started = false # reset for next field
|
|
253
507
|
else
|
|
254
508
|
if allow_escaped_quotes && line[i] == '\\'
|
|
255
509
|
backslash_count += 1
|
|
510
|
+
field_started = true if quote_boundary_standard && !in_quotes
|
|
256
511
|
else
|
|
257
512
|
if line[i] == quote
|
|
258
513
|
if !allow_escaped_quotes || backslash_count % 2 == 0
|
|
259
|
-
|
|
514
|
+
if quote_boundary_standard
|
|
515
|
+
if in_quotes
|
|
516
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
517
|
+
next_i = i + 1
|
|
518
|
+
if next_i >= line_size ||
|
|
519
|
+
line[next_i...next_i + col_sep_size] == col_sep ||
|
|
520
|
+
(row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
|
|
521
|
+
in_quotes = false
|
|
522
|
+
field_started = true
|
|
523
|
+
end
|
|
524
|
+
# else: quote inside quoted field → literal (handles "" doubling)
|
|
525
|
+
elsif !field_started # at field boundary: open quoted field
|
|
526
|
+
in_quotes = true
|
|
527
|
+
field_started = true
|
|
528
|
+
end
|
|
529
|
+
# else: mid-field quote → literal, no state change
|
|
530
|
+
else
|
|
531
|
+
in_quotes = !in_quotes
|
|
532
|
+
end
|
|
533
|
+
end
|
|
534
|
+
elsif quote_boundary_standard && !in_quotes && !field_started
|
|
535
|
+
# rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
|
|
536
|
+
if strip && (line[i] == ' ' || line[i] == "\t")
|
|
537
|
+
start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
|
|
538
|
+
else
|
|
539
|
+
field_started = true
|
|
260
540
|
end
|
|
541
|
+
# rubocop:enable Style/MultipleComparison
|
|
261
542
|
end
|
|
262
543
|
backslash_count = 0
|
|
263
544
|
end
|
|
264
545
|
i += 1
|
|
265
546
|
end
|
|
266
547
|
end
|
|
267
|
-
end
|
|
268
|
-
|
|
269
|
-
# Check for unclosed quotes at the end of the line
|
|
270
|
-
if in_quotes
|
|
271
|
-
# :nocov:
|
|
272
|
-
raise MalformedCSV, "Unclosed quoted field detected in line: #{line}"
|
|
273
|
-
# :nocov:
|
|
274
|
-
end
|
|
275
548
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
549
|
+
# Unclosed quote at end of line: signal "needs more data" to the caller.
|
|
550
|
+
# The read loop will stitch the next physical line and re-parse rather than raising.
|
|
551
|
+
return [[], -1] if in_quotes
|
|
552
|
+
|
|
553
|
+
# Process the remaining field
|
|
554
|
+
if header_size.nil? || elements.size < header_size
|
|
555
|
+
# Optimization #15 (multi-char final field): same direct extraction; line is pre-chomped.
|
|
556
|
+
field_len = line_size - start
|
|
557
|
+
if field_len >= 2 && line[start] == quote && line[line_size - 1] == quote
|
|
558
|
+
field = line[start + 1..line_size - 2]
|
|
559
|
+
field.gsub!(doubled_quote(quote), quote)
|
|
560
|
+
field.strip! if strip
|
|
561
|
+
elements << field
|
|
562
|
+
else
|
|
563
|
+
field = line[start..-1]
|
|
564
|
+
field = cleanup_quotes(field, quote)
|
|
565
|
+
elements << (strip ? field.strip : field)
|
|
566
|
+
end
|
|
567
|
+
end
|
|
281
568
|
end
|
|
282
569
|
|
|
283
570
|
[elements, elements.size]
|