smarter_csv 1.15.2 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +68 -1
  4. data/CONTRIBUTORS.md +3 -1
  5. data/Gemfile +1 -0
  6. data/README.md +123 -27
  7. data/docs/_introduction.md +40 -24
  8. data/docs/bad_row_quarantine.md +285 -0
  9. data/docs/basic_read_api.md +151 -9
  10. data/docs/basic_write_api.md +474 -59
  11. data/docs/batch_processing.md +161 -4
  12. data/docs/column_selection.md +183 -0
  13. data/docs/data_transformations.md +162 -29
  14. data/docs/examples.md +339 -46
  15. data/docs/header_transformations.md +93 -12
  16. data/docs/header_validations.md +56 -18
  17. data/docs/history.md +117 -0
  18. data/docs/instrumentation.md +165 -0
  19. data/docs/migrating_from_csv.md +290 -0
  20. data/docs/options.md +150 -87
  21. data/docs/parsing_strategy.md +63 -1
  22. data/docs/real_world_csv.md +262 -0
  23. data/docs/releases/1.16.0/benchmarks.md +223 -0
  24. data/docs/releases/1.16.0/changes.md +272 -0
  25. data/docs/releases/1.16.0/performance_notes.md +114 -0
  26. data/docs/row_col_sep.md +14 -5
  27. data/docs/value_converters.md +193 -57
  28. data/ext/smarter_csv/extconf.rb +3 -0
  29. data/ext/smarter_csv/smarter_csv.c +1007 -71
  30. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
  31. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
  32. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
  33. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
  34. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
  35. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
  36. data/lib/smarter_csv/errors.rb +8 -0
  37. data/lib/smarter_csv/file_io.rb +1 -1
  38. data/lib/smarter_csv/hash_transformations.rb +14 -13
  39. data/lib/smarter_csv/header_transformations.rb +21 -2
  40. data/lib/smarter_csv/headers.rb +2 -1
  41. data/lib/smarter_csv/options.rb +124 -7
  42. data/lib/smarter_csv/parser.rb +362 -75
  43. data/lib/smarter_csv/reader.rb +494 -46
  44. data/lib/smarter_csv/version.rb +1 -1
  45. data/lib/smarter_csv/writer.rb +71 -19
  46. data/lib/smarter_csv.rb +95 -12
  47. data/smarter_csv.gemspec +20 -10
  48. metadata +37 -80
@@ -3,6 +3,16 @@
3
3
  module SmarterCSV
4
4
  module Parser
5
5
  EMPTY_STRING = '' # already frozen
6
+ # Optimization #13: byteindex (byte-position search) was added in Ruby 3.2.
7
+ # When available, it lets Opt #10/#12 skip-ahead use byte offsets directly —
8
+ # no conversion from byte position to character position needed.
9
+ #
10
+ # Restricted to MRI Ruby (RUBY_ENGINE == 'ruby'): JRuby and TruffleRuby implement
11
+ # byteindex but require the offset to land on a character boundary. Our byte-level
12
+ # loop advances i one byte at a time, so i can point to a UTF-8 continuation byte
13
+ # (0x80–0xBF) when Opt #10/#12 fires — which raises IndexError on those runtimes.
14
+ # The inline getbyte fallback below is correct for all Ruby implementations.
15
+ BYTEINDEX_AVAILABLE = RUBY_ENGINE == 'ruby' && String.method_defined?(:byteindex)
6
16
 
7
17
  protected
8
18
 
@@ -20,7 +30,7 @@ module SmarterCSV
20
30
 
21
31
  if options[:acceleration] && has_acceleration
22
32
  # :nocov:
23
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash)
33
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash, options[:quote_boundary] == :standard, options[:row_sep])
24
34
  [elements, elements.size]
25
35
  # :nocov:
26
36
  else
@@ -31,33 +41,68 @@ module SmarterCSV
31
41
  end
32
42
 
33
43
  def parse_with_auto_fallback(line, options, header_size = nil)
34
- has_quotes = line.include?(options[:quote_char])
44
+ # Optimization #4: cache merged options hashes for :auto mode
45
+ @quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
46
+ @quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
47
+
48
+ # Optimization #5: if the line contains no backslash, backslash escaping cannot
49
+ # affect parsing (a backslash only matters immediately before a quote char).
50
+ # RFC 4180 and backslash modes give identical results — skip the try-backslash
51
+ # dance and call directly with RFC options (tighter C inner loop + memchr).
52
+ # has_quotes is only needed for the Ruby fallback path — C computes it internally.
53
+ unless line.include?('\\')
54
+ if options[:acceleration] && has_acceleration
55
+ # :nocov:
56
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
57
+ return [elements, elements.size]
58
+ # :nocov:
59
+ else
60
+ has_quotes = line.include?(options[:quote_char])
61
+ return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
62
+ end
63
+ end
35
64
 
36
- begin
65
+ # Line has a backslash — try backslash-escape interpretation first.
66
+ # has_quotes only needed for Ruby fallback path.
67
+ has_quotes = line.include?(options[:quote_char]) unless options[:acceleration] && has_acceleration
68
+
69
+ result = begin
37
70
  # Try backslash-escape interpretation first
38
71
  if options[:acceleration] && has_acceleration
39
72
  # :nocov:
40
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true)
73
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], true, options[:quote_boundary] == :standard, options[:row_sep])
41
74
  [elements, elements.size]
42
75
  # :nocov:
43
76
  else
44
- # Optimization #4: cache merged options hashes for :auto mode
45
- @backslash_options ||= options.merge(quote_escaping: :backslash)
46
- parse_csv_line_ruby(line, @backslash_options, header_size, has_quotes)
77
+ parse_csv_line_ruby(line, @quote_escaping_backslash, header_size, has_quotes)
47
78
  end
48
79
  rescue MalformedCSV
49
- # Backslash interpretation failed — fall back to RFC 4180
80
+ # Backslash raised a hard error — fall back to RFC 4180 immediately
50
81
  if options[:acceleration] && has_acceleration
51
82
  # :nocov:
52
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false)
53
- [elements, elements.size]
83
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
84
+ return [elements, elements.size]
54
85
  # :nocov:
55
86
  else
56
- # Optimization #4: cache merged options hashes for :auto mode
57
- @rfc_options ||= options.merge(quote_escaping: :double_quotes)
58
- parse_csv_line_ruby(line, @rfc_options, header_size, has_quotes)
87
+ return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
59
88
  end
60
89
  end
90
+
91
+ # Backslash sees unclosed quote (-1): RFC may still close it (e.g. header "val\")
92
+ if result[1] == -1
93
+ rfc_result = if options[:acceleration] && has_acceleration
94
+ # :nocov:
95
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, false, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
96
+ [elements, elements.size]
97
+ # :nocov:
98
+ else
99
+ parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
100
+ end
101
+ return rfc_result unless rfc_result[1] == -1
102
+ # Both agree line is incomplete → propagate -1
103
+ end
104
+
105
+ result
61
106
  end
62
107
 
63
108
  # Parse a CSV line directly into a hash, with support for extra columns.
@@ -78,35 +123,54 @@ module SmarterCSV
78
123
  end
79
124
 
80
125
  def parse_line_to_hash_auto(line, headers, options)
81
- begin
82
- # Try backslash-escape interpretation first
83
- if options[:acceleration] && has_acceleration
84
- # :nocov:
85
- # Optimization #4: cache merged options hashes for :auto mode
86
- @backslash_options ||= options.merge(quote_escaping: :backslash)
87
- parse_line_to_hash_c(line, headers, @backslash_options)
88
- # :nocov:
89
- else
90
- has_quotes = line.include?(options[:quote_char])
91
- # Optimization #4: cache merged options hashes for :auto mode
92
- @backslash_options ||= options.merge(quote_escaping: :backslash)
93
- parse_line_to_hash_ruby(line, headers, @backslash_options, has_quotes)
126
+ # Optimization #4: cache merged options hashes for :auto mode
127
+ @quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
128
+ @quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
129
+
130
+ if options[:acceleration] && has_acceleration
131
+ # C path: zero Ruby string scanning on the hot path.
132
+ # C handles Opt #5 internally — if backslash mode is requested but the line
133
+ # contains no backslash, C automatically downgrades to RFC mode in Section 5
134
+ # (enabling the memchr-inside-quotes optimisation). For unquoted lines, Section 4
135
+ # fast path is taken and allow_escaped_quotes is irrelevant anyway.
136
+ # :nocov:
137
+ result = parse_line_to_hash_c(line, headers, @quote_escaping_backslash)
138
+ if result[1] == -1 && line.include?('\\')
139
+ # Backslash mode sees unclosed quote on a line that contains a backslash.
140
+ # RFC 4180 may close it differently (e.g. "val\" is open in backslash
141
+ # mode but closed in RFC mode). Only try RFC when a backslash is present —
142
+ # if there is no backslash, both modes give identical results and the extra
143
+ # call is wasted work (common case: embedded-newline partial stitching lines).
144
+ rfc_result = parse_line_to_hash_c(line, headers, @quote_escaping_double)
145
+ return rfc_result unless rfc_result[1] == -1
146
+ # Both agree line is incomplete → propagate [nil, -1]
94
147
  end
148
+ # :nocov:
149
+ return result
150
+ end
151
+
152
+ # Ruby fallback path: explicit backslash/quote checks still needed
153
+ has_quotes = line.include?(options[:quote_char])
154
+ unless line.include?('\\')
155
+ return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
156
+ end
157
+
158
+ result = begin
159
+ parse_line_to_hash_ruby(line, headers, @quote_escaping_backslash, has_quotes)
95
160
  rescue MalformedCSV
96
- # Backslash interpretation failed — fall back to RFC 4180
97
- if options[:acceleration] && has_acceleration
98
- # :nocov:
99
- # Optimization #4: cache merged options hashes for :auto mode
100
- @rfc_options ||= options.merge(quote_escaping: :double_quotes)
101
- parse_line_to_hash_c(line, headers, @rfc_options)
102
- # :nocov:
103
- else
104
- has_quotes = line.include?(options[:quote_char])
105
- # Optimization #4: cache merged options hashes for :auto mode
106
- @rfc_options ||= options.merge(quote_escaping: :double_quotes)
107
- parse_line_to_hash_ruby(line, headers, @rfc_options, has_quotes)
108
- end
161
+ return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
109
162
  end
163
+
164
+ # Backslash path sees an unclosed quote ([nil, -1]): RFC 4180 may still close
165
+ # the field — e.g. a field ending with \" is open in backslash mode but closed
166
+ # in RFC mode. Try RFC; if it also returns -1 both agree the line is incomplete.
167
+ if result[1] == -1
168
+ rfc_result = parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
169
+ return rfc_result unless rfc_result[1] == -1
170
+ # Both interpretations agree the line is incomplete → propagate [nil, -1]
171
+ end
172
+
173
+ result
110
174
  end
111
175
 
112
176
  # Ruby implementation of parse_line_to_hash
@@ -116,14 +180,55 @@ module SmarterCSV
116
180
  # Chomp trailing row separator
117
181
  line = line.chomp(options[:row_sep]) if options[:row_sep]
118
182
 
119
- # Parse the line into values
183
+ col_sep = options[:col_sep]
184
+ strip = options[:strip_whitespace]
185
+ prefix = options[:missing_header_prefix]
186
+
187
+ # Optimization #11: for unquoted lines, build the hash in one pass directly
188
+ # from String#split — no intermediate array returned from parse_csv_line_ruby
189
+ # and no second iteration to convert array → hash. Saves one Array allocation
190
+ # + one full-row iteration per row (most impactful on wide-column files).
191
+ #
192
+ # Optimization #14: when remove_empty_values is set (default: true), skip
193
+ # empty fields inline during hash building instead of inserting them and
194
+ # deleting later in hash_transformations. With strip_whitespace: true
195
+ # (default), v.empty? after strip catches both empty and whitespace-only
196
+ # fields without a regex. Most impactful on sparse files (many empty fields).
197
+ unless has_quotes || col_sep == ' '
198
+ fields = line.split(col_sep, -1)
199
+ n = fields.size
200
+
201
+ if options[:remove_empty_hashes]
202
+ all_blank = fields.empty? || fields.all? { |v| v.strip.empty? }
203
+ return [nil, n] if all_blank
204
+ end
205
+
206
+ # Batch-strip using C-level map! — faster than per-element strip inside the loop
207
+ fields.map!(&:strip) if strip
208
+
209
+ remove_empty = options[:remove_empty_values]
210
+ hash = {}
211
+ fields.each_with_index do |v, i| # C-level iteration, faster than Ruby while counter loop
212
+ next if remove_empty && v.empty?
213
+ hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = v
214
+ end
215
+
216
+ unless remove_empty
217
+ (n...headers.size).each { |i| hash[headers[i]] = nil }
218
+ end
219
+
220
+ return [hash, n]
221
+ end
222
+
223
+ # Quoted/complex path: parse into elements array, then build hash.
120
224
  elements, data_size = parse_csv_line_ruby(line, options, nil, has_quotes)
225
+ return [nil, -1] if data_size == -1 # unclosed quote at EOL → caller stitches next line
121
226
 
122
227
  # Optimization #6: elements are always String or nil from parse_csv_line_ruby,
123
228
  # so .to_s is unnecessary. If strip_whitespace is on, fields are already
124
229
  # stripped, so .strip is also redundant — just check .empty?.
125
230
  if options[:remove_empty_hashes]
126
- all_blank = if options[:strip_whitespace]
231
+ all_blank = if strip
127
232
  elements.empty? || elements.all? { |v| v.nil? || v.empty? }
128
233
  else
129
234
  elements.empty? || elements.all? { |v| v.nil? || v.strip.empty? }
@@ -131,22 +236,21 @@ module SmarterCSV
131
236
  return [nil, data_size] if all_blank
132
237
  end
133
238
 
134
- # Build the hash - only include keys for values that exist
239
+ # Build the hash — integer-index while loop avoids enumerator overhead vs each_with_index
240
+ n = elements.size
135
241
  hash = {}
136
- elements.each_with_index do |value, i|
137
- key = if i < headers.size
138
- headers[i]
139
- else
140
- "#{options[:missing_header_prefix]}#{i + 1}".to_sym
141
- end
142
- hash[key] = value
242
+ i = 0
243
+ while i < n
244
+ hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = elements[i]
245
+ i += 1
143
246
  end
144
247
 
145
248
  # Add nil for missing columns only when remove_empty_values is false
146
249
  # (when true, nils would be removed anyway by hash_transformations)
147
250
  unless options[:remove_empty_values]
148
- (elements.size...headers.size).each do |i|
251
+ while i < headers.size
149
252
  hash[headers[i]] = nil
253
+ i += 1
150
254
  end
151
255
  end
152
256
 
@@ -182,7 +286,9 @@ module SmarterCSV
182
286
 
183
287
  # Ensure has_quotes is set correctly (callers via parse/parse_line_to_hash
184
288
  # always pass this, but direct callers may not)
289
+ # rubocop:disable Style/OrAssignment
185
290
  has_quotes = line.include?(options[:quote_char]) unless has_quotes
291
+ # rubocop:enable Style/OrAssignment
186
292
 
187
293
  # Optimization #7: when line has no quotes, use String#split (C-implemented)
188
294
  # to bypass the entire character-by-character loop.
@@ -193,6 +299,7 @@ module SmarterCSV
193
299
  if header_size && header_size <= 0
194
300
  return [[], 0]
195
301
  end
302
+
196
303
  elements = line.split(col_sep, -1) # -1 preserves trailing empty fields
197
304
  elements = elements[0, header_size] if header_size
198
305
  elements.map!(&:strip) if strip
@@ -210,74 +317,254 @@ module SmarterCSV
210
317
  backslash_count = 0
211
318
  in_quotes = false
212
319
  allow_escaped_quotes = options[:quote_escaping] == :backslash
320
+ quote_boundary_standard = options[:quote_boundary] == :standard
321
+ field_started = false # for boundary tracking (standard mode only)
322
+ row_sep = options[:row_sep]
323
+ row_sep_size = row_sep.is_a?(String) ? row_sep.size : 0
213
324
 
214
325
  # Optimization #1: for the common single-char separator, use direct
215
326
  # character comparison instead of allocating a substring via line[i...i+n].
216
327
  if col_sep_size == 1
217
- while i < line_size
218
- if line[i] == col_sep && !in_quotes
328
+ # Optimization #13: byte-level indexing for single-char separator.
329
+ # col_sep and quote_char are both validated to be single-byte at option
330
+ # parsing time. UTF-8 multi-byte continuation bytes (0x80–0xBF) never
331
+ # alias ASCII delimiter bytes (0x00–0x7F), so byte scanning is safe for
332
+ # UTF-8 strings with ASCII delimiters — no String allocation per character.
333
+ col_sep_byte = col_sep.getbyte(0)
334
+ quote_byte = quote.getbyte(0)
335
+ bytesize = line.bytesize
336
+ row_sep_bytesize = row_sep.is_a?(String) ? row_sep.bytesize : 0
337
+
338
+ while i < bytesize
339
+ # Optimization #10: inside a quoted field with no backslash escaping, jump
340
+ # directly to the next quote character using byteindex (C-level scan).
341
+ # Avoids per-character Ruby iteration through long field content.
342
+ if in_quotes && !allow_escaped_quotes
343
+ next_q = if BYTEINDEX_AVAILABLE
344
+ line.byteindex(quote, i)
345
+ else
346
+ j = i
347
+ j += 1 while j < bytesize && line.getbyte(j) != quote_byte
348
+ j < bytesize ? j : nil
349
+ end
350
+ if next_q.nil?
351
+ i = bytesize # no closing quote — exit loop, return [[], -1] below
352
+ break
353
+ end
354
+ i = next_q # land on the quote; fall through to normal quote-handling below
355
+ b = quote_byte
356
+
357
+ # Optimization #12: in :standard mode, once we know the current field is
358
+ # unquoted (field_started && !in_quotes), remaining quotes are literal and
359
+ # cannot affect parser state — jump directly to the next col_sep.
360
+ # Mirrors Opt #10 for the unquoted side of the same trade-off.
361
+ elsif quote_boundary_standard && field_started && !in_quotes
362
+ next_sep = if BYTEINDEX_AVAILABLE
363
+ line.byteindex(col_sep, i)
364
+ else
365
+ j = i
366
+ j += 1 while j < bytesize && line.getbyte(j) != col_sep_byte
367
+ j < bytesize ? j : nil
368
+ end
369
+ if next_sep.nil?
370
+ break
371
+ end
372
+
373
+ i = next_sep
374
+ b = col_sep_byte
375
+
376
+ else
377
+ b = line.getbyte(i)
378
+ end
379
+
380
+ if b == col_sep_byte && !in_quotes
219
381
  break if !header_size.nil? && elements.size >= header_size
220
382
 
221
- field = line[start...i]
222
- field = cleanup_quotes(field, quote)
223
- elements << (strip ? field.strip : field)
383
+ # Optimization #15: for quoted fields, extract content directly without
384
+ # surrounding quotes to avoid the double allocation of byteslice + field[1..-2]
385
+ # inside cleanup_quotes. Safe because the line is pre-chomped and the state
386
+ # machine has already found and validated the closing quote.
387
+ field_len = i - start
388
+ if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(i - 1) == quote_byte
389
+ field = line.byteslice(start + 1, field_len - 2)
390
+ field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
391
+ field.strip! if strip # in-place: no extra allocation; safe on fresh byteslice
392
+ elements << field
393
+ else
394
+ field = line.byteslice(start, field_len)
395
+ field = cleanup_quotes(field, quote)
396
+ elements << (strip ? field.strip : field) # cleanup_quotes may return frozen EMPTY_STRING
397
+ end
224
398
  i += 1
225
399
  start = i
226
400
  backslash_count = 0
401
+ field_started = false # reset for next field
227
402
  else
228
- if allow_escaped_quotes && line[i] == '\\'
403
+ if allow_escaped_quotes && b == 92 # backslash '\\'
229
404
  backslash_count += 1
405
+ field_started = true if quote_boundary_standard && !in_quotes
230
406
  else
231
- if line[i] == quote
407
+ if b == quote_byte
232
408
  if !allow_escaped_quotes || backslash_count % 2 == 0
233
- in_quotes = !in_quotes
409
+ if quote_boundary_standard
410
+ if in_quotes
411
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
412
+ next_i = i + 1
413
+ if next_i >= bytesize ||
414
+ line.getbyte(next_i) == col_sep_byte ||
415
+ (row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
416
+ in_quotes = false
417
+ field_started = true
418
+ end
419
+ # else: quote inside quoted field → literal (handles "" doubling)
420
+ elsif !field_started # at field boundary: open quoted field
421
+ in_quotes = true
422
+ field_started = true
423
+ end
424
+ # else: mid-field quote → literal, no state change
425
+ else
426
+ in_quotes = !in_quotes
427
+ end
234
428
  end
429
+ elsif quote_boundary_standard && !in_quotes && !field_started
430
+ # Non-quote, non-separator: mark field as started (only needs to fire once
431
+ # per field — Opt #12 skips the rest once this is set).
432
+ # rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
433
+ if strip && (b == 32 || b == 9) # ' ' == 32, '\t' == 9
434
+ start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
435
+ else
436
+ field_started = true
437
+ end
438
+ # rubocop:enable Style/MultipleComparison
235
439
  end
236
440
  backslash_count = 0
237
441
  end
238
442
  i += 1
239
443
  end
240
444
  end
445
+
446
+ # Unclosed quote at end of line: signal "needs more data" to the caller.
447
+ # The read loop will stitch the next physical line and re-parse rather than raising.
448
+ return [[], -1] if in_quotes
449
+
450
+ # Process the remaining field
451
+ if header_size.nil? || elements.size < header_size
452
+ # Optimization #15 (final field): same direct extraction; safe because line is pre-chomped.
453
+ field_len = bytesize - start
454
+ if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(bytesize - 1) == quote_byte
455
+ field = line.byteslice(start + 1, field_len - 2)
456
+ field.gsub!(doubled_quote(quote), quote)
457
+ field.strip! if strip
458
+ elements << field
459
+ else
460
+ field = line.byteslice(start, field_len)
461
+ field = cleanup_quotes(field, quote)
462
+ elements << (strip ? field.strip : field)
463
+ end
464
+ end
241
465
  else
242
466
  # Multi-char col_sep: use substring comparison (original path)
243
467
  while i < line_size
468
+ # Optimization #10 (multi-char path): same skip-ahead as single-char path above.
469
+ if in_quotes && !allow_escaped_quotes
470
+ next_q = line.index(quote, i)
471
+ if next_q.nil?
472
+ i = line_size
473
+ break
474
+ end
475
+ i = next_q
476
+ end
477
+
478
+ # Optimization #12 (multi-char path): mirror of single-char path above.
479
+ if quote_boundary_standard && field_started && !in_quotes
480
+ next_sep = line.index(col_sep, i)
481
+ if next_sep.nil?
482
+ break
483
+ end
484
+
485
+ i = next_sep
486
+ end
487
+
244
488
  if line[i...i+col_sep_size] == col_sep && !in_quotes
245
489
  break if !header_size.nil? && elements.size >= header_size
246
490
 
247
- field = line[start...i]
248
- field = cleanup_quotes(field, quote)
249
- elements << (strip ? field.strip : field)
491
+ # Optimization #15 (multi-char path): same direct extraction using character indexing.
492
+ field_len = i - start
493
+ if field_len >= 2 && line[start] == quote && line[i - 1] == quote
494
+ field = line[start + 1...i - 1]
495
+ field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
496
+ field.strip! if strip
497
+ elements << field
498
+ else
499
+ field = line[start...i]
500
+ field = cleanup_quotes(field, quote)
501
+ elements << (strip ? field.strip : field)
502
+ end
250
503
  i += col_sep_size
251
504
  start = i
252
505
  backslash_count = 0
506
+ field_started = false # reset for next field
253
507
  else
254
508
  if allow_escaped_quotes && line[i] == '\\'
255
509
  backslash_count += 1
510
+ field_started = true if quote_boundary_standard && !in_quotes
256
511
  else
257
512
  if line[i] == quote
258
513
  if !allow_escaped_quotes || backslash_count % 2 == 0
259
- in_quotes = !in_quotes
514
+ if quote_boundary_standard
515
+ if in_quotes
516
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
517
+ next_i = i + 1
518
+ if next_i >= line_size ||
519
+ line[next_i...next_i + col_sep_size] == col_sep ||
520
+ (row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
521
+ in_quotes = false
522
+ field_started = true
523
+ end
524
+ # else: quote inside quoted field → literal (handles "" doubling)
525
+ elsif !field_started # at field boundary: open quoted field
526
+ in_quotes = true
527
+ field_started = true
528
+ end
529
+ # else: mid-field quote → literal, no state change
530
+ else
531
+ in_quotes = !in_quotes
532
+ end
533
+ end
534
+ elsif quote_boundary_standard && !in_quotes && !field_started
535
+ # rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
536
+ if strip && (line[i] == ' ' || line[i] == "\t")
537
+ start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
538
+ else
539
+ field_started = true
260
540
  end
541
+ # rubocop:enable Style/MultipleComparison
261
542
  end
262
543
  backslash_count = 0
263
544
  end
264
545
  i += 1
265
546
  end
266
547
  end
267
- end
268
-
269
- # Check for unclosed quotes at the end of the line
270
- if in_quotes
271
- # :nocov:
272
- raise MalformedCSV, "Unclosed quoted field detected in line: #{line}"
273
- # :nocov:
274
- end
275
548
 
276
- # Process the remaining field
277
- if header_size.nil? || elements.size < header_size
278
- field = line[start..-1]
279
- field = cleanup_quotes(field, quote)
280
- elements << (strip ? field.strip : field)
549
+ # Unclosed quote at end of line: signal "needs more data" to the caller.
550
+ # The read loop will stitch the next physical line and re-parse rather than raising.
551
+ return [[], -1] if in_quotes
552
+
553
+ # Process the remaining field
554
+ if header_size.nil? || elements.size < header_size
555
+ # Optimization #15 (multi-char final field): same direct extraction; line is pre-chomped.
556
+ field_len = line_size - start
557
+ if field_len >= 2 && line[start] == quote && line[line_size - 1] == quote
558
+ field = line[start + 1..line_size - 2]
559
+ field.gsub!(doubled_quote(quote), quote)
560
+ field.strip! if strip
561
+ elements << field
562
+ else
563
+ field = line[start..-1]
564
+ field = cleanup_quotes(field, quote)
565
+ elements << (strip ? field.strip : field)
566
+ end
567
+ end
281
568
  end
282
569
 
283
570
  [elements, elements.size]