smarter_csv 1.15.2 → 1.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +4 -4
  2. data/.rspec +2 -0
  3. data/.rubocop.yml +9 -0
  4. data/CHANGELOG.md +112 -1
  5. data/CONTRIBUTORS.md +4 -1
  6. data/Gemfile +1 -0
  7. data/README.md +129 -27
  8. data/docs/_introduction.md +45 -24
  9. data/docs/bad_row_quarantine.md +342 -0
  10. data/docs/basic_read_api.md +152 -9
  11. data/docs/basic_write_api.md +475 -59
  12. data/docs/batch_processing.md +162 -4
  13. data/docs/column_selection.md +184 -0
  14. data/docs/data_transformations.md +163 -29
  15. data/docs/examples.md +340 -46
  16. data/docs/header_transformations.md +94 -12
  17. data/docs/header_validations.md +57 -18
  18. data/docs/history.md +119 -0
  19. data/docs/instrumentation.md +166 -0
  20. data/docs/migrating_from_csv.md +565 -0
  21. data/docs/options.md +151 -87
  22. data/docs/parsing_strategy.md +64 -1
  23. data/docs/real_world_csv.md +263 -0
  24. data/docs/releases/1.16.0/benchmarks.md +223 -0
  25. data/docs/releases/1.16.0/changes.md +273 -0
  26. data/docs/releases/1.16.0/performance_notes.md +114 -0
  27. data/docs/row_col_sep.md +15 -5
  28. data/docs/ruby_csv_pitfalls.md +514 -0
  29. data/docs/value_converters.md +194 -57
  30. data/ext/smarter_csv/extconf.rb +3 -0
  31. data/ext/smarter_csv/smarter_csv.c +1017 -82
  32. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
  33. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
  34. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
  35. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
  36. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
  37. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
  38. data/lib/smarter_csv/errors.rb +8 -0
  39. data/lib/smarter_csv/file_io.rb +1 -1
  40. data/lib/smarter_csv/hash_transformations.rb +14 -13
  41. data/lib/smarter_csv/header_transformations.rb +21 -2
  42. data/lib/smarter_csv/headers.rb +2 -1
  43. data/lib/smarter_csv/options.rb +124 -7
  44. data/lib/smarter_csv/parser.rb +358 -74
  45. data/lib/smarter_csv/reader.rb +494 -46
  46. data/lib/smarter_csv/version.rb +1 -1
  47. data/lib/smarter_csv/writer.rb +71 -19
  48. data/lib/smarter_csv.rb +134 -13
  49. data/smarter_csv.gemspec +20 -10
  50. metadata +38 -80
@@ -3,6 +3,16 @@
3
3
  module SmarterCSV
4
4
  module Parser
5
5
  EMPTY_STRING = '' # already frozen
6
+ # Optimization #13: byteindex (byte-position search) was added in Ruby 3.2.
7
+ # When available, it lets Opt #10/#12 skip-ahead use byte offsets directly —
8
+ # no conversion from byte position to character position needed.
9
+ #
10
+ # Restricted to MRI Ruby (RUBY_ENGINE == 'ruby'): JRuby and TruffleRuby implement
11
+ # byteindex but require the offset to land on a character boundary. Our byte-level
12
+ # loop advances i one byte at a time, so i can point to a UTF-8 continuation byte
13
+ # (0x80–0xBF) when Opt #10/#12 fires — which raises IndexError on those runtimes.
14
+ # The inline getbyte fallback below is correct for all Ruby implementations.
15
+ BYTEINDEX_AVAILABLE = RUBY_ENGINE == 'ruby' && String.method_defined?(:byteindex)
6
16
 
7
17
  protected
8
18
 
@@ -20,7 +30,7 @@ module SmarterCSV
20
30
 
21
31
  if options[:acceleration] && has_acceleration
22
32
  # :nocov:
23
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash)
33
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash, options[:quote_boundary] == :standard, options[:row_sep])
24
34
  [elements, elements.size]
25
35
  # :nocov:
26
36
  else
@@ -31,33 +41,65 @@ module SmarterCSV
31
41
  end
32
42
 
33
43
  def parse_with_auto_fallback(line, options, header_size = nil)
44
+ # Optimization #4: cache merged options hashes for :auto mode
45
+ @quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
46
+ @quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
47
+
48
+ # Optimization #5: if the line contains no backslash, backslash escaping cannot
49
+ # affect parsing (a backslash only matters immediately before a quote char).
50
+ # RFC 4180 and backslash modes give identical results — skip the try-backslash
51
+ # dance and call directly with RFC options (tighter C inner loop + memchr).
34
52
  has_quotes = line.include?(options[:quote_char])
53
+ unless line.include?('\\')
54
+ if options[:acceleration] && has_acceleration
55
+ # :nocov:
56
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
57
+ return [elements, elements.size]
58
+ # :nocov:
59
+ else
60
+ return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
61
+ end
62
+ end
63
+
64
+ # Line has a backslash — try backslash-escape interpretation first.
35
65
 
36
- begin
66
+ result = begin
37
67
  # Try backslash-escape interpretation first
38
68
  if options[:acceleration] && has_acceleration
39
69
  # :nocov:
40
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true)
70
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true, options[:quote_boundary] == :standard, options[:row_sep])
41
71
  [elements, elements.size]
42
72
  # :nocov:
43
73
  else
44
- # Optimization #4: cache merged options hashes for :auto mode
45
- @backslash_options ||= options.merge(quote_escaping: :backslash)
46
- parse_csv_line_ruby(line, @backslash_options, header_size, has_quotes)
74
+ parse_csv_line_ruby(line, @quote_escaping_backslash, header_size, has_quotes)
47
75
  end
48
76
  rescue MalformedCSV
49
- # Backslash interpretation failed — fall back to RFC 4180
77
+ # Backslash raised a hard error — fall back to RFC 4180 immediately
50
78
  if options[:acceleration] && has_acceleration
51
79
  # :nocov:
52
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false)
53
- [elements, elements.size]
80
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
81
+ return [elements, elements.size]
54
82
  # :nocov:
55
83
  else
56
- # Optimization #4: cache merged options hashes for :auto mode
57
- @rfc_options ||= options.merge(quote_escaping: :double_quotes)
58
- parse_csv_line_ruby(line, @rfc_options, header_size, has_quotes)
84
+ return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
59
85
  end
60
86
  end
87
+
88
+ # Backslash sees unclosed quote (-1): RFC may still close it (e.g. header "val\")
89
+ if result[1] == -1
90
+ rfc_result = if options[:acceleration] && has_acceleration
91
+ # :nocov:
92
+ elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
93
+ [elements, elements.size]
94
+ # :nocov:
95
+ else
96
+ parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
97
+ end
98
+ return rfc_result unless rfc_result[1] == -1
99
+ # Both agree line is incomplete → propagate -1
100
+ end
101
+
102
+ result
61
103
  end
62
104
 
63
105
  # Parse a CSV line directly into a hash, with support for extra columns.
@@ -78,35 +120,54 @@ module SmarterCSV
78
120
  end
79
121
 
80
122
  def parse_line_to_hash_auto(line, headers, options)
81
- begin
82
- # Try backslash-escape interpretation first
83
- if options[:acceleration] && has_acceleration
84
- # :nocov:
85
- # Optimization #4: cache merged options hashes for :auto mode
86
- @backslash_options ||= options.merge(quote_escaping: :backslash)
87
- parse_line_to_hash_c(line, headers, @backslash_options)
88
- # :nocov:
89
- else
90
- has_quotes = line.include?(options[:quote_char])
91
- # Optimization #4: cache merged options hashes for :auto mode
92
- @backslash_options ||= options.merge(quote_escaping: :backslash)
93
- parse_line_to_hash_ruby(line, headers, @backslash_options, has_quotes)
123
+ # Optimization #4: cache merged options hashes for :auto mode
124
+ @quote_escaping_backslash ||= options.merge(quote_escaping: :backslash)
125
+ @quote_escaping_double ||= options.merge(quote_escaping: :double_quotes)
126
+
127
+ if options[:acceleration] && has_acceleration
128
+ # C path: zero Ruby string scanning on the hot path.
129
+ # C handles Opt #5 internally — if backslash mode is requested but the line
130
+ # contains no backslash, C automatically downgrades to RFC mode in Section 5
131
+ # (enabling the memchr-inside-quotes optimisation). For unquoted lines, Section 4
132
+ # fast path is taken and allow_escaped_quotes is irrelevant anyway.
133
+ # :nocov:
134
+ result = parse_line_to_hash_c(line, headers, @quote_escaping_backslash)
135
+ if result[1] == -1 && line.include?('\\')
136
+ # Backslash mode sees unclosed quote on a line that contains a backslash.
137
+ # RFC 4180 may close it differently (e.g. "val\" is open in backslash
138
+ # mode but closed in RFC mode). Only try RFC when a backslash is present —
139
+ # if there is no backslash, both modes give identical results and the extra
140
+ # call is wasted work (common case: embedded-newline partial stitching lines).
141
+ rfc_result = parse_line_to_hash_c(line, headers, @quote_escaping_double)
142
+ return rfc_result unless rfc_result[1] == -1
143
+ # Both agree line is incomplete → propagate [nil, -1]
94
144
  end
145
+ # :nocov:
146
+ return result
147
+ end
148
+
149
+ # Ruby fallback path: explicit backslash/quote checks still needed
150
+ has_quotes = line.include?(options[:quote_char])
151
+ unless line.include?('\\')
152
+ return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
153
+ end
154
+
155
+ result = begin
156
+ parse_line_to_hash_ruby(line, headers, @quote_escaping_backslash, has_quotes)
95
157
  rescue MalformedCSV
96
- # Backslash interpretation failed — fall back to RFC 4180
97
- if options[:acceleration] && has_acceleration
98
- # :nocov:
99
- # Optimization #4: cache merged options hashes for :auto mode
100
- @rfc_options ||= options.merge(quote_escaping: :double_quotes)
101
- parse_line_to_hash_c(line, headers, @rfc_options)
102
- # :nocov:
103
- else
104
- has_quotes = line.include?(options[:quote_char])
105
- # Optimization #4: cache merged options hashes for :auto mode
106
- @rfc_options ||= options.merge(quote_escaping: :double_quotes)
107
- parse_line_to_hash_ruby(line, headers, @rfc_options, has_quotes)
108
- end
158
+ return parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
159
+ end
160
+
161
+ # Backslash path sees an unclosed quote ([nil, -1]): RFC 4180 may still close
162
+ # the field — e.g. a field ending with \" is open in backslash mode but closed
163
+ # in RFC mode. Try RFC; if it also returns -1 both agree the line is incomplete.
164
+ if result[1] == -1
165
+ rfc_result = parse_line_to_hash_ruby(line, headers, @quote_escaping_double, has_quotes)
166
+ return rfc_result unless rfc_result[1] == -1
167
+ # Both interpretations agree the line is incomplete propagate [nil, -1]
109
168
  end
169
+
170
+ result
110
171
  end
111
172
 
112
173
  # Ruby implementation of parse_line_to_hash
@@ -116,14 +177,55 @@ module SmarterCSV
116
177
  # Chomp trailing row separator
117
178
  line = line.chomp(options[:row_sep]) if options[:row_sep]
118
179
 
119
- # Parse the line into values
180
+ col_sep = options[:col_sep]
181
+ strip = options[:strip_whitespace]
182
+ prefix = options[:missing_header_prefix]
183
+
184
+ # Optimization #11: for unquoted lines, build the hash in one pass directly
185
+ # from String#split — no intermediate array returned from parse_csv_line_ruby
186
+ # and no second iteration to convert array → hash. Saves one Array allocation
187
+ # + one full-row iteration per row (most impactful on wide-column files).
188
+ #
189
+ # Optimization #14: when remove_empty_values is set (default: true), skip
190
+ # empty fields inline during hash building instead of inserting them and
191
+ # deleting later in hash_transformations. With strip_whitespace: true
192
+ # (default), v.empty? after strip catches both empty and whitespace-only
193
+ # fields without a regex. Most impactful on sparse files (many empty fields).
194
+ unless has_quotes || col_sep == ' '
195
+ fields = line.split(col_sep, -1)
196
+ n = fields.size
197
+
198
+ if options[:remove_empty_hashes]
199
+ all_blank = fields.empty? || fields.all? { |v| v.strip.empty? }
200
+ return [nil, n] if all_blank
201
+ end
202
+
203
+ # Batch-strip using C-level map! — faster than per-element strip inside the loop
204
+ fields.map!(&:strip) if strip
205
+
206
+ remove_empty = options[:remove_empty_values]
207
+ hash = {}
208
+ fields.each_with_index do |v, i| # C-level iteration, faster than Ruby while counter loop
209
+ next if remove_empty && v.empty?
210
+ hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = v
211
+ end
212
+
213
+ unless remove_empty
214
+ (n...headers.size).each { |i| hash[headers[i]] = nil }
215
+ end
216
+
217
+ return [hash, n]
218
+ end
219
+
220
+ # Quoted/complex path: parse into elements array, then build hash.
120
221
  elements, data_size = parse_csv_line_ruby(line, options, nil, has_quotes)
222
+ return [nil, -1] if data_size == -1 # unclosed quote at EOL → caller stitches next line
121
223
 
122
224
  # Optimization #6: elements are always String or nil from parse_csv_line_ruby,
123
225
  # so .to_s is unnecessary. If strip_whitespace is on, fields are already
124
226
  # stripped, so .strip is also redundant — just check .empty?.
125
227
  if options[:remove_empty_hashes]
126
- all_blank = if options[:strip_whitespace]
228
+ all_blank = if strip
127
229
  elements.empty? || elements.all? { |v| v.nil? || v.empty? }
128
230
  else
129
231
  elements.empty? || elements.all? { |v| v.nil? || v.strip.empty? }
@@ -131,22 +233,21 @@ module SmarterCSV
131
233
  return [nil, data_size] if all_blank
132
234
  end
133
235
 
134
- # Build the hash - only include keys for values that exist
236
+ # Build the hash — integer-index while loop avoids enumerator overhead vs each_with_index
237
+ n = elements.size
135
238
  hash = {}
136
- elements.each_with_index do |value, i|
137
- key = if i < headers.size
138
- headers[i]
139
- else
140
- "#{options[:missing_header_prefix]}#{i + 1}".to_sym
141
- end
142
- hash[key] = value
239
+ i = 0
240
+ while i < n
241
+ hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = elements[i]
242
+ i += 1
143
243
  end
144
244
 
145
245
  # Add nil for missing columns only when remove_empty_values is false
146
246
  # (when true, nils would be removed anyway by hash_transformations)
147
247
  unless options[:remove_empty_values]
148
- (elements.size...headers.size).each do |i|
248
+ while i < headers.size
149
249
  hash[headers[i]] = nil
250
+ i += 1
150
251
  end
151
252
  end
152
253
 
@@ -182,7 +283,9 @@ module SmarterCSV
182
283
 
183
284
  # Ensure has_quotes is set correctly (callers via parse/parse_line_to_hash
184
285
  # always pass this, but direct callers may not)
286
+ # rubocop:disable Style/OrAssignment
185
287
  has_quotes = line.include?(options[:quote_char]) unless has_quotes
288
+ # rubocop:enable Style/OrAssignment
186
289
 
187
290
  # Optimization #7: when line has no quotes, use String#split (C-implemented)
188
291
  # to bypass the entire character-by-character loop.
@@ -193,6 +296,7 @@ module SmarterCSV
193
296
  if header_size && header_size <= 0
194
297
  return [[], 0]
195
298
  end
299
+
196
300
  elements = line.split(col_sep, -1) # -1 preserves trailing empty fields
197
301
  elements = elements[0, header_size] if header_size
198
302
  elements.map!(&:strip) if strip
@@ -210,74 +314,254 @@ module SmarterCSV
210
314
  backslash_count = 0
211
315
  in_quotes = false
212
316
  allow_escaped_quotes = options[:quote_escaping] == :backslash
317
+ quote_boundary_standard = options[:quote_boundary] == :standard
318
+ field_started = false # for boundary tracking (standard mode only)
319
+ row_sep = options[:row_sep]
320
+ row_sep_size = row_sep.is_a?(String) ? row_sep.size : 0
213
321
 
214
322
  # Optimization #1: for the common single-char separator, use direct
215
323
  # character comparison instead of allocating a substring via line[i...i+n].
216
324
  if col_sep_size == 1
217
- while i < line_size
218
- if line[i] == col_sep && !in_quotes
325
+ # Optimization #13: byte-level indexing for single-char separator.
326
+ # col_sep and quote_char are both validated to be single-byte at option
327
+ # parsing time. UTF-8 multi-byte continuation bytes (0x80–0xBF) never
328
+ # alias ASCII delimiter bytes (0x00–0x7F), so byte scanning is safe for
329
+ # UTF-8 strings with ASCII delimiters — no String allocation per character.
330
+ col_sep_byte = col_sep.getbyte(0)
331
+ quote_byte = quote.getbyte(0)
332
+ bytesize = line.bytesize
333
+ row_sep_bytesize = row_sep.is_a?(String) ? row_sep.bytesize : 0
334
+
335
+ while i < bytesize
336
+ # Optimization #10: inside a quoted field with no backslash escaping, jump
337
+ # directly to the next quote character using byteindex (C-level scan).
338
+ # Avoids per-character Ruby iteration through long field content.
339
+ if in_quotes && !allow_escaped_quotes
340
+ next_q = if BYTEINDEX_AVAILABLE
341
+ line.byteindex(quote, i)
342
+ else
343
+ j = i
344
+ j += 1 while j < bytesize && line.getbyte(j) != quote_byte
345
+ j < bytesize ? j : nil
346
+ end
347
+ if next_q.nil?
348
+ i = bytesize # no closing quote — exit loop, return [[], -1] below
349
+ break
350
+ end
351
+ i = next_q # land on the quote; fall through to normal quote-handling below
352
+ b = quote_byte
353
+
354
+ # Optimization #12: in :standard mode, once we know the current field is
355
+ # unquoted (field_started && !in_quotes), remaining quotes are literal and
356
+ # cannot affect parser state — jump directly to the next col_sep.
357
+ # Mirrors Opt #10 for the unquoted side of the same trade-off.
358
+ elsif quote_boundary_standard && field_started && !in_quotes
359
+ next_sep = if BYTEINDEX_AVAILABLE
360
+ line.byteindex(col_sep, i)
361
+ else
362
+ j = i
363
+ j += 1 while j < bytesize && line.getbyte(j) != col_sep_byte
364
+ j < bytesize ? j : nil
365
+ end
366
+ if next_sep.nil?
367
+ break
368
+ end
369
+
370
+ i = next_sep
371
+ b = col_sep_byte
372
+
373
+ else
374
+ b = line.getbyte(i)
375
+ end
376
+
377
+ if b == col_sep_byte && !in_quotes
219
378
  break if !header_size.nil? && elements.size >= header_size
220
379
 
221
- field = line[start...i]
222
- field = cleanup_quotes(field, quote)
223
- elements << (strip ? field.strip : field)
380
+ # Optimization #15: for quoted fields, extract content directly without
381
+ # surrounding quotes to avoid the double allocation of byteslice + field[1..-2]
382
+ # inside cleanup_quotes. Safe because the line is pre-chomped and the state
383
+ # machine has already found and validated the closing quote.
384
+ field_len = i - start
385
+ if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(i - 1) == quote_byte
386
+ field = line.byteslice(start + 1, field_len - 2)
387
+ field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
388
+ field.strip! if strip # in-place: no extra allocation; safe on fresh byteslice
389
+ elements << field
390
+ else
391
+ field = line.byteslice(start, field_len)
392
+ field = cleanup_quotes(field, quote)
393
+ elements << (strip ? field.strip : field) # cleanup_quotes may return frozen EMPTY_STRING
394
+ end
224
395
  i += 1
225
396
  start = i
226
397
  backslash_count = 0
398
+ field_started = false # reset for next field
227
399
  else
228
- if allow_escaped_quotes && line[i] == '\\'
400
+ if allow_escaped_quotes && b == 92 # backslash '\\'
229
401
  backslash_count += 1
402
+ field_started = true if quote_boundary_standard && !in_quotes
230
403
  else
231
- if line[i] == quote
404
+ if b == quote_byte
232
405
  if !allow_escaped_quotes || backslash_count % 2 == 0
233
- in_quotes = !in_quotes
406
+ if quote_boundary_standard
407
+ if in_quotes
408
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
409
+ next_i = i + 1
410
+ if next_i >= bytesize ||
411
+ line.getbyte(next_i) == col_sep_byte ||
412
+ (row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
413
+ in_quotes = false
414
+ field_started = true
415
+ end
416
+ # else: quote inside quoted field → literal (handles "" doubling)
417
+ elsif !field_started # at field boundary: open quoted field
418
+ in_quotes = true
419
+ field_started = true
420
+ end
421
+ # else: mid-field quote → literal, no state change
422
+ else
423
+ in_quotes = !in_quotes
424
+ end
425
+ end
426
+ elsif quote_boundary_standard && !in_quotes && !field_started
427
+ # Non-quote, non-separator: mark field as started (only needs to fire once
428
+ # per field — Opt #12 skips the rest once this is set).
429
+ # rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
430
+ if strip && (b == 32 || b == 9) # ' ' == 32, '\t' == 9
431
+ start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
432
+ else
433
+ field_started = true
234
434
  end
435
+ # rubocop:enable Style/MultipleComparison
235
436
  end
236
437
  backslash_count = 0
237
438
  end
238
439
  i += 1
239
440
  end
240
441
  end
442
+
443
+ # Unclosed quote at end of line: signal "needs more data" to the caller.
444
+ # The read loop will stitch the next physical line and re-parse rather than raising.
445
+ return [[], -1] if in_quotes
446
+
447
+ # Process the remaining field
448
+ if header_size.nil? || elements.size < header_size
449
+ # Optimization #15 (final field): same direct extraction; safe because line is pre-chomped.
450
+ field_len = bytesize - start
451
+ if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(bytesize - 1) == quote_byte
452
+ field = line.byteslice(start + 1, field_len - 2)
453
+ field.gsub!(doubled_quote(quote), quote)
454
+ field.strip! if strip
455
+ elements << field
456
+ else
457
+ field = line.byteslice(start, field_len)
458
+ field = cleanup_quotes(field, quote)
459
+ elements << (strip ? field.strip : field)
460
+ end
461
+ end
241
462
  else
242
463
  # Multi-char col_sep: use substring comparison (original path)
243
464
  while i < line_size
465
+ # Optimization #10 (multi-char path): same skip-ahead as single-char path above.
466
+ if in_quotes && !allow_escaped_quotes
467
+ next_q = line.index(quote, i)
468
+ if next_q.nil?
469
+ i = line_size
470
+ break
471
+ end
472
+ i = next_q
473
+ end
474
+
475
+ # Optimization #12 (multi-char path): mirror of single-char path above.
476
+ if quote_boundary_standard && field_started && !in_quotes
477
+ next_sep = line.index(col_sep, i)
478
+ if next_sep.nil?
479
+ break
480
+ end
481
+
482
+ i = next_sep
483
+ end
484
+
244
485
  if line[i...i+col_sep_size] == col_sep && !in_quotes
245
486
  break if !header_size.nil? && elements.size >= header_size
246
487
 
247
- field = line[start...i]
248
- field = cleanup_quotes(field, quote)
249
- elements << (strip ? field.strip : field)
488
+ # Optimization #15 (multi-char path): same direct extraction using character indexing.
489
+ field_len = i - start
490
+ if field_len >= 2 && line[start] == quote && line[i - 1] == quote
491
+ field = line[start + 1...i - 1]
492
+ field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
493
+ field.strip! if strip
494
+ elements << field
495
+ else
496
+ field = line[start...i]
497
+ field = cleanup_quotes(field, quote)
498
+ elements << (strip ? field.strip : field)
499
+ end
250
500
  i += col_sep_size
251
501
  start = i
252
502
  backslash_count = 0
503
+ field_started = false # reset for next field
253
504
  else
254
505
  if allow_escaped_quotes && line[i] == '\\'
255
506
  backslash_count += 1
507
+ field_started = true if quote_boundary_standard && !in_quotes
256
508
  else
257
509
  if line[i] == quote
258
510
  if !allow_escaped_quotes || backslash_count % 2 == 0
259
- in_quotes = !in_quotes
511
+ if quote_boundary_standard
512
+ if in_quotes
513
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
514
+ next_i = i + 1
515
+ if next_i >= line_size ||
516
+ line[next_i...next_i + col_sep_size] == col_sep ||
517
+ (row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
518
+ in_quotes = false
519
+ field_started = true
520
+ end
521
+ # else: quote inside quoted field → literal (handles "" doubling)
522
+ elsif !field_started # at field boundary: open quoted field
523
+ in_quotes = true
524
+ field_started = true
525
+ end
526
+ # else: mid-field quote → literal, no state change
527
+ else
528
+ in_quotes = !in_quotes
529
+ end
260
530
  end
531
+ elsif quote_boundary_standard && !in_quotes && !field_started
532
+ # rubocop:disable Style/MultipleComparison -- two direct == comparisons are faster than Array#include? in this hot loop
533
+ if strip && (line[i] == ' ' || line[i] == "\t")
534
+ start = i + 1 # advance past leading whitespace so the quote check at extraction sees the quote
535
+ else
536
+ field_started = true
537
+ end
538
+ # rubocop:enable Style/MultipleComparison
261
539
  end
262
540
  backslash_count = 0
263
541
  end
264
542
  i += 1
265
543
  end
266
544
  end
267
- end
268
-
269
- # Check for unclosed quotes at the end of the line
270
- if in_quotes
271
- # :nocov:
272
- raise MalformedCSV, "Unclosed quoted field detected in line: #{line}"
273
- # :nocov:
274
- end
275
545
 
276
- # Process the remaining field
277
- if header_size.nil? || elements.size < header_size
278
- field = line[start..-1]
279
- field = cleanup_quotes(field, quote)
280
- elements << (strip ? field.strip : field)
546
+ # Unclosed quote at end of line: signal "needs more data" to the caller.
547
+ # The read loop will stitch the next physical line and re-parse rather than raising.
548
+ return [[], -1] if in_quotes
549
+
550
+ # Process the remaining field
551
+ if header_size.nil? || elements.size < header_size
552
+ # Optimization #15 (multi-char final field): same direct extraction; line is pre-chomped.
553
+ field_len = line_size - start
554
+ if field_len >= 2 && line[start] == quote && line[line_size - 1] == quote
555
+ field = line[start + 1..line_size - 2]
556
+ field.gsub!(doubled_quote(quote), quote)
557
+ field.strip! if strip
558
+ elements << field
559
+ else
560
+ field = line[start..-1]
561
+ field = cleanup_quotes(field, quote)
562
+ elements << (strip ? field.strip : field)
563
+ end
564
+ end
281
565
  end
282
566
 
283
567
  [elements, elements.size]