smarter_csv 1.17.1 → 1.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -169,16 +169,107 @@ needs_unescape:
169
169
  /* Helper: build the 2-element [elements, data_size] tuple returned by rb_parse_csv_line.
170
170
  * Aligns this function's return shape with parse_csv_line_ruby and rb_parse_line_to_hash_ctx:
171
171
  * data_size = -1 signals "unclosed quoted field — needs more data". */
172
- static inline VALUE make_parse_result(VALUE elements, long data_size) {
172
+ static inline __attribute__((always_inline))
173
+ VALUE return_parser_result(VALUE elements, long data_size) {
173
174
  VALUE result = rb_ary_new_capa(2);
174
175
  rb_ary_push(result, elements);
175
176
  rb_ary_push(result, LONG2FIX(data_size));
176
177
  return result;
177
178
  }
178
179
 
180
+ /* Helper: trim leading/trailing spaces and tabs from a field when strip_ws is set.
181
+ * Sets *out_start to the first kept byte and returns the trimmed length (0 when the
182
+ * field is empty or all whitespace). This is the trim performed at every field
183
+ * boundary in all three parsers; kept always_inline so each call site compiles to
184
+ * the same code as the hand-written loops it replaces (no performance cost). */
185
+ static inline __attribute__((always_inline))
186
+ long trim_field(char *field, long field_len, bool strip_ws, char **out_start) {
187
+ char *trim_start = field;
188
+ char *trim_end = field + field_len - 1;
189
+ if (strip_ws) {
190
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
191
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
192
+ }
193
+ *out_start = trim_start;
194
+ return (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
195
+ }
196
+
197
+ /* A field after quote-stripping and trimming: where its content starts, how long it
198
+ * is, and whether it still contains quote characters that need unescaping. */
199
+ typedef struct {
200
+ char *start;
201
+ long len;
202
+ bool has_quotes;
203
+ } extracted_field;
204
+
205
+ /* Helper: turn a raw field slice into the values every extraction site needs.
206
+ * Strips a surrounding pair of quote chars (if present), trims whitespace via
207
+ * trim_field, and reports whether the result still has embedded quotes (true for a
208
+ * quoted field, or any field containing the quote char). This is the common prefix
209
+ * before each field is pushed/inserted, in all three parsers' slow paths.
210
+ * always_inline + return-by-value so the struct is dissolved into registers and each
211
+ * call site compiles to the same code as the old inline block (no performance cost). */
212
+ static inline __attribute__((always_inline))
213
+ extracted_field extract_field(char *raw_field, long field_len, bool strip_ws, char quote_char_val) {
214
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
215
+ if (quoted) {
216
+ raw_field++; // Skip opening quote
217
+ field_len -= 2; // Exclude both quotes from length
218
+ }
219
+ char *trim_start;
220
+ long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
221
+ extracted_field result = {
222
+ trim_start,
223
+ trimmed_len,
224
+ quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len))
225
+ };
226
+ return result;
227
+ }
228
+
229
+ /* Helper: is a closing quote at p actually a field close? Valid only when followed by
230
+ * the column separator, the row separator, or end of line. Pure read — touches none of
231
+ * the quote loop's state (in_quotes/field_started/etc). Mirrors the inline lookahead
232
+ * copied into all three parsers' quote machines; always_inline so it compiles to the
233
+ * same code as the hand-written block. */
234
+ static inline __attribute__((always_inline))
235
+ bool is_valid_close(const char *p, const char *endP,
236
+ const char *col_sepP, long col_sep_len,
237
+ const char *row_sepP, long row_sep_len) {
238
+ bool valid_close = (p + 1 >= endP);
239
+ if (!valid_close) {
240
+ valid_close = true;
241
+ for (long j = 0; j < col_sep_len; j++) {
242
+ if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
243
+ }
244
+ }
245
+ if (!valid_close && row_sep_len > 0) {
246
+ valid_close = true;
247
+ for (long j = 0; j < row_sep_len; j++) {
248
+ if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
249
+ }
250
+ }
251
+ return valid_close;
252
+ }
253
+
254
+ /* Helper: strip a trailing row separator from the line (pointer adjustment, no string
255
+ * mutation). If the last row_sep_len bytes at endP match row_sepP, move endP back past
256
+ * them; otherwise leave endP untouched. The row_sep_len > 0 guard means callers can
257
+ * pass (NULL, 0) for "no separator known yet" without an outer if. Shared by
258
+ * rb_parse_line_to_hash and rb_parse_line_to_hash_ctx; always_inline keeps the chomp
259
+ * site as cheap as the hand-written check it replaces. */
260
+ static inline __attribute__((always_inline))
261
+ char *chomp_row_sep(char *endP, long line_len, const char *row_sepP, long row_sep_len) {
262
+ if (row_sep_len > 0
263
+ && line_len >= row_sep_len
264
+ && memcmp(endP - row_sep_len, row_sepP, (size_t)row_sep_len) == 0) {
265
+ endP -= row_sep_len;
266
+ }
267
+ return endP;
268
+ }
269
+
179
270
  static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val, VALUE quote_boundary_standard_val, VALUE row_sep_val) {
180
271
  if (RB_TYPE_P(line, T_NIL) == 1) {
181
- return make_parse_result(rb_ary_new(), 0);
272
+ return return_parser_result(rb_ary_new(), 0);
182
273
  }
183
274
 
184
275
  if (RB_TYPE_P(line, T_STRING) != 1) {
@@ -205,7 +296,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
205
296
  if (max_size != Qnil) {
206
297
  max_fields = NUM2INT(max_size);
207
298
  if (max_fields < 0) {
208
- return make_parse_result(rb_ary_new(), 0);
299
+ return return_parser_result(rb_ary_new(), 0);
209
300
  }
210
301
  }
211
302
 
@@ -229,15 +320,8 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
229
320
 
230
321
  long field_len = sep_pos - startP;
231
322
  char *raw_field = startP;
232
- char *trim_start = raw_field;
233
- char *trim_end = raw_field + field_len - 1;
234
-
235
- if (strip_ws) {
236
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
237
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
238
- }
239
-
240
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
323
+ char *trim_start;
324
+ long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
241
325
 
242
326
  field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
243
327
  rb_ary_push(elements, field);
@@ -250,21 +334,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
250
334
  if ((max_fields < 0) || (element_count < max_fields)) {
251
335
  long field_len = endP - startP;
252
336
  char *raw_field = startP;
253
- char *trim_start = raw_field;
254
- char *trim_end = raw_field + field_len - 1;
255
-
256
- if (strip_ws) {
257
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
258
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
259
- }
260
-
261
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
337
+ char *trim_start;
338
+ long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
262
339
 
263
340
  field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
264
341
  rb_ary_push(elements, field);
265
342
  }
266
343
 
267
- return make_parse_result(elements, RARRAY_LEN(elements));
344
+ return return_parser_result(elements, RARRAY_LEN(elements));
268
345
  }
269
346
 
270
347
  // === SLOW PATH: Quoted fields or multi-char separator ===
@@ -291,28 +368,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
291
368
  long field_len = p - startP;
292
369
  char *raw_field = startP;
293
370
 
294
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
295
- if (quoted) {
296
- raw_field++;
297
- field_len -= 2;
298
- }
299
-
300
- char *trim_start = raw_field;
301
- char *trim_end = raw_field + field_len - 1;
371
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
302
372
 
303
- if (strip_ws) {
304
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
305
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
306
- }
307
-
308
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
309
-
310
- if (trimmed_len == 0) {
373
+ if (f.len == 0) {
311
374
  field = Qempty_string;
312
- } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
313
- field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
375
+ } else if (f.has_quotes) {
376
+ field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
314
377
  } else {
315
- field = rb_enc_str_new(trim_start, trimmed_len, encoding);
378
+ field = rb_enc_str_new(f.start, f.len, encoding);
316
379
  }
317
380
 
318
381
  rb_ary_push(elements, field);
@@ -331,25 +394,27 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
331
394
  if (!allow_escaped_quotes || backslash_count % 2 == 0) {
332
395
  if (__builtin_expect(quote_boundary_standard, 1)) {
333
396
  if (in_quotes) {
334
- // closing quote: only valid if followed by col_sep, row_sep, or end of line
335
- bool valid_close = (p + 1 >= endP);
336
- if (!valid_close) {
337
- valid_close = true;
338
- for (long j = 0; j < col_sep_len; j++) {
339
- if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
340
- }
341
- }
342
- if (!valid_close && row_sep_len > 0) {
343
- valid_close = true;
344
- for (long j = 0; j < row_sep_len; j++) {
345
- if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
397
+ if (p + 2 < endP && *(p + 1) == quote_char_val) {
398
+ /* RFC doubled quote inside a quoted field ("" → ").
399
+ * Give this precedence over the closing-quote check, but only
400
+ * when another byte follows the doubled pair.
401
+ *
402
+ * Compatibility note: we intentionally do NOT force terminal
403
+ * "" to be consumed here. SmarterCSV has a long-standing lenient
404
+ * behavior for malformed tails like ...\"" in :double_quotes mode:
405
+ * the final quote may still close the field instead of turning the
406
+ * row into an unclosed-quote error. Issue #334 needs doubled-quote
407
+ * precedence for ..."",... (more content follows), but we keep the
408
+ * historical leniency for terminal ..."". */
409
+ p++;
410
+ } else {
411
+ // closing quote: only valid if followed by col_sep, row_sep, or end of line
412
+ if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP, row_sep_len)) {
413
+ in_quotes = false;
414
+ field_started = true;
346
415
  }
416
+ // else: quote inside quoted field → literal
347
417
  }
348
- if (valid_close) {
349
- in_quotes = false;
350
- field_started = true;
351
- }
352
- // else: quote inside quoted field → literal (handles "" doubling)
353
418
  } else if (!field_started) {
354
419
  in_quotes = true; // opening quote at field boundary
355
420
  field_started = true;
@@ -383,41 +448,27 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
383
448
  * the signal: append the next physical line and re-parse, or raise MalformedCSV
384
449
  * at EOF if the field never closes. The parser does not decide "ultimately
385
450
  * malformed"; the caller does. */
386
- return make_parse_result(rb_ary_new(), -1);
451
+ return return_parser_result(rb_ary_new(), -1);
387
452
  }
388
453
 
389
454
  if ((max_fields < 0) || (element_count < max_fields)) {
390
455
  long field_len = endP - startP;
391
456
  char *raw_field = startP;
392
457
 
393
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
394
- if (quoted) {
395
- raw_field++;
396
- field_len -= 2;
397
- }
458
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
398
459
 
399
- char *trim_start = raw_field;
400
- char *trim_end = raw_field + field_len - 1;
401
-
402
- if (strip_ws) {
403
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
404
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
405
- }
406
-
407
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
408
-
409
- if (trimmed_len == 0) {
460
+ if (f.len == 0) {
410
461
  field = Qempty_string;
411
- } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
412
- field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
462
+ } else if (f.has_quotes) {
463
+ field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
413
464
  } else {
414
- field = rb_enc_str_new(trim_start, trimmed_len, encoding);
465
+ field = rb_enc_str_new(f.start, f.len, encoding);
415
466
  }
416
467
 
417
468
  rb_ary_push(elements, field);
418
469
  }
419
470
 
420
- return make_parse_result(elements, RARRAY_LEN(elements));
471
+ return return_parser_result(elements, RARRAY_LEN(elements));
421
472
  }
422
473
 
423
474
  // Efficiently combine two arrays into a hash (replaces headers.zip(values).to_h)
@@ -675,6 +726,32 @@ static inline __attribute__((always_inline)) bool insert_field_into_hash(
675
726
  return true;
676
727
  }
677
728
 
729
+ /* Helper: parse the convert_values_to_numeric option into a mode + key list.
730
+ * mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys.
731
+ * Writes through the out-params only when the option is set, so callers must
732
+ * pre-initialize *out_mode = 0 and *out_keys = Qnil. Shared by rb_parse_line_to_hash
733
+ * and rb_new_parse_context — identical logic, different storage (locals vs ctx fields).
734
+ * always_inline so each call site compiles to the same code as the old inline block. */
735
+ static inline __attribute__((always_inline))
736
+ void parse_numeric_option(VALUE options_hash, int *out_mode, VALUE *out_keys) {
737
+ VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
738
+ if (RTEST(convert_opt)) {
739
+ if (RB_TYPE_P(convert_opt, T_HASH)) {
740
+ VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
741
+ VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
742
+ if (RTEST(only_keys)) {
743
+ *out_mode = 2;
744
+ *out_keys = rb_Array(only_keys); // wrap single value in array if needed
745
+ } else if (RTEST(except_keys)) {
746
+ *out_mode = 3;
747
+ *out_keys = rb_Array(except_keys); // wrap single value in array if needed
748
+ }
749
+ } else {
750
+ *out_mode = 1; // convert all
751
+ }
752
+ }
753
+ }
754
+
678
755
  /*
679
756
  * ================================================================================
680
757
  * rb_parse_line_to_hash - Parse CSV line directly into a Ruby Hash
@@ -723,10 +800,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
723
800
  * SECTION 1: Handle nil/invalid input
724
801
  * ---------------------------------------- */
725
802
  if (NIL_P(line)) {
726
- VALUE result = rb_ary_new_capa(2);
727
- rb_ary_push(result, Qnil);
728
- rb_ary_push(result, INT2FIX(0));
729
- return result;
803
+ return return_parser_result(Qnil, 0);
730
804
  }
731
805
 
732
806
  if (RB_TYPE_P(line, T_STRING) != 1) {
@@ -751,22 +825,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
751
825
  // numeric_mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys
752
826
  int numeric_mode = 0;
753
827
  VALUE numeric_keys = Qnil;
754
- VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
755
- if (RTEST(convert_opt)) {
756
- if (RB_TYPE_P(convert_opt, T_HASH)) {
757
- VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
758
- VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
759
- if (RTEST(only_keys)) {
760
- numeric_mode = 2;
761
- numeric_keys = rb_Array(only_keys); // wrap single value in array if needed
762
- } else if (RTEST(except_keys)) {
763
- numeric_mode = 3;
764
- numeric_keys = rb_Array(except_keys); // wrap single value in array if needed
765
- }
766
- } else {
767
- numeric_mode = 1; // convert all
768
- }
769
- }
828
+ parse_numeric_option(options_hash, &numeric_mode, &numeric_keys);
770
829
 
771
830
  // quote_escaping and quote_boundary are only needed in Section 5 (quoted/slow path).
772
831
  // They are declared here as forward declarations so Section 5 can set them lazily.
@@ -783,11 +842,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
783
842
  // row_sep is also reused in Section 5 for the closing-quote boundary check.
784
843
  VALUE row_sep = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
785
844
  if (!NIL_P(row_sep) && RB_TYPE_P(row_sep, T_STRING)) {
786
- char *row_sepP = RSTRING_PTR(row_sep);
787
- long row_sep_len = RSTRING_LEN(row_sep);
788
- if (line_len >= row_sep_len && memcmp(endP - row_sep_len, row_sepP, row_sep_len) == 0) {
789
- endP -= row_sep_len;
790
- }
845
+ endP = chomp_row_sep(endP, line_len, RSTRING_PTR(row_sep), RSTRING_LEN(row_sep));
791
846
  }
792
847
 
793
848
  char *col_sepP = RSTRING_PTR(col_sep);
@@ -829,6 +884,11 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
829
884
  * the frame stays well below 4 KB and ___chkstk_darwin never fires on ARM64 macOS.
830
885
  */
831
886
  bool *keep_bitmap = NULL;
887
+ /* In THIS (non-ctx) function the bitmap is alloca'd to headers_len on every call (see the alloca
888
+ * sites below), so keep_bitmap[] is exactly headers_len long and headers_len is the correct bound
889
+ * at all access sites. Do NOT mirror rb_parse_line_to_hash_ctx's keep_bitmap_len here: that variant
890
+ * caches its bitmap across rows (where @headers can grow), so it must use the captured length; this
891
+ * one rebuilds per call and does not. */
832
892
  bool keep_extra_columns = true; /* extra cols (> headers_len): keep by default */
833
893
  bool has_only = false; /* true when only_headers: filtering is active */
834
894
  long early_exit_after = -1; /* column index after which we stop; -1 = no early exit */
@@ -955,13 +1015,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
955
1015
  /* --- (a) Common path: no column filter, no early exit --- */
956
1016
  while ((sep_pos = memchr(p, sep, endP - p))) {
957
1017
  long field_len = sep_pos - startP;
958
- char *trim_start = startP;
959
- char *trim_end = startP + field_len - 1;
960
- if (strip_ws) {
961
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
962
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
963
- }
964
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1018
+ char *trim_start;
1019
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
965
1020
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
966
1021
  all_blank = false;
967
1022
  element_count++;
@@ -970,13 +1025,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
970
1025
  /* Process last field */
971
1026
  {
972
1027
  long field_len = endP - startP;
973
- char *trim_start = startP;
974
- char *trim_end = startP + field_len - 1;
975
- if (strip_ws) {
976
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
977
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
978
- }
979
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1028
+ char *trim_start;
1029
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
980
1030
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
981
1031
  all_blank = false;
982
1032
  element_count++;
@@ -985,13 +1035,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
985
1035
  /* --- (b) Filter path: column bitmap and/or early exit active --- */
986
1036
  while ((sep_pos = memchr(p, sep, endP - p))) {
987
1037
  long field_len = sep_pos - startP;
988
- char *trim_start = startP;
989
- char *trim_end = startP + field_len - 1;
990
- if (strip_ws) {
991
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
992
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
993
- }
994
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1038
+ char *trim_start;
1039
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
995
1040
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
996
1041
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
997
1042
  all_blank = false;
@@ -1006,13 +1051,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1006
1051
  /* Process last field — skip on early exit */
1007
1052
  if (!did_early_exit) {
1008
1053
  long field_len = endP - startP;
1009
- char *trim_start = startP;
1010
- char *trim_end = startP + field_len - 1;
1011
- if (strip_ws) {
1012
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1013
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1014
- }
1015
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1054
+ char *trim_start;
1055
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1016
1056
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1017
1057
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1018
1058
  all_blank = false;
@@ -1087,28 +1127,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1087
1127
  long field_len = p - startP;
1088
1128
  char *raw_field = startP;
1089
1129
 
1090
- // Check if field is wrapped in quotes: "value"
1091
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1092
- if (quoted) {
1093
- raw_field++; // Skip opening quote
1094
- field_len -= 2; // Exclude both quotes from length
1095
- }
1096
-
1097
- char *trim_start = raw_field;
1098
- char *trim_end = raw_field + field_len - 1;
1099
-
1100
- if (strip_ws) {
1101
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1102
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1103
- }
1104
-
1105
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1106
-
1107
- // Determine if field contains embedded quotes (need unescape)
1108
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1130
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1109
1131
 
1110
1132
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1111
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1133
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1112
1134
  all_blank = false;
1113
1135
  }
1114
1136
  element_count++;
@@ -1147,25 +1169,27 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1147
1169
  if (!allow_escaped_quotes || backslash_count % 2 == 0) {
1148
1170
  if (__builtin_expect(quote_boundary_standard, 1)) {
1149
1171
  if (in_quotes) {
1150
- // closing quote: only valid if followed by col_sep, row_sep, or end of line
1151
- bool valid_close = (p + 1 >= endP);
1152
- if (!valid_close) {
1153
- valid_close = true;
1154
- for (long j = 0; j < col_sep_len; j++) {
1155
- if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
1156
- }
1157
- }
1158
- if (!valid_close && row_sep_len2 > 0) {
1159
- valid_close = true;
1160
- for (long j = 0; j < row_sep_len2; j++) {
1161
- if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
1172
+ if (p + 2 < endP && *(p + 1) == quote_char_val) {
1173
+ /* RFC doubled quote inside a quoted field ("" → ").
1174
+ * Give this precedence over the closing-quote check, but only
1175
+ * when another byte follows the doubled pair.
1176
+ *
1177
+ * Compatibility note: we intentionally do NOT force terminal
1178
+ * "" to be consumed here. SmarterCSV has a long-standing lenient
1179
+ * behavior for malformed tails like ...\"" in :double_quotes mode:
1180
+ * the final quote may still close the field instead of turning the
1181
+ * row into an unclosed-quote error. Issue #334 needs doubled-quote
1182
+ * precedence for ..."",... (more content follows), but we keep the
1183
+ * historical leniency for terminal ..."". */
1184
+ p++;
1185
+ } else {
1186
+ // closing quote: only valid if followed by col_sep, row_sep, or end of line
1187
+ if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
1188
+ in_quotes = false;
1189
+ field_started = true;
1162
1190
  }
1191
+ // else: quote inside quoted field → literal
1163
1192
  }
1164
- if (valid_close) {
1165
- in_quotes = false;
1166
- field_started = true;
1167
- }
1168
- // else: quote inside quoted field → literal (handles "" doubling)
1169
1193
  } else if (!field_started) {
1170
1194
  in_quotes = true; // opening quote at field boundary
1171
1195
  field_started = true;
@@ -1198,10 +1222,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1198
1222
  * We return [nil, -1] rather than raising so the read loop can handle multiline fields
1199
1223
  * without a separate pre-scan pass (detect_multiline). */
1200
1224
  if (!did_early_exit && in_quotes) {
1201
- VALUE result = rb_ary_new_capa(2);
1202
- rb_ary_push(result, Qnil);
1203
- rb_ary_push(result, LONG2FIX(-1));
1204
- return result;
1225
+ return return_parser_result(Qnil, -1);
1205
1226
  }
1206
1227
 
1207
1228
  /* Process the last field (same logic as above) — skip on early exit */
@@ -1209,26 +1230,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1209
1230
  long field_len = endP - startP;
1210
1231
  char *raw_field = startP;
1211
1232
 
1212
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1213
- if (quoted) {
1214
- raw_field++;
1215
- field_len -= 2;
1216
- }
1217
-
1218
- char *trim_start = raw_field;
1219
- char *trim_end = raw_field + field_len - 1;
1220
-
1221
- if (strip_ws) {
1222
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1223
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1224
- }
1225
-
1226
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1227
-
1228
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1233
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1229
1234
 
1230
1235
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1231
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1236
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1232
1237
  all_blank = false;
1233
1238
  }
1234
1239
  element_count++;
@@ -1249,10 +1254,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1249
1254
  */
1250
1255
  if (all_blank) {
1251
1256
  if (remove_empty) {
1252
- VALUE result = rb_ary_new_capa(2);
1253
- rb_ary_push(result, Qnil);
1254
- rb_ary_push(result, LONG2FIX(element_count));
1255
- return result;
1257
+ return return_parser_result(Qnil, element_count);
1256
1258
  }
1257
1259
 
1258
1260
  ensure_hash_allocated(&xform);
@@ -1280,10 +1282,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1280
1282
  * Return [hash, element_count] so caller can detect extra columns
1281
1283
  * (when element_count > headers_len) and extend headers if needed.
1282
1284
  */
1283
- VALUE result = rb_ary_new_capa(2);
1284
- rb_ary_push(result, xform.hash);
1285
- rb_ary_push(result, LONG2FIX(element_count));
1286
- return result;
1285
+ return return_parser_result(xform.hash, element_count);
1287
1286
  }
1288
1287
 
1289
1288
  /* ================================================================================
@@ -1354,22 +1353,7 @@ __attribute__((cold)) static VALUE rb_new_parse_context(VALUE self, VALUE header
1354
1353
  ctx->remove_zero_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_zero_values)));
1355
1354
 
1356
1355
  /* Numeric conversion */
1357
- VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
1358
- if (RTEST(convert_opt)) {
1359
- if (RB_TYPE_P(convert_opt, T_HASH)) {
1360
- VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
1361
- VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
1362
- if (RTEST(only_keys)) {
1363
- ctx->numeric_mode = 2;
1364
- ctx->numeric_keys = rb_Array(only_keys);
1365
- } else if (RTEST(except_keys)) {
1366
- ctx->numeric_mode = 3;
1367
- ctx->numeric_keys = rb_Array(except_keys);
1368
- }
1369
- } else {
1370
- ctx->numeric_mode = 1;
1371
- }
1372
- }
1356
+ parse_numeric_option(options_hash, &ctx->numeric_mode, &ctx->numeric_keys);
1373
1357
 
1374
1358
  /* quote_escaping → allow_escaped_quotes */
1375
1359
  VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
@@ -1468,10 +1452,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1468
1452
  * SECTION 1: Handle nil/invalid input
1469
1453
  * ---------------------------------------- */
1470
1454
  if (NIL_P(line)) {
1471
- VALUE result = rb_ary_new_capa(2);
1472
- rb_ary_push(result, Qnil);
1473
- rb_ary_push(result, INT2FIX(0));
1474
- return result;
1455
+ return return_parser_result(Qnil, 0);
1475
1456
  }
1476
1457
 
1477
1458
  if (RB_TYPE_P(line, T_STRING) != 1) {
@@ -1495,6 +1476,14 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1495
1476
  int numeric_mode = ctx->numeric_mode;
1496
1477
  VALUE numeric_keys = ctx->numeric_keys;
1497
1478
  bool *keep_bitmap = ctx->keep_bitmap;
1479
+ /* keep_bitmap is cached in the context (xmalloc'd once at construction, sized to the header count
1480
+ * THEN). @headers can grow in place as undeclared extra columns appear, so the live headers_len
1481
+ * (re-read each call below) may exceed the bitmap's length. Every keep_bitmap[] access in this
1482
+ * function MUST be bounded by keep_bitmap_len, never headers_len — indices past the bitmap are
1483
+ * extra columns and follow keep_extra_columns. Bounding by the grown headers_len was an
1484
+ * out-of-bounds heap read (the bug). The sibling rb_parse_line_to_hash safely uses headers_len
1485
+ * because it re-allocs its bitmap to headers_len on every call. */
1486
+ long keep_bitmap_len = ctx->keep_bitmap_len;
1498
1487
  bool keep_extra_columns = ctx->keep_extra_columns;
1499
1488
  long early_exit_after = ctx->early_exit_after;
1500
1489
 
@@ -1509,12 +1498,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1509
1498
  char *p = startP;
1510
1499
 
1511
1500
  /* Chomp: strip trailing row separator (pointer adjustment, no string mutation) */
1512
- if (ctx->row_sep_len > 0) {
1513
- long rsl = (long)ctx->row_sep_len;
1514
- if (line_len >= rsl && memcmp(endP - rsl, ctx->row_sep_buf, (size_t)rsl) == 0) {
1515
- endP -= rsl;
1516
- }
1517
- }
1501
+ endP = chomp_row_sep(endP, line_len, ctx->row_sep_buf, (long)ctx->row_sep_len);
1518
1502
 
1519
1503
  /* Re-read headers_len each call to handle extra-column growth */
1520
1504
  long headers_len = NIL_P(ctx->headers) ? 0 : RARRAY_LEN(ctx->headers);
@@ -1559,13 +1543,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1559
1543
  /* --- (a) Common path: no column filter, no early exit --- */
1560
1544
  while ((sep_pos = memchr(p, sep, endP - p))) {
1561
1545
  long field_len = sep_pos - startP;
1562
- char *trim_start = startP;
1563
- char *trim_end = startP + field_len - 1;
1564
- if (strip_ws) {
1565
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1566
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1567
- }
1568
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1546
+ char *trim_start;
1547
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1569
1548
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1570
1549
  all_blank = false;
1571
1550
  element_count++;
@@ -1574,13 +1553,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1574
1553
  /* Process last field */
1575
1554
  {
1576
1555
  long field_len = endP - startP;
1577
- char *trim_start = startP;
1578
- char *trim_end = startP + field_len - 1;
1579
- if (strip_ws) {
1580
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1581
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1582
- }
1583
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1556
+ char *trim_start;
1557
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1584
1558
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1585
1559
  all_blank = false;
1586
1560
  element_count++;
@@ -1589,14 +1563,9 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1589
1563
  /* --- (b) Filter path: column bitmap and/or early exit active --- */
1590
1564
  while ((sep_pos = memchr(p, sep, endP - p))) {
1591
1565
  long field_len = sep_pos - startP;
1592
- char *trim_start = startP;
1593
- char *trim_end = startP + field_len - 1;
1594
- if (strip_ws) {
1595
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1596
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1597
- }
1598
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1599
- if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1566
+ char *trim_start;
1567
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1568
+ if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1600
1569
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1601
1570
  all_blank = false;
1602
1571
  }
@@ -1610,14 +1579,9 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1610
1579
  /* Process last field — skip on early exit */
1611
1580
  if (!did_early_exit) {
1612
1581
  long field_len = endP - startP;
1613
- char *trim_start = startP;
1614
- char *trim_end = startP + field_len - 1;
1615
- if (strip_ws) {
1616
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1617
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1618
- }
1619
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1620
- if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1582
+ char *trim_start;
1583
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1584
+ if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1621
1585
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1622
1586
  all_blank = false;
1623
1587
  }
@@ -1662,26 +1626,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1662
1626
  long field_len = p - startP;
1663
1627
  char *raw_field = startP;
1664
1628
 
1665
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1666
- if (quoted) {
1667
- raw_field++;
1668
- field_len -= 2;
1669
- }
1670
-
1671
- char *trim_start = raw_field;
1672
- char *trim_end = raw_field + field_len - 1;
1673
-
1674
- if (strip_ws) {
1675
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1676
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1677
- }
1678
-
1679
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1629
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1680
1630
 
1681
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1682
-
1683
- if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1684
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1631
+ if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1632
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1685
1633
  all_blank = false;
1686
1634
  }
1687
1635
  element_count++;
@@ -1714,25 +1662,27 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1714
1662
  if (!allow_escaped_quotes || backslash_count % 2 == 0) {
1715
1663
  if (__builtin_expect(quote_boundary_standard, 1)) {
1716
1664
  if (in_quotes) {
1717
- /* closing quote: only valid if followed by col_sep, row_sep, or end */
1718
- bool valid_close = (p + 1 >= endP);
1719
- if (!valid_close) {
1720
- valid_close = true;
1721
- for (long j = 0; j < col_sep_len; j++) {
1722
- if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
1723
- }
1724
- }
1725
- if (!valid_close && row_sep_len2 > 0) {
1726
- valid_close = true;
1727
- for (long j = 0; j < row_sep_len2; j++) {
1728
- if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
1665
+ if (p + 2 < endP && *(p + 1) == quote_char_val) {
1666
+ /* RFC doubled quote inside a quoted field ("" → ").
1667
+ * Give this precedence over the closing-quote check, but only
1668
+ * when another byte follows the doubled pair.
1669
+ *
1670
+ * Compatibility note: we intentionally do NOT force terminal
1671
+ * "" to be consumed here. SmarterCSV has a long-standing lenient
1672
+ * behavior for malformed tails like ...\"" in :double_quotes mode:
1673
+ * the final quote may still close the field instead of turning the
1674
+ * row into an unclosed-quote error. Issue #334 needs doubled-quote
1675
+ * precedence for ..."",... (more content follows), but we keep the
1676
+ * historical leniency for terminal ..."". */
1677
+ p++;
1678
+ } else {
1679
+ /* closing quote: only valid if followed by col_sep, row_sep, or end */
1680
+ if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
1681
+ in_quotes = false;
1682
+ field_started = true;
1729
1683
  }
1684
+ /* else: quote inside quoted field → literal */
1730
1685
  }
1731
- if (valid_close) {
1732
- in_quotes = false;
1733
- field_started = true;
1734
- }
1735
- /* else: quote inside quoted field → literal (handles "" doubling) */
1736
1686
  } else if (!field_started) {
1737
1687
  in_quotes = true; /* opening quote at field boundary */
1738
1688
  field_started = true;
@@ -1762,10 +1712,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1762
1712
  section5_done_ctx:;
1763
1713
  /* Unclosed quote at end of line — signal multiline continuation */
1764
1714
  if (!did_early_exit && in_quotes) {
1765
- VALUE result = rb_ary_new_capa(2);
1766
- rb_ary_push(result, Qnil);
1767
- rb_ary_push(result, LONG2FIX(-1));
1768
- return result;
1715
+ return return_parser_result(Qnil, -1);
1769
1716
  }
1770
1717
 
1771
1718
  /* Process the last field — skip on early exit */
@@ -1773,26 +1720,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1773
1720
  long field_len = endP - startP;
1774
1721
  char *raw_field = startP;
1775
1722
 
1776
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1777
- if (quoted) {
1778
- raw_field++;
1779
- field_len -= 2;
1780
- }
1781
-
1782
- char *trim_start = raw_field;
1783
- char *trim_end = raw_field + field_len - 1;
1723
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1784
1724
 
1785
- if (strip_ws) {
1786
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1787
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1788
- }
1789
-
1790
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1791
-
1792
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1793
-
1794
- if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1795
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1725
+ if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1726
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1796
1727
  all_blank = false;
1797
1728
  }
1798
1729
  element_count++;
@@ -1804,10 +1735,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1804
1735
  * ---------------------------------------- */
1805
1736
  if (all_blank) {
1806
1737
  if (remove_empty) {
1807
- VALUE result = rb_ary_new_capa(2);
1808
- rb_ary_push(result, Qnil);
1809
- rb_ary_push(result, LONG2FIX(element_count));
1810
- return result;
1738
+ return return_parser_result(Qnil, element_count);
1811
1739
  }
1812
1740
 
1813
1741
  ensure_hash_allocated(&xform);
@@ -1819,7 +1747,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1819
1747
  if (!remove_empty_values) {
1820
1748
  ensure_hash_allocated(&xform);
1821
1749
  for (long i = element_count; i < headers_len; i++) {
1822
- if (!keep_bitmap || keep_bitmap[i]) {
1750
+ if (!keep_bitmap || (i < keep_bitmap_len ? keep_bitmap[i] : keep_extra_columns)) {
1823
1751
  rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
1824
1752
  }
1825
1753
  }
@@ -1828,10 +1756,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1828
1756
  /* ----------------------------------------
1829
1757
  * SECTION 8: Return result
1830
1758
  * ---------------------------------------- */
1831
- VALUE result = rb_ary_new_capa(2);
1832
- rb_ary_push(result, xform.hash);
1833
- rb_ary_push(result, LONG2FIX(element_count));
1834
- return result;
1759
+ return return_parser_result(xform.hash, element_count);
1835
1760
  }
1836
1761
 
1837
1762
  // Count quote characters in a line, optionally respecting backslash escapes.