smarter_csv 1.17.2 → 1.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -169,16 +169,107 @@ needs_unescape:
169
169
  /* Helper: build the 2-element [elements, data_size] tuple returned by rb_parse_csv_line.
170
170
  * Aligns this function's return shape with parse_csv_line_ruby and rb_parse_line_to_hash_ctx:
171
171
  * data_size = -1 signals "unclosed quoted field — needs more data". */
172
- static inline VALUE make_parse_result(VALUE elements, long data_size) {
172
+ static inline __attribute__((always_inline))
173
+ VALUE return_parser_result(VALUE elements, long data_size) {
173
174
  VALUE result = rb_ary_new_capa(2);
174
175
  rb_ary_push(result, elements);
175
176
  rb_ary_push(result, LONG2FIX(data_size));
176
177
  return result;
177
178
  }
178
179
 
180
+ /* Helper: trim leading/trailing spaces and tabs from a field when strip_ws is set.
181
+ * Sets *out_start to the first kept byte and returns the trimmed length (0 when the
182
+ * field is empty or all whitespace). This is the trim performed at every field
183
+ * boundary in all three parsers; kept always_inline so each call site compiles to
184
+ * the same code as the hand-written loops it replaces (no performance cost). */
185
+ static inline __attribute__((always_inline))
186
+ long trim_field(char *field, long field_len, bool strip_ws, char **out_start) {
187
+ char *trim_start = field;
188
+ char *trim_end = field + field_len - 1;
189
+ if (strip_ws) {
190
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
191
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
192
+ }
193
+ *out_start = trim_start;
194
+ return (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
195
+ }
196
+
197
+ /* A field after quote-stripping and trimming: where its content starts, how long it
198
+ * is, and whether it still contains quote characters that need unescaping. */
199
+ typedef struct {
200
+ char *start;
201
+ long len;
202
+ bool has_quotes;
203
+ } extracted_field;
204
+
205
+ /* Helper: turn a raw field slice into the values every extraction site needs.
206
+ * Strips a surrounding pair of quote chars (if present), trims whitespace via
207
+ * trim_field, and reports whether the result still has embedded quotes (true for a
208
+ * quoted field, or any field containing the quote char). This is the common prefix
209
+ * before each field is pushed/inserted, in all three parsers' slow paths.
210
+ * always_inline + return-by-value so the struct is dissolved into registers and each
211
+ * call site compiles to the same code as the old inline block (no performance cost). */
212
+ static inline __attribute__((always_inline))
213
+ extracted_field extract_field(char *raw_field, long field_len, bool strip_ws, char quote_char_val) {
214
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
215
+ if (quoted) {
216
+ raw_field++; // Skip opening quote
217
+ field_len -= 2; // Exclude both quotes from length
218
+ }
219
+ char *trim_start;
220
+ long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
221
+ extracted_field result = {
222
+ trim_start,
223
+ trimmed_len,
224
+ quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len))
225
+ };
226
+ return result;
227
+ }
228
+
229
+ /* Helper: is a closing quote at p actually a field close? Valid only when followed by
230
+ * the column separator, the row separator, or end of line. Pure read — touches none of
231
+ * the quote loop's state (in_quotes/field_started/etc). Mirrors the inline lookahead
232
+ * copied into all three parsers' quote machines; always_inline so it compiles to the
233
+ * same code as the hand-written block. */
234
+ static inline __attribute__((always_inline))
235
+ bool is_valid_close(const char *p, const char *endP,
236
+ const char *col_sepP, long col_sep_len,
237
+ const char *row_sepP, long row_sep_len) {
238
+ bool valid_close = (p + 1 >= endP);
239
+ if (!valid_close) {
240
+ valid_close = true;
241
+ for (long j = 0; j < col_sep_len; j++) {
242
+ if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
243
+ }
244
+ }
245
+ if (!valid_close && row_sep_len > 0) {
246
+ valid_close = true;
247
+ for (long j = 0; j < row_sep_len; j++) {
248
+ if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
249
+ }
250
+ }
251
+ return valid_close;
252
+ }
253
+
254
+ /* Helper: strip a trailing row separator from the line (pointer adjustment, no string
255
+ * mutation). If the last row_sep_len bytes at endP match row_sepP, move endP back past
256
+ * them; otherwise leave endP untouched. The row_sep_len > 0 guard means callers can
257
+ * pass (NULL, 0) for "no separator known yet" without an outer if. Shared by
258
+ * rb_parse_line_to_hash and rb_parse_line_to_hash_ctx; always_inline keeps the chomp
259
+ * site as cheap as the hand-written check it replaces. */
260
+ static inline __attribute__((always_inline))
261
+ char *chomp_row_sep(char *endP, long line_len, const char *row_sepP, long row_sep_len) {
262
+ if (row_sep_len > 0
263
+ && line_len >= row_sep_len
264
+ && memcmp(endP - row_sep_len, row_sepP, (size_t)row_sep_len) == 0) {
265
+ endP -= row_sep_len;
266
+ }
267
+ return endP;
268
+ }
269
+
179
270
  static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val, VALUE quote_boundary_standard_val, VALUE row_sep_val) {
180
271
  if (RB_TYPE_P(line, T_NIL) == 1) {
181
- return make_parse_result(rb_ary_new(), 0);
272
+ return return_parser_result(rb_ary_new(), 0);
182
273
  }
183
274
 
184
275
  if (RB_TYPE_P(line, T_STRING) != 1) {
@@ -205,7 +296,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
205
296
  if (max_size != Qnil) {
206
297
  max_fields = NUM2INT(max_size);
207
298
  if (max_fields < 0) {
208
- return make_parse_result(rb_ary_new(), 0);
299
+ return return_parser_result(rb_ary_new(), 0);
209
300
  }
210
301
  }
211
302
 
@@ -229,15 +320,8 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
229
320
 
230
321
  long field_len = sep_pos - startP;
231
322
  char *raw_field = startP;
232
- char *trim_start = raw_field;
233
- char *trim_end = raw_field + field_len - 1;
234
-
235
- if (strip_ws) {
236
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
237
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
238
- }
239
-
240
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
323
+ char *trim_start;
324
+ long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
241
325
 
242
326
  field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
243
327
  rb_ary_push(elements, field);
@@ -250,21 +334,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
250
334
  if ((max_fields < 0) || (element_count < max_fields)) {
251
335
  long field_len = endP - startP;
252
336
  char *raw_field = startP;
253
- char *trim_start = raw_field;
254
- char *trim_end = raw_field + field_len - 1;
255
-
256
- if (strip_ws) {
257
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
258
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
259
- }
260
-
261
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
337
+ char *trim_start;
338
+ long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
262
339
 
263
340
  field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
264
341
  rb_ary_push(elements, field);
265
342
  }
266
343
 
267
- return make_parse_result(elements, RARRAY_LEN(elements));
344
+ return return_parser_result(elements, RARRAY_LEN(elements));
268
345
  }
269
346
 
270
347
  // === SLOW PATH: Quoted fields or multi-char separator ===
@@ -291,28 +368,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
291
368
  long field_len = p - startP;
292
369
  char *raw_field = startP;
293
370
 
294
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
295
- if (quoted) {
296
- raw_field++;
297
- field_len -= 2;
298
- }
299
-
300
- char *trim_start = raw_field;
301
- char *trim_end = raw_field + field_len - 1;
302
-
303
- if (strip_ws) {
304
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
305
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
306
- }
371
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
307
372
 
308
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
309
-
310
- if (trimmed_len == 0) {
373
+ if (f.len == 0) {
311
374
  field = Qempty_string;
312
- } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
313
- field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
375
+ } else if (f.has_quotes) {
376
+ field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
314
377
  } else {
315
- field = rb_enc_str_new(trim_start, trimmed_len, encoding);
378
+ field = rb_enc_str_new(f.start, f.len, encoding);
316
379
  }
317
380
 
318
381
  rb_ary_push(elements, field);
@@ -346,20 +409,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
346
409
  p++;
347
410
  } else {
348
411
  // closing quote: only valid if followed by col_sep, row_sep, or end of line
349
- bool valid_close = (p + 1 >= endP);
350
- if (!valid_close) {
351
- valid_close = true;
352
- for (long j = 0; j < col_sep_len; j++) {
353
- if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
354
- }
355
- }
356
- if (!valid_close && row_sep_len > 0) {
357
- valid_close = true;
358
- for (long j = 0; j < row_sep_len; j++) {
359
- if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
360
- }
361
- }
362
- if (valid_close) {
412
+ if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP, row_sep_len)) {
363
413
  in_quotes = false;
364
414
  field_started = true;
365
415
  }
@@ -398,41 +448,27 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
398
448
  * the signal: append the next physical line and re-parse, or raise MalformedCSV
399
449
  * at EOF if the field never closes. The parser does not decide "ultimately
400
450
  * malformed"; the caller does. */
401
- return make_parse_result(rb_ary_new(), -1);
451
+ return return_parser_result(rb_ary_new(), -1);
402
452
  }
403
453
 
404
454
  if ((max_fields < 0) || (element_count < max_fields)) {
405
455
  long field_len = endP - startP;
406
456
  char *raw_field = startP;
407
457
 
408
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
409
- if (quoted) {
410
- raw_field++;
411
- field_len -= 2;
412
- }
413
-
414
- char *trim_start = raw_field;
415
- char *trim_end = raw_field + field_len - 1;
458
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
416
459
 
417
- if (strip_ws) {
418
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
419
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
420
- }
421
-
422
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
423
-
424
- if (trimmed_len == 0) {
460
+ if (f.len == 0) {
425
461
  field = Qempty_string;
426
- } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
427
- field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
462
+ } else if (f.has_quotes) {
463
+ field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
428
464
  } else {
429
- field = rb_enc_str_new(trim_start, trimmed_len, encoding);
465
+ field = rb_enc_str_new(f.start, f.len, encoding);
430
466
  }
431
467
 
432
468
  rb_ary_push(elements, field);
433
469
  }
434
470
 
435
- return make_parse_result(elements, RARRAY_LEN(elements));
471
+ return return_parser_result(elements, RARRAY_LEN(elements));
436
472
  }
437
473
 
438
474
  // Efficiently combine two arrays into a hash (replaces headers.zip(values).to_h)
@@ -690,6 +726,32 @@ static inline __attribute__((always_inline)) bool insert_field_into_hash(
690
726
  return true;
691
727
  }
692
728
 
729
+ /* Helper: parse the convert_values_to_numeric option into a mode + key list.
730
+ * mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys.
731
+ * Writes through the out-params only when the option is set, so callers must
732
+ * pre-initialize *out_mode = 0 and *out_keys = Qnil. Shared by rb_parse_line_to_hash
733
+ * and rb_new_parse_context — identical logic, different storage (locals vs ctx fields).
734
+ * always_inline so each call site compiles to the same code as the old inline block. */
735
+ static inline __attribute__((always_inline))
736
+ void parse_numeric_option(VALUE options_hash, int *out_mode, VALUE *out_keys) {
737
+ VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
738
+ if (RTEST(convert_opt)) {
739
+ if (RB_TYPE_P(convert_opt, T_HASH)) {
740
+ VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
741
+ VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
742
+ if (RTEST(only_keys)) {
743
+ *out_mode = 2;
744
+ *out_keys = rb_Array(only_keys); // wrap single value in array if needed
745
+ } else if (RTEST(except_keys)) {
746
+ *out_mode = 3;
747
+ *out_keys = rb_Array(except_keys); // wrap single value in array if needed
748
+ }
749
+ } else {
750
+ *out_mode = 1; // convert all
751
+ }
752
+ }
753
+ }
754
+
693
755
  /*
694
756
  * ================================================================================
695
757
  * rb_parse_line_to_hash - Parse CSV line directly into a Ruby Hash
@@ -738,10 +800,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
738
800
  * SECTION 1: Handle nil/invalid input
739
801
  * ---------------------------------------- */
740
802
  if (NIL_P(line)) {
741
- VALUE result = rb_ary_new_capa(2);
742
- rb_ary_push(result, Qnil);
743
- rb_ary_push(result, INT2FIX(0));
744
- return result;
803
+ return return_parser_result(Qnil, 0);
745
804
  }
746
805
 
747
806
  if (RB_TYPE_P(line, T_STRING) != 1) {
@@ -766,22 +825,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
766
825
  // numeric_mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys
767
826
  int numeric_mode = 0;
768
827
  VALUE numeric_keys = Qnil;
769
- VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
770
- if (RTEST(convert_opt)) {
771
- if (RB_TYPE_P(convert_opt, T_HASH)) {
772
- VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
773
- VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
774
- if (RTEST(only_keys)) {
775
- numeric_mode = 2;
776
- numeric_keys = rb_Array(only_keys); // wrap single value in array if needed
777
- } else if (RTEST(except_keys)) {
778
- numeric_mode = 3;
779
- numeric_keys = rb_Array(except_keys); // wrap single value in array if needed
780
- }
781
- } else {
782
- numeric_mode = 1; // convert all
783
- }
784
- }
828
+ parse_numeric_option(options_hash, &numeric_mode, &numeric_keys);
785
829
 
786
830
  // quote_escaping and quote_boundary are only needed in Section 5 (quoted/slow path).
787
831
  // They are declared here as forward declarations so Section 5 can set them lazily.
@@ -798,11 +842,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
798
842
  // row_sep is also reused in Section 5 for the closing-quote boundary check.
799
843
  VALUE row_sep = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
800
844
  if (!NIL_P(row_sep) && RB_TYPE_P(row_sep, T_STRING)) {
801
- char *row_sepP = RSTRING_PTR(row_sep);
802
- long row_sep_len = RSTRING_LEN(row_sep);
803
- if (line_len >= row_sep_len && memcmp(endP - row_sep_len, row_sepP, row_sep_len) == 0) {
804
- endP -= row_sep_len;
805
- }
845
+ endP = chomp_row_sep(endP, line_len, RSTRING_PTR(row_sep), RSTRING_LEN(row_sep));
806
846
  }
807
847
 
808
848
  char *col_sepP = RSTRING_PTR(col_sep);
@@ -975,13 +1015,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
975
1015
  /* --- (a) Common path: no column filter, no early exit --- */
976
1016
  while ((sep_pos = memchr(p, sep, endP - p))) {
977
1017
  long field_len = sep_pos - startP;
978
- char *trim_start = startP;
979
- char *trim_end = startP + field_len - 1;
980
- if (strip_ws) {
981
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
982
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
983
- }
984
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1018
+ char *trim_start;
1019
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
985
1020
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
986
1021
  all_blank = false;
987
1022
  element_count++;
@@ -990,13 +1025,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
990
1025
  /* Process last field */
991
1026
  {
992
1027
  long field_len = endP - startP;
993
- char *trim_start = startP;
994
- char *trim_end = startP + field_len - 1;
995
- if (strip_ws) {
996
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
997
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
998
- }
999
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1028
+ char *trim_start;
1029
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1000
1030
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1001
1031
  all_blank = false;
1002
1032
  element_count++;
@@ -1005,13 +1035,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1005
1035
  /* --- (b) Filter path: column bitmap and/or early exit active --- */
1006
1036
  while ((sep_pos = memchr(p, sep, endP - p))) {
1007
1037
  long field_len = sep_pos - startP;
1008
- char *trim_start = startP;
1009
- char *trim_end = startP + field_len - 1;
1010
- if (strip_ws) {
1011
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1012
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1013
- }
1014
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1038
+ char *trim_start;
1039
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1015
1040
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1016
1041
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1017
1042
  all_blank = false;
@@ -1026,13 +1051,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1026
1051
  /* Process last field — skip on early exit */
1027
1052
  if (!did_early_exit) {
1028
1053
  long field_len = endP - startP;
1029
- char *trim_start = startP;
1030
- char *trim_end = startP + field_len - 1;
1031
- if (strip_ws) {
1032
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1033
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1034
- }
1035
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1054
+ char *trim_start;
1055
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1036
1056
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1037
1057
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1038
1058
  all_blank = false;
@@ -1107,28 +1127,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1107
1127
  long field_len = p - startP;
1108
1128
  char *raw_field = startP;
1109
1129
 
1110
- // Check if field is wrapped in quotes: "value"
1111
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1112
- if (quoted) {
1113
- raw_field++; // Skip opening quote
1114
- field_len -= 2; // Exclude both quotes from length
1115
- }
1116
-
1117
- char *trim_start = raw_field;
1118
- char *trim_end = raw_field + field_len - 1;
1119
-
1120
- if (strip_ws) {
1121
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1122
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1123
- }
1124
-
1125
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1126
-
1127
- // Determine if field contains embedded quotes (need unescape)
1128
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1130
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1129
1131
 
1130
1132
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1131
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1133
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1132
1134
  all_blank = false;
1133
1135
  }
1134
1136
  element_count++;
@@ -1182,20 +1184,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1182
1184
  p++;
1183
1185
  } else {
1184
1186
  // closing quote: only valid if followed by col_sep, row_sep, or end of line
1185
- bool valid_close = (p + 1 >= endP);
1186
- if (!valid_close) {
1187
- valid_close = true;
1188
- for (long j = 0; j < col_sep_len; j++) {
1189
- if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
1190
- }
1191
- }
1192
- if (!valid_close && row_sep_len2 > 0) {
1193
- valid_close = true;
1194
- for (long j = 0; j < row_sep_len2; j++) {
1195
- if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
1196
- }
1197
- }
1198
- if (valid_close) {
1187
+ if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
1199
1188
  in_quotes = false;
1200
1189
  field_started = true;
1201
1190
  }
@@ -1233,10 +1222,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1233
1222
  * We return [nil, -1] rather than raising so the read loop can handle multiline fields
1234
1223
  * without a separate pre-scan pass (detect_multiline). */
1235
1224
  if (!did_early_exit && in_quotes) {
1236
- VALUE result = rb_ary_new_capa(2);
1237
- rb_ary_push(result, Qnil);
1238
- rb_ary_push(result, LONG2FIX(-1));
1239
- return result;
1225
+ return return_parser_result(Qnil, -1);
1240
1226
  }
1241
1227
 
1242
1228
  /* Process the last field (same logic as above) — skip on early exit */
@@ -1244,26 +1230,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1244
1230
  long field_len = endP - startP;
1245
1231
  char *raw_field = startP;
1246
1232
 
1247
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1248
- if (quoted) {
1249
- raw_field++;
1250
- field_len -= 2;
1251
- }
1252
-
1253
- char *trim_start = raw_field;
1254
- char *trim_end = raw_field + field_len - 1;
1255
-
1256
- if (strip_ws) {
1257
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1258
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1259
- }
1260
-
1261
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1262
-
1263
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1233
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1264
1234
 
1265
1235
  if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1266
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1236
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1267
1237
  all_blank = false;
1268
1238
  }
1269
1239
  element_count++;
@@ -1284,10 +1254,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1284
1254
  */
1285
1255
  if (all_blank) {
1286
1256
  if (remove_empty) {
1287
- VALUE result = rb_ary_new_capa(2);
1288
- rb_ary_push(result, Qnil);
1289
- rb_ary_push(result, LONG2FIX(element_count));
1290
- return result;
1257
+ return return_parser_result(Qnil, element_count);
1291
1258
  }
1292
1259
 
1293
1260
  ensure_hash_allocated(&xform);
@@ -1315,10 +1282,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
1315
1282
  * Return [hash, element_count] so caller can detect extra columns
1316
1283
  * (when element_count > headers_len) and extend headers if needed.
1317
1284
  */
1318
- VALUE result = rb_ary_new_capa(2);
1319
- rb_ary_push(result, xform.hash);
1320
- rb_ary_push(result, LONG2FIX(element_count));
1321
- return result;
1285
+ return return_parser_result(xform.hash, element_count);
1322
1286
  }
1323
1287
 
1324
1288
  /* ================================================================================
@@ -1389,22 +1353,7 @@ __attribute__((cold)) static VALUE rb_new_parse_context(VALUE self, VALUE header
1389
1353
  ctx->remove_zero_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_zero_values)));
1390
1354
 
1391
1355
  /* Numeric conversion */
1392
- VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
1393
- if (RTEST(convert_opt)) {
1394
- if (RB_TYPE_P(convert_opt, T_HASH)) {
1395
- VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
1396
- VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
1397
- if (RTEST(only_keys)) {
1398
- ctx->numeric_mode = 2;
1399
- ctx->numeric_keys = rb_Array(only_keys);
1400
- } else if (RTEST(except_keys)) {
1401
- ctx->numeric_mode = 3;
1402
- ctx->numeric_keys = rb_Array(except_keys);
1403
- }
1404
- } else {
1405
- ctx->numeric_mode = 1;
1406
- }
1407
- }
1356
+ parse_numeric_option(options_hash, &ctx->numeric_mode, &ctx->numeric_keys);
1408
1357
 
1409
1358
  /* quote_escaping → allow_escaped_quotes */
1410
1359
  VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
@@ -1503,10 +1452,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1503
1452
  * SECTION 1: Handle nil/invalid input
1504
1453
  * ---------------------------------------- */
1505
1454
  if (NIL_P(line)) {
1506
- VALUE result = rb_ary_new_capa(2);
1507
- rb_ary_push(result, Qnil);
1508
- rb_ary_push(result, INT2FIX(0));
1509
- return result;
1455
+ return return_parser_result(Qnil, 0);
1510
1456
  }
1511
1457
 
1512
1458
  if (RB_TYPE_P(line, T_STRING) != 1) {
@@ -1552,12 +1498,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1552
1498
  char *p = startP;
1553
1499
 
1554
1500
  /* Chomp: strip trailing row separator (pointer adjustment, no string mutation) */
1555
- if (ctx->row_sep_len > 0) {
1556
- long rsl = (long)ctx->row_sep_len;
1557
- if (line_len >= rsl && memcmp(endP - rsl, ctx->row_sep_buf, (size_t)rsl) == 0) {
1558
- endP -= rsl;
1559
- }
1560
- }
1501
+ endP = chomp_row_sep(endP, line_len, ctx->row_sep_buf, (long)ctx->row_sep_len);
1561
1502
 
1562
1503
  /* Re-read headers_len each call to handle extra-column growth */
1563
1504
  long headers_len = NIL_P(ctx->headers) ? 0 : RARRAY_LEN(ctx->headers);
@@ -1602,13 +1543,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1602
1543
  /* --- (a) Common path: no column filter, no early exit --- */
1603
1544
  while ((sep_pos = memchr(p, sep, endP - p))) {
1604
1545
  long field_len = sep_pos - startP;
1605
- char *trim_start = startP;
1606
- char *trim_end = startP + field_len - 1;
1607
- if (strip_ws) {
1608
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1609
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1610
- }
1611
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1546
+ char *trim_start;
1547
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1612
1548
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1613
1549
  all_blank = false;
1614
1550
  element_count++;
@@ -1617,13 +1553,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1617
1553
  /* Process last field */
1618
1554
  {
1619
1555
  long field_len = endP - startP;
1620
- char *trim_start = startP;
1621
- char *trim_end = startP + field_len - 1;
1622
- if (strip_ws) {
1623
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1624
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1625
- }
1626
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1556
+ char *trim_start;
1557
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1627
1558
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1628
1559
  all_blank = false;
1629
1560
  element_count++;
@@ -1632,13 +1563,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1632
1563
  /* --- (b) Filter path: column bitmap and/or early exit active --- */
1633
1564
  while ((sep_pos = memchr(p, sep, endP - p))) {
1634
1565
  long field_len = sep_pos - startP;
1635
- char *trim_start = startP;
1636
- char *trim_end = startP + field_len - 1;
1637
- if (strip_ws) {
1638
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1639
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1640
- }
1641
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1566
+ char *trim_start;
1567
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1642
1568
  if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1643
1569
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1644
1570
  all_blank = false;
@@ -1653,13 +1579,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1653
1579
  /* Process last field — skip on early exit */
1654
1580
  if (!did_early_exit) {
1655
1581
  long field_len = endP - startP;
1656
- char *trim_start = startP;
1657
- char *trim_end = startP + field_len - 1;
1658
- if (strip_ws) {
1659
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1660
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1661
- }
1662
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1582
+ char *trim_start;
1583
+ long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
1663
1584
  if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1664
1585
  if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1665
1586
  all_blank = false;
@@ -1705,26 +1626,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1705
1626
  long field_len = p - startP;
1706
1627
  char *raw_field = startP;
1707
1628
 
1708
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1709
- if (quoted) {
1710
- raw_field++;
1711
- field_len -= 2;
1712
- }
1713
-
1714
- char *trim_start = raw_field;
1715
- char *trim_end = raw_field + field_len - 1;
1716
-
1717
- if (strip_ws) {
1718
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1719
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1720
- }
1721
-
1722
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1723
-
1724
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1629
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1725
1630
 
1726
1631
  if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1727
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1632
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1728
1633
  all_blank = false;
1729
1634
  }
1730
1635
  element_count++;
@@ -1772,20 +1677,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1772
1677
  p++;
1773
1678
  } else {
1774
1679
  /* closing quote: only valid if followed by col_sep, row_sep, or end */
1775
- bool valid_close = (p + 1 >= endP);
1776
- if (!valid_close) {
1777
- valid_close = true;
1778
- for (long j = 0; j < col_sep_len; j++) {
1779
- if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
1780
- }
1781
- }
1782
- if (!valid_close && row_sep_len2 > 0) {
1783
- valid_close = true;
1784
- for (long j = 0; j < row_sep_len2; j++) {
1785
- if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
1786
- }
1787
- }
1788
- if (valid_close) {
1680
+ if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
1789
1681
  in_quotes = false;
1790
1682
  field_started = true;
1791
1683
  }
@@ -1820,10 +1712,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1820
1712
  section5_done_ctx:;
1821
1713
  /* Unclosed quote at end of line — signal multiline continuation */
1822
1714
  if (!did_early_exit && in_quotes) {
1823
- VALUE result = rb_ary_new_capa(2);
1824
- rb_ary_push(result, Qnil);
1825
- rb_ary_push(result, LONG2FIX(-1));
1826
- return result;
1715
+ return return_parser_result(Qnil, -1);
1827
1716
  }
1828
1717
 
1829
1718
  /* Process the last field — skip on early exit */
@@ -1831,26 +1720,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1831
1720
  long field_len = endP - startP;
1832
1721
  char *raw_field = startP;
1833
1722
 
1834
- bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1835
- if (quoted) {
1836
- raw_field++;
1837
- field_len -= 2;
1838
- }
1839
-
1840
- char *trim_start = raw_field;
1841
- char *trim_end = raw_field + field_len - 1;
1842
-
1843
- if (strip_ws) {
1844
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1845
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1846
- }
1847
-
1848
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1849
-
1850
- bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1723
+ extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
1851
1724
 
1852
1725
  if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1853
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1726
+ if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
1854
1727
  all_blank = false;
1855
1728
  }
1856
1729
  element_count++;
@@ -1862,10 +1735,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1862
1735
  * ---------------------------------------- */
1863
1736
  if (all_blank) {
1864
1737
  if (remove_empty) {
1865
- VALUE result = rb_ary_new_capa(2);
1866
- rb_ary_push(result, Qnil);
1867
- rb_ary_push(result, LONG2FIX(element_count));
1868
- return result;
1738
+ return return_parser_result(Qnil, element_count);
1869
1739
  }
1870
1740
 
1871
1741
  ensure_hash_allocated(&xform);
@@ -1886,10 +1756,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
1886
1756
  /* ----------------------------------------
1887
1757
  * SECTION 8: Return result
1888
1758
  * ---------------------------------------- */
1889
- VALUE result = rb_ary_new_capa(2);
1890
- rb_ary_push(result, xform.hash);
1891
- rb_ary_push(result, LONG2FIX(element_count));
1892
- return result;
1759
+ return return_parser_result(xform.hash, element_count);
1893
1760
  }
1894
1761
 
1895
1762
  // Count quote characters in a line, optionally respecting backslash escapes.