smarter_csv 1.15.2 → 1.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +9 -0
  3. data/CHANGELOG.md +68 -1
  4. data/CONTRIBUTORS.md +3 -1
  5. data/Gemfile +1 -0
  6. data/README.md +123 -27
  7. data/docs/_introduction.md +40 -24
  8. data/docs/bad_row_quarantine.md +285 -0
  9. data/docs/basic_read_api.md +151 -9
  10. data/docs/basic_write_api.md +474 -59
  11. data/docs/batch_processing.md +161 -4
  12. data/docs/column_selection.md +183 -0
  13. data/docs/data_transformations.md +162 -29
  14. data/docs/examples.md +339 -46
  15. data/docs/header_transformations.md +93 -12
  16. data/docs/header_validations.md +56 -18
  17. data/docs/history.md +117 -0
  18. data/docs/instrumentation.md +165 -0
  19. data/docs/migrating_from_csv.md +290 -0
  20. data/docs/options.md +150 -87
  21. data/docs/parsing_strategy.md +63 -1
  22. data/docs/real_world_csv.md +262 -0
  23. data/docs/releases/1.16.0/benchmarks.md +223 -0
  24. data/docs/releases/1.16.0/changes.md +272 -0
  25. data/docs/releases/1.16.0/performance_notes.md +114 -0
  26. data/docs/row_col_sep.md +14 -5
  27. data/docs/value_converters.md +193 -57
  28. data/ext/smarter_csv/extconf.rb +3 -0
  29. data/ext/smarter_csv/smarter_csv.c +1007 -71
  30. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
  31. data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
  32. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
  33. data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
  34. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
  35. data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
  36. data/lib/smarter_csv/errors.rb +8 -0
  37. data/lib/smarter_csv/file_io.rb +1 -1
  38. data/lib/smarter_csv/hash_transformations.rb +14 -13
  39. data/lib/smarter_csv/header_transformations.rb +21 -2
  40. data/lib/smarter_csv/headers.rb +2 -1
  41. data/lib/smarter_csv/options.rb +124 -7
  42. data/lib/smarter_csv/parser.rb +362 -75
  43. data/lib/smarter_csv/reader.rb +494 -46
  44. data/lib/smarter_csv/version.rb +1 -1
  45. data/lib/smarter_csv/writer.rb +71 -19
  46. data/lib/smarter_csv.rb +95 -12
  47. data/smarter_csv.gemspec +20 -10
  48. metadata +37 -80
@@ -37,7 +37,101 @@ VALUE Qempty_string = Qnil;
37
37
  static ID id_col_sep, id_quote_char, id_row_sep, id_missing_header_prefix;
38
38
  static ID id_strip_whitespace, id_remove_empty_hashes, id_remove_empty_values;
39
39
  static ID id_quote_escaping, id_convert_values_to_numeric, id_remove_zero_values;
40
- static ID id_only, id_except;
40
+ static ID id_only, id_except, id_quote_boundary;
41
+ static ID id_only_headers, id_except_headers, id_keep_cols, id_strict;
42
+ static ID id_keep_bitmap, id_keep_extra_cols, id_early_exit_after_sym;
43
+ static ID id_backslash, id_standard;
44
+
45
+ /* ================================================================================
46
+ * ParseContext — wraps all per-file parse options as a GC-managed TypedData object.
47
+ *
48
+ * Building a context once after headers are loaded eliminates the ~10 rb_hash_aref
49
+ * calls that rb_parse_line_to_hash performs on every row. The hot path calls
50
+ * parse_line_to_hash_ctx_c(line, ctx) instead of parse_line_to_hash_c(line, headers, opts).
51
+ * ================================================================================ */
52
+ typedef struct {
53
+ /* Separator and quoting config — copied into C buffers, no Ruby GC tracking needed */
54
+ char col_sep_buf[8];
55
+ int col_sep_len;
56
+ char quote_char_val;
57
+ char row_sep_buf[16];
58
+ int row_sep_len;
59
+ char prefix_buf[64];
60
+ const char *prefix_str; /* "column_" literal or points into prefix_buf */
61
+
62
+ /* Boolean parse flags */
63
+ bool strip_ws;
64
+ bool remove_empty;
65
+ bool remove_empty_values;
66
+ bool remove_zero_values;
67
+ bool allow_escaped_quotes; /* quote_escaping == :backslash */
68
+ bool quote_boundary_standard;
69
+
70
+ /* Numeric conversion: 0=off, 1=all, 2=only listed keys, 3=except listed keys */
71
+ int numeric_mode;
72
+
73
+ /* Column filter bitmap (xmalloc'd; NULL when no filtering active) */
74
+ bool *keep_bitmap;
75
+ long keep_bitmap_len;
76
+ bool keep_extra_columns;
77
+ bool has_only;
78
+ long early_exit_after; /* column index after which we stop; -1 = no early exit */
79
+
80
+ /* Hash allocation hint (set once at context creation) */
81
+ long hash_capa;
82
+
83
+ /* GC-tracked Ruby values — must be marked in the mark callback */
84
+ VALUE headers;
85
+ VALUE numeric_keys; /* Qnil when not used */
86
+ } parse_context_t;
87
+
88
+ __attribute__((cold)) static void parse_context_mark(void *ptr) {
89
+ parse_context_t *ctx = (parse_context_t *)ptr;
90
+ #if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7))
91
+ rb_gc_mark_movable(ctx->headers);
92
+ rb_gc_mark_movable(ctx->numeric_keys);
93
+ #else
94
+ rb_gc_mark(ctx->headers);
95
+ if (!NIL_P(ctx->numeric_keys)) rb_gc_mark(ctx->numeric_keys);
96
+ #endif
97
+ }
98
+
99
+ #if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7))
100
+ __attribute__((cold)) static void parse_context_compact(void *ptr) {
101
+ parse_context_t *ctx = (parse_context_t *)ptr;
102
+ ctx->headers = rb_gc_location(ctx->headers);
103
+ ctx->numeric_keys = rb_gc_location(ctx->numeric_keys);
104
+ }
105
+ #endif
106
+
107
+ __attribute__((cold)) static void parse_context_free(void *ptr) {
108
+ parse_context_t *ctx = (parse_context_t *)ptr;
109
+ if (ctx->keep_bitmap) xfree(ctx->keep_bitmap);
110
+ xfree(ctx);
111
+ }
112
+
113
+ __attribute__((cold)) static size_t parse_context_memsize(const void *ptr) {
114
+ const parse_context_t *ctx = (const parse_context_t *)ptr;
115
+ size_t sz = sizeof(parse_context_t);
116
+ if (ctx->keep_bitmap) sz += (size_t)ctx->keep_bitmap_len * sizeof(bool);
117
+ return sz;
118
+ }
119
+
120
+ static const rb_data_type_t parse_context_type = {
121
+ "SmarterCSV::ParseContext",
122
+ {
123
+ parse_context_mark,
124
+ parse_context_free,
125
+ parse_context_memsize,
126
+ #if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7))
127
+ parse_context_compact,
128
+ #else
129
+ 0,
130
+ #endif
131
+ },
132
+ 0, 0,
133
+ RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
134
+ };
41
135
 
42
136
  static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
43
137
  char *buf = ALLOC_N(char, len);
@@ -55,7 +149,7 @@ static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *
55
149
  return out;
56
150
  }
57
151
 
58
- static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val) {
152
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val, VALUE quote_boundary_standard_val, VALUE row_sep_val) {
59
153
  if (RB_TYPE_P(line, T_NIL) == 1) {
60
154
  return rb_ary_new();
61
155
  }
@@ -91,6 +185,10 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
91
185
  bool has_quotes = RTEST(has_quotes_val);
92
186
  bool strip_ws = RTEST(strip_ws_val);
93
187
  bool allow_escaped_quotes = RTEST(allow_escaped_quotes_val);
188
+ bool quote_boundary_standard = RTEST(quote_boundary_standard_val);
189
+
190
+ char *row_sepP = (RB_TYPE_P(row_sep_val, T_STRING)) ? RSTRING_PTR(row_sep_val) : NULL;
191
+ long row_sep_len = (row_sepP) ? RSTRING_LEN(row_sep_val) : 0;
94
192
 
95
193
  // === FAST PATH: No quotes and single-character separator ===
96
194
  if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
@@ -147,6 +245,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
147
245
  long backslash_count = 0;
148
246
  bool in_quotes = false;
149
247
  bool col_sep_found = true;
248
+ bool field_started = false; // for quote_boundary_standard: true once field has non-boundary content
150
249
 
151
250
  while (p < endP) {
152
251
  col_sep_found = true;
@@ -195,13 +294,53 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
195
294
  p += col_sep_len;
196
295
  startP = p;
197
296
  backslash_count = 0;
297
+ field_started = false; // reset for next field
198
298
  } else {
199
299
  if (allow_escaped_quotes && *p == '\\') {
200
300
  backslash_count++;
301
+ if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) field_started = true;
201
302
  } else {
202
303
  if (*p == quote_char_val) {
203
304
  if (!allow_escaped_quotes || backslash_count % 2 == 0) {
204
- in_quotes = !in_quotes;
305
+ if (__builtin_expect(quote_boundary_standard, 1)) {
306
+ if (in_quotes) {
307
+ // closing quote: only valid if followed by col_sep, row_sep, or end of line
308
+ bool valid_close = (p + 1 >= endP);
309
+ if (!valid_close) {
310
+ valid_close = true;
311
+ for (long j = 0; j < col_sep_len; j++) {
312
+ if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
313
+ }
314
+ }
315
+ if (!valid_close && row_sep_len > 0) {
316
+ valid_close = true;
317
+ for (long j = 0; j < row_sep_len; j++) {
318
+ if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
319
+ }
320
+ }
321
+ if (valid_close) {
322
+ in_quotes = false;
323
+ field_started = true;
324
+ }
325
+ // else: quote inside quoted field → literal (handles "" doubling)
326
+ } else if (!field_started) {
327
+ in_quotes = true; // opening quote at field boundary
328
+ field_started = true;
329
+ }
330
+ // else: mid-field quote → treat as literal
331
+ } else {
332
+ in_quotes = !in_quotes;
333
+ }
334
+ }
335
+ } else if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) {
336
+ if (strip_ws) {
337
+ if (*p != ' ' && *p != '\t') {
338
+ field_started = true;
339
+ } else if (!field_started) {
340
+ startP = p + 1; /* advance past leading whitespace so quote-detection at extraction sees the quote */
341
+ }
342
+ } else {
343
+ field_started = true;
205
344
  }
206
345
  }
207
346
  backslash_count = 0;
@@ -393,7 +532,7 @@ static inline void ensure_hash_allocated(field_transform_opts *opts) {
393
532
  * Returns: true if a non-blank value was inserted, false otherwise.
394
533
  * (Used to track all_blank for remove_empty_hashes.)
395
534
  */
396
- static inline bool insert_field_into_hash(
535
+ static inline __attribute__((always_inline)) bool insert_field_into_hash(
397
536
  field_transform_opts *opts,
398
537
  char *trim_start, long trimmed_len,
399
538
  long element_count, bool is_quoted,
@@ -513,7 +652,7 @@ static inline bool insert_field_into_hash(
513
652
  * Input: line = "john,25,boston,extra" (more fields than headers)
514
653
  * Output: [{name: "john", age: "25", city: "boston", column_4: "extra"}, 4]
515
654
  */
516
- static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE options_hash) {
655
+ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE options_hash) {
517
656
 
518
657
  /* ----------------------------------------
519
658
  * SECTION 1: Handle nil/invalid input
@@ -538,7 +677,6 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
538
677
  VALUE col_sep = rb_hash_aref(options_hash, ID2SYM(id_col_sep));
539
678
  VALUE quote_char = rb_hash_aref(options_hash, ID2SYM(id_quote_char));
540
679
  VALUE header_prefix = rb_hash_aref(options_hash, ID2SYM(id_missing_header_prefix));
541
- VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
542
680
  bool strip_ws = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strip_whitespace)));
543
681
  bool remove_empty = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_hashes)));
544
682
  bool remove_empty_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_values)));
@@ -565,11 +703,10 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
565
703
  }
566
704
  }
567
705
 
568
- // Determine if backslash-escaped quotes are allowed
569
- bool allow_escaped_quotes = false;
570
- if (RB_TYPE_P(quote_escaping_val, T_SYMBOL)) {
571
- allow_escaped_quotes = (SYM2ID(quote_escaping_val) == rb_intern("backslash"));
572
- }
706
+ // quote_escaping and quote_boundary are only needed in Section 5 (quoted/slow path).
707
+ // They are declared here as forward declarations so Section 5 can set them lazily.
708
+ bool allow_escaped_quotes = false; // set in Section 5 on first entry
709
+ bool quote_boundary_standard = false; // set in Section 5 on first entry
573
710
 
574
711
  rb_encoding *encoding = rb_enc_get(line); // Preserve string encoding
575
712
  char *startP = RSTRING_PTR(line); // Pointer to start of current field
@@ -577,7 +714,8 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
577
714
  char *endP = startP + line_len; // End of line marker
578
715
  char *p = startP; // Current parsing position
579
716
 
580
- // Chomp: strip trailing row separator (pointer adjustment, no string mutation)
717
+ // Chomp: strip trailing row separator (pointer adjustment, no string mutation).
718
+ // row_sep is also reused in Section 5 for the closing-quote boundary check.
581
719
  VALUE row_sep = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
582
720
  if (!NIL_P(row_sep) && RB_TYPE_P(row_sep, T_STRING)) {
583
721
  char *row_sepP = RSTRING_PTR(row_sep);
@@ -600,6 +738,112 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
600
738
  // Optimization hint: check if line contains quote characters
601
739
  bool has_quotes = (memchr(startP, quote_char_val, line_len) != NULL);
602
740
 
741
+ /* ----------------------------------------
742
+ * Column-filter bitmap for only_headers: / except_headers:
743
+ * ----------------------------------------
744
+ * keep_bitmap[i] = true → include column i in the output hash
745
+ * keep_bitmap[i] = false → skip column i (no Ruby allocation at all)
746
+ * NULL when no filter is active — zero overhead on common path.
747
+ *
748
+ * Preferred source: options[:_keep_cols] — a Ruby Array of true/false values
749
+ * precomputed once in reader.rb after headers are loaded (O(1) Set lookups).
750
+ * Copying it here is O(headers_len) with O(1) per element — no rb_ary_includes.
751
+ *
752
+ * Fallback: build from only_headers/except_headers via rb_ary_includes (O(k)
753
+ * per column, k = filter list length). Used only when _keep_cols is absent.
754
+ *
755
+ * Capped at 4096 columns; wider CSVs fall back to the Ruby-side
756
+ * hash.select!/hash.reject! filter applied after return.
757
+ *
758
+ * The bitmap is a loop invariant: headers and filter settings never change between rows.
759
+ * reader.rb precomputes it once as a packed binary String (_keep_bitmap) and also
760
+ * pre-stores keep_extra_cols and early_exit_after, so C just does 3 hash lookups +
761
+ * one memcpy instead of N rb_ary_entry calls on every row.
762
+ *
763
+ * alloca() keeps the allocation conditional: no-filter path never calls alloca(), so
764
+ * the frame stays well below 4 KB and ___chkstk_darwin never fires on ARM64 macOS.
765
+ */
766
+ bool *keep_bitmap = NULL;
767
+ bool keep_extra_columns = true; /* extra cols (> headers_len): keep by default */
768
+ bool has_only = false; /* true when only_headers: filtering is active */
769
+ long early_exit_after = -1; /* column index after which we stop; -1 = no early exit */
770
+
771
+ /* Column-filter bitmap setup.
772
+ *
773
+ * _keep_cols is the gate key — checked with a single rb_hash_aref on every row:
774
+ * false (default) → no filtering; skip everything instantly. ← COMMON CASE, zero overhead
775
+ * nil → filter active (reader.rb path): check _keep_bitmap for the fast bitmap.
776
+ * Array → backward-compat: direct C API callers passing _keep_cols as an Array.
777
+ *
778
+ * When _keep_cols is absent from the hash (nil from rb_hash_aref), it falls through to
779
+ * deriving the bitmap from only_headers/except_headers directly (manual options hashes).
780
+ *
781
+ * only_headers: / except_headers: are RARELY used options. The common path (no filtering)
782
+ * pays exactly one rb_hash_aref and nothing else.
783
+ */
784
+ VALUE keep_cols_val = rb_hash_aref(options_hash, ID2SYM(id_keep_cols));
785
+ if (keep_cols_val != Qfalse) {
786
+ /* Not false: either nil (filter active / absent) or Array (backward-compat). */
787
+ if (NIL_P(keep_cols_val)) {
788
+ /* nil: reader.rb filter path — check _keep_bitmap, or fall back to deriving it. */
789
+ VALUE prebuilt_bitmap = rb_hash_aref(options_hash, ID2SYM(id_keep_bitmap));
790
+ if (RB_TYPE_P(prebuilt_bitmap, T_STRING)
791
+ && headers_len > 0 && RSTRING_LEN(prebuilt_bitmap) >= headers_len) {
792
+ /* Precomputed binary bitmap from reader.rb — one memcpy replaces N rb_ary_entry calls.
793
+ * Copy before any Ruby API calls that could trigger GC compaction. */
794
+ keep_bitmap = (bool *)alloca((size_t)headers_len * sizeof(bool));
795
+ memcpy(keep_bitmap, RSTRING_PTR(prebuilt_bitmap), (size_t)headers_len * sizeof(bool));
796
+ VALUE kec = rb_hash_aref(options_hash, ID2SYM(id_keep_extra_cols));
797
+ keep_extra_columns = NIL_P(kec) ? true : RTEST(kec);
798
+ VALUE exa = rb_hash_aref(options_hash, ID2SYM(id_early_exit_after_sym));
799
+ early_exit_after = RB_INTEGER_TYPE_P(exa) ? NUM2LONG(exa) : -1;
800
+ has_only = !keep_extra_columns;
801
+ } else if (headers_len > 0 && headers_len <= 4096) {
802
+ /* Last resort: derive from only_headers/except_headers directly.
803
+ * Only reached when options hash is built manually without any _keep_* keys. */
804
+ VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
805
+ VALUE except_hdrs = rb_hash_aref(options_hash, ID2SYM(id_except_headers));
806
+ bool has_except = RB_TYPE_P(except_hdrs, T_ARRAY) && RARRAY_LEN(except_hdrs) > 0;
807
+ has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
808
+ if (has_only || has_except) {
809
+ keep_bitmap = (bool *)alloca((size_t)headers_len * sizeof(bool));
810
+ for (long bi = 0; bi < headers_len; bi++) {
811
+ VALUE hdr = rb_ary_entry(headers, bi);
812
+ keep_bitmap[bi] = has_only
813
+ ? (rb_ary_includes(only_hdrs, hdr) == Qtrue)
814
+ : (rb_ary_includes(except_hdrs, hdr) != Qtrue);
815
+ }
816
+ keep_extra_columns = !has_only;
817
+ bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
818
+ if (has_only && !strict) {
819
+ for (long bi = headers_len - 1; bi >= 0; bi--) {
820
+ if (keep_bitmap[bi]) { early_exit_after = bi; break; }
821
+ }
822
+ }
823
+ }
824
+ }
825
+ } else if (RB_TYPE_P(keep_cols_val, T_ARRAY) && headers_len > 0 && headers_len <= 4096) {
826
+ /* Backward-compat: _keep_cols Array from direct C API callers — O(headers_len) Ruby calls */
827
+ keep_bitmap = (bool *)alloca((size_t)headers_len * sizeof(bool));
828
+ long prebuilt_len = RARRAY_LEN(keep_cols_val);
829
+ for (long bi = 0; bi < headers_len; bi++) {
830
+ keep_bitmap[bi] = bi < prebuilt_len ? RTEST(rb_ary_entry(keep_cols_val, bi)) : false;
831
+ }
832
+ VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
833
+ has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
834
+ keep_extra_columns = !has_only;
835
+ bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
836
+ if (has_only && !strict) {
837
+ for (long bi = headers_len - 1; bi >= 0; bi--) {
838
+ if (keep_bitmap[bi]) { early_exit_after = bi; break; }
839
+ }
840
+ }
841
+ }
842
+ }
843
+ /* else: _keep_cols is false — no filtering, keep_bitmap stays NULL. COMMON CASE. */
844
+
845
+ bool did_early_exit = false; /* set to true when early exit fires */
846
+
603
847
  /* ----------------------------------------
604
848
  * SECTION 3: Initialize hash and tracking variables
605
849
  * ----------------------------------------
@@ -637,46 +881,79 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
637
881
  char sep = *col_sepP;
638
882
  char *sep_pos = NULL;
639
883
 
640
- /* Loop through each field by finding separator positions */
641
- while ((sep_pos = memchr(p, sep, endP - p))) {
642
- // Extract field boundaries
643
- long field_len = sep_pos - startP;
644
- char *trim_start = startP;
645
- char *trim_end = startP + field_len - 1;
646
-
647
- // Optional whitespace trimming (spaces and tabs only)
648
- if (strip_ws) {
649
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
650
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
884
+ /* Loop through each field by finding separator positions.
885
+ * Two sub-paths to avoid per-field overhead in the common case:
886
+ * (a) no filter + no early exit → pure memchr loop, zero extra branches
887
+ * (b) filter active → bitmap/early-exit checks per field
888
+ */
889
+ if (__builtin_expect(keep_bitmap == NULL && early_exit_after < 0, 1)) {
890
+ /* --- (a) Common path: no column filter, no early exit --- */
891
+ while ((sep_pos = memchr(p, sep, endP - p))) {
892
+ long field_len = sep_pos - startP;
893
+ char *trim_start = startP;
894
+ char *trim_end = startP + field_len - 1;
895
+ if (strip_ws) {
896
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
897
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
898
+ }
899
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
900
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
901
+ all_blank = false;
902
+ element_count++;
903
+ p = sep_pos + 1; startP = p;
651
904
  }
652
-
653
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
654
-
655
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
656
- all_blank = false;
657
- element_count++;
658
-
659
- // Move to next field
660
- p = sep_pos + 1;
661
- startP = p;
662
- }
663
-
664
- /* Process the last field (no separator after it) */
665
- {
666
- long field_len = endP - startP;
667
- char *trim_start = startP;
668
- char *trim_end = startP + field_len - 1;
669
-
670
- if (strip_ws) {
671
- while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
672
- while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
905
+ /* Process last field */
906
+ {
907
+ long field_len = endP - startP;
908
+ char *trim_start = startP;
909
+ char *trim_end = startP + field_len - 1;
910
+ if (strip_ws) {
911
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
912
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
913
+ }
914
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
915
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
916
+ all_blank = false;
917
+ element_count++;
918
+ }
919
+ } else {
920
+ /* --- (b) Filter path: column bitmap and/or early exit active --- */
921
+ while ((sep_pos = memchr(p, sep, endP - p))) {
922
+ long field_len = sep_pos - startP;
923
+ char *trim_start = startP;
924
+ char *trim_end = startP + field_len - 1;
925
+ if (strip_ws) {
926
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
927
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
928
+ }
929
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
930
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
931
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
932
+ all_blank = false;
933
+ }
934
+ element_count++;
935
+ if (early_exit_after >= 0 && element_count > early_exit_after) {
936
+ did_early_exit = true;
937
+ break;
938
+ }
939
+ p = sep_pos + 1; startP = p;
940
+ }
941
+ /* Process last field — skip on early exit */
942
+ if (!did_early_exit) {
943
+ long field_len = endP - startP;
944
+ char *trim_start = startP;
945
+ char *trim_end = startP + field_len - 1;
946
+ if (strip_ws) {
947
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
948
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
949
+ }
950
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
951
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
952
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
953
+ all_blank = false;
954
+ }
955
+ element_count++;
673
956
  }
674
-
675
- long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
676
-
677
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
678
- all_blank = false;
679
- element_count++;
680
957
  }
681
958
 
682
959
  } else {
@@ -689,25 +966,59 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
689
966
  * - Escaped quotes using backslash: \"
690
967
  *
691
968
  * We must scan character-by-character to track quote state.
969
+ *
970
+ * quote_escaping and quote_boundary options are only needed here (Section 4
971
+ * fast path never touches them), so we extract them lazily on first Section 5 entry.
692
972
  */
973
+ VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
974
+ if (RB_TYPE_P(quote_escaping_val, T_SYMBOL)) {
975
+ allow_escaped_quotes = (SYM2ID(quote_escaping_val) == id_backslash);
976
+ }
977
+ VALUE quote_boundary_val = rb_hash_aref(options_hash, ID2SYM(id_quote_boundary));
978
+ quote_boundary_standard = (RB_TYPE_P(quote_boundary_val, T_SYMBOL) &&
979
+ SYM2ID(quote_boundary_val) == id_standard);
980
+ /* row_sep reused from chomp above for the closing-quote boundary check */
981
+ char *row_sepP2 = (RB_TYPE_P(row_sep, T_STRING)) ? RSTRING_PTR(row_sep) : NULL;
982
+ long row_sep_len2 = (row_sepP2) ? RSTRING_LEN(row_sep) : 0;
983
+
984
+ /* Opt #5 (C-side): if backslash mode is requested but the (chomped) line contains
985
+ * no backslash character, backslash escaping cannot possibly affect parsing — a
986
+ * backslash only matters immediately before a quote char. Downgrade to RFC mode
987
+ * so the memchr-inside-quotes optimisation fires unconditionally for such lines.
988
+ * This replaces the Ruby-side line.include?('\\') pre-scan that was on the hot
989
+ * path: now the check happens here in C (one fast memchr), and only for lines
990
+ * that actually reach Section 5 (i.e. lines that contain quote characters).
991
+ * Unquoted lines never enter Section 5, so they pay zero cost for this check. */
992
+ if (allow_escaped_quotes && !memchr(startP, '\\', endP - startP)) {
993
+ allow_escaped_quotes = false;
994
+ }
995
+
693
996
  long i;
694
997
  long backslash_count = 0; // Track consecutive backslashes for escape detection
695
998
  bool in_quotes = false; // Are we inside a quoted field?
696
999
  bool col_sep_found = true;
1000
+ bool field_started = false; // for quote_boundary_standard: true once field has non-boundary content
1001
+
1002
+ /* Cache first separator byte for fast pre-filtering */
1003
+ char sep_char_slow = *col_sepP;
697
1004
 
698
1005
  /* Scan through the line character by character */
699
1006
  while (p < endP) {
700
- // Check if current position matches the column separator
701
- col_sep_found = true;
702
- for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
703
- if (*(p + i) != *(col_sepP + i)) {
704
- col_sep_found = false;
705
- break;
1007
+ // Separator check: when in_quotes we can never be at a field boundary,
1008
+ // so skip the comparison entirely.
1009
+ // For single-char separator: direct byte compare.
1010
+ // For multi-char separator: pre-filter on first byte, then check the rest.
1011
+ if (!in_quotes && *p == sep_char_slow) {
1012
+ col_sep_found = true;
1013
+ for (i = 1; (i < col_sep_len) && (p + i < endP); i++) {
1014
+ if (*(p + i) != *(col_sepP + i)) { col_sep_found = false; break; }
706
1015
  }
1016
+ } else {
1017
+ col_sep_found = false;
707
1018
  }
708
1019
 
709
- // Found separator and not inside quotes = end of field
710
- if (col_sep_found && !in_quotes) {
1020
+ // Found separator !in_quotes is guaranteed by the block above
1021
+ if (col_sep_found) {
711
1022
  long field_len = p - startP;
712
1023
  char *raw_field = startP;
713
1024
 
@@ -731,25 +1042,83 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
731
1042
  // Determine if field contains embedded quotes (need unescape)
732
1043
  bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
733
1044
 
734
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
735
- all_blank = false;
1045
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1046
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1047
+ all_blank = false;
1048
+ }
736
1049
  element_count++;
737
1050
 
1051
+ /* Early exit: all required columns already collected — stop scanning */
1052
+ if (early_exit_after >= 0 && element_count > early_exit_after) {
1053
+ did_early_exit = true;
1054
+ goto section5_done;
1055
+ }
1056
+
738
1057
  // Move past the separator to start of next field
739
1058
  p += col_sep_len;
740
1059
  startP = p;
741
1060
  backslash_count = 0;
1061
+ field_started = false; // reset for next field
742
1062
 
743
1063
  } else {
744
1064
  /* Not at a separator (or inside quotes) - track quote state */
745
1065
 
1066
+ /* RFC mode: inside quoted field, skip ahead to the next quote char.
1067
+ * Everything between here and the next quote is plain field content — no
1068
+ * separators or backslashes can appear (allow_escaped_quotes is false).
1069
+ * memchr() is SIMD-accelerated and handles typical field lengths in 1 call. */
1070
+ if (!allow_escaped_quotes && in_quotes) {
1071
+ char *next_quote = (char *)memchr(p, quote_char_val, endP - p);
1072
+ if (!next_quote) { p = endP; continue; } /* no closing quote → unclosed */
1073
+ p = next_quote; /* jump to quote char; fall through to quote-handling code */
1074
+ }
1075
+
746
1076
  if (allow_escaped_quotes && *p == '\\') {
747
1077
  // Count consecutive backslashes for escape sequence detection
748
1078
  backslash_count++;
1079
+ if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) field_started = true;
749
1080
  } else {
750
1081
  if (*p == quote_char_val) {
751
1082
  if (!allow_escaped_quotes || backslash_count % 2 == 0) {
752
- in_quotes = !in_quotes;
1083
+ if (__builtin_expect(quote_boundary_standard, 1)) {
1084
+ if (in_quotes) {
1085
+ // closing quote: only valid if followed by col_sep, row_sep, or end of line
1086
+ bool valid_close = (p + 1 >= endP);
1087
+ if (!valid_close) {
1088
+ valid_close = true;
1089
+ for (long j = 0; j < col_sep_len; j++) {
1090
+ if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
1091
+ }
1092
+ }
1093
+ if (!valid_close && row_sep_len2 > 0) {
1094
+ valid_close = true;
1095
+ for (long j = 0; j < row_sep_len2; j++) {
1096
+ if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
1097
+ }
1098
+ }
1099
+ if (valid_close) {
1100
+ in_quotes = false;
1101
+ field_started = true;
1102
+ }
1103
+ // else: quote inside quoted field → literal (handles "" doubling)
1104
+ } else if (!field_started) {
1105
+ in_quotes = true; // opening quote at field boundary
1106
+ field_started = true;
1107
+ }
1108
+ // else: mid-field quote → treat as literal
1109
+ } else {
1110
+ in_quotes = !in_quotes;
1111
+ }
1112
+ }
1113
+ } else if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) {
1114
+ if (strip_ws) {
1115
+ if (*p != ' ' && *p != '\t') {
1116
+ field_started = true;
1117
+ } else if (!field_started) {
1118
+ startP = p + 1; /* advance past leading whitespace so quote-detection at extraction sees the quote */
1119
+ }
1120
+ } else {
1121
+ field_started = true;
753
1122
  }
754
1123
  }
755
1124
  backslash_count = 0;
@@ -758,13 +1127,20 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
758
1127
  }
759
1128
  }
760
1129
 
761
- // Error: unclosed quote at end of line
762
- if (in_quotes) {
763
- rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
1130
+ section5_done:;
1131
+ /* Unclosed quote at end of line (skip check on early exit):
1132
+ * Signal "needs more data" the caller stitches the next physical line and re-parses.
1133
+ * We return [nil, -1] rather than raising so the read loop can handle multiline fields
1134
+ * without a separate pre-scan pass (detect_multiline). */
1135
+ if (!did_early_exit && in_quotes) {
1136
+ VALUE result = rb_ary_new_capa(2);
1137
+ rb_ary_push(result, Qnil);
1138
+ rb_ary_push(result, LONG2FIX(-1));
1139
+ return result;
764
1140
  }
765
1141
 
766
- /* Process the last field (same logic as above) */
767
- {
1142
+ /* Process the last field (same logic as above) — skip on early exit */
1143
+ if (!did_early_exit) {
768
1144
  long field_len = endP - startP;
769
1145
  char *raw_field = startP;
770
1146
 
@@ -786,8 +1162,10 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
786
1162
 
787
1163
  bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
788
1164
 
789
- if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
790
- all_blank = false;
1165
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1166
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1167
+ all_blank = false;
1168
+ }
791
1169
  element_count++;
792
1170
  }
793
1171
  }
@@ -817,7 +1195,9 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
817
1195
  if (!remove_empty_values) {
818
1196
  ensure_hash_allocated(&xform);
819
1197
  for (long i = element_count; i < headers_len; i++) {
820
- rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
1198
+ if (!keep_bitmap || keep_bitmap[i]) {
1199
+ rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
1200
+ }
821
1201
  }
822
1202
  }
823
1203
 
@@ -833,6 +1213,550 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
833
1213
  return result;
834
1214
  }
835
1215
 
1216
+ /* ================================================================================
1217
+ * new_parse_context_c(headers, options_hash) → ParseContext
1218
+ *
1219
+ * Extracts all loop-invariant options from the options_hash once and stores them
1220
+ * in a C struct wrapped as a TypedData Ruby object. Called once per file after
1221
+ * headers are known. The returned context is passed to parse_line_to_hash_ctx_c
1222
+ * on every row, eliminating ~10 rb_hash_aref calls per row.
1223
+ * ================================================================================ */
1224
+ __attribute__((cold)) static VALUE rb_new_parse_context(VALUE self, VALUE headers, VALUE options_hash) {
1225
+ parse_context_t *ctx;
1226
+ VALUE ctx_obj = TypedData_Make_Struct(rb_cObject, parse_context_t, &parse_context_type, ctx);
1227
+
1228
+ /* Initialize all fields to safe defaults */
1229
+ memset(ctx, 0, sizeof(parse_context_t));
1230
+ ctx->headers = headers;
1231
+ ctx->numeric_keys = Qnil;
1232
+ ctx->keep_bitmap = NULL;
1233
+ ctx->early_exit_after = -1;
1234
+ ctx->keep_extra_columns = true;
1235
+
1236
+ /* col_sep */
1237
+ VALUE col_sep_val = rb_hash_aref(options_hash, ID2SYM(id_col_sep));
1238
+ if (RB_TYPE_P(col_sep_val, T_STRING)) {
1239
+ long len = RSTRING_LEN(col_sep_val);
1240
+ if (len > (long)(sizeof(ctx->col_sep_buf) - 1)) len = (long)(sizeof(ctx->col_sep_buf) - 1);
1241
+ memcpy(ctx->col_sep_buf, RSTRING_PTR(col_sep_val), (size_t)len);
1242
+ ctx->col_sep_buf[len] = '\0';
1243
+ ctx->col_sep_len = (int)len;
1244
+ } else {
1245
+ ctx->col_sep_buf[0] = ',';
1246
+ ctx->col_sep_buf[1] = '\0';
1247
+ ctx->col_sep_len = 1;
1248
+ }
1249
+
1250
+ /* quote_char */
1251
+ VALUE quote_char_v = rb_hash_aref(options_hash, ID2SYM(id_quote_char));
1252
+ ctx->quote_char_val = (RB_TYPE_P(quote_char_v, T_STRING) && RSTRING_LEN(quote_char_v) > 0)
1253
+ ? RSTRING_PTR(quote_char_v)[0] : '"';
1254
+
1255
+ /* row_sep */
1256
+ VALUE row_sep_v = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
1257
+ if (RB_TYPE_P(row_sep_v, T_STRING)) {
1258
+ long len = RSTRING_LEN(row_sep_v);
1259
+ if (len > (long)(sizeof(ctx->row_sep_buf) - 1)) len = (long)(sizeof(ctx->row_sep_buf) - 1);
1260
+ memcpy(ctx->row_sep_buf, RSTRING_PTR(row_sep_v), (size_t)len);
1261
+ ctx->row_sep_buf[len] = '\0';
1262
+ ctx->row_sep_len = (int)len;
1263
+ }
1264
+
1265
+ /* missing_header_prefix */
1266
+ VALUE header_prefix = rb_hash_aref(options_hash, ID2SYM(id_missing_header_prefix));
1267
+ if (NIL_P(header_prefix)) {
1268
+ ctx->prefix_str = "column_";
1269
+ } else {
1270
+ long len = RSTRING_LEN(header_prefix);
1271
+ if (len > (long)(sizeof(ctx->prefix_buf) - 1)) len = (long)(sizeof(ctx->prefix_buf) - 1);
1272
+ memcpy(ctx->prefix_buf, RSTRING_PTR(header_prefix), (size_t)len);
1273
+ ctx->prefix_buf[len] = '\0';
1274
+ ctx->prefix_str = ctx->prefix_buf;
1275
+ }
1276
+
1277
+ /* Boolean flags */
1278
+ ctx->strip_ws = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strip_whitespace)));
1279
+ ctx->remove_empty = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_hashes)));
1280
+ ctx->remove_empty_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_values)));
1281
+ ctx->remove_zero_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_zero_values)));
1282
+
1283
+ /* Numeric conversion */
1284
+ VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
1285
+ if (RTEST(convert_opt)) {
1286
+ if (RB_TYPE_P(convert_opt, T_HASH)) {
1287
+ VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
1288
+ VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
1289
+ if (RTEST(only_keys)) {
1290
+ ctx->numeric_mode = 2;
1291
+ ctx->numeric_keys = rb_Array(only_keys);
1292
+ } else if (RTEST(except_keys)) {
1293
+ ctx->numeric_mode = 3;
1294
+ ctx->numeric_keys = rb_Array(except_keys);
1295
+ }
1296
+ } else {
1297
+ ctx->numeric_mode = 1;
1298
+ }
1299
+ }
1300
+
1301
+ /* quote_escaping → allow_escaped_quotes */
1302
+ VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
1303
+ if (RB_TYPE_P(quote_escaping_val, T_SYMBOL)) {
1304
+ ctx->allow_escaped_quotes = (SYM2ID(quote_escaping_val) == id_backslash);
1305
+ }
1306
+
1307
+ /* quote_boundary */
1308
+ VALUE quote_boundary_val = rb_hash_aref(options_hash, ID2SYM(id_quote_boundary));
1309
+ ctx->quote_boundary_standard = (RB_TYPE_P(quote_boundary_val, T_SYMBOL) &&
1310
+ SYM2ID(quote_boundary_val) == id_standard);
1311
+
1312
+ /* Column filter bitmap */
1313
+ long headers_len = NIL_P(headers) ? 0 : RARRAY_LEN(headers);
1314
+ ctx->hash_capa = headers_len > 0 ? headers_len : 16;
1315
+
1316
+ VALUE keep_cols_val = rb_hash_aref(options_hash, ID2SYM(id_keep_cols));
1317
+ if (keep_cols_val != Qfalse) {
1318
+ if (NIL_P(keep_cols_val)) {
1319
+ /* nil: reader.rb filter path — check _keep_bitmap, or fall back to deriving it. */
1320
+ VALUE prebuilt_bitmap = rb_hash_aref(options_hash, ID2SYM(id_keep_bitmap));
1321
+ if (RB_TYPE_P(prebuilt_bitmap, T_STRING)
1322
+ && headers_len > 0 && RSTRING_LEN(prebuilt_bitmap) >= headers_len) {
1323
+ ctx->keep_bitmap = (bool *)xmalloc((size_t)headers_len * sizeof(bool));
1324
+ ctx->keep_bitmap_len = headers_len;
1325
+ memcpy(ctx->keep_bitmap, RSTRING_PTR(prebuilt_bitmap), (size_t)headers_len * sizeof(bool));
1326
+ VALUE kec = rb_hash_aref(options_hash, ID2SYM(id_keep_extra_cols));
1327
+ ctx->keep_extra_columns = NIL_P(kec) ? true : RTEST(kec);
1328
+ VALUE exa = rb_hash_aref(options_hash, ID2SYM(id_early_exit_after_sym));
1329
+ ctx->early_exit_after = RB_INTEGER_TYPE_P(exa) ? NUM2LONG(exa) : -1;
1330
+ ctx->has_only = !ctx->keep_extra_columns;
1331
+ } else if (headers_len > 0 && headers_len <= 4096) {
1332
+ /* Last resort: derive from only_headers/except_headers directly. */
1333
+ VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
1334
+ VALUE except_hdrs = rb_hash_aref(options_hash, ID2SYM(id_except_headers));
1335
+ bool has_except = RB_TYPE_P(except_hdrs, T_ARRAY) && RARRAY_LEN(except_hdrs) > 0;
1336
+ ctx->has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
1337
+ if (ctx->has_only || has_except) {
1338
+ ctx->keep_bitmap = (bool *)xmalloc((size_t)headers_len * sizeof(bool));
1339
+ ctx->keep_bitmap_len = headers_len;
1340
+ for (long bi = 0; bi < headers_len; bi++) {
1341
+ VALUE hdr = rb_ary_entry(headers, bi);
1342
+ ctx->keep_bitmap[bi] = ctx->has_only
1343
+ ? (rb_ary_includes(only_hdrs, hdr) == Qtrue)
1344
+ : (rb_ary_includes(except_hdrs, hdr) != Qtrue);
1345
+ }
1346
+ ctx->keep_extra_columns = !ctx->has_only;
1347
+ bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
1348
+ if (ctx->has_only && !strict) {
1349
+ for (long bi = headers_len - 1; bi >= 0; bi--) {
1350
+ if (ctx->keep_bitmap[bi]) { ctx->early_exit_after = bi; break; }
1351
+ }
1352
+ }
1353
+ }
1354
+ }
1355
+ } else if (RB_TYPE_P(keep_cols_val, T_ARRAY) && headers_len > 0 && headers_len <= 4096) {
1356
+ /* Backward-compat: _keep_cols Array from direct C API callers */
1357
+ ctx->keep_bitmap = (bool *)xmalloc((size_t)headers_len * sizeof(bool));
1358
+ ctx->keep_bitmap_len = headers_len;
1359
+ long prebuilt_len = RARRAY_LEN(keep_cols_val);
1360
+ for (long bi = 0; bi < headers_len; bi++) {
1361
+ ctx->keep_bitmap[bi] = bi < prebuilt_len ? RTEST(rb_ary_entry(keep_cols_val, bi)) : false;
1362
+ }
1363
+ VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
1364
+ ctx->has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
1365
+ ctx->keep_extra_columns = !ctx->has_only;
1366
+ bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
1367
+ if (ctx->has_only && !strict) {
1368
+ for (long bi = headers_len - 1; bi >= 0; bi--) {
1369
+ if (ctx->keep_bitmap[bi]) { ctx->early_exit_after = bi; break; }
1370
+ }
1371
+ }
1372
+ }
1373
+ }
1374
+ /* else: _keep_cols == false — no filtering; keep_bitmap stays NULL */
1375
+
1376
+ return ctx_obj;
1377
+ }
1378
+
1379
+ /* ================================================================================
1380
+ * parse_line_to_hash_ctx_c(line, ctx) → [hash, data_size]
1381
+ *
1382
+ * High-performance variant of parse_line_to_hash_c that reads all loop-invariant
1383
+ * options from a pre-built ParseContext object instead of calling rb_hash_aref on
1384
+ * every row. Eliminates ~10 rb_hash_aref calls per row from the critical path.
1385
+ *
1386
+ * ctx must be a ParseContext built by new_parse_context_c(headers, options_hash).
1387
+ * headers_len is re-read each call from RARRAY_LEN(ctx->headers) to handle extra
1388
+ * column growth without requiring a context rebuild.
1389
+ * ================================================================================ */
1390
+ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE line, VALUE ctx_obj) {
1391
+ parse_context_t *ctx;
1392
+ TypedData_Get_Struct(ctx_obj, parse_context_t, &parse_context_type, ctx);
1393
+
1394
+ /* ----------------------------------------
1395
+ * SECTION 1: Handle nil/invalid input
1396
+ * ---------------------------------------- */
1397
+ if (NIL_P(line)) {
1398
+ VALUE result = rb_ary_new_capa(2);
1399
+ rb_ary_push(result, Qnil);
1400
+ rb_ary_push(result, INT2FIX(0));
1401
+ return result;
1402
+ }
1403
+
1404
+ if (RB_TYPE_P(line, T_STRING) != 1) {
1405
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line_to_hash: line has to be a string or nil");
1406
+ }
1407
+
1408
+ /* ----------------------------------------
1409
+ * SECTION 2: Read options from context (zero rb_hash_aref calls)
1410
+ * ----------------------------------------
1411
+ * All loop-invariant options are read directly from the pre-built struct.
1412
+ * No Hash lookups. No Ruby object allocation. Pure C struct field reads.
1413
+ */
1414
+ char *col_sepP = ctx->col_sep_buf;
1415
+ long col_sep_len = (long)ctx->col_sep_len;
1416
+ char quote_char_val = ctx->quote_char_val;
1417
+ const char *prefix_str = ctx->prefix_str;
1418
+ bool strip_ws = ctx->strip_ws;
1419
+ bool remove_empty = ctx->remove_empty;
1420
+ bool remove_empty_values = ctx->remove_empty_values;
1421
+ bool remove_zero_values = ctx->remove_zero_values;
1422
+ int numeric_mode = ctx->numeric_mode;
1423
+ VALUE numeric_keys = ctx->numeric_keys;
1424
+ bool *keep_bitmap = ctx->keep_bitmap;
1425
+ bool keep_extra_columns = ctx->keep_extra_columns;
1426
+ long early_exit_after = ctx->early_exit_after;
1427
+
1428
+ /* allow_escaped_quotes starts from context; per-line Opt #5 may downgrade it */
1429
+ bool allow_escaped_quotes = ctx->allow_escaped_quotes;
1430
+ bool quote_boundary_standard = ctx->quote_boundary_standard;
1431
+
1432
+ rb_encoding *encoding = rb_enc_get(line);
1433
+ char *startP = RSTRING_PTR(line);
1434
+ long line_len = RSTRING_LEN(line);
1435
+ char *endP = startP + line_len;
1436
+ char *p = startP;
1437
+
1438
+ /* Chomp: strip trailing row separator (pointer adjustment, no string mutation) */
1439
+ if (ctx->row_sep_len > 0) {
1440
+ long rsl = (long)ctx->row_sep_len;
1441
+ if (line_len >= rsl && memcmp(endP - rsl, ctx->row_sep_buf, (size_t)rsl) == 0) {
1442
+ endP -= rsl;
1443
+ }
1444
+ }
1445
+
1446
+ /* Re-read headers_len each call to handle extra-column growth */
1447
+ long headers_len = NIL_P(ctx->headers) ? 0 : RARRAY_LEN(ctx->headers);
1448
+ VALUE headers = ctx->headers;
1449
+
1450
+ /* Check if line contains quote characters (per-line; cannot be precomputed) */
1451
+ bool has_quotes = (memchr(startP, quote_char_val, line_len) != NULL);
1452
+
1453
+ bool did_early_exit = false;
1454
+
1455
+ /* ----------------------------------------
1456
+ * SECTION 3: Initialize hash and tracking variables
1457
+ * ---------------------------------------- */
1458
+ long hash_size = headers_len > 0 ? headers_len : 16;
1459
+ long element_count = 0;
1460
+ bool all_blank = true;
1461
+
1462
+ field_transform_opts xform = {
1463
+ .hash = Qnil,
1464
+ .headers = headers,
1465
+ .numeric_keys = numeric_keys,
1466
+ .encoding = encoding,
1467
+ .prefix_str = prefix_str,
1468
+ .headers_len = headers_len,
1469
+ .hash_capa = hash_size,
1470
+ .numeric_mode = numeric_mode,
1471
+ .remove_empty_values = remove_empty_values,
1472
+ .remove_zero_values = remove_zero_values,
1473
+ };
1474
+
1475
+ /* ========================================
1476
+ * SECTION 4: FAST PATH - No quotes, single-char separator
1477
+ * Two sub-paths to avoid per-field overhead in the common case:
1478
+ * (a) no filter + no early exit → pure memchr loop, zero extra branches
1479
+ * (b) filter active → bitmap/early-exit checks per field
1480
+ * ======================================== */
1481
+ if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
1482
+ char sep = *col_sepP;
1483
+ char *sep_pos = NULL;
1484
+
1485
+ if (__builtin_expect(keep_bitmap == NULL && early_exit_after < 0, 1)) {
1486
+ /* --- (a) Common path: no column filter, no early exit --- */
1487
+ while ((sep_pos = memchr(p, sep, endP - p))) {
1488
+ long field_len = sep_pos - startP;
1489
+ char *trim_start = startP;
1490
+ char *trim_end = startP + field_len - 1;
1491
+ if (strip_ws) {
1492
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1493
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1494
+ }
1495
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1496
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1497
+ all_blank = false;
1498
+ element_count++;
1499
+ p = sep_pos + 1; startP = p;
1500
+ }
1501
+ /* Process last field */
1502
+ {
1503
+ long field_len = endP - startP;
1504
+ char *trim_start = startP;
1505
+ char *trim_end = startP + field_len - 1;
1506
+ if (strip_ws) {
1507
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1508
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1509
+ }
1510
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1511
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1512
+ all_blank = false;
1513
+ element_count++;
1514
+ }
1515
+ } else {
1516
+ /* --- (b) Filter path: column bitmap and/or early exit active --- */
1517
+ while ((sep_pos = memchr(p, sep, endP - p))) {
1518
+ long field_len = sep_pos - startP;
1519
+ char *trim_start = startP;
1520
+ char *trim_end = startP + field_len - 1;
1521
+ if (strip_ws) {
1522
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1523
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1524
+ }
1525
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1526
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1527
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1528
+ all_blank = false;
1529
+ }
1530
+ element_count++;
1531
+ if (early_exit_after >= 0 && element_count > early_exit_after) {
1532
+ did_early_exit = true;
1533
+ break;
1534
+ }
1535
+ p = sep_pos + 1; startP = p;
1536
+ }
1537
+ /* Process last field — skip on early exit */
1538
+ if (!did_early_exit) {
1539
+ long field_len = endP - startP;
1540
+ char *trim_start = startP;
1541
+ char *trim_end = startP + field_len - 1;
1542
+ if (strip_ws) {
1543
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1544
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1545
+ }
1546
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1547
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1548
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
1549
+ all_blank = false;
1550
+ }
1551
+ element_count++;
1552
+ }
1553
+ }
1554
+
1555
+ } else {
1556
+ /* ========================================
1557
+ * SECTION 5: SLOW PATH - Quoted fields or multi-char separator
1558
+ * ========================================
1559
+ * Quote escaping options are read from the context (no rb_hash_aref).
1560
+ * Opt #5: downgrade to RFC mode if backslash mode is requested but this
1561
+ * specific line contains no backslash — allows memchr skip-ahead inside quotes.
1562
+ */
1563
+ if (allow_escaped_quotes && !memchr(startP, '\\', endP - startP)) {
1564
+ allow_escaped_quotes = false;
1565
+ }
1566
+
1567
+ char *row_sepP2 = (ctx->row_sep_len > 0) ? ctx->row_sep_buf : NULL;
1568
+ long row_sep_len2 = (long)ctx->row_sep_len;
1569
+
1570
+ long i;
1571
+ long backslash_count = 0;
1572
+ bool in_quotes = false;
1573
+ bool col_sep_found = true;
1574
+ bool field_started = false;
1575
+
1576
+ char sep_char_slow = *col_sepP;
1577
+
1578
+ while (p < endP) {
1579
+ if (!in_quotes && *p == sep_char_slow) {
1580
+ col_sep_found = true;
1581
+ for (i = 1; (i < col_sep_len) && (p + i < endP); i++) {
1582
+ if (*(p + i) != *(col_sepP + i)) { col_sep_found = false; break; }
1583
+ }
1584
+ } else {
1585
+ col_sep_found = false;
1586
+ }
1587
+
1588
+ if (col_sep_found) {
1589
+ long field_len = p - startP;
1590
+ char *raw_field = startP;
1591
+
1592
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1593
+ if (quoted) {
1594
+ raw_field++;
1595
+ field_len -= 2;
1596
+ }
1597
+
1598
+ char *trim_start = raw_field;
1599
+ char *trim_end = raw_field + field_len - 1;
1600
+
1601
+ if (strip_ws) {
1602
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1603
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1604
+ }
1605
+
1606
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1607
+
1608
+ bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1609
+
1610
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1611
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1612
+ all_blank = false;
1613
+ }
1614
+ element_count++;
1615
+
1616
+ if (early_exit_after >= 0 && element_count > early_exit_after) {
1617
+ did_early_exit = true;
1618
+ goto section5_done_ctx;
1619
+ }
1620
+
1621
+ p += col_sep_len;
1622
+ startP = p;
1623
+ backslash_count = 0;
1624
+ field_started = false;
1625
+
1626
+ } else {
1627
+ /* Not at a separator (or inside quotes) — track quote state */
1628
+
1629
+ /* RFC mode: memchr skip-ahead inside quoted fields (Opt #6) */
1630
+ if (!allow_escaped_quotes && in_quotes) {
1631
+ char *next_quote = (char *)memchr(p, quote_char_val, endP - p);
1632
+ if (!next_quote) { p = endP; continue; }
1633
+ p = next_quote; /* fall through to quote-handling code */
1634
+ }
1635
+
1636
+ if (allow_escaped_quotes && *p == '\\') {
1637
+ backslash_count++;
1638
+ if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) field_started = true;
1639
+ } else {
1640
+ if (*p == quote_char_val) {
1641
+ if (!allow_escaped_quotes || backslash_count % 2 == 0) {
1642
+ if (__builtin_expect(quote_boundary_standard, 1)) {
1643
+ if (in_quotes) {
1644
+ /* closing quote: only valid if followed by col_sep, row_sep, or end */
1645
+ bool valid_close = (p + 1 >= endP);
1646
+ if (!valid_close) {
1647
+ valid_close = true;
1648
+ for (long j = 0; j < col_sep_len; j++) {
1649
+ if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
1650
+ }
1651
+ }
1652
+ if (!valid_close && row_sep_len2 > 0) {
1653
+ valid_close = true;
1654
+ for (long j = 0; j < row_sep_len2; j++) {
1655
+ if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
1656
+ }
1657
+ }
1658
+ if (valid_close) {
1659
+ in_quotes = false;
1660
+ field_started = true;
1661
+ }
1662
+ /* else: quote inside quoted field → literal (handles "" doubling) */
1663
+ } else if (!field_started) {
1664
+ in_quotes = true; /* opening quote at field boundary */
1665
+ field_started = true;
1666
+ }
1667
+ /* else: mid-field quote → treat as literal */
1668
+ } else {
1669
+ in_quotes = !in_quotes;
1670
+ }
1671
+ }
1672
+ } else if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) {
1673
+ if (strip_ws) {
1674
+ if (*p != ' ' && *p != '\t') {
1675
+ field_started = true;
1676
+ } else if (!field_started) {
1677
+ startP = p + 1;
1678
+ }
1679
+ } else {
1680
+ field_started = true;
1681
+ }
1682
+ }
1683
+ backslash_count = 0;
1684
+ }
1685
+ p++;
1686
+ }
1687
+ }
1688
+
1689
+ section5_done_ctx:;
1690
+ /* Unclosed quote at end of line — signal multiline continuation */
1691
+ if (!did_early_exit && in_quotes) {
1692
+ VALUE result = rb_ary_new_capa(2);
1693
+ rb_ary_push(result, Qnil);
1694
+ rb_ary_push(result, LONG2FIX(-1));
1695
+ return result;
1696
+ }
1697
+
1698
+ /* Process the last field — skip on early exit */
1699
+ if (!did_early_exit) {
1700
+ long field_len = endP - startP;
1701
+ char *raw_field = startP;
1702
+
1703
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
1704
+ if (quoted) {
1705
+ raw_field++;
1706
+ field_len -= 2;
1707
+ }
1708
+
1709
+ char *trim_start = raw_field;
1710
+ char *trim_end = raw_field + field_len - 1;
1711
+
1712
+ if (strip_ws) {
1713
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
1714
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
1715
+ }
1716
+
1717
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
1718
+
1719
+ bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
1720
+
1721
+ if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
1722
+ if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
1723
+ all_blank = false;
1724
+ }
1725
+ element_count++;
1726
+ }
1727
+ }
1728
+
1729
+ /* ----------------------------------------
1730
+ * SECTION 6: Handle blank rows
1731
+ * ---------------------------------------- */
1732
+ if (remove_empty && all_blank) {
1733
+ VALUE result = rb_ary_new_capa(2);
1734
+ rb_ary_push(result, Qnil);
1735
+ rb_ary_push(result, LONG2FIX(element_count));
1736
+ return result;
1737
+ }
1738
+
1739
+ /* ----------------------------------------
1740
+ * SECTION 7: Pad hash with nil for missing columns (conditional)
1741
+ * ---------------------------------------- */
1742
+ if (!remove_empty_values) {
1743
+ ensure_hash_allocated(&xform);
1744
+ for (long i = element_count; i < headers_len; i++) {
1745
+ if (!keep_bitmap || keep_bitmap[i]) {
1746
+ rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
1747
+ }
1748
+ }
1749
+ }
1750
+
1751
+ /* ----------------------------------------
1752
+ * SECTION 8: Return result
1753
+ * ---------------------------------------- */
1754
+ VALUE result = rb_ary_new_capa(2);
1755
+ rb_ary_push(result, xform.hash);
1756
+ rb_ary_push(result, LONG2FIX(element_count));
1757
+ return result;
1758
+ }
1759
+
836
1760
  // Count quote characters in a line, optionally respecting backslash escapes.
837
1761
  // This is a performance optimization that replaces the Ruby each_char implementation
838
1762
  // which creates a new String object for every character in the line.
@@ -942,10 +1866,22 @@ void Init_smarter_csv(void) {
942
1866
  id_remove_zero_values = rb_intern("remove_zero_values");
943
1867
  id_only = rb_intern("only");
944
1868
  id_except = rb_intern("except");
945
-
946
- rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 7);
1869
+ id_quote_boundary = rb_intern("quote_boundary");
1870
+ id_only_headers = rb_intern("only_headers");
1871
+ id_except_headers = rb_intern("except_headers");
1872
+ id_keep_cols = rb_intern("_keep_cols");
1873
+ id_keep_bitmap = rb_intern("_keep_bitmap");
1874
+ id_keep_extra_cols = rb_intern("_keep_extra_cols");
1875
+ id_early_exit_after_sym = rb_intern("_early_exit_after");
1876
+ id_strict = rb_intern("strict");
1877
+ id_backslash = rb_intern("backslash");
1878
+ id_standard = rb_intern("standard");
1879
+
1880
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 9);
947
1881
  rb_define_module_function(Parser, "count_quote_chars_c", rb_count_quote_chars, 4);
948
1882
  rb_define_module_function(Parser, "count_quote_chars_auto_c", rb_count_quote_chars_auto, 3);
949
1883
  rb_define_module_function(Parser, "zip_to_hash_c", rb_zip_to_hash, 2);
950
1884
  rb_define_module_function(Parser, "parse_line_to_hash_c", rb_parse_line_to_hash, 3);
1885
+ rb_define_module_function(Parser, "new_parse_context_c", rb_new_parse_context, 2);
1886
+ rb_define_module_function(Parser, "parse_line_to_hash_ctx_c", rb_parse_line_to_hash_ctx, 2);
951
1887
  }