smarter_csv 1.15.2 → 1.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +9 -0
- data/CHANGELOG.md +68 -1
- data/CONTRIBUTORS.md +3 -1
- data/Gemfile +1 -0
- data/README.md +123 -27
- data/docs/_introduction.md +40 -24
- data/docs/bad_row_quarantine.md +285 -0
- data/docs/basic_read_api.md +151 -9
- data/docs/basic_write_api.md +474 -59
- data/docs/batch_processing.md +161 -4
- data/docs/column_selection.md +183 -0
- data/docs/data_transformations.md +162 -29
- data/docs/examples.md +339 -46
- data/docs/header_transformations.md +93 -12
- data/docs/header_validations.md +56 -18
- data/docs/history.md +117 -0
- data/docs/instrumentation.md +165 -0
- data/docs/migrating_from_csv.md +290 -0
- data/docs/options.md +150 -87
- data/docs/parsing_strategy.md +63 -1
- data/docs/real_world_csv.md +262 -0
- data/docs/releases/1.16.0/benchmarks.md +223 -0
- data/docs/releases/1.16.0/changes.md +272 -0
- data/docs/releases/1.16.0/performance_notes.md +114 -0
- data/docs/row_col_sep.md +14 -5
- data/docs/value_converters.md +193 -57
- data/ext/smarter_csv/extconf.rb +3 -0
- data/ext/smarter_csv/smarter_csv.c +1007 -71
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg +108 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg +141 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png +0 -0
- data/images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg +139 -0
- data/lib/smarter_csv/errors.rb +8 -0
- data/lib/smarter_csv/file_io.rb +1 -1
- data/lib/smarter_csv/hash_transformations.rb +14 -13
- data/lib/smarter_csv/header_transformations.rb +21 -2
- data/lib/smarter_csv/headers.rb +2 -1
- data/lib/smarter_csv/options.rb +124 -7
- data/lib/smarter_csv/parser.rb +362 -75
- data/lib/smarter_csv/reader.rb +494 -46
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv/writer.rb +71 -19
- data/lib/smarter_csv.rb +95 -12
- data/smarter_csv.gemspec +20 -10
- metadata +37 -80
|
@@ -37,7 +37,101 @@ VALUE Qempty_string = Qnil;
|
|
|
37
37
|
static ID id_col_sep, id_quote_char, id_row_sep, id_missing_header_prefix;
|
|
38
38
|
static ID id_strip_whitespace, id_remove_empty_hashes, id_remove_empty_values;
|
|
39
39
|
static ID id_quote_escaping, id_convert_values_to_numeric, id_remove_zero_values;
|
|
40
|
-
static ID id_only, id_except;
|
|
40
|
+
static ID id_only, id_except, id_quote_boundary;
|
|
41
|
+
static ID id_only_headers, id_except_headers, id_keep_cols, id_strict;
|
|
42
|
+
static ID id_keep_bitmap, id_keep_extra_cols, id_early_exit_after_sym;
|
|
43
|
+
static ID id_backslash, id_standard;
|
|
44
|
+
|
|
45
|
+
/* ================================================================================
|
|
46
|
+
* ParseContext — wraps all per-file parse options as a GC-managed TypedData object.
|
|
47
|
+
*
|
|
48
|
+
* Building a context once after headers are loaded eliminates the ~10 rb_hash_aref
|
|
49
|
+
* calls that rb_parse_line_to_hash performs on every row. The hot path calls
|
|
50
|
+
* parse_line_to_hash_ctx_c(line, ctx) instead of parse_line_to_hash_c(line, headers, opts).
|
|
51
|
+
* ================================================================================ */
|
|
52
|
+
typedef struct {
|
|
53
|
+
/* Separator and quoting config — copied into C buffers, no Ruby GC tracking needed */
|
|
54
|
+
char col_sep_buf[8];
|
|
55
|
+
int col_sep_len;
|
|
56
|
+
char quote_char_val;
|
|
57
|
+
char row_sep_buf[16];
|
|
58
|
+
int row_sep_len;
|
|
59
|
+
char prefix_buf[64];
|
|
60
|
+
const char *prefix_str; /* "column_" literal or points into prefix_buf */
|
|
61
|
+
|
|
62
|
+
/* Boolean parse flags */
|
|
63
|
+
bool strip_ws;
|
|
64
|
+
bool remove_empty;
|
|
65
|
+
bool remove_empty_values;
|
|
66
|
+
bool remove_zero_values;
|
|
67
|
+
bool allow_escaped_quotes; /* quote_escaping == :backslash */
|
|
68
|
+
bool quote_boundary_standard;
|
|
69
|
+
|
|
70
|
+
/* Numeric conversion: 0=off, 1=all, 2=only listed keys, 3=except listed keys */
|
|
71
|
+
int numeric_mode;
|
|
72
|
+
|
|
73
|
+
/* Column filter bitmap (xmalloc'd; NULL when no filtering active) */
|
|
74
|
+
bool *keep_bitmap;
|
|
75
|
+
long keep_bitmap_len;
|
|
76
|
+
bool keep_extra_columns;
|
|
77
|
+
bool has_only;
|
|
78
|
+
long early_exit_after; /* column index after which we stop; -1 = no early exit */
|
|
79
|
+
|
|
80
|
+
/* Hash allocation hint (set once at context creation) */
|
|
81
|
+
long hash_capa;
|
|
82
|
+
|
|
83
|
+
/* GC-tracked Ruby values — must be marked in the mark callback */
|
|
84
|
+
VALUE headers;
|
|
85
|
+
VALUE numeric_keys; /* Qnil when not used */
|
|
86
|
+
} parse_context_t;
|
|
87
|
+
|
|
88
|
+
__attribute__((cold)) static void parse_context_mark(void *ptr) {
|
|
89
|
+
parse_context_t *ctx = (parse_context_t *)ptr;
|
|
90
|
+
#if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7))
|
|
91
|
+
rb_gc_mark_movable(ctx->headers);
|
|
92
|
+
rb_gc_mark_movable(ctx->numeric_keys);
|
|
93
|
+
#else
|
|
94
|
+
rb_gc_mark(ctx->headers);
|
|
95
|
+
if (!NIL_P(ctx->numeric_keys)) rb_gc_mark(ctx->numeric_keys);
|
|
96
|
+
#endif
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
#if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7))
|
|
100
|
+
__attribute__((cold)) static void parse_context_compact(void *ptr) {
|
|
101
|
+
parse_context_t *ctx = (parse_context_t *)ptr;
|
|
102
|
+
ctx->headers = rb_gc_location(ctx->headers);
|
|
103
|
+
ctx->numeric_keys = rb_gc_location(ctx->numeric_keys);
|
|
104
|
+
}
|
|
105
|
+
#endif
|
|
106
|
+
|
|
107
|
+
__attribute__((cold)) static void parse_context_free(void *ptr) {
|
|
108
|
+
parse_context_t *ctx = (parse_context_t *)ptr;
|
|
109
|
+
if (ctx->keep_bitmap) xfree(ctx->keep_bitmap);
|
|
110
|
+
xfree(ctx);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
__attribute__((cold)) static size_t parse_context_memsize(const void *ptr) {
|
|
114
|
+
const parse_context_t *ctx = (const parse_context_t *)ptr;
|
|
115
|
+
size_t sz = sizeof(parse_context_t);
|
|
116
|
+
if (ctx->keep_bitmap) sz += (size_t)ctx->keep_bitmap_len * sizeof(bool);
|
|
117
|
+
return sz;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
static const rb_data_type_t parse_context_type = {
|
|
121
|
+
"SmarterCSV::ParseContext",
|
|
122
|
+
{
|
|
123
|
+
parse_context_mark,
|
|
124
|
+
parse_context_free,
|
|
125
|
+
parse_context_memsize,
|
|
126
|
+
#if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 2 || (RUBY_API_VERSION_MAJOR == 2 && RUBY_API_VERSION_MINOR >= 7))
|
|
127
|
+
parse_context_compact,
|
|
128
|
+
#else
|
|
129
|
+
0,
|
|
130
|
+
#endif
|
|
131
|
+
},
|
|
132
|
+
0, 0,
|
|
133
|
+
RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
|
|
134
|
+
};
|
|
41
135
|
|
|
42
136
|
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
|
43
137
|
char *buf = ALLOC_N(char, len);
|
|
@@ -55,7 +149,7 @@ static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *
|
|
|
55
149
|
return out;
|
|
56
150
|
}
|
|
57
151
|
|
|
58
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val) {
|
|
152
|
+
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val, VALUE quote_boundary_standard_val, VALUE row_sep_val) {
|
|
59
153
|
if (RB_TYPE_P(line, T_NIL) == 1) {
|
|
60
154
|
return rb_ary_new();
|
|
61
155
|
}
|
|
@@ -91,6 +185,10 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
91
185
|
bool has_quotes = RTEST(has_quotes_val);
|
|
92
186
|
bool strip_ws = RTEST(strip_ws_val);
|
|
93
187
|
bool allow_escaped_quotes = RTEST(allow_escaped_quotes_val);
|
|
188
|
+
bool quote_boundary_standard = RTEST(quote_boundary_standard_val);
|
|
189
|
+
|
|
190
|
+
char *row_sepP = (RB_TYPE_P(row_sep_val, T_STRING)) ? RSTRING_PTR(row_sep_val) : NULL;
|
|
191
|
+
long row_sep_len = (row_sepP) ? RSTRING_LEN(row_sep_val) : 0;
|
|
94
192
|
|
|
95
193
|
// === FAST PATH: No quotes and single-character separator ===
|
|
96
194
|
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
|
@@ -147,6 +245,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
147
245
|
long backslash_count = 0;
|
|
148
246
|
bool in_quotes = false;
|
|
149
247
|
bool col_sep_found = true;
|
|
248
|
+
bool field_started = false; // for quote_boundary_standard: true once field has non-boundary content
|
|
150
249
|
|
|
151
250
|
while (p < endP) {
|
|
152
251
|
col_sep_found = true;
|
|
@@ -195,13 +294,53 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
195
294
|
p += col_sep_len;
|
|
196
295
|
startP = p;
|
|
197
296
|
backslash_count = 0;
|
|
297
|
+
field_started = false; // reset for next field
|
|
198
298
|
} else {
|
|
199
299
|
if (allow_escaped_quotes && *p == '\\') {
|
|
200
300
|
backslash_count++;
|
|
301
|
+
if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) field_started = true;
|
|
201
302
|
} else {
|
|
202
303
|
if (*p == quote_char_val) {
|
|
203
304
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
204
|
-
|
|
305
|
+
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
306
|
+
if (in_quotes) {
|
|
307
|
+
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
308
|
+
bool valid_close = (p + 1 >= endP);
|
|
309
|
+
if (!valid_close) {
|
|
310
|
+
valid_close = true;
|
|
311
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
312
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
313
|
+
}
|
|
314
|
+
}
|
|
315
|
+
if (!valid_close && row_sep_len > 0) {
|
|
316
|
+
valid_close = true;
|
|
317
|
+
for (long j = 0; j < row_sep_len; j++) {
|
|
318
|
+
if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
if (valid_close) {
|
|
322
|
+
in_quotes = false;
|
|
323
|
+
field_started = true;
|
|
324
|
+
}
|
|
325
|
+
// else: quote inside quoted field → literal (handles "" doubling)
|
|
326
|
+
} else if (!field_started) {
|
|
327
|
+
in_quotes = true; // opening quote at field boundary
|
|
328
|
+
field_started = true;
|
|
329
|
+
}
|
|
330
|
+
// else: mid-field quote → treat as literal
|
|
331
|
+
} else {
|
|
332
|
+
in_quotes = !in_quotes;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
} else if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) {
|
|
336
|
+
if (strip_ws) {
|
|
337
|
+
if (*p != ' ' && *p != '\t') {
|
|
338
|
+
field_started = true;
|
|
339
|
+
} else if (!field_started) {
|
|
340
|
+
startP = p + 1; /* advance past leading whitespace so quote-detection at extraction sees the quote */
|
|
341
|
+
}
|
|
342
|
+
} else {
|
|
343
|
+
field_started = true;
|
|
205
344
|
}
|
|
206
345
|
}
|
|
207
346
|
backslash_count = 0;
|
|
@@ -393,7 +532,7 @@ static inline void ensure_hash_allocated(field_transform_opts *opts) {
|
|
|
393
532
|
* Returns: true if a non-blank value was inserted, false otherwise.
|
|
394
533
|
* (Used to track all_blank for remove_empty_hashes.)
|
|
395
534
|
*/
|
|
396
|
-
static inline bool insert_field_into_hash(
|
|
535
|
+
static inline __attribute__((always_inline)) bool insert_field_into_hash(
|
|
397
536
|
field_transform_opts *opts,
|
|
398
537
|
char *trim_start, long trimmed_len,
|
|
399
538
|
long element_count, bool is_quoted,
|
|
@@ -513,7 +652,7 @@ static inline bool insert_field_into_hash(
|
|
|
513
652
|
* Input: line = "john,25,boston,extra" (more fields than headers)
|
|
514
653
|
* Output: [{name: "john", age: "25", city: "boston", column_4: "extra"}, 4]
|
|
515
654
|
*/
|
|
516
|
-
static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE options_hash) {
|
|
655
|
+
__attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE options_hash) {
|
|
517
656
|
|
|
518
657
|
/* ----------------------------------------
|
|
519
658
|
* SECTION 1: Handle nil/invalid input
|
|
@@ -538,7 +677,6 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
538
677
|
VALUE col_sep = rb_hash_aref(options_hash, ID2SYM(id_col_sep));
|
|
539
678
|
VALUE quote_char = rb_hash_aref(options_hash, ID2SYM(id_quote_char));
|
|
540
679
|
VALUE header_prefix = rb_hash_aref(options_hash, ID2SYM(id_missing_header_prefix));
|
|
541
|
-
VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
|
|
542
680
|
bool strip_ws = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strip_whitespace)));
|
|
543
681
|
bool remove_empty = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_hashes)));
|
|
544
682
|
bool remove_empty_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_values)));
|
|
@@ -565,11 +703,10 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
565
703
|
}
|
|
566
704
|
}
|
|
567
705
|
|
|
568
|
-
//
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
}
|
|
706
|
+
// quote_escaping and quote_boundary are only needed in Section 5 (quoted/slow path).
|
|
707
|
+
// They are declared here as forward declarations so Section 5 can set them lazily.
|
|
708
|
+
bool allow_escaped_quotes = false; // set in Section 5 on first entry
|
|
709
|
+
bool quote_boundary_standard = false; // set in Section 5 on first entry
|
|
573
710
|
|
|
574
711
|
rb_encoding *encoding = rb_enc_get(line); // Preserve string encoding
|
|
575
712
|
char *startP = RSTRING_PTR(line); // Pointer to start of current field
|
|
@@ -577,7 +714,8 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
577
714
|
char *endP = startP + line_len; // End of line marker
|
|
578
715
|
char *p = startP; // Current parsing position
|
|
579
716
|
|
|
580
|
-
// Chomp: strip trailing row separator (pointer adjustment, no string mutation)
|
|
717
|
+
// Chomp: strip trailing row separator (pointer adjustment, no string mutation).
|
|
718
|
+
// row_sep is also reused in Section 5 for the closing-quote boundary check.
|
|
581
719
|
VALUE row_sep = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
|
|
582
720
|
if (!NIL_P(row_sep) && RB_TYPE_P(row_sep, T_STRING)) {
|
|
583
721
|
char *row_sepP = RSTRING_PTR(row_sep);
|
|
@@ -600,6 +738,112 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
600
738
|
// Optimization hint: check if line contains quote characters
|
|
601
739
|
bool has_quotes = (memchr(startP, quote_char_val, line_len) != NULL);
|
|
602
740
|
|
|
741
|
+
/* ----------------------------------------
|
|
742
|
+
* Column-filter bitmap for only_headers: / except_headers:
|
|
743
|
+
* ----------------------------------------
|
|
744
|
+
* keep_bitmap[i] = true → include column i in the output hash
|
|
745
|
+
* keep_bitmap[i] = false → skip column i (no Ruby allocation at all)
|
|
746
|
+
* NULL when no filter is active — zero overhead on common path.
|
|
747
|
+
*
|
|
748
|
+
* Preferred source: options[:_keep_cols] — a Ruby Array of true/false values
|
|
749
|
+
* precomputed once in reader.rb after headers are loaded (O(1) Set lookups).
|
|
750
|
+
* Copying it here is O(headers_len) with O(1) per element — no rb_ary_includes.
|
|
751
|
+
*
|
|
752
|
+
* Fallback: build from only_headers/except_headers via rb_ary_includes (O(k)
|
|
753
|
+
* per column, k = filter list length). Used only when _keep_cols is absent.
|
|
754
|
+
*
|
|
755
|
+
* Capped at 4096 columns; wider CSVs fall back to the Ruby-side
|
|
756
|
+
* hash.select!/hash.reject! filter applied after return.
|
|
757
|
+
*
|
|
758
|
+
* The bitmap is a loop invariant: headers and filter settings never change between rows.
|
|
759
|
+
* reader.rb precomputes it once as a packed binary String (_keep_bitmap) and also
|
|
760
|
+
* pre-stores keep_extra_cols and early_exit_after, so C just does 3 hash lookups +
|
|
761
|
+
* one memcpy instead of N rb_ary_entry calls on every row.
|
|
762
|
+
*
|
|
763
|
+
* alloca() keeps the allocation conditional: no-filter path never calls alloca(), so
|
|
764
|
+
* the frame stays well below 4 KB and ___chkstk_darwin never fires on ARM64 macOS.
|
|
765
|
+
*/
|
|
766
|
+
bool *keep_bitmap = NULL;
|
|
767
|
+
bool keep_extra_columns = true; /* extra cols (> headers_len): keep by default */
|
|
768
|
+
bool has_only = false; /* true when only_headers: filtering is active */
|
|
769
|
+
long early_exit_after = -1; /* column index after which we stop; -1 = no early exit */
|
|
770
|
+
|
|
771
|
+
/* Column-filter bitmap setup.
|
|
772
|
+
*
|
|
773
|
+
* _keep_cols is the gate key — checked with a single rb_hash_aref on every row:
|
|
774
|
+
* false (default) → no filtering; skip everything instantly. ← COMMON CASE, zero overhead
|
|
775
|
+
* nil → filter active (reader.rb path): check _keep_bitmap for the fast bitmap.
|
|
776
|
+
* Array → backward-compat: direct C API callers passing _keep_cols as an Array.
|
|
777
|
+
*
|
|
778
|
+
* When _keep_cols is absent from the hash (nil from rb_hash_aref), it falls through to
|
|
779
|
+
* deriving the bitmap from only_headers/except_headers directly (manual options hashes).
|
|
780
|
+
*
|
|
781
|
+
* only_headers: / except_headers: are RARELY used options. The common path (no filtering)
|
|
782
|
+
* pays exactly one rb_hash_aref and nothing else.
|
|
783
|
+
*/
|
|
784
|
+
VALUE keep_cols_val = rb_hash_aref(options_hash, ID2SYM(id_keep_cols));
|
|
785
|
+
if (keep_cols_val != Qfalse) {
|
|
786
|
+
/* Not false: either nil (filter active / absent) or Array (backward-compat). */
|
|
787
|
+
if (NIL_P(keep_cols_val)) {
|
|
788
|
+
/* nil: reader.rb filter path — check _keep_bitmap, or fall back to deriving it. */
|
|
789
|
+
VALUE prebuilt_bitmap = rb_hash_aref(options_hash, ID2SYM(id_keep_bitmap));
|
|
790
|
+
if (RB_TYPE_P(prebuilt_bitmap, T_STRING)
|
|
791
|
+
&& headers_len > 0 && RSTRING_LEN(prebuilt_bitmap) >= headers_len) {
|
|
792
|
+
/* Precomputed binary bitmap from reader.rb — one memcpy replaces N rb_ary_entry calls.
|
|
793
|
+
* Copy before any Ruby API calls that could trigger GC compaction. */
|
|
794
|
+
keep_bitmap = (bool *)alloca((size_t)headers_len * sizeof(bool));
|
|
795
|
+
memcpy(keep_bitmap, RSTRING_PTR(prebuilt_bitmap), (size_t)headers_len * sizeof(bool));
|
|
796
|
+
VALUE kec = rb_hash_aref(options_hash, ID2SYM(id_keep_extra_cols));
|
|
797
|
+
keep_extra_columns = NIL_P(kec) ? true : RTEST(kec);
|
|
798
|
+
VALUE exa = rb_hash_aref(options_hash, ID2SYM(id_early_exit_after_sym));
|
|
799
|
+
early_exit_after = RB_INTEGER_TYPE_P(exa) ? NUM2LONG(exa) : -1;
|
|
800
|
+
has_only = !keep_extra_columns;
|
|
801
|
+
} else if (headers_len > 0 && headers_len <= 4096) {
|
|
802
|
+
/* Last resort: derive from only_headers/except_headers directly.
|
|
803
|
+
* Only reached when options hash is built manually without any _keep_* keys. */
|
|
804
|
+
VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
|
|
805
|
+
VALUE except_hdrs = rb_hash_aref(options_hash, ID2SYM(id_except_headers));
|
|
806
|
+
bool has_except = RB_TYPE_P(except_hdrs, T_ARRAY) && RARRAY_LEN(except_hdrs) > 0;
|
|
807
|
+
has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
|
|
808
|
+
if (has_only || has_except) {
|
|
809
|
+
keep_bitmap = (bool *)alloca((size_t)headers_len * sizeof(bool));
|
|
810
|
+
for (long bi = 0; bi < headers_len; bi++) {
|
|
811
|
+
VALUE hdr = rb_ary_entry(headers, bi);
|
|
812
|
+
keep_bitmap[bi] = has_only
|
|
813
|
+
? (rb_ary_includes(only_hdrs, hdr) == Qtrue)
|
|
814
|
+
: (rb_ary_includes(except_hdrs, hdr) != Qtrue);
|
|
815
|
+
}
|
|
816
|
+
keep_extra_columns = !has_only;
|
|
817
|
+
bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
|
|
818
|
+
if (has_only && !strict) {
|
|
819
|
+
for (long bi = headers_len - 1; bi >= 0; bi--) {
|
|
820
|
+
if (keep_bitmap[bi]) { early_exit_after = bi; break; }
|
|
821
|
+
}
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
}
|
|
825
|
+
} else if (RB_TYPE_P(keep_cols_val, T_ARRAY) && headers_len > 0 && headers_len <= 4096) {
|
|
826
|
+
/* Backward-compat: _keep_cols Array from direct C API callers — O(headers_len) Ruby calls */
|
|
827
|
+
keep_bitmap = (bool *)alloca((size_t)headers_len * sizeof(bool));
|
|
828
|
+
long prebuilt_len = RARRAY_LEN(keep_cols_val);
|
|
829
|
+
for (long bi = 0; bi < headers_len; bi++) {
|
|
830
|
+
keep_bitmap[bi] = bi < prebuilt_len ? RTEST(rb_ary_entry(keep_cols_val, bi)) : false;
|
|
831
|
+
}
|
|
832
|
+
VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
|
|
833
|
+
has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
|
|
834
|
+
keep_extra_columns = !has_only;
|
|
835
|
+
bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
|
|
836
|
+
if (has_only && !strict) {
|
|
837
|
+
for (long bi = headers_len - 1; bi >= 0; bi--) {
|
|
838
|
+
if (keep_bitmap[bi]) { early_exit_after = bi; break; }
|
|
839
|
+
}
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
}
|
|
843
|
+
/* else: _keep_cols is false — no filtering, keep_bitmap stays NULL. COMMON CASE. */
|
|
844
|
+
|
|
845
|
+
bool did_early_exit = false; /* set to true when early exit fires */
|
|
846
|
+
|
|
603
847
|
/* ----------------------------------------
|
|
604
848
|
* SECTION 3: Initialize hash and tracking variables
|
|
605
849
|
* ----------------------------------------
|
|
@@ -637,46 +881,79 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
637
881
|
char sep = *col_sepP;
|
|
638
882
|
char *sep_pos = NULL;
|
|
639
883
|
|
|
640
|
-
/* Loop through each field by finding separator positions
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
884
|
+
/* Loop through each field by finding separator positions.
|
|
885
|
+
* Two sub-paths to avoid per-field overhead in the common case:
|
|
886
|
+
* (a) no filter + no early exit → pure memchr loop, zero extra branches
|
|
887
|
+
* (b) filter active → bitmap/early-exit checks per field
|
|
888
|
+
*/
|
|
889
|
+
if (__builtin_expect(keep_bitmap == NULL && early_exit_after < 0, 1)) {
|
|
890
|
+
/* --- (a) Common path: no column filter, no early exit --- */
|
|
891
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
892
|
+
long field_len = sep_pos - startP;
|
|
893
|
+
char *trim_start = startP;
|
|
894
|
+
char *trim_end = startP + field_len - 1;
|
|
895
|
+
if (strip_ws) {
|
|
896
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
897
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
898
|
+
}
|
|
899
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
900
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
901
|
+
all_blank = false;
|
|
902
|
+
element_count++;
|
|
903
|
+
p = sep_pos + 1; startP = p;
|
|
651
904
|
}
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
905
|
+
/* Process last field */
|
|
906
|
+
{
|
|
907
|
+
long field_len = endP - startP;
|
|
908
|
+
char *trim_start = startP;
|
|
909
|
+
char *trim_end = startP + field_len - 1;
|
|
910
|
+
if (strip_ws) {
|
|
911
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
912
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
913
|
+
}
|
|
914
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
915
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
916
|
+
all_blank = false;
|
|
917
|
+
element_count++;
|
|
918
|
+
}
|
|
919
|
+
} else {
|
|
920
|
+
/* --- (b) Filter path: column bitmap and/or early exit active --- */
|
|
921
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
922
|
+
long field_len = sep_pos - startP;
|
|
923
|
+
char *trim_start = startP;
|
|
924
|
+
char *trim_end = startP + field_len - 1;
|
|
925
|
+
if (strip_ws) {
|
|
926
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
927
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
928
|
+
}
|
|
929
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
930
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
931
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
932
|
+
all_blank = false;
|
|
933
|
+
}
|
|
934
|
+
element_count++;
|
|
935
|
+
if (early_exit_after >= 0 && element_count > early_exit_after) {
|
|
936
|
+
did_early_exit = true;
|
|
937
|
+
break;
|
|
938
|
+
}
|
|
939
|
+
p = sep_pos + 1; startP = p;
|
|
940
|
+
}
|
|
941
|
+
/* Process last field — skip on early exit */
|
|
942
|
+
if (!did_early_exit) {
|
|
943
|
+
long field_len = endP - startP;
|
|
944
|
+
char *trim_start = startP;
|
|
945
|
+
char *trim_end = startP + field_len - 1;
|
|
946
|
+
if (strip_ws) {
|
|
947
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
948
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
949
|
+
}
|
|
950
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
951
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
952
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
953
|
+
all_blank = false;
|
|
954
|
+
}
|
|
955
|
+
element_count++;
|
|
673
956
|
}
|
|
674
|
-
|
|
675
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
676
|
-
|
|
677
|
-
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
678
|
-
all_blank = false;
|
|
679
|
-
element_count++;
|
|
680
957
|
}
|
|
681
958
|
|
|
682
959
|
} else {
|
|
@@ -689,25 +966,59 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
689
966
|
* - Escaped quotes using backslash: \"
|
|
690
967
|
*
|
|
691
968
|
* We must scan character-by-character to track quote state.
|
|
969
|
+
*
|
|
970
|
+
* quote_escaping and quote_boundary options are only needed here (Section 4
|
|
971
|
+
* fast path never touches them), so we extract them lazily on first Section 5 entry.
|
|
692
972
|
*/
|
|
973
|
+
VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
|
|
974
|
+
if (RB_TYPE_P(quote_escaping_val, T_SYMBOL)) {
|
|
975
|
+
allow_escaped_quotes = (SYM2ID(quote_escaping_val) == id_backslash);
|
|
976
|
+
}
|
|
977
|
+
VALUE quote_boundary_val = rb_hash_aref(options_hash, ID2SYM(id_quote_boundary));
|
|
978
|
+
quote_boundary_standard = (RB_TYPE_P(quote_boundary_val, T_SYMBOL) &&
|
|
979
|
+
SYM2ID(quote_boundary_val) == id_standard);
|
|
980
|
+
/* row_sep reused from chomp above for the closing-quote boundary check */
|
|
981
|
+
char *row_sepP2 = (RB_TYPE_P(row_sep, T_STRING)) ? RSTRING_PTR(row_sep) : NULL;
|
|
982
|
+
long row_sep_len2 = (row_sepP2) ? RSTRING_LEN(row_sep) : 0;
|
|
983
|
+
|
|
984
|
+
/* Opt #5 (C-side): if backslash mode is requested but the (chomped) line contains
|
|
985
|
+
* no backslash character, backslash escaping cannot possibly affect parsing — a
|
|
986
|
+
* backslash only matters immediately before a quote char. Downgrade to RFC mode
|
|
987
|
+
* so the memchr-inside-quotes optimisation fires unconditionally for such lines.
|
|
988
|
+
* This replaces the Ruby-side line.include?('\\') pre-scan that was on the hot
|
|
989
|
+
* path: now the check happens here in C (one fast memchr), and only for lines
|
|
990
|
+
* that actually reach Section 5 (i.e. lines that contain quote characters).
|
|
991
|
+
* Unquoted lines never enter Section 5, so they pay zero cost for this check. */
|
|
992
|
+
if (allow_escaped_quotes && !memchr(startP, '\\', endP - startP)) {
|
|
993
|
+
allow_escaped_quotes = false;
|
|
994
|
+
}
|
|
995
|
+
|
|
693
996
|
long i;
|
|
694
997
|
long backslash_count = 0; // Track consecutive backslashes for escape detection
|
|
695
998
|
bool in_quotes = false; // Are we inside a quoted field?
|
|
696
999
|
bool col_sep_found = true;
|
|
1000
|
+
bool field_started = false; // for quote_boundary_standard: true once field has non-boundary content
|
|
1001
|
+
|
|
1002
|
+
/* Cache first separator byte for fast pre-filtering */
|
|
1003
|
+
char sep_char_slow = *col_sepP;
|
|
697
1004
|
|
|
698
1005
|
/* Scan through the line character by character */
|
|
699
1006
|
while (p < endP) {
|
|
700
|
-
//
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
1007
|
+
// Separator check: when in_quotes we can never be at a field boundary,
|
|
1008
|
+
// so skip the comparison entirely.
|
|
1009
|
+
// For single-char separator: direct byte compare.
|
|
1010
|
+
// For multi-char separator: pre-filter on first byte, then check the rest.
|
|
1011
|
+
if (!in_quotes && *p == sep_char_slow) {
|
|
1012
|
+
col_sep_found = true;
|
|
1013
|
+
for (i = 1; (i < col_sep_len) && (p + i < endP); i++) {
|
|
1014
|
+
if (*(p + i) != *(col_sepP + i)) { col_sep_found = false; break; }
|
|
706
1015
|
}
|
|
1016
|
+
} else {
|
|
1017
|
+
col_sep_found = false;
|
|
707
1018
|
}
|
|
708
1019
|
|
|
709
|
-
// Found separator
|
|
710
|
-
if (col_sep_found
|
|
1020
|
+
// Found separator — !in_quotes is guaranteed by the block above
|
|
1021
|
+
if (col_sep_found) {
|
|
711
1022
|
long field_len = p - startP;
|
|
712
1023
|
char *raw_field = startP;
|
|
713
1024
|
|
|
@@ -731,25 +1042,83 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
731
1042
|
// Determine if field contains embedded quotes (need unescape)
|
|
732
1043
|
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
733
1044
|
|
|
734
|
-
if (
|
|
735
|
-
|
|
1045
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1046
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1047
|
+
all_blank = false;
|
|
1048
|
+
}
|
|
736
1049
|
element_count++;
|
|
737
1050
|
|
|
1051
|
+
/* Early exit: all required columns already collected — stop scanning */
|
|
1052
|
+
if (early_exit_after >= 0 && element_count > early_exit_after) {
|
|
1053
|
+
did_early_exit = true;
|
|
1054
|
+
goto section5_done;
|
|
1055
|
+
}
|
|
1056
|
+
|
|
738
1057
|
// Move past the separator to start of next field
|
|
739
1058
|
p += col_sep_len;
|
|
740
1059
|
startP = p;
|
|
741
1060
|
backslash_count = 0;
|
|
1061
|
+
field_started = false; // reset for next field
|
|
742
1062
|
|
|
743
1063
|
} else {
|
|
744
1064
|
/* Not at a separator (or inside quotes) - track quote state */
|
|
745
1065
|
|
|
1066
|
+
/* RFC mode: inside quoted field, skip ahead to the next quote char.
|
|
1067
|
+
* Everything between here and the next quote is plain field content — no
|
|
1068
|
+
* separators or backslashes can appear (allow_escaped_quotes is false).
|
|
1069
|
+
* memchr() is SIMD-accelerated and handles typical field lengths in 1 call. */
|
|
1070
|
+
if (!allow_escaped_quotes && in_quotes) {
|
|
1071
|
+
char *next_quote = (char *)memchr(p, quote_char_val, endP - p);
|
|
1072
|
+
if (!next_quote) { p = endP; continue; } /* no closing quote → unclosed */
|
|
1073
|
+
p = next_quote; /* jump to quote char; fall through to quote-handling code */
|
|
1074
|
+
}
|
|
1075
|
+
|
|
746
1076
|
if (allow_escaped_quotes && *p == '\\') {
|
|
747
1077
|
// Count consecutive backslashes for escape sequence detection
|
|
748
1078
|
backslash_count++;
|
|
1079
|
+
if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) field_started = true;
|
|
749
1080
|
} else {
|
|
750
1081
|
if (*p == quote_char_val) {
|
|
751
1082
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
752
|
-
|
|
1083
|
+
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
1084
|
+
if (in_quotes) {
|
|
1085
|
+
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
1086
|
+
bool valid_close = (p + 1 >= endP);
|
|
1087
|
+
if (!valid_close) {
|
|
1088
|
+
valid_close = true;
|
|
1089
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
1090
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1091
|
+
}
|
|
1092
|
+
}
|
|
1093
|
+
if (!valid_close && row_sep_len2 > 0) {
|
|
1094
|
+
valid_close = true;
|
|
1095
|
+
for (long j = 0; j < row_sep_len2; j++) {
|
|
1096
|
+
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1097
|
+
}
|
|
1098
|
+
}
|
|
1099
|
+
if (valid_close) {
|
|
1100
|
+
in_quotes = false;
|
|
1101
|
+
field_started = true;
|
|
1102
|
+
}
|
|
1103
|
+
// else: quote inside quoted field → literal (handles "" doubling)
|
|
1104
|
+
} else if (!field_started) {
|
|
1105
|
+
in_quotes = true; // opening quote at field boundary
|
|
1106
|
+
field_started = true;
|
|
1107
|
+
}
|
|
1108
|
+
// else: mid-field quote → treat as literal
|
|
1109
|
+
} else {
|
|
1110
|
+
in_quotes = !in_quotes;
|
|
1111
|
+
}
|
|
1112
|
+
}
|
|
1113
|
+
} else if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) {
|
|
1114
|
+
if (strip_ws) {
|
|
1115
|
+
if (*p != ' ' && *p != '\t') {
|
|
1116
|
+
field_started = true;
|
|
1117
|
+
} else if (!field_started) {
|
|
1118
|
+
startP = p + 1; /* advance past leading whitespace so quote-detection at extraction sees the quote */
|
|
1119
|
+
}
|
|
1120
|
+
} else {
|
|
1121
|
+
field_started = true;
|
|
753
1122
|
}
|
|
754
1123
|
}
|
|
755
1124
|
backslash_count = 0;
|
|
@@ -758,13 +1127,20 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
758
1127
|
}
|
|
759
1128
|
}
|
|
760
1129
|
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
1130
|
+
section5_done:;
|
|
1131
|
+
/* Unclosed quote at end of line (skip check on early exit):
|
|
1132
|
+
* Signal "needs more data" — the caller stitches the next physical line and re-parses.
|
|
1133
|
+
* We return [nil, -1] rather than raising so the read loop can handle multiline fields
|
|
1134
|
+
* without a separate pre-scan pass (detect_multiline). */
|
|
1135
|
+
if (!did_early_exit && in_quotes) {
|
|
1136
|
+
VALUE result = rb_ary_new_capa(2);
|
|
1137
|
+
rb_ary_push(result, Qnil);
|
|
1138
|
+
rb_ary_push(result, LONG2FIX(-1));
|
|
1139
|
+
return result;
|
|
764
1140
|
}
|
|
765
1141
|
|
|
766
|
-
/* Process the last field (same logic as above) */
|
|
767
|
-
{
|
|
1142
|
+
/* Process the last field (same logic as above) — skip on early exit */
|
|
1143
|
+
if (!did_early_exit) {
|
|
768
1144
|
long field_len = endP - startP;
|
|
769
1145
|
char *raw_field = startP;
|
|
770
1146
|
|
|
@@ -786,8 +1162,10 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
786
1162
|
|
|
787
1163
|
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
788
1164
|
|
|
789
|
-
if (
|
|
790
|
-
|
|
1165
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1166
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1167
|
+
all_blank = false;
|
|
1168
|
+
}
|
|
791
1169
|
element_count++;
|
|
792
1170
|
}
|
|
793
1171
|
}
|
|
@@ -817,7 +1195,9 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
817
1195
|
if (!remove_empty_values) {
|
|
818
1196
|
ensure_hash_allocated(&xform);
|
|
819
1197
|
for (long i = element_count; i < headers_len; i++) {
|
|
820
|
-
|
|
1198
|
+
if (!keep_bitmap || keep_bitmap[i]) {
|
|
1199
|
+
rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
|
|
1200
|
+
}
|
|
821
1201
|
}
|
|
822
1202
|
}
|
|
823
1203
|
|
|
@@ -833,6 +1213,550 @@ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE
|
|
|
833
1213
|
return result;
|
|
834
1214
|
}
|
|
835
1215
|
|
|
1216
|
+
/* ================================================================================
|
|
1217
|
+
* new_parse_context_c(headers, options_hash) → ParseContext
|
|
1218
|
+
*
|
|
1219
|
+
* Extracts all loop-invariant options from the options_hash once and stores them
|
|
1220
|
+
* in a C struct wrapped as a TypedData Ruby object. Called once per file after
|
|
1221
|
+
* headers are known. The returned context is passed to parse_line_to_hash_ctx_c
|
|
1222
|
+
* on every row, eliminating ~10 rb_hash_aref calls per row.
|
|
1223
|
+
* ================================================================================ */
|
|
1224
|
+
__attribute__((cold)) static VALUE rb_new_parse_context(VALUE self, VALUE headers, VALUE options_hash) {
|
|
1225
|
+
parse_context_t *ctx;
|
|
1226
|
+
VALUE ctx_obj = TypedData_Make_Struct(rb_cObject, parse_context_t, &parse_context_type, ctx);
|
|
1227
|
+
|
|
1228
|
+
/* Initialize all fields to safe defaults */
|
|
1229
|
+
memset(ctx, 0, sizeof(parse_context_t));
|
|
1230
|
+
ctx->headers = headers;
|
|
1231
|
+
ctx->numeric_keys = Qnil;
|
|
1232
|
+
ctx->keep_bitmap = NULL;
|
|
1233
|
+
ctx->early_exit_after = -1;
|
|
1234
|
+
ctx->keep_extra_columns = true;
|
|
1235
|
+
|
|
1236
|
+
/* col_sep */
|
|
1237
|
+
VALUE col_sep_val = rb_hash_aref(options_hash, ID2SYM(id_col_sep));
|
|
1238
|
+
if (RB_TYPE_P(col_sep_val, T_STRING)) {
|
|
1239
|
+
long len = RSTRING_LEN(col_sep_val);
|
|
1240
|
+
if (len > (long)(sizeof(ctx->col_sep_buf) - 1)) len = (long)(sizeof(ctx->col_sep_buf) - 1);
|
|
1241
|
+
memcpy(ctx->col_sep_buf, RSTRING_PTR(col_sep_val), (size_t)len);
|
|
1242
|
+
ctx->col_sep_buf[len] = '\0';
|
|
1243
|
+
ctx->col_sep_len = (int)len;
|
|
1244
|
+
} else {
|
|
1245
|
+
ctx->col_sep_buf[0] = ',';
|
|
1246
|
+
ctx->col_sep_buf[1] = '\0';
|
|
1247
|
+
ctx->col_sep_len = 1;
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
/* quote_char */
|
|
1251
|
+
VALUE quote_char_v = rb_hash_aref(options_hash, ID2SYM(id_quote_char));
|
|
1252
|
+
ctx->quote_char_val = (RB_TYPE_P(quote_char_v, T_STRING) && RSTRING_LEN(quote_char_v) > 0)
|
|
1253
|
+
? RSTRING_PTR(quote_char_v)[0] : '"';
|
|
1254
|
+
|
|
1255
|
+
/* row_sep */
|
|
1256
|
+
VALUE row_sep_v = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
|
|
1257
|
+
if (RB_TYPE_P(row_sep_v, T_STRING)) {
|
|
1258
|
+
long len = RSTRING_LEN(row_sep_v);
|
|
1259
|
+
if (len > (long)(sizeof(ctx->row_sep_buf) - 1)) len = (long)(sizeof(ctx->row_sep_buf) - 1);
|
|
1260
|
+
memcpy(ctx->row_sep_buf, RSTRING_PTR(row_sep_v), (size_t)len);
|
|
1261
|
+
ctx->row_sep_buf[len] = '\0';
|
|
1262
|
+
ctx->row_sep_len = (int)len;
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
/* missing_header_prefix */
|
|
1266
|
+
VALUE header_prefix = rb_hash_aref(options_hash, ID2SYM(id_missing_header_prefix));
|
|
1267
|
+
if (NIL_P(header_prefix)) {
|
|
1268
|
+
ctx->prefix_str = "column_";
|
|
1269
|
+
} else {
|
|
1270
|
+
long len = RSTRING_LEN(header_prefix);
|
|
1271
|
+
if (len > (long)(sizeof(ctx->prefix_buf) - 1)) len = (long)(sizeof(ctx->prefix_buf) - 1);
|
|
1272
|
+
memcpy(ctx->prefix_buf, RSTRING_PTR(header_prefix), (size_t)len);
|
|
1273
|
+
ctx->prefix_buf[len] = '\0';
|
|
1274
|
+
ctx->prefix_str = ctx->prefix_buf;
|
|
1275
|
+
}
|
|
1276
|
+
|
|
1277
|
+
/* Boolean flags */
|
|
1278
|
+
ctx->strip_ws = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strip_whitespace)));
|
|
1279
|
+
ctx->remove_empty = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_hashes)));
|
|
1280
|
+
ctx->remove_empty_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_empty_values)));
|
|
1281
|
+
ctx->remove_zero_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_zero_values)));
|
|
1282
|
+
|
|
1283
|
+
/* Numeric conversion */
|
|
1284
|
+
VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
|
|
1285
|
+
if (RTEST(convert_opt)) {
|
|
1286
|
+
if (RB_TYPE_P(convert_opt, T_HASH)) {
|
|
1287
|
+
VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
|
|
1288
|
+
VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
|
|
1289
|
+
if (RTEST(only_keys)) {
|
|
1290
|
+
ctx->numeric_mode = 2;
|
|
1291
|
+
ctx->numeric_keys = rb_Array(only_keys);
|
|
1292
|
+
} else if (RTEST(except_keys)) {
|
|
1293
|
+
ctx->numeric_mode = 3;
|
|
1294
|
+
ctx->numeric_keys = rb_Array(except_keys);
|
|
1295
|
+
}
|
|
1296
|
+
} else {
|
|
1297
|
+
ctx->numeric_mode = 1;
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
|
|
1301
|
+
/* quote_escaping → allow_escaped_quotes */
|
|
1302
|
+
VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
|
|
1303
|
+
if (RB_TYPE_P(quote_escaping_val, T_SYMBOL)) {
|
|
1304
|
+
ctx->allow_escaped_quotes = (SYM2ID(quote_escaping_val) == id_backslash);
|
|
1305
|
+
}
|
|
1306
|
+
|
|
1307
|
+
/* quote_boundary */
|
|
1308
|
+
VALUE quote_boundary_val = rb_hash_aref(options_hash, ID2SYM(id_quote_boundary));
|
|
1309
|
+
ctx->quote_boundary_standard = (RB_TYPE_P(quote_boundary_val, T_SYMBOL) &&
|
|
1310
|
+
SYM2ID(quote_boundary_val) == id_standard);
|
|
1311
|
+
|
|
1312
|
+
/* Column filter bitmap */
|
|
1313
|
+
long headers_len = NIL_P(headers) ? 0 : RARRAY_LEN(headers);
|
|
1314
|
+
ctx->hash_capa = headers_len > 0 ? headers_len : 16;
|
|
1315
|
+
|
|
1316
|
+
VALUE keep_cols_val = rb_hash_aref(options_hash, ID2SYM(id_keep_cols));
|
|
1317
|
+
if (keep_cols_val != Qfalse) {
|
|
1318
|
+
if (NIL_P(keep_cols_val)) {
|
|
1319
|
+
/* nil: reader.rb filter path — check _keep_bitmap, or fall back to deriving it. */
|
|
1320
|
+
VALUE prebuilt_bitmap = rb_hash_aref(options_hash, ID2SYM(id_keep_bitmap));
|
|
1321
|
+
if (RB_TYPE_P(prebuilt_bitmap, T_STRING)
|
|
1322
|
+
&& headers_len > 0 && RSTRING_LEN(prebuilt_bitmap) >= headers_len) {
|
|
1323
|
+
ctx->keep_bitmap = (bool *)xmalloc((size_t)headers_len * sizeof(bool));
|
|
1324
|
+
ctx->keep_bitmap_len = headers_len;
|
|
1325
|
+
memcpy(ctx->keep_bitmap, RSTRING_PTR(prebuilt_bitmap), (size_t)headers_len * sizeof(bool));
|
|
1326
|
+
VALUE kec = rb_hash_aref(options_hash, ID2SYM(id_keep_extra_cols));
|
|
1327
|
+
ctx->keep_extra_columns = NIL_P(kec) ? true : RTEST(kec);
|
|
1328
|
+
VALUE exa = rb_hash_aref(options_hash, ID2SYM(id_early_exit_after_sym));
|
|
1329
|
+
ctx->early_exit_after = RB_INTEGER_TYPE_P(exa) ? NUM2LONG(exa) : -1;
|
|
1330
|
+
ctx->has_only = !ctx->keep_extra_columns;
|
|
1331
|
+
} else if (headers_len > 0 && headers_len <= 4096) {
|
|
1332
|
+
/* Last resort: derive from only_headers/except_headers directly. */
|
|
1333
|
+
VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
|
|
1334
|
+
VALUE except_hdrs = rb_hash_aref(options_hash, ID2SYM(id_except_headers));
|
|
1335
|
+
bool has_except = RB_TYPE_P(except_hdrs, T_ARRAY) && RARRAY_LEN(except_hdrs) > 0;
|
|
1336
|
+
ctx->has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
|
|
1337
|
+
if (ctx->has_only || has_except) {
|
|
1338
|
+
ctx->keep_bitmap = (bool *)xmalloc((size_t)headers_len * sizeof(bool));
|
|
1339
|
+
ctx->keep_bitmap_len = headers_len;
|
|
1340
|
+
for (long bi = 0; bi < headers_len; bi++) {
|
|
1341
|
+
VALUE hdr = rb_ary_entry(headers, bi);
|
|
1342
|
+
ctx->keep_bitmap[bi] = ctx->has_only
|
|
1343
|
+
? (rb_ary_includes(only_hdrs, hdr) == Qtrue)
|
|
1344
|
+
: (rb_ary_includes(except_hdrs, hdr) != Qtrue);
|
|
1345
|
+
}
|
|
1346
|
+
ctx->keep_extra_columns = !ctx->has_only;
|
|
1347
|
+
bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
|
|
1348
|
+
if (ctx->has_only && !strict) {
|
|
1349
|
+
for (long bi = headers_len - 1; bi >= 0; bi--) {
|
|
1350
|
+
if (ctx->keep_bitmap[bi]) { ctx->early_exit_after = bi; break; }
|
|
1351
|
+
}
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
} else if (RB_TYPE_P(keep_cols_val, T_ARRAY) && headers_len > 0 && headers_len <= 4096) {
|
|
1356
|
+
/* Backward-compat: _keep_cols Array from direct C API callers */
|
|
1357
|
+
ctx->keep_bitmap = (bool *)xmalloc((size_t)headers_len * sizeof(bool));
|
|
1358
|
+
ctx->keep_bitmap_len = headers_len;
|
|
1359
|
+
long prebuilt_len = RARRAY_LEN(keep_cols_val);
|
|
1360
|
+
for (long bi = 0; bi < headers_len; bi++) {
|
|
1361
|
+
ctx->keep_bitmap[bi] = bi < prebuilt_len ? RTEST(rb_ary_entry(keep_cols_val, bi)) : false;
|
|
1362
|
+
}
|
|
1363
|
+
VALUE only_hdrs = rb_hash_aref(options_hash, ID2SYM(id_only_headers));
|
|
1364
|
+
ctx->has_only = RB_TYPE_P(only_hdrs, T_ARRAY) && RARRAY_LEN(only_hdrs) > 0;
|
|
1365
|
+
ctx->keep_extra_columns = !ctx->has_only;
|
|
1366
|
+
bool strict = RTEST(rb_hash_aref(options_hash, ID2SYM(id_strict)));
|
|
1367
|
+
if (ctx->has_only && !strict) {
|
|
1368
|
+
for (long bi = headers_len - 1; bi >= 0; bi--) {
|
|
1369
|
+
if (ctx->keep_bitmap[bi]) { ctx->early_exit_after = bi; break; }
|
|
1370
|
+
}
|
|
1371
|
+
}
|
|
1372
|
+
}
|
|
1373
|
+
}
|
|
1374
|
+
/* else: _keep_cols == false — no filtering; keep_bitmap stays NULL */
|
|
1375
|
+
|
|
1376
|
+
return ctx_obj;
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
/* ================================================================================
|
|
1380
|
+
* parse_line_to_hash_ctx_c(line, ctx) → [hash, data_size]
|
|
1381
|
+
*
|
|
1382
|
+
* High-performance variant of parse_line_to_hash_c that reads all loop-invariant
|
|
1383
|
+
* options from a pre-built ParseContext object instead of calling rb_hash_aref on
|
|
1384
|
+
* every row. Eliminates ~10 rb_hash_aref calls per row from the critical path.
|
|
1385
|
+
*
|
|
1386
|
+
* ctx must be a ParseContext built by new_parse_context_c(headers, options_hash).
|
|
1387
|
+
* headers_len is re-read each call from RARRAY_LEN(ctx->headers) to handle extra
|
|
1388
|
+
* column growth without requiring a context rebuild.
|
|
1389
|
+
* ================================================================================ */
|
|
1390
|
+
__attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE line, VALUE ctx_obj) {
|
|
1391
|
+
parse_context_t *ctx;
|
|
1392
|
+
TypedData_Get_Struct(ctx_obj, parse_context_t, &parse_context_type, ctx);
|
|
1393
|
+
|
|
1394
|
+
/* ----------------------------------------
|
|
1395
|
+
* SECTION 1: Handle nil/invalid input
|
|
1396
|
+
* ---------------------------------------- */
|
|
1397
|
+
if (NIL_P(line)) {
|
|
1398
|
+
VALUE result = rb_ary_new_capa(2);
|
|
1399
|
+
rb_ary_push(result, Qnil);
|
|
1400
|
+
rb_ary_push(result, INT2FIX(0));
|
|
1401
|
+
return result;
|
|
1402
|
+
}
|
|
1403
|
+
|
|
1404
|
+
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
1405
|
+
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line_to_hash: line has to be a string or nil");
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
/* ----------------------------------------
|
|
1409
|
+
* SECTION 2: Read options from context (zero rb_hash_aref calls)
|
|
1410
|
+
* ----------------------------------------
|
|
1411
|
+
* All loop-invariant options are read directly from the pre-built struct.
|
|
1412
|
+
* No Hash lookups. No Ruby object allocation. Pure C struct field reads.
|
|
1413
|
+
*/
|
|
1414
|
+
char *col_sepP = ctx->col_sep_buf;
|
|
1415
|
+
long col_sep_len = (long)ctx->col_sep_len;
|
|
1416
|
+
char quote_char_val = ctx->quote_char_val;
|
|
1417
|
+
const char *prefix_str = ctx->prefix_str;
|
|
1418
|
+
bool strip_ws = ctx->strip_ws;
|
|
1419
|
+
bool remove_empty = ctx->remove_empty;
|
|
1420
|
+
bool remove_empty_values = ctx->remove_empty_values;
|
|
1421
|
+
bool remove_zero_values = ctx->remove_zero_values;
|
|
1422
|
+
int numeric_mode = ctx->numeric_mode;
|
|
1423
|
+
VALUE numeric_keys = ctx->numeric_keys;
|
|
1424
|
+
bool *keep_bitmap = ctx->keep_bitmap;
|
|
1425
|
+
bool keep_extra_columns = ctx->keep_extra_columns;
|
|
1426
|
+
long early_exit_after = ctx->early_exit_after;
|
|
1427
|
+
|
|
1428
|
+
/* allow_escaped_quotes starts from context; per-line Opt #5 may downgrade it */
|
|
1429
|
+
bool allow_escaped_quotes = ctx->allow_escaped_quotes;
|
|
1430
|
+
bool quote_boundary_standard = ctx->quote_boundary_standard;
|
|
1431
|
+
|
|
1432
|
+
rb_encoding *encoding = rb_enc_get(line);
|
|
1433
|
+
char *startP = RSTRING_PTR(line);
|
|
1434
|
+
long line_len = RSTRING_LEN(line);
|
|
1435
|
+
char *endP = startP + line_len;
|
|
1436
|
+
char *p = startP;
|
|
1437
|
+
|
|
1438
|
+
/* Chomp: strip trailing row separator (pointer adjustment, no string mutation) */
|
|
1439
|
+
if (ctx->row_sep_len > 0) {
|
|
1440
|
+
long rsl = (long)ctx->row_sep_len;
|
|
1441
|
+
if (line_len >= rsl && memcmp(endP - rsl, ctx->row_sep_buf, (size_t)rsl) == 0) {
|
|
1442
|
+
endP -= rsl;
|
|
1443
|
+
}
|
|
1444
|
+
}
|
|
1445
|
+
|
|
1446
|
+
/* Re-read headers_len each call to handle extra-column growth */
|
|
1447
|
+
long headers_len = NIL_P(ctx->headers) ? 0 : RARRAY_LEN(ctx->headers);
|
|
1448
|
+
VALUE headers = ctx->headers;
|
|
1449
|
+
|
|
1450
|
+
/* Check if line contains quote characters (per-line; cannot be precomputed) */
|
|
1451
|
+
bool has_quotes = (memchr(startP, quote_char_val, line_len) != NULL);
|
|
1452
|
+
|
|
1453
|
+
bool did_early_exit = false;
|
|
1454
|
+
|
|
1455
|
+
/* ----------------------------------------
|
|
1456
|
+
* SECTION 3: Initialize hash and tracking variables
|
|
1457
|
+
* ---------------------------------------- */
|
|
1458
|
+
long hash_size = headers_len > 0 ? headers_len : 16;
|
|
1459
|
+
long element_count = 0;
|
|
1460
|
+
bool all_blank = true;
|
|
1461
|
+
|
|
1462
|
+
field_transform_opts xform = {
|
|
1463
|
+
.hash = Qnil,
|
|
1464
|
+
.headers = headers,
|
|
1465
|
+
.numeric_keys = numeric_keys,
|
|
1466
|
+
.encoding = encoding,
|
|
1467
|
+
.prefix_str = prefix_str,
|
|
1468
|
+
.headers_len = headers_len,
|
|
1469
|
+
.hash_capa = hash_size,
|
|
1470
|
+
.numeric_mode = numeric_mode,
|
|
1471
|
+
.remove_empty_values = remove_empty_values,
|
|
1472
|
+
.remove_zero_values = remove_zero_values,
|
|
1473
|
+
};
|
|
1474
|
+
|
|
1475
|
+
/* ========================================
|
|
1476
|
+
* SECTION 4: FAST PATH - No quotes, single-char separator
|
|
1477
|
+
* Two sub-paths to avoid per-field overhead in the common case:
|
|
1478
|
+
* (a) no filter + no early exit → pure memchr loop, zero extra branches
|
|
1479
|
+
* (b) filter active → bitmap/early-exit checks per field
|
|
1480
|
+
* ======================================== */
|
|
1481
|
+
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
|
1482
|
+
char sep = *col_sepP;
|
|
1483
|
+
char *sep_pos = NULL;
|
|
1484
|
+
|
|
1485
|
+
if (__builtin_expect(keep_bitmap == NULL && early_exit_after < 0, 1)) {
|
|
1486
|
+
/* --- (a) Common path: no column filter, no early exit --- */
|
|
1487
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
1488
|
+
long field_len = sep_pos - startP;
|
|
1489
|
+
char *trim_start = startP;
|
|
1490
|
+
char *trim_end = startP + field_len - 1;
|
|
1491
|
+
if (strip_ws) {
|
|
1492
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1493
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1494
|
+
}
|
|
1495
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1496
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1497
|
+
all_blank = false;
|
|
1498
|
+
element_count++;
|
|
1499
|
+
p = sep_pos + 1; startP = p;
|
|
1500
|
+
}
|
|
1501
|
+
/* Process last field */
|
|
1502
|
+
{
|
|
1503
|
+
long field_len = endP - startP;
|
|
1504
|
+
char *trim_start = startP;
|
|
1505
|
+
char *trim_end = startP + field_len - 1;
|
|
1506
|
+
if (strip_ws) {
|
|
1507
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1508
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1509
|
+
}
|
|
1510
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1511
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1512
|
+
all_blank = false;
|
|
1513
|
+
element_count++;
|
|
1514
|
+
}
|
|
1515
|
+
} else {
|
|
1516
|
+
/* --- (b) Filter path: column bitmap and/or early exit active --- */
|
|
1517
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
1518
|
+
long field_len = sep_pos - startP;
|
|
1519
|
+
char *trim_start = startP;
|
|
1520
|
+
char *trim_end = startP + field_len - 1;
|
|
1521
|
+
if (strip_ws) {
|
|
1522
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1523
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1524
|
+
}
|
|
1525
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1526
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1527
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1528
|
+
all_blank = false;
|
|
1529
|
+
}
|
|
1530
|
+
element_count++;
|
|
1531
|
+
if (early_exit_after >= 0 && element_count > early_exit_after) {
|
|
1532
|
+
did_early_exit = true;
|
|
1533
|
+
break;
|
|
1534
|
+
}
|
|
1535
|
+
p = sep_pos + 1; startP = p;
|
|
1536
|
+
}
|
|
1537
|
+
/* Process last field — skip on early exit */
|
|
1538
|
+
if (!did_early_exit) {
|
|
1539
|
+
long field_len = endP - startP;
|
|
1540
|
+
char *trim_start = startP;
|
|
1541
|
+
char *trim_end = startP + field_len - 1;
|
|
1542
|
+
if (strip_ws) {
|
|
1543
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1544
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1545
|
+
}
|
|
1546
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1547
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1548
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1549
|
+
all_blank = false;
|
|
1550
|
+
}
|
|
1551
|
+
element_count++;
|
|
1552
|
+
}
|
|
1553
|
+
}
|
|
1554
|
+
|
|
1555
|
+
} else {
|
|
1556
|
+
/* ========================================
|
|
1557
|
+
* SECTION 5: SLOW PATH - Quoted fields or multi-char separator
|
|
1558
|
+
* ========================================
|
|
1559
|
+
* Quote escaping options are read from the context (no rb_hash_aref).
|
|
1560
|
+
* Opt #5: downgrade to RFC mode if backslash mode is requested but this
|
|
1561
|
+
* specific line contains no backslash — allows memchr skip-ahead inside quotes.
|
|
1562
|
+
*/
|
|
1563
|
+
if (allow_escaped_quotes && !memchr(startP, '\\', endP - startP)) {
|
|
1564
|
+
allow_escaped_quotes = false;
|
|
1565
|
+
}
|
|
1566
|
+
|
|
1567
|
+
char *row_sepP2 = (ctx->row_sep_len > 0) ? ctx->row_sep_buf : NULL;
|
|
1568
|
+
long row_sep_len2 = (long)ctx->row_sep_len;
|
|
1569
|
+
|
|
1570
|
+
long i;
|
|
1571
|
+
long backslash_count = 0;
|
|
1572
|
+
bool in_quotes = false;
|
|
1573
|
+
bool col_sep_found = true;
|
|
1574
|
+
bool field_started = false;
|
|
1575
|
+
|
|
1576
|
+
char sep_char_slow = *col_sepP;
|
|
1577
|
+
|
|
1578
|
+
while (p < endP) {
|
|
1579
|
+
if (!in_quotes && *p == sep_char_slow) {
|
|
1580
|
+
col_sep_found = true;
|
|
1581
|
+
for (i = 1; (i < col_sep_len) && (p + i < endP); i++) {
|
|
1582
|
+
if (*(p + i) != *(col_sepP + i)) { col_sep_found = false; break; }
|
|
1583
|
+
}
|
|
1584
|
+
} else {
|
|
1585
|
+
col_sep_found = false;
|
|
1586
|
+
}
|
|
1587
|
+
|
|
1588
|
+
if (col_sep_found) {
|
|
1589
|
+
long field_len = p - startP;
|
|
1590
|
+
char *raw_field = startP;
|
|
1591
|
+
|
|
1592
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
1593
|
+
if (quoted) {
|
|
1594
|
+
raw_field++;
|
|
1595
|
+
field_len -= 2;
|
|
1596
|
+
}
|
|
1597
|
+
|
|
1598
|
+
char *trim_start = raw_field;
|
|
1599
|
+
char *trim_end = raw_field + field_len - 1;
|
|
1600
|
+
|
|
1601
|
+
if (strip_ws) {
|
|
1602
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1603
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1604
|
+
}
|
|
1605
|
+
|
|
1606
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1607
|
+
|
|
1608
|
+
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1609
|
+
|
|
1610
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1611
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1612
|
+
all_blank = false;
|
|
1613
|
+
}
|
|
1614
|
+
element_count++;
|
|
1615
|
+
|
|
1616
|
+
if (early_exit_after >= 0 && element_count > early_exit_after) {
|
|
1617
|
+
did_early_exit = true;
|
|
1618
|
+
goto section5_done_ctx;
|
|
1619
|
+
}
|
|
1620
|
+
|
|
1621
|
+
p += col_sep_len;
|
|
1622
|
+
startP = p;
|
|
1623
|
+
backslash_count = 0;
|
|
1624
|
+
field_started = false;
|
|
1625
|
+
|
|
1626
|
+
} else {
|
|
1627
|
+
/* Not at a separator (or inside quotes) — track quote state */
|
|
1628
|
+
|
|
1629
|
+
/* RFC mode: memchr skip-ahead inside quoted fields (Opt #6) */
|
|
1630
|
+
if (!allow_escaped_quotes && in_quotes) {
|
|
1631
|
+
char *next_quote = (char *)memchr(p, quote_char_val, endP - p);
|
|
1632
|
+
if (!next_quote) { p = endP; continue; }
|
|
1633
|
+
p = next_quote; /* fall through to quote-handling code */
|
|
1634
|
+
}
|
|
1635
|
+
|
|
1636
|
+
if (allow_escaped_quotes && *p == '\\') {
|
|
1637
|
+
backslash_count++;
|
|
1638
|
+
if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) field_started = true;
|
|
1639
|
+
} else {
|
|
1640
|
+
if (*p == quote_char_val) {
|
|
1641
|
+
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
1642
|
+
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
1643
|
+
if (in_quotes) {
|
|
1644
|
+
/* closing quote: only valid if followed by col_sep, row_sep, or end */
|
|
1645
|
+
bool valid_close = (p + 1 >= endP);
|
|
1646
|
+
if (!valid_close) {
|
|
1647
|
+
valid_close = true;
|
|
1648
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
1649
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1650
|
+
}
|
|
1651
|
+
}
|
|
1652
|
+
if (!valid_close && row_sep_len2 > 0) {
|
|
1653
|
+
valid_close = true;
|
|
1654
|
+
for (long j = 0; j < row_sep_len2; j++) {
|
|
1655
|
+
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1656
|
+
}
|
|
1657
|
+
}
|
|
1658
|
+
if (valid_close) {
|
|
1659
|
+
in_quotes = false;
|
|
1660
|
+
field_started = true;
|
|
1661
|
+
}
|
|
1662
|
+
/* else: quote inside quoted field → literal (handles "" doubling) */
|
|
1663
|
+
} else if (!field_started) {
|
|
1664
|
+
in_quotes = true; /* opening quote at field boundary */
|
|
1665
|
+
field_started = true;
|
|
1666
|
+
}
|
|
1667
|
+
/* else: mid-field quote → treat as literal */
|
|
1668
|
+
} else {
|
|
1669
|
+
in_quotes = !in_quotes;
|
|
1670
|
+
}
|
|
1671
|
+
}
|
|
1672
|
+
} else if (__builtin_expect(quote_boundary_standard, 1) && !in_quotes) {
|
|
1673
|
+
if (strip_ws) {
|
|
1674
|
+
if (*p != ' ' && *p != '\t') {
|
|
1675
|
+
field_started = true;
|
|
1676
|
+
} else if (!field_started) {
|
|
1677
|
+
startP = p + 1;
|
|
1678
|
+
}
|
|
1679
|
+
} else {
|
|
1680
|
+
field_started = true;
|
|
1681
|
+
}
|
|
1682
|
+
}
|
|
1683
|
+
backslash_count = 0;
|
|
1684
|
+
}
|
|
1685
|
+
p++;
|
|
1686
|
+
}
|
|
1687
|
+
}
|
|
1688
|
+
|
|
1689
|
+
section5_done_ctx:;
|
|
1690
|
+
/* Unclosed quote at end of line — signal multiline continuation */
|
|
1691
|
+
if (!did_early_exit && in_quotes) {
|
|
1692
|
+
VALUE result = rb_ary_new_capa(2);
|
|
1693
|
+
rb_ary_push(result, Qnil);
|
|
1694
|
+
rb_ary_push(result, LONG2FIX(-1));
|
|
1695
|
+
return result;
|
|
1696
|
+
}
|
|
1697
|
+
|
|
1698
|
+
/* Process the last field — skip on early exit */
|
|
1699
|
+
if (!did_early_exit) {
|
|
1700
|
+
long field_len = endP - startP;
|
|
1701
|
+
char *raw_field = startP;
|
|
1702
|
+
|
|
1703
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
1704
|
+
if (quoted) {
|
|
1705
|
+
raw_field++;
|
|
1706
|
+
field_len -= 2;
|
|
1707
|
+
}
|
|
1708
|
+
|
|
1709
|
+
char *trim_start = raw_field;
|
|
1710
|
+
char *trim_end = raw_field + field_len - 1;
|
|
1711
|
+
|
|
1712
|
+
if (strip_ws) {
|
|
1713
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1714
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1715
|
+
}
|
|
1716
|
+
|
|
1717
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1718
|
+
|
|
1719
|
+
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1720
|
+
|
|
1721
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1722
|
+
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1723
|
+
all_blank = false;
|
|
1724
|
+
}
|
|
1725
|
+
element_count++;
|
|
1726
|
+
}
|
|
1727
|
+
}
|
|
1728
|
+
|
|
1729
|
+
/* ----------------------------------------
|
|
1730
|
+
* SECTION 6: Handle blank rows
|
|
1731
|
+
* ---------------------------------------- */
|
|
1732
|
+
if (remove_empty && all_blank) {
|
|
1733
|
+
VALUE result = rb_ary_new_capa(2);
|
|
1734
|
+
rb_ary_push(result, Qnil);
|
|
1735
|
+
rb_ary_push(result, LONG2FIX(element_count));
|
|
1736
|
+
return result;
|
|
1737
|
+
}
|
|
1738
|
+
|
|
1739
|
+
/* ----------------------------------------
|
|
1740
|
+
* SECTION 7: Pad hash with nil for missing columns (conditional)
|
|
1741
|
+
* ---------------------------------------- */
|
|
1742
|
+
if (!remove_empty_values) {
|
|
1743
|
+
ensure_hash_allocated(&xform);
|
|
1744
|
+
for (long i = element_count; i < headers_len; i++) {
|
|
1745
|
+
if (!keep_bitmap || keep_bitmap[i]) {
|
|
1746
|
+
rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
|
|
1747
|
+
}
|
|
1748
|
+
}
|
|
1749
|
+
}
|
|
1750
|
+
|
|
1751
|
+
/* ----------------------------------------
|
|
1752
|
+
* SECTION 8: Return result
|
|
1753
|
+
* ---------------------------------------- */
|
|
1754
|
+
VALUE result = rb_ary_new_capa(2);
|
|
1755
|
+
rb_ary_push(result, xform.hash);
|
|
1756
|
+
rb_ary_push(result, LONG2FIX(element_count));
|
|
1757
|
+
return result;
|
|
1758
|
+
}
|
|
1759
|
+
|
|
836
1760
|
// Count quote characters in a line, optionally respecting backslash escapes.
|
|
837
1761
|
// This is a performance optimization that replaces the Ruby each_char implementation
|
|
838
1762
|
// which creates a new String object for every character in the line.
|
|
@@ -942,10 +1866,22 @@ void Init_smarter_csv(void) {
|
|
|
942
1866
|
id_remove_zero_values = rb_intern("remove_zero_values");
|
|
943
1867
|
id_only = rb_intern("only");
|
|
944
1868
|
id_except = rb_intern("except");
|
|
945
|
-
|
|
946
|
-
|
|
1869
|
+
id_quote_boundary = rb_intern("quote_boundary");
|
|
1870
|
+
id_only_headers = rb_intern("only_headers");
|
|
1871
|
+
id_except_headers = rb_intern("except_headers");
|
|
1872
|
+
id_keep_cols = rb_intern("_keep_cols");
|
|
1873
|
+
id_keep_bitmap = rb_intern("_keep_bitmap");
|
|
1874
|
+
id_keep_extra_cols = rb_intern("_keep_extra_cols");
|
|
1875
|
+
id_early_exit_after_sym = rb_intern("_early_exit_after");
|
|
1876
|
+
id_strict = rb_intern("strict");
|
|
1877
|
+
id_backslash = rb_intern("backslash");
|
|
1878
|
+
id_standard = rb_intern("standard");
|
|
1879
|
+
|
|
1880
|
+
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 9);
|
|
947
1881
|
rb_define_module_function(Parser, "count_quote_chars_c", rb_count_quote_chars, 4);
|
|
948
1882
|
rb_define_module_function(Parser, "count_quote_chars_auto_c", rb_count_quote_chars_auto, 3);
|
|
949
1883
|
rb_define_module_function(Parser, "zip_to_hash_c", rb_zip_to_hash, 2);
|
|
950
1884
|
rb_define_module_function(Parser, "parse_line_to_hash_c", rb_parse_line_to_hash, 3);
|
|
1885
|
+
rb_define_module_function(Parser, "new_parse_context_c", rb_new_parse_context, 2);
|
|
1886
|
+
rb_define_module_function(Parser, "parse_line_to_hash_ctx_c", rb_parse_line_to_hash_ctx, 2);
|
|
951
1887
|
}
|