smarter_csv 1.17.1 → 1.17.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +246 -63
- data/CONTRIBUTORS.md +2 -1
- data/README.md +6 -3
- data/UPGRADING.md +251 -0
- data/docs/.nojekyll +0 -0
- data/docs/upgrade_path.json +175 -0
- data/docs/upgrade_wizard.html +498 -0
- data/ext/smarter_csv/smarter_csv.c +248 -323
- data/lib/smarter_csv/parser.rb +40 -12
- data/lib/smarter_csv/version.rb +1 -1
- data/smarter_csv.gemspec +7 -5
- metadata +8 -3
- data/TO_DO.md +0 -109
|
@@ -169,16 +169,107 @@ needs_unescape:
|
|
|
169
169
|
/* Helper: build the 2-element [elements, data_size] tuple returned by rb_parse_csv_line.
|
|
170
170
|
* Aligns this function's return shape with parse_csv_line_ruby and rb_parse_line_to_hash_ctx:
|
|
171
171
|
* data_size = -1 signals "unclosed quoted field — needs more data". */
|
|
172
|
-
static inline
|
|
172
|
+
static inline __attribute__((always_inline))
|
|
173
|
+
VALUE return_parser_result(VALUE elements, long data_size) {
|
|
173
174
|
VALUE result = rb_ary_new_capa(2);
|
|
174
175
|
rb_ary_push(result, elements);
|
|
175
176
|
rb_ary_push(result, LONG2FIX(data_size));
|
|
176
177
|
return result;
|
|
177
178
|
}
|
|
178
179
|
|
|
180
|
+
/* Helper: trim leading/trailing spaces and tabs from a field when strip_ws is set.
|
|
181
|
+
* Sets *out_start to the first kept byte and returns the trimmed length (0 when the
|
|
182
|
+
* field is empty or all whitespace). This is the trim performed at every field
|
|
183
|
+
* boundary in all three parsers; kept always_inline so each call site compiles to
|
|
184
|
+
* the same code as the hand-written loops it replaces (no performance cost). */
|
|
185
|
+
static inline __attribute__((always_inline))
|
|
186
|
+
long trim_field(char *field, long field_len, bool strip_ws, char **out_start) {
|
|
187
|
+
char *trim_start = field;
|
|
188
|
+
char *trim_end = field + field_len - 1;
|
|
189
|
+
if (strip_ws) {
|
|
190
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
191
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
192
|
+
}
|
|
193
|
+
*out_start = trim_start;
|
|
194
|
+
return (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/* A field after quote-stripping and trimming: where its content starts, how long it
|
|
198
|
+
* is, and whether it still contains quote characters that need unescaping. */
|
|
199
|
+
typedef struct {
|
|
200
|
+
char *start;
|
|
201
|
+
long len;
|
|
202
|
+
bool has_quotes;
|
|
203
|
+
} extracted_field;
|
|
204
|
+
|
|
205
|
+
/* Helper: turn a raw field slice into the values every extraction site needs.
|
|
206
|
+
* Strips a surrounding pair of quote chars (if present), trims whitespace via
|
|
207
|
+
* trim_field, and reports whether the result still has embedded quotes (true for a
|
|
208
|
+
* quoted field, or any field containing the quote char). This is the common prefix
|
|
209
|
+
* before each field is pushed/inserted, in all three parsers' slow paths.
|
|
210
|
+
* always_inline + return-by-value so the struct is dissolved into registers and each
|
|
211
|
+
* call site compiles to the same code as the old inline block (no performance cost). */
|
|
212
|
+
static inline __attribute__((always_inline))
|
|
213
|
+
extracted_field extract_field(char *raw_field, long field_len, bool strip_ws, char quote_char_val) {
|
|
214
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
215
|
+
if (quoted) {
|
|
216
|
+
raw_field++; // Skip opening quote
|
|
217
|
+
field_len -= 2; // Exclude both quotes from length
|
|
218
|
+
}
|
|
219
|
+
char *trim_start;
|
|
220
|
+
long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
|
|
221
|
+
extracted_field result = {
|
|
222
|
+
trim_start,
|
|
223
|
+
trimmed_len,
|
|
224
|
+
quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len))
|
|
225
|
+
};
|
|
226
|
+
return result;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/* Helper: is a closing quote at p actually a field close? Valid only when followed by
|
|
230
|
+
* the column separator, the row separator, or end of line. Pure read — touches none of
|
|
231
|
+
* the quote loop's state (in_quotes/field_started/etc). Mirrors the inline lookahead
|
|
232
|
+
* copied into all three parsers' quote machines; always_inline so it compiles to the
|
|
233
|
+
* same code as the hand-written block. */
|
|
234
|
+
static inline __attribute__((always_inline))
|
|
235
|
+
bool is_valid_close(const char *p, const char *endP,
|
|
236
|
+
const char *col_sepP, long col_sep_len,
|
|
237
|
+
const char *row_sepP, long row_sep_len) {
|
|
238
|
+
bool valid_close = (p + 1 >= endP);
|
|
239
|
+
if (!valid_close) {
|
|
240
|
+
valid_close = true;
|
|
241
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
242
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
if (!valid_close && row_sep_len > 0) {
|
|
246
|
+
valid_close = true;
|
|
247
|
+
for (long j = 0; j < row_sep_len; j++) {
|
|
248
|
+
if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return valid_close;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/* Helper: strip a trailing row separator from the line (pointer adjustment, no string
|
|
255
|
+
* mutation). If the last row_sep_len bytes at endP match row_sepP, move endP back past
|
|
256
|
+
* them; otherwise leave endP untouched. The row_sep_len > 0 guard means callers can
|
|
257
|
+
* pass (NULL, 0) for "no separator known yet" without an outer if. Shared by
|
|
258
|
+
* rb_parse_line_to_hash and rb_parse_line_to_hash_ctx; always_inline keeps the chomp
|
|
259
|
+
* site as cheap as the hand-written check it replaces. */
|
|
260
|
+
static inline __attribute__((always_inline))
|
|
261
|
+
char *chomp_row_sep(char *endP, long line_len, const char *row_sepP, long row_sep_len) {
|
|
262
|
+
if (row_sep_len > 0
|
|
263
|
+
&& line_len >= row_sep_len
|
|
264
|
+
&& memcmp(endP - row_sep_len, row_sepP, (size_t)row_sep_len) == 0) {
|
|
265
|
+
endP -= row_sep_len;
|
|
266
|
+
}
|
|
267
|
+
return endP;
|
|
268
|
+
}
|
|
269
|
+
|
|
179
270
|
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val, VALUE quote_boundary_standard_val, VALUE row_sep_val) {
|
|
180
271
|
if (RB_TYPE_P(line, T_NIL) == 1) {
|
|
181
|
-
return
|
|
272
|
+
return return_parser_result(rb_ary_new(), 0);
|
|
182
273
|
}
|
|
183
274
|
|
|
184
275
|
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
@@ -205,7 +296,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
205
296
|
if (max_size != Qnil) {
|
|
206
297
|
max_fields = NUM2INT(max_size);
|
|
207
298
|
if (max_fields < 0) {
|
|
208
|
-
return
|
|
299
|
+
return return_parser_result(rb_ary_new(), 0);
|
|
209
300
|
}
|
|
210
301
|
}
|
|
211
302
|
|
|
@@ -229,15 +320,8 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
229
320
|
|
|
230
321
|
long field_len = sep_pos - startP;
|
|
231
322
|
char *raw_field = startP;
|
|
232
|
-
char *trim_start
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
if (strip_ws) {
|
|
236
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
237
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
323
|
+
char *trim_start;
|
|
324
|
+
long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
|
|
241
325
|
|
|
242
326
|
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
243
327
|
rb_ary_push(elements, field);
|
|
@@ -250,21 +334,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
250
334
|
if ((max_fields < 0) || (element_count < max_fields)) {
|
|
251
335
|
long field_len = endP - startP;
|
|
252
336
|
char *raw_field = startP;
|
|
253
|
-
char *trim_start
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
if (strip_ws) {
|
|
257
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
258
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
337
|
+
char *trim_start;
|
|
338
|
+
long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
|
|
262
339
|
|
|
263
340
|
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
264
341
|
rb_ary_push(elements, field);
|
|
265
342
|
}
|
|
266
343
|
|
|
267
|
-
return
|
|
344
|
+
return return_parser_result(elements, RARRAY_LEN(elements));
|
|
268
345
|
}
|
|
269
346
|
|
|
270
347
|
// === SLOW PATH: Quoted fields or multi-char separator ===
|
|
@@ -291,28 +368,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
291
368
|
long field_len = p - startP;
|
|
292
369
|
char *raw_field = startP;
|
|
293
370
|
|
|
294
|
-
|
|
295
|
-
if (quoted) {
|
|
296
|
-
raw_field++;
|
|
297
|
-
field_len -= 2;
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
char *trim_start = raw_field;
|
|
301
|
-
char *trim_end = raw_field + field_len - 1;
|
|
371
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
302
372
|
|
|
303
|
-
if (
|
|
304
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
305
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
309
|
-
|
|
310
|
-
if (trimmed_len == 0) {
|
|
373
|
+
if (f.len == 0) {
|
|
311
374
|
field = Qempty_string;
|
|
312
|
-
} else if (
|
|
313
|
-
field = unescape_quotes(
|
|
375
|
+
} else if (f.has_quotes) {
|
|
376
|
+
field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
|
|
314
377
|
} else {
|
|
315
|
-
field = rb_enc_str_new(
|
|
378
|
+
field = rb_enc_str_new(f.start, f.len, encoding);
|
|
316
379
|
}
|
|
317
380
|
|
|
318
381
|
rb_ary_push(elements, field);
|
|
@@ -331,25 +394,27 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
331
394
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
332
395
|
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
333
396
|
if (in_quotes) {
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
397
|
+
if (p + 2 < endP && *(p + 1) == quote_char_val) {
|
|
398
|
+
/* RFC doubled quote inside a quoted field ("" → ").
|
|
399
|
+
* Give this precedence over the closing-quote check, but only
|
|
400
|
+
* when another byte follows the doubled pair.
|
|
401
|
+
*
|
|
402
|
+
* Compatibility note: we intentionally do NOT force terminal
|
|
403
|
+
* "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
404
|
+
* behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
405
|
+
* the final quote may still close the field instead of turning the
|
|
406
|
+
* row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
407
|
+
* precedence for ..."",... (more content follows), but we keep the
|
|
408
|
+
* historical leniency for terminal ..."". */
|
|
409
|
+
p++;
|
|
410
|
+
} else {
|
|
411
|
+
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
412
|
+
if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP, row_sep_len)) {
|
|
413
|
+
in_quotes = false;
|
|
414
|
+
field_started = true;
|
|
346
415
|
}
|
|
416
|
+
// else: quote inside quoted field → literal
|
|
347
417
|
}
|
|
348
|
-
if (valid_close) {
|
|
349
|
-
in_quotes = false;
|
|
350
|
-
field_started = true;
|
|
351
|
-
}
|
|
352
|
-
// else: quote inside quoted field → literal (handles "" doubling)
|
|
353
418
|
} else if (!field_started) {
|
|
354
419
|
in_quotes = true; // opening quote at field boundary
|
|
355
420
|
field_started = true;
|
|
@@ -383,41 +448,27 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
383
448
|
* the signal: append the next physical line and re-parse, or raise MalformedCSV
|
|
384
449
|
* at EOF if the field never closes. The parser does not decide "ultimately
|
|
385
450
|
* malformed"; the caller does. */
|
|
386
|
-
return
|
|
451
|
+
return return_parser_result(rb_ary_new(), -1);
|
|
387
452
|
}
|
|
388
453
|
|
|
389
454
|
if ((max_fields < 0) || (element_count < max_fields)) {
|
|
390
455
|
long field_len = endP - startP;
|
|
391
456
|
char *raw_field = startP;
|
|
392
457
|
|
|
393
|
-
|
|
394
|
-
if (quoted) {
|
|
395
|
-
raw_field++;
|
|
396
|
-
field_len -= 2;
|
|
397
|
-
}
|
|
458
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
398
459
|
|
|
399
|
-
|
|
400
|
-
char *trim_end = raw_field + field_len - 1;
|
|
401
|
-
|
|
402
|
-
if (strip_ws) {
|
|
403
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
404
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
408
|
-
|
|
409
|
-
if (trimmed_len == 0) {
|
|
460
|
+
if (f.len == 0) {
|
|
410
461
|
field = Qempty_string;
|
|
411
|
-
} else if (
|
|
412
|
-
field = unescape_quotes(
|
|
462
|
+
} else if (f.has_quotes) {
|
|
463
|
+
field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
|
|
413
464
|
} else {
|
|
414
|
-
field = rb_enc_str_new(
|
|
465
|
+
field = rb_enc_str_new(f.start, f.len, encoding);
|
|
415
466
|
}
|
|
416
467
|
|
|
417
468
|
rb_ary_push(elements, field);
|
|
418
469
|
}
|
|
419
470
|
|
|
420
|
-
return
|
|
471
|
+
return return_parser_result(elements, RARRAY_LEN(elements));
|
|
421
472
|
}
|
|
422
473
|
|
|
423
474
|
// Efficiently combine two arrays into a hash (replaces headers.zip(values).to_h)
|
|
@@ -675,6 +726,32 @@ static inline __attribute__((always_inline)) bool insert_field_into_hash(
|
|
|
675
726
|
return true;
|
|
676
727
|
}
|
|
677
728
|
|
|
729
|
+
/* Helper: parse the convert_values_to_numeric option into a mode + key list.
|
|
730
|
+
* mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys.
|
|
731
|
+
* Writes through the out-params only when the option is set, so callers must
|
|
732
|
+
* pre-initialize *out_mode = 0 and *out_keys = Qnil. Shared by rb_parse_line_to_hash
|
|
733
|
+
* and rb_new_parse_context — identical logic, different storage (locals vs ctx fields).
|
|
734
|
+
* always_inline so each call site compiles to the same code as the old inline block. */
|
|
735
|
+
static inline __attribute__((always_inline))
|
|
736
|
+
void parse_numeric_option(VALUE options_hash, int *out_mode, VALUE *out_keys) {
|
|
737
|
+
VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
|
|
738
|
+
if (RTEST(convert_opt)) {
|
|
739
|
+
if (RB_TYPE_P(convert_opt, T_HASH)) {
|
|
740
|
+
VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
|
|
741
|
+
VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
|
|
742
|
+
if (RTEST(only_keys)) {
|
|
743
|
+
*out_mode = 2;
|
|
744
|
+
*out_keys = rb_Array(only_keys); // wrap single value in array if needed
|
|
745
|
+
} else if (RTEST(except_keys)) {
|
|
746
|
+
*out_mode = 3;
|
|
747
|
+
*out_keys = rb_Array(except_keys); // wrap single value in array if needed
|
|
748
|
+
}
|
|
749
|
+
} else {
|
|
750
|
+
*out_mode = 1; // convert all
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
678
755
|
/*
|
|
679
756
|
* ================================================================================
|
|
680
757
|
* rb_parse_line_to_hash - Parse CSV line directly into a Ruby Hash
|
|
@@ -723,10 +800,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
723
800
|
* SECTION 1: Handle nil/invalid input
|
|
724
801
|
* ---------------------------------------- */
|
|
725
802
|
if (NIL_P(line)) {
|
|
726
|
-
|
|
727
|
-
rb_ary_push(result, Qnil);
|
|
728
|
-
rb_ary_push(result, INT2FIX(0));
|
|
729
|
-
return result;
|
|
803
|
+
return return_parser_result(Qnil, 0);
|
|
730
804
|
}
|
|
731
805
|
|
|
732
806
|
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
@@ -751,22 +825,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
751
825
|
// numeric_mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys
|
|
752
826
|
int numeric_mode = 0;
|
|
753
827
|
VALUE numeric_keys = Qnil;
|
|
754
|
-
|
|
755
|
-
if (RTEST(convert_opt)) {
|
|
756
|
-
if (RB_TYPE_P(convert_opt, T_HASH)) {
|
|
757
|
-
VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
|
|
758
|
-
VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
|
|
759
|
-
if (RTEST(only_keys)) {
|
|
760
|
-
numeric_mode = 2;
|
|
761
|
-
numeric_keys = rb_Array(only_keys); // wrap single value in array if needed
|
|
762
|
-
} else if (RTEST(except_keys)) {
|
|
763
|
-
numeric_mode = 3;
|
|
764
|
-
numeric_keys = rb_Array(except_keys); // wrap single value in array if needed
|
|
765
|
-
}
|
|
766
|
-
} else {
|
|
767
|
-
numeric_mode = 1; // convert all
|
|
768
|
-
}
|
|
769
|
-
}
|
|
828
|
+
parse_numeric_option(options_hash, &numeric_mode, &numeric_keys);
|
|
770
829
|
|
|
771
830
|
// quote_escaping and quote_boundary are only needed in Section 5 (quoted/slow path).
|
|
772
831
|
// They are declared here as forward declarations so Section 5 can set them lazily.
|
|
@@ -783,11 +842,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
783
842
|
// row_sep is also reused in Section 5 for the closing-quote boundary check.
|
|
784
843
|
VALUE row_sep = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
|
|
785
844
|
if (!NIL_P(row_sep) && RB_TYPE_P(row_sep, T_STRING)) {
|
|
786
|
-
|
|
787
|
-
long row_sep_len = RSTRING_LEN(row_sep);
|
|
788
|
-
if (line_len >= row_sep_len && memcmp(endP - row_sep_len, row_sepP, row_sep_len) == 0) {
|
|
789
|
-
endP -= row_sep_len;
|
|
790
|
-
}
|
|
845
|
+
endP = chomp_row_sep(endP, line_len, RSTRING_PTR(row_sep), RSTRING_LEN(row_sep));
|
|
791
846
|
}
|
|
792
847
|
|
|
793
848
|
char *col_sepP = RSTRING_PTR(col_sep);
|
|
@@ -829,6 +884,11 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
829
884
|
* the frame stays well below 4 KB and ___chkstk_darwin never fires on ARM64 macOS.
|
|
830
885
|
*/
|
|
831
886
|
bool *keep_bitmap = NULL;
|
|
887
|
+
/* In THIS (non-ctx) function the bitmap is alloca'd to headers_len on every call (see the alloca
|
|
888
|
+
* sites below), so keep_bitmap[] is exactly headers_len long and headers_len is the correct bound
|
|
889
|
+
* at all access sites. Do NOT mirror rb_parse_line_to_hash_ctx's keep_bitmap_len here: that variant
|
|
890
|
+
* caches its bitmap across rows (where @headers can grow), so it must use the captured length; this
|
|
891
|
+
* one rebuilds per call and does not. */
|
|
832
892
|
bool keep_extra_columns = true; /* extra cols (> headers_len): keep by default */
|
|
833
893
|
bool has_only = false; /* true when only_headers: filtering is active */
|
|
834
894
|
long early_exit_after = -1; /* column index after which we stop; -1 = no early exit */
|
|
@@ -955,13 +1015,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
955
1015
|
/* --- (a) Common path: no column filter, no early exit --- */
|
|
956
1016
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
957
1017
|
long field_len = sep_pos - startP;
|
|
958
|
-
char *trim_start
|
|
959
|
-
|
|
960
|
-
if (strip_ws) {
|
|
961
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
962
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
963
|
-
}
|
|
964
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1018
|
+
char *trim_start;
|
|
1019
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
965
1020
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
966
1021
|
all_blank = false;
|
|
967
1022
|
element_count++;
|
|
@@ -970,13 +1025,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
970
1025
|
/* Process last field */
|
|
971
1026
|
{
|
|
972
1027
|
long field_len = endP - startP;
|
|
973
|
-
char *trim_start
|
|
974
|
-
|
|
975
|
-
if (strip_ws) {
|
|
976
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
977
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
978
|
-
}
|
|
979
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1028
|
+
char *trim_start;
|
|
1029
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
980
1030
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
981
1031
|
all_blank = false;
|
|
982
1032
|
element_count++;
|
|
@@ -985,13 +1035,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
985
1035
|
/* --- (b) Filter path: column bitmap and/or early exit active --- */
|
|
986
1036
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
987
1037
|
long field_len = sep_pos - startP;
|
|
988
|
-
char *trim_start
|
|
989
|
-
|
|
990
|
-
if (strip_ws) {
|
|
991
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
992
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
993
|
-
}
|
|
994
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1038
|
+
char *trim_start;
|
|
1039
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
995
1040
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
996
1041
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
997
1042
|
all_blank = false;
|
|
@@ -1006,13 +1051,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1006
1051
|
/* Process last field — skip on early exit */
|
|
1007
1052
|
if (!did_early_exit) {
|
|
1008
1053
|
long field_len = endP - startP;
|
|
1009
|
-
char *trim_start
|
|
1010
|
-
|
|
1011
|
-
if (strip_ws) {
|
|
1012
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1013
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1014
|
-
}
|
|
1015
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1054
|
+
char *trim_start;
|
|
1055
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1016
1056
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1017
1057
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1018
1058
|
all_blank = false;
|
|
@@ -1087,28 +1127,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1087
1127
|
long field_len = p - startP;
|
|
1088
1128
|
char *raw_field = startP;
|
|
1089
1129
|
|
|
1090
|
-
|
|
1091
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
1092
|
-
if (quoted) {
|
|
1093
|
-
raw_field++; // Skip opening quote
|
|
1094
|
-
field_len -= 2; // Exclude both quotes from length
|
|
1095
|
-
}
|
|
1096
|
-
|
|
1097
|
-
char *trim_start = raw_field;
|
|
1098
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1099
|
-
|
|
1100
|
-
if (strip_ws) {
|
|
1101
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1102
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1103
|
-
}
|
|
1104
|
-
|
|
1105
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1106
|
-
|
|
1107
|
-
// Determine if field contains embedded quotes (need unescape)
|
|
1108
|
-
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1130
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1109
1131
|
|
|
1110
1132
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1111
|
-
if (insert_field_into_hash(&xform,
|
|
1133
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1112
1134
|
all_blank = false;
|
|
1113
1135
|
}
|
|
1114
1136
|
element_count++;
|
|
@@ -1147,25 +1169,27 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1147
1169
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
1148
1170
|
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
1149
1171
|
if (in_quotes) {
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1172
|
+
if (p + 2 < endP && *(p + 1) == quote_char_val) {
|
|
1173
|
+
/* RFC doubled quote inside a quoted field ("" → ").
|
|
1174
|
+
* Give this precedence over the closing-quote check, but only
|
|
1175
|
+
* when another byte follows the doubled pair.
|
|
1176
|
+
*
|
|
1177
|
+
* Compatibility note: we intentionally do NOT force terminal
|
|
1178
|
+
* "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
1179
|
+
* behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
1180
|
+
* the final quote may still close the field instead of turning the
|
|
1181
|
+
* row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
1182
|
+
* precedence for ..."",... (more content follows), but we keep the
|
|
1183
|
+
* historical leniency for terminal ..."". */
|
|
1184
|
+
p++;
|
|
1185
|
+
} else {
|
|
1186
|
+
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
1187
|
+
if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
|
|
1188
|
+
in_quotes = false;
|
|
1189
|
+
field_started = true;
|
|
1162
1190
|
}
|
|
1191
|
+
// else: quote inside quoted field → literal
|
|
1163
1192
|
}
|
|
1164
|
-
if (valid_close) {
|
|
1165
|
-
in_quotes = false;
|
|
1166
|
-
field_started = true;
|
|
1167
|
-
}
|
|
1168
|
-
// else: quote inside quoted field → literal (handles "" doubling)
|
|
1169
1193
|
} else if (!field_started) {
|
|
1170
1194
|
in_quotes = true; // opening quote at field boundary
|
|
1171
1195
|
field_started = true;
|
|
@@ -1198,10 +1222,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1198
1222
|
* We return [nil, -1] rather than raising so the read loop can handle multiline fields
|
|
1199
1223
|
* without a separate pre-scan pass (detect_multiline). */
|
|
1200
1224
|
if (!did_early_exit && in_quotes) {
|
|
1201
|
-
|
|
1202
|
-
rb_ary_push(result, Qnil);
|
|
1203
|
-
rb_ary_push(result, LONG2FIX(-1));
|
|
1204
|
-
return result;
|
|
1225
|
+
return return_parser_result(Qnil, -1);
|
|
1205
1226
|
}
|
|
1206
1227
|
|
|
1207
1228
|
/* Process the last field (same logic as above) — skip on early exit */
|
|
@@ -1209,26 +1230,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1209
1230
|
long field_len = endP - startP;
|
|
1210
1231
|
char *raw_field = startP;
|
|
1211
1232
|
|
|
1212
|
-
|
|
1213
|
-
if (quoted) {
|
|
1214
|
-
raw_field++;
|
|
1215
|
-
field_len -= 2;
|
|
1216
|
-
}
|
|
1217
|
-
|
|
1218
|
-
char *trim_start = raw_field;
|
|
1219
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1220
|
-
|
|
1221
|
-
if (strip_ws) {
|
|
1222
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1223
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1224
|
-
}
|
|
1225
|
-
|
|
1226
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1227
|
-
|
|
1228
|
-
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1233
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1229
1234
|
|
|
1230
1235
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1231
|
-
if (insert_field_into_hash(&xform,
|
|
1236
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1232
1237
|
all_blank = false;
|
|
1233
1238
|
}
|
|
1234
1239
|
element_count++;
|
|
@@ -1249,10 +1254,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1249
1254
|
*/
|
|
1250
1255
|
if (all_blank) {
|
|
1251
1256
|
if (remove_empty) {
|
|
1252
|
-
|
|
1253
|
-
rb_ary_push(result, Qnil);
|
|
1254
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1255
|
-
return result;
|
|
1257
|
+
return return_parser_result(Qnil, element_count);
|
|
1256
1258
|
}
|
|
1257
1259
|
|
|
1258
1260
|
ensure_hash_allocated(&xform);
|
|
@@ -1280,10 +1282,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1280
1282
|
* Return [hash, element_count] so caller can detect extra columns
|
|
1281
1283
|
* (when element_count > headers_len) and extend headers if needed.
|
|
1282
1284
|
*/
|
|
1283
|
-
|
|
1284
|
-
rb_ary_push(result, xform.hash);
|
|
1285
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1286
|
-
return result;
|
|
1285
|
+
return return_parser_result(xform.hash, element_count);
|
|
1287
1286
|
}
|
|
1288
1287
|
|
|
1289
1288
|
/* ================================================================================
|
|
@@ -1354,22 +1353,7 @@ __attribute__((cold)) static VALUE rb_new_parse_context(VALUE self, VALUE header
|
|
|
1354
1353
|
ctx->remove_zero_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_zero_values)));
|
|
1355
1354
|
|
|
1356
1355
|
/* Numeric conversion */
|
|
1357
|
-
|
|
1358
|
-
if (RTEST(convert_opt)) {
|
|
1359
|
-
if (RB_TYPE_P(convert_opt, T_HASH)) {
|
|
1360
|
-
VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
|
|
1361
|
-
VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
|
|
1362
|
-
if (RTEST(only_keys)) {
|
|
1363
|
-
ctx->numeric_mode = 2;
|
|
1364
|
-
ctx->numeric_keys = rb_Array(only_keys);
|
|
1365
|
-
} else if (RTEST(except_keys)) {
|
|
1366
|
-
ctx->numeric_mode = 3;
|
|
1367
|
-
ctx->numeric_keys = rb_Array(except_keys);
|
|
1368
|
-
}
|
|
1369
|
-
} else {
|
|
1370
|
-
ctx->numeric_mode = 1;
|
|
1371
|
-
}
|
|
1372
|
-
}
|
|
1356
|
+
parse_numeric_option(options_hash, &ctx->numeric_mode, &ctx->numeric_keys);
|
|
1373
1357
|
|
|
1374
1358
|
/* quote_escaping → allow_escaped_quotes */
|
|
1375
1359
|
VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
|
|
@@ -1468,10 +1452,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1468
1452
|
* SECTION 1: Handle nil/invalid input
|
|
1469
1453
|
* ---------------------------------------- */
|
|
1470
1454
|
if (NIL_P(line)) {
|
|
1471
|
-
|
|
1472
|
-
rb_ary_push(result, Qnil);
|
|
1473
|
-
rb_ary_push(result, INT2FIX(0));
|
|
1474
|
-
return result;
|
|
1455
|
+
return return_parser_result(Qnil, 0);
|
|
1475
1456
|
}
|
|
1476
1457
|
|
|
1477
1458
|
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
@@ -1495,6 +1476,14 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1495
1476
|
int numeric_mode = ctx->numeric_mode;
|
|
1496
1477
|
VALUE numeric_keys = ctx->numeric_keys;
|
|
1497
1478
|
bool *keep_bitmap = ctx->keep_bitmap;
|
|
1479
|
+
/* keep_bitmap is cached in the context (xmalloc'd once at construction, sized to the header count
|
|
1480
|
+
* THEN). @headers can grow in place as undeclared extra columns appear, so the live headers_len
|
|
1481
|
+
* (re-read each call below) may exceed the bitmap's length. Every keep_bitmap[] access in this
|
|
1482
|
+
* function MUST be bounded by keep_bitmap_len, never headers_len — indices past the bitmap are
|
|
1483
|
+
* extra columns and follow keep_extra_columns. Bounding by the grown headers_len was an
|
|
1484
|
+
* out-of-bounds heap read (the bug). The sibling rb_parse_line_to_hash safely uses headers_len
|
|
1485
|
+
* because it re-allocs its bitmap to headers_len on every call. */
|
|
1486
|
+
long keep_bitmap_len = ctx->keep_bitmap_len;
|
|
1498
1487
|
bool keep_extra_columns = ctx->keep_extra_columns;
|
|
1499
1488
|
long early_exit_after = ctx->early_exit_after;
|
|
1500
1489
|
|
|
@@ -1509,12 +1498,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1509
1498
|
char *p = startP;
|
|
1510
1499
|
|
|
1511
1500
|
/* Chomp: strip trailing row separator (pointer adjustment, no string mutation) */
|
|
1512
|
-
|
|
1513
|
-
long rsl = (long)ctx->row_sep_len;
|
|
1514
|
-
if (line_len >= rsl && memcmp(endP - rsl, ctx->row_sep_buf, (size_t)rsl) == 0) {
|
|
1515
|
-
endP -= rsl;
|
|
1516
|
-
}
|
|
1517
|
-
}
|
|
1501
|
+
endP = chomp_row_sep(endP, line_len, ctx->row_sep_buf, (long)ctx->row_sep_len);
|
|
1518
1502
|
|
|
1519
1503
|
/* Re-read headers_len each call to handle extra-column growth */
|
|
1520
1504
|
long headers_len = NIL_P(ctx->headers) ? 0 : RARRAY_LEN(ctx->headers);
|
|
@@ -1559,13 +1543,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1559
1543
|
/* --- (a) Common path: no column filter, no early exit --- */
|
|
1560
1544
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
1561
1545
|
long field_len = sep_pos - startP;
|
|
1562
|
-
char *trim_start
|
|
1563
|
-
|
|
1564
|
-
if (strip_ws) {
|
|
1565
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1566
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1567
|
-
}
|
|
1568
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1546
|
+
char *trim_start;
|
|
1547
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1569
1548
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1570
1549
|
all_blank = false;
|
|
1571
1550
|
element_count++;
|
|
@@ -1574,13 +1553,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1574
1553
|
/* Process last field */
|
|
1575
1554
|
{
|
|
1576
1555
|
long field_len = endP - startP;
|
|
1577
|
-
char *trim_start
|
|
1578
|
-
|
|
1579
|
-
if (strip_ws) {
|
|
1580
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1581
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1582
|
-
}
|
|
1583
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1556
|
+
char *trim_start;
|
|
1557
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1584
1558
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1585
1559
|
all_blank = false;
|
|
1586
1560
|
element_count++;
|
|
@@ -1589,14 +1563,9 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1589
1563
|
/* --- (b) Filter path: column bitmap and/or early exit active --- */
|
|
1590
1564
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
1591
1565
|
long field_len = sep_pos - startP;
|
|
1592
|
-
char *trim_start
|
|
1593
|
-
|
|
1594
|
-
if (
|
|
1595
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1596
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1597
|
-
}
|
|
1598
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1599
|
-
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1566
|
+
char *trim_start;
|
|
1567
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1568
|
+
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1600
1569
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1601
1570
|
all_blank = false;
|
|
1602
1571
|
}
|
|
@@ -1610,14 +1579,9 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1610
1579
|
/* Process last field — skip on early exit */
|
|
1611
1580
|
if (!did_early_exit) {
|
|
1612
1581
|
long field_len = endP - startP;
|
|
1613
|
-
char *trim_start
|
|
1614
|
-
|
|
1615
|
-
if (
|
|
1616
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1617
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1618
|
-
}
|
|
1619
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1620
|
-
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1582
|
+
char *trim_start;
|
|
1583
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1584
|
+
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1621
1585
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1622
1586
|
all_blank = false;
|
|
1623
1587
|
}
|
|
@@ -1662,26 +1626,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1662
1626
|
long field_len = p - startP;
|
|
1663
1627
|
char *raw_field = startP;
|
|
1664
1628
|
|
|
1665
|
-
|
|
1666
|
-
if (quoted) {
|
|
1667
|
-
raw_field++;
|
|
1668
|
-
field_len -= 2;
|
|
1669
|
-
}
|
|
1670
|
-
|
|
1671
|
-
char *trim_start = raw_field;
|
|
1672
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1673
|
-
|
|
1674
|
-
if (strip_ws) {
|
|
1675
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1676
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1677
|
-
}
|
|
1678
|
-
|
|
1679
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1629
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1680
1630
|
|
|
1681
|
-
|
|
1682
|
-
|
|
1683
|
-
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1684
|
-
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1631
|
+
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1632
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1685
1633
|
all_blank = false;
|
|
1686
1634
|
}
|
|
1687
1635
|
element_count++;
|
|
@@ -1714,25 +1662,27 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1714
1662
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
1715
1663
|
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
1716
1664
|
if (in_quotes) {
|
|
1717
|
-
|
|
1718
|
-
|
|
1719
|
-
|
|
1720
|
-
|
|
1721
|
-
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1725
|
-
|
|
1726
|
-
|
|
1727
|
-
|
|
1728
|
-
|
|
1665
|
+
if (p + 2 < endP && *(p + 1) == quote_char_val) {
|
|
1666
|
+
/* RFC doubled quote inside a quoted field ("" → ").
|
|
1667
|
+
* Give this precedence over the closing-quote check, but only
|
|
1668
|
+
* when another byte follows the doubled pair.
|
|
1669
|
+
*
|
|
1670
|
+
* Compatibility note: we intentionally do NOT force terminal
|
|
1671
|
+
* "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
1672
|
+
* behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
1673
|
+
* the final quote may still close the field instead of turning the
|
|
1674
|
+
* row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
1675
|
+
* precedence for ..."",... (more content follows), but we keep the
|
|
1676
|
+
* historical leniency for terminal ..."". */
|
|
1677
|
+
p++;
|
|
1678
|
+
} else {
|
|
1679
|
+
/* closing quote: only valid if followed by col_sep, row_sep, or end */
|
|
1680
|
+
if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
|
|
1681
|
+
in_quotes = false;
|
|
1682
|
+
field_started = true;
|
|
1729
1683
|
}
|
|
1684
|
+
/* else: quote inside quoted field → literal */
|
|
1730
1685
|
}
|
|
1731
|
-
if (valid_close) {
|
|
1732
|
-
in_quotes = false;
|
|
1733
|
-
field_started = true;
|
|
1734
|
-
}
|
|
1735
|
-
/* else: quote inside quoted field → literal (handles "" doubling) */
|
|
1736
1686
|
} else if (!field_started) {
|
|
1737
1687
|
in_quotes = true; /* opening quote at field boundary */
|
|
1738
1688
|
field_started = true;
|
|
@@ -1762,10 +1712,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1762
1712
|
section5_done_ctx:;
|
|
1763
1713
|
/* Unclosed quote at end of line — signal multiline continuation */
|
|
1764
1714
|
if (!did_early_exit && in_quotes) {
|
|
1765
|
-
|
|
1766
|
-
rb_ary_push(result, Qnil);
|
|
1767
|
-
rb_ary_push(result, LONG2FIX(-1));
|
|
1768
|
-
return result;
|
|
1715
|
+
return return_parser_result(Qnil, -1);
|
|
1769
1716
|
}
|
|
1770
1717
|
|
|
1771
1718
|
/* Process the last field — skip on early exit */
|
|
@@ -1773,26 +1720,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1773
1720
|
long field_len = endP - startP;
|
|
1774
1721
|
char *raw_field = startP;
|
|
1775
1722
|
|
|
1776
|
-
|
|
1777
|
-
if (quoted) {
|
|
1778
|
-
raw_field++;
|
|
1779
|
-
field_len -= 2;
|
|
1780
|
-
}
|
|
1781
|
-
|
|
1782
|
-
char *trim_start = raw_field;
|
|
1783
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1723
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1784
1724
|
|
|
1785
|
-
if (
|
|
1786
|
-
|
|
1787
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1788
|
-
}
|
|
1789
|
-
|
|
1790
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1791
|
-
|
|
1792
|
-
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1793
|
-
|
|
1794
|
-
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1795
|
-
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1725
|
+
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1726
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1796
1727
|
all_blank = false;
|
|
1797
1728
|
}
|
|
1798
1729
|
element_count++;
|
|
@@ -1804,10 +1735,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1804
1735
|
* ---------------------------------------- */
|
|
1805
1736
|
if (all_blank) {
|
|
1806
1737
|
if (remove_empty) {
|
|
1807
|
-
|
|
1808
|
-
rb_ary_push(result, Qnil);
|
|
1809
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1810
|
-
return result;
|
|
1738
|
+
return return_parser_result(Qnil, element_count);
|
|
1811
1739
|
}
|
|
1812
1740
|
|
|
1813
1741
|
ensure_hash_allocated(&xform);
|
|
@@ -1819,7 +1747,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1819
1747
|
if (!remove_empty_values) {
|
|
1820
1748
|
ensure_hash_allocated(&xform);
|
|
1821
1749
|
for (long i = element_count; i < headers_len; i++) {
|
|
1822
|
-
if (!keep_bitmap || keep_bitmap[i]) {
|
|
1750
|
+
if (!keep_bitmap || (i < keep_bitmap_len ? keep_bitmap[i] : keep_extra_columns)) {
|
|
1823
1751
|
rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
|
|
1824
1752
|
}
|
|
1825
1753
|
}
|
|
@@ -1828,10 +1756,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1828
1756
|
/* ----------------------------------------
|
|
1829
1757
|
* SECTION 8: Return result
|
|
1830
1758
|
* ---------------------------------------- */
|
|
1831
|
-
|
|
1832
|
-
rb_ary_push(result, xform.hash);
|
|
1833
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1834
|
-
return result;
|
|
1759
|
+
return return_parser_result(xform.hash, element_count);
|
|
1835
1760
|
}
|
|
1836
1761
|
|
|
1837
1762
|
// Count quote characters in a line, optionally respecting backslash escapes.
|