smarter_csv 1.17.2 → 1.17.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +235 -61
- data/README.md +4 -1
- data/UPGRADING.md +251 -0
- data/docs/.nojekyll +0 -0
- data/docs/upgrade_path.json +175 -0
- data/docs/upgrade_wizard.html +498 -0
- data/ext/smarter_csv/smarter_csv.c +176 -309
- data/lib/smarter_csv/parser.rb +4 -2
- data/lib/smarter_csv/version.rb +1 -1
- data/smarter_csv.gemspec +7 -5
- metadata +8 -2
|
@@ -169,16 +169,107 @@ needs_unescape:
|
|
|
169
169
|
/* Helper: build the 2-element [elements, data_size] tuple returned by rb_parse_csv_line.
|
|
170
170
|
* Aligns this function's return shape with parse_csv_line_ruby and rb_parse_line_to_hash_ctx:
|
|
171
171
|
* data_size = -1 signals "unclosed quoted field — needs more data". */
|
|
172
|
-
static inline
|
|
172
|
+
static inline __attribute__((always_inline))
|
|
173
|
+
VALUE return_parser_result(VALUE elements, long data_size) {
|
|
173
174
|
VALUE result = rb_ary_new_capa(2);
|
|
174
175
|
rb_ary_push(result, elements);
|
|
175
176
|
rb_ary_push(result, LONG2FIX(data_size));
|
|
176
177
|
return result;
|
|
177
178
|
}
|
|
178
179
|
|
|
180
|
+
/* Helper: trim leading/trailing spaces and tabs from a field when strip_ws is set.
|
|
181
|
+
* Sets *out_start to the first kept byte and returns the trimmed length (0 when the
|
|
182
|
+
* field is empty or all whitespace). This is the trim performed at every field
|
|
183
|
+
* boundary in all three parsers; kept always_inline so each call site compiles to
|
|
184
|
+
* the same code as the hand-written loops it replaces (no performance cost). */
|
|
185
|
+
static inline __attribute__((always_inline))
|
|
186
|
+
long trim_field(char *field, long field_len, bool strip_ws, char **out_start) {
|
|
187
|
+
char *trim_start = field;
|
|
188
|
+
char *trim_end = field + field_len - 1;
|
|
189
|
+
if (strip_ws) {
|
|
190
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
191
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
192
|
+
}
|
|
193
|
+
*out_start = trim_start;
|
|
194
|
+
return (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/* A field after quote-stripping and trimming: where its content starts, how long it
|
|
198
|
+
* is, and whether it still contains quote characters that need unescaping. */
|
|
199
|
+
typedef struct {
|
|
200
|
+
char *start;
|
|
201
|
+
long len;
|
|
202
|
+
bool has_quotes;
|
|
203
|
+
} extracted_field;
|
|
204
|
+
|
|
205
|
+
/* Helper: turn a raw field slice into the values every extraction site needs.
|
|
206
|
+
* Strips a surrounding pair of quote chars (if present), trims whitespace via
|
|
207
|
+
* trim_field, and reports whether the result still has embedded quotes (true for a
|
|
208
|
+
* quoted field, or any field containing the quote char). This is the common prefix
|
|
209
|
+
* before each field is pushed/inserted, in all three parsers' slow paths.
|
|
210
|
+
* always_inline + return-by-value so the struct is dissolved into registers and each
|
|
211
|
+
* call site compiles to the same code as the old inline block (no performance cost). */
|
|
212
|
+
static inline __attribute__((always_inline))
|
|
213
|
+
extracted_field extract_field(char *raw_field, long field_len, bool strip_ws, char quote_char_val) {
|
|
214
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
215
|
+
if (quoted) {
|
|
216
|
+
raw_field++; // Skip opening quote
|
|
217
|
+
field_len -= 2; // Exclude both quotes from length
|
|
218
|
+
}
|
|
219
|
+
char *trim_start;
|
|
220
|
+
long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
|
|
221
|
+
extracted_field result = {
|
|
222
|
+
trim_start,
|
|
223
|
+
trimmed_len,
|
|
224
|
+
quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len))
|
|
225
|
+
};
|
|
226
|
+
return result;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/* Helper: is a closing quote at p actually a field close? Valid only when followed by
|
|
230
|
+
* the column separator, the row separator, or end of line. Pure read — touches none of
|
|
231
|
+
* the quote loop's state (in_quotes/field_started/etc). Mirrors the inline lookahead
|
|
232
|
+
* copied into all three parsers' quote machines; always_inline so it compiles to the
|
|
233
|
+
* same code as the hand-written block. */
|
|
234
|
+
static inline __attribute__((always_inline))
|
|
235
|
+
bool is_valid_close(const char *p, const char *endP,
|
|
236
|
+
const char *col_sepP, long col_sep_len,
|
|
237
|
+
const char *row_sepP, long row_sep_len) {
|
|
238
|
+
bool valid_close = (p + 1 >= endP);
|
|
239
|
+
if (!valid_close) {
|
|
240
|
+
valid_close = true;
|
|
241
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
242
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
if (!valid_close && row_sep_len > 0) {
|
|
246
|
+
valid_close = true;
|
|
247
|
+
for (long j = 0; j < row_sep_len; j++) {
|
|
248
|
+
if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return valid_close;
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
/* Helper: strip a trailing row separator from the line (pointer adjustment, no string
|
|
255
|
+
* mutation). If the last row_sep_len bytes at endP match row_sepP, move endP back past
|
|
256
|
+
* them; otherwise leave endP untouched. The row_sep_len > 0 guard means callers can
|
|
257
|
+
* pass (NULL, 0) for "no separator known yet" without an outer if. Shared by
|
|
258
|
+
* rb_parse_line_to_hash and rb_parse_line_to_hash_ctx; always_inline keeps the chomp
|
|
259
|
+
* site as cheap as the hand-written check it replaces. */
|
|
260
|
+
static inline __attribute__((always_inline))
|
|
261
|
+
char *chomp_row_sep(char *endP, long line_len, const char *row_sepP, long row_sep_len) {
|
|
262
|
+
if (row_sep_len > 0
|
|
263
|
+
&& line_len >= row_sep_len
|
|
264
|
+
&& memcmp(endP - row_sep_len, row_sepP, (size_t)row_sep_len) == 0) {
|
|
265
|
+
endP -= row_sep_len;
|
|
266
|
+
}
|
|
267
|
+
return endP;
|
|
268
|
+
}
|
|
269
|
+
|
|
179
270
|
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val, VALUE allow_escaped_quotes_val, VALUE quote_boundary_standard_val, VALUE row_sep_val) {
|
|
180
271
|
if (RB_TYPE_P(line, T_NIL) == 1) {
|
|
181
|
-
return
|
|
272
|
+
return return_parser_result(rb_ary_new(), 0);
|
|
182
273
|
}
|
|
183
274
|
|
|
184
275
|
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
@@ -205,7 +296,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
205
296
|
if (max_size != Qnil) {
|
|
206
297
|
max_fields = NUM2INT(max_size);
|
|
207
298
|
if (max_fields < 0) {
|
|
208
|
-
return
|
|
299
|
+
return return_parser_result(rb_ary_new(), 0);
|
|
209
300
|
}
|
|
210
301
|
}
|
|
211
302
|
|
|
@@ -229,15 +320,8 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
229
320
|
|
|
230
321
|
long field_len = sep_pos - startP;
|
|
231
322
|
char *raw_field = startP;
|
|
232
|
-
char *trim_start
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
if (strip_ws) {
|
|
236
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
237
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
238
|
-
}
|
|
239
|
-
|
|
240
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
323
|
+
char *trim_start;
|
|
324
|
+
long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
|
|
241
325
|
|
|
242
326
|
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
243
327
|
rb_ary_push(elements, field);
|
|
@@ -250,21 +334,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
250
334
|
if ((max_fields < 0) || (element_count < max_fields)) {
|
|
251
335
|
long field_len = endP - startP;
|
|
252
336
|
char *raw_field = startP;
|
|
253
|
-
char *trim_start
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
if (strip_ws) {
|
|
257
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
258
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
337
|
+
char *trim_start;
|
|
338
|
+
long trimmed_len = trim_field(raw_field, field_len, strip_ws, &trim_start);
|
|
262
339
|
|
|
263
340
|
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
264
341
|
rb_ary_push(elements, field);
|
|
265
342
|
}
|
|
266
343
|
|
|
267
|
-
return
|
|
344
|
+
return return_parser_result(elements, RARRAY_LEN(elements));
|
|
268
345
|
}
|
|
269
346
|
|
|
270
347
|
// === SLOW PATH: Quoted fields or multi-char separator ===
|
|
@@ -291,28 +368,14 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
291
368
|
long field_len = p - startP;
|
|
292
369
|
char *raw_field = startP;
|
|
293
370
|
|
|
294
|
-
|
|
295
|
-
if (quoted) {
|
|
296
|
-
raw_field++;
|
|
297
|
-
field_len -= 2;
|
|
298
|
-
}
|
|
299
|
-
|
|
300
|
-
char *trim_start = raw_field;
|
|
301
|
-
char *trim_end = raw_field + field_len - 1;
|
|
302
|
-
|
|
303
|
-
if (strip_ws) {
|
|
304
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
305
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
306
|
-
}
|
|
371
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
307
372
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
if (trimmed_len == 0) {
|
|
373
|
+
if (f.len == 0) {
|
|
311
374
|
field = Qempty_string;
|
|
312
|
-
} else if (
|
|
313
|
-
field = unescape_quotes(
|
|
375
|
+
} else if (f.has_quotes) {
|
|
376
|
+
field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
|
|
314
377
|
} else {
|
|
315
|
-
field = rb_enc_str_new(
|
|
378
|
+
field = rb_enc_str_new(f.start, f.len, encoding);
|
|
316
379
|
}
|
|
317
380
|
|
|
318
381
|
rb_ary_push(elements, field);
|
|
@@ -346,20 +409,7 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
346
409
|
p++;
|
|
347
410
|
} else {
|
|
348
411
|
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
349
|
-
|
|
350
|
-
if (!valid_close) {
|
|
351
|
-
valid_close = true;
|
|
352
|
-
for (long j = 0; j < col_sep_len; j++) {
|
|
353
|
-
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
354
|
-
}
|
|
355
|
-
}
|
|
356
|
-
if (!valid_close && row_sep_len > 0) {
|
|
357
|
-
valid_close = true;
|
|
358
|
-
for (long j = 0; j < row_sep_len; j++) {
|
|
359
|
-
if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
|
|
360
|
-
}
|
|
361
|
-
}
|
|
362
|
-
if (valid_close) {
|
|
412
|
+
if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP, row_sep_len)) {
|
|
363
413
|
in_quotes = false;
|
|
364
414
|
field_started = true;
|
|
365
415
|
}
|
|
@@ -398,41 +448,27 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
398
448
|
* the signal: append the next physical line and re-parse, or raise MalformedCSV
|
|
399
449
|
* at EOF if the field never closes. The parser does not decide "ultimately
|
|
400
450
|
* malformed"; the caller does. */
|
|
401
|
-
return
|
|
451
|
+
return return_parser_result(rb_ary_new(), -1);
|
|
402
452
|
}
|
|
403
453
|
|
|
404
454
|
if ((max_fields < 0) || (element_count < max_fields)) {
|
|
405
455
|
long field_len = endP - startP;
|
|
406
456
|
char *raw_field = startP;
|
|
407
457
|
|
|
408
|
-
|
|
409
|
-
if (quoted) {
|
|
410
|
-
raw_field++;
|
|
411
|
-
field_len -= 2;
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
char *trim_start = raw_field;
|
|
415
|
-
char *trim_end = raw_field + field_len - 1;
|
|
458
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
416
459
|
|
|
417
|
-
if (
|
|
418
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
419
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
423
|
-
|
|
424
|
-
if (trimmed_len == 0) {
|
|
460
|
+
if (f.len == 0) {
|
|
425
461
|
field = Qempty_string;
|
|
426
|
-
} else if (
|
|
427
|
-
field = unescape_quotes(
|
|
462
|
+
} else if (f.has_quotes) {
|
|
463
|
+
field = unescape_quotes(f.start, f.len, quote_char_val, encoding);
|
|
428
464
|
} else {
|
|
429
|
-
field = rb_enc_str_new(
|
|
465
|
+
field = rb_enc_str_new(f.start, f.len, encoding);
|
|
430
466
|
}
|
|
431
467
|
|
|
432
468
|
rb_ary_push(elements, field);
|
|
433
469
|
}
|
|
434
470
|
|
|
435
|
-
return
|
|
471
|
+
return return_parser_result(elements, RARRAY_LEN(elements));
|
|
436
472
|
}
|
|
437
473
|
|
|
438
474
|
// Efficiently combine two arrays into a hash (replaces headers.zip(values).to_h)
|
|
@@ -690,6 +726,32 @@ static inline __attribute__((always_inline)) bool insert_field_into_hash(
|
|
|
690
726
|
return true;
|
|
691
727
|
}
|
|
692
728
|
|
|
729
|
+
/* Helper: parse the convert_values_to_numeric option into a mode + key list.
|
|
730
|
+
* mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys.
|
|
731
|
+
* Writes through the out-params only when the option is set, so callers must
|
|
732
|
+
* pre-initialize *out_mode = 0 and *out_keys = Qnil. Shared by rb_parse_line_to_hash
|
|
733
|
+
* and rb_new_parse_context — identical logic, different storage (locals vs ctx fields).
|
|
734
|
+
* always_inline so each call site compiles to the same code as the old inline block. */
|
|
735
|
+
static inline __attribute__((always_inline))
|
|
736
|
+
void parse_numeric_option(VALUE options_hash, int *out_mode, VALUE *out_keys) {
|
|
737
|
+
VALUE convert_opt = rb_hash_aref(options_hash, ID2SYM(id_convert_values_to_numeric));
|
|
738
|
+
if (RTEST(convert_opt)) {
|
|
739
|
+
if (RB_TYPE_P(convert_opt, T_HASH)) {
|
|
740
|
+
VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
|
|
741
|
+
VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
|
|
742
|
+
if (RTEST(only_keys)) {
|
|
743
|
+
*out_mode = 2;
|
|
744
|
+
*out_keys = rb_Array(only_keys); // wrap single value in array if needed
|
|
745
|
+
} else if (RTEST(except_keys)) {
|
|
746
|
+
*out_mode = 3;
|
|
747
|
+
*out_keys = rb_Array(except_keys); // wrap single value in array if needed
|
|
748
|
+
}
|
|
749
|
+
} else {
|
|
750
|
+
*out_mode = 1; // convert all
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
693
755
|
/*
|
|
694
756
|
* ================================================================================
|
|
695
757
|
* rb_parse_line_to_hash - Parse CSV line directly into a Ruby Hash
|
|
@@ -738,10 +800,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
738
800
|
* SECTION 1: Handle nil/invalid input
|
|
739
801
|
* ---------------------------------------- */
|
|
740
802
|
if (NIL_P(line)) {
|
|
741
|
-
|
|
742
|
-
rb_ary_push(result, Qnil);
|
|
743
|
-
rb_ary_push(result, INT2FIX(0));
|
|
744
|
-
return result;
|
|
803
|
+
return return_parser_result(Qnil, 0);
|
|
745
804
|
}
|
|
746
805
|
|
|
747
806
|
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
@@ -766,22 +825,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
766
825
|
// numeric_mode: 0=off, 1=all, 2=only listed keys, 3=except listed keys
|
|
767
826
|
int numeric_mode = 0;
|
|
768
827
|
VALUE numeric_keys = Qnil;
|
|
769
|
-
|
|
770
|
-
if (RTEST(convert_opt)) {
|
|
771
|
-
if (RB_TYPE_P(convert_opt, T_HASH)) {
|
|
772
|
-
VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
|
|
773
|
-
VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
|
|
774
|
-
if (RTEST(only_keys)) {
|
|
775
|
-
numeric_mode = 2;
|
|
776
|
-
numeric_keys = rb_Array(only_keys); // wrap single value in array if needed
|
|
777
|
-
} else if (RTEST(except_keys)) {
|
|
778
|
-
numeric_mode = 3;
|
|
779
|
-
numeric_keys = rb_Array(except_keys); // wrap single value in array if needed
|
|
780
|
-
}
|
|
781
|
-
} else {
|
|
782
|
-
numeric_mode = 1; // convert all
|
|
783
|
-
}
|
|
784
|
-
}
|
|
828
|
+
parse_numeric_option(options_hash, &numeric_mode, &numeric_keys);
|
|
785
829
|
|
|
786
830
|
// quote_escaping and quote_boundary are only needed in Section 5 (quoted/slow path).
|
|
787
831
|
// They are declared here as forward declarations so Section 5 can set them lazily.
|
|
@@ -798,11 +842,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
798
842
|
// row_sep is also reused in Section 5 for the closing-quote boundary check.
|
|
799
843
|
VALUE row_sep = rb_hash_aref(options_hash, ID2SYM(id_row_sep));
|
|
800
844
|
if (!NIL_P(row_sep) && RB_TYPE_P(row_sep, T_STRING)) {
|
|
801
|
-
|
|
802
|
-
long row_sep_len = RSTRING_LEN(row_sep);
|
|
803
|
-
if (line_len >= row_sep_len && memcmp(endP - row_sep_len, row_sepP, row_sep_len) == 0) {
|
|
804
|
-
endP -= row_sep_len;
|
|
805
|
-
}
|
|
845
|
+
endP = chomp_row_sep(endP, line_len, RSTRING_PTR(row_sep), RSTRING_LEN(row_sep));
|
|
806
846
|
}
|
|
807
847
|
|
|
808
848
|
char *col_sepP = RSTRING_PTR(col_sep);
|
|
@@ -975,13 +1015,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
975
1015
|
/* --- (a) Common path: no column filter, no early exit --- */
|
|
976
1016
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
977
1017
|
long field_len = sep_pos - startP;
|
|
978
|
-
char *trim_start
|
|
979
|
-
|
|
980
|
-
if (strip_ws) {
|
|
981
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
982
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
983
|
-
}
|
|
984
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1018
|
+
char *trim_start;
|
|
1019
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
985
1020
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
986
1021
|
all_blank = false;
|
|
987
1022
|
element_count++;
|
|
@@ -990,13 +1025,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
990
1025
|
/* Process last field */
|
|
991
1026
|
{
|
|
992
1027
|
long field_len = endP - startP;
|
|
993
|
-
char *trim_start
|
|
994
|
-
|
|
995
|
-
if (strip_ws) {
|
|
996
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
997
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
998
|
-
}
|
|
999
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1028
|
+
char *trim_start;
|
|
1029
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1000
1030
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1001
1031
|
all_blank = false;
|
|
1002
1032
|
element_count++;
|
|
@@ -1005,13 +1035,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1005
1035
|
/* --- (b) Filter path: column bitmap and/or early exit active --- */
|
|
1006
1036
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
1007
1037
|
long field_len = sep_pos - startP;
|
|
1008
|
-
char *trim_start
|
|
1009
|
-
|
|
1010
|
-
if (strip_ws) {
|
|
1011
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1012
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1013
|
-
}
|
|
1014
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1038
|
+
char *trim_start;
|
|
1039
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1015
1040
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1016
1041
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1017
1042
|
all_blank = false;
|
|
@@ -1026,13 +1051,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1026
1051
|
/* Process last field — skip on early exit */
|
|
1027
1052
|
if (!did_early_exit) {
|
|
1028
1053
|
long field_len = endP - startP;
|
|
1029
|
-
char *trim_start
|
|
1030
|
-
|
|
1031
|
-
if (strip_ws) {
|
|
1032
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1033
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1034
|
-
}
|
|
1035
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1054
|
+
char *trim_start;
|
|
1055
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1036
1056
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1037
1057
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1038
1058
|
all_blank = false;
|
|
@@ -1107,28 +1127,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1107
1127
|
long field_len = p - startP;
|
|
1108
1128
|
char *raw_field = startP;
|
|
1109
1129
|
|
|
1110
|
-
|
|
1111
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
1112
|
-
if (quoted) {
|
|
1113
|
-
raw_field++; // Skip opening quote
|
|
1114
|
-
field_len -= 2; // Exclude both quotes from length
|
|
1115
|
-
}
|
|
1116
|
-
|
|
1117
|
-
char *trim_start = raw_field;
|
|
1118
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1119
|
-
|
|
1120
|
-
if (strip_ws) {
|
|
1121
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1122
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1123
|
-
}
|
|
1124
|
-
|
|
1125
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1126
|
-
|
|
1127
|
-
// Determine if field contains embedded quotes (need unescape)
|
|
1128
|
-
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1130
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1129
1131
|
|
|
1130
1132
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1131
|
-
if (insert_field_into_hash(&xform,
|
|
1133
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1132
1134
|
all_blank = false;
|
|
1133
1135
|
}
|
|
1134
1136
|
element_count++;
|
|
@@ -1182,20 +1184,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1182
1184
|
p++;
|
|
1183
1185
|
} else {
|
|
1184
1186
|
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
1185
|
-
|
|
1186
|
-
if (!valid_close) {
|
|
1187
|
-
valid_close = true;
|
|
1188
|
-
for (long j = 0; j < col_sep_len; j++) {
|
|
1189
|
-
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1190
|
-
}
|
|
1191
|
-
}
|
|
1192
|
-
if (!valid_close && row_sep_len2 > 0) {
|
|
1193
|
-
valid_close = true;
|
|
1194
|
-
for (long j = 0; j < row_sep_len2; j++) {
|
|
1195
|
-
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1196
|
-
}
|
|
1197
|
-
}
|
|
1198
|
-
if (valid_close) {
|
|
1187
|
+
if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
|
|
1199
1188
|
in_quotes = false;
|
|
1200
1189
|
field_started = true;
|
|
1201
1190
|
}
|
|
@@ -1233,10 +1222,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1233
1222
|
* We return [nil, -1] rather than raising so the read loop can handle multiline fields
|
|
1234
1223
|
* without a separate pre-scan pass (detect_multiline). */
|
|
1235
1224
|
if (!did_early_exit && in_quotes) {
|
|
1236
|
-
|
|
1237
|
-
rb_ary_push(result, Qnil);
|
|
1238
|
-
rb_ary_push(result, LONG2FIX(-1));
|
|
1239
|
-
return result;
|
|
1225
|
+
return return_parser_result(Qnil, -1);
|
|
1240
1226
|
}
|
|
1241
1227
|
|
|
1242
1228
|
/* Process the last field (same logic as above) — skip on early exit */
|
|
@@ -1244,26 +1230,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1244
1230
|
long field_len = endP - startP;
|
|
1245
1231
|
char *raw_field = startP;
|
|
1246
1232
|
|
|
1247
|
-
|
|
1248
|
-
if (quoted) {
|
|
1249
|
-
raw_field++;
|
|
1250
|
-
field_len -= 2;
|
|
1251
|
-
}
|
|
1252
|
-
|
|
1253
|
-
char *trim_start = raw_field;
|
|
1254
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1255
|
-
|
|
1256
|
-
if (strip_ws) {
|
|
1257
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1258
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1259
|
-
}
|
|
1260
|
-
|
|
1261
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1262
|
-
|
|
1263
|
-
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1233
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1264
1234
|
|
|
1265
1235
|
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1266
|
-
if (insert_field_into_hash(&xform,
|
|
1236
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1267
1237
|
all_blank = false;
|
|
1268
1238
|
}
|
|
1269
1239
|
element_count++;
|
|
@@ -1284,10 +1254,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1284
1254
|
*/
|
|
1285
1255
|
if (all_blank) {
|
|
1286
1256
|
if (remove_empty) {
|
|
1287
|
-
|
|
1288
|
-
rb_ary_push(result, Qnil);
|
|
1289
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1290
|
-
return result;
|
|
1257
|
+
return return_parser_result(Qnil, element_count);
|
|
1291
1258
|
}
|
|
1292
1259
|
|
|
1293
1260
|
ensure_hash_allocated(&xform);
|
|
@@ -1315,10 +1282,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1315
1282
|
* Return [hash, element_count] so caller can detect extra columns
|
|
1316
1283
|
* (when element_count > headers_len) and extend headers if needed.
|
|
1317
1284
|
*/
|
|
1318
|
-
|
|
1319
|
-
rb_ary_push(result, xform.hash);
|
|
1320
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1321
|
-
return result;
|
|
1285
|
+
return return_parser_result(xform.hash, element_count);
|
|
1322
1286
|
}
|
|
1323
1287
|
|
|
1324
1288
|
/* ================================================================================
|
|
@@ -1389,22 +1353,7 @@ __attribute__((cold)) static VALUE rb_new_parse_context(VALUE self, VALUE header
|
|
|
1389
1353
|
ctx->remove_zero_values = RTEST(rb_hash_aref(options_hash, ID2SYM(id_remove_zero_values)));
|
|
1390
1354
|
|
|
1391
1355
|
/* Numeric conversion */
|
|
1392
|
-
|
|
1393
|
-
if (RTEST(convert_opt)) {
|
|
1394
|
-
if (RB_TYPE_P(convert_opt, T_HASH)) {
|
|
1395
|
-
VALUE only_keys = rb_hash_aref(convert_opt, ID2SYM(id_only));
|
|
1396
|
-
VALUE except_keys = rb_hash_aref(convert_opt, ID2SYM(id_except));
|
|
1397
|
-
if (RTEST(only_keys)) {
|
|
1398
|
-
ctx->numeric_mode = 2;
|
|
1399
|
-
ctx->numeric_keys = rb_Array(only_keys);
|
|
1400
|
-
} else if (RTEST(except_keys)) {
|
|
1401
|
-
ctx->numeric_mode = 3;
|
|
1402
|
-
ctx->numeric_keys = rb_Array(except_keys);
|
|
1403
|
-
}
|
|
1404
|
-
} else {
|
|
1405
|
-
ctx->numeric_mode = 1;
|
|
1406
|
-
}
|
|
1407
|
-
}
|
|
1356
|
+
parse_numeric_option(options_hash, &ctx->numeric_mode, &ctx->numeric_keys);
|
|
1408
1357
|
|
|
1409
1358
|
/* quote_escaping → allow_escaped_quotes */
|
|
1410
1359
|
VALUE quote_escaping_val = rb_hash_aref(options_hash, ID2SYM(id_quote_escaping));
|
|
@@ -1503,10 +1452,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1503
1452
|
* SECTION 1: Handle nil/invalid input
|
|
1504
1453
|
* ---------------------------------------- */
|
|
1505
1454
|
if (NIL_P(line)) {
|
|
1506
|
-
|
|
1507
|
-
rb_ary_push(result, Qnil);
|
|
1508
|
-
rb_ary_push(result, INT2FIX(0));
|
|
1509
|
-
return result;
|
|
1455
|
+
return return_parser_result(Qnil, 0);
|
|
1510
1456
|
}
|
|
1511
1457
|
|
|
1512
1458
|
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
@@ -1552,12 +1498,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1552
1498
|
char *p = startP;
|
|
1553
1499
|
|
|
1554
1500
|
/* Chomp: strip trailing row separator (pointer adjustment, no string mutation) */
|
|
1555
|
-
|
|
1556
|
-
long rsl = (long)ctx->row_sep_len;
|
|
1557
|
-
if (line_len >= rsl && memcmp(endP - rsl, ctx->row_sep_buf, (size_t)rsl) == 0) {
|
|
1558
|
-
endP -= rsl;
|
|
1559
|
-
}
|
|
1560
|
-
}
|
|
1501
|
+
endP = chomp_row_sep(endP, line_len, ctx->row_sep_buf, (long)ctx->row_sep_len);
|
|
1561
1502
|
|
|
1562
1503
|
/* Re-read headers_len each call to handle extra-column growth */
|
|
1563
1504
|
long headers_len = NIL_P(ctx->headers) ? 0 : RARRAY_LEN(ctx->headers);
|
|
@@ -1602,13 +1543,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1602
1543
|
/* --- (a) Common path: no column filter, no early exit --- */
|
|
1603
1544
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
1604
1545
|
long field_len = sep_pos - startP;
|
|
1605
|
-
char *trim_start
|
|
1606
|
-
|
|
1607
|
-
if (strip_ws) {
|
|
1608
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1609
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1610
|
-
}
|
|
1611
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1546
|
+
char *trim_start;
|
|
1547
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1612
1548
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1613
1549
|
all_blank = false;
|
|
1614
1550
|
element_count++;
|
|
@@ -1617,13 +1553,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1617
1553
|
/* Process last field */
|
|
1618
1554
|
{
|
|
1619
1555
|
long field_len = endP - startP;
|
|
1620
|
-
char *trim_start
|
|
1621
|
-
|
|
1622
|
-
if (strip_ws) {
|
|
1623
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1624
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1625
|
-
}
|
|
1626
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1556
|
+
char *trim_start;
|
|
1557
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1627
1558
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1628
1559
|
all_blank = false;
|
|
1629
1560
|
element_count++;
|
|
@@ -1632,13 +1563,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1632
1563
|
/* --- (b) Filter path: column bitmap and/or early exit active --- */
|
|
1633
1564
|
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
1634
1565
|
long field_len = sep_pos - startP;
|
|
1635
|
-
char *trim_start
|
|
1636
|
-
|
|
1637
|
-
if (strip_ws) {
|
|
1638
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1639
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1640
|
-
}
|
|
1641
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1566
|
+
char *trim_start;
|
|
1567
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1642
1568
|
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1643
1569
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1644
1570
|
all_blank = false;
|
|
@@ -1653,13 +1579,8 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1653
1579
|
/* Process last field — skip on early exit */
|
|
1654
1580
|
if (!did_early_exit) {
|
|
1655
1581
|
long field_len = endP - startP;
|
|
1656
|
-
char *trim_start
|
|
1657
|
-
|
|
1658
|
-
if (strip_ws) {
|
|
1659
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1660
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1661
|
-
}
|
|
1662
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1582
|
+
char *trim_start;
|
|
1583
|
+
long trimmed_len = trim_field(startP, field_len, strip_ws, &trim_start);
|
|
1663
1584
|
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1664
1585
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1665
1586
|
all_blank = false;
|
|
@@ -1705,26 +1626,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1705
1626
|
long field_len = p - startP;
|
|
1706
1627
|
char *raw_field = startP;
|
|
1707
1628
|
|
|
1708
|
-
|
|
1709
|
-
if (quoted) {
|
|
1710
|
-
raw_field++;
|
|
1711
|
-
field_len -= 2;
|
|
1712
|
-
}
|
|
1713
|
-
|
|
1714
|
-
char *trim_start = raw_field;
|
|
1715
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1716
|
-
|
|
1717
|
-
if (strip_ws) {
|
|
1718
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1719
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1720
|
-
}
|
|
1721
|
-
|
|
1722
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1723
|
-
|
|
1724
|
-
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1629
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1725
1630
|
|
|
1726
1631
|
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1727
|
-
if (insert_field_into_hash(&xform,
|
|
1632
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1728
1633
|
all_blank = false;
|
|
1729
1634
|
}
|
|
1730
1635
|
element_count++;
|
|
@@ -1772,20 +1677,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1772
1677
|
p++;
|
|
1773
1678
|
} else {
|
|
1774
1679
|
/* closing quote: only valid if followed by col_sep, row_sep, or end */
|
|
1775
|
-
|
|
1776
|
-
if (!valid_close) {
|
|
1777
|
-
valid_close = true;
|
|
1778
|
-
for (long j = 0; j < col_sep_len; j++) {
|
|
1779
|
-
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1780
|
-
}
|
|
1781
|
-
}
|
|
1782
|
-
if (!valid_close && row_sep_len2 > 0) {
|
|
1783
|
-
valid_close = true;
|
|
1784
|
-
for (long j = 0; j < row_sep_len2; j++) {
|
|
1785
|
-
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1786
|
-
}
|
|
1787
|
-
}
|
|
1788
|
-
if (valid_close) {
|
|
1680
|
+
if (is_valid_close(p, endP, col_sepP, col_sep_len, row_sepP2, row_sep_len2)) {
|
|
1789
1681
|
in_quotes = false;
|
|
1790
1682
|
field_started = true;
|
|
1791
1683
|
}
|
|
@@ -1820,10 +1712,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1820
1712
|
section5_done_ctx:;
|
|
1821
1713
|
/* Unclosed quote at end of line — signal multiline continuation */
|
|
1822
1714
|
if (!did_early_exit && in_quotes) {
|
|
1823
|
-
|
|
1824
|
-
rb_ary_push(result, Qnil);
|
|
1825
|
-
rb_ary_push(result, LONG2FIX(-1));
|
|
1826
|
-
return result;
|
|
1715
|
+
return return_parser_result(Qnil, -1);
|
|
1827
1716
|
}
|
|
1828
1717
|
|
|
1829
1718
|
/* Process the last field — skip on early exit */
|
|
@@ -1831,26 +1720,10 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1831
1720
|
long field_len = endP - startP;
|
|
1832
1721
|
char *raw_field = startP;
|
|
1833
1722
|
|
|
1834
|
-
|
|
1835
|
-
if (quoted) {
|
|
1836
|
-
raw_field++;
|
|
1837
|
-
field_len -= 2;
|
|
1838
|
-
}
|
|
1839
|
-
|
|
1840
|
-
char *trim_start = raw_field;
|
|
1841
|
-
char *trim_end = raw_field + field_len - 1;
|
|
1842
|
-
|
|
1843
|
-
if (strip_ws) {
|
|
1844
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
1845
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1846
|
-
}
|
|
1847
|
-
|
|
1848
|
-
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1849
|
-
|
|
1850
|
-
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1723
|
+
extracted_field f = extract_field(raw_field, field_len, strip_ws, quote_char_val);
|
|
1851
1724
|
|
|
1852
1725
|
if (!keep_bitmap || (element_count < keep_bitmap_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1853
|
-
if (insert_field_into_hash(&xform,
|
|
1726
|
+
if (insert_field_into_hash(&xform, f.start, f.len, element_count, f.has_quotes, quote_char_val, encoding))
|
|
1854
1727
|
all_blank = false;
|
|
1855
1728
|
}
|
|
1856
1729
|
element_count++;
|
|
@@ -1862,10 +1735,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1862
1735
|
* ---------------------------------------- */
|
|
1863
1736
|
if (all_blank) {
|
|
1864
1737
|
if (remove_empty) {
|
|
1865
|
-
|
|
1866
|
-
rb_ary_push(result, Qnil);
|
|
1867
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1868
|
-
return result;
|
|
1738
|
+
return return_parser_result(Qnil, element_count);
|
|
1869
1739
|
}
|
|
1870
1740
|
|
|
1871
1741
|
ensure_hash_allocated(&xform);
|
|
@@ -1886,10 +1756,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1886
1756
|
/* ----------------------------------------
|
|
1887
1757
|
* SECTION 8: Return result
|
|
1888
1758
|
* ---------------------------------------- */
|
|
1889
|
-
|
|
1890
|
-
rb_ary_push(result, xform.hash);
|
|
1891
|
-
rb_ary_push(result, LONG2FIX(element_count));
|
|
1892
|
-
return result;
|
|
1759
|
+
return return_parser_result(xform.hash, element_count);
|
|
1893
1760
|
}
|
|
1894
1761
|
|
|
1895
1762
|
// Count quote characters in a line, optionally respecting backslash escapes.
|