smarter_csv 1.14.4 → 1.15.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rvmrc +1 -1
- data/CHANGELOG.md +113 -0
- data/CONTRIBUTORS.md +2 -0
- data/README.md +16 -0
- data/docs/basic_read_api.md +3 -2
- data/docs/batch_processing.md +15 -3
- data/docs/examples.md +4 -2
- data/docs/header_transformations.md +11 -0
- data/docs/header_validations.md +14 -0
- data/ext/smarter_csv/Makefile +273 -0
- data/ext/smarter_csv/extconf.rb +4 -2
- data/ext/smarter_csv/smarter_csv.bundle +0 -0
- data/ext/smarter_csv/smarter_csv.bundle.dSYM/Contents/Info.plist +20 -0
- data/ext/smarter_csv/smarter_csv.bundle.dSYM/Contents/Resources/DWARF/smarter_csv.bundle +0 -0
- data/ext/smarter_csv/smarter_csv.bundle.dSYM/Contents/Resources/Relocations/aarch64/smarter_csv.bundle.yml +5 -0
- data/ext/smarter_csv/smarter_csv.c +427 -10
- data/ext/smarter_csv/smarter_csv.o +0 -0
- data/lib/smarter_csv/auto_detection.rb +6 -5
- data/lib/smarter_csv/errors.rb +18 -2
- data/lib/smarter_csv/file_io.rb +5 -3
- data/lib/smarter_csv/hash_transformations.rb +75 -53
- data/lib/smarter_csv/header_validations.rb +2 -2
- data/lib/smarter_csv/headers.rb +1 -1
- data/lib/smarter_csv/parser.rb +60 -2
- data/lib/smarter_csv/reader.rb +31 -28
- data/lib/smarter_csv/version.rb +1 -1
- data/smarter_csv.gemspec +0 -1
- metadata +9 -20
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
#include "ruby.h"
|
|
2
2
|
#include "ruby/encoding.h"
|
|
3
|
+
#include "ruby/version.h"
|
|
3
4
|
#include <stdio.h>
|
|
4
5
|
#include <stdbool.h>
|
|
5
6
|
#include <string.h>
|
|
@@ -10,10 +11,25 @@
|
|
|
10
11
|
#define true ((bool)1)
|
|
11
12
|
#endif
|
|
12
13
|
|
|
14
|
+
/*
|
|
15
|
+
* rb_hash_new_capa() was added in Ruby 3.2. For older Ruby versions,
|
|
16
|
+
* we fall back to rb_hash_new() which doesn't pre-allocate capacity.
|
|
17
|
+
*/
|
|
18
|
+
#if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 3 || (RUBY_API_VERSION_MAJOR == 3 && RUBY_API_VERSION_MINOR >= 2))
|
|
19
|
+
/* Ruby 3.2+ has rb_hash_new_capa */
|
|
20
|
+
#else
|
|
21
|
+
#define rb_hash_new_capa(capa) rb_hash_new()
|
|
22
|
+
#endif
|
|
23
|
+
|
|
13
24
|
VALUE SmarterCSV = Qnil;
|
|
14
25
|
VALUE eMalformedCSVError = Qnil;
|
|
15
26
|
VALUE Parser = Qnil;
|
|
16
|
-
|
|
27
|
+
|
|
28
|
+
// Shared empty string to avoid allocating new empty strings for each empty CSV field.
|
|
29
|
+
// Empty fields are common in CSV files, and with strip_whitespace enabled (default),
|
|
30
|
+
// whitespace-only fields also become empty. Reusing a single frozen empty string
|
|
31
|
+
// significantly reduces object allocations and GC pressure.
|
|
32
|
+
VALUE Qempty_string = Qnil;
|
|
17
33
|
|
|
18
34
|
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
|
19
35
|
char *buf = ALLOC_N(char, len);
|
|
@@ -51,7 +67,6 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
51
67
|
|
|
52
68
|
char *quoteP = RSTRING_PTR(quote_char);
|
|
53
69
|
char quote_char_val = quoteP[0];
|
|
54
|
-
size_t quote_len = strlen(quoteP);
|
|
55
70
|
|
|
56
71
|
VALUE elements = rb_ary_new();
|
|
57
72
|
VALUE field;
|
|
@@ -88,9 +103,9 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
88
103
|
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
89
104
|
}
|
|
90
105
|
|
|
91
|
-
long trimmed_len = trim_end - trim_start + 1;
|
|
106
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
92
107
|
|
|
93
|
-
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
|
108
|
+
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
94
109
|
rb_ary_push(elements, field);
|
|
95
110
|
element_count++;
|
|
96
111
|
|
|
@@ -109,9 +124,9 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
109
124
|
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
110
125
|
}
|
|
111
126
|
|
|
112
|
-
long trimmed_len = trim_end - trim_start + 1;
|
|
127
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
113
128
|
|
|
114
|
-
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
|
129
|
+
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
115
130
|
rb_ary_push(elements, field);
|
|
116
131
|
}
|
|
117
132
|
|
|
@@ -155,9 +170,11 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
155
170
|
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
156
171
|
}
|
|
157
172
|
|
|
158
|
-
long trimmed_len = trim_end - trim_start + 1;
|
|
173
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
159
174
|
|
|
160
|
-
if (
|
|
175
|
+
if (trimmed_len == 0) {
|
|
176
|
+
field = Qempty_string;
|
|
177
|
+
} else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
|
161
178
|
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
|
162
179
|
} else {
|
|
163
180
|
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
|
@@ -206,9 +223,11 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
206
223
|
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
207
224
|
}
|
|
208
225
|
|
|
209
|
-
long trimmed_len = trim_end - trim_start + 1;
|
|
226
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
210
227
|
|
|
211
|
-
if (
|
|
228
|
+
if (trimmed_len == 0) {
|
|
229
|
+
field = Qempty_string;
|
|
230
|
+
} else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
|
212
231
|
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
|
213
232
|
} else {
|
|
214
233
|
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
|
@@ -220,6 +239,401 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
220
239
|
return elements;
|
|
221
240
|
}
|
|
222
241
|
|
|
242
|
+
// Efficiently combine two arrays into a hash (replaces headers.zip(values).to_h)
|
|
243
|
+
// This eliminates the intermediate array allocation from zip and the to_h conversion.
|
|
244
|
+
// For CSV files with many columns, this significantly reduces object allocations.
|
|
245
|
+
// Matches Ruby's zip behavior: pads with nil when values array is shorter than keys.
|
|
246
|
+
static VALUE rb_zip_to_hash(VALUE self, VALUE keys, VALUE values) {
|
|
247
|
+
if (NIL_P(keys) || NIL_P(values)) return rb_hash_new();
|
|
248
|
+
|
|
249
|
+
long keys_len = RARRAY_LEN(keys);
|
|
250
|
+
long vals_len = RARRAY_LEN(values);
|
|
251
|
+
|
|
252
|
+
VALUE hash = rb_hash_new_capa(keys_len);
|
|
253
|
+
for (long i = 0; i < keys_len; i++) {
|
|
254
|
+
VALUE val = (i < vals_len) ? rb_ary_entry(values, i) : Qnil;
|
|
255
|
+
rb_hash_aset(hash, rb_ary_entry(keys, i), val);
|
|
256
|
+
}
|
|
257
|
+
return hash;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/*
|
|
261
|
+
* ================================================================================
|
|
262
|
+
* get_key_for_index - Helper to get the hash key for a given column index
|
|
263
|
+
* ================================================================================
|
|
264
|
+
*
|
|
265
|
+
* For columns within the headers array, returns the corresponding header symbol.
|
|
266
|
+
* For extra columns (beyond headers), generates a symbol like :column_7, :column_8, etc.
|
|
267
|
+
*
|
|
268
|
+
* This supports CSV files where some rows have more columns than the header row.
|
|
269
|
+
*/
|
|
270
|
+
static inline VALUE get_key_for_index(long index, VALUE headers, long headers_len, const char *prefix_str) {
|
|
271
|
+
if (index < headers_len) {
|
|
272
|
+
// Use existing header from the headers array
|
|
273
|
+
return rb_ary_entry(headers, index);
|
|
274
|
+
} else {
|
|
275
|
+
// Generate a new key for extra columns: "column_7" -> :column_7
|
|
276
|
+
char key_buf[64];
|
|
277
|
+
snprintf(key_buf, sizeof(key_buf), "%s%ld", prefix_str, index + 1);
|
|
278
|
+
return ID2SYM(rb_intern(key_buf));
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
/*
|
|
283
|
+
* ================================================================================
|
|
284
|
+
* rb_parse_line_to_hash - Parse CSV line directly into a Ruby Hash
|
|
285
|
+
* ================================================================================
|
|
286
|
+
*
|
|
287
|
+
* This is the main parsing function that converts a CSV line directly into a hash.
|
|
288
|
+
* It builds the hash during parsing to avoid intermediate array allocations.
|
|
289
|
+
*
|
|
290
|
+
* PERFORMANCE NOTES:
|
|
291
|
+
* -----------------
|
|
292
|
+
* - Builds hash directly during parsing (no intermediate values array)
|
|
293
|
+
* - Uses a fast path for the common case (no quotes, single-char separator)
|
|
294
|
+
* - Reuses a shared empty string (Qempty_string) to reduce allocations
|
|
295
|
+
* - Tracks blank fields to support remove_empty_hashes option
|
|
296
|
+
*
|
|
297
|
+
* PARAMETERS:
|
|
298
|
+
* -----------
|
|
299
|
+
* @param line - The CSV line to parse (Ruby String)
|
|
300
|
+
* @param headers - Array of header symbols for hash keys
|
|
301
|
+
* @param col_sep - Column separator string (e.g., ",")
|
|
302
|
+
* @param quote_char - Quote character string (e.g., "\"")
|
|
303
|
+
* @param header_prefix - Prefix for auto-generated column names (e.g., "column_")
|
|
304
|
+
* @param has_quotes_val - Boolean: whether line contains quote characters (optimization hint)
|
|
305
|
+
* @param strip_ws_val - Boolean: whether to strip whitespace from field values
|
|
306
|
+
* @param remove_empty_val - Boolean: if true, return nil for rows where all values are blank
|
|
307
|
+
* @param remove_empty_values_val - Boolean: if true, don't add nil for missing columns
|
|
308
|
+
* (they'd be removed anyway by hash_transformations)
|
|
309
|
+
*
|
|
310
|
+
* RETURNS:
|
|
311
|
+
* --------
|
|
312
|
+
* A Ruby Array [hash, data_size] where:
|
|
313
|
+
* - hash is the parsed row as a Hash, or nil if all values were blank (and remove_empty_val is true)
|
|
314
|
+
* - data_size is the number of fields parsed (used to detect extra columns)
|
|
315
|
+
*
|
|
316
|
+
* EXAMPLE:
|
|
317
|
+
* --------
|
|
318
|
+
* Input: line = "john,25,boston", headers = [:name, :age, :city]
|
|
319
|
+
* Output: [{name: "john", age: "25", city: "boston"}, 3]
|
|
320
|
+
*
|
|
321
|
+
* Input: line = "john,25,boston,extra" (more fields than headers)
|
|
322
|
+
* Output: [{name: "john", age: "25", city: "boston", column_4: "extra"}, 4]
|
|
323
|
+
*/
|
|
324
|
+
static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE col_sep,
|
|
325
|
+
VALUE quote_char, VALUE header_prefix, VALUE has_quotes_val,
|
|
326
|
+
VALUE strip_ws_val, VALUE remove_empty_val, VALUE remove_empty_values_val) {
|
|
327
|
+
|
|
328
|
+
/* ----------------------------------------
|
|
329
|
+
* SECTION 1: Handle nil/invalid input
|
|
330
|
+
* ---------------------------------------- */
|
|
331
|
+
if (NIL_P(line)) {
|
|
332
|
+
VALUE result = rb_ary_new_capa(2);
|
|
333
|
+
rb_ary_push(result, Qnil);
|
|
334
|
+
rb_ary_push(result, INT2FIX(0));
|
|
335
|
+
return result;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
if (RB_TYPE_P(line, T_STRING) != 1) {
|
|
339
|
+
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line_to_hash: line has to be a string or nil");
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/* ----------------------------------------
|
|
343
|
+
* SECTION 2: Extract parameters from Ruby objects
|
|
344
|
+
* ----------------------------------------
|
|
345
|
+
* Convert Ruby objects to C types for efficient access during parsing.
|
|
346
|
+
*/
|
|
347
|
+
rb_encoding *encoding = rb_enc_get(line); // Preserve string encoding
|
|
348
|
+
char *startP = RSTRING_PTR(line); // Pointer to start of current field
|
|
349
|
+
long line_len = RSTRING_LEN(line);
|
|
350
|
+
char *endP = startP + line_len; // End of line marker
|
|
351
|
+
char *p = startP; // Current parsing position
|
|
352
|
+
|
|
353
|
+
char *col_sepP = RSTRING_PTR(col_sep);
|
|
354
|
+
long col_sep_len = RSTRING_LEN(col_sep);
|
|
355
|
+
|
|
356
|
+
char *quoteP = RSTRING_PTR(quote_char);
|
|
357
|
+
char quote_char_val = quoteP[0]; // First char of quote string
|
|
358
|
+
|
|
359
|
+
// Default prefix for extra columns is "column_" (e.g., :column_7)
|
|
360
|
+
const char *prefix_str = NIL_P(header_prefix) ? "column_" : RSTRING_PTR(header_prefix);
|
|
361
|
+
|
|
362
|
+
long headers_len = NIL_P(headers) ? 0 : RARRAY_LEN(headers);
|
|
363
|
+
bool has_quotes = RTEST(has_quotes_val); // Hint: does line contain quotes?
|
|
364
|
+
bool strip_ws = RTEST(strip_ws_val); // Strip whitespace from fields?
|
|
365
|
+
bool remove_empty = RTEST(remove_empty_val); // Skip rows with all blank values?
|
|
366
|
+
bool remove_empty_values = RTEST(remove_empty_values_val); // If true, don't add nil for missing cols
|
|
367
|
+
|
|
368
|
+
/* ----------------------------------------
|
|
369
|
+
* SECTION 3: Initialize hash and tracking variables
|
|
370
|
+
* ----------------------------------------
|
|
371
|
+
* Pre-allocate hash with expected capacity for better performance.
|
|
372
|
+
*/
|
|
373
|
+
long hash_size = headers_len > 0 ? headers_len : 16;
|
|
374
|
+
VALUE hash = rb_hash_new_capa(hash_size); // Pre-sized hash for efficiency
|
|
375
|
+
VALUE field; // Current field value
|
|
376
|
+
long element_count = 0; // Number of fields parsed
|
|
377
|
+
bool all_blank = true; // Track if all fields are blank
|
|
378
|
+
|
|
379
|
+
/* ========================================
|
|
380
|
+
* SECTION 4: FAST PATH - No quotes, single-char separator
|
|
381
|
+
* ========================================
|
|
382
|
+
* This is the common case for most CSV files. We use memchr() for fast
|
|
383
|
+
* separator scanning, avoiding character-by-character iteration.
|
|
384
|
+
*
|
|
385
|
+
* __builtin_expect hints to the compiler that this branch is likely taken.
|
|
386
|
+
*/
|
|
387
|
+
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
|
388
|
+
char sep = *col_sepP;
|
|
389
|
+
char *sep_pos = NULL;
|
|
390
|
+
|
|
391
|
+
/* Loop through each field by finding separator positions */
|
|
392
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
|
393
|
+
// Extract field boundaries
|
|
394
|
+
long field_len = sep_pos - startP;
|
|
395
|
+
char *raw_field = startP;
|
|
396
|
+
char *trim_start = raw_field;
|
|
397
|
+
char *trim_end = raw_field + field_len - 1;
|
|
398
|
+
|
|
399
|
+
// Optional whitespace trimming (spaces and tabs only)
|
|
400
|
+
if (strip_ws) {
|
|
401
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
402
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
406
|
+
|
|
407
|
+
// Create field value: use shared empty string for empty fields to reduce allocations
|
|
408
|
+
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
409
|
+
if (all_blank && trimmed_len > 0) all_blank = false;
|
|
410
|
+
|
|
411
|
+
// Insert field directly into hash with appropriate key
|
|
412
|
+
VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
|
|
413
|
+
rb_hash_aset(hash, key, field);
|
|
414
|
+
element_count++;
|
|
415
|
+
|
|
416
|
+
// Move to next field
|
|
417
|
+
p = sep_pos + 1;
|
|
418
|
+
startP = p;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
/* Process the last field (no separator after it) */
|
|
422
|
+
long field_len = endP - startP;
|
|
423
|
+
char *raw_field = startP;
|
|
424
|
+
char *trim_start = raw_field;
|
|
425
|
+
char *trim_end = raw_field + field_len - 1;
|
|
426
|
+
|
|
427
|
+
if (strip_ws) {
|
|
428
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
429
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
433
|
+
|
|
434
|
+
field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
|
|
435
|
+
if (all_blank && trimmed_len > 0) all_blank = false;
|
|
436
|
+
|
|
437
|
+
VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
|
|
438
|
+
rb_hash_aset(hash, key, field);
|
|
439
|
+
element_count++;
|
|
440
|
+
|
|
441
|
+
} else {
|
|
442
|
+
/* ========================================
|
|
443
|
+
* SECTION 5: SLOW PATH - Quoted fields or multi-char separator
|
|
444
|
+
* ========================================
|
|
445
|
+
* This handles complex cases:
|
|
446
|
+
* - Fields containing the separator inside quotes: "hello,world"
|
|
447
|
+
* - Multi-character separators like "::" or "\t\t"
|
|
448
|
+
* - Escaped quotes using backslash: \"
|
|
449
|
+
*
|
|
450
|
+
* We must scan character-by-character to track quote state.
|
|
451
|
+
*/
|
|
452
|
+
long i;
|
|
453
|
+
long backslash_count = 0; // Track consecutive backslashes for escape detection
|
|
454
|
+
bool in_quotes = false; // Are we inside a quoted field?
|
|
455
|
+
bool col_sep_found = true;
|
|
456
|
+
|
|
457
|
+
/* Scan through the line character by character */
|
|
458
|
+
while (p < endP) {
|
|
459
|
+
// Check if current position matches the column separator
|
|
460
|
+
col_sep_found = true;
|
|
461
|
+
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
|
|
462
|
+
if (*(p + i) != *(col_sepP + i)) {
|
|
463
|
+
col_sep_found = false;
|
|
464
|
+
break;
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
// Found separator and not inside quotes = end of field
|
|
469
|
+
if (col_sep_found && !in_quotes) {
|
|
470
|
+
long field_len = p - startP;
|
|
471
|
+
char *raw_field = startP;
|
|
472
|
+
|
|
473
|
+
// Check if field is wrapped in quotes: "value"
|
|
474
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
475
|
+
if (quoted) {
|
|
476
|
+
raw_field++; // Skip opening quote
|
|
477
|
+
field_len -= 2; // Exclude both quotes from length
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
char *trim_start = raw_field;
|
|
481
|
+
char *trim_end = raw_field + field_len - 1;
|
|
482
|
+
|
|
483
|
+
if (strip_ws) {
|
|
484
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
485
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
489
|
+
|
|
490
|
+
// Create field value, handling escaped quotes if present
|
|
491
|
+
if (trimmed_len == 0) {
|
|
492
|
+
field = Qempty_string;
|
|
493
|
+
} else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
|
494
|
+
// Field contains quotes - need to unescape doubled quotes ("" -> ")
|
|
495
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
|
496
|
+
all_blank = false;
|
|
497
|
+
} else {
|
|
498
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
|
499
|
+
all_blank = false;
|
|
500
|
+
}
|
|
501
|
+
|
|
502
|
+
// Insert field directly into hash
|
|
503
|
+
VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
|
|
504
|
+
rb_hash_aset(hash, key, field);
|
|
505
|
+
element_count++;
|
|
506
|
+
|
|
507
|
+
// Move past the separator to start of next field
|
|
508
|
+
p += col_sep_len;
|
|
509
|
+
startP = p;
|
|
510
|
+
backslash_count = 0;
|
|
511
|
+
|
|
512
|
+
} else {
|
|
513
|
+
/* Not at a separator (or inside quotes) - track quote state */
|
|
514
|
+
|
|
515
|
+
if (*p == '\\') {
|
|
516
|
+
// Count consecutive backslashes for escape sequence detection
|
|
517
|
+
backslash_count++;
|
|
518
|
+
} else {
|
|
519
|
+
if (*p == quote_char_val) {
|
|
520
|
+
// Quote char toggles in_quotes state only if not escaped
|
|
521
|
+
// (even number of preceding backslashes = not escaped)
|
|
522
|
+
if (backslash_count % 2 == 0) {
|
|
523
|
+
in_quotes = !in_quotes;
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
backslash_count = 0;
|
|
527
|
+
}
|
|
528
|
+
p++;
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
// Error: unclosed quote at end of line
|
|
533
|
+
if (in_quotes) {
|
|
534
|
+
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
/* Process the last field (same logic as above) */
|
|
538
|
+
long field_len = endP - startP;
|
|
539
|
+
char *raw_field = startP;
|
|
540
|
+
|
|
541
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
|
542
|
+
if (quoted) {
|
|
543
|
+
raw_field++;
|
|
544
|
+
field_len -= 2;
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
char *trim_start = raw_field;
|
|
548
|
+
char *trim_end = raw_field + field_len - 1;
|
|
549
|
+
|
|
550
|
+
if (strip_ws) {
|
|
551
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
|
552
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
556
|
+
|
|
557
|
+
if (trimmed_len == 0) {
|
|
558
|
+
field = Qempty_string;
|
|
559
|
+
} else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
|
560
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
|
561
|
+
all_blank = false;
|
|
562
|
+
} else {
|
|
563
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
|
564
|
+
all_blank = false;
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
|
|
568
|
+
rb_hash_aset(hash, key, field);
|
|
569
|
+
element_count++;
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
/* ----------------------------------------
|
|
573
|
+
* SECTION 6: Handle blank rows
|
|
574
|
+
* ----------------------------------------
|
|
575
|
+
* If remove_empty_hashes is enabled and all fields were blank,
|
|
576
|
+
* return nil instead of the hash so the row can be skipped.
|
|
577
|
+
*/
|
|
578
|
+
if (remove_empty && all_blank) {
|
|
579
|
+
VALUE result = rb_ary_new_capa(2);
|
|
580
|
+
rb_ary_push(result, Qnil);
|
|
581
|
+
rb_ary_push(result, LONG2FIX(element_count));
|
|
582
|
+
return result;
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
/* ----------------------------------------
|
|
586
|
+
* SECTION 7: Pad hash with nil for missing columns (conditional)
|
|
587
|
+
* ----------------------------------------
|
|
588
|
+
* Only add nil for missing columns when remove_empty_values is false.
|
|
589
|
+
* When remove_empty_values is true, nils would be removed anyway by
|
|
590
|
+
* hash_transformations, so we skip this for efficiency.
|
|
591
|
+
*/
|
|
592
|
+
if (!remove_empty_values) {
|
|
593
|
+
for (long i = element_count; i < headers_len; i++) {
|
|
594
|
+
rb_hash_aset(hash, rb_ary_entry(headers, i), Qnil);
|
|
595
|
+
}
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
/* ----------------------------------------
|
|
599
|
+
* SECTION 8: Return result
|
|
600
|
+
* ----------------------------------------
|
|
601
|
+
* Return [hash, element_count] so caller can detect extra columns
|
|
602
|
+
* (when element_count > headers_len) and extend headers if needed.
|
|
603
|
+
*/
|
|
604
|
+
VALUE result = rb_ary_new_capa(2);
|
|
605
|
+
rb_ary_push(result, hash);
|
|
606
|
+
rb_ary_push(result, LONG2FIX(element_count));
|
|
607
|
+
return result;
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
// Count quote characters in a line, respecting backslash escapes.
|
|
611
|
+
// This is a performance optimization that replaces the Ruby each_char implementation
|
|
612
|
+
// which creates a new String object for every character in the line.
|
|
613
|
+
// For a 1000-char line, this eliminates ~1000 object allocations per line.
|
|
614
|
+
static VALUE rb_count_quote_chars(VALUE self, VALUE line, VALUE quote_char) {
|
|
615
|
+
if (NIL_P(line) || NIL_P(quote_char)) return INT2FIX(0);
|
|
616
|
+
if (RSTRING_LEN(quote_char) == 0) return INT2FIX(0);
|
|
617
|
+
|
|
618
|
+
char *str = RSTRING_PTR(line);
|
|
619
|
+
long len = RSTRING_LEN(line);
|
|
620
|
+
char qc = RSTRING_PTR(quote_char)[0];
|
|
621
|
+
|
|
622
|
+
long count = 0;
|
|
623
|
+
bool escaped = false;
|
|
624
|
+
|
|
625
|
+
for (long i = 0; i < len; i++) {
|
|
626
|
+
if (str[i] == '\\' && !escaped) {
|
|
627
|
+
escaped = true;
|
|
628
|
+
} else {
|
|
629
|
+
if (str[i] == qc && !escaped) count++;
|
|
630
|
+
escaped = false;
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
return LONG2FIX(count);
|
|
635
|
+
}
|
|
636
|
+
|
|
223
637
|
void Init_smarter_csv(void) {
|
|
224
638
|
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
|
225
639
|
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
|
@@ -227,4 +641,7 @@ void Init_smarter_csv(void) {
|
|
|
227
641
|
Qempty_string = rb_str_new_literal("");
|
|
228
642
|
rb_gc_register_address(&Qempty_string);
|
|
229
643
|
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
|
|
644
|
+
rb_define_module_function(Parser, "count_quote_chars_c", rb_count_quote_chars, 2);
|
|
645
|
+
rb_define_module_function(Parser, "zip_to_hash_c", rb_zip_to_hash, 2);
|
|
646
|
+
rb_define_module_function(Parser, "parse_line_to_hash_c", rb_parse_line_to_hash, 9);
|
|
230
647
|
}
|
|
Binary file
|
|
@@ -18,21 +18,22 @@ module SmarterCSV
|
|
|
18
18
|
candidates = Hash.new(0)
|
|
19
19
|
count = has_header ? 1 : 5
|
|
20
20
|
count.times do
|
|
21
|
-
|
|
21
|
+
next_line = next_line_with_counts(filehandle, options)
|
|
22
|
+
break if next_line.nil? # EOF reached (short files)
|
|
23
|
+
|
|
24
|
+
line = next_line
|
|
22
25
|
delimiters.each do |d|
|
|
23
26
|
# Count only non-quoted occurrences of the delimiter
|
|
24
27
|
non_quoted_text = line.split(/#{escaped_quote}[^#{escaped_quote}]*#{escaped_quote}/).join
|
|
25
28
|
|
|
26
29
|
candidates[d] += non_quoted_text.scan(d).count
|
|
27
30
|
end
|
|
28
|
-
rescue EOFError # short files
|
|
29
|
-
break
|
|
30
31
|
end
|
|
31
32
|
rewind(filehandle)
|
|
32
33
|
|
|
33
34
|
if candidates.values.max == 0
|
|
34
|
-
# if the header only contains
|
|
35
|
-
return ',' if line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
|
|
35
|
+
# if the header only contains word characters and whitespace, assume comma separator
|
|
36
|
+
return ',' if line && line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
|
|
36
37
|
|
|
37
38
|
raise SmarterCSV::NoColSepDetected
|
|
38
39
|
end
|
data/lib/smarter_csv/errors.rb
CHANGED
|
@@ -7,8 +7,24 @@ module SmarterCSV
|
|
|
7
7
|
class HeaderSizeMismatch < SmarterCSVException; end
|
|
8
8
|
class IncorrectOption < SmarterCSVException; end
|
|
9
9
|
class ValidationError < SmarterCSVException; end
|
|
10
|
-
class DuplicateHeaders < SmarterCSVException
|
|
11
|
-
|
|
10
|
+
class DuplicateHeaders < SmarterCSVException
|
|
11
|
+
attr_reader :headers
|
|
12
|
+
|
|
13
|
+
def initialize(message, headers = [])
|
|
14
|
+
super(message)
|
|
15
|
+
@headers = headers
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
class MissingKeys < SmarterCSVException # previously known as MissingHeaders
|
|
20
|
+
attr_reader :keys
|
|
21
|
+
|
|
22
|
+
def initialize(message, keys = [])
|
|
23
|
+
super(message)
|
|
24
|
+
@keys = keys
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
12
28
|
class NoColSepDetected < SmarterCSVException; end
|
|
13
29
|
class KeyMappingError < SmarterCSVException; end
|
|
14
30
|
class MalformedCSV < SmarterCSVException; end
|
data/lib/smarter_csv/file_io.rb
CHANGED
|
@@ -4,8 +4,10 @@ module SmarterCSV
|
|
|
4
4
|
module FileIO
|
|
5
5
|
protected
|
|
6
6
|
|
|
7
|
-
def
|
|
8
|
-
line = filehandle.
|
|
7
|
+
def next_line_with_counts(filehandle, options)
|
|
8
|
+
line = filehandle.gets(options[:row_sep])
|
|
9
|
+
return nil if line.nil? # EOF reached
|
|
10
|
+
|
|
9
11
|
@file_line_count += 1
|
|
10
12
|
@csv_line_count += 1
|
|
11
13
|
line = remove_bom(line) if @csv_line_count == 1
|
|
@@ -14,7 +16,7 @@ module SmarterCSV
|
|
|
14
16
|
|
|
15
17
|
def skip_lines(filehandle, options)
|
|
16
18
|
options[:skip_lines].to_i.times do
|
|
17
|
-
|
|
19
|
+
next_line_with_counts(filehandle, options)
|
|
18
20
|
end
|
|
19
21
|
end
|
|
20
22
|
|