smarter_csv 1.14.4 → 1.15.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,6 @@
1
1
  #include "ruby.h"
2
2
  #include "ruby/encoding.h"
3
+ #include "ruby/version.h"
3
4
  #include <stdio.h>
4
5
  #include <stdbool.h>
5
6
  #include <string.h>
@@ -10,10 +11,25 @@
10
11
  #define true ((bool)1)
11
12
  #endif
12
13
 
14
+ /*
15
+ * rb_hash_new_capa() was added in Ruby 3.2. For older Ruby versions,
16
+ * we fall back to rb_hash_new() which doesn't pre-allocate capacity.
17
+ */
18
+ #if defined(RUBY_API_VERSION_MAJOR) && (RUBY_API_VERSION_MAJOR > 3 || (RUBY_API_VERSION_MAJOR == 3 && RUBY_API_VERSION_MINOR >= 2))
19
+ /* Ruby 3.2+ has rb_hash_new_capa */
20
+ #else
21
+ #define rb_hash_new_capa(capa) rb_hash_new()
22
+ #endif
23
+
13
24
  VALUE SmarterCSV = Qnil;
14
25
  VALUE eMalformedCSVError = Qnil;
15
26
  VALUE Parser = Qnil;
16
- VALUE Qempty_string = Qnil; // shared frozen empty string
27
+
28
+ // Shared empty string to avoid allocating new empty strings for each empty CSV field.
29
+ // Empty fields are common in CSV files, and with strip_whitespace enabled (default),
30
+ // whitespace-only fields also become empty. Reusing a single frozen empty string
31
+ // significantly reduces object allocations and GC pressure.
32
+ VALUE Qempty_string = Qnil;
17
33
 
18
34
  static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
19
35
  char *buf = ALLOC_N(char, len);
@@ -51,7 +67,6 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
51
67
 
52
68
  char *quoteP = RSTRING_PTR(quote_char);
53
69
  char quote_char_val = quoteP[0];
54
- size_t quote_len = strlen(quoteP);
55
70
 
56
71
  VALUE elements = rb_ary_new();
57
72
  VALUE field;
@@ -88,9 +103,9 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
88
103
  while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
89
104
  }
90
105
 
91
- long trimmed_len = trim_end - trim_start + 1;
106
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
92
107
 
93
- field = rb_enc_str_new(trim_start, trimmed_len, encoding);
108
+ field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
94
109
  rb_ary_push(elements, field);
95
110
  element_count++;
96
111
 
@@ -109,9 +124,9 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
109
124
  while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
110
125
  }
111
126
 
112
- long trimmed_len = trim_end - trim_start + 1;
127
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
113
128
 
114
- field = rb_enc_str_new(trim_start, trimmed_len, encoding);
129
+ field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
115
130
  rb_ary_push(elements, field);
116
131
  }
117
132
 
@@ -155,9 +170,11 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
155
170
  while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
156
171
  }
157
172
 
158
- long trimmed_len = trim_end - trim_start + 1;
173
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
159
174
 
160
- if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
175
+ if (trimmed_len == 0) {
176
+ field = Qempty_string;
177
+ } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
161
178
  field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
162
179
  } else {
163
180
  field = rb_enc_str_new(trim_start, trimmed_len, encoding);
@@ -206,9 +223,11 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
206
223
  while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
207
224
  }
208
225
 
209
- long trimmed_len = trim_end - trim_start + 1;
226
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
210
227
 
211
- if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
228
+ if (trimmed_len == 0) {
229
+ field = Qempty_string;
230
+ } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
212
231
  field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
213
232
  } else {
214
233
  field = rb_enc_str_new(trim_start, trimmed_len, encoding);
@@ -220,6 +239,401 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
220
239
  return elements;
221
240
  }
222
241
 
242
+ // Efficiently combine two arrays into a hash (replaces headers.zip(values).to_h)
243
+ // This eliminates the intermediate array allocation from zip and the to_h conversion.
244
+ // For CSV files with many columns, this significantly reduces object allocations.
245
+ // Matches Ruby's zip behavior: pads with nil when values array is shorter than keys.
246
+ static VALUE rb_zip_to_hash(VALUE self, VALUE keys, VALUE values) {
247
+ if (NIL_P(keys) || NIL_P(values)) return rb_hash_new();
248
+
249
+ long keys_len = RARRAY_LEN(keys);
250
+ long vals_len = RARRAY_LEN(values);
251
+
252
+ VALUE hash = rb_hash_new_capa(keys_len);
253
+ for (long i = 0; i < keys_len; i++) {
254
+ VALUE val = (i < vals_len) ? rb_ary_entry(values, i) : Qnil;
255
+ rb_hash_aset(hash, rb_ary_entry(keys, i), val);
256
+ }
257
+ return hash;
258
+ }
259
+
260
+ /*
261
+ * ================================================================================
262
+ * get_key_for_index - Helper to get the hash key for a given column index
263
+ * ================================================================================
264
+ *
265
+ * For columns within the headers array, returns the corresponding header symbol.
266
+ * For extra columns (beyond headers), generates a symbol like :column_7, :column_8, etc.
267
+ *
268
+ * This supports CSV files where some rows have more columns than the header row.
269
+ */
270
+ static inline VALUE get_key_for_index(long index, VALUE headers, long headers_len, const char *prefix_str) {
271
+ if (index < headers_len) {
272
+ // Use existing header from the headers array
273
+ return rb_ary_entry(headers, index);
274
+ } else {
275
+ // Generate a new key for extra columns: "column_7" -> :column_7
276
+ char key_buf[64];
277
+ snprintf(key_buf, sizeof(key_buf), "%s%ld", prefix_str, index + 1);
278
+ return ID2SYM(rb_intern(key_buf));
279
+ }
280
+ }
281
+
282
+ /*
283
+ * ================================================================================
284
+ * rb_parse_line_to_hash - Parse CSV line directly into a Ruby Hash
285
+ * ================================================================================
286
+ *
287
+ * This is the main parsing function that converts a CSV line directly into a hash.
288
+ * It builds the hash during parsing to avoid intermediate array allocations.
289
+ *
290
+ * PERFORMANCE NOTES:
291
+ * -----------------
292
+ * - Builds hash directly during parsing (no intermediate values array)
293
+ * - Uses a fast path for the common case (no quotes, single-char separator)
294
+ * - Reuses a shared empty string (Qempty_string) to reduce allocations
295
+ * - Tracks blank fields to support remove_empty_hashes option
296
+ *
297
+ * PARAMETERS:
298
+ * -----------
299
+ * @param line - The CSV line to parse (Ruby String)
300
+ * @param headers - Array of header symbols for hash keys
301
+ * @param col_sep - Column separator string (e.g., ",")
302
+ * @param quote_char - Quote character string (e.g., "\"")
303
+ * @param header_prefix - Prefix for auto-generated column names (e.g., "column_")
304
+ * @param has_quotes_val - Boolean: whether line contains quote characters (optimization hint)
305
+ * @param strip_ws_val - Boolean: whether to strip whitespace from field values
306
+ * @param remove_empty_val - Boolean: if true, return nil for rows where all values are blank
307
+ * @param remove_empty_values_val - Boolean: if true, don't add nil for missing columns
308
+ * (they'd be removed anyway by hash_transformations)
309
+ *
310
+ * RETURNS:
311
+ * --------
312
+ * A Ruby Array [hash, data_size] where:
313
+ * - hash is the parsed row as a Hash, or nil if all values were blank (and remove_empty_val is true)
314
+ * - data_size is the number of fields parsed (used to detect extra columns)
315
+ *
316
+ * EXAMPLE:
317
+ * --------
318
+ * Input: line = "john,25,boston", headers = [:name, :age, :city]
319
+ * Output: [{name: "john", age: "25", city: "boston"}, 3]
320
+ *
321
+ * Input: line = "john,25,boston,extra" (more fields than headers)
322
+ * Output: [{name: "john", age: "25", city: "boston", column_4: "extra"}, 4]
323
+ */
324
+ static VALUE rb_parse_line_to_hash(VALUE self, VALUE line, VALUE headers, VALUE col_sep,
325
+ VALUE quote_char, VALUE header_prefix, VALUE has_quotes_val,
326
+ VALUE strip_ws_val, VALUE remove_empty_val, VALUE remove_empty_values_val) {
327
+
328
+ /* ----------------------------------------
329
+ * SECTION 1: Handle nil/invalid input
330
+ * ---------------------------------------- */
331
+ if (NIL_P(line)) {
332
+ VALUE result = rb_ary_new_capa(2);
333
+ rb_ary_push(result, Qnil);
334
+ rb_ary_push(result, INT2FIX(0));
335
+ return result;
336
+ }
337
+
338
+ if (RB_TYPE_P(line, T_STRING) != 1) {
339
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line_to_hash: line has to be a string or nil");
340
+ }
341
+
342
+ /* ----------------------------------------
343
+ * SECTION 2: Extract parameters from Ruby objects
344
+ * ----------------------------------------
345
+ * Convert Ruby objects to C types for efficient access during parsing.
346
+ */
347
+ rb_encoding *encoding = rb_enc_get(line); // Preserve string encoding
348
+ char *startP = RSTRING_PTR(line); // Pointer to start of current field
349
+ long line_len = RSTRING_LEN(line);
350
+ char *endP = startP + line_len; // End of line marker
351
+ char *p = startP; // Current parsing position
352
+
353
+ char *col_sepP = RSTRING_PTR(col_sep);
354
+ long col_sep_len = RSTRING_LEN(col_sep);
355
+
356
+ char *quoteP = RSTRING_PTR(quote_char);
357
+ char quote_char_val = quoteP[0]; // First char of quote string
358
+
359
+ // Default prefix for extra columns is "column_" (e.g., :column_7)
360
+ const char *prefix_str = NIL_P(header_prefix) ? "column_" : RSTRING_PTR(header_prefix);
361
+
362
+ long headers_len = NIL_P(headers) ? 0 : RARRAY_LEN(headers);
363
+ bool has_quotes = RTEST(has_quotes_val); // Hint: does line contain quotes?
364
+ bool strip_ws = RTEST(strip_ws_val); // Strip whitespace from fields?
365
+ bool remove_empty = RTEST(remove_empty_val); // Skip rows with all blank values?
366
+ bool remove_empty_values = RTEST(remove_empty_values_val); // If true, don't add nil for missing cols
367
+
368
+ /* ----------------------------------------
369
+ * SECTION 3: Initialize hash and tracking variables
370
+ * ----------------------------------------
371
+ * Pre-allocate hash with expected capacity for better performance.
372
+ */
373
+ long hash_size = headers_len > 0 ? headers_len : 16;
374
+ VALUE hash = rb_hash_new_capa(hash_size); // Pre-sized hash for efficiency
375
+ VALUE field; // Current field value
376
+ long element_count = 0; // Number of fields parsed
377
+ bool all_blank = true; // Track if all fields are blank
378
+
379
+ /* ========================================
380
+ * SECTION 4: FAST PATH - No quotes, single-char separator
381
+ * ========================================
382
+ * This is the common case for most CSV files. We use memchr() for fast
383
+ * separator scanning, avoiding character-by-character iteration.
384
+ *
385
+ * __builtin_expect hints to the compiler that this branch is likely taken.
386
+ */
387
+ if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
388
+ char sep = *col_sepP;
389
+ char *sep_pos = NULL;
390
+
391
+ /* Loop through each field by finding separator positions */
392
+ while ((sep_pos = memchr(p, sep, endP - p))) {
393
+ // Extract field boundaries
394
+ long field_len = sep_pos - startP;
395
+ char *raw_field = startP;
396
+ char *trim_start = raw_field;
397
+ char *trim_end = raw_field + field_len - 1;
398
+
399
+ // Optional whitespace trimming (spaces and tabs only)
400
+ if (strip_ws) {
401
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
402
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
403
+ }
404
+
405
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
406
+
407
+ // Create field value: use shared empty string for empty fields to reduce allocations
408
+ field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
409
+ if (all_blank && trimmed_len > 0) all_blank = false;
410
+
411
+ // Insert field directly into hash with appropriate key
412
+ VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
413
+ rb_hash_aset(hash, key, field);
414
+ element_count++;
415
+
416
+ // Move to next field
417
+ p = sep_pos + 1;
418
+ startP = p;
419
+ }
420
+
421
+ /* Process the last field (no separator after it) */
422
+ long field_len = endP - startP;
423
+ char *raw_field = startP;
424
+ char *trim_start = raw_field;
425
+ char *trim_end = raw_field + field_len - 1;
426
+
427
+ if (strip_ws) {
428
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
429
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
430
+ }
431
+
432
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
433
+
434
+ field = (trimmed_len > 0) ? rb_enc_str_new(trim_start, trimmed_len, encoding) : Qempty_string;
435
+ if (all_blank && trimmed_len > 0) all_blank = false;
436
+
437
+ VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
438
+ rb_hash_aset(hash, key, field);
439
+ element_count++;
440
+
441
+ } else {
442
+ /* ========================================
443
+ * SECTION 5: SLOW PATH - Quoted fields or multi-char separator
444
+ * ========================================
445
+ * This handles complex cases:
446
+ * - Fields containing the separator inside quotes: "hello,world"
447
+ * - Multi-character separators like "::" or "\t\t"
448
+ * - Escaped quotes using backslash: \"
449
+ *
450
+ * We must scan character-by-character to track quote state.
451
+ */
452
+ long i;
453
+ long backslash_count = 0; // Track consecutive backslashes for escape detection
454
+ bool in_quotes = false; // Are we inside a quoted field?
455
+ bool col_sep_found = true;
456
+
457
+ /* Scan through the line character by character */
458
+ while (p < endP) {
459
+ // Check if current position matches the column separator
460
+ col_sep_found = true;
461
+ for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
462
+ if (*(p + i) != *(col_sepP + i)) {
463
+ col_sep_found = false;
464
+ break;
465
+ }
466
+ }
467
+
468
+ // Found separator and not inside quotes = end of field
469
+ if (col_sep_found && !in_quotes) {
470
+ long field_len = p - startP;
471
+ char *raw_field = startP;
472
+
473
+ // Check if field is wrapped in quotes: "value"
474
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
475
+ if (quoted) {
476
+ raw_field++; // Skip opening quote
477
+ field_len -= 2; // Exclude both quotes from length
478
+ }
479
+
480
+ char *trim_start = raw_field;
481
+ char *trim_end = raw_field + field_len - 1;
482
+
483
+ if (strip_ws) {
484
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
485
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
486
+ }
487
+
488
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
489
+
490
+ // Create field value, handling escaped quotes if present
491
+ if (trimmed_len == 0) {
492
+ field = Qempty_string;
493
+ } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
494
+ // Field contains quotes - need to unescape doubled quotes ("" -> ")
495
+ field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
496
+ all_blank = false;
497
+ } else {
498
+ field = rb_enc_str_new(trim_start, trimmed_len, encoding);
499
+ all_blank = false;
500
+ }
501
+
502
+ // Insert field directly into hash
503
+ VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
504
+ rb_hash_aset(hash, key, field);
505
+ element_count++;
506
+
507
+ // Move past the separator to start of next field
508
+ p += col_sep_len;
509
+ startP = p;
510
+ backslash_count = 0;
511
+
512
+ } else {
513
+ /* Not at a separator (or inside quotes) - track quote state */
514
+
515
+ if (*p == '\\') {
516
+ // Count consecutive backslashes for escape sequence detection
517
+ backslash_count++;
518
+ } else {
519
+ if (*p == quote_char_val) {
520
+ // Quote char toggles in_quotes state only if not escaped
521
+ // (even number of preceding backslashes = not escaped)
522
+ if (backslash_count % 2 == 0) {
523
+ in_quotes = !in_quotes;
524
+ }
525
+ }
526
+ backslash_count = 0;
527
+ }
528
+ p++;
529
+ }
530
+ }
531
+
532
+ // Error: unclosed quote at end of line
533
+ if (in_quotes) {
534
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
535
+ }
536
+
537
+ /* Process the last field (same logic as above) */
538
+ long field_len = endP - startP;
539
+ char *raw_field = startP;
540
+
541
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
542
+ if (quoted) {
543
+ raw_field++;
544
+ field_len -= 2;
545
+ }
546
+
547
+ char *trim_start = raw_field;
548
+ char *trim_end = raw_field + field_len - 1;
549
+
550
+ if (strip_ws) {
551
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
552
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
553
+ }
554
+
555
+ long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
556
+
557
+ if (trimmed_len == 0) {
558
+ field = Qempty_string;
559
+ } else if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
560
+ field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
561
+ all_blank = false;
562
+ } else {
563
+ field = rb_enc_str_new(trim_start, trimmed_len, encoding);
564
+ all_blank = false;
565
+ }
566
+
567
+ VALUE key = get_key_for_index(element_count, headers, headers_len, prefix_str);
568
+ rb_hash_aset(hash, key, field);
569
+ element_count++;
570
+ }
571
+
572
+ /* ----------------------------------------
573
+ * SECTION 6: Handle blank rows
574
+ * ----------------------------------------
575
+ * If remove_empty_hashes is enabled and all fields were blank,
576
+ * return nil instead of the hash so the row can be skipped.
577
+ */
578
+ if (remove_empty && all_blank) {
579
+ VALUE result = rb_ary_new_capa(2);
580
+ rb_ary_push(result, Qnil);
581
+ rb_ary_push(result, LONG2FIX(element_count));
582
+ return result;
583
+ }
584
+
585
+ /* ----------------------------------------
586
+ * SECTION 7: Pad hash with nil for missing columns (conditional)
587
+ * ----------------------------------------
588
+ * Only add nil for missing columns when remove_empty_values is false.
589
+ * When remove_empty_values is true, nils would be removed anyway by
590
+ * hash_transformations, so we skip this for efficiency.
591
+ */
592
+ if (!remove_empty_values) {
593
+ for (long i = element_count; i < headers_len; i++) {
594
+ rb_hash_aset(hash, rb_ary_entry(headers, i), Qnil);
595
+ }
596
+ }
597
+
598
+ /* ----------------------------------------
599
+ * SECTION 8: Return result
600
+ * ----------------------------------------
601
+ * Return [hash, element_count] so caller can detect extra columns
602
+ * (when element_count > headers_len) and extend headers if needed.
603
+ */
604
+ VALUE result = rb_ary_new_capa(2);
605
+ rb_ary_push(result, hash);
606
+ rb_ary_push(result, LONG2FIX(element_count));
607
+ return result;
608
+ }
609
+
610
+ // Count quote characters in a line, respecting backslash escapes.
611
+ // This is a performance optimization that replaces the Ruby each_char implementation
612
+ // which creates a new String object for every character in the line.
613
+ // For a 1000-char line, this eliminates ~1000 object allocations per line.
614
+ static VALUE rb_count_quote_chars(VALUE self, VALUE line, VALUE quote_char) {
615
+ if (NIL_P(line) || NIL_P(quote_char)) return INT2FIX(0);
616
+ if (RSTRING_LEN(quote_char) == 0) return INT2FIX(0);
617
+
618
+ char *str = RSTRING_PTR(line);
619
+ long len = RSTRING_LEN(line);
620
+ char qc = RSTRING_PTR(quote_char)[0];
621
+
622
+ long count = 0;
623
+ bool escaped = false;
624
+
625
+ for (long i = 0; i < len; i++) {
626
+ if (str[i] == '\\' && !escaped) {
627
+ escaped = true;
628
+ } else {
629
+ if (str[i] == qc && !escaped) count++;
630
+ escaped = false;
631
+ }
632
+ }
633
+
634
+ return LONG2FIX(count);
635
+ }
636
+
223
637
  void Init_smarter_csv(void) {
224
638
  SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
225
639
  Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
@@ -227,4 +641,7 @@ void Init_smarter_csv(void) {
227
641
  Qempty_string = rb_str_new_literal("");
228
642
  rb_gc_register_address(&Qempty_string);
229
643
  rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
644
+ rb_define_module_function(Parser, "count_quote_chars_c", rb_count_quote_chars, 2);
645
+ rb_define_module_function(Parser, "zip_to_hash_c", rb_zip_to_hash, 2);
646
+ rb_define_module_function(Parser, "parse_line_to_hash_c", rb_parse_line_to_hash, 9);
230
647
  }
Binary file
@@ -18,21 +18,22 @@ module SmarterCSV
18
18
  candidates = Hash.new(0)
19
19
  count = has_header ? 1 : 5
20
20
  count.times do
21
- line = readline_with_counts(filehandle, options)
21
+ next_line = next_line_with_counts(filehandle, options)
22
+ break if next_line.nil? # EOF reached (short files)
23
+
24
+ line = next_line
22
25
  delimiters.each do |d|
23
26
  # Count only non-quoted occurrences of the delimiter
24
27
  non_quoted_text = line.split(/#{escaped_quote}[^#{escaped_quote}]*#{escaped_quote}/).join
25
28
 
26
29
  candidates[d] += non_quoted_text.scan(d).count
27
30
  end
28
- rescue EOFError # short files
29
- break
30
31
  end
31
32
  rewind(filehandle)
32
33
 
33
34
  if candidates.values.max == 0
34
- # if the header only contains
35
- return ',' if line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
35
+ # if the header only contains word characters and whitespace, assume comma separator
36
+ return ',' if line && line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
36
37
 
37
38
  raise SmarterCSV::NoColSepDetected
38
39
  end
@@ -7,8 +7,24 @@ module SmarterCSV
7
7
  class HeaderSizeMismatch < SmarterCSVException; end
8
8
  class IncorrectOption < SmarterCSVException; end
9
9
  class ValidationError < SmarterCSVException; end
10
- class DuplicateHeaders < SmarterCSVException; end
11
- class MissingKeys < SmarterCSVException; end # previously known as MissingHeaders
10
+ class DuplicateHeaders < SmarterCSVException
11
+ attr_reader :headers
12
+
13
+ def initialize(message, headers = [])
14
+ super(message)
15
+ @headers = headers
16
+ end
17
+ end
18
+
19
+ class MissingKeys < SmarterCSVException # previously known as MissingHeaders
20
+ attr_reader :keys
21
+
22
+ def initialize(message, keys = [])
23
+ super(message)
24
+ @keys = keys
25
+ end
26
+ end
27
+
12
28
  class NoColSepDetected < SmarterCSVException; end
13
29
  class KeyMappingError < SmarterCSVException; end
14
30
  class MalformedCSV < SmarterCSVException; end
@@ -4,8 +4,10 @@ module SmarterCSV
4
4
  module FileIO
5
5
  protected
6
6
 
7
- def readline_with_counts(filehandle, options)
8
- line = filehandle.readline(options[:row_sep])
7
+ def next_line_with_counts(filehandle, options)
8
+ line = filehandle.gets(options[:row_sep])
9
+ return nil if line.nil? # EOF reached
10
+
9
11
  @file_line_count += 1
10
12
  @csv_line_count += 1
11
13
  line = remove_bom(line) if @csv_line_count == 1
@@ -14,7 +16,7 @@ module SmarterCSV
14
16
 
15
17
  def skip_lines(filehandle, options)
16
18
  options[:skip_lines].to_i.times do
17
- readline_with_counts(filehandle, options)
19
+ next_line_with_counts(filehandle, options)
18
20
  end
19
21
  end
20
22