RubyGems - smarter_csv - Versions diffs - 1.14.2 → 1.14.4 - Mend

smarter_csv 1.14.2 → 1.14.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +12 -1
data/CONTRIBUTORS.md +1 -0
data/ext/smarter_csv/extconf.rb +3 -1
data/ext/smarter_csv/smarter_csv.c +159 -35
data/lib/smarter_csv/auto_detection.rb +1 -1
data/lib/smarter_csv/parser.rb +14 -7
data/lib/smarter_csv/reader.rb +1 -2
data/lib/smarter_csv/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4b6c0ad6a61721dac33f4ef31cf34da3cdd221804aaa45ff8e49cf7b5894b539
-  data.tar.gz: 8c1307aa7a74fc4f434eec362519dcb4392774aa35f563fc2674d542f6b4ea62
+  metadata.gz: a84cfa57008a6f9f05ee82eeef9edb4dab5993874a38c456ce78d8da999280aa
+  data.tar.gz: 7e8569670615a6ff7fb63c152d8e849069bcde318edf885d95cef52b5513f52c
 SHA512:
-  metadata.gz: 8b7cef2ec65c990d3f6c8b05acaefa328952e70571c5868293531ac51c44fd13306dfad5e976f786eb61d206214bc7eae91cf8c2e00023a26c75d689769ed684
-  data.tar.gz: e204946071d76d264b0c8206b1d9170eb7b5c89bb25f22975322fbdc0c4d52869f4b7cf06245d5143298c6d3a89b8c2bc6aa6529fc32f0895e57e1d2ddef97d1
+  metadata.gz: 3d1baa73a0120824390f062e38d3acab8dfc7a09d48fbbdd647466605ccd543ca839e61cbefca9929d2dafce9a61b12e50b62d9e034aadd047a46e1886728980
+  data.tar.gz: 9756f2fdd15e619ba98011370a3410781b7771edd0656dd1bcb1226ebf899726ec00f43627dcc16756d986a962e9eb3e84c7e901ef5aecf12ede9f077f7a8423

data/CHANGELOG.md CHANGED Viewed

@@ -1,8 +1,19 @@
 # SmarterCSV 1.x Change Log
+## 1.14.4 (2025-05-26)
+ * Bugfix: SmarterCSV::Reader fixing issue with header containing spaces ([PR 305](https://github.com/tilo/smarter_csv/pull/305) thanks to Felipe Cabezudo)
+## 1.14.3 (2025-05-04)
+ * Improved C-extension parsing logic:
+   - Added fast path for unquoted fields to avoid unnecessary quote checks.
+   - Aded inline whitespace stripping inside the C parser
+ * Performance
+   -  Significantly reduced per-line overhead in non-quoted, wide CSVs (e.g. fixed-width data exports).
+   - Benchmarks show ~10–40% speedup over v1.14.2 depending on structure and quoting.
 ## 1.14.2 (2025-04-10)
- * bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true`
+ * bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true` ([issue 301](https://github.com/tilo/smarter_csv/issues/301))
  * new option: `header_converter` allows to programatically modify the headers
 ## 1.14.1 (2025-04-09)

data/CONTRIBUTORS.md CHANGED Viewed

@@ -59,3 +59,4 @@ A Big Thank you to everyone who filed issues, sent comments, and who contributed
  * [Randall B](https://github.com/randall-coding)
  * [Matthew Kennedy](https://github.com/MattKitmanLabs)
  * [Robert Reiz](https://github.com/reiz)
+ * [Felipe Cabezudo](https://github.com/felipekb)

data/ext/smarter_csv/extconf.rb CHANGED Viewed

@@ -9,6 +9,8 @@ if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
   RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
 end
-CONFIG["optflags"] = "-O3"
+# CONFIG["optflags"] = "-O3 -march=native -flto"
+CONFIG["optflags"] = "-O3 -march=native -flto -fomit-frame-pointer -DNDEBUG"
+CONFIG["debugflags"] = ""
 create_makefile('smarter_csv/smarter_csv')

data/ext/smarter_csv/smarter_csv.c CHANGED Viewed

@@ -2,6 +2,7 @@
 #include "ruby/encoding.h"
 #include <stdio.h>
 #include <stdbool.h>
+#include <string.h>
 #ifndef bool
   #define bool int
@@ -12,8 +13,25 @@
 VALUE SmarterCSV = Qnil;
 VALUE eMalformedCSVError = Qnil;
 VALUE Parser = Qnil;
+VALUE Qempty_string = Qnil; // shared frozen empty string
+static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
+  char *buf = ALLOC_N(char, len);
+  long j = 0;
+  for (long i = 0; i < len; i++) {
+    if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
+      buf[j++] = quote_char;
+      i++; // skip second quote
+    } else {
+      buf[j++] = str[i];
+    }
+  }
+  VALUE out = rb_enc_str_new(buf, j, encoding);
+  xfree(buf);
+  return out;
+}
-static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
+static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val) {
   if (RB_TYPE_P(line, T_NIL) == 1) {
     return rb_ary_new();
   }
@@ -22,74 +40,180 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
     rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
   }
-  rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
-  char *startP = RSTRING_PTR(line); /* may not be null terminated */
+  rb_encoding *encoding = rb_enc_get(line);
+  char *startP = RSTRING_PTR(line);
   long line_len = RSTRING_LEN(line);
-  char *endP = startP + line_len; /* points behind the string */
+  char *endP = startP + line_len;
   char *p = startP;
   char *col_sepP = RSTRING_PTR(col_sep);
   long col_sep_len = RSTRING_LEN(col_sep);
   char *quoteP = RSTRING_PTR(quote_char);
-  long quote_count = 0;
-  bool col_sep_found = true;
+  char quote_char_val = quoteP[0];
+  size_t quote_len = strlen(quoteP);
   VALUE elements = rb_ary_new();
   VALUE field;
-  long i;
-  /* Variables for escaped quote handling */
+  long element_count = 0;
+  int max_fields = -1;
+  if (max_size != Qnil) {
+    max_fields = NUM2INT(max_size);
+    if (max_fields < 0) {
+      return rb_ary_new();
+    }
+  }
+  bool has_quotes = RTEST(has_quotes_val);
+  bool strip_ws = RTEST(strip_ws_val);
+  // === FAST PATH: No quotes and single-character separator ===
+  if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
+    char sep = *col_sepP;
+    char *sep_pos = NULL;
+    while ((sep_pos = memchr(p, sep, endP - p))) {
+      if ((max_fields >= 0) && (element_count >= max_fields)) {
+        break;
+      }
+      long field_len = sep_pos - startP;
+      char *raw_field = startP;
+      char *trim_start = raw_field;
+      char *trim_end = raw_field + field_len - 1;
+      if (strip_ws) {
+        while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
+        while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
+      }
+      long trimmed_len = trim_end - trim_start + 1;
+      field = rb_enc_str_new(trim_start, trimmed_len, encoding);
+      rb_ary_push(elements, field);
+      element_count++;
+      p = sep_pos + 1;
+      startP = p;
+    }
+    if ((max_fields < 0) || (element_count < max_fields)) {
+      long field_len = endP - startP;
+      char *raw_field = startP;
+      char *trim_start = raw_field;
+      char *trim_end = raw_field + field_len - 1;
+      if (strip_ws) {
+        while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
+        while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
+      }
+      long trimmed_len = trim_end - trim_start + 1;
+      field = rb_enc_str_new(trim_start, trimmed_len, encoding);
+      rb_ary_push(elements, field);
+    }
+    return elements;
+  }
+  // === SLOW PATH: Quoted fields or multi-char separator ===
+  long i;
   long backslash_count = 0;
   bool in_quotes = false;
+  bool col_sep_found = true;
   while (p < endP) {
-    /* does the remaining string start with col_sep ? */
     col_sep_found = true;
-    for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
-      col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
+    for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
+      if (*(p + i) != *(col_sepP + i)) {
+        col_sep_found = false;
+        break;
+      }
     }
-    /* if col_sep was found and we're not inside quotes */
     if (col_sep_found && !in_quotes) {
-      /* if max_size != nil && elements.size >= header_size */
-      if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
+      if ((max_fields >= 0) && (element_count >= max_fields)) {
         break;
-      } else {
-        /* push that field with original encoding onto the results */
-        field = rb_enc_str_new(startP, p - startP, encoding);
-        rb_ary_push(elements, field);
+      }
+      long field_len = p - startP;
+      char *raw_field = startP;
-        p += col_sep_len;
-        startP = p;
-        backslash_count = 0; // Reset backslash count at the start of a new field
+      bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
+      if (quoted) {
+        raw_field++;
+        field_len -= 2;
+      }
+      char *trim_start = raw_field;
+      char *trim_end = raw_field + field_len - 1;
+      if (strip_ws) {
+        while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
+        while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
+      }
+      long trimmed_len = trim_end - trim_start + 1;
+      if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
+        field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
+      } else {
+        field = rb_enc_str_new(trim_start, trimmed_len, encoding);
       }
+      rb_ary_push(elements, field);
+      element_count++;
+      p += col_sep_len;
+      startP = p;
+      backslash_count = 0;
     } else {
       if (*p == '\\') {
         backslash_count++;
       } else {
-        if (*p == *quoteP) {
+        if (*p == quote_char_val) {
           if (backslash_count % 2 == 0) {
-            /* Even number of backslashes means quote is not escaped */
             in_quotes = !in_quotes;
           }
-          /* Else, quote is escaped; do nothing */
         }
-        backslash_count = 0; // Reset after any character other than backslash
+        backslash_count = 0;
       }
       p++;
     }
-  } /* while */
+  }
-  /* Check for unclosed quotes at the end of the line */
   if (in_quotes) {
     rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
   }
-  /* check if the last part of the line needs to be processed */
-  if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
-    /* copy the remaining line as a field with original encoding onto the results */
-    field = rb_enc_str_new(startP, endP - startP, encoding);
+  if ((max_fields < 0) || (element_count < max_fields)) {
+    long field_len = endP - startP;
+    char *raw_field = startP;
+    bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
+    if (quoted) {
+      raw_field++;
+      field_len -= 2;
+    }
+    char *trim_start = raw_field;
+    char *trim_end = raw_field + field_len - 1;
+    if (strip_ws) {
+      while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
+      while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
+    }
+    long trimmed_len = trim_end - trim_start + 1;
+    if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
+      field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
+    } else {
+      field = rb_enc_str_new(trim_start, trimmed_len, encoding);
+    }
     rb_ary_push(elements, field);
   }
@@ -97,10 +221,10 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
 }
 void Init_smarter_csv(void) {
-  // these modules and the error class are already defined in Ruby code, make them accessible:
   SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
   Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
   eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
-  rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
+  Qempty_string = rb_str_new_literal("");
+  rb_gc_register_address(&Qempty_string);
+  rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
 }

data/lib/smarter_csv/auto_detection.rb CHANGED Viewed

@@ -32,7 +32,7 @@ module SmarterCSV
       if candidates.values.max == 0
         # if the header only contains
-        return ',' if line.chomp(options[:row_sep]) =~ /^\w+$/
+        return ',' if line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
         raise SmarterCSV::NoColSepDetected
       end

data/lib/smarter_csv/parser.rb CHANGED Viewed

@@ -2,6 +2,8 @@
 module SmarterCSV
   module Parser
+    EMPTY_STRING = ''.freeze
     protected
     ###
@@ -11,17 +13,16 @@ module SmarterCSV
     ###
     def parse(line, options, header_size = nil)
       # puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
+      has_quotes = line.include?(options[:quote_char])
       if options[:acceleration] && has_acceleration
         # :nocov:
-        has_quotes = line =~ /#{options[:quote_char]}/
-        elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
-        elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
+        elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace])
         [elements, elements.size]
         # :nocov:
       else
         # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
-        parse_csv_line_ruby(line, options, header_size)
+        parse_csv_line_ruby(line, options, header_size, has_quotes)
       end
     end
@@ -46,7 +47,7 @@ module SmarterCSV
     #
     # Our convention is that empty fields are returned as empty strings, not as nil.
-    def parse_csv_line_ruby(line, options, header_size = nil)
+    def parse_csv_line_ruby(line, options, header_size = nil, has_quotes = false)
       return [[], 0] if line.nil?
       line_size = line.size
@@ -98,11 +99,13 @@ module SmarterCSV
         elements << cleanup_quotes(line[start..-1], quote)
       end
+      elements.map!(&:strip) if options[:strip_whitespace]
       [elements, elements.size]
     end
     def cleanup_quotes(field, quote)
-      return field if field.nil?
+      return nil if field.nil?
+      return EMPTY_STRING if field.empty?
       # Remove surrounding quotes if present
       if field.start_with?(quote) && field.end_with?(quote)
@@ -110,9 +113,13 @@ module SmarterCSV
       end
       # Replace double quotes with a single quote
-      field.gsub!((quote * 2).to_s, quote)
+      field.gsub!(doubled_quote(quote), quote)
       field
     end
+    def doubled_quote(quote)
+      @doubled_quote ||= (quote * 2).to_s.freeze
+    end
   end
 end

data/lib/smarter_csv/reader.rb CHANGED Viewed

@@ -128,6 +128,7 @@ module SmarterCSV
           line.chomp!(options[:row_sep])
           # --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
+          # we are now stripping whitespace inside the parse() methods
           dataA, data_size = parse(line, options) # we parse the extra columns
           if options[:strict]
@@ -141,8 +142,6 @@ module SmarterCSV
             end
           end
-          dataA.map!{|x| x.strip} if options[:strip_whitespace]
           # if all values are blank, then ignore this line
           next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))

data/lib/smarter_csv/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module SmarterCSV
-  VERSION = "1.14.2"
+  VERSION = "1.14.4"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: smarter_csv
 version: !ruby/object:Gem::Version
-  version: 1.14.2
+  version: 1.14.4
 platform: ruby
 authors:
 - Tilo Sloboda
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-04-10 00:00:00.000000000 Z
+date: 2025-05-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: awesome_print