smarter_csv 1.14.2 → 1.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4b6c0ad6a61721dac33f4ef31cf34da3cdd221804aaa45ff8e49cf7b5894b539
4
- data.tar.gz: 8c1307aa7a74fc4f434eec362519dcb4392774aa35f563fc2674d542f6b4ea62
3
+ metadata.gz: 79d1b26219dfb35f385c30eea361bc90d4b1b6cfc1030b6abb9339cc99eb39de
4
+ data.tar.gz: ab0915f4193b657c7e8380ea3bc924a4efb40f9171fb3f120aef462d6f464370
5
5
  SHA512:
6
- metadata.gz: 8b7cef2ec65c990d3f6c8b05acaefa328952e70571c5868293531ac51c44fd13306dfad5e976f786eb61d206214bc7eae91cf8c2e00023a26c75d689769ed684
7
- data.tar.gz: e204946071d76d264b0c8206b1d9170eb7b5c89bb25f22975322fbdc0c4d52869f4b7cf06245d5143298c6d3a89b8c2bc6aa6529fc32f0895e57e1d2ddef97d1
6
+ metadata.gz: 722a598f1581423d4295eb3ce6f46bf556fbf0206dbd4229eceff68625f60ad83d7e3f2dda3a5b6d61c18be2ff6d3d0fb495c46427bb2cc20fcf5f72fa58e317
7
+ data.tar.gz: f61d37aa5a64e37601a7b12ce57a528bc26fec8ee643e4171040f0e9021a2fe4fdeec71698638e98e29da27248e51e77792ed5053bb252716298219f4b095e85
data/CHANGELOG.md CHANGED
@@ -1,8 +1,16 @@
1
1
 
2
2
  # SmarterCSV 1.x Change Log
3
3
 
4
+ ## 1.14.3 (2025-05-04)
5
+ * Improved C-extension parsing logic:
6
+ - Added fast path for unquoted fields to avoid unnecessary quote checks.
7
+ - Aded inline whitespace stripping inside the C parser
8
+ * Performance
9
+ - Significantly reduced per-line overhead in non-quoted, wide CSVs (e.g. fixed-width data exports).
10
+ - Benchmarks show ~10–40% speedup over v1.14.2 depending on structure and quoting.
11
+
4
12
  ## 1.14.2 (2025-04-10)
5
- * bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true`
13
+ * bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true` ([issue 301](https://github.com/tilo/smarter_csv/issues/301))
6
14
  * new option: `header_converter` allows to programatically modify the headers
7
15
 
8
16
  ## 1.14.1 (2025-04-09)
@@ -0,0 +1,270 @@
1
+
2
+ SHELL = /bin/sh
3
+
4
+ # V=0 quiet, V=1 verbose. other values don't work.
5
+ V = 0
6
+ V0 = $(V:0=)
7
+ Q1 = $(V:1=)
8
+ Q = $(Q1:0=@)
9
+ ECHO1 = $(V:1=@ :)
10
+ ECHO = $(ECHO1:0=@ echo)
11
+ NULLCMD = :
12
+
13
+ #### Start of system configuration section. ####
14
+
15
+ srcdir = .
16
+ topdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0
17
+ hdrdir = $(topdir)
18
+ arch_hdrdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0/arm64-darwin23
19
+ PATH_SEPARATOR = :
20
+ VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
21
+ prefix = $(DESTDIR)/Users/tilo/.rvm/rubies/ruby-3.2.2
22
+ rubysitearchprefix = $(rubylibprefix)/$(sitearch)
23
+ rubyarchprefix = $(rubylibprefix)/$(arch)
24
+ rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
25
+ exec_prefix = $(prefix)
26
+ vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
27
+ sitearchhdrdir = $(sitehdrdir)/$(sitearch)
28
+ rubyarchhdrdir = $(rubyhdrdir)/$(arch)
29
+ vendorhdrdir = $(rubyhdrdir)/vendor_ruby
30
+ sitehdrdir = $(rubyhdrdir)/site_ruby
31
+ rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
32
+ vendorarchdir = $(vendorlibdir)/$(sitearch)
33
+ vendorlibdir = $(vendordir)/$(ruby_version)
34
+ vendordir = $(rubylibprefix)/vendor_ruby
35
+ sitearchdir = $(sitelibdir)/$(sitearch)
36
+ sitelibdir = $(sitedir)/$(ruby_version)
37
+ sitedir = $(rubylibprefix)/site_ruby
38
+ rubyarchdir = $(rubylibdir)/$(arch)
39
+ rubylibdir = $(rubylibprefix)/$(ruby_version)
40
+ sitearchincludedir = $(includedir)/$(sitearch)
41
+ archincludedir = $(includedir)/$(arch)
42
+ sitearchlibdir = $(libdir)/$(sitearch)
43
+ archlibdir = $(libdir)/$(arch)
44
+ ridir = $(datarootdir)/$(RI_BASE_NAME)
45
+ mandir = $(datarootdir)/man
46
+ localedir = $(datarootdir)/locale
47
+ libdir = $(exec_prefix)/lib
48
+ psdir = $(docdir)
49
+ pdfdir = $(docdir)
50
+ dvidir = $(docdir)
51
+ htmldir = $(docdir)
52
+ infodir = $(datarootdir)/info
53
+ docdir = $(datarootdir)/doc/$(PACKAGE)
54
+ oldincludedir = $(DESTDIR)/usr/include
55
+ includedir = $(SDKROOT)$(prefix)/include
56
+ runstatedir = $(localstatedir)/run
57
+ localstatedir = $(prefix)/var
58
+ sharedstatedir = $(prefix)/com
59
+ sysconfdir = $(prefix)/etc
60
+ datadir = $(datarootdir)
61
+ datarootdir = $(prefix)/share
62
+ libexecdir = $(exec_prefix)/libexec
63
+ sbindir = $(exec_prefix)/sbin
64
+ bindir = $(exec_prefix)/bin
65
+ archdir = $(rubyarchdir)
66
+
67
+
68
+ CC_WRAPPER =
69
+ CC = gcc
70
+ CXX = g++
71
+ LIBRUBY = $(LIBRUBY_SO)
72
+ LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
73
+ LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
74
+ LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation $(MAINLIBS)
75
+ empty =
76
+ OUTFLAG = -o $(empty)
77
+ COUTFLAG = -o $(empty)
78
+ CSRCFLAG = $(empty)
79
+
80
+ RUBY_EXTCONF_H =
81
+ cflags = -fdeclspec $(optflags) $(debugflags) $(warnflags)
82
+ cxxflags =
83
+ optflags = -O3
84
+ debugflags = -ggdb3
85
+ warnflags = -Wall -Wextra -Wextra-tokens -Wdeprecated-declarations -Wdivision-by-zero -Wdiv-by-zero -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wshorten-64-to-32 -Wwrite-strings -Wold-style-definition -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wunused-variable -Wundef
86
+ cppflags =
87
+ CCDLFLAGS = -fno-common
88
+ CFLAGS = $(CCDLFLAGS) -O3 -I/opt/homebrew/opt/libyaml/include -I/opt/homebrew/opt/libksba/include -I/opt/homebrew/opt/readline/include -I/opt/homebrew/opt/zlib/include -I/opt/homebrew/opt/openssl@1.1/include $(cflags) -fno-common -pipe $(ARCH_FLAG)
89
+ INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
90
+ DEFS =
91
+ CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
92
+ CXXFLAGS = $(CCDLFLAGS) -fdeclspec $(ARCH_FLAG)
93
+ ldflags = -L. -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -fstack-protector-strong
94
+ dldflags = -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -Wl,-undefined,dynamic_lookup $(LIBRUBYARG_SHARED)
95
+ ARCH_FLAG =
96
+ DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
97
+ LDSHARED = $(CC) -dynamic -bundle
98
+ LDSHAREDXX = $(CXX) -dynamic -bundle
99
+ AR = ar
100
+ EXEEXT =
101
+
102
+ RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
103
+ RUBY_SO_NAME = ruby.3.2
104
+ RUBYW_INSTALL_NAME =
105
+ RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
106
+ RUBYW_BASE_NAME = rubyw
107
+ RUBY_BASE_NAME = ruby
108
+
109
+ arch = arm64-darwin23
110
+ sitearch = $(arch)
111
+ ruby_version = 3.2.0
112
+ ruby = $(bindir)/$(RUBY_BASE_NAME)
113
+ RUBY = $(ruby)
114
+ BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
115
+ ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
116
+
117
+ RM = rm -f
118
+ RM_RF = rm -fr
119
+ RMDIRS = rmdir -p
120
+ MAKEDIRS = /opt/homebrew/opt/coreutils/bin/gmkdir -p
121
+ INSTALL = /opt/homebrew/opt/coreutils/bin/ginstall -c
122
+ INSTALL_PROG = $(INSTALL) -m 0755
123
+ INSTALL_DATA = $(INSTALL) -m 644
124
+ COPY = cp
125
+ TOUCH = exit >
126
+
127
+ #### End of system configuration section. ####
128
+
129
+ preload =
130
+ libpath = . $(libdir)
131
+ LIBPATH = -L. -L$(libdir)
132
+ DEFFILE =
133
+
134
+ CLEANFILES = mkmf.log
135
+ DISTCLEANFILES =
136
+ DISTCLEANDIRS =
137
+
138
+ extout =
139
+ extout_prefix =
140
+ target_prefix = /smarter_csv
141
+ LOCAL_LIBS =
142
+ LIBS = $(LIBRUBYARG_SHARED) -lpthread
143
+ ORIG_SRCS = smarter_csv.c
144
+ SRCS = $(ORIG_SRCS)
145
+ OBJS = smarter_csv.o
146
+ HDRS =
147
+ LOCAL_HDRS =
148
+ TARGET = smarter_csv
149
+ TARGET_NAME = smarter_csv
150
+ TARGET_ENTRY = Init_$(TARGET_NAME)
151
+ DLLIB = $(TARGET).bundle
152
+ EXTSTATIC =
153
+ STATIC_LIB =
154
+
155
+ TIMESTAMP_DIR = .
156
+ BINDIR = $(bindir)
157
+ RUBYCOMMONDIR = $(sitedir)$(target_prefix)
158
+ RUBYLIBDIR = $(sitelibdir)$(target_prefix)
159
+ RUBYARCHDIR = $(sitearchdir)$(target_prefix)
160
+ HDRDIR = $(sitehdrdir)$(target_prefix)
161
+ ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
162
+ TARGET_SO_DIR =
163
+ TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
164
+ CLEANLIBS = $(TARGET_SO) $(TARGET_SO).dSYM
165
+ CLEANOBJS = $(OBJS) *.bak
166
+ TARGET_SO_DIR_TIMESTAMP = $(TIMESTAMP_DIR)/.sitearchdir.-.smarter_csv.time
167
+
168
+ all: $(DLLIB)
169
+ static: $(STATIC_LIB)
170
+ .PHONY: all install static install-so install-rb
171
+ .PHONY: clean clean-so clean-static clean-rb
172
+
173
+ clean-static::
174
+ clean-rb-default::
175
+ clean-rb::
176
+ clean-so::
177
+ clean: clean-so clean-static clean-rb-default clean-rb
178
+ -$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
179
+
180
+ distclean-rb-default::
181
+ distclean-rb::
182
+ distclean-so::
183
+ distclean-static::
184
+ distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
185
+ -$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
186
+ -$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
187
+ -$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
188
+
189
+ realclean: distclean
190
+ install: install-so install-rb
191
+
192
+ install-so: $(DLLIB) $(TARGET_SO_DIR_TIMESTAMP)
193
+ $(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
194
+ clean-static::
195
+ -$(Q)$(RM) $(STATIC_LIB)
196
+ install-rb: pre-install-rb do-install-rb install-rb-default
197
+ install-rb-default: pre-install-rb-default do-install-rb-default
198
+ pre-install-rb: Makefile
199
+ pre-install-rb-default: Makefile
200
+ do-install-rb:
201
+ do-install-rb-default:
202
+ pre-install-rb-default:
203
+ @$(NULLCMD)
204
+ $(TARGET_SO_DIR_TIMESTAMP):
205
+ $(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
206
+ $(Q) $(TOUCH) $@
207
+
208
+ site-install: site-install-so site-install-rb
209
+ site-install-so: install-so
210
+ site-install-rb: install-rb
211
+
212
+ .SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
213
+
214
+ .cc.o:
215
+ $(ECHO) compiling $(<)
216
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
217
+
218
+ .cc.S:
219
+ $(ECHO) translating $(<)
220
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
221
+
222
+ .mm.o:
223
+ $(ECHO) compiling $(<)
224
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
225
+
226
+ .mm.S:
227
+ $(ECHO) translating $(<)
228
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
229
+
230
+ .cxx.o:
231
+ $(ECHO) compiling $(<)
232
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
233
+
234
+ .cxx.S:
235
+ $(ECHO) translating $(<)
236
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
237
+
238
+ .cpp.o:
239
+ $(ECHO) compiling $(<)
240
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
241
+
242
+ .cpp.S:
243
+ $(ECHO) translating $(<)
244
+ $(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
245
+
246
+ .c.o:
247
+ $(ECHO) compiling $(<)
248
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
249
+
250
+ .c.S:
251
+ $(ECHO) translating $(<)
252
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
253
+
254
+ .m.o:
255
+ $(ECHO) compiling $(<)
256
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
257
+
258
+ .m.S:
259
+ $(ECHO) translating $(<)
260
+ $(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
261
+
262
+ $(TARGET_SO): $(OBJS) Makefile
263
+ $(ECHO) linking shared-object smarter_csv/$(DLLIB)
264
+ -$(Q)$(RM) $(@)
265
+ $(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
266
+ $(Q) $(POSTLINK)
267
+
268
+
269
+
270
+ $(OBJS): $(HDRS) $(ruby_headers)
@@ -9,6 +9,8 @@ if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
9
9
  RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
10
10
  end
11
11
 
12
- CONFIG["optflags"] = "-O3"
12
+ # CONFIG["optflags"] = "-O3 -march=native -flto"
13
+ CONFIG["optflags"] = "-O3 -march=native -flto -fomit-frame-pointer -DNDEBUG"
14
+ CONFIG["debugflags"] = ""
13
15
 
14
16
  create_makefile('smarter_csv/smarter_csv')
@@ -2,6 +2,7 @@
2
2
  #include "ruby/encoding.h"
3
3
  #include <stdio.h>
4
4
  #include <stdbool.h>
5
+ #include <string.h>
5
6
 
6
7
  #ifndef bool
7
8
  #define bool int
@@ -12,8 +13,25 @@
12
13
  VALUE SmarterCSV = Qnil;
13
14
  VALUE eMalformedCSVError = Qnil;
14
15
  VALUE Parser = Qnil;
16
+ VALUE Qempty_string = Qnil; // shared frozen empty string
17
+
18
+ static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
19
+ char *buf = ALLOC_N(char, len);
20
+ long j = 0;
21
+ for (long i = 0; i < len; i++) {
22
+ if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
23
+ buf[j++] = quote_char;
24
+ i++; // skip second quote
25
+ } else {
26
+ buf[j++] = str[i];
27
+ }
28
+ }
29
+ VALUE out = rb_enc_str_new(buf, j, encoding);
30
+ xfree(buf);
31
+ return out;
32
+ }
15
33
 
16
- static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
34
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val) {
17
35
  if (RB_TYPE_P(line, T_NIL) == 1) {
18
36
  return rb_ary_new();
19
37
  }
@@ -22,74 +40,180 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
22
40
  rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
23
41
  }
24
42
 
25
- rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
26
- char *startP = RSTRING_PTR(line); /* may not be null terminated */
43
+ rb_encoding *encoding = rb_enc_get(line);
44
+ char *startP = RSTRING_PTR(line);
27
45
  long line_len = RSTRING_LEN(line);
28
- char *endP = startP + line_len; /* points behind the string */
46
+ char *endP = startP + line_len;
29
47
  char *p = startP;
30
48
 
31
49
  char *col_sepP = RSTRING_PTR(col_sep);
32
50
  long col_sep_len = RSTRING_LEN(col_sep);
33
51
 
34
52
  char *quoteP = RSTRING_PTR(quote_char);
35
- long quote_count = 0;
36
-
37
- bool col_sep_found = true;
53
+ char quote_char_val = quoteP[0];
54
+ size_t quote_len = strlen(quoteP);
38
55
 
39
56
  VALUE elements = rb_ary_new();
40
57
  VALUE field;
41
- long i;
42
58
 
43
- /* Variables for escaped quote handling */
59
+ long element_count = 0;
60
+ int max_fields = -1;
61
+ if (max_size != Qnil) {
62
+ max_fields = NUM2INT(max_size);
63
+ if (max_fields < 0) {
64
+ return rb_ary_new();
65
+ }
66
+ }
67
+
68
+ bool has_quotes = RTEST(has_quotes_val);
69
+ bool strip_ws = RTEST(strip_ws_val);
70
+
71
+ // === FAST PATH: No quotes and single-character separator ===
72
+ if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
73
+ char sep = *col_sepP;
74
+ char *sep_pos = NULL;
75
+
76
+ while ((sep_pos = memchr(p, sep, endP - p))) {
77
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
78
+ break;
79
+ }
80
+
81
+ long field_len = sep_pos - startP;
82
+ char *raw_field = startP;
83
+ char *trim_start = raw_field;
84
+ char *trim_end = raw_field + field_len - 1;
85
+
86
+ if (strip_ws) {
87
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
88
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
89
+ }
90
+
91
+ long trimmed_len = trim_end - trim_start + 1;
92
+
93
+ field = rb_enc_str_new(trim_start, trimmed_len, encoding);
94
+ rb_ary_push(elements, field);
95
+ element_count++;
96
+
97
+ p = sep_pos + 1;
98
+ startP = p;
99
+ }
100
+
101
+ if ((max_fields < 0) || (element_count < max_fields)) {
102
+ long field_len = endP - startP;
103
+ char *raw_field = startP;
104
+ char *trim_start = raw_field;
105
+ char *trim_end = raw_field + field_len - 1;
106
+
107
+ if (strip_ws) {
108
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
109
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
110
+ }
111
+
112
+ long trimmed_len = trim_end - trim_start + 1;
113
+
114
+ field = rb_enc_str_new(trim_start, trimmed_len, encoding);
115
+ rb_ary_push(elements, field);
116
+ }
117
+
118
+ return elements;
119
+ }
120
+
121
+ // === SLOW PATH: Quoted fields or multi-char separator ===
122
+ long i;
44
123
  long backslash_count = 0;
45
124
  bool in_quotes = false;
125
+ bool col_sep_found = true;
46
126
 
47
127
  while (p < endP) {
48
- /* does the remaining string start with col_sep ? */
49
128
  col_sep_found = true;
50
- for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
51
- col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
129
+ for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
130
+ if (*(p + i) != *(col_sepP + i)) {
131
+ col_sep_found = false;
132
+ break;
133
+ }
52
134
  }
53
- /* if col_sep was found and we're not inside quotes */
135
+
54
136
  if (col_sep_found && !in_quotes) {
55
- /* if max_size != nil && elements.size >= header_size */
56
- if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
137
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
57
138
  break;
58
- } else {
59
- /* push that field with original encoding onto the results */
60
- field = rb_enc_str_new(startP, p - startP, encoding);
61
- rb_ary_push(elements, field);
139
+ }
140
+
141
+ long field_len = p - startP;
142
+ char *raw_field = startP;
62
143
 
63
- p += col_sep_len;
64
- startP = p;
65
- backslash_count = 0; // Reset backslash count at the start of a new field
144
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
145
+ if (quoted) {
146
+ raw_field++;
147
+ field_len -= 2;
148
+ }
149
+
150
+ char *trim_start = raw_field;
151
+ char *trim_end = raw_field + field_len - 1;
152
+
153
+ if (strip_ws) {
154
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
155
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
156
+ }
157
+
158
+ long trimmed_len = trim_end - trim_start + 1;
159
+
160
+ if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
161
+ field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
162
+ } else {
163
+ field = rb_enc_str_new(trim_start, trimmed_len, encoding);
66
164
  }
165
+
166
+ rb_ary_push(elements, field);
167
+ element_count++;
168
+
169
+ p += col_sep_len;
170
+ startP = p;
171
+ backslash_count = 0;
67
172
  } else {
68
173
  if (*p == '\\') {
69
174
  backslash_count++;
70
175
  } else {
71
- if (*p == *quoteP) {
176
+ if (*p == quote_char_val) {
72
177
  if (backslash_count % 2 == 0) {
73
- /* Even number of backslashes means quote is not escaped */
74
178
  in_quotes = !in_quotes;
75
179
  }
76
- /* Else, quote is escaped; do nothing */
77
180
  }
78
- backslash_count = 0; // Reset after any character other than backslash
181
+ backslash_count = 0;
79
182
  }
80
183
  p++;
81
184
  }
82
- } /* while */
185
+ }
83
186
 
84
- /* Check for unclosed quotes at the end of the line */
85
187
  if (in_quotes) {
86
188
  rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
87
189
  }
88
190
 
89
- /* check if the last part of the line needs to be processed */
90
- if ((max_size == Qnil) || RARRAY_LEN(elements) < NUM2INT(max_size)) {
91
- /* copy the remaining line as a field with original encoding onto the results */
92
- field = rb_enc_str_new(startP, endP - startP, encoding);
191
+ if ((max_fields < 0) || (element_count < max_fields)) {
192
+ long field_len = endP - startP;
193
+ char *raw_field = startP;
194
+
195
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
196
+ if (quoted) {
197
+ raw_field++;
198
+ field_len -= 2;
199
+ }
200
+
201
+ char *trim_start = raw_field;
202
+ char *trim_end = raw_field + field_len - 1;
203
+
204
+ if (strip_ws) {
205
+ while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
206
+ while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
207
+ }
208
+
209
+ long trimmed_len = trim_end - trim_start + 1;
210
+
211
+ if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
212
+ field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
213
+ } else {
214
+ field = rb_enc_str_new(trim_start, trimmed_len, encoding);
215
+ }
216
+
93
217
  rb_ary_push(elements, field);
94
218
  }
95
219
 
@@ -97,10 +221,10 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
97
221
  }
98
222
 
99
223
  void Init_smarter_csv(void) {
100
- // these modules and the error class are already defined in Ruby code, make them accessible:
101
224
  SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
102
225
  Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
103
226
  eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
104
-
105
- rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
227
+ Qempty_string = rb_str_new_literal("");
228
+ rb_gc_register_address(&Qempty_string);
229
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
106
230
  }