smarter_csv 1.14.2 → 1.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -1
- data/ext/smarter_csv/Makefile +270 -0
- data/ext/smarter_csv/extconf.rb +3 -1
- data/ext/smarter_csv/smarter_csv.c +159 -35
- data/ext/smarter_csv/smarter_csv.c.works +185 -0
- data/ext/smarter_csv/smarter_csv.c.works10 +199 -0
- data/ext/smarter_csv/smarter_csv.c.works11 +189 -0
- data/ext/smarter_csv/smarter_csv.c.works14 +230 -0
- data/ext/smarter_csv/smarter_csv.c.works15 +230 -0
- data/ext/smarter_csv/smarter_csv.c.works2 +192 -0
- data/ext/smarter_csv/smarter_csv.c.works4 +190 -0
- data/ext/smarter_csv/smarter_csv.c.works5 +203 -0
- data/ext/smarter_csv/smarter_csv.c.works7 +217 -0
- data/ext/smarter_csv/smarter_csv.c.works8 +193 -0
- data/ext/smarter_csv/smarter_csv.c.works9 +196 -0
- data/lib/smarter_csv/parser.rb +14 -7
- data/lib/smarter_csv/reader.rb +1 -2
- data/lib/smarter_csv/version.rb +1 -1
- metadata +14 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79d1b26219dfb35f385c30eea361bc90d4b1b6cfc1030b6abb9339cc99eb39de
|
4
|
+
data.tar.gz: ab0915f4193b657c7e8380ea3bc924a4efb40f9171fb3f120aef462d6f464370
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 722a598f1581423d4295eb3ce6f46bf556fbf0206dbd4229eceff68625f60ad83d7e3f2dda3a5b6d61c18be2ff6d3d0fb495c46427bb2cc20fcf5f72fa58e317
|
7
|
+
data.tar.gz: f61d37aa5a64e37601a7b12ce57a528bc26fec8ee643e4171040f0e9021a2fe4fdeec71698638e98e29da27248e51e77792ed5053bb252716298219f4b095e85
|
data/CHANGELOG.md
CHANGED
@@ -1,8 +1,16 @@
|
|
1
1
|
|
2
2
|
# SmarterCSV 1.x Change Log
|
3
3
|
|
4
|
+
## 1.14.3 (2025-05-04)
|
5
|
+
* Improved C-extension parsing logic:
|
6
|
+
- Added fast path for unquoted fields to avoid unnecessary quote checks.
|
7
|
+
- Aded inline whitespace stripping inside the C parser
|
8
|
+
* Performance
|
9
|
+
- Significantly reduced per-line overhead in non-quoted, wide CSVs (e.g. fixed-width data exports).
|
10
|
+
- Benchmarks show ~10–40% speedup over v1.14.2 depending on structure and quoting.
|
11
|
+
|
4
12
|
## 1.14.2 (2025-04-10)
|
5
|
-
* bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true`
|
13
|
+
* bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true` ([issue 301](https://github.com/tilo/smarter_csv/issues/301))
|
6
14
|
* new option: `header_converter` allows to programatically modify the headers
|
7
15
|
|
8
16
|
## 1.14.1 (2025-04-09)
|
@@ -0,0 +1,270 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
V0 = $(V:0=)
|
7
|
+
Q1 = $(V:1=)
|
8
|
+
Q = $(Q1:0=@)
|
9
|
+
ECHO1 = $(V:1=@ :)
|
10
|
+
ECHO = $(ECHO1:0=@ echo)
|
11
|
+
NULLCMD = :
|
12
|
+
|
13
|
+
#### Start of system configuration section. ####
|
14
|
+
|
15
|
+
srcdir = .
|
16
|
+
topdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0
|
17
|
+
hdrdir = $(topdir)
|
18
|
+
arch_hdrdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0/arm64-darwin23
|
19
|
+
PATH_SEPARATOR = :
|
20
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
21
|
+
prefix = $(DESTDIR)/Users/tilo/.rvm/rubies/ruby-3.2.2
|
22
|
+
rubysitearchprefix = $(rubylibprefix)/$(sitearch)
|
23
|
+
rubyarchprefix = $(rubylibprefix)/$(arch)
|
24
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
25
|
+
exec_prefix = $(prefix)
|
26
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
|
27
|
+
sitearchhdrdir = $(sitehdrdir)/$(sitearch)
|
28
|
+
rubyarchhdrdir = $(rubyhdrdir)/$(arch)
|
29
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
30
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
31
|
+
rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
|
32
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
33
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
34
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
35
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
36
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
37
|
+
sitedir = $(rubylibprefix)/site_ruby
|
38
|
+
rubyarchdir = $(rubylibdir)/$(arch)
|
39
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
40
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
41
|
+
archincludedir = $(includedir)/$(arch)
|
42
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
43
|
+
archlibdir = $(libdir)/$(arch)
|
44
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
45
|
+
mandir = $(datarootdir)/man
|
46
|
+
localedir = $(datarootdir)/locale
|
47
|
+
libdir = $(exec_prefix)/lib
|
48
|
+
psdir = $(docdir)
|
49
|
+
pdfdir = $(docdir)
|
50
|
+
dvidir = $(docdir)
|
51
|
+
htmldir = $(docdir)
|
52
|
+
infodir = $(datarootdir)/info
|
53
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
|
+
oldincludedir = $(DESTDIR)/usr/include
|
55
|
+
includedir = $(SDKROOT)$(prefix)/include
|
56
|
+
runstatedir = $(localstatedir)/run
|
57
|
+
localstatedir = $(prefix)/var
|
58
|
+
sharedstatedir = $(prefix)/com
|
59
|
+
sysconfdir = $(prefix)/etc
|
60
|
+
datadir = $(datarootdir)
|
61
|
+
datarootdir = $(prefix)/share
|
62
|
+
libexecdir = $(exec_prefix)/libexec
|
63
|
+
sbindir = $(exec_prefix)/sbin
|
64
|
+
bindir = $(exec_prefix)/bin
|
65
|
+
archdir = $(rubyarchdir)
|
66
|
+
|
67
|
+
|
68
|
+
CC_WRAPPER =
|
69
|
+
CC = gcc
|
70
|
+
CXX = g++
|
71
|
+
LIBRUBY = $(LIBRUBY_SO)
|
72
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
73
|
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
74
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation $(MAINLIBS)
|
75
|
+
empty =
|
76
|
+
OUTFLAG = -o $(empty)
|
77
|
+
COUTFLAG = -o $(empty)
|
78
|
+
CSRCFLAG = $(empty)
|
79
|
+
|
80
|
+
RUBY_EXTCONF_H =
|
81
|
+
cflags = -fdeclspec $(optflags) $(debugflags) $(warnflags)
|
82
|
+
cxxflags =
|
83
|
+
optflags = -O3
|
84
|
+
debugflags = -ggdb3
|
85
|
+
warnflags = -Wall -Wextra -Wextra-tokens -Wdeprecated-declarations -Wdivision-by-zero -Wdiv-by-zero -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wshorten-64-to-32 -Wwrite-strings -Wold-style-definition -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wunused-variable -Wundef
|
86
|
+
cppflags =
|
87
|
+
CCDLFLAGS = -fno-common
|
88
|
+
CFLAGS = $(CCDLFLAGS) -O3 -I/opt/homebrew/opt/libyaml/include -I/opt/homebrew/opt/libksba/include -I/opt/homebrew/opt/readline/include -I/opt/homebrew/opt/zlib/include -I/opt/homebrew/opt/openssl@1.1/include $(cflags) -fno-common -pipe $(ARCH_FLAG)
|
89
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
90
|
+
DEFS =
|
91
|
+
CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
|
92
|
+
CXXFLAGS = $(CCDLFLAGS) -fdeclspec $(ARCH_FLAG)
|
93
|
+
ldflags = -L. -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -fstack-protector-strong
|
94
|
+
dldflags = -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -Wl,-undefined,dynamic_lookup $(LIBRUBYARG_SHARED)
|
95
|
+
ARCH_FLAG =
|
96
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
97
|
+
LDSHARED = $(CC) -dynamic -bundle
|
98
|
+
LDSHAREDXX = $(CXX) -dynamic -bundle
|
99
|
+
AR = ar
|
100
|
+
EXEEXT =
|
101
|
+
|
102
|
+
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
103
|
+
RUBY_SO_NAME = ruby.3.2
|
104
|
+
RUBYW_INSTALL_NAME =
|
105
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
|
106
|
+
RUBYW_BASE_NAME = rubyw
|
107
|
+
RUBY_BASE_NAME = ruby
|
108
|
+
|
109
|
+
arch = arm64-darwin23
|
110
|
+
sitearch = $(arch)
|
111
|
+
ruby_version = 3.2.0
|
112
|
+
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
113
|
+
RUBY = $(ruby)
|
114
|
+
BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
|
115
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
116
|
+
|
117
|
+
RM = rm -f
|
118
|
+
RM_RF = rm -fr
|
119
|
+
RMDIRS = rmdir -p
|
120
|
+
MAKEDIRS = /opt/homebrew/opt/coreutils/bin/gmkdir -p
|
121
|
+
INSTALL = /opt/homebrew/opt/coreutils/bin/ginstall -c
|
122
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
123
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
124
|
+
COPY = cp
|
125
|
+
TOUCH = exit >
|
126
|
+
|
127
|
+
#### End of system configuration section. ####
|
128
|
+
|
129
|
+
preload =
|
130
|
+
libpath = . $(libdir)
|
131
|
+
LIBPATH = -L. -L$(libdir)
|
132
|
+
DEFFILE =
|
133
|
+
|
134
|
+
CLEANFILES = mkmf.log
|
135
|
+
DISTCLEANFILES =
|
136
|
+
DISTCLEANDIRS =
|
137
|
+
|
138
|
+
extout =
|
139
|
+
extout_prefix =
|
140
|
+
target_prefix = /smarter_csv
|
141
|
+
LOCAL_LIBS =
|
142
|
+
LIBS = $(LIBRUBYARG_SHARED) -lpthread
|
143
|
+
ORIG_SRCS = smarter_csv.c
|
144
|
+
SRCS = $(ORIG_SRCS)
|
145
|
+
OBJS = smarter_csv.o
|
146
|
+
HDRS =
|
147
|
+
LOCAL_HDRS =
|
148
|
+
TARGET = smarter_csv
|
149
|
+
TARGET_NAME = smarter_csv
|
150
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
151
|
+
DLLIB = $(TARGET).bundle
|
152
|
+
EXTSTATIC =
|
153
|
+
STATIC_LIB =
|
154
|
+
|
155
|
+
TIMESTAMP_DIR = .
|
156
|
+
BINDIR = $(bindir)
|
157
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
158
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
159
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
160
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
161
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
162
|
+
TARGET_SO_DIR =
|
163
|
+
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
164
|
+
CLEANLIBS = $(TARGET_SO) $(TARGET_SO).dSYM
|
165
|
+
CLEANOBJS = $(OBJS) *.bak
|
166
|
+
TARGET_SO_DIR_TIMESTAMP = $(TIMESTAMP_DIR)/.sitearchdir.-.smarter_csv.time
|
167
|
+
|
168
|
+
all: $(DLLIB)
|
169
|
+
static: $(STATIC_LIB)
|
170
|
+
.PHONY: all install static install-so install-rb
|
171
|
+
.PHONY: clean clean-so clean-static clean-rb
|
172
|
+
|
173
|
+
clean-static::
|
174
|
+
clean-rb-default::
|
175
|
+
clean-rb::
|
176
|
+
clean-so::
|
177
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
178
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
179
|
+
|
180
|
+
distclean-rb-default::
|
181
|
+
distclean-rb::
|
182
|
+
distclean-so::
|
183
|
+
distclean-static::
|
184
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
185
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
186
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
187
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
188
|
+
|
189
|
+
realclean: distclean
|
190
|
+
install: install-so install-rb
|
191
|
+
|
192
|
+
install-so: $(DLLIB) $(TARGET_SO_DIR_TIMESTAMP)
|
193
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
194
|
+
clean-static::
|
195
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
196
|
+
install-rb: pre-install-rb do-install-rb install-rb-default
|
197
|
+
install-rb-default: pre-install-rb-default do-install-rb-default
|
198
|
+
pre-install-rb: Makefile
|
199
|
+
pre-install-rb-default: Makefile
|
200
|
+
do-install-rb:
|
201
|
+
do-install-rb-default:
|
202
|
+
pre-install-rb-default:
|
203
|
+
@$(NULLCMD)
|
204
|
+
$(TARGET_SO_DIR_TIMESTAMP):
|
205
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
206
|
+
$(Q) $(TOUCH) $@
|
207
|
+
|
208
|
+
site-install: site-install-so site-install-rb
|
209
|
+
site-install-so: install-so
|
210
|
+
site-install-rb: install-rb
|
211
|
+
|
212
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
|
213
|
+
|
214
|
+
.cc.o:
|
215
|
+
$(ECHO) compiling $(<)
|
216
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
217
|
+
|
218
|
+
.cc.S:
|
219
|
+
$(ECHO) translating $(<)
|
220
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
221
|
+
|
222
|
+
.mm.o:
|
223
|
+
$(ECHO) compiling $(<)
|
224
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
225
|
+
|
226
|
+
.mm.S:
|
227
|
+
$(ECHO) translating $(<)
|
228
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
229
|
+
|
230
|
+
.cxx.o:
|
231
|
+
$(ECHO) compiling $(<)
|
232
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
233
|
+
|
234
|
+
.cxx.S:
|
235
|
+
$(ECHO) translating $(<)
|
236
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
237
|
+
|
238
|
+
.cpp.o:
|
239
|
+
$(ECHO) compiling $(<)
|
240
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
241
|
+
|
242
|
+
.cpp.S:
|
243
|
+
$(ECHO) translating $(<)
|
244
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
245
|
+
|
246
|
+
.c.o:
|
247
|
+
$(ECHO) compiling $(<)
|
248
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
249
|
+
|
250
|
+
.c.S:
|
251
|
+
$(ECHO) translating $(<)
|
252
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
253
|
+
|
254
|
+
.m.o:
|
255
|
+
$(ECHO) compiling $(<)
|
256
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
257
|
+
|
258
|
+
.m.S:
|
259
|
+
$(ECHO) translating $(<)
|
260
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
261
|
+
|
262
|
+
$(TARGET_SO): $(OBJS) Makefile
|
263
|
+
$(ECHO) linking shared-object smarter_csv/$(DLLIB)
|
264
|
+
-$(Q)$(RM) $(@)
|
265
|
+
$(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
266
|
+
$(Q) $(POSTLINK)
|
267
|
+
|
268
|
+
|
269
|
+
|
270
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/ext/smarter_csv/extconf.rb
CHANGED
@@ -9,6 +9,8 @@ if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
|
|
9
9
|
RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
|
10
10
|
end
|
11
11
|
|
12
|
-
CONFIG["optflags"] = "-O3"
|
12
|
+
# CONFIG["optflags"] = "-O3 -march=native -flto"
|
13
|
+
CONFIG["optflags"] = "-O3 -march=native -flto -fomit-frame-pointer -DNDEBUG"
|
14
|
+
CONFIG["debugflags"] = ""
|
13
15
|
|
14
16
|
create_makefile('smarter_csv/smarter_csv')
|
@@ -2,6 +2,7 @@
|
|
2
2
|
#include "ruby/encoding.h"
|
3
3
|
#include <stdio.h>
|
4
4
|
#include <stdbool.h>
|
5
|
+
#include <string.h>
|
5
6
|
|
6
7
|
#ifndef bool
|
7
8
|
#define bool int
|
@@ -12,8 +13,25 @@
|
|
12
13
|
VALUE SmarterCSV = Qnil;
|
13
14
|
VALUE eMalformedCSVError = Qnil;
|
14
15
|
VALUE Parser = Qnil;
|
16
|
+
VALUE Qempty_string = Qnil; // shared frozen empty string
|
17
|
+
|
18
|
+
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
19
|
+
char *buf = ALLOC_N(char, len);
|
20
|
+
long j = 0;
|
21
|
+
for (long i = 0; i < len; i++) {
|
22
|
+
if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
|
23
|
+
buf[j++] = quote_char;
|
24
|
+
i++; // skip second quote
|
25
|
+
} else {
|
26
|
+
buf[j++] = str[i];
|
27
|
+
}
|
28
|
+
}
|
29
|
+
VALUE out = rb_enc_str_new(buf, j, encoding);
|
30
|
+
xfree(buf);
|
31
|
+
return out;
|
32
|
+
}
|
15
33
|
|
16
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
|
34
|
+
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val) {
|
17
35
|
if (RB_TYPE_P(line, T_NIL) == 1) {
|
18
36
|
return rb_ary_new();
|
19
37
|
}
|
@@ -22,74 +40,180 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
22
40
|
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
23
41
|
}
|
24
42
|
|
25
|
-
rb_encoding *encoding = rb_enc_get(line);
|
26
|
-
char *startP = RSTRING_PTR(line);
|
43
|
+
rb_encoding *encoding = rb_enc_get(line);
|
44
|
+
char *startP = RSTRING_PTR(line);
|
27
45
|
long line_len = RSTRING_LEN(line);
|
28
|
-
char *endP = startP + line_len;
|
46
|
+
char *endP = startP + line_len;
|
29
47
|
char *p = startP;
|
30
48
|
|
31
49
|
char *col_sepP = RSTRING_PTR(col_sep);
|
32
50
|
long col_sep_len = RSTRING_LEN(col_sep);
|
33
51
|
|
34
52
|
char *quoteP = RSTRING_PTR(quote_char);
|
35
|
-
|
36
|
-
|
37
|
-
bool col_sep_found = true;
|
53
|
+
char quote_char_val = quoteP[0];
|
54
|
+
size_t quote_len = strlen(quoteP);
|
38
55
|
|
39
56
|
VALUE elements = rb_ary_new();
|
40
57
|
VALUE field;
|
41
|
-
long i;
|
42
58
|
|
43
|
-
|
59
|
+
long element_count = 0;
|
60
|
+
int max_fields = -1;
|
61
|
+
if (max_size != Qnil) {
|
62
|
+
max_fields = NUM2INT(max_size);
|
63
|
+
if (max_fields < 0) {
|
64
|
+
return rb_ary_new();
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
bool has_quotes = RTEST(has_quotes_val);
|
69
|
+
bool strip_ws = RTEST(strip_ws_val);
|
70
|
+
|
71
|
+
// === FAST PATH: No quotes and single-character separator ===
|
72
|
+
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
73
|
+
char sep = *col_sepP;
|
74
|
+
char *sep_pos = NULL;
|
75
|
+
|
76
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
77
|
+
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
|
81
|
+
long field_len = sep_pos - startP;
|
82
|
+
char *raw_field = startP;
|
83
|
+
char *trim_start = raw_field;
|
84
|
+
char *trim_end = raw_field + field_len - 1;
|
85
|
+
|
86
|
+
if (strip_ws) {
|
87
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
88
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
89
|
+
}
|
90
|
+
|
91
|
+
long trimmed_len = trim_end - trim_start + 1;
|
92
|
+
|
93
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
94
|
+
rb_ary_push(elements, field);
|
95
|
+
element_count++;
|
96
|
+
|
97
|
+
p = sep_pos + 1;
|
98
|
+
startP = p;
|
99
|
+
}
|
100
|
+
|
101
|
+
if ((max_fields < 0) || (element_count < max_fields)) {
|
102
|
+
long field_len = endP - startP;
|
103
|
+
char *raw_field = startP;
|
104
|
+
char *trim_start = raw_field;
|
105
|
+
char *trim_end = raw_field + field_len - 1;
|
106
|
+
|
107
|
+
if (strip_ws) {
|
108
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
109
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
110
|
+
}
|
111
|
+
|
112
|
+
long trimmed_len = trim_end - trim_start + 1;
|
113
|
+
|
114
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
115
|
+
rb_ary_push(elements, field);
|
116
|
+
}
|
117
|
+
|
118
|
+
return elements;
|
119
|
+
}
|
120
|
+
|
121
|
+
// === SLOW PATH: Quoted fields or multi-char separator ===
|
122
|
+
long i;
|
44
123
|
long backslash_count = 0;
|
45
124
|
bool in_quotes = false;
|
125
|
+
bool col_sep_found = true;
|
46
126
|
|
47
127
|
while (p < endP) {
|
48
|
-
/* does the remaining string start with col_sep ? */
|
49
128
|
col_sep_found = true;
|
50
|
-
for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
|
51
|
-
|
129
|
+
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
|
130
|
+
if (*(p + i) != *(col_sepP + i)) {
|
131
|
+
col_sep_found = false;
|
132
|
+
break;
|
133
|
+
}
|
52
134
|
}
|
53
|
-
|
135
|
+
|
54
136
|
if (col_sep_found && !in_quotes) {
|
55
|
-
|
56
|
-
if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
|
137
|
+
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
57
138
|
break;
|
58
|
-
}
|
59
|
-
|
60
|
-
|
61
|
-
|
139
|
+
}
|
140
|
+
|
141
|
+
long field_len = p - startP;
|
142
|
+
char *raw_field = startP;
|
62
143
|
|
63
|
-
|
64
|
-
|
65
|
-
|
144
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
145
|
+
if (quoted) {
|
146
|
+
raw_field++;
|
147
|
+
field_len -= 2;
|
148
|
+
}
|
149
|
+
|
150
|
+
char *trim_start = raw_field;
|
151
|
+
char *trim_end = raw_field + field_len - 1;
|
152
|
+
|
153
|
+
if (strip_ws) {
|
154
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
155
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
156
|
+
}
|
157
|
+
|
158
|
+
long trimmed_len = trim_end - trim_start + 1;
|
159
|
+
|
160
|
+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
161
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
162
|
+
} else {
|
163
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
66
164
|
}
|
165
|
+
|
166
|
+
rb_ary_push(elements, field);
|
167
|
+
element_count++;
|
168
|
+
|
169
|
+
p += col_sep_len;
|
170
|
+
startP = p;
|
171
|
+
backslash_count = 0;
|
67
172
|
} else {
|
68
173
|
if (*p == '\\') {
|
69
174
|
backslash_count++;
|
70
175
|
} else {
|
71
|
-
if (*p ==
|
176
|
+
if (*p == quote_char_val) {
|
72
177
|
if (backslash_count % 2 == 0) {
|
73
|
-
/* Even number of backslashes means quote is not escaped */
|
74
178
|
in_quotes = !in_quotes;
|
75
179
|
}
|
76
|
-
/* Else, quote is escaped; do nothing */
|
77
180
|
}
|
78
|
-
backslash_count = 0;
|
181
|
+
backslash_count = 0;
|
79
182
|
}
|
80
183
|
p++;
|
81
184
|
}
|
82
|
-
}
|
185
|
+
}
|
83
186
|
|
84
|
-
/* Check for unclosed quotes at the end of the line */
|
85
187
|
if (in_quotes) {
|
86
188
|
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
87
189
|
}
|
88
190
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
191
|
+
if ((max_fields < 0) || (element_count < max_fields)) {
|
192
|
+
long field_len = endP - startP;
|
193
|
+
char *raw_field = startP;
|
194
|
+
|
195
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
196
|
+
if (quoted) {
|
197
|
+
raw_field++;
|
198
|
+
field_len -= 2;
|
199
|
+
}
|
200
|
+
|
201
|
+
char *trim_start = raw_field;
|
202
|
+
char *trim_end = raw_field + field_len - 1;
|
203
|
+
|
204
|
+
if (strip_ws) {
|
205
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
206
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
207
|
+
}
|
208
|
+
|
209
|
+
long trimmed_len = trim_end - trim_start + 1;
|
210
|
+
|
211
|
+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
212
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
213
|
+
} else {
|
214
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
215
|
+
}
|
216
|
+
|
93
217
|
rb_ary_push(elements, field);
|
94
218
|
}
|
95
219
|
|
@@ -97,10 +221,10 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
97
221
|
}
|
98
222
|
|
99
223
|
void Init_smarter_csv(void) {
|
100
|
-
// these modules and the error class are already defined in Ruby code, make them accessible:
|
101
224
|
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
102
225
|
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
103
226
|
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
|
104
|
-
|
105
|
-
|
227
|
+
Qempty_string = rb_str_new_literal("");
|
228
|
+
rb_gc_register_address(&Qempty_string);
|
229
|
+
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
|
106
230
|
}
|