smarter_csv 1.14.1 → 1.14.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/README.md +1 -0
- data/docs/options.md +5 -0
- data/ext/smarter_csv/Makefile +270 -0
- data/ext/smarter_csv/extconf.rb +3 -1
- data/ext/smarter_csv/smarter_csv.c +159 -35
- data/ext/smarter_csv/smarter_csv.c.works +185 -0
- data/ext/smarter_csv/smarter_csv.c.works10 +199 -0
- data/ext/smarter_csv/smarter_csv.c.works11 +189 -0
- data/ext/smarter_csv/smarter_csv.c.works14 +230 -0
- data/ext/smarter_csv/smarter_csv.c.works15 +230 -0
- data/ext/smarter_csv/smarter_csv.c.works2 +192 -0
- data/ext/smarter_csv/smarter_csv.c.works4 +190 -0
- data/ext/smarter_csv/smarter_csv.c.works5 +203 -0
- data/ext/smarter_csv/smarter_csv.c.works7 +217 -0
- data/ext/smarter_csv/smarter_csv.c.works8 +193 -0
- data/ext/smarter_csv/smarter_csv.c.works9 +196 -0
- data/lib/smarter_csv/parser.rb +14 -7
- data/lib/smarter_csv/reader.rb +1 -2
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv/writer.rb +5 -1
- metadata +14 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 79d1b26219dfb35f385c30eea361bc90d4b1b6cfc1030b6abb9339cc99eb39de
|
4
|
+
data.tar.gz: ab0915f4193b657c7e8380ea3bc924a4efb40f9171fb3f120aef462d6f464370
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 722a598f1581423d4295eb3ce6f46bf556fbf0206dbd4229eceff68625f60ad83d7e3f2dda3a5b6d61c18be2ff6d3d0fb495c46427bb2cc20fcf5f72fa58e317
|
7
|
+
data.tar.gz: f61d37aa5a64e37601a7b12ce57a528bc26fec8ee643e4171040f0e9021a2fe4fdeec71698638e98e29da27248e51e77792ed5053bb252716298219f4b095e85
|
data/CHANGELOG.md
CHANGED
@@ -1,9 +1,21 @@
|
|
1
1
|
|
2
2
|
# SmarterCSV 1.x Change Log
|
3
3
|
|
4
|
+
## 1.14.3 (2025-05-04)
|
5
|
+
* Improved C-extension parsing logic:
|
6
|
+
- Added fast path for unquoted fields to avoid unnecessary quote checks.
|
7
|
+
- Aded inline whitespace stripping inside the C parser
|
8
|
+
* Performance
|
9
|
+
- Significantly reduced per-line overhead in non-quoted, wide CSVs (e.g. fixed-width data exports).
|
10
|
+
- Benchmarks show ~10–40% speedup over v1.14.2 depending on structure and quoting.
|
11
|
+
|
12
|
+
## 1.14.2 (2025-04-10)
|
13
|
+
* bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true` ([issue 301](https://github.com/tilo/smarter_csv/issues/301))
|
14
|
+
* new option: `header_converter` allows to programatically modify the headers
|
15
|
+
|
4
16
|
## 1.14.1 (2025-04-09)
|
5
|
-
* bugfix: empty hash results in a blank line ([issue 299](https://github.com/tilo/smarter_csv/issues/299))
|
6
|
-
* bugfix: automatically quote problematic headers ([issue #300](https://github.com/tilo/smarter_csv/issues/300))
|
17
|
+
* bugfix: SmarterCSV::Writer empty hash results in a blank line ([issue 299](https://github.com/tilo/smarter_csv/issues/299))
|
18
|
+
* bugfix: SmarterCSV::Writer need to automatically quote problematic headers ([issue #300](https://github.com/tilo/smarter_csv/issues/300))
|
7
19
|
* new option: `quote_headers` allows to explicitly quote all headers
|
8
20
|
|
9
21
|
## 1.14.0 (2025-04-07)
|
data/README.md
CHANGED
@@ -47,6 +47,7 @@ Or install it yourself as:
|
|
47
47
|
|
48
48
|
# Articles
|
49
49
|
* [Parsing CSV Files in Ruby with SmarterCSV](https://tilo-sloboda.medium.com/parsing-csv-files-in-ruby-with-smartercsv-6ce66fb6cf38)
|
50
|
+
* [CSV Writing with SmarterCSV](https://tilo-sloboda.medium.com/csv-writing-with-smartercsv-26136d47ad0c)
|
50
51
|
* [Processing 1.4 Million CSV Records in Ruby, fast ](https://lcx.wien/blog/processing-14-million-csv-records-in-ruby/)
|
51
52
|
* [Faster Parsing CSV with Parallel Processing](http://xjlin0.github.io/tech/2015/05/25/faster-parsing-csv-with-parallel-processing) by [Jack lin](https://github.com/xjlin0/)
|
52
53
|
* The original [Stackoverflow Question](https://stackoverflow.com/questions/7788618/update-mongodb-with-array-from-csv-join-table/7788746#7788746) that inspired SmarterCSV
|
data/docs/options.md
CHANGED
@@ -28,10 +28,15 @@
|
|
28
28
|
| | | ⚠️ This disables automatic header detection! |
|
29
29
|
| :map_headers | {} | Similar to `headers`, but also maps each desired key to a user-specified value that is uesd as the header. |
|
30
30
|
| | | ⚠️ This disables automatic header detection! |
|
31
|
+
| :value_converters | nil | allows to define lambdas to programmatically modify values |
|
32
|
+
| | | * either for specific `key` names |
|
33
|
+
| | | * or using `_all` for all fields |
|
34
|
+
| :header_converter | nil | allows to define one lambda to programmatically modify the headers |
|
31
35
|
| :discover_headers | true | Automatically detects all keys in the input before writing the header |
|
32
36
|
| | | Do not manually set this to `false`. ⚠️ |
|
33
37
|
| | | But you can set this to `true` when using `map_headers` option. |
|
34
38
|
| :disable_auto_quoting | false | To manually disable auto-quoting of special characters. ⚠️ Be careful with this! |
|
39
|
+
| :quote_headers | false | To force quoting all headers (only needed in rare cases) |
|
35
40
|
|
36
41
|
|
37
42
|
## CSV Reading
|
@@ -0,0 +1,270 @@
|
|
1
|
+
|
2
|
+
SHELL = /bin/sh
|
3
|
+
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
5
|
+
V = 0
|
6
|
+
V0 = $(V:0=)
|
7
|
+
Q1 = $(V:1=)
|
8
|
+
Q = $(Q1:0=@)
|
9
|
+
ECHO1 = $(V:1=@ :)
|
10
|
+
ECHO = $(ECHO1:0=@ echo)
|
11
|
+
NULLCMD = :
|
12
|
+
|
13
|
+
#### Start of system configuration section. ####
|
14
|
+
|
15
|
+
srcdir = .
|
16
|
+
topdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0
|
17
|
+
hdrdir = $(topdir)
|
18
|
+
arch_hdrdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0/arm64-darwin23
|
19
|
+
PATH_SEPARATOR = :
|
20
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
21
|
+
prefix = $(DESTDIR)/Users/tilo/.rvm/rubies/ruby-3.2.2
|
22
|
+
rubysitearchprefix = $(rubylibprefix)/$(sitearch)
|
23
|
+
rubyarchprefix = $(rubylibprefix)/$(arch)
|
24
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
25
|
+
exec_prefix = $(prefix)
|
26
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
|
27
|
+
sitearchhdrdir = $(sitehdrdir)/$(sitearch)
|
28
|
+
rubyarchhdrdir = $(rubyhdrdir)/$(arch)
|
29
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
30
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
31
|
+
rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
|
32
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
33
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
34
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
35
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
36
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
37
|
+
sitedir = $(rubylibprefix)/site_ruby
|
38
|
+
rubyarchdir = $(rubylibdir)/$(arch)
|
39
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
40
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
41
|
+
archincludedir = $(includedir)/$(arch)
|
42
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
43
|
+
archlibdir = $(libdir)/$(arch)
|
44
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
45
|
+
mandir = $(datarootdir)/man
|
46
|
+
localedir = $(datarootdir)/locale
|
47
|
+
libdir = $(exec_prefix)/lib
|
48
|
+
psdir = $(docdir)
|
49
|
+
pdfdir = $(docdir)
|
50
|
+
dvidir = $(docdir)
|
51
|
+
htmldir = $(docdir)
|
52
|
+
infodir = $(datarootdir)/info
|
53
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
54
|
+
oldincludedir = $(DESTDIR)/usr/include
|
55
|
+
includedir = $(SDKROOT)$(prefix)/include
|
56
|
+
runstatedir = $(localstatedir)/run
|
57
|
+
localstatedir = $(prefix)/var
|
58
|
+
sharedstatedir = $(prefix)/com
|
59
|
+
sysconfdir = $(prefix)/etc
|
60
|
+
datadir = $(datarootdir)
|
61
|
+
datarootdir = $(prefix)/share
|
62
|
+
libexecdir = $(exec_prefix)/libexec
|
63
|
+
sbindir = $(exec_prefix)/sbin
|
64
|
+
bindir = $(exec_prefix)/bin
|
65
|
+
archdir = $(rubyarchdir)
|
66
|
+
|
67
|
+
|
68
|
+
CC_WRAPPER =
|
69
|
+
CC = gcc
|
70
|
+
CXX = g++
|
71
|
+
LIBRUBY = $(LIBRUBY_SO)
|
72
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
73
|
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
74
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation $(MAINLIBS)
|
75
|
+
empty =
|
76
|
+
OUTFLAG = -o $(empty)
|
77
|
+
COUTFLAG = -o $(empty)
|
78
|
+
CSRCFLAG = $(empty)
|
79
|
+
|
80
|
+
RUBY_EXTCONF_H =
|
81
|
+
cflags = -fdeclspec $(optflags) $(debugflags) $(warnflags)
|
82
|
+
cxxflags =
|
83
|
+
optflags = -O3
|
84
|
+
debugflags = -ggdb3
|
85
|
+
warnflags = -Wall -Wextra -Wextra-tokens -Wdeprecated-declarations -Wdivision-by-zero -Wdiv-by-zero -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wshorten-64-to-32 -Wwrite-strings -Wold-style-definition -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wunused-variable -Wundef
|
86
|
+
cppflags =
|
87
|
+
CCDLFLAGS = -fno-common
|
88
|
+
CFLAGS = $(CCDLFLAGS) -O3 -I/opt/homebrew/opt/libyaml/include -I/opt/homebrew/opt/libksba/include -I/opt/homebrew/opt/readline/include -I/opt/homebrew/opt/zlib/include -I/opt/homebrew/opt/openssl@1.1/include $(cflags) -fno-common -pipe $(ARCH_FLAG)
|
89
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
90
|
+
DEFS =
|
91
|
+
CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
|
92
|
+
CXXFLAGS = $(CCDLFLAGS) -fdeclspec $(ARCH_FLAG)
|
93
|
+
ldflags = -L. -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -fstack-protector-strong
|
94
|
+
dldflags = -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -Wl,-undefined,dynamic_lookup $(LIBRUBYARG_SHARED)
|
95
|
+
ARCH_FLAG =
|
96
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
97
|
+
LDSHARED = $(CC) -dynamic -bundle
|
98
|
+
LDSHAREDXX = $(CXX) -dynamic -bundle
|
99
|
+
AR = ar
|
100
|
+
EXEEXT =
|
101
|
+
|
102
|
+
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
103
|
+
RUBY_SO_NAME = ruby.3.2
|
104
|
+
RUBYW_INSTALL_NAME =
|
105
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
|
106
|
+
RUBYW_BASE_NAME = rubyw
|
107
|
+
RUBY_BASE_NAME = ruby
|
108
|
+
|
109
|
+
arch = arm64-darwin23
|
110
|
+
sitearch = $(arch)
|
111
|
+
ruby_version = 3.2.0
|
112
|
+
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
113
|
+
RUBY = $(ruby)
|
114
|
+
BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
|
115
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
116
|
+
|
117
|
+
RM = rm -f
|
118
|
+
RM_RF = rm -fr
|
119
|
+
RMDIRS = rmdir -p
|
120
|
+
MAKEDIRS = /opt/homebrew/opt/coreutils/bin/gmkdir -p
|
121
|
+
INSTALL = /opt/homebrew/opt/coreutils/bin/ginstall -c
|
122
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
123
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
124
|
+
COPY = cp
|
125
|
+
TOUCH = exit >
|
126
|
+
|
127
|
+
#### End of system configuration section. ####
|
128
|
+
|
129
|
+
preload =
|
130
|
+
libpath = . $(libdir)
|
131
|
+
LIBPATH = -L. -L$(libdir)
|
132
|
+
DEFFILE =
|
133
|
+
|
134
|
+
CLEANFILES = mkmf.log
|
135
|
+
DISTCLEANFILES =
|
136
|
+
DISTCLEANDIRS =
|
137
|
+
|
138
|
+
extout =
|
139
|
+
extout_prefix =
|
140
|
+
target_prefix = /smarter_csv
|
141
|
+
LOCAL_LIBS =
|
142
|
+
LIBS = $(LIBRUBYARG_SHARED) -lpthread
|
143
|
+
ORIG_SRCS = smarter_csv.c
|
144
|
+
SRCS = $(ORIG_SRCS)
|
145
|
+
OBJS = smarter_csv.o
|
146
|
+
HDRS =
|
147
|
+
LOCAL_HDRS =
|
148
|
+
TARGET = smarter_csv
|
149
|
+
TARGET_NAME = smarter_csv
|
150
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
151
|
+
DLLIB = $(TARGET).bundle
|
152
|
+
EXTSTATIC =
|
153
|
+
STATIC_LIB =
|
154
|
+
|
155
|
+
TIMESTAMP_DIR = .
|
156
|
+
BINDIR = $(bindir)
|
157
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
158
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
159
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
160
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
161
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
162
|
+
TARGET_SO_DIR =
|
163
|
+
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
164
|
+
CLEANLIBS = $(TARGET_SO) $(TARGET_SO).dSYM
|
165
|
+
CLEANOBJS = $(OBJS) *.bak
|
166
|
+
TARGET_SO_DIR_TIMESTAMP = $(TIMESTAMP_DIR)/.sitearchdir.-.smarter_csv.time
|
167
|
+
|
168
|
+
all: $(DLLIB)
|
169
|
+
static: $(STATIC_LIB)
|
170
|
+
.PHONY: all install static install-so install-rb
|
171
|
+
.PHONY: clean clean-so clean-static clean-rb
|
172
|
+
|
173
|
+
clean-static::
|
174
|
+
clean-rb-default::
|
175
|
+
clean-rb::
|
176
|
+
clean-so::
|
177
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
178
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
179
|
+
|
180
|
+
distclean-rb-default::
|
181
|
+
distclean-rb::
|
182
|
+
distclean-so::
|
183
|
+
distclean-static::
|
184
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
185
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
186
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
187
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
188
|
+
|
189
|
+
realclean: distclean
|
190
|
+
install: install-so install-rb
|
191
|
+
|
192
|
+
install-so: $(DLLIB) $(TARGET_SO_DIR_TIMESTAMP)
|
193
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
194
|
+
clean-static::
|
195
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
196
|
+
install-rb: pre-install-rb do-install-rb install-rb-default
|
197
|
+
install-rb-default: pre-install-rb-default do-install-rb-default
|
198
|
+
pre-install-rb: Makefile
|
199
|
+
pre-install-rb-default: Makefile
|
200
|
+
do-install-rb:
|
201
|
+
do-install-rb-default:
|
202
|
+
pre-install-rb-default:
|
203
|
+
@$(NULLCMD)
|
204
|
+
$(TARGET_SO_DIR_TIMESTAMP):
|
205
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
206
|
+
$(Q) $(TOUCH) $@
|
207
|
+
|
208
|
+
site-install: site-install-so site-install-rb
|
209
|
+
site-install-so: install-so
|
210
|
+
site-install-rb: install-rb
|
211
|
+
|
212
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
|
213
|
+
|
214
|
+
.cc.o:
|
215
|
+
$(ECHO) compiling $(<)
|
216
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
217
|
+
|
218
|
+
.cc.S:
|
219
|
+
$(ECHO) translating $(<)
|
220
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
221
|
+
|
222
|
+
.mm.o:
|
223
|
+
$(ECHO) compiling $(<)
|
224
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
225
|
+
|
226
|
+
.mm.S:
|
227
|
+
$(ECHO) translating $(<)
|
228
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
229
|
+
|
230
|
+
.cxx.o:
|
231
|
+
$(ECHO) compiling $(<)
|
232
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
233
|
+
|
234
|
+
.cxx.S:
|
235
|
+
$(ECHO) translating $(<)
|
236
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
237
|
+
|
238
|
+
.cpp.o:
|
239
|
+
$(ECHO) compiling $(<)
|
240
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
241
|
+
|
242
|
+
.cpp.S:
|
243
|
+
$(ECHO) translating $(<)
|
244
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
245
|
+
|
246
|
+
.c.o:
|
247
|
+
$(ECHO) compiling $(<)
|
248
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
249
|
+
|
250
|
+
.c.S:
|
251
|
+
$(ECHO) translating $(<)
|
252
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
253
|
+
|
254
|
+
.m.o:
|
255
|
+
$(ECHO) compiling $(<)
|
256
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
257
|
+
|
258
|
+
.m.S:
|
259
|
+
$(ECHO) translating $(<)
|
260
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
261
|
+
|
262
|
+
$(TARGET_SO): $(OBJS) Makefile
|
263
|
+
$(ECHO) linking shared-object smarter_csv/$(DLLIB)
|
264
|
+
-$(Q)$(RM) $(@)
|
265
|
+
$(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
266
|
+
$(Q) $(POSTLINK)
|
267
|
+
|
268
|
+
|
269
|
+
|
270
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/ext/smarter_csv/extconf.rb
CHANGED
@@ -9,6 +9,8 @@ if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
|
|
9
9
|
RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
|
10
10
|
end
|
11
11
|
|
12
|
-
CONFIG["optflags"] = "-O3"
|
12
|
+
# CONFIG["optflags"] = "-O3 -march=native -flto"
|
13
|
+
CONFIG["optflags"] = "-O3 -march=native -flto -fomit-frame-pointer -DNDEBUG"
|
14
|
+
CONFIG["debugflags"] = ""
|
13
15
|
|
14
16
|
create_makefile('smarter_csv/smarter_csv')
|
@@ -2,6 +2,7 @@
|
|
2
2
|
#include "ruby/encoding.h"
|
3
3
|
#include <stdio.h>
|
4
4
|
#include <stdbool.h>
|
5
|
+
#include <string.h>
|
5
6
|
|
6
7
|
#ifndef bool
|
7
8
|
#define bool int
|
@@ -12,8 +13,25 @@
|
|
12
13
|
VALUE SmarterCSV = Qnil;
|
13
14
|
VALUE eMalformedCSVError = Qnil;
|
14
15
|
VALUE Parser = Qnil;
|
16
|
+
VALUE Qempty_string = Qnil; // shared frozen empty string
|
17
|
+
|
18
|
+
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
19
|
+
char *buf = ALLOC_N(char, len);
|
20
|
+
long j = 0;
|
21
|
+
for (long i = 0; i < len; i++) {
|
22
|
+
if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
|
23
|
+
buf[j++] = quote_char;
|
24
|
+
i++; // skip second quote
|
25
|
+
} else {
|
26
|
+
buf[j++] = str[i];
|
27
|
+
}
|
28
|
+
}
|
29
|
+
VALUE out = rb_enc_str_new(buf, j, encoding);
|
30
|
+
xfree(buf);
|
31
|
+
return out;
|
32
|
+
}
|
15
33
|
|
16
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
|
34
|
+
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val) {
|
17
35
|
if (RB_TYPE_P(line, T_NIL) == 1) {
|
18
36
|
return rb_ary_new();
|
19
37
|
}
|
@@ -22,74 +40,180 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
22
40
|
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
23
41
|
}
|
24
42
|
|
25
|
-
rb_encoding *encoding = rb_enc_get(line);
|
26
|
-
char *startP = RSTRING_PTR(line);
|
43
|
+
rb_encoding *encoding = rb_enc_get(line);
|
44
|
+
char *startP = RSTRING_PTR(line);
|
27
45
|
long line_len = RSTRING_LEN(line);
|
28
|
-
char *endP = startP + line_len;
|
46
|
+
char *endP = startP + line_len;
|
29
47
|
char *p = startP;
|
30
48
|
|
31
49
|
char *col_sepP = RSTRING_PTR(col_sep);
|
32
50
|
long col_sep_len = RSTRING_LEN(col_sep);
|
33
51
|
|
34
52
|
char *quoteP = RSTRING_PTR(quote_char);
|
35
|
-
|
36
|
-
|
37
|
-
bool col_sep_found = true;
|
53
|
+
char quote_char_val = quoteP[0];
|
54
|
+
size_t quote_len = strlen(quoteP);
|
38
55
|
|
39
56
|
VALUE elements = rb_ary_new();
|
40
57
|
VALUE field;
|
41
|
-
long i;
|
42
58
|
|
43
|
-
|
59
|
+
long element_count = 0;
|
60
|
+
int max_fields = -1;
|
61
|
+
if (max_size != Qnil) {
|
62
|
+
max_fields = NUM2INT(max_size);
|
63
|
+
if (max_fields < 0) {
|
64
|
+
return rb_ary_new();
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
bool has_quotes = RTEST(has_quotes_val);
|
69
|
+
bool strip_ws = RTEST(strip_ws_val);
|
70
|
+
|
71
|
+
// === FAST PATH: No quotes and single-character separator ===
|
72
|
+
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
73
|
+
char sep = *col_sepP;
|
74
|
+
char *sep_pos = NULL;
|
75
|
+
|
76
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
77
|
+
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
|
81
|
+
long field_len = sep_pos - startP;
|
82
|
+
char *raw_field = startP;
|
83
|
+
char *trim_start = raw_field;
|
84
|
+
char *trim_end = raw_field + field_len - 1;
|
85
|
+
|
86
|
+
if (strip_ws) {
|
87
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
88
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
89
|
+
}
|
90
|
+
|
91
|
+
long trimmed_len = trim_end - trim_start + 1;
|
92
|
+
|
93
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
94
|
+
rb_ary_push(elements, field);
|
95
|
+
element_count++;
|
96
|
+
|
97
|
+
p = sep_pos + 1;
|
98
|
+
startP = p;
|
99
|
+
}
|
100
|
+
|
101
|
+
if ((max_fields < 0) || (element_count < max_fields)) {
|
102
|
+
long field_len = endP - startP;
|
103
|
+
char *raw_field = startP;
|
104
|
+
char *trim_start = raw_field;
|
105
|
+
char *trim_end = raw_field + field_len - 1;
|
106
|
+
|
107
|
+
if (strip_ws) {
|
108
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
109
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
110
|
+
}
|
111
|
+
|
112
|
+
long trimmed_len = trim_end - trim_start + 1;
|
113
|
+
|
114
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
115
|
+
rb_ary_push(elements, field);
|
116
|
+
}
|
117
|
+
|
118
|
+
return elements;
|
119
|
+
}
|
120
|
+
|
121
|
+
// === SLOW PATH: Quoted fields or multi-char separator ===
|
122
|
+
long i;
|
44
123
|
long backslash_count = 0;
|
45
124
|
bool in_quotes = false;
|
125
|
+
bool col_sep_found = true;
|
46
126
|
|
47
127
|
while (p < endP) {
|
48
|
-
/* does the remaining string start with col_sep ? */
|
49
128
|
col_sep_found = true;
|
50
|
-
for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
|
51
|
-
|
129
|
+
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
|
130
|
+
if (*(p + i) != *(col_sepP + i)) {
|
131
|
+
col_sep_found = false;
|
132
|
+
break;
|
133
|
+
}
|
52
134
|
}
|
53
|
-
|
135
|
+
|
54
136
|
if (col_sep_found && !in_quotes) {
|
55
|
-
|
56
|
-
if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
|
137
|
+
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
57
138
|
break;
|
58
|
-
}
|
59
|
-
|
60
|
-
|
61
|
-
|
139
|
+
}
|
140
|
+
|
141
|
+
long field_len = p - startP;
|
142
|
+
char *raw_field = startP;
|
62
143
|
|
63
|
-
|
64
|
-
|
65
|
-
|
144
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
145
|
+
if (quoted) {
|
146
|
+
raw_field++;
|
147
|
+
field_len -= 2;
|
148
|
+
}
|
149
|
+
|
150
|
+
char *trim_start = raw_field;
|
151
|
+
char *trim_end = raw_field + field_len - 1;
|
152
|
+
|
153
|
+
if (strip_ws) {
|
154
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
155
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
156
|
+
}
|
157
|
+
|
158
|
+
long trimmed_len = trim_end - trim_start + 1;
|
159
|
+
|
160
|
+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
161
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
162
|
+
} else {
|
163
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
66
164
|
}
|
165
|
+
|
166
|
+
rb_ary_push(elements, field);
|
167
|
+
element_count++;
|
168
|
+
|
169
|
+
p += col_sep_len;
|
170
|
+
startP = p;
|
171
|
+
backslash_count = 0;
|
67
172
|
} else {
|
68
173
|
if (*p == '\\') {
|
69
174
|
backslash_count++;
|
70
175
|
} else {
|
71
|
-
if (*p ==
|
176
|
+
if (*p == quote_char_val) {
|
72
177
|
if (backslash_count % 2 == 0) {
|
73
|
-
/* Even number of backslashes means quote is not escaped */
|
74
178
|
in_quotes = !in_quotes;
|
75
179
|
}
|
76
|
-
/* Else, quote is escaped; do nothing */
|
77
180
|
}
|
78
|
-
backslash_count = 0;
|
181
|
+
backslash_count = 0;
|
79
182
|
}
|
80
183
|
p++;
|
81
184
|
}
|
82
|
-
}
|
185
|
+
}
|
83
186
|
|
84
|
-
/* Check for unclosed quotes at the end of the line */
|
85
187
|
if (in_quotes) {
|
86
188
|
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
87
189
|
}
|
88
190
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
191
|
+
if ((max_fields < 0) || (element_count < max_fields)) {
|
192
|
+
long field_len = endP - startP;
|
193
|
+
char *raw_field = startP;
|
194
|
+
|
195
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
196
|
+
if (quoted) {
|
197
|
+
raw_field++;
|
198
|
+
field_len -= 2;
|
199
|
+
}
|
200
|
+
|
201
|
+
char *trim_start = raw_field;
|
202
|
+
char *trim_end = raw_field + field_len - 1;
|
203
|
+
|
204
|
+
if (strip_ws) {
|
205
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
206
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
207
|
+
}
|
208
|
+
|
209
|
+
long trimmed_len = trim_end - trim_start + 1;
|
210
|
+
|
211
|
+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
212
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
213
|
+
} else {
|
214
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
215
|
+
}
|
216
|
+
|
93
217
|
rb_ary_push(elements, field);
|
94
218
|
}
|
95
219
|
|
@@ -97,10 +221,10 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
97
221
|
}
|
98
222
|
|
99
223
|
void Init_smarter_csv(void) {
|
100
|
-
// these modules and the error class are already defined in Ruby code, make them accessible:
|
101
224
|
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
102
225
|
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
103
226
|
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
|
104
|
-
|
105
|
-
|
227
|
+
Qempty_string = rb_str_new_literal("");
|
228
|
+
rb_gc_register_address(&Qempty_string);
|
229
|
+
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
|
106
230
|
}
|