smarter_csv 1.16.6 → 1.17.0.pre5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +7 -1
- data/CHANGELOG.md +26 -9
- data/CONTRIBUTORS.md +1 -2
- data/README.md +5 -2
- data/Rakefile +2 -7
- data/TO_DO_v2.md +17 -11
- data/docs/_introduction.md +1 -0
- data/docs/bad_row_quarantine.md +2 -1
- data/docs/basic_read_api.md +2 -1
- data/docs/basic_write_api.md +1 -0
- data/docs/batch_processing.md +1 -0
- data/docs/column_selection.md +1 -0
- data/docs/data_transformations.md +1 -0
- data/docs/examples.md +1 -0
- data/docs/header_transformations.md +1 -0
- data/docs/header_validations.md +1 -0
- data/docs/history.md +1 -0
- data/docs/instrumentation.md +2 -1
- data/docs/migrating_from_csv.md +1 -0
- data/docs/options.md +4 -3
- data/docs/parsing_strategy.md +1 -0
- data/docs/real_world_csv.md +6 -1
- data/docs/row_col_sep.md +2 -1
- data/docs/ruby_csv_pitfalls.md +1 -0
- data/docs/value_converters.md +24 -0
- data/docs/warnings.md +119 -0
- data/ext/smarter_csv/Makefile +270 -0
- data/ext/smarter_csv/extconf.rb +0 -9
- data/ext/smarter_csv/smarter_csv.c +63 -133
- data/lib/smarter_csv/auto_detection.rb +73 -32
- data/lib/smarter_csv/file_io.rb +2 -2
- data/lib/smarter_csv/parser.rb +10 -36
- data/lib/smarter_csv/peekable_io.rb +432 -0
- data/lib/smarter_csv/reader.rb +121 -19
- data/lib/smarter_csv/reader_options.rb +14 -1
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +39 -11
- metadata +6 -3
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
|
|
2
|
+
SHELL = /bin/sh
|
|
3
|
+
|
|
4
|
+
# V=0 quiet, V=1 verbose. other values don't work.
|
|
5
|
+
V = 0
|
|
6
|
+
V0 = $(V:0=)
|
|
7
|
+
Q1 = $(V:1=)
|
|
8
|
+
Q = $(Q1:0=@)
|
|
9
|
+
ECHO1 = $(V:1=@ :)
|
|
10
|
+
ECHO = $(ECHO1:0=@ echo)
|
|
11
|
+
NULLCMD = :
|
|
12
|
+
|
|
13
|
+
#### Start of system configuration section. ####
|
|
14
|
+
|
|
15
|
+
srcdir = .
|
|
16
|
+
topdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0
|
|
17
|
+
hdrdir = $(topdir)
|
|
18
|
+
arch_hdrdir = /Users/tilo/.rvm/rubies/ruby-3.2.2/include/ruby-3.2.0/arm64-darwin23
|
|
19
|
+
PATH_SEPARATOR = :
|
|
20
|
+
VPATH = $(srcdir):$(arch_hdrdir)/ruby:$(hdrdir)/ruby
|
|
21
|
+
prefix = $(DESTDIR)/Users/tilo/.rvm/rubies/ruby-3.2.2
|
|
22
|
+
rubysitearchprefix = $(rubylibprefix)/$(sitearch)
|
|
23
|
+
rubyarchprefix = $(rubylibprefix)/$(arch)
|
|
24
|
+
rubylibprefix = $(libdir)/$(RUBY_BASE_NAME)
|
|
25
|
+
exec_prefix = $(prefix)
|
|
26
|
+
vendorarchhdrdir = $(vendorhdrdir)/$(sitearch)
|
|
27
|
+
sitearchhdrdir = $(sitehdrdir)/$(sitearch)
|
|
28
|
+
rubyarchhdrdir = $(rubyhdrdir)/$(arch)
|
|
29
|
+
vendorhdrdir = $(rubyhdrdir)/vendor_ruby
|
|
30
|
+
sitehdrdir = $(rubyhdrdir)/site_ruby
|
|
31
|
+
rubyhdrdir = $(includedir)/$(RUBY_VERSION_NAME)
|
|
32
|
+
vendorarchdir = $(vendorlibdir)/$(sitearch)
|
|
33
|
+
vendorlibdir = $(vendordir)/$(ruby_version)
|
|
34
|
+
vendordir = $(rubylibprefix)/vendor_ruby
|
|
35
|
+
sitearchdir = $(sitelibdir)/$(sitearch)
|
|
36
|
+
sitelibdir = $(sitedir)/$(ruby_version)
|
|
37
|
+
sitedir = $(rubylibprefix)/site_ruby
|
|
38
|
+
rubyarchdir = $(rubylibdir)/$(arch)
|
|
39
|
+
rubylibdir = $(rubylibprefix)/$(ruby_version)
|
|
40
|
+
sitearchincludedir = $(includedir)/$(sitearch)
|
|
41
|
+
archincludedir = $(includedir)/$(arch)
|
|
42
|
+
sitearchlibdir = $(libdir)/$(sitearch)
|
|
43
|
+
archlibdir = $(libdir)/$(arch)
|
|
44
|
+
ridir = $(datarootdir)/$(RI_BASE_NAME)
|
|
45
|
+
mandir = $(datarootdir)/man
|
|
46
|
+
localedir = $(datarootdir)/locale
|
|
47
|
+
libdir = $(exec_prefix)/lib
|
|
48
|
+
psdir = $(docdir)
|
|
49
|
+
pdfdir = $(docdir)
|
|
50
|
+
dvidir = $(docdir)
|
|
51
|
+
htmldir = $(docdir)
|
|
52
|
+
infodir = $(datarootdir)/info
|
|
53
|
+
docdir = $(datarootdir)/doc/$(PACKAGE)
|
|
54
|
+
oldincludedir = $(DESTDIR)/usr/include
|
|
55
|
+
includedir = $(SDKROOT)$(prefix)/include
|
|
56
|
+
runstatedir = $(localstatedir)/run
|
|
57
|
+
localstatedir = $(prefix)/var
|
|
58
|
+
sharedstatedir = $(prefix)/com
|
|
59
|
+
sysconfdir = $(prefix)/etc
|
|
60
|
+
datadir = $(datarootdir)
|
|
61
|
+
datarootdir = $(prefix)/share
|
|
62
|
+
libexecdir = $(exec_prefix)/libexec
|
|
63
|
+
sbindir = $(exec_prefix)/sbin
|
|
64
|
+
bindir = $(exec_prefix)/bin
|
|
65
|
+
archdir = $(rubyarchdir)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
CC_WRAPPER =
|
|
69
|
+
CC = gcc
|
|
70
|
+
CXX = g++
|
|
71
|
+
LIBRUBY = $(LIBRUBY_SO)
|
|
72
|
+
LIBRUBY_A = lib$(RUBY_SO_NAME)-static.a
|
|
73
|
+
LIBRUBYARG_SHARED = -l$(RUBY_SO_NAME)
|
|
74
|
+
LIBRUBYARG_STATIC = -l$(RUBY_SO_NAME)-static -framework CoreFoundation $(MAINLIBS)
|
|
75
|
+
empty =
|
|
76
|
+
OUTFLAG = -o $(empty)
|
|
77
|
+
COUTFLAG = -o $(empty)
|
|
78
|
+
CSRCFLAG = $(empty)
|
|
79
|
+
|
|
80
|
+
RUBY_EXTCONF_H =
|
|
81
|
+
cflags = -fdeclspec $(optflags) $(debugflags) $(warnflags)
|
|
82
|
+
cxxflags =
|
|
83
|
+
optflags = -O3
|
|
84
|
+
debugflags = -ggdb3
|
|
85
|
+
warnflags = -Wall -Wextra -Wextra-tokens -Wdeprecated-declarations -Wdivision-by-zero -Wdiv-by-zero -Wimplicit-function-declaration -Wimplicit-int -Wmisleading-indentation -Wpointer-arith -Wshorten-64-to-32 -Wwrite-strings -Wold-style-definition -Wmissing-noreturn -Wno-cast-function-type -Wno-constant-logical-operand -Wno-long-long -Wno-missing-field-initializers -Wno-overlength-strings -Wno-parentheses-equality -Wno-self-assign -Wno-tautological-compare -Wno-unused-parameter -Wno-unused-value -Wunused-variable -Wundef
|
|
86
|
+
cppflags =
|
|
87
|
+
CCDLFLAGS = -fno-common
|
|
88
|
+
CFLAGS = $(CCDLFLAGS) -O3 -I/opt/homebrew/opt/libyaml/include -I/opt/homebrew/opt/libksba/include -I/opt/homebrew/opt/readline/include -I/opt/homebrew/opt/zlib/include -I/opt/homebrew/opt/openssl@1.1/include $(cflags) -fno-common -pipe $(ARCH_FLAG)
|
|
89
|
+
INCFLAGS = -I. -I$(arch_hdrdir) -I$(hdrdir)/ruby/backward -I$(hdrdir) -I$(srcdir)
|
|
90
|
+
DEFS =
|
|
91
|
+
CPPFLAGS = -D_XOPEN_SOURCE -D_DARWIN_C_SOURCE -D_DARWIN_UNLIMITED_SELECT -D_REENTRANT $(DEFS) $(cppflags)
|
|
92
|
+
CXXFLAGS = $(CCDLFLAGS) -fdeclspec $(ARCH_FLAG)
|
|
93
|
+
ldflags = -L. -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -fstack-protector-strong
|
|
94
|
+
dldflags = -L/opt/homebrew/opt/libyaml/lib -L/opt/homebrew/opt/libksba/lib -L/opt/homebrew/opt/readline/lib -L/opt/homebrew/opt/zlib/lib -L/opt/homebrew/opt/openssl@1.1/lib -Wl,-undefined,dynamic_lookup $(LIBRUBYARG_SHARED)
|
|
95
|
+
ARCH_FLAG =
|
|
96
|
+
DLDFLAGS = $(ldflags) $(dldflags) $(ARCH_FLAG)
|
|
97
|
+
LDSHARED = $(CC) -dynamic -bundle
|
|
98
|
+
LDSHAREDXX = $(CXX) -dynamic -bundle
|
|
99
|
+
AR = ar
|
|
100
|
+
EXEEXT =
|
|
101
|
+
|
|
102
|
+
RUBY_INSTALL_NAME = $(RUBY_BASE_NAME)
|
|
103
|
+
RUBY_SO_NAME = ruby.3.2
|
|
104
|
+
RUBYW_INSTALL_NAME =
|
|
105
|
+
RUBY_VERSION_NAME = $(RUBY_BASE_NAME)-$(ruby_version)
|
|
106
|
+
RUBYW_BASE_NAME = rubyw
|
|
107
|
+
RUBY_BASE_NAME = ruby
|
|
108
|
+
|
|
109
|
+
arch = arm64-darwin23
|
|
110
|
+
sitearch = $(arch)
|
|
111
|
+
ruby_version = 3.2.0
|
|
112
|
+
ruby = $(bindir)/$(RUBY_BASE_NAME)
|
|
113
|
+
RUBY = $(ruby)
|
|
114
|
+
BUILTRUBY = $(bindir)/$(RUBY_BASE_NAME)
|
|
115
|
+
ruby_headers = $(hdrdir)/ruby.h $(hdrdir)/ruby/backward.h $(hdrdir)/ruby/ruby.h $(hdrdir)/ruby/defines.h $(hdrdir)/ruby/missing.h $(hdrdir)/ruby/intern.h $(hdrdir)/ruby/st.h $(hdrdir)/ruby/subst.h $(arch_hdrdir)/ruby/config.h
|
|
116
|
+
|
|
117
|
+
RM = rm -f
|
|
118
|
+
RM_RF = rm -fr
|
|
119
|
+
RMDIRS = rmdir -p
|
|
120
|
+
MAKEDIRS = /opt/homebrew/opt/coreutils/bin/gmkdir -p
|
|
121
|
+
INSTALL = /opt/homebrew/opt/coreutils/bin/ginstall -c
|
|
122
|
+
INSTALL_PROG = $(INSTALL) -m 0755
|
|
123
|
+
INSTALL_DATA = $(INSTALL) -m 644
|
|
124
|
+
COPY = cp
|
|
125
|
+
TOUCH = exit >
|
|
126
|
+
|
|
127
|
+
#### End of system configuration section. ####
|
|
128
|
+
|
|
129
|
+
preload =
|
|
130
|
+
libpath = . $(libdir)
|
|
131
|
+
LIBPATH = -L. -L$(libdir)
|
|
132
|
+
DEFFILE =
|
|
133
|
+
|
|
134
|
+
CLEANFILES = mkmf.log
|
|
135
|
+
DISTCLEANFILES =
|
|
136
|
+
DISTCLEANDIRS =
|
|
137
|
+
|
|
138
|
+
extout =
|
|
139
|
+
extout_prefix =
|
|
140
|
+
target_prefix = /smarter_csv
|
|
141
|
+
LOCAL_LIBS =
|
|
142
|
+
LIBS = $(LIBRUBYARG_SHARED) -lpthread
|
|
143
|
+
ORIG_SRCS = smarter_csv.c
|
|
144
|
+
SRCS = $(ORIG_SRCS)
|
|
145
|
+
OBJS = smarter_csv.o
|
|
146
|
+
HDRS =
|
|
147
|
+
LOCAL_HDRS =
|
|
148
|
+
TARGET = smarter_csv
|
|
149
|
+
TARGET_NAME = smarter_csv
|
|
150
|
+
TARGET_ENTRY = Init_$(TARGET_NAME)
|
|
151
|
+
DLLIB = $(TARGET).bundle
|
|
152
|
+
EXTSTATIC =
|
|
153
|
+
STATIC_LIB =
|
|
154
|
+
|
|
155
|
+
TIMESTAMP_DIR = .
|
|
156
|
+
BINDIR = $(bindir)
|
|
157
|
+
RUBYCOMMONDIR = $(sitedir)$(target_prefix)
|
|
158
|
+
RUBYLIBDIR = $(sitelibdir)$(target_prefix)
|
|
159
|
+
RUBYARCHDIR = $(sitearchdir)$(target_prefix)
|
|
160
|
+
HDRDIR = $(sitehdrdir)$(target_prefix)
|
|
161
|
+
ARCHHDRDIR = $(sitearchhdrdir)$(target_prefix)
|
|
162
|
+
TARGET_SO_DIR =
|
|
163
|
+
TARGET_SO = $(TARGET_SO_DIR)$(DLLIB)
|
|
164
|
+
CLEANLIBS = $(TARGET_SO) $(TARGET_SO).dSYM
|
|
165
|
+
CLEANOBJS = $(OBJS) *.bak
|
|
166
|
+
TARGET_SO_DIR_TIMESTAMP = $(TIMESTAMP_DIR)/.sitearchdir.-.smarter_csv.time
|
|
167
|
+
|
|
168
|
+
all: $(DLLIB)
|
|
169
|
+
static: $(STATIC_LIB)
|
|
170
|
+
.PHONY: all install static install-so install-rb
|
|
171
|
+
.PHONY: clean clean-so clean-static clean-rb
|
|
172
|
+
|
|
173
|
+
clean-static::
|
|
174
|
+
clean-rb-default::
|
|
175
|
+
clean-rb::
|
|
176
|
+
clean-so::
|
|
177
|
+
clean: clean-so clean-static clean-rb-default clean-rb
|
|
178
|
+
-$(Q)$(RM_RF) $(CLEANLIBS) $(CLEANOBJS) $(CLEANFILES) .*.time
|
|
179
|
+
|
|
180
|
+
distclean-rb-default::
|
|
181
|
+
distclean-rb::
|
|
182
|
+
distclean-so::
|
|
183
|
+
distclean-static::
|
|
184
|
+
distclean: clean distclean-so distclean-static distclean-rb-default distclean-rb
|
|
185
|
+
-$(Q)$(RM) Makefile $(RUBY_EXTCONF_H) conftest.* mkmf.log
|
|
186
|
+
-$(Q)$(RM) core ruby$(EXEEXT) *~ $(DISTCLEANFILES)
|
|
187
|
+
-$(Q)$(RMDIRS) $(DISTCLEANDIRS) 2> /dev/null || true
|
|
188
|
+
|
|
189
|
+
realclean: distclean
|
|
190
|
+
install: install-so install-rb
|
|
191
|
+
|
|
192
|
+
install-so: $(DLLIB) $(TARGET_SO_DIR_TIMESTAMP)
|
|
193
|
+
$(INSTALL_PROG) $(DLLIB) $(RUBYARCHDIR)
|
|
194
|
+
clean-static::
|
|
195
|
+
-$(Q)$(RM) $(STATIC_LIB)
|
|
196
|
+
install-rb: pre-install-rb do-install-rb install-rb-default
|
|
197
|
+
install-rb-default: pre-install-rb-default do-install-rb-default
|
|
198
|
+
pre-install-rb: Makefile
|
|
199
|
+
pre-install-rb-default: Makefile
|
|
200
|
+
do-install-rb:
|
|
201
|
+
do-install-rb-default:
|
|
202
|
+
pre-install-rb-default:
|
|
203
|
+
@$(NULLCMD)
|
|
204
|
+
$(TARGET_SO_DIR_TIMESTAMP):
|
|
205
|
+
$(Q) $(MAKEDIRS) $(@D) $(RUBYARCHDIR)
|
|
206
|
+
$(Q) $(TOUCH) $@
|
|
207
|
+
|
|
208
|
+
site-install: site-install-so site-install-rb
|
|
209
|
+
site-install-so: install-so
|
|
210
|
+
site-install-rb: install-rb
|
|
211
|
+
|
|
212
|
+
.SUFFIXES: .c .m .cc .mm .cxx .cpp .o .S
|
|
213
|
+
|
|
214
|
+
.cc.o:
|
|
215
|
+
$(ECHO) compiling $(<)
|
|
216
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
|
217
|
+
|
|
218
|
+
.cc.S:
|
|
219
|
+
$(ECHO) translating $(<)
|
|
220
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
|
221
|
+
|
|
222
|
+
.mm.o:
|
|
223
|
+
$(ECHO) compiling $(<)
|
|
224
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
|
225
|
+
|
|
226
|
+
.mm.S:
|
|
227
|
+
$(ECHO) translating $(<)
|
|
228
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
|
229
|
+
|
|
230
|
+
.cxx.o:
|
|
231
|
+
$(ECHO) compiling $(<)
|
|
232
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
|
233
|
+
|
|
234
|
+
.cxx.S:
|
|
235
|
+
$(ECHO) translating $(<)
|
|
236
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
|
237
|
+
|
|
238
|
+
.cpp.o:
|
|
239
|
+
$(ECHO) compiling $(<)
|
|
240
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
|
241
|
+
|
|
242
|
+
.cpp.S:
|
|
243
|
+
$(ECHO) translating $(<)
|
|
244
|
+
$(Q) $(CXX) $(INCFLAGS) $(CPPFLAGS) $(CXXFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
|
245
|
+
|
|
246
|
+
.c.o:
|
|
247
|
+
$(ECHO) compiling $(<)
|
|
248
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
|
249
|
+
|
|
250
|
+
.c.S:
|
|
251
|
+
$(ECHO) translating $(<)
|
|
252
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
|
253
|
+
|
|
254
|
+
.m.o:
|
|
255
|
+
$(ECHO) compiling $(<)
|
|
256
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -c $(CSRCFLAG)$<
|
|
257
|
+
|
|
258
|
+
.m.S:
|
|
259
|
+
$(ECHO) translating $(<)
|
|
260
|
+
$(Q) $(CC) $(INCFLAGS) $(CPPFLAGS) $(CFLAGS) $(COUTFLAG)$@ -S $(CSRCFLAG)$<
|
|
261
|
+
|
|
262
|
+
$(TARGET_SO): $(OBJS) Makefile
|
|
263
|
+
$(ECHO) linking shared-object smarter_csv/$(DLLIB)
|
|
264
|
+
-$(Q)$(RM) $(@)
|
|
265
|
+
$(Q) $(LDSHARED) -o $@ $(OBJS) $(LIBPATH) $(DLDFLAGS) $(LOCAL_LIBS) $(LIBS)
|
|
266
|
+
$(Q) $(POSTLINK)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
$(OBJS): $(HDRS) $(ruby_headers)
|
data/ext/smarter_csv/extconf.rb
CHANGED
|
@@ -3,15 +3,6 @@
|
|
|
3
3
|
require 'mkmf'
|
|
4
4
|
require "rbconfig"
|
|
5
5
|
|
|
6
|
-
# On non-MRI Rubies (JRuby, TruffleRuby, ...) there is no C extension to build, and trying to build
|
|
7
|
-
# it breaks `gem install` for anything that depends on smarter_csv. Write a no-op Makefile so install
|
|
8
|
-
# succeeds, then stop. At runtime SmarterCSV falls back to its pure-Ruby parser (it checks whether the
|
|
9
|
-
# C functions actually loaded via respond_to?(:parse_csv_line_c)).
|
|
10
|
-
if RUBY_ENGINE != 'ruby'
|
|
11
|
-
File.write('Makefile', dummy_makefile($srcdir).join)
|
|
12
|
-
exit 0
|
|
13
|
-
end
|
|
14
|
-
|
|
15
6
|
if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
|
|
16
7
|
fixed_CFLAGS = RbConfig::MAKEFILE_CONFIG["CFLAGS"].sub("-g -O3", "-O3 $(cflags)")
|
|
17
8
|
puts("Fix CFLAGS: #{RbConfig::MAKEFILE_CONFIG["CFLAGS"]} -> #{fixed_CFLAGS}")
|
|
@@ -304,40 +304,25 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
|
304
304
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
305
305
|
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
306
306
|
if (in_quotes) {
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
* "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
314
|
-
* behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
315
|
-
* the final quote may still close the field instead of turning the
|
|
316
|
-
* row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
317
|
-
* precedence for ..."",... (more content follows), but we keep the
|
|
318
|
-
* historical leniency for terminal ..."". */
|
|
319
|
-
p++;
|
|
320
|
-
} else {
|
|
321
|
-
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
322
|
-
bool valid_close = (p + 1 >= endP);
|
|
323
|
-
if (!valid_close) {
|
|
324
|
-
valid_close = true;
|
|
325
|
-
for (long j = 0; j < col_sep_len; j++) {
|
|
326
|
-
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
327
|
-
}
|
|
328
|
-
}
|
|
329
|
-
if (!valid_close && row_sep_len > 0) {
|
|
330
|
-
valid_close = true;
|
|
331
|
-
for (long j = 0; j < row_sep_len; j++) {
|
|
332
|
-
if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
|
|
333
|
-
}
|
|
307
|
+
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
308
|
+
bool valid_close = (p + 1 >= endP);
|
|
309
|
+
if (!valid_close) {
|
|
310
|
+
valid_close = true;
|
|
311
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
312
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
334
313
|
}
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
314
|
+
}
|
|
315
|
+
if (!valid_close && row_sep_len > 0) {
|
|
316
|
+
valid_close = true;
|
|
317
|
+
for (long j = 0; j < row_sep_len; j++) {
|
|
318
|
+
if (*(p + 1 + j) != *(row_sepP + j)) { valid_close = false; break; }
|
|
338
319
|
}
|
|
339
|
-
// else: quote inside quoted field → literal
|
|
340
320
|
}
|
|
321
|
+
if (valid_close) {
|
|
322
|
+
in_quotes = false;
|
|
323
|
+
field_started = true;
|
|
324
|
+
}
|
|
325
|
+
// else: quote inside quoted field → literal (handles "" doubling)
|
|
341
326
|
} else if (!field_started) {
|
|
342
327
|
in_quotes = true; // opening quote at field boundary
|
|
343
328
|
field_started = true;
|
|
@@ -778,11 +763,6 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
778
763
|
* the frame stays well below 4 KB and ___chkstk_darwin never fires on ARM64 macOS.
|
|
779
764
|
*/
|
|
780
765
|
bool *keep_bitmap = NULL;
|
|
781
|
-
/* In THIS (non-ctx) function the bitmap is alloca'd to headers_len on every call (see the alloca
|
|
782
|
-
* sites below), so keep_bitmap[] is exactly headers_len long and headers_len is the correct bound
|
|
783
|
-
* at all access sites. Do NOT mirror rb_parse_line_to_hash_ctx's keep_bitmap_len here: that variant
|
|
784
|
-
* caches its bitmap across rows (where @headers can grow), so it must use the captured length; this
|
|
785
|
-
* one rebuilds per call and does not. */
|
|
786
766
|
bool keep_extra_columns = true; /* extra cols (> headers_len): keep by default */
|
|
787
767
|
bool has_only = false; /* true when only_headers: filtering is active */
|
|
788
768
|
long early_exit_after = -1; /* column index after which we stop; -1 = no early exit */
|
|
@@ -1101,40 +1081,25 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1101
1081
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
1102
1082
|
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
1103
1083
|
if (in_quotes) {
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
* "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
1111
|
-
* behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
1112
|
-
* the final quote may still close the field instead of turning the
|
|
1113
|
-
* row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
1114
|
-
* precedence for ..."",... (more content follows), but we keep the
|
|
1115
|
-
* historical leniency for terminal ..."". */
|
|
1116
|
-
p++;
|
|
1117
|
-
} else {
|
|
1118
|
-
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
1119
|
-
bool valid_close = (p + 1 >= endP);
|
|
1120
|
-
if (!valid_close) {
|
|
1121
|
-
valid_close = true;
|
|
1122
|
-
for (long j = 0; j < col_sep_len; j++) {
|
|
1123
|
-
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1124
|
-
}
|
|
1125
|
-
}
|
|
1126
|
-
if (!valid_close && row_sep_len2 > 0) {
|
|
1127
|
-
valid_close = true;
|
|
1128
|
-
for (long j = 0; j < row_sep_len2; j++) {
|
|
1129
|
-
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1130
|
-
}
|
|
1084
|
+
// closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
1085
|
+
bool valid_close = (p + 1 >= endP);
|
|
1086
|
+
if (!valid_close) {
|
|
1087
|
+
valid_close = true;
|
|
1088
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
1089
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1131
1090
|
}
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1091
|
+
}
|
|
1092
|
+
if (!valid_close && row_sep_len2 > 0) {
|
|
1093
|
+
valid_close = true;
|
|
1094
|
+
for (long j = 0; j < row_sep_len2; j++) {
|
|
1095
|
+
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1135
1096
|
}
|
|
1136
|
-
// else: quote inside quoted field → literal
|
|
1137
1097
|
}
|
|
1098
|
+
if (valid_close) {
|
|
1099
|
+
in_quotes = false;
|
|
1100
|
+
field_started = true;
|
|
1101
|
+
}
|
|
1102
|
+
// else: quote inside quoted field → literal (handles "" doubling)
|
|
1138
1103
|
} else if (!field_started) {
|
|
1139
1104
|
in_quotes = true; // opening quote at field boundary
|
|
1140
1105
|
field_started = true;
|
|
@@ -1211,20 +1176,12 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash(VALUE self, VALUE line,
|
|
|
1211
1176
|
* return nil instead of the hash so the row can be skipped.
|
|
1212
1177
|
* With lazy allocation, if all_blank is true, xform.hash is still Qnil —
|
|
1213
1178
|
* no hash was ever allocated.
|
|
1214
|
-
*
|
|
1215
|
-
* If remove_empty_hashes is disabled, preserve the row as an empty hash.
|
|
1216
|
-
* This keeps parity with the Ruby path without adding any cost to the
|
|
1217
|
-
* normal non-blank hot path.
|
|
1218
1179
|
*/
|
|
1219
|
-
if (all_blank) {
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
return result;
|
|
1225
|
-
}
|
|
1226
|
-
|
|
1227
|
-
ensure_hash_allocated(&xform);
|
|
1180
|
+
if (remove_empty && all_blank) {
|
|
1181
|
+
VALUE result = rb_ary_new_capa(2);
|
|
1182
|
+
rb_ary_push(result, Qnil);
|
|
1183
|
+
rb_ary_push(result, LONG2FIX(element_count));
|
|
1184
|
+
return result;
|
|
1228
1185
|
}
|
|
1229
1186
|
|
|
1230
1187
|
/* ----------------------------------------
|
|
@@ -1464,14 +1421,6 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1464
1421
|
int numeric_mode = ctx->numeric_mode;
|
|
1465
1422
|
VALUE numeric_keys = ctx->numeric_keys;
|
|
1466
1423
|
bool *keep_bitmap = ctx->keep_bitmap;
|
|
1467
|
-
/* keep_bitmap is cached in the context (xmalloc'd once at construction, sized to the header count
|
|
1468
|
-
* THEN). @headers can grow in place as undeclared extra columns appear, so the live headers_len
|
|
1469
|
-
* (re-read each call below) may exceed the bitmap's length. Every keep_bitmap[] access in this
|
|
1470
|
-
* function MUST be bounded by keep_bitmap_len, never headers_len — indices past the bitmap are
|
|
1471
|
-
* extra columns and follow keep_extra_columns. Bounding by the grown headers_len was an
|
|
1472
|
-
* out-of-bounds heap read (the bug). The sibling rb_parse_line_to_hash safely uses headers_len
|
|
1473
|
-
* because it re-allocs its bitmap to headers_len on every call. */
|
|
1474
|
-
long keep_bitmap_len = ctx->keep_bitmap_len;
|
|
1475
1424
|
bool keep_extra_columns = ctx->keep_extra_columns;
|
|
1476
1425
|
long early_exit_after = ctx->early_exit_after;
|
|
1477
1426
|
|
|
@@ -1573,7 +1522,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1573
1522
|
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1574
1523
|
}
|
|
1575
1524
|
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1576
|
-
if (!keep_bitmap || (element_count <
|
|
1525
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1577
1526
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1578
1527
|
all_blank = false;
|
|
1579
1528
|
}
|
|
@@ -1594,7 +1543,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1594
1543
|
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
|
1595
1544
|
}
|
|
1596
1545
|
long trimmed_len = (trim_end >= trim_start) ? (trim_end - trim_start + 1) : 0;
|
|
1597
|
-
if (!keep_bitmap || (element_count <
|
|
1546
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1598
1547
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, false, quote_char_val, encoding))
|
|
1599
1548
|
all_blank = false;
|
|
1600
1549
|
}
|
|
@@ -1657,7 +1606,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1657
1606
|
|
|
1658
1607
|
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1659
1608
|
|
|
1660
|
-
if (!keep_bitmap || (element_count <
|
|
1609
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1661
1610
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1662
1611
|
all_blank = false;
|
|
1663
1612
|
}
|
|
@@ -1691,40 +1640,25 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1691
1640
|
if (!allow_escaped_quotes || backslash_count % 2 == 0) {
|
|
1692
1641
|
if (__builtin_expect(quote_boundary_standard, 1)) {
|
|
1693
1642
|
if (in_quotes) {
|
|
1694
|
-
|
|
1695
|
-
|
|
1696
|
-
|
|
1697
|
-
|
|
1698
|
-
|
|
1699
|
-
|
|
1700
|
-
* "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
1701
|
-
* behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
1702
|
-
* the final quote may still close the field instead of turning the
|
|
1703
|
-
* row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
1704
|
-
* precedence for ..."",... (more content follows), but we keep the
|
|
1705
|
-
* historical leniency for terminal ..."". */
|
|
1706
|
-
p++;
|
|
1707
|
-
} else {
|
|
1708
|
-
/* closing quote: only valid if followed by col_sep, row_sep, or end */
|
|
1709
|
-
bool valid_close = (p + 1 >= endP);
|
|
1710
|
-
if (!valid_close) {
|
|
1711
|
-
valid_close = true;
|
|
1712
|
-
for (long j = 0; j < col_sep_len; j++) {
|
|
1713
|
-
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1714
|
-
}
|
|
1715
|
-
}
|
|
1716
|
-
if (!valid_close && row_sep_len2 > 0) {
|
|
1717
|
-
valid_close = true;
|
|
1718
|
-
for (long j = 0; j < row_sep_len2; j++) {
|
|
1719
|
-
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1720
|
-
}
|
|
1643
|
+
/* closing quote: only valid if followed by col_sep, row_sep, or end */
|
|
1644
|
+
bool valid_close = (p + 1 >= endP);
|
|
1645
|
+
if (!valid_close) {
|
|
1646
|
+
valid_close = true;
|
|
1647
|
+
for (long j = 0; j < col_sep_len; j++) {
|
|
1648
|
+
if (*(p + 1 + j) != *(col_sepP + j)) { valid_close = false; break; }
|
|
1721
1649
|
}
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1650
|
+
}
|
|
1651
|
+
if (!valid_close && row_sep_len2 > 0) {
|
|
1652
|
+
valid_close = true;
|
|
1653
|
+
for (long j = 0; j < row_sep_len2; j++) {
|
|
1654
|
+
if (*(p + 1 + j) != *(row_sepP2 + j)) { valid_close = false; break; }
|
|
1725
1655
|
}
|
|
1726
|
-
/* else: quote inside quoted field → literal */
|
|
1727
1656
|
}
|
|
1657
|
+
if (valid_close) {
|
|
1658
|
+
in_quotes = false;
|
|
1659
|
+
field_started = true;
|
|
1660
|
+
}
|
|
1661
|
+
/* else: quote inside quoted field → literal (handles "" doubling) */
|
|
1728
1662
|
} else if (!field_started) {
|
|
1729
1663
|
in_quotes = true; /* opening quote at field boundary */
|
|
1730
1664
|
field_started = true;
|
|
@@ -1783,7 +1717,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1783
1717
|
|
|
1784
1718
|
bool has_embedded_quotes = quoted || (trimmed_len > 0 && memchr(trim_start, quote_char_val, trimmed_len));
|
|
1785
1719
|
|
|
1786
|
-
if (!keep_bitmap || (element_count <
|
|
1720
|
+
if (!keep_bitmap || (element_count < headers_len ? keep_bitmap[element_count] : keep_extra_columns)) {
|
|
1787
1721
|
if (insert_field_into_hash(&xform, trim_start, trimmed_len, element_count, has_embedded_quotes, quote_char_val, encoding))
|
|
1788
1722
|
all_blank = false;
|
|
1789
1723
|
}
|
|
@@ -1794,15 +1728,11 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1794
1728
|
/* ----------------------------------------
|
|
1795
1729
|
* SECTION 6: Handle blank rows
|
|
1796
1730
|
* ---------------------------------------- */
|
|
1797
|
-
if (all_blank) {
|
|
1798
|
-
|
|
1799
|
-
|
|
1800
|
-
|
|
1801
|
-
|
|
1802
|
-
return result;
|
|
1803
|
-
}
|
|
1804
|
-
|
|
1805
|
-
ensure_hash_allocated(&xform);
|
|
1731
|
+
if (remove_empty && all_blank) {
|
|
1732
|
+
VALUE result = rb_ary_new_capa(2);
|
|
1733
|
+
rb_ary_push(result, Qnil);
|
|
1734
|
+
rb_ary_push(result, LONG2FIX(element_count));
|
|
1735
|
+
return result;
|
|
1806
1736
|
}
|
|
1807
1737
|
|
|
1808
1738
|
/* ----------------------------------------
|
|
@@ -1811,7 +1741,7 @@ __attribute__((hot)) static VALUE rb_parse_line_to_hash_ctx(VALUE self, VALUE li
|
|
|
1811
1741
|
if (!remove_empty_values) {
|
|
1812
1742
|
ensure_hash_allocated(&xform);
|
|
1813
1743
|
for (long i = element_count; i < headers_len; i++) {
|
|
1814
|
-
if (!keep_bitmap ||
|
|
1744
|
+
if (!keep_bitmap || keep_bitmap[i]) {
|
|
1815
1745
|
rb_hash_aset(xform.hash, rb_ary_entry(headers, i), Qnil);
|
|
1816
1746
|
}
|
|
1817
1747
|
}
|