smarter_csv 1.14.2 → 1.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -1
- data/CONTRIBUTORS.md +1 -0
- data/ext/smarter_csv/extconf.rb +3 -1
- data/ext/smarter_csv/smarter_csv.c +159 -35
- data/lib/smarter_csv/auto_detection.rb +1 -1
- data/lib/smarter_csv/parser.rb +14 -7
- data/lib/smarter_csv/reader.rb +1 -2
- data/lib/smarter_csv/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a84cfa57008a6f9f05ee82eeef9edb4dab5993874a38c456ce78d8da999280aa
|
4
|
+
data.tar.gz: 7e8569670615a6ff7fb63c152d8e849069bcde318edf885d95cef52b5513f52c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d1baa73a0120824390f062e38d3acab8dfc7a09d48fbbdd647466605ccd543ca839e61cbefca9929d2dafce9a61b12e50b62d9e034aadd047a46e1886728980
|
7
|
+
data.tar.gz: 9756f2fdd15e619ba98011370a3410781b7771edd0656dd1bcb1226ebf899726ec00f43627dcc16756d986a962e9eb3e84c7e901ef5aecf12ede9f077f7a8423
|
data/CHANGELOG.md
CHANGED
@@ -1,8 +1,19 @@
|
|
1
1
|
|
2
2
|
# SmarterCSV 1.x Change Log
|
3
3
|
|
4
|
+
## 1.14.4 (2025-05-26)
|
5
|
+
* Bugfix: SmarterCSV::Reader fixing issue with header containing spaces ([PR 305](https://github.com/tilo/smarter_csv/pull/305) thanks to Felipe Cabezudo)
|
6
|
+
|
7
|
+
## 1.14.3 (2025-05-04)
|
8
|
+
* Improved C-extension parsing logic:
|
9
|
+
- Added fast path for unquoted fields to avoid unnecessary quote checks.
|
10
|
+
- Aded inline whitespace stripping inside the C parser
|
11
|
+
* Performance
|
12
|
+
- Significantly reduced per-line overhead in non-quoted, wide CSVs (e.g. fixed-width data exports).
|
13
|
+
- Benchmarks show ~10–40% speedup over v1.14.2 depending on structure and quoting.
|
14
|
+
|
4
15
|
## 1.14.2 (2025-04-10)
|
5
|
-
* bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true`
|
16
|
+
* bugfix: SmarterCSV::Writer fixing corner case with `quote_headers: true` ([issue 301](https://github.com/tilo/smarter_csv/issues/301))
|
6
17
|
* new option: `header_converter` allows to programatically modify the headers
|
7
18
|
|
8
19
|
## 1.14.1 (2025-04-09)
|
data/CONTRIBUTORS.md
CHANGED
@@ -59,3 +59,4 @@ A Big Thank you to everyone who filed issues, sent comments, and who contributed
|
|
59
59
|
* [Randall B](https://github.com/randall-coding)
|
60
60
|
* [Matthew Kennedy](https://github.com/MattKitmanLabs)
|
61
61
|
* [Robert Reiz](https://github.com/reiz)
|
62
|
+
* [Felipe Cabezudo](https://github.com/felipekb)
|
data/ext/smarter_csv/extconf.rb
CHANGED
@@ -9,6 +9,8 @@ if RbConfig::MAKEFILE_CONFIG["CFLAGS"].include?("-g -O3")
|
|
9
9
|
RbConfig::MAKEFILE_CONFIG["CFLAGS"] = fixed_CFLAGS
|
10
10
|
end
|
11
11
|
|
12
|
-
CONFIG["optflags"] = "-O3"
|
12
|
+
# CONFIG["optflags"] = "-O3 -march=native -flto"
|
13
|
+
CONFIG["optflags"] = "-O3 -march=native -flto -fomit-frame-pointer -DNDEBUG"
|
14
|
+
CONFIG["debugflags"] = ""
|
13
15
|
|
14
16
|
create_makefile('smarter_csv/smarter_csv')
|
@@ -2,6 +2,7 @@
|
|
2
2
|
#include "ruby/encoding.h"
|
3
3
|
#include <stdio.h>
|
4
4
|
#include <stdbool.h>
|
5
|
+
#include <string.h>
|
5
6
|
|
6
7
|
#ifndef bool
|
7
8
|
#define bool int
|
@@ -12,8 +13,25 @@
|
|
12
13
|
VALUE SmarterCSV = Qnil;
|
13
14
|
VALUE eMalformedCSVError = Qnil;
|
14
15
|
VALUE Parser = Qnil;
|
16
|
+
VALUE Qempty_string = Qnil; // shared frozen empty string
|
17
|
+
|
18
|
+
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
19
|
+
char *buf = ALLOC_N(char, len);
|
20
|
+
long j = 0;
|
21
|
+
for (long i = 0; i < len; i++) {
|
22
|
+
if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
|
23
|
+
buf[j++] = quote_char;
|
24
|
+
i++; // skip second quote
|
25
|
+
} else {
|
26
|
+
buf[j++] = str[i];
|
27
|
+
}
|
28
|
+
}
|
29
|
+
VALUE out = rb_enc_str_new(buf, j, encoding);
|
30
|
+
xfree(buf);
|
31
|
+
return out;
|
32
|
+
}
|
15
33
|
|
16
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
|
34
|
+
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val) {
|
17
35
|
if (RB_TYPE_P(line, T_NIL) == 1) {
|
18
36
|
return rb_ary_new();
|
19
37
|
}
|
@@ -22,74 +40,180 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
22
40
|
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
23
41
|
}
|
24
42
|
|
25
|
-
rb_encoding *encoding = rb_enc_get(line);
|
26
|
-
char *startP = RSTRING_PTR(line);
|
43
|
+
rb_encoding *encoding = rb_enc_get(line);
|
44
|
+
char *startP = RSTRING_PTR(line);
|
27
45
|
long line_len = RSTRING_LEN(line);
|
28
|
-
char *endP = startP + line_len;
|
46
|
+
char *endP = startP + line_len;
|
29
47
|
char *p = startP;
|
30
48
|
|
31
49
|
char *col_sepP = RSTRING_PTR(col_sep);
|
32
50
|
long col_sep_len = RSTRING_LEN(col_sep);
|
33
51
|
|
34
52
|
char *quoteP = RSTRING_PTR(quote_char);
|
35
|
-
|
36
|
-
|
37
|
-
bool col_sep_found = true;
|
53
|
+
char quote_char_val = quoteP[0];
|
54
|
+
size_t quote_len = strlen(quoteP);
|
38
55
|
|
39
56
|
VALUE elements = rb_ary_new();
|
40
57
|
VALUE field;
|
41
|
-
long i;
|
42
58
|
|
43
|
-
|
59
|
+
long element_count = 0;
|
60
|
+
int max_fields = -1;
|
61
|
+
if (max_size != Qnil) {
|
62
|
+
max_fields = NUM2INT(max_size);
|
63
|
+
if (max_fields < 0) {
|
64
|
+
return rb_ary_new();
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
bool has_quotes = RTEST(has_quotes_val);
|
69
|
+
bool strip_ws = RTEST(strip_ws_val);
|
70
|
+
|
71
|
+
// === FAST PATH: No quotes and single-character separator ===
|
72
|
+
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
73
|
+
char sep = *col_sepP;
|
74
|
+
char *sep_pos = NULL;
|
75
|
+
|
76
|
+
while ((sep_pos = memchr(p, sep, endP - p))) {
|
77
|
+
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
78
|
+
break;
|
79
|
+
}
|
80
|
+
|
81
|
+
long field_len = sep_pos - startP;
|
82
|
+
char *raw_field = startP;
|
83
|
+
char *trim_start = raw_field;
|
84
|
+
char *trim_end = raw_field + field_len - 1;
|
85
|
+
|
86
|
+
if (strip_ws) {
|
87
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
88
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
89
|
+
}
|
90
|
+
|
91
|
+
long trimmed_len = trim_end - trim_start + 1;
|
92
|
+
|
93
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
94
|
+
rb_ary_push(elements, field);
|
95
|
+
element_count++;
|
96
|
+
|
97
|
+
p = sep_pos + 1;
|
98
|
+
startP = p;
|
99
|
+
}
|
100
|
+
|
101
|
+
if ((max_fields < 0) || (element_count < max_fields)) {
|
102
|
+
long field_len = endP - startP;
|
103
|
+
char *raw_field = startP;
|
104
|
+
char *trim_start = raw_field;
|
105
|
+
char *trim_end = raw_field + field_len - 1;
|
106
|
+
|
107
|
+
if (strip_ws) {
|
108
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
109
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
110
|
+
}
|
111
|
+
|
112
|
+
long trimmed_len = trim_end - trim_start + 1;
|
113
|
+
|
114
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
115
|
+
rb_ary_push(elements, field);
|
116
|
+
}
|
117
|
+
|
118
|
+
return elements;
|
119
|
+
}
|
120
|
+
|
121
|
+
// === SLOW PATH: Quoted fields or multi-char separator ===
|
122
|
+
long i;
|
44
123
|
long backslash_count = 0;
|
45
124
|
bool in_quotes = false;
|
125
|
+
bool col_sep_found = true;
|
46
126
|
|
47
127
|
while (p < endP) {
|
48
|
-
/* does the remaining string start with col_sep ? */
|
49
128
|
col_sep_found = true;
|
50
|
-
for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
|
51
|
-
|
129
|
+
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
|
130
|
+
if (*(p + i) != *(col_sepP + i)) {
|
131
|
+
col_sep_found = false;
|
132
|
+
break;
|
133
|
+
}
|
52
134
|
}
|
53
|
-
|
135
|
+
|
54
136
|
if (col_sep_found && !in_quotes) {
|
55
|
-
|
56
|
-
if ((max_size != Qnil) && RARRAY_LEN(elements) >= NUM2INT(max_size)) {
|
137
|
+
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
57
138
|
break;
|
58
|
-
}
|
59
|
-
|
60
|
-
|
61
|
-
|
139
|
+
}
|
140
|
+
|
141
|
+
long field_len = p - startP;
|
142
|
+
char *raw_field = startP;
|
62
143
|
|
63
|
-
|
64
|
-
|
65
|
-
|
144
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
145
|
+
if (quoted) {
|
146
|
+
raw_field++;
|
147
|
+
field_len -= 2;
|
148
|
+
}
|
149
|
+
|
150
|
+
char *trim_start = raw_field;
|
151
|
+
char *trim_end = raw_field + field_len - 1;
|
152
|
+
|
153
|
+
if (strip_ws) {
|
154
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
155
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
156
|
+
}
|
157
|
+
|
158
|
+
long trimmed_len = trim_end - trim_start + 1;
|
159
|
+
|
160
|
+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
161
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
162
|
+
} else {
|
163
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
66
164
|
}
|
165
|
+
|
166
|
+
rb_ary_push(elements, field);
|
167
|
+
element_count++;
|
168
|
+
|
169
|
+
p += col_sep_len;
|
170
|
+
startP = p;
|
171
|
+
backslash_count = 0;
|
67
172
|
} else {
|
68
173
|
if (*p == '\\') {
|
69
174
|
backslash_count++;
|
70
175
|
} else {
|
71
|
-
if (*p ==
|
176
|
+
if (*p == quote_char_val) {
|
72
177
|
if (backslash_count % 2 == 0) {
|
73
|
-
/* Even number of backslashes means quote is not escaped */
|
74
178
|
in_quotes = !in_quotes;
|
75
179
|
}
|
76
|
-
/* Else, quote is escaped; do nothing */
|
77
180
|
}
|
78
|
-
backslash_count = 0;
|
181
|
+
backslash_count = 0;
|
79
182
|
}
|
80
183
|
p++;
|
81
184
|
}
|
82
|
-
}
|
185
|
+
}
|
83
186
|
|
84
|
-
/* Check for unclosed quotes at the end of the line */
|
85
187
|
if (in_quotes) {
|
86
188
|
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
87
189
|
}
|
88
190
|
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
191
|
+
if ((max_fields < 0) || (element_count < max_fields)) {
|
192
|
+
long field_len = endP - startP;
|
193
|
+
char *raw_field = startP;
|
194
|
+
|
195
|
+
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
196
|
+
if (quoted) {
|
197
|
+
raw_field++;
|
198
|
+
field_len -= 2;
|
199
|
+
}
|
200
|
+
|
201
|
+
char *trim_start = raw_field;
|
202
|
+
char *trim_end = raw_field + field_len - 1;
|
203
|
+
|
204
|
+
if (strip_ws) {
|
205
|
+
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
206
|
+
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
207
|
+
}
|
208
|
+
|
209
|
+
long trimmed_len = trim_end - trim_start + 1;
|
210
|
+
|
211
|
+
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
212
|
+
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
213
|
+
} else {
|
214
|
+
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
215
|
+
}
|
216
|
+
|
93
217
|
rb_ary_push(elements, field);
|
94
218
|
}
|
95
219
|
|
@@ -97,10 +221,10 @@ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quot
|
|
97
221
|
}
|
98
222
|
|
99
223
|
void Init_smarter_csv(void) {
|
100
|
-
// these modules and the error class are already defined in Ruby code, make them accessible:
|
101
224
|
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
102
225
|
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
103
226
|
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
|
104
|
-
|
105
|
-
|
227
|
+
Qempty_string = rb_str_new_literal("");
|
228
|
+
rb_gc_register_address(&Qempty_string);
|
229
|
+
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
|
106
230
|
}
|
data/lib/smarter_csv/parser.rb
CHANGED
@@ -2,6 +2,8 @@
|
|
2
2
|
|
3
3
|
module SmarterCSV
|
4
4
|
module Parser
|
5
|
+
EMPTY_STRING = ''.freeze
|
6
|
+
|
5
7
|
protected
|
6
8
|
|
7
9
|
###
|
@@ -11,17 +13,16 @@ module SmarterCSV
|
|
11
13
|
###
|
12
14
|
def parse(line, options, header_size = nil)
|
13
15
|
# puts "SmarterCSV.parse OPTIONS: #{options[:acceleration]}" if options[:verbose]
|
16
|
+
has_quotes = line.include?(options[:quote_char])
|
14
17
|
|
15
18
|
if options[:acceleration] && has_acceleration
|
16
19
|
# :nocov:
|
17
|
-
|
18
|
-
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size)
|
19
|
-
elements.map!{|x| cleanup_quotes(x, options[:quote_char])} if has_quotes
|
20
|
+
elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace])
|
20
21
|
[elements, elements.size]
|
21
22
|
# :nocov:
|
22
23
|
else
|
23
24
|
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
24
|
-
parse_csv_line_ruby(line, options, header_size)
|
25
|
+
parse_csv_line_ruby(line, options, header_size, has_quotes)
|
25
26
|
end
|
26
27
|
end
|
27
28
|
|
@@ -46,7 +47,7 @@ module SmarterCSV
|
|
46
47
|
#
|
47
48
|
# Our convention is that empty fields are returned as empty strings, not as nil.
|
48
49
|
|
49
|
-
def parse_csv_line_ruby(line, options, header_size = nil)
|
50
|
+
def parse_csv_line_ruby(line, options, header_size = nil, has_quotes = false)
|
50
51
|
return [[], 0] if line.nil?
|
51
52
|
|
52
53
|
line_size = line.size
|
@@ -98,11 +99,13 @@ module SmarterCSV
|
|
98
99
|
elements << cleanup_quotes(line[start..-1], quote)
|
99
100
|
end
|
100
101
|
|
102
|
+
elements.map!(&:strip) if options[:strip_whitespace]
|
101
103
|
[elements, elements.size]
|
102
104
|
end
|
103
105
|
|
104
106
|
def cleanup_quotes(field, quote)
|
105
|
-
return
|
107
|
+
return nil if field.nil?
|
108
|
+
return EMPTY_STRING if field.empty?
|
106
109
|
|
107
110
|
# Remove surrounding quotes if present
|
108
111
|
if field.start_with?(quote) && field.end_with?(quote)
|
@@ -110,9 +113,13 @@ module SmarterCSV
|
|
110
113
|
end
|
111
114
|
|
112
115
|
# Replace double quotes with a single quote
|
113
|
-
field.gsub!((quote
|
116
|
+
field.gsub!(doubled_quote(quote), quote)
|
114
117
|
|
115
118
|
field
|
116
119
|
end
|
120
|
+
|
121
|
+
def doubled_quote(quote)
|
122
|
+
@doubled_quote ||= (quote * 2).to_s.freeze
|
123
|
+
end
|
117
124
|
end
|
118
125
|
end
|
data/lib/smarter_csv/reader.rb
CHANGED
@@ -128,6 +128,7 @@ module SmarterCSV
|
|
128
128
|
line.chomp!(options[:row_sep])
|
129
129
|
|
130
130
|
# --- SPLIT LINE & DATA TRANSFORMATIONS ------------------------------------------------------------
|
131
|
+
# we are now stripping whitespace inside the parse() methods
|
131
132
|
dataA, data_size = parse(line, options) # we parse the extra columns
|
132
133
|
|
133
134
|
if options[:strict]
|
@@ -141,8 +142,6 @@ module SmarterCSV
|
|
141
142
|
end
|
142
143
|
end
|
143
144
|
|
144
|
-
dataA.map!{|x| x.strip} if options[:strip_whitespace]
|
145
|
-
|
146
145
|
# if all values are blank, then ignore this line
|
147
146
|
next if options[:remove_empty_hashes] && (dataA.empty? || blank?(dataA))
|
148
147
|
|
data/lib/smarter_csv/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: smarter_csv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.14.
|
4
|
+
version: 1.14.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Tilo Sloboda
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-05-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: awesome_print
|