smarter_csv 1.14.3 → 1.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/CONTRIBUTORS.md +1 -0
- data/lib/smarter_csv/auto_detection.rb +1 -1
- data/lib/smarter_csv/version.rb +1 -1
- metadata +2 -14
- data/ext/smarter_csv/Makefile +0 -270
- data/ext/smarter_csv/smarter_csv.c.works +0 -185
- data/ext/smarter_csv/smarter_csv.c.works10 +0 -199
- data/ext/smarter_csv/smarter_csv.c.works11 +0 -189
- data/ext/smarter_csv/smarter_csv.c.works14 +0 -230
- data/ext/smarter_csv/smarter_csv.c.works15 +0 -230
- data/ext/smarter_csv/smarter_csv.c.works2 +0 -192
- data/ext/smarter_csv/smarter_csv.c.works4 +0 -190
- data/ext/smarter_csv/smarter_csv.c.works5 +0 -203
- data/ext/smarter_csv/smarter_csv.c.works7 +0 -217
- data/ext/smarter_csv/smarter_csv.c.works8 +0 -193
- data/ext/smarter_csv/smarter_csv.c.works9 +0 -196
@@ -1,199 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "ruby/encoding.h"
|
3
|
-
#include <stdio.h>
|
4
|
-
#include <stdbool.h>
|
5
|
-
#include <string.h>
|
6
|
-
|
7
|
-
#ifndef bool
|
8
|
-
#define bool int
|
9
|
-
#define false ((bool)0)
|
10
|
-
#define true ((bool)1)
|
11
|
-
#endif
|
12
|
-
|
13
|
-
VALUE SmarterCSV = Qnil;
|
14
|
-
VALUE eMalformedCSVError = Qnil;
|
15
|
-
VALUE Parser = Qnil;
|
16
|
-
VALUE Qempty_string = Qnil; // shared frozen empty string
|
17
|
-
|
18
|
-
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
19
|
-
char *buf = ALLOC_N(char, len);
|
20
|
-
long j = 0;
|
21
|
-
for (long i = 0; i < len; i++) {
|
22
|
-
if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
|
23
|
-
buf[j++] = quote_char;
|
24
|
-
i++; // skip second quote
|
25
|
-
} else {
|
26
|
-
buf[j++] = str[i];
|
27
|
-
}
|
28
|
-
}
|
29
|
-
VALUE out = rb_enc_str_new(buf, j, encoding);
|
30
|
-
xfree(buf);
|
31
|
-
return out;
|
32
|
-
}
|
33
|
-
|
34
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
|
35
|
-
if (RB_TYPE_P(line, T_NIL) == 1) {
|
36
|
-
return rb_ary_new();
|
37
|
-
}
|
38
|
-
|
39
|
-
if (RB_TYPE_P(line, T_STRING) != 1) {
|
40
|
-
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
41
|
-
}
|
42
|
-
|
43
|
-
rb_encoding *encoding = rb_enc_get(line);
|
44
|
-
char *startP = RSTRING_PTR(line);
|
45
|
-
long line_len = RSTRING_LEN(line);
|
46
|
-
char *endP = startP + line_len;
|
47
|
-
char *p = startP;
|
48
|
-
|
49
|
-
char *col_sepP = RSTRING_PTR(col_sep);
|
50
|
-
long col_sep_len = RSTRING_LEN(col_sep);
|
51
|
-
|
52
|
-
char *quoteP = RSTRING_PTR(quote_char);
|
53
|
-
char quote_char_val = quoteP[0];
|
54
|
-
size_t quote_len = strlen(quoteP);
|
55
|
-
|
56
|
-
VALUE elements = rb_ary_new();
|
57
|
-
VALUE field;
|
58
|
-
|
59
|
-
long element_count = 0;
|
60
|
-
int max_fields = -1;
|
61
|
-
if (max_size != Qnil) {
|
62
|
-
max_fields = NUM2INT(max_size);
|
63
|
-
if (max_fields < 0) {
|
64
|
-
return rb_ary_new();
|
65
|
-
}
|
66
|
-
}
|
67
|
-
|
68
|
-
bool has_quotes = RTEST(has_quotes_val);
|
69
|
-
|
70
|
-
// === FAST PATH: No quotes and single-character separator ===
|
71
|
-
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
72
|
-
char sep = *col_sepP;
|
73
|
-
char *sep_pos = NULL;
|
74
|
-
|
75
|
-
while ((sep_pos = memchr(p, sep, endP - p))) {
|
76
|
-
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
77
|
-
break;
|
78
|
-
}
|
79
|
-
|
80
|
-
long field_len = sep_pos - startP;
|
81
|
-
char *raw_field = startP;
|
82
|
-
|
83
|
-
if (memchr(raw_field, quote_char_val, field_len)) {
|
84
|
-
field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
|
85
|
-
} else {
|
86
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
87
|
-
}
|
88
|
-
rb_ary_push(elements, field);
|
89
|
-
element_count++;
|
90
|
-
|
91
|
-
p = sep_pos + 1;
|
92
|
-
startP = p;
|
93
|
-
}
|
94
|
-
|
95
|
-
if ((max_fields < 0) || (element_count < max_fields)) {
|
96
|
-
long field_len = endP - startP;
|
97
|
-
char *raw_field = startP;
|
98
|
-
|
99
|
-
if (memchr(raw_field, quote_char_val, field_len)) {
|
100
|
-
field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
|
101
|
-
} else {
|
102
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
103
|
-
}
|
104
|
-
rb_ary_push(elements, field);
|
105
|
-
}
|
106
|
-
|
107
|
-
return elements;
|
108
|
-
}
|
109
|
-
|
110
|
-
// === SLOW PATH: Quoted fields or multi-char separator ===
|
111
|
-
long i;
|
112
|
-
long backslash_count = 0;
|
113
|
-
bool in_quotes = false;
|
114
|
-
bool col_sep_found = true;
|
115
|
-
|
116
|
-
while (p < endP) {
|
117
|
-
col_sep_found = true;
|
118
|
-
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
|
119
|
-
if (*(p + i) != *(col_sepP + i)) {
|
120
|
-
col_sep_found = false;
|
121
|
-
break;
|
122
|
-
}
|
123
|
-
}
|
124
|
-
|
125
|
-
if (col_sep_found && !in_quotes) {
|
126
|
-
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
127
|
-
break;
|
128
|
-
}
|
129
|
-
|
130
|
-
long field_len = p - startP;
|
131
|
-
char *raw_field = startP;
|
132
|
-
|
133
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
134
|
-
if (quoted) {
|
135
|
-
raw_field++;
|
136
|
-
field_len -= 2;
|
137
|
-
}
|
138
|
-
|
139
|
-
if (quoted || memchr(raw_field, quote_char_val, field_len)) {
|
140
|
-
field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
|
141
|
-
} else {
|
142
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
143
|
-
}
|
144
|
-
|
145
|
-
rb_ary_push(elements, field);
|
146
|
-
element_count++;
|
147
|
-
|
148
|
-
p += col_sep_len;
|
149
|
-
startP = p;
|
150
|
-
backslash_count = 0;
|
151
|
-
} else {
|
152
|
-
if (*p == '\\') {
|
153
|
-
backslash_count++;
|
154
|
-
} else {
|
155
|
-
if (*p == quote_char_val) {
|
156
|
-
if (backslash_count % 2 == 0) {
|
157
|
-
in_quotes = !in_quotes;
|
158
|
-
}
|
159
|
-
}
|
160
|
-
backslash_count = 0;
|
161
|
-
}
|
162
|
-
p++;
|
163
|
-
}
|
164
|
-
}
|
165
|
-
|
166
|
-
if (in_quotes) {
|
167
|
-
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
168
|
-
}
|
169
|
-
|
170
|
-
if ((max_fields < 0) || (element_count < max_fields)) {
|
171
|
-
long field_len = endP - startP;
|
172
|
-
char *raw_field = startP;
|
173
|
-
|
174
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
175
|
-
if (quoted) {
|
176
|
-
raw_field++;
|
177
|
-
field_len -= 2;
|
178
|
-
}
|
179
|
-
|
180
|
-
if (quoted || memchr(raw_field, quote_char_val, field_len)) {
|
181
|
-
field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
|
182
|
-
} else {
|
183
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
184
|
-
}
|
185
|
-
|
186
|
-
rb_ary_push(elements, field);
|
187
|
-
}
|
188
|
-
|
189
|
-
return elements;
|
190
|
-
}
|
191
|
-
|
192
|
-
void Init_smarter_csv(void) {
|
193
|
-
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
194
|
-
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
195
|
-
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
|
196
|
-
Qempty_string = rb_str_new_literal("");
|
197
|
-
rb_gc_register_address(&Qempty_string);
|
198
|
-
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
|
199
|
-
}
|
@@ -1,189 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "ruby/encoding.h"
|
3
|
-
#include <stdio.h>
|
4
|
-
#include <stdbool.h>
|
5
|
-
#include <string.h>
|
6
|
-
|
7
|
-
#ifndef bool
|
8
|
-
#define bool int
|
9
|
-
#define false ((bool)0)
|
10
|
-
#define true ((bool)1)
|
11
|
-
#endif
|
12
|
-
|
13
|
-
VALUE SmarterCSV = Qnil;
|
14
|
-
VALUE eMalformedCSVError = Qnil;
|
15
|
-
VALUE Parser = Qnil;
|
16
|
-
VALUE Qempty_string = Qnil; // shared frozen empty string
|
17
|
-
|
18
|
-
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
19
|
-
char *buf = ALLOC_N(char, len);
|
20
|
-
long j = 0;
|
21
|
-
for (long i = 0; i < len; i++) {
|
22
|
-
if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
|
23
|
-
buf[j++] = quote_char;
|
24
|
-
i++; // skip second quote
|
25
|
-
} else {
|
26
|
-
buf[j++] = str[i];
|
27
|
-
}
|
28
|
-
}
|
29
|
-
VALUE out = rb_enc_str_new(buf, j, encoding);
|
30
|
-
xfree(buf);
|
31
|
-
return out;
|
32
|
-
}
|
33
|
-
|
34
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
|
35
|
-
if (RB_TYPE_P(line, T_NIL) == 1) {
|
36
|
-
return rb_ary_new();
|
37
|
-
}
|
38
|
-
|
39
|
-
if (RB_TYPE_P(line, T_STRING) != 1) {
|
40
|
-
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
41
|
-
}
|
42
|
-
|
43
|
-
rb_encoding *encoding = rb_enc_get(line);
|
44
|
-
char *startP = RSTRING_PTR(line);
|
45
|
-
long line_len = RSTRING_LEN(line);
|
46
|
-
char *endP = startP + line_len;
|
47
|
-
char *p = startP;
|
48
|
-
|
49
|
-
char *col_sepP = RSTRING_PTR(col_sep);
|
50
|
-
long col_sep_len = RSTRING_LEN(col_sep);
|
51
|
-
|
52
|
-
char *quoteP = RSTRING_PTR(quote_char);
|
53
|
-
char quote_char_val = quoteP[0];
|
54
|
-
size_t quote_len = strlen(quoteP);
|
55
|
-
|
56
|
-
VALUE elements = rb_ary_new();
|
57
|
-
VALUE field;
|
58
|
-
|
59
|
-
long element_count = 0;
|
60
|
-
int max_fields = -1;
|
61
|
-
if (max_size != Qnil) {
|
62
|
-
max_fields = NUM2INT(max_size);
|
63
|
-
if (max_fields < 0) {
|
64
|
-
return rb_ary_new();
|
65
|
-
}
|
66
|
-
}
|
67
|
-
|
68
|
-
bool has_quotes = RTEST(has_quotes_val);
|
69
|
-
|
70
|
-
// === FAST PATH: No quotes and single-character separator ===
|
71
|
-
if (!has_quotes && col_sep_len == 1) {
|
72
|
-
char sep = *col_sepP;
|
73
|
-
char *sep_pos = NULL;
|
74
|
-
|
75
|
-
while ((sep_pos = memchr(p, sep, endP - p))) {
|
76
|
-
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
77
|
-
break;
|
78
|
-
}
|
79
|
-
|
80
|
-
long field_len = sep_pos - startP;
|
81
|
-
char *raw_field = startP;
|
82
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
83
|
-
rb_ary_push(elements, field);
|
84
|
-
element_count++;
|
85
|
-
|
86
|
-
p = sep_pos + 1;
|
87
|
-
startP = p;
|
88
|
-
}
|
89
|
-
|
90
|
-
if ((max_fields < 0) || (element_count < max_fields)) {
|
91
|
-
long field_len = endP - startP;
|
92
|
-
char *raw_field = startP;
|
93
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
94
|
-
rb_ary_push(elements, field);
|
95
|
-
}
|
96
|
-
|
97
|
-
return elements;
|
98
|
-
}
|
99
|
-
|
100
|
-
// === SLOW PATH: Quoted fields or multi-char separator ===
|
101
|
-
long i;
|
102
|
-
long backslash_count = 0;
|
103
|
-
bool in_quotes = false;
|
104
|
-
bool col_sep_found = true;
|
105
|
-
|
106
|
-
while (p < endP) {
|
107
|
-
col_sep_found = true;
|
108
|
-
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
|
109
|
-
if (*(p + i) != *(col_sepP + i)) {
|
110
|
-
col_sep_found = false;
|
111
|
-
break;
|
112
|
-
}
|
113
|
-
}
|
114
|
-
|
115
|
-
if (col_sep_found && !in_quotes) {
|
116
|
-
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
117
|
-
break;
|
118
|
-
}
|
119
|
-
|
120
|
-
long field_len = p - startP;
|
121
|
-
char *raw_field = startP;
|
122
|
-
|
123
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
124
|
-
if (quoted) {
|
125
|
-
raw_field++;
|
126
|
-
field_len -= 2;
|
127
|
-
}
|
128
|
-
|
129
|
-
if (quoted || memchr(raw_field, quote_char_val, field_len)) {
|
130
|
-
field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
|
131
|
-
} else {
|
132
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
133
|
-
}
|
134
|
-
|
135
|
-
rb_ary_push(elements, field);
|
136
|
-
element_count++;
|
137
|
-
|
138
|
-
p += col_sep_len;
|
139
|
-
startP = p;
|
140
|
-
backslash_count = 0;
|
141
|
-
} else {
|
142
|
-
if (*p == '\\') {
|
143
|
-
backslash_count++;
|
144
|
-
} else {
|
145
|
-
if (*p == quote_char_val) {
|
146
|
-
if (backslash_count % 2 == 0) {
|
147
|
-
in_quotes = !in_quotes;
|
148
|
-
}
|
149
|
-
}
|
150
|
-
backslash_count = 0;
|
151
|
-
}
|
152
|
-
p++;
|
153
|
-
}
|
154
|
-
}
|
155
|
-
|
156
|
-
if (in_quotes) {
|
157
|
-
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
158
|
-
}
|
159
|
-
|
160
|
-
if ((max_fields < 0) || (element_count < max_fields)) {
|
161
|
-
long field_len = endP - startP;
|
162
|
-
char *raw_field = startP;
|
163
|
-
|
164
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
165
|
-
if (quoted) {
|
166
|
-
raw_field++;
|
167
|
-
field_len -= 2;
|
168
|
-
}
|
169
|
-
|
170
|
-
if (quoted || memchr(raw_field, quote_char_val, field_len)) {
|
171
|
-
field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
|
172
|
-
} else {
|
173
|
-
field = rb_enc_str_new(raw_field, field_len, encoding);
|
174
|
-
}
|
175
|
-
|
176
|
-
rb_ary_push(elements, field);
|
177
|
-
}
|
178
|
-
|
179
|
-
return elements;
|
180
|
-
}
|
181
|
-
|
182
|
-
void Init_smarter_csv(void) {
|
183
|
-
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
184
|
-
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
185
|
-
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
|
186
|
-
Qempty_string = rb_str_new_literal("");
|
187
|
-
rb_gc_register_address(&Qempty_string);
|
188
|
-
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
|
189
|
-
}
|
@@ -1,230 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "ruby/encoding.h"
|
3
|
-
#include <stdio.h>
|
4
|
-
#include <stdbool.h>
|
5
|
-
#include <string.h>
|
6
|
-
|
7
|
-
#ifndef bool
|
8
|
-
#define bool int
|
9
|
-
#define false ((bool)0)
|
10
|
-
#define true ((bool)1)
|
11
|
-
#endif
|
12
|
-
|
13
|
-
VALUE SmarterCSV = Qnil;
|
14
|
-
VALUE eMalformedCSVError = Qnil;
|
15
|
-
VALUE Parser = Qnil;
|
16
|
-
VALUE Qempty_string = Qnil; // shared frozen empty string
|
17
|
-
|
18
|
-
static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
|
19
|
-
char *buf = ALLOC_N(char, len);
|
20
|
-
long j = 0;
|
21
|
-
for (long i = 0; i < len; i++) {
|
22
|
-
if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
|
23
|
-
buf[j++] = quote_char;
|
24
|
-
i++; // skip second quote
|
25
|
-
} else {
|
26
|
-
buf[j++] = str[i];
|
27
|
-
}
|
28
|
-
}
|
29
|
-
VALUE out = rb_enc_str_new(buf, j, encoding);
|
30
|
-
xfree(buf);
|
31
|
-
return out;
|
32
|
-
}
|
33
|
-
|
34
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val, VALUE strip_ws_val) {
|
35
|
-
if (RB_TYPE_P(line, T_NIL) == 1) {
|
36
|
-
return rb_ary_new();
|
37
|
-
}
|
38
|
-
|
39
|
-
if (RB_TYPE_P(line, T_STRING) != 1) {
|
40
|
-
rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
41
|
-
}
|
42
|
-
|
43
|
-
rb_encoding *encoding = rb_enc_get(line);
|
44
|
-
char *startP = RSTRING_PTR(line);
|
45
|
-
long line_len = RSTRING_LEN(line);
|
46
|
-
char *endP = startP + line_len;
|
47
|
-
char *p = startP;
|
48
|
-
|
49
|
-
char *col_sepP = RSTRING_PTR(col_sep);
|
50
|
-
long col_sep_len = RSTRING_LEN(col_sep);
|
51
|
-
|
52
|
-
char *quoteP = RSTRING_PTR(quote_char);
|
53
|
-
char quote_char_val = quoteP[0];
|
54
|
-
size_t quote_len = strlen(quoteP);
|
55
|
-
|
56
|
-
VALUE elements = rb_ary_new();
|
57
|
-
VALUE field;
|
58
|
-
|
59
|
-
long element_count = 0;
|
60
|
-
int max_fields = -1;
|
61
|
-
if (max_size != Qnil) {
|
62
|
-
max_fields = NUM2INT(max_size);
|
63
|
-
if (max_fields < 0) {
|
64
|
-
return rb_ary_new();
|
65
|
-
}
|
66
|
-
}
|
67
|
-
|
68
|
-
bool has_quotes = RTEST(has_quotes_val);
|
69
|
-
bool strip_ws = RTEST(strip_ws_val);
|
70
|
-
|
71
|
-
// === FAST PATH: No quotes and single-character separator ===
|
72
|
-
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
73
|
-
char sep = *col_sepP;
|
74
|
-
char *sep_pos = NULL;
|
75
|
-
|
76
|
-
while ((sep_pos = memchr(p, sep, endP - p))) {
|
77
|
-
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
78
|
-
break;
|
79
|
-
}
|
80
|
-
|
81
|
-
long field_len = sep_pos - startP;
|
82
|
-
char *raw_field = startP;
|
83
|
-
char *trim_start = raw_field;
|
84
|
-
char *trim_end = raw_field + field_len - 1;
|
85
|
-
|
86
|
-
if (strip_ws) {
|
87
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
88
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
89
|
-
}
|
90
|
-
|
91
|
-
long trimmed_len = trim_end - trim_start + 1;
|
92
|
-
|
93
|
-
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
94
|
-
rb_ary_push(elements, field);
|
95
|
-
element_count++;
|
96
|
-
|
97
|
-
p = sep_pos + 1;
|
98
|
-
startP = p;
|
99
|
-
}
|
100
|
-
|
101
|
-
if ((max_fields < 0) || (element_count < max_fields)) {
|
102
|
-
long field_len = endP - startP;
|
103
|
-
char *raw_field = startP;
|
104
|
-
char *trim_start = raw_field;
|
105
|
-
char *trim_end = raw_field + field_len - 1;
|
106
|
-
|
107
|
-
if (strip_ws) {
|
108
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
109
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
110
|
-
}
|
111
|
-
|
112
|
-
long trimmed_len = trim_end - trim_start + 1;
|
113
|
-
|
114
|
-
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
115
|
-
rb_ary_push(elements, field);
|
116
|
-
}
|
117
|
-
|
118
|
-
return elements;
|
119
|
-
}
|
120
|
-
|
121
|
-
// === SLOW PATH: Quoted fields or multi-char separator ===
|
122
|
-
long i;
|
123
|
-
long backslash_count = 0;
|
124
|
-
bool in_quotes = false;
|
125
|
-
bool col_sep_found = true;
|
126
|
-
|
127
|
-
while (p < endP) {
|
128
|
-
col_sep_found = true;
|
129
|
-
for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
|
130
|
-
if (*(p + i) != *(col_sepP + i)) {
|
131
|
-
col_sep_found = false;
|
132
|
-
break;
|
133
|
-
}
|
134
|
-
}
|
135
|
-
|
136
|
-
if (col_sep_found && !in_quotes) {
|
137
|
-
if ((max_fields >= 0) && (element_count >= max_fields)) {
|
138
|
-
break;
|
139
|
-
}
|
140
|
-
|
141
|
-
long field_len = p - startP;
|
142
|
-
char *raw_field = startP;
|
143
|
-
|
144
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
145
|
-
if (quoted) {
|
146
|
-
raw_field++;
|
147
|
-
field_len -= 2;
|
148
|
-
}
|
149
|
-
|
150
|
-
char *trim_start = raw_field;
|
151
|
-
char *trim_end = raw_field + field_len - 1;
|
152
|
-
|
153
|
-
if (strip_ws) {
|
154
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
155
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
156
|
-
}
|
157
|
-
|
158
|
-
long trimmed_len = trim_end - trim_start + 1;
|
159
|
-
|
160
|
-
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
161
|
-
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
162
|
-
} else {
|
163
|
-
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
164
|
-
}
|
165
|
-
|
166
|
-
rb_ary_push(elements, field);
|
167
|
-
element_count++;
|
168
|
-
|
169
|
-
p += col_sep_len;
|
170
|
-
startP = p;
|
171
|
-
backslash_count = 0;
|
172
|
-
} else {
|
173
|
-
if (*p == '\\') {
|
174
|
-
backslash_count++;
|
175
|
-
} else {
|
176
|
-
if (*p == quote_char_val) {
|
177
|
-
if (backslash_count % 2 == 0) {
|
178
|
-
in_quotes = !in_quotes;
|
179
|
-
}
|
180
|
-
}
|
181
|
-
backslash_count = 0;
|
182
|
-
}
|
183
|
-
p++;
|
184
|
-
}
|
185
|
-
}
|
186
|
-
|
187
|
-
if (in_quotes) {
|
188
|
-
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
189
|
-
}
|
190
|
-
|
191
|
-
if ((max_fields < 0) || (element_count < max_fields)) {
|
192
|
-
long field_len = endP - startP;
|
193
|
-
char *raw_field = startP;
|
194
|
-
|
195
|
-
bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
|
196
|
-
if (quoted) {
|
197
|
-
raw_field++;
|
198
|
-
field_len -= 2;
|
199
|
-
}
|
200
|
-
|
201
|
-
char *trim_start = raw_field;
|
202
|
-
char *trim_end = raw_field + field_len - 1;
|
203
|
-
|
204
|
-
if (strip_ws) {
|
205
|
-
while (trim_start <= trim_end && (*trim_start == ' ' || *trim_start == '\t')) trim_start++;
|
206
|
-
while (trim_end >= trim_start && (*trim_end == ' ' || *trim_end == '\t')) trim_end--;
|
207
|
-
}
|
208
|
-
|
209
|
-
long trimmed_len = trim_end - trim_start + 1;
|
210
|
-
|
211
|
-
if (quoted || memchr(trim_start, quote_char_val, trimmed_len)) {
|
212
|
-
field = unescape_quotes(trim_start, trimmed_len, quote_char_val, encoding);
|
213
|
-
} else {
|
214
|
-
field = rb_enc_str_new(trim_start, trimmed_len, encoding);
|
215
|
-
}
|
216
|
-
|
217
|
-
rb_ary_push(elements, field);
|
218
|
-
}
|
219
|
-
|
220
|
-
return elements;
|
221
|
-
}
|
222
|
-
|
223
|
-
void Init_smarter_csv(void) {
|
224
|
-
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
225
|
-
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
226
|
-
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
|
227
|
-
Qempty_string = rb_str_new_literal("");
|
228
|
-
rb_gc_register_address(&Qempty_string);
|
229
|
-
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 6);
|
230
|
-
}
|