smarter_csv 1.14.2 → 1.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,190 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+
6
+ #ifndef bool
7
+ #define bool int
8
+ #define false ((bool)0)
9
+ #define true ((bool)1)
10
+ #endif
11
+
12
+ VALUE SmarterCSV = Qnil;
13
+ VALUE eMalformedCSVError = Qnil;
14
+ VALUE Parser = Qnil;
15
+
16
+ static VALUE Qempty_string; // shared frozen empty string
17
+
18
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_flag) {
19
+ if (RB_TYPE_P(line, T_NIL) == 1) {
20
+ return rb_ary_new();
21
+ }
22
+
23
+ if (RB_TYPE_P(line, T_STRING) != 1) {
24
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
25
+ }
26
+
27
+ rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
28
+ char *startP = RSTRING_PTR(line); /* may not be null terminated */
29
+ long line_len = RSTRING_LEN(line);
30
+ char *endP = startP + line_len; /* points behind the string */
31
+ char *p = startP;
32
+
33
+ char *col_sepP = RSTRING_PTR(col_sep);
34
+ long col_sep_len = RSTRING_LEN(col_sep);
35
+
36
+ char *quoteP = RSTRING_PTR(quote_char);
37
+ char quote = *quoteP;
38
+ bool has_quotes = RTEST(has_quotes_flag);
39
+
40
+ bool col_sep_found = true;
41
+
42
+ VALUE elements = rb_ary_new();
43
+ VALUE field;
44
+ long i;
45
+
46
+ /* Variables for escaped quote handling */
47
+ long backslash_count = 0;
48
+ bool in_quotes = false;
49
+
50
+ /* Optimization 1: maintain count instead of calling RARRAY_LEN repeatedly */
51
+ long element_count = 0;
52
+
53
+ /* Optimization 2: cache max_size value if not nil */
54
+ int max_fields = -1;
55
+ if (max_size != Qnil) {
56
+ max_fields = NUM2INT(max_size);
57
+ if (max_fields < 0) {
58
+ return rb_ary_new(); // Return empty array early
59
+ }
60
+ }
61
+
62
+ while (p < endP) {
63
+ /* does the remaining string start with col_sep ? */
64
+ col_sep_found = true;
65
+ for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
66
+ col_sep_found = col_sep_found && (*(p + i) == *(col_sepP + i));
67
+ }
68
+
69
+ /* if col_sep was found and we're not inside quotes */
70
+ if (col_sep_found && !in_quotes) {
71
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
72
+ break;
73
+ } else {
74
+ bool only_spaces = true;
75
+ for (char *s = startP; s < p; s++) {
76
+ if (*s != ' ') {
77
+ only_spaces = false;
78
+ break;
79
+ }
80
+ }
81
+
82
+ if (only_spaces) {
83
+ field = Qempty_string;
84
+ } else {
85
+ char *src = startP;
86
+ long src_len = p - startP;
87
+ bool is_quoted = has_quotes && src_len >= 2 && src[0] == quote && src[src_len - 1] == quote;
88
+
89
+ if (is_quoted) {
90
+ src += 1;
91
+ src_len -= 2;
92
+ }
93
+
94
+ char *clean_buf = ALLOC_N(char, src_len);
95
+ long clean_len = 0;
96
+
97
+ for (i = 0; i < src_len; i++) {
98
+ if (src[i] == quote && (i + 1 < src_len) && src[i + 1] == quote) {
99
+ clean_buf[clean_len++] = quote;
100
+ i++;
101
+ } else {
102
+ clean_buf[clean_len++] = src[i];
103
+ }
104
+ }
105
+
106
+ field = rb_enc_str_new(clean_buf, clean_len, encoding);
107
+ xfree(clean_buf);
108
+ }
109
+
110
+ rb_ary_push(elements, field);
111
+ element_count++;
112
+
113
+ p += col_sep_len;
114
+ startP = p;
115
+ backslash_count = 0; // Reset backslash count at the start of a new field
116
+ }
117
+ } else {
118
+ if (*p == '\\') {
119
+ backslash_count++;
120
+ } else {
121
+ if (*p == quote) {
122
+ if (backslash_count % 2 == 0) {
123
+ in_quotes = !in_quotes;
124
+ }
125
+ }
126
+ backslash_count = 0;
127
+ }
128
+ p++;
129
+ }
130
+ }
131
+
132
+ if (in_quotes) {
133
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
134
+ }
135
+
136
+ if ((max_fields < 0) || (element_count < max_fields)) {
137
+ bool only_spaces = true;
138
+ for (char *s = startP; s < endP; s++) {
139
+ if (*s != ' ') {
140
+ only_spaces = false;
141
+ break;
142
+ }
143
+ }
144
+
145
+ if (only_spaces) {
146
+ field = Qempty_string;
147
+ } else {
148
+ char *src = startP;
149
+ long src_len = endP - startP;
150
+ bool is_quoted = has_quotes && src_len >= 2 && src[0] == quote && src[src_len - 1] == quote;
151
+
152
+ if (is_quoted) {
153
+ src += 1;
154
+ src_len -= 2;
155
+ }
156
+
157
+ char *clean_buf = ALLOC_N(char, src_len);
158
+ long clean_len = 0;
159
+
160
+ for (i = 0; i < src_len; i++) {
161
+ if (src[i] == quote && (i + 1 < src_len) && src[i + 1] == quote) {
162
+ clean_buf[clean_len++] = quote;
163
+ i++;
164
+ } else {
165
+ clean_buf[clean_len++] = src[i];
166
+ }
167
+ }
168
+
169
+ field = rb_enc_str_new(clean_buf, clean_len, encoding);
170
+ xfree(clean_buf);
171
+ }
172
+
173
+ rb_ary_push(elements, field);
174
+ }
175
+
176
+ return elements;
177
+ }
178
+
179
+ void Init_smarter_csv(void) {
180
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
181
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
182
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
183
+
184
+ Qempty_string = rb_str_new_literal("");
185
+ rb_obj_freeze(Qempty_string);
186
+ rb_global_variable(&Qempty_string);
187
+
188
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
189
+ }
190
+
@@ -0,0 +1,203 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #ifndef bool
8
+ #define bool int
9
+ #define false ((bool)0)
10
+ #define true ((bool)1)
11
+ #endif
12
+
13
+ #define STACK_BUF_SIZE 4096
14
+
15
+ VALUE SmarterCSV = Qnil;
16
+ VALUE eMalformedCSVError = Qnil;
17
+ VALUE Parser = Qnil;
18
+
19
+ static VALUE Qempty_string; // shared frozen empty string
20
+
21
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_flag) {
22
+ if (RB_TYPE_P(line, T_NIL) == 1) {
23
+ return rb_ary_new();
24
+ }
25
+
26
+ if (RB_TYPE_P(line, T_STRING) != 1) {
27
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
28
+ }
29
+
30
+ rb_encoding *encoding = rb_enc_get(line);
31
+ char *startP = RSTRING_PTR(line);
32
+ long line_len = RSTRING_LEN(line);
33
+ char *endP = startP + line_len;
34
+ char *p = startP;
35
+
36
+ char *col_sepP = RSTRING_PTR(col_sep);
37
+ long col_sep_len = RSTRING_LEN(col_sep);
38
+
39
+ char *quoteP = RSTRING_PTR(quote_char);
40
+ char quote = *quoteP;
41
+ bool has_quotes = RTEST(has_quotes_flag);
42
+
43
+ bool col_sep_found = true;
44
+
45
+ VALUE elements = rb_ary_new();
46
+ VALUE field;
47
+ long i;
48
+
49
+ long backslash_count = 0;
50
+ bool in_quotes = false;
51
+ long element_count = 0;
52
+
53
+ int max_fields = -1;
54
+ if (max_size != Qnil) {
55
+ max_fields = NUM2INT(max_size);
56
+ if (max_fields < 0) {
57
+ return rb_ary_new(); // early return
58
+ }
59
+ }
60
+
61
+ char stack_buf[STACK_BUF_SIZE];
62
+
63
+ while (p < endP) {
64
+ col_sep_found = true;
65
+ for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
66
+ col_sep_found = col_sep_found && (*(p + i) == *(col_sepP + i));
67
+ }
68
+
69
+ if (__builtin_expect(col_sep_found && !in_quotes, 1)) {
70
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
71
+ break;
72
+ }
73
+
74
+ bool only_spaces = true;
75
+ for (char *s = startP; s < p; s++) {
76
+ if (*s != ' ') {
77
+ only_spaces = false;
78
+ break;
79
+ }
80
+ }
81
+
82
+ if (only_spaces) {
83
+ field = Qempty_string;
84
+ } else {
85
+ char *src = startP;
86
+ long src_len = p - startP;
87
+ bool is_quoted = false;
88
+
89
+ if (__builtin_expect(has_quotes, 0)) {
90
+ is_quoted = src_len >= 2 && src[0] == quote && src[src_len - 1] == quote;
91
+ if (is_quoted) {
92
+ src += 1;
93
+ src_len -= 2;
94
+ }
95
+ }
96
+
97
+ if (!has_quotes || memchr(src, quote, src_len) == NULL) {
98
+ field = rb_enc_str_new(src, src_len, encoding);
99
+ } else {
100
+ char *clean_buf = src_len < STACK_BUF_SIZE ? stack_buf : ALLOC_N(char, src_len);
101
+ long clean_len = 0;
102
+
103
+ for (i = 0; i < src_len; i++) {
104
+ if (src[i] == quote && (i + 1 < src_len) && src[i + 1] == quote) {
105
+ clean_buf[clean_len++] = quote;
106
+ i++;
107
+ } else {
108
+ clean_buf[clean_len++] = src[i];
109
+ }
110
+ }
111
+
112
+ field = rb_enc_str_new(clean_buf, clean_len, encoding);
113
+ if (clean_buf != stack_buf) xfree(clean_buf);
114
+ }
115
+ }
116
+
117
+ rb_ary_push(elements, field);
118
+ element_count++;
119
+
120
+ p += col_sep_len;
121
+ startP = p;
122
+ backslash_count = 0;
123
+ } else {
124
+ if (*p == '\\') {
125
+ backslash_count++;
126
+ } else {
127
+ if (*p == quote) {
128
+ if (backslash_count % 2 == 0) {
129
+ in_quotes = !in_quotes;
130
+ }
131
+ }
132
+ backslash_count = 0;
133
+ }
134
+ p++;
135
+ }
136
+ }
137
+
138
+ if (in_quotes) {
139
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
140
+ }
141
+
142
+ if ((max_fields < 0) || (element_count < max_fields)) {
143
+ bool only_spaces = true;
144
+ for (char *s = startP; s < endP; s++) {
145
+ if (*s != ' ') {
146
+ only_spaces = false;
147
+ break;
148
+ }
149
+ }
150
+
151
+ if (only_spaces) {
152
+ field = Qempty_string;
153
+ } else {
154
+ char *src = startP;
155
+ long src_len = endP - startP;
156
+ bool is_quoted = false;
157
+
158
+ if (__builtin_expect(has_quotes, 0)) {
159
+ is_quoted = src_len >= 2 && src[0] == quote && src[src_len - 1] == quote;
160
+ if (is_quoted) {
161
+ src += 1;
162
+ src_len -= 2;
163
+ }
164
+ }
165
+
166
+ if (!has_quotes || memchr(src, quote, src_len) == NULL) {
167
+ field = rb_enc_str_new(src, src_len, encoding);
168
+ } else {
169
+ char *clean_buf = src_len < STACK_BUF_SIZE ? stack_buf : ALLOC_N(char, src_len);
170
+ long clean_len = 0;
171
+
172
+ for (i = 0; i < src_len; i++) {
173
+ if (src[i] == quote && (i + 1 < src_len) && src[i + 1] == quote) {
174
+ clean_buf[clean_len++] = quote;
175
+ i++;
176
+ } else {
177
+ clean_buf[clean_len++] = src[i];
178
+ }
179
+ }
180
+
181
+ field = rb_enc_str_new(clean_buf, clean_len, encoding);
182
+ if (clean_buf != stack_buf) xfree(clean_buf);
183
+ }
184
+ }
185
+
186
+ rb_ary_push(elements, field);
187
+ }
188
+
189
+ return elements;
190
+ }
191
+
192
+ void Init_smarter_csv(void) {
193
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
194
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
195
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
196
+
197
+ Qempty_string = rb_str_new_literal("");
198
+ rb_obj_freeze(Qempty_string);
199
+ rb_global_variable(&Qempty_string);
200
+
201
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
202
+ }
203
+
@@ -0,0 +1,217 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #ifndef bool
8
+ #define bool int
9
+ #define false ((bool)0)
10
+ #define true ((bool)1)
11
+ #endif
12
+
13
+ VALUE SmarterCSV = Qnil;
14
+ VALUE eMalformedCSVError = Qnil;
15
+ VALUE Parser = Qnil;
16
+ VALUE Qempty_string = Qnil; // shared frozen empty string
17
+
18
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
19
+ if (RB_TYPE_P(line, T_NIL) == 1) {
20
+ return rb_ary_new();
21
+ }
22
+
23
+ if (RB_TYPE_P(line, T_STRING) != 1) {
24
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
25
+ }
26
+
27
+ rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
28
+ char *startP = RSTRING_PTR(line); /* may not be null terminated */
29
+ long line_len = RSTRING_LEN(line);
30
+ char *endP = startP + line_len; /* points behind the string */
31
+ char *p = startP;
32
+
33
+ char *col_sepP = RSTRING_PTR(col_sep);
34
+ long col_sep_len = RSTRING_LEN(col_sep);
35
+
36
+ char *quoteP = RSTRING_PTR(quote_char);
37
+ char quote_char_val = quoteP[0];
38
+
39
+ bool col_sep_found = true;
40
+
41
+ VALUE elements = rb_ary_new();
42
+ VALUE field;
43
+ long i;
44
+
45
+ /* Variables for escaped quote handling */
46
+ long backslash_count = 0;
47
+ bool in_quotes = false;
48
+
49
+ /* Optimization 1: maintain count instead of calling RARRAY_LEN repeatedly */
50
+ long element_count = 0;
51
+
52
+ /* Optimization 2: cache max_size value if not nil */
53
+ int max_fields = -1;
54
+ if (max_size != Qnil) {
55
+ max_fields = NUM2INT(max_size);
56
+ if (max_fields < 0) {
57
+ return rb_ary_new(); // Return empty array early
58
+ }
59
+ }
60
+
61
+ bool has_quotes = RTEST(has_quotes_val);
62
+
63
+ /* Fast-path for clean CSV lines: no quotes, 1-character separator */
64
+ if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
65
+ char sep = *col_sepP;
66
+ char *sep_pos = NULL;
67
+
68
+ while ((sep_pos = memchr(p, sep, endP - p))) {
69
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
70
+ break;
71
+ }
72
+
73
+ /* check if only spaces */
74
+ bool only_spaces = true;
75
+ for (char *s = startP; s < sep_pos; s++) {
76
+ if (*s != ' ') {
77
+ only_spaces = false;
78
+ break;
79
+ }
80
+ }
81
+
82
+ field = only_spaces ? Qempty_string : rb_enc_str_new(startP, sep_pos - startP, encoding);
83
+ rb_ary_push(elements, field);
84
+ element_count++;
85
+
86
+ p = sep_pos + 1;
87
+ startP = p;
88
+ }
89
+
90
+ /* check if the last part of the line needs to be processed */
91
+ if ((max_fields < 0) || (element_count < max_fields)) {
92
+ bool only_spaces = true;
93
+ for (char *s = startP; s < endP; s++) {
94
+ if (*s != ' ') {
95
+ only_spaces = false;
96
+ break;
97
+ }
98
+ }
99
+ field = only_spaces ? Qempty_string : rb_enc_str_new(startP, endP - startP, encoding);
100
+ rb_ary_push(elements, field);
101
+ }
102
+
103
+ return elements;
104
+ }
105
+
106
+ /* default path for quoted/multi-separator lines */
107
+ while (p < endP) {
108
+ /* does the remaining string start with col_sep ? */
109
+ col_sep_found = true;
110
+ for(i=0; (i < col_sep_len) && (p+i < endP); i++) {
111
+ col_sep_found = col_sep_found && (*(p+i) == *(col_sepP+i));
112
+ }
113
+ /* if col_sep was found and we're not inside quotes */
114
+ if (col_sep_found && !in_quotes) {
115
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
116
+ break;
117
+ } else {
118
+ long field_len = p - startP;
119
+ char *raw_field = startP;
120
+
121
+ /* Remove surrounding quotes if present */
122
+ bool is_wrapped = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
123
+ if (is_wrapped) {
124
+ raw_field++;
125
+ field_len -= 2;
126
+ }
127
+
128
+ /* Always unescape doubled quotes */
129
+ char *src = raw_field;
130
+ char *limit = raw_field + field_len;
131
+ char *buffer = ALLOC_N(char, field_len);
132
+ char *dst = buffer;
133
+
134
+ while (src < limit) {
135
+ if (*src == quote_char_val && (src + 1 < limit) && *(src + 1) == quote_char_val) {
136
+ *dst++ = quote_char_val;
137
+ src += 2;
138
+ } else {
139
+ *dst++ = *src++;
140
+ }
141
+ }
142
+
143
+ field = rb_enc_str_new(buffer, dst - buffer, encoding);
144
+ xfree(buffer);
145
+
146
+ rb_ary_push(elements, field);
147
+ element_count++;
148
+
149
+ p += col_sep_len;
150
+ startP = p;
151
+ backslash_count = 0; // Reset backslash count at the start of a new field
152
+ }
153
+ } else {
154
+ if (*p == '\\') {
155
+ backslash_count++;
156
+ } else {
157
+ if (*p == quote_char_val) {
158
+ if (backslash_count % 2 == 0) {
159
+ /* Even number of backslashes means quote is not escaped */
160
+ in_quotes = !in_quotes;
161
+ }
162
+ }
163
+ backslash_count = 0; // Reset after any character other than backslash
164
+ }
165
+ p++;
166
+ }
167
+ } /* while */
168
+
169
+ /* Check for unclosed quotes at the end of the line */
170
+ if (in_quotes) {
171
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
172
+ }
173
+
174
+ /* check if the last part of the line needs to be processed */
175
+ if ((max_fields < 0) || (element_count < max_fields)) {
176
+ long field_len = endP - startP;
177
+ char *raw_field = startP;
178
+
179
+ bool is_wrapped = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
180
+ if (is_wrapped) {
181
+ raw_field++;
182
+ field_len -= 2;
183
+ }
184
+
185
+ /* Always unescape doubled quotes */
186
+ char *src = raw_field;
187
+ char *limit = raw_field + field_len;
188
+ char *buffer = ALLOC_N(char, field_len);
189
+ char *dst = buffer;
190
+
191
+ while (src < limit) {
192
+ if (*src == quote_char_val && (src + 1 < limit) && *(src + 1) == quote_char_val) {
193
+ *dst++ = quote_char_val;
194
+ src += 2;
195
+ } else {
196
+ *dst++ = *src++;
197
+ }
198
+ }
199
+
200
+ field = rb_enc_str_new(buffer, dst - buffer, encoding);
201
+ xfree(buffer);
202
+
203
+ rb_ary_push(elements, field);
204
+ }
205
+
206
+ return elements;
207
+ }
208
+
209
+ void Init_smarter_csv(void) {
210
+ // these modules and the error class are already defined in Ruby code, make them accessible:
211
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
212
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
213
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
214
+ Qempty_string = rb_str_new_literal("");
215
+ rb_gc_register_address(&Qempty_string);
216
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
217
+ }