smarter_csv 1.14.2 → 1.14.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,185 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+
6
+ #ifndef bool
7
+ #define bool int
8
+ #define false ((bool)0)
9
+ #define true ((bool)1)
10
+ #endif
11
+
12
+ VALUE SmarterCSV = Qnil;
13
+ VALUE eMalformedCSVError = Qnil;
14
+ VALUE Parser = Qnil;
15
+
16
+ static VALUE Qempty_string; // shared frozen empty string
17
+
18
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size) {
19
+ if (RB_TYPE_P(line, T_NIL) == 1) {
20
+ return rb_ary_new();
21
+ }
22
+
23
+ if (RB_TYPE_P(line, T_STRING) != 1) {
24
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
25
+ }
26
+
27
+ rb_encoding *encoding = rb_enc_get(line); /* get the encoding from the input line */
28
+ char *startP = RSTRING_PTR(line); /* may not be null terminated */
29
+ long line_len = RSTRING_LEN(line);
30
+ char *endP = startP + line_len; /* points behind the string */
31
+ char *p = startP;
32
+
33
+ char *col_sepP = RSTRING_PTR(col_sep);
34
+ long col_sep_len = RSTRING_LEN(col_sep);
35
+
36
+ char *quoteP = RSTRING_PTR(quote_char);
37
+ char quote = *quoteP;
38
+ long quote_count = 0;
39
+
40
+ bool col_sep_found = true;
41
+
42
+ VALUE elements = rb_ary_new();
43
+ VALUE field;
44
+ long i;
45
+
46
+ /* Variables for escaped quote handling */
47
+ long backslash_count = 0;
48
+ bool in_quotes = false;
49
+
50
+ /* Optimization 1: maintain count instead of calling RARRAY_LEN repeatedly */
51
+ long element_count = 0;
52
+
53
+ /* Optimization 2: cache max_size value if not nil */
54
+ int max_fields = -1;
55
+ if (max_size != Qnil) {
56
+ max_fields = NUM2INT(max_size);
57
+ if (max_fields < 0) {
58
+ return rb_ary_new(); // Return empty array early
59
+ }
60
+ }
61
+
62
+ while (p < endP) {
63
+ /* does the remaining string start with col_sep ? */
64
+ col_sep_found = true;
65
+ for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
66
+ col_sep_found = col_sep_found && (*(p + i) == *(col_sepP + i));
67
+ }
68
+
69
+ /* if col_sep was found and we're not inside quotes */
70
+ if (col_sep_found && !in_quotes) {
71
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
72
+ break;
73
+ } else {
74
+ bool only_spaces = true;
75
+ for (char *s = startP; s < p; s++) {
76
+ if (*s != ' ') {
77
+ only_spaces = false;
78
+ break;
79
+ }
80
+ }
81
+
82
+ if (only_spaces) {
83
+ field = Qempty_string;
84
+ } else {
85
+ long field_len = p - startP;
86
+
87
+ // fast-path quote cleanup: if field starts and ends with quote, and no doubled quotes inside
88
+ if (field_len >= 2 && startP[0] == quote && startP[field_len - 1] == quote) {
89
+ char *inner_start = startP + 1;
90
+ long inner_len = field_len - 2;
91
+ bool has_double_quote = false;
92
+ for (i = 0; i < inner_len - 1; i++) {
93
+ if (inner_start[i] == quote && inner_start[i + 1] == quote) {
94
+ has_double_quote = true;
95
+ break;
96
+ }
97
+ }
98
+ if (!has_double_quote) {
99
+ field = rb_enc_str_new(inner_start, inner_len, encoding);
100
+ } else {
101
+ field = rb_enc_str_new(startP, field_len, encoding);
102
+ }
103
+ } else {
104
+ field = rb_enc_str_new(startP, field_len, encoding);
105
+ }
106
+ }
107
+
108
+ rb_ary_push(elements, field);
109
+ element_count++;
110
+
111
+ p += col_sep_len;
112
+ startP = p;
113
+ backslash_count = 0; // Reset backslash count at the start of a new field
114
+ }
115
+ } else {
116
+ if (*p == '\\') {
117
+ backslash_count++;
118
+ } else {
119
+ if (*p == quote) {
120
+ if (backslash_count % 2 == 0) {
121
+ in_quotes = !in_quotes;
122
+ }
123
+ }
124
+ backslash_count = 0;
125
+ }
126
+ p++;
127
+ }
128
+ }
129
+
130
+ if (in_quotes) {
131
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
132
+ }
133
+
134
+ if ((max_fields < 0) || (element_count < max_fields)) {
135
+ bool only_spaces = true;
136
+ for (char *s = startP; s < endP; s++) {
137
+ if (*s != ' ') {
138
+ only_spaces = false;
139
+ break;
140
+ }
141
+ }
142
+
143
+ if (only_spaces) {
144
+ field = Qempty_string;
145
+ } else {
146
+ long field_len = endP - startP;
147
+
148
+ // fast-path quote cleanup on final field
149
+ if (field_len >= 2 && startP[0] == quote && startP[field_len - 1] == quote) {
150
+ char *inner_start = startP + 1;
151
+ long inner_len = field_len - 2;
152
+ bool has_double_quote = false;
153
+ for (i = 0; i < inner_len - 1; i++) {
154
+ if (inner_start[i] == quote && inner_start[i + 1] == quote) {
155
+ has_double_quote = true;
156
+ break;
157
+ }
158
+ }
159
+ if (!has_double_quote) {
160
+ field = rb_enc_str_new(inner_start, inner_len, encoding);
161
+ } else {
162
+ field = rb_enc_str_new(startP, field_len, encoding);
163
+ }
164
+ } else {
165
+ field = rb_enc_str_new(startP, field_len, encoding);
166
+ }
167
+ }
168
+
169
+ rb_ary_push(elements, field);
170
+ }
171
+
172
+ return elements;
173
+ }
174
+
175
+ void Init_smarter_csv(void) {
176
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
177
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
178
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
179
+
180
+ Qempty_string = rb_str_new_literal("");
181
+ rb_obj_freeze(Qempty_string);
182
+ rb_global_variable(&Qempty_string);
183
+
184
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 4);
185
+ }
@@ -0,0 +1,199 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #ifndef bool
8
+ #define bool int
9
+ #define false ((bool)0)
10
+ #define true ((bool)1)
11
+ #endif
12
+
13
+ VALUE SmarterCSV = Qnil;
14
+ VALUE eMalformedCSVError = Qnil;
15
+ VALUE Parser = Qnil;
16
+ VALUE Qempty_string = Qnil; // shared frozen empty string
17
+
18
+ static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
19
+ char *buf = ALLOC_N(char, len);
20
+ long j = 0;
21
+ for (long i = 0; i < len; i++) {
22
+ if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
23
+ buf[j++] = quote_char;
24
+ i++; // skip second quote
25
+ } else {
26
+ buf[j++] = str[i];
27
+ }
28
+ }
29
+ VALUE out = rb_enc_str_new(buf, j, encoding);
30
+ xfree(buf);
31
+ return out;
32
+ }
33
+
34
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
35
+ if (RB_TYPE_P(line, T_NIL) == 1) {
36
+ return rb_ary_new();
37
+ }
38
+
39
+ if (RB_TYPE_P(line, T_STRING) != 1) {
40
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
41
+ }
42
+
43
+ rb_encoding *encoding = rb_enc_get(line);
44
+ char *startP = RSTRING_PTR(line);
45
+ long line_len = RSTRING_LEN(line);
46
+ char *endP = startP + line_len;
47
+ char *p = startP;
48
+
49
+ char *col_sepP = RSTRING_PTR(col_sep);
50
+ long col_sep_len = RSTRING_LEN(col_sep);
51
+
52
+ char *quoteP = RSTRING_PTR(quote_char);
53
+ char quote_char_val = quoteP[0];
54
+ size_t quote_len = strlen(quoteP);
55
+
56
+ VALUE elements = rb_ary_new();
57
+ VALUE field;
58
+
59
+ long element_count = 0;
60
+ int max_fields = -1;
61
+ if (max_size != Qnil) {
62
+ max_fields = NUM2INT(max_size);
63
+ if (max_fields < 0) {
64
+ return rb_ary_new();
65
+ }
66
+ }
67
+
68
+ bool has_quotes = RTEST(has_quotes_val);
69
+
70
+ // === FAST PATH: No quotes and single-character separator ===
71
+ if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
72
+ char sep = *col_sepP;
73
+ char *sep_pos = NULL;
74
+
75
+ while ((sep_pos = memchr(p, sep, endP - p))) {
76
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
77
+ break;
78
+ }
79
+
80
+ long field_len = sep_pos - startP;
81
+ char *raw_field = startP;
82
+
83
+ if (memchr(raw_field, quote_char_val, field_len)) {
84
+ field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
85
+ } else {
86
+ field = rb_enc_str_new(raw_field, field_len, encoding);
87
+ }
88
+ rb_ary_push(elements, field);
89
+ element_count++;
90
+
91
+ p = sep_pos + 1;
92
+ startP = p;
93
+ }
94
+
95
+ if ((max_fields < 0) || (element_count < max_fields)) {
96
+ long field_len = endP - startP;
97
+ char *raw_field = startP;
98
+
99
+ if (memchr(raw_field, quote_char_val, field_len)) {
100
+ field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
101
+ } else {
102
+ field = rb_enc_str_new(raw_field, field_len, encoding);
103
+ }
104
+ rb_ary_push(elements, field);
105
+ }
106
+
107
+ return elements;
108
+ }
109
+
110
+ // === SLOW PATH: Quoted fields or multi-char separator ===
111
+ long i;
112
+ long backslash_count = 0;
113
+ bool in_quotes = false;
114
+ bool col_sep_found = true;
115
+
116
+ while (p < endP) {
117
+ col_sep_found = true;
118
+ for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
119
+ if (*(p + i) != *(col_sepP + i)) {
120
+ col_sep_found = false;
121
+ break;
122
+ }
123
+ }
124
+
125
+ if (col_sep_found && !in_quotes) {
126
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
127
+ break;
128
+ }
129
+
130
+ long field_len = p - startP;
131
+ char *raw_field = startP;
132
+
133
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
134
+ if (quoted) {
135
+ raw_field++;
136
+ field_len -= 2;
137
+ }
138
+
139
+ if (quoted || memchr(raw_field, quote_char_val, field_len)) {
140
+ field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
141
+ } else {
142
+ field = rb_enc_str_new(raw_field, field_len, encoding);
143
+ }
144
+
145
+ rb_ary_push(elements, field);
146
+ element_count++;
147
+
148
+ p += col_sep_len;
149
+ startP = p;
150
+ backslash_count = 0;
151
+ } else {
152
+ if (*p == '\\') {
153
+ backslash_count++;
154
+ } else {
155
+ if (*p == quote_char_val) {
156
+ if (backslash_count % 2 == 0) {
157
+ in_quotes = !in_quotes;
158
+ }
159
+ }
160
+ backslash_count = 0;
161
+ }
162
+ p++;
163
+ }
164
+ }
165
+
166
+ if (in_quotes) {
167
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
168
+ }
169
+
170
+ if ((max_fields < 0) || (element_count < max_fields)) {
171
+ long field_len = endP - startP;
172
+ char *raw_field = startP;
173
+
174
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
175
+ if (quoted) {
176
+ raw_field++;
177
+ field_len -= 2;
178
+ }
179
+
180
+ if (quoted || memchr(raw_field, quote_char_val, field_len)) {
181
+ field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
182
+ } else {
183
+ field = rb_enc_str_new(raw_field, field_len, encoding);
184
+ }
185
+
186
+ rb_ary_push(elements, field);
187
+ }
188
+
189
+ return elements;
190
+ }
191
+
192
+ void Init_smarter_csv(void) {
193
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
194
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
195
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
196
+ Qempty_string = rb_str_new_literal("");
197
+ rb_gc_register_address(&Qempty_string);
198
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
199
+ }
@@ -0,0 +1,189 @@
1
+ #include "ruby.h"
2
+ #include "ruby/encoding.h"
3
+ #include <stdio.h>
4
+ #include <stdbool.h>
5
+ #include <string.h>
6
+
7
+ #ifndef bool
8
+ #define bool int
9
+ #define false ((bool)0)
10
+ #define true ((bool)1)
11
+ #endif
12
+
13
+ VALUE SmarterCSV = Qnil;
14
+ VALUE eMalformedCSVError = Qnil;
15
+ VALUE Parser = Qnil;
16
+ VALUE Qempty_string = Qnil; // shared frozen empty string
17
+
18
+ static VALUE unescape_quotes(char *str, long len, char quote_char, rb_encoding *encoding) {
19
+ char *buf = ALLOC_N(char, len);
20
+ long j = 0;
21
+ for (long i = 0; i < len; i++) {
22
+ if (str[i] == quote_char && i + 1 < len && str[i + 1] == quote_char) {
23
+ buf[j++] = quote_char;
24
+ i++; // skip second quote
25
+ } else {
26
+ buf[j++] = str[i];
27
+ }
28
+ }
29
+ VALUE out = rb_enc_str_new(buf, j, encoding);
30
+ xfree(buf);
31
+ return out;
32
+ }
33
+
34
+ static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
35
+ if (RB_TYPE_P(line, T_NIL) == 1) {
36
+ return rb_ary_new();
37
+ }
38
+
39
+ if (RB_TYPE_P(line, T_STRING) != 1) {
40
+ rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
41
+ }
42
+
43
+ rb_encoding *encoding = rb_enc_get(line);
44
+ char *startP = RSTRING_PTR(line);
45
+ long line_len = RSTRING_LEN(line);
46
+ char *endP = startP + line_len;
47
+ char *p = startP;
48
+
49
+ char *col_sepP = RSTRING_PTR(col_sep);
50
+ long col_sep_len = RSTRING_LEN(col_sep);
51
+
52
+ char *quoteP = RSTRING_PTR(quote_char);
53
+ char quote_char_val = quoteP[0];
54
+ size_t quote_len = strlen(quoteP);
55
+
56
+ VALUE elements = rb_ary_new();
57
+ VALUE field;
58
+
59
+ long element_count = 0;
60
+ int max_fields = -1;
61
+ if (max_size != Qnil) {
62
+ max_fields = NUM2INT(max_size);
63
+ if (max_fields < 0) {
64
+ return rb_ary_new();
65
+ }
66
+ }
67
+
68
+ bool has_quotes = RTEST(has_quotes_val);
69
+
70
+ // === FAST PATH: No quotes and single-character separator ===
71
+ if (!has_quotes && col_sep_len == 1) {
72
+ char sep = *col_sepP;
73
+ char *sep_pos = NULL;
74
+
75
+ while ((sep_pos = memchr(p, sep, endP - p))) {
76
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
77
+ break;
78
+ }
79
+
80
+ long field_len = sep_pos - startP;
81
+ char *raw_field = startP;
82
+ field = rb_enc_str_new(raw_field, field_len, encoding);
83
+ rb_ary_push(elements, field);
84
+ element_count++;
85
+
86
+ p = sep_pos + 1;
87
+ startP = p;
88
+ }
89
+
90
+ if ((max_fields < 0) || (element_count < max_fields)) {
91
+ long field_len = endP - startP;
92
+ char *raw_field = startP;
93
+ field = rb_enc_str_new(raw_field, field_len, encoding);
94
+ rb_ary_push(elements, field);
95
+ }
96
+
97
+ return elements;
98
+ }
99
+
100
+ // === SLOW PATH: Quoted fields or multi-char separator ===
101
+ long i;
102
+ long backslash_count = 0;
103
+ bool in_quotes = false;
104
+ bool col_sep_found = true;
105
+
106
+ while (p < endP) {
107
+ col_sep_found = true;
108
+ for (i = 0; (i < col_sep_len) && (p + i < endP); i++) {
109
+ if (*(p + i) != *(col_sepP + i)) {
110
+ col_sep_found = false;
111
+ break;
112
+ }
113
+ }
114
+
115
+ if (col_sep_found && !in_quotes) {
116
+ if ((max_fields >= 0) && (element_count >= max_fields)) {
117
+ break;
118
+ }
119
+
120
+ long field_len = p - startP;
121
+ char *raw_field = startP;
122
+
123
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
124
+ if (quoted) {
125
+ raw_field++;
126
+ field_len -= 2;
127
+ }
128
+
129
+ if (quoted || memchr(raw_field, quote_char_val, field_len)) {
130
+ field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
131
+ } else {
132
+ field = rb_enc_str_new(raw_field, field_len, encoding);
133
+ }
134
+
135
+ rb_ary_push(elements, field);
136
+ element_count++;
137
+
138
+ p += col_sep_len;
139
+ startP = p;
140
+ backslash_count = 0;
141
+ } else {
142
+ if (*p == '\\') {
143
+ backslash_count++;
144
+ } else {
145
+ if (*p == quote_char_val) {
146
+ if (backslash_count % 2 == 0) {
147
+ in_quotes = !in_quotes;
148
+ }
149
+ }
150
+ backslash_count = 0;
151
+ }
152
+ p++;
153
+ }
154
+ }
155
+
156
+ if (in_quotes) {
157
+ rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
158
+ }
159
+
160
+ if ((max_fields < 0) || (element_count < max_fields)) {
161
+ long field_len = endP - startP;
162
+ char *raw_field = startP;
163
+
164
+ bool quoted = (field_len >= 2 && raw_field[0] == quote_char_val && raw_field[field_len - 1] == quote_char_val);
165
+ if (quoted) {
166
+ raw_field++;
167
+ field_len -= 2;
168
+ }
169
+
170
+ if (quoted || memchr(raw_field, quote_char_val, field_len)) {
171
+ field = unescape_quotes(raw_field, field_len, quote_char_val, encoding);
172
+ } else {
173
+ field = rb_enc_str_new(raw_field, field_len, encoding);
174
+ }
175
+
176
+ rb_ary_push(elements, field);
177
+ }
178
+
179
+ return elements;
180
+ }
181
+
182
+ void Init_smarter_csv(void) {
183
+ SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
184
+ Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
185
+ eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
186
+ Qempty_string = rb_str_new_literal("");
187
+ rb_gc_register_address(&Qempty_string);
188
+ rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
189
+ }