smarter_csv 1.14.3 → 1.14.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +3 -0
- data/CONTRIBUTORS.md +1 -0
- data/lib/smarter_csv/auto_detection.rb +1 -1
- data/lib/smarter_csv/version.rb +1 -1
- metadata +2 -14
- data/ext/smarter_csv/Makefile +0 -270
- data/ext/smarter_csv/smarter_csv.c.works +0 -185
- data/ext/smarter_csv/smarter_csv.c.works10 +0 -199
- data/ext/smarter_csv/smarter_csv.c.works11 +0 -189
- data/ext/smarter_csv/smarter_csv.c.works14 +0 -230
- data/ext/smarter_csv/smarter_csv.c.works15 +0 -230
- data/ext/smarter_csv/smarter_csv.c.works2 +0 -192
- data/ext/smarter_csv/smarter_csv.c.works4 +0 -190
- data/ext/smarter_csv/smarter_csv.c.works5 +0 -203
- data/ext/smarter_csv/smarter_csv.c.works7 +0 -217
- data/ext/smarter_csv/smarter_csv.c.works8 +0 -193
- data/ext/smarter_csv/smarter_csv.c.works9 +0 -196
@@ -1,196 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "ruby/encoding.h"
|
3
|
-
#include <stdio.h>
|
4
|
-
#include <stdbool.h>
|
5
|
-
#include <string.h>
|
6
|
-
|
7
|
-
#ifndef bool
|
8
|
-
#define bool int
|
9
|
-
#define false ((bool)0)
|
10
|
-
#define true ((bool)1)
|
11
|
-
#endif
|
12
|
-
|
13
|
-
VALUE SmarterCSV = Qnil;
|
14
|
-
VALUE eMalformedCSVError = Qnil;
|
15
|
-
VALUE Parser = Qnil;
|
16
|
-
VALUE Qempty_string = Qnil; // shared frozen empty string
|
17
|
-
|
18
|
-
typedef struct {
|
19
|
-
const char *start;
|
20
|
-
long length;
|
21
|
-
bool needs_unescape;
|
22
|
-
} ParsedField;
|
23
|
-
|
24
|
-
static bool detect_and_strip_quotes(const char **start, long *len, char quote_char, bool *found_double) {
|
25
|
-
if (*len < 2 || (*start)[0] != quote_char || (*start)[*len - 1] != quote_char) return false;
|
26
|
-
|
27
|
-
*start = *start + 1;
|
28
|
-
*len = *len - 2;
|
29
|
-
|
30
|
-
for (long i = 0; i < *len - 1; i++) {
|
31
|
-
if ((*start)[i] == quote_char && (*start)[i + 1] == quote_char) {
|
32
|
-
*found_double = true;
|
33
|
-
return true;
|
34
|
-
}
|
35
|
-
}
|
36
|
-
*found_double = false;
|
37
|
-
return true;
|
38
|
-
}
|
39
|
-
|
40
|
-
static VALUE unescape_quotes(const char *start, long len, char quote_char, rb_encoding *enc) {
|
41
|
-
char *buf = ALLOC_N(char, len);
|
42
|
-
long j = 0;
|
43
|
-
|
44
|
-
for (long i = 0; i < len; i++) {
|
45
|
-
if (start[i] == quote_char && i + 1 < len && start[i + 1] == quote_char) {
|
46
|
-
buf[j++] = quote_char;
|
47
|
-
i++;
|
48
|
-
} else {
|
49
|
-
buf[j++] = start[i];
|
50
|
-
}
|
51
|
-
}
|
52
|
-
|
53
|
-
VALUE str = rb_enc_str_new(buf, j, enc);
|
54
|
-
xfree(buf);
|
55
|
-
return str;
|
56
|
-
}
|
57
|
-
|
58
|
-
static VALUE rb_parse_csv_line(VALUE self, VALUE line, VALUE col_sep, VALUE quote_char, VALUE max_size, VALUE has_quotes_val) {
|
59
|
-
if (RB_TYPE_P(line, T_NIL)) return rb_ary_new();
|
60
|
-
if (!RB_TYPE_P(line, T_STRING)) rb_raise(rb_eTypeError, "ERROR in SmarterCSV.parse_line: line has to be a string or nil");
|
61
|
-
|
62
|
-
rb_encoding *encoding = rb_enc_get(line);
|
63
|
-
char *startP = RSTRING_PTR(line);
|
64
|
-
long line_len = RSTRING_LEN(line);
|
65
|
-
char *endP = startP + line_len;
|
66
|
-
char *p = startP;
|
67
|
-
|
68
|
-
char *col_sepP = RSTRING_PTR(col_sep);
|
69
|
-
long col_sep_len = RSTRING_LEN(col_sep);
|
70
|
-
|
71
|
-
char *quoteP = RSTRING_PTR(quote_char);
|
72
|
-
char quote_char_val = quoteP[0];
|
73
|
-
size_t quote_len = strlen(quoteP);
|
74
|
-
|
75
|
-
VALUE elements = rb_ary_new();
|
76
|
-
|
77
|
-
long backslash_count = 0;
|
78
|
-
bool in_quotes = false;
|
79
|
-
long element_count = 0;
|
80
|
-
|
81
|
-
int max_fields = -1;
|
82
|
-
if (max_size != Qnil) {
|
83
|
-
max_fields = NUM2INT(max_size);
|
84
|
-
if (max_fields < 0) return rb_ary_new();
|
85
|
-
}
|
86
|
-
|
87
|
-
bool has_quotes = RTEST(has_quotes_val);
|
88
|
-
|
89
|
-
if (__builtin_expect(!has_quotes && col_sep_len == 1, 1)) {
|
90
|
-
char sep = *col_sepP;
|
91
|
-
char *sep_pos = NULL;
|
92
|
-
|
93
|
-
while ((sep_pos = memchr(p, sep, endP - p))) {
|
94
|
-
if ((max_fields >= 0) && (element_count >= max_fields)) break;
|
95
|
-
|
96
|
-
bool only_spaces = true;
|
97
|
-
for (char *s = startP; s < sep_pos; s++) {
|
98
|
-
if (*s != ' ') { only_spaces = false; break; }
|
99
|
-
}
|
100
|
-
|
101
|
-
VALUE field = only_spaces ? Qempty_string : rb_enc_str_new(startP, sep_pos - startP, encoding);
|
102
|
-
rb_ary_push(elements, field);
|
103
|
-
element_count++;
|
104
|
-
|
105
|
-
p = sep_pos + 1;
|
106
|
-
startP = p;
|
107
|
-
}
|
108
|
-
|
109
|
-
if ((max_fields < 0) || (element_count < max_fields)) {
|
110
|
-
bool only_spaces = true;
|
111
|
-
for (char *s = startP; s < endP; s++) {
|
112
|
-
if (*s != ' ') { only_spaces = false; break; }
|
113
|
-
}
|
114
|
-
VALUE field = only_spaces ? Qempty_string : rb_enc_str_new(startP, endP - startP, encoding);
|
115
|
-
rb_ary_push(elements, field);
|
116
|
-
}
|
117
|
-
|
118
|
-
return elements;
|
119
|
-
}
|
120
|
-
|
121
|
-
ParsedField *fields = ALLOC_N(ParsedField, 128);
|
122
|
-
long field_count = 0;
|
123
|
-
char *field_start = startP;
|
124
|
-
|
125
|
-
while (p < endP) {
|
126
|
-
bool col_sep_found = (p + col_sep_len <= endP) && (memcmp(p, col_sepP, col_sep_len) == 0);
|
127
|
-
|
128
|
-
if (col_sep_found && !in_quotes) {
|
129
|
-
if ((max_fields >= 0) && (field_count >= max_fields)) break;
|
130
|
-
|
131
|
-
long field_len = p - field_start;
|
132
|
-
const char *actual_start = field_start;
|
133
|
-
long actual_len = field_len;
|
134
|
-
bool needs_unescape = false;
|
135
|
-
bool quoted = detect_and_strip_quotes(&actual_start, &actual_len, quote_char_val, &needs_unescape);
|
136
|
-
if (!quoted) {
|
137
|
-
for (long i = 0; i < actual_len - 1; i++) {
|
138
|
-
if (actual_start[i] == quote_char_val && actual_start[i + 1] == quote_char_val) {
|
139
|
-
needs_unescape = true;
|
140
|
-
break;
|
141
|
-
}
|
142
|
-
}
|
143
|
-
}
|
144
|
-
|
145
|
-
fields[field_count++] = (ParsedField){ actual_start, actual_len, needs_unescape };
|
146
|
-
p += col_sep_len;
|
147
|
-
field_start = p;
|
148
|
-
backslash_count = 0;
|
149
|
-
} else {
|
150
|
-
if (*p == '\\') {
|
151
|
-
backslash_count++;
|
152
|
-
} else {
|
153
|
-
if (*p == quote_char_val && (backslash_count % 2 == 0)) {
|
154
|
-
in_quotes = !in_quotes;
|
155
|
-
}
|
156
|
-
backslash_count = 0;
|
157
|
-
}
|
158
|
-
p++;
|
159
|
-
}
|
160
|
-
}
|
161
|
-
|
162
|
-
if (in_quotes) {
|
163
|
-
xfree(fields);
|
164
|
-
rb_raise(eMalformedCSVError, "Unclosed quoted field detected in line: %s", StringValueCStr(line));
|
165
|
-
}
|
166
|
-
|
167
|
-
if ((max_fields < 0) || (field_count < max_fields)) {
|
168
|
-
long field_len = endP - field_start;
|
169
|
-
const char *actual_start = field_start;
|
170
|
-
long actual_len = field_len;
|
171
|
-
bool needs_unescape = false;
|
172
|
-
detect_and_strip_quotes(&actual_start, &actual_len, quote_char_val, &needs_unescape);
|
173
|
-
|
174
|
-
fields[field_count++] = (ParsedField){ actual_start, actual_len, needs_unescape };
|
175
|
-
}
|
176
|
-
|
177
|
-
for (long i = 0; i < field_count; i++) {
|
178
|
-
ParsedField f = fields[i];
|
179
|
-
VALUE field = f.length == 0 ? Qempty_string :
|
180
|
-
(f.needs_unescape ? unescape_quotes(f.start, f.length, quote_char_val, encoding)
|
181
|
-
: rb_enc_str_new(f.start, f.length, encoding));
|
182
|
-
rb_ary_push(elements, field);
|
183
|
-
}
|
184
|
-
|
185
|
-
xfree(fields);
|
186
|
-
return elements;
|
187
|
-
}
|
188
|
-
|
189
|
-
void Init_smarter_csv(void) {
|
190
|
-
SmarterCSV = rb_const_get(rb_cObject, rb_intern("SmarterCSV"));
|
191
|
-
Parser = rb_const_get(SmarterCSV, rb_intern("Parser"));
|
192
|
-
eMalformedCSVError = rb_const_get(SmarterCSV, rb_intern("MalformedCSV"));
|
193
|
-
Qempty_string = rb_str_new_literal("");
|
194
|
-
rb_gc_register_address(&Qempty_string);
|
195
|
-
rb_define_module_function(Parser, "parse_csv_line_c", rb_parse_csv_line, 5);
|
196
|
-
}
|