fastcsv 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +11 -0
- data/README.md +37 -2
- data/TESTS.md +42 -0
- data/ext/fastcsv/fastcsv.c +281 -223
- data/ext/fastcsv/fastcsv.rl +149 -72
- data/fastcsv.gemspec +1 -1
- data/lib/fastcsv.rb +130 -0
- data/spec/fastcsv_spec.rb +189 -57
- data/spec/fixtures/csv.csv +3 -0
- data/spec/fixtures/iso-8859-1-quoted.csv +1 -0
- data/spec/fixtures/utf-8-quoted.csv +1 -0
- data/spec/spec_helper.rb +5 -0
- data/test/csv/base.rb +8 -0
- data/test/csv/line_endings.gz +0 -0
- data/test/csv/test_csv_parsing.rb +221 -0
- data/test/csv/test_csv_writing.rb +97 -0
- data/test/csv/test_data_converters.rb +263 -0
- data/test/csv/test_encodings.rb +339 -0
- data/test/csv/test_features.rb +317 -0
- data/test/csv/test_headers.rb +289 -0
- data/test/csv/test_interface.rb +362 -0
- data/test/csv/test_row.rb +349 -0
- data/test/csv/test_table.rb +420 -0
- data/test/csv/ts_all.rb +20 -0
- data/test/runner.rb +36 -0
- data/test/with_different_ofs.rb +17 -0
- metadata +38 -2
data/ext/fastcsv/fastcsv.rl
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
// http://w3c.github.io/csvw/syntax/#ebnf
|
6
6
|
|
7
7
|
// CSV implementation.
|
8
|
-
// https://github.com/ruby/ruby/blob/
|
8
|
+
// https://github.com/ruby/ruby/blob/trunk/lib/csv.rb
|
9
9
|
|
10
10
|
// Ruby C extensions help.
|
11
11
|
// https://github.com/ruby/ruby/blob/trunk/README.EXT
|
@@ -19,15 +19,24 @@ if (enc2 != NULL) { \
|
|
19
19
|
field = rb_str_encode(field, rb_enc_from_encoding(enc), 0, Qnil); \
|
20
20
|
}
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
#define FREE \
|
23
|
+
if (buf != NULL) { \
|
24
|
+
free(buf); \
|
25
|
+
} \
|
26
|
+
if (row_sep != NULL) { \
|
27
|
+
free(row_sep); \
|
28
|
+
}
|
24
29
|
|
25
|
-
|
26
|
-
|
30
|
+
static VALUE cClass, cParser, eError;
|
31
|
+
static ID s_read, s_row;
|
27
32
|
|
28
|
-
|
29
|
-
|
30
|
-
|
33
|
+
// @see https://github.com/nofxx/georuby_c/blob/b3b91fd90980d7c295ac8f6012d89878ea7cd569/ext/types.h#L22
|
34
|
+
typedef struct {
|
35
|
+
char *start;
|
36
|
+
} Data;
|
37
|
+
|
38
|
+
%%{
|
39
|
+
machine raw_parse;
|
31
40
|
|
32
41
|
action open_quote {
|
33
42
|
unclosed_line = curline;
|
@@ -74,7 +83,7 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
74
83
|
reader++;
|
75
84
|
}
|
76
85
|
|
77
|
-
field = rb_enc_str_new(copy, writer - copy,
|
86
|
+
field = rb_enc_str_new(copy, writer - copy, encoding);
|
78
87
|
ENCODE;
|
79
88
|
|
80
89
|
if (copy != NULL) {
|
@@ -88,7 +97,35 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
88
97
|
field = Qnil;
|
89
98
|
}
|
90
99
|
|
100
|
+
action mark_row {
|
101
|
+
d->start = p;
|
102
|
+
|
103
|
+
if (len_row_sep) {
|
104
|
+
if (p - mark_row_sep != len_row_sep || row_sep[0] != *mark_row_sep || len_row_sep == 2 && row_sep[1] != *(mark_row_sep + 1)) {
|
105
|
+
FREE;
|
106
|
+
|
107
|
+
rb_raise(eError, "Unquoted fields do not allow \\r or \\n (line %d).", curline - 1);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
else {
|
111
|
+
len_row_sep = p - mark_row_sep;
|
112
|
+
row_sep = ALLOC_N(char, p - mark_row_sep);
|
113
|
+
memcpy(row_sep, mark_row_sep, p - mark_row_sep);
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
91
117
|
action new_row {
|
118
|
+
mark_row_sep = p;
|
119
|
+
|
120
|
+
curline++;
|
121
|
+
|
122
|
+
if (d->start == 0 || p == d->start) {
|
123
|
+
rb_ivar_set(self, s_row, rb_str_new2(""));
|
124
|
+
}
|
125
|
+
else if (p > d->start) {
|
126
|
+
rb_ivar_set(self, s_row, rb_str_new(d->start, p - d->start));
|
127
|
+
}
|
128
|
+
|
92
129
|
if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
|
93
130
|
rb_ary_push(row, field);
|
94
131
|
field = Qnil;
|
@@ -99,18 +136,26 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
99
136
|
}
|
100
137
|
|
101
138
|
action last_row {
|
139
|
+
if (d->start == 0 || p == d->start) {
|
140
|
+
rb_ivar_set(self, s_row, rb_str_new2(""));
|
141
|
+
}
|
142
|
+
else if (p > d->start) {
|
143
|
+
rb_ivar_set(self, s_row, rb_str_new(d->start, p - d->start));
|
144
|
+
}
|
145
|
+
|
102
146
|
if (!NIL_P(field) || RARRAY_LEN(row)) {
|
103
147
|
rb_ary_push(row, field);
|
104
148
|
}
|
149
|
+
|
105
150
|
if (RARRAY_LEN(row)) {
|
106
151
|
rb_yield(row);
|
107
152
|
}
|
108
153
|
}
|
109
154
|
|
110
|
-
EOF = 0
|
155
|
+
EOF = 0;
|
111
156
|
quote_char = '"';
|
112
157
|
col_sep = ',' >new_field;
|
113
|
-
row_sep = ('\r' '\n'? | '\n')
|
158
|
+
row_sep = ('\r' '\n'? | '\n');
|
114
159
|
unquoted = (any* -- quote_char -- col_sep -- row_sep - EOF) %read_unquoted;
|
115
160
|
quoted = quote_char >open_quote (any - quote_char - EOF | quote_char quote_char | row_sep)* %read_quoted quote_char >close_quote;
|
116
161
|
field = unquoted | quoted;
|
@@ -118,9 +163,9 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
118
163
|
# @see Ragel Guide: 6.3 Scanners
|
119
164
|
# An unquoted field can be zero-length.
|
120
165
|
main := |*
|
121
|
-
field col_sep
|
122
|
-
field row_sep >new_row
|
123
|
-
field EOF;
|
166
|
+
field col_sep;
|
167
|
+
field row_sep >new_row %mark_row;
|
168
|
+
field EOF >last_row;
|
124
169
|
*|;
|
125
170
|
}%%
|
126
171
|
|
@@ -130,9 +175,7 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
130
175
|
#define BUFSIZE 16384
|
131
176
|
|
132
177
|
// @see http://rxr.whitequark.org/mri/source/io.c#4845
|
133
|
-
static void
|
134
|
-
rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode)
|
135
|
-
{
|
178
|
+
static void rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode) {
|
136
179
|
int default_ext = 0;
|
137
180
|
|
138
181
|
if (ext == NULL) {
|
@@ -157,15 +200,17 @@ rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc,
|
|
157
200
|
}
|
158
201
|
}
|
159
202
|
|
160
|
-
VALUE
|
203
|
+
static VALUE raw_parse(int argc, VALUE *argv, VALUE self) {
|
161
204
|
int cs, act, have = 0, curline = 1, io = 0;
|
162
|
-
char *ts = 0, *te = 0, *buf = 0, *eof = 0;
|
205
|
+
char *ts = 0, *te = 0, *buf = 0, *eof = 0, *mark_row_sep = 0, *row_sep = NULL;
|
163
206
|
|
164
|
-
VALUE port, opts;
|
207
|
+
VALUE port, opts, r_encoding;
|
165
208
|
VALUE row = rb_ary_new(), field = Qnil, bufsize = Qnil;
|
166
|
-
int done = 0, unclosed_line = 0, buffer_size = 0, taint = 0;
|
209
|
+
int done = 0, unclosed_line = 0, len_row_sep = 0, buffer_size = 0, taint = 0;
|
167
210
|
rb_encoding *enc = NULL, *enc2 = NULL, *encoding = NULL;
|
168
|
-
|
211
|
+
|
212
|
+
Data *d;
|
213
|
+
Data_Get_Struct(self, Data, d);
|
169
214
|
|
170
215
|
VALUE option;
|
171
216
|
char quote_char = '"';
|
@@ -174,8 +219,8 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
174
219
|
taint = OBJ_TAINTED(port);
|
175
220
|
io = rb_respond_to(port, s_read);
|
176
221
|
if (!io) {
|
177
|
-
if (rb_respond_to(port,
|
178
|
-
port = rb_funcall(port,
|
222
|
+
if (rb_respond_to(port, rb_intern("to_str"))) {
|
223
|
+
port = rb_funcall(port, rb_intern("to_str"), 0);
|
179
224
|
StringValue(port);
|
180
225
|
}
|
181
226
|
else {
|
@@ -199,7 +244,7 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
199
244
|
// @see http://ruby-doc.org/core-2.1.1/IO.html#method-c-new-label-Open+Mode
|
200
245
|
option = rb_hash_aref(opts, ID2SYM(rb_intern("encoding")));
|
201
246
|
if (TYPE(option) == T_STRING) {
|
202
|
-
// parse_mode_enc is not in header file.
|
247
|
+
// `parse_mode_enc` is not in header file.
|
203
248
|
const char *estr = StringValueCStr(option), *ptr;
|
204
249
|
char encname[ENCODING_MAXNAMELEN+1];
|
205
250
|
int idx, idx2;
|
@@ -210,17 +255,17 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
210
255
|
ptr = strrchr(estr, ':');
|
211
256
|
if (ptr) {
|
212
257
|
long len = (ptr++) - estr;
|
213
|
-
if (len == 0 || len > ENCODING_MAXNAMELEN) {
|
258
|
+
if (len == 0 || len > ENCODING_MAXNAMELEN) { // ":enc"
|
214
259
|
idx = -1;
|
215
260
|
}
|
216
|
-
else {
|
261
|
+
else { // "enc2:enc" or "enc:-"
|
217
262
|
memcpy(encname, estr, len);
|
218
263
|
encname[len] = '\0';
|
219
264
|
estr = encname;
|
220
265
|
idx = rb_enc_find_index(encname);
|
221
266
|
}
|
222
267
|
}
|
223
|
-
else {
|
268
|
+
else { // "enc"
|
224
269
|
idx = rb_enc_find_index(estr);
|
225
270
|
}
|
226
271
|
|
@@ -228,7 +273,7 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
228
273
|
ext_enc = rb_enc_from_index(idx);
|
229
274
|
}
|
230
275
|
else {
|
231
|
-
if (idx != -2) {
|
276
|
+
if (idx != -2) { // ":enc"
|
232
277
|
// `unsupported_encoding` is not in header file.
|
233
278
|
rb_warn("Unsupported encoding %s ignored", estr);
|
234
279
|
}
|
@@ -237,11 +282,11 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
237
282
|
|
238
283
|
int_enc = NULL;
|
239
284
|
if (ptr) {
|
240
|
-
if (*ptr == '-' && *(ptr+1) == '\0') {
|
285
|
+
if (*ptr == '-' && *(ptr+1) == '\0') { // "enc:-"
|
241
286
|
/* Special case - "-" => no transcoding */
|
242
287
|
int_enc = (rb_encoding *)Qnil;
|
243
288
|
}
|
244
|
-
else {
|
289
|
+
else { // "enc2:enc"
|
245
290
|
idx2 = rb_enc_find_index(ptr);
|
246
291
|
if (idx2 < 0) {
|
247
292
|
// `unsupported_encoding` is not in header file.
|
@@ -262,29 +307,33 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
262
307
|
rb_raise(rb_eArgError, ":encoding has to be a String");
|
263
308
|
}
|
264
309
|
|
265
|
-
// @see
|
266
|
-
// @see https://github.com/ruby/ruby/blob/
|
267
|
-
if (rb_respond_to(port,
|
268
|
-
r_encoding = rb_funcall(port,
|
310
|
+
// @see CSV#raw_encoding
|
311
|
+
// @see https://github.com/ruby/ruby/blob/ab337e61ecb5f42384ba7d710c36faf96a454e5c/lib/csv.rb#L2290
|
312
|
+
if (rb_respond_to(port, rb_intern("internal_encoding"))) {
|
313
|
+
r_encoding = rb_funcall(port, rb_intern("internal_encoding"), 0);
|
269
314
|
if (NIL_P(r_encoding)) {
|
270
|
-
r_encoding = rb_funcall(port,
|
315
|
+
r_encoding = rb_funcall(port, rb_intern("external_encoding"), 0);
|
271
316
|
}
|
272
317
|
}
|
273
|
-
else if (rb_respond_to(port,
|
274
|
-
r_encoding = rb_funcall(rb_funcall(port,
|
318
|
+
else if (rb_respond_to(port, rb_intern("string"))) {
|
319
|
+
r_encoding = rb_funcall(rb_funcall(port, rb_intern("string"), 0), rb_intern("encoding"), 0);
|
275
320
|
}
|
276
|
-
else if (rb_respond_to(port,
|
277
|
-
r_encoding = rb_funcall(port,
|
321
|
+
else if (rb_respond_to(port, rb_intern("encoding"))) {
|
322
|
+
r_encoding = rb_funcall(port, rb_intern("encoding"), 0);
|
278
323
|
}
|
279
324
|
else {
|
280
325
|
r_encoding = rb_enc_from_encoding(rb_ascii8bit_encoding());
|
281
326
|
}
|
327
|
+
|
328
|
+
// @see CSV#initialize
|
329
|
+
// @see https://github.com/ruby/ruby/blob/ab337e61ecb5f42384ba7d710c36faf96a454e5c/lib/csv.rb#L1510
|
282
330
|
if (NIL_P(r_encoding)) {
|
283
331
|
r_encoding = rb_enc_from_encoding(rb_default_internal_encoding());
|
284
332
|
}
|
285
333
|
if (NIL_P(r_encoding)) {
|
286
334
|
r_encoding = rb_enc_from_encoding(rb_default_external_encoding());
|
287
335
|
}
|
336
|
+
|
288
337
|
if (enc2 != NULL) {
|
289
338
|
encoding = enc2;
|
290
339
|
}
|
@@ -295,11 +344,19 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
295
344
|
encoding = rb_enc_get(r_encoding);
|
296
345
|
}
|
297
346
|
|
347
|
+
// In case #raw_parse is called multiple times on the same parser. Note that
|
348
|
+
// using IO methods on a re-used parser can cause segmentation faults.
|
349
|
+
rb_ivar_set(self, s_row, Qnil);
|
350
|
+
|
298
351
|
buffer_size = BUFSIZE;
|
299
352
|
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
300
353
|
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
301
354
|
if (!NIL_P(bufsize)) {
|
302
355
|
buffer_size = NUM2INT(bufsize);
|
356
|
+
// buffer_size = 0 can cause segmentation faults.
|
357
|
+
if (buffer_size == 0) {
|
358
|
+
buffer_size = BUFSIZE;
|
359
|
+
}
|
303
360
|
}
|
304
361
|
}
|
305
362
|
|
@@ -312,26 +369,34 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
312
369
|
while (!done) {
|
313
370
|
VALUE str;
|
314
371
|
char *p, *pe;
|
315
|
-
int len, space = buffer_size - have, tokstart_diff, tokend_diff;
|
372
|
+
int len, space = buffer_size - have, tokstart_diff, tokend_diff, start_diff, mark_row_sep_diff;
|
316
373
|
|
317
374
|
if (io) {
|
318
375
|
if (space == 0) {
|
319
|
-
|
320
|
-
|
376
|
+
// Not moving d->start will cause intermittent segmentation faults.
|
377
|
+
tokstart_diff = ts - buf;
|
378
|
+
tokend_diff = te - buf;
|
379
|
+
start_diff = d->start - buf;
|
380
|
+
mark_row_sep_diff = mark_row_sep - buf;
|
321
381
|
|
322
|
-
|
323
|
-
|
382
|
+
buffer_size += BUFSIZE;
|
383
|
+
REALLOC_N(buf, char, buffer_size);
|
324
384
|
|
325
|
-
|
385
|
+
space = buffer_size - have;
|
326
386
|
|
327
|
-
|
328
|
-
|
387
|
+
ts = buf + tokstart_diff;
|
388
|
+
te = buf + tokend_diff;
|
389
|
+
d->start = buf + start_diff;
|
390
|
+
mark_row_sep = buf + mark_row_sep_diff;
|
329
391
|
}
|
330
392
|
p = buf + have;
|
331
393
|
|
394
|
+
// Reads "`length` bytes without any conversion (binary mode)."
|
395
|
+
// "The resulted string is always ASCII-8BIT encoding."
|
396
|
+
// @see http://www.ruby-doc.org/core-2.1.4/IO.html#method-i-read
|
332
397
|
str = rb_funcall(port, s_read, 1, INT2FIX(space));
|
333
398
|
if (NIL_P(str)) {
|
334
|
-
//
|
399
|
+
// "`nil` means it met EOF at beginning," e.g. for `StringIO.new("")`.
|
335
400
|
len = 0;
|
336
401
|
}
|
337
402
|
else {
|
@@ -339,6 +404,7 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
339
404
|
memcpy(p, StringValuePtr(str), len);
|
340
405
|
}
|
341
406
|
|
407
|
+
// "The 1 to `length`-1 bytes string means it met EOF after reading the result."
|
342
408
|
if (len < space) {
|
343
409
|
// EOF actions don't work in scanners, so we add a sentinel value.
|
344
410
|
// @see http://www.complang.org/pipermail/ragel-users/2007-May/001516.html
|
@@ -354,22 +420,21 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
354
420
|
done = 1;
|
355
421
|
}
|
356
422
|
|
423
|
+
if (d->start == 0) {
|
424
|
+
d->start = p;
|
425
|
+
}
|
426
|
+
|
357
427
|
pe = p + len;
|
358
428
|
%% write exec;
|
359
429
|
|
360
|
-
if (done && cs <
|
361
|
-
|
362
|
-
|
363
|
-
}
|
430
|
+
if (done && cs < raw_parse_first_final) {
|
431
|
+
FREE;
|
432
|
+
|
364
433
|
if (unclosed_line) {
|
365
|
-
rb_raise(
|
434
|
+
rb_raise(eError, "Unclosed quoted field on line %d.", unclosed_line);
|
366
435
|
}
|
367
|
-
// Ruby raises different errors for illegal quoting, depending on whether
|
368
|
-
// a quoted string is followed by a string ("Unclosed quoted field on line
|
369
|
-
// %d.") or by a string ending in a quote ("Missing or stray quote in line
|
370
|
-
// %d"). These precisions are kind of bogus, but we can try using $!.
|
371
436
|
else {
|
372
|
-
rb_raise(
|
437
|
+
rb_raise(eError, "Illegal quoting in line %d.", curline);
|
373
438
|
}
|
374
439
|
}
|
375
440
|
|
@@ -384,23 +449,35 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
384
449
|
}
|
385
450
|
}
|
386
451
|
|
387
|
-
|
388
|
-
free(buf);
|
389
|
-
}
|
452
|
+
FREE;
|
390
453
|
|
391
454
|
return Qnil;
|
392
455
|
}
|
393
456
|
|
457
|
+
// @see https://github.com/ruby/ruby/blob/trunk/README.EXT#L616
|
458
|
+
static VALUE allocate(VALUE class) {
|
459
|
+
// @see https://github.com/nofxx/georuby_c/blob/b3b91fd90980d7c295ac8f6012d89878ea7cd569/ext/line.c#L66
|
460
|
+
Data *d = ALLOC(Data);
|
461
|
+
d->start = 0;
|
462
|
+
// @see https://github.com/nofxx/georuby_c/blob/b3b91fd90980d7c295ac8f6012d89878ea7cd569/ext/point.h#L26
|
463
|
+
// rb_gc_mark(d->start) or rb_gc_mark(d) cause warning "passing argument 1 of ‘rb_gc_mark’ makes integer from pointer without a cast"
|
464
|
+
// free(d->start) causes error "pointer being freed was not allocated"
|
465
|
+
return Data_Wrap_Struct(class, NULL, free, d);
|
466
|
+
}
|
467
|
+
|
468
|
+
// @see http://tenderlovemaking.com/2009/12/18/writing-ruby-c-extensions-part-1.html
|
469
|
+
// @see http://tenderlovemaking.com/2010/12/11/writing-ruby-c-extensions-part-2.html
|
394
470
|
void Init_fastcsv() {
|
395
471
|
s_read = rb_intern("read");
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
rb_define_attr(
|
404
|
-
|
405
|
-
|
472
|
+
s_row = rb_intern("@row");
|
473
|
+
|
474
|
+
cClass = rb_define_class("FastCSV", rb_const_get(rb_cObject, rb_intern("CSV"))); // class FastCSV < CSV
|
475
|
+
cParser = rb_define_class_under(cClass, "Parser", rb_cObject); // class Parser
|
476
|
+
rb_define_alloc_func(cParser, allocate); //
|
477
|
+
rb_define_method(cParser, "raw_parse", raw_parse, -1); // def raw_parse(port, opts = nil); end
|
478
|
+
rb_define_attr(cParser, "row", 1, 0); // attr_reader :row
|
479
|
+
rb_define_attr(cParser, "buffer_size", 1, 1); // attr_accessor :buffer_size
|
480
|
+
// end
|
481
|
+
eError = rb_define_class_under(cClass, "MalformedCSVError", rb_eRuntimeError); // class MalformedCSVError < RuntimeError
|
482
|
+
// end
|
406
483
|
}
|
data/fastcsv.gemspec
CHANGED
data/lib/fastcsv.rb
CHANGED
@@ -1 +1,131 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
1
3
|
require 'fastcsv/fastcsv'
|
4
|
+
|
5
|
+
# @see https://github.com/ruby/ruby/blob/ab337e61ecb5f42384ba7d710c36faf96a454e5c/lib/csv.rb
|
6
|
+
class FastCSV < CSV
|
7
|
+
def self.raw_parse(*args, &block)
|
8
|
+
Parser.new.raw_parse(*args, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
def shift
|
12
|
+
# COPY
|
13
|
+
# handle headers not based on document content
|
14
|
+
if header_row? and @return_headers and
|
15
|
+
[Array, String].include? @use_headers.class
|
16
|
+
if @unconverted_fields
|
17
|
+
return add_unconverted_fields(parse_headers, Array.new)
|
18
|
+
else
|
19
|
+
return parse_headers
|
20
|
+
end
|
21
|
+
end
|
22
|
+
# PASTE
|
23
|
+
|
24
|
+
# The CSV library wraps File objects, whereas `FastCSV.raw_parse` accepts
|
25
|
+
# IO-like objects that implement `#read(length)`.
|
26
|
+
begin
|
27
|
+
unless csv = fiber.resume # was unless parse = @io.gets(@row_sep)
|
28
|
+
return nil
|
29
|
+
end
|
30
|
+
rescue FiberError
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
|
34
|
+
row = parser.row
|
35
|
+
|
36
|
+
# COPY
|
37
|
+
if csv.empty?
|
38
|
+
#
|
39
|
+
# I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
|
40
|
+
# CSV's <tt>[nil]</tt>
|
41
|
+
#
|
42
|
+
if row.empty? # was if parse.empty?
|
43
|
+
@lineno += 1
|
44
|
+
if @skip_blanks
|
45
|
+
return shift # was next
|
46
|
+
elsif @unconverted_fields
|
47
|
+
return add_unconverted_fields(Array.new, Array.new)
|
48
|
+
elsif @use_headers
|
49
|
+
return self.class::Row.new(Array.new, Array.new)
|
50
|
+
else
|
51
|
+
return Array.new
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# PASTE
|
56
|
+
|
57
|
+
return shift if @skip_lines and @skip_lines.match row # was next if @skip_lines and @skip_lines.match parse
|
58
|
+
|
59
|
+
# COPY
|
60
|
+
@lineno += 1
|
61
|
+
|
62
|
+
# save fields unconverted fields, if needed...
|
63
|
+
unconverted = csv.dup if @unconverted_fields
|
64
|
+
|
65
|
+
# convert fields, if needed...
|
66
|
+
csv = convert_fields(csv) unless @use_headers or @converters.empty?
|
67
|
+
# parse out header rows and handle CSV::Row conversions...
|
68
|
+
csv = parse_headers(csv) if @use_headers
|
69
|
+
|
70
|
+
# inject unconverted fields and accessor, if requested...
|
71
|
+
if @unconverted_fields and not csv.respond_to? :unconverted_fields
|
72
|
+
add_unconverted_fields(csv, unconverted)
|
73
|
+
end
|
74
|
+
# PASTE
|
75
|
+
|
76
|
+
csv # was break csv
|
77
|
+
end
|
78
|
+
|
79
|
+
# CSV's delegated and overwritten IO methods move the pointer within the file,
|
80
|
+
# but FastCSV doesn't notice, so we need to recreate the fiber. The old fiber
|
81
|
+
# is garbage collected.
|
82
|
+
|
83
|
+
def pos=(*args)
|
84
|
+
super
|
85
|
+
@parser = nil
|
86
|
+
@fiber = nil
|
87
|
+
end
|
88
|
+
def reopen(*args)
|
89
|
+
super
|
90
|
+
@parser = nil
|
91
|
+
@fiber = nil
|
92
|
+
end
|
93
|
+
def seek(*args)
|
94
|
+
super
|
95
|
+
@parser = nil
|
96
|
+
@fiber = nil
|
97
|
+
end
|
98
|
+
def rewind
|
99
|
+
super
|
100
|
+
@parser = nil
|
101
|
+
@fiber = nil
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def parser
|
107
|
+
@parser ||= Parser.new
|
108
|
+
end
|
109
|
+
|
110
|
+
def fiber
|
111
|
+
# @see http://www.ruby-doc.org/core-2.1.4/Fiber.html
|
112
|
+
@fiber ||= Fiber.new do
|
113
|
+
if @io.respond_to?(:internal_encoding)
|
114
|
+
enc2 = @io.external_encoding
|
115
|
+
enc = @io.internal_encoding || '-'
|
116
|
+
if enc2
|
117
|
+
encoding = "#{enc2}:#{enc}"
|
118
|
+
else
|
119
|
+
encoding = enc
|
120
|
+
end
|
121
|
+
end
|
122
|
+
parser.raw_parse(@io, encoding: encoding) do |row|
|
123
|
+
Fiber.yield(row)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def FastCSV(*args, &block)
|
130
|
+
FastCSV.instance(*args, &block)
|
131
|
+
end
|