fastcsv 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +11 -0
- data/README.md +37 -2
- data/TESTS.md +42 -0
- data/ext/fastcsv/fastcsv.c +281 -223
- data/ext/fastcsv/fastcsv.rl +149 -72
- data/fastcsv.gemspec +1 -1
- data/lib/fastcsv.rb +130 -0
- data/spec/fastcsv_spec.rb +189 -57
- data/spec/fixtures/csv.csv +3 -0
- data/spec/fixtures/iso-8859-1-quoted.csv +1 -0
- data/spec/fixtures/utf-8-quoted.csv +1 -0
- data/spec/spec_helper.rb +5 -0
- data/test/csv/base.rb +8 -0
- data/test/csv/line_endings.gz +0 -0
- data/test/csv/test_csv_parsing.rb +221 -0
- data/test/csv/test_csv_writing.rb +97 -0
- data/test/csv/test_data_converters.rb +263 -0
- data/test/csv/test_encodings.rb +339 -0
- data/test/csv/test_features.rb +317 -0
- data/test/csv/test_headers.rb +289 -0
- data/test/csv/test_interface.rb +362 -0
- data/test/csv/test_row.rb +349 -0
- data/test/csv/test_table.rb +420 -0
- data/test/csv/ts_all.rb +20 -0
- data/test/runner.rb +36 -0
- data/test/with_different_ofs.rb +17 -0
- metadata +38 -2
data/ext/fastcsv/fastcsv.rl
CHANGED
@@ -5,7 +5,7 @@
|
|
5
5
|
// http://w3c.github.io/csvw/syntax/#ebnf
|
6
6
|
|
7
7
|
// CSV implementation.
|
8
|
-
// https://github.com/ruby/ruby/blob/
|
8
|
+
// https://github.com/ruby/ruby/blob/trunk/lib/csv.rb
|
9
9
|
|
10
10
|
// Ruby C extensions help.
|
11
11
|
// https://github.com/ruby/ruby/blob/trunk/README.EXT
|
@@ -19,15 +19,24 @@ if (enc2 != NULL) { \
|
|
19
19
|
field = rb_str_encode(field, rb_enc_from_encoding(enc), 0, Qnil); \
|
20
20
|
}
|
21
21
|
|
22
|
-
|
23
|
-
|
22
|
+
#define FREE \
|
23
|
+
if (buf != NULL) { \
|
24
|
+
free(buf); \
|
25
|
+
} \
|
26
|
+
if (row_sep != NULL) { \
|
27
|
+
free(row_sep); \
|
28
|
+
}
|
24
29
|
|
25
|
-
|
26
|
-
|
30
|
+
static VALUE cClass, cParser, eError;
|
31
|
+
static ID s_read, s_row;
|
27
32
|
|
28
|
-
|
29
|
-
|
30
|
-
|
33
|
+
// @see https://github.com/nofxx/georuby_c/blob/b3b91fd90980d7c295ac8f6012d89878ea7cd569/ext/types.h#L22
|
34
|
+
typedef struct {
|
35
|
+
char *start;
|
36
|
+
} Data;
|
37
|
+
|
38
|
+
%%{
|
39
|
+
machine raw_parse;
|
31
40
|
|
32
41
|
action open_quote {
|
33
42
|
unclosed_line = curline;
|
@@ -74,7 +83,7 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
74
83
|
reader++;
|
75
84
|
}
|
76
85
|
|
77
|
-
field = rb_enc_str_new(copy, writer - copy,
|
86
|
+
field = rb_enc_str_new(copy, writer - copy, encoding);
|
78
87
|
ENCODE;
|
79
88
|
|
80
89
|
if (copy != NULL) {
|
@@ -88,7 +97,35 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
88
97
|
field = Qnil;
|
89
98
|
}
|
90
99
|
|
100
|
+
action mark_row {
|
101
|
+
d->start = p;
|
102
|
+
|
103
|
+
if (len_row_sep) {
|
104
|
+
if (p - mark_row_sep != len_row_sep || row_sep[0] != *mark_row_sep || len_row_sep == 2 && row_sep[1] != *(mark_row_sep + 1)) {
|
105
|
+
FREE;
|
106
|
+
|
107
|
+
rb_raise(eError, "Unquoted fields do not allow \\r or \\n (line %d).", curline - 1);
|
108
|
+
}
|
109
|
+
}
|
110
|
+
else {
|
111
|
+
len_row_sep = p - mark_row_sep;
|
112
|
+
row_sep = ALLOC_N(char, p - mark_row_sep);
|
113
|
+
memcpy(row_sep, mark_row_sep, p - mark_row_sep);
|
114
|
+
}
|
115
|
+
}
|
116
|
+
|
91
117
|
action new_row {
|
118
|
+
mark_row_sep = p;
|
119
|
+
|
120
|
+
curline++;
|
121
|
+
|
122
|
+
if (d->start == 0 || p == d->start) {
|
123
|
+
rb_ivar_set(self, s_row, rb_str_new2(""));
|
124
|
+
}
|
125
|
+
else if (p > d->start) {
|
126
|
+
rb_ivar_set(self, s_row, rb_str_new(d->start, p - d->start));
|
127
|
+
}
|
128
|
+
|
92
129
|
if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
|
93
130
|
rb_ary_push(row, field);
|
94
131
|
field = Qnil;
|
@@ -99,18 +136,26 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
99
136
|
}
|
100
137
|
|
101
138
|
action last_row {
|
139
|
+
if (d->start == 0 || p == d->start) {
|
140
|
+
rb_ivar_set(self, s_row, rb_str_new2(""));
|
141
|
+
}
|
142
|
+
else if (p > d->start) {
|
143
|
+
rb_ivar_set(self, s_row, rb_str_new(d->start, p - d->start));
|
144
|
+
}
|
145
|
+
|
102
146
|
if (!NIL_P(field) || RARRAY_LEN(row)) {
|
103
147
|
rb_ary_push(row, field);
|
104
148
|
}
|
149
|
+
|
105
150
|
if (RARRAY_LEN(row)) {
|
106
151
|
rb_yield(row);
|
107
152
|
}
|
108
153
|
}
|
109
154
|
|
110
|
-
EOF = 0
|
155
|
+
EOF = 0;
|
111
156
|
quote_char = '"';
|
112
157
|
col_sep = ',' >new_field;
|
113
|
-
row_sep = ('\r' '\n'? | '\n')
|
158
|
+
row_sep = ('\r' '\n'? | '\n');
|
114
159
|
unquoted = (any* -- quote_char -- col_sep -- row_sep - EOF) %read_unquoted;
|
115
160
|
quoted = quote_char >open_quote (any - quote_char - EOF | quote_char quote_char | row_sep)* %read_quoted quote_char >close_quote;
|
116
161
|
field = unquoted | quoted;
|
@@ -118,9 +163,9 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
118
163
|
# @see Ragel Guide: 6.3 Scanners
|
119
164
|
# An unquoted field can be zero-length.
|
120
165
|
main := |*
|
121
|
-
field col_sep
|
122
|
-
field row_sep >new_row
|
123
|
-
field EOF;
|
166
|
+
field col_sep;
|
167
|
+
field row_sep >new_row %mark_row;
|
168
|
+
field EOF >last_row;
|
124
169
|
*|;
|
125
170
|
}%%
|
126
171
|
|
@@ -130,9 +175,7 @@ static ID s_read, s_to_str, s_internal_encoding, s_external_encoding, s_string,
|
|
130
175
|
#define BUFSIZE 16384
|
131
176
|
|
132
177
|
// @see http://rxr.whitequark.org/mri/source/io.c#4845
|
133
|
-
static void
|
134
|
-
rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode)
|
135
|
-
{
|
178
|
+
static void rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc, rb_encoding **enc2, int fmode) {
|
136
179
|
int default_ext = 0;
|
137
180
|
|
138
181
|
if (ext == NULL) {
|
@@ -157,15 +200,17 @@ rb_io_ext_int_to_encs(rb_encoding *ext, rb_encoding *intern, rb_encoding **enc,
|
|
157
200
|
}
|
158
201
|
}
|
159
202
|
|
160
|
-
VALUE
|
203
|
+
static VALUE raw_parse(int argc, VALUE *argv, VALUE self) {
|
161
204
|
int cs, act, have = 0, curline = 1, io = 0;
|
162
|
-
char *ts = 0, *te = 0, *buf = 0, *eof = 0;
|
205
|
+
char *ts = 0, *te = 0, *buf = 0, *eof = 0, *mark_row_sep = 0, *row_sep = NULL;
|
163
206
|
|
164
|
-
VALUE port, opts;
|
207
|
+
VALUE port, opts, r_encoding;
|
165
208
|
VALUE row = rb_ary_new(), field = Qnil, bufsize = Qnil;
|
166
|
-
int done = 0, unclosed_line = 0, buffer_size = 0, taint = 0;
|
209
|
+
int done = 0, unclosed_line = 0, len_row_sep = 0, buffer_size = 0, taint = 0;
|
167
210
|
rb_encoding *enc = NULL, *enc2 = NULL, *encoding = NULL;
|
168
|
-
|
211
|
+
|
212
|
+
Data *d;
|
213
|
+
Data_Get_Struct(self, Data, d);
|
169
214
|
|
170
215
|
VALUE option;
|
171
216
|
char quote_char = '"';
|
@@ -174,8 +219,8 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
174
219
|
taint = OBJ_TAINTED(port);
|
175
220
|
io = rb_respond_to(port, s_read);
|
176
221
|
if (!io) {
|
177
|
-
if (rb_respond_to(port,
|
178
|
-
port = rb_funcall(port,
|
222
|
+
if (rb_respond_to(port, rb_intern("to_str"))) {
|
223
|
+
port = rb_funcall(port, rb_intern("to_str"), 0);
|
179
224
|
StringValue(port);
|
180
225
|
}
|
181
226
|
else {
|
@@ -199,7 +244,7 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
199
244
|
// @see http://ruby-doc.org/core-2.1.1/IO.html#method-c-new-label-Open+Mode
|
200
245
|
option = rb_hash_aref(opts, ID2SYM(rb_intern("encoding")));
|
201
246
|
if (TYPE(option) == T_STRING) {
|
202
|
-
// parse_mode_enc is not in header file.
|
247
|
+
// `parse_mode_enc` is not in header file.
|
203
248
|
const char *estr = StringValueCStr(option), *ptr;
|
204
249
|
char encname[ENCODING_MAXNAMELEN+1];
|
205
250
|
int idx, idx2;
|
@@ -210,17 +255,17 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
210
255
|
ptr = strrchr(estr, ':');
|
211
256
|
if (ptr) {
|
212
257
|
long len = (ptr++) - estr;
|
213
|
-
if (len == 0 || len > ENCODING_MAXNAMELEN) {
|
258
|
+
if (len == 0 || len > ENCODING_MAXNAMELEN) { // ":enc"
|
214
259
|
idx = -1;
|
215
260
|
}
|
216
|
-
else {
|
261
|
+
else { // "enc2:enc" or "enc:-"
|
217
262
|
memcpy(encname, estr, len);
|
218
263
|
encname[len] = '\0';
|
219
264
|
estr = encname;
|
220
265
|
idx = rb_enc_find_index(encname);
|
221
266
|
}
|
222
267
|
}
|
223
|
-
else {
|
268
|
+
else { // "enc"
|
224
269
|
idx = rb_enc_find_index(estr);
|
225
270
|
}
|
226
271
|
|
@@ -228,7 +273,7 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
228
273
|
ext_enc = rb_enc_from_index(idx);
|
229
274
|
}
|
230
275
|
else {
|
231
|
-
if (idx != -2) {
|
276
|
+
if (idx != -2) { // ":enc"
|
232
277
|
// `unsupported_encoding` is not in header file.
|
233
278
|
rb_warn("Unsupported encoding %s ignored", estr);
|
234
279
|
}
|
@@ -237,11 +282,11 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
237
282
|
|
238
283
|
int_enc = NULL;
|
239
284
|
if (ptr) {
|
240
|
-
if (*ptr == '-' && *(ptr+1) == '\0') {
|
285
|
+
if (*ptr == '-' && *(ptr+1) == '\0') { // "enc:-"
|
241
286
|
/* Special case - "-" => no transcoding */
|
242
287
|
int_enc = (rb_encoding *)Qnil;
|
243
288
|
}
|
244
|
-
else {
|
289
|
+
else { // "enc2:enc"
|
245
290
|
idx2 = rb_enc_find_index(ptr);
|
246
291
|
if (idx2 < 0) {
|
247
292
|
// `unsupported_encoding` is not in header file.
|
@@ -262,29 +307,33 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
262
307
|
rb_raise(rb_eArgError, ":encoding has to be a String");
|
263
308
|
}
|
264
309
|
|
265
|
-
// @see
|
266
|
-
// @see https://github.com/ruby/ruby/blob/
|
267
|
-
if (rb_respond_to(port,
|
268
|
-
r_encoding = rb_funcall(port,
|
310
|
+
// @see CSV#raw_encoding
|
311
|
+
// @see https://github.com/ruby/ruby/blob/ab337e61ecb5f42384ba7d710c36faf96a454e5c/lib/csv.rb#L2290
|
312
|
+
if (rb_respond_to(port, rb_intern("internal_encoding"))) {
|
313
|
+
r_encoding = rb_funcall(port, rb_intern("internal_encoding"), 0);
|
269
314
|
if (NIL_P(r_encoding)) {
|
270
|
-
r_encoding = rb_funcall(port,
|
315
|
+
r_encoding = rb_funcall(port, rb_intern("external_encoding"), 0);
|
271
316
|
}
|
272
317
|
}
|
273
|
-
else if (rb_respond_to(port,
|
274
|
-
r_encoding = rb_funcall(rb_funcall(port,
|
318
|
+
else if (rb_respond_to(port, rb_intern("string"))) {
|
319
|
+
r_encoding = rb_funcall(rb_funcall(port, rb_intern("string"), 0), rb_intern("encoding"), 0);
|
275
320
|
}
|
276
|
-
else if (rb_respond_to(port,
|
277
|
-
r_encoding = rb_funcall(port,
|
321
|
+
else if (rb_respond_to(port, rb_intern("encoding"))) {
|
322
|
+
r_encoding = rb_funcall(port, rb_intern("encoding"), 0);
|
278
323
|
}
|
279
324
|
else {
|
280
325
|
r_encoding = rb_enc_from_encoding(rb_ascii8bit_encoding());
|
281
326
|
}
|
327
|
+
|
328
|
+
// @see CSV#initialize
|
329
|
+
// @see https://github.com/ruby/ruby/blob/ab337e61ecb5f42384ba7d710c36faf96a454e5c/lib/csv.rb#L1510
|
282
330
|
if (NIL_P(r_encoding)) {
|
283
331
|
r_encoding = rb_enc_from_encoding(rb_default_internal_encoding());
|
284
332
|
}
|
285
333
|
if (NIL_P(r_encoding)) {
|
286
334
|
r_encoding = rb_enc_from_encoding(rb_default_external_encoding());
|
287
335
|
}
|
336
|
+
|
288
337
|
if (enc2 != NULL) {
|
289
338
|
encoding = enc2;
|
290
339
|
}
|
@@ -295,11 +344,19 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
295
344
|
encoding = rb_enc_get(r_encoding);
|
296
345
|
}
|
297
346
|
|
347
|
+
// In case #raw_parse is called multiple times on the same parser. Note that
|
348
|
+
// using IO methods on a re-used parser can cause segmentation faults.
|
349
|
+
rb_ivar_set(self, s_row, Qnil);
|
350
|
+
|
298
351
|
buffer_size = BUFSIZE;
|
299
352
|
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
300
353
|
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
301
354
|
if (!NIL_P(bufsize)) {
|
302
355
|
buffer_size = NUM2INT(bufsize);
|
356
|
+
// buffer_size = 0 can cause segmentation faults.
|
357
|
+
if (buffer_size == 0) {
|
358
|
+
buffer_size = BUFSIZE;
|
359
|
+
}
|
303
360
|
}
|
304
361
|
}
|
305
362
|
|
@@ -312,26 +369,34 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
312
369
|
while (!done) {
|
313
370
|
VALUE str;
|
314
371
|
char *p, *pe;
|
315
|
-
int len, space = buffer_size - have, tokstart_diff, tokend_diff;
|
372
|
+
int len, space = buffer_size - have, tokstart_diff, tokend_diff, start_diff, mark_row_sep_diff;
|
316
373
|
|
317
374
|
if (io) {
|
318
375
|
if (space == 0) {
|
319
|
-
|
320
|
-
|
376
|
+
// Not moving d->start will cause intermittent segmentation faults.
|
377
|
+
tokstart_diff = ts - buf;
|
378
|
+
tokend_diff = te - buf;
|
379
|
+
start_diff = d->start - buf;
|
380
|
+
mark_row_sep_diff = mark_row_sep - buf;
|
321
381
|
|
322
|
-
|
323
|
-
|
382
|
+
buffer_size += BUFSIZE;
|
383
|
+
REALLOC_N(buf, char, buffer_size);
|
324
384
|
|
325
|
-
|
385
|
+
space = buffer_size - have;
|
326
386
|
|
327
|
-
|
328
|
-
|
387
|
+
ts = buf + tokstart_diff;
|
388
|
+
te = buf + tokend_diff;
|
389
|
+
d->start = buf + start_diff;
|
390
|
+
mark_row_sep = buf + mark_row_sep_diff;
|
329
391
|
}
|
330
392
|
p = buf + have;
|
331
393
|
|
394
|
+
// Reads "`length` bytes without any conversion (binary mode)."
|
395
|
+
// "The resulted string is always ASCII-8BIT encoding."
|
396
|
+
// @see http://www.ruby-doc.org/core-2.1.4/IO.html#method-i-read
|
332
397
|
str = rb_funcall(port, s_read, 1, INT2FIX(space));
|
333
398
|
if (NIL_P(str)) {
|
334
|
-
//
|
399
|
+
// "`nil` means it met EOF at beginning," e.g. for `StringIO.new("")`.
|
335
400
|
len = 0;
|
336
401
|
}
|
337
402
|
else {
|
@@ -339,6 +404,7 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
339
404
|
memcpy(p, StringValuePtr(str), len);
|
340
405
|
}
|
341
406
|
|
407
|
+
// "The 1 to `length`-1 bytes string means it met EOF after reading the result."
|
342
408
|
if (len < space) {
|
343
409
|
// EOF actions don't work in scanners, so we add a sentinel value.
|
344
410
|
// @see http://www.complang.org/pipermail/ragel-users/2007-May/001516.html
|
@@ -354,22 +420,21 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
354
420
|
done = 1;
|
355
421
|
}
|
356
422
|
|
423
|
+
if (d->start == 0) {
|
424
|
+
d->start = p;
|
425
|
+
}
|
426
|
+
|
357
427
|
pe = p + len;
|
358
428
|
%% write exec;
|
359
429
|
|
360
|
-
if (done && cs <
|
361
|
-
|
362
|
-
|
363
|
-
}
|
430
|
+
if (done && cs < raw_parse_first_final) {
|
431
|
+
FREE;
|
432
|
+
|
364
433
|
if (unclosed_line) {
|
365
|
-
rb_raise(
|
434
|
+
rb_raise(eError, "Unclosed quoted field on line %d.", unclosed_line);
|
366
435
|
}
|
367
|
-
// Ruby raises different errors for illegal quoting, depending on whether
|
368
|
-
// a quoted string is followed by a string ("Unclosed quoted field on line
|
369
|
-
// %d.") or by a string ending in a quote ("Missing or stray quote in line
|
370
|
-
// %d"). These precisions are kind of bogus, but we can try using $!.
|
371
436
|
else {
|
372
|
-
rb_raise(
|
437
|
+
rb_raise(eError, "Illegal quoting in line %d.", curline);
|
373
438
|
}
|
374
439
|
}
|
375
440
|
|
@@ -384,23 +449,35 @@ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
|
|
384
449
|
}
|
385
450
|
}
|
386
451
|
|
387
|
-
|
388
|
-
free(buf);
|
389
|
-
}
|
452
|
+
FREE;
|
390
453
|
|
391
454
|
return Qnil;
|
392
455
|
}
|
393
456
|
|
457
|
+
// @see https://github.com/ruby/ruby/blob/trunk/README.EXT#L616
|
458
|
+
static VALUE allocate(VALUE class) {
|
459
|
+
// @see https://github.com/nofxx/georuby_c/blob/b3b91fd90980d7c295ac8f6012d89878ea7cd569/ext/line.c#L66
|
460
|
+
Data *d = ALLOC(Data);
|
461
|
+
d->start = 0;
|
462
|
+
// @see https://github.com/nofxx/georuby_c/blob/b3b91fd90980d7c295ac8f6012d89878ea7cd569/ext/point.h#L26
|
463
|
+
// rb_gc_mark(d->start) or rb_gc_mark(d) cause warning "passing argument 1 of ‘rb_gc_mark’ makes integer from pointer without a cast"
|
464
|
+
// free(d->start) causes error "pointer being freed was not allocated"
|
465
|
+
return Data_Wrap_Struct(class, NULL, free, d);
|
466
|
+
}
|
467
|
+
|
468
|
+
// @see http://tenderlovemaking.com/2009/12/18/writing-ruby-c-extensions-part-1.html
|
469
|
+
// @see http://tenderlovemaking.com/2010/12/11/writing-ruby-c-extensions-part-2.html
|
394
470
|
void Init_fastcsv() {
|
395
471
|
s_read = rb_intern("read");
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
rb_define_attr(
|
404
|
-
|
405
|
-
|
472
|
+
s_row = rb_intern("@row");
|
473
|
+
|
474
|
+
cClass = rb_define_class("FastCSV", rb_const_get(rb_cObject, rb_intern("CSV"))); // class FastCSV < CSV
|
475
|
+
cParser = rb_define_class_under(cClass, "Parser", rb_cObject); // class Parser
|
476
|
+
rb_define_alloc_func(cParser, allocate); //
|
477
|
+
rb_define_method(cParser, "raw_parse", raw_parse, -1); // def raw_parse(port, opts = nil); end
|
478
|
+
rb_define_attr(cParser, "row", 1, 0); // attr_reader :row
|
479
|
+
rb_define_attr(cParser, "buffer_size", 1, 1); // attr_accessor :buffer_size
|
480
|
+
// end
|
481
|
+
eError = rb_define_class_under(cClass, "MalformedCSVError", rb_eRuntimeError); // class MalformedCSVError < RuntimeError
|
482
|
+
// end
|
406
483
|
}
|
data/fastcsv.gemspec
CHANGED
data/lib/fastcsv.rb
CHANGED
@@ -1 +1,131 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
1
3
|
require 'fastcsv/fastcsv'
|
4
|
+
|
5
|
+
# @see https://github.com/ruby/ruby/blob/ab337e61ecb5f42384ba7d710c36faf96a454e5c/lib/csv.rb
|
6
|
+
class FastCSV < CSV
|
7
|
+
def self.raw_parse(*args, &block)
|
8
|
+
Parser.new.raw_parse(*args, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
def shift
|
12
|
+
# COPY
|
13
|
+
# handle headers not based on document content
|
14
|
+
if header_row? and @return_headers and
|
15
|
+
[Array, String].include? @use_headers.class
|
16
|
+
if @unconverted_fields
|
17
|
+
return add_unconverted_fields(parse_headers, Array.new)
|
18
|
+
else
|
19
|
+
return parse_headers
|
20
|
+
end
|
21
|
+
end
|
22
|
+
# PASTE
|
23
|
+
|
24
|
+
# The CSV library wraps File objects, whereas `FastCSV.raw_parse` accepts
|
25
|
+
# IO-like objects that implement `#read(length)`.
|
26
|
+
begin
|
27
|
+
unless csv = fiber.resume # was unless parse = @io.gets(@row_sep)
|
28
|
+
return nil
|
29
|
+
end
|
30
|
+
rescue FiberError
|
31
|
+
return nil
|
32
|
+
end
|
33
|
+
|
34
|
+
row = parser.row
|
35
|
+
|
36
|
+
# COPY
|
37
|
+
if csv.empty?
|
38
|
+
#
|
39
|
+
# I believe a blank line should be an <tt>Array.new</tt>, not Ruby 1.8
|
40
|
+
# CSV's <tt>[nil]</tt>
|
41
|
+
#
|
42
|
+
if row.empty? # was if parse.empty?
|
43
|
+
@lineno += 1
|
44
|
+
if @skip_blanks
|
45
|
+
return shift # was next
|
46
|
+
elsif @unconverted_fields
|
47
|
+
return add_unconverted_fields(Array.new, Array.new)
|
48
|
+
elsif @use_headers
|
49
|
+
return self.class::Row.new(Array.new, Array.new)
|
50
|
+
else
|
51
|
+
return Array.new
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
# PASTE
|
56
|
+
|
57
|
+
return shift if @skip_lines and @skip_lines.match row # was next if @skip_lines and @skip_lines.match parse
|
58
|
+
|
59
|
+
# COPY
|
60
|
+
@lineno += 1
|
61
|
+
|
62
|
+
# save fields unconverted fields, if needed...
|
63
|
+
unconverted = csv.dup if @unconverted_fields
|
64
|
+
|
65
|
+
# convert fields, if needed...
|
66
|
+
csv = convert_fields(csv) unless @use_headers or @converters.empty?
|
67
|
+
# parse out header rows and handle CSV::Row conversions...
|
68
|
+
csv = parse_headers(csv) if @use_headers
|
69
|
+
|
70
|
+
# inject unconverted fields and accessor, if requested...
|
71
|
+
if @unconverted_fields and not csv.respond_to? :unconverted_fields
|
72
|
+
add_unconverted_fields(csv, unconverted)
|
73
|
+
end
|
74
|
+
# PASTE
|
75
|
+
|
76
|
+
csv # was break csv
|
77
|
+
end
|
78
|
+
|
79
|
+
# CSV's delegated and overwritten IO methods move the pointer within the file,
|
80
|
+
# but FastCSV doesn't notice, so we need to recreate the fiber. The old fiber
|
81
|
+
# is garbage collected.
|
82
|
+
|
83
|
+
def pos=(*args)
|
84
|
+
super
|
85
|
+
@parser = nil
|
86
|
+
@fiber = nil
|
87
|
+
end
|
88
|
+
def reopen(*args)
|
89
|
+
super
|
90
|
+
@parser = nil
|
91
|
+
@fiber = nil
|
92
|
+
end
|
93
|
+
def seek(*args)
|
94
|
+
super
|
95
|
+
@parser = nil
|
96
|
+
@fiber = nil
|
97
|
+
end
|
98
|
+
def rewind
|
99
|
+
super
|
100
|
+
@parser = nil
|
101
|
+
@fiber = nil
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def parser
|
107
|
+
@parser ||= Parser.new
|
108
|
+
end
|
109
|
+
|
110
|
+
def fiber
|
111
|
+
# @see http://www.ruby-doc.org/core-2.1.4/Fiber.html
|
112
|
+
@fiber ||= Fiber.new do
|
113
|
+
if @io.respond_to?(:internal_encoding)
|
114
|
+
enc2 = @io.external_encoding
|
115
|
+
enc = @io.internal_encoding || '-'
|
116
|
+
if enc2
|
117
|
+
encoding = "#{enc2}:#{enc}"
|
118
|
+
else
|
119
|
+
encoding = enc
|
120
|
+
end
|
121
|
+
end
|
122
|
+
parser.raw_parse(@io, encoding: encoding) do |row|
|
123
|
+
Fiber.yield(row)
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def FastCSV(*args, &block)
|
130
|
+
FastCSV.instance(*args, &block)
|
131
|
+
end
|