fastcsv 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,356 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ // CSV specifications.
4
+ // http://tools.ietf.org/html/rfc4180
5
+ // http://w3c.github.io/csvw/syntax/#ebnf
6
+
7
+ // CSV implementation.
8
+ // https://github.com/ruby/ruby/blob/master/lib/csv.rb
9
+
10
+ // Ruby C extensions help.
11
+ // https://github.com/ruby/ruby/blob/trunk/README.EXT
12
+ // http://rxr.whitequark.org/mri/source
13
+
14
+ // Ragel help.
15
+ // https://www.mail-archive.com/ragel-users@complang.org/
16
+
17
+ # define ASSOCIATE_INDEX \
18
+ if (internal_index >= 0) { \
19
+ rb_enc_associate_index(field, internal_index); \
20
+ field = rb_str_encode(field, rb_enc_from_encoding(external_encoding), 0, Qnil); \
21
+ } \
22
+ else { \
23
+ rb_enc_associate_index(field, rb_enc_to_index(external_encoding)); \
24
+ }
25
+
26
+ static VALUE mModule, rb_eParseError;
27
+ static ID s_read, s_to_str;
28
+
29
+ %%{
30
+ machine fastcsv;
31
+
32
+ action new_line {
33
+ curline++;
34
+ }
35
+
36
+ action open_quote {
37
+ unclosed_line = curline;
38
+ }
39
+
40
+ action close_quote {
41
+ unclosed_line = 0;
42
+ }
43
+
44
+ action read_unquoted {
45
+ if (p == ts) {
46
+ // Unquoted empty fields are nil, not "", in Ruby.
47
+ field = Qnil;
48
+ }
49
+ else if (p > ts) {
50
+ field = rb_str_new(ts, p - ts);
51
+ ASSOCIATE_INDEX;
52
+ }
53
+ }
54
+
55
+ action read_quoted {
56
+ if (p == ts) {
57
+ field = rb_str_new2("");
58
+ ASSOCIATE_INDEX;
59
+ }
60
+ // @note If we add an action on '""', we can skip some steps if no '""' is found.
61
+ else if (p > ts) {
62
+ // Operating on ts in-place produces odd behavior, FYI.
63
+ char *copy = ALLOC_N(char, p - ts);
64
+ memcpy(copy, ts, p - ts);
65
+
66
+ char *reader = ts, *writer = copy;
67
+ int escaped = 0;
68
+
69
+ while (p > reader) {
70
+ if (*reader == quote_char && !escaped) {
71
+ // Skip the escaping character.
72
+ escaped = 1;
73
+ }
74
+ else {
75
+ escaped = 0;
76
+ *writer++ = *reader;
77
+ }
78
+ reader++;
79
+ }
80
+
81
+ field = rb_str_new(copy, writer - copy);
82
+ ASSOCIATE_INDEX;
83
+
84
+ if (copy != NULL) {
85
+ free(copy);
86
+ }
87
+ }
88
+ }
89
+
90
+ action new_field {
91
+ rb_ary_push(row, field);
92
+ field = Qnil;
93
+ }
94
+
95
+ action new_row {
96
+ if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
97
+ rb_ary_push(row, field);
98
+ field = Qnil;
99
+ }
100
+
101
+ rb_yield(row);
102
+ row = rb_ary_new();
103
+ }
104
+
105
+ action last_row {
106
+ if (!NIL_P(field) || RARRAY_LEN(row)) {
107
+ rb_ary_push(row, field);
108
+ }
109
+ if (RARRAY_LEN(row)) {
110
+ rb_yield(row);
111
+ }
112
+ }
113
+
114
+ EOF = 0 >last_row;
115
+ quote_char = '"';
116
+ col_sep = ',' >new_field;
117
+ row_sep = ('\r' '\n'? | '\n') @new_line;
118
+ unquoted = (any* -- quote_char -- col_sep -- row_sep - EOF) %read_unquoted;
119
+ quoted = quote_char >open_quote (any - quote_char - EOF | quote_char quote_char | row_sep)* %read_quoted quote_char >close_quote;
120
+ field = unquoted | quoted;
121
+ # fields = (field col_sep)* field?;
122
+ # file = (fields row_sep >new_row)* fields?;
123
+
124
+ # @see Ragel Guide: 6.3 Scanners
125
+ # Remember that an unquoted field can be zero-length.
126
+ main := |*
127
+ field col_sep EOF?;
128
+ field row_sep >new_row EOF?;
129
+ field EOF;
130
+ *|;
131
+
132
+ # Non-scanner version requires very large buffer.
133
+ # main := file $/{
134
+ # if (!NIL_P(field) || RARRAY_LEN(row)) {
135
+ # rb_ary_push(row, field);
136
+ # rb_yield(row);
137
+ # }
138
+ # };
139
+ }%%
140
+
141
+ %% write data;
142
+
143
+ #define BUFSIZE 16384
144
+
145
+ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
146
+ int cs, act, have = 0, curline = 1, io = 0;
147
+ char *ts = 0, *te = 0, *buf = 0, *eof = 0;
148
+
149
+ VALUE port, opts;
150
+ VALUE row = rb_ary_new(), field = Qnil, bufsize = Qnil;
151
+ int done = 0, unclosed_line = 0, buffer_size = 0, taint = 0;
152
+ int internal_index = 0, external_index = rb_enc_to_index(rb_default_external_encoding());
153
+ rb_encoding *external_encoding = rb_default_external_encoding();
154
+
155
+ VALUE option;
156
+ char quote_char = '"'; //, *col_sep = ",", *row_sep = "\r\n";
157
+
158
+ rb_scan_args(argc, argv, "11", &port, &opts);
159
+ taint = OBJ_TAINTED(port);
160
+ io = rb_respond_to(port, s_read);
161
+ if (!io) {
162
+ if (rb_respond_to(port, s_to_str)) {
163
+ port = rb_funcall(port, s_to_str, 0);
164
+ StringValue(port);
165
+ }
166
+ else {
167
+ rb_raise(rb_eArgError, "data has to respond to #read or #to_str");
168
+ }
169
+ }
170
+
171
+ if (NIL_P(opts)) {
172
+ opts = rb_hash_new();
173
+ }
174
+ else if (TYPE(opts) != T_HASH) {
175
+ rb_raise(rb_eArgError, "options has to be a Hash or nil");
176
+ }
177
+
178
+ // @note Add machines for common CSV dialects, or see if we can use "when"
179
+ // from Chapter 6 to compare the character to the host program's variable.
180
+ // option = rb_hash_aref(opts, ID2SYM(rb_intern("quote_char")));
181
+ // if (TYPE(option) == T_STRING && RSTRING_LEN(option) == 1) {
182
+ // quote_char = *StringValueCStr(option);
183
+ // }
184
+ // else if (!NIL_P(option)) {
185
+ // rb_raise(rb_eArgError, ":quote_char has to be a single character String");
186
+ // }
187
+
188
+ // option = rb_hash_aref(opts, ID2SYM(rb_intern("col_sep")));
189
+ // if (TYPE(option) == T_STRING) {
190
+ // col_sep = StringValueCStr(option);
191
+ // }
192
+ // else if (!NIL_P(option)) {
193
+ // rb_raise(rb_eArgError, ":col_sep has to be a String");
194
+ // }
195
+
196
+ // option = rb_hash_aref(opts, ID2SYM(rb_intern("row_sep")));
197
+ // if (TYPE(option) == T_STRING) {
198
+ // row_sep = StringValueCStr(option);
199
+ // }
200
+ // else if (!NIL_P(option)) {
201
+ // rb_raise(rb_eArgError, ":row_sep has to be a String");
202
+ // }
203
+
204
+ option = rb_hash_aref(opts, ID2SYM(rb_intern("encoding")));
205
+ if (TYPE(option) == T_STRING) {
206
+ // @see parse_mode_enc in Ruby's io.c
207
+ const char *string = StringValueCStr(option), *pointer;
208
+ char internal_encoding_name[ENCODING_MAXNAMELEN + 1];
209
+
210
+ pointer = strrchr(string, ':');
211
+ if (pointer) {
212
+ long len = (pointer++) - string;
213
+ if (len == 0 || len > ENCODING_MAXNAMELEN) {
214
+ internal_index = -1;
215
+ }
216
+ else {
217
+ memcpy(internal_encoding_name, string, len);
218
+ internal_encoding_name[len] = '\0';
219
+ string = internal_encoding_name;
220
+ internal_index = rb_enc_find_index(internal_encoding_name);
221
+ }
222
+ }
223
+ else {
224
+ internal_index = rb_enc_find_index(string);
225
+ }
226
+
227
+ if (internal_index < 0 && internal_index != -2) {
228
+ rb_warn("Unsupported encoding %s ignored", string);
229
+ }
230
+
231
+ if (pointer) {
232
+ external_index = rb_enc_find_index(pointer);
233
+ if (external_index >= 0) {
234
+ external_encoding = rb_enc_from_index(external_index);
235
+ }
236
+ else {
237
+ rb_warn("Unsupported encoding %s ignored", string);
238
+ }
239
+ }
240
+ else if (internal_index >= 0) {
241
+ external_encoding = rb_enc_from_index(internal_index);
242
+ }
243
+ }
244
+ else if (!NIL_P(option)) {
245
+ rb_raise(rb_eArgError, ":encoding has to be a String");
246
+ }
247
+
248
+ buffer_size = BUFSIZE;
249
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
250
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
251
+ if (!NIL_P(bufsize)) {
252
+ buffer_size = NUM2INT(bufsize);
253
+ }
254
+ }
255
+
256
+ if (io) {
257
+ buf = ALLOC_N(char, buffer_size);
258
+ }
259
+
260
+ %% write init;
261
+
262
+ while (!done) {
263
+ VALUE str;
264
+ char *p, *pe;
265
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff;
266
+
267
+ if (io) {
268
+ if (space == 0) {
269
+ tokstart_diff = ts - buf;
270
+ tokend_diff = te - buf;
271
+
272
+ buffer_size += BUFSIZE;
273
+ REALLOC_N(buf, char, buffer_size);
274
+
275
+ space = buffer_size - have;
276
+
277
+ ts = buf + tokstart_diff;
278
+ te = buf + tokend_diff;
279
+ }
280
+ p = buf + have;
281
+
282
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
283
+ if (NIL_P(str)) {
284
+ // StringIO#read returns nil for empty string.
285
+ len = 0;
286
+ }
287
+ else {
288
+ len = RSTRING_LEN(str);
289
+ memcpy(p, StringValuePtr(str), len);
290
+ }
291
+
292
+ if (len < space) {
293
+ // EOF actions don't work in scanners, so we add a sentinel value.
294
+ // @see http://www.complang.org/pipermail/ragel-users/2007-May/001516.html
295
+ // @see https://github.com/leeonix/lua-csv-ragel/blob/master/src/csv.rl
296
+ p[len++] = 0;
297
+ done = 1;
298
+ }
299
+ }
300
+ else {
301
+ p = RSTRING_PTR(port);
302
+ len = RSTRING_LEN(port);
303
+ p[len++] = 0;
304
+ done = 1;
305
+ }
306
+
307
+ pe = p + len;
308
+ // if (done) {
309
+ // // This triggers the eof action in the non-scanner version.
310
+ // eof = pe;
311
+ // }
312
+ %% write exec;
313
+
314
+ if (done && cs < fastcsv_first_final) {
315
+ if (buf != NULL) {
316
+ free(buf);
317
+ }
318
+ if (unclosed_line) {
319
+ rb_raise(rb_eParseError, "Unclosed quoted field on line %d.", unclosed_line);
320
+ }
321
+ // Ruby raises different errors for illegal quoting, depending on whether
322
+ // a quoted string is followed by a string ("Unclosed quoted field on line
323
+ // %d.") or by a string ending in a quote ("Missing or stray quote in line
324
+ // %d"). These precisions are kind of bogus, but we can try using $!.
325
+ else {
326
+ rb_raise(rb_eParseError, "Illegal quoting in line %d.", curline);
327
+ }
328
+ }
329
+
330
+ if (ts == 0) {
331
+ have = 0;
332
+ }
333
+ else if (io) {
334
+ have = pe - ts;
335
+ memmove(buf, ts, have);
336
+ te = buf + (te - ts);
337
+ ts = buf;
338
+ }
339
+ }
340
+
341
+ if (buf != NULL) {
342
+ free(buf);
343
+ }
344
+
345
+ return Qnil;
346
+ }
347
+
348
+ void Init_fastcsv() {
349
+ s_read = rb_intern("read");
350
+ s_to_str = rb_intern("to_str");
351
+
352
+ mModule = rb_define_module("FastCSV");
353
+ rb_define_attr(rb_singleton_class(mModule), "buffer_size", 1, 1);
354
+ rb_define_singleton_method(mModule, "raw_parse", fastcsv, -1);
355
+ rb_eParseError = rb_define_class_under(mModule, "ParseError", rb_eStandardError);
356
+ }
data/fastcsv.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "fastcsv"
5
+ s.version = '0.0.1'
6
+ s.platform = Gem::Platform::RUBY
7
+ s.authors = ["Open North"]
8
+ s.email = ["info@opennorth.ca"]
9
+ s.homepage = "http://github.com/opennorth/fastcsv"
10
+ s.summary = %q{A fast Ragel-based CSV parser}
11
+ s.license = 'MIT'
12
+
13
+ s.files = `git ls-files`.split("\n")
14
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
15
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
16
+ s.require_paths = ["lib"]
17
+ s.extensions = ["ext/fastcsv/extconf.rb"]
18
+
19
+ s.add_development_dependency('coveralls')
20
+ s.add_development_dependency('json', '~> 1.7.7') # to silence coveralls warning
21
+ s.add_development_dependency('rake')
22
+ s.add_development_dependency('rake-compiler')
23
+ s.add_development_dependency('rspec', '~> 3.1')
24
+ end
data/lib/fastcsv.rb ADDED
@@ -0,0 +1 @@
1
+ require 'fastcsv/fastcsv'
@@ -0,0 +1,218 @@
1
+ require 'spec_helper'
2
+
3
+ require 'csv'
4
+
5
+ RSpec.shared_examples 'a CSV parser' do
6
+ let :simple do
7
+ "foo\nbar\nbaz"
8
+ end
9
+
10
+ [
11
+ # Single tokens.
12
+ "",
13
+ "x",
14
+ %(""),
15
+ %("x"),
16
+ ",",
17
+ "\n",
18
+
19
+ # Last tokens.
20
+ "x,y",
21
+ %(x,"y"),
22
+ "x,",
23
+ "x\n",
24
+
25
+ # Line endings.
26
+ "\n\n\n",
27
+ "\r\r\r",
28
+ "\r\n\r\n\r\n",
29
+ "foo\rbar\rbaz\r",
30
+ "foo\nbar\nbaz\n",
31
+ "foo\r\nbar\r\nbaz\r\n",
32
+
33
+ # Repetition.
34
+ "x,x,x",
35
+ "x\nx\nx",
36
+ %("x","x","x"),
37
+ %("x"\n"x"\n"x"),
38
+ ",,,",
39
+ ",\n,\n,",
40
+
41
+ # Blank.
42
+ %(,""),
43
+ %("",),
44
+ "\n\n\nfoo\n\n\n",
45
+
46
+ # Whitespace.
47
+ " x",
48
+ "x ",
49
+ " x ",
50
+ # Tab.
51
+ " x",
52
+ "x ",
53
+ " x ",
54
+
55
+ # Quoting.
56
+ %(foo,"bar,baz",bzz),
57
+ %(foo,"bar\nbaz",bzz),
58
+ %(foo,"""bar""baz""bzz""",zzz),
59
+
60
+ # Buffers.
61
+ "01234567890" * 2_000, # 20,000 > BUFSIZE
62
+ "0123456789," * 2_000,
63
+
64
+ # Uneven rows.
65
+ "1,2,3\n1,2",
66
+ "1,2\n1,2,3",
67
+
68
+ # Uneven data types.
69
+ "2000-01-01,2,x\nx,2000-01-01,2",
70
+ ].each do |csv|
71
+ it "should parse: #{csv}" do
72
+ expect(parse(csv)).to eq(CSV.parse(csv))
73
+ end
74
+ end
75
+
76
+ [
77
+ # Whitespace.
78
+ # @note Ruby's CSV library has inexplicably inconsistent error messages for
79
+ # the same class of error.
80
+ [%( "x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
81
+ [%("x" ), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
82
+ [%( "x" ), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
83
+ # Tab.
84
+ [%( "x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
85
+ [%("x" ), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
86
+ [%( "x" ), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
87
+
88
+ # Quoted next to unquoted.
89
+ [%("x"x), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
90
+ [%(x"x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
91
+ [%(x"x"x), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
92
+ [%("x"x"x"), 'Missing or stray quote in line %d', 'Illegal quoting in line %d.'],
93
+
94
+ # Unclosed quote.
95
+ [%("x), 'Unclosed quoted field on line %d.', 'Unclosed quoted field on line %d.'],
96
+
97
+ # Quote in unquoted field.
98
+ [%(x"x), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
99
+
100
+ # Unescaped quote in quoted field.
101
+ [%("x"x"), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
102
+ ].each do |csv,csv_error,fastcsv_error|
103
+ it "should raise an error on: #{csv.inspect.gsub('\"', '"')}" do
104
+ expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, csv_error % 1)
105
+ expect{parse(csv)}.to raise_error(FastCSV::ParseError, fastcsv_error % 1)
106
+ end
107
+
108
+ it "should raise an error with the correct line number on: #{"\n#{csv}\n".inspect.gsub('\"', '"')}" do
109
+ csv = "\n#{csv}\n"
110
+ expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, csv_error % 2)
111
+ expect{parse(csv)}.to raise_error(FastCSV::ParseError, fastcsv_error % 2)
112
+ end
113
+ end
114
+
115
+ it 'should raise an error on mixed row separators are' do
116
+ csv = "foo\rbar\nbaz\r\n"
117
+ expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, 'Unquoted fields do not allow \r or \n (line 2).')
118
+ skip
119
+ end
120
+
121
+ it 'should raise an error if no block is given' do
122
+ expect{parse_without_block('x')}.to raise_error(LocalJumpError, 'no block given')
123
+ end
124
+
125
+ it 'should not raise an error if no block and empty input' do
126
+ expect{parse_without_block('')}.to_not raise_error
127
+ end
128
+
129
+ it 'should raise an error if the options are not a Hash or nil' do
130
+ expect{parse('', '')}.to raise_error(ArgumentError, 'options has to be a Hash or nil')
131
+ end
132
+
133
+ it 'should allow nil buffer size' do
134
+ FastCSV.buffer_size = nil
135
+ expect(parse(simple)).to eq(CSV.parse(simple))
136
+ FastCSV.buffer_size = nil
137
+ end
138
+
139
+ it 'should recover from a zero buffer size' do
140
+ FastCSV.buffer_size = 0
141
+ expect(parse(simple)).to eq(CSV.parse(simple))
142
+ FastCSV.buffer_size = nil
143
+ end
144
+ end
145
+
146
+ RSpec.describe FastCSV do
147
+ context "with String" do
148
+ def parse(csv, options = nil)
149
+ rows = []
150
+ FastCSV.raw_parse(csv, options){|row| rows << row}
151
+ rows
152
+ end
153
+
154
+ def parse_without_block(csv, options = nil)
155
+ FastCSV.raw_parse(csv, options)
156
+ end
157
+
158
+ include_examples 'a CSV parser'
159
+
160
+ it 'should not raise an error on negative buffer size' do
161
+ FastCSV.buffer_size = -1
162
+ expect{parse(simple)}.to_not raise_error
163
+ FastCSV.buffer_size = nil
164
+ end
165
+ end
166
+
167
+ context "with StringIO" do
168
+ def parse(csv, options = nil)
169
+ rows = []
170
+ FastCSV.raw_parse(StringIO.new(csv), options){|row| rows << row}
171
+ rows
172
+ end
173
+
174
+ def parse_without_block(csv, options = nil)
175
+ FastCSV.raw_parse(StringIO.new(csv), options)
176
+ end
177
+
178
+ include_examples 'a CSV parser'
179
+
180
+ it 'should raise an error on negative buffer size' do
181
+ FastCSV.buffer_size = -1
182
+ expect{parse(simple)}.to raise_error(NoMemoryError)
183
+ FastCSV.buffer_size = nil
184
+ end
185
+ end
186
+
187
+ def parse_with_encoding(basename, encoding)
188
+ filename = File.expand_path(File.join('..', 'fixtures', basename), __FILE__)
189
+ options = {encoding: encoding}
190
+ File.open(filename) do |io|
191
+ rows = []
192
+ FastCSV.raw_parse(io, options){|row| rows << row}
193
+ expected = CSV.read(filename, options)
194
+ expect(rows).to eq(expected)
195
+ expect(rows[0][0].encoding).to eq(expected[0][0].encoding)
196
+ end
197
+ end
198
+
199
+ it 'should encode the input' do
200
+ parse_with_encoding('iso-8859-1.csv', 'iso-8859-1')
201
+ end
202
+
203
+ it 'should encode the input with a blank internal encoding' do
204
+ parse_with_encoding('utf-8.csv', ':utf-8')
205
+ end
206
+
207
+ it 'should transcode the input' do
208
+ parse_with_encoding('iso-8859-1.csv', 'iso-8859-1:utf-8')
209
+ end
210
+
211
+ it 'should invalid encoding' do
212
+ parse_with_encoding('utf-8.csv', 'invalid')
213
+ end
214
+
215
+ it 'should raise an error if the input is not a String or IO' do
216
+ expect{FastCSV.raw_parse(nil)}.to raise_error(ArgumentError, 'data has to respond to #read or #to_str')
217
+ end
218
+ end