fastcsv 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,356 @@
1
+ #include <ruby.h>
2
+ #include <ruby/encoding.h>
3
+ // CSV specifications.
4
+ // http://tools.ietf.org/html/rfc4180
5
+ // http://w3c.github.io/csvw/syntax/#ebnf
6
+
7
+ // CSV implementation.
8
+ // https://github.com/ruby/ruby/blob/master/lib/csv.rb
9
+
10
+ // Ruby C extensions help.
11
+ // https://github.com/ruby/ruby/blob/trunk/README.EXT
12
+ // http://rxr.whitequark.org/mri/source
13
+
14
+ // Ragel help.
15
+ // https://www.mail-archive.com/ragel-users@complang.org/
16
+
17
+ # define ASSOCIATE_INDEX \
18
+ if (internal_index >= 0) { \
19
+ rb_enc_associate_index(field, internal_index); \
20
+ field = rb_str_encode(field, rb_enc_from_encoding(external_encoding), 0, Qnil); \
21
+ } \
22
+ else { \
23
+ rb_enc_associate_index(field, rb_enc_to_index(external_encoding)); \
24
+ }
25
+
26
+ static VALUE mModule, rb_eParseError;
27
+ static ID s_read, s_to_str;
28
+
29
+ %%{
30
+ machine fastcsv;
31
+
32
+ action new_line {
33
+ curline++;
34
+ }
35
+
36
+ action open_quote {
37
+ unclosed_line = curline;
38
+ }
39
+
40
+ action close_quote {
41
+ unclosed_line = 0;
42
+ }
43
+
44
+ action read_unquoted {
45
+ if (p == ts) {
46
+ // Unquoted empty fields are nil, not "", in Ruby.
47
+ field = Qnil;
48
+ }
49
+ else if (p > ts) {
50
+ field = rb_str_new(ts, p - ts);
51
+ ASSOCIATE_INDEX;
52
+ }
53
+ }
54
+
55
+ action read_quoted {
56
+ if (p == ts) {
57
+ field = rb_str_new2("");
58
+ ASSOCIATE_INDEX;
59
+ }
60
+ // @note If we add an action on '""', we can skip some steps if no '""' is found.
61
+ else if (p > ts) {
62
+ // Operating on ts in-place produces odd behavior, FYI.
63
+ char *copy = ALLOC_N(char, p - ts);
64
+ memcpy(copy, ts, p - ts);
65
+
66
+ char *reader = ts, *writer = copy;
67
+ int escaped = 0;
68
+
69
+ while (p > reader) {
70
+ if (*reader == quote_char && !escaped) {
71
+ // Skip the escaping character.
72
+ escaped = 1;
73
+ }
74
+ else {
75
+ escaped = 0;
76
+ *writer++ = *reader;
77
+ }
78
+ reader++;
79
+ }
80
+
81
+ field = rb_str_new(copy, writer - copy);
82
+ ASSOCIATE_INDEX;
83
+
84
+ if (copy != NULL) {
85
+ free(copy);
86
+ }
87
+ }
88
+ }
89
+
90
+ action new_field {
91
+ rb_ary_push(row, field);
92
+ field = Qnil;
93
+ }
94
+
95
+ action new_row {
96
+ if (!NIL_P(field) || RARRAY_LEN(row)) { // same as new_field
97
+ rb_ary_push(row, field);
98
+ field = Qnil;
99
+ }
100
+
101
+ rb_yield(row);
102
+ row = rb_ary_new();
103
+ }
104
+
105
+ action last_row {
106
+ if (!NIL_P(field) || RARRAY_LEN(row)) {
107
+ rb_ary_push(row, field);
108
+ }
109
+ if (RARRAY_LEN(row)) {
110
+ rb_yield(row);
111
+ }
112
+ }
113
+
114
+ EOF = 0 >last_row;
115
+ quote_char = '"';
116
+ col_sep = ',' >new_field;
117
+ row_sep = ('\r' '\n'? | '\n') @new_line;
118
+ unquoted = (any* -- quote_char -- col_sep -- row_sep - EOF) %read_unquoted;
119
+ quoted = quote_char >open_quote (any - quote_char - EOF | quote_char quote_char | row_sep)* %read_quoted quote_char >close_quote;
120
+ field = unquoted | quoted;
121
+ # fields = (field col_sep)* field?;
122
+ # file = (fields row_sep >new_row)* fields?;
123
+
124
+ # @see Ragel Guide: 6.3 Scanners
125
+ # Remember that an unquoted field can be zero-length.
126
+ main := |*
127
+ field col_sep EOF?;
128
+ field row_sep >new_row EOF?;
129
+ field EOF;
130
+ *|;
131
+
132
+ # Non-scanner version requires very large buffer.
133
+ # main := file $/{
134
+ # if (!NIL_P(field) || RARRAY_LEN(row)) {
135
+ # rb_ary_push(row, field);
136
+ # rb_yield(row);
137
+ # }
138
+ # };
139
+ }%%
140
+
141
+ %% write data;
142
+
143
+ #define BUFSIZE 16384
144
+
145
+ VALUE fastcsv(int argc, VALUE *argv, VALUE self) {
146
+ int cs, act, have = 0, curline = 1, io = 0;
147
+ char *ts = 0, *te = 0, *buf = 0, *eof = 0;
148
+
149
+ VALUE port, opts;
150
+ VALUE row = rb_ary_new(), field = Qnil, bufsize = Qnil;
151
+ int done = 0, unclosed_line = 0, buffer_size = 0, taint = 0;
152
+ int internal_index = 0, external_index = rb_enc_to_index(rb_default_external_encoding());
153
+ rb_encoding *external_encoding = rb_default_external_encoding();
154
+
155
+ VALUE option;
156
+ char quote_char = '"'; //, *col_sep = ",", *row_sep = "\r\n";
157
+
158
+ rb_scan_args(argc, argv, "11", &port, &opts);
159
+ taint = OBJ_TAINTED(port);
160
+ io = rb_respond_to(port, s_read);
161
+ if (!io) {
162
+ if (rb_respond_to(port, s_to_str)) {
163
+ port = rb_funcall(port, s_to_str, 0);
164
+ StringValue(port);
165
+ }
166
+ else {
167
+ rb_raise(rb_eArgError, "data has to respond to #read or #to_str");
168
+ }
169
+ }
170
+
171
+ if (NIL_P(opts)) {
172
+ opts = rb_hash_new();
173
+ }
174
+ else if (TYPE(opts) != T_HASH) {
175
+ rb_raise(rb_eArgError, "options has to be a Hash or nil");
176
+ }
177
+
178
+ // @note Add machines for common CSV dialects, or see if we can use "when"
179
+ // from Chapter 6 to compare the character to the host program's variable.
180
+ // option = rb_hash_aref(opts, ID2SYM(rb_intern("quote_char")));
181
+ // if (TYPE(option) == T_STRING && RSTRING_LEN(option) == 1) {
182
+ // quote_char = *StringValueCStr(option);
183
+ // }
184
+ // else if (!NIL_P(option)) {
185
+ // rb_raise(rb_eArgError, ":quote_char has to be a single character String");
186
+ // }
187
+
188
+ // option = rb_hash_aref(opts, ID2SYM(rb_intern("col_sep")));
189
+ // if (TYPE(option) == T_STRING) {
190
+ // col_sep = StringValueCStr(option);
191
+ // }
192
+ // else if (!NIL_P(option)) {
193
+ // rb_raise(rb_eArgError, ":col_sep has to be a String");
194
+ // }
195
+
196
+ // option = rb_hash_aref(opts, ID2SYM(rb_intern("row_sep")));
197
+ // if (TYPE(option) == T_STRING) {
198
+ // row_sep = StringValueCStr(option);
199
+ // }
200
+ // else if (!NIL_P(option)) {
201
+ // rb_raise(rb_eArgError, ":row_sep has to be a String");
202
+ // }
203
+
204
+ option = rb_hash_aref(opts, ID2SYM(rb_intern("encoding")));
205
+ if (TYPE(option) == T_STRING) {
206
+ // @see parse_mode_enc in Ruby's io.c
207
+ const char *string = StringValueCStr(option), *pointer;
208
+ char internal_encoding_name[ENCODING_MAXNAMELEN + 1];
209
+
210
+ pointer = strrchr(string, ':');
211
+ if (pointer) {
212
+ long len = (pointer++) - string;
213
+ if (len == 0 || len > ENCODING_MAXNAMELEN) {
214
+ internal_index = -1;
215
+ }
216
+ else {
217
+ memcpy(internal_encoding_name, string, len);
218
+ internal_encoding_name[len] = '\0';
219
+ string = internal_encoding_name;
220
+ internal_index = rb_enc_find_index(internal_encoding_name);
221
+ }
222
+ }
223
+ else {
224
+ internal_index = rb_enc_find_index(string);
225
+ }
226
+
227
+ if (internal_index < 0 && internal_index != -2) {
228
+ rb_warn("Unsupported encoding %s ignored", string);
229
+ }
230
+
231
+ if (pointer) {
232
+ external_index = rb_enc_find_index(pointer);
233
+ if (external_index >= 0) {
234
+ external_encoding = rb_enc_from_index(external_index);
235
+ }
236
+ else {
237
+ rb_warn("Unsupported encoding %s ignored", string);
238
+ }
239
+ }
240
+ else if (internal_index >= 0) {
241
+ external_encoding = rb_enc_from_index(internal_index);
242
+ }
243
+ }
244
+ else if (!NIL_P(option)) {
245
+ rb_raise(rb_eArgError, ":encoding has to be a String");
246
+ }
247
+
248
+ buffer_size = BUFSIZE;
249
+ if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
250
+ bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
251
+ if (!NIL_P(bufsize)) {
252
+ buffer_size = NUM2INT(bufsize);
253
+ }
254
+ }
255
+
256
+ if (io) {
257
+ buf = ALLOC_N(char, buffer_size);
258
+ }
259
+
260
+ %% write init;
261
+
262
+ while (!done) {
263
+ VALUE str;
264
+ char *p, *pe;
265
+ int len, space = buffer_size - have, tokstart_diff, tokend_diff;
266
+
267
+ if (io) {
268
+ if (space == 0) {
269
+ tokstart_diff = ts - buf;
270
+ tokend_diff = te - buf;
271
+
272
+ buffer_size += BUFSIZE;
273
+ REALLOC_N(buf, char, buffer_size);
274
+
275
+ space = buffer_size - have;
276
+
277
+ ts = buf + tokstart_diff;
278
+ te = buf + tokend_diff;
279
+ }
280
+ p = buf + have;
281
+
282
+ str = rb_funcall(port, s_read, 1, INT2FIX(space));
283
+ if (NIL_P(str)) {
284
+ // StringIO#read returns nil for empty string.
285
+ len = 0;
286
+ }
287
+ else {
288
+ len = RSTRING_LEN(str);
289
+ memcpy(p, StringValuePtr(str), len);
290
+ }
291
+
292
+ if (len < space) {
293
+ // EOF actions don't work in scanners, so we add a sentinel value.
294
+ // @see http://www.complang.org/pipermail/ragel-users/2007-May/001516.html
295
+ // @see https://github.com/leeonix/lua-csv-ragel/blob/master/src/csv.rl
296
+ p[len++] = 0;
297
+ done = 1;
298
+ }
299
+ }
300
+ else {
301
+ p = RSTRING_PTR(port);
302
+ len = RSTRING_LEN(port);
303
+ p[len++] = 0;
304
+ done = 1;
305
+ }
306
+
307
+ pe = p + len;
308
+ // if (done) {
309
+ // // This triggers the eof action in the non-scanner version.
310
+ // eof = pe;
311
+ // }
312
+ %% write exec;
313
+
314
+ if (done && cs < fastcsv_first_final) {
315
+ if (buf != NULL) {
316
+ free(buf);
317
+ }
318
+ if (unclosed_line) {
319
+ rb_raise(rb_eParseError, "Unclosed quoted field on line %d.", unclosed_line);
320
+ }
321
+ // Ruby raises different errors for illegal quoting, depending on whether
322
+ // a quoted string is followed by a string ("Unclosed quoted field on line
323
+ // %d.") or by a string ending in a quote ("Missing or stray quote in line
324
+ // %d"). These precisions are kind of bogus, but we can try using $!.
325
+ else {
326
+ rb_raise(rb_eParseError, "Illegal quoting in line %d.", curline);
327
+ }
328
+ }
329
+
330
+ if (ts == 0) {
331
+ have = 0;
332
+ }
333
+ else if (io) {
334
+ have = pe - ts;
335
+ memmove(buf, ts, have);
336
+ te = buf + (te - ts);
337
+ ts = buf;
338
+ }
339
+ }
340
+
341
+ if (buf != NULL) {
342
+ free(buf);
343
+ }
344
+
345
+ return Qnil;
346
+ }
347
+
348
+ void Init_fastcsv() {
349
+ s_read = rb_intern("read");
350
+ s_to_str = rb_intern("to_str");
351
+
352
+ mModule = rb_define_module("FastCSV");
353
+ rb_define_attr(rb_singleton_class(mModule), "buffer_size", 1, 1);
354
+ rb_define_singleton_method(mModule, "raw_parse", fastcsv, -1);
355
+ rb_eParseError = rb_define_class_under(mModule, "ParseError", rb_eStandardError);
356
+ }
data/fastcsv.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "fastcsv"
5
+ s.version = '0.0.1'
6
+ s.platform = Gem::Platform::RUBY
7
+ s.authors = ["Open North"]
8
+ s.email = ["info@opennorth.ca"]
9
+ s.homepage = "http://github.com/opennorth/fastcsv"
10
+ s.summary = %q{A fast Ragel-based CSV parser}
11
+ s.license = 'MIT'
12
+
13
+ s.files = `git ls-files`.split("\n")
14
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
15
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
16
+ s.require_paths = ["lib"]
17
+ s.extensions = ["ext/fastcsv/extconf.rb"]
18
+
19
+ s.add_development_dependency('coveralls')
20
+ s.add_development_dependency('json', '~> 1.7.7') # to silence coveralls warning
21
+ s.add_development_dependency('rake')
22
+ s.add_development_dependency('rake-compiler')
23
+ s.add_development_dependency('rspec', '~> 3.1')
24
+ end
data/lib/fastcsv.rb ADDED
@@ -0,0 +1 @@
1
+ require 'fastcsv/fastcsv'
@@ -0,0 +1,218 @@
1
+ require 'spec_helper'
2
+
3
+ require 'csv'
4
+
5
+ RSpec.shared_examples 'a CSV parser' do
6
+ let :simple do
7
+ "foo\nbar\nbaz"
8
+ end
9
+
10
+ [
11
+ # Single tokens.
12
+ "",
13
+ "x",
14
+ %(""),
15
+ %("x"),
16
+ ",",
17
+ "\n",
18
+
19
+ # Last tokens.
20
+ "x,y",
21
+ %(x,"y"),
22
+ "x,",
23
+ "x\n",
24
+
25
+ # Line endings.
26
+ "\n\n\n",
27
+ "\r\r\r",
28
+ "\r\n\r\n\r\n",
29
+ "foo\rbar\rbaz\r",
30
+ "foo\nbar\nbaz\n",
31
+ "foo\r\nbar\r\nbaz\r\n",
32
+
33
+ # Repetition.
34
+ "x,x,x",
35
+ "x\nx\nx",
36
+ %("x","x","x"),
37
+ %("x"\n"x"\n"x"),
38
+ ",,,",
39
+ ",\n,\n,",
40
+
41
+ # Blank.
42
+ %(,""),
43
+ %("",),
44
+ "\n\n\nfoo\n\n\n",
45
+
46
+ # Whitespace.
47
+ " x",
48
+ "x ",
49
+ " x ",
50
+ # Tab.
51
+ " x",
52
+ "x ",
53
+ " x ",
54
+
55
+ # Quoting.
56
+ %(foo,"bar,baz",bzz),
57
+ %(foo,"bar\nbaz",bzz),
58
+ %(foo,"""bar""baz""bzz""",zzz),
59
+
60
+ # Buffers.
61
+ "01234567890" * 2_000, # 20,000 > BUFSIZE
62
+ "0123456789," * 2_000,
63
+
64
+ # Uneven rows.
65
+ "1,2,3\n1,2",
66
+ "1,2\n1,2,3",
67
+
68
+ # Uneven data types.
69
+ "2000-01-01,2,x\nx,2000-01-01,2",
70
+ ].each do |csv|
71
+ it "should parse: #{csv}" do
72
+ expect(parse(csv)).to eq(CSV.parse(csv))
73
+ end
74
+ end
75
+
76
+ [
77
+ # Whitespace.
78
+ # @note Ruby's CSV library has inexplicably inconsistent error messages for
79
+ # the same class of error.
80
+ [%( "x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
81
+ [%("x" ), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
82
+ [%( "x" ), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
83
+ # Tab.
84
+ [%( "x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
85
+ [%("x" ), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
86
+ [%( "x" ), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
87
+
88
+ # Quoted next to unquoted.
89
+ [%("x"x), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
90
+ [%(x"x"), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
91
+ [%(x"x"x), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
92
+ [%("x"x"x"), 'Missing or stray quote in line %d', 'Illegal quoting in line %d.'],
93
+
94
+ # Unclosed quote.
95
+ [%("x), 'Unclosed quoted field on line %d.', 'Unclosed quoted field on line %d.'],
96
+
97
+ # Quote in unquoted field.
98
+ [%(x"x), 'Illegal quoting in line %d.', 'Illegal quoting in line %d.'],
99
+
100
+ # Unescaped quote in quoted field.
101
+ [%("x"x"), 'Unclosed quoted field on line %d.', 'Illegal quoting in line %d.'],
102
+ ].each do |csv,csv_error,fastcsv_error|
103
+ it "should raise an error on: #{csv.inspect.gsub('\"', '"')}" do
104
+ expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, csv_error % 1)
105
+ expect{parse(csv)}.to raise_error(FastCSV::ParseError, fastcsv_error % 1)
106
+ end
107
+
108
+ it "should raise an error with the correct line number on: #{"\n#{csv}\n".inspect.gsub('\"', '"')}" do
109
+ csv = "\n#{csv}\n"
110
+ expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, csv_error % 2)
111
+ expect{parse(csv)}.to raise_error(FastCSV::ParseError, fastcsv_error % 2)
112
+ end
113
+ end
114
+
115
+ it 'should raise an error on mixed row separators are' do
116
+ csv = "foo\rbar\nbaz\r\n"
117
+ expect{CSV.parse(csv)}.to raise_error(CSV::MalformedCSVError, 'Unquoted fields do not allow \r or \n (line 2).')
118
+ skip
119
+ end
120
+
121
+ it 'should raise an error if no block is given' do
122
+ expect{parse_without_block('x')}.to raise_error(LocalJumpError, 'no block given')
123
+ end
124
+
125
+ it 'should not raise an error if no block and empty input' do
126
+ expect{parse_without_block('')}.to_not raise_error
127
+ end
128
+
129
+ it 'should raise an error if the options are not a Hash or nil' do
130
+ expect{parse('', '')}.to raise_error(ArgumentError, 'options has to be a Hash or nil')
131
+ end
132
+
133
+ it 'should allow nil buffer size' do
134
+ FastCSV.buffer_size = nil
135
+ expect(parse(simple)).to eq(CSV.parse(simple))
136
+ FastCSV.buffer_size = nil
137
+ end
138
+
139
+ it 'should recover from a zero buffer size' do
140
+ FastCSV.buffer_size = 0
141
+ expect(parse(simple)).to eq(CSV.parse(simple))
142
+ FastCSV.buffer_size = nil
143
+ end
144
+ end
145
+
146
+ RSpec.describe FastCSV do
147
+ context "with String" do
148
+ def parse(csv, options = nil)
149
+ rows = []
150
+ FastCSV.raw_parse(csv, options){|row| rows << row}
151
+ rows
152
+ end
153
+
154
+ def parse_without_block(csv, options = nil)
155
+ FastCSV.raw_parse(csv, options)
156
+ end
157
+
158
+ include_examples 'a CSV parser'
159
+
160
+ it 'should not raise an error on negative buffer size' do
161
+ FastCSV.buffer_size = -1
162
+ expect{parse(simple)}.to_not raise_error
163
+ FastCSV.buffer_size = nil
164
+ end
165
+ end
166
+
167
+ context "with StringIO" do
168
+ def parse(csv, options = nil)
169
+ rows = []
170
+ FastCSV.raw_parse(StringIO.new(csv), options){|row| rows << row}
171
+ rows
172
+ end
173
+
174
+ def parse_without_block(csv, options = nil)
175
+ FastCSV.raw_parse(StringIO.new(csv), options)
176
+ end
177
+
178
+ include_examples 'a CSV parser'
179
+
180
+ it 'should raise an error on negative buffer size' do
181
+ FastCSV.buffer_size = -1
182
+ expect{parse(simple)}.to raise_error(NoMemoryError)
183
+ FastCSV.buffer_size = nil
184
+ end
185
+ end
186
+
187
+ def parse_with_encoding(basename, encoding)
188
+ filename = File.expand_path(File.join('..', 'fixtures', basename), __FILE__)
189
+ options = {encoding: encoding}
190
+ File.open(filename) do |io|
191
+ rows = []
192
+ FastCSV.raw_parse(io, options){|row| rows << row}
193
+ expected = CSV.read(filename, options)
194
+ expect(rows).to eq(expected)
195
+ expect(rows[0][0].encoding).to eq(expected[0][0].encoding)
196
+ end
197
+ end
198
+
199
+ it 'should encode the input' do
200
+ parse_with_encoding('iso-8859-1.csv', 'iso-8859-1')
201
+ end
202
+
203
+ it 'should encode the input with a blank internal encoding' do
204
+ parse_with_encoding('utf-8.csv', ':utf-8')
205
+ end
206
+
207
+ it 'should transcode the input' do
208
+ parse_with_encoding('iso-8859-1.csv', 'iso-8859-1:utf-8')
209
+ end
210
+
211
+ it 'should invalid encoding' do
212
+ parse_with_encoding('utf-8.csv', 'invalid')
213
+ end
214
+
215
+ it 'should raise an error if the input is not a String or IO' do
216
+ expect{FastCSV.raw_parse(nil)}.to raise_error(ArgumentError, 'data has to respond to #read or #to_str')
217
+ end
218
+ end