csvscan 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/History.txt +6 -0
- data/Manifest.txt +11 -0
- data/README.ja +33 -0
- data/README.txt +46 -0
- data/Rakefile +16 -0
- data/ext/csvscan/MANIFEST +4 -0
- data/ext/csvscan/csvscan.c +332 -0
- data/ext/csvscan/csvscan.rl +161 -0
- data/ext/csvscan/extconf.rb +3 -0
- data/setup.rb +1585 -0
- data/test/test_csvscan.rb +8 -0
- metadata +127 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.ja
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
CSVScan
|
2
|
+
|
3
|
+
CSVScan ��CSV���®�˥ѡ������뤿��Υ饤�֥��Ǥ���
|
4
|
+
|
5
|
+
|
6
|
+
1. ɬ�״Ķ�
|
7
|
+
|
8
|
+
* ruby 1.8
|
9
|
+
* C ����ѥ���
|
10
|
+
|
11
|
+
|
12
|
+
2. ���ȡ�����ˡ
|
13
|
+
|
14
|
+
���ޥ�ɥ饤��ǰʲ��Τ褦�����Ϥ��Ƥ���������
|
15
|
+
UNIX �� OS �ǤϤ����餯 root ���¤�ɬ�פˤʤ�ޤ���
|
16
|
+
|
17
|
+
# ruby setup.rb
|
18
|
+
|
19
|
+
3. �Ȥ���
|
20
|
+
|
21
|
+
require "csvscan" # �饤�֥��Υ�����
|
22
|
+
|
23
|
+
open(ARGV.shift) {|io|
|
24
|
+
CSVScan.scan(io) {|row|
|
25
|
+
p row
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
4. �饤����
|
30
|
+
|
31
|
+
�饤����Ruby�Υ饤���˽����ޤ���
|
32
|
+
|
33
|
+
MoonWolf <moonwolf@moonwolf.com>
|
data/README.txt
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
= csvscan
|
2
|
+
|
3
|
+
http://github.com/sandofsky/csvscan
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
This is a packaged version of CSVScan, written by MoonWolf. If you can read Japanese, checkout README.ja for whatever he said.
|
8
|
+
|
9
|
+
On a 10,000 line file:
|
10
|
+
|
11
|
+
time cat example.csv | ruby fastercsv_benchmark.rb
|
12
|
+
|
13
|
+
real 0m8.804s
|
14
|
+
user 0m8.502s
|
15
|
+
sys 0m0.304s
|
16
|
+
|
17
|
+
time cat example.csv | ruby csvscan_benchmark.rb
|
18
|
+
|
19
|
+
real 0m0.860s
|
20
|
+
user 0m0.782s
|
21
|
+
sys 0m0.088s
|
22
|
+
|
23
|
+
|
24
|
+
== FEATURES/PROBLEMS:
|
25
|
+
|
26
|
+
* First version.
|
27
|
+
* I have not tested this on Windows, and have no intention to.
|
28
|
+
|
29
|
+
== SYNOPSIS:
|
30
|
+
|
31
|
+
require 'csvscan'
|
32
|
+
CSVScan.scan(STDIN) do |row|
|
33
|
+
puts row.inspect
|
34
|
+
end
|
35
|
+
|
36
|
+
== REQUIREMENTS:
|
37
|
+
|
38
|
+
* FIX (list of requirements)
|
39
|
+
|
40
|
+
== INSTALL:
|
41
|
+
|
42
|
+
* gem install sandofsky-csvscan --source http://gems.github.com
|
43
|
+
|
44
|
+
== LICENSE:
|
45
|
+
|
46
|
+
Looks like the original source was LGPL, so I'm stuck with that.
|
data/Rakefile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
|
6
|
+
class CSVScan
|
7
|
+
VERSION = "0.1.0"
|
8
|
+
end
|
9
|
+
|
10
|
+
Hoe.spec 'csvscan' do
|
11
|
+
developer('Ben Sandofsky', 'sandofsky@gmail.com')
|
12
|
+
spec_extras[:extensions] = "ext/csvscan/extconf.rb"
|
13
|
+
clean_globs << "ext/csvscan/csvscan.*" << "ext/csvscan/*.o" << "ext/Makefile"
|
14
|
+
end
|
15
|
+
|
16
|
+
# vim: syntax=ruby
|
@@ -0,0 +1,332 @@
|
|
1
|
+
#line 1 "csvscan.rl"
|
2
|
+
#include <ruby.h>
|
3
|
+
|
4
|
+
static VALUE rb_eCSVParseError;
|
5
|
+
static ID s_read, s_to_str;
|
6
|
+
|
7
|
+
#line 70 "csvscan.rl"
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
#line 12 "csvscan.c"
|
12
|
+
static const int csv_scan_start = 2;
|
13
|
+
|
14
|
+
static const int csv_scan_error = 1;
|
15
|
+
|
16
|
+
#line 73 "csvscan.rl"
|
17
|
+
|
18
|
+
#define BUFSIZE 131072
|
19
|
+
|
20
|
+
VALUE csv_scan(VALUE self, VALUE port) {
|
21
|
+
int cs, act, have = 0, nread = 0, curline = 1;
|
22
|
+
unsigned char *tokstart = NULL, *tokend = NULL, *buf;
|
23
|
+
VALUE row, coldata;
|
24
|
+
VALUE bufsize = Qnil;
|
25
|
+
int done=0, buffer_size;
|
26
|
+
|
27
|
+
if ( !rb_respond_to( port, s_read ) ) {
|
28
|
+
if ( rb_respond_to( port, s_to_str ) ) {
|
29
|
+
port = rb_funcall( port, s_to_str, 0 );
|
30
|
+
StringValue(port);
|
31
|
+
} else {
|
32
|
+
rb_raise( rb_eArgError, "bad argument, String or IO only please." );
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
buffer_size = BUFSIZE;
|
37
|
+
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
38
|
+
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
39
|
+
if (!NIL_P(bufsize)) {
|
40
|
+
buffer_size = NUM2INT(bufsize);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
buf = ALLOC_N(unsigned char, buffer_size);
|
44
|
+
|
45
|
+
|
46
|
+
#line 47 "csvscan.c"
|
47
|
+
{
|
48
|
+
cs = csv_scan_start;
|
49
|
+
tokstart = 0;
|
50
|
+
tokend = 0;
|
51
|
+
act = 0;
|
52
|
+
}
|
53
|
+
#line 102 "csvscan.rl"
|
54
|
+
|
55
|
+
row = rb_ary_new();
|
56
|
+
coldata = Qnil;
|
57
|
+
|
58
|
+
while( !done ) {
|
59
|
+
VALUE str;
|
60
|
+
unsigned char *p = buf + have, *pe;
|
61
|
+
int len, space = buffer_size - have;
|
62
|
+
|
63
|
+
if ( space == 0 ) {
|
64
|
+
rb_raise(rb_eCSVParseError, "ran out of buffer on line %d.", curline);
|
65
|
+
}
|
66
|
+
|
67
|
+
if ( rb_respond_to( port, s_read ) ) {
|
68
|
+
str = rb_funcall( port, s_read, 1, INT2FIX(space) );
|
69
|
+
} else {
|
70
|
+
str = rb_str_substr( port, nread, space );
|
71
|
+
}
|
72
|
+
|
73
|
+
StringValue(str);
|
74
|
+
memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
|
75
|
+
len = RSTRING(str)->len;
|
76
|
+
nread += len;
|
77
|
+
|
78
|
+
/* If this is the last buffer, tack on an EOF. */
|
79
|
+
if ( len < space ) {
|
80
|
+
p[len++] = 0;
|
81
|
+
done = 1;
|
82
|
+
}
|
83
|
+
|
84
|
+
pe = p + len;
|
85
|
+
|
86
|
+
#line 87 "csvscan.c"
|
87
|
+
{
|
88
|
+
if ( p == pe )
|
89
|
+
goto _out;
|
90
|
+
switch ( cs )
|
91
|
+
{
|
92
|
+
tr0:
|
93
|
+
#line 19 "csvscan.rl"
|
94
|
+
{tokend = p;{p = ((tokend))-1;}}
|
95
|
+
goto st2;
|
96
|
+
tr1:
|
97
|
+
#line 10 "csvscan.rl"
|
98
|
+
{
|
99
|
+
curline += 1;
|
100
|
+
}
|
101
|
+
#line 20 "csvscan.rl"
|
102
|
+
{
|
103
|
+
rb_ary_push(row, coldata);
|
104
|
+
rb_yield(row);
|
105
|
+
coldata = Qnil;
|
106
|
+
row = rb_ary_new();
|
107
|
+
}
|
108
|
+
#line 20 "csvscan.rl"
|
109
|
+
{tokend = p+1;{p = ((tokend))-1;}}
|
110
|
+
goto st2;
|
111
|
+
tr2:
|
112
|
+
#line 49 "csvscan.rl"
|
113
|
+
{tokend = p;{
|
114
|
+
unsigned char ch, *start_p, *wptr, *rptr;
|
115
|
+
int rest, datalen;
|
116
|
+
start_p = wptr = tokstart;
|
117
|
+
rptr = tokstart + 1;
|
118
|
+
rest = tokend - tokstart - 2;
|
119
|
+
datalen = 0;
|
120
|
+
while(rest>0) {
|
121
|
+
ch = *rptr++;
|
122
|
+
if (ch=='"') {
|
123
|
+
rptr++;
|
124
|
+
rest--;
|
125
|
+
}
|
126
|
+
*wptr++ = ch;
|
127
|
+
datalen++;
|
128
|
+
rest--;
|
129
|
+
}
|
130
|
+
coldata = rb_str_new( start_p, datalen );
|
131
|
+
}{p = ((tokend))-1;}}
|
132
|
+
goto st2;
|
133
|
+
tr5:
|
134
|
+
#line 1 "csvscan.rl"
|
135
|
+
{ switch( act ) {
|
136
|
+
case 0: tokend = tokstart; {goto st1;}
|
137
|
+
case 4:
|
138
|
+
{
|
139
|
+
unsigned char ch, *endp;
|
140
|
+
int datalen;
|
141
|
+
datalen = tokend - tokstart;
|
142
|
+
endp = tokend - 1;
|
143
|
+
while(datalen>0) {
|
144
|
+
ch = *endp--;
|
145
|
+
if (ch==' ' || ch=='\t') {
|
146
|
+
datalen--;
|
147
|
+
} else {
|
148
|
+
break;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
if (datalen==0) {
|
152
|
+
coldata = Qnil;
|
153
|
+
} else {
|
154
|
+
coldata = rb_str_new(tokstart, datalen);
|
155
|
+
}
|
156
|
+
}
|
157
|
+
break;
|
158
|
+
case 5:
|
159
|
+
{
|
160
|
+
unsigned char ch, *start_p, *wptr, *rptr;
|
161
|
+
int rest, datalen;
|
162
|
+
start_p = wptr = tokstart;
|
163
|
+
rptr = tokstart + 1;
|
164
|
+
rest = tokend - tokstart - 2;
|
165
|
+
datalen = 0;
|
166
|
+
while(rest>0) {
|
167
|
+
ch = *rptr++;
|
168
|
+
if (ch=='"') {
|
169
|
+
rptr++;
|
170
|
+
rest--;
|
171
|
+
}
|
172
|
+
*wptr++ = ch;
|
173
|
+
datalen++;
|
174
|
+
rest--;
|
175
|
+
}
|
176
|
+
coldata = rb_str_new( start_p, datalen );
|
177
|
+
}
|
178
|
+
break;
|
179
|
+
default: break;
|
180
|
+
}
|
181
|
+
{p = ((tokend))-1;}}
|
182
|
+
goto st2;
|
183
|
+
tr6:
|
184
|
+
#line 19 "csvscan.rl"
|
185
|
+
{tokend = p+1;{p = ((tokend))-1;}}
|
186
|
+
goto st2;
|
187
|
+
tr7:
|
188
|
+
#line 19 "csvscan.rl"
|
189
|
+
{tokend = p+1;{p = ((tokend))-1;}}
|
190
|
+
#line 10 "csvscan.rl"
|
191
|
+
{
|
192
|
+
curline += 1;
|
193
|
+
}
|
194
|
+
#line 20 "csvscan.rl"
|
195
|
+
{
|
196
|
+
rb_ary_push(row, coldata);
|
197
|
+
rb_yield(row);
|
198
|
+
coldata = Qnil;
|
199
|
+
row = rb_ary_new();
|
200
|
+
}
|
201
|
+
goto st2;
|
202
|
+
tr10:
|
203
|
+
#line 26 "csvscan.rl"
|
204
|
+
{tokend = p+1;{
|
205
|
+
rb_ary_push(row, coldata);
|
206
|
+
coldata = Qnil;
|
207
|
+
}{p = ((tokend))-1;}}
|
208
|
+
goto st2;
|
209
|
+
st2:
|
210
|
+
#line 1 "csvscan.rl"
|
211
|
+
{tokstart = 0;}
|
212
|
+
#line 1 "csvscan.rl"
|
213
|
+
{act = 0;}
|
214
|
+
if ( ++p == pe )
|
215
|
+
goto _out2;
|
216
|
+
case 2:
|
217
|
+
#line 1 "csvscan.rl"
|
218
|
+
{tokstart = p;}
|
219
|
+
#line 220 "csvscan.c"
|
220
|
+
switch( (*p) ) {
|
221
|
+
case 9u: goto tr6;
|
222
|
+
case 10u: goto tr7;
|
223
|
+
case 13u: goto st4;
|
224
|
+
case 32u: goto tr6;
|
225
|
+
case 34u: goto st0;
|
226
|
+
case 44u: goto tr10;
|
227
|
+
}
|
228
|
+
if ( 11u <= (*p) && (*p) <= 12u )
|
229
|
+
goto tr8;
|
230
|
+
goto tr4;
|
231
|
+
tr4:
|
232
|
+
#line 1 "csvscan.rl"
|
233
|
+
{tokend = p+1;}
|
234
|
+
#line 30 "csvscan.rl"
|
235
|
+
{act = 4;}
|
236
|
+
goto st3;
|
237
|
+
tr8:
|
238
|
+
#line 1 "csvscan.rl"
|
239
|
+
{tokend = p+1;}
|
240
|
+
#line 19 "csvscan.rl"
|
241
|
+
{act = 1;}
|
242
|
+
goto st3;
|
243
|
+
st3:
|
244
|
+
if ( ++p == pe )
|
245
|
+
goto _out3;
|
246
|
+
case 3:
|
247
|
+
#line 248 "csvscan.c"
|
248
|
+
switch( (*p) ) {
|
249
|
+
case 10u: goto tr5;
|
250
|
+
case 13u: goto tr5;
|
251
|
+
case 34u: goto tr5;
|
252
|
+
case 44u: goto tr5;
|
253
|
+
}
|
254
|
+
goto tr4;
|
255
|
+
st4:
|
256
|
+
if ( ++p == pe )
|
257
|
+
goto _out4;
|
258
|
+
case 4:
|
259
|
+
if ( (*p) == 10u )
|
260
|
+
goto tr1;
|
261
|
+
goto tr0;
|
262
|
+
tr11:
|
263
|
+
#line 10 "csvscan.rl"
|
264
|
+
{
|
265
|
+
curline += 1;
|
266
|
+
}
|
267
|
+
goto st0;
|
268
|
+
st0:
|
269
|
+
if ( ++p == pe )
|
270
|
+
goto _out0;
|
271
|
+
case 0:
|
272
|
+
#line 273 "csvscan.c"
|
273
|
+
switch( (*p) ) {
|
274
|
+
case 10u: goto tr11;
|
275
|
+
case 34u: goto tr12;
|
276
|
+
}
|
277
|
+
goto st0;
|
278
|
+
tr12:
|
279
|
+
#line 1 "csvscan.rl"
|
280
|
+
{tokend = p+1;}
|
281
|
+
#line 49 "csvscan.rl"
|
282
|
+
{act = 5;}
|
283
|
+
goto st5;
|
284
|
+
st5:
|
285
|
+
if ( ++p == pe )
|
286
|
+
goto _out5;
|
287
|
+
case 5:
|
288
|
+
#line 289 "csvscan.c"
|
289
|
+
if ( (*p) == 34u )
|
290
|
+
goto st0;
|
291
|
+
goto tr2;
|
292
|
+
st1:
|
293
|
+
goto _out1;
|
294
|
+
}
|
295
|
+
_out2: cs = 2; goto _out;
|
296
|
+
_out3: cs = 3; goto _out;
|
297
|
+
_out4: cs = 4; goto _out;
|
298
|
+
_out0: cs = 0; goto _out;
|
299
|
+
_out5: cs = 5; goto _out;
|
300
|
+
_out1: cs = 1; goto _out;
|
301
|
+
|
302
|
+
_out: {}
|
303
|
+
}
|
304
|
+
#line 134 "csvscan.rl"
|
305
|
+
|
306
|
+
if ( cs == csv_scan_error ) {
|
307
|
+
free(buf);
|
308
|
+
rb_raise(rb_eCSVParseError, "parse error on line %d.", curline);
|
309
|
+
}
|
310
|
+
|
311
|
+
if ( tokstart == 0 ) {
|
312
|
+
have = 0;
|
313
|
+
} else {
|
314
|
+
have = pe - tokstart;
|
315
|
+
memmove( buf, tokstart, have );
|
316
|
+
tokend = buf + (tokend - tokstart);
|
317
|
+
tokstart = buf;
|
318
|
+
}
|
319
|
+
}
|
320
|
+
free(buf);
|
321
|
+
return Qnil;
|
322
|
+
}
|
323
|
+
|
324
|
+
void Init_csvscan() {
|
325
|
+
VALUE mCSVScan = rb_define_module("CSVScan");
|
326
|
+
rb_define_attr(rb_singleton_class(mCSVScan), "buffer_size", 1, 1);
|
327
|
+
rb_define_singleton_method(mCSVScan, "scan", csv_scan, 1);
|
328
|
+
rb_eCSVParseError = rb_define_class_under(mCSVScan, "ParseError", rb_eException);
|
329
|
+
|
330
|
+
s_read = rb_intern("read");
|
331
|
+
s_to_str = rb_intern("to_str");
|
332
|
+
}
|