sandofsky-csvscan 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +6 -0
- data/Manifest.txt +11 -0
- data/README.ja +33 -0
- data/README.txt +46 -0
- data/Rakefile +16 -0
- data/ext/csvscan/MANIFEST +4 -0
- data/ext/csvscan/csvscan.c +332 -0
- data/ext/csvscan/csvscan.rl +161 -0
- data/ext/csvscan/extconf.rb +3 -0
- data/setup.rb +1585 -0
- data/test/test_csvscan.rb +8 -0
- metadata +77 -0
data/History.txt
ADDED
data/Manifest.txt
ADDED
data/README.ja
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
CSVScan
|
2
|
+
|
3
|
+
CSVScan ��CSV���®�˥ѡ������뤿��Υ饤�֥��Ǥ���
|
4
|
+
|
5
|
+
|
6
|
+
1. ɬ�״Ķ�
|
7
|
+
|
8
|
+
* ruby 1.8
|
9
|
+
* C ����ѥ���
|
10
|
+
|
11
|
+
|
12
|
+
2. ���ȡ�����ˡ
|
13
|
+
|
14
|
+
���ޥ�ɥ饤��ǰʲ��Τ褦�����Ϥ��Ƥ���������
|
15
|
+
UNIX �� OS �ǤϤ����餯 root ���¤�ɬ�פˤʤ�ޤ���
|
16
|
+
|
17
|
+
# ruby setup.rb
|
18
|
+
|
19
|
+
3. �Ȥ���
|
20
|
+
|
21
|
+
require "csvscan" # �饤�֥��Υ�����
|
22
|
+
|
23
|
+
open(ARGV.shift) {|io|
|
24
|
+
CSVScan.scan(io) {|row|
|
25
|
+
p row
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
4. �饤����
|
30
|
+
|
31
|
+
�饤����Ruby�Υ饤���˽����ޤ���
|
32
|
+
|
33
|
+
MoonWolf <moonwolf@moonwolf.com>
|
data/README.txt
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
= csvscan
|
2
|
+
|
3
|
+
http://github.com/sandofsky/csvscan
|
4
|
+
|
5
|
+
== DESCRIPTION:
|
6
|
+
|
7
|
+
This is a packaged version of CSVScan, written by MoonWolf. If you can read Japanese, checkout README.ja for whatever he said.
|
8
|
+
|
9
|
+
On a 10,000 line file:
|
10
|
+
|
11
|
+
time cat example.csv | ruby fastercsv_benchmark.rb
|
12
|
+
|
13
|
+
real 0m8.804s
|
14
|
+
user 0m8.502s
|
15
|
+
sys 0m0.304s
|
16
|
+
|
17
|
+
time cat example.csv | ruby csvscan_benchmark.rb
|
18
|
+
|
19
|
+
real 0m0.860s
|
20
|
+
user 0m0.782s
|
21
|
+
sys 0m0.088s
|
22
|
+
|
23
|
+
|
24
|
+
== FEATURES/PROBLEMS:
|
25
|
+
|
26
|
+
* First version.
|
27
|
+
* I have not tested this on Windows, and have no intention to.
|
28
|
+
|
29
|
+
== SYNOPSIS:
|
30
|
+
|
31
|
+
require 'csvscan'
|
32
|
+
CSVScan.scan(STDIN) do |row|
|
33
|
+
puts row.inspect
|
34
|
+
end
|
35
|
+
|
36
|
+
== REQUIREMENTS:
|
37
|
+
|
38
|
+
* FIX (list of requirements)
|
39
|
+
|
40
|
+
== INSTALL:
|
41
|
+
|
42
|
+
* gem install sandofsky-csvscan --source http://gems.github.com
|
43
|
+
|
44
|
+
== LICENSE:
|
45
|
+
|
46
|
+
Looks like the original source was LGPL, so I'm stuck with that.
|
data/Rakefile
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# -*- ruby -*-
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'hoe'
|
5
|
+
|
6
|
+
class CSVScan
|
7
|
+
VERSION = "0.1.0"
|
8
|
+
end
|
9
|
+
|
10
|
+
Hoe.spec 'csvscan' do
|
11
|
+
developer('Ben Sandofsky', 'sandofsky@gmail.com')
|
12
|
+
spec_extras[:extensions] = "ext/csvscan/extconf.rb"
|
13
|
+
clean_globs << "ext/csvscan/csvscan.*" << "ext/csvscan/*.o" << "ext/Makefile"
|
14
|
+
end
|
15
|
+
|
16
|
+
# vim: syntax=ruby
|
@@ -0,0 +1,332 @@
|
|
1
|
+
#line 1 "csvscan.rl"
|
2
|
+
#include <ruby.h>
|
3
|
+
|
4
|
+
static VALUE rb_eCSVParseError;
|
5
|
+
static ID s_read, s_to_str;
|
6
|
+
|
7
|
+
#line 70 "csvscan.rl"
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
#line 12 "csvscan.c"
|
12
|
+
static const int csv_scan_start = 2;
|
13
|
+
|
14
|
+
static const int csv_scan_error = 1;
|
15
|
+
|
16
|
+
#line 73 "csvscan.rl"
|
17
|
+
|
18
|
+
#define BUFSIZE 131072
|
19
|
+
|
20
|
+
VALUE csv_scan(VALUE self, VALUE port) {
|
21
|
+
int cs, act, have = 0, nread = 0, curline = 1;
|
22
|
+
unsigned char *tokstart = NULL, *tokend = NULL, *buf;
|
23
|
+
VALUE row, coldata;
|
24
|
+
VALUE bufsize = Qnil;
|
25
|
+
int done=0, buffer_size;
|
26
|
+
|
27
|
+
if ( !rb_respond_to( port, s_read ) ) {
|
28
|
+
if ( rb_respond_to( port, s_to_str ) ) {
|
29
|
+
port = rb_funcall( port, s_to_str, 0 );
|
30
|
+
StringValue(port);
|
31
|
+
} else {
|
32
|
+
rb_raise( rb_eArgError, "bad argument, String or IO only please." );
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
buffer_size = BUFSIZE;
|
37
|
+
if (rb_ivar_defined(self, rb_intern("@buffer_size")) == Qtrue) {
|
38
|
+
bufsize = rb_ivar_get(self, rb_intern("@buffer_size"));
|
39
|
+
if (!NIL_P(bufsize)) {
|
40
|
+
buffer_size = NUM2INT(bufsize);
|
41
|
+
}
|
42
|
+
}
|
43
|
+
buf = ALLOC_N(unsigned char, buffer_size);
|
44
|
+
|
45
|
+
|
46
|
+
#line 47 "csvscan.c"
|
47
|
+
{
|
48
|
+
cs = csv_scan_start;
|
49
|
+
tokstart = 0;
|
50
|
+
tokend = 0;
|
51
|
+
act = 0;
|
52
|
+
}
|
53
|
+
#line 102 "csvscan.rl"
|
54
|
+
|
55
|
+
row = rb_ary_new();
|
56
|
+
coldata = Qnil;
|
57
|
+
|
58
|
+
while( !done ) {
|
59
|
+
VALUE str;
|
60
|
+
unsigned char *p = buf + have, *pe;
|
61
|
+
int len, space = buffer_size - have;
|
62
|
+
|
63
|
+
if ( space == 0 ) {
|
64
|
+
rb_raise(rb_eCSVParseError, "ran out of buffer on line %d.", curline);
|
65
|
+
}
|
66
|
+
|
67
|
+
if ( rb_respond_to( port, s_read ) ) {
|
68
|
+
str = rb_funcall( port, s_read, 1, INT2FIX(space) );
|
69
|
+
} else {
|
70
|
+
str = rb_str_substr( port, nread, space );
|
71
|
+
}
|
72
|
+
|
73
|
+
StringValue(str);
|
74
|
+
memcpy( p, RSTRING(str)->ptr, RSTRING(str)->len );
|
75
|
+
len = RSTRING(str)->len;
|
76
|
+
nread += len;
|
77
|
+
|
78
|
+
/* If this is the last buffer, tack on an EOF. */
|
79
|
+
if ( len < space ) {
|
80
|
+
p[len++] = 0;
|
81
|
+
done = 1;
|
82
|
+
}
|
83
|
+
|
84
|
+
pe = p + len;
|
85
|
+
|
86
|
+
#line 87 "csvscan.c"
|
87
|
+
{
|
88
|
+
if ( p == pe )
|
89
|
+
goto _out;
|
90
|
+
switch ( cs )
|
91
|
+
{
|
92
|
+
tr0:
|
93
|
+
#line 19 "csvscan.rl"
|
94
|
+
{tokend = p;{p = ((tokend))-1;}}
|
95
|
+
goto st2;
|
96
|
+
tr1:
|
97
|
+
#line 10 "csvscan.rl"
|
98
|
+
{
|
99
|
+
curline += 1;
|
100
|
+
}
|
101
|
+
#line 20 "csvscan.rl"
|
102
|
+
{
|
103
|
+
rb_ary_push(row, coldata);
|
104
|
+
rb_yield(row);
|
105
|
+
coldata = Qnil;
|
106
|
+
row = rb_ary_new();
|
107
|
+
}
|
108
|
+
#line 20 "csvscan.rl"
|
109
|
+
{tokend = p+1;{p = ((tokend))-1;}}
|
110
|
+
goto st2;
|
111
|
+
tr2:
|
112
|
+
#line 49 "csvscan.rl"
|
113
|
+
{tokend = p;{
|
114
|
+
unsigned char ch, *start_p, *wptr, *rptr;
|
115
|
+
int rest, datalen;
|
116
|
+
start_p = wptr = tokstart;
|
117
|
+
rptr = tokstart + 1;
|
118
|
+
rest = tokend - tokstart - 2;
|
119
|
+
datalen = 0;
|
120
|
+
while(rest>0) {
|
121
|
+
ch = *rptr++;
|
122
|
+
if (ch=='"') {
|
123
|
+
rptr++;
|
124
|
+
rest--;
|
125
|
+
}
|
126
|
+
*wptr++ = ch;
|
127
|
+
datalen++;
|
128
|
+
rest--;
|
129
|
+
}
|
130
|
+
coldata = rb_str_new( start_p, datalen );
|
131
|
+
}{p = ((tokend))-1;}}
|
132
|
+
goto st2;
|
133
|
+
tr5:
|
134
|
+
#line 1 "csvscan.rl"
|
135
|
+
{ switch( act ) {
|
136
|
+
case 0: tokend = tokstart; {goto st1;}
|
137
|
+
case 4:
|
138
|
+
{
|
139
|
+
unsigned char ch, *endp;
|
140
|
+
int datalen;
|
141
|
+
datalen = tokend - tokstart;
|
142
|
+
endp = tokend - 1;
|
143
|
+
while(datalen>0) {
|
144
|
+
ch = *endp--;
|
145
|
+
if (ch==' ' || ch=='\t') {
|
146
|
+
datalen--;
|
147
|
+
} else {
|
148
|
+
break;
|
149
|
+
}
|
150
|
+
}
|
151
|
+
if (datalen==0) {
|
152
|
+
coldata = Qnil;
|
153
|
+
} else {
|
154
|
+
coldata = rb_str_new(tokstart, datalen);
|
155
|
+
}
|
156
|
+
}
|
157
|
+
break;
|
158
|
+
case 5:
|
159
|
+
{
|
160
|
+
unsigned char ch, *start_p, *wptr, *rptr;
|
161
|
+
int rest, datalen;
|
162
|
+
start_p = wptr = tokstart;
|
163
|
+
rptr = tokstart + 1;
|
164
|
+
rest = tokend - tokstart - 2;
|
165
|
+
datalen = 0;
|
166
|
+
while(rest>0) {
|
167
|
+
ch = *rptr++;
|
168
|
+
if (ch=='"') {
|
169
|
+
rptr++;
|
170
|
+
rest--;
|
171
|
+
}
|
172
|
+
*wptr++ = ch;
|
173
|
+
datalen++;
|
174
|
+
rest--;
|
175
|
+
}
|
176
|
+
coldata = rb_str_new( start_p, datalen );
|
177
|
+
}
|
178
|
+
break;
|
179
|
+
default: break;
|
180
|
+
}
|
181
|
+
{p = ((tokend))-1;}}
|
182
|
+
goto st2;
|
183
|
+
tr6:
|
184
|
+
#line 19 "csvscan.rl"
|
185
|
+
{tokend = p+1;{p = ((tokend))-1;}}
|
186
|
+
goto st2;
|
187
|
+
tr7:
|
188
|
+
#line 19 "csvscan.rl"
|
189
|
+
{tokend = p+1;{p = ((tokend))-1;}}
|
190
|
+
#line 10 "csvscan.rl"
|
191
|
+
{
|
192
|
+
curline += 1;
|
193
|
+
}
|
194
|
+
#line 20 "csvscan.rl"
|
195
|
+
{
|
196
|
+
rb_ary_push(row, coldata);
|
197
|
+
rb_yield(row);
|
198
|
+
coldata = Qnil;
|
199
|
+
row = rb_ary_new();
|
200
|
+
}
|
201
|
+
goto st2;
|
202
|
+
tr10:
|
203
|
+
#line 26 "csvscan.rl"
|
204
|
+
{tokend = p+1;{
|
205
|
+
rb_ary_push(row, coldata);
|
206
|
+
coldata = Qnil;
|
207
|
+
}{p = ((tokend))-1;}}
|
208
|
+
goto st2;
|
209
|
+
st2:
|
210
|
+
#line 1 "csvscan.rl"
|
211
|
+
{tokstart = 0;}
|
212
|
+
#line 1 "csvscan.rl"
|
213
|
+
{act = 0;}
|
214
|
+
if ( ++p == pe )
|
215
|
+
goto _out2;
|
216
|
+
case 2:
|
217
|
+
#line 1 "csvscan.rl"
|
218
|
+
{tokstart = p;}
|
219
|
+
#line 220 "csvscan.c"
|
220
|
+
switch( (*p) ) {
|
221
|
+
case 9u: goto tr6;
|
222
|
+
case 10u: goto tr7;
|
223
|
+
case 13u: goto st4;
|
224
|
+
case 32u: goto tr6;
|
225
|
+
case 34u: goto st0;
|
226
|
+
case 44u: goto tr10;
|
227
|
+
}
|
228
|
+
if ( 11u <= (*p) && (*p) <= 12u )
|
229
|
+
goto tr8;
|
230
|
+
goto tr4;
|
231
|
+
tr4:
|
232
|
+
#line 1 "csvscan.rl"
|
233
|
+
{tokend = p+1;}
|
234
|
+
#line 30 "csvscan.rl"
|
235
|
+
{act = 4;}
|
236
|
+
goto st3;
|
237
|
+
tr8:
|
238
|
+
#line 1 "csvscan.rl"
|
239
|
+
{tokend = p+1;}
|
240
|
+
#line 19 "csvscan.rl"
|
241
|
+
{act = 1;}
|
242
|
+
goto st3;
|
243
|
+
st3:
|
244
|
+
if ( ++p == pe )
|
245
|
+
goto _out3;
|
246
|
+
case 3:
|
247
|
+
#line 248 "csvscan.c"
|
248
|
+
switch( (*p) ) {
|
249
|
+
case 10u: goto tr5;
|
250
|
+
case 13u: goto tr5;
|
251
|
+
case 34u: goto tr5;
|
252
|
+
case 44u: goto tr5;
|
253
|
+
}
|
254
|
+
goto tr4;
|
255
|
+
st4:
|
256
|
+
if ( ++p == pe )
|
257
|
+
goto _out4;
|
258
|
+
case 4:
|
259
|
+
if ( (*p) == 10u )
|
260
|
+
goto tr1;
|
261
|
+
goto tr0;
|
262
|
+
tr11:
|
263
|
+
#line 10 "csvscan.rl"
|
264
|
+
{
|
265
|
+
curline += 1;
|
266
|
+
}
|
267
|
+
goto st0;
|
268
|
+
st0:
|
269
|
+
if ( ++p == pe )
|
270
|
+
goto _out0;
|
271
|
+
case 0:
|
272
|
+
#line 273 "csvscan.c"
|
273
|
+
switch( (*p) ) {
|
274
|
+
case 10u: goto tr11;
|
275
|
+
case 34u: goto tr12;
|
276
|
+
}
|
277
|
+
goto st0;
|
278
|
+
tr12:
|
279
|
+
#line 1 "csvscan.rl"
|
280
|
+
{tokend = p+1;}
|
281
|
+
#line 49 "csvscan.rl"
|
282
|
+
{act = 5;}
|
283
|
+
goto st5;
|
284
|
+
st5:
|
285
|
+
if ( ++p == pe )
|
286
|
+
goto _out5;
|
287
|
+
case 5:
|
288
|
+
#line 289 "csvscan.c"
|
289
|
+
if ( (*p) == 34u )
|
290
|
+
goto st0;
|
291
|
+
goto tr2;
|
292
|
+
st1:
|
293
|
+
goto _out1;
|
294
|
+
}
|
295
|
+
_out2: cs = 2; goto _out;
|
296
|
+
_out3: cs = 3; goto _out;
|
297
|
+
_out4: cs = 4; goto _out;
|
298
|
+
_out0: cs = 0; goto _out;
|
299
|
+
_out5: cs = 5; goto _out;
|
300
|
+
_out1: cs = 1; goto _out;
|
301
|
+
|
302
|
+
_out: {}
|
303
|
+
}
|
304
|
+
#line 134 "csvscan.rl"
|
305
|
+
|
306
|
+
if ( cs == csv_scan_error ) {
|
307
|
+
free(buf);
|
308
|
+
rb_raise(rb_eCSVParseError, "parse error on line %d.", curline);
|
309
|
+
}
|
310
|
+
|
311
|
+
if ( tokstart == 0 ) {
|
312
|
+
have = 0;
|
313
|
+
} else {
|
314
|
+
have = pe - tokstart;
|
315
|
+
memmove( buf, tokstart, have );
|
316
|
+
tokend = buf + (tokend - tokstart);
|
317
|
+
tokstart = buf;
|
318
|
+
}
|
319
|
+
}
|
320
|
+
free(buf);
|
321
|
+
return Qnil;
|
322
|
+
}
|
323
|
+
|
324
|
+
void Init_csvscan() {
|
325
|
+
VALUE mCSVScan = rb_define_module("CSVScan");
|
326
|
+
rb_define_attr(rb_singleton_class(mCSVScan), "buffer_size", 1, 1);
|
327
|
+
rb_define_singleton_method(mCSVScan, "scan", csv_scan, 1);
|
328
|
+
rb_eCSVParseError = rb_define_class_under(mCSVScan, "ParseError", rb_eException);
|
329
|
+
|
330
|
+
s_read = rb_intern("read");
|
331
|
+
s_to_str = rb_intern("to_str");
|
332
|
+
}
|