rcsv 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/.travis.yml +7 -0
- data/Gemfile.lock +1 -1
- data/README.md +29 -27
- data/ext/rcsv/rcsv.c +79 -79
- data/lib/rcsv/version.rb +1 -1
- metadata +2 -1
data/.travis.yml
ADDED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
# Rcsv
|
2
2
|
|
3
|
+
[![Build Status](https://travis-ci.org/fiksu/rcsv.png)](https://travis-ci.org/fiksu/rcsv)
|
4
|
+
|
3
5
|
Rcsv is a fast CSV parsing library for MRI Ruby. Tested on REE 1.8.7 and Ruby 1.9.3.
|
4
6
|
|
5
|
-
Contrary to many other gems that implement their own parsers, Rcsv uses libcsv 3.
|
7
|
+
Contrary to many other gems that implement their own parsers, Rcsv uses libcsv 3.0.2 (http://sourceforge.net/projects/libcsv/). As long as libcsv's API is stable, getting Rcsv to use newer libcsv version is as simple as updating two files (csv.h and libcsv.c).
|
6
8
|
|
7
9
|
## Benchmarks
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
user system total real
|
11
|
+
FasterCSV 0.580000 0.000000 0.580000 ( 0.618837)
|
12
|
+
rcsv 0.060000 0.000000 0.060000 ( 0.062248)
|
11
13
|
|
12
14
|
## License
|
13
15
|
|
@@ -30,7 +32,7 @@ Or install it yourself as:
|
|
30
32
|
|
31
33
|
## Building the latest source
|
32
34
|
|
33
|
-
|
35
|
+
First, check out the master branch. Then cd there and run:
|
34
36
|
|
35
37
|
$ bundle # Installs development dependencies
|
36
38
|
$ bundle exec rake # Runs tests
|
@@ -42,7 +44,7 @@ Currently, Rcsv only supports CSV parsing. CSV write support is planned.
|
|
42
44
|
|
43
45
|
Quickstart:
|
44
46
|
|
45
|
-
|
47
|
+
parsed = Rcsv.parse(csv_data)
|
46
48
|
|
47
49
|
|
48
50
|
Rcsv class exposes a class method *parse* that accepts a CSV string as its first parameter and options hash as its second parameter.
|
@@ -99,36 +101,36 @@ A boolean flag. If enabled, only parses columns that are listed in :columns. Dis
|
|
99
101
|
|
100
102
|
This example parses a 3-column CSV file and only returns parsed rows where "Age" values are set to "35".
|
101
103
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
104
|
+
Rcsv.parse some_csv, :row_as_hash => true,
|
105
|
+
:columns => {
|
106
|
+
'First Name' => { :alias => :first_name, :default => "Unknown" },
|
107
|
+
'Last Name' => { :alias => :last_name, :default => "Unknown"},
|
108
|
+
'Age' => { :alias => :age, :type => :int, :match => "35"}
|
109
|
+
}
|
108
110
|
|
109
111
|
The result would look like this:
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
113
|
+
[
|
114
|
+
{ :first_name => "Mary", :last_name => "Jane", :age => 35 },
|
115
|
+
{ :first_name => "Unknown", :last_name => "Alien", :age => 35}
|
116
|
+
]
|
115
117
|
|
116
118
|
Another example, for a miserable headerless Tab-separated CSV:
|
117
119
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
120
|
+
Rcsv.parse some_csv, :column_separator => "\t",
|
121
|
+
:header => :none,
|
122
|
+
:columns => {
|
123
|
+
1 => { :type => :float, :default => 0 }
|
124
|
+
}
|
123
125
|
|
124
126
|
The result would look like this:
|
125
127
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
128
|
+
[
|
129
|
+
[ "Very hot", 3.7, "Mercury" ],
|
130
|
+
[ "Very hot and cloudy", 8.87, "Venus" ],
|
131
|
+
[ "Just about ok", 9.78, "Earth"],
|
132
|
+
[ nil, 0, "Vacuum" ]
|
133
|
+
]
|
132
134
|
|
133
135
|
|
134
136
|
## To do
|
data/ext/rcsv/rcsv.c
CHANGED
@@ -3,37 +3,37 @@
|
|
3
3
|
|
4
4
|
#include "csv.h"
|
5
5
|
|
6
|
-
static VALUE rcsv_parse_error;
|
6
|
+
static VALUE rcsv_parse_error; /* class Rcsv::ParseError << StandardError; end */
|
7
7
|
|
8
|
-
|
8
|
+
/* It is useful to know exact row/column positions and field contents where parse-time exception was raised */
|
9
9
|
#define RAISE_WITH_LOCATION(row, column, contents, fmt, ...) \
|
10
10
|
rb_raise(rcsv_parse_error, "[%d:%d '%s'] " fmt, (int)(row), (int)(column), (char *)(contents), ##__VA_ARGS__);
|
11
11
|
|
12
12
|
struct rcsv_metadata {
|
13
|
-
|
14
|
-
bool row_as_hash;
|
15
|
-
size_t offset_rows;
|
16
|
-
|
17
|
-
char * row_conversions;
|
18
|
-
char ** only_rows;
|
19
|
-
VALUE * row_defaults;
|
20
|
-
VALUE * column_names;
|
21
|
-
|
22
|
-
|
23
|
-
size_t num_row_conversions;
|
24
|
-
size_t num_only_rows;
|
25
|
-
size_t num_row_defaults;
|
26
|
-
size_t num_columns;
|
27
|
-
|
28
|
-
|
29
|
-
bool skip_current_row;
|
30
|
-
size_t current_col;
|
31
|
-
size_t current_row;
|
32
|
-
|
33
|
-
VALUE * result;
|
13
|
+
/* Derived from user-specified options */
|
14
|
+
bool row_as_hash; /* Used to return array of hashes rather than array of arrays */
|
15
|
+
size_t offset_rows; /* Number of rows to skip before parsing */
|
16
|
+
|
17
|
+
char * row_conversions; /* A pointer to string/array of row conversions char specifiers */
|
18
|
+
char ** only_rows; /* A pointer to array of strings for only_rows filter */
|
19
|
+
VALUE * row_defaults; /* A pointer to array of row defaults */
|
20
|
+
VALUE * column_names; /* A pointer to array of column names to be used with hashes */
|
21
|
+
|
22
|
+
/* Pointer options lengths */
|
23
|
+
size_t num_row_conversions; /* Number of converter types in row_conversions array */
|
24
|
+
size_t num_only_rows; /* Number of items in only_rows filter */
|
25
|
+
size_t num_row_defaults; /* Number of default values in row_defaults array */
|
26
|
+
size_t num_columns; /* Number of columns detected from column_names.size */
|
27
|
+
|
28
|
+
/* Internal state */
|
29
|
+
bool skip_current_row; /* Used by only_rows filter to skip parsing of the row remainder */
|
30
|
+
size_t current_col; /* Current column's index */
|
31
|
+
size_t current_row; /* Current row's index */
|
32
|
+
|
33
|
+
VALUE * result; /* A pointer to the parsed data */
|
34
34
|
};
|
35
35
|
|
36
|
-
|
36
|
+
/* Internal callbacks */
|
37
37
|
|
38
38
|
/* This procedure is called for every parsed field */
|
39
39
|
void end_of_field_callback(void * field, size_t field_size, void * data) {
|
@@ -41,20 +41,20 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
41
41
|
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
42
42
|
char row_conversion = 0;
|
43
43
|
VALUE parsed_field;
|
44
|
-
VALUE last_entry = rb_ary_entry(*(meta->result), -1);
|
44
|
+
VALUE last_entry = rb_ary_entry(*(meta->result), -1); /* result.last */
|
45
45
|
|
46
|
-
|
46
|
+
/* No need to parse anything until the end of the line if skip_current_row is set */
|
47
47
|
if (meta->skip_current_row) {
|
48
48
|
return;
|
49
49
|
}
|
50
50
|
|
51
|
-
|
51
|
+
/* Skip the row if its position is less than specifed offset */
|
52
52
|
if (meta->current_row < meta->offset_rows) {
|
53
53
|
meta->skip_current_row = true;
|
54
54
|
return;
|
55
55
|
}
|
56
56
|
|
57
|
-
|
57
|
+
/* Filter by string row values listed in meta->only_rows */
|
58
58
|
if ((meta->only_rows != NULL) &&
|
59
59
|
(meta->current_col < meta->num_only_rows) &&
|
60
60
|
(meta->only_rows[meta->current_col] != NULL) &&
|
@@ -63,33 +63,33 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
63
63
|
return;
|
64
64
|
}
|
65
65
|
|
66
|
-
|
66
|
+
/* Get row conversion char specifier */
|
67
67
|
if (meta->current_col < meta->num_row_conversions) {
|
68
68
|
row_conversion = (char)meta->row_conversions[meta->current_col];
|
69
69
|
}
|
70
70
|
|
71
|
-
|
72
|
-
if (row_conversion != ' ') {
|
71
|
+
/* Convert the field from string into Ruby type specified by row_conversion */
|
72
|
+
if (row_conversion != ' ') { /* spacebar skips the column */
|
73
73
|
if (field_size == 0) {
|
74
|
-
|
74
|
+
/* Assigning appropriate default value if applicable. */
|
75
75
|
if (meta->current_col < meta->num_row_defaults) {
|
76
76
|
parsed_field = meta->row_defaults[meta->current_col];
|
77
|
-
} else {
|
77
|
+
} else { /* By default, default is nil */
|
78
78
|
parsed_field = Qnil;
|
79
79
|
}
|
80
80
|
} else {
|
81
81
|
if (meta->current_col < meta->num_row_conversions) {
|
82
82
|
switch (row_conversion){
|
83
|
-
case 's':
|
83
|
+
case 's': /* String */
|
84
84
|
parsed_field = rb_str_new(field_str, field_size);
|
85
85
|
break;
|
86
|
-
case 'i':
|
86
|
+
case 'i': /* Integer */
|
87
87
|
parsed_field = INT2NUM(atol(field_str));
|
88
88
|
break;
|
89
|
-
case 'f':
|
89
|
+
case 'f': /* Float */
|
90
90
|
parsed_field = rb_float_new(atof(field_str));
|
91
91
|
break;
|
92
|
-
case 'b':
|
92
|
+
case 'b': /* TrueClass/FalseClass */
|
93
93
|
switch (field_str[0]) {
|
94
94
|
case 't':
|
95
95
|
case 'T':
|
@@ -119,12 +119,12 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
119
119
|
row_conversion
|
120
120
|
);
|
121
121
|
}
|
122
|
-
} else {
|
123
|
-
parsed_field = rb_str_new(field_str, field_size);
|
122
|
+
} else { /* No conversion happens */
|
123
|
+
parsed_field = rb_str_new(field_str, field_size); /* field */
|
124
124
|
}
|
125
125
|
}
|
126
126
|
|
127
|
-
|
127
|
+
/* Assign the value to appropriate hash key if parsing into Hash */
|
128
128
|
if (meta->row_as_hash) {
|
129
129
|
if (meta->current_col >= meta->num_columns) {
|
130
130
|
RAISE_WITH_LOCATION(
|
@@ -138,12 +138,12 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
138
138
|
} else {
|
139
139
|
rb_hash_aset(last_entry, meta->column_names[meta->current_col], parsed_field);
|
140
140
|
}
|
141
|
-
} else {
|
142
|
-
rb_ary_push(last_entry, parsed_field);
|
141
|
+
} else { /* Parse into Array */
|
142
|
+
rb_ary_push(last_entry, parsed_field); /* result << field */
|
143
143
|
}
|
144
144
|
}
|
145
145
|
|
146
|
-
|
146
|
+
/* Increment column counter */
|
147
147
|
meta->current_col++;
|
148
148
|
return;
|
149
149
|
}
|
@@ -152,30 +152,30 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
152
152
|
void end_of_line_callback(int last_char, void * data) {
|
153
153
|
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
154
154
|
|
155
|
-
|
155
|
+
/* If filters didn't match, current row parsing is reverted */
|
156
156
|
if (meta->skip_current_row) {
|
157
|
-
rb_ary_pop(*(meta->result));
|
157
|
+
rb_ary_pop(*(meta->result)); /* result.pop */
|
158
158
|
meta->skip_current_row = false;
|
159
159
|
}
|
160
160
|
|
161
|
-
|
161
|
+
/* Add a new empty array/hash for the next line unless EOF reached */
|
162
162
|
if (last_char != -1) {
|
163
163
|
if (meta->row_as_hash) {
|
164
|
-
rb_ary_push(*(meta->result), rb_hash_new());
|
164
|
+
rb_ary_push(*(meta->result), rb_hash_new()); /* result << {} */
|
165
165
|
} else {
|
166
|
-
rb_ary_push(*(meta->result), rb_ary_new());
|
166
|
+
rb_ary_push(*(meta->result), rb_ary_new()); /* result << [] */
|
167
167
|
}
|
168
168
|
}
|
169
169
|
|
170
|
-
|
170
|
+
/* Resetting column counter */
|
171
171
|
meta->current_col = 0;
|
172
172
|
|
173
|
-
|
173
|
+
/* Incrementing row counter */
|
174
174
|
meta->current_row++;
|
175
175
|
return;
|
176
176
|
}
|
177
177
|
|
178
|
-
|
178
|
+
/* C API */
|
179
179
|
|
180
180
|
/* The main method that handles parsing */
|
181
181
|
static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
@@ -189,7 +189,7 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
189
189
|
int error;
|
190
190
|
size_t i = 0;
|
191
191
|
|
192
|
-
|
192
|
+
/* Setting up some sane defaults */
|
193
193
|
meta.row_as_hash = false;
|
194
194
|
meta.skip_current_row = false;
|
195
195
|
meta.num_columns = 0;
|
@@ -203,50 +203,50 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
203
203
|
meta.row_defaults = NULL;
|
204
204
|
meta.row_conversions = NULL;
|
205
205
|
meta.column_names = NULL;
|
206
|
-
meta.result = (VALUE[]){rb_ary_new()};
|
206
|
+
meta.result = (VALUE[]){rb_ary_new()}; /* [] */
|
207
207
|
|
208
|
-
|
208
|
+
/* str is required, options is optional (pun intended) */
|
209
209
|
rb_scan_args(argc, argv, "11", &str, &options);
|
210
210
|
csv_string = StringValuePtr(str);
|
211
211
|
csv_string_len = strlen(csv_string);
|
212
212
|
|
213
|
-
|
213
|
+
/* options ||= nil */
|
214
214
|
if (NIL_P(options)) {
|
215
215
|
options = rb_hash_new();
|
216
216
|
}
|
217
217
|
|
218
|
-
|
218
|
+
/* By default, parsing is strict */
|
219
219
|
option = rb_hash_aref(options, ID2SYM(rb_intern("nostrict")));
|
220
220
|
if (!option || (option == Qnil)) {
|
221
221
|
csv_options |= CSV_STRICT;
|
222
222
|
}
|
223
223
|
|
224
|
-
|
224
|
+
/* Try to initialize libcsv */
|
225
225
|
if (csv_init(&cp, csv_options) == -1) {
|
226
226
|
rb_raise(rcsv_parse_error, "Couldn't initialize libcsv");
|
227
227
|
}
|
228
228
|
|
229
|
-
|
229
|
+
/* By default, parse as Array of Arrays */
|
230
230
|
option = rb_hash_aref(options, ID2SYM(rb_intern("row_as_hash")));
|
231
231
|
if (option && (option != Qnil)) {
|
232
232
|
meta.row_as_hash = true;
|
233
233
|
}
|
234
234
|
|
235
|
-
|
235
|
+
/* :col_sep sets the column separator, default is comma (,) */
|
236
236
|
option = rb_hash_aref(options, ID2SYM(rb_intern("col_sep")));
|
237
237
|
if (option != Qnil) {
|
238
238
|
csv_set_delim(&cp, (unsigned char)*StringValuePtr(option));
|
239
239
|
}
|
240
240
|
|
241
|
-
|
241
|
+
/* Specify how many rows to skip from the beginning of CSV */
|
242
242
|
option = rb_hash_aref(options, ID2SYM(rb_intern("offset_rows")));
|
243
243
|
if (option != Qnil) {
|
244
244
|
meta.offset_rows = (size_t)NUM2INT(option);
|
245
245
|
}
|
246
246
|
|
247
|
-
|
248
|
-
|
249
|
-
|
247
|
+
/* :only_rows is a string mask where row is only parsed
|
248
|
+
if its fields match those in the passed array.
|
249
|
+
[nil, nil, "ABC"] skips all rows where 3rd column isn't equal to "ABC" */
|
250
250
|
option = rb_hash_aref(options, ID2SYM(rb_intern("only_rows")));
|
251
251
|
if (option != Qnil) {
|
252
252
|
meta.num_only_rows = (size_t)RARRAY_LEN(option);
|
@@ -262,8 +262,8 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
262
262
|
}
|
263
263
|
}
|
264
264
|
|
265
|
-
|
266
|
-
|
265
|
+
/* :row_defaults is an array of default values that are assigned to fields containing empty strings
|
266
|
+
according to matching field positions */
|
267
267
|
option = rb_hash_aref(options, ID2SYM(rb_intern("row_defaults")));
|
268
268
|
if (option != Qnil) {
|
269
269
|
meta.num_row_defaults = RARRAY_LEN(option);
|
@@ -275,16 +275,16 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
275
275
|
}
|
276
276
|
}
|
277
277
|
|
278
|
-
|
279
|
-
|
278
|
+
/* :row_conversions specifies Ruby types that CSV field values should be converted into.
|
279
|
+
Each char of row_conversions string represents Ruby type for CSV field with matching position. */
|
280
280
|
option = rb_hash_aref(options, ID2SYM(rb_intern("row_conversions")));
|
281
281
|
if (option != Qnil) {
|
282
282
|
meta.num_row_conversions = RSTRING_LEN(option);
|
283
283
|
meta.row_conversions = StringValuePtr(option);
|
284
284
|
}
|
285
285
|
|
286
|
-
|
287
|
-
if (meta.row_as_hash) {
|
286
|
+
/* Column names should be declared explicitly when parsing fields as Hashes */
|
287
|
+
if (meta.row_as_hash) { /* Only matters for hash results */
|
288
288
|
option = rb_hash_aref(options, ID2SYM(rb_intern("column_names")));
|
289
289
|
if (option == Qnil) {
|
290
290
|
rb_raise(rcsv_parse_error, ":row_as_hash requires :column_names to be set.");
|
@@ -298,14 +298,14 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
298
298
|
}
|
299
299
|
}
|
300
300
|
|
301
|
-
|
301
|
+
/* Initializing result with empty Array */
|
302
302
|
if (meta.row_as_hash) {
|
303
|
-
rb_ary_push(*(meta.result), rb_hash_new());
|
303
|
+
rb_ary_push(*(meta.result), rb_hash_new()); /* [{}] */
|
304
304
|
} else {
|
305
|
-
rb_ary_push(*(meta.result), rb_ary_new());
|
305
|
+
rb_ary_push(*(meta.result), rb_ary_new()); /* [[]] */
|
306
306
|
}
|
307
307
|
|
308
|
-
|
308
|
+
/* Actual parsing and error handling */
|
309
309
|
if (csv_string_len != csv_parse(&cp, csv_string, strlen(csv_string),
|
310
310
|
&end_of_field_callback, &end_of_line_callback, &meta)) {
|
311
311
|
error = csv_error(&cp);
|
@@ -327,7 +327,7 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
327
327
|
}
|
328
328
|
}
|
329
329
|
|
330
|
-
|
330
|
+
/* Flushing libcsv's buffer and freeing up allocated memory */
|
331
331
|
csv_fini(&cp, &end_of_field_callback, &end_of_line_callback, &meta);
|
332
332
|
csv_free(&cp);
|
333
333
|
|
@@ -343,23 +343,23 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
343
343
|
free(meta.column_names);
|
344
344
|
}
|
345
345
|
|
346
|
-
|
346
|
+
/* Remove the last row if it's empty. That happens if CSV file ends with a newline. */
|
347
347
|
if (RARRAY_LEN(rb_ary_entry(*(meta.result), -1)) == 0) {
|
348
348
|
rb_ary_pop(*(meta.result));
|
349
349
|
}
|
350
350
|
|
351
|
-
|
351
|
+
/* An array of arrays of strings is returned. */
|
352
352
|
return *(meta.result);
|
353
353
|
}
|
354
354
|
|
355
355
|
|
356
356
|
/* Define Ruby API */
|
357
357
|
void Init_rcsv(void) {
|
358
|
-
VALUE klass = rb_define_class("Rcsv", rb_cObject);
|
358
|
+
VALUE klass = rb_define_class("Rcsv", rb_cObject); /* class Rcsv; end */
|
359
359
|
|
360
|
-
|
360
|
+
/* Error is initialized through static variable in order to access it from rb_rcsv_raw_parse */
|
361
361
|
rcsv_parse_error = rb_define_class_under(klass, "ParseError", rb_eStandardError);
|
362
362
|
|
363
|
-
|
363
|
+
/* def Rcsv.raw_parse; ...; end */
|
364
364
|
rb_define_singleton_method(klass, "raw_parse", rb_rcsv_raw_parse, -1);
|
365
365
|
}
|
data/lib/rcsv/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rcsv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extensions:
|
|
20
20
|
extra_rdoc_files: []
|
21
21
|
files:
|
22
22
|
- .gitignore
|
23
|
+
- .travis.yml
|
23
24
|
- COPYING.LESSER
|
24
25
|
- Gemfile
|
25
26
|
- Gemfile.lock
|