rcsv 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.travis.yml +7 -0
- data/Gemfile.lock +1 -1
- data/README.md +29 -27
- data/ext/rcsv/rcsv.c +79 -79
- data/lib/rcsv/version.rb +1 -1
- metadata +2 -1
data/.travis.yml
ADDED
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
# Rcsv
|
2
2
|
|
3
|
+
[](https://travis-ci.org/fiksu/rcsv)
|
4
|
+
|
3
5
|
Rcsv is a fast CSV parsing library for MRI Ruby. Tested on REE 1.8.7 and Ruby 1.9.3.
|
4
6
|
|
5
|
-
Contrary to many other gems that implement their own parsers, Rcsv uses libcsv 3.
|
7
|
+
Contrary to many other gems that implement their own parsers, Rcsv uses libcsv 3.0.2 (http://sourceforge.net/projects/libcsv/). As long as libcsv's API is stable, getting Rcsv to use newer libcsv version is as simple as updating two files (csv.h and libcsv.c).
|
6
8
|
|
7
9
|
## Benchmarks
|
8
|
-
|
9
|
-
|
10
|
-
|
10
|
+
user system total real
|
11
|
+
FasterCSV 0.580000 0.000000 0.580000 ( 0.618837)
|
12
|
+
rcsv 0.060000 0.000000 0.060000 ( 0.062248)
|
11
13
|
|
12
14
|
## License
|
13
15
|
|
@@ -30,7 +32,7 @@ Or install it yourself as:
|
|
30
32
|
|
31
33
|
## Building the latest source
|
32
34
|
|
33
|
-
|
35
|
+
First, check out the master branch. Then cd there and run:
|
34
36
|
|
35
37
|
$ bundle # Installs development dependencies
|
36
38
|
$ bundle exec rake # Runs tests
|
@@ -42,7 +44,7 @@ Currently, Rcsv only supports CSV parsing. CSV write support is planned.
|
|
42
44
|
|
43
45
|
Quickstart:
|
44
46
|
|
45
|
-
|
47
|
+
parsed = Rcsv.parse(csv_data)
|
46
48
|
|
47
49
|
|
48
50
|
Rcsv class exposes a class method *parse* that accepts a CSV string as its first parameter and options hash as its second parameter.
|
@@ -99,36 +101,36 @@ A boolean flag. If enabled, only parses columns that are listed in :columns. Dis
|
|
99
101
|
|
100
102
|
This example parses a 3-column CSV file and only returns parsed rows where "Age" values are set to "35".
|
101
103
|
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
104
|
+
Rcsv.parse some_csv, :row_as_hash => true,
|
105
|
+
:columns => {
|
106
|
+
'First Name' => { :alias => :first_name, :default => "Unknown" },
|
107
|
+
'Last Name' => { :alias => :last_name, :default => "Unknown"},
|
108
|
+
'Age' => { :alias => :age, :type => :int, :match => "35"}
|
109
|
+
}
|
108
110
|
|
109
111
|
The result would look like this:
|
110
112
|
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
113
|
+
[
|
114
|
+
{ :first_name => "Mary", :last_name => "Jane", :age => 35 },
|
115
|
+
{ :first_name => "Unknown", :last_name => "Alien", :age => 35}
|
116
|
+
]
|
115
117
|
|
116
118
|
Another example, for a miserable headerless Tab-separated CSV:
|
117
119
|
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
120
|
+
Rcsv.parse some_csv, :column_separator => "\t",
|
121
|
+
:header => :none,
|
122
|
+
:columns => {
|
123
|
+
1 => { :type => :float, :default => 0 }
|
124
|
+
}
|
123
125
|
|
124
126
|
The result would look like this:
|
125
127
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
128
|
+
[
|
129
|
+
[ "Very hot", 3.7, "Mercury" ],
|
130
|
+
[ "Very hot and cloudy", 8.87, "Venus" ],
|
131
|
+
[ "Just about ok", 9.78, "Earth"],
|
132
|
+
[ nil, 0, "Vacuum" ]
|
133
|
+
]
|
132
134
|
|
133
135
|
|
134
136
|
## To do
|
data/ext/rcsv/rcsv.c
CHANGED
@@ -3,37 +3,37 @@
|
|
3
3
|
|
4
4
|
#include "csv.h"
|
5
5
|
|
6
|
-
static VALUE rcsv_parse_error;
|
6
|
+
static VALUE rcsv_parse_error; /* class Rcsv::ParseError << StandardError; end */
|
7
7
|
|
8
|
-
|
8
|
+
/* It is useful to know exact row/column positions and field contents where parse-time exception was raised */
|
9
9
|
#define RAISE_WITH_LOCATION(row, column, contents, fmt, ...) \
|
10
10
|
rb_raise(rcsv_parse_error, "[%d:%d '%s'] " fmt, (int)(row), (int)(column), (char *)(contents), ##__VA_ARGS__);
|
11
11
|
|
12
12
|
struct rcsv_metadata {
|
13
|
-
|
14
|
-
bool row_as_hash;
|
15
|
-
size_t offset_rows;
|
16
|
-
|
17
|
-
char * row_conversions;
|
18
|
-
char ** only_rows;
|
19
|
-
VALUE * row_defaults;
|
20
|
-
VALUE * column_names;
|
21
|
-
|
22
|
-
|
23
|
-
size_t num_row_conversions;
|
24
|
-
size_t num_only_rows;
|
25
|
-
size_t num_row_defaults;
|
26
|
-
size_t num_columns;
|
27
|
-
|
28
|
-
|
29
|
-
bool skip_current_row;
|
30
|
-
size_t current_col;
|
31
|
-
size_t current_row;
|
32
|
-
|
33
|
-
VALUE * result;
|
13
|
+
/* Derived from user-specified options */
|
14
|
+
bool row_as_hash; /* Used to return array of hashes rather than array of arrays */
|
15
|
+
size_t offset_rows; /* Number of rows to skip before parsing */
|
16
|
+
|
17
|
+
char * row_conversions; /* A pointer to string/array of row conversions char specifiers */
|
18
|
+
char ** only_rows; /* A pointer to array of strings for only_rows filter */
|
19
|
+
VALUE * row_defaults; /* A pointer to array of row defaults */
|
20
|
+
VALUE * column_names; /* A pointer to array of column names to be used with hashes */
|
21
|
+
|
22
|
+
/* Pointer options lengths */
|
23
|
+
size_t num_row_conversions; /* Number of converter types in row_conversions array */
|
24
|
+
size_t num_only_rows; /* Number of items in only_rows filter */
|
25
|
+
size_t num_row_defaults; /* Number of default values in row_defaults array */
|
26
|
+
size_t num_columns; /* Number of columns detected from column_names.size */
|
27
|
+
|
28
|
+
/* Internal state */
|
29
|
+
bool skip_current_row; /* Used by only_rows filter to skip parsing of the row remainder */
|
30
|
+
size_t current_col; /* Current column's index */
|
31
|
+
size_t current_row; /* Current row's index */
|
32
|
+
|
33
|
+
VALUE * result; /* A pointer to the parsed data */
|
34
34
|
};
|
35
35
|
|
36
|
-
|
36
|
+
/* Internal callbacks */
|
37
37
|
|
38
38
|
/* This procedure is called for every parsed field */
|
39
39
|
void end_of_field_callback(void * field, size_t field_size, void * data) {
|
@@ -41,20 +41,20 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
41
41
|
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
42
42
|
char row_conversion = 0;
|
43
43
|
VALUE parsed_field;
|
44
|
-
VALUE last_entry = rb_ary_entry(*(meta->result), -1);
|
44
|
+
VALUE last_entry = rb_ary_entry(*(meta->result), -1); /* result.last */
|
45
45
|
|
46
|
-
|
46
|
+
/* No need to parse anything until the end of the line if skip_current_row is set */
|
47
47
|
if (meta->skip_current_row) {
|
48
48
|
return;
|
49
49
|
}
|
50
50
|
|
51
|
-
|
51
|
+
/* Skip the row if its position is less than specifed offset */
|
52
52
|
if (meta->current_row < meta->offset_rows) {
|
53
53
|
meta->skip_current_row = true;
|
54
54
|
return;
|
55
55
|
}
|
56
56
|
|
57
|
-
|
57
|
+
/* Filter by string row values listed in meta->only_rows */
|
58
58
|
if ((meta->only_rows != NULL) &&
|
59
59
|
(meta->current_col < meta->num_only_rows) &&
|
60
60
|
(meta->only_rows[meta->current_col] != NULL) &&
|
@@ -63,33 +63,33 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
63
63
|
return;
|
64
64
|
}
|
65
65
|
|
66
|
-
|
66
|
+
/* Get row conversion char specifier */
|
67
67
|
if (meta->current_col < meta->num_row_conversions) {
|
68
68
|
row_conversion = (char)meta->row_conversions[meta->current_col];
|
69
69
|
}
|
70
70
|
|
71
|
-
|
72
|
-
if (row_conversion != ' ') {
|
71
|
+
/* Convert the field from string into Ruby type specified by row_conversion */
|
72
|
+
if (row_conversion != ' ') { /* spacebar skips the column */
|
73
73
|
if (field_size == 0) {
|
74
|
-
|
74
|
+
/* Assigning appropriate default value if applicable. */
|
75
75
|
if (meta->current_col < meta->num_row_defaults) {
|
76
76
|
parsed_field = meta->row_defaults[meta->current_col];
|
77
|
-
} else {
|
77
|
+
} else { /* By default, default is nil */
|
78
78
|
parsed_field = Qnil;
|
79
79
|
}
|
80
80
|
} else {
|
81
81
|
if (meta->current_col < meta->num_row_conversions) {
|
82
82
|
switch (row_conversion){
|
83
|
-
case 's':
|
83
|
+
case 's': /* String */
|
84
84
|
parsed_field = rb_str_new(field_str, field_size);
|
85
85
|
break;
|
86
|
-
case 'i':
|
86
|
+
case 'i': /* Integer */
|
87
87
|
parsed_field = INT2NUM(atol(field_str));
|
88
88
|
break;
|
89
|
-
case 'f':
|
89
|
+
case 'f': /* Float */
|
90
90
|
parsed_field = rb_float_new(atof(field_str));
|
91
91
|
break;
|
92
|
-
case 'b':
|
92
|
+
case 'b': /* TrueClass/FalseClass */
|
93
93
|
switch (field_str[0]) {
|
94
94
|
case 't':
|
95
95
|
case 'T':
|
@@ -119,12 +119,12 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
119
119
|
row_conversion
|
120
120
|
);
|
121
121
|
}
|
122
|
-
} else {
|
123
|
-
parsed_field = rb_str_new(field_str, field_size);
|
122
|
+
} else { /* No conversion happens */
|
123
|
+
parsed_field = rb_str_new(field_str, field_size); /* field */
|
124
124
|
}
|
125
125
|
}
|
126
126
|
|
127
|
-
|
127
|
+
/* Assign the value to appropriate hash key if parsing into Hash */
|
128
128
|
if (meta->row_as_hash) {
|
129
129
|
if (meta->current_col >= meta->num_columns) {
|
130
130
|
RAISE_WITH_LOCATION(
|
@@ -138,12 +138,12 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
138
138
|
} else {
|
139
139
|
rb_hash_aset(last_entry, meta->column_names[meta->current_col], parsed_field);
|
140
140
|
}
|
141
|
-
} else {
|
142
|
-
rb_ary_push(last_entry, parsed_field);
|
141
|
+
} else { /* Parse into Array */
|
142
|
+
rb_ary_push(last_entry, parsed_field); /* result << field */
|
143
143
|
}
|
144
144
|
}
|
145
145
|
|
146
|
-
|
146
|
+
/* Increment column counter */
|
147
147
|
meta->current_col++;
|
148
148
|
return;
|
149
149
|
}
|
@@ -152,30 +152,30 @@ void end_of_field_callback(void * field, size_t field_size, void * data) {
|
|
152
152
|
void end_of_line_callback(int last_char, void * data) {
|
153
153
|
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
154
154
|
|
155
|
-
|
155
|
+
/* If filters didn't match, current row parsing is reverted */
|
156
156
|
if (meta->skip_current_row) {
|
157
|
-
rb_ary_pop(*(meta->result));
|
157
|
+
rb_ary_pop(*(meta->result)); /* result.pop */
|
158
158
|
meta->skip_current_row = false;
|
159
159
|
}
|
160
160
|
|
161
|
-
|
161
|
+
/* Add a new empty array/hash for the next line unless EOF reached */
|
162
162
|
if (last_char != -1) {
|
163
163
|
if (meta->row_as_hash) {
|
164
|
-
rb_ary_push(*(meta->result), rb_hash_new());
|
164
|
+
rb_ary_push(*(meta->result), rb_hash_new()); /* result << {} */
|
165
165
|
} else {
|
166
|
-
rb_ary_push(*(meta->result), rb_ary_new());
|
166
|
+
rb_ary_push(*(meta->result), rb_ary_new()); /* result << [] */
|
167
167
|
}
|
168
168
|
}
|
169
169
|
|
170
|
-
|
170
|
+
/* Resetting column counter */
|
171
171
|
meta->current_col = 0;
|
172
172
|
|
173
|
-
|
173
|
+
/* Incrementing row counter */
|
174
174
|
meta->current_row++;
|
175
175
|
return;
|
176
176
|
}
|
177
177
|
|
178
|
-
|
178
|
+
/* C API */
|
179
179
|
|
180
180
|
/* The main method that handles parsing */
|
181
181
|
static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
@@ -189,7 +189,7 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
189
189
|
int error;
|
190
190
|
size_t i = 0;
|
191
191
|
|
192
|
-
|
192
|
+
/* Setting up some sane defaults */
|
193
193
|
meta.row_as_hash = false;
|
194
194
|
meta.skip_current_row = false;
|
195
195
|
meta.num_columns = 0;
|
@@ -203,50 +203,50 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
203
203
|
meta.row_defaults = NULL;
|
204
204
|
meta.row_conversions = NULL;
|
205
205
|
meta.column_names = NULL;
|
206
|
-
meta.result = (VALUE[]){rb_ary_new()};
|
206
|
+
meta.result = (VALUE[]){rb_ary_new()}; /* [] */
|
207
207
|
|
208
|
-
|
208
|
+
/* str is required, options is optional (pun intended) */
|
209
209
|
rb_scan_args(argc, argv, "11", &str, &options);
|
210
210
|
csv_string = StringValuePtr(str);
|
211
211
|
csv_string_len = strlen(csv_string);
|
212
212
|
|
213
|
-
|
213
|
+
/* options ||= nil */
|
214
214
|
if (NIL_P(options)) {
|
215
215
|
options = rb_hash_new();
|
216
216
|
}
|
217
217
|
|
218
|
-
|
218
|
+
/* By default, parsing is strict */
|
219
219
|
option = rb_hash_aref(options, ID2SYM(rb_intern("nostrict")));
|
220
220
|
if (!option || (option == Qnil)) {
|
221
221
|
csv_options |= CSV_STRICT;
|
222
222
|
}
|
223
223
|
|
224
|
-
|
224
|
+
/* Try to initialize libcsv */
|
225
225
|
if (csv_init(&cp, csv_options) == -1) {
|
226
226
|
rb_raise(rcsv_parse_error, "Couldn't initialize libcsv");
|
227
227
|
}
|
228
228
|
|
229
|
-
|
229
|
+
/* By default, parse as Array of Arrays */
|
230
230
|
option = rb_hash_aref(options, ID2SYM(rb_intern("row_as_hash")));
|
231
231
|
if (option && (option != Qnil)) {
|
232
232
|
meta.row_as_hash = true;
|
233
233
|
}
|
234
234
|
|
235
|
-
|
235
|
+
/* :col_sep sets the column separator, default is comma (,) */
|
236
236
|
option = rb_hash_aref(options, ID2SYM(rb_intern("col_sep")));
|
237
237
|
if (option != Qnil) {
|
238
238
|
csv_set_delim(&cp, (unsigned char)*StringValuePtr(option));
|
239
239
|
}
|
240
240
|
|
241
|
-
|
241
|
+
/* Specify how many rows to skip from the beginning of CSV */
|
242
242
|
option = rb_hash_aref(options, ID2SYM(rb_intern("offset_rows")));
|
243
243
|
if (option != Qnil) {
|
244
244
|
meta.offset_rows = (size_t)NUM2INT(option);
|
245
245
|
}
|
246
246
|
|
247
|
-
|
248
|
-
|
249
|
-
|
247
|
+
/* :only_rows is a string mask where row is only parsed
|
248
|
+
if its fields match those in the passed array.
|
249
|
+
[nil, nil, "ABC"] skips all rows where 3rd column isn't equal to "ABC" */
|
250
250
|
option = rb_hash_aref(options, ID2SYM(rb_intern("only_rows")));
|
251
251
|
if (option != Qnil) {
|
252
252
|
meta.num_only_rows = (size_t)RARRAY_LEN(option);
|
@@ -262,8 +262,8 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
262
262
|
}
|
263
263
|
}
|
264
264
|
|
265
|
-
|
266
|
-
|
265
|
+
/* :row_defaults is an array of default values that are assigned to fields containing empty strings
|
266
|
+
according to matching field positions */
|
267
267
|
option = rb_hash_aref(options, ID2SYM(rb_intern("row_defaults")));
|
268
268
|
if (option != Qnil) {
|
269
269
|
meta.num_row_defaults = RARRAY_LEN(option);
|
@@ -275,16 +275,16 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
275
275
|
}
|
276
276
|
}
|
277
277
|
|
278
|
-
|
279
|
-
|
278
|
+
/* :row_conversions specifies Ruby types that CSV field values should be converted into.
|
279
|
+
Each char of row_conversions string represents Ruby type for CSV field with matching position. */
|
280
280
|
option = rb_hash_aref(options, ID2SYM(rb_intern("row_conversions")));
|
281
281
|
if (option != Qnil) {
|
282
282
|
meta.num_row_conversions = RSTRING_LEN(option);
|
283
283
|
meta.row_conversions = StringValuePtr(option);
|
284
284
|
}
|
285
285
|
|
286
|
-
|
287
|
-
if (meta.row_as_hash) {
|
286
|
+
/* Column names should be declared explicitly when parsing fields as Hashes */
|
287
|
+
if (meta.row_as_hash) { /* Only matters for hash results */
|
288
288
|
option = rb_hash_aref(options, ID2SYM(rb_intern("column_names")));
|
289
289
|
if (option == Qnil) {
|
290
290
|
rb_raise(rcsv_parse_error, ":row_as_hash requires :column_names to be set.");
|
@@ -298,14 +298,14 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
298
298
|
}
|
299
299
|
}
|
300
300
|
|
301
|
-
|
301
|
+
/* Initializing result with empty Array */
|
302
302
|
if (meta.row_as_hash) {
|
303
|
-
rb_ary_push(*(meta.result), rb_hash_new());
|
303
|
+
rb_ary_push(*(meta.result), rb_hash_new()); /* [{}] */
|
304
304
|
} else {
|
305
|
-
rb_ary_push(*(meta.result), rb_ary_new());
|
305
|
+
rb_ary_push(*(meta.result), rb_ary_new()); /* [[]] */
|
306
306
|
}
|
307
307
|
|
308
|
-
|
308
|
+
/* Actual parsing and error handling */
|
309
309
|
if (csv_string_len != csv_parse(&cp, csv_string, strlen(csv_string),
|
310
310
|
&end_of_field_callback, &end_of_line_callback, &meta)) {
|
311
311
|
error = csv_error(&cp);
|
@@ -327,7 +327,7 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
327
327
|
}
|
328
328
|
}
|
329
329
|
|
330
|
-
|
330
|
+
/* Flushing libcsv's buffer and freeing up allocated memory */
|
331
331
|
csv_fini(&cp, &end_of_field_callback, &end_of_line_callback, &meta);
|
332
332
|
csv_free(&cp);
|
333
333
|
|
@@ -343,23 +343,23 @@ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
|
343
343
|
free(meta.column_names);
|
344
344
|
}
|
345
345
|
|
346
|
-
|
346
|
+
/* Remove the last row if it's empty. That happens if CSV file ends with a newline. */
|
347
347
|
if (RARRAY_LEN(rb_ary_entry(*(meta.result), -1)) == 0) {
|
348
348
|
rb_ary_pop(*(meta.result));
|
349
349
|
}
|
350
350
|
|
351
|
-
|
351
|
+
/* An array of arrays of strings is returned. */
|
352
352
|
return *(meta.result);
|
353
353
|
}
|
354
354
|
|
355
355
|
|
356
356
|
/* Define Ruby API */
|
357
357
|
void Init_rcsv(void) {
|
358
|
-
VALUE klass = rb_define_class("Rcsv", rb_cObject);
|
358
|
+
VALUE klass = rb_define_class("Rcsv", rb_cObject); /* class Rcsv; end */
|
359
359
|
|
360
|
-
|
360
|
+
/* Error is initialized through static variable in order to access it from rb_rcsv_raw_parse */
|
361
361
|
rcsv_parse_error = rb_define_class_under(klass, "ParseError", rb_eStandardError);
|
362
362
|
|
363
|
-
|
363
|
+
/* def Rcsv.raw_parse; ...; end */
|
364
364
|
rb_define_singleton_method(klass, "raw_parse", rb_rcsv_raw_parse, -1);
|
365
365
|
}
|
data/lib/rcsv/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rcsv
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -20,6 +20,7 @@ extensions:
|
|
20
20
|
extra_rdoc_files: []
|
21
21
|
files:
|
22
22
|
- .gitignore
|
23
|
+
- .travis.yml
|
23
24
|
- COPYING.LESSER
|
24
25
|
- Gemfile
|
25
26
|
- Gemfile.lock
|