rcsv 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/COPYING.LESSER +458 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +30 -0
- data/README.md +148 -0
- data/Rakefile +18 -0
- data/bench.rb +32 -0
- data/ext/rcsv/csv.h +86 -0
- data/ext/rcsv/extconf.rb +3 -0
- data/ext/rcsv/libcsv.c +579 -0
- data/ext/rcsv/rcsv.c +365 -0
- data/ext/rcsv/test.rb +5 -0
- data/lib/lib_csv.rb +88 -0
- data/lib/rcsv.rb +91 -0
- data/lib/rcsv/version.rb +3 -0
- data/rcsv.gemspec +19 -0
- data/test/test_rcsv.csv +889 -0
- data/test/test_rcsv_raw_parse.rb +156 -0
- metadata +70 -0
data/ext/rcsv/rcsv.c
ADDED
@@ -0,0 +1,365 @@
|
|
1
|
+
#include <stdbool.h>
|
2
|
+
#include <ruby.h>
|
3
|
+
|
4
|
+
#include "csv.h"
|
5
|
+
|
6
|
+
static VALUE rcsv_parse_error; // class Rcsv::ParseError << StandardError; end
|
7
|
+
|
8
|
+
// It is useful to know exact row/column positions and field contents where parse-time exception was raised
|
9
|
+
#define RAISE_WITH_LOCATION(row, column, contents, fmt, ...) \
|
10
|
+
rb_raise(rcsv_parse_error, "[%d:%d '%s'] " fmt, (int)(row), (int)(column), (char *)(contents), ##__VA_ARGS__);
|
11
|
+
|
12
|
+
struct rcsv_metadata {
|
13
|
+
// Derived from user-specified options
|
14
|
+
bool row_as_hash; // Used to return array of hashes rather than array of arrays
|
15
|
+
size_t offset_rows; // Number of rows to skip before parsing
|
16
|
+
|
17
|
+
char * row_conversions; // A pointer to string/array of row conversions char specifiers
|
18
|
+
char ** only_rows; // A pointer to array of strings for only_rows filter
|
19
|
+
VALUE * row_defaults; // A pointer to array of row defaults
|
20
|
+
VALUE * column_names; // A pointer to array of column names to be used with hashes
|
21
|
+
|
22
|
+
// Pointer options lengths
|
23
|
+
size_t num_row_conversions; // Number of converter types in row_conversions array
|
24
|
+
size_t num_only_rows; // Number of items in only_rows filter
|
25
|
+
size_t num_row_defaults; // Number of default values in row_defaults array
|
26
|
+
size_t num_columns; // Number of columns detected from column_names.size
|
27
|
+
|
28
|
+
// Internal state
|
29
|
+
bool skip_current_row; // Used by only_rows filter to skip parsing of the row remainder
|
30
|
+
size_t current_col; // Current column's index
|
31
|
+
size_t current_row; // Current row's index
|
32
|
+
|
33
|
+
VALUE * result; // A pointer to the parsed data
|
34
|
+
};
|
35
|
+
|
36
|
+
//// Internal callbacks ////
|
37
|
+
|
38
|
+
/* This procedure is called for every parsed field */
|
39
|
+
void end_of_field_callback(void * field, size_t field_size, void * data) {
|
40
|
+
const char * field_str = (char *)field;
|
41
|
+
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
42
|
+
char row_conversion = 0;
|
43
|
+
VALUE parsed_field;
|
44
|
+
VALUE last_entry = rb_ary_entry(*(meta->result), -1); // result.last
|
45
|
+
|
46
|
+
// No need to parse anything until the end of the line if skip_current_row is set
|
47
|
+
if (meta->skip_current_row) {
|
48
|
+
return;
|
49
|
+
}
|
50
|
+
|
51
|
+
// Skip the row if its position is less than specifed offset
|
52
|
+
if (meta->current_row < meta->offset_rows) {
|
53
|
+
meta->skip_current_row = true;
|
54
|
+
return;
|
55
|
+
}
|
56
|
+
|
57
|
+
// Filter by string row values listed in meta->only_rows.
|
58
|
+
if ((meta->only_rows != NULL) &&
|
59
|
+
(meta->current_col < meta->num_only_rows) &&
|
60
|
+
(meta->only_rows[meta->current_col] != NULL) &&
|
61
|
+
(strcmp(meta->only_rows[meta->current_col], field_str))) {
|
62
|
+
meta->skip_current_row = true;
|
63
|
+
return;
|
64
|
+
}
|
65
|
+
|
66
|
+
// Get row conversion char specifier
|
67
|
+
if (meta->current_col < meta->num_row_conversions) {
|
68
|
+
row_conversion = (char)meta->row_conversions[meta->current_col];
|
69
|
+
}
|
70
|
+
|
71
|
+
// Convert the field from string into Ruby type specified by row_conversion
|
72
|
+
if (row_conversion != ' ') { // spacebar skips the column
|
73
|
+
if (field_size == 0) {
|
74
|
+
// Assigning appropriate default value if applicable.
|
75
|
+
if (meta->current_col < meta->num_row_defaults) {
|
76
|
+
parsed_field = meta->row_defaults[meta->current_col];
|
77
|
+
} else { // By default, default is nil
|
78
|
+
parsed_field = Qnil;
|
79
|
+
}
|
80
|
+
} else {
|
81
|
+
if (meta->current_col < meta->num_row_conversions) {
|
82
|
+
switch (row_conversion){
|
83
|
+
case 's': // String
|
84
|
+
parsed_field = rb_str_new(field_str, field_size);
|
85
|
+
break;
|
86
|
+
case 'i': // Integer
|
87
|
+
parsed_field = INT2NUM(atol(field_str));
|
88
|
+
break;
|
89
|
+
case 'f': // Float
|
90
|
+
parsed_field = rb_float_new(atof(field_str));
|
91
|
+
break;
|
92
|
+
case 'b': // TrueClass/FalseClass
|
93
|
+
switch (field_str[0]) {
|
94
|
+
case 't':
|
95
|
+
case 'T':
|
96
|
+
case '1':
|
97
|
+
parsed_field = Qtrue;
|
98
|
+
break;
|
99
|
+
case 'f':
|
100
|
+
case 'F':
|
101
|
+
case '0':
|
102
|
+
parsed_field = Qfalse;
|
103
|
+
break;
|
104
|
+
default:
|
105
|
+
RAISE_WITH_LOCATION(
|
106
|
+
meta->current_row,
|
107
|
+
meta->current_col,
|
108
|
+
field_str,
|
109
|
+
"Bad Boolean value. Valid values are strings where the first character is T/t/1 for true or F/f/0 for false."
|
110
|
+
);
|
111
|
+
}
|
112
|
+
break;
|
113
|
+
default:
|
114
|
+
RAISE_WITH_LOCATION(
|
115
|
+
meta->current_row,
|
116
|
+
meta->current_col,
|
117
|
+
field_str,
|
118
|
+
"Unknown deserializer '%c'.",
|
119
|
+
row_conversion
|
120
|
+
);
|
121
|
+
}
|
122
|
+
} else { // No conversion happens
|
123
|
+
parsed_field = rb_str_new(field_str, field_size); // field
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
// Assign the value to appropriate hash key if parsing into Hash
|
128
|
+
if (meta->row_as_hash) {
|
129
|
+
if (meta->current_col >= meta->num_columns) {
|
130
|
+
RAISE_WITH_LOCATION(
|
131
|
+
meta->current_row,
|
132
|
+
meta->current_col,
|
133
|
+
field_str,
|
134
|
+
"There are at least %d columns in a row, which is beyond the number of provided column names (%d).",
|
135
|
+
(int)meta->current_col + 1,
|
136
|
+
(int)meta->num_columns
|
137
|
+
);
|
138
|
+
} else {
|
139
|
+
rb_hash_aset(last_entry, meta->column_names[meta->current_col], parsed_field);
|
140
|
+
}
|
141
|
+
} else { // Parse into Array
|
142
|
+
rb_ary_push(last_entry, parsed_field); // result << field
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
// Increment column counter
|
147
|
+
meta->current_col++;
|
148
|
+
return;
|
149
|
+
}
|
150
|
+
|
151
|
+
/* This procedure is called for every line ending */
|
152
|
+
void end_of_line_callback(int last_char, void * data) {
|
153
|
+
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
154
|
+
|
155
|
+
// If filters didn't match, current row parsing is reverted.
|
156
|
+
if (meta->skip_current_row) {
|
157
|
+
rb_ary_pop(*(meta->result)); // result.pop
|
158
|
+
meta->skip_current_row = false;
|
159
|
+
}
|
160
|
+
|
161
|
+
// Add a new empty array/hash for the next line unless EOF reached.
|
162
|
+
if (last_char != -1) {
|
163
|
+
if (meta->row_as_hash) {
|
164
|
+
rb_ary_push(*(meta->result), rb_hash_new()); // result << {}
|
165
|
+
} else {
|
166
|
+
rb_ary_push(*(meta->result), rb_ary_new()); // result << []
|
167
|
+
}
|
168
|
+
}
|
169
|
+
|
170
|
+
// Resetting column counter.
|
171
|
+
meta->current_col = 0;
|
172
|
+
|
173
|
+
// Incrementing row counter.
|
174
|
+
meta->current_row++;
|
175
|
+
return;
|
176
|
+
}
|
177
|
+
|
178
|
+
//// C API ////
|
179
|
+
|
180
|
+
/* The main method that handles parsing */
|
181
|
+
static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
182
|
+
struct rcsv_metadata meta;
|
183
|
+
VALUE str, options, option;
|
184
|
+
|
185
|
+
struct csv_parser cp;
|
186
|
+
unsigned char csv_options = CSV_STRICT_FINI | CSV_APPEND_NULL;
|
187
|
+
char * csv_string;
|
188
|
+
size_t csv_string_len;
|
189
|
+
int error;
|
190
|
+
size_t i = 0;
|
191
|
+
|
192
|
+
// Setting up some sane defaults
|
193
|
+
meta.row_as_hash = false;
|
194
|
+
meta.skip_current_row = false;
|
195
|
+
meta.num_columns = 0;
|
196
|
+
meta.current_col = 0;
|
197
|
+
meta.current_row = 0;
|
198
|
+
meta.offset_rows = 0;
|
199
|
+
meta.num_only_rows = 0;
|
200
|
+
meta.num_row_defaults = 0;
|
201
|
+
meta.num_row_conversions = 0;
|
202
|
+
meta.only_rows = NULL;
|
203
|
+
meta.row_defaults = NULL;
|
204
|
+
meta.row_conversions = NULL;
|
205
|
+
meta.column_names = NULL;
|
206
|
+
meta.result = (VALUE[]){rb_ary_new()}; // []
|
207
|
+
|
208
|
+
// str is required, options is optional (pun intended)
|
209
|
+
rb_scan_args(argc, argv, "11", &str, &options);
|
210
|
+
csv_string = StringValuePtr(str);
|
211
|
+
csv_string_len = strlen(csv_string);
|
212
|
+
|
213
|
+
// options ||= nil
|
214
|
+
if (NIL_P(options)) {
|
215
|
+
options = rb_hash_new();
|
216
|
+
}
|
217
|
+
|
218
|
+
// By default, parsing is strict
|
219
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("nostrict")));
|
220
|
+
if (!option || (option == Qnil)) {
|
221
|
+
csv_options |= CSV_STRICT;
|
222
|
+
}
|
223
|
+
|
224
|
+
// Try to initialize libcsv
|
225
|
+
if (csv_init(&cp, csv_options) == -1) {
|
226
|
+
rb_raise(rcsv_parse_error, "Couldn't initialize libcsv");
|
227
|
+
}
|
228
|
+
|
229
|
+
// By default, parse as Array of Arrays
|
230
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("row_as_hash")));
|
231
|
+
if (option && (option != Qnil)) {
|
232
|
+
meta.row_as_hash = true;
|
233
|
+
}
|
234
|
+
|
235
|
+
// :col_sep sets the column separator, default is comma (,)
|
236
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("col_sep")));
|
237
|
+
if (option != Qnil) {
|
238
|
+
csv_set_delim(&cp, (unsigned char)*StringValuePtr(option));
|
239
|
+
}
|
240
|
+
|
241
|
+
// Specify how many rows to skip from the beginning of CSV
|
242
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("offset_rows")));
|
243
|
+
if (option != Qnil) {
|
244
|
+
meta.offset_rows = (size_t)NUM2INT(option);
|
245
|
+
}
|
246
|
+
|
247
|
+
// :only_rows is a string mask where row is only parsed
|
248
|
+
// if its fields match those in the passed array.
|
249
|
+
// [nil, nil, "ABC"] skips all rows where 3rd column isn't equal to "ABC"
|
250
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("only_rows")));
|
251
|
+
if (option != Qnil) {
|
252
|
+
meta.num_only_rows = (size_t)RARRAY_LEN(option);
|
253
|
+
meta.only_rows = (char **)malloc(meta.num_only_rows * sizeof(char *));
|
254
|
+
|
255
|
+
for (i = 0; i < meta.num_only_rows; i++) {
|
256
|
+
VALUE only_row = rb_ary_entry(option, i);
|
257
|
+
if (only_row == Qnil) {
|
258
|
+
meta.only_rows[i] = NULL;
|
259
|
+
} else {
|
260
|
+
meta.only_rows[i] = StringValueCStr(only_row);
|
261
|
+
}
|
262
|
+
}
|
263
|
+
}
|
264
|
+
|
265
|
+
// :row_defaults is an array of default values that are assigned to fields containing empty strings
|
266
|
+
// according to matching field positions
|
267
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("row_defaults")));
|
268
|
+
if (option != Qnil) {
|
269
|
+
meta.num_row_defaults = RARRAY_LEN(option);
|
270
|
+
meta.row_defaults = (VALUE*)malloc(meta.num_row_defaults * sizeof(VALUE*));
|
271
|
+
|
272
|
+
for (i = 0; i < meta.num_row_defaults; i++) {
|
273
|
+
VALUE row_default = rb_ary_entry(option, i);
|
274
|
+
meta.row_defaults[i] = row_default;
|
275
|
+
}
|
276
|
+
}
|
277
|
+
|
278
|
+
// :row_conversions specifies Ruby types that CSV field values should be converted into.
|
279
|
+
// Each char of row_conversions string represents Ruby type for CSV field with matching position.
|
280
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("row_conversions")));
|
281
|
+
if (option != Qnil) {
|
282
|
+
meta.num_row_conversions = RSTRING_LEN(option);
|
283
|
+
meta.row_conversions = StringValuePtr(option);
|
284
|
+
}
|
285
|
+
|
286
|
+
// Column names should be declared explicitly when parsing fields as Hashes
|
287
|
+
if (meta.row_as_hash) { // Only matters for hash results
|
288
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("column_names")));
|
289
|
+
if (option == Qnil) {
|
290
|
+
rb_raise(rcsv_parse_error, ":row_as_hash requires :column_names to be set.");
|
291
|
+
} else {
|
292
|
+
meta.num_columns = (size_t)RARRAY_LEN(option);
|
293
|
+
meta.column_names = (VALUE*)malloc(meta.num_columns * sizeof(VALUE*));
|
294
|
+
|
295
|
+
for (i = 0; i < meta.num_columns; i++) {
|
296
|
+
meta.column_names[i] = rb_ary_entry(option, i);
|
297
|
+
}
|
298
|
+
}
|
299
|
+
}
|
300
|
+
|
301
|
+
// Initializing result with empty Array
|
302
|
+
if (meta.row_as_hash) {
|
303
|
+
rb_ary_push(*(meta.result), rb_hash_new()); // [{}]
|
304
|
+
} else {
|
305
|
+
rb_ary_push(*(meta.result), rb_ary_new()); // [[]]
|
306
|
+
}
|
307
|
+
|
308
|
+
// Actual parsing and error handling
|
309
|
+
if (csv_string_len != csv_parse(&cp, csv_string, strlen(csv_string),
|
310
|
+
&end_of_field_callback, &end_of_line_callback, &meta)) {
|
311
|
+
error = csv_error(&cp);
|
312
|
+
switch(error) {
|
313
|
+
case CSV_EPARSE:
|
314
|
+
rb_raise(rcsv_parse_error, "Error when parsing malformed data");
|
315
|
+
break;
|
316
|
+
case CSV_ENOMEM:
|
317
|
+
rb_raise(rcsv_parse_error, "No memory");
|
318
|
+
break;
|
319
|
+
case CSV_ETOOBIG:
|
320
|
+
rb_raise(rcsv_parse_error, "Field data is too large");
|
321
|
+
break;
|
322
|
+
case CSV_EINVALID:
|
323
|
+
rb_raise(rcsv_parse_error, "%s", (const char *)csv_strerror(error));
|
324
|
+
break;
|
325
|
+
default:
|
326
|
+
rb_raise(rcsv_parse_error, "Failed due to unknown reason");
|
327
|
+
}
|
328
|
+
}
|
329
|
+
|
330
|
+
// Flushing libcsv's buffer and freeing up allocated memory
|
331
|
+
csv_fini(&cp, &end_of_field_callback, &end_of_line_callback, &meta);
|
332
|
+
csv_free(&cp);
|
333
|
+
|
334
|
+
if (meta.only_rows != NULL) {
|
335
|
+
free(meta.only_rows);
|
336
|
+
}
|
337
|
+
|
338
|
+
if (meta.row_defaults != NULL) {
|
339
|
+
free(meta.row_defaults);
|
340
|
+
}
|
341
|
+
|
342
|
+
if (meta.column_names != NULL) {
|
343
|
+
free(meta.column_names);
|
344
|
+
}
|
345
|
+
|
346
|
+
// Remove the last row if it's empty. That happens if CSV file ends with a newline.
|
347
|
+
if (RARRAY_LEN(rb_ary_entry(*(meta.result), -1)) == 0) {
|
348
|
+
rb_ary_pop(*(meta.result));
|
349
|
+
}
|
350
|
+
|
351
|
+
// An array of arrays of strings is returned.
|
352
|
+
return *(meta.result);
|
353
|
+
}
|
354
|
+
|
355
|
+
|
356
|
+
/* Define Ruby API */
|
357
|
+
void Init_rcsv(void) {
|
358
|
+
VALUE klass = rb_define_class("Rcsv", rb_cObject); // class Rcsv; end
|
359
|
+
|
360
|
+
// Error is initialized through static variable in order to access it from rb_rcsv_raw_parse
|
361
|
+
rcsv_parse_error = rb_define_class_under(klass, "ParseError", rb_eStandardError);
|
362
|
+
|
363
|
+
// def Rcsv.raw_parse; ...; end
|
364
|
+
rb_define_singleton_method(klass, "raw_parse", rb_rcsv_raw_parse, -1);
|
365
|
+
}
|
data/ext/rcsv/test.rb
ADDED
data/lib/lib_csv.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
class LibCsv
|
4
|
+
extend FFI::Library
|
5
|
+
ffi_lib 'libcsv'
|
6
|
+
|
7
|
+
class CsvParser < FFI::Struct
|
8
|
+
layout :pstate, :int,
|
9
|
+
:qouted, :int,
|
10
|
+
:spaces, :size_t,
|
11
|
+
:entry_buf, :string,
|
12
|
+
:entry_pos, :size_t,
|
13
|
+
:entry_size, :size_t,
|
14
|
+
:status, :int,
|
15
|
+
:options, :uchar,
|
16
|
+
:quote_char, :uchar,
|
17
|
+
:delim_char, :uchar,
|
18
|
+
:is_space, :pointer,
|
19
|
+
:is_term, :pointer,
|
20
|
+
:blk_size, :size_t,
|
21
|
+
:malloc_func, :pointer,
|
22
|
+
:realloc_func, :pointer,
|
23
|
+
:free_func, :pointer
|
24
|
+
end
|
25
|
+
|
26
|
+
callback :end_of_field_callback, [:pointer, :size_t, :pointer], :void
|
27
|
+
callback :end_of_record_callback, [:int, :pointer], :void
|
28
|
+
|
29
|
+
attach_function :csv_init, [:pointer, :uchar], :int
|
30
|
+
attach_function :csv_parse, [:pointer, :pointer, :size_t, :end_of_field_callback, :end_of_record_callback, :pointer], :size_t
|
31
|
+
attach_function :csv_fini, [:pointer, :end_of_field_callback, :end_of_record_callback, :pointer], :int
|
32
|
+
attach_function :csv_free, [:pointer], :void
|
33
|
+
|
34
|
+
attach_function :csv_set_delim, [:pointer, :uchar], :void
|
35
|
+
attach_function :csv_get_delim, [:pointer], :uchar
|
36
|
+
|
37
|
+
attach_function :csv_error, [:pointer], :int
|
38
|
+
attach_function :csv_strerror, [:int], :string
|
39
|
+
|
40
|
+
def self.parse(string, options = {})
|
41
|
+
pointer = FFI::MemoryPointer.new :char, CsvParser.size, false
|
42
|
+
parser = CsvParser.new pointer
|
43
|
+
result = csv_init(parser, 0)
|
44
|
+
|
45
|
+
if options[:col_sep]
|
46
|
+
csv_set_delim(parser, options[:col_sep].ord)
|
47
|
+
end
|
48
|
+
|
49
|
+
fail "Couldn't initialize libcsv" if result == -1
|
50
|
+
|
51
|
+
result = [[]]
|
52
|
+
|
53
|
+
end_of_field_callback = Proc.new { |p_field, field_size, p_data|
|
54
|
+
str = p_field.read_pointer.null? ? nil : p_field.read_string(field_size)
|
55
|
+
result.last << str
|
56
|
+
}
|
57
|
+
|
58
|
+
end_of_record_callback = Proc.new { |last_char, p_data|
|
59
|
+
result << [] unless last_char == -1
|
60
|
+
}
|
61
|
+
|
62
|
+
original_length = string.bytesize
|
63
|
+
length = nil
|
64
|
+
|
65
|
+
length = csv_parse(parser, string, original_length, end_of_field_callback, end_of_record_callback, nil)
|
66
|
+
|
67
|
+
unless length == original_length
|
68
|
+
case error = csv_error(parser)
|
69
|
+
when CSV_EPARSE
|
70
|
+
fail "Error when parsing malformed data"
|
71
|
+
when CSV_ENOMEM
|
72
|
+
fail "No memory"
|
73
|
+
when CSV_ETOOBIG
|
74
|
+
fail "Too large field data"
|
75
|
+
when CSV_EINVALID
|
76
|
+
fail csv_strerror(error)
|
77
|
+
else
|
78
|
+
fail "Failed due to unknown reason"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
csv_fini(parser, end_of_field_callback, end_of_record_callback, nil)
|
83
|
+
csv_free(parser)
|
84
|
+
result.pop if result.last == []
|
85
|
+
|
86
|
+
return result
|
87
|
+
end
|
88
|
+
end
|