rcsv 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +8 -0
- data/COPYING.LESSER +458 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +30 -0
- data/README.md +148 -0
- data/Rakefile +18 -0
- data/bench.rb +32 -0
- data/ext/rcsv/csv.h +86 -0
- data/ext/rcsv/extconf.rb +3 -0
- data/ext/rcsv/libcsv.c +579 -0
- data/ext/rcsv/rcsv.c +365 -0
- data/ext/rcsv/test.rb +5 -0
- data/lib/lib_csv.rb +88 -0
- data/lib/rcsv.rb +91 -0
- data/lib/rcsv/version.rb +3 -0
- data/rcsv.gemspec +19 -0
- data/test/test_rcsv.csv +889 -0
- data/test/test_rcsv_raw_parse.rb +156 -0
- metadata +70 -0
data/ext/rcsv/rcsv.c
ADDED
@@ -0,0 +1,365 @@
|
|
1
|
+
#include <stdbool.h>
|
2
|
+
#include <ruby.h>
|
3
|
+
|
4
|
+
#include "csv.h"
|
5
|
+
|
6
|
+
static VALUE rcsv_parse_error; // class Rcsv::ParseError << StandardError; end
|
7
|
+
|
8
|
+
// It is useful to know exact row/column positions and field contents where parse-time exception was raised
|
9
|
+
#define RAISE_WITH_LOCATION(row, column, contents, fmt, ...) \
|
10
|
+
rb_raise(rcsv_parse_error, "[%d:%d '%s'] " fmt, (int)(row), (int)(column), (char *)(contents), ##__VA_ARGS__);
|
11
|
+
|
12
|
+
struct rcsv_metadata {
|
13
|
+
// Derived from user-specified options
|
14
|
+
bool row_as_hash; // Used to return array of hashes rather than array of arrays
|
15
|
+
size_t offset_rows; // Number of rows to skip before parsing
|
16
|
+
|
17
|
+
char * row_conversions; // A pointer to string/array of row conversions char specifiers
|
18
|
+
char ** only_rows; // A pointer to array of strings for only_rows filter
|
19
|
+
VALUE * row_defaults; // A pointer to array of row defaults
|
20
|
+
VALUE * column_names; // A pointer to array of column names to be used with hashes
|
21
|
+
|
22
|
+
// Pointer options lengths
|
23
|
+
size_t num_row_conversions; // Number of converter types in row_conversions array
|
24
|
+
size_t num_only_rows; // Number of items in only_rows filter
|
25
|
+
size_t num_row_defaults; // Number of default values in row_defaults array
|
26
|
+
size_t num_columns; // Number of columns detected from column_names.size
|
27
|
+
|
28
|
+
// Internal state
|
29
|
+
bool skip_current_row; // Used by only_rows filter to skip parsing of the row remainder
|
30
|
+
size_t current_col; // Current column's index
|
31
|
+
size_t current_row; // Current row's index
|
32
|
+
|
33
|
+
VALUE * result; // A pointer to the parsed data
|
34
|
+
};
|
35
|
+
|
36
|
+
//// Internal callbacks ////
|
37
|
+
|
38
|
+
/* This procedure is called for every parsed field */
|
39
|
+
void end_of_field_callback(void * field, size_t field_size, void * data) {
|
40
|
+
const char * field_str = (char *)field;
|
41
|
+
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
42
|
+
char row_conversion = 0;
|
43
|
+
VALUE parsed_field;
|
44
|
+
VALUE last_entry = rb_ary_entry(*(meta->result), -1); // result.last
|
45
|
+
|
46
|
+
// No need to parse anything until the end of the line if skip_current_row is set
|
47
|
+
if (meta->skip_current_row) {
|
48
|
+
return;
|
49
|
+
}
|
50
|
+
|
51
|
+
// Skip the row if its position is less than specifed offset
|
52
|
+
if (meta->current_row < meta->offset_rows) {
|
53
|
+
meta->skip_current_row = true;
|
54
|
+
return;
|
55
|
+
}
|
56
|
+
|
57
|
+
// Filter by string row values listed in meta->only_rows.
|
58
|
+
if ((meta->only_rows != NULL) &&
|
59
|
+
(meta->current_col < meta->num_only_rows) &&
|
60
|
+
(meta->only_rows[meta->current_col] != NULL) &&
|
61
|
+
(strcmp(meta->only_rows[meta->current_col], field_str))) {
|
62
|
+
meta->skip_current_row = true;
|
63
|
+
return;
|
64
|
+
}
|
65
|
+
|
66
|
+
// Get row conversion char specifier
|
67
|
+
if (meta->current_col < meta->num_row_conversions) {
|
68
|
+
row_conversion = (char)meta->row_conversions[meta->current_col];
|
69
|
+
}
|
70
|
+
|
71
|
+
// Convert the field from string into Ruby type specified by row_conversion
|
72
|
+
if (row_conversion != ' ') { // spacebar skips the column
|
73
|
+
if (field_size == 0) {
|
74
|
+
// Assigning appropriate default value if applicable.
|
75
|
+
if (meta->current_col < meta->num_row_defaults) {
|
76
|
+
parsed_field = meta->row_defaults[meta->current_col];
|
77
|
+
} else { // By default, default is nil
|
78
|
+
parsed_field = Qnil;
|
79
|
+
}
|
80
|
+
} else {
|
81
|
+
if (meta->current_col < meta->num_row_conversions) {
|
82
|
+
switch (row_conversion){
|
83
|
+
case 's': // String
|
84
|
+
parsed_field = rb_str_new(field_str, field_size);
|
85
|
+
break;
|
86
|
+
case 'i': // Integer
|
87
|
+
parsed_field = INT2NUM(atol(field_str));
|
88
|
+
break;
|
89
|
+
case 'f': // Float
|
90
|
+
parsed_field = rb_float_new(atof(field_str));
|
91
|
+
break;
|
92
|
+
case 'b': // TrueClass/FalseClass
|
93
|
+
switch (field_str[0]) {
|
94
|
+
case 't':
|
95
|
+
case 'T':
|
96
|
+
case '1':
|
97
|
+
parsed_field = Qtrue;
|
98
|
+
break;
|
99
|
+
case 'f':
|
100
|
+
case 'F':
|
101
|
+
case '0':
|
102
|
+
parsed_field = Qfalse;
|
103
|
+
break;
|
104
|
+
default:
|
105
|
+
RAISE_WITH_LOCATION(
|
106
|
+
meta->current_row,
|
107
|
+
meta->current_col,
|
108
|
+
field_str,
|
109
|
+
"Bad Boolean value. Valid values are strings where the first character is T/t/1 for true or F/f/0 for false."
|
110
|
+
);
|
111
|
+
}
|
112
|
+
break;
|
113
|
+
default:
|
114
|
+
RAISE_WITH_LOCATION(
|
115
|
+
meta->current_row,
|
116
|
+
meta->current_col,
|
117
|
+
field_str,
|
118
|
+
"Unknown deserializer '%c'.",
|
119
|
+
row_conversion
|
120
|
+
);
|
121
|
+
}
|
122
|
+
} else { // No conversion happens
|
123
|
+
parsed_field = rb_str_new(field_str, field_size); // field
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
// Assign the value to appropriate hash key if parsing into Hash
|
128
|
+
if (meta->row_as_hash) {
|
129
|
+
if (meta->current_col >= meta->num_columns) {
|
130
|
+
RAISE_WITH_LOCATION(
|
131
|
+
meta->current_row,
|
132
|
+
meta->current_col,
|
133
|
+
field_str,
|
134
|
+
"There are at least %d columns in a row, which is beyond the number of provided column names (%d).",
|
135
|
+
(int)meta->current_col + 1,
|
136
|
+
(int)meta->num_columns
|
137
|
+
);
|
138
|
+
} else {
|
139
|
+
rb_hash_aset(last_entry, meta->column_names[meta->current_col], parsed_field);
|
140
|
+
}
|
141
|
+
} else { // Parse into Array
|
142
|
+
rb_ary_push(last_entry, parsed_field); // result << field
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
// Increment column counter
|
147
|
+
meta->current_col++;
|
148
|
+
return;
|
149
|
+
}
|
150
|
+
|
151
|
+
/* This procedure is called for every line ending */
|
152
|
+
void end_of_line_callback(int last_char, void * data) {
|
153
|
+
struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
|
154
|
+
|
155
|
+
// If filters didn't match, current row parsing is reverted.
|
156
|
+
if (meta->skip_current_row) {
|
157
|
+
rb_ary_pop(*(meta->result)); // result.pop
|
158
|
+
meta->skip_current_row = false;
|
159
|
+
}
|
160
|
+
|
161
|
+
// Add a new empty array/hash for the next line unless EOF reached.
|
162
|
+
if (last_char != -1) {
|
163
|
+
if (meta->row_as_hash) {
|
164
|
+
rb_ary_push(*(meta->result), rb_hash_new()); // result << {}
|
165
|
+
} else {
|
166
|
+
rb_ary_push(*(meta->result), rb_ary_new()); // result << []
|
167
|
+
}
|
168
|
+
}
|
169
|
+
|
170
|
+
// Resetting column counter.
|
171
|
+
meta->current_col = 0;
|
172
|
+
|
173
|
+
// Incrementing row counter.
|
174
|
+
meta->current_row++;
|
175
|
+
return;
|
176
|
+
}
|
177
|
+
|
178
|
+
//// C API ////
|
179
|
+
|
180
|
+
/* The main method that handles parsing */
|
181
|
+
static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
|
182
|
+
struct rcsv_metadata meta;
|
183
|
+
VALUE str, options, option;
|
184
|
+
|
185
|
+
struct csv_parser cp;
|
186
|
+
unsigned char csv_options = CSV_STRICT_FINI | CSV_APPEND_NULL;
|
187
|
+
char * csv_string;
|
188
|
+
size_t csv_string_len;
|
189
|
+
int error;
|
190
|
+
size_t i = 0;
|
191
|
+
|
192
|
+
// Setting up some sane defaults
|
193
|
+
meta.row_as_hash = false;
|
194
|
+
meta.skip_current_row = false;
|
195
|
+
meta.num_columns = 0;
|
196
|
+
meta.current_col = 0;
|
197
|
+
meta.current_row = 0;
|
198
|
+
meta.offset_rows = 0;
|
199
|
+
meta.num_only_rows = 0;
|
200
|
+
meta.num_row_defaults = 0;
|
201
|
+
meta.num_row_conversions = 0;
|
202
|
+
meta.only_rows = NULL;
|
203
|
+
meta.row_defaults = NULL;
|
204
|
+
meta.row_conversions = NULL;
|
205
|
+
meta.column_names = NULL;
|
206
|
+
meta.result = (VALUE[]){rb_ary_new()}; // []
|
207
|
+
|
208
|
+
// str is required, options is optional (pun intended)
|
209
|
+
rb_scan_args(argc, argv, "11", &str, &options);
|
210
|
+
csv_string = StringValuePtr(str);
|
211
|
+
csv_string_len = strlen(csv_string);
|
212
|
+
|
213
|
+
// options ||= nil
|
214
|
+
if (NIL_P(options)) {
|
215
|
+
options = rb_hash_new();
|
216
|
+
}
|
217
|
+
|
218
|
+
// By default, parsing is strict
|
219
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("nostrict")));
|
220
|
+
if (!option || (option == Qnil)) {
|
221
|
+
csv_options |= CSV_STRICT;
|
222
|
+
}
|
223
|
+
|
224
|
+
// Try to initialize libcsv
|
225
|
+
if (csv_init(&cp, csv_options) == -1) {
|
226
|
+
rb_raise(rcsv_parse_error, "Couldn't initialize libcsv");
|
227
|
+
}
|
228
|
+
|
229
|
+
// By default, parse as Array of Arrays
|
230
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("row_as_hash")));
|
231
|
+
if (option && (option != Qnil)) {
|
232
|
+
meta.row_as_hash = true;
|
233
|
+
}
|
234
|
+
|
235
|
+
// :col_sep sets the column separator, default is comma (,)
|
236
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("col_sep")));
|
237
|
+
if (option != Qnil) {
|
238
|
+
csv_set_delim(&cp, (unsigned char)*StringValuePtr(option));
|
239
|
+
}
|
240
|
+
|
241
|
+
// Specify how many rows to skip from the beginning of CSV
|
242
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("offset_rows")));
|
243
|
+
if (option != Qnil) {
|
244
|
+
meta.offset_rows = (size_t)NUM2INT(option);
|
245
|
+
}
|
246
|
+
|
247
|
+
// :only_rows is a string mask where row is only parsed
|
248
|
+
// if its fields match those in the passed array.
|
249
|
+
// [nil, nil, "ABC"] skips all rows where 3rd column isn't equal to "ABC"
|
250
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("only_rows")));
|
251
|
+
if (option != Qnil) {
|
252
|
+
meta.num_only_rows = (size_t)RARRAY_LEN(option);
|
253
|
+
meta.only_rows = (char **)malloc(meta.num_only_rows * sizeof(char *));
|
254
|
+
|
255
|
+
for (i = 0; i < meta.num_only_rows; i++) {
|
256
|
+
VALUE only_row = rb_ary_entry(option, i);
|
257
|
+
if (only_row == Qnil) {
|
258
|
+
meta.only_rows[i] = NULL;
|
259
|
+
} else {
|
260
|
+
meta.only_rows[i] = StringValueCStr(only_row);
|
261
|
+
}
|
262
|
+
}
|
263
|
+
}
|
264
|
+
|
265
|
+
// :row_defaults is an array of default values that are assigned to fields containing empty strings
|
266
|
+
// according to matching field positions
|
267
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("row_defaults")));
|
268
|
+
if (option != Qnil) {
|
269
|
+
meta.num_row_defaults = RARRAY_LEN(option);
|
270
|
+
meta.row_defaults = (VALUE*)malloc(meta.num_row_defaults * sizeof(VALUE*));
|
271
|
+
|
272
|
+
for (i = 0; i < meta.num_row_defaults; i++) {
|
273
|
+
VALUE row_default = rb_ary_entry(option, i);
|
274
|
+
meta.row_defaults[i] = row_default;
|
275
|
+
}
|
276
|
+
}
|
277
|
+
|
278
|
+
// :row_conversions specifies Ruby types that CSV field values should be converted into.
|
279
|
+
// Each char of row_conversions string represents Ruby type for CSV field with matching position.
|
280
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("row_conversions")));
|
281
|
+
if (option != Qnil) {
|
282
|
+
meta.num_row_conversions = RSTRING_LEN(option);
|
283
|
+
meta.row_conversions = StringValuePtr(option);
|
284
|
+
}
|
285
|
+
|
286
|
+
// Column names should be declared explicitly when parsing fields as Hashes
|
287
|
+
if (meta.row_as_hash) { // Only matters for hash results
|
288
|
+
option = rb_hash_aref(options, ID2SYM(rb_intern("column_names")));
|
289
|
+
if (option == Qnil) {
|
290
|
+
rb_raise(rcsv_parse_error, ":row_as_hash requires :column_names to be set.");
|
291
|
+
} else {
|
292
|
+
meta.num_columns = (size_t)RARRAY_LEN(option);
|
293
|
+
meta.column_names = (VALUE*)malloc(meta.num_columns * sizeof(VALUE*));
|
294
|
+
|
295
|
+
for (i = 0; i < meta.num_columns; i++) {
|
296
|
+
meta.column_names[i] = rb_ary_entry(option, i);
|
297
|
+
}
|
298
|
+
}
|
299
|
+
}
|
300
|
+
|
301
|
+
// Initializing result with empty Array
|
302
|
+
if (meta.row_as_hash) {
|
303
|
+
rb_ary_push(*(meta.result), rb_hash_new()); // [{}]
|
304
|
+
} else {
|
305
|
+
rb_ary_push(*(meta.result), rb_ary_new()); // [[]]
|
306
|
+
}
|
307
|
+
|
308
|
+
// Actual parsing and error handling
|
309
|
+
if (csv_string_len != csv_parse(&cp, csv_string, strlen(csv_string),
|
310
|
+
&end_of_field_callback, &end_of_line_callback, &meta)) {
|
311
|
+
error = csv_error(&cp);
|
312
|
+
switch(error) {
|
313
|
+
case CSV_EPARSE:
|
314
|
+
rb_raise(rcsv_parse_error, "Error when parsing malformed data");
|
315
|
+
break;
|
316
|
+
case CSV_ENOMEM:
|
317
|
+
rb_raise(rcsv_parse_error, "No memory");
|
318
|
+
break;
|
319
|
+
case CSV_ETOOBIG:
|
320
|
+
rb_raise(rcsv_parse_error, "Field data is too large");
|
321
|
+
break;
|
322
|
+
case CSV_EINVALID:
|
323
|
+
rb_raise(rcsv_parse_error, "%s", (const char *)csv_strerror(error));
|
324
|
+
break;
|
325
|
+
default:
|
326
|
+
rb_raise(rcsv_parse_error, "Failed due to unknown reason");
|
327
|
+
}
|
328
|
+
}
|
329
|
+
|
330
|
+
// Flushing libcsv's buffer and freeing up allocated memory
|
331
|
+
csv_fini(&cp, &end_of_field_callback, &end_of_line_callback, &meta);
|
332
|
+
csv_free(&cp);
|
333
|
+
|
334
|
+
if (meta.only_rows != NULL) {
|
335
|
+
free(meta.only_rows);
|
336
|
+
}
|
337
|
+
|
338
|
+
if (meta.row_defaults != NULL) {
|
339
|
+
free(meta.row_defaults);
|
340
|
+
}
|
341
|
+
|
342
|
+
if (meta.column_names != NULL) {
|
343
|
+
free(meta.column_names);
|
344
|
+
}
|
345
|
+
|
346
|
+
// Remove the last row if it's empty. That happens if CSV file ends with a newline.
|
347
|
+
if (RARRAY_LEN(rb_ary_entry(*(meta.result), -1)) == 0) {
|
348
|
+
rb_ary_pop(*(meta.result));
|
349
|
+
}
|
350
|
+
|
351
|
+
// An array of arrays of strings is returned.
|
352
|
+
return *(meta.result);
|
353
|
+
}
|
354
|
+
|
355
|
+
|
356
|
+
/* Define Ruby API */
|
357
|
+
void Init_rcsv(void) {
|
358
|
+
VALUE klass = rb_define_class("Rcsv", rb_cObject); // class Rcsv; end
|
359
|
+
|
360
|
+
// Error is initialized through static variable in order to access it from rb_rcsv_raw_parse
|
361
|
+
rcsv_parse_error = rb_define_class_under(klass, "ParseError", rb_eStandardError);
|
362
|
+
|
363
|
+
// def Rcsv.raw_parse; ...; end
|
364
|
+
rb_define_singleton_method(klass, "raw_parse", rb_rcsv_raw_parse, -1);
|
365
|
+
}
|
data/ext/rcsv/test.rb
ADDED
data/lib/lib_csv.rb
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
require 'ffi'
|
2
|
+
|
3
|
+
class LibCsv
|
4
|
+
extend FFI::Library
|
5
|
+
ffi_lib 'libcsv'
|
6
|
+
|
7
|
+
class CsvParser < FFI::Struct
|
8
|
+
layout :pstate, :int,
|
9
|
+
:qouted, :int,
|
10
|
+
:spaces, :size_t,
|
11
|
+
:entry_buf, :string,
|
12
|
+
:entry_pos, :size_t,
|
13
|
+
:entry_size, :size_t,
|
14
|
+
:status, :int,
|
15
|
+
:options, :uchar,
|
16
|
+
:quote_char, :uchar,
|
17
|
+
:delim_char, :uchar,
|
18
|
+
:is_space, :pointer,
|
19
|
+
:is_term, :pointer,
|
20
|
+
:blk_size, :size_t,
|
21
|
+
:malloc_func, :pointer,
|
22
|
+
:realloc_func, :pointer,
|
23
|
+
:free_func, :pointer
|
24
|
+
end
|
25
|
+
|
26
|
+
callback :end_of_field_callback, [:pointer, :size_t, :pointer], :void
|
27
|
+
callback :end_of_record_callback, [:int, :pointer], :void
|
28
|
+
|
29
|
+
attach_function :csv_init, [:pointer, :uchar], :int
|
30
|
+
attach_function :csv_parse, [:pointer, :pointer, :size_t, :end_of_field_callback, :end_of_record_callback, :pointer], :size_t
|
31
|
+
attach_function :csv_fini, [:pointer, :end_of_field_callback, :end_of_record_callback, :pointer], :int
|
32
|
+
attach_function :csv_free, [:pointer], :void
|
33
|
+
|
34
|
+
attach_function :csv_set_delim, [:pointer, :uchar], :void
|
35
|
+
attach_function :csv_get_delim, [:pointer], :uchar
|
36
|
+
|
37
|
+
attach_function :csv_error, [:pointer], :int
|
38
|
+
attach_function :csv_strerror, [:int], :string
|
39
|
+
|
40
|
+
def self.parse(string, options = {})
|
41
|
+
pointer = FFI::MemoryPointer.new :char, CsvParser.size, false
|
42
|
+
parser = CsvParser.new pointer
|
43
|
+
result = csv_init(parser, 0)
|
44
|
+
|
45
|
+
if options[:col_sep]
|
46
|
+
csv_set_delim(parser, options[:col_sep].ord)
|
47
|
+
end
|
48
|
+
|
49
|
+
fail "Couldn't initialize libcsv" if result == -1
|
50
|
+
|
51
|
+
result = [[]]
|
52
|
+
|
53
|
+
end_of_field_callback = Proc.new { |p_field, field_size, p_data|
|
54
|
+
str = p_field.read_pointer.null? ? nil : p_field.read_string(field_size)
|
55
|
+
result.last << str
|
56
|
+
}
|
57
|
+
|
58
|
+
end_of_record_callback = Proc.new { |last_char, p_data|
|
59
|
+
result << [] unless last_char == -1
|
60
|
+
}
|
61
|
+
|
62
|
+
original_length = string.bytesize
|
63
|
+
length = nil
|
64
|
+
|
65
|
+
length = csv_parse(parser, string, original_length, end_of_field_callback, end_of_record_callback, nil)
|
66
|
+
|
67
|
+
unless length == original_length
|
68
|
+
case error = csv_error(parser)
|
69
|
+
when CSV_EPARSE
|
70
|
+
fail "Error when parsing malformed data"
|
71
|
+
when CSV_ENOMEM
|
72
|
+
fail "No memory"
|
73
|
+
when CSV_ETOOBIG
|
74
|
+
fail "Too large field data"
|
75
|
+
when CSV_EINVALID
|
76
|
+
fail csv_strerror(error)
|
77
|
+
else
|
78
|
+
fail "Failed due to unknown reason"
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
csv_fini(parser, end_of_field_callback, end_of_record_callback, nil)
|
83
|
+
csv_free(parser)
|
84
|
+
result.pop if result.last == []
|
85
|
+
|
86
|
+
return result
|
87
|
+
end
|
88
|
+
end
|