rcsv 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/ext/rcsv/rcsv.c ADDED
@@ -0,0 +1,365 @@
1
+ #include <stdbool.h>
2
+ #include <ruby.h>
3
+
4
+ #include "csv.h"
5
+
6
+ static VALUE rcsv_parse_error; // class Rcsv::ParseError << StandardError; end
7
+
8
+ // It is useful to know exact row/column positions and field contents where parse-time exception was raised
9
+ #define RAISE_WITH_LOCATION(row, column, contents, fmt, ...) \
10
+ rb_raise(rcsv_parse_error, "[%d:%d '%s'] " fmt, (int)(row), (int)(column), (char *)(contents), ##__VA_ARGS__);
11
+
12
+ struct rcsv_metadata {
13
+ // Derived from user-specified options
14
+ bool row_as_hash; // Used to return array of hashes rather than array of arrays
15
+ size_t offset_rows; // Number of rows to skip before parsing
16
+
17
+ char * row_conversions; // A pointer to string/array of row conversions char specifiers
18
+ char ** only_rows; // A pointer to array of strings for only_rows filter
19
+ VALUE * row_defaults; // A pointer to array of row defaults
20
+ VALUE * column_names; // A pointer to array of column names to be used with hashes
21
+
22
+ // Pointer options lengths
23
+ size_t num_row_conversions; // Number of converter types in row_conversions array
24
+ size_t num_only_rows; // Number of items in only_rows filter
25
+ size_t num_row_defaults; // Number of default values in row_defaults array
26
+ size_t num_columns; // Number of columns detected from column_names.size
27
+
28
+ // Internal state
29
+ bool skip_current_row; // Used by only_rows filter to skip parsing of the row remainder
30
+ size_t current_col; // Current column's index
31
+ size_t current_row; // Current row's index
32
+
33
+ VALUE * result; // A pointer to the parsed data
34
+ };
35
+
36
+ //// Internal callbacks ////
37
+
38
+ /* This procedure is called for every parsed field */
39
+ void end_of_field_callback(void * field, size_t field_size, void * data) {
40
+ const char * field_str = (char *)field;
41
+ struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
42
+ char row_conversion = 0;
43
+ VALUE parsed_field;
44
+ VALUE last_entry = rb_ary_entry(*(meta->result), -1); // result.last
45
+
46
+ // No need to parse anything until the end of the line if skip_current_row is set
47
+ if (meta->skip_current_row) {
48
+ return;
49
+ }
50
+
51
+ // Skip the row if its position is less than specifed offset
52
+ if (meta->current_row < meta->offset_rows) {
53
+ meta->skip_current_row = true;
54
+ return;
55
+ }
56
+
57
+ // Filter by string row values listed in meta->only_rows.
58
+ if ((meta->only_rows != NULL) &&
59
+ (meta->current_col < meta->num_only_rows) &&
60
+ (meta->only_rows[meta->current_col] != NULL) &&
61
+ (strcmp(meta->only_rows[meta->current_col], field_str))) {
62
+ meta->skip_current_row = true;
63
+ return;
64
+ }
65
+
66
+ // Get row conversion char specifier
67
+ if (meta->current_col < meta->num_row_conversions) {
68
+ row_conversion = (char)meta->row_conversions[meta->current_col];
69
+ }
70
+
71
+ // Convert the field from string into Ruby type specified by row_conversion
72
+ if (row_conversion != ' ') { // spacebar skips the column
73
+ if (field_size == 0) {
74
+ // Assigning appropriate default value if applicable.
75
+ if (meta->current_col < meta->num_row_defaults) {
76
+ parsed_field = meta->row_defaults[meta->current_col];
77
+ } else { // By default, default is nil
78
+ parsed_field = Qnil;
79
+ }
80
+ } else {
81
+ if (meta->current_col < meta->num_row_conversions) {
82
+ switch (row_conversion){
83
+ case 's': // String
84
+ parsed_field = rb_str_new(field_str, field_size);
85
+ break;
86
+ case 'i': // Integer
87
+ parsed_field = INT2NUM(atol(field_str));
88
+ break;
89
+ case 'f': // Float
90
+ parsed_field = rb_float_new(atof(field_str));
91
+ break;
92
+ case 'b': // TrueClass/FalseClass
93
+ switch (field_str[0]) {
94
+ case 't':
95
+ case 'T':
96
+ case '1':
97
+ parsed_field = Qtrue;
98
+ break;
99
+ case 'f':
100
+ case 'F':
101
+ case '0':
102
+ parsed_field = Qfalse;
103
+ break;
104
+ default:
105
+ RAISE_WITH_LOCATION(
106
+ meta->current_row,
107
+ meta->current_col,
108
+ field_str,
109
+ "Bad Boolean value. Valid values are strings where the first character is T/t/1 for true or F/f/0 for false."
110
+ );
111
+ }
112
+ break;
113
+ default:
114
+ RAISE_WITH_LOCATION(
115
+ meta->current_row,
116
+ meta->current_col,
117
+ field_str,
118
+ "Unknown deserializer '%c'.",
119
+ row_conversion
120
+ );
121
+ }
122
+ } else { // No conversion happens
123
+ parsed_field = rb_str_new(field_str, field_size); // field
124
+ }
125
+ }
126
+
127
+ // Assign the value to appropriate hash key if parsing into Hash
128
+ if (meta->row_as_hash) {
129
+ if (meta->current_col >= meta->num_columns) {
130
+ RAISE_WITH_LOCATION(
131
+ meta->current_row,
132
+ meta->current_col,
133
+ field_str,
134
+ "There are at least %d columns in a row, which is beyond the number of provided column names (%d).",
135
+ (int)meta->current_col + 1,
136
+ (int)meta->num_columns
137
+ );
138
+ } else {
139
+ rb_hash_aset(last_entry, meta->column_names[meta->current_col], parsed_field);
140
+ }
141
+ } else { // Parse into Array
142
+ rb_ary_push(last_entry, parsed_field); // result << field
143
+ }
144
+ }
145
+
146
+ // Increment column counter
147
+ meta->current_col++;
148
+ return;
149
+ }
150
+
151
+ /* This procedure is called for every line ending */
152
+ void end_of_line_callback(int last_char, void * data) {
153
+ struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
154
+
155
+ // If filters didn't match, current row parsing is reverted.
156
+ if (meta->skip_current_row) {
157
+ rb_ary_pop(*(meta->result)); // result.pop
158
+ meta->skip_current_row = false;
159
+ }
160
+
161
+ // Add a new empty array/hash for the next line unless EOF reached.
162
+ if (last_char != -1) {
163
+ if (meta->row_as_hash) {
164
+ rb_ary_push(*(meta->result), rb_hash_new()); // result << {}
165
+ } else {
166
+ rb_ary_push(*(meta->result), rb_ary_new()); // result << []
167
+ }
168
+ }
169
+
170
+ // Resetting column counter.
171
+ meta->current_col = 0;
172
+
173
+ // Incrementing row counter.
174
+ meta->current_row++;
175
+ return;
176
+ }
177
+
178
+ //// C API ////
179
+
180
+ /* The main method that handles parsing */
181
+ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
182
+ struct rcsv_metadata meta;
183
+ VALUE str, options, option;
184
+
185
+ struct csv_parser cp;
186
+ unsigned char csv_options = CSV_STRICT_FINI | CSV_APPEND_NULL;
187
+ char * csv_string;
188
+ size_t csv_string_len;
189
+ int error;
190
+ size_t i = 0;
191
+
192
+ // Setting up some sane defaults
193
+ meta.row_as_hash = false;
194
+ meta.skip_current_row = false;
195
+ meta.num_columns = 0;
196
+ meta.current_col = 0;
197
+ meta.current_row = 0;
198
+ meta.offset_rows = 0;
199
+ meta.num_only_rows = 0;
200
+ meta.num_row_defaults = 0;
201
+ meta.num_row_conversions = 0;
202
+ meta.only_rows = NULL;
203
+ meta.row_defaults = NULL;
204
+ meta.row_conversions = NULL;
205
+ meta.column_names = NULL;
206
+ meta.result = (VALUE[]){rb_ary_new()}; // []
207
+
208
+ // str is required, options is optional (pun intended)
209
+ rb_scan_args(argc, argv, "11", &str, &options);
210
+ csv_string = StringValuePtr(str);
211
+ csv_string_len = strlen(csv_string);
212
+
213
+ // options ||= nil
214
+ if (NIL_P(options)) {
215
+ options = rb_hash_new();
216
+ }
217
+
218
+ // By default, parsing is strict
219
+ option = rb_hash_aref(options, ID2SYM(rb_intern("nostrict")));
220
+ if (!option || (option == Qnil)) {
221
+ csv_options |= CSV_STRICT;
222
+ }
223
+
224
+ // Try to initialize libcsv
225
+ if (csv_init(&cp, csv_options) == -1) {
226
+ rb_raise(rcsv_parse_error, "Couldn't initialize libcsv");
227
+ }
228
+
229
+ // By default, parse as Array of Arrays
230
+ option = rb_hash_aref(options, ID2SYM(rb_intern("row_as_hash")));
231
+ if (option && (option != Qnil)) {
232
+ meta.row_as_hash = true;
233
+ }
234
+
235
+ // :col_sep sets the column separator, default is comma (,)
236
+ option = rb_hash_aref(options, ID2SYM(rb_intern("col_sep")));
237
+ if (option != Qnil) {
238
+ csv_set_delim(&cp, (unsigned char)*StringValuePtr(option));
239
+ }
240
+
241
+ // Specify how many rows to skip from the beginning of CSV
242
+ option = rb_hash_aref(options, ID2SYM(rb_intern("offset_rows")));
243
+ if (option != Qnil) {
244
+ meta.offset_rows = (size_t)NUM2INT(option);
245
+ }
246
+
247
+ // :only_rows is a string mask where row is only parsed
248
+ // if its fields match those in the passed array.
249
+ // [nil, nil, "ABC"] skips all rows where 3rd column isn't equal to "ABC"
250
+ option = rb_hash_aref(options, ID2SYM(rb_intern("only_rows")));
251
+ if (option != Qnil) {
252
+ meta.num_only_rows = (size_t)RARRAY_LEN(option);
253
+ meta.only_rows = (char **)malloc(meta.num_only_rows * sizeof(char *));
254
+
255
+ for (i = 0; i < meta.num_only_rows; i++) {
256
+ VALUE only_row = rb_ary_entry(option, i);
257
+ if (only_row == Qnil) {
258
+ meta.only_rows[i] = NULL;
259
+ } else {
260
+ meta.only_rows[i] = StringValueCStr(only_row);
261
+ }
262
+ }
263
+ }
264
+
265
+ // :row_defaults is an array of default values that are assigned to fields containing empty strings
266
+ // according to matching field positions
267
+ option = rb_hash_aref(options, ID2SYM(rb_intern("row_defaults")));
268
+ if (option != Qnil) {
269
+ meta.num_row_defaults = RARRAY_LEN(option);
270
+ meta.row_defaults = (VALUE*)malloc(meta.num_row_defaults * sizeof(VALUE*));
271
+
272
+ for (i = 0; i < meta.num_row_defaults; i++) {
273
+ VALUE row_default = rb_ary_entry(option, i);
274
+ meta.row_defaults[i] = row_default;
275
+ }
276
+ }
277
+
278
+ // :row_conversions specifies Ruby types that CSV field values should be converted into.
279
+ // Each char of row_conversions string represents Ruby type for CSV field with matching position.
280
+ option = rb_hash_aref(options, ID2SYM(rb_intern("row_conversions")));
281
+ if (option != Qnil) {
282
+ meta.num_row_conversions = RSTRING_LEN(option);
283
+ meta.row_conversions = StringValuePtr(option);
284
+ }
285
+
286
+ // Column names should be declared explicitly when parsing fields as Hashes
287
+ if (meta.row_as_hash) { // Only matters for hash results
288
+ option = rb_hash_aref(options, ID2SYM(rb_intern("column_names")));
289
+ if (option == Qnil) {
290
+ rb_raise(rcsv_parse_error, ":row_as_hash requires :column_names to be set.");
291
+ } else {
292
+ meta.num_columns = (size_t)RARRAY_LEN(option);
293
+ meta.column_names = (VALUE*)malloc(meta.num_columns * sizeof(VALUE*));
294
+
295
+ for (i = 0; i < meta.num_columns; i++) {
296
+ meta.column_names[i] = rb_ary_entry(option, i);
297
+ }
298
+ }
299
+ }
300
+
301
+ // Initializing result with empty Array
302
+ if (meta.row_as_hash) {
303
+ rb_ary_push(*(meta.result), rb_hash_new()); // [{}]
304
+ } else {
305
+ rb_ary_push(*(meta.result), rb_ary_new()); // [[]]
306
+ }
307
+
308
+ // Actual parsing and error handling
309
+ if (csv_string_len != csv_parse(&cp, csv_string, strlen(csv_string),
310
+ &end_of_field_callback, &end_of_line_callback, &meta)) {
311
+ error = csv_error(&cp);
312
+ switch(error) {
313
+ case CSV_EPARSE:
314
+ rb_raise(rcsv_parse_error, "Error when parsing malformed data");
315
+ break;
316
+ case CSV_ENOMEM:
317
+ rb_raise(rcsv_parse_error, "No memory");
318
+ break;
319
+ case CSV_ETOOBIG:
320
+ rb_raise(rcsv_parse_error, "Field data is too large");
321
+ break;
322
+ case CSV_EINVALID:
323
+ rb_raise(rcsv_parse_error, "%s", (const char *)csv_strerror(error));
324
+ break;
325
+ default:
326
+ rb_raise(rcsv_parse_error, "Failed due to unknown reason");
327
+ }
328
+ }
329
+
330
+ // Flushing libcsv's buffer and freeing up allocated memory
331
+ csv_fini(&cp, &end_of_field_callback, &end_of_line_callback, &meta);
332
+ csv_free(&cp);
333
+
334
+ if (meta.only_rows != NULL) {
335
+ free(meta.only_rows);
336
+ }
337
+
338
+ if (meta.row_defaults != NULL) {
339
+ free(meta.row_defaults);
340
+ }
341
+
342
+ if (meta.column_names != NULL) {
343
+ free(meta.column_names);
344
+ }
345
+
346
+ // Remove the last row if it's empty. That happens if CSV file ends with a newline.
347
+ if (RARRAY_LEN(rb_ary_entry(*(meta.result), -1)) == 0) {
348
+ rb_ary_pop(*(meta.result));
349
+ }
350
+
351
+ // An array of arrays of strings is returned.
352
+ return *(meta.result);
353
+ }
354
+
355
+
356
+ /* Define Ruby API */
357
+ void Init_rcsv(void) {
358
+ VALUE klass = rb_define_class("Rcsv", rb_cObject); // class Rcsv; end
359
+
360
+ // Error is initialized through static variable in order to access it from rb_rcsv_raw_parse
361
+ rcsv_parse_error = rb_define_class_under(klass, "ParseError", rb_eStandardError);
362
+
363
+ // def Rcsv.raw_parse; ...; end
364
+ rb_define_singleton_method(klass, "raw_parse", rb_rcsv_raw_parse, -1);
365
+ }
data/ext/rcsv/test.rb ADDED
@@ -0,0 +1,5 @@
1
+ require './rcsv'
2
+
3
+ r = Rcsv.parse(['1,2,3,4,5,abc,"def"', '5,,7,8,9,"yo, kmon", foo'].join("\n"), :col_sep => ',')
4
+
5
+ puts r.inspect
data/lib/lib_csv.rb ADDED
@@ -0,0 +1,88 @@
1
+ require 'ffi'
2
+
3
+ class LibCsv
4
+ extend FFI::Library
5
+ ffi_lib 'libcsv'
6
+
7
+ class CsvParser < FFI::Struct
8
+ layout :pstate, :int,
9
+ :qouted, :int,
10
+ :spaces, :size_t,
11
+ :entry_buf, :string,
12
+ :entry_pos, :size_t,
13
+ :entry_size, :size_t,
14
+ :status, :int,
15
+ :options, :uchar,
16
+ :quote_char, :uchar,
17
+ :delim_char, :uchar,
18
+ :is_space, :pointer,
19
+ :is_term, :pointer,
20
+ :blk_size, :size_t,
21
+ :malloc_func, :pointer,
22
+ :realloc_func, :pointer,
23
+ :free_func, :pointer
24
+ end
25
+
26
+ callback :end_of_field_callback, [:pointer, :size_t, :pointer], :void
27
+ callback :end_of_record_callback, [:int, :pointer], :void
28
+
29
+ attach_function :csv_init, [:pointer, :uchar], :int
30
+ attach_function :csv_parse, [:pointer, :pointer, :size_t, :end_of_field_callback, :end_of_record_callback, :pointer], :size_t
31
+ attach_function :csv_fini, [:pointer, :end_of_field_callback, :end_of_record_callback, :pointer], :int
32
+ attach_function :csv_free, [:pointer], :void
33
+
34
+ attach_function :csv_set_delim, [:pointer, :uchar], :void
35
+ attach_function :csv_get_delim, [:pointer], :uchar
36
+
37
+ attach_function :csv_error, [:pointer], :int
38
+ attach_function :csv_strerror, [:int], :string
39
+
40
+ def self.parse(string, options = {})
41
+ pointer = FFI::MemoryPointer.new :char, CsvParser.size, false
42
+ parser = CsvParser.new pointer
43
+ result = csv_init(parser, 0)
44
+
45
+ if options[:col_sep]
46
+ csv_set_delim(parser, options[:col_sep].ord)
47
+ end
48
+
49
+ fail "Couldn't initialize libcsv" if result == -1
50
+
51
+ result = [[]]
52
+
53
+ end_of_field_callback = Proc.new { |p_field, field_size, p_data|
54
+ str = p_field.read_pointer.null? ? nil : p_field.read_string(field_size)
55
+ result.last << str
56
+ }
57
+
58
+ end_of_record_callback = Proc.new { |last_char, p_data|
59
+ result << [] unless last_char == -1
60
+ }
61
+
62
+ original_length = string.bytesize
63
+ length = nil
64
+
65
+ length = csv_parse(parser, string, original_length, end_of_field_callback, end_of_record_callback, nil)
66
+
67
+ unless length == original_length
68
+ case error = csv_error(parser)
69
+ when CSV_EPARSE
70
+ fail "Error when parsing malformed data"
71
+ when CSV_ENOMEM
72
+ fail "No memory"
73
+ when CSV_ETOOBIG
74
+ fail "Too large field data"
75
+ when CSV_EINVALID
76
+ fail csv_strerror(error)
77
+ else
78
+ fail "Failed due to unknown reason"
79
+ end
80
+ end
81
+
82
+ csv_fini(parser, end_of_field_callback, end_of_record_callback, nil)
83
+ csv_free(parser)
84
+ result.pop if result.last == []
85
+
86
+ return result
87
+ end
88
+ end