rcsv 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/rcsv/rcsv.c ADDED
@@ -0,0 +1,365 @@
1
+ #include <stdbool.h>
2
+ #include <ruby.h>
3
+
4
+ #include "csv.h"
5
+
6
+ static VALUE rcsv_parse_error; // class Rcsv::ParseError << StandardError; end
7
+
8
+ // It is useful to know exact row/column positions and field contents where parse-time exception was raised
9
+ #define RAISE_WITH_LOCATION(row, column, contents, fmt, ...) \
10
+ rb_raise(rcsv_parse_error, "[%d:%d '%s'] " fmt, (int)(row), (int)(column), (char *)(contents), ##__VA_ARGS__);
11
+
12
+ struct rcsv_metadata {
13
+ // Derived from user-specified options
14
+ bool row_as_hash; // Used to return array of hashes rather than array of arrays
15
+ size_t offset_rows; // Number of rows to skip before parsing
16
+
17
+ char * row_conversions; // A pointer to string/array of row conversions char specifiers
18
+ char ** only_rows; // A pointer to array of strings for only_rows filter
19
+ VALUE * row_defaults; // A pointer to array of row defaults
20
+ VALUE * column_names; // A pointer to array of column names to be used with hashes
21
+
22
+ // Pointer options lengths
23
+ size_t num_row_conversions; // Number of converter types in row_conversions array
24
+ size_t num_only_rows; // Number of items in only_rows filter
25
+ size_t num_row_defaults; // Number of default values in row_defaults array
26
+ size_t num_columns; // Number of columns detected from column_names.size
27
+
28
+ // Internal state
29
+ bool skip_current_row; // Used by only_rows filter to skip parsing of the row remainder
30
+ size_t current_col; // Current column's index
31
+ size_t current_row; // Current row's index
32
+
33
+ VALUE * result; // A pointer to the parsed data
34
+ };
35
+
36
+ //// Internal callbacks ////
37
+
38
+ /* This procedure is called for every parsed field */
39
+ void end_of_field_callback(void * field, size_t field_size, void * data) {
40
+ const char * field_str = (char *)field;
41
+ struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
42
+ char row_conversion = 0;
43
+ VALUE parsed_field;
44
+ VALUE last_entry = rb_ary_entry(*(meta->result), -1); // result.last
45
+
46
+ // No need to parse anything until the end of the line if skip_current_row is set
47
+ if (meta->skip_current_row) {
48
+ return;
49
+ }
50
+
51
+ // Skip the row if its position is less than specifed offset
52
+ if (meta->current_row < meta->offset_rows) {
53
+ meta->skip_current_row = true;
54
+ return;
55
+ }
56
+
57
+ // Filter by string row values listed in meta->only_rows.
58
+ if ((meta->only_rows != NULL) &&
59
+ (meta->current_col < meta->num_only_rows) &&
60
+ (meta->only_rows[meta->current_col] != NULL) &&
61
+ (strcmp(meta->only_rows[meta->current_col], field_str))) {
62
+ meta->skip_current_row = true;
63
+ return;
64
+ }
65
+
66
+ // Get row conversion char specifier
67
+ if (meta->current_col < meta->num_row_conversions) {
68
+ row_conversion = (char)meta->row_conversions[meta->current_col];
69
+ }
70
+
71
+ // Convert the field from string into Ruby type specified by row_conversion
72
+ if (row_conversion != ' ') { // spacebar skips the column
73
+ if (field_size == 0) {
74
+ // Assigning appropriate default value if applicable.
75
+ if (meta->current_col < meta->num_row_defaults) {
76
+ parsed_field = meta->row_defaults[meta->current_col];
77
+ } else { // By default, default is nil
78
+ parsed_field = Qnil;
79
+ }
80
+ } else {
81
+ if (meta->current_col < meta->num_row_conversions) {
82
+ switch (row_conversion){
83
+ case 's': // String
84
+ parsed_field = rb_str_new(field_str, field_size);
85
+ break;
86
+ case 'i': // Integer
87
+ parsed_field = INT2NUM(atol(field_str));
88
+ break;
89
+ case 'f': // Float
90
+ parsed_field = rb_float_new(atof(field_str));
91
+ break;
92
+ case 'b': // TrueClass/FalseClass
93
+ switch (field_str[0]) {
94
+ case 't':
95
+ case 'T':
96
+ case '1':
97
+ parsed_field = Qtrue;
98
+ break;
99
+ case 'f':
100
+ case 'F':
101
+ case '0':
102
+ parsed_field = Qfalse;
103
+ break;
104
+ default:
105
+ RAISE_WITH_LOCATION(
106
+ meta->current_row,
107
+ meta->current_col,
108
+ field_str,
109
+ "Bad Boolean value. Valid values are strings where the first character is T/t/1 for true or F/f/0 for false."
110
+ );
111
+ }
112
+ break;
113
+ default:
114
+ RAISE_WITH_LOCATION(
115
+ meta->current_row,
116
+ meta->current_col,
117
+ field_str,
118
+ "Unknown deserializer '%c'.",
119
+ row_conversion
120
+ );
121
+ }
122
+ } else { // No conversion happens
123
+ parsed_field = rb_str_new(field_str, field_size); // field
124
+ }
125
+ }
126
+
127
+ // Assign the value to appropriate hash key if parsing into Hash
128
+ if (meta->row_as_hash) {
129
+ if (meta->current_col >= meta->num_columns) {
130
+ RAISE_WITH_LOCATION(
131
+ meta->current_row,
132
+ meta->current_col,
133
+ field_str,
134
+ "There are at least %d columns in a row, which is beyond the number of provided column names (%d).",
135
+ (int)meta->current_col + 1,
136
+ (int)meta->num_columns
137
+ );
138
+ } else {
139
+ rb_hash_aset(last_entry, meta->column_names[meta->current_col], parsed_field);
140
+ }
141
+ } else { // Parse into Array
142
+ rb_ary_push(last_entry, parsed_field); // result << field
143
+ }
144
+ }
145
+
146
+ // Increment column counter
147
+ meta->current_col++;
148
+ return;
149
+ }
150
+
151
+ /* This procedure is called for every line ending */
152
+ void end_of_line_callback(int last_char, void * data) {
153
+ struct rcsv_metadata * meta = (struct rcsv_metadata *) data;
154
+
155
+ // If filters didn't match, current row parsing is reverted.
156
+ if (meta->skip_current_row) {
157
+ rb_ary_pop(*(meta->result)); // result.pop
158
+ meta->skip_current_row = false;
159
+ }
160
+
161
+ // Add a new empty array/hash for the next line unless EOF reached.
162
+ if (last_char != -1) {
163
+ if (meta->row_as_hash) {
164
+ rb_ary_push(*(meta->result), rb_hash_new()); // result << {}
165
+ } else {
166
+ rb_ary_push(*(meta->result), rb_ary_new()); // result << []
167
+ }
168
+ }
169
+
170
+ // Resetting column counter.
171
+ meta->current_col = 0;
172
+
173
+ // Incrementing row counter.
174
+ meta->current_row++;
175
+ return;
176
+ }
177
+
178
+ //// C API ////
179
+
180
+ /* The main method that handles parsing */
181
+ static VALUE rb_rcsv_raw_parse(int argc, VALUE * argv, VALUE self) {
182
+ struct rcsv_metadata meta;
183
+ VALUE str, options, option;
184
+
185
+ struct csv_parser cp;
186
+ unsigned char csv_options = CSV_STRICT_FINI | CSV_APPEND_NULL;
187
+ char * csv_string;
188
+ size_t csv_string_len;
189
+ int error;
190
+ size_t i = 0;
191
+
192
+ // Setting up some sane defaults
193
+ meta.row_as_hash = false;
194
+ meta.skip_current_row = false;
195
+ meta.num_columns = 0;
196
+ meta.current_col = 0;
197
+ meta.current_row = 0;
198
+ meta.offset_rows = 0;
199
+ meta.num_only_rows = 0;
200
+ meta.num_row_defaults = 0;
201
+ meta.num_row_conversions = 0;
202
+ meta.only_rows = NULL;
203
+ meta.row_defaults = NULL;
204
+ meta.row_conversions = NULL;
205
+ meta.column_names = NULL;
206
+ meta.result = (VALUE[]){rb_ary_new()}; // []
207
+
208
+ // str is required, options is optional (pun intended)
209
+ rb_scan_args(argc, argv, "11", &str, &options);
210
+ csv_string = StringValuePtr(str);
211
+ csv_string_len = strlen(csv_string);
212
+
213
+ // options ||= nil
214
+ if (NIL_P(options)) {
215
+ options = rb_hash_new();
216
+ }
217
+
218
+ // By default, parsing is strict
219
+ option = rb_hash_aref(options, ID2SYM(rb_intern("nostrict")));
220
+ if (!option || (option == Qnil)) {
221
+ csv_options |= CSV_STRICT;
222
+ }
223
+
224
+ // Try to initialize libcsv
225
+ if (csv_init(&cp, csv_options) == -1) {
226
+ rb_raise(rcsv_parse_error, "Couldn't initialize libcsv");
227
+ }
228
+
229
+ // By default, parse as Array of Arrays
230
+ option = rb_hash_aref(options, ID2SYM(rb_intern("row_as_hash")));
231
+ if (option && (option != Qnil)) {
232
+ meta.row_as_hash = true;
233
+ }
234
+
235
+ // :col_sep sets the column separator, default is comma (,)
236
+ option = rb_hash_aref(options, ID2SYM(rb_intern("col_sep")));
237
+ if (option != Qnil) {
238
+ csv_set_delim(&cp, (unsigned char)*StringValuePtr(option));
239
+ }
240
+
241
+ // Specify how many rows to skip from the beginning of CSV
242
+ option = rb_hash_aref(options, ID2SYM(rb_intern("offset_rows")));
243
+ if (option != Qnil) {
244
+ meta.offset_rows = (size_t)NUM2INT(option);
245
+ }
246
+
247
+ // :only_rows is a string mask where row is only parsed
248
+ // if its fields match those in the passed array.
249
+ // [nil, nil, "ABC"] skips all rows where 3rd column isn't equal to "ABC"
250
+ option = rb_hash_aref(options, ID2SYM(rb_intern("only_rows")));
251
+ if (option != Qnil) {
252
+ meta.num_only_rows = (size_t)RARRAY_LEN(option);
253
+ meta.only_rows = (char **)malloc(meta.num_only_rows * sizeof(char *));
254
+
255
+ for (i = 0; i < meta.num_only_rows; i++) {
256
+ VALUE only_row = rb_ary_entry(option, i);
257
+ if (only_row == Qnil) {
258
+ meta.only_rows[i] = NULL;
259
+ } else {
260
+ meta.only_rows[i] = StringValueCStr(only_row);
261
+ }
262
+ }
263
+ }
264
+
265
+ // :row_defaults is an array of default values that are assigned to fields containing empty strings
266
+ // according to matching field positions
267
+ option = rb_hash_aref(options, ID2SYM(rb_intern("row_defaults")));
268
+ if (option != Qnil) {
269
+ meta.num_row_defaults = RARRAY_LEN(option);
270
+ meta.row_defaults = (VALUE*)malloc(meta.num_row_defaults * sizeof(VALUE*));
271
+
272
+ for (i = 0; i < meta.num_row_defaults; i++) {
273
+ VALUE row_default = rb_ary_entry(option, i);
274
+ meta.row_defaults[i] = row_default;
275
+ }
276
+ }
277
+
278
+ // :row_conversions specifies Ruby types that CSV field values should be converted into.
279
+ // Each char of row_conversions string represents Ruby type for CSV field with matching position.
280
+ option = rb_hash_aref(options, ID2SYM(rb_intern("row_conversions")));
281
+ if (option != Qnil) {
282
+ meta.num_row_conversions = RSTRING_LEN(option);
283
+ meta.row_conversions = StringValuePtr(option);
284
+ }
285
+
286
+ // Column names should be declared explicitly when parsing fields as Hashes
287
+ if (meta.row_as_hash) { // Only matters for hash results
288
+ option = rb_hash_aref(options, ID2SYM(rb_intern("column_names")));
289
+ if (option == Qnil) {
290
+ rb_raise(rcsv_parse_error, ":row_as_hash requires :column_names to be set.");
291
+ } else {
292
+ meta.num_columns = (size_t)RARRAY_LEN(option);
293
+ meta.column_names = (VALUE*)malloc(meta.num_columns * sizeof(VALUE*));
294
+
295
+ for (i = 0; i < meta.num_columns; i++) {
296
+ meta.column_names[i] = rb_ary_entry(option, i);
297
+ }
298
+ }
299
+ }
300
+
301
+ // Initializing result with empty Array
302
+ if (meta.row_as_hash) {
303
+ rb_ary_push(*(meta.result), rb_hash_new()); // [{}]
304
+ } else {
305
+ rb_ary_push(*(meta.result), rb_ary_new()); // [[]]
306
+ }
307
+
308
+ // Actual parsing and error handling
309
+ if (csv_string_len != csv_parse(&cp, csv_string, strlen(csv_string),
310
+ &end_of_field_callback, &end_of_line_callback, &meta)) {
311
+ error = csv_error(&cp);
312
+ switch(error) {
313
+ case CSV_EPARSE:
314
+ rb_raise(rcsv_parse_error, "Error when parsing malformed data");
315
+ break;
316
+ case CSV_ENOMEM:
317
+ rb_raise(rcsv_parse_error, "No memory");
318
+ break;
319
+ case CSV_ETOOBIG:
320
+ rb_raise(rcsv_parse_error, "Field data is too large");
321
+ break;
322
+ case CSV_EINVALID:
323
+ rb_raise(rcsv_parse_error, "%s", (const char *)csv_strerror(error));
324
+ break;
325
+ default:
326
+ rb_raise(rcsv_parse_error, "Failed due to unknown reason");
327
+ }
328
+ }
329
+
330
+ // Flushing libcsv's buffer and freeing up allocated memory
331
+ csv_fini(&cp, &end_of_field_callback, &end_of_line_callback, &meta);
332
+ csv_free(&cp);
333
+
334
+ if (meta.only_rows != NULL) {
335
+ free(meta.only_rows);
336
+ }
337
+
338
+ if (meta.row_defaults != NULL) {
339
+ free(meta.row_defaults);
340
+ }
341
+
342
+ if (meta.column_names != NULL) {
343
+ free(meta.column_names);
344
+ }
345
+
346
+ // Remove the last row if it's empty. That happens if CSV file ends with a newline.
347
+ if (RARRAY_LEN(rb_ary_entry(*(meta.result), -1)) == 0) {
348
+ rb_ary_pop(*(meta.result));
349
+ }
350
+
351
+ // An array of arrays of strings is returned.
352
+ return *(meta.result);
353
+ }
354
+
355
+
356
+ /* Define Ruby API */
357
+ void Init_rcsv(void) {
358
+ VALUE klass = rb_define_class("Rcsv", rb_cObject); // class Rcsv; end
359
+
360
+ // Error is initialized through static variable in order to access it from rb_rcsv_raw_parse
361
+ rcsv_parse_error = rb_define_class_under(klass, "ParseError", rb_eStandardError);
362
+
363
+ // def Rcsv.raw_parse; ...; end
364
+ rb_define_singleton_method(klass, "raw_parse", rb_rcsv_raw_parse, -1);
365
+ }
data/ext/rcsv/test.rb ADDED
@@ -0,0 +1,5 @@
1
+ require './rcsv'
2
+
3
+ r = Rcsv.parse(['1,2,3,4,5,abc,"def"', '5,,7,8,9,"yo, kmon", foo'].join("\n"), :col_sep => ',')
4
+
5
+ puts r.inspect
data/lib/lib_csv.rb ADDED
@@ -0,0 +1,88 @@
1
+ require 'ffi'
2
+
3
+ class LibCsv
4
+ extend FFI::Library
5
+ ffi_lib 'libcsv'
6
+
7
+ class CsvParser < FFI::Struct
8
+ layout :pstate, :int,
9
+ :qouted, :int,
10
+ :spaces, :size_t,
11
+ :entry_buf, :string,
12
+ :entry_pos, :size_t,
13
+ :entry_size, :size_t,
14
+ :status, :int,
15
+ :options, :uchar,
16
+ :quote_char, :uchar,
17
+ :delim_char, :uchar,
18
+ :is_space, :pointer,
19
+ :is_term, :pointer,
20
+ :blk_size, :size_t,
21
+ :malloc_func, :pointer,
22
+ :realloc_func, :pointer,
23
+ :free_func, :pointer
24
+ end
25
+
26
+ callback :end_of_field_callback, [:pointer, :size_t, :pointer], :void
27
+ callback :end_of_record_callback, [:int, :pointer], :void
28
+
29
+ attach_function :csv_init, [:pointer, :uchar], :int
30
+ attach_function :csv_parse, [:pointer, :pointer, :size_t, :end_of_field_callback, :end_of_record_callback, :pointer], :size_t
31
+ attach_function :csv_fini, [:pointer, :end_of_field_callback, :end_of_record_callback, :pointer], :int
32
+ attach_function :csv_free, [:pointer], :void
33
+
34
+ attach_function :csv_set_delim, [:pointer, :uchar], :void
35
+ attach_function :csv_get_delim, [:pointer], :uchar
36
+
37
+ attach_function :csv_error, [:pointer], :int
38
+ attach_function :csv_strerror, [:int], :string
39
+
40
+ def self.parse(string, options = {})
41
+ pointer = FFI::MemoryPointer.new :char, CsvParser.size, false
42
+ parser = CsvParser.new pointer
43
+ result = csv_init(parser, 0)
44
+
45
+ if options[:col_sep]
46
+ csv_set_delim(parser, options[:col_sep].ord)
47
+ end
48
+
49
+ fail "Couldn't initialize libcsv" if result == -1
50
+
51
+ result = [[]]
52
+
53
+ end_of_field_callback = Proc.new { |p_field, field_size, p_data|
54
+ str = p_field.read_pointer.null? ? nil : p_field.read_string(field_size)
55
+ result.last << str
56
+ }
57
+
58
+ end_of_record_callback = Proc.new { |last_char, p_data|
59
+ result << [] unless last_char == -1
60
+ }
61
+
62
+ original_length = string.bytesize
63
+ length = nil
64
+
65
+ length = csv_parse(parser, string, original_length, end_of_field_callback, end_of_record_callback, nil)
66
+
67
+ unless length == original_length
68
+ case error = csv_error(parser)
69
+ when CSV_EPARSE
70
+ fail "Error when parsing malformed data"
71
+ when CSV_ENOMEM
72
+ fail "No memory"
73
+ when CSV_ETOOBIG
74
+ fail "Too large field data"
75
+ when CSV_EINVALID
76
+ fail csv_strerror(error)
77
+ else
78
+ fail "Failed due to unknown reason"
79
+ end
80
+ end
81
+
82
+ csv_fini(parser, end_of_field_callback, end_of_record_callback, nil)
83
+ csv_free(parser)
84
+ result.pop if result.last == []
85
+
86
+ return result
87
+ end
88
+ end