rcsv 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rcsv.gemspec
4
+ gemspec
5
+
6
+ gem "rake-compiler", :group => :development
data/Gemfile.lock ADDED
@@ -0,0 +1,18 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rcsv (0.0.5)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ rake (0.9.2.2)
10
+ rake-compiler (0.8.1)
11
+ rake
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ rake-compiler
18
+ rcsv!
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Copyright (c) 2012, Fiksu, Inc.
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ o Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ o Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer in the
13
+ documentation and/or other materials provided with the
14
+ distribution.
15
+
16
+ o Fiksu, Inc. nor the names of its contributors may be used to
17
+ endorse or promote products derived from this software without
18
+ specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,148 @@
1
+ # Rcsv
2
+
3
+ Rcsv is a fast CSV parsing library for MRI Ruby. Tested on REE 1.8.7 and Ruby 1.9.3.
4
+
5
+ Contrary to many other gems that implement their own parsers, Rcsv uses libcsv 3.1.0 (http://sourceforge.net/projects/libcsv/). As long as libcsv's API is stable, getting Rcsv to use newer libcsv version is as simple as updating two files (csv.h and libcsv.c).
6
+
7
+ ## Benchmarks
8
+ user system total real
9
+ FasterCSV 0.580000 0.000000 0.580000 ( 0.618837)
10
+ rcsv 0.060000 0.000000 0.060000 ( 0.062248)
11
+
12
+ ## License
13
+
14
+ Rcsv itself is distributed under BSD-derived license (see LICENSE) except for included csv.h and libcsv.c source files that are distributed under LGPL v2.1 (see COPYING.LESSER). Libcsv sources were not modified in any manner.
15
+
16
+ ## Installation
17
+
18
+ Add this line to your application's Gemfile:
19
+
20
+ gem 'rcsv'
21
+
22
+ And then execute:
23
+
24
+ $ bundle
25
+
26
+ Or install it yourself as:
27
+
28
+ $ gem install rcsv
29
+
30
+
31
+ ## Building the latest source
32
+
33
+ First, check out the master branch. Then cd there and run:
34
+
35
+ $ bundle # Installs development dependencies
36
+ $ bundle exec rake # Runs tests
37
+ $ gem build rcsv.gemspec # Builds the gem
38
+
39
+ ## Usage
40
+
41
+ Currently, Rcsv only supports CSV parsing. CSV write support is planned.
42
+
43
+ Quickstart:
44
+
45
+ parsed = Rcsv.parse(csv_data)
46
+
47
+
48
+ Rcsv class exposes a class method *parse* that accepts a CSV string as its first parameter and options hash as its second parameter.
49
+
50
+
51
+ Options supported:
52
+
53
+ ### :column_separator
54
+
55
+ A single-character string that is used as a separator. Default is ",".
56
+
57
+ ### :nostrict
58
+
59
+ A boolean flag. When enabled, allows to parse oddly quoted CSV data without exceptions being raised. Disabled by default.
60
+
61
+ Anything that does not conform to http://www.ietf.org/rfc/rfc4180.txt should better be parsed with this option enabled.
62
+
63
+ ### :offset_rows
64
+
65
+ A positive integer that specifies how many rows should be skipped, counting from the beginning. Default is 0.
66
+
67
+ ### :columns
68
+ A hash that contains per-column parsing instructions. By default, every CSV cell is parsed as a raw string without conversions. Empty strings are parsed as nils.
69
+
70
+ If CSV has a header, :columns keys can be strings that are equal to column names in the header. If there is no header, keys should represent integer column positions.
71
+
72
+ :columns values are in turn hashes that provide parsing options:
73
+
74
+ * :alias - Object of any type (though usually a Symbol) that is used to as a key that represents column name when :row_as_hash is set.
75
+ * :type - A Ruby Symbol that specifies Ruby data type that CSV cell value should be converted into. Supported types: :int, :float, :string, :bool. :string is the default.
76
+ * :default - Object of any type (though usually of the same type that is specified by :type option). If CSV doesn't have any value for a cell, this default value is used.
77
+ * :match - A string. If set, makes Rcsv skip all the rows where any column doesn't match its :match value. Useful for filtering data.
78
+
79
+
80
+ ### :header
81
+ A Ruby symbol that specifies how CSV header should be processed. Accepted values:
82
+
83
+ * :use (default) - If :columns is set, instructs Rcsv to parse the first CSV line and use column names from there as :columns keys. Ignores the header when :columns is not set.
84
+
85
+ * :skip - Skips the header, treats :columns keys as column positions.
86
+
87
+ * :none - Tells Rcsv that CSV header is not present. :columns keys are treated as column positions.
88
+
89
+ ### :row_as_hash
90
+ A boolean flag. Disabled by default.
91
+ When enabled, *parse* return value is represented as array of hashes. If :header is set to :use, keys for hashes are either string column names from CSV header or their aliases. Otherwise, column indexes are used.
92
+ When :row_as_hash is disabled, return value is represented as array of arrays.
93
+
94
+ ### :only_listed_columns
95
+ A boolean flag. If enabled, only parses columns that are listed in :columns. Disabled by default.
96
+
97
+
98
+ ## Examples
99
+
100
+ This example parses a 3-column CSV file and only returns parsed rows where "Age" values are set to "35".
101
+
102
+ Rcsv.parse some_csv, :row_as_hash => true,
103
+ :columns => {
104
+ 'First Name' => { :alias => :first_name, :default => "Unknown" },
105
+ 'Last Name' => { :alias => :last_name, :default => "Unknown"},
106
+ 'Age' => { :alias => :age, :type => :int, :match => "35"}
107
+ }
108
+
109
+ The result would look like this:
110
+
111
+ [
112
+ { :first_name => "Mary", :last_name => "Jane", :age => 35 },
113
+ { :first_name => "Unknown", :last_name => "Alien", :age => 35}
114
+ ]
115
+
116
+ Another example, for a miserable headerless Tab-separated CSV:
117
+
118
+ Rcsv.parse some_csv, :column_separator => "\t",
119
+ :header => :none,
120
+ :columns => {
121
+ 1 => { :type => :float, :default => 0 }
122
+ }
123
+
124
+ The result would look like this:
125
+
126
+ [
127
+ [ "Very hot", 3.7, "Mercury" ],
128
+ [ "Very hot and cloudy", 8.87, "Venus" ],
129
+ [ "Just about ok", 9.78, "Earth"],
130
+ [ nil, 0, "Vacuum" ]
131
+ ]
132
+
133
+
134
+ ## To do
135
+
136
+ * More specs for boolean values
137
+ * Specs for Ruby parse
138
+ * Add custom Ruby callbacks (if block is passed)
139
+ * Add CSV write support
140
+
141
+
142
+ ## Contributing
143
+
144
+ 1. Fork it
145
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
146
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
147
+ 4. Push to the branch (`git push origin my-new-feature`)
148
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require "rake/extensiontask"
4
+ require 'rake/testtask'
5
+
6
+ Rake::ExtensionTask.new('rcsv') do |ext|
7
+ ext.lib_dir = 'lib/rcsv'
8
+ end
9
+
10
+ Rake::TestTask.new do |t|
11
+ t.libs << 'test'
12
+ end
13
+
14
+ desc "Recompile native code"
15
+ task :recompile => [:clobber, :compile] # clean build
16
+
17
+ desc "Recompile native code and run tests"
18
+ task :default => [:recompile, :test] # clean testing FTW
data/bench.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'benchmark'
2
+
3
+ require 'csv'
4
+ #require './lib/lib_csv'
5
+ require 'rcsv'
6
+
7
+ TIMES = 10
8
+
9
+ # That CSV file contains "broken" headers that FaterCSV doesn't like.
10
+ # Remove all quotes from the header in order to fix this benchmark.
11
+ # But even better would be to test against much bigger CSV file.
12
+ data = File.read('./test/test_rcsv.csv')
13
+
14
+ Benchmark.bmbm do |b|
15
+ b.report("FasterCSV") {
16
+ TIMES.times {
17
+ str = CSV.parse(data)
18
+ }
19
+ }
20
+
21
+ # b.report("lib_csv") {
22
+ # TIMES.times {
23
+ # str = LibCsv.parse(data)
24
+ # }
25
+ # }
26
+
27
+ b.report("rcsv") {
28
+ TIMES.times {
29
+ str = Rcsv.parse(data)
30
+ }
31
+ }
32
+ end
data/ext/rcsv/csv.h ADDED
@@ -0,0 +1,86 @@
1
+ #ifndef LIBCSV_H__
2
+ #define LIBCSV_H__
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ #define CSV_MAJOR 3
11
+ #define CSV_MINOR 1
12
+ #define CSV_RELEASE 0
13
+
14
+ /* Error Codes */
15
+ #define CSV_SUCCESS 0
16
+ #define CSV_EPARSE 1 /* Parse error in strict mode */
17
+ #define CSV_ENOMEM 2 /* Out of memory while increasing buffer size */
18
+ #define CSV_ETOOBIG 3 /* Buffer larger than SIZE_MAX needed */
19
+ #define CSV_EINVALID 4 /* Invalid code,should never be received from csv_error*/
20
+
21
+
22
+ /* parser options */
23
+ #define CSV_STRICT 1 /* enable strict mode */
24
+ #define CSV_REPALL_NL 2 /* report all unquoted carriage returns and linefeeds */
25
+ #define CSV_STRICT_FINI 4 /* causes csv_fini to return CSV_EPARSE if last
26
+ field is quoted and doesn't containg ending
27
+ quote */
28
+ #define CSV_APPEND_NULL 8 /* Ensure that all fields are null-ternimated */
29
+
30
+
31
+ /* Character values */
32
+ #define CSV_TAB 0x09
33
+ #define CSV_SPACE 0x20
34
+ #define CSV_CR 0x0d
35
+ #define CSV_LF 0x0a
36
+ #define CSV_COMMA 0x2c
37
+ #define CSV_QUOTE 0x22
38
+
39
+ struct csv_parser {
40
+ int pstate; /* Parser state */
41
+ int quoted; /* Is the current field a quoted field? */
42
+ size_t spaces; /* Number of continious spaces after quote or in a non-quoted field */
43
+ unsigned char * entry_buf; /* Entry buffer */
44
+ size_t entry_pos; /* Current position in entry_buf (and current size of entry) */
45
+ size_t entry_size; /* Size of entry buffer */
46
+ int status; /* Operation status */
47
+ unsigned char options;
48
+ unsigned char quote_char;
49
+ unsigned char delim_char;
50
+ int (*is_space)(unsigned char);
51
+ int (*is_term)(unsigned char);
52
+ size_t blk_size;
53
+ void *(*malloc_func)(size_t);
54
+ void *(*realloc_func)(void *, size_t);
55
+ void (*free_func)(void *);
56
+ };
57
+
58
+ /* Function Prototypes */
59
+ int csv_init(struct csv_parser *p, unsigned char options);
60
+ int csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int, void *), void *data);
61
+ void csv_free(struct csv_parser *p);
62
+ int csv_error(struct csv_parser *p);
63
+ char * csv_strerror(int error);
64
+ size_t csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int, void *), void *data);
65
+ size_t csv_write(void *dest, size_t dest_size, const void *src, size_t src_size);
66
+ int csv_fwrite(FILE *fp, const void *src, size_t src_size);
67
+ size_t csv_write2(void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote);
68
+ int csv_fwrite2(FILE *fp, const void *src, size_t src_size, unsigned char quote);
69
+ int csv_get_opts(struct csv_parser *p);
70
+ int csv_set_opts(struct csv_parser *p, unsigned char options);
71
+ void csv_set_delim(struct csv_parser *p, unsigned char c);
72
+ void csv_set_quote(struct csv_parser *p, unsigned char c);
73
+ unsigned char csv_get_delim(struct csv_parser *p);
74
+ unsigned char csv_get_quote(struct csv_parser *p);
75
+ void csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char));
76
+ void csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char));
77
+ void csv_set_realloc_func(struct csv_parser *p, void *(*)(void *, size_t));
78
+ void csv_set_free_func(struct csv_parser *p, void (*)(void *));
79
+ void csv_set_blk_size(struct csv_parser *p, size_t);
80
+ size_t csv_get_buffer_size(struct csv_parser *p);
81
+
82
+ #ifdef __cplusplus
83
+ }
84
+ #endif
85
+
86
+ #endif
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('rcsv/rcsv')
data/ext/rcsv/libcsv.c ADDED
@@ -0,0 +1,579 @@
1
+ /*
2
+ libcsv - parse and write csv data
3
+ Copyright (C) 2008 Robert Gamble
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+ #if ___STDC_VERSION__ >= 199901L
21
+ # include <stdint.h>
22
+ #else
23
+ # define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
24
+ #endif
25
+
26
+ #include "csv.h"
27
+
28
+ #define VERSION "3.0.2"
29
+
30
+ #define ROW_NOT_BEGUN 0
31
+ #define FIELD_NOT_BEGUN 1
32
+ #define FIELD_BEGUN 2
33
+ #define FIELD_MIGHT_HAVE_ENDED 3
34
+
35
+ /*
36
+ Explanation of states
37
+ ROW_NOT_BEGUN There have not been any fields encountered for this row
38
+ FIELD_NOT_BEGUN There have been fields but we are currently not in one
39
+ FIELD_BEGUN We are in a field
40
+ FIELD_MIGHT_HAVE_ENDED
41
+ We encountered a double quote inside a quoted field, the
42
+ field is either ended or the quote is literal
43
+ */
44
+
45
+ #define MEM_BLK_SIZE 128
46
+
47
+ #define SUBMIT_FIELD(p) \
48
+ do { \
49
+ if (!quoted) \
50
+ entry_pos -= spaces; \
51
+ if (p->options & CSV_APPEND_NULL) \
52
+ ((p)->entry_buf[entry_pos]) = '\0'; \
53
+ if (cb1) \
54
+ cb1(p->entry_buf, entry_pos, data); \
55
+ pstate = FIELD_NOT_BEGUN; \
56
+ entry_pos = quoted = spaces = 0; \
57
+ } while (0)
58
+
59
+ #define SUBMIT_ROW(p, c) \
60
+ do { \
61
+ if (cb2) \
62
+ cb2(c, data); \
63
+ pstate = ROW_NOT_BEGUN; \
64
+ entry_pos = quoted = spaces = 0; \
65
+ } while (0)
66
+
67
+ #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
68
+
69
+ static char *csv_errors[] = {"success",
70
+ "error parsing data while strict checking enabled",
71
+ "memory exhausted while increasing buffer size",
72
+ "data size too large",
73
+ "invalid status code"};
74
+
75
+ int
76
+ csv_error(struct csv_parser *p)
77
+ {
78
+ /* Return the current status of the parser */
79
+ return p->status;
80
+ }
81
+
82
+ char *
83
+ csv_strerror(int status)
84
+ {
85
+ /* Return a textual description of status */
86
+ if (status >= CSV_EINVALID || status < 0)
87
+ return csv_errors[CSV_EINVALID];
88
+ else
89
+ return csv_errors[status];
90
+ }
91
+
92
+ int
93
+ csv_get_opts(struct csv_parser *p)
94
+ {
95
+ /* Return the currently set options of parser */
96
+ if (p == NULL)
97
+ return -1;
98
+
99
+ return p->options;
100
+ }
101
+
102
+ int
103
+ csv_set_opts(struct csv_parser *p, unsigned char options)
104
+ {
105
+ /* Set the options */
106
+ if (p == NULL)
107
+ return -1;
108
+
109
+ p->options = options;
110
+ return 0;
111
+ }
112
+
113
+ int
114
+ csv_init(struct csv_parser *p, unsigned char options)
115
+ {
116
+ /* Initialize a csv_parser object returns 0 on success, -1 on error */
117
+ if (p == NULL)
118
+ return -1;
119
+
120
+ p->entry_buf = NULL;
121
+ p->pstate = ROW_NOT_BEGUN;
122
+ p->quoted = 0;
123
+ p->spaces = 0;
124
+ p->entry_pos = 0;
125
+ p->entry_size = 0;
126
+ p->status = 0;
127
+ p->options = options;
128
+ p->quote_char = CSV_QUOTE;
129
+ p->delim_char = CSV_COMMA;
130
+ p->is_space = NULL;
131
+ p->is_term = NULL;
132
+ p->blk_size = MEM_BLK_SIZE;
133
+ p->malloc_func = NULL;
134
+ p->realloc_func = realloc;
135
+ p->free_func = free;
136
+
137
+ return 0;
138
+ }
139
+
140
+ void
141
+ csv_free(struct csv_parser *p)
142
+ {
143
+ /* Free the entry_buffer of csv_parser object */
144
+ if (p == NULL)
145
+ return;
146
+
147
+ if (p->entry_buf)
148
+ p->free_func(p->entry_buf);
149
+
150
+ p->entry_buf = NULL;
151
+ p->entry_size = 0;
152
+
153
+ return;
154
+ }
155
+
156
+ int
157
+ csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
158
+ {
159
+ /* Finalize parsing. Needed, for example, when file does not end in a newline */
160
+ int quoted = p->quoted;
161
+ int pstate = p->pstate;
162
+ size_t spaces = p->spaces;
163
+ size_t entry_pos = p->entry_pos;
164
+
165
+ if (p == NULL)
166
+ return -1;
167
+
168
+
169
+ if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
170
+ /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
171
+ p->status = CSV_EPARSE;
172
+ return -1;
173
+ }
174
+
175
+ switch (p->pstate) {
176
+ case FIELD_MIGHT_HAVE_ENDED:
177
+ p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */
178
+ /* Fall-through */
179
+ case FIELD_NOT_BEGUN:
180
+ case FIELD_BEGUN:
181
+ quoted = p->quoted, pstate = p->pstate;
182
+ spaces = p->spaces, entry_pos = p->entry_pos;
183
+ SUBMIT_FIELD(p);
184
+ SUBMIT_ROW(p, -1);
185
+ case ROW_NOT_BEGUN: /* Already ended properly */
186
+ ;
187
+ }
188
+
189
+ /* Reset parser */
190
+ p->spaces = p->quoted = p->entry_pos = p->status = 0;
191
+ p->pstate = ROW_NOT_BEGUN;
192
+
193
+ return 0;
194
+ }
195
+
196
+ void
197
+ csv_set_delim(struct csv_parser *p, unsigned char c)
198
+ {
199
+ /* Set the delimiter */
200
+ if (p) p->delim_char = c;
201
+ }
202
+
203
+ void
204
+ csv_set_quote(struct csv_parser *p, unsigned char c)
205
+ {
206
+ /* Set the quote character */
207
+ if (p) p->quote_char = c;
208
+ }
209
+
210
+ unsigned char
211
+ csv_get_delim(struct csv_parser *p)
212
+ {
213
+ /* Get the delimiter */
214
+ return p->delim_char;
215
+ }
216
+
217
+ unsigned char
218
+ csv_get_quote(struct csv_parser *p)
219
+ {
220
+ /* Get the quote character */
221
+ return p->quote_char;
222
+ }
223
+
224
+ void
225
+ csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
226
+ {
227
+ /* Set the space function */
228
+ if (p) p->is_space = f;
229
+ }
230
+
231
+ void
232
+ csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
233
+ {
234
+ /* Set the term function */
235
+ if (p) p->is_term = f;
236
+ }
237
+
238
+ void
239
+ csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
240
+ {
241
+ /* Set the realloc function used to increase buffer size */
242
+ if (p && f) p->realloc_func = f;
243
+ }
244
+
245
+ void
246
+ csv_set_free_func(struct csv_parser *p, void (*f)(void *))
247
+ {
248
+ /* Set the free function used to free the buffer */
249
+ if (p && f) p->free_func = f;
250
+ }
251
+
252
+ void
253
+ csv_set_blk_size(struct csv_parser *p, size_t size)
254
+ {
255
+ /* Set the block size used to increment buffer size */
256
+ if (p) p->blk_size = size;
257
+ }
258
+
259
+ size_t
260
+ csv_get_buffer_size(struct csv_parser *p)
261
+ {
262
+ /* Get the size of the entry buffer */
263
+ if (p)
264
+ return p->entry_size;
265
+ return 0;
266
+ }
267
+
268
+ static int
269
+ csv_increase_buffer(struct csv_parser *p)
270
+ {
271
+ /* Increase the size of the entry buffer. Attempt to increase size by
272
+ * p->blk_size, if this is larger than SIZE_MAX try to increase current
273
+ * buffer size to SIZE_MAX. If allocation fails, try to allocate halve
274
+ * the size and try again until successful or increment size is zero.
275
+ */
276
+
277
+ size_t to_add = p->blk_size;
278
+ void *vp;
279
+
280
+ if ( p->entry_size >= SIZE_MAX - to_add )
281
+ to_add = SIZE_MAX - p->entry_size;
282
+
283
+ if (!to_add) {
284
+ p->status = CSV_ETOOBIG;
285
+ return -1;
286
+ }
287
+
288
+ while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
289
+ to_add /= 2;
290
+ if (!to_add) {
291
+ p->status = CSV_ENOMEM;
292
+ return -1;
293
+ }
294
+ }
295
+
296
+ /* Update entry buffer pointer and entry_size if successful */
297
+ p->entry_buf = vp;
298
+ p->entry_size += to_add;
299
+ return 0;
300
+ }
301
+
302
+ size_t
303
+ csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
304
+ {
305
+ unsigned const char *us = s; /* Access input data as array of unsigned char */
306
+ unsigned char c; /* The character we are currently processing */
307
+ size_t pos = 0; /* The number of characters we have processed in this call */
308
+
309
+ /* Store key fields into local variables for performance */
310
+ unsigned char delim = p->delim_char;
311
+ unsigned char quote = p->quote_char;
312
+ int (*is_space)(unsigned char) = p->is_space;
313
+ int (*is_term)(unsigned char) = p->is_term;
314
+ int quoted = p->quoted;
315
+ int pstate = p->pstate;
316
+ size_t spaces = p->spaces;
317
+ size_t entry_pos = p->entry_pos;
318
+
319
+
320
+ if (!p->entry_buf && pos < len) {
321
+ /* Buffer hasn't been allocated yet and len > 0 */
322
+ if (csv_increase_buffer(p) != 0) {
323
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
324
+ return pos;
325
+ }
326
+ }
327
+
328
+ while (pos < len) {
329
+ /* Check memory usage, increase buffer if neccessary */
330
+ if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
331
+ if (csv_increase_buffer(p) != 0) {
332
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
333
+ return pos;
334
+ }
335
+ }
336
+
337
+ c = us[pos++];
338
+
339
+ switch (pstate) {
340
+ case ROW_NOT_BEGUN:
341
+ case FIELD_NOT_BEGUN:
342
+ if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */
343
+ continue;
344
+ } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
345
+ if (pstate == FIELD_NOT_BEGUN) {
346
+ SUBMIT_FIELD(p);
347
+ SUBMIT_ROW(p, (unsigned char)c);
348
+ } else { /* ROW_NOT_BEGUN */
349
+ /* Don't submit empty rows by default */
350
+ if (p->options & CSV_REPALL_NL) {
351
+ SUBMIT_ROW(p, (unsigned char)c);
352
+ }
353
+ }
354
+ continue;
355
+ } else if (c == delim) { /* Comma */
356
+ SUBMIT_FIELD(p);
357
+ break;
358
+ } else if (c == quote) { /* Quote */
359
+ pstate = FIELD_BEGUN;
360
+ quoted = 1;
361
+ } else { /* Anything else */
362
+ pstate = FIELD_BEGUN;
363
+ quoted = 0;
364
+ SUBMIT_CHAR(p, c);
365
+ }
366
+ break;
367
+ case FIELD_BEGUN:
368
+ if (c == quote) { /* Quote */
369
+ if (quoted) {
370
+ SUBMIT_CHAR(p, c);
371
+ pstate = FIELD_MIGHT_HAVE_ENDED;
372
+ } else {
373
+ /* STRICT ERROR - double quote inside non-quoted field */
374
+ if (p->options & CSV_STRICT) {
375
+ p->status = CSV_EPARSE;
376
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
377
+ return pos-1;
378
+ }
379
+ SUBMIT_CHAR(p, c);
380
+ spaces = 0;
381
+ }
382
+ } else if (c == delim) { /* Comma */
383
+ if (quoted) {
384
+ SUBMIT_CHAR(p, c);
385
+ } else {
386
+ SUBMIT_FIELD(p);
387
+ }
388
+ } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
389
+ if (!quoted) {
390
+ SUBMIT_FIELD(p);
391
+ SUBMIT_ROW(p, (unsigned char)c);
392
+ } else {
393
+ SUBMIT_CHAR(p, c);
394
+ }
395
+ } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
396
+ SUBMIT_CHAR(p, c);
397
+ spaces++;
398
+ } else { /* Anything else */
399
+ SUBMIT_CHAR(p, c);
400
+ spaces = 0;
401
+ }
402
+ break;
403
+ case FIELD_MIGHT_HAVE_ENDED:
404
+ /* This only happens when a quote character is encountered in a quoted field */
405
+ if (c == delim) { /* Comma */
406
+ entry_pos -= spaces + 1; /* get rid of spaces and original quote */
407
+ SUBMIT_FIELD(p);
408
+ } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
409
+ entry_pos -= spaces + 1; /* get rid of spaces and original quote */
410
+ SUBMIT_FIELD(p);
411
+ SUBMIT_ROW(p, (unsigned char)c);
412
+ } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
413
+ SUBMIT_CHAR(p, c);
414
+ spaces++;
415
+ } else if (c == quote) { /* Quote */
416
+ if (spaces) {
417
+ /* STRICT ERROR - unescaped double quote */
418
+ if (p->options & CSV_STRICT) {
419
+ p->status = CSV_EPARSE;
420
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
421
+ return pos-1;
422
+ }
423
+ spaces = 0;
424
+ SUBMIT_CHAR(p, c);
425
+ } else {
426
+ /* Two quotes in a row */
427
+ pstate = FIELD_BEGUN;
428
+ }
429
+ } else { /* Anything else */
430
+ /* STRICT ERROR - unescaped double quote */
431
+ if (p->options & CSV_STRICT) {
432
+ p->status = CSV_EPARSE;
433
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
434
+ return pos-1;
435
+ }
436
+ pstate = FIELD_BEGUN;
437
+ spaces = 0;
438
+ SUBMIT_CHAR(p, c);
439
+ }
440
+ break;
441
+ default:
442
+ break;
443
+ }
444
+ }
445
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
446
+ return pos;
447
+ }
448
+
449
+ size_t
450
+ csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
451
+ {
452
+ unsigned char *cdest = dest;
453
+ const unsigned char *csrc = src;
454
+ size_t chars = 0;
455
+
456
+ if (src == NULL)
457
+ return 0;
458
+
459
+ if (cdest == NULL)
460
+ dest_size = 0;
461
+
462
+ if (dest_size > 0)
463
+ *cdest++ = '"';
464
+ chars++;
465
+
466
+ while (src_size) {
467
+ if (*csrc == '"') {
468
+ if (dest_size > chars)
469
+ *cdest++ = '"';
470
+ if (chars < SIZE_MAX) chars++;
471
+ }
472
+ if (dest_size > chars)
473
+ *cdest++ = *csrc;
474
+ if (chars < SIZE_MAX) chars++;
475
+ src_size--;
476
+ csrc++;
477
+ }
478
+
479
+ if (dest_size > chars)
480
+ *cdest = '"';
481
+ if (chars < SIZE_MAX) chars++;
482
+
483
+ return chars;
484
+ }
485
+
486
+ int
487
+ csv_fwrite (FILE *fp, const void *src, size_t src_size)
488
+ {
489
+ const unsigned char *csrc = src;
490
+
491
+ if (fp == NULL || src == NULL)
492
+ return 0;
493
+
494
+ if (fputc('"', fp) == EOF)
495
+ return EOF;
496
+
497
+ while (src_size) {
498
+ if (*csrc == '"') {
499
+ if (fputc('"', fp) == EOF)
500
+ return EOF;
501
+ }
502
+ if (fputc(*csrc, fp) == EOF)
503
+ return EOF;
504
+ src_size--;
505
+ csrc++;
506
+ }
507
+
508
+ if (fputc('"', fp) == EOF) {
509
+ return EOF;
510
+ }
511
+
512
+ return 0;
513
+ }
514
+
515
+ size_t
516
+ csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
517
+ {
518
+ unsigned char *cdest = dest;
519
+ const unsigned char *csrc = src;
520
+ size_t chars = 0;
521
+
522
+ if (src == NULL)
523
+ return 0;
524
+
525
+ if (dest == NULL)
526
+ dest_size = 0;
527
+
528
+ if (dest_size > 0)
529
+ *cdest++ = quote;
530
+ chars++;
531
+
532
+ while (src_size) {
533
+ if (*csrc == quote) {
534
+ if (dest_size > chars)
535
+ *cdest++ = quote;
536
+ if (chars < SIZE_MAX) chars++;
537
+ }
538
+ if (dest_size > chars)
539
+ *cdest++ = *csrc;
540
+ if (chars < SIZE_MAX) chars++;
541
+ src_size--;
542
+ csrc++;
543
+ }
544
+
545
+ if (dest_size > chars)
546
+ *cdest = quote;
547
+ if (chars < SIZE_MAX) chars++;
548
+
549
+ return chars;
550
+ }
551
+
552
+ int
553
+ csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
554
+ {
555
+ const unsigned char *csrc = src;
556
+
557
+ if (fp == NULL || src == NULL)
558
+ return 0;
559
+
560
+ if (fputc(quote, fp) == EOF)
561
+ return EOF;
562
+
563
+ while (src_size) {
564
+ if (*csrc == quote) {
565
+ if (fputc(quote, fp) == EOF)
566
+ return EOF;
567
+ }
568
+ if (fputc(*csrc, fp) == EOF)
569
+ return EOF;
570
+ src_size--;
571
+ csrc++;
572
+ }
573
+
574
+ if (fputc(quote, fp) == EOF) {
575
+ return EOF;
576
+ }
577
+
578
+ return 0;
579
+ }