rcsv 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in rcsv.gemspec
4
+ gemspec
5
+
6
+ gem "rake-compiler", :group => :development
data/Gemfile.lock ADDED
@@ -0,0 +1,18 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ rcsv (0.0.5)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ rake (0.9.2.2)
10
+ rake-compiler (0.8.1)
11
+ rake
12
+
13
+ PLATFORMS
14
+ ruby
15
+
16
+ DEPENDENCIES
17
+ rake-compiler
18
+ rcsv!
data/LICENSE ADDED
@@ -0,0 +1,30 @@
1
+ Copyright (c) 2012, Fiksu, Inc.
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are
6
+ met:
7
+
8
+ o Redistributions of source code must retain the above copyright
9
+ notice, this list of conditions and the following disclaimer.
10
+
11
+ o Redistributions in binary form must reproduce the above copyright
12
+ notice, this list of conditions and the following disclaimer in the
13
+ documentation and/or other materials provided with the
14
+ distribution.
15
+
16
+ o Fiksu, Inc. nor the names of its contributors may be used to
17
+ endorse or promote products derived from this software without
18
+ specific prior written permission.
19
+
20
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
data/README.md ADDED
@@ -0,0 +1,148 @@
1
+ # Rcsv
2
+
3
+ Rcsv is a fast CSV parsing library for MRI Ruby. Tested on REE 1.8.7 and Ruby 1.9.3.
4
+
5
+ Contrary to many other gems that implement their own parsers, Rcsv uses libcsv 3.1.0 (http://sourceforge.net/projects/libcsv/). As long as libcsv's API is stable, getting Rcsv to use newer libcsv version is as simple as updating two files (csv.h and libcsv.c).
6
+
7
+ ## Benchmarks
8
+ user system total real
9
+ FasterCSV 0.580000 0.000000 0.580000 ( 0.618837)
10
+ rcsv 0.060000 0.000000 0.060000 ( 0.062248)
11
+
12
+ ## License
13
+
14
+ Rcsv itself is distributed under BSD-derived license (see LICENSE) except for included csv.h and libcsv.c source files that are distributed under LGPL v2.1 (see COPYING.LESSER). Libcsv sources were not modified in any manner.
15
+
16
+ ## Installation
17
+
18
+ Add this line to your application's Gemfile:
19
+
20
+ gem 'rcsv'
21
+
22
+ And then execute:
23
+
24
+ $ bundle
25
+
26
+ Or install it yourself as:
27
+
28
+ $ gem install rcsv
29
+
30
+
31
+ ## Building the latest source
32
+
33
+ First, check out the master branch. Then cd there and run:
34
+
35
+ $ bundle # Installs development dependencies
36
+ $ bundle exec rake # Runs tests
37
+ $ gem build rcsv.gemspec # Builds the gem
38
+
39
+ ## Usage
40
+
41
+ Currently, Rcsv only supports CSV parsing. CSV write support is planned.
42
+
43
+ Quickstart:
44
+
45
+ parsed = Rcsv.parse(csv_data)
46
+
47
+
48
+ Rcsv class exposes a class method *parse* that accepts a CSV string as its first parameter and options hash as its second parameter.
49
+
50
+
51
+ Options supported:
52
+
53
+ ### :column_separator
54
+
55
+ A single-character string that is used as a separator. Default is ",".
56
+
57
+ ### :nostrict
58
+
59
+ A boolean flag. When enabled, allows to parse oddly quoted CSV data without exceptions being raised. Disabled by default.
60
+
61
+ Anything that does not conform to http://www.ietf.org/rfc/rfc4180.txt should better be parsed with this option enabled.
62
+
63
+ ### :offset_rows
64
+
65
+ A positive integer that specifies how many rows should be skipped, counting from the beginning. Default is 0.
66
+
67
+ ### :columns
68
+ A hash that contains per-column parsing instructions. By default, every CSV cell is parsed as a raw string without conversions. Empty strings are parsed as nils.
69
+
70
+ If CSV has a header, :columns keys can be strings that are equal to column names in the header. If there is no header, keys should represent integer column positions.
71
+
72
+ :columns values are in turn hashes that provide parsing options:
73
+
74
+ * :alias - Object of any type (though usually a Symbol) that is used to as a key that represents column name when :row_as_hash is set.
75
+ * :type - A Ruby Symbol that specifies Ruby data type that CSV cell value should be converted into. Supported types: :int, :float, :string, :bool. :string is the default.
76
+ * :default - Object of any type (though usually of the same type that is specified by :type option). If CSV doesn't have any value for a cell, this default value is used.
77
+ * :match - A string. If set, makes Rcsv skip all the rows where any column doesn't match its :match value. Useful for filtering data.
78
+
79
+
80
+ ### :header
81
+ A Ruby symbol that specifies how CSV header should be processed. Accepted values:
82
+
83
+ * :use (default) - If :columns is set, instructs Rcsv to parse the first CSV line and use column names from there as :columns keys. Ignores the header when :columns is not set.
84
+
85
+ * :skip - Skips the header, treats :columns keys as column positions.
86
+
87
+ * :none - Tells Rcsv that CSV header is not present. :columns keys are treated as column positions.
88
+
89
+ ### :row_as_hash
90
+ A boolean flag. Disabled by default.
91
+ When enabled, *parse* return value is represented as array of hashes. If :header is set to :use, keys for hashes are either string column names from CSV header or their aliases. Otherwise, column indexes are used.
92
+ When :row_as_hash is disabled, return value is represented as array of arrays.
93
+
94
+ ### :only_listed_columns
95
+ A boolean flag. If enabled, only parses columns that are listed in :columns. Disabled by default.
96
+
97
+
98
+ ## Examples
99
+
100
+ This example parses a 3-column CSV file and only returns parsed rows where "Age" values are set to "35".
101
+
102
+ Rcsv.parse some_csv, :row_as_hash => true,
103
+ :columns => {
104
+ 'First Name' => { :alias => :first_name, :default => "Unknown" },
105
+ 'Last Name' => { :alias => :last_name, :default => "Unknown"},
106
+ 'Age' => { :alias => :age, :type => :int, :match => "35"}
107
+ }
108
+
109
+ The result would look like this:
110
+
111
+ [
112
+ { :first_name => "Mary", :last_name => "Jane", :age => 35 },
113
+ { :first_name => "Unknown", :last_name => "Alien", :age => 35}
114
+ ]
115
+
116
+ Another example, for a miserable headerless Tab-separated CSV:
117
+
118
+ Rcsv.parse some_csv, :column_separator => "\t",
119
+ :header => :none,
120
+ :columns => {
121
+ 1 => { :type => :float, :default => 0 }
122
+ }
123
+
124
+ The result would look like this:
125
+
126
+ [
127
+ [ "Very hot", 3.7, "Mercury" ],
128
+ [ "Very hot and cloudy", 8.87, "Venus" ],
129
+ [ "Just about ok", 9.78, "Earth"],
130
+ [ nil, 0, "Vacuum" ]
131
+ ]
132
+
133
+
134
+ ## To do
135
+
136
+ * More specs for boolean values
137
+ * Specs for Ruby parse
138
+ * Add custom Ruby callbacks (if block is passed)
139
+ * Add CSV write support
140
+
141
+
142
+ ## Contributing
143
+
144
+ 1. Fork it
145
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
146
+ 3. Commit your changes (`git commit -am 'Added some feature'`)
147
+ 4. Push to the branch (`git push origin my-new-feature`)
148
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+ require "rake/extensiontask"
4
+ require 'rake/testtask'
5
+
6
+ Rake::ExtensionTask.new('rcsv') do |ext|
7
+ ext.lib_dir = 'lib/rcsv'
8
+ end
9
+
10
+ Rake::TestTask.new do |t|
11
+ t.libs << 'test'
12
+ end
13
+
14
+ desc "Recompile native code"
15
+ task :recompile => [:clobber, :compile] # clean build
16
+
17
+ desc "Recompile native code and run tests"
18
+ task :default => [:recompile, :test] # clean testing FTW
data/bench.rb ADDED
@@ -0,0 +1,32 @@
1
+ require 'benchmark'
2
+
3
+ require 'csv'
4
+ #require './lib/lib_csv'
5
+ require 'rcsv'
6
+
7
+ TIMES = 10
8
+
9
+ # That CSV file contains "broken" headers that FaterCSV doesn't like.
10
+ # Remove all quotes from the header in order to fix this benchmark.
11
+ # But even better would be to test against much bigger CSV file.
12
+ data = File.read('./test/test_rcsv.csv')
13
+
14
+ Benchmark.bmbm do |b|
15
+ b.report("FasterCSV") {
16
+ TIMES.times {
17
+ str = CSV.parse(data)
18
+ }
19
+ }
20
+
21
+ # b.report("lib_csv") {
22
+ # TIMES.times {
23
+ # str = LibCsv.parse(data)
24
+ # }
25
+ # }
26
+
27
+ b.report("rcsv") {
28
+ TIMES.times {
29
+ str = Rcsv.parse(data)
30
+ }
31
+ }
32
+ end
data/ext/rcsv/csv.h ADDED
@@ -0,0 +1,86 @@
1
+ #ifndef LIBCSV_H__
2
+ #define LIBCSV_H__
3
+ #include <stdlib.h>
4
+ #include <stdio.h>
5
+
6
+ #ifdef __cplusplus
7
+ extern "C" {
8
+ #endif
9
+
10
+ #define CSV_MAJOR 3
11
+ #define CSV_MINOR 1
12
+ #define CSV_RELEASE 0
13
+
14
+ /* Error Codes */
15
+ #define CSV_SUCCESS 0
16
+ #define CSV_EPARSE 1 /* Parse error in strict mode */
17
+ #define CSV_ENOMEM 2 /* Out of memory while increasing buffer size */
18
+ #define CSV_ETOOBIG 3 /* Buffer larger than SIZE_MAX needed */
19
+ #define CSV_EINVALID 4 /* Invalid code,should never be received from csv_error*/
20
+
21
+
22
+ /* parser options */
23
+ #define CSV_STRICT 1 /* enable strict mode */
24
+ #define CSV_REPALL_NL 2 /* report all unquoted carriage returns and linefeeds */
25
+ #define CSV_STRICT_FINI 4 /* causes csv_fini to return CSV_EPARSE if last
26
+ field is quoted and doesn't containg ending
27
+ quote */
28
+ #define CSV_APPEND_NULL 8 /* Ensure that all fields are null-ternimated */
29
+
30
+
31
+ /* Character values */
32
+ #define CSV_TAB 0x09
33
+ #define CSV_SPACE 0x20
34
+ #define CSV_CR 0x0d
35
+ #define CSV_LF 0x0a
36
+ #define CSV_COMMA 0x2c
37
+ #define CSV_QUOTE 0x22
38
+
39
+ struct csv_parser {
40
+ int pstate; /* Parser state */
41
+ int quoted; /* Is the current field a quoted field? */
42
+ size_t spaces; /* Number of continious spaces after quote or in a non-quoted field */
43
+ unsigned char * entry_buf; /* Entry buffer */
44
+ size_t entry_pos; /* Current position in entry_buf (and current size of entry) */
45
+ size_t entry_size; /* Size of entry buffer */
46
+ int status; /* Operation status */
47
+ unsigned char options;
48
+ unsigned char quote_char;
49
+ unsigned char delim_char;
50
+ int (*is_space)(unsigned char);
51
+ int (*is_term)(unsigned char);
52
+ size_t blk_size;
53
+ void *(*malloc_func)(size_t);
54
+ void *(*realloc_func)(void *, size_t);
55
+ void (*free_func)(void *);
56
+ };
57
+
58
+ /* Function Prototypes */
59
+ int csv_init(struct csv_parser *p, unsigned char options);
60
+ int csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int, void *), void *data);
61
+ void csv_free(struct csv_parser *p);
62
+ int csv_error(struct csv_parser *p);
63
+ char * csv_strerror(int error);
64
+ size_t csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int, void *), void *data);
65
+ size_t csv_write(void *dest, size_t dest_size, const void *src, size_t src_size);
66
+ int csv_fwrite(FILE *fp, const void *src, size_t src_size);
67
+ size_t csv_write2(void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote);
68
+ int csv_fwrite2(FILE *fp, const void *src, size_t src_size, unsigned char quote);
69
+ int csv_get_opts(struct csv_parser *p);
70
+ int csv_set_opts(struct csv_parser *p, unsigned char options);
71
+ void csv_set_delim(struct csv_parser *p, unsigned char c);
72
+ void csv_set_quote(struct csv_parser *p, unsigned char c);
73
+ unsigned char csv_get_delim(struct csv_parser *p);
74
+ unsigned char csv_get_quote(struct csv_parser *p);
75
+ void csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char));
76
+ void csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char));
77
+ void csv_set_realloc_func(struct csv_parser *p, void *(*)(void *, size_t));
78
+ void csv_set_free_func(struct csv_parser *p, void (*)(void *));
79
+ void csv_set_blk_size(struct csv_parser *p, size_t);
80
+ size_t csv_get_buffer_size(struct csv_parser *p);
81
+
82
+ #ifdef __cplusplus
83
+ }
84
+ #endif
85
+
86
+ #endif
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+
3
+ create_makefile('rcsv/rcsv')
data/ext/rcsv/libcsv.c ADDED
@@ -0,0 +1,579 @@
1
+ /*
2
+ libcsv - parse and write csv data
3
+ Copyright (C) 2008 Robert Gamble
4
+
5
+ This library is free software; you can redistribute it and/or
6
+ modify it under the terms of the GNU Lesser General Public
7
+ License as published by the Free Software Foundation; either
8
+ version 2.1 of the License, or (at your option) any later version.
9
+
10
+ This library is distributed in the hope that it will be useful,
11
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ Lesser General Public License for more details.
14
+
15
+ You should have received a copy of the GNU Lesser General Public
16
+ License along with this library; if not, write to the Free Software
17
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
18
+ */
19
+
20
+ #if ___STDC_VERSION__ >= 199901L
21
+ # include <stdint.h>
22
+ #else
23
+ # define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
24
+ #endif
25
+
26
+ #include "csv.h"
27
+
28
+ #define VERSION "3.0.2"
29
+
30
+ #define ROW_NOT_BEGUN 0
31
+ #define FIELD_NOT_BEGUN 1
32
+ #define FIELD_BEGUN 2
33
+ #define FIELD_MIGHT_HAVE_ENDED 3
34
+
35
+ /*
36
+ Explanation of states
37
+ ROW_NOT_BEGUN There have not been any fields encountered for this row
38
+ FIELD_NOT_BEGUN There have been fields but we are currently not in one
39
+ FIELD_BEGUN We are in a field
40
+ FIELD_MIGHT_HAVE_ENDED
41
+ We encountered a double quote inside a quoted field, the
42
+ field is either ended or the quote is literal
43
+ */
44
+
45
+ #define MEM_BLK_SIZE 128
46
+
47
+ #define SUBMIT_FIELD(p) \
48
+ do { \
49
+ if (!quoted) \
50
+ entry_pos -= spaces; \
51
+ if (p->options & CSV_APPEND_NULL) \
52
+ ((p)->entry_buf[entry_pos]) = '\0'; \
53
+ if (cb1) \
54
+ cb1(p->entry_buf, entry_pos, data); \
55
+ pstate = FIELD_NOT_BEGUN; \
56
+ entry_pos = quoted = spaces = 0; \
57
+ } while (0)
58
+
59
+ #define SUBMIT_ROW(p, c) \
60
+ do { \
61
+ if (cb2) \
62
+ cb2(c, data); \
63
+ pstate = ROW_NOT_BEGUN; \
64
+ entry_pos = quoted = spaces = 0; \
65
+ } while (0)
66
+
67
+ #define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
68
+
69
+ static char *csv_errors[] = {"success",
70
+ "error parsing data while strict checking enabled",
71
+ "memory exhausted while increasing buffer size",
72
+ "data size too large",
73
+ "invalid status code"};
74
+
75
+ int
76
+ csv_error(struct csv_parser *p)
77
+ {
78
+ /* Return the current status of the parser */
79
+ return p->status;
80
+ }
81
+
82
+ char *
83
+ csv_strerror(int status)
84
+ {
85
+ /* Return a textual description of status */
86
+ if (status >= CSV_EINVALID || status < 0)
87
+ return csv_errors[CSV_EINVALID];
88
+ else
89
+ return csv_errors[status];
90
+ }
91
+
92
+ int
93
+ csv_get_opts(struct csv_parser *p)
94
+ {
95
+ /* Return the currently set options of parser */
96
+ if (p == NULL)
97
+ return -1;
98
+
99
+ return p->options;
100
+ }
101
+
102
+ int
103
+ csv_set_opts(struct csv_parser *p, unsigned char options)
104
+ {
105
+ /* Set the options */
106
+ if (p == NULL)
107
+ return -1;
108
+
109
+ p->options = options;
110
+ return 0;
111
+ }
112
+
113
+ int
114
+ csv_init(struct csv_parser *p, unsigned char options)
115
+ {
116
+ /* Initialize a csv_parser object returns 0 on success, -1 on error */
117
+ if (p == NULL)
118
+ return -1;
119
+
120
+ p->entry_buf = NULL;
121
+ p->pstate = ROW_NOT_BEGUN;
122
+ p->quoted = 0;
123
+ p->spaces = 0;
124
+ p->entry_pos = 0;
125
+ p->entry_size = 0;
126
+ p->status = 0;
127
+ p->options = options;
128
+ p->quote_char = CSV_QUOTE;
129
+ p->delim_char = CSV_COMMA;
130
+ p->is_space = NULL;
131
+ p->is_term = NULL;
132
+ p->blk_size = MEM_BLK_SIZE;
133
+ p->malloc_func = NULL;
134
+ p->realloc_func = realloc;
135
+ p->free_func = free;
136
+
137
+ return 0;
138
+ }
139
+
140
+ void
141
+ csv_free(struct csv_parser *p)
142
+ {
143
+ /* Free the entry_buffer of csv_parser object */
144
+ if (p == NULL)
145
+ return;
146
+
147
+ if (p->entry_buf)
148
+ p->free_func(p->entry_buf);
149
+
150
+ p->entry_buf = NULL;
151
+ p->entry_size = 0;
152
+
153
+ return;
154
+ }
155
+
156
+ int
157
+ csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
158
+ {
159
+ /* Finalize parsing. Needed, for example, when file does not end in a newline */
160
+ int quoted = p->quoted;
161
+ int pstate = p->pstate;
162
+ size_t spaces = p->spaces;
163
+ size_t entry_pos = p->entry_pos;
164
+
165
+ if (p == NULL)
166
+ return -1;
167
+
168
+
169
+ if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
170
+ /* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
171
+ p->status = CSV_EPARSE;
172
+ return -1;
173
+ }
174
+
175
+ switch (p->pstate) {
176
+ case FIELD_MIGHT_HAVE_ENDED:
177
+ p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */
178
+ /* Fall-through */
179
+ case FIELD_NOT_BEGUN:
180
+ case FIELD_BEGUN:
181
+ quoted = p->quoted, pstate = p->pstate;
182
+ spaces = p->spaces, entry_pos = p->entry_pos;
183
+ SUBMIT_FIELD(p);
184
+ SUBMIT_ROW(p, -1);
185
+ case ROW_NOT_BEGUN: /* Already ended properly */
186
+ ;
187
+ }
188
+
189
+ /* Reset parser */
190
+ p->spaces = p->quoted = p->entry_pos = p->status = 0;
191
+ p->pstate = ROW_NOT_BEGUN;
192
+
193
+ return 0;
194
+ }
195
+
196
+ void
197
+ csv_set_delim(struct csv_parser *p, unsigned char c)
198
+ {
199
+ /* Set the delimiter */
200
+ if (p) p->delim_char = c;
201
+ }
202
+
203
+ void
204
+ csv_set_quote(struct csv_parser *p, unsigned char c)
205
+ {
206
+ /* Set the quote character */
207
+ if (p) p->quote_char = c;
208
+ }
209
+
210
+ unsigned char
211
+ csv_get_delim(struct csv_parser *p)
212
+ {
213
+ /* Get the delimiter */
214
+ return p->delim_char;
215
+ }
216
+
217
+ unsigned char
218
+ csv_get_quote(struct csv_parser *p)
219
+ {
220
+ /* Get the quote character */
221
+ return p->quote_char;
222
+ }
223
+
224
+ void
225
+ csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
226
+ {
227
+ /* Set the space function */
228
+ if (p) p->is_space = f;
229
+ }
230
+
231
+ void
232
+ csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
233
+ {
234
+ /* Set the term function */
235
+ if (p) p->is_term = f;
236
+ }
237
+
238
+ void
239
+ csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
240
+ {
241
+ /* Set the realloc function used to increase buffer size */
242
+ if (p && f) p->realloc_func = f;
243
+ }
244
+
245
+ void
246
+ csv_set_free_func(struct csv_parser *p, void (*f)(void *))
247
+ {
248
+ /* Set the free function used to free the buffer */
249
+ if (p && f) p->free_func = f;
250
+ }
251
+
252
+ void
253
+ csv_set_blk_size(struct csv_parser *p, size_t size)
254
+ {
255
+ /* Set the block size used to increment buffer size */
256
+ if (p) p->blk_size = size;
257
+ }
258
+
259
+ size_t
260
+ csv_get_buffer_size(struct csv_parser *p)
261
+ {
262
+ /* Get the size of the entry buffer */
263
+ if (p)
264
+ return p->entry_size;
265
+ return 0;
266
+ }
267
+
268
+ static int
269
+ csv_increase_buffer(struct csv_parser *p)
270
+ {
271
+ /* Increase the size of the entry buffer. Attempt to increase size by
272
+ * p->blk_size, if this is larger than SIZE_MAX try to increase current
273
+ * buffer size to SIZE_MAX. If allocation fails, try to allocate halve
274
+ * the size and try again until successful or increment size is zero.
275
+ */
276
+
277
+ size_t to_add = p->blk_size;
278
+ void *vp;
279
+
280
+ if ( p->entry_size >= SIZE_MAX - to_add )
281
+ to_add = SIZE_MAX - p->entry_size;
282
+
283
+ if (!to_add) {
284
+ p->status = CSV_ETOOBIG;
285
+ return -1;
286
+ }
287
+
288
+ while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
289
+ to_add /= 2;
290
+ if (!to_add) {
291
+ p->status = CSV_ENOMEM;
292
+ return -1;
293
+ }
294
+ }
295
+
296
+ /* Update entry buffer pointer and entry_size if successful */
297
+ p->entry_buf = vp;
298
+ p->entry_size += to_add;
299
+ return 0;
300
+ }
301
+
302
+ size_t
303
+ csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
304
+ {
305
+ unsigned const char *us = s; /* Access input data as array of unsigned char */
306
+ unsigned char c; /* The character we are currently processing */
307
+ size_t pos = 0; /* The number of characters we have processed in this call */
308
+
309
+ /* Store key fields into local variables for performance */
310
+ unsigned char delim = p->delim_char;
311
+ unsigned char quote = p->quote_char;
312
+ int (*is_space)(unsigned char) = p->is_space;
313
+ int (*is_term)(unsigned char) = p->is_term;
314
+ int quoted = p->quoted;
315
+ int pstate = p->pstate;
316
+ size_t spaces = p->spaces;
317
+ size_t entry_pos = p->entry_pos;
318
+
319
+
320
+ if (!p->entry_buf && pos < len) {
321
+ /* Buffer hasn't been allocated yet and len > 0 */
322
+ if (csv_increase_buffer(p) != 0) {
323
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
324
+ return pos;
325
+ }
326
+ }
327
+
328
+ while (pos < len) {
329
+ /* Check memory usage, increase buffer if neccessary */
330
+ if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
331
+ if (csv_increase_buffer(p) != 0) {
332
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
333
+ return pos;
334
+ }
335
+ }
336
+
337
+ c = us[pos++];
338
+
339
+ switch (pstate) {
340
+ case ROW_NOT_BEGUN:
341
+ case FIELD_NOT_BEGUN:
342
+ if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */
343
+ continue;
344
+ } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
345
+ if (pstate == FIELD_NOT_BEGUN) {
346
+ SUBMIT_FIELD(p);
347
+ SUBMIT_ROW(p, (unsigned char)c);
348
+ } else { /* ROW_NOT_BEGUN */
349
+ /* Don't submit empty rows by default */
350
+ if (p->options & CSV_REPALL_NL) {
351
+ SUBMIT_ROW(p, (unsigned char)c);
352
+ }
353
+ }
354
+ continue;
355
+ } else if (c == delim) { /* Comma */
356
+ SUBMIT_FIELD(p);
357
+ break;
358
+ } else if (c == quote) { /* Quote */
359
+ pstate = FIELD_BEGUN;
360
+ quoted = 1;
361
+ } else { /* Anything else */
362
+ pstate = FIELD_BEGUN;
363
+ quoted = 0;
364
+ SUBMIT_CHAR(p, c);
365
+ }
366
+ break;
367
+ case FIELD_BEGUN:
368
+ if (c == quote) { /* Quote */
369
+ if (quoted) {
370
+ SUBMIT_CHAR(p, c);
371
+ pstate = FIELD_MIGHT_HAVE_ENDED;
372
+ } else {
373
+ /* STRICT ERROR - double quote inside non-quoted field */
374
+ if (p->options & CSV_STRICT) {
375
+ p->status = CSV_EPARSE;
376
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
377
+ return pos-1;
378
+ }
379
+ SUBMIT_CHAR(p, c);
380
+ spaces = 0;
381
+ }
382
+ } else if (c == delim) { /* Comma */
383
+ if (quoted) {
384
+ SUBMIT_CHAR(p, c);
385
+ } else {
386
+ SUBMIT_FIELD(p);
387
+ }
388
+ } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
389
+ if (!quoted) {
390
+ SUBMIT_FIELD(p);
391
+ SUBMIT_ROW(p, (unsigned char)c);
392
+ } else {
393
+ SUBMIT_CHAR(p, c);
394
+ }
395
+ } else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
396
+ SUBMIT_CHAR(p, c);
397
+ spaces++;
398
+ } else { /* Anything else */
399
+ SUBMIT_CHAR(p, c);
400
+ spaces = 0;
401
+ }
402
+ break;
403
+ case FIELD_MIGHT_HAVE_ENDED:
404
+ /* This only happens when a quote character is encountered in a quoted field */
405
+ if (c == delim) { /* Comma */
406
+ entry_pos -= spaces + 1; /* get rid of spaces and original quote */
407
+ SUBMIT_FIELD(p);
408
+ } else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
409
+ entry_pos -= spaces + 1; /* get rid of spaces and original quote */
410
+ SUBMIT_FIELD(p);
411
+ SUBMIT_ROW(p, (unsigned char)c);
412
+ } else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
413
+ SUBMIT_CHAR(p, c);
414
+ spaces++;
415
+ } else if (c == quote) { /* Quote */
416
+ if (spaces) {
417
+ /* STRICT ERROR - unescaped double quote */
418
+ if (p->options & CSV_STRICT) {
419
+ p->status = CSV_EPARSE;
420
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
421
+ return pos-1;
422
+ }
423
+ spaces = 0;
424
+ SUBMIT_CHAR(p, c);
425
+ } else {
426
+ /* Two quotes in a row */
427
+ pstate = FIELD_BEGUN;
428
+ }
429
+ } else { /* Anything else */
430
+ /* STRICT ERROR - unescaped double quote */
431
+ if (p->options & CSV_STRICT) {
432
+ p->status = CSV_EPARSE;
433
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
434
+ return pos-1;
435
+ }
436
+ pstate = FIELD_BEGUN;
437
+ spaces = 0;
438
+ SUBMIT_CHAR(p, c);
439
+ }
440
+ break;
441
+ default:
442
+ break;
443
+ }
444
+ }
445
+ p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
446
+ return pos;
447
+ }
448
+
449
+ size_t
450
+ csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
451
+ {
452
+ unsigned char *cdest = dest;
453
+ const unsigned char *csrc = src;
454
+ size_t chars = 0;
455
+
456
+ if (src == NULL)
457
+ return 0;
458
+
459
+ if (cdest == NULL)
460
+ dest_size = 0;
461
+
462
+ if (dest_size > 0)
463
+ *cdest++ = '"';
464
+ chars++;
465
+
466
+ while (src_size) {
467
+ if (*csrc == '"') {
468
+ if (dest_size > chars)
469
+ *cdest++ = '"';
470
+ if (chars < SIZE_MAX) chars++;
471
+ }
472
+ if (dest_size > chars)
473
+ *cdest++ = *csrc;
474
+ if (chars < SIZE_MAX) chars++;
475
+ src_size--;
476
+ csrc++;
477
+ }
478
+
479
+ if (dest_size > chars)
480
+ *cdest = '"';
481
+ if (chars < SIZE_MAX) chars++;
482
+
483
+ return chars;
484
+ }
485
+
486
+ int
487
+ csv_fwrite (FILE *fp, const void *src, size_t src_size)
488
+ {
489
+ const unsigned char *csrc = src;
490
+
491
+ if (fp == NULL || src == NULL)
492
+ return 0;
493
+
494
+ if (fputc('"', fp) == EOF)
495
+ return EOF;
496
+
497
+ while (src_size) {
498
+ if (*csrc == '"') {
499
+ if (fputc('"', fp) == EOF)
500
+ return EOF;
501
+ }
502
+ if (fputc(*csrc, fp) == EOF)
503
+ return EOF;
504
+ src_size--;
505
+ csrc++;
506
+ }
507
+
508
+ if (fputc('"', fp) == EOF) {
509
+ return EOF;
510
+ }
511
+
512
+ return 0;
513
+ }
514
+
515
+ size_t
516
+ csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
517
+ {
518
+ unsigned char *cdest = dest;
519
+ const unsigned char *csrc = src;
520
+ size_t chars = 0;
521
+
522
+ if (src == NULL)
523
+ return 0;
524
+
525
+ if (dest == NULL)
526
+ dest_size = 0;
527
+
528
+ if (dest_size > 0)
529
+ *cdest++ = quote;
530
+ chars++;
531
+
532
+ while (src_size) {
533
+ if (*csrc == quote) {
534
+ if (dest_size > chars)
535
+ *cdest++ = quote;
536
+ if (chars < SIZE_MAX) chars++;
537
+ }
538
+ if (dest_size > chars)
539
+ *cdest++ = *csrc;
540
+ if (chars < SIZE_MAX) chars++;
541
+ src_size--;
542
+ csrc++;
543
+ }
544
+
545
+ if (dest_size > chars)
546
+ *cdest = quote;
547
+ if (chars < SIZE_MAX) chars++;
548
+
549
+ return chars;
550
+ }
551
+
552
+ int
553
+ csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
554
+ {
555
+ const unsigned char *csrc = src;
556
+
557
+ if (fp == NULL || src == NULL)
558
+ return 0;
559
+
560
+ if (fputc(quote, fp) == EOF)
561
+ return EOF;
562
+
563
+ while (src_size) {
564
+ if (*csrc == quote) {
565
+ if (fputc(quote, fp) == EOF)
566
+ return EOF;
567
+ }
568
+ if (fputc(*csrc, fp) == EOF)
569
+ return EOF;
570
+ src_size--;
571
+ csrc++;
572
+ }
573
+
574
+ if (fputc(quote, fp) == EOF) {
575
+ return EOF;
576
+ }
577
+
578
+ return 0;
579
+ }