rcsv 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +8 -0
- data/COPYING.LESSER +458 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +18 -0
- data/LICENSE +30 -0
- data/README.md +148 -0
- data/Rakefile +18 -0
- data/bench.rb +32 -0
- data/ext/rcsv/csv.h +86 -0
- data/ext/rcsv/extconf.rb +3 -0
- data/ext/rcsv/libcsv.c +579 -0
- data/ext/rcsv/rcsv.c +365 -0
- data/ext/rcsv/test.rb +5 -0
- data/lib/lib_csv.rb +88 -0
- data/lib/rcsv.rb +91 -0
- data/lib/rcsv/version.rb +3 -0
- data/rcsv.gemspec +19 -0
- data/test/test_rcsv.csv +889 -0
- data/test/test_rcsv_raw_parse.rb +156 -0
- metadata +70 -0
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
Copyright (c) 2012, Fiksu, Inc.
|
2
|
+
All rights reserved.
|
3
|
+
|
4
|
+
Redistribution and use in source and binary forms, with or without
|
5
|
+
modification, are permitted provided that the following conditions are
|
6
|
+
met:
|
7
|
+
|
8
|
+
o Redistributions of source code must retain the above copyright
|
9
|
+
notice, this list of conditions and the following disclaimer.
|
10
|
+
|
11
|
+
o Redistributions in binary form must reproduce the above copyright
|
12
|
+
notice, this list of conditions and the following disclaimer in the
|
13
|
+
documentation and/or other materials provided with the
|
14
|
+
distribution.
|
15
|
+
|
16
|
+
o Fiksu, Inc. nor the names of its contributors may be used to
|
17
|
+
endorse or promote products derived from this software without
|
18
|
+
specific prior written permission.
|
19
|
+
|
20
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
21
|
+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
22
|
+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
23
|
+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
24
|
+
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
25
|
+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
26
|
+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
27
|
+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
28
|
+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
29
|
+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
30
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
# Rcsv
|
2
|
+
|
3
|
+
Rcsv is a fast CSV parsing library for MRI Ruby. Tested on REE 1.8.7 and Ruby 1.9.3.
|
4
|
+
|
5
|
+
Contrary to many other gems that implement their own parsers, Rcsv uses libcsv 3.1.0 (http://sourceforge.net/projects/libcsv/). As long as libcsv's API is stable, getting Rcsv to use newer libcsv version is as simple as updating two files (csv.h and libcsv.c).
|
6
|
+
|
7
|
+
## Benchmarks
|
8
|
+
user system total real
|
9
|
+
FasterCSV 0.580000 0.000000 0.580000 ( 0.618837)
|
10
|
+
rcsv 0.060000 0.000000 0.060000 ( 0.062248)
|
11
|
+
|
12
|
+
## License
|
13
|
+
|
14
|
+
Rcsv itself is distributed under BSD-derived license (see LICENSE) except for included csv.h and libcsv.c source files that are distributed under LGPL v2.1 (see COPYING.LESSER). Libcsv sources were not modified in any manner.
|
15
|
+
|
16
|
+
## Installation
|
17
|
+
|
18
|
+
Add this line to your application's Gemfile:
|
19
|
+
|
20
|
+
gem 'rcsv'
|
21
|
+
|
22
|
+
And then execute:
|
23
|
+
|
24
|
+
$ bundle
|
25
|
+
|
26
|
+
Or install it yourself as:
|
27
|
+
|
28
|
+
$ gem install rcsv
|
29
|
+
|
30
|
+
|
31
|
+
## Building the latest source
|
32
|
+
|
33
|
+
First, check out the master branch. Then cd there and run:
|
34
|
+
|
35
|
+
$ bundle # Installs development dependencies
|
36
|
+
$ bundle exec rake # Runs tests
|
37
|
+
$ gem build rcsv.gemspec # Builds the gem
|
38
|
+
|
39
|
+
## Usage
|
40
|
+
|
41
|
+
Currently, Rcsv only supports CSV parsing. CSV write support is planned.
|
42
|
+
|
43
|
+
Quickstart:
|
44
|
+
|
45
|
+
parsed = Rcsv.parse(csv_data)
|
46
|
+
|
47
|
+
|
48
|
+
Rcsv class exposes a class method *parse* that accepts a CSV string as its first parameter and options hash as its second parameter.
|
49
|
+
|
50
|
+
|
51
|
+
Options supported:
|
52
|
+
|
53
|
+
### :column_separator
|
54
|
+
|
55
|
+
A single-character string that is used as a separator. Default is ",".
|
56
|
+
|
57
|
+
### :nostrict
|
58
|
+
|
59
|
+
A boolean flag. When enabled, allows to parse oddly quoted CSV data without exceptions being raised. Disabled by default.
|
60
|
+
|
61
|
+
Anything that does not conform to http://www.ietf.org/rfc/rfc4180.txt should better be parsed with this option enabled.
|
62
|
+
|
63
|
+
### :offset_rows
|
64
|
+
|
65
|
+
A positive integer that specifies how many rows should be skipped, counting from the beginning. Default is 0.
|
66
|
+
|
67
|
+
### :columns
|
68
|
+
A hash that contains per-column parsing instructions. By default, every CSV cell is parsed as a raw string without conversions. Empty strings are parsed as nils.
|
69
|
+
|
70
|
+
If CSV has a header, :columns keys can be strings that are equal to column names in the header. If there is no header, keys should represent integer column positions.
|
71
|
+
|
72
|
+
:columns values are in turn hashes that provide parsing options:
|
73
|
+
|
74
|
+
* :alias - Object of any type (though usually a Symbol) that is used to as a key that represents column name when :row_as_hash is set.
|
75
|
+
* :type - A Ruby Symbol that specifies Ruby data type that CSV cell value should be converted into. Supported types: :int, :float, :string, :bool. :string is the default.
|
76
|
+
* :default - Object of any type (though usually of the same type that is specified by :type option). If CSV doesn't have any value for a cell, this default value is used.
|
77
|
+
* :match - A string. If set, makes Rcsv skip all the rows where any column doesn't match its :match value. Useful for filtering data.
|
78
|
+
|
79
|
+
|
80
|
+
### :header
|
81
|
+
A Ruby symbol that specifies how CSV header should be processed. Accepted values:
|
82
|
+
|
83
|
+
* :use (default) - If :columns is set, instructs Rcsv to parse the first CSV line and use column names from there as :columns keys. Ignores the header when :columns is not set.
|
84
|
+
|
85
|
+
* :skip - Skips the header, treats :columns keys as column positions.
|
86
|
+
|
87
|
+
* :none - Tells Rcsv that CSV header is not present. :columns keys are treated as column positions.
|
88
|
+
|
89
|
+
### :row_as_hash
|
90
|
+
A boolean flag. Disabled by default.
|
91
|
+
When enabled, *parse* return value is represented as array of hashes. If :header is set to :use, keys for hashes are either string column names from CSV header or their aliases. Otherwise, column indexes are used.
|
92
|
+
When :row_as_hash is disabled, return value is represented as array of arrays.
|
93
|
+
|
94
|
+
### :only_listed_columns
|
95
|
+
A boolean flag. If enabled, only parses columns that are listed in :columns. Disabled by default.
|
96
|
+
|
97
|
+
|
98
|
+
## Examples
|
99
|
+
|
100
|
+
This example parses a 3-column CSV file and only returns parsed rows where "Age" values are set to "35".
|
101
|
+
|
102
|
+
Rcsv.parse some_csv, :row_as_hash => true,
|
103
|
+
:columns => {
|
104
|
+
'First Name' => { :alias => :first_name, :default => "Unknown" },
|
105
|
+
'Last Name' => { :alias => :last_name, :default => "Unknown"},
|
106
|
+
'Age' => { :alias => :age, :type => :int, :match => "35"}
|
107
|
+
}
|
108
|
+
|
109
|
+
The result would look like this:
|
110
|
+
|
111
|
+
[
|
112
|
+
{ :first_name => "Mary", :last_name => "Jane", :age => 35 },
|
113
|
+
{ :first_name => "Unknown", :last_name => "Alien", :age => 35}
|
114
|
+
]
|
115
|
+
|
116
|
+
Another example, for a miserable headerless Tab-separated CSV:
|
117
|
+
|
118
|
+
Rcsv.parse some_csv, :column_separator => "\t",
|
119
|
+
:header => :none,
|
120
|
+
:columns => {
|
121
|
+
1 => { :type => :float, :default => 0 }
|
122
|
+
}
|
123
|
+
|
124
|
+
The result would look like this:
|
125
|
+
|
126
|
+
[
|
127
|
+
[ "Very hot", 3.7, "Mercury" ],
|
128
|
+
[ "Very hot and cloudy", 8.87, "Venus" ],
|
129
|
+
[ "Just about ok", 9.78, "Earth"],
|
130
|
+
[ nil, 0, "Vacuum" ]
|
131
|
+
]
|
132
|
+
|
133
|
+
|
134
|
+
## To do
|
135
|
+
|
136
|
+
* More specs for boolean values
|
137
|
+
* Specs for Ruby parse
|
138
|
+
* Add custom Ruby callbacks (if block is passed)
|
139
|
+
* Add CSV write support
|
140
|
+
|
141
|
+
|
142
|
+
## Contributing
|
143
|
+
|
144
|
+
1. Fork it
|
145
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
146
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
147
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
148
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require "rake/extensiontask"
|
4
|
+
require 'rake/testtask'
|
5
|
+
|
6
|
+
Rake::ExtensionTask.new('rcsv') do |ext|
|
7
|
+
ext.lib_dir = 'lib/rcsv'
|
8
|
+
end
|
9
|
+
|
10
|
+
Rake::TestTask.new do |t|
|
11
|
+
t.libs << 'test'
|
12
|
+
end
|
13
|
+
|
14
|
+
desc "Recompile native code"
|
15
|
+
task :recompile => [:clobber, :compile] # clean build
|
16
|
+
|
17
|
+
desc "Recompile native code and run tests"
|
18
|
+
task :default => [:recompile, :test] # clean testing FTW
|
data/bench.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require 'benchmark'
|
2
|
+
|
3
|
+
require 'csv'
|
4
|
+
#require './lib/lib_csv'
|
5
|
+
require 'rcsv'
|
6
|
+
|
7
|
+
TIMES = 10
|
8
|
+
|
9
|
+
# That CSV file contains "broken" headers that FaterCSV doesn't like.
|
10
|
+
# Remove all quotes from the header in order to fix this benchmark.
|
11
|
+
# But even better would be to test against much bigger CSV file.
|
12
|
+
data = File.read('./test/test_rcsv.csv')
|
13
|
+
|
14
|
+
Benchmark.bmbm do |b|
|
15
|
+
b.report("FasterCSV") {
|
16
|
+
TIMES.times {
|
17
|
+
str = CSV.parse(data)
|
18
|
+
}
|
19
|
+
}
|
20
|
+
|
21
|
+
# b.report("lib_csv") {
|
22
|
+
# TIMES.times {
|
23
|
+
# str = LibCsv.parse(data)
|
24
|
+
# }
|
25
|
+
# }
|
26
|
+
|
27
|
+
b.report("rcsv") {
|
28
|
+
TIMES.times {
|
29
|
+
str = Rcsv.parse(data)
|
30
|
+
}
|
31
|
+
}
|
32
|
+
end
|
data/ext/rcsv/csv.h
ADDED
@@ -0,0 +1,86 @@
|
|
1
|
+
#ifndef LIBCSV_H__
|
2
|
+
#define LIBCSV_H__
|
3
|
+
#include <stdlib.h>
|
4
|
+
#include <stdio.h>
|
5
|
+
|
6
|
+
#ifdef __cplusplus
|
7
|
+
extern "C" {
|
8
|
+
#endif
|
9
|
+
|
10
|
+
#define CSV_MAJOR 3
|
11
|
+
#define CSV_MINOR 1
|
12
|
+
#define CSV_RELEASE 0
|
13
|
+
|
14
|
+
/* Error Codes */
|
15
|
+
#define CSV_SUCCESS 0
|
16
|
+
#define CSV_EPARSE 1 /* Parse error in strict mode */
|
17
|
+
#define CSV_ENOMEM 2 /* Out of memory while increasing buffer size */
|
18
|
+
#define CSV_ETOOBIG 3 /* Buffer larger than SIZE_MAX needed */
|
19
|
+
#define CSV_EINVALID 4 /* Invalid code,should never be received from csv_error*/
|
20
|
+
|
21
|
+
|
22
|
+
/* parser options */
|
23
|
+
#define CSV_STRICT 1 /* enable strict mode */
|
24
|
+
#define CSV_REPALL_NL 2 /* report all unquoted carriage returns and linefeeds */
|
25
|
+
#define CSV_STRICT_FINI 4 /* causes csv_fini to return CSV_EPARSE if last
|
26
|
+
field is quoted and doesn't containg ending
|
27
|
+
quote */
|
28
|
+
#define CSV_APPEND_NULL 8 /* Ensure that all fields are null-ternimated */
|
29
|
+
|
30
|
+
|
31
|
+
/* Character values */
|
32
|
+
#define CSV_TAB 0x09
|
33
|
+
#define CSV_SPACE 0x20
|
34
|
+
#define CSV_CR 0x0d
|
35
|
+
#define CSV_LF 0x0a
|
36
|
+
#define CSV_COMMA 0x2c
|
37
|
+
#define CSV_QUOTE 0x22
|
38
|
+
|
39
|
+
struct csv_parser {
|
40
|
+
int pstate; /* Parser state */
|
41
|
+
int quoted; /* Is the current field a quoted field? */
|
42
|
+
size_t spaces; /* Number of continious spaces after quote or in a non-quoted field */
|
43
|
+
unsigned char * entry_buf; /* Entry buffer */
|
44
|
+
size_t entry_pos; /* Current position in entry_buf (and current size of entry) */
|
45
|
+
size_t entry_size; /* Size of entry buffer */
|
46
|
+
int status; /* Operation status */
|
47
|
+
unsigned char options;
|
48
|
+
unsigned char quote_char;
|
49
|
+
unsigned char delim_char;
|
50
|
+
int (*is_space)(unsigned char);
|
51
|
+
int (*is_term)(unsigned char);
|
52
|
+
size_t blk_size;
|
53
|
+
void *(*malloc_func)(size_t);
|
54
|
+
void *(*realloc_func)(void *, size_t);
|
55
|
+
void (*free_func)(void *);
|
56
|
+
};
|
57
|
+
|
58
|
+
/* Function Prototypes */
|
59
|
+
int csv_init(struct csv_parser *p, unsigned char options);
|
60
|
+
int csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int, void *), void *data);
|
61
|
+
void csv_free(struct csv_parser *p);
|
62
|
+
int csv_error(struct csv_parser *p);
|
63
|
+
char * csv_strerror(int error);
|
64
|
+
size_t csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int, void *), void *data);
|
65
|
+
size_t csv_write(void *dest, size_t dest_size, const void *src, size_t src_size);
|
66
|
+
int csv_fwrite(FILE *fp, const void *src, size_t src_size);
|
67
|
+
size_t csv_write2(void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote);
|
68
|
+
int csv_fwrite2(FILE *fp, const void *src, size_t src_size, unsigned char quote);
|
69
|
+
int csv_get_opts(struct csv_parser *p);
|
70
|
+
int csv_set_opts(struct csv_parser *p, unsigned char options);
|
71
|
+
void csv_set_delim(struct csv_parser *p, unsigned char c);
|
72
|
+
void csv_set_quote(struct csv_parser *p, unsigned char c);
|
73
|
+
unsigned char csv_get_delim(struct csv_parser *p);
|
74
|
+
unsigned char csv_get_quote(struct csv_parser *p);
|
75
|
+
void csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char));
|
76
|
+
void csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char));
|
77
|
+
void csv_set_realloc_func(struct csv_parser *p, void *(*)(void *, size_t));
|
78
|
+
void csv_set_free_func(struct csv_parser *p, void (*)(void *));
|
79
|
+
void csv_set_blk_size(struct csv_parser *p, size_t);
|
80
|
+
size_t csv_get_buffer_size(struct csv_parser *p);
|
81
|
+
|
82
|
+
#ifdef __cplusplus
|
83
|
+
}
|
84
|
+
#endif
|
85
|
+
|
86
|
+
#endif
|
data/ext/rcsv/extconf.rb
ADDED
data/ext/rcsv/libcsv.c
ADDED
@@ -0,0 +1,579 @@
|
|
1
|
+
/*
|
2
|
+
libcsv - parse and write csv data
|
3
|
+
Copyright (C) 2008 Robert Gamble
|
4
|
+
|
5
|
+
This library is free software; you can redistribute it and/or
|
6
|
+
modify it under the terms of the GNU Lesser General Public
|
7
|
+
License as published by the Free Software Foundation; either
|
8
|
+
version 2.1 of the License, or (at your option) any later version.
|
9
|
+
|
10
|
+
This library is distributed in the hope that it will be useful,
|
11
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
12
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
13
|
+
Lesser General Public License for more details.
|
14
|
+
|
15
|
+
You should have received a copy of the GNU Lesser General Public
|
16
|
+
License along with this library; if not, write to the Free Software
|
17
|
+
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
18
|
+
*/
|
19
|
+
|
20
|
+
#if ___STDC_VERSION__ >= 199901L
|
21
|
+
# include <stdint.h>
|
22
|
+
#else
|
23
|
+
# define SIZE_MAX ((size_t)-1) /* C89 doesn't have stdint.h or SIZE_MAX */
|
24
|
+
#endif
|
25
|
+
|
26
|
+
#include "csv.h"
|
27
|
+
|
28
|
+
#define VERSION "3.0.2"
|
29
|
+
|
30
|
+
#define ROW_NOT_BEGUN 0
|
31
|
+
#define FIELD_NOT_BEGUN 1
|
32
|
+
#define FIELD_BEGUN 2
|
33
|
+
#define FIELD_MIGHT_HAVE_ENDED 3
|
34
|
+
|
35
|
+
/*
|
36
|
+
Explanation of states
|
37
|
+
ROW_NOT_BEGUN There have not been any fields encountered for this row
|
38
|
+
FIELD_NOT_BEGUN There have been fields but we are currently not in one
|
39
|
+
FIELD_BEGUN We are in a field
|
40
|
+
FIELD_MIGHT_HAVE_ENDED
|
41
|
+
We encountered a double quote inside a quoted field, the
|
42
|
+
field is either ended or the quote is literal
|
43
|
+
*/
|
44
|
+
|
45
|
+
#define MEM_BLK_SIZE 128
|
46
|
+
|
47
|
+
#define SUBMIT_FIELD(p) \
|
48
|
+
do { \
|
49
|
+
if (!quoted) \
|
50
|
+
entry_pos -= spaces; \
|
51
|
+
if (p->options & CSV_APPEND_NULL) \
|
52
|
+
((p)->entry_buf[entry_pos]) = '\0'; \
|
53
|
+
if (cb1) \
|
54
|
+
cb1(p->entry_buf, entry_pos, data); \
|
55
|
+
pstate = FIELD_NOT_BEGUN; \
|
56
|
+
entry_pos = quoted = spaces = 0; \
|
57
|
+
} while (0)
|
58
|
+
|
59
|
+
#define SUBMIT_ROW(p, c) \
|
60
|
+
do { \
|
61
|
+
if (cb2) \
|
62
|
+
cb2(c, data); \
|
63
|
+
pstate = ROW_NOT_BEGUN; \
|
64
|
+
entry_pos = quoted = spaces = 0; \
|
65
|
+
} while (0)
|
66
|
+
|
67
|
+
#define SUBMIT_CHAR(p, c) ((p)->entry_buf[entry_pos++] = (c))
|
68
|
+
|
69
|
+
static char *csv_errors[] = {"success",
|
70
|
+
"error parsing data while strict checking enabled",
|
71
|
+
"memory exhausted while increasing buffer size",
|
72
|
+
"data size too large",
|
73
|
+
"invalid status code"};
|
74
|
+
|
75
|
+
int
|
76
|
+
csv_error(struct csv_parser *p)
|
77
|
+
{
|
78
|
+
/* Return the current status of the parser */
|
79
|
+
return p->status;
|
80
|
+
}
|
81
|
+
|
82
|
+
char *
|
83
|
+
csv_strerror(int status)
|
84
|
+
{
|
85
|
+
/* Return a textual description of status */
|
86
|
+
if (status >= CSV_EINVALID || status < 0)
|
87
|
+
return csv_errors[CSV_EINVALID];
|
88
|
+
else
|
89
|
+
return csv_errors[status];
|
90
|
+
}
|
91
|
+
|
92
|
+
int
|
93
|
+
csv_get_opts(struct csv_parser *p)
|
94
|
+
{
|
95
|
+
/* Return the currently set options of parser */
|
96
|
+
if (p == NULL)
|
97
|
+
return -1;
|
98
|
+
|
99
|
+
return p->options;
|
100
|
+
}
|
101
|
+
|
102
|
+
int
|
103
|
+
csv_set_opts(struct csv_parser *p, unsigned char options)
|
104
|
+
{
|
105
|
+
/* Set the options */
|
106
|
+
if (p == NULL)
|
107
|
+
return -1;
|
108
|
+
|
109
|
+
p->options = options;
|
110
|
+
return 0;
|
111
|
+
}
|
112
|
+
|
113
|
+
int
|
114
|
+
csv_init(struct csv_parser *p, unsigned char options)
|
115
|
+
{
|
116
|
+
/* Initialize a csv_parser object returns 0 on success, -1 on error */
|
117
|
+
if (p == NULL)
|
118
|
+
return -1;
|
119
|
+
|
120
|
+
p->entry_buf = NULL;
|
121
|
+
p->pstate = ROW_NOT_BEGUN;
|
122
|
+
p->quoted = 0;
|
123
|
+
p->spaces = 0;
|
124
|
+
p->entry_pos = 0;
|
125
|
+
p->entry_size = 0;
|
126
|
+
p->status = 0;
|
127
|
+
p->options = options;
|
128
|
+
p->quote_char = CSV_QUOTE;
|
129
|
+
p->delim_char = CSV_COMMA;
|
130
|
+
p->is_space = NULL;
|
131
|
+
p->is_term = NULL;
|
132
|
+
p->blk_size = MEM_BLK_SIZE;
|
133
|
+
p->malloc_func = NULL;
|
134
|
+
p->realloc_func = realloc;
|
135
|
+
p->free_func = free;
|
136
|
+
|
137
|
+
return 0;
|
138
|
+
}
|
139
|
+
|
140
|
+
void
|
141
|
+
csv_free(struct csv_parser *p)
|
142
|
+
{
|
143
|
+
/* Free the entry_buffer of csv_parser object */
|
144
|
+
if (p == NULL)
|
145
|
+
return;
|
146
|
+
|
147
|
+
if (p->entry_buf)
|
148
|
+
p->free_func(p->entry_buf);
|
149
|
+
|
150
|
+
p->entry_buf = NULL;
|
151
|
+
p->entry_size = 0;
|
152
|
+
|
153
|
+
return;
|
154
|
+
}
|
155
|
+
|
156
|
+
int
|
157
|
+
csv_fini(struct csv_parser *p, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
|
158
|
+
{
|
159
|
+
/* Finalize parsing. Needed, for example, when file does not end in a newline */
|
160
|
+
int quoted = p->quoted;
|
161
|
+
int pstate = p->pstate;
|
162
|
+
size_t spaces = p->spaces;
|
163
|
+
size_t entry_pos = p->entry_pos;
|
164
|
+
|
165
|
+
if (p == NULL)
|
166
|
+
return -1;
|
167
|
+
|
168
|
+
|
169
|
+
if (p->pstate == FIELD_BEGUN && p->quoted && p->options & CSV_STRICT && p->options & CSV_STRICT_FINI) {
|
170
|
+
/* Current field is quoted, no end-quote was seen, and CSV_STRICT_FINI is set */
|
171
|
+
p->status = CSV_EPARSE;
|
172
|
+
return -1;
|
173
|
+
}
|
174
|
+
|
175
|
+
switch (p->pstate) {
|
176
|
+
case FIELD_MIGHT_HAVE_ENDED:
|
177
|
+
p->entry_pos -= p->spaces + 1; /* get rid of spaces and original quote */
|
178
|
+
/* Fall-through */
|
179
|
+
case FIELD_NOT_BEGUN:
|
180
|
+
case FIELD_BEGUN:
|
181
|
+
quoted = p->quoted, pstate = p->pstate;
|
182
|
+
spaces = p->spaces, entry_pos = p->entry_pos;
|
183
|
+
SUBMIT_FIELD(p);
|
184
|
+
SUBMIT_ROW(p, -1);
|
185
|
+
case ROW_NOT_BEGUN: /* Already ended properly */
|
186
|
+
;
|
187
|
+
}
|
188
|
+
|
189
|
+
/* Reset parser */
|
190
|
+
p->spaces = p->quoted = p->entry_pos = p->status = 0;
|
191
|
+
p->pstate = ROW_NOT_BEGUN;
|
192
|
+
|
193
|
+
return 0;
|
194
|
+
}
|
195
|
+
|
196
|
+
void
|
197
|
+
csv_set_delim(struct csv_parser *p, unsigned char c)
|
198
|
+
{
|
199
|
+
/* Set the delimiter */
|
200
|
+
if (p) p->delim_char = c;
|
201
|
+
}
|
202
|
+
|
203
|
+
void
|
204
|
+
csv_set_quote(struct csv_parser *p, unsigned char c)
|
205
|
+
{
|
206
|
+
/* Set the quote character */
|
207
|
+
if (p) p->quote_char = c;
|
208
|
+
}
|
209
|
+
|
210
|
+
unsigned char
|
211
|
+
csv_get_delim(struct csv_parser *p)
|
212
|
+
{
|
213
|
+
/* Get the delimiter */
|
214
|
+
return p->delim_char;
|
215
|
+
}
|
216
|
+
|
217
|
+
unsigned char
|
218
|
+
csv_get_quote(struct csv_parser *p)
|
219
|
+
{
|
220
|
+
/* Get the quote character */
|
221
|
+
return p->quote_char;
|
222
|
+
}
|
223
|
+
|
224
|
+
void
|
225
|
+
csv_set_space_func(struct csv_parser *p, int (*f)(unsigned char))
|
226
|
+
{
|
227
|
+
/* Set the space function */
|
228
|
+
if (p) p->is_space = f;
|
229
|
+
}
|
230
|
+
|
231
|
+
void
|
232
|
+
csv_set_term_func(struct csv_parser *p, int (*f)(unsigned char))
|
233
|
+
{
|
234
|
+
/* Set the term function */
|
235
|
+
if (p) p->is_term = f;
|
236
|
+
}
|
237
|
+
|
238
|
+
void
|
239
|
+
csv_set_realloc_func(struct csv_parser *p, void *(*f)(void *, size_t))
|
240
|
+
{
|
241
|
+
/* Set the realloc function used to increase buffer size */
|
242
|
+
if (p && f) p->realloc_func = f;
|
243
|
+
}
|
244
|
+
|
245
|
+
void
|
246
|
+
csv_set_free_func(struct csv_parser *p, void (*f)(void *))
|
247
|
+
{
|
248
|
+
/* Set the free function used to free the buffer */
|
249
|
+
if (p && f) p->free_func = f;
|
250
|
+
}
|
251
|
+
|
252
|
+
void
|
253
|
+
csv_set_blk_size(struct csv_parser *p, size_t size)
|
254
|
+
{
|
255
|
+
/* Set the block size used to increment buffer size */
|
256
|
+
if (p) p->blk_size = size;
|
257
|
+
}
|
258
|
+
|
259
|
+
size_t
|
260
|
+
csv_get_buffer_size(struct csv_parser *p)
|
261
|
+
{
|
262
|
+
/* Get the size of the entry buffer */
|
263
|
+
if (p)
|
264
|
+
return p->entry_size;
|
265
|
+
return 0;
|
266
|
+
}
|
267
|
+
|
268
|
+
static int
|
269
|
+
csv_increase_buffer(struct csv_parser *p)
|
270
|
+
{
|
271
|
+
/* Increase the size of the entry buffer. Attempt to increase size by
|
272
|
+
* p->blk_size, if this is larger than SIZE_MAX try to increase current
|
273
|
+
* buffer size to SIZE_MAX. If allocation fails, try to allocate halve
|
274
|
+
* the size and try again until successful or increment size is zero.
|
275
|
+
*/
|
276
|
+
|
277
|
+
size_t to_add = p->blk_size;
|
278
|
+
void *vp;
|
279
|
+
|
280
|
+
if ( p->entry_size >= SIZE_MAX - to_add )
|
281
|
+
to_add = SIZE_MAX - p->entry_size;
|
282
|
+
|
283
|
+
if (!to_add) {
|
284
|
+
p->status = CSV_ETOOBIG;
|
285
|
+
return -1;
|
286
|
+
}
|
287
|
+
|
288
|
+
while ((vp = p->realloc_func(p->entry_buf, p->entry_size + to_add)) == NULL) {
|
289
|
+
to_add /= 2;
|
290
|
+
if (!to_add) {
|
291
|
+
p->status = CSV_ENOMEM;
|
292
|
+
return -1;
|
293
|
+
}
|
294
|
+
}
|
295
|
+
|
296
|
+
/* Update entry buffer pointer and entry_size if successful */
|
297
|
+
p->entry_buf = vp;
|
298
|
+
p->entry_size += to_add;
|
299
|
+
return 0;
|
300
|
+
}
|
301
|
+
|
302
|
+
size_t
|
303
|
+
csv_parse(struct csv_parser *p, const void *s, size_t len, void (*cb1)(void *, size_t, void *), void (*cb2)(int c, void *), void *data)
|
304
|
+
{
|
305
|
+
unsigned const char *us = s; /* Access input data as array of unsigned char */
|
306
|
+
unsigned char c; /* The character we are currently processing */
|
307
|
+
size_t pos = 0; /* The number of characters we have processed in this call */
|
308
|
+
|
309
|
+
/* Store key fields into local variables for performance */
|
310
|
+
unsigned char delim = p->delim_char;
|
311
|
+
unsigned char quote = p->quote_char;
|
312
|
+
int (*is_space)(unsigned char) = p->is_space;
|
313
|
+
int (*is_term)(unsigned char) = p->is_term;
|
314
|
+
int quoted = p->quoted;
|
315
|
+
int pstate = p->pstate;
|
316
|
+
size_t spaces = p->spaces;
|
317
|
+
size_t entry_pos = p->entry_pos;
|
318
|
+
|
319
|
+
|
320
|
+
if (!p->entry_buf && pos < len) {
|
321
|
+
/* Buffer hasn't been allocated yet and len > 0 */
|
322
|
+
if (csv_increase_buffer(p) != 0) {
|
323
|
+
p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
|
324
|
+
return pos;
|
325
|
+
}
|
326
|
+
}
|
327
|
+
|
328
|
+
while (pos < len) {
|
329
|
+
/* Check memory usage, increase buffer if neccessary */
|
330
|
+
if (entry_pos == ((p->options & CSV_APPEND_NULL) ? p->entry_size - 1 : p->entry_size) ) {
|
331
|
+
if (csv_increase_buffer(p) != 0) {
|
332
|
+
p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
|
333
|
+
return pos;
|
334
|
+
}
|
335
|
+
}
|
336
|
+
|
337
|
+
c = us[pos++];
|
338
|
+
|
339
|
+
switch (pstate) {
|
340
|
+
case ROW_NOT_BEGUN:
|
341
|
+
case FIELD_NOT_BEGUN:
|
342
|
+
if ((is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) && c!=delim) { /* Space or Tab */
|
343
|
+
continue;
|
344
|
+
} else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
|
345
|
+
if (pstate == FIELD_NOT_BEGUN) {
|
346
|
+
SUBMIT_FIELD(p);
|
347
|
+
SUBMIT_ROW(p, (unsigned char)c);
|
348
|
+
} else { /* ROW_NOT_BEGUN */
|
349
|
+
/* Don't submit empty rows by default */
|
350
|
+
if (p->options & CSV_REPALL_NL) {
|
351
|
+
SUBMIT_ROW(p, (unsigned char)c);
|
352
|
+
}
|
353
|
+
}
|
354
|
+
continue;
|
355
|
+
} else if (c == delim) { /* Comma */
|
356
|
+
SUBMIT_FIELD(p);
|
357
|
+
break;
|
358
|
+
} else if (c == quote) { /* Quote */
|
359
|
+
pstate = FIELD_BEGUN;
|
360
|
+
quoted = 1;
|
361
|
+
} else { /* Anything else */
|
362
|
+
pstate = FIELD_BEGUN;
|
363
|
+
quoted = 0;
|
364
|
+
SUBMIT_CHAR(p, c);
|
365
|
+
}
|
366
|
+
break;
|
367
|
+
case FIELD_BEGUN:
|
368
|
+
if (c == quote) { /* Quote */
|
369
|
+
if (quoted) {
|
370
|
+
SUBMIT_CHAR(p, c);
|
371
|
+
pstate = FIELD_MIGHT_HAVE_ENDED;
|
372
|
+
} else {
|
373
|
+
/* STRICT ERROR - double quote inside non-quoted field */
|
374
|
+
if (p->options & CSV_STRICT) {
|
375
|
+
p->status = CSV_EPARSE;
|
376
|
+
p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
|
377
|
+
return pos-1;
|
378
|
+
}
|
379
|
+
SUBMIT_CHAR(p, c);
|
380
|
+
spaces = 0;
|
381
|
+
}
|
382
|
+
} else if (c == delim) { /* Comma */
|
383
|
+
if (quoted) {
|
384
|
+
SUBMIT_CHAR(p, c);
|
385
|
+
} else {
|
386
|
+
SUBMIT_FIELD(p);
|
387
|
+
}
|
388
|
+
} else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
|
389
|
+
if (!quoted) {
|
390
|
+
SUBMIT_FIELD(p);
|
391
|
+
SUBMIT_ROW(p, (unsigned char)c);
|
392
|
+
} else {
|
393
|
+
SUBMIT_CHAR(p, c);
|
394
|
+
}
|
395
|
+
} else if (!quoted && (is_space? is_space(c) : c == CSV_SPACE || c == CSV_TAB)) { /* Tab or space for non-quoted field */
|
396
|
+
SUBMIT_CHAR(p, c);
|
397
|
+
spaces++;
|
398
|
+
} else { /* Anything else */
|
399
|
+
SUBMIT_CHAR(p, c);
|
400
|
+
spaces = 0;
|
401
|
+
}
|
402
|
+
break;
|
403
|
+
case FIELD_MIGHT_HAVE_ENDED:
|
404
|
+
/* This only happens when a quote character is encountered in a quoted field */
|
405
|
+
if (c == delim) { /* Comma */
|
406
|
+
entry_pos -= spaces + 1; /* get rid of spaces and original quote */
|
407
|
+
SUBMIT_FIELD(p);
|
408
|
+
} else if (is_term ? is_term(c) : c == CSV_CR || c == CSV_LF) { /* Carriage Return or Line Feed */
|
409
|
+
entry_pos -= spaces + 1; /* get rid of spaces and original quote */
|
410
|
+
SUBMIT_FIELD(p);
|
411
|
+
SUBMIT_ROW(p, (unsigned char)c);
|
412
|
+
} else if (is_space ? is_space(c) : c == CSV_SPACE || c == CSV_TAB) { /* Space or Tab */
|
413
|
+
SUBMIT_CHAR(p, c);
|
414
|
+
spaces++;
|
415
|
+
} else if (c == quote) { /* Quote */
|
416
|
+
if (spaces) {
|
417
|
+
/* STRICT ERROR - unescaped double quote */
|
418
|
+
if (p->options & CSV_STRICT) {
|
419
|
+
p->status = CSV_EPARSE;
|
420
|
+
p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
|
421
|
+
return pos-1;
|
422
|
+
}
|
423
|
+
spaces = 0;
|
424
|
+
SUBMIT_CHAR(p, c);
|
425
|
+
} else {
|
426
|
+
/* Two quotes in a row */
|
427
|
+
pstate = FIELD_BEGUN;
|
428
|
+
}
|
429
|
+
} else { /* Anything else */
|
430
|
+
/* STRICT ERROR - unescaped double quote */
|
431
|
+
if (p->options & CSV_STRICT) {
|
432
|
+
p->status = CSV_EPARSE;
|
433
|
+
p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
|
434
|
+
return pos-1;
|
435
|
+
}
|
436
|
+
pstate = FIELD_BEGUN;
|
437
|
+
spaces = 0;
|
438
|
+
SUBMIT_CHAR(p, c);
|
439
|
+
}
|
440
|
+
break;
|
441
|
+
default:
|
442
|
+
break;
|
443
|
+
}
|
444
|
+
}
|
445
|
+
p->quoted = quoted, p->pstate = pstate, p->spaces = spaces, p->entry_pos = entry_pos;
|
446
|
+
return pos;
|
447
|
+
}
|
448
|
+
|
449
|
+
size_t
|
450
|
+
csv_write (void *dest, size_t dest_size, const void *src, size_t src_size)
|
451
|
+
{
|
452
|
+
unsigned char *cdest = dest;
|
453
|
+
const unsigned char *csrc = src;
|
454
|
+
size_t chars = 0;
|
455
|
+
|
456
|
+
if (src == NULL)
|
457
|
+
return 0;
|
458
|
+
|
459
|
+
if (cdest == NULL)
|
460
|
+
dest_size = 0;
|
461
|
+
|
462
|
+
if (dest_size > 0)
|
463
|
+
*cdest++ = '"';
|
464
|
+
chars++;
|
465
|
+
|
466
|
+
while (src_size) {
|
467
|
+
if (*csrc == '"') {
|
468
|
+
if (dest_size > chars)
|
469
|
+
*cdest++ = '"';
|
470
|
+
if (chars < SIZE_MAX) chars++;
|
471
|
+
}
|
472
|
+
if (dest_size > chars)
|
473
|
+
*cdest++ = *csrc;
|
474
|
+
if (chars < SIZE_MAX) chars++;
|
475
|
+
src_size--;
|
476
|
+
csrc++;
|
477
|
+
}
|
478
|
+
|
479
|
+
if (dest_size > chars)
|
480
|
+
*cdest = '"';
|
481
|
+
if (chars < SIZE_MAX) chars++;
|
482
|
+
|
483
|
+
return chars;
|
484
|
+
}
|
485
|
+
|
486
|
+
int
|
487
|
+
csv_fwrite (FILE *fp, const void *src, size_t src_size)
|
488
|
+
{
|
489
|
+
const unsigned char *csrc = src;
|
490
|
+
|
491
|
+
if (fp == NULL || src == NULL)
|
492
|
+
return 0;
|
493
|
+
|
494
|
+
if (fputc('"', fp) == EOF)
|
495
|
+
return EOF;
|
496
|
+
|
497
|
+
while (src_size) {
|
498
|
+
if (*csrc == '"') {
|
499
|
+
if (fputc('"', fp) == EOF)
|
500
|
+
return EOF;
|
501
|
+
}
|
502
|
+
if (fputc(*csrc, fp) == EOF)
|
503
|
+
return EOF;
|
504
|
+
src_size--;
|
505
|
+
csrc++;
|
506
|
+
}
|
507
|
+
|
508
|
+
if (fputc('"', fp) == EOF) {
|
509
|
+
return EOF;
|
510
|
+
}
|
511
|
+
|
512
|
+
return 0;
|
513
|
+
}
|
514
|
+
|
515
|
+
size_t
|
516
|
+
csv_write2 (void *dest, size_t dest_size, const void *src, size_t src_size, unsigned char quote)
|
517
|
+
{
|
518
|
+
unsigned char *cdest = dest;
|
519
|
+
const unsigned char *csrc = src;
|
520
|
+
size_t chars = 0;
|
521
|
+
|
522
|
+
if (src == NULL)
|
523
|
+
return 0;
|
524
|
+
|
525
|
+
if (dest == NULL)
|
526
|
+
dest_size = 0;
|
527
|
+
|
528
|
+
if (dest_size > 0)
|
529
|
+
*cdest++ = quote;
|
530
|
+
chars++;
|
531
|
+
|
532
|
+
while (src_size) {
|
533
|
+
if (*csrc == quote) {
|
534
|
+
if (dest_size > chars)
|
535
|
+
*cdest++ = quote;
|
536
|
+
if (chars < SIZE_MAX) chars++;
|
537
|
+
}
|
538
|
+
if (dest_size > chars)
|
539
|
+
*cdest++ = *csrc;
|
540
|
+
if (chars < SIZE_MAX) chars++;
|
541
|
+
src_size--;
|
542
|
+
csrc++;
|
543
|
+
}
|
544
|
+
|
545
|
+
if (dest_size > chars)
|
546
|
+
*cdest = quote;
|
547
|
+
if (chars < SIZE_MAX) chars++;
|
548
|
+
|
549
|
+
return chars;
|
550
|
+
}
|
551
|
+
|
552
|
+
int
|
553
|
+
csv_fwrite2 (FILE *fp, const void *src, size_t src_size, unsigned char quote)
|
554
|
+
{
|
555
|
+
const unsigned char *csrc = src;
|
556
|
+
|
557
|
+
if (fp == NULL || src == NULL)
|
558
|
+
return 0;
|
559
|
+
|
560
|
+
if (fputc(quote, fp) == EOF)
|
561
|
+
return EOF;
|
562
|
+
|
563
|
+
while (src_size) {
|
564
|
+
if (*csrc == quote) {
|
565
|
+
if (fputc(quote, fp) == EOF)
|
566
|
+
return EOF;
|
567
|
+
}
|
568
|
+
if (fputc(*csrc, fp) == EOF)
|
569
|
+
return EOF;
|
570
|
+
src_size--;
|
571
|
+
csrc++;
|
572
|
+
}
|
573
|
+
|
574
|
+
if (fputc(quote, fp) == EOF) {
|
575
|
+
return EOF;
|
576
|
+
}
|
577
|
+
|
578
|
+
return 0;
|
579
|
+
}
|