csvreader 1.2.4 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -2
- data/README.md +682 -682
- data/Rakefile +33 -32
- data/datasets/cars11.csv +10 -10
- data/datasets/cities11.csv +12 -12
- data/datasets/customers11.csv +13 -13
- data/datasets/iris.attrib.csv +25 -25
- data/datasets/iris11.csv +163 -163
- data/datasets/lcc.attrib.csv +14 -14
- data/datasets/shakespeare.csv +9 -9
- data/lib/csvreader/base.rb +6 -2
- data/lib/csvreader/buffer.rb +0 -1
- data/lib/csvreader/builder.rb +0 -1
- data/lib/csvreader/converter.rb +0 -1
- data/lib/csvreader/parser.rb +32 -33
- data/lib/csvreader/parser_fixed.rb +105 -106
- data/lib/csvreader/parser_json.rb +23 -24
- data/lib/csvreader/parser_std.rb +582 -583
- data/lib/csvreader/parser_strict.rb +290 -291
- data/lib/csvreader/parser_tab.rb +22 -23
- data/lib/csvreader/parser_table.rb +122 -123
- data/lib/csvreader/parser_yaml.rb +23 -24
- data/lib/csvreader/reader.rb +2 -3
- data/lib/csvreader/reader_hash.rb +1 -2
- data/lib/csvreader/version.rb +30 -32
- data/lib/csvreader.rb +0 -1
- data/test/test_parser_formats.rb +66 -66
- data/test/test_parser_java.rb +208 -208
- metadata +18 -15
- data/LICENSE.md +0 -116
@@ -1,291 +1,290 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
##
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
l =
|
24
|
-
l
|
25
|
-
|
26
|
-
end
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
@config
|
42
|
-
@config[:
|
43
|
-
@config[:
|
44
|
-
@config[:
|
45
|
-
@config[:
|
46
|
-
@config[:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
##
|
53
|
-
##
|
54
|
-
|
55
|
-
def
|
56
|
-
def
|
57
|
-
def
|
58
|
-
def
|
59
|
-
def
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
input.
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
input.
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
##
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
##
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
value
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
input.getc
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
## no
|
245
|
-
## no
|
246
|
-
##
|
247
|
-
##
|
248
|
-
##
|
249
|
-
##
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
##
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
end # class
|
291
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
|
5
|
+
class ParserStrict
|
6
|
+
|
7
|
+
|
8
|
+
## char constants
|
9
|
+
BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
|
10
|
+
LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
|
11
|
+
CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
|
12
|
+
|
13
|
+
|
14
|
+
###################################
|
15
|
+
## add simple logger with debug flag/switch
|
16
|
+
#
|
17
|
+
# use Parser.debug = true # to turn on
|
18
|
+
#
|
19
|
+
# todo/fix: use logutils instead of std logger - why? why not?
|
20
|
+
|
21
|
+
def self.build_logger()
|
22
|
+
l = Logger.new( STDOUT )
|
23
|
+
l.level = :info ## set to :info on start; note: is 0 (debug) by default
|
24
|
+
l
|
25
|
+
end
|
26
|
+
def self.logger() @@logger ||= build_logger; end
|
27
|
+
def logger() self.class.logger; end
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
|
32
|
+
|
33
|
+
def initialize( sep: ',',
|
34
|
+
quote: '"', ## note: set to false/nil for no quote
|
35
|
+
doublequote: true,
|
36
|
+
escape: false, ## true/false
|
37
|
+
null: nil, ## note: set to nil for no null vales / not availabe (na)
|
38
|
+
comment: false ## note: comment char e.g. # or false/nil
|
39
|
+
)
|
40
|
+
@config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
|
41
|
+
@config[:sep] = sep
|
42
|
+
@config[:quote] = quote
|
43
|
+
@config[:doublequote] = doublequote
|
44
|
+
@config[:escape] = escape
|
45
|
+
@config[:null] = null
|
46
|
+
@config[:comment] = comment
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
#########################################
|
51
|
+
## config convenience helpers
|
52
|
+
## e.g. use like Csv.mysql.sep = ',' etc. instead of
|
53
|
+
## Csv.mysql.config[:sep] = ','
|
54
|
+
def sep=( value ) @config[:sep]=value; end
|
55
|
+
def quote=( value ) @config[:quote]=value; end
|
56
|
+
def doublequote=( value ) @config[:doublequote]=value; end
|
57
|
+
def escape=( value ) @config[:escape]=value; end
|
58
|
+
def null=( value ) @config[:null]=value; end
|
59
|
+
def comment=( value ) @config[:comment]=value; end
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
def parse( data, sep: config[:sep], &block )
|
64
|
+
## note: data - will wrap either a String or IO object passed in data
|
65
|
+
|
66
|
+
## make sure data (string or io) is a wrapped into Buffer!!!!!!
|
67
|
+
if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
|
68
|
+
input = data
|
69
|
+
else
|
70
|
+
input = Buffer.new( data )
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
if block_given?
|
75
|
+
parse_lines( input, sep: sep, &block )
|
76
|
+
else
|
77
|
+
records = []
|
78
|
+
|
79
|
+
parse_lines( input, sep: sep ) do |record|
|
80
|
+
records << record
|
81
|
+
end
|
82
|
+
|
83
|
+
records
|
84
|
+
end
|
85
|
+
|
86
|
+
end ## method parse
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def parse_escape( input, sep: )
|
93
|
+
value = ""
|
94
|
+
|
95
|
+
quote = config[:quote]
|
96
|
+
|
97
|
+
if input.peek == BACKSLASH
|
98
|
+
input.getc ## eat-up backslash
|
99
|
+
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
|
100
|
+
value << input.getc ## add escaped char (e.g. lf, cr, etc.)
|
101
|
+
else
|
102
|
+
## unknown escape sequence; no special handling/escaping
|
103
|
+
value << BACKSLASH
|
104
|
+
end
|
105
|
+
else
|
106
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
|
107
|
+
end
|
108
|
+
value
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
def parse_quote( input, sep: )
|
114
|
+
value = ""
|
115
|
+
|
116
|
+
quote = config[:quote] # char (e.g.",') | nil
|
117
|
+
doublequote = config[:doublequote] # true|false
|
118
|
+
escape = config[:escape] # true|false
|
119
|
+
|
120
|
+
if input.peek == quote
|
121
|
+
input.getc ## eat-up double_quote
|
122
|
+
|
123
|
+
loop do
|
124
|
+
while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
|
125
|
+
value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
|
126
|
+
end
|
127
|
+
|
128
|
+
if input.eof?
|
129
|
+
break
|
130
|
+
elsif input.peek == BACKSLASH
|
131
|
+
value << parse_escape( input, sep: sep )
|
132
|
+
else ## assume input.peek == DOUBLE_QUOTE
|
133
|
+
input.getc ## eat-up double_quote
|
134
|
+
if doublequote && input.peek == quote ## doubled up quote?
|
135
|
+
value << input.getc ## add doube quote and continue!!!!
|
136
|
+
else
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
else
|
142
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
|
143
|
+
end
|
144
|
+
value
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
def parse_field( input, sep: )
|
150
|
+
value = ""
|
151
|
+
|
152
|
+
quote = config[:quote]
|
153
|
+
escape = config[:escape]
|
154
|
+
|
155
|
+
logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
|
156
|
+
|
157
|
+
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
|
158
|
+
## note: allows null = '' that is turn unquoted empty strings into null/nil
|
159
|
+
## or if using numeric into NotANumber (NaN)
|
160
|
+
value = nil if is_null?( value )
|
161
|
+
## do nothing - keep value as is :-) e.g. "".
|
162
|
+
elsif quote && input.peek == quote
|
163
|
+
logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
164
|
+
value << parse_quote( input, sep: sep )
|
165
|
+
logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
166
|
+
else
|
167
|
+
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
168
|
+
## consume simple value
|
169
|
+
## until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
|
170
|
+
while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
|
171
|
+
if escape && input.peek == BACKSLASH
|
172
|
+
value << parse_escape( input, sep: sep )
|
173
|
+
else
|
174
|
+
logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
175
|
+
value << input.getc
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
value = nil if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
|
181
|
+
# do nothing - keep value as is :-).
|
182
|
+
|
183
|
+
logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
184
|
+
end
|
185
|
+
|
186
|
+
value
|
187
|
+
end
|
188
|
+
|
189
|
+
|
190
|
+
|
191
|
+
def parse_record( input, sep: )
|
192
|
+
values = []
|
193
|
+
|
194
|
+
loop do
|
195
|
+
value = parse_field( input, sep: sep )
|
196
|
+
logger.debug "value: »#{value}«" if logger.debug?
|
197
|
+
values << value
|
198
|
+
|
199
|
+
if input.eof?
|
200
|
+
break
|
201
|
+
elsif (c=input.peek; c==LF || c==CR)
|
202
|
+
skip_newline( input ) ## note: singular / single newline only (NOT plural)
|
203
|
+
break
|
204
|
+
elsif input.peek == sep
|
205
|
+
input.getc ## eat-up FS (,)
|
206
|
+
else
|
207
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
values
|
212
|
+
end
|
213
|
+
|
214
|
+
|
215
|
+
|
216
|
+
def skip_newline( input ) ## note: singular (strict) version
|
217
|
+
return if input.eof?
|
218
|
+
|
219
|
+
## only skip CR LF or LF or CR
|
220
|
+
if input.peek == CR
|
221
|
+
input.getc ## eat-up
|
222
|
+
input.getc if input.peek == LF
|
223
|
+
elsif input.peek == LF
|
224
|
+
input.getc ## eat-up
|
225
|
+
else
|
226
|
+
# do nothing
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
|
231
|
+
|
232
|
+
def skip_until_eol( input )
|
233
|
+
return if input.eof?
|
234
|
+
|
235
|
+
while (c=input.peek; !(c==LF || c==CR || input.eof?))
|
236
|
+
input.getc ## eat-up all until end of line
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
def parse_lines( input, sep:, &block )
|
243
|
+
## no leading and trailing whitespaces trimmed/stripped
|
244
|
+
## no comments skipped
|
245
|
+
## no blanks skipped
|
246
|
+
## - follows strict rules of
|
247
|
+
## note: this csv format is NOT recommended;
|
248
|
+
## please, use a format with comments, leading and trailing whitespaces, etc.
|
249
|
+
## only added for checking compatibility
|
250
|
+
|
251
|
+
comment = config[:comment]
|
252
|
+
|
253
|
+
loop do
|
254
|
+
break if input.eof?
|
255
|
+
|
256
|
+
logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
257
|
+
|
258
|
+
if comment && input.peek == comment ## comment line
|
259
|
+
logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
260
|
+
skip_until_eol( input )
|
261
|
+
skip_newline( input )
|
262
|
+
else
|
263
|
+
record = parse_record( input, sep: sep )
|
264
|
+
## note: requires block - enforce? how? why? why not?
|
265
|
+
block.call( record ) ## yield( record )
|
266
|
+
end
|
267
|
+
end # loop
|
268
|
+
|
269
|
+
end # method parse_lines
|
270
|
+
|
271
|
+
|
272
|
+
def is_null?( value )
|
273
|
+
null = @config[:null]
|
274
|
+
if null.nil?
|
275
|
+
false ## nothing set; return always false (not null)
|
276
|
+
elsif null.is_a?( Proc )
|
277
|
+
null.call( value )
|
278
|
+
elsif null.is_a?( Array )
|
279
|
+
null.include?( value )
|
280
|
+
elsif null.is_a?( String )
|
281
|
+
value == null
|
282
|
+
else ## unknown config style / setting
|
283
|
+
## todo: issue a warning or error - why? why not?
|
284
|
+
false ## nothing set; return always false (not null)
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
|
289
|
+
end # class ParserStrict
|
290
|
+
end # class CsvReader
|
data/lib/csvreader/parser_tab.rb
CHANGED
@@ -1,23 +1,22 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
##
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end # class
|
23
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
class ParserTab
|
5
|
+
|
6
|
+
def parse( data, **kwargs, &block )
|
7
|
+
## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
|
8
|
+
|
9
|
+
## note: input: required each_line (string or io/file for example)
|
10
|
+
## assume data is a string or io/file handle
|
11
|
+
tab = TabReader.new( data )
|
12
|
+
|
13
|
+
if block_given?
|
14
|
+
tab.each( &block )
|
15
|
+
else
|
16
|
+
tab.to_a
|
17
|
+
end
|
18
|
+
end ## method parse
|
19
|
+
|
20
|
+
|
21
|
+
end # class ParserTab
|
22
|
+
end # class CsvReader
|