csvreader 1.2.1 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +13 -12
- data/README.md +682 -677
- data/Rakefile +33 -26
- data/{test/data → datasets}/beer.csv +0 -0
- data/{test/data → datasets}/beer11.csv +0 -0
- data/{test/data → datasets}/cars11.csv +10 -10
- data/{test/data → datasets}/cities11.csv +12 -12
- data/{test/data → datasets}/customers11.csv +13 -13
- data/{test/data → datasets}/iris.attrib.csv +25 -25
- data/{test/data → datasets}/iris11.csv +163 -163
- data/{test/data → datasets}/lcc.attrib.csv +14 -14
- data/{test/data → datasets}/shakespeare.csv +9 -9
- data/{test/data → datasets}/test.csv +0 -0
- data/lib/csvreader/base.rb +36 -2
- data/lib/csvreader/buffer.rb +0 -1
- data/lib/csvreader/builder.rb +0 -1
- data/lib/csvreader/converter.rb +0 -1
- data/lib/csvreader/parser.rb +32 -33
- data/lib/csvreader/parser_fixed.rb +105 -106
- data/lib/csvreader/parser_json.rb +23 -5
- data/lib/csvreader/parser_std.rb +582 -534
- data/lib/csvreader/parser_strict.rb +290 -291
- data/lib/csvreader/parser_tab.rb +22 -62
- data/lib/csvreader/parser_table.rb +122 -123
- data/lib/csvreader/parser_yaml.rb +23 -0
- data/lib/csvreader/reader.rb +2 -3
- data/lib/csvreader/reader_hash.rb +3 -2
- data/lib/csvreader/version.rb +30 -32
- data/lib/csvreader.rb +0 -1
- data/test/helper.rb +1 -1
- data/test/test_parser_autofix.rb +28 -0
- data/test/test_parser_formats.rb +66 -66
- data/test/test_parser_java.rb +208 -208
- metadata +72 -25
- data/LICENSE.md +0 -116
@@ -1,291 +1,290 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
##
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
l =
|
24
|
-
l
|
25
|
-
|
26
|
-
end
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
@config
|
42
|
-
@config[:
|
43
|
-
@config[:
|
44
|
-
@config[:
|
45
|
-
@config[:
|
46
|
-
@config[:
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
##
|
53
|
-
##
|
54
|
-
|
55
|
-
def
|
56
|
-
def
|
57
|
-
def
|
58
|
-
def
|
59
|
-
def
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
input.
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
input.
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
##
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
##
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
value
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
input.getc
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
## no
|
245
|
-
## no
|
246
|
-
##
|
247
|
-
##
|
248
|
-
##
|
249
|
-
##
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
##
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
end # class
|
291
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
|
5
|
+
class ParserStrict
|
6
|
+
|
7
|
+
|
8
|
+
## char constants
|
9
|
+
BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
|
10
|
+
LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
|
11
|
+
CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
|
12
|
+
|
13
|
+
|
14
|
+
###################################
|
15
|
+
## add simple logger with debug flag/switch
|
16
|
+
#
|
17
|
+
# use Parser.debug = true # to turn on
|
18
|
+
#
|
19
|
+
# todo/fix: use logutils instead of std logger - why? why not?
|
20
|
+
|
21
|
+
def self.build_logger()
|
22
|
+
l = Logger.new( STDOUT )
|
23
|
+
l.level = :info ## set to :info on start; note: is 0 (debug) by default
|
24
|
+
l
|
25
|
+
end
|
26
|
+
def self.logger() @@logger ||= build_logger; end
|
27
|
+
def logger() self.class.logger; end
|
28
|
+
|
29
|
+
|
30
|
+
|
31
|
+
attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
|
32
|
+
|
33
|
+
def initialize( sep: ',',
|
34
|
+
quote: '"', ## note: set to false/nil for no quote
|
35
|
+
doublequote: true,
|
36
|
+
escape: false, ## true/false
|
37
|
+
null: nil, ## note: set to nil for no null vales / not availabe (na)
|
38
|
+
comment: false ## note: comment char e.g. # or false/nil
|
39
|
+
)
|
40
|
+
@config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
|
41
|
+
@config[:sep] = sep
|
42
|
+
@config[:quote] = quote
|
43
|
+
@config[:doublequote] = doublequote
|
44
|
+
@config[:escape] = escape
|
45
|
+
@config[:null] = null
|
46
|
+
@config[:comment] = comment
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
#########################################
|
51
|
+
## config convenience helpers
|
52
|
+
## e.g. use like Csv.mysql.sep = ',' etc. instead of
|
53
|
+
## Csv.mysql.config[:sep] = ','
|
54
|
+
def sep=( value ) @config[:sep]=value; end
|
55
|
+
def quote=( value ) @config[:quote]=value; end
|
56
|
+
def doublequote=( value ) @config[:doublequote]=value; end
|
57
|
+
def escape=( value ) @config[:escape]=value; end
|
58
|
+
def null=( value ) @config[:null]=value; end
|
59
|
+
def comment=( value ) @config[:comment]=value; end
|
60
|
+
|
61
|
+
|
62
|
+
|
63
|
+
def parse( data, sep: config[:sep], &block )
|
64
|
+
## note: data - will wrap either a String or IO object passed in data
|
65
|
+
|
66
|
+
## make sure data (string or io) is a wrapped into Buffer!!!!!!
|
67
|
+
if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
|
68
|
+
input = data
|
69
|
+
else
|
70
|
+
input = Buffer.new( data )
|
71
|
+
end
|
72
|
+
|
73
|
+
|
74
|
+
if block_given?
|
75
|
+
parse_lines( input, sep: sep, &block )
|
76
|
+
else
|
77
|
+
records = []
|
78
|
+
|
79
|
+
parse_lines( input, sep: sep ) do |record|
|
80
|
+
records << record
|
81
|
+
end
|
82
|
+
|
83
|
+
records
|
84
|
+
end
|
85
|
+
|
86
|
+
end ## method parse
|
87
|
+
|
88
|
+
|
89
|
+
|
90
|
+
private
|
91
|
+
|
92
|
+
def parse_escape( input, sep: )
|
93
|
+
value = ""
|
94
|
+
|
95
|
+
quote = config[:quote]
|
96
|
+
|
97
|
+
if input.peek == BACKSLASH
|
98
|
+
input.getc ## eat-up backslash
|
99
|
+
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
|
100
|
+
value << input.getc ## add escaped char (e.g. lf, cr, etc.)
|
101
|
+
else
|
102
|
+
## unknown escape sequence; no special handling/escaping
|
103
|
+
value << BACKSLASH
|
104
|
+
end
|
105
|
+
else
|
106
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
|
107
|
+
end
|
108
|
+
value
|
109
|
+
end
|
110
|
+
|
111
|
+
|
112
|
+
|
113
|
+
def parse_quote( input, sep: )
|
114
|
+
value = ""
|
115
|
+
|
116
|
+
quote = config[:quote] # char (e.g.",') | nil
|
117
|
+
doublequote = config[:doublequote] # true|false
|
118
|
+
escape = config[:escape] # true|false
|
119
|
+
|
120
|
+
if input.peek == quote
|
121
|
+
input.getc ## eat-up double_quote
|
122
|
+
|
123
|
+
loop do
|
124
|
+
while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
|
125
|
+
value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
|
126
|
+
end
|
127
|
+
|
128
|
+
if input.eof?
|
129
|
+
break
|
130
|
+
elsif input.peek == BACKSLASH
|
131
|
+
value << parse_escape( input, sep: sep )
|
132
|
+
else ## assume input.peek == DOUBLE_QUOTE
|
133
|
+
input.getc ## eat-up double_quote
|
134
|
+
if doublequote && input.peek == quote ## doubled up quote?
|
135
|
+
value << input.getc ## add doube quote and continue!!!!
|
136
|
+
else
|
137
|
+
break
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
else
|
142
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
|
143
|
+
end
|
144
|
+
value
|
145
|
+
end
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
def parse_field( input, sep: )
|
150
|
+
value = ""
|
151
|
+
|
152
|
+
quote = config[:quote]
|
153
|
+
escape = config[:escape]
|
154
|
+
|
155
|
+
logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
|
156
|
+
|
157
|
+
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
|
158
|
+
## note: allows null = '' that is turn unquoted empty strings into null/nil
|
159
|
+
## or if using numeric into NotANumber (NaN)
|
160
|
+
value = nil if is_null?( value )
|
161
|
+
## do nothing - keep value as is :-) e.g. "".
|
162
|
+
elsif quote && input.peek == quote
|
163
|
+
logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
164
|
+
value << parse_quote( input, sep: sep )
|
165
|
+
logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
166
|
+
else
|
167
|
+
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
168
|
+
## consume simple value
|
169
|
+
## until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
|
170
|
+
while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
|
171
|
+
if escape && input.peek == BACKSLASH
|
172
|
+
value << parse_escape( input, sep: sep )
|
173
|
+
else
|
174
|
+
logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
175
|
+
value << input.getc
|
176
|
+
end
|
177
|
+
end
|
178
|
+
|
179
|
+
|
180
|
+
value = nil if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
|
181
|
+
# do nothing - keep value as is :-).
|
182
|
+
|
183
|
+
logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
184
|
+
end
|
185
|
+
|
186
|
+
value
|
187
|
+
end
|
188
|
+
|
189
|
+
|
190
|
+
|
191
|
+
def parse_record( input, sep: )
|
192
|
+
values = []
|
193
|
+
|
194
|
+
loop do
|
195
|
+
value = parse_field( input, sep: sep )
|
196
|
+
logger.debug "value: »#{value}«" if logger.debug?
|
197
|
+
values << value
|
198
|
+
|
199
|
+
if input.eof?
|
200
|
+
break
|
201
|
+
elsif (c=input.peek; c==LF || c==CR)
|
202
|
+
skip_newline( input ) ## note: singular / single newline only (NOT plural)
|
203
|
+
break
|
204
|
+
elsif input.peek == sep
|
205
|
+
input.getc ## eat-up FS (,)
|
206
|
+
else
|
207
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
|
208
|
+
end
|
209
|
+
end
|
210
|
+
|
211
|
+
values
|
212
|
+
end
|
213
|
+
|
214
|
+
|
215
|
+
|
216
|
+
def skip_newline( input ) ## note: singular (strict) version
|
217
|
+
return if input.eof?
|
218
|
+
|
219
|
+
## only skip CR LF or LF or CR
|
220
|
+
if input.peek == CR
|
221
|
+
input.getc ## eat-up
|
222
|
+
input.getc if input.peek == LF
|
223
|
+
elsif input.peek == LF
|
224
|
+
input.getc ## eat-up
|
225
|
+
else
|
226
|
+
# do nothing
|
227
|
+
end
|
228
|
+
end
|
229
|
+
|
230
|
+
|
231
|
+
|
232
|
+
def skip_until_eol( input )
|
233
|
+
return if input.eof?
|
234
|
+
|
235
|
+
while (c=input.peek; !(c==LF || c==CR || input.eof?))
|
236
|
+
input.getc ## eat-up all until end of line
|
237
|
+
end
|
238
|
+
end
|
239
|
+
|
240
|
+
|
241
|
+
|
242
|
+
def parse_lines( input, sep:, &block )
|
243
|
+
## no leading and trailing whitespaces trimmed/stripped
|
244
|
+
## no comments skipped
|
245
|
+
## no blanks skipped
|
246
|
+
## - follows strict rules of
|
247
|
+
## note: this csv format is NOT recommended;
|
248
|
+
## please, use a format with comments, leading and trailing whitespaces, etc.
|
249
|
+
## only added for checking compatibility
|
250
|
+
|
251
|
+
comment = config[:comment]
|
252
|
+
|
253
|
+
loop do
|
254
|
+
break if input.eof?
|
255
|
+
|
256
|
+
logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
257
|
+
|
258
|
+
if comment && input.peek == comment ## comment line
|
259
|
+
logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
260
|
+
skip_until_eol( input )
|
261
|
+
skip_newline( input )
|
262
|
+
else
|
263
|
+
record = parse_record( input, sep: sep )
|
264
|
+
## note: requires block - enforce? how? why? why not?
|
265
|
+
block.call( record ) ## yield( record )
|
266
|
+
end
|
267
|
+
end # loop
|
268
|
+
|
269
|
+
end # method parse_lines
|
270
|
+
|
271
|
+
|
272
|
+
def is_null?( value )
|
273
|
+
null = @config[:null]
|
274
|
+
if null.nil?
|
275
|
+
false ## nothing set; return always false (not null)
|
276
|
+
elsif null.is_a?( Proc )
|
277
|
+
null.call( value )
|
278
|
+
elsif null.is_a?( Array )
|
279
|
+
null.include?( value )
|
280
|
+
elsif null.is_a?( String )
|
281
|
+
value == null
|
282
|
+
else ## unknown config style / setting
|
283
|
+
## todo: issue a warning or error - why? why not?
|
284
|
+
false ## nothing set; return always false (not null)
|
285
|
+
end
|
286
|
+
end
|
287
|
+
|
288
|
+
|
289
|
+
end # class ParserStrict
|
290
|
+
end # class CsvReader
|