csvreader 1.2.4 → 1.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -2
- data/README.md +682 -682
- data/Rakefile +33 -32
- data/datasets/cars11.csv +10 -10
- data/datasets/cities11.csv +12 -12
- data/datasets/customers11.csv +13 -13
- data/datasets/iris.attrib.csv +25 -25
- data/datasets/iris11.csv +163 -163
- data/datasets/lcc.attrib.csv +14 -14
- data/datasets/shakespeare.csv +9 -9
- data/lib/csvreader/base.rb +6 -2
- data/lib/csvreader/buffer.rb +0 -1
- data/lib/csvreader/builder.rb +0 -1
- data/lib/csvreader/converter.rb +0 -1
- data/lib/csvreader/parser.rb +32 -33
- data/lib/csvreader/parser_fixed.rb +105 -106
- data/lib/csvreader/parser_json.rb +23 -24
- data/lib/csvreader/parser_std.rb +582 -583
- data/lib/csvreader/parser_strict.rb +290 -291
- data/lib/csvreader/parser_tab.rb +22 -23
- data/lib/csvreader/parser_table.rb +122 -123
- data/lib/csvreader/parser_yaml.rb +23 -24
- data/lib/csvreader/reader.rb +2 -3
- data/lib/csvreader/reader_hash.rb +1 -2
- data/lib/csvreader/version.rb +30 -32
- data/lib/csvreader.rb +0 -1
- data/test/test_parser_formats.rb +66 -66
- data/test/test_parser_java.rb +208 -208
- metadata +18 -15
- data/LICENSE.md +0 -116
data/lib/csvreader/parser_std.rb
CHANGED
@@ -1,583 +1,582 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
l =
|
35
|
-
l
|
36
|
-
|
37
|
-
end
|
38
|
-
def
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
attr_reader :
|
45
|
-
|
46
|
-
|
47
|
-
##
|
48
|
-
##
|
49
|
-
##
|
50
|
-
##
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
##
|
65
|
-
##
|
66
|
-
##
|
67
|
-
@config[:
|
68
|
-
@config[:
|
69
|
-
|
70
|
-
|
71
|
-
##
|
72
|
-
##
|
73
|
-
##
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
##
|
78
|
-
##
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
##
|
90
|
-
##
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
##
|
103
|
-
##
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
def
|
108
|
-
def
|
109
|
-
def
|
110
|
-
def
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
## note:
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
input.
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
input.
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
##
|
204
|
-
##
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
##
|
215
|
-
|
216
|
-
value
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
##
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
##
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
##
|
259
|
-
|
260
|
-
|
261
|
-
##
|
262
|
-
|
263
|
-
##
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
##
|
301
|
-
##
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
##
|
308
|
-
##
|
309
|
-
##
|
310
|
-
##
|
311
|
-
##
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
##
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
value =
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
##
|
382
|
-
|
383
|
-
|
384
|
-
input.getc
|
385
|
-
input.getc
|
386
|
-
|
387
|
-
|
388
|
-
##
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
## ::YAML.load("")
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
##
|
410
|
-
input.getc
|
411
|
-
input.getc
|
412
|
-
input
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
##
|
424
|
-
|
425
|
-
|
426
|
-
data
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
input.getc
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
##
|
476
|
-
|
477
|
-
|
478
|
-
##
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
comment
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
##
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
##
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
##
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
end # class
|
583
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
class ParserStd
|
9
|
+
|
10
|
+
|
11
|
+
## char constants
|
12
|
+
DOUBLE_QUOTE = "\""
|
13
|
+
SINGLE_QUOTE = "'"
|
14
|
+
BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
|
15
|
+
COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
|
16
|
+
COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
|
17
|
+
DIRECTIVE = "@" ## use a different name e.g. AT or ??
|
18
|
+
SPACE = " " ## \s == ASCII 32 (dec) = (Space)
|
19
|
+
TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
|
20
|
+
LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
|
21
|
+
CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
###################################
|
26
|
+
## add simple logger with debug flag/switch
|
27
|
+
#
|
28
|
+
# use Parser.debug = true # to turn on
|
29
|
+
#
|
30
|
+
# todo/fix: use logutils instead of std logger - why? why not?
|
31
|
+
|
32
|
+
def self.build_logger()
|
33
|
+
l = Logger.new( STDOUT )
|
34
|
+
l.level = :info ## set to :info on start; note: is 0 (debug) by default
|
35
|
+
l
|
36
|
+
end
|
37
|
+
def self.logger() @@logger ||= build_logger; end
|
38
|
+
def logger() self.class.logger; end
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
|
44
|
+
attr_reader :meta
|
45
|
+
|
46
|
+
##
|
47
|
+
## todo/check:
|
48
|
+
## null values - include NA - why? why not?
|
49
|
+
## make null values case sensitive or add an option for case sensitive
|
50
|
+
## or better allow a proc as option for checking too!!!
|
51
|
+
def initialize( sep: ',',
|
52
|
+
null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
|
53
|
+
numeric: false, ## (auto-)convert all non-quoted values to float
|
54
|
+
nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
|
55
|
+
space: nil,
|
56
|
+
hashtag: false
|
57
|
+
)
|
58
|
+
@config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
|
59
|
+
|
60
|
+
check_sep( sep )
|
61
|
+
@config[:sep] = sep
|
62
|
+
|
63
|
+
## note: null values must get handled by parser
|
64
|
+
## only get checked for unquoted strings (and NOT for quoted strings)
|
65
|
+
## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
|
66
|
+
@config[:null] = null ## null values
|
67
|
+
@config[:numeric] = numeric
|
68
|
+
@config[:nan] = nan # not a number (NaN) e.g. Float::NAN
|
69
|
+
|
70
|
+
## e.g. treat/convert char to space e.g. _-+• etc
|
71
|
+
## Man_Utd => Man Utd
|
72
|
+
## or use it for leading and trailing spaces without quotes
|
73
|
+
## todo/check: only use for unquoted values? why? why not?
|
74
|
+
@config[:space] = space
|
75
|
+
|
76
|
+
## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
|
77
|
+
## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
|
78
|
+
## do NOT treat # as a comment (always use % for now)
|
79
|
+
@config[:hashtag] = hashtag
|
80
|
+
|
81
|
+
@meta = nil ## no meta data block (use empty hash {} - why? why not?)
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
SEPARATORS = ",;|^:"
|
86
|
+
|
87
|
+
def check_sep( sep )
|
88
|
+
## note: parse does NOT support space or tab as separator!!
|
89
|
+
## leading and trailing space or tab (whitespace) gets by default trimmed
|
90
|
+
## unless quoted (or alternative space char used e.g. _-+ if configured)
|
91
|
+
|
92
|
+
if SEPARATORS.include?( sep )
|
93
|
+
## everything ok
|
94
|
+
else
|
95
|
+
raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
#########################################
|
101
|
+
## config convenience helpers
|
102
|
+
## e.g. use like Csv.defaultl.null = '\N' etc. instead of
|
103
|
+
## Csv.default.config[:null] = '\N'
|
104
|
+
def sep=( value ) check_sep( value ); @config[:sep]=value; end
|
105
|
+
|
106
|
+
def null=( value ) @config[:null]=value; end
|
107
|
+
def numeric=( value ) @config[:numeric]=value; end
|
108
|
+
def nan=( value ) @config[:nan]=value; end
|
109
|
+
def space=( value ) @config[:space]=value; end
|
110
|
+
def hashtag=( value ) @config[:hashtag]=value; end
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
def parse( str_or_readable, sep: config[:sep], &block )
|
116
|
+
|
117
|
+
check_sep( sep )
|
118
|
+
|
119
|
+
## note: data - will wrap either a String or IO object passed in data
|
120
|
+
## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
|
121
|
+
|
122
|
+
## make sure data (string or io) is a wrapped into Buffer!!!!!!
|
123
|
+
if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
|
124
|
+
input = str_or_readable
|
125
|
+
else
|
126
|
+
input = Buffer.new( str_or_readable )
|
127
|
+
end
|
128
|
+
|
129
|
+
if block_given?
|
130
|
+
parse_lines( input, sep: sep, &block )
|
131
|
+
else
|
132
|
+
records = []
|
133
|
+
|
134
|
+
parse_lines( input, sep: sep ) do |record|
|
135
|
+
records << record
|
136
|
+
end
|
137
|
+
|
138
|
+
records
|
139
|
+
end
|
140
|
+
end ## method parse
|
141
|
+
|
142
|
+
|
143
|
+
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def parse_escape( input, sep: )
|
148
|
+
value = ""
|
149
|
+
if input.peek == BACKSLASH
|
150
|
+
input.getc ## eat-up backslash
|
151
|
+
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
|
152
|
+
logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
153
|
+
value << input.getc ## add escaped char (e.g. lf, cr, etc.)
|
154
|
+
else
|
155
|
+
## unknown escape sequence; no special handling/escaping
|
156
|
+
logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
157
|
+
value << BACKSLASH
|
158
|
+
end
|
159
|
+
else
|
160
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
|
161
|
+
end
|
162
|
+
value
|
163
|
+
end
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
def parse_quote( input, sep:, opening_quote:, closing_quote:)
|
168
|
+
value = ""
|
169
|
+
if input.peek == opening_quote
|
170
|
+
input.getc ## eat-up opening quote
|
171
|
+
|
172
|
+
loop do
|
173
|
+
while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
|
174
|
+
value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
|
175
|
+
end
|
176
|
+
|
177
|
+
if input.eof?
|
178
|
+
break
|
179
|
+
elsif input.peek == BACKSLASH
|
180
|
+
value << parse_escape( input, sep: sep )
|
181
|
+
else ## assume input.peek == quote
|
182
|
+
input.getc ## eat-up quote
|
183
|
+
if opening_quote == closing_quote && input.peek == closing_quote
|
184
|
+
## doubled up quote?
|
185
|
+
# note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
|
186
|
+
value << input.getc ## add doube quote and continue!!!!
|
187
|
+
else
|
188
|
+
break
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
else
|
193
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
|
194
|
+
end
|
195
|
+
value
|
196
|
+
end
|
197
|
+
|
198
|
+
|
199
|
+
def parse_field_until_sep( input, sep: )
|
200
|
+
value = ""
|
201
|
+
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
202
|
+
## consume simple value
|
203
|
+
## until we hit "," or "\n" or "\r"
|
204
|
+
## note: will eat-up quotes too!!!
|
205
|
+
while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
|
206
|
+
if input.peek == BACKSLASH
|
207
|
+
value << parse_escape( input, sep: sep )
|
208
|
+
else
|
209
|
+
logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
210
|
+
value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
|
211
|
+
end
|
212
|
+
end
|
213
|
+
## note: only strip **trailing** spaces (space and tab only)
|
214
|
+
## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
|
215
|
+
value = value.sub( /[ \t]+$/, '' )
|
216
|
+
value
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
def parse_field( input, sep: )
|
222
|
+
value = ""
|
223
|
+
|
224
|
+
numeric = config[:numeric]
|
225
|
+
hashtag = config[:hashtag]
|
226
|
+
|
227
|
+
|
228
|
+
logger.debug "parse field" if logger.debug?
|
229
|
+
|
230
|
+
skip_spaces( input ) ## strip leading spaces
|
231
|
+
|
232
|
+
|
233
|
+
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
|
234
|
+
## note: allows null = '' that is turn unquoted empty strings into null/nil
|
235
|
+
## or if using numeric into NotANumber (NaN)
|
236
|
+
if is_null?( value )
|
237
|
+
value = nil
|
238
|
+
elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
|
239
|
+
value = Float::NAN
|
240
|
+
else
|
241
|
+
# do nothing - keep value as is :-) e.g. "".
|
242
|
+
end
|
243
|
+
elsif input.peek == DOUBLE_QUOTE
|
244
|
+
logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
245
|
+
value << parse_quote( input, sep: sep,
|
246
|
+
opening_quote: DOUBLE_QUOTE,
|
247
|
+
closing_quote: DOUBLE_QUOTE )
|
248
|
+
|
249
|
+
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
250
|
+
spaces_count = skip_spaces( input )
|
251
|
+
|
252
|
+
## check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
|
253
|
+
## todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
|
254
|
+
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
|
255
|
+
## everything ok (that is, regular quoted value)!!!
|
256
|
+
else
|
257
|
+
## try auto-fix
|
258
|
+
## todo: report warning/issue error (if configured)!!!
|
259
|
+
extra_value = parse_field_until_sep( input, sep: sep )
|
260
|
+
## "reconstruct" non-quoted value
|
261
|
+
spaces = ' ' * spaces_count ## todo: preserve tab (\t) - why? why not?
|
262
|
+
## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
|
263
|
+
## e.g. "hello """ extra, (becomes)=> "hello "" extra (one quote less/"eaten up")
|
264
|
+
value = %Q{"#{value}"#{spaces}#{extra_value}}
|
265
|
+
end
|
266
|
+
|
267
|
+
logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
268
|
+
elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
|
269
|
+
logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
270
|
+
value << parse_quote( input, sep: sep,
|
271
|
+
opening_quote: SINGLE_QUOTE,
|
272
|
+
closing_quote: SINGLE_QUOTE )
|
273
|
+
|
274
|
+
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
275
|
+
skip_spaces( input )
|
276
|
+
logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
277
|
+
elsif input.peek == "«"
|
278
|
+
value << parse_quote( input, sep: sep,
|
279
|
+
opening_quote: "«",
|
280
|
+
closing_quote: "»" )
|
281
|
+
skip_spaces( input )
|
282
|
+
elsif input.peek == "»"
|
283
|
+
value << parse_quote( input, sep: sep,
|
284
|
+
opening_quote: "»",
|
285
|
+
closing_quote: "«" )
|
286
|
+
skip_spaces( input )
|
287
|
+
elsif input.peek == "‹"
|
288
|
+
value << parse_quote( input, sep: sep,
|
289
|
+
opening_quote: "‹",
|
290
|
+
closing_quote: "›" )
|
291
|
+
skip_spaces( input )
|
292
|
+
elsif input.peek == "›"
|
293
|
+
value << parse_quote( input, sep: sep,
|
294
|
+
opening_quote: "›",
|
295
|
+
closing_quote: "‹" )
|
296
|
+
skip_spaces( input )
|
297
|
+
else
|
298
|
+
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
299
|
+
## consume simple value
|
300
|
+
## until we hit "," or "\n" or "\r"
|
301
|
+
## note: will eat-up quotes too!!!
|
302
|
+
while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
|
303
|
+
if input.peek == BACKSLASH
|
304
|
+
value << parse_escape( input, sep: sep )
|
305
|
+
### check for end-of-line comments (e.g. # ...)
|
306
|
+
## note: quick hack for now
|
307
|
+
## will NOT work in hashtag (hxl) mode and for % comments
|
308
|
+
## for now ALWAYS assumes # for comments
|
309
|
+
## and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
|
310
|
+
## todo/fix: note: require leading space for comment hash (#) for now- why? why not?
|
311
|
+
## require trailing space after comment hash (#) - why? why not?
|
312
|
+
elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
|
313
|
+
(value.size == 0 || (value.size > 0 && value[-1] == ' '))
|
314
|
+
## eat-up everything until end-of-line (eol)
|
315
|
+
skip_until_eol( input )
|
316
|
+
else
|
317
|
+
logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
318
|
+
value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
## note: only strip **trailing** spaces (space and tab only)
|
322
|
+
## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
|
323
|
+
value = value.sub( /[ \t]+$/, '' )
|
324
|
+
|
325
|
+
if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
|
326
|
+
value = nil
|
327
|
+
elsif numeric
|
328
|
+
if is_nan?( value )
|
329
|
+
value = Float::NAN
|
330
|
+
else
|
331
|
+
## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
|
332
|
+
if numeric.is_a?( Proc )
|
333
|
+
value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
|
334
|
+
else
|
335
|
+
value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
|
336
|
+
end
|
337
|
+
end
|
338
|
+
else
|
339
|
+
# do nothing - keep value as is :-).
|
340
|
+
end
|
341
|
+
|
342
|
+
logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
343
|
+
end
|
344
|
+
|
345
|
+
value
|
346
|
+
end
|
347
|
+
|
348
|
+
|
349
|
+
|
350
|
+
def parse_record( input, sep: )
|
351
|
+
values = []
|
352
|
+
|
353
|
+
space = config[:space]
|
354
|
+
|
355
|
+
loop do
|
356
|
+
value = parse_field( input, sep: sep )
|
357
|
+
value = value.tr( space, ' ' ) if space && value.is_a?( String )
|
358
|
+
|
359
|
+
logger.debug "value: »#{value}«" if logger.debug?
|
360
|
+
values << value
|
361
|
+
|
362
|
+
if input.eof?
|
363
|
+
break
|
364
|
+
elsif (c=input.peek; c==LF || c==CR)
|
365
|
+
skip_newline( input )
|
366
|
+
break
|
367
|
+
elsif input.peek == sep
|
368
|
+
input.getc ## eat-up FS(,)
|
369
|
+
else
|
370
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
values
|
375
|
+
end
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
def parse_meta( input )
|
380
|
+
## todo/check:
|
381
|
+
## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
|
382
|
+
|
383
|
+
input.getc ## eat-up (add document header ---) - skip "---"
|
384
|
+
input.getc
|
385
|
+
input.getc
|
386
|
+
|
387
|
+
## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
|
388
|
+
## use match() or something to always match regexp
|
389
|
+
skip_spaces( input ) # eat-up optional whitespaces in header line
|
390
|
+
skip_newline( input )
|
391
|
+
|
392
|
+
buf = "---\n" ## note: start buffer with yaml header line - why?
|
393
|
+
## ::YAML.load("") return false !!!
|
394
|
+
## ::YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
|
395
|
+
|
396
|
+
newline = true
|
397
|
+
|
398
|
+
## eat-up until we hit "---" again
|
399
|
+
loop do
|
400
|
+
if input.eof?
|
401
|
+
raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
|
402
|
+
elsif (c=input.peek; c==LF || c==CR)
|
403
|
+
while (c=input.peek; c==LF || c==CR ) ## add newlines
|
404
|
+
buf << input.getc ## eat-up all until end of line
|
405
|
+
end
|
406
|
+
newline = true
|
407
|
+
elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
|
408
|
+
## todo/fix/check: allow (ignore) spaces after --- why? why not?
|
409
|
+
input.getc ## eat-up (add document header ---) - skip "---"
|
410
|
+
input.getc
|
411
|
+
input.getc
|
412
|
+
skip_spaces( input ) # eat-up optional whitespaces in header line
|
413
|
+
skip_newline( input )
|
414
|
+
break
|
415
|
+
else
|
416
|
+
buf << input.getc
|
417
|
+
newline = false
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
data = ::YAML.load( buf ) ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
|
422
|
+
## todo: check edge cases - always should return a hash or nil
|
423
|
+
## what to do with just integer, string or array etc. ???
|
424
|
+
|
425
|
+
data = {} if data.nil? ## note: if nil return empty hash e.g. {}
|
426
|
+
data
|
427
|
+
end ## parse_meta
|
428
|
+
|
429
|
+
|
430
|
+
|
431
|
+
def skip_newline( input ) ## note: singular (strict) version
|
432
|
+
return if input.eof?
|
433
|
+
|
434
|
+
## only skip CR LF or LF or CR
|
435
|
+
if input.peek == CR
|
436
|
+
input.getc ## eat-up
|
437
|
+
input.getc if input.peek == LF
|
438
|
+
elsif input.peek == LF
|
439
|
+
input.getc ## eat-up
|
440
|
+
else
|
441
|
+
# do nothing
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
|
446
|
+
|
447
|
+
def skip_until_eol( input )
|
448
|
+
return if input.eof?
|
449
|
+
|
450
|
+
while (c=input.peek; !(c==LF || c==CR || input.eof?))
|
451
|
+
input.getc ## eat-up all until end of line
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
|
456
|
+
def skip_spaces( input )
|
457
|
+
return 0 if input.eof?
|
458
|
+
|
459
|
+
## note: return number of spaces skipped (e.g. 0,1,2,etc.)
|
460
|
+
spaces_count = 0
|
461
|
+
while (c=input.peek; c==SPACE || c==TAB)
|
462
|
+
input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
|
463
|
+
spaces_count += 1
|
464
|
+
end
|
465
|
+
spaces_count
|
466
|
+
end
|
467
|
+
|
468
|
+
|
469
|
+
|
470
|
+
|
471
|
+
|
472
|
+
|
473
|
+
def parse_lines( input, sep:, &block )
|
474
|
+
## note: reset (optional) meta data block
|
475
|
+
@meta = nil ## no meta data block (use empty hash {} - why? why not?)
|
476
|
+
|
477
|
+
## note: track number of records
|
478
|
+
## used for meta block (can only start before any records e.g. if record_num == 0)
|
479
|
+
record_num = 0
|
480
|
+
|
481
|
+
|
482
|
+
|
483
|
+
hashtag = config[:hashtag]
|
484
|
+
|
485
|
+
if hashtag
|
486
|
+
comment = COMMENT_PERCENT
|
487
|
+
## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
|
488
|
+
else
|
489
|
+
## note: can either use '#' or '%' but NOT both; first one "wins"
|
490
|
+
comment = nil
|
491
|
+
end
|
492
|
+
|
493
|
+
|
494
|
+
has_seen_directive = false
|
495
|
+
has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
|
496
|
+
## note: can either use directives (@) or frontmatter (---) block; first one "wins"
|
497
|
+
|
498
|
+
loop do
|
499
|
+
break if input.eof?
|
500
|
+
|
501
|
+
skipped_spaces = skip_spaces( input )
|
502
|
+
|
503
|
+
if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
|
504
|
+
logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
505
|
+
comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
|
506
|
+
skip_until_eol( input )
|
507
|
+
skip_newline( input )
|
508
|
+
elsif comment && input.peek == comment ## (anther) comment line
|
509
|
+
logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
510
|
+
skip_until_eol( input )
|
511
|
+
skip_newline( input )
|
512
|
+
elsif (c=input.peek; c==LF || c==CR || input.eof?)
|
513
|
+
logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
514
|
+
skip_newline( input )
|
515
|
+
elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
|
516
|
+
## note: "skip" directives for now
|
517
|
+
has_seen_directive = true
|
518
|
+
logger.debug "skip directive" if logger.debug?
|
519
|
+
skip_until_eol( input )
|
520
|
+
skip_newline( input )
|
521
|
+
elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
|
522
|
+
skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
|
523
|
+
## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
|
524
|
+
has_seen_frontmatter = true
|
525
|
+
logger.debug "start meta block" if logger.debug?
|
526
|
+
## note: meta gets stored as object attribute (state/state/state!!)
|
527
|
+
## use meta attribute to get meta data after reading first record
|
528
|
+
@meta = parse_meta( input ) ## note: assumes a hash gets returned
|
529
|
+
logger.debug " meta: >#{meta.inspect}<" if logger.debug?
|
530
|
+
else
|
531
|
+
logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
532
|
+
|
533
|
+
record = parse_record( input, sep: sep )
|
534
|
+
record_num +=1
|
535
|
+
|
536
|
+
## note: requires block - enforce? how? why? why not?
|
537
|
+
block.call( record ) ## yield( record )
|
538
|
+
end
|
539
|
+
end # loop
|
540
|
+
end # method parse_lines
|
541
|
+
|
542
|
+
|
543
|
+
|
544
|
+
|
545
|
+
def convert_to_float( value ) Float( value ) rescue value; end
|
546
|
+
|
547
|
+
def is_nan?( value )
|
548
|
+
nan = @config[:nan]
|
549
|
+
if nan.nil?
|
550
|
+
false ## nothing set; return always false (not NaN)
|
551
|
+
elsif nan.is_a?( Proc )
|
552
|
+
nan.call( value )
|
553
|
+
elsif nan.is_a?( Array )
|
554
|
+
nan.include?( value )
|
555
|
+
elsif nan.is_a?( String )
|
556
|
+
value == nan
|
557
|
+
else ## unknown config style / setting
|
558
|
+
## todo: issue a warning or error - why? why not?
|
559
|
+
false ## nothing set; return always false (not nan)
|
560
|
+
end
|
561
|
+
end
|
562
|
+
|
563
|
+
|
564
|
+
def is_null?( value )
|
565
|
+
null = @config[:null]
|
566
|
+
if null.nil?
|
567
|
+
false ## nothing set; return always false (not null)
|
568
|
+
elsif null.is_a?( Proc )
|
569
|
+
null.call( value )
|
570
|
+
elsif null.is_a?( Array )
|
571
|
+
null.include?( value )
|
572
|
+
elsif null.is_a?( String )
|
573
|
+
value == null
|
574
|
+
else ## unknown config style / setting
|
575
|
+
## todo: issue a warning or error - why? why not?
|
576
|
+
false ## nothing set; return always false (not null)
|
577
|
+
end
|
578
|
+
end
|
579
|
+
|
580
|
+
|
581
|
+
end # class ParserStd
|
582
|
+
end # class CsvReader
|