csvreader 1.2.4 → 1.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/{HISTORY.md → CHANGELOG.md} +3 -3
- data/Manifest.txt +1 -2
- data/README.md +682 -682
- data/Rakefile +33 -32
- data/datasets/cars11.csv +10 -10
- data/datasets/cities11.csv +12 -12
- data/datasets/customers11.csv +13 -13
- data/datasets/iris.attrib.csv +25 -25
- data/datasets/iris11.csv +163 -163
- data/datasets/lcc.attrib.csv +14 -14
- data/datasets/shakespeare.csv +9 -9
- data/lib/csvreader/base.rb +6 -2
- data/lib/csvreader/buffer.rb +0 -1
- data/lib/csvreader/builder.rb +0 -1
- data/lib/csvreader/converter.rb +0 -1
- data/lib/csvreader/parser.rb +32 -33
- data/lib/csvreader/parser_fixed.rb +105 -106
- data/lib/csvreader/parser_json.rb +23 -24
- data/lib/csvreader/parser_std.rb +582 -583
- data/lib/csvreader/parser_strict.rb +290 -291
- data/lib/csvreader/parser_tab.rb +22 -23
- data/lib/csvreader/parser_table.rb +122 -123
- data/lib/csvreader/parser_yaml.rb +23 -24
- data/lib/csvreader/reader.rb +2 -3
- data/lib/csvreader/reader_hash.rb +1 -2
- data/lib/csvreader/version.rb +30 -32
- data/lib/csvreader.rb +0 -1
- data/test/test_parser_formats.rb +66 -66
- data/test/test_parser_java.rb +208 -208
- metadata +18 -15
- data/LICENSE.md +0 -116
data/lib/csvreader/parser_std.rb
CHANGED
@@ -1,583 +1,582 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
#
|
29
|
-
#
|
30
|
-
#
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
l =
|
35
|
-
l
|
36
|
-
|
37
|
-
end
|
38
|
-
def
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
attr_reader :
|
45
|
-
|
46
|
-
|
47
|
-
##
|
48
|
-
##
|
49
|
-
##
|
50
|
-
##
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
##
|
65
|
-
##
|
66
|
-
##
|
67
|
-
@config[:
|
68
|
-
@config[:
|
69
|
-
|
70
|
-
|
71
|
-
##
|
72
|
-
##
|
73
|
-
##
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
##
|
78
|
-
##
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
##
|
90
|
-
##
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
##
|
103
|
-
##
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
def
|
108
|
-
def
|
109
|
-
def
|
110
|
-
def
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
## note:
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
input.
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
input.
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
##
|
204
|
-
##
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
##
|
215
|
-
|
216
|
-
value
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
##
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
##
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
##
|
259
|
-
|
260
|
-
|
261
|
-
##
|
262
|
-
|
263
|
-
##
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
##
|
301
|
-
##
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
##
|
308
|
-
##
|
309
|
-
##
|
310
|
-
##
|
311
|
-
##
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
##
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
357
|
-
value =
|
358
|
-
|
359
|
-
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
|
373
|
-
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
##
|
382
|
-
|
383
|
-
|
384
|
-
input.getc
|
385
|
-
input.getc
|
386
|
-
|
387
|
-
|
388
|
-
##
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
## ::YAML.load("")
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
408
|
-
|
409
|
-
##
|
410
|
-
input.getc
|
411
|
-
input.getc
|
412
|
-
input
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
##
|
424
|
-
|
425
|
-
|
426
|
-
data
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
input.getc
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
##
|
476
|
-
|
477
|
-
|
478
|
-
##
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
comment
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
|
526
|
-
|
527
|
-
##
|
528
|
-
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
##
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
##
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
end # class
|
583
|
-
end # class CsvReader
|
1
|
+
|
2
|
+
class CsvReader
|
3
|
+
|
4
|
+
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
class ParserStd
|
9
|
+
|
10
|
+
|
11
|
+
## char constants
|
12
|
+
DOUBLE_QUOTE = "\""
|
13
|
+
SINGLE_QUOTE = "'"
|
14
|
+
BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
|
15
|
+
COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
|
16
|
+
COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
|
17
|
+
DIRECTIVE = "@" ## use a different name e.g. AT or ??
|
18
|
+
SPACE = " " ## \s == ASCII 32 (dec) = (Space)
|
19
|
+
TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
|
20
|
+
LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
|
21
|
+
CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
|
22
|
+
|
23
|
+
|
24
|
+
|
25
|
+
###################################
|
26
|
+
## add simple logger with debug flag/switch
|
27
|
+
#
|
28
|
+
# use Parser.debug = true # to turn on
|
29
|
+
#
|
30
|
+
# todo/fix: use logutils instead of std logger - why? why not?
|
31
|
+
|
32
|
+
def self.build_logger()
|
33
|
+
l = Logger.new( STDOUT )
|
34
|
+
l.level = :info ## set to :info on start; note: is 0 (debug) by default
|
35
|
+
l
|
36
|
+
end
|
37
|
+
def self.logger() @@logger ||= build_logger; end
|
38
|
+
def logger() self.class.logger; end
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
|
43
|
+
attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
|
44
|
+
attr_reader :meta
|
45
|
+
|
46
|
+
##
|
47
|
+
## todo/check:
|
48
|
+
## null values - include NA - why? why not?
|
49
|
+
## make null values case sensitive or add an option for case sensitive
|
50
|
+
## or better allow a proc as option for checking too!!!
|
51
|
+
def initialize( sep: ',',
|
52
|
+
null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
|
53
|
+
numeric: false, ## (auto-)convert all non-quoted values to float
|
54
|
+
nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
|
55
|
+
space: nil,
|
56
|
+
hashtag: false
|
57
|
+
)
|
58
|
+
@config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
|
59
|
+
|
60
|
+
check_sep( sep )
|
61
|
+
@config[:sep] = sep
|
62
|
+
|
63
|
+
## note: null values must get handled by parser
|
64
|
+
## only get checked for unquoted strings (and NOT for quoted strings)
|
65
|
+
## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
|
66
|
+
@config[:null] = null ## null values
|
67
|
+
@config[:numeric] = numeric
|
68
|
+
@config[:nan] = nan # not a number (NaN) e.g. Float::NAN
|
69
|
+
|
70
|
+
## e.g. treat/convert char to space e.g. _-+• etc
|
71
|
+
## Man_Utd => Man Utd
|
72
|
+
## or use it for leading and trailing spaces without quotes
|
73
|
+
## todo/check: only use for unquoted values? why? why not?
|
74
|
+
@config[:space] = space
|
75
|
+
|
76
|
+
## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
|
77
|
+
## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
|
78
|
+
## do NOT treat # as a comment (always use % for now)
|
79
|
+
@config[:hashtag] = hashtag
|
80
|
+
|
81
|
+
@meta = nil ## no meta data block (use empty hash {} - why? why not?)
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
SEPARATORS = ",;|^:"
|
86
|
+
|
87
|
+
def check_sep( sep )
|
88
|
+
## note: parse does NOT support space or tab as separator!!
|
89
|
+
## leading and trailing space or tab (whitespace) gets by default trimmed
|
90
|
+
## unless quoted (or alternative space char used e.g. _-+ if configured)
|
91
|
+
|
92
|
+
if SEPARATORS.include?( sep )
|
93
|
+
## everything ok
|
94
|
+
else
|
95
|
+
raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
|
100
|
+
#########################################
|
101
|
+
## config convenience helpers
|
102
|
+
## e.g. use like Csv.defaultl.null = '\N' etc. instead of
|
103
|
+
## Csv.default.config[:null] = '\N'
|
104
|
+
def sep=( value ) check_sep( value ); @config[:sep]=value; end
|
105
|
+
|
106
|
+
def null=( value ) @config[:null]=value; end
|
107
|
+
def numeric=( value ) @config[:numeric]=value; end
|
108
|
+
def nan=( value ) @config[:nan]=value; end
|
109
|
+
def space=( value ) @config[:space]=value; end
|
110
|
+
def hashtag=( value ) @config[:hashtag]=value; end
|
111
|
+
|
112
|
+
|
113
|
+
|
114
|
+
|
115
|
+
def parse( str_or_readable, sep: config[:sep], &block )
|
116
|
+
|
117
|
+
check_sep( sep )
|
118
|
+
|
119
|
+
## note: data - will wrap either a String or IO object passed in data
|
120
|
+
## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
|
121
|
+
|
122
|
+
## make sure data (string or io) is a wrapped into Buffer!!!!!!
|
123
|
+
if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
|
124
|
+
input = str_or_readable
|
125
|
+
else
|
126
|
+
input = Buffer.new( str_or_readable )
|
127
|
+
end
|
128
|
+
|
129
|
+
if block_given?
|
130
|
+
parse_lines( input, sep: sep, &block )
|
131
|
+
else
|
132
|
+
records = []
|
133
|
+
|
134
|
+
parse_lines( input, sep: sep ) do |record|
|
135
|
+
records << record
|
136
|
+
end
|
137
|
+
|
138
|
+
records
|
139
|
+
end
|
140
|
+
end ## method parse
|
141
|
+
|
142
|
+
|
143
|
+
|
144
|
+
|
145
|
+
private
|
146
|
+
|
147
|
+
def parse_escape( input, sep: )
|
148
|
+
value = ""
|
149
|
+
if input.peek == BACKSLASH
|
150
|
+
input.getc ## eat-up backslash
|
151
|
+
if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
|
152
|
+
logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
153
|
+
value << input.getc ## add escaped char (e.g. lf, cr, etc.)
|
154
|
+
else
|
155
|
+
## unknown escape sequence; no special handling/escaping
|
156
|
+
logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
157
|
+
value << BACKSLASH
|
158
|
+
end
|
159
|
+
else
|
160
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
|
161
|
+
end
|
162
|
+
value
|
163
|
+
end
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
def parse_quote( input, sep:, opening_quote:, closing_quote:)
|
168
|
+
value = ""
|
169
|
+
if input.peek == opening_quote
|
170
|
+
input.getc ## eat-up opening quote
|
171
|
+
|
172
|
+
loop do
|
173
|
+
while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
|
174
|
+
value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
|
175
|
+
end
|
176
|
+
|
177
|
+
if input.eof?
|
178
|
+
break
|
179
|
+
elsif input.peek == BACKSLASH
|
180
|
+
value << parse_escape( input, sep: sep )
|
181
|
+
else ## assume input.peek == quote
|
182
|
+
input.getc ## eat-up quote
|
183
|
+
if opening_quote == closing_quote && input.peek == closing_quote
|
184
|
+
## doubled up quote?
|
185
|
+
# note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
|
186
|
+
value << input.getc ## add doube quote and continue!!!!
|
187
|
+
else
|
188
|
+
break
|
189
|
+
end
|
190
|
+
end
|
191
|
+
end
|
192
|
+
else
|
193
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
|
194
|
+
end
|
195
|
+
value
|
196
|
+
end
|
197
|
+
|
198
|
+
|
199
|
+
def parse_field_until_sep( input, sep: )
|
200
|
+
value = ""
|
201
|
+
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
202
|
+
## consume simple value
|
203
|
+
## until we hit "," or "\n" or "\r"
|
204
|
+
## note: will eat-up quotes too!!!
|
205
|
+
while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
|
206
|
+
if input.peek == BACKSLASH
|
207
|
+
value << parse_escape( input, sep: sep )
|
208
|
+
else
|
209
|
+
logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
210
|
+
value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
|
211
|
+
end
|
212
|
+
end
|
213
|
+
## note: only strip **trailing** spaces (space and tab only)
|
214
|
+
## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
|
215
|
+
value = value.sub( /[ \t]+$/, '' )
|
216
|
+
value
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
|
221
|
+
def parse_field( input, sep: )
|
222
|
+
value = ""
|
223
|
+
|
224
|
+
numeric = config[:numeric]
|
225
|
+
hashtag = config[:hashtag]
|
226
|
+
|
227
|
+
|
228
|
+
logger.debug "parse field" if logger.debug?
|
229
|
+
|
230
|
+
skip_spaces( input ) ## strip leading spaces
|
231
|
+
|
232
|
+
|
233
|
+
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
|
234
|
+
## note: allows null = '' that is turn unquoted empty strings into null/nil
|
235
|
+
## or if using numeric into NotANumber (NaN)
|
236
|
+
if is_null?( value )
|
237
|
+
value = nil
|
238
|
+
elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
|
239
|
+
value = Float::NAN
|
240
|
+
else
|
241
|
+
# do nothing - keep value as is :-) e.g. "".
|
242
|
+
end
|
243
|
+
elsif input.peek == DOUBLE_QUOTE
|
244
|
+
logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
245
|
+
value << parse_quote( input, sep: sep,
|
246
|
+
opening_quote: DOUBLE_QUOTE,
|
247
|
+
closing_quote: DOUBLE_QUOTE )
|
248
|
+
|
249
|
+
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
250
|
+
spaces_count = skip_spaces( input )
|
251
|
+
|
252
|
+
## check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
|
253
|
+
## todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
|
254
|
+
if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
|
255
|
+
## everything ok (that is, regular quoted value)!!!
|
256
|
+
else
|
257
|
+
## try auto-fix
|
258
|
+
## todo: report warning/issue error (if configured)!!!
|
259
|
+
extra_value = parse_field_until_sep( input, sep: sep )
|
260
|
+
## "reconstruct" non-quoted value
|
261
|
+
spaces = ' ' * spaces_count ## todo: preserve tab (\t) - why? why not?
|
262
|
+
## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
|
263
|
+
## e.g. "hello """ extra, (becomes)=> "hello "" extra (one quote less/"eaten up")
|
264
|
+
value = %Q{"#{value}"#{spaces}#{extra_value}}
|
265
|
+
end
|
266
|
+
|
267
|
+
logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
268
|
+
elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
|
269
|
+
logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
270
|
+
value << parse_quote( input, sep: sep,
|
271
|
+
opening_quote: SINGLE_QUOTE,
|
272
|
+
closing_quote: SINGLE_QUOTE )
|
273
|
+
|
274
|
+
## note: always eat-up all trailing spaces (" ") and tabs (\t)
|
275
|
+
skip_spaces( input )
|
276
|
+
logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
277
|
+
elsif input.peek == "«"
|
278
|
+
value << parse_quote( input, sep: sep,
|
279
|
+
opening_quote: "«",
|
280
|
+
closing_quote: "»" )
|
281
|
+
skip_spaces( input )
|
282
|
+
elsif input.peek == "»"
|
283
|
+
value << parse_quote( input, sep: sep,
|
284
|
+
opening_quote: "»",
|
285
|
+
closing_quote: "«" )
|
286
|
+
skip_spaces( input )
|
287
|
+
elsif input.peek == "‹"
|
288
|
+
value << parse_quote( input, sep: sep,
|
289
|
+
opening_quote: "‹",
|
290
|
+
closing_quote: "›" )
|
291
|
+
skip_spaces( input )
|
292
|
+
elsif input.peek == "›"
|
293
|
+
value << parse_quote( input, sep: sep,
|
294
|
+
opening_quote: "›",
|
295
|
+
closing_quote: "‹" )
|
296
|
+
skip_spaces( input )
|
297
|
+
else
|
298
|
+
logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
299
|
+
## consume simple value
|
300
|
+
## until we hit "," or "\n" or "\r"
|
301
|
+
## note: will eat-up quotes too!!!
|
302
|
+
while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
|
303
|
+
if input.peek == BACKSLASH
|
304
|
+
value << parse_escape( input, sep: sep )
|
305
|
+
### check for end-of-line comments (e.g. # ...)
|
306
|
+
## note: quick hack for now
|
307
|
+
## will NOT work in hashtag (hxl) mode and for % comments
|
308
|
+
## for now ALWAYS assumes # for comments
|
309
|
+
## and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
|
310
|
+
## todo/fix: note: require leading space for comment hash (#) for now- why? why not?
|
311
|
+
## require trailing space after comment hash (#) - why? why not?
|
312
|
+
elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
|
313
|
+
(value.size == 0 || (value.size > 0 && value[-1] == ' '))
|
314
|
+
## eat-up everything until end-of-line (eol)
|
315
|
+
skip_until_eol( input )
|
316
|
+
else
|
317
|
+
logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
318
|
+
value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
|
319
|
+
end
|
320
|
+
end
|
321
|
+
## note: only strip **trailing** spaces (space and tab only)
|
322
|
+
## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
|
323
|
+
value = value.sub( /[ \t]+$/, '' )
|
324
|
+
|
325
|
+
if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
|
326
|
+
value = nil
|
327
|
+
elsif numeric
|
328
|
+
if is_nan?( value )
|
329
|
+
value = Float::NAN
|
330
|
+
else
|
331
|
+
## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
|
332
|
+
if numeric.is_a?( Proc )
|
333
|
+
value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
|
334
|
+
else
|
335
|
+
value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
|
336
|
+
end
|
337
|
+
end
|
338
|
+
else
|
339
|
+
# do nothing - keep value as is :-).
|
340
|
+
end
|
341
|
+
|
342
|
+
logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
343
|
+
end
|
344
|
+
|
345
|
+
value
|
346
|
+
end
|
347
|
+
|
348
|
+
|
349
|
+
|
350
|
+
def parse_record( input, sep: )
|
351
|
+
values = []
|
352
|
+
|
353
|
+
space = config[:space]
|
354
|
+
|
355
|
+
loop do
|
356
|
+
value = parse_field( input, sep: sep )
|
357
|
+
value = value.tr( space, ' ' ) if space && value.is_a?( String )
|
358
|
+
|
359
|
+
logger.debug "value: »#{value}«" if logger.debug?
|
360
|
+
values << value
|
361
|
+
|
362
|
+
if input.eof?
|
363
|
+
break
|
364
|
+
elsif (c=input.peek; c==LF || c==CR)
|
365
|
+
skip_newline( input )
|
366
|
+
break
|
367
|
+
elsif input.peek == sep
|
368
|
+
input.getc ## eat-up FS(,)
|
369
|
+
else
|
370
|
+
raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
|
371
|
+
end
|
372
|
+
end
|
373
|
+
|
374
|
+
values
|
375
|
+
end
|
376
|
+
|
377
|
+
|
378
|
+
|
379
|
+
def parse_meta( input )
|
380
|
+
## todo/check:
|
381
|
+
## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
|
382
|
+
|
383
|
+
input.getc ## eat-up (add document header ---) - skip "---"
|
384
|
+
input.getc
|
385
|
+
input.getc
|
386
|
+
|
387
|
+
## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
|
388
|
+
## use match() or something to always match regexp
|
389
|
+
skip_spaces( input ) # eat-up optional whitespaces in header line
|
390
|
+
skip_newline( input )
|
391
|
+
|
392
|
+
buf = "---\n" ## note: start buffer with yaml header line - why?
|
393
|
+
## ::YAML.load("") return false !!!
|
394
|
+
## ::YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
|
395
|
+
|
396
|
+
newline = true
|
397
|
+
|
398
|
+
## eat-up until we hit "---" again
|
399
|
+
loop do
|
400
|
+
if input.eof?
|
401
|
+
raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
|
402
|
+
elsif (c=input.peek; c==LF || c==CR)
|
403
|
+
while (c=input.peek; c==LF || c==CR ) ## add newlines
|
404
|
+
buf << input.getc ## eat-up all until end of line
|
405
|
+
end
|
406
|
+
newline = true
|
407
|
+
elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
|
408
|
+
## todo/fix/check: allow (ignore) spaces after --- why? why not?
|
409
|
+
input.getc ## eat-up (add document header ---) - skip "---"
|
410
|
+
input.getc
|
411
|
+
input.getc
|
412
|
+
skip_spaces( input ) # eat-up optional whitespaces in header line
|
413
|
+
skip_newline( input )
|
414
|
+
break
|
415
|
+
else
|
416
|
+
buf << input.getc
|
417
|
+
newline = false
|
418
|
+
end
|
419
|
+
end
|
420
|
+
|
421
|
+
data = ::YAML.load( buf ) ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
|
422
|
+
## todo: check edge cases - always should return a hash or nil
|
423
|
+
## what to do with just integer, string or array etc. ???
|
424
|
+
|
425
|
+
data = {} if data.nil? ## note: if nil return empty hash e.g. {}
|
426
|
+
data
|
427
|
+
end ## parse_meta
|
428
|
+
|
429
|
+
|
430
|
+
|
431
|
+
def skip_newline( input ) ## note: singular (strict) version
|
432
|
+
return if input.eof?
|
433
|
+
|
434
|
+
## only skip CR LF or LF or CR
|
435
|
+
if input.peek == CR
|
436
|
+
input.getc ## eat-up
|
437
|
+
input.getc if input.peek == LF
|
438
|
+
elsif input.peek == LF
|
439
|
+
input.getc ## eat-up
|
440
|
+
else
|
441
|
+
# do nothing
|
442
|
+
end
|
443
|
+
end
|
444
|
+
|
445
|
+
|
446
|
+
|
447
|
+
def skip_until_eol( input )
|
448
|
+
return if input.eof?
|
449
|
+
|
450
|
+
while (c=input.peek; !(c==LF || c==CR || input.eof?))
|
451
|
+
input.getc ## eat-up all until end of line
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
|
456
|
+
def skip_spaces( input )
|
457
|
+
return 0 if input.eof?
|
458
|
+
|
459
|
+
## note: return number of spaces skipped (e.g. 0,1,2,etc.)
|
460
|
+
spaces_count = 0
|
461
|
+
while (c=input.peek; c==SPACE || c==TAB)
|
462
|
+
input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
|
463
|
+
spaces_count += 1
|
464
|
+
end
|
465
|
+
spaces_count
|
466
|
+
end
|
467
|
+
|
468
|
+
|
469
|
+
|
470
|
+
|
471
|
+
|
472
|
+
|
473
|
+
def parse_lines( input, sep:, &block )
|
474
|
+
## note: reset (optional) meta data block
|
475
|
+
@meta = nil ## no meta data block (use empty hash {} - why? why not?)
|
476
|
+
|
477
|
+
## note: track number of records
|
478
|
+
## used for meta block (can only start before any records e.g. if record_num == 0)
|
479
|
+
record_num = 0
|
480
|
+
|
481
|
+
|
482
|
+
|
483
|
+
hashtag = config[:hashtag]
|
484
|
+
|
485
|
+
if hashtag
|
486
|
+
comment = COMMENT_PERCENT
|
487
|
+
## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
|
488
|
+
else
|
489
|
+
## note: can either use '#' or '%' but NOT both; first one "wins"
|
490
|
+
comment = nil
|
491
|
+
end
|
492
|
+
|
493
|
+
|
494
|
+
has_seen_directive = false
|
495
|
+
has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
|
496
|
+
## note: can either use directives (@) or frontmatter (---) block; first one "wins"
|
497
|
+
|
498
|
+
loop do
|
499
|
+
break if input.eof?
|
500
|
+
|
501
|
+
skipped_spaces = skip_spaces( input )
|
502
|
+
|
503
|
+
if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
|
504
|
+
logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
505
|
+
comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
|
506
|
+
skip_until_eol( input )
|
507
|
+
skip_newline( input )
|
508
|
+
elsif comment && input.peek == comment ## (anther) comment line
|
509
|
+
logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
510
|
+
skip_until_eol( input )
|
511
|
+
skip_newline( input )
|
512
|
+
elsif (c=input.peek; c==LF || c==CR || input.eof?)
|
513
|
+
logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
514
|
+
skip_newline( input )
|
515
|
+
elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
|
516
|
+
## note: "skip" directives for now
|
517
|
+
has_seen_directive = true
|
518
|
+
logger.debug "skip directive" if logger.debug?
|
519
|
+
skip_until_eol( input )
|
520
|
+
skip_newline( input )
|
521
|
+
elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
|
522
|
+
skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
|
523
|
+
## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
|
524
|
+
has_seen_frontmatter = true
|
525
|
+
logger.debug "start meta block" if logger.debug?
|
526
|
+
## note: meta gets stored as object attribute (state/state/state!!)
|
527
|
+
## use meta attribute to get meta data after reading first record
|
528
|
+
@meta = parse_meta( input ) ## note: assumes a hash gets returned
|
529
|
+
logger.debug " meta: >#{meta.inspect}<" if logger.debug?
|
530
|
+
else
|
531
|
+
logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
|
532
|
+
|
533
|
+
record = parse_record( input, sep: sep )
|
534
|
+
record_num +=1
|
535
|
+
|
536
|
+
## note: requires block - enforce? how? why? why not?
|
537
|
+
block.call( record ) ## yield( record )
|
538
|
+
end
|
539
|
+
end # loop
|
540
|
+
end # method parse_lines
|
541
|
+
|
542
|
+
|
543
|
+
|
544
|
+
|
545
|
+
def convert_to_float( value ) Float( value ) rescue value; end
|
546
|
+
|
547
|
+
def is_nan?( value )
|
548
|
+
nan = @config[:nan]
|
549
|
+
if nan.nil?
|
550
|
+
false ## nothing set; return always false (not NaN)
|
551
|
+
elsif nan.is_a?( Proc )
|
552
|
+
nan.call( value )
|
553
|
+
elsif nan.is_a?( Array )
|
554
|
+
nan.include?( value )
|
555
|
+
elsif nan.is_a?( String )
|
556
|
+
value == nan
|
557
|
+
else ## unknown config style / setting
|
558
|
+
## todo: issue a warning or error - why? why not?
|
559
|
+
false ## nothing set; return always false (not nan)
|
560
|
+
end
|
561
|
+
end
|
562
|
+
|
563
|
+
|
564
|
+
def is_null?( value )
|
565
|
+
null = @config[:null]
|
566
|
+
if null.nil?
|
567
|
+
false ## nothing set; return always false (not null)
|
568
|
+
elsif null.is_a?( Proc )
|
569
|
+
null.call( value )
|
570
|
+
elsif null.is_a?( Array )
|
571
|
+
null.include?( value )
|
572
|
+
elsif null.is_a?( String )
|
573
|
+
value == null
|
574
|
+
else ## unknown config style / setting
|
575
|
+
## todo: issue a warning or error - why? why not?
|
576
|
+
false ## nothing set; return always false (not null)
|
577
|
+
end
|
578
|
+
end
|
579
|
+
|
580
|
+
|
581
|
+
end # class ParserStd
|
582
|
+
end # class CsvReader
|