csvreader 1.2.4 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,583 +1,582 @@
1
- # encoding: utf-8
2
-
3
- class CsvReader
4
-
5
-
6
-
7
-
8
-
9
- class ParserStd
10
-
11
-
12
- ## char constants
13
- DOUBLE_QUOTE = "\""
14
- SINGLE_QUOTE = "'"
15
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
16
- COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
17
- COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
18
- DIRECTIVE = "@" ## use a different name e.g. AT or ??
19
- SPACE = " " ## \s == ASCII 32 (dec) = (Space)
20
- TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
21
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
22
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
23
-
24
-
25
-
26
- ###################################
27
- ## add simple logger with debug flag/switch
28
- #
29
- # use Parser.debug = true # to turn on
30
- #
31
- # todo/fix: use logutils instead of std logger - why? why not?
32
-
33
- def self.build_logger()
34
- l = Logger.new( STDOUT )
35
- l.level = :info ## set to :info on start; note: is 0 (debug) by default
36
- l
37
- end
38
- def self.logger() @@logger ||= build_logger; end
39
- def logger() self.class.logger; end
40
-
41
-
42
-
43
-
44
- attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
45
- attr_reader :meta
46
-
47
- ##
48
- ## todo/check:
49
- ## null values - include NA - why? why not?
50
- ## make null values case sensitive or add an option for case sensitive
51
- ## or better allow a proc as option for checking too!!!
52
- def initialize( sep: ',',
53
- null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
54
- numeric: false, ## (auto-)convert all non-quoted values to float
55
- nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
56
- space: nil,
57
- hashtag: false
58
- )
59
- @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
60
-
61
- check_sep( sep )
62
- @config[:sep] = sep
63
-
64
- ## note: null values must get handled by parser
65
- ## only get checked for unquoted strings (and NOT for quoted strings)
66
- ## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
67
- @config[:null] = null ## null values
68
- @config[:numeric] = numeric
69
- @config[:nan] = nan # not a number (NaN) e.g. Float::NAN
70
-
71
- ## e.g. treat/convert char to space e.g. _-+• etc
72
- ## Man_Utd => Man Utd
73
- ## or use it for leading and trailing spaces without quotes
74
- ## todo/check: only use for unquoted values? why? why not?
75
- @config[:space] = space
76
-
77
- ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
78
- ## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
79
- ## do NOT treat # as a comment (always use % for now)
80
- @config[:hashtag] = hashtag
81
-
82
- @meta = nil ## no meta data block (use empty hash {} - why? why not?)
83
- end
84
-
85
-
86
- SEPARATORS = ",;|^:"
87
-
88
- def check_sep( sep )
89
- ## note: parse does NOT support space or tab as separator!!
90
- ## leading and trailing space or tab (whitespace) gets by default trimmed
91
- ## unless quoted (or alternative space char used e.g. _-+ if configured)
92
-
93
- if SEPARATORS.include?( sep )
94
- ## everything ok
95
- else
96
- raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
97
- end
98
- end
99
-
100
-
101
- #########################################
102
- ## config convenience helpers
103
- ## e.g. use like Csv.defaultl.null = '\N' etc. instead of
104
- ## Csv.default.config[:null] = '\N'
105
- def sep=( value ) check_sep( value ); @config[:sep]=value; end
106
-
107
- def null=( value ) @config[:null]=value; end
108
- def numeric=( value ) @config[:numeric]=value; end
109
- def nan=( value ) @config[:nan]=value; end
110
- def space=( value ) @config[:space]=value; end
111
- def hashtag=( value ) @config[:hashtag]=value; end
112
-
113
-
114
-
115
-
116
- def parse( str_or_readable, sep: config[:sep], &block )
117
-
118
- check_sep( sep )
119
-
120
- ## note: data - will wrap either a String or IO object passed in data
121
- ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
122
-
123
- ## make sure data (string or io) is a wrapped into Buffer!!!!!!
124
- if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
125
- input = str_or_readable
126
- else
127
- input = Buffer.new( str_or_readable )
128
- end
129
-
130
- if block_given?
131
- parse_lines( input, sep: sep, &block )
132
- else
133
- records = []
134
-
135
- parse_lines( input, sep: sep ) do |record|
136
- records << record
137
- end
138
-
139
- records
140
- end
141
- end ## method parse
142
-
143
-
144
-
145
-
146
- private
147
-
148
- def parse_escape( input, sep: )
149
- value = ""
150
- if input.peek == BACKSLASH
151
- input.getc ## eat-up backslash
152
- if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
153
- logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
154
- value << input.getc ## add escaped char (e.g. lf, cr, etc.)
155
- else
156
- ## unknown escape sequence; no special handling/escaping
157
- logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
158
- value << BACKSLASH
159
- end
160
- else
161
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
162
- end
163
- value
164
- end
165
-
166
-
167
-
168
- def parse_quote( input, sep:, opening_quote:, closing_quote:)
169
- value = ""
170
- if input.peek == opening_quote
171
- input.getc ## eat-up opening quote
172
-
173
- loop do
174
- while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
175
- value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
176
- end
177
-
178
- if input.eof?
179
- break
180
- elsif input.peek == BACKSLASH
181
- value << parse_escape( input, sep: sep )
182
- else ## assume input.peek == quote
183
- input.getc ## eat-up quote
184
- if opening_quote == closing_quote && input.peek == closing_quote
185
- ## doubled up quote?
186
- # note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
187
- value << input.getc ## add doube quote and continue!!!!
188
- else
189
- break
190
- end
191
- end
192
- end
193
- else
194
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
195
- end
196
- value
197
- end
198
-
199
-
200
- def parse_field_until_sep( input, sep: )
201
- value = ""
202
- logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
203
- ## consume simple value
204
- ## until we hit "," or "\n" or "\r"
205
- ## note: will eat-up quotes too!!!
206
- while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
207
- if input.peek == BACKSLASH
208
- value << parse_escape( input, sep: sep )
209
- else
210
- logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
211
- value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
212
- end
213
- end
214
- ## note: only strip **trailing** spaces (space and tab only)
215
- ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
216
- value = value.sub( /[ \t]+$/, '' )
217
- value
218
- end
219
-
220
-
221
-
222
- def parse_field( input, sep: )
223
- value = ""
224
-
225
- numeric = config[:numeric]
226
- hashtag = config[:hashtag]
227
-
228
-
229
- logger.debug "parse field" if logger.debug?
230
-
231
- skip_spaces( input ) ## strip leading spaces
232
-
233
-
234
- if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
235
- ## note: allows null = '' that is turn unquoted empty strings into null/nil
236
- ## or if using numeric into NotANumber (NaN)
237
- if is_null?( value )
238
- value = nil
239
- elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
240
- value = Float::NAN
241
- else
242
- # do nothing - keep value as is :-) e.g. "".
243
- end
244
- elsif input.peek == DOUBLE_QUOTE
245
- logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
246
- value << parse_quote( input, sep: sep,
247
- opening_quote: DOUBLE_QUOTE,
248
- closing_quote: DOUBLE_QUOTE )
249
-
250
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
251
- spaces_count = skip_spaces( input )
252
-
253
- ## check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
254
- ## todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
255
- if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
256
- ## everything ok (that is, regular quoted value)!!!
257
- else
258
- ## try auto-fix
259
- ## todo: report warning/issue error (if configured)!!!
260
- extra_value = parse_field_until_sep( input, sep: sep )
261
- ## "reconstruct" non-quoted value
262
- spaces = ' ' * spaces_count ## todo: preserve tab (\t) - why? why not?
263
- ## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
264
- ## e.g. "hello """ extra, (becomes)=> "hello "" extra (one quote less/"eaten up")
265
- value = %Q{"#{value}"#{spaces}#{extra_value}}
266
- end
267
-
268
- logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
269
- elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
270
- logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
271
- value << parse_quote( input, sep: sep,
272
- opening_quote: SINGLE_QUOTE,
273
- closing_quote: SINGLE_QUOTE )
274
-
275
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
276
- skip_spaces( input )
277
- logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
278
- elsif input.peek == "«"
279
- value << parse_quote( input, sep: sep,
280
- opening_quote: "«",
281
- closing_quote: "»" )
282
- skip_spaces( input )
283
- elsif input.peek == "»"
284
- value << parse_quote( input, sep: sep,
285
- opening_quote: "»",
286
- closing_quote: "«" )
287
- skip_spaces( input )
288
- elsif input.peek == "‹"
289
- value << parse_quote( input, sep: sep,
290
- opening_quote: "",
291
- closing_quote: "›" )
292
- skip_spaces( input )
293
- elsif input.peek == "›"
294
- value << parse_quote( input, sep: sep,
295
- opening_quote: "",
296
- closing_quote: "‹" )
297
- skip_spaces( input )
298
- else
299
- logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
300
- ## consume simple value
301
- ## until we hit "," or "\n" or "\r"
302
- ## note: will eat-up quotes too!!!
303
- while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
304
- if input.peek == BACKSLASH
305
- value << parse_escape( input, sep: sep )
306
- ### check for end-of-line comments (e.g. # ...)
307
- ## note: quick hack for now
308
- ## will NOT work in hashtag (hxl) mode and for % comments
309
- ## for now ALWAYS assumes # for comments
310
- ## and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
311
- ## todo/fix: note: require leading space for comment hash (#) for now- why? why not?
312
- ## require trailing space after comment hash (#) - why? why not?
313
- elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
314
- (value.size == 0 || (value.size > 0 && value[-1] == ' '))
315
- ## eat-up everything until end-of-line (eol)
316
- skip_until_eol( input )
317
- else
318
- logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
319
- value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
320
- end
321
- end
322
- ## note: only strip **trailing** spaces (space and tab only)
323
- ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
324
- value = value.sub( /[ \t]+$/, '' )
325
-
326
- if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
327
- value = nil
328
- elsif numeric
329
- if is_nan?( value )
330
- value = Float::NAN
331
- else
332
- ## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
333
- if numeric.is_a?( Proc )
334
- value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
335
- else
336
- value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
337
- end
338
- end
339
- else
340
- # do nothing - keep value as is :-).
341
- end
342
-
343
- logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
344
- end
345
-
346
- value
347
- end
348
-
349
-
350
-
351
- def parse_record( input, sep: )
352
- values = []
353
-
354
- space = config[:space]
355
-
356
- loop do
357
- value = parse_field( input, sep: sep )
358
- value = value.tr( space, ' ' ) if space && value.is_a?( String )
359
-
360
- logger.debug "value: »#{value}«" if logger.debug?
361
- values << value
362
-
363
- if input.eof?
364
- break
365
- elsif (c=input.peek; c==LF || c==CR)
366
- skip_newline( input )
367
- break
368
- elsif input.peek == sep
369
- input.getc ## eat-up FS(,)
370
- else
371
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
372
- end
373
- end
374
-
375
- values
376
- end
377
-
378
-
379
-
380
- def parse_meta( input )
381
- ## todo/check:
382
- ## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
383
-
384
- input.getc ## eat-up (add document header ---) - skip "---"
385
- input.getc
386
- input.getc
387
-
388
- ## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
389
- ## use match() or something to always match regexp
390
- skip_spaces( input ) # eat-up optional whitespaces in header line
391
- skip_newline( input )
392
-
393
- buf = "---\n" ## note: start buffer with yaml header line - why?
394
- ## ::YAML.load("") return false !!!
395
- ## ::YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
396
-
397
- newline = true
398
-
399
- ## eat-up until we hit "---" again
400
- loop do
401
- if input.eof?
402
- raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
403
- elsif (c=input.peek; c==LF || c==CR)
404
- while (c=input.peek; c==LF || c==CR ) ## add newlines
405
- buf << input.getc ## eat-up all until end of line
406
- end
407
- newline = true
408
- elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
409
- ## todo/fix/check: allow (ignore) spaces after --- why? why not?
410
- input.getc ## eat-up (add document header ---) - skip "---"
411
- input.getc
412
- input.getc
413
- skip_spaces( input ) # eat-up optional whitespaces in header line
414
- skip_newline( input )
415
- break
416
- else
417
- buf << input.getc
418
- newline = false
419
- end
420
- end
421
-
422
- data = ::YAML.load( buf ) ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
423
- ## todo: check edge cases - always should return a hash or nil
424
- ## what to do with just integer, string or array etc. ???
425
-
426
- data = {} if data.nil? ## note: if nil return empty hash e.g. {}
427
- data
428
- end ## parse_meta
429
-
430
-
431
-
432
- def skip_newline( input ) ## note: singular (strict) version
433
- return if input.eof?
434
-
435
- ## only skip CR LF or LF or CR
436
- if input.peek == CR
437
- input.getc ## eat-up
438
- input.getc if input.peek == LF
439
- elsif input.peek == LF
440
- input.getc ## eat-up
441
- else
442
- # do nothing
443
- end
444
- end
445
-
446
-
447
-
448
- def skip_until_eol( input )
449
- return if input.eof?
450
-
451
- while (c=input.peek; !(c==LF || c==CR || input.eof?))
452
- input.getc ## eat-up all until end of line
453
- end
454
- end
455
-
456
-
457
- def skip_spaces( input )
458
- return 0 if input.eof?
459
-
460
- ## note: return number of spaces skipped (e.g. 0,1,2,etc.)
461
- spaces_count = 0
462
- while (c=input.peek; c==SPACE || c==TAB)
463
- input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
464
- spaces_count += 1
465
- end
466
- spaces_count
467
- end
468
-
469
-
470
-
471
-
472
-
473
-
474
- def parse_lines( input, sep:, &block )
475
- ## note: reset (optional) meta data block
476
- @meta = nil ## no meta data block (use empty hash {} - why? why not?)
477
-
478
- ## note: track number of records
479
- ## used for meta block (can only start before any records e.g. if record_num == 0)
480
- record_num = 0
481
-
482
-
483
-
484
- hashtag = config[:hashtag]
485
-
486
- if hashtag
487
- comment = COMMENT_PERCENT
488
- ## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
489
- else
490
- ## note: can either use '#' or '%' but NOT both; first one "wins"
491
- comment = nil
492
- end
493
-
494
-
495
- has_seen_directive = false
496
- has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
497
- ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
498
-
499
- loop do
500
- break if input.eof?
501
-
502
- skipped_spaces = skip_spaces( input )
503
-
504
- if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
505
- logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
506
- comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
507
- skip_until_eol( input )
508
- skip_newline( input )
509
- elsif comment && input.peek == comment ## (anther) comment line
510
- logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
511
- skip_until_eol( input )
512
- skip_newline( input )
513
- elsif (c=input.peek; c==LF || c==CR || input.eof?)
514
- logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
515
- skip_newline( input )
516
- elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
517
- ## note: "skip" directives for now
518
- has_seen_directive = true
519
- logger.debug "skip directive" if logger.debug?
520
- skip_until_eol( input )
521
- skip_newline( input )
522
- elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
523
- skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
524
- ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
525
- has_seen_frontmatter = true
526
- logger.debug "start meta block" if logger.debug?
527
- ## note: meta gets stored as object attribute (state/state/state!!)
528
- ## use meta attribute to get meta data after reading first record
529
- @meta = parse_meta( input ) ## note: assumes a hash gets returned
530
- logger.debug " meta: >#{meta.inspect}<" if logger.debug?
531
- else
532
- logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
533
-
534
- record = parse_record( input, sep: sep )
535
- record_num +=1
536
-
537
- ## note: requires block - enforce? how? why? why not?
538
- block.call( record ) ## yield( record )
539
- end
540
- end # loop
541
- end # method parse_lines
542
-
543
-
544
-
545
-
546
- def convert_to_float( value ) Float( value ) rescue value; end
547
-
548
- def is_nan?( value )
549
- nan = @config[:nan]
550
- if nan.nil?
551
- false ## nothing set; return always false (not NaN)
552
- elsif nan.is_a?( Proc )
553
- nan.call( value )
554
- elsif nan.is_a?( Array )
555
- nan.include?( value )
556
- elsif nan.is_a?( String )
557
- value == nan
558
- else ## unknown config style / setting
559
- ## todo: issue a warning or error - why? why not?
560
- false ## nothing set; return always false (not nan)
561
- end
562
- end
563
-
564
-
565
- def is_null?( value )
566
- null = @config[:null]
567
- if null.nil?
568
- false ## nothing set; return always false (not null)
569
- elsif null.is_a?( Proc )
570
- null.call( value )
571
- elsif null.is_a?( Array )
572
- null.include?( value )
573
- elsif null.is_a?( String )
574
- value == null
575
- else ## unknown config style / setting
576
- ## todo: issue a warning or error - why? why not?
577
- false ## nothing set; return always false (not null)
578
- end
579
- end
580
-
581
-
582
- end # class ParserStd
583
- end # class CsvReader
1
+
2
+ class CsvReader
3
+
4
+
5
+
6
+
7
+
8
+ class ParserStd
9
+
10
+
11
+ ## char constants
12
+ DOUBLE_QUOTE = "\""
13
+ SINGLE_QUOTE = "'"
14
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
+ COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
16
+ COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
17
+ DIRECTIVE = "@" ## use a different name e.g. AT or ??
18
+ SPACE = " " ## \s == ASCII 32 (dec) = (Space)
19
+ TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
20
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
21
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
22
+
23
+
24
+
25
+ ###################################
26
+ ## add simple logger with debug flag/switch
27
+ #
28
+ # use Parser.debug = true # to turn on
29
+ #
30
+ # todo/fix: use logutils instead of std logger - why? why not?
31
+
32
+ def self.build_logger()
33
+ l = Logger.new( STDOUT )
34
+ l.level = :info ## set to :info on start; note: is 0 (debug) by default
35
+ l
36
+ end
37
+ def self.logger() @@logger ||= build_logger; end
38
+ def logger() self.class.logger; end
39
+
40
+
41
+
42
+
43
+ attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
44
+ attr_reader :meta
45
+
46
+ ##
47
+ ## todo/check:
48
+ ## null values - include NA - why? why not?
49
+ ## make null values case sensitive or add an option for case sensitive
50
+ ## or better allow a proc as option for checking too!!!
51
+ def initialize( sep: ',',
52
+ null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
53
+ numeric: false, ## (auto-)convert all non-quoted values to float
54
+ nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
55
+ space: nil,
56
+ hashtag: false
57
+ )
58
+ @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
59
+
60
+ check_sep( sep )
61
+ @config[:sep] = sep
62
+
63
+ ## note: null values must get handled by parser
64
+ ## only get checked for unquoted strings (and NOT for quoted strings)
65
+ ## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
66
+ @config[:null] = null ## null values
67
+ @config[:numeric] = numeric
68
+ @config[:nan] = nan # not a number (NaN) e.g. Float::NAN
69
+
70
+ ## e.g. treat/convert char to space e.g. _-+• etc
71
+ ## Man_Utd => Man Utd
72
+ ## or use it for leading and trailing spaces without quotes
73
+ ## todo/check: only use for unquoted values? why? why not?
74
+ @config[:space] = space
75
+
76
+ ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
77
+ ## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
78
+ ## do NOT treat # as a comment (always use % for now)
79
+ @config[:hashtag] = hashtag
80
+
81
+ @meta = nil ## no meta data block (use empty hash {} - why? why not?)
82
+ end
83
+
84
+
85
+ SEPARATORS = ",;|^:"
86
+
87
+ def check_sep( sep )
88
+ ## note: parse does NOT support space or tab as separator!!
89
+ ## leading and trailing space or tab (whitespace) gets by default trimmed
90
+ ## unless quoted (or alternative space char used e.g. _-+ if configured)
91
+
92
+ if SEPARATORS.include?( sep )
93
+ ## everything ok
94
+ else
95
+ raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
96
+ end
97
+ end
98
+
99
+
100
+ #########################################
101
+ ## config convenience helpers
102
+ ## e.g. use like Csv.defaultl.null = '\N' etc. instead of
103
+ ## Csv.default.config[:null] = '\N'
104
+ def sep=( value ) check_sep( value ); @config[:sep]=value; end
105
+
106
+ def null=( value ) @config[:null]=value; end
107
+ def numeric=( value ) @config[:numeric]=value; end
108
+ def nan=( value ) @config[:nan]=value; end
109
+ def space=( value ) @config[:space]=value; end
110
+ def hashtag=( value ) @config[:hashtag]=value; end
111
+
112
+
113
+
114
+
115
+ def parse( str_or_readable, sep: config[:sep], &block )
116
+
117
+ check_sep( sep )
118
+
119
+ ## note: data - will wrap either a String or IO object passed in data
120
+ ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
121
+
122
+ ## make sure data (string or io) is a wrapped into Buffer!!!!!!
123
+ if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
124
+ input = str_or_readable
125
+ else
126
+ input = Buffer.new( str_or_readable )
127
+ end
128
+
129
+ if block_given?
130
+ parse_lines( input, sep: sep, &block )
131
+ else
132
+ records = []
133
+
134
+ parse_lines( input, sep: sep ) do |record|
135
+ records << record
136
+ end
137
+
138
+ records
139
+ end
140
+ end ## method parse
141
+
142
+
143
+
144
+
145
+ private
146
+
147
+ def parse_escape( input, sep: )
148
+ value = ""
149
+ if input.peek == BACKSLASH
150
+ input.getc ## eat-up backslash
151
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
152
+ logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
153
+ value << input.getc ## add escaped char (e.g. lf, cr, etc.)
154
+ else
155
+ ## unknown escape sequence; no special handling/escaping
156
+ logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
157
+ value << BACKSLASH
158
+ end
159
+ else
160
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
161
+ end
162
+ value
163
+ end
164
+
165
+
166
+
167
+ def parse_quote( input, sep:, opening_quote:, closing_quote:)
168
+ value = ""
169
+ if input.peek == opening_quote
170
+ input.getc ## eat-up opening quote
171
+
172
+ loop do
173
+ while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
174
+ value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
175
+ end
176
+
177
+ if input.eof?
178
+ break
179
+ elsif input.peek == BACKSLASH
180
+ value << parse_escape( input, sep: sep )
181
+ else ## assume input.peek == quote
182
+ input.getc ## eat-up quote
183
+ if opening_quote == closing_quote && input.peek == closing_quote
184
+ ## doubled up quote?
185
+ # note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
186
+ value << input.getc ## add doube quote and continue!!!!
187
+ else
188
+ break
189
+ end
190
+ end
191
+ end
192
+ else
193
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
194
+ end
195
+ value
196
+ end
197
+
198
+
199
+ def parse_field_until_sep( input, sep: )
200
+ value = ""
201
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
202
+ ## consume simple value
203
+ ## until we hit "," or "\n" or "\r"
204
+ ## note: will eat-up quotes too!!!
205
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
206
+ if input.peek == BACKSLASH
207
+ value << parse_escape( input, sep: sep )
208
+ else
209
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
210
+ value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
211
+ end
212
+ end
213
+ ## note: only strip **trailing** spaces (space and tab only)
214
+ ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
215
+ value = value.sub( /[ \t]+$/, '' )
216
+ value
217
+ end
218
+
219
+
220
+
221
+ def parse_field( input, sep: )
222
+ value = ""
223
+
224
+ numeric = config[:numeric]
225
+ hashtag = config[:hashtag]
226
+
227
+
228
+ logger.debug "parse field" if logger.debug?
229
+
230
+ skip_spaces( input ) ## strip leading spaces
231
+
232
+
233
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
234
+ ## note: allows null = '' that is turn unquoted empty strings into null/nil
235
+ ## or if using numeric into NotANumber (NaN)
236
+ if is_null?( value )
237
+ value = nil
238
+ elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
239
+ value = Float::NAN
240
+ else
241
+ # do nothing - keep value as is :-) e.g. "".
242
+ end
243
+ elsif input.peek == DOUBLE_QUOTE
244
+ logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
245
+ value << parse_quote( input, sep: sep,
246
+ opening_quote: DOUBLE_QUOTE,
247
+ closing_quote: DOUBLE_QUOTE )
248
+
249
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
250
+ spaces_count = skip_spaces( input )
251
+
252
+ ## check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
253
+ ## todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
254
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
255
+ ## everything ok (that is, regular quoted value)!!!
256
+ else
257
+ ## try auto-fix
258
+ ## todo: report warning/issue error (if configured)!!!
259
+ extra_value = parse_field_until_sep( input, sep: sep )
260
+ ## "reconstruct" non-quoted value
261
+ spaces = ' ' * spaces_count ## todo: preserve tab (\t) - why? why not?
262
+ ## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
263
+ ## e.g. "hello """ extra, (becomes)=> "hello "" extra (one quote less/"eaten up")
264
+ value = %Q{"#{value}"#{spaces}#{extra_value}}
265
+ end
266
+
267
+ logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
268
+ elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
269
+ logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
270
+ value << parse_quote( input, sep: sep,
271
+ opening_quote: SINGLE_QUOTE,
272
+ closing_quote: SINGLE_QUOTE )
273
+
274
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
275
+ skip_spaces( input )
276
+ logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
277
+ elsif input.peek == "«"
278
+ value << parse_quote( input, sep: sep,
279
+ opening_quote: "«",
280
+ closing_quote: "»" )
281
+ skip_spaces( input )
282
+ elsif input.peek == "»"
283
+ value << parse_quote( input, sep: sep,
284
+ opening_quote: "»",
285
+ closing_quote: "«" )
286
+ skip_spaces( input )
287
+ elsif input.peek == "‹"
288
+ value << parse_quote( input, sep: sep,
289
+ opening_quote: "‹",
290
+ closing_quote: "" )
291
+ skip_spaces( input )
292
+ elsif input.peek == "›"
293
+ value << parse_quote( input, sep: sep,
294
+ opening_quote: "›",
295
+ closing_quote: "" )
296
+ skip_spaces( input )
297
+ else
298
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
299
+ ## consume simple value
300
+ ## until we hit "," or "\n" or "\r"
301
+ ## note: will eat-up quotes too!!!
302
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
303
+ if input.peek == BACKSLASH
304
+ value << parse_escape( input, sep: sep )
305
+ ### check for end-of-line comments (e.g. # ...)
306
+ ## note: quick hack for now
307
+ ## will NOT work in hashtag (hxl) mode and for % comments
308
+ ## for now ALWAYS assumes # for comments
309
+ ## and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
310
+ ## todo/fix: note: require leading space for comment hash (#) for now- why? why not?
311
+ ## require trailing space after comment hash (#) - why? why not?
312
+ elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
313
+ (value.size == 0 || (value.size > 0 && value[-1] == ' '))
314
+ ## eat-up everything until end-of-line (eol)
315
+ skip_until_eol( input )
316
+ else
317
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
318
+ value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
319
+ end
320
+ end
321
+ ## note: only strip **trailing** spaces (space and tab only)
322
+ ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
323
+ value = value.sub( /[ \t]+$/, '' )
324
+
325
+ if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
326
+ value = nil
327
+ elsif numeric
328
+ if is_nan?( value )
329
+ value = Float::NAN
330
+ else
331
+ ## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
332
+ if numeric.is_a?( Proc )
333
+ value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
334
+ else
335
+ value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
336
+ end
337
+ end
338
+ else
339
+ # do nothing - keep value as is :-).
340
+ end
341
+
342
+ logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
343
+ end
344
+
345
+ value
346
+ end
347
+
348
+
349
+
350
+ def parse_record( input, sep: )
351
+ values = []
352
+
353
+ space = config[:space]
354
+
355
+ loop do
356
+ value = parse_field( input, sep: sep )
357
+ value = value.tr( space, ' ' ) if space && value.is_a?( String )
358
+
359
+ logger.debug "value: »#{value}«" if logger.debug?
360
+ values << value
361
+
362
+ if input.eof?
363
+ break
364
+ elsif (c=input.peek; c==LF || c==CR)
365
+ skip_newline( input )
366
+ break
367
+ elsif input.peek == sep
368
+ input.getc ## eat-up FS(,)
369
+ else
370
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
371
+ end
372
+ end
373
+
374
+ values
375
+ end
376
+
377
+
378
+
379
+ def parse_meta( input )
380
+ ## todo/check:
381
+ ## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
382
+
383
+ input.getc ## eat-up (add document header ---) - skip "---"
384
+ input.getc
385
+ input.getc
386
+
387
+ ## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
388
+ ## use match() or something to always match regexp
389
+ skip_spaces( input ) # eat-up optional whitespaces in header line
390
+ skip_newline( input )
391
+
392
+ buf = "---\n" ## note: start buffer with yaml header line - why?
393
+ ## ::YAML.load("") return false !!!
394
+ ## ::YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
395
+
396
+ newline = true
397
+
398
+ ## eat-up until we hit "---" again
399
+ loop do
400
+ if input.eof?
401
+ raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
402
+ elsif (c=input.peek; c==LF || c==CR)
403
+ while (c=input.peek; c==LF || c==CR ) ## add newlines
404
+ buf << input.getc ## eat-up all until end of line
405
+ end
406
+ newline = true
407
+ elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
408
+ ## todo/fix/check: allow (ignore) spaces after --- why? why not?
409
+ input.getc ## eat-up (add document header ---) - skip "---"
410
+ input.getc
411
+ input.getc
412
+ skip_spaces( input ) # eat-up optional whitespaces in header line
413
+ skip_newline( input )
414
+ break
415
+ else
416
+ buf << input.getc
417
+ newline = false
418
+ end
419
+ end
420
+
421
+ data = ::YAML.load( buf ) ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
422
+ ## todo: check edge cases - always should return a hash or nil
423
+ ## what to do with just integer, string or array etc. ???
424
+
425
+ data = {} if data.nil? ## note: if nil return empty hash e.g. {}
426
+ data
427
+ end ## parse_meta
428
+
429
+
430
+
431
+ def skip_newline( input ) ## note: singular (strict) version
432
+ return if input.eof?
433
+
434
+ ## only skip CR LF or LF or CR
435
+ if input.peek == CR
436
+ input.getc ## eat-up
437
+ input.getc if input.peek == LF
438
+ elsif input.peek == LF
439
+ input.getc ## eat-up
440
+ else
441
+ # do nothing
442
+ end
443
+ end
444
+
445
+
446
+
447
+ def skip_until_eol( input )
448
+ return if input.eof?
449
+
450
+ while (c=input.peek; !(c==LF || c==CR || input.eof?))
451
+ input.getc ## eat-up all until end of line
452
+ end
453
+ end
454
+
455
+
456
+ def skip_spaces( input )
457
+ return 0 if input.eof?
458
+
459
+ ## note: return number of spaces skipped (e.g. 0,1,2,etc.)
460
+ spaces_count = 0
461
+ while (c=input.peek; c==SPACE || c==TAB)
462
+ input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
463
+ spaces_count += 1
464
+ end
465
+ spaces_count
466
+ end
467
+
468
+
469
+
470
+
471
+
472
+
473
+ def parse_lines( input, sep:, &block )
474
+ ## note: reset (optional) meta data block
475
+ @meta = nil ## no meta data block (use empty hash {} - why? why not?)
476
+
477
+ ## note: track number of records
478
+ ## used for meta block (can only start before any records e.g. if record_num == 0)
479
+ record_num = 0
480
+
481
+
482
+
483
+ hashtag = config[:hashtag]
484
+
485
+ if hashtag
486
+ comment = COMMENT_PERCENT
487
+ ## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
488
+ else
489
+ ## note: can either use '#' or '%' but NOT both; first one "wins"
490
+ comment = nil
491
+ end
492
+
493
+
494
+ has_seen_directive = false
495
+ has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
496
+ ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
497
+
498
+ loop do
499
+ break if input.eof?
500
+
501
+ skipped_spaces = skip_spaces( input )
502
+
503
+ if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
504
+ logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
505
+ comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
506
+ skip_until_eol( input )
507
+ skip_newline( input )
508
+ elsif comment && input.peek == comment ## (anther) comment line
509
+ logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
510
+ skip_until_eol( input )
511
+ skip_newline( input )
512
+ elsif (c=input.peek; c==LF || c==CR || input.eof?)
513
+ logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
514
+ skip_newline( input )
515
+ elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
516
+ ## note: "skip" directives for now
517
+ has_seen_directive = true
518
+ logger.debug "skip directive" if logger.debug?
519
+ skip_until_eol( input )
520
+ skip_newline( input )
521
+ elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
522
+ skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
523
+ ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
524
+ has_seen_frontmatter = true
525
+ logger.debug "start meta block" if logger.debug?
526
+ ## note: meta gets stored as object attribute (state/state/state!!)
527
+ ## use meta attribute to get meta data after reading first record
528
+ @meta = parse_meta( input ) ## note: assumes a hash gets returned
529
+ logger.debug " meta: >#{meta.inspect}<" if logger.debug?
530
+ else
531
+ logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
532
+
533
+ record = parse_record( input, sep: sep )
534
+ record_num +=1
535
+
536
+ ## note: requires block - enforce? how? why? why not?
537
+ block.call( record ) ## yield( record )
538
+ end
539
+ end # loop
540
+ end # method parse_lines
541
+
542
+
543
+
544
+
545
+ def convert_to_float( value ) Float( value ) rescue value; end
546
+
547
+ def is_nan?( value )
548
+ nan = @config[:nan]
549
+ if nan.nil?
550
+ false ## nothing set; return always false (not NaN)
551
+ elsif nan.is_a?( Proc )
552
+ nan.call( value )
553
+ elsif nan.is_a?( Array )
554
+ nan.include?( value )
555
+ elsif nan.is_a?( String )
556
+ value == nan
557
+ else ## unknown config style / setting
558
+ ## todo: issue a warning or error - why? why not?
559
+ false ## nothing set; return always false (not nan)
560
+ end
561
+ end
562
+
563
+
564
+ def is_null?( value )
565
+ null = @config[:null]
566
+ if null.nil?
567
+ false ## nothing set; return always false (not null)
568
+ elsif null.is_a?( Proc )
569
+ null.call( value )
570
+ elsif null.is_a?( Array )
571
+ null.include?( value )
572
+ elsif null.is_a?( String )
573
+ value == null
574
+ else ## unknown config style / setting
575
+ ## todo: issue a warning or error - why? why not?
576
+ false ## nothing set; return always false (not null)
577
+ end
578
+ end
579
+
580
+
581
+ end # class ParserStd
582
+ end # class CsvReader