csvreader 1.2.4 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,583 +1,582 @@
1
- # encoding: utf-8
2
-
3
- class CsvReader
4
-
5
-
6
-
7
-
8
-
9
- class ParserStd
10
-
11
-
12
- ## char constants
13
- DOUBLE_QUOTE = "\""
14
- SINGLE_QUOTE = "'"
15
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
16
- COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
17
- COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
18
- DIRECTIVE = "@" ## use a different name e.g. AT or ??
19
- SPACE = " " ## \s == ASCII 32 (dec) = (Space)
20
- TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
21
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
22
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
23
-
24
-
25
-
26
- ###################################
27
- ## add simple logger with debug flag/switch
28
- #
29
- # use Parser.debug = true # to turn on
30
- #
31
- # todo/fix: use logutils instead of std logger - why? why not?
32
-
33
- def self.build_logger()
34
- l = Logger.new( STDOUT )
35
- l.level = :info ## set to :info on start; note: is 0 (debug) by default
36
- l
37
- end
38
- def self.logger() @@logger ||= build_logger; end
39
- def logger() self.class.logger; end
40
-
41
-
42
-
43
-
44
- attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
45
- attr_reader :meta
46
-
47
- ##
48
- ## todo/check:
49
- ## null values - include NA - why? why not?
50
- ## make null values case sensitive or add an option for case sensitive
51
- ## or better allow a proc as option for checking too!!!
52
- def initialize( sep: ',',
53
- null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
54
- numeric: false, ## (auto-)convert all non-quoted values to float
55
- nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
56
- space: nil,
57
- hashtag: false
58
- )
59
- @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
60
-
61
- check_sep( sep )
62
- @config[:sep] = sep
63
-
64
- ## note: null values must get handled by parser
65
- ## only get checked for unquoted strings (and NOT for quoted strings)
66
- ## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
67
- @config[:null] = null ## null values
68
- @config[:numeric] = numeric
69
- @config[:nan] = nan # not a number (NaN) e.g. Float::NAN
70
-
71
- ## e.g. treat/convert char to space e.g. _-+• etc
72
- ## Man_Utd => Man Utd
73
- ## or use it for leading and trailing spaces without quotes
74
- ## todo/check: only use for unquoted values? why? why not?
75
- @config[:space] = space
76
-
77
- ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
78
- ## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
79
- ## do NOT treat # as a comment (always use % for now)
80
- @config[:hashtag] = hashtag
81
-
82
- @meta = nil ## no meta data block (use empty hash {} - why? why not?)
83
- end
84
-
85
-
86
- SEPARATORS = ",;|^:"
87
-
88
- def check_sep( sep )
89
- ## note: parse does NOT support space or tab as separator!!
90
- ## leading and trailing space or tab (whitespace) gets by default trimmed
91
- ## unless quoted (or alternative space char used e.g. _-+ if configured)
92
-
93
- if SEPARATORS.include?( sep )
94
- ## everything ok
95
- else
96
- raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
97
- end
98
- end
99
-
100
-
101
- #########################################
102
- ## config convenience helpers
103
- ## e.g. use like Csv.defaultl.null = '\N' etc. instead of
104
- ## Csv.default.config[:null] = '\N'
105
- def sep=( value ) check_sep( value ); @config[:sep]=value; end
106
-
107
- def null=( value ) @config[:null]=value; end
108
- def numeric=( value ) @config[:numeric]=value; end
109
- def nan=( value ) @config[:nan]=value; end
110
- def space=( value ) @config[:space]=value; end
111
- def hashtag=( value ) @config[:hashtag]=value; end
112
-
113
-
114
-
115
-
116
- def parse( str_or_readable, sep: config[:sep], &block )
117
-
118
- check_sep( sep )
119
-
120
- ## note: data - will wrap either a String or IO object passed in data
121
- ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
122
-
123
- ## make sure data (string or io) is a wrapped into Buffer!!!!!!
124
- if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
125
- input = str_or_readable
126
- else
127
- input = Buffer.new( str_or_readable )
128
- end
129
-
130
- if block_given?
131
- parse_lines( input, sep: sep, &block )
132
- else
133
- records = []
134
-
135
- parse_lines( input, sep: sep ) do |record|
136
- records << record
137
- end
138
-
139
- records
140
- end
141
- end ## method parse
142
-
143
-
144
-
145
-
146
- private
147
-
148
- def parse_escape( input, sep: )
149
- value = ""
150
- if input.peek == BACKSLASH
151
- input.getc ## eat-up backslash
152
- if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
153
- logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
154
- value << input.getc ## add escaped char (e.g. lf, cr, etc.)
155
- else
156
- ## unknown escape sequence; no special handling/escaping
157
- logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
158
- value << BACKSLASH
159
- end
160
- else
161
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
162
- end
163
- value
164
- end
165
-
166
-
167
-
168
- def parse_quote( input, sep:, opening_quote:, closing_quote:)
169
- value = ""
170
- if input.peek == opening_quote
171
- input.getc ## eat-up opening quote
172
-
173
- loop do
174
- while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
175
- value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
176
- end
177
-
178
- if input.eof?
179
- break
180
- elsif input.peek == BACKSLASH
181
- value << parse_escape( input, sep: sep )
182
- else ## assume input.peek == quote
183
- input.getc ## eat-up quote
184
- if opening_quote == closing_quote && input.peek == closing_quote
185
- ## doubled up quote?
186
- # note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
187
- value << input.getc ## add doube quote and continue!!!!
188
- else
189
- break
190
- end
191
- end
192
- end
193
- else
194
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
195
- end
196
- value
197
- end
198
-
199
-
200
- def parse_field_until_sep( input, sep: )
201
- value = ""
202
- logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
203
- ## consume simple value
204
- ## until we hit "," or "\n" or "\r"
205
- ## note: will eat-up quotes too!!!
206
- while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
207
- if input.peek == BACKSLASH
208
- value << parse_escape( input, sep: sep )
209
- else
210
- logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
211
- value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
212
- end
213
- end
214
- ## note: only strip **trailing** spaces (space and tab only)
215
- ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
216
- value = value.sub( /[ \t]+$/, '' )
217
- value
218
- end
219
-
220
-
221
-
222
- def parse_field( input, sep: )
223
- value = ""
224
-
225
- numeric = config[:numeric]
226
- hashtag = config[:hashtag]
227
-
228
-
229
- logger.debug "parse field" if logger.debug?
230
-
231
- skip_spaces( input ) ## strip leading spaces
232
-
233
-
234
- if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
235
- ## note: allows null = '' that is turn unquoted empty strings into null/nil
236
- ## or if using numeric into NotANumber (NaN)
237
- if is_null?( value )
238
- value = nil
239
- elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
240
- value = Float::NAN
241
- else
242
- # do nothing - keep value as is :-) e.g. "".
243
- end
244
- elsif input.peek == DOUBLE_QUOTE
245
- logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
246
- value << parse_quote( input, sep: sep,
247
- opening_quote: DOUBLE_QUOTE,
248
- closing_quote: DOUBLE_QUOTE )
249
-
250
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
251
- spaces_count = skip_spaces( input )
252
-
253
- ## check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
254
- ## todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
255
- if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
256
- ## everything ok (that is, regular quoted value)!!!
257
- else
258
- ## try auto-fix
259
- ## todo: report warning/issue error (if configured)!!!
260
- extra_value = parse_field_until_sep( input, sep: sep )
261
- ## "reconstruct" non-quoted value
262
- spaces = ' ' * spaces_count ## todo: preserve tab (\t) - why? why not?
263
- ## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
264
- ## e.g. "hello """ extra, (becomes)=> "hello "" extra (one quote less/"eaten up")
265
- value = %Q{"#{value}"#{spaces}#{extra_value}}
266
- end
267
-
268
- logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
269
- elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
270
- logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
271
- value << parse_quote( input, sep: sep,
272
- opening_quote: SINGLE_QUOTE,
273
- closing_quote: SINGLE_QUOTE )
274
-
275
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
276
- skip_spaces( input )
277
- logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
278
- elsif input.peek == "«"
279
- value << parse_quote( input, sep: sep,
280
- opening_quote: "«",
281
- closing_quote: "»" )
282
- skip_spaces( input )
283
- elsif input.peek == "»"
284
- value << parse_quote( input, sep: sep,
285
- opening_quote: "»",
286
- closing_quote: "«" )
287
- skip_spaces( input )
288
- elsif input.peek == "‹"
289
- value << parse_quote( input, sep: sep,
290
- opening_quote: "",
291
- closing_quote: "›" )
292
- skip_spaces( input )
293
- elsif input.peek == "›"
294
- value << parse_quote( input, sep: sep,
295
- opening_quote: "",
296
- closing_quote: "‹" )
297
- skip_spaces( input )
298
- else
299
- logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
300
- ## consume simple value
301
- ## until we hit "," or "\n" or "\r"
302
- ## note: will eat-up quotes too!!!
303
- while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
304
- if input.peek == BACKSLASH
305
- value << parse_escape( input, sep: sep )
306
- ### check for end-of-line comments (e.g. # ...)
307
- ## note: quick hack for now
308
- ## will NOT work in hashtag (hxl) mode and for % comments
309
- ## for now ALWAYS assumes # for comments
310
- ## and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
311
- ## todo/fix: note: require leading space for comment hash (#) for now- why? why not?
312
- ## require trailing space after comment hash (#) - why? why not?
313
- elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
314
- (value.size == 0 || (value.size > 0 && value[-1] == ' '))
315
- ## eat-up everything until end-of-line (eol)
316
- skip_until_eol( input )
317
- else
318
- logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
319
- value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
320
- end
321
- end
322
- ## note: only strip **trailing** spaces (space and tab only)
323
- ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
324
- value = value.sub( /[ \t]+$/, '' )
325
-
326
- if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
327
- value = nil
328
- elsif numeric
329
- if is_nan?( value )
330
- value = Float::NAN
331
- else
332
- ## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
333
- if numeric.is_a?( Proc )
334
- value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
335
- else
336
- value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
337
- end
338
- end
339
- else
340
- # do nothing - keep value as is :-).
341
- end
342
-
343
- logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
344
- end
345
-
346
- value
347
- end
348
-
349
-
350
-
351
- def parse_record( input, sep: )
352
- values = []
353
-
354
- space = config[:space]
355
-
356
- loop do
357
- value = parse_field( input, sep: sep )
358
- value = value.tr( space, ' ' ) if space && value.is_a?( String )
359
-
360
- logger.debug "value: »#{value}«" if logger.debug?
361
- values << value
362
-
363
- if input.eof?
364
- break
365
- elsif (c=input.peek; c==LF || c==CR)
366
- skip_newline( input )
367
- break
368
- elsif input.peek == sep
369
- input.getc ## eat-up FS(,)
370
- else
371
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
372
- end
373
- end
374
-
375
- values
376
- end
377
-
378
-
379
-
380
- def parse_meta( input )
381
- ## todo/check:
382
- ## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
383
-
384
- input.getc ## eat-up (add document header ---) - skip "---"
385
- input.getc
386
- input.getc
387
-
388
- ## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
389
- ## use match() or something to always match regexp
390
- skip_spaces( input ) # eat-up optional whitespaces in header line
391
- skip_newline( input )
392
-
393
- buf = "---\n" ## note: start buffer with yaml header line - why?
394
- ## ::YAML.load("") return false !!!
395
- ## ::YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
396
-
397
- newline = true
398
-
399
- ## eat-up until we hit "---" again
400
- loop do
401
- if input.eof?
402
- raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
403
- elsif (c=input.peek; c==LF || c==CR)
404
- while (c=input.peek; c==LF || c==CR ) ## add newlines
405
- buf << input.getc ## eat-up all until end of line
406
- end
407
- newline = true
408
- elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
409
- ## todo/fix/check: allow (ignore) spaces after --- why? why not?
410
- input.getc ## eat-up (add document header ---) - skip "---"
411
- input.getc
412
- input.getc
413
- skip_spaces( input ) # eat-up optional whitespaces in header line
414
- skip_newline( input )
415
- break
416
- else
417
- buf << input.getc
418
- newline = false
419
- end
420
- end
421
-
422
- data = ::YAML.load( buf ) ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
423
- ## todo: check edge cases - always should return a hash or nil
424
- ## what to do with just integer, string or array etc. ???
425
-
426
- data = {} if data.nil? ## note: if nil return empty hash e.g. {}
427
- data
428
- end ## parse_meta
429
-
430
-
431
-
432
- def skip_newline( input ) ## note: singular (strict) version
433
- return if input.eof?
434
-
435
- ## only skip CR LF or LF or CR
436
- if input.peek == CR
437
- input.getc ## eat-up
438
- input.getc if input.peek == LF
439
- elsif input.peek == LF
440
- input.getc ## eat-up
441
- else
442
- # do nothing
443
- end
444
- end
445
-
446
-
447
-
448
- def skip_until_eol( input )
449
- return if input.eof?
450
-
451
- while (c=input.peek; !(c==LF || c==CR || input.eof?))
452
- input.getc ## eat-up all until end of line
453
- end
454
- end
455
-
456
-
457
- def skip_spaces( input )
458
- return 0 if input.eof?
459
-
460
- ## note: return number of spaces skipped (e.g. 0,1,2,etc.)
461
- spaces_count = 0
462
- while (c=input.peek; c==SPACE || c==TAB)
463
- input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
464
- spaces_count += 1
465
- end
466
- spaces_count
467
- end
468
-
469
-
470
-
471
-
472
-
473
-
474
- def parse_lines( input, sep:, &block )
475
- ## note: reset (optional) meta data block
476
- @meta = nil ## no meta data block (use empty hash {} - why? why not?)
477
-
478
- ## note: track number of records
479
- ## used for meta block (can only start before any records e.g. if record_num == 0)
480
- record_num = 0
481
-
482
-
483
-
484
- hashtag = config[:hashtag]
485
-
486
- if hashtag
487
- comment = COMMENT_PERCENT
488
- ## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
489
- else
490
- ## note: can either use '#' or '%' but NOT both; first one "wins"
491
- comment = nil
492
- end
493
-
494
-
495
- has_seen_directive = false
496
- has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
497
- ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
498
-
499
- loop do
500
- break if input.eof?
501
-
502
- skipped_spaces = skip_spaces( input )
503
-
504
- if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
505
- logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
506
- comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
507
- skip_until_eol( input )
508
- skip_newline( input )
509
- elsif comment && input.peek == comment ## (anther) comment line
510
- logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
511
- skip_until_eol( input )
512
- skip_newline( input )
513
- elsif (c=input.peek; c==LF || c==CR || input.eof?)
514
- logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
515
- skip_newline( input )
516
- elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
517
- ## note: "skip" directives for now
518
- has_seen_directive = true
519
- logger.debug "skip directive" if logger.debug?
520
- skip_until_eol( input )
521
- skip_newline( input )
522
- elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
523
- skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
524
- ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
525
- has_seen_frontmatter = true
526
- logger.debug "start meta block" if logger.debug?
527
- ## note: meta gets stored as object attribute (state/state/state!!)
528
- ## use meta attribute to get meta data after reading first record
529
- @meta = parse_meta( input ) ## note: assumes a hash gets returned
530
- logger.debug " meta: >#{meta.inspect}<" if logger.debug?
531
- else
532
- logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
533
-
534
- record = parse_record( input, sep: sep )
535
- record_num +=1
536
-
537
- ## note: requires block - enforce? how? why? why not?
538
- block.call( record ) ## yield( record )
539
- end
540
- end # loop
541
- end # method parse_lines
542
-
543
-
544
-
545
-
546
- def convert_to_float( value ) Float( value ) rescue value; end
547
-
548
- def is_nan?( value )
549
- nan = @config[:nan]
550
- if nan.nil?
551
- false ## nothing set; return always false (not NaN)
552
- elsif nan.is_a?( Proc )
553
- nan.call( value )
554
- elsif nan.is_a?( Array )
555
- nan.include?( value )
556
- elsif nan.is_a?( String )
557
- value == nan
558
- else ## unknown config style / setting
559
- ## todo: issue a warning or error - why? why not?
560
- false ## nothing set; return always false (not nan)
561
- end
562
- end
563
-
564
-
565
- def is_null?( value )
566
- null = @config[:null]
567
- if null.nil?
568
- false ## nothing set; return always false (not null)
569
- elsif null.is_a?( Proc )
570
- null.call( value )
571
- elsif null.is_a?( Array )
572
- null.include?( value )
573
- elsif null.is_a?( String )
574
- value == null
575
- else ## unknown config style / setting
576
- ## todo: issue a warning or error - why? why not?
577
- false ## nothing set; return always false (not null)
578
- end
579
- end
580
-
581
-
582
- end # class ParserStd
583
- end # class CsvReader
1
+
2
+ class CsvReader
3
+
4
+
5
+
6
+
7
+
8
+ class ParserStd
9
+
10
+
11
+ ## char constants
12
+ DOUBLE_QUOTE = "\""
13
+ SINGLE_QUOTE = "'"
14
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
+ COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
16
+ COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
17
+ DIRECTIVE = "@" ## use a different name e.g. AT or ??
18
+ SPACE = " " ## \s == ASCII 32 (dec) = (Space)
19
+ TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
20
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
21
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
22
+
23
+
24
+
25
+ ###################################
26
+ ## add simple logger with debug flag/switch
27
+ #
28
+ # use Parser.debug = true # to turn on
29
+ #
30
+ # todo/fix: use logutils instead of std logger - why? why not?
31
+
32
+ def self.build_logger()
33
+ l = Logger.new( STDOUT )
34
+ l.level = :info ## set to :info on start; note: is 0 (debug) by default
35
+ l
36
+ end
37
+ def self.logger() @@logger ||= build_logger; end
38
+ def logger() self.class.logger; end
39
+
40
+
41
+
42
+
43
+ attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
44
+ attr_reader :meta
45
+
46
+ ##
47
+ ## todo/check:
48
+ ## null values - include NA - why? why not?
49
+ ## make null values case sensitive or add an option for case sensitive
50
+ ## or better allow a proc as option for checking too!!!
51
+ def initialize( sep: ',',
52
+ null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
53
+ numeric: false, ## (auto-)convert all non-quoted values to float
54
+ nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
55
+ space: nil,
56
+ hashtag: false
57
+ )
58
+ @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
59
+
60
+ check_sep( sep )
61
+ @config[:sep] = sep
62
+
63
+ ## note: null values must get handled by parser
64
+ ## only get checked for unquoted strings (and NOT for quoted strings)
65
+ ## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
66
+ @config[:null] = null ## null values
67
+ @config[:numeric] = numeric
68
+ @config[:nan] = nan # not a number (NaN) e.g. Float::NAN
69
+
70
+ ## e.g. treat/convert char to space e.g. _-+• etc
71
+ ## Man_Utd => Man Utd
72
+ ## or use it for leading and trailing spaces without quotes
73
+ ## todo/check: only use for unquoted values? why? why not?
74
+ @config[:space] = space
75
+
76
+ ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
77
+ ## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
78
+ ## do NOT treat # as a comment (always use % for now)
79
+ @config[:hashtag] = hashtag
80
+
81
+ @meta = nil ## no meta data block (use empty hash {} - why? why not?)
82
+ end
83
+
84
+
85
+ SEPARATORS = ",;|^:"
86
+
87
+ def check_sep( sep )
88
+ ## note: parse does NOT support space or tab as separator!!
89
+ ## leading and trailing space or tab (whitespace) gets by default trimmed
90
+ ## unless quoted (or alternative space char used e.g. _-+ if configured)
91
+
92
+ if SEPARATORS.include?( sep )
93
+ ## everything ok
94
+ else
95
+ raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
96
+ end
97
+ end
98
+
99
+
100
+ #########################################
101
+ ## config convenience helpers
102
+ ## e.g. use like Csv.defaultl.null = '\N' etc. instead of
103
+ ## Csv.default.config[:null] = '\N'
104
+ def sep=( value ) check_sep( value ); @config[:sep]=value; end
105
+
106
+ def null=( value ) @config[:null]=value; end
107
+ def numeric=( value ) @config[:numeric]=value; end
108
+ def nan=( value ) @config[:nan]=value; end
109
+ def space=( value ) @config[:space]=value; end
110
+ def hashtag=( value ) @config[:hashtag]=value; end
111
+
112
+
113
+
114
+
115
+ def parse( str_or_readable, sep: config[:sep], &block )
116
+
117
+ check_sep( sep )
118
+
119
+ ## note: data - will wrap either a String or IO object passed in data
120
+ ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
121
+
122
+ ## make sure data (string or io) is a wrapped into Buffer!!!!!!
123
+ if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
124
+ input = str_or_readable
125
+ else
126
+ input = Buffer.new( str_or_readable )
127
+ end
128
+
129
+ if block_given?
130
+ parse_lines( input, sep: sep, &block )
131
+ else
132
+ records = []
133
+
134
+ parse_lines( input, sep: sep ) do |record|
135
+ records << record
136
+ end
137
+
138
+ records
139
+ end
140
+ end ## method parse
141
+
142
+
143
+
144
+
145
+ private
146
+
147
+ def parse_escape( input, sep: )
148
+ value = ""
149
+ if input.peek == BACKSLASH
150
+ input.getc ## eat-up backslash
151
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
152
+ logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
153
+ value << input.getc ## add escaped char (e.g. lf, cr, etc.)
154
+ else
155
+ ## unknown escape sequence; no special handling/escaping
156
+ logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
157
+ value << BACKSLASH
158
+ end
159
+ else
160
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
161
+ end
162
+ value
163
+ end
164
+
165
+
166
+
167
+ def parse_quote( input, sep:, opening_quote:, closing_quote:)
168
+ value = ""
169
+ if input.peek == opening_quote
170
+ input.getc ## eat-up opening quote
171
+
172
+ loop do
173
+ while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
174
+ value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
175
+ end
176
+
177
+ if input.eof?
178
+ break
179
+ elsif input.peek == BACKSLASH
180
+ value << parse_escape( input, sep: sep )
181
+ else ## assume input.peek == quote
182
+ input.getc ## eat-up quote
183
+ if opening_quote == closing_quote && input.peek == closing_quote
184
+ ## doubled up quote?
185
+ # note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
186
+ value << input.getc ## add doube quote and continue!!!!
187
+ else
188
+ break
189
+ end
190
+ end
191
+ end
192
+ else
193
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
194
+ end
195
+ value
196
+ end
197
+
198
+
199
+ def parse_field_until_sep( input, sep: )
200
+ value = ""
201
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
202
+ ## consume simple value
203
+ ## until we hit "," or "\n" or "\r"
204
+ ## note: will eat-up quotes too!!!
205
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
206
+ if input.peek == BACKSLASH
207
+ value << parse_escape( input, sep: sep )
208
+ else
209
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
210
+ value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
211
+ end
212
+ end
213
+ ## note: only strip **trailing** spaces (space and tab only)
214
+ ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
215
+ value = value.sub( /[ \t]+$/, '' )
216
+ value
217
+ end
218
+
219
+
220
+
221
+ def parse_field( input, sep: )
222
+ value = ""
223
+
224
+ numeric = config[:numeric]
225
+ hashtag = config[:hashtag]
226
+
227
+
228
+ logger.debug "parse field" if logger.debug?
229
+
230
+ skip_spaces( input ) ## strip leading spaces
231
+
232
+
233
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
234
+ ## note: allows null = '' that is turn unquoted empty strings into null/nil
235
+ ## or if using numeric into NotANumber (NaN)
236
+ if is_null?( value )
237
+ value = nil
238
+ elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
239
+ value = Float::NAN
240
+ else
241
+ # do nothing - keep value as is :-) e.g. "".
242
+ end
243
+ elsif input.peek == DOUBLE_QUOTE
244
+ logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
245
+ value << parse_quote( input, sep: sep,
246
+ opening_quote: DOUBLE_QUOTE,
247
+ closing_quote: DOUBLE_QUOTE )
248
+
249
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
250
+ spaces_count = skip_spaces( input )
251
+
252
+ ## check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
253
+ ## todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
254
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
255
+ ## everything ok (that is, regular quoted value)!!!
256
+ else
257
+ ## try auto-fix
258
+ ## todo: report warning/issue error (if configured)!!!
259
+ extra_value = parse_field_until_sep( input, sep: sep )
260
+ ## "reconstruct" non-quoted value
261
+ spaces = ' ' * spaces_count ## todo: preserve tab (\t) - why? why not?
262
+ ## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
263
+ ## e.g. "hello """ extra, (becomes)=> "hello "" extra (one quote less/"eaten up")
264
+ value = %Q{"#{value}"#{spaces}#{extra_value}}
265
+ end
266
+
267
+ logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
268
+ elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
269
+ logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
270
+ value << parse_quote( input, sep: sep,
271
+ opening_quote: SINGLE_QUOTE,
272
+ closing_quote: SINGLE_QUOTE )
273
+
274
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
275
+ skip_spaces( input )
276
+ logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
277
+ elsif input.peek == "«"
278
+ value << parse_quote( input, sep: sep,
279
+ opening_quote: "«",
280
+ closing_quote: "»" )
281
+ skip_spaces( input )
282
+ elsif input.peek == "»"
283
+ value << parse_quote( input, sep: sep,
284
+ opening_quote: "»",
285
+ closing_quote: "«" )
286
+ skip_spaces( input )
287
+ elsif input.peek == "‹"
288
+ value << parse_quote( input, sep: sep,
289
+ opening_quote: "‹",
290
+ closing_quote: "" )
291
+ skip_spaces( input )
292
+ elsif input.peek == "›"
293
+ value << parse_quote( input, sep: sep,
294
+ opening_quote: "›",
295
+ closing_quote: "" )
296
+ skip_spaces( input )
297
+ else
298
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
299
+ ## consume simple value
300
+ ## until we hit "," or "\n" or "\r"
301
+ ## note: will eat-up quotes too!!!
302
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
303
+ if input.peek == BACKSLASH
304
+ value << parse_escape( input, sep: sep )
305
+ ### check for end-of-line comments (e.g. # ...)
306
+ ## note: quick hack for now
307
+ ## will NOT work in hashtag (hxl) mode and for % comments
308
+ ## for now ALWAYS assumes # for comments
309
+ ## and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
310
+ ## todo/fix: note: require leading space for comment hash (#) for now- why? why not?
311
+ ## require trailing space after comment hash (#) - why? why not?
312
+ elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
313
+ (value.size == 0 || (value.size > 0 && value[-1] == ' '))
314
+ ## eat-up everything until end-of-line (eol)
315
+ skip_until_eol( input )
316
+ else
317
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
318
+ value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
319
+ end
320
+ end
321
+ ## note: only strip **trailing** spaces (space and tab only)
322
+ ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
323
+ value = value.sub( /[ \t]+$/, '' )
324
+
325
+ if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
326
+ value = nil
327
+ elsif numeric
328
+ if is_nan?( value )
329
+ value = Float::NAN
330
+ else
331
+ ## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
332
+ if numeric.is_a?( Proc )
333
+ value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
334
+ else
335
+ value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
336
+ end
337
+ end
338
+ else
339
+ # do nothing - keep value as is :-).
340
+ end
341
+
342
+ logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
343
+ end
344
+
345
+ value
346
+ end
347
+
348
+
349
+
350
+ def parse_record( input, sep: )
351
+ values = []
352
+
353
+ space = config[:space]
354
+
355
+ loop do
356
+ value = parse_field( input, sep: sep )
357
+ value = value.tr( space, ' ' ) if space && value.is_a?( String )
358
+
359
+ logger.debug "value: »#{value}«" if logger.debug?
360
+ values << value
361
+
362
+ if input.eof?
363
+ break
364
+ elsif (c=input.peek; c==LF || c==CR)
365
+ skip_newline( input )
366
+ break
367
+ elsif input.peek == sep
368
+ input.getc ## eat-up FS(,)
369
+ else
370
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
371
+ end
372
+ end
373
+
374
+ values
375
+ end
376
+
377
+
378
+
379
+ def parse_meta( input )
380
+ ## todo/check:
381
+ ## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
382
+
383
+ input.getc ## eat-up (add document header ---) - skip "---"
384
+ input.getc
385
+ input.getc
386
+
387
+ ## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
388
+ ## use match() or something to always match regexp
389
+ skip_spaces( input ) # eat-up optional whitespaces in header line
390
+ skip_newline( input )
391
+
392
+ buf = "---\n" ## note: start buffer with yaml header line - why?
393
+ ## ::YAML.load("") return false !!!
394
+ ## ::YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
395
+
396
+ newline = true
397
+
398
+ ## eat-up until we hit "---" again
399
+ loop do
400
+ if input.eof?
401
+ raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
402
+ elsif (c=input.peek; c==LF || c==CR)
403
+ while (c=input.peek; c==LF || c==CR ) ## add newlines
404
+ buf << input.getc ## eat-up all until end of line
405
+ end
406
+ newline = true
407
+ elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
408
+ ## todo/fix/check: allow (ignore) spaces after --- why? why not?
409
+ input.getc ## eat-up (add document header ---) - skip "---"
410
+ input.getc
411
+ input.getc
412
+ skip_spaces( input ) # eat-up optional whitespaces in header line
413
+ skip_newline( input )
414
+ break
415
+ else
416
+ buf << input.getc
417
+ newline = false
418
+ end
419
+ end
420
+
421
+ data = ::YAML.load( buf ) ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
422
+ ## todo: check edge cases - always should return a hash or nil
423
+ ## what to do with just integer, string or array etc. ???
424
+
425
+ data = {} if data.nil? ## note: if nil return empty hash e.g. {}
426
+ data
427
+ end ## parse_meta
428
+
429
+
430
+
431
+ def skip_newline( input ) ## note: singular (strict) version
432
+ return if input.eof?
433
+
434
+ ## only skip CR LF or LF or CR
435
+ if input.peek == CR
436
+ input.getc ## eat-up
437
+ input.getc if input.peek == LF
438
+ elsif input.peek == LF
439
+ input.getc ## eat-up
440
+ else
441
+ # do nothing
442
+ end
443
+ end
444
+
445
+
446
+
447
+ def skip_until_eol( input )
448
+ return if input.eof?
449
+
450
+ while (c=input.peek; !(c==LF || c==CR || input.eof?))
451
+ input.getc ## eat-up all until end of line
452
+ end
453
+ end
454
+
455
+
456
+ def skip_spaces( input )
457
+ return 0 if input.eof?
458
+
459
+ ## note: return number of spaces skipped (e.g. 0,1,2,etc.)
460
+ spaces_count = 0
461
+ while (c=input.peek; c==SPACE || c==TAB)
462
+ input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
463
+ spaces_count += 1
464
+ end
465
+ spaces_count
466
+ end
467
+
468
+
469
+
470
+
471
+
472
+
473
+ def parse_lines( input, sep:, &block )
474
+ ## note: reset (optional) meta data block
475
+ @meta = nil ## no meta data block (use empty hash {} - why? why not?)
476
+
477
+ ## note: track number of records
478
+ ## used for meta block (can only start before any records e.g. if record_num == 0)
479
+ record_num = 0
480
+
481
+
482
+
483
+ hashtag = config[:hashtag]
484
+
485
+ if hashtag
486
+ comment = COMMENT_PERCENT
487
+ ## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
488
+ else
489
+ ## note: can either use '#' or '%' but NOT both; first one "wins"
490
+ comment = nil
491
+ end
492
+
493
+
494
+ has_seen_directive = false
495
+ has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
496
+ ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
497
+
498
+ loop do
499
+ break if input.eof?
500
+
501
+ skipped_spaces = skip_spaces( input )
502
+
503
+ if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
504
+ logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
505
+ comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
506
+ skip_until_eol( input )
507
+ skip_newline( input )
508
+ elsif comment && input.peek == comment ## (anther) comment line
509
+ logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
510
+ skip_until_eol( input )
511
+ skip_newline( input )
512
+ elsif (c=input.peek; c==LF || c==CR || input.eof?)
513
+ logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
514
+ skip_newline( input )
515
+ elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
516
+ ## note: "skip" directives for now
517
+ has_seen_directive = true
518
+ logger.debug "skip directive" if logger.debug?
519
+ skip_until_eol( input )
520
+ skip_newline( input )
521
+ elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
522
+ skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
523
+ ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
524
+ has_seen_frontmatter = true
525
+ logger.debug "start meta block" if logger.debug?
526
+ ## note: meta gets stored as object attribute (state/state/state!!)
527
+ ## use meta attribute to get meta data after reading first record
528
+ @meta = parse_meta( input ) ## note: assumes a hash gets returned
529
+ logger.debug " meta: >#{meta.inspect}<" if logger.debug?
530
+ else
531
+ logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
532
+
533
+ record = parse_record( input, sep: sep )
534
+ record_num +=1
535
+
536
+ ## note: requires block - enforce? how? why? why not?
537
+ block.call( record ) ## yield( record )
538
+ end
539
+ end # loop
540
+ end # method parse_lines
541
+
542
+
543
+
544
+
545
+ def convert_to_float( value ) Float( value ) rescue value; end
546
+
547
+ def is_nan?( value )
548
+ nan = @config[:nan]
549
+ if nan.nil?
550
+ false ## nothing set; return always false (not NaN)
551
+ elsif nan.is_a?( Proc )
552
+ nan.call( value )
553
+ elsif nan.is_a?( Array )
554
+ nan.include?( value )
555
+ elsif nan.is_a?( String )
556
+ value == nan
557
+ else ## unknown config style / setting
558
+ ## todo: issue a warning or error - why? why not?
559
+ false ## nothing set; return always false (not nan)
560
+ end
561
+ end
562
+
563
+
564
+ def is_null?( value )
565
+ null = @config[:null]
566
+ if null.nil?
567
+ false ## nothing set; return always false (not null)
568
+ elsif null.is_a?( Proc )
569
+ null.call( value )
570
+ elsif null.is_a?( Array )
571
+ null.include?( value )
572
+ elsif null.is_a?( String )
573
+ value == null
574
+ else ## unknown config style / setting
575
+ ## todo: issue a warning or error - why? why not?
576
+ false ## nothing set; return always false (not null)
577
+ end
578
+ end
579
+
580
+
581
+ end # class ParserStd
582
+ end # class CsvReader