csvreader 1.2.1 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,534 +1,582 @@
1
- # encoding: utf-8
2
-
3
- class CsvReader
4
-
5
-
6
-
7
-
8
-
9
- class ParserStd
10
-
11
-
12
- ## char constants
13
- DOUBLE_QUOTE = "\""
14
- SINGLE_QUOTE = "'"
15
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
16
- COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
17
- COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
18
- DIRECTIVE = "@" ## use a different name e.g. AT or ??
19
- SPACE = " " ## \s == ASCII 32 (dec) = (Space)
20
- TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
21
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
22
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
23
-
24
-
25
-
26
- ###################################
27
- ## add simple logger with debug flag/switch
28
- #
29
- # use Parser.debug = true # to turn on
30
- #
31
- # todo/fix: use logutils instead of std logger - why? why not?
32
-
33
- def self.build_logger()
34
- l = Logger.new( STDOUT )
35
- l.level = :info ## set to :info on start; note: is 0 (debug) by default
36
- l
37
- end
38
- def self.logger() @@logger ||= build_logger; end
39
- def logger() self.class.logger; end
40
-
41
-
42
-
43
-
44
- attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
45
- attr_reader :meta
46
-
47
- ##
48
- ## todo/check:
49
- ## null values - include NA - why? why not?
50
- ## make null values case sensitive or add an option for case sensitive
51
- ## or better allow a proc as option for checking too!!!
52
- def initialize( sep: ',',
53
- null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
54
- numeric: false, ## (auto-)convert all non-quoted values to float
55
- nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
56
- space: nil,
57
- hashtag: false
58
- )
59
- @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
60
-
61
- check_sep( sep )
62
- @config[:sep] = sep
63
-
64
- ## note: null values must get handled by parser
65
- ## only get checked for unquoted strings (and NOT for quoted strings)
66
- ## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
67
- @config[:null] = null ## null values
68
- @config[:numeric] = numeric
69
- @config[:nan] = nan # not a number (NaN) e.g. Float::NAN
70
-
71
- ## e.g. treat/convert char to space e.g. _-+• etc
72
- ## Man_Utd => Man Utd
73
- ## or use it for leading and trailing spaces without quotes
74
- ## todo/check: only use for unquoted values? why? why not?
75
- @config[:space] = space
76
-
77
- ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
78
- ## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
79
- ## do NOT treat # as a comment (always use % for now)
80
- @config[:hashtag] = hashtag
81
-
82
- @meta = nil ## no meta data block (use empty hash {} - why? why not?)
83
- end
84
-
85
-
86
- SEPARATORS = ",;|^:"
87
-
88
- def check_sep( sep )
89
- ## note: parse does NOT support space or tab as separator!!
90
- ## leading and trailing space or tab (whitespace) gets by default trimmed
91
- ## unless quoted (or alternative space char used e.g. _-+ if configured)
92
-
93
- if SEPARATORS.include?( sep )
94
- ## everything ok
95
- else
96
- raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
97
- end
98
- end
99
-
100
-
101
- #########################################
102
- ## config convenience helpers
103
- ## e.g. use like Csv.defaultl.null = '\N' etc. instead of
104
- ## Csv.default.config[:null] = '\N'
105
- def sep=( value ) check_sep( value ); @config[:sep]=value; end
106
-
107
- def null=( value ) @config[:null]=value; end
108
- def numeric=( value ) @config[:numeric]=value; end
109
- def nan=( value ) @config[:nan]=value; end
110
- def space=( value ) @config[:space]=value; end
111
- def hashtag=( value ) @config[:hashtag]=value; end
112
-
113
-
114
-
115
-
116
- def parse( str_or_readable, sep: config[:sep], &block )
117
-
118
- check_sep( sep )
119
-
120
- ## note: data - will wrap either a String or IO object passed in data
121
- ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
122
-
123
- ## make sure data (string or io) is a wrapped into Buffer!!!!!!
124
- if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
125
- input = str_or_readable
126
- else
127
- input = Buffer.new( str_or_readable )
128
- end
129
-
130
- if block_given?
131
- parse_lines( input, sep: sep, &block )
132
- else
133
- records = []
134
-
135
- parse_lines( input, sep: sep ) do |record|
136
- records << record
137
- end
138
-
139
- records
140
- end
141
- end ## method parse
142
-
143
-
144
-
145
-
146
- private
147
-
148
- def parse_escape( input, sep: )
149
- value = ""
150
- if input.peek == BACKSLASH
151
- input.getc ## eat-up backslash
152
- if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
153
- logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
154
- value << input.getc ## add escaped char (e.g. lf, cr, etc.)
155
- else
156
- ## unknown escape sequence; no special handling/escaping
157
- logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
158
- value << BACKSLASH
159
- end
160
- else
161
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
162
- end
163
- value
164
- end
165
-
166
-
167
-
168
- def parse_quote( input, sep:, opening_quote:, closing_quote:)
169
- value = ""
170
- if input.peek == opening_quote
171
- input.getc ## eat-up opening quote
172
-
173
- loop do
174
- while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
175
- value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
176
- end
177
-
178
- if input.eof?
179
- break
180
- elsif input.peek == BACKSLASH
181
- value << parse_escape( input, sep: sep )
182
- else ## assume input.peek == quote
183
- input.getc ## eat-up quote
184
- if opening_quote == closing_quote && input.peek == closing_quote
185
- ## doubled up quote?
186
- # note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
187
- value << input.getc ## add doube quote and continue!!!!
188
- else
189
- break
190
- end
191
- end
192
- end
193
- else
194
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
195
- end
196
- value
197
- end
198
-
199
-
200
-
201
-
202
- def parse_field( input, sep: )
203
- value = ""
204
-
205
- numeric = config[:numeric]
206
-
207
- logger.debug "parse field" if logger.debug?
208
-
209
- skip_spaces( input ) ## strip leading spaces
210
-
211
-
212
- if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
213
- ## note: allows null = '' that is turn unquoted empty strings into null/nil
214
- ## or if using numeric into NotANumber (NaN)
215
- if is_null?( value )
216
- value = nil
217
- elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
218
- value = Float::NAN
219
- else
220
- # do nothing - keep value as is :-) e.g. "".
221
- end
222
- elsif input.peek == DOUBLE_QUOTE
223
- logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
224
- value << parse_quote( input, sep: sep,
225
- opening_quote: DOUBLE_QUOTE,
226
- closing_quote: DOUBLE_QUOTE )
227
-
228
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
229
- skip_spaces( input )
230
- logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
231
- elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
232
- logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
233
- value << parse_quote( input, sep: sep,
234
- opening_quote: SINGLE_QUOTE,
235
- closing_quote: SINGLE_QUOTE )
236
-
237
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
238
- skip_spaces( input )
239
- logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
240
- elsif input.peek == "«"
241
- value << parse_quote( input, sep: sep,
242
- opening_quote: "«",
243
- closing_quote: "»" )
244
- skip_spaces( input )
245
- elsif input.peek == "»"
246
- value << parse_quote( input, sep: sep,
247
- opening_quote: "»",
248
- closing_quote: "«" )
249
- skip_spaces( input )
250
- elsif input.peek == "‹"
251
- value << parse_quote( input, sep: sep,
252
- opening_quote: "",
253
- closing_quote: "›" )
254
- skip_spaces( input )
255
- elsif input.peek == "›"
256
- value << parse_quote( input, sep: sep,
257
- opening_quote: "›",
258
- closing_quote: "‹" )
259
- skip_spaces( input )
260
- else
261
- logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
262
- ## consume simple value
263
- ## until we hit "," or "\n" or "\r"
264
- ## note: will eat-up quotes too!!!
265
- while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
266
- if input.peek == BACKSLASH
267
- value << parse_escape( input, sep: sep )
268
- else
269
- logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
270
- value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
271
- end
272
- end
273
- ## note: only strip **trailing** spaces (space and tab only)
274
- ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
275
- value = value.sub( /[ \t]+$/, '' )
276
-
277
- if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
278
- value = nil
279
- elsif numeric
280
- if is_nan?( value )
281
- value = Float::NAN
282
- else
283
- ## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
284
- if numeric.is_a?( Proc )
285
- value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
286
- else
287
- value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
288
- end
289
- end
290
- else
291
- # do nothing - keep value as is :-).
292
- end
293
-
294
- logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
295
- end
296
-
297
- value
298
- end
299
-
300
-
301
-
302
- def parse_record( input, sep: )
303
- values = []
304
-
305
- space = config[:space]
306
-
307
- loop do
308
- value = parse_field( input, sep: sep )
309
- value = value.tr( space, ' ' ) if space && value.is_a?( String )
310
-
311
- logger.debug "value: »#{value}«" if logger.debug?
312
- values << value
313
-
314
- if input.eof?
315
- break
316
- elsif (c=input.peek; c==LF || c==CR)
317
- skip_newline( input )
318
- break
319
- elsif input.peek == sep
320
- input.getc ## eat-up FS(,)
321
- else
322
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
323
- end
324
- end
325
-
326
- values
327
- end
328
-
329
-
330
-
331
- def parse_meta( input )
332
- ## todo/check:
333
- ## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
334
-
335
- input.getc ## eat-up (add document header ---) - skip "---"
336
- input.getc
337
- input.getc
338
-
339
- ## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
340
- ## use match() or something to always match regexp
341
- skip_spaces( input ) # eat-up optional whitespaces in header line
342
- skip_newline( input )
343
-
344
- buf = "---\n" ## note: start buffer with yaml header line - why?
345
- ## YAML.load("") return false !!!
346
- ## YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
347
-
348
- newline = true
349
-
350
- ## eat-up until we hit "---" again
351
- loop do
352
- if input.eof?
353
- raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
354
- elsif (c=input.peek; c==LF || c==CR)
355
- while (c=input.peek; c==LF || c==CR ) ## add newlines
356
- buf << input.getc ## eat-up all until end of line
357
- end
358
- newline = true
359
- elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
360
- ## todo/fix/check: allow (ignore) spaces after --- why? why not?
361
- input.getc ## eat-up (add document header ---) - skip "---"
362
- input.getc
363
- input.getc
364
- skip_spaces( input ) # eat-up optional whitespaces in header line
365
- skip_newline( input )
366
- break
367
- else
368
- buf << input.getc
369
- newline = false
370
- end
371
- end
372
-
373
- data = YAML.load( buf )
374
- ## todo: check edge cases - always should return a hash or nil
375
- ## what to do with just integer, string or array etc. ???
376
-
377
- data = {} if data.nil? ## note: if nil return empty hash e.g. {}
378
- data
379
- end ## parse_meta
380
-
381
-
382
-
383
- def skip_newline( input ) ## note: singular (strict) version
384
- return if input.eof?
385
-
386
- ## only skip CR LF or LF or CR
387
- if input.peek == CR
388
- input.getc ## eat-up
389
- input.getc if input.peek == LF
390
- elsif input.peek == LF
391
- input.getc ## eat-up
392
- else
393
- # do nothing
394
- end
395
- end
396
-
397
-
398
-
399
- def skip_until_eol( input )
400
- return if input.eof?
401
-
402
- while (c=input.peek; !(c==LF || c==CR || input.eof?))
403
- input.getc ## eat-up all until end of line
404
- end
405
- end
406
-
407
-
408
- def skip_spaces( input )
409
- return 0 if input.eof?
410
-
411
- ## note: return number of spaces skipped (e.g. 0,1,2,etc.)
412
- spaces_count = 0
413
- while (c=input.peek; c==SPACE || c==TAB)
414
- input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
415
- spaces_count += 1
416
- end
417
- spaces_count
418
- end
419
-
420
-
421
-
422
-
423
-
424
-
425
- def parse_lines( input, sep:, &block )
426
- ## note: reset (optional) meta data block
427
- @meta = nil ## no meta data block (use empty hash {} - why? why not?)
428
-
429
- ## note: track number of records
430
- ## used for meta block (can only start before any records e.g. if record_num == 0)
431
- record_num = 0
432
-
433
-
434
-
435
- hashtag = config[:hashtag]
436
-
437
- if hashtag
438
- comment = COMMENT_PERCENT
439
- ## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
440
- else
441
- ## note: can either use '#' or '%' but NOT both; first one "wins"
442
- comment = nil
443
- end
444
-
445
-
446
- has_seen_directive = false
447
- has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
448
- ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
449
-
450
- loop do
451
- break if input.eof?
452
-
453
- skipped_spaces = skip_spaces( input )
454
-
455
- if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
456
- logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
457
- comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
458
- skip_until_eol( input )
459
- skip_newline( input )
460
- elsif comment && input.peek == comment ## (anther) comment line
461
- logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
462
- skip_until_eol( input )
463
- skip_newline( input )
464
- elsif (c=input.peek; c==LF || c==CR || input.eof?)
465
- logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
466
- skip_newline( input )
467
- elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
468
- ## note: "skip" directives for now
469
- has_seen_directive = true
470
- logger.debug "skip directive" if logger.debug?
471
- skip_until_eol( input )
472
- skip_newline( input )
473
- elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
474
- skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
475
- ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
476
- has_seen_frontmatter = true
477
- logger.debug "start meta block" if logger.debug?
478
- ## note: meta gets stored as object attribute (state/state/state!!)
479
- ## use meta attribute to get meta data after reading first record
480
- @meta = parse_meta( input ) ## note: assumes a hash gets returned
481
- logger.debug " meta: >#{meta.inspect}<" if logger.debug?
482
- else
483
- logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
484
-
485
- record = parse_record( input, sep: sep )
486
- record_num +=1
487
-
488
- ## note: requires block - enforce? how? why? why not?
489
- block.call( record ) ## yield( record )
490
- end
491
- end # loop
492
- end # method parse_lines
493
-
494
-
495
-
496
-
497
- def convert_to_float( value ) Float( value ) rescue value; end
498
-
499
- def is_nan?( value )
500
- nan = @config[:nan]
501
- if nan.nil?
502
- false ## nothing set; return always false (not NaN)
503
- elsif nan.is_a?( Proc )
504
- nan.call( value )
505
- elsif nan.is_a?( Array )
506
- nan.include?( value )
507
- elsif nan.is_a?( String )
508
- value == nan
509
- else ## unknown config style / setting
510
- ## todo: issue a warning or error - why? why not?
511
- false ## nothing set; return always false (not nan)
512
- end
513
- end
514
-
515
-
516
- def is_null?( value )
517
- null = @config[:null]
518
- if null.nil?
519
- false ## nothing set; return always false (not null)
520
- elsif null.is_a?( Proc )
521
- null.call( value )
522
- elsif null.is_a?( Array )
523
- null.include?( value )
524
- elsif null.is_a?( String )
525
- value == null
526
- else ## unknown config style / setting
527
- ## todo: issue a warning or error - why? why not?
528
- false ## nothing set; return always false (not null)
529
- end
530
- end
531
-
532
-
533
- end # class ParserStd
534
- end # class CsvReader
1
+
2
+ class CsvReader
3
+
4
+
5
+
6
+
7
+
8
+ class ParserStd
9
+
10
+
11
+ ## char constants
12
+ DOUBLE_QUOTE = "\""
13
+ SINGLE_QUOTE = "'"
14
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
+ COMMENT_HASH = "#" ## use COMMENT1 or COMMENT_HASH or HASH or ??
16
+ COMMENT_PERCENT = "%" ## use COMMENT2 or COMMENT_PERCENT or PERCENT or ??
17
+ DIRECTIVE = "@" ## use a different name e.g. AT or ??
18
+ SPACE = " " ## \s == ASCII 32 (dec) = (Space)
19
+ TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
20
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
21
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
22
+
23
+
24
+
25
+ ###################################
26
+ ## add simple logger with debug flag/switch
27
+ #
28
+ # use Parser.debug = true # to turn on
29
+ #
30
+ # todo/fix: use logutils instead of std logger - why? why not?
31
+
32
+ def self.build_logger()
33
+ l = Logger.new( STDOUT )
34
+ l.level = :info ## set to :info on start; note: is 0 (debug) by default
35
+ l
36
+ end
37
+ def self.logger() @@logger ||= build_logger; end
38
+ def logger() self.class.logger; end
39
+
40
+
41
+
42
+
43
+ attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
44
+ attr_reader :meta
45
+
46
+ ##
47
+ ## todo/check:
48
+ ## null values - include NA - why? why not?
49
+ ## make null values case sensitive or add an option for case sensitive
50
+ ## or better allow a proc as option for checking too!!!
51
+ def initialize( sep: ',',
52
+ null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
53
+ numeric: false, ## (auto-)convert all non-quoted values to float
54
+ nan: nil, ## note: only if numeric - set mappings for Float::NAN (not a number) values
55
+ space: nil,
56
+ hashtag: false
57
+ )
58
+ @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
59
+
60
+ check_sep( sep )
61
+ @config[:sep] = sep
62
+
63
+ ## note: null values must get handled by parser
64
+ ## only get checked for unquoted strings (and NOT for quoted strings)
65
+ ## "higher-level" code only knows about strings and has no longer any info if string was quoted or unquoted
66
+ @config[:null] = null ## null values
67
+ @config[:numeric] = numeric
68
+ @config[:nan] = nan # not a number (NaN) e.g. Float::NAN
69
+
70
+ ## e.g. treat/convert char to space e.g. _-+• etc
71
+ ## Man_Utd => Man Utd
72
+ ## or use it for leading and trailing spaces without quotes
73
+ ## todo/check: only use for unquoted values? why? why not?
74
+ @config[:space] = space
75
+
76
+ ## hxl - humanitarian eXchange language uses a hashtag row for "meta data"
77
+ ## e.g. #sector+en,#subsector,#org,#country,#sex+#targeted,#sex+#targeted,#adm1
78
+ ## do NOT treat # as a comment (always use % for now)
79
+ @config[:hashtag] = hashtag
80
+
81
+ @meta = nil ## no meta data block (use empty hash {} - why? why not?)
82
+ end
83
+
84
+
85
+ SEPARATORS = ",;|^:"
86
+
87
+ def check_sep( sep )
88
+ ## note: parse does NOT support space or tab as separator!!
89
+ ## leading and trailing space or tab (whitespace) gets by default trimmed
90
+ ## unless quoted (or alternative space char used e.g. _-+ if configured)
91
+
92
+ if SEPARATORS.include?( sep )
93
+ ## everything ok
94
+ else
95
+ raise ArgumentError, "invalid/unsupported sep >#{sep}< - for now only >#{SEPARATORS}< allowed; sorry"
96
+ end
97
+ end
98
+
99
+
100
+ #########################################
101
+ ## config convenience helpers
102
+ ## e.g. use like Csv.defaultl.null = '\N' etc. instead of
103
+ ## Csv.default.config[:null] = '\N'
104
+ def sep=( value ) check_sep( value ); @config[:sep]=value; end
105
+
106
+ def null=( value ) @config[:null]=value; end
107
+ def numeric=( value ) @config[:numeric]=value; end
108
+ def nan=( value ) @config[:nan]=value; end
109
+ def space=( value ) @config[:space]=value; end
110
+ def hashtag=( value ) @config[:hashtag]=value; end
111
+
112
+
113
+
114
+
115
+ def parse( str_or_readable, sep: config[:sep], &block )
116
+
117
+ check_sep( sep )
118
+
119
+ ## note: data - will wrap either a String or IO object passed in data
120
+ ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
121
+
122
+ ## make sure data (string or io) is a wrapped into Buffer!!!!!!
123
+ if str_or_readable.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
124
+ input = str_or_readable
125
+ else
126
+ input = Buffer.new( str_or_readable )
127
+ end
128
+
129
+ if block_given?
130
+ parse_lines( input, sep: sep, &block )
131
+ else
132
+ records = []
133
+
134
+ parse_lines( input, sep: sep ) do |record|
135
+ records << record
136
+ end
137
+
138
+ records
139
+ end
140
+ end ## method parse
141
+
142
+
143
+
144
+
145
+ private
146
+
147
+ def parse_escape( input, sep: )
148
+ value = ""
149
+ if input.peek == BACKSLASH
150
+ input.getc ## eat-up backslash
151
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || c==DOUBLE_QUOTE || c==SINGLE_QUOTE )
152
+ logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
153
+ value << input.getc ## add escaped char (e.g. lf, cr, etc.)
154
+ else
155
+ ## unknown escape sequence; no special handling/escaping
156
+ logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
157
+ value << BACKSLASH
158
+ end
159
+ else
160
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
161
+ end
162
+ value
163
+ end
164
+
165
+
166
+
167
+ def parse_quote( input, sep:, opening_quote:, closing_quote:)
168
+ value = ""
169
+ if input.peek == opening_quote
170
+ input.getc ## eat-up opening quote
171
+
172
+ loop do
173
+ while (c=input.peek; !(c==closing_quote || c==BACKSLASH || input.eof?))
174
+ value << input.getc ## eat-up everything until hitting quote (e.g. " or ') or backslash (escape)
175
+ end
176
+
177
+ if input.eof?
178
+ break
179
+ elsif input.peek == BACKSLASH
180
+ value << parse_escape( input, sep: sep )
181
+ else ## assume input.peek == quote
182
+ input.getc ## eat-up quote
183
+ if opening_quote == closing_quote && input.peek == closing_quote
184
+ ## doubled up quote?
185
+ # note: only works (enabled) for "" or '' and NOT for «»,‹›.. (if opening and closing differ)
186
+ value << input.getc ## add doube quote and continue!!!!
187
+ else
188
+ break
189
+ end
190
+ end
191
+ end
192
+ else
193
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - CLOSING QUOTE (#{closing_quote}) expected in parse_quote!!!!" )
194
+ end
195
+ value
196
+ end
197
+
198
+
199
+ def parse_field_until_sep( input, sep: )
200
+ value = ""
201
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
202
+ ## consume simple value
203
+ ## until we hit "," or "\n" or "\r"
204
+ ## note: will eat-up quotes too!!!
205
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
206
+ if input.peek == BACKSLASH
207
+ value << parse_escape( input, sep: sep )
208
+ else
209
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
210
+ value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
211
+ end
212
+ end
213
+ ## note: only strip **trailing** spaces (space and tab only)
214
+ ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
215
+ value = value.sub( /[ \t]+$/, '' )
216
+ value
217
+ end
218
+
219
+
220
+
221
+ def parse_field( input, sep: )
222
+ value = ""
223
+
224
+ numeric = config[:numeric]
225
+ hashtag = config[:hashtag]
226
+
227
+
228
+ logger.debug "parse field" if logger.debug?
229
+
230
+ skip_spaces( input ) ## strip leading spaces
231
+
232
+
233
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty field
234
+ ## note: allows null = '' that is turn unquoted empty strings into null/nil
235
+ ## or if using numeric into NotANumber (NaN)
236
+ if is_null?( value )
237
+ value = nil
238
+ elsif numeric && is_nan?( value ) ## todo: check - how to handle numeric? return nil, NaN, or "" ???
239
+ value = Float::NAN
240
+ else
241
+ # do nothing - keep value as is :-) e.g. "".
242
+ end
243
+ elsif input.peek == DOUBLE_QUOTE
244
+ logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
245
+ value << parse_quote( input, sep: sep,
246
+ opening_quote: DOUBLE_QUOTE,
247
+ closing_quote: DOUBLE_QUOTE )
248
+
249
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
250
+ spaces_count = skip_spaces( input )
251
+
252
+ ## check for auto-fix trailing data after quoted value e.g. ---,"Fredy" Mercury,---
253
+ ## todo/fix: add auto-fix for all quote variants!!!!!!!!!!!!!!!!!!!!
254
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?)
255
+ ## everything ok (that is, regular quoted value)!!!
256
+ else
257
+ ## try auto-fix
258
+ ## todo: report warning/issue error (if configured)!!!
259
+ extra_value = parse_field_until_sep( input, sep: sep )
260
+ ## "reconstruct" non-quoted value
261
+ spaces = ' ' * spaces_count ## todo: preserve tab (\t) - why? why not?
262
+ ## note: minor (theoratical) issue (doubled quoted got "collapsed/escaped" to one from two in quoted value)
263
+ ## e.g. "hello """ extra, (becomes)=> "hello "" extra (one quote less/"eaten up")
264
+ value = %Q{"#{value}"#{spaces}#{extra_value}}
265
+ end
266
+
267
+ logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
268
+ elsif input.peek == SINGLE_QUOTE ## allow single quote too (by default)
269
+ logger.debug "start single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
270
+ value << parse_quote( input, sep: sep,
271
+ opening_quote: SINGLE_QUOTE,
272
+ closing_quote: SINGLE_QUOTE )
273
+
274
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
275
+ skip_spaces( input )
276
+ logger.debug "end single_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
277
+ elsif input.peek == "«"
278
+ value << parse_quote( input, sep: sep,
279
+ opening_quote: "«",
280
+ closing_quote: "»" )
281
+ skip_spaces( input )
282
+ elsif input.peek == "»"
283
+ value << parse_quote( input, sep: sep,
284
+ opening_quote: "»",
285
+ closing_quote: "«" )
286
+ skip_spaces( input )
287
+ elsif input.peek == "‹"
288
+ value << parse_quote( input, sep: sep,
289
+ opening_quote: "‹",
290
+ closing_quote: "›" )
291
+ skip_spaces( input )
292
+ elsif input.peek == "›"
293
+ value << parse_quote( input, sep: sep,
294
+ opening_quote: "",
295
+ closing_quote: "‹" )
296
+ skip_spaces( input )
297
+ else
298
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
299
+ ## consume simple value
300
+ ## until we hit "," or "\n" or "\r"
301
+ ## note: will eat-up quotes too!!!
302
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof?))
303
+ if input.peek == BACKSLASH
304
+ value << parse_escape( input, sep: sep )
305
+ ### check for end-of-line comments (e.g. # ...)
306
+ ## note: quick hack for now
307
+ ## will NOT work in hashtag (hxl) mode and for % comments
308
+ ## for now ALWAYS assumes # for comments
309
+ ## and end-of-line comments ONLY work here (that is, in unquoted values and NOT in quotes values) for now
310
+ ## todo/fix: note: require leading space for comment hash (#) for now- why? why not?
311
+ ## require trailing space after comment hash (#) - why? why not?
312
+ elsif (hashtag == false || hashtag.nil?) && input.peek == COMMENT_HASH &&
313
+ (value.size == 0 || (value.size > 0 && value[-1] == ' '))
314
+ ## eat-up everything until end-of-line (eol)
315
+ skip_until_eol( input )
316
+ else
317
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
318
+ value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
319
+ end
320
+ end
321
+ ## note: only strip **trailing** spaces (space and tab only)
322
+ ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
323
+ value = value.sub( /[ \t]+$/, '' )
324
+
325
+ if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
326
+ value = nil
327
+ elsif numeric
328
+ if is_nan?( value )
329
+ value = Float::NAN
330
+ else
331
+ ## numeric - (auto-convert) non-quoted values (if NOT nil) to floats
332
+ if numeric.is_a?( Proc )
333
+ value = numeric.call( value ) ## allow custom converter proc (e.g. how to handle NaN and conversion errors?)
334
+ else
335
+ value = convert_to_float( value ) # default (fails silently) keep string value if cannot convert - change - why? why not?
336
+ end
337
+ end
338
+ else
339
+ # do nothing - keep value as is :-).
340
+ end
341
+
342
+ logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
343
+ end
344
+
345
+ value
346
+ end
347
+
348
+
349
+
350
+ def parse_record( input, sep: )
351
+ values = []
352
+
353
+ space = config[:space]
354
+
355
+ loop do
356
+ value = parse_field( input, sep: sep )
357
+ value = value.tr( space, ' ' ) if space && value.is_a?( String )
358
+
359
+ logger.debug "value: »#{value}«" if logger.debug?
360
+ values << value
361
+
362
+ if input.eof?
363
+ break
364
+ elsif (c=input.peek; c==LF || c==CR)
365
+ skip_newline( input )
366
+ break
367
+ elsif input.peek == sep
368
+ input.getc ## eat-up FS(,)
369
+ else
370
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (#{sep}) or RS (\\n) expected!!!!" )
371
+ end
372
+ end
373
+
374
+ values
375
+ end
376
+
377
+
378
+
379
+ def parse_meta( input )
380
+ ## todo/check:
381
+ ## check again for input.peekn(4) =~ /^---[\n\r \t]$/ - why? why not?
382
+
383
+ input.getc ## eat-up (add document header ---) - skip "---"
384
+ input.getc
385
+ input.getc
386
+
387
+ ## todo/fix: make peekn(4)=~/^---[\n\r \t]$/ "more strict"
388
+ ## use match() or something to always match regexp
389
+ skip_spaces( input ) # eat-up optional whitespaces in header line
390
+ skip_newline( input )
391
+
392
+ buf = "---\n" ## note: start buffer with yaml header line - why?
393
+ ## ::YAML.load("") return false !!!
394
+ ## ::YAML.load("---\n") returns nil -- yes!! if we get nil return empty hash {}
395
+
396
+ newline = true
397
+
398
+ ## eat-up until we hit "---" again
399
+ loop do
400
+ if input.eof?
401
+ raise ParseError.new( "end of input/stream - meta block footer >---< expected!!!!" )
402
+ elsif (c=input.peek; c==LF || c==CR)
403
+ while (c=input.peek; c==LF || c==CR ) ## add newlines
404
+ buf << input.getc ## eat-up all until end of line
405
+ end
406
+ newline = true
407
+ elsif newline && input.peekn(4) =~ /^---[\n\r \t]?$/ ## check if meta block end marker?
408
+ ## todo/fix/check: allow (ignore) spaces after --- why? why not?
409
+ input.getc ## eat-up (add document header ---) - skip "---"
410
+ input.getc
411
+ input.getc
412
+ skip_spaces( input ) # eat-up optional whitespaces in header line
413
+ skip_newline( input )
414
+ break
415
+ else
416
+ buf << input.getc
417
+ newline = false
418
+ end
419
+ end
420
+
421
+ data = ::YAML.load( buf ) ## note: MUST use "outer" scope (CsvReader defines its own YAML parser)
422
+ ## todo: check edge cases - always should return a hash or nil
423
+ ## what to do with just integer, string or array etc. ???
424
+
425
+ data = {} if data.nil? ## note: if nil return empty hash e.g. {}
426
+ data
427
+ end ## parse_meta
428
+
429
+
430
+
431
+ def skip_newline( input ) ## note: singular (strict) version
432
+ return if input.eof?
433
+
434
+ ## only skip CR LF or LF or CR
435
+ if input.peek == CR
436
+ input.getc ## eat-up
437
+ input.getc if input.peek == LF
438
+ elsif input.peek == LF
439
+ input.getc ## eat-up
440
+ else
441
+ # do nothing
442
+ end
443
+ end
444
+
445
+
446
+
447
+ def skip_until_eol( input )
448
+ return if input.eof?
449
+
450
+ while (c=input.peek; !(c==LF || c==CR || input.eof?))
451
+ input.getc ## eat-up all until end of line
452
+ end
453
+ end
454
+
455
+
456
+ def skip_spaces( input )
457
+ return 0 if input.eof?
458
+
459
+ ## note: return number of spaces skipped (e.g. 0,1,2,etc.)
460
+ spaces_count = 0
461
+ while (c=input.peek; c==SPACE || c==TAB)
462
+ input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
463
+ spaces_count += 1
464
+ end
465
+ spaces_count
466
+ end
467
+
468
+
469
+
470
+
471
+
472
+
473
+ def parse_lines( input, sep:, &block )
474
+ ## note: reset (optional) meta data block
475
+ @meta = nil ## no meta data block (use empty hash {} - why? why not?)
476
+
477
+ ## note: track number of records
478
+ ## used for meta block (can only start before any records e.g. if record_num == 0)
479
+ record_num = 0
480
+
481
+
482
+
483
+ hashtag = config[:hashtag]
484
+
485
+ if hashtag
486
+ comment = COMMENT_PERCENT
487
+ ## todo/check: use a "heuristic" to check if its a comment or a hashtag line? why? why not?
488
+ else
489
+ ## note: can either use '#' or '%' but NOT both; first one "wins"
490
+ comment = nil
491
+ end
492
+
493
+
494
+ has_seen_directive = false
495
+ has_seen_frontmatter = false ## - renameto has_seen_dash (---) - why? why not???
496
+ ## note: can either use directives (@) or frontmatter (---) block; first one "wins"
497
+
498
+ loop do
499
+ break if input.eof?
500
+
501
+ skipped_spaces = skip_spaces( input )
502
+
503
+ if comment.nil? && (c=input.peek; c==COMMENT_HASH || c==COMMENT_PERCENT)
504
+ logger.debug "skipping comment (first) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
505
+ comment = input.getc ## first comment line (determines/fixes "allowed" comment-style)
506
+ skip_until_eol( input )
507
+ skip_newline( input )
508
+ elsif comment && input.peek == comment ## (anther) comment line
509
+ logger.debug "skipping comment (follow-up) - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
510
+ skip_until_eol( input )
511
+ skip_newline( input )
512
+ elsif (c=input.peek; c==LF || c==CR || input.eof?)
513
+ logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
514
+ skip_newline( input )
515
+ elsif record_num == 0 && hashtag == false && has_seen_frontmatter == false && input.peek==DIRECTIVE
516
+ ## note: "skip" directives for now
517
+ has_seen_directive = true
518
+ logger.debug "skip directive" if logger.debug?
519
+ skip_until_eol( input )
520
+ skip_newline( input )
521
+ elsif record_num == 0 && hashtag == false && has_seen_directive == false && has_seen_frontmatter == false &&
522
+ skipped_spaces == 0 && input.peekn(4) =~ /^---[\n\r \t]$/
523
+ ## note: assume "---" (MUST BE) followed by newline (\r or \n) or space starts a meta block
524
+ has_seen_frontmatter = true
525
+ logger.debug "start meta block" if logger.debug?
526
+ ## note: meta gets stored as object attribute (state/state/state!!)
527
+ ## use meta attribute to get meta data after reading first record
528
+ @meta = parse_meta( input ) ## note: assumes a hash gets returned
529
+ logger.debug " meta: >#{meta.inspect}<" if logger.debug?
530
+ else
531
+ logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
532
+
533
+ record = parse_record( input, sep: sep )
534
+ record_num +=1
535
+
536
+ ## note: requires block - enforce? how? why? why not?
537
+ block.call( record ) ## yield( record )
538
+ end
539
+ end # loop
540
+ end # method parse_lines
541
+
542
+
543
+
544
+
545
+ def convert_to_float( value ) Float( value ) rescue value; end
546
+
547
+ def is_nan?( value )
548
+ nan = @config[:nan]
549
+ if nan.nil?
550
+ false ## nothing set; return always false (not NaN)
551
+ elsif nan.is_a?( Proc )
552
+ nan.call( value )
553
+ elsif nan.is_a?( Array )
554
+ nan.include?( value )
555
+ elsif nan.is_a?( String )
556
+ value == nan
557
+ else ## unknown config style / setting
558
+ ## todo: issue a warning or error - why? why not?
559
+ false ## nothing set; return always false (not nan)
560
+ end
561
+ end
562
+
563
+
564
+ def is_null?( value )
565
+ null = @config[:null]
566
+ if null.nil?
567
+ false ## nothing set; return always false (not null)
568
+ elsif null.is_a?( Proc )
569
+ null.call( value )
570
+ elsif null.is_a?( Array )
571
+ null.include?( value )
572
+ elsif null.is_a?( String )
573
+ value == null
574
+ else ## unknown config style / setting
575
+ ## todo: issue a warning or error - why? why not?
576
+ false ## nothing set; return always false (not null)
577
+ end
578
+ end
579
+
580
+
581
+ end # class ParserStd
582
+ end # class CsvReader