csvreader 1.2.4 → 1.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,291 +1,290 @@
1
- # encoding: utf-8
2
-
3
- class CsvReader
4
-
5
-
6
- class ParserStrict
7
-
8
-
9
- ## char constants
10
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
11
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
12
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
13
-
14
-
15
- ###################################
16
- ## add simple logger with debug flag/switch
17
- #
18
- # use Parser.debug = true # to turn on
19
- #
20
- # todo/fix: use logutils instead of std logger - why? why not?
21
-
22
- def self.build_logger()
23
- l = Logger.new( STDOUT )
24
- l.level = :info ## set to :info on start; note: is 0 (debug) by default
25
- l
26
- end
27
- def self.logger() @@logger ||= build_logger; end
28
- def logger() self.class.logger; end
29
-
30
-
31
-
32
- attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
33
-
34
- def initialize( sep: ',',
35
- quote: '"', ## note: set to false/nil for no quote
36
- doublequote: true,
37
- escape: false, ## true/false
38
- null: nil, ## note: set to nil for no null vales / not availabe (na)
39
- comment: false ## note: comment char e.g. # or false/nil
40
- )
41
- @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
42
- @config[:sep] = sep
43
- @config[:quote] = quote
44
- @config[:doublequote] = doublequote
45
- @config[:escape] = escape
46
- @config[:null] = null
47
- @config[:comment] = comment
48
- end
49
-
50
-
51
- #########################################
52
- ## config convenience helpers
53
- ## e.g. use like Csv.mysql.sep = ',' etc. instead of
54
- ## Csv.mysql.config[:sep] = ','
55
- def sep=( value ) @config[:sep]=value; end
56
- def quote=( value ) @config[:quote]=value; end
57
- def doublequote=( value ) @config[:doublequote]=value; end
58
- def escape=( value ) @config[:escape]=value; end
59
- def null=( value ) @config[:null]=value; end
60
- def comment=( value ) @config[:comment]=value; end
61
-
62
-
63
-
64
- def parse( data, sep: config[:sep], &block )
65
- ## note: data - will wrap either a String or IO object passed in data
66
-
67
- ## make sure data (string or io) is a wrapped into Buffer!!!!!!
68
- if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
69
- input = data
70
- else
71
- input = Buffer.new( data )
72
- end
73
-
74
-
75
- if block_given?
76
- parse_lines( input, sep: sep, &block )
77
- else
78
- records = []
79
-
80
- parse_lines( input, sep: sep ) do |record|
81
- records << record
82
- end
83
-
84
- records
85
- end
86
-
87
- end ## method parse
88
-
89
-
90
-
91
- private
92
-
93
- def parse_escape( input, sep: )
94
- value = ""
95
-
96
- quote = config[:quote]
97
-
98
- if input.peek == BACKSLASH
99
- input.getc ## eat-up backslash
100
- if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
101
- value << input.getc ## add escaped char (e.g. lf, cr, etc.)
102
- else
103
- ## unknown escape sequence; no special handling/escaping
104
- value << BACKSLASH
105
- end
106
- else
107
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
108
- end
109
- value
110
- end
111
-
112
-
113
-
114
- def parse_quote( input, sep: )
115
- value = ""
116
-
117
- quote = config[:quote] # char (e.g.",') | nil
118
- doublequote = config[:doublequote] # true|false
119
- escape = config[:escape] # true|false
120
-
121
- if input.peek == quote
122
- input.getc ## eat-up double_quote
123
-
124
- loop do
125
- while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
126
- value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
127
- end
128
-
129
- if input.eof?
130
- break
131
- elsif input.peek == BACKSLASH
132
- value << parse_escape( input, sep: sep )
133
- else ## assume input.peek == DOUBLE_QUOTE
134
- input.getc ## eat-up double_quote
135
- if doublequote && input.peek == quote ## doubled up quote?
136
- value << input.getc ## add doube quote and continue!!!!
137
- else
138
- break
139
- end
140
- end
141
- end
142
- else
143
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
144
- end
145
- value
146
- end
147
-
148
-
149
-
150
- def parse_field( input, sep: )
151
- value = ""
152
-
153
- quote = config[:quote]
154
- escape = config[:escape]
155
-
156
- logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
157
-
158
- if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
159
- ## note: allows null = '' that is turn unquoted empty strings into null/nil
160
- ## or if using numeric into NotANumber (NaN)
161
- value = nil if is_null?( value )
162
- ## do nothing - keep value as is :-) e.g. "".
163
- elsif quote && input.peek == quote
164
- logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
165
- value << parse_quote( input, sep: sep )
166
- logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
167
- else
168
- logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
169
- ## consume simple value
170
- ## until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
171
- while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
172
- if escape && input.peek == BACKSLASH
173
- value << parse_escape( input, sep: sep )
174
- else
175
- logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
176
- value << input.getc
177
- end
178
- end
179
-
180
-
181
- value = nil if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
182
- # do nothing - keep value as is :-).
183
-
184
- logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
185
- end
186
-
187
- value
188
- end
189
-
190
-
191
-
192
- def parse_record( input, sep: )
193
- values = []
194
-
195
- loop do
196
- value = parse_field( input, sep: sep )
197
- logger.debug "value: »#{value}«" if logger.debug?
198
- values << value
199
-
200
- if input.eof?
201
- break
202
- elsif (c=input.peek; c==LF || c==CR)
203
- skip_newline( input ) ## note: singular / single newline only (NOT plural)
204
- break
205
- elsif input.peek == sep
206
- input.getc ## eat-up FS (,)
207
- else
208
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
209
- end
210
- end
211
-
212
- values
213
- end
214
-
215
-
216
-
217
- def skip_newline( input ) ## note: singular (strict) version
218
- return if input.eof?
219
-
220
- ## only skip CR LF or LF or CR
221
- if input.peek == CR
222
- input.getc ## eat-up
223
- input.getc if input.peek == LF
224
- elsif input.peek == LF
225
- input.getc ## eat-up
226
- else
227
- # do nothing
228
- end
229
- end
230
-
231
-
232
-
233
- def skip_until_eol( input )
234
- return if input.eof?
235
-
236
- while (c=input.peek; !(c==LF || c==CR || input.eof?))
237
- input.getc ## eat-up all until end of line
238
- end
239
- end
240
-
241
-
242
-
243
- def parse_lines( input, sep:, &block )
244
- ## no leading and trailing whitespaces trimmed/stripped
245
- ## no comments skipped
246
- ## no blanks skipped
247
- ## - follows strict rules of
248
- ## note: this csv format is NOT recommended;
249
- ## please, use a format with comments, leading and trailing whitespaces, etc.
250
- ## only added for checking compatibility
251
-
252
- comment = config[:comment]
253
-
254
- loop do
255
- break if input.eof?
256
-
257
- logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
258
-
259
- if comment && input.peek == comment ## comment line
260
- logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
261
- skip_until_eol( input )
262
- skip_newline( input )
263
- else
264
- record = parse_record( input, sep: sep )
265
- ## note: requires block - enforce? how? why? why not?
266
- block.call( record ) ## yield( record )
267
- end
268
- end # loop
269
-
270
- end # method parse_lines
271
-
272
-
273
- def is_null?( value )
274
- null = @config[:null]
275
- if null.nil?
276
- false ## nothing set; return always false (not null)
277
- elsif null.is_a?( Proc )
278
- null.call( value )
279
- elsif null.is_a?( Array )
280
- null.include?( value )
281
- elsif null.is_a?( String )
282
- value == null
283
- else ## unknown config style / setting
284
- ## todo: issue a warning or error - why? why not?
285
- false ## nothing set; return always false (not null)
286
- end
287
- end
288
-
289
-
290
- end # class ParserStrict
291
- end # class CsvReader
1
+
2
+ class CsvReader
3
+
4
+
5
+ class ParserStrict
6
+
7
+
8
+ ## char constants
9
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
10
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
11
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
12
+
13
+
14
+ ###################################
15
+ ## add simple logger with debug flag/switch
16
+ #
17
+ # use Parser.debug = true # to turn on
18
+ #
19
+ # todo/fix: use logutils instead of std logger - why? why not?
20
+
21
+ def self.build_logger()
22
+ l = Logger.new( STDOUT )
23
+ l.level = :info ## set to :info on start; note: is 0 (debug) by default
24
+ l
25
+ end
26
+ def self.logger() @@logger ||= build_logger; end
27
+ def logger() self.class.logger; end
28
+
29
+
30
+
31
+ attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
32
+
33
+ def initialize( sep: ',',
34
+ quote: '"', ## note: set to false/nil for no quote
35
+ doublequote: true,
36
+ escape: false, ## true/false
37
+ null: nil, ## note: set to nil for no null vales / not availabe (na)
38
+ comment: false ## note: comment char e.g. # or false/nil
39
+ )
40
+ @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
41
+ @config[:sep] = sep
42
+ @config[:quote] = quote
43
+ @config[:doublequote] = doublequote
44
+ @config[:escape] = escape
45
+ @config[:null] = null
46
+ @config[:comment] = comment
47
+ end
48
+
49
+
50
+ #########################################
51
+ ## config convenience helpers
52
+ ## e.g. use like Csv.mysql.sep = ',' etc. instead of
53
+ ## Csv.mysql.config[:sep] = ','
54
+ def sep=( value ) @config[:sep]=value; end
55
+ def quote=( value ) @config[:quote]=value; end
56
+ def doublequote=( value ) @config[:doublequote]=value; end
57
+ def escape=( value ) @config[:escape]=value; end
58
+ def null=( value ) @config[:null]=value; end
59
+ def comment=( value ) @config[:comment]=value; end
60
+
61
+
62
+
63
+ def parse( data, sep: config[:sep], &block )
64
+ ## note: data - will wrap either a String or IO object passed in data
65
+
66
+ ## make sure data (string or io) is a wrapped into Buffer!!!!!!
67
+ if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
68
+ input = data
69
+ else
70
+ input = Buffer.new( data )
71
+ end
72
+
73
+
74
+ if block_given?
75
+ parse_lines( input, sep: sep, &block )
76
+ else
77
+ records = []
78
+
79
+ parse_lines( input, sep: sep ) do |record|
80
+ records << record
81
+ end
82
+
83
+ records
84
+ end
85
+
86
+ end ## method parse
87
+
88
+
89
+
90
+ private
91
+
92
+ def parse_escape( input, sep: )
93
+ value = ""
94
+
95
+ quote = config[:quote]
96
+
97
+ if input.peek == BACKSLASH
98
+ input.getc ## eat-up backslash
99
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
100
+ value << input.getc ## add escaped char (e.g. lf, cr, etc.)
101
+ else
102
+ ## unknown escape sequence; no special handling/escaping
103
+ value << BACKSLASH
104
+ end
105
+ else
106
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
107
+ end
108
+ value
109
+ end
110
+
111
+
112
+
113
+ def parse_quote( input, sep: )
114
+ value = ""
115
+
116
+ quote = config[:quote] # char (e.g.",') | nil
117
+ doublequote = config[:doublequote] # true|false
118
+ escape = config[:escape] # true|false
119
+
120
+ if input.peek == quote
121
+ input.getc ## eat-up double_quote
122
+
123
+ loop do
124
+ while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
125
+ value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
126
+ end
127
+
128
+ if input.eof?
129
+ break
130
+ elsif input.peek == BACKSLASH
131
+ value << parse_escape( input, sep: sep )
132
+ else ## assume input.peek == DOUBLE_QUOTE
133
+ input.getc ## eat-up double_quote
134
+ if doublequote && input.peek == quote ## doubled up quote?
135
+ value << input.getc ## add doube quote and continue!!!!
136
+ else
137
+ break
138
+ end
139
+ end
140
+ end
141
+ else
142
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
143
+ end
144
+ value
145
+ end
146
+
147
+
148
+
149
+ def parse_field( input, sep: )
150
+ value = ""
151
+
152
+ quote = config[:quote]
153
+ escape = config[:escape]
154
+
155
+ logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
156
+
157
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
158
+ ## note: allows null = '' that is turn unquoted empty strings into null/nil
159
+ ## or if using numeric into NotANumber (NaN)
160
+ value = nil if is_null?( value )
161
+ ## do nothing - keep value as is :-) e.g. "".
162
+ elsif quote && input.peek == quote
163
+ logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
164
+ value << parse_quote( input, sep: sep )
165
+ logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
166
+ else
167
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
168
+ ## consume simple value
169
+ ## until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
170
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
171
+ if escape && input.peek == BACKSLASH
172
+ value << parse_escape( input, sep: sep )
173
+ else
174
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
175
+ value << input.getc
176
+ end
177
+ end
178
+
179
+
180
+ value = nil if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
181
+ # do nothing - keep value as is :-).
182
+
183
+ logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
184
+ end
185
+
186
+ value
187
+ end
188
+
189
+
190
+
191
+ def parse_record( input, sep: )
192
+ values = []
193
+
194
+ loop do
195
+ value = parse_field( input, sep: sep )
196
+ logger.debug "value: »#{value}«" if logger.debug?
197
+ values << value
198
+
199
+ if input.eof?
200
+ break
201
+ elsif (c=input.peek; c==LF || c==CR)
202
+ skip_newline( input ) ## note: singular / single newline only (NOT plural)
203
+ break
204
+ elsif input.peek == sep
205
+ input.getc ## eat-up FS (,)
206
+ else
207
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
208
+ end
209
+ end
210
+
211
+ values
212
+ end
213
+
214
+
215
+
216
+ def skip_newline( input ) ## note: singular (strict) version
217
+ return if input.eof?
218
+
219
+ ## only skip CR LF or LF or CR
220
+ if input.peek == CR
221
+ input.getc ## eat-up
222
+ input.getc if input.peek == LF
223
+ elsif input.peek == LF
224
+ input.getc ## eat-up
225
+ else
226
+ # do nothing
227
+ end
228
+ end
229
+
230
+
231
+
232
+ def skip_until_eol( input )
233
+ return if input.eof?
234
+
235
+ while (c=input.peek; !(c==LF || c==CR || input.eof?))
236
+ input.getc ## eat-up all until end of line
237
+ end
238
+ end
239
+
240
+
241
+
242
+ def parse_lines( input, sep:, &block )
243
+ ## no leading and trailing whitespaces trimmed/stripped
244
+ ## no comments skipped
245
+ ## no blanks skipped
246
+ ## - follows strict rules of
247
+ ## note: this csv format is NOT recommended;
248
+ ## please, use a format with comments, leading and trailing whitespaces, etc.
249
+ ## only added for checking compatibility
250
+
251
+ comment = config[:comment]
252
+
253
+ loop do
254
+ break if input.eof?
255
+
256
+ logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
257
+
258
+ if comment && input.peek == comment ## comment line
259
+ logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
260
+ skip_until_eol( input )
261
+ skip_newline( input )
262
+ else
263
+ record = parse_record( input, sep: sep )
264
+ ## note: requires block - enforce? how? why? why not?
265
+ block.call( record ) ## yield( record )
266
+ end
267
+ end # loop
268
+
269
+ end # method parse_lines
270
+
271
+
272
+ def is_null?( value )
273
+ null = @config[:null]
274
+ if null.nil?
275
+ false ## nothing set; return always false (not null)
276
+ elsif null.is_a?( Proc )
277
+ null.call( value )
278
+ elsif null.is_a?( Array )
279
+ null.include?( value )
280
+ elsif null.is_a?( String )
281
+ value == null
282
+ else ## unknown config style / setting
283
+ ## todo: issue a warning or error - why? why not?
284
+ false ## nothing set; return always false (not null)
285
+ end
286
+ end
287
+
288
+
289
+ end # class ParserStrict
290
+ end # class CsvReader
@@ -1,23 +1,22 @@
1
- # encoding: utf-8
2
-
3
- class CsvReader
4
-
5
- class ParserTab
6
-
7
- def parse( data, **kwargs, &block )
8
- ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
9
-
10
- ## note: input: required each_line (string or io/file for example)
11
- ## assume data is a string or io/file handle
12
- tab = TabReader.new( data )
13
-
14
- if block_given?
15
- tab.each( &block )
16
- else
17
- tab.to_a
18
- end
19
- end ## method parse
20
-
21
-
22
- end # class ParserTab
23
- end # class CsvReader
1
+
2
+ class CsvReader
3
+
4
+ class ParserTab
5
+
6
+ def parse( data, **kwargs, &block )
7
+ ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
8
+
9
+ ## note: input: required each_line (string or io/file for example)
10
+ ## assume data is a string or io/file handle
11
+ tab = TabReader.new( data )
12
+
13
+ if block_given?
14
+ tab.each( &block )
15
+ else
16
+ tab.to_a
17
+ end
18
+ end ## method parse
19
+
20
+
21
+ end # class ParserTab
22
+ end # class CsvReader