csvreader 1.2.1 → 1.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,291 +1,290 @@
1
- # encoding: utf-8
2
-
3
- class CsvReader
4
-
5
-
6
- class ParserStrict
7
-
8
-
9
- ## char constants
10
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
11
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
12
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
13
-
14
-
15
- ###################################
16
- ## add simple logger with debug flag/switch
17
- #
18
- # use Parser.debug = true # to turn on
19
- #
20
- # todo/fix: use logutils instead of std logger - why? why not?
21
-
22
- def self.build_logger()
23
- l = Logger.new( STDOUT )
24
- l.level = :info ## set to :info on start; note: is 0 (debug) by default
25
- l
26
- end
27
- def self.logger() @@logger ||= build_logger; end
28
- def logger() self.class.logger; end
29
-
30
-
31
-
32
- attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
33
-
34
- def initialize( sep: ',',
35
- quote: '"', ## note: set to false/nil for no quote
36
- doublequote: true,
37
- escape: false, ## true/false
38
- null: nil, ## note: set to nil for no null vales / not availabe (na)
39
- comment: false ## note: comment char e.g. # or false/nil
40
- )
41
- @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
42
- @config[:sep] = sep
43
- @config[:quote] = quote
44
- @config[:doublequote] = doublequote
45
- @config[:escape] = escape
46
- @config[:null] = null
47
- @config[:comment] = comment
48
- end
49
-
50
-
51
- #########################################
52
- ## config convenience helpers
53
- ## e.g. use like Csv.mysql.sep = ',' etc. instead of
54
- ## Csv.mysql.config[:sep] = ','
55
- def sep=( value ) @config[:sep]=value; end
56
- def quote=( value ) @config[:quote]=value; end
57
- def doublequote=( value ) @config[:doublequote]=value; end
58
- def escape=( value ) @config[:escape]=value; end
59
- def null=( value ) @config[:null]=value; end
60
- def comment=( value ) @config[:comment]=value; end
61
-
62
-
63
-
64
- def parse( data, sep: config[:sep], &block )
65
- ## note: data - will wrap either a String or IO object passed in data
66
-
67
- ## make sure data (string or io) is a wrapped into Buffer!!!!!!
68
- if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
69
- input = data
70
- else
71
- input = Buffer.new( data )
72
- end
73
-
74
-
75
- if block_given?
76
- parse_lines( input, sep: sep, &block )
77
- else
78
- records = []
79
-
80
- parse_lines( input, sep: sep ) do |record|
81
- records << record
82
- end
83
-
84
- records
85
- end
86
-
87
- end ## method parse
88
-
89
-
90
-
91
- private
92
-
93
- def parse_escape( input, sep: )
94
- value = ""
95
-
96
- quote = config[:quote]
97
-
98
- if input.peek == BACKSLASH
99
- input.getc ## eat-up backslash
100
- if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
101
- value << input.getc ## add escaped char (e.g. lf, cr, etc.)
102
- else
103
- ## unknown escape sequence; no special handling/escaping
104
- value << BACKSLASH
105
- end
106
- else
107
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
108
- end
109
- value
110
- end
111
-
112
-
113
-
114
- def parse_quote( input, sep: )
115
- value = ""
116
-
117
- quote = config[:quote] # char (e.g.",') | nil
118
- doublequote = config[:doublequote] # true|false
119
- escape = config[:escape] # true|false
120
-
121
- if input.peek == quote
122
- input.getc ## eat-up double_quote
123
-
124
- loop do
125
- while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
126
- value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
127
- end
128
-
129
- if input.eof?
130
- break
131
- elsif input.peek == BACKSLASH
132
- value << parse_escape( input, sep: sep )
133
- else ## assume input.peek == DOUBLE_QUOTE
134
- input.getc ## eat-up double_quote
135
- if doublequote && input.peek == quote ## doubled up quote?
136
- value << input.getc ## add doube quote and continue!!!!
137
- else
138
- break
139
- end
140
- end
141
- end
142
- else
143
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
144
- end
145
- value
146
- end
147
-
148
-
149
-
150
- def parse_field( input, sep: )
151
- value = ""
152
-
153
- quote = config[:quote]
154
- escape = config[:escape]
155
-
156
- logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
157
-
158
- if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
159
- ## note: allows null = '' that is turn unquoted empty strings into null/nil
160
- ## or if using numeric into NotANumber (NaN)
161
- value = nil if is_null?( value )
162
- ## do nothing - keep value as is :-) e.g. "".
163
- elsif quote && input.peek == quote
164
- logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
165
- value << parse_quote( input, sep: sep )
166
- logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
167
- else
168
- logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
169
- ## consume simple value
170
- ## until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
171
- while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
172
- if escape && input.peek == BACKSLASH
173
- value << parse_escape( input, sep: sep )
174
- else
175
- logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
176
- value << input.getc
177
- end
178
- end
179
-
180
-
181
- value = nil if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
182
- # do nothing - keep value as is :-).
183
-
184
- logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
185
- end
186
-
187
- value
188
- end
189
-
190
-
191
-
192
- def parse_record( input, sep: )
193
- values = []
194
-
195
- loop do
196
- value = parse_field( input, sep: sep )
197
- logger.debug "value: »#{value}«" if logger.debug?
198
- values << value
199
-
200
- if input.eof?
201
- break
202
- elsif (c=input.peek; c==LF || c==CR)
203
- skip_newline( input ) ## note: singular / single newline only (NOT plural)
204
- break
205
- elsif input.peek == sep
206
- input.getc ## eat-up FS (,)
207
- else
208
- raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
209
- end
210
- end
211
-
212
- values
213
- end
214
-
215
-
216
-
217
- def skip_newline( input ) ## note: singular (strict) version
218
- return if input.eof?
219
-
220
- ## only skip CR LF or LF or CR
221
- if input.peek == CR
222
- input.getc ## eat-up
223
- input.getc if input.peek == LF
224
- elsif input.peek == LF
225
- input.getc ## eat-up
226
- else
227
- # do nothing
228
- end
229
- end
230
-
231
-
232
-
233
- def skip_until_eol( input )
234
- return if input.eof?
235
-
236
- while (c=input.peek; !(c==LF || c==CR || input.eof?))
237
- input.getc ## eat-up all until end of line
238
- end
239
- end
240
-
241
-
242
-
243
- def parse_lines( input, sep:, &block )
244
- ## no leading and trailing whitespaces trimmed/stripped
245
- ## no comments skipped
246
- ## no blanks skipped
247
- ## - follows strict rules of
248
- ## note: this csv format is NOT recommended;
249
- ## please, use a format with comments, leading and trailing whitespaces, etc.
250
- ## only added for checking compatibility
251
-
252
- comment = config[:comment]
253
-
254
- loop do
255
- break if input.eof?
256
-
257
- logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
258
-
259
- if comment && input.peek == comment ## comment line
260
- logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
261
- skip_until_eol( input )
262
- skip_newline( input )
263
- else
264
- record = parse_record( input, sep: sep )
265
- ## note: requires block - enforce? how? why? why not?
266
- block.call( record ) ## yield( record )
267
- end
268
- end # loop
269
-
270
- end # method parse_lines
271
-
272
-
273
- def is_null?( value )
274
- null = @config[:null]
275
- if null.nil?
276
- false ## nothing set; return always false (not null)
277
- elsif null.is_a?( Proc )
278
- null.call( value )
279
- elsif null.is_a?( Array )
280
- null.include?( value )
281
- elsif null.is_a?( String )
282
- value == null
283
- else ## unknown config style / setting
284
- ## todo: issue a warning or error - why? why not?
285
- false ## nothing set; return always false (not null)
286
- end
287
- end
288
-
289
-
290
- end # class ParserStrict
291
- end # class CsvReader
1
+
2
+ class CsvReader
3
+
4
+
5
+ class ParserStrict
6
+
7
+
8
+ ## char constants
9
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
10
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
11
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
12
+
13
+
14
+ ###################################
15
+ ## add simple logger with debug flag/switch
16
+ #
17
+ # use Parser.debug = true # to turn on
18
+ #
19
+ # todo/fix: use logutils instead of std logger - why? why not?
20
+
21
+ def self.build_logger()
22
+ l = Logger.new( STDOUT )
23
+ l.level = :info ## set to :info on start; note: is 0 (debug) by default
24
+ l
25
+ end
26
+ def self.logger() @@logger ||= build_logger; end
27
+ def logger() self.class.logger; end
28
+
29
+
30
+
31
+ attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
32
+
33
+ def initialize( sep: ',',
34
+ quote: '"', ## note: set to false/nil for no quote
35
+ doublequote: true,
36
+ escape: false, ## true/false
37
+ null: nil, ## note: set to nil for no null vales / not availabe (na)
38
+ comment: false ## note: comment char e.g. # or false/nil
39
+ )
40
+ @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
41
+ @config[:sep] = sep
42
+ @config[:quote] = quote
43
+ @config[:doublequote] = doublequote
44
+ @config[:escape] = escape
45
+ @config[:null] = null
46
+ @config[:comment] = comment
47
+ end
48
+
49
+
50
+ #########################################
51
+ ## config convenience helpers
52
+ ## e.g. use like Csv.mysql.sep = ',' etc. instead of
53
+ ## Csv.mysql.config[:sep] = ','
54
+ def sep=( value ) @config[:sep]=value; end
55
+ def quote=( value ) @config[:quote]=value; end
56
+ def doublequote=( value ) @config[:doublequote]=value; end
57
+ def escape=( value ) @config[:escape]=value; end
58
+ def null=( value ) @config[:null]=value; end
59
+ def comment=( value ) @config[:comment]=value; end
60
+
61
+
62
+
63
+ def parse( data, sep: config[:sep], &block )
64
+ ## note: data - will wrap either a String or IO object passed in data
65
+
66
+ ## make sure data (string or io) is a wrapped into Buffer!!!!!!
67
+ if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
68
+ input = data
69
+ else
70
+ input = Buffer.new( data )
71
+ end
72
+
73
+
74
+ if block_given?
75
+ parse_lines( input, sep: sep, &block )
76
+ else
77
+ records = []
78
+
79
+ parse_lines( input, sep: sep ) do |record|
80
+ records << record
81
+ end
82
+
83
+ records
84
+ end
85
+
86
+ end ## method parse
87
+
88
+
89
+
90
+ private
91
+
92
+ def parse_escape( input, sep: )
93
+ value = ""
94
+
95
+ quote = config[:quote]
96
+
97
+ if input.peek == BACKSLASH
98
+ input.getc ## eat-up backslash
99
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
100
+ value << input.getc ## add escaped char (e.g. lf, cr, etc.)
101
+ else
102
+ ## unknown escape sequence; no special handling/escaping
103
+ value << BACKSLASH
104
+ end
105
+ else
106
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
107
+ end
108
+ value
109
+ end
110
+
111
+
112
+
113
+ def parse_quote( input, sep: )
114
+ value = ""
115
+
116
+ quote = config[:quote] # char (e.g.",') | nil
117
+ doublequote = config[:doublequote] # true|false
118
+ escape = config[:escape] # true|false
119
+
120
+ if input.peek == quote
121
+ input.getc ## eat-up double_quote
122
+
123
+ loop do
124
+ while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
125
+ value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
126
+ end
127
+
128
+ if input.eof?
129
+ break
130
+ elsif input.peek == BACKSLASH
131
+ value << parse_escape( input, sep: sep )
132
+ else ## assume input.peek == DOUBLE_QUOTE
133
+ input.getc ## eat-up double_quote
134
+ if doublequote && input.peek == quote ## doubled up quote?
135
+ value << input.getc ## add doube quote and continue!!!!
136
+ else
137
+ break
138
+ end
139
+ end
140
+ end
141
+ else
142
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
143
+ end
144
+ value
145
+ end
146
+
147
+
148
+
149
+ def parse_field( input, sep: )
150
+ value = ""
151
+
152
+ quote = config[:quote]
153
+ escape = config[:escape]
154
+
155
+ logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
156
+
157
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
158
+ ## note: allows null = '' that is turn unquoted empty strings into null/nil
159
+ ## or if using numeric into NotANumber (NaN)
160
+ value = nil if is_null?( value )
161
+ ## do nothing - keep value as is :-) e.g. "".
162
+ elsif quote && input.peek == quote
163
+ logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
164
+ value << parse_quote( input, sep: sep )
165
+ logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
166
+ else
167
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
168
+ ## consume simple value
169
+ ## until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
170
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
171
+ if escape && input.peek == BACKSLASH
172
+ value << parse_escape( input, sep: sep )
173
+ else
174
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
175
+ value << input.getc
176
+ end
177
+ end
178
+
179
+
180
+ value = nil if is_null?( value ) ## note: null check only for UNQUOTED (not quoted/escaped) values
181
+ # do nothing - keep value as is :-).
182
+
183
+ logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
184
+ end
185
+
186
+ value
187
+ end
188
+
189
+
190
+
191
+ def parse_record( input, sep: )
192
+ values = []
193
+
194
+ loop do
195
+ value = parse_field( input, sep: sep )
196
+ logger.debug "value: »#{value}«" if logger.debug?
197
+ values << value
198
+
199
+ if input.eof?
200
+ break
201
+ elsif (c=input.peek; c==LF || c==CR)
202
+ skip_newline( input ) ## note: singular / single newline only (NOT plural)
203
+ break
204
+ elsif input.peek == sep
205
+ input.getc ## eat-up FS (,)
206
+ else
207
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
208
+ end
209
+ end
210
+
211
+ values
212
+ end
213
+
214
+
215
+
216
+ def skip_newline( input ) ## note: singular (strict) version
217
+ return if input.eof?
218
+
219
+ ## only skip CR LF or LF or CR
220
+ if input.peek == CR
221
+ input.getc ## eat-up
222
+ input.getc if input.peek == LF
223
+ elsif input.peek == LF
224
+ input.getc ## eat-up
225
+ else
226
+ # do nothing
227
+ end
228
+ end
229
+
230
+
231
+
232
+ def skip_until_eol( input )
233
+ return if input.eof?
234
+
235
+ while (c=input.peek; !(c==LF || c==CR || input.eof?))
236
+ input.getc ## eat-up all until end of line
237
+ end
238
+ end
239
+
240
+
241
+
242
+ def parse_lines( input, sep:, &block )
243
+ ## no leading and trailing whitespaces trimmed/stripped
244
+ ## no comments skipped
245
+ ## no blanks skipped
246
+ ## - follows strict rules of
247
+ ## note: this csv format is NOT recommended;
248
+ ## please, use a format with comments, leading and trailing whitespaces, etc.
249
+ ## only added for checking compatibility
250
+
251
+ comment = config[:comment]
252
+
253
+ loop do
254
+ break if input.eof?
255
+
256
+ logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
257
+
258
+ if comment && input.peek == comment ## comment line
259
+ logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
260
+ skip_until_eol( input )
261
+ skip_newline( input )
262
+ else
263
+ record = parse_record( input, sep: sep )
264
+ ## note: requires block - enforce? how? why? why not?
265
+ block.call( record ) ## yield( record )
266
+ end
267
+ end # loop
268
+
269
+ end # method parse_lines
270
+
271
+
272
+ def is_null?( value )
273
+ null = @config[:null]
274
+ if null.nil?
275
+ false ## nothing set; return always false (not null)
276
+ elsif null.is_a?( Proc )
277
+ null.call( value )
278
+ elsif null.is_a?( Array )
279
+ null.include?( value )
280
+ elsif null.is_a?( String )
281
+ value == null
282
+ else ## unknown config style / setting
283
+ ## todo: issue a warning or error - why? why not?
284
+ false ## nothing set; return always false (not null)
285
+ end
286
+ end
287
+
288
+
289
+ end # class ParserStrict
290
+ end # class CsvReader