csvreader 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,255 @@
1
+ # encoding: utf-8
2
+
3
+ class CsvReader
4
+
5
+
6
+
7
+
8
+
9
+ class ParserStd
10
+
11
+
12
+ ## char constants
13
+ DOUBLE_QUOTE = "\""
14
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
+ COMMENT = "#" ## use COMMENT_HASH or HASH or ??
16
+ SPACE = " " ## \s == ASCII 32 (dec) = (Space)
17
+ TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
18
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
19
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
20
+
21
+
22
+ ###################################
23
+ ## add simple logger with debug flag/switch
24
+ #
25
+ # use Parser.debug = true # to turn on
26
+ #
27
+ # todo/fix: use logutils instead of std logger - why? why not?
28
+
29
+ def self.build_logger()
30
+ l = Logger.new( STDOUT )
31
+ l.level = :info ## set to :info on start; note: is 0 (debug) by default
32
+ l
33
+ end
34
+ def self.logger() @@logger ||= build_logger; end
35
+ def logger() self.class.logger; end
36
+
37
+
38
+
39
+
40
+ attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
41
+
42
+ def initialize( null: ['\N', 'NA'] ## note: set to nil for no null vales / not availabe (na)
43
+ )
44
+ @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
45
+ @config[:null] = null ## null values
46
+ end
47
+
48
+
49
+ def parse( data, **kwargs, &block )
50
+
51
+ ## note: data - will wrap either a String or IO object passed in data
52
+ ## note: kwargs NOT used for now (but required for "protocol/interface" by other parsers)
53
+
54
+ ## make sure data (string or io) is a wrapped into Buffer!!!!!!
55
+ if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
56
+ input = data
57
+ else
58
+ input = Buffer.new( data )
59
+ end
60
+
61
+ if block_given?
62
+ parse_lines( input, &block )
63
+ else
64
+ records = []
65
+
66
+ parse_lines( input ) do |record|
67
+ records << record
68
+ end
69
+
70
+ records
71
+ end
72
+ end ## method parse
73
+
74
+
75
+
76
+ private
77
+
78
+ def parse_escape( input )
79
+ value = ""
80
+ if input.peek == BACKSLASH
81
+ input.getc ## eat-up backslash
82
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==',' || c=='"' )
83
+ logger.debug " add escaped char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
84
+ value << input.getc ## add escaped char (e.g. lf, cr, etc.)
85
+ else
86
+ ## unknown escape sequence; no special handling/escaping
87
+ logger.debug " add backspace (unknown escape seq) >#{input.peek}< (#{input.peek.ord})" if logger.debug?
88
+ value << BACKSLASH
89
+ end
90
+ else
91
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
92
+ end
93
+ value
94
+ end
95
+
96
+
97
+ def parse_doublequote( input )
98
+ value = ""
99
+ if input.peek == DOUBLE_QUOTE
100
+ input.getc ## eat-up double_quote
101
+
102
+ loop do
103
+ while (c=input.peek; !(c==DOUBLE_QUOTE || c==BACKSLASH || input.eof?))
104
+ value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
105
+ end
106
+
107
+ if input.eof?
108
+ break
109
+ elsif input.peek == BACKSLASH
110
+ value << parse_escape( input )
111
+ else ## assume input.peek == DOUBLE_QUOTE
112
+ input.getc ## eat-up double_quote
113
+ if input.peek == DOUBLE_QUOTE ## doubled up quote?
114
+ value << input.getc ## add doube quote and continue!!!!
115
+ else
116
+ break
117
+ end
118
+ end
119
+ end
120
+ else
121
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
122
+ end
123
+ value
124
+ end
125
+
126
+
127
+
128
+ def parse_field( input )
129
+ logger.debug "parse field" if logger.debug?
130
+
131
+ value = ""
132
+ skip_spaces( input ) ## strip leading spaces
133
+
134
+ if (c=input.peek; c=="," || c==LF || c==CR || input.eof?) ## empty field
135
+ ## return value; do nothing
136
+ elsif input.peek == DOUBLE_QUOTE
137
+ logger.debug "start double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
138
+ value << parse_doublequote( input )
139
+
140
+ ## note: always eat-up all trailing spaces (" ") and tabs (\t)
141
+ skip_spaces( input )
142
+ logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
143
+ else
144
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
145
+ ## consume simple value
146
+ ## until we hit "," or "\n" or "\r"
147
+ ## note: will eat-up quotes too!!!
148
+ while (c=input.peek; !(c=="," || c==LF || c==CR || input.eof?))
149
+ if input.peek == BACKSLASH
150
+ value << parse_escape( input )
151
+ else
152
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
153
+ value << input.getc ## note: eat-up all spaces (" ") and tabs (\t) too (strip trailing spaces at the end)
154
+ end
155
+ end
156
+ ## note: only strip **trailing** spaces (space and tab only)
157
+ ## do NOT strip newlines etc. might have been added via escape! e.g. \\\n
158
+ value = value.sub( /[ \t]+$/, '' )
159
+ logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
160
+ end
161
+
162
+ value
163
+ end
164
+
165
+
166
+
167
+ def parse_record( input )
168
+ values = []
169
+
170
+ loop do
171
+ value = parse_field( input )
172
+ logger.debug "value: »#{value}«" if logger.debug?
173
+ values << value
174
+
175
+ if input.eof?
176
+ break
177
+ elsif (c=input.peek; c==LF || c==CR)
178
+ skip_newline( input )
179
+ break
180
+ elsif input.peek == ","
181
+ input.getc ## eat-up FS(,)
182
+ else
183
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
184
+ end
185
+ end
186
+
187
+ values
188
+ end
189
+
190
+
191
+
192
+ def skip_newline( input ) ## note: singular (strict) version
193
+ return if input.eof?
194
+
195
+ ## only skip CR LF or LF or CR
196
+ if input.peek == CR
197
+ input.getc ## eat-up
198
+ input.getc if input.peek == LF
199
+ elsif input.peek == LF
200
+ input.getc ## eat-up
201
+ else
202
+ # do nothing
203
+ end
204
+ end
205
+
206
+
207
+
208
+ def skip_until_eol( input )
209
+ return if input.eof?
210
+
211
+ while (c=input.peek; !(c==LF || c==CR || input.eof?))
212
+ input.getc ## eat-up all until end of line
213
+ end
214
+ end
215
+
216
+ def skip_spaces( input )
217
+ return if input.eof?
218
+
219
+ while (c=input.peek; c==SPACE || c==TAB)
220
+ input.getc ## note: always eat-up all spaces (" ") and tabs (\t)
221
+ end
222
+ end
223
+
224
+
225
+
226
+
227
+
228
+
229
+ def parse_lines( input, &block )
230
+
231
+ loop do
232
+ break if input.eof?
233
+
234
+ skip_spaces( input )
235
+
236
+ if input.peek == COMMENT ## comment line
237
+ logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
238
+ skip_until_eol( input )
239
+ skip_newline( input )
240
+ elsif (c=input.peek; c==LF || c==CR || input.eof?)
241
+ logger.debug "skipping blank - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
242
+ skip_newline( input )
243
+ else
244
+ logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
245
+
246
+ record = parse_record( input )
247
+ ## note: requires block - enforce? how? why? why not?
248
+ block.call( record ) ## yield( record )
249
+ end
250
+ end # loop
251
+ end # method parse_lines
252
+
253
+
254
+ end # class ParserStd
255
+ end # class CsvReader
@@ -0,0 +1,269 @@
1
+ # encoding: utf-8
2
+
3
+ class CsvReader
4
+
5
+
6
+ class ParserStrict
7
+
8
+
9
+ ## char constants
10
+ BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
11
+ LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
12
+ CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
13
+
14
+
15
+ ###################################
16
+ ## add simple logger with debug flag/switch
17
+ #
18
+ # use Parser.debug = true # to turn on
19
+ #
20
+ # todo/fix: use logutils instead of std logger - why? why not?
21
+
22
+ def self.build_logger()
23
+ l = Logger.new( STDOUT )
24
+ l.level = :info ## set to :info on start; note: is 0 (debug) by default
25
+ l
26
+ end
27
+ def self.logger() @@logger ||= build_logger; end
28
+ def logger() self.class.logger; end
29
+
30
+
31
+
32
+ attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
33
+
34
+ def initialize( sep: ',',
35
+ quote: '"', ## note: set to false/nil for no quote
36
+ doublequote: true,
37
+ escape: false, ## true/false
38
+ null: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
39
+ quoted_empty_null: false,
40
+ unquoted_empty_null: false,
41
+ comment: false ## note: comment char e.g. # or false/nil
42
+ )
43
+ @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
44
+ @config[:sep] = sep
45
+ @config[:quote] = quote
46
+ @config[:doublequote] = doublequote
47
+ @config[:escape] = escape
48
+ @config[:null] = null
49
+ @config[:quoted_empty_null] = quoted_empty_null
50
+ @config[:unquoted_empty_null] = unquoted_empty_null
51
+ @config[:comment] = comment
52
+ end
53
+
54
+ #########################################
55
+ ## config convenience helpers
56
+ ## e.g. use like Csv.mysql.sep = ',' etc. instead of
57
+ ## Csv.mysql.config[:sep] = ','
58
+ def sep=( value ) @config[:sep]=value; end
59
+ def comment=( value ) @config[:comment]=value; end
60
+ def escape=( value ) @config[:escape]=value; end
61
+
62
+
63
+
64
+ def parse( data, sep: config[:sep], &block )
65
+ ## note: data - will wrap either a String or IO object passed in data
66
+
67
+ ## make sure data (string or io) is a wrapped into Buffer!!!!!!
68
+ if data.is_a?( Buffer ) ### allow (re)use of Buffer if managed from "outside"
69
+ input = data
70
+ else
71
+ input = Buffer.new( data )
72
+ end
73
+
74
+
75
+ if block_given?
76
+ parse_lines( input, sep: sep, &block )
77
+ else
78
+ records = []
79
+
80
+ parse_lines( input, sep: sep ) do |record|
81
+ records << record
82
+ end
83
+
84
+ records
85
+ end
86
+
87
+ end ## method parse
88
+
89
+
90
+
91
+ private
92
+
93
+ def parse_escape( input, sep: )
94
+ value = ""
95
+
96
+ quote = config[:quote]
97
+
98
+ if input.peek == BACKSLASH
99
+ input.getc ## eat-up backslash
100
+ if (c=input.peek; c==BACKSLASH || c==LF || c==CR || c==sep || (quote && c==quote) )
101
+ value << input.getc ## add escaped char (e.g. lf, cr, etc.)
102
+ else
103
+ ## unknown escape sequence; no special handling/escaping
104
+ value << BACKSLASH
105
+ end
106
+ else
107
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - BACKSLASH (\\) expected in parse_escape!!!!" )
108
+ end
109
+ value
110
+ end
111
+
112
+
113
+
114
+ def parse_quote( input, sep: )
115
+ value = ""
116
+
117
+ quote = config[:quote] # char (e.g.",') | nil
118
+ doublequote = config[:doublequote] # true|false
119
+ escape = config[:escape] # true|false
120
+
121
+ if input.peek == quote
122
+ input.getc ## eat-up double_quote
123
+
124
+ loop do
125
+ while (c=input.peek; !(c==quote || input.eof? || (escape && c==BACKSLASH)))
126
+ value << input.getc ## eat-up everything until hitting double_quote (") or backslash (escape)
127
+ end
128
+
129
+ if input.eof?
130
+ break
131
+ elsif input.peek == BACKSLASH
132
+ value << parse_escape( input, sep: sep )
133
+ else ## assume input.peek == DOUBLE_QUOTE
134
+ input.getc ## eat-up double_quote
135
+ if doublequote && input.peek == quote ## doubled up quote?
136
+ value << input.getc ## add doube quote and continue!!!!
137
+ else
138
+ break
139
+ end
140
+ end
141
+ end
142
+ else
143
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - DOUBLE_QUOTE (\") expected in parse_double_quote!!!!" )
144
+ end
145
+ value
146
+ end
147
+
148
+
149
+
150
+ def parse_field( input, sep: )
151
+ value = ""
152
+
153
+ quote = config[:quote]
154
+ escape = config[:escape]
155
+
156
+ logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
157
+
158
+ if (c=input.peek; c==sep || c==LF || c==CR || input.eof?) ## empty unquoted field
159
+ value = nil if config[:unquoted_empty_null]
160
+ ## return value; do nothing
161
+ elsif quote && input.peek == quote
162
+ logger.debug "start quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
163
+ value << parse_quote( input, sep: sep )
164
+
165
+ value = nil if config[:quoted_empty_null] && value == ""
166
+
167
+ logger.debug "end double_quote field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
168
+ else
169
+ logger.debug "start reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
170
+ ## consume simple value
171
+ ## until we hit "," or "\n" or "\r" or stray (double) quote e.g (")
172
+ while (c=input.peek; !(c==sep || c==LF || c==CR || input.eof? || (quote && c==quote)))
173
+ if escape && input.peek == BACKSLASH
174
+ value << parse_escape( input, sep: sep )
175
+ else
176
+ logger.debug " add char >#{input.peek}< (#{input.peek.ord})" if logger.debug?
177
+ value << input.getc
178
+ end
179
+ end
180
+ logger.debug "end reg field - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
181
+ end
182
+
183
+ value
184
+ end
185
+
186
+
187
+ def parse_record( input, sep: )
188
+ values = []
189
+
190
+ loop do
191
+ value = parse_field( input, sep: sep )
192
+ logger.debug "value: »#{value}«" if logger.debug?
193
+ values << value
194
+
195
+ if input.eof?
196
+ break
197
+ elsif (c=input.peek; c==LF || c==CR)
198
+ skip_newline( input ) ## note: singular / single newline only (NOT plural)
199
+ break
200
+ elsif input.peek == sep
201
+ input.getc ## eat-up FS (,)
202
+ else
203
+ raise ParseError.new( "found >#{input.peek} (#{input.peek.ord})< - FS (,) or RS (\\n) expected!!!!" )
204
+ end
205
+ end
206
+
207
+ values
208
+ end
209
+
210
+
211
+
212
+ def skip_newline( input ) ## note: singular (strict) version
213
+ return if input.eof?
214
+
215
+ ## only skip CR LF or LF or CR
216
+ if input.peek == CR
217
+ input.getc ## eat-up
218
+ input.getc if input.peek == LF
219
+ elsif input.peek == LF
220
+ input.getc ## eat-up
221
+ else
222
+ # do nothing
223
+ end
224
+ end
225
+
226
+
227
+
228
+ def skip_until_eol( input )
229
+ return if input.eof?
230
+
231
+ while (c=input.peek; !(c==LF || c==CR || input.eof?))
232
+ input.getc ## eat-up all until end of line
233
+ end
234
+ end
235
+
236
+
237
+
238
+ def parse_lines( input, sep:, &block )
239
+ ## no leading and trailing whitespaces trimmed/stripped
240
+ ## no comments skipped
241
+ ## no blanks skipped
242
+ ## - follows strict rules of
243
+ ## note: this csv format is NOT recommended;
244
+ ## please, use a format with comments, leading and trailing whitespaces, etc.
245
+ ## only added for checking compatibility
246
+
247
+ comment = config[:comment]
248
+
249
+ loop do
250
+ break if input.eof?
251
+
252
+ logger.debug "start record - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
253
+
254
+ if comment && input.peek == comment ## comment line
255
+ logger.debug "skipping comment - peek >#{input.peek}< (#{input.peek.ord})" if logger.debug?
256
+ skip_until_eol( input )
257
+ skip_newline( input )
258
+ else
259
+ record = parse_record( input, sep: sep )
260
+ ## note: requires block - enforce? how? why? why not?
261
+ block.call( record ) ## yield( record )
262
+ end
263
+ end # loop
264
+
265
+ end # method parse_lines
266
+
267
+
268
+ end # class ParserStrict
269
+ end # class CsvReader