csvreader 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea1d667219773e3a355c81f815d91e92340d61a1
4
- data.tar.gz: ba7a43ccb5e110fc1f6eca76ca2a74a62f1131fb
3
+ metadata.gz: c87e1cac5f0988f4423a0c5aaf96d2a625bf4d60
4
+ data.tar.gz: 5af8f5875ac0e18ade4cc793ba8ad658f905d1df
5
5
  SHA512:
6
- metadata.gz: 0543a4338d2d12e36da16acdad9abff28633e519baa1d92044d1ca8f5e3472d835d00a10d8b19c24561b06e0d724f87414495600f4c83eef7c9e033474b4c09e
7
- data.tar.gz: 8df669bc86f2066b2650a67bda5698fae7b6d58766b9c318f47958b0499671d0a4d39e862b8d3af842105a2777a3cf7ad05168380c338a3053fd3d363697abfb
6
+ metadata.gz: c9528101aa8a2db3a8e0dfb3685e6d15fcd262a76ed16f69b34ca9d54003e772f9441eb1673e11886ee14ac3347a99c22bd06662a8191214189f5c57f0ecfe7b
7
+ data.tar.gz: acc9ada28d539dbc7ce1a2178e904ba247f511327f5828eebfdda78b21b263ca5d153d8fc234d7483cb60229f764094bc3c2fbeefa2381335d4e21a30487a828
data/Manifest.txt CHANGED
@@ -6,7 +6,11 @@ Rakefile
6
6
  lib/csvreader.rb
7
7
  lib/csvreader/buffer.rb
8
8
  lib/csvreader/parser.rb
9
+ lib/csvreader/parser_std.rb
10
+ lib/csvreader/parser_strict.rb
11
+ lib/csvreader/parser_tab.rb
9
12
  lib/csvreader/reader.rb
13
+ lib/csvreader/reader_hash.rb
10
14
  lib/csvreader/version.rb
11
15
  test/data/beer.csv
12
16
  test/data/beer11.csv
@@ -14,6 +18,8 @@ test/data/shakespeare.csv
14
18
  test/helper.rb
15
19
  test/test_parser.rb
16
20
  test/test_parser_formats.rb
17
- test/test_parser_rfc4180.rb
21
+ test/test_parser_java.rb
22
+ test/test_parser_strict.rb
23
+ test/test_parser_tab.rb
18
24
  test/test_reader.rb
19
25
  test/test_reader_hash.rb
data/lib/csvreader.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'csv'
4
- require 'json'
3
+
5
4
  require 'pp'
6
5
  require 'logger'
7
6
 
@@ -10,8 +9,19 @@ require 'logger'
10
9
  # our own code
11
10
  require 'csvreader/version' # let version always go first
12
11
  require 'csvreader/buffer'
12
+ require 'csvreader/parser_std' # best practices pre-configured out-of-the-box
13
+ require 'csvreader/parser_strict' # flexible (strict - no leading/trailing space triming, blanks, etc.), configure for different formats/dialects
14
+ require 'csvreader/parser_tab'
13
15
  require 'csvreader/parser'
14
16
  require 'csvreader/reader'
17
+ require 'csvreader/reader_hash'
18
+
19
+
20
+
21
+ ## add convenience / shortcut alias
22
+ Csv = CsvReader
23
+ CsvHash = CsvHashReader
24
+
15
25
 
16
26
 
17
27
  puts CsvReader.banner # say hello
@@ -1,7 +1,12 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  class CsvReader
4
- class BufferIO ## todo: find a better name - why? why not? is really just for reading (keep io?)
4
+ class Buffer ## todo: find a better name:
5
+ ## BufferedReader
6
+ ## BufferedInput
7
+ ## BufferI
8
+ ## - why? why not? is really just for reading (keep io?)
9
+
5
10
  def initialize( data )
6
11
  # create the IO object we will read from
7
12
  @io = data.is_a?(String) ? StringIO.new(data) : data
@@ -20,7 +25,7 @@ class BufferIO ## todo: find a better name - why? why not? is really just for
20
25
 
21
26
  def peek
22
27
  if @buf.size == 0 && @io.eof?
23
- puts "peek - hitting eof!!!"
28
+ ## puts "peek - hitting eof!!!"
24
29
  return "\0" ## return NUL char (0) for now
25
30
  end
26
31
 
@@ -33,5 +38,5 @@ class BufferIO ## todo: find a better name - why? why not? is really just for
33
38
  @buf.first
34
39
  end # method peek
35
40
 
36
- end # class BufferIO
41
+ end # class Buffer
37
42
  end # class CsvReader
@@ -2,363 +2,74 @@
2
2
 
3
3
  class CsvReader
4
4
 
5
-
6
-
7
-
8
-
9
5
  class Parser
10
6
 
7
+ ## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
8
+ ## parser must support parse method (with and without block)
9
+ ## e.g. records = parse( data )
10
+ ## -or-
11
+ ## parse( data ) do |record|
12
+ ## end
11
13
 
12
- ## char constants
13
- DOUBLE_QUOTE = "\""
14
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
- COMMENT = "#" ## use COMMENT_HASH or HASH or ??
16
- SPACE = " " ## \s == ASCII 32 (dec) = (Space)
17
- TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
18
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
19
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
20
-
21
-
22
- ###################################
23
- ## add simple logger with debug flag/switch
24
- #
25
- # use Parser.debug = true # to turn on
26
- #
27
- # todo/fix: use logutils instead of std logger - why? why not?
28
-
29
- def self.logger() @@logger ||= Logger.new( STDOUT ); end
30
- def logger() self.class.logger; end
31
-
32
-
33
-
34
- attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
35
-
36
- def initialize( sep: ',',
37
- quote: DOUBLE_QUOTE, ## note: set to nil for no quote
38
- doublequote: true,
39
- escape: BACKSLASH, ## note: set to nil for no escapes
40
- trim: true, ## note: will toggle between human/default and strict mode parser!!!
41
- na: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
42
- quoted_empty: '', ## note: only available in strict mode (e.g. trim=false)
43
- unquoted_empty: '' ## note: only available in strict mode (e.g. trim=false)
44
- )
45
- @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
46
- @config[:sep] = sep
47
- @config[:quote] = quote
48
- @config[:doublequote] = doublequote
49
- @config[:escape] = escape
50
- @config[:trim] = trim
51
- @config[:na] = na
52
- @config[:quoted_empty] = quoted_empty
53
- @config[:unquoted_empty] = unquoted_empty
54
- end
55
-
56
-
57
-
58
- def strict?
59
- ## note: use trim for separating two different parsers / code paths:
60
- ## - human with trim leading and trailing whitespace and
61
- ## - strict with no leading and trailing whitespaces allowed
62
-
63
- ## for now use - trim == false for strict version flag alias
64
- ## todo/fix: add strict flag - why? why not?
65
- @config[:trim] ? false : true
66
- end
67
-
68
-
69
- DEFAULT = new( sep: ',', trim: true )
70
- RFC4180 = new( sep: ',', trim: false )
71
- EXCEL = new( sep: ',', trim: false )
72
-
73
- def self.default() DEFAULT; end ## alternative alias for DEFAULT
74
- def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
75
- def self.excel() EXCEL; end ## alternative alias for EXCEL
76
-
77
-
78
-
79
-
80
- def parse_field( io, sep: )
81
- logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
82
-
83
- value = ""
84
- skip_spaces( io ) ## strip leading spaces
85
-
86
- if (c=io.peek; c=="," || c==LF || c==CR || io.eof?) ## empty field
87
- ## return value; do nothing
88
- elsif io.peek == DOUBLE_QUOTE
89
- logger.debug "start double_quote field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
90
- io.getc ## eat-up double_quote
91
14
 
92
- loop do
93
- while (c=io.peek; !(c==DOUBLE_QUOTE || io.eof?))
94
- value << io.getc ## eat-up everything unit quote (")
95
- end
15
+ DEFAULT = ParserStd.new
96
16
 
97
- break if io.eof?
17
+ RFC4180 = ParserStrict.new
18
+ STRICT = ParserStrict.new ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
19
+ EXCEL = ParserStrict.new ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
98
20
 
99
- io.getc ## eat-up double_quote
21
+ MYSQL = ParserStrict.new( sep: "\t",
22
+ quote: false,
23
+ escape: true,
24
+ null: "\\N" )
100
25
 
101
- if io.peek == DOUBLE_QUOTE ## doubled up quote?
102
- value << io.getc ## add doube quote and continue!!!!
103
- else
104
- break
105
- end
106
- end
26
+ POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
27
+ escape: true,
28
+ unquoted_empty_null: true )
107
29
 
108
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
109
- skip_spaces( io )
110
- logger.debug "end double_quote field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
111
- else
112
- logger.debug "start reg field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
113
- ## consume simple value
114
- ## until we hit "," or "\n" or "\r"
115
- ## note: will eat-up quotes too!!!
116
- while (c=io.peek; !(c=="," || c==LF || c==CR || io.eof?))
117
- logger.debug " add char >#{io.peek}< (#{io.peek.ord})" if logger.debug?
118
- value << io.getc ## eat-up all spaces (" ") and tabs (\t)
119
- end
120
- value = value.strip ## strip all trailing spaces
121
- logger.debug "end reg field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
122
- end
123
-
124
- value
125
- end
126
-
127
-
128
-
129
-
130
- def parse_field_strict( io, sep: )
131
- logger.debug "parse field (strict) - sep: >#{sep}< (#{sep.ord})" if logger.debug?
132
-
133
- value = ""
134
-
135
- if (c=io.peek; c==sep || c==LF || c==CR || io.eof?) ## empty unquoted field
136
- value = config[:unquoted_empty] ## defaults to "" (might be set to nil if needed)
137
- ## return value; do nothing
138
- elsif config[:quote] && io.peek == config[:quote]
139
- logger.debug "start quote field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
140
- io.getc ## eat-up double_quote
141
-
142
- loop do
143
- while (c=io.peek; !(c==config[:quote] || io.eof?))
144
- value << io.getc ## eat-up everything unit quote (")
145
- end
146
-
147
- break if io.eof?
148
-
149
- io.getc ## eat-up double_quote
150
-
151
- if config[:doublequote] && io.peek == config[:quote] ## doubled up quote?
152
- value << io.getc ## add doube quote and continue!!!!
153
- else
154
- break
155
- end
156
- end
157
-
158
- value = config[:quoted_empty] if value == "" ## defaults to "" (might be set to nil if needed)
159
-
160
- logger.debug "end double_quote field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
161
- else
162
- logger.debug "start reg field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
163
- ## consume simple value
164
- ## until we hit "," or "\n" or "\r" or stroy "\"" double quote
165
- while (c=io.peek; !(c==sep || c==LF || c==CR || c==config[:quote] || io.eof?))
166
- logger.debug " add char >#{io.peek}< (#{io.peek.ord})" if logger.debug?
167
- value << io.getc
168
- end
169
- logger.debug "end reg field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
170
- end
171
-
172
- value
173
- end
174
-
175
-
176
-
177
- def parse_record( io, sep: )
178
- values = []
179
-
180
- loop do
181
- value = parse_field( io, sep: sep )
182
- logger.debug "value: »#{value}«" if logger.debug?
183
- values << value
184
-
185
- if io.eof?
186
- break
187
- elsif (c=io.peek; c==LF || c==CR)
188
- skip_newlines( io )
189
- break
190
- elsif io.peek == ","
191
- io.getc ## eat-up FS(,)
192
- else
193
- puts "*** csv parse error: found >#{io.peek} (#{io.peek.ord})< - FS (,) or RS (\\n) expected!!!!"
194
- exit(1)
195
- end
196
- end
197
-
198
- values
199
- end
30
+ POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
31
+ quote: false,
32
+ escape: true,
33
+ null: "\\N" )
200
34
 
35
+ TAB = ParserTab.new
201
36
 
202
37
 
203
- def parse_record_strict( io, sep: )
204
- values = []
38
+ def self.default() DEFAULT; end ## alternative alias for DEFAULT
39
+ def self.strict() STRICT; end ## alternative alias for STRICT
40
+ def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
41
+ def self.excel() EXCEL; end ## alternative alias for EXCEL
42
+ def self.mysql() MYSQL; end
43
+ def self.postgresql() POSTGRESQL; end
44
+ def self.postgres() postgresql; end
45
+ def self.postgresql_text() POSTGRESQL_TEXT; end
46
+ def self.postgres_text() postgresql_text; end
47
+ def self.tab() TAB; end
205
48
 
206
- loop do
207
- value = parse_field_strict( io, sep: sep )
208
- logger.debug "value: »#{value}«" if logger.debug?
209
- values << value
210
-
211
- if io.eof?
212
- break
213
- elsif (c=io.peek; c==LF || c==CR)
214
- skip_newline( io ) ## note: singular / single newline only (NOT plural)
215
- break
216
- elsif io.peek == sep
217
- io.getc ## eat-up FS (,)
218
- else
219
- puts "*** csv parse error (strict): found >#{io.peek} (#{io.peek.ord})< - FS (,) or RS (\\n) expected!!!!"
220
- exit(1)
221
- end
222
- end
223
-
224
- values
225
- end
226
-
227
-
228
-
229
- def skip_newlines( io )
230
- return if io.eof?
49
+ end # class Parser
231
50
 
232
- while (c=io.peek; c==LF || c==CR)
233
- io.getc ## eat-up all \n and \r
234
- end
235
- end
236
51
 
237
52
 
238
- def skip_newline( io ) ## note: singular (strict) version
239
- return if io.eof?
53
+ ####################################
54
+ # define errors / exceptions
55
+ # for all parsers for (re)use
240
56
 
241
- ## only skip CR LF or LF or CR
242
- if io.peek == CR
243
- io.getc ## eat-up
244
- io.getc if io.peek == LF
245
- elsif io.peek == LF
246
- io.getc ## eat-up
247
- else
248
- # do nothing
249
- end
57
+ class Error < StandardError
250
58
  end
251
59
 
60
+ ####
61
+ # todo/check:
62
+ # use "common" error class - why? why not?
252
63
 
64
+ class ParseError < Error
65
+ attr_reader :message
253
66
 
254
- def skip_until_eol( io )
255
- return if io.eof?
256
-
257
- while (c=io.peek; !(c==LF || c==CR || io.eof?))
258
- io.getc ## eat-up all until end of line
67
+ def initialize( message )
68
+ @message = message
259
69
  end
260
- end
261
70
 
262
- def skip_spaces( io )
263
- return if io.eof?
264
-
265
- while (c=io.peek; c==SPACE || c==TAB)
266
- io.getc ## note: always eat-up all spaces (" ") and tabs (\t)
71
+ def to_s
72
+ "*** csv parse error: #{@message}"
267
73
  end
268
- end
269
-
270
-
271
-
272
-
273
-
274
-
275
- def parse_lines_human( io, sep:, &block )
276
-
277
- loop do
278
- break if io.eof?
279
-
280
- skip_spaces( io )
281
-
282
- if io.peek == COMMENT ## comment line
283
- logger.debug "skipping comment - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
284
- skip_until_eol( io )
285
- skip_newlines( io )
286
- elsif (c=io.peek; c==LF || c==CR || io.eof?)
287
- logger.debug "skipping blank - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
288
- skip_newlines( io )
289
- else
290
- logger.debug "start record - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
291
-
292
- record = parse_record( io, sep: sep )
293
- ## note: requires block - enforce? how? why? why not?
294
- block.call( record ) ## yield( record )
295
- end
296
- end # loop
297
- end # method parse_lines_human
298
-
299
-
300
-
301
- def parse_lines_strict( io, sep:, &block )
302
-
303
- ## no leading and trailing whitespaces trimmed/stripped
304
- ## no comments skipped
305
- ## no blanks skipped
306
- ## - follows strict rules of
307
- ## note: this csv format is NOT recommended;
308
- ## please, use a format with comments, leading and trailing whitespaces, etc.
309
- ## only added for checking compatibility
310
-
311
- loop do
312
- break if io.eof?
313
-
314
- logger.debug "start record (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
315
-
316
- record = parse_record_strict( io, sep: sep )
317
-
318
- ## note: requires block - enforce? how? why? why not?
319
- block.call( record ) ## yield( record )
320
- end # loop
321
- end # method parse_lines_strict
322
-
323
-
324
-
325
- def parse_lines( io_maybe, sep: config[:sep], &block )
326
- ## find a better name for io_maybe
327
- ## make sure io is a wrapped into BufferIO!!!!!!
328
- if io_maybe.is_a?( BufferIO ) ### allow (re)use of BufferIO if managed from "outside"
329
- io = io_maybe
330
- else
331
- io = BufferIO.new( io_maybe )
332
- end
333
-
334
- if strict?
335
- parse_lines_strict( io, sep: sep, &block )
336
- else
337
- parse_lines_human( io, sep: sep, &block )
338
- end
339
- end ## parse_lines
340
-
341
-
342
-
343
- ## fix: add optional block - lets you use it like foreach!!!
344
- ## make foreach an alias of parse with block - why? why not?
345
- ##
346
- ## unifiy with (make one) parse and parse_lines!!!! - why? why not?
347
-
348
- def parse( io_maybe, sep: config[:sep], limit: nil )
349
- records = []
350
-
351
- parse_lines( io_maybe, sep: sep ) do |record|
352
- records << record
353
-
354
- ## set limit to 1 for processing "single" line (that is, get one record)
355
- break if limit && limit >= records.size
356
- end
357
-
358
- records
359
- end ## method parse
360
-
361
-
362
-
363
- end # class Parser
74
+ end # class ParseError
364
75
  end # class CsvReader