csvreader 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ea1d667219773e3a355c81f815d91e92340d61a1
4
- data.tar.gz: ba7a43ccb5e110fc1f6eca76ca2a74a62f1131fb
3
+ metadata.gz: c87e1cac5f0988f4423a0c5aaf96d2a625bf4d60
4
+ data.tar.gz: 5af8f5875ac0e18ade4cc793ba8ad658f905d1df
5
5
  SHA512:
6
- metadata.gz: 0543a4338d2d12e36da16acdad9abff28633e519baa1d92044d1ca8f5e3472d835d00a10d8b19c24561b06e0d724f87414495600f4c83eef7c9e033474b4c09e
7
- data.tar.gz: 8df669bc86f2066b2650a67bda5698fae7b6d58766b9c318f47958b0499671d0a4d39e862b8d3af842105a2777a3cf7ad05168380c338a3053fd3d363697abfb
6
+ metadata.gz: c9528101aa8a2db3a8e0dfb3685e6d15fcd262a76ed16f69b34ca9d54003e772f9441eb1673e11886ee14ac3347a99c22bd06662a8191214189f5c57f0ecfe7b
7
+ data.tar.gz: acc9ada28d539dbc7ce1a2178e904ba247f511327f5828eebfdda78b21b263ca5d153d8fc234d7483cb60229f764094bc3c2fbeefa2381335d4e21a30487a828
data/Manifest.txt CHANGED
@@ -6,7 +6,11 @@ Rakefile
6
6
  lib/csvreader.rb
7
7
  lib/csvreader/buffer.rb
8
8
  lib/csvreader/parser.rb
9
+ lib/csvreader/parser_std.rb
10
+ lib/csvreader/parser_strict.rb
11
+ lib/csvreader/parser_tab.rb
9
12
  lib/csvreader/reader.rb
13
+ lib/csvreader/reader_hash.rb
10
14
  lib/csvreader/version.rb
11
15
  test/data/beer.csv
12
16
  test/data/beer11.csv
@@ -14,6 +18,8 @@ test/data/shakespeare.csv
14
18
  test/helper.rb
15
19
  test/test_parser.rb
16
20
  test/test_parser_formats.rb
17
- test/test_parser_rfc4180.rb
21
+ test/test_parser_java.rb
22
+ test/test_parser_strict.rb
23
+ test/test_parser_tab.rb
18
24
  test/test_reader.rb
19
25
  test/test_reader_hash.rb
data/lib/csvreader.rb CHANGED
@@ -1,7 +1,6 @@
1
1
  # encoding: utf-8
2
2
 
3
- require 'csv'
4
- require 'json'
3
+
5
4
  require 'pp'
6
5
  require 'logger'
7
6
 
@@ -10,8 +9,19 @@ require 'logger'
10
9
  # our own code
11
10
  require 'csvreader/version' # let version always go first
12
11
  require 'csvreader/buffer'
12
+ require 'csvreader/parser_std' # best practices pre-configured out-of-the-box
13
+ require 'csvreader/parser_strict' # flexible (strict - no leading/trailing space triming, blanks, etc.), configure for different formats/dialects
14
+ require 'csvreader/parser_tab'
13
15
  require 'csvreader/parser'
14
16
  require 'csvreader/reader'
17
+ require 'csvreader/reader_hash'
18
+
19
+
20
+
21
+ ## add convenience / shortcut alias
22
+ Csv = CsvReader
23
+ CsvHash = CsvHashReader
24
+
15
25
 
16
26
 
17
27
  puts CsvReader.banner # say hello
@@ -1,7 +1,12 @@
1
1
  # encoding: utf-8
2
2
 
3
3
  class CsvReader
4
- class BufferIO ## todo: find a better name - why? why not? is really just for reading (keep io?)
4
+ class Buffer ## todo: find a better name:
5
+ ## BufferedReader
6
+ ## BufferedInput
7
+ ## BufferI
8
+ ## - why? why not? is really just for reading (keep io?)
9
+
5
10
  def initialize( data )
6
11
  # create the IO object we will read from
7
12
  @io = data.is_a?(String) ? StringIO.new(data) : data
@@ -20,7 +25,7 @@ class BufferIO ## todo: find a better name - why? why not? is really just for
20
25
 
21
26
  def peek
22
27
  if @buf.size == 0 && @io.eof?
23
- puts "peek - hitting eof!!!"
28
+ ## puts "peek - hitting eof!!!"
24
29
  return "\0" ## return NUL char (0) for now
25
30
  end
26
31
 
@@ -33,5 +38,5 @@ class BufferIO ## todo: find a better name - why? why not? is really just for
33
38
  @buf.first
34
39
  end # method peek
35
40
 
36
- end # class BufferIO
41
+ end # class Buffer
37
42
  end # class CsvReader
@@ -2,363 +2,74 @@
2
2
 
3
3
  class CsvReader
4
4
 
5
-
6
-
7
-
8
-
9
5
  class Parser
10
6
 
7
+ ## use/allow different "backends" e.g. ParserStd, ParserStrict, ParserTab, etc.
8
+ ## parser must support parse method (with and without block)
9
+ ## e.g. records = parse( data )
10
+ ## -or-
11
+ ## parse( data ) do |record|
12
+ ## end
11
13
 
12
- ## char constants
13
- DOUBLE_QUOTE = "\""
14
- BACKSLASH = "\\" ## use BACKSLASH_ESCAPE ??
15
- COMMENT = "#" ## use COMMENT_HASH or HASH or ??
16
- SPACE = " " ## \s == ASCII 32 (dec) = (Space)
17
- TAB = "\t" ## \t == ASCII 0x09 (hex) = HT (Tab/horizontal tab)
18
- LF = "\n" ## \n == ASCII 0x0A (hex) 10 (dec) = LF (Newline/line feed)
19
- CR = "\r" ## \r == ASCII 0x0D (hex) 13 (dec) = CR (Carriage return)
20
-
21
-
22
- ###################################
23
- ## add simple logger with debug flag/switch
24
- #
25
- # use Parser.debug = true # to turn on
26
- #
27
- # todo/fix: use logutils instead of std logger - why? why not?
28
-
29
- def self.logger() @@logger ||= Logger.new( STDOUT ); end
30
- def logger() self.class.logger; end
31
-
32
-
33
-
34
- attr_reader :config ## todo/fix: change config to proper dialect class/struct - why? why not?
35
-
36
- def initialize( sep: ',',
37
- quote: DOUBLE_QUOTE, ## note: set to nil for no quote
38
- doublequote: true,
39
- escape: BACKSLASH, ## note: set to nil for no escapes
40
- trim: true, ## note: will toggle between human/default and strict mode parser!!!
41
- na: ['\N', 'NA'], ## note: set to nil for no null vales / not availabe (na)
42
- quoted_empty: '', ## note: only available in strict mode (e.g. trim=false)
43
- unquoted_empty: '' ## note: only available in strict mode (e.g. trim=false)
44
- )
45
- @config = {} ## todo/fix: change config to proper dialect class/struct - why? why not?
46
- @config[:sep] = sep
47
- @config[:quote] = quote
48
- @config[:doublequote] = doublequote
49
- @config[:escape] = escape
50
- @config[:trim] = trim
51
- @config[:na] = na
52
- @config[:quoted_empty] = quoted_empty
53
- @config[:unquoted_empty] = unquoted_empty
54
- end
55
-
56
-
57
-
58
- def strict?
59
- ## note: use trim for separating two different parsers / code paths:
60
- ## - human with trim leading and trailing whitespace and
61
- ## - strict with no leading and trailing whitespaces allowed
62
-
63
- ## for now use - trim == false for strict version flag alias
64
- ## todo/fix: add strict flag - why? why not?
65
- @config[:trim] ? false : true
66
- end
67
-
68
-
69
- DEFAULT = new( sep: ',', trim: true )
70
- RFC4180 = new( sep: ',', trim: false )
71
- EXCEL = new( sep: ',', trim: false )
72
-
73
- def self.default() DEFAULT; end ## alternative alias for DEFAULT
74
- def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
75
- def self.excel() EXCEL; end ## alternative alias for EXCEL
76
-
77
-
78
-
79
-
80
- def parse_field( io, sep: )
81
- logger.debug "parse field - sep: >#{sep}< (#{sep.ord})" if logger.debug?
82
-
83
- value = ""
84
- skip_spaces( io ) ## strip leading spaces
85
-
86
- if (c=io.peek; c=="," || c==LF || c==CR || io.eof?) ## empty field
87
- ## return value; do nothing
88
- elsif io.peek == DOUBLE_QUOTE
89
- logger.debug "start double_quote field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
90
- io.getc ## eat-up double_quote
91
14
 
92
- loop do
93
- while (c=io.peek; !(c==DOUBLE_QUOTE || io.eof?))
94
- value << io.getc ## eat-up everything unit quote (")
95
- end
15
+ DEFAULT = ParserStd.new
96
16
 
97
- break if io.eof?
17
+ RFC4180 = ParserStrict.new
18
+ STRICT = ParserStrict.new ## note: make strict its own instance (so you can change config without "breaking" rfc4180)
19
+ EXCEL = ParserStrict.new ## note: make excel its own instance (so you can change configs without "breaking" rfc4180/strict)
98
20
 
99
- io.getc ## eat-up double_quote
21
+ MYSQL = ParserStrict.new( sep: "\t",
22
+ quote: false,
23
+ escape: true,
24
+ null: "\\N" )
100
25
 
101
- if io.peek == DOUBLE_QUOTE ## doubled up quote?
102
- value << io.getc ## add doube quote and continue!!!!
103
- else
104
- break
105
- end
106
- end
26
+ POSTGRES = POSTGRESQL = ParserStrict.new( doublequote: false,
27
+ escape: true,
28
+ unquoted_empty_null: true )
107
29
 
108
- ## note: always eat-up all trailing spaces (" ") and tabs (\t)
109
- skip_spaces( io )
110
- logger.debug "end double_quote field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
111
- else
112
- logger.debug "start reg field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
113
- ## consume simple value
114
- ## until we hit "," or "\n" or "\r"
115
- ## note: will eat-up quotes too!!!
116
- while (c=io.peek; !(c=="," || c==LF || c==CR || io.eof?))
117
- logger.debug " add char >#{io.peek}< (#{io.peek.ord})" if logger.debug?
118
- value << io.getc ## eat-up all spaces (" ") and tabs (\t)
119
- end
120
- value = value.strip ## strip all trailing spaces
121
- logger.debug "end reg field - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
122
- end
123
-
124
- value
125
- end
126
-
127
-
128
-
129
-
130
- def parse_field_strict( io, sep: )
131
- logger.debug "parse field (strict) - sep: >#{sep}< (#{sep.ord})" if logger.debug?
132
-
133
- value = ""
134
-
135
- if (c=io.peek; c==sep || c==LF || c==CR || io.eof?) ## empty unquoted field
136
- value = config[:unquoted_empty] ## defaults to "" (might be set to nil if needed)
137
- ## return value; do nothing
138
- elsif config[:quote] && io.peek == config[:quote]
139
- logger.debug "start quote field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
140
- io.getc ## eat-up double_quote
141
-
142
- loop do
143
- while (c=io.peek; !(c==config[:quote] || io.eof?))
144
- value << io.getc ## eat-up everything unit quote (")
145
- end
146
-
147
- break if io.eof?
148
-
149
- io.getc ## eat-up double_quote
150
-
151
- if config[:doublequote] && io.peek == config[:quote] ## doubled up quote?
152
- value << io.getc ## add doube quote and continue!!!!
153
- else
154
- break
155
- end
156
- end
157
-
158
- value = config[:quoted_empty] if value == "" ## defaults to "" (might be set to nil if needed)
159
-
160
- logger.debug "end double_quote field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
161
- else
162
- logger.debug "start reg field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
163
- ## consume simple value
164
- ## until we hit "," or "\n" or "\r" or stroy "\"" double quote
165
- while (c=io.peek; !(c==sep || c==LF || c==CR || c==config[:quote] || io.eof?))
166
- logger.debug " add char >#{io.peek}< (#{io.peek.ord})" if logger.debug?
167
- value << io.getc
168
- end
169
- logger.debug "end reg field (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
170
- end
171
-
172
- value
173
- end
174
-
175
-
176
-
177
- def parse_record( io, sep: )
178
- values = []
179
-
180
- loop do
181
- value = parse_field( io, sep: sep )
182
- logger.debug "value: »#{value}«" if logger.debug?
183
- values << value
184
-
185
- if io.eof?
186
- break
187
- elsif (c=io.peek; c==LF || c==CR)
188
- skip_newlines( io )
189
- break
190
- elsif io.peek == ","
191
- io.getc ## eat-up FS(,)
192
- else
193
- puts "*** csv parse error: found >#{io.peek} (#{io.peek.ord})< - FS (,) or RS (\\n) expected!!!!"
194
- exit(1)
195
- end
196
- end
197
-
198
- values
199
- end
30
+ POSTGRES_TEXT = POSTGRESQL_TEXT = ParserStrict.new( sep: "\t",
31
+ quote: false,
32
+ escape: true,
33
+ null: "\\N" )
200
34
 
35
+ TAB = ParserTab.new
201
36
 
202
37
 
203
- def parse_record_strict( io, sep: )
204
- values = []
38
+ def self.default() DEFAULT; end ## alternative alias for DEFAULT
39
+ def self.strict() STRICT; end ## alternative alias for STRICT
40
+ def self.rfc4180() RFC4180; end ## alternative alias for RFC4180
41
+ def self.excel() EXCEL; end ## alternative alias for EXCEL
42
+ def self.mysql() MYSQL; end
43
+ def self.postgresql() POSTGRESQL; end
44
+ def self.postgres() postgresql; end
45
+ def self.postgresql_text() POSTGRESQL_TEXT; end
46
+ def self.postgres_text() postgresql_text; end
47
+ def self.tab() TAB; end
205
48
 
206
- loop do
207
- value = parse_field_strict( io, sep: sep )
208
- logger.debug "value: »#{value}«" if logger.debug?
209
- values << value
210
-
211
- if io.eof?
212
- break
213
- elsif (c=io.peek; c==LF || c==CR)
214
- skip_newline( io ) ## note: singular / single newline only (NOT plural)
215
- break
216
- elsif io.peek == sep
217
- io.getc ## eat-up FS (,)
218
- else
219
- puts "*** csv parse error (strict): found >#{io.peek} (#{io.peek.ord})< - FS (,) or RS (\\n) expected!!!!"
220
- exit(1)
221
- end
222
- end
223
-
224
- values
225
- end
226
-
227
-
228
-
229
- def skip_newlines( io )
230
- return if io.eof?
49
+ end # class Parser
231
50
 
232
- while (c=io.peek; c==LF || c==CR)
233
- io.getc ## eat-up all \n and \r
234
- end
235
- end
236
51
 
237
52
 
238
- def skip_newline( io ) ## note: singular (strict) version
239
- return if io.eof?
53
+ ####################################
54
+ # define errors / exceptions
55
+ # for all parsers for (re)use
240
56
 
241
- ## only skip CR LF or LF or CR
242
- if io.peek == CR
243
- io.getc ## eat-up
244
- io.getc if io.peek == LF
245
- elsif io.peek == LF
246
- io.getc ## eat-up
247
- else
248
- # do nothing
249
- end
57
+ class Error < StandardError
250
58
  end
251
59
 
60
+ ####
61
+ # todo/check:
62
+ # use "common" error class - why? why not?
252
63
 
64
+ class ParseError < Error
65
+ attr_reader :message
253
66
 
254
- def skip_until_eol( io )
255
- return if io.eof?
256
-
257
- while (c=io.peek; !(c==LF || c==CR || io.eof?))
258
- io.getc ## eat-up all until end of line
67
+ def initialize( message )
68
+ @message = message
259
69
  end
260
- end
261
70
 
262
- def skip_spaces( io )
263
- return if io.eof?
264
-
265
- while (c=io.peek; c==SPACE || c==TAB)
266
- io.getc ## note: always eat-up all spaces (" ") and tabs (\t)
71
+ def to_s
72
+ "*** csv parse error: #{@message}"
267
73
  end
268
- end
269
-
270
-
271
-
272
-
273
-
274
-
275
- def parse_lines_human( io, sep:, &block )
276
-
277
- loop do
278
- break if io.eof?
279
-
280
- skip_spaces( io )
281
-
282
- if io.peek == COMMENT ## comment line
283
- logger.debug "skipping comment - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
284
- skip_until_eol( io )
285
- skip_newlines( io )
286
- elsif (c=io.peek; c==LF || c==CR || io.eof?)
287
- logger.debug "skipping blank - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
288
- skip_newlines( io )
289
- else
290
- logger.debug "start record - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
291
-
292
- record = parse_record( io, sep: sep )
293
- ## note: requires block - enforce? how? why? why not?
294
- block.call( record ) ## yield( record )
295
- end
296
- end # loop
297
- end # method parse_lines_human
298
-
299
-
300
-
301
- def parse_lines_strict( io, sep:, &block )
302
-
303
- ## no leading and trailing whitespaces trimmed/stripped
304
- ## no comments skipped
305
- ## no blanks skipped
306
- ## - follows strict rules of
307
- ## note: this csv format is NOT recommended;
308
- ## please, use a format with comments, leading and trailing whitespaces, etc.
309
- ## only added for checking compatibility
310
-
311
- loop do
312
- break if io.eof?
313
-
314
- logger.debug "start record (strict) - peek >#{io.peek}< (#{io.peek.ord})" if logger.debug?
315
-
316
- record = parse_record_strict( io, sep: sep )
317
-
318
- ## note: requires block - enforce? how? why? why not?
319
- block.call( record ) ## yield( record )
320
- end # loop
321
- end # method parse_lines_strict
322
-
323
-
324
-
325
- def parse_lines( io_maybe, sep: config[:sep], &block )
326
- ## find a better name for io_maybe
327
- ## make sure io is a wrapped into BufferIO!!!!!!
328
- if io_maybe.is_a?( BufferIO ) ### allow (re)use of BufferIO if managed from "outside"
329
- io = io_maybe
330
- else
331
- io = BufferIO.new( io_maybe )
332
- end
333
-
334
- if strict?
335
- parse_lines_strict( io, sep: sep, &block )
336
- else
337
- parse_lines_human( io, sep: sep, &block )
338
- end
339
- end ## parse_lines
340
-
341
-
342
-
343
- ## fix: add optional block - lets you use it like foreach!!!
344
- ## make foreach an alias of parse with block - why? why not?
345
- ##
346
- ## unifiy with (make one) parse and parse_lines!!!! - why? why not?
347
-
348
- def parse( io_maybe, sep: config[:sep], limit: nil )
349
- records = []
350
-
351
- parse_lines( io_maybe, sep: sep ) do |record|
352
- records << record
353
-
354
- ## set limit to 1 for processing "single" line (that is, get one record)
355
- break if limit && limit >= records.size
356
- end
357
-
358
- records
359
- end ## method parse
360
-
361
-
362
-
363
- end # class Parser
74
+ end # class ParseError
364
75
  end # class CsvReader