fastercsv 0.1.4 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +16 -0
- data/Rakefile +1 -1
- data/TODO +0 -5
- data/lib/faster_csv.rb +655 -40
- data/test/tc_csv_parsing.rb +2 -2
- data/test/tc_data_converters.rb +166 -0
- data/test/tc_features.rb +18 -0
- data/test/tc_headers.rb +125 -0
- data/test/tc_interface.rb +28 -0
- data/test/tc_row.rb +264 -0
- data/test/ts_all.rb +3 -0
- metadata +41 -35
data/CHANGELOG
CHANGED
@@ -2,6 +2,22 @@
|
|
2
2
|
|
3
3
|
Below is a complete listing of changes for each revision of FasterCSV.
|
4
4
|
|
5
|
+
== 0.1.6
|
6
|
+
|
7
|
+
* Began using a forked development/stable versioning system.
|
8
|
+
* Reorganized initialization code for easier additions and maintenance.
|
9
|
+
* Added a check for unknown options. Exceptions will now be thrown for them.
|
10
|
+
* Added built-in and custom data converters. Built-in handle numbers and dates.
|
11
|
+
* Added Array#to_csv and String#parse_csv. Both accept normal options.
|
12
|
+
* Project moved to RubyForge Subversion.
|
13
|
+
* Added auto-discovery for <tt>:row_sep</tt> (now the default).
|
14
|
+
* Added FasterCSV::filter() for easy Unix-like CSV filters.
|
15
|
+
* Added support for accessing fields by headers.
|
16
|
+
* Headers can have their own converters.
|
17
|
+
* Headers can be skipped or returned as needed.
|
18
|
+
* FasterCSV::Row allows index or header access while retaining order and
|
19
|
+
allowing for duplicate headers.
|
20
|
+
|
5
21
|
== 0.1.4
|
6
22
|
|
7
23
|
* Fixed <tt>:col_sep</tt> escaping bug (reported by Kev Jackson).
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -3,10 +3,6 @@
|
|
3
3
|
The following is a list of planned expansions for FasterCSV, in no particular
|
4
4
|
order.
|
5
5
|
|
6
|
-
* Add support for accessing fields by headers (from first row of document).
|
7
|
-
* Add "convertors" for switching numbers to Integers or Floats, dates to Date or
|
8
|
-
Time objects, etc.
|
9
|
-
* Add to_csv().
|
10
6
|
* Find a good headers solution for data like this:
|
11
7
|
"Experiment ID: 1",,,,,,,,,,,,
|
12
8
|
"Subject ID: 1013938829432171e868c340.
|
@@ -31,5 +27,4 @@ order.
|
|
31
27
|
### and a new block starts
|
32
28
|
"Experiment ID: 3",,,,,,,,,,,,0.92
|
33
29
|
....
|
34
|
-
* Add FasterCSV.filter().
|
35
30
|
* Add calculated fields.
|
data/lib/faster_csv.rb
CHANGED
@@ -9,6 +9,8 @@
|
|
9
9
|
|
10
10
|
require "stringio"
|
11
11
|
require "forwardable"
|
12
|
+
require "enumerator"
|
13
|
+
require "date"
|
12
14
|
|
13
15
|
#
|
14
16
|
# This class provides a complete interface to CSV files and data. It offers
|
@@ -61,20 +63,374 @@ require "forwardable"
|
|
61
63
|
#
|
62
64
|
# == Convert a Single Line
|
63
65
|
#
|
64
|
-
# csv_string =
|
65
|
-
# csv_array =
|
66
|
+
# csv_string = ["CSV", "data"].to_csv # to CSV
|
67
|
+
# csv_array = "CSV,String".parse_csv # from CSV
|
66
68
|
#
|
67
69
|
class FasterCSV
|
70
|
+
#
|
71
|
+
# A FasterCSV::Row is part Array and part Hash. It retains an order for the
|
72
|
+
# fields and allows duplicates just as an Array would, but also allows you to
|
73
|
+
# access fields by name just as you could if they were in a Hash.
|
74
|
+
#
|
75
|
+
# All rows returned by FasterCSV will be constructed from this class, if
|
76
|
+
# header row processing is activated.
|
77
|
+
#
|
78
|
+
class Row
|
79
|
+
#
|
80
|
+
# Construct a new FasterCSV::Row from +headers+ and +fields+, which are
|
81
|
+
# expected to be Arrays. If one Array is shorter than the other, it will be
|
82
|
+
# padded with +nil+ objects.
|
83
|
+
#
|
84
|
+
def initialize( headers, fields )
|
85
|
+
# handle extra headers or fields
|
86
|
+
@row = if headers.size > fields.size
|
87
|
+
headers.zip(fields)
|
88
|
+
else
|
89
|
+
fields.zip(headers).map { |pair| pair.reverse }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns the headers of this row.
|
94
|
+
def headers
|
95
|
+
@row.map { |pair| pair.first }
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# :call-seq:
|
100
|
+
# field( header )
|
101
|
+
# field( header, offset )
|
102
|
+
# field( index )
|
103
|
+
#
|
104
|
+
# This method will fetch the field value by +header+ or +index+. If a field
|
105
|
+
# is not found, +nil+ is returned.
|
106
|
+
#
|
107
|
+
# When provided, +offset+ ensures that a header match occurrs on or later
|
108
|
+
# than the +offset+ index. You can use this to find duplicate headers,
|
109
|
+
# without resorting to hard-coding exact indices.
|
110
|
+
#
|
111
|
+
def field( header_or_index, minimum_index = 0 )
|
112
|
+
# locate the pair
|
113
|
+
finder = header_or_index.is_a?(Integer) ? :[] : :assoc
|
114
|
+
pair = @row[minimum_index..-1].send(finder, header_or_index)
|
115
|
+
|
116
|
+
# return the field if we have a pair
|
117
|
+
pair.nil? ? nil : pair.last
|
118
|
+
end
|
119
|
+
alias_method :[], :field
|
120
|
+
|
121
|
+
#
|
122
|
+
# :call-seq:
|
123
|
+
# []=( header, value )
|
124
|
+
# []=( header, offset, value )
|
125
|
+
# []=( index, value )
|
126
|
+
#
|
127
|
+
# Looks up the field by the semantics described in FasterCSV::Row.field()
|
128
|
+
# and assigns the +value+.
|
129
|
+
#
|
130
|
+
# Assigning past the end of the row with an index will set all pairs between
|
131
|
+
# to <tt>[nil, nil]</tt>. Assigning to an unused header appends the new
|
132
|
+
# pair.
|
133
|
+
#
|
134
|
+
def []=( *args )
|
135
|
+
value = args.pop
|
136
|
+
|
137
|
+
if args.first.is_a? Integer
|
138
|
+
if @row[args.first].nil? # extending past the end with index
|
139
|
+
@row[args.first] = [nil, value]
|
140
|
+
@row.map! { |pair| pair.nil? ? [nil, nil] : pair }
|
141
|
+
else # normal index assignment
|
142
|
+
@row[args.first][1] = value
|
143
|
+
end
|
144
|
+
else
|
145
|
+
index = index(*args)
|
146
|
+
if index.nil? # appending a field
|
147
|
+
self << [args.first, value]
|
148
|
+
else # normal header assignment
|
149
|
+
@row[index][1] = value
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
#
|
155
|
+
# :call-seq:
|
156
|
+
# <<( field )
|
157
|
+
# <<( header_and_field_array )
|
158
|
+
# <<( header_and_field_hash )
|
159
|
+
#
|
160
|
+
# If a two-element Array is provided, it is assumed to be a header and field
|
161
|
+
# and the pair is appended. A Hash works the same way with the key being
|
162
|
+
# the header and the value being the field. Anything else is assumed to be
|
163
|
+
# a lone field which is appended with a +nil+ header.
|
164
|
+
#
|
165
|
+
# This method returns the row for chaining.
|
166
|
+
#
|
167
|
+
def <<( arg )
|
168
|
+
if arg.is_a?(Array) and arg.size == 2 # appending a header and name
|
169
|
+
@row << arg
|
170
|
+
elsif arg.is_a?(Hash) # append header and name pairs
|
171
|
+
arg.each { |pair| @row << pair }
|
172
|
+
else # append field value
|
173
|
+
@row << [nil, arg]
|
174
|
+
end
|
175
|
+
|
176
|
+
self # for chaining
|
177
|
+
end
|
178
|
+
|
179
|
+
#
|
180
|
+
# A shortcut for appending multiple fields. Equivalent to:
|
181
|
+
#
|
182
|
+
# args.each { |arg| faster_csv_row << arg }
|
183
|
+
#
|
184
|
+
# This method returns the row for chaining.
|
185
|
+
#
|
186
|
+
def push( *args )
|
187
|
+
args.each { |arg| self << arg }
|
188
|
+
|
189
|
+
self # for chaining
|
190
|
+
end
|
191
|
+
|
192
|
+
#
|
193
|
+
# :call-seq:
|
194
|
+
# delete( header )
|
195
|
+
# delete( header, offset )
|
196
|
+
# delete( index )
|
197
|
+
#
|
198
|
+
# Used to remove a pair from the row by +header+ or +index+. The pair is
|
199
|
+
# located as described in FasterCSV::Row.field(). The deleted pair is
|
200
|
+
# returned, or +nil+ if a pair could not be found.
|
201
|
+
#
|
202
|
+
def delete( header_or_index, minimum_index = 0 )
|
203
|
+
if header_or_index.is_a? Integer # by index
|
204
|
+
@row.delete_at(header_or_index)
|
205
|
+
else # by header
|
206
|
+
@row.delete_at(index(header_or_index, minimum_index))
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
#
|
211
|
+
# The provided +block+ is passed a header and field for each pair in the row
|
212
|
+
# and expected to return +true+ or +false+, depending on whether the pair
|
213
|
+
# should be deleted.
|
214
|
+
#
|
215
|
+
# This method returns the row for chaining.
|
216
|
+
#
|
217
|
+
def delete_if( &block )
|
218
|
+
@row.delete_if(&block)
|
219
|
+
|
220
|
+
self # for chaining
|
221
|
+
end
|
222
|
+
|
223
|
+
#
|
224
|
+
# This method accepts any number of arguments which can be headers, indices,
|
225
|
+
# or two-element Arrays containing a header and offset. Each argument will
|
226
|
+
# be replaced with a field lookup as described in FasterCSV::Row.field().
|
227
|
+
#
|
228
|
+
# If called with no arguments, all fields are returned.
|
229
|
+
#
|
230
|
+
def fields( *headers_and_or_indices )
|
231
|
+
if headers_and_or_indices.empty? # return all fields--no arguments
|
232
|
+
@row.map { |pair| pair.last }
|
233
|
+
else # or work like values_at()
|
234
|
+
headers_and_or_indices.map { |h_or_i| field(*Array(h_or_i)) }
|
235
|
+
end
|
236
|
+
end
|
237
|
+
alias_method :values_at, :fields
|
238
|
+
|
239
|
+
#
|
240
|
+
# :call-seq:
|
241
|
+
# index( header )
|
242
|
+
# index( header, offset )
|
243
|
+
#
|
244
|
+
# This method will return the index of a field with the provided +header+.
|
245
|
+
# The +offset+ can be used to locate duplicate header names, as described in
|
246
|
+
# FasterCSV::Row.field().
|
247
|
+
#
|
248
|
+
def index( header, minimum_index = 0 )
|
249
|
+
# find the pair
|
250
|
+
index = headers[minimum_index..-1].index(header)
|
251
|
+
# return the index at the right offset, if we found one
|
252
|
+
index.nil? ? nil : index + minimum_index
|
253
|
+
end
|
254
|
+
|
255
|
+
# Returns +true+ if +name+ is a header for this row, and +false+ otherwise.
|
256
|
+
def header?( name )
|
257
|
+
headers.include? name
|
258
|
+
end
|
259
|
+
alias_method :include?, :header?
|
260
|
+
|
261
|
+
#
|
262
|
+
# Returns +true+ if +data+ matches a field in this row, and +false+
|
263
|
+
# otherwise.
|
264
|
+
#
|
265
|
+
def field?( data )
|
266
|
+
fields.include? data
|
267
|
+
end
|
268
|
+
|
269
|
+
include Enumerable
|
270
|
+
|
271
|
+
#
|
272
|
+
# Yields each pair of the row as header and field tuples (much like
|
273
|
+
# iterating over a Hash).
|
274
|
+
#
|
275
|
+
# Support for Enumerable.
|
276
|
+
#
|
277
|
+
# This method returns the row for chaining.
|
278
|
+
#
|
279
|
+
def each( &block )
|
280
|
+
@row.each(&block)
|
281
|
+
|
282
|
+
self # for chaining
|
283
|
+
end
|
284
|
+
|
285
|
+
#
|
286
|
+
# Collapses the row into a simple Hash. Be warning that this discards field
|
287
|
+
# order and clobbers duplicate fields.
|
288
|
+
#
|
289
|
+
def to_hash
|
290
|
+
# flatten just one level of the internal Array
|
291
|
+
Hash[*@row.inject(Array.new) { |ary, pair| ary.push(*pair) }]
|
292
|
+
end
|
293
|
+
|
294
|
+
#
|
295
|
+
# Returns the row as a CSV String. Headers are not used. Equivalent to:
|
296
|
+
#
|
297
|
+
# faster_csv_row.fields.to_csv( options )
|
298
|
+
#
|
299
|
+
def to_csv( options = Hash.new )
|
300
|
+
fields.to_csv(options)
|
301
|
+
end
|
302
|
+
alias_method :to_s, :to_csv
|
303
|
+
end
|
304
|
+
|
68
305
|
# The error thrown when the parser encounters illegal CSV formatting.
|
69
306
|
class MalformedCSVError < RuntimeError; end
|
70
307
|
|
308
|
+
#
|
309
|
+
# A FieldInfo Struct contains details about a field's position in the data
|
310
|
+
# source it was read from. FasterCSV will pass this Struct to some blocks
|
311
|
+
# that make decisions based on field structure. See
|
312
|
+
# FasterCSV.convert_fields() for an example.
|
313
|
+
#
|
314
|
+
# <b><tt>index</tt></b>:: The zero-based index of the field in its row.
|
315
|
+
# <b><tt>line</tt></b>:: The line of the data source this row is from.
|
316
|
+
#
|
317
|
+
FieldInfo = Struct.new(:index, :line)
|
318
|
+
|
319
|
+
#
|
320
|
+
# This Hash holds the built-in converters of FasterCSV that can be accessed by
|
321
|
+
# name. You can select Converters with FasterCSV.convert() or through the
|
322
|
+
# +options+ Hash passed to FasterCSV::new().
|
323
|
+
#
|
324
|
+
# <b><tt>:integer</tt></b>:: Converts any field Integer() accepts.
|
325
|
+
# <b><tt>:float</tt></b>:: Converts any field Float() accepts.
|
326
|
+
# <b><tt>:numeric</tt></b>:: A combination of <tt>:integer</tt>
|
327
|
+
# and <tt>:float</tt>.
|
328
|
+
# <b><tt>:date</tt></b>:: Converts any field Date::parse() accepts.
|
329
|
+
# <b><tt>:date_time</tt></b>:: Converts any field DateTime::parse() accepts.
|
330
|
+
# <b><tt>:all</tt></b>:: All built-in converters. A combination of
|
331
|
+
# <tt>:date_time</tt> and <tt>:numeric</tt>.
|
332
|
+
#
|
333
|
+
# This Hash is intetionally left unfrozen and users should feel free to add
|
334
|
+
# values to it that can be accessed by all FasterCSV objects.
|
335
|
+
#
|
336
|
+
# To add a combo field, the value should be an Array of names. Combo fields
|
337
|
+
# can be nested with other combo fields.
|
338
|
+
#
|
339
|
+
Converters = { :integer => lambda { |f| Integer(f) rescue f },
|
340
|
+
:float => lambda { |f| Float(f) rescue f },
|
341
|
+
:numeric => [:integer, :float],
|
342
|
+
:date => lambda { |f| Date.parse(f) rescue f },
|
343
|
+
:date_time => lambda { |f| DateTime.parse(f) rescue f },
|
344
|
+
:all => [:date_time, :numeric] }
|
345
|
+
|
346
|
+
#
|
347
|
+
# This Hash holds the built-in header converters of FasterCSV that can be
|
348
|
+
# accessed by name. You can select HeaderConverters with
|
349
|
+
# FasterCSV.header_convert() or through the +options+ Hash passed to
|
350
|
+
# FasterCSV::new().
|
351
|
+
#
|
352
|
+
# <b><tt>:downcase</tt></b>:: Calls downcase() on the header String.
|
353
|
+
# <b><tt>:symbol</tt></b>:: The header String is downcased, spaces are
|
354
|
+
# replaced with underscores, non-word characters
|
355
|
+
# are dropped, and finally to_sym() is called.
|
356
|
+
#
|
357
|
+
# This Hash is intetionally left unfrozen and users should feel free to add
|
358
|
+
# values to it that can be accessed by all FasterCSV objects.
|
359
|
+
#
|
360
|
+
# To add a combo field, the value should be an Array of names. Combo fields
|
361
|
+
# can be nested with other combo fields.
|
362
|
+
#
|
363
|
+
HeaderConverters = {
|
364
|
+
:downcase => lambda { |h| h.downcase },
|
365
|
+
:symbol => lambda { |h|
|
366
|
+
h.downcase.tr(" ", "_").delete("^a-z0-9_").to_sym
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
71
370
|
#
|
72
371
|
# The options used when no overrides are given by calling code. They are:
|
73
372
|
#
|
74
|
-
# <b><tt>:col_sep</tt></b>::
|
75
|
-
# <b><tt>:row_sep</tt></b>::
|
373
|
+
# <b><tt>:col_sep</tt></b>:: <tt>","</tt>
|
374
|
+
# <b><tt>:row_sep</tt></b>:: <tt>:auto</tt>
|
375
|
+
# <b><tt>:converters</tt></b>:: +nil+
|
376
|
+
# <b><tt>:headers</tt></b>:: +false+
|
377
|
+
# <b><tt>:return_headers</tt></b>:: +false+
|
378
|
+
# <b><tt>:header_converters</tt></b>:: +nil+
|
379
|
+
#
|
380
|
+
DEFAULT_OPTIONS = { :col_sep => ",",
|
381
|
+
:row_sep => :auto,
|
382
|
+
:converters => nil,
|
383
|
+
:headers => false,
|
384
|
+
:return_headers => false,
|
385
|
+
:header_converters => nil }.freeze
|
386
|
+
|
387
|
+
#
|
388
|
+
# :call-seq:
|
389
|
+
# filter( options = Hash.new ) { |row| ... }
|
390
|
+
# filter( input, options = Hash.new ) { |row| ... }
|
391
|
+
# filter( input, output, options = Hash.new ) { |row| ... }
|
392
|
+
#
|
393
|
+
# This method is a convenience for building Unix-like filters for CSV data.
|
394
|
+
# Each row is yielded to the provided block which can alter it as needed.
|
395
|
+
# After the block returns, the row is appended to _output_ altered or not.
|
396
|
+
#
|
397
|
+
# The _input_ and _output_ arguments can be anything FasterCSV::new() accepts
|
398
|
+
# (generally String or IO objects). If not given, they default to
|
399
|
+
# <tt>$stdin</tt> and <tt>$stdout</tt>.
|
76
400
|
#
|
77
|
-
|
401
|
+
# The _options_ parameter is also filtered down to FasterCSV::new() after some
|
402
|
+
# clever key parsing. Any key beginning with <tt>:in_</tt> or
|
403
|
+
# <tt>:input_</tt> will have that leading identifier stripped and will only
|
404
|
+
# be used in the _options_ Hash for the _input_ object. Keys starting with
|
405
|
+
# <tt>:out_</tt> or <tt>:output_</tt> affect only _output_. All other keys
|
406
|
+
# are assigned to both objects.
|
407
|
+
#
|
408
|
+
def self.filter( *args )
|
409
|
+
# parse options for input, output, or both
|
410
|
+
input_options, output_options = Hash.new, Hash.new
|
411
|
+
if args.last.is_a? Hash
|
412
|
+
args.pop.each do |key, value|
|
413
|
+
case key.to_s
|
414
|
+
when /\Ain(?:put)?_(.+)\Z/
|
415
|
+
input_options[$1.to_sym] = value
|
416
|
+
when /\Aout(?:put)?_(.+)\Z/
|
417
|
+
output_options[$1.to_sym] = value
|
418
|
+
else
|
419
|
+
input_options[key] = value
|
420
|
+
output_options[key] = value
|
421
|
+
end
|
422
|
+
end
|
423
|
+
end
|
424
|
+
# build input and output wrappers
|
425
|
+
input = FasterCSV.new(args.shift || $stdin, input_options)
|
426
|
+
output = FasterCSV.new(args.shift || $stdout, output_options)
|
427
|
+
|
428
|
+
# read, yield, write
|
429
|
+
input.each do |row|
|
430
|
+
yield row
|
431
|
+
output << row
|
432
|
+
end
|
433
|
+
end
|
78
434
|
|
79
435
|
#
|
80
436
|
# This method is intended as the primary interface for reading CSV files. You
|
@@ -218,19 +574,6 @@ class FasterCSV
|
|
218
574
|
end
|
219
575
|
end
|
220
576
|
|
221
|
-
#
|
222
|
-
# Use to slurp a CSV file into an Array of Arrays. Pass the +path+ to the
|
223
|
-
# file and any +options+ FasterCSV::new() understands.
|
224
|
-
#
|
225
|
-
def self.read( path, options = Hash.new )
|
226
|
-
open(path, options) { |csv| csv.read }
|
227
|
-
end
|
228
|
-
|
229
|
-
# Alias for FasterCSV::read().
|
230
|
-
def self.readlines( path, options = Hash.new )
|
231
|
-
open(path, options) { |csv| csv.readlines }
|
232
|
-
end
|
233
|
-
|
234
577
|
#
|
235
578
|
# This method is a shortcut for converting a single line of a CSV String into
|
236
579
|
# a into an Array. Note that if +line+ contains multiple rows, anything
|
@@ -242,6 +585,19 @@ class FasterCSV
|
|
242
585
|
new(line, options).shift
|
243
586
|
end
|
244
587
|
|
588
|
+
#
|
589
|
+
# Use to slurp a CSV file into an Array of Arrays. Pass the +path+ to the
|
590
|
+
# file and any +options+ FasterCSV::new() understands.
|
591
|
+
#
|
592
|
+
def self.read( path, options = Hash.new )
|
593
|
+
open(path, options) { |csv| csv.read }
|
594
|
+
end
|
595
|
+
|
596
|
+
# Alias for FasterCSV::read().
|
597
|
+
def self.readlines( *args )
|
598
|
+
read(*args)
|
599
|
+
end
|
600
|
+
|
245
601
|
#
|
246
602
|
# This constructor will wrap either a String or IO object passed in +data+ for
|
247
603
|
# reading and/or writing. In addition to the FasterCSV instance methods,
|
@@ -257,8 +613,43 @@ class FasterCSV
|
|
257
613
|
# You may set any reading and/or writing preferences in the +options+ Hash.
|
258
614
|
# Available options are:
|
259
615
|
#
|
260
|
-
# <b><tt>:col_sep</tt></b>::
|
261
|
-
# <b><tt>:row_sep</tt></b>::
|
616
|
+
# <b><tt>:col_sep</tt></b>:: The String placed between each field.
|
617
|
+
# <b><tt>:row_sep</tt></b>:: The String appended to the end of each
|
618
|
+
# row. This can be set to the special
|
619
|
+
# <tt>:auto</tt> setting, which requests
|
620
|
+
# that FasterCSV automatically discover
|
621
|
+
# this from the data. Auto-discovery
|
622
|
+
# reads ahead in the data looking for
|
623
|
+
# the next <tt>"\r\n"</tt>,
|
624
|
+
# <tt>"\n"</tt>, or <tt>"\r"</tt>
|
625
|
+
# sequence. A sequence will be selected
|
626
|
+
# even if it occurs in a quoted field,
|
627
|
+
# assuming that you would have the same
|
628
|
+
# line endings there. If none of those
|
629
|
+
# sequences is found, the default
|
630
|
+
# <tt>$/</tt> is used. Obviously,
|
631
|
+
# discovery takes a little time. Set
|
632
|
+
# manually if speed is important.
|
633
|
+
# <b><tt>:converters</tt></b>:: An Array of names from the Converters
|
634
|
+
# Hash and/or lambdas that handle custom
|
635
|
+
# conversion. A single converter
|
636
|
+
# doesn't have to be in an Array.
|
637
|
+
# <b><tt>:headers</tt></b>:: If set to <tt>:first_row</tt> or
|
638
|
+
# +true+, the initial row of the CSV
|
639
|
+
# file will be treated as a row of
|
640
|
+
# headers. This setting causes
|
641
|
+
# FasterCSV.shift() to return rows as
|
642
|
+
# FasterCSV::Row objects instead of
|
643
|
+
# Arrays.
|
644
|
+
# <b><tt>:return_headers</tt></b>:: When +false+, header rows are silently
|
645
|
+
# swallowed. If set to +true+, header
|
646
|
+
# rows are returned in a FasterCSV::Row
|
647
|
+
# object with identical headers and
|
648
|
+
# fields.
|
649
|
+
# <b><tt>:header_converters</tt></b>:: Identical in functionality to
|
650
|
+
# <tt>:converters</tt> save that the
|
651
|
+
# conversions are only made to header
|
652
|
+
# rows.
|
262
653
|
#
|
263
654
|
# See FasterCSV::DEFAULT_OPTIONS for the default settings.
|
264
655
|
#
|
@@ -272,25 +663,14 @@ class FasterCSV
|
|
272
663
|
# create the IO object we will read from
|
273
664
|
@io = if data.is_a? String then StringIO.new(data) else data end
|
274
665
|
|
275
|
-
|
276
|
-
|
277
|
-
|
666
|
+
init_separators(options)
|
667
|
+
init_parsers(options)
|
668
|
+
init_converters(options)
|
669
|
+
init_headers(options)
|
278
670
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
/\A#{Regexp.escape(@col_sep)}+/, # for empty leading fields
|
283
|
-
:csv_row =>
|
284
|
-
### The Primary Parser ###
|
285
|
-
/ \G(?:^|#{Regexp.escape(@col_sep)}) # anchor the match
|
286
|
-
(?: "((?>[^"]*)(?>""[^"]*)*)" # find quoted fields
|
287
|
-
| # ... or ...
|
288
|
-
([^"#{Regexp.escape(@col_sep)}]*) # unquoted fields
|
289
|
-
)/x,
|
290
|
-
### End Primary Parser ###
|
291
|
-
:line_end =>
|
292
|
-
/#{Regexp.escape(@row_sep)}\Z/ # safer than chomp!()
|
293
|
-
}
|
671
|
+
unless options.empty?
|
672
|
+
raise ArgumentError, "Unknown options: #{options.keys.join(', ')}."
|
673
|
+
end
|
294
674
|
end
|
295
675
|
|
296
676
|
### IO and StringIO Delegation ###
|
@@ -330,6 +710,43 @@ class FasterCSV
|
|
330
710
|
alias_method :add_row, :<<
|
331
711
|
alias_method :puts, :<<
|
332
712
|
|
713
|
+
#
|
714
|
+
# :call-seq:
|
715
|
+
# convert( name )
|
716
|
+
# convert { |field| ... }
|
717
|
+
# convert { |field, field_info| ... }
|
718
|
+
#
|
719
|
+
# You can use this method to install a FasterCSV::Converters built-in, or
|
720
|
+
# provide a block that handles a custom conversion.
|
721
|
+
#
|
722
|
+
# If you provide a block that takes one argument, it will be passed the field
|
723
|
+
# and is expected to return the converted value or the field itself. If your
|
724
|
+
# block takes two arguments, it will also be passed a FieldInfo Struct,
|
725
|
+
# containing details about the field. Again, the block should return a
|
726
|
+
# converted field or the field itself.
|
727
|
+
#
|
728
|
+
def convert( name = nil, &converter )
|
729
|
+
add_converter(:converters, self.class::Converters, name, &converter)
|
730
|
+
end
|
731
|
+
|
732
|
+
#
|
733
|
+
# :call-seq:
|
734
|
+
# header_convert( name )
|
735
|
+
# header_convert { |field| ... }
|
736
|
+
# header_convert { |field, field_info| ... }
|
737
|
+
#
|
738
|
+
# Identical to FasterCSV.convert(), but for header rows.
|
739
|
+
#
|
740
|
+
# Note that this method must be called before header rows are read to have any
|
741
|
+
# effect.
|
742
|
+
#
|
743
|
+
def header_convert( name = nil, &converter )
|
744
|
+
add_converter( :header_converters,
|
745
|
+
self.class::HeaderConverters,
|
746
|
+
name,
|
747
|
+
&converter )
|
748
|
+
end
|
749
|
+
|
333
750
|
include Enumerable
|
334
751
|
|
335
752
|
#
|
@@ -355,9 +772,15 @@ class FasterCSV
|
|
355
772
|
end
|
356
773
|
alias_method :readlines, :read
|
357
774
|
|
775
|
+
# Returns +true+ if the next row read will be a header row.
|
776
|
+
def header_row?
|
777
|
+
@use_headers and @headers.nil?
|
778
|
+
end
|
779
|
+
|
358
780
|
#
|
359
781
|
# The primary read method for wrapped Strings and IOs, a single row is pulled
|
360
|
-
# from the data source, parsed and returned as an Array of fields
|
782
|
+
# from the data source, parsed and returned as an Array of fields (if header
|
783
|
+
# rows are not used) or a FasterCSV::Row (when header rows are used).
|
361
784
|
#
|
362
785
|
# The data source must be open for reading.
|
363
786
|
#
|
@@ -415,7 +838,16 @@ class FasterCSV
|
|
415
838
|
end
|
416
839
|
|
417
840
|
# if parse is empty?(), we found all the fields on the line...
|
418
|
-
|
841
|
+
if parse.empty?
|
842
|
+
# convert headers or fields if needed...
|
843
|
+
csv = convert_fields(csv) if ( header_row? and
|
844
|
+
not @header_converters.empty? ) or
|
845
|
+
not @converters.empty?
|
846
|
+
# parse out header rows and handle FasterCSV::Row conversions...
|
847
|
+
csv = parse_headers(csv) if @use_headers
|
848
|
+
# return the results
|
849
|
+
break csv
|
850
|
+
end
|
419
851
|
# if we're not empty?() but at eof?(), a quoted field wasn't closed...
|
420
852
|
raise MalformedCSVError, "Unclosed quoted field." if @io.eof?
|
421
853
|
# otherwise, we need to loop and pull some more data to complete the row
|
@@ -423,4 +855,187 @@ class FasterCSV
|
|
423
855
|
end
|
424
856
|
alias_method :gets, :shift
|
425
857
|
alias_method :readline, :shift
|
858
|
+
|
859
|
+
private
|
860
|
+
|
861
|
+
#
|
862
|
+
# Stores the indicated separators for later use.
|
863
|
+
#
|
864
|
+
# If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
|
865
|
+
# ahead in the <tt>@io</tt> and try to find one.
|
866
|
+
#
|
867
|
+
def init_separators( options )
|
868
|
+
# store the selected separators
|
869
|
+
@col_sep = options.delete(:col_sep)
|
870
|
+
@row_sep = options.delete(:row_sep)
|
871
|
+
|
872
|
+
# automatically discover row separator when requested
|
873
|
+
saved_pos = @io.pos # remember where we were
|
874
|
+
while @row_sep == :auto
|
875
|
+
#
|
876
|
+
# if we run out of data, it's probably a single line
|
877
|
+
# (use a sensible default)
|
878
|
+
#
|
879
|
+
if @io.eof?
|
880
|
+
@row_sep = $/
|
881
|
+
break
|
882
|
+
end
|
883
|
+
|
884
|
+
# read ahead a bit
|
885
|
+
sample = @io.read(1024)
|
886
|
+
sample += @io.read(1) if sample[-1..-1] == "\r" and not @io.eof?
|
887
|
+
|
888
|
+
# try to find a standard separator
|
889
|
+
if sample =~ /\r\n?|\n/
|
890
|
+
@row_sep = $&
|
891
|
+
break
|
892
|
+
end
|
893
|
+
end
|
894
|
+
@io.seek(saved_pos) # reset back to the remembered position
|
895
|
+
end
|
896
|
+
|
897
|
+
# Pre-compiles parsers and stores them by name for access during reads.
|
898
|
+
def init_parsers( options )
|
899
|
+
# prebuild Regexps for faster parsing
|
900
|
+
@parsers = {
|
901
|
+
:leading_fields =>
|
902
|
+
/\A#{Regexp.escape(@col_sep)}+/, # for empty leading fields
|
903
|
+
:csv_row =>
|
904
|
+
### The Primary Parser ###
|
905
|
+
/ \G(?:^|#{Regexp.escape(@col_sep)}) # anchor the match
|
906
|
+
(?: "((?>[^"]*)(?>""[^"]*)*)" # find quoted fields
|
907
|
+
| # ... or ...
|
908
|
+
([^"#{Regexp.escape(@col_sep)}]*) # unquoted fields
|
909
|
+
)/x,
|
910
|
+
### End Primary Parser ###
|
911
|
+
:line_end =>
|
912
|
+
/#{Regexp.escape(@row_sep)}\Z/ # safer than chomp!()
|
913
|
+
}
|
914
|
+
end
|
915
|
+
|
916
|
+
#
|
917
|
+
# Loads any converters requested during construction.
|
918
|
+
#
|
919
|
+
# If +field_name+ is set <tt>:converters</tt> (the default) field converters
|
920
|
+
# are set. When +field_name+ is <tt>:header_converters</tt> header converters
|
921
|
+
# are added instead.
|
922
|
+
#
|
923
|
+
def init_converters( options, field_name = :converters )
|
924
|
+
instance_variable_set("@#{field_name}", Array.new)
|
925
|
+
|
926
|
+
# find the correct method to add the coverters
|
927
|
+
convert = method(field_name.to_s.sub(/ers\Z/, ""))
|
928
|
+
|
929
|
+
# load converters
|
930
|
+
unless options[field_name].nil?
|
931
|
+
# allow a single converter not wrapped in an Array
|
932
|
+
unless options[field_name].is_a? Array
|
933
|
+
options[field_name] = [options[field_name]]
|
934
|
+
end
|
935
|
+
# load each converter...
|
936
|
+
options[field_name].each do |converter|
|
937
|
+
if converter.is_a? Proc # custom code block
|
938
|
+
convert.call(&converter)
|
939
|
+
else # by name
|
940
|
+
convert.call(converter)
|
941
|
+
end
|
942
|
+
end
|
943
|
+
end
|
944
|
+
|
945
|
+
options.delete(field_name)
|
946
|
+
end
|
947
|
+
|
948
|
+
# Stores header row settings and loads header converters, if needed.
|
949
|
+
def init_headers( options )
|
950
|
+
@use_headers = options.delete(:headers)
|
951
|
+
@return_headers = options.delete(:return_headers)
|
952
|
+
|
953
|
+
@headers = nil
|
954
|
+
|
955
|
+
init_converters(options, :header_converters)
|
956
|
+
end
|
957
|
+
|
958
|
+
#
|
959
|
+
# The actual work method for adding converters, used by both
|
960
|
+
# FasterCSV.convert() and FasterCSV.header_convert().
|
961
|
+
#
|
962
|
+
# This method requires the +var_name+ of the instance variable to place the
|
963
|
+
# converters in, the +const+ Hash to lookup named converters in, and the
|
964
|
+
# normal parameters of the FasterCSV.convert() and FasterCSV.header_convert()
|
965
|
+
# methods.
|
966
|
+
#
|
967
|
+
def add_converter( var_name, const, name = nil, &converter )
|
968
|
+
if name.nil? # custom converter
|
969
|
+
instance_variable_get("@#{var_name}") << converter
|
970
|
+
else # named converter
|
971
|
+
combo = const[name]
|
972
|
+
case combo
|
973
|
+
when Array # combo converter
|
974
|
+
combo.each do |converter_name|
|
975
|
+
add_converter(var_name, const, converter_name)
|
976
|
+
end
|
977
|
+
else # individual named converter
|
978
|
+
instance_variable_get("@#{var_name}") << combo
|
979
|
+
end
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
#
|
984
|
+
# Processes +fields+ with <tt>@converters</tt>, returning the converted field
|
985
|
+
# set. Any converter that changes the field into something other than a
|
986
|
+
# String halts the pipeline of conversion for that field. This is primarily
|
987
|
+
# an efficiency shortcut.
|
988
|
+
#
|
989
|
+
def convert_fields( fields )
|
990
|
+
converters = if header_row? # see if we are converting headers or fields
|
991
|
+
@header_converters
|
992
|
+
else
|
993
|
+
@converters
|
994
|
+
end
|
995
|
+
|
996
|
+
fields.enum_for(:each_with_index).map do |field, index| # map_with_index
|
997
|
+
converters.each do |converter|
|
998
|
+
field = if converter.arity == 1 # straight field converter
|
999
|
+
converter[field]
|
1000
|
+
else # FieldInfo converter
|
1001
|
+
converter[field, FieldInfo.new(index, @io.lineno)]
|
1002
|
+
end
|
1003
|
+
break unless field.is_a? String # short-curcuit pipeline for speed
|
1004
|
+
end
|
1005
|
+
field # return final state of each field, converted or original
|
1006
|
+
end
|
1007
|
+
end
|
1008
|
+
|
1009
|
+
#
|
1010
|
+
# This methods is used to turn a finished +row+ into a FasterCSV::Row. Header
|
1011
|
+
# rows are also dealt with here, either by returning a FasterCSV::Row with
|
1012
|
+
# identical headers and fields or by reading past them to return a field row.
|
1013
|
+
# Headers are also saved in <tt>@headers</tt> for use in future rows.
|
1014
|
+
#
|
1015
|
+
def parse_headers( row )
|
1016
|
+
if @headers.nil? # header row
|
1017
|
+
@headers = row # save
|
1018
|
+
if @return_headers # return the headers
|
1019
|
+
FasterCSV::Row.new(@headers, @headers)
|
1020
|
+
else # skip to next field row
|
1021
|
+
shift
|
1022
|
+
end
|
1023
|
+
else # field row
|
1024
|
+
FasterCSV::Row.new(@headers, row)
|
1025
|
+
end
|
1026
|
+
end
|
1027
|
+
end
|
1028
|
+
|
1029
|
+
class Array
|
1030
|
+
# Equivalent to <tt>FasterCSV::generate_line(self, options)</tt>.
|
1031
|
+
def to_csv( options = Hash.new )
|
1032
|
+
FasterCSV.generate_line(self, options)
|
1033
|
+
end
|
1034
|
+
end
|
1035
|
+
|
1036
|
+
class String
|
1037
|
+
# Equivalent to <tt>FasterCSV::parse_line(self, options)</tt>.
|
1038
|
+
def parse_csv( options = Hash.new )
|
1039
|
+
FasterCSV.parse_line(self, options)
|
1040
|
+
end
|
426
1041
|
end
|