fastercsv 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +16 -0
- data/Rakefile +1 -1
- data/TODO +0 -5
- data/lib/faster_csv.rb +655 -40
- data/test/tc_csv_parsing.rb +2 -2
- data/test/tc_data_converters.rb +166 -0
- data/test/tc_features.rb +18 -0
- data/test/tc_headers.rb +125 -0
- data/test/tc_interface.rb +28 -0
- data/test/tc_row.rb +264 -0
- data/test/ts_all.rb +3 -0
- metadata +41 -35
data/CHANGELOG
CHANGED
@@ -2,6 +2,22 @@
|
|
2
2
|
|
3
3
|
Below is a complete listing of changes for each revision of FasterCSV.
|
4
4
|
|
5
|
+
== 0.1.6
|
6
|
+
|
7
|
+
* Began using a forked development/stable versioning system.
|
8
|
+
* Reorganized initialization code for easier additions and maintenance.
|
9
|
+
* Added a check for unknown options. Exceptions will now be thrown for them.
|
10
|
+
* Added built-in and custom data converters. Built-in handle numbers and dates.
|
11
|
+
* Added Array#to_csv and String#parse_csv. Both accept normal options.
|
12
|
+
* Project moved to RubyForge Subversion.
|
13
|
+
* Added auto-discovery for <tt>:row_sep</tt> (now the default).
|
14
|
+
* Added FasterCSV::filter() for easy Unix-like CSV filters.
|
15
|
+
* Added support for accessing fields by headers.
|
16
|
+
* Headers can have their own converters.
|
17
|
+
* Headers can be skipped or returned as needed.
|
18
|
+
* FasterCSV::Row allows index or header access while retaining order and
|
19
|
+
allowing for duplicate headers.
|
20
|
+
|
5
21
|
== 0.1.4
|
6
22
|
|
7
23
|
* Fixed <tt>:col_sep</tt> escaping bug (reported by Kev Jackson).
|
data/Rakefile
CHANGED
data/TODO
CHANGED
@@ -3,10 +3,6 @@
|
|
3
3
|
The following is a list of planned expansions for FasterCSV, in no particular
|
4
4
|
order.
|
5
5
|
|
6
|
-
* Add support for accessing fields by headers (from first row of document).
|
7
|
-
* Add "convertors" for switching numbers to Integers or Floats, dates to Date or
|
8
|
-
Time objects, etc.
|
9
|
-
* Add to_csv().
|
10
6
|
* Find a good headers solution for data like this:
|
11
7
|
"Experiment ID: 1",,,,,,,,,,,,
|
12
8
|
"Subject ID: 1013938829432171e868c340.
|
@@ -31,5 +27,4 @@ order.
|
|
31
27
|
### and a new block starts
|
32
28
|
"Experiment ID: 3",,,,,,,,,,,,0.92
|
33
29
|
....
|
34
|
-
* Add FasterCSV.filter().
|
35
30
|
* Add calculated fields.
|
data/lib/faster_csv.rb
CHANGED
@@ -9,6 +9,8 @@
|
|
9
9
|
|
10
10
|
require "stringio"
|
11
11
|
require "forwardable"
|
12
|
+
require "enumerator"
|
13
|
+
require "date"
|
12
14
|
|
13
15
|
#
|
14
16
|
# This class provides a complete interface to CSV files and data. It offers
|
@@ -61,20 +63,374 @@ require "forwardable"
|
|
61
63
|
#
|
62
64
|
# == Convert a Single Line
|
63
65
|
#
|
64
|
-
# csv_string =
|
65
|
-
# csv_array =
|
66
|
+
# csv_string = ["CSV", "data"].to_csv # to CSV
|
67
|
+
# csv_array = "CSV,String".parse_csv # from CSV
|
66
68
|
#
|
67
69
|
class FasterCSV
|
70
|
+
#
|
71
|
+
# A FasterCSV::Row is part Array and part Hash. It retains an order for the
|
72
|
+
# fields and allows duplicates just as an Array would, but also allows you to
|
73
|
+
# access fields by name just as you could if they were in a Hash.
|
74
|
+
#
|
75
|
+
# All rows returned by FasterCSV will be constructed from this class, if
|
76
|
+
# header row processing is activated.
|
77
|
+
#
|
78
|
+
class Row
|
79
|
+
#
|
80
|
+
# Construct a new FasterCSV::Row from +headers+ and +fields+, which are
|
81
|
+
# expected to be Arrays. If one Array is shorter than the other, it will be
|
82
|
+
# padded with +nil+ objects.
|
83
|
+
#
|
84
|
+
def initialize( headers, fields )
|
85
|
+
# handle extra headers or fields
|
86
|
+
@row = if headers.size > fields.size
|
87
|
+
headers.zip(fields)
|
88
|
+
else
|
89
|
+
fields.zip(headers).map { |pair| pair.reverse }
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# Returns the headers of this row.
|
94
|
+
def headers
|
95
|
+
@row.map { |pair| pair.first }
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# :call-seq:
|
100
|
+
# field( header )
|
101
|
+
# field( header, offset )
|
102
|
+
# field( index )
|
103
|
+
#
|
104
|
+
# This method will fetch the field value by +header+ or +index+. If a field
|
105
|
+
# is not found, +nil+ is returned.
|
106
|
+
#
|
107
|
+
# When provided, +offset+ ensures that a header match occurrs on or later
|
108
|
+
# than the +offset+ index. You can use this to find duplicate headers,
|
109
|
+
# without resorting to hard-coding exact indices.
|
110
|
+
#
|
111
|
+
def field( header_or_index, minimum_index = 0 )
|
112
|
+
# locate the pair
|
113
|
+
finder = header_or_index.is_a?(Integer) ? :[] : :assoc
|
114
|
+
pair = @row[minimum_index..-1].send(finder, header_or_index)
|
115
|
+
|
116
|
+
# return the field if we have a pair
|
117
|
+
pair.nil? ? nil : pair.last
|
118
|
+
end
|
119
|
+
alias_method :[], :field
|
120
|
+
|
121
|
+
#
|
122
|
+
# :call-seq:
|
123
|
+
# []=( header, value )
|
124
|
+
# []=( header, offset, value )
|
125
|
+
# []=( index, value )
|
126
|
+
#
|
127
|
+
# Looks up the field by the semantics described in FasterCSV::Row.field()
|
128
|
+
# and assigns the +value+.
|
129
|
+
#
|
130
|
+
# Assigning past the end of the row with an index will set all pairs between
|
131
|
+
# to <tt>[nil, nil]</tt>. Assigning to an unused header appends the new
|
132
|
+
# pair.
|
133
|
+
#
|
134
|
+
def []=( *args )
|
135
|
+
value = args.pop
|
136
|
+
|
137
|
+
if args.first.is_a? Integer
|
138
|
+
if @row[args.first].nil? # extending past the end with index
|
139
|
+
@row[args.first] = [nil, value]
|
140
|
+
@row.map! { |pair| pair.nil? ? [nil, nil] : pair }
|
141
|
+
else # normal index assignment
|
142
|
+
@row[args.first][1] = value
|
143
|
+
end
|
144
|
+
else
|
145
|
+
index = index(*args)
|
146
|
+
if index.nil? # appending a field
|
147
|
+
self << [args.first, value]
|
148
|
+
else # normal header assignment
|
149
|
+
@row[index][1] = value
|
150
|
+
end
|
151
|
+
end
|
152
|
+
end
|
153
|
+
|
154
|
+
#
|
155
|
+
# :call-seq:
|
156
|
+
# <<( field )
|
157
|
+
# <<( header_and_field_array )
|
158
|
+
# <<( header_and_field_hash )
|
159
|
+
#
|
160
|
+
# If a two-element Array is provided, it is assumed to be a header and field
|
161
|
+
# and the pair is appended. A Hash works the same way with the key being
|
162
|
+
# the header and the value being the field. Anything else is assumed to be
|
163
|
+
# a lone field which is appended with a +nil+ header.
|
164
|
+
#
|
165
|
+
# This method returns the row for chaining.
|
166
|
+
#
|
167
|
+
def <<( arg )
|
168
|
+
if arg.is_a?(Array) and arg.size == 2 # appending a header and name
|
169
|
+
@row << arg
|
170
|
+
elsif arg.is_a?(Hash) # append header and name pairs
|
171
|
+
arg.each { |pair| @row << pair }
|
172
|
+
else # append field value
|
173
|
+
@row << [nil, arg]
|
174
|
+
end
|
175
|
+
|
176
|
+
self # for chaining
|
177
|
+
end
|
178
|
+
|
179
|
+
#
|
180
|
+
# A shortcut for appending multiple fields. Equivalent to:
|
181
|
+
#
|
182
|
+
# args.each { |arg| faster_csv_row << arg }
|
183
|
+
#
|
184
|
+
# This method returns the row for chaining.
|
185
|
+
#
|
186
|
+
def push( *args )
|
187
|
+
args.each { |arg| self << arg }
|
188
|
+
|
189
|
+
self # for chaining
|
190
|
+
end
|
191
|
+
|
192
|
+
#
|
193
|
+
# :call-seq:
|
194
|
+
# delete( header )
|
195
|
+
# delete( header, offset )
|
196
|
+
# delete( index )
|
197
|
+
#
|
198
|
+
# Used to remove a pair from the row by +header+ or +index+. The pair is
|
199
|
+
# located as described in FasterCSV::Row.field(). The deleted pair is
|
200
|
+
# returned, or +nil+ if a pair could not be found.
|
201
|
+
#
|
202
|
+
def delete( header_or_index, minimum_index = 0 )
|
203
|
+
if header_or_index.is_a? Integer # by index
|
204
|
+
@row.delete_at(header_or_index)
|
205
|
+
else # by header
|
206
|
+
@row.delete_at(index(header_or_index, minimum_index))
|
207
|
+
end
|
208
|
+
end
|
209
|
+
|
210
|
+
#
|
211
|
+
# The provided +block+ is passed a header and field for each pair in the row
|
212
|
+
# and expected to return +true+ or +false+, depending on whether the pair
|
213
|
+
# should be deleted.
|
214
|
+
#
|
215
|
+
# This method returns the row for chaining.
|
216
|
+
#
|
217
|
+
def delete_if( &block )
|
218
|
+
@row.delete_if(&block)
|
219
|
+
|
220
|
+
self # for chaining
|
221
|
+
end
|
222
|
+
|
223
|
+
#
|
224
|
+
# This method accepts any number of arguments which can be headers, indices,
|
225
|
+
# or two-element Arrays containing a header and offset. Each argument will
|
226
|
+
# be replaced with a field lookup as described in FasterCSV::Row.field().
|
227
|
+
#
|
228
|
+
# If called with no arguments, all fields are returned.
|
229
|
+
#
|
230
|
+
def fields( *headers_and_or_indices )
|
231
|
+
if headers_and_or_indices.empty? # return all fields--no arguments
|
232
|
+
@row.map { |pair| pair.last }
|
233
|
+
else # or work like values_at()
|
234
|
+
headers_and_or_indices.map { |h_or_i| field(*Array(h_or_i)) }
|
235
|
+
end
|
236
|
+
end
|
237
|
+
alias_method :values_at, :fields
|
238
|
+
|
239
|
+
#
|
240
|
+
# :call-seq:
|
241
|
+
# index( header )
|
242
|
+
# index( header, offset )
|
243
|
+
#
|
244
|
+
# This method will return the index of a field with the provided +header+.
|
245
|
+
# The +offset+ can be used to locate duplicate header names, as described in
|
246
|
+
# FasterCSV::Row.field().
|
247
|
+
#
|
248
|
+
def index( header, minimum_index = 0 )
|
249
|
+
# find the pair
|
250
|
+
index = headers[minimum_index..-1].index(header)
|
251
|
+
# return the index at the right offset, if we found one
|
252
|
+
index.nil? ? nil : index + minimum_index
|
253
|
+
end
|
254
|
+
|
255
|
+
# Returns +true+ if +name+ is a header for this row, and +false+ otherwise.
|
256
|
+
def header?( name )
|
257
|
+
headers.include? name
|
258
|
+
end
|
259
|
+
alias_method :include?, :header?
|
260
|
+
|
261
|
+
#
|
262
|
+
# Returns +true+ if +data+ matches a field in this row, and +false+
|
263
|
+
# otherwise.
|
264
|
+
#
|
265
|
+
def field?( data )
|
266
|
+
fields.include? data
|
267
|
+
end
|
268
|
+
|
269
|
+
include Enumerable
|
270
|
+
|
271
|
+
#
|
272
|
+
# Yields each pair of the row as header and field tuples (much like
|
273
|
+
# iterating over a Hash).
|
274
|
+
#
|
275
|
+
# Support for Enumerable.
|
276
|
+
#
|
277
|
+
# This method returns the row for chaining.
|
278
|
+
#
|
279
|
+
def each( &block )
|
280
|
+
@row.each(&block)
|
281
|
+
|
282
|
+
self # for chaining
|
283
|
+
end
|
284
|
+
|
285
|
+
#
|
286
|
+
# Collapses the row into a simple Hash. Be warning that this discards field
|
287
|
+
# order and clobbers duplicate fields.
|
288
|
+
#
|
289
|
+
def to_hash
|
290
|
+
# flatten just one level of the internal Array
|
291
|
+
Hash[*@row.inject(Array.new) { |ary, pair| ary.push(*pair) }]
|
292
|
+
end
|
293
|
+
|
294
|
+
#
|
295
|
+
# Returns the row as a CSV String. Headers are not used. Equivalent to:
|
296
|
+
#
|
297
|
+
# faster_csv_row.fields.to_csv( options )
|
298
|
+
#
|
299
|
+
def to_csv( options = Hash.new )
|
300
|
+
fields.to_csv(options)
|
301
|
+
end
|
302
|
+
alias_method :to_s, :to_csv
|
303
|
+
end
|
304
|
+
|
68
305
|
# The error thrown when the parser encounters illegal CSV formatting.
|
69
306
|
class MalformedCSVError < RuntimeError; end
|
70
307
|
|
308
|
+
#
|
309
|
+
# A FieldInfo Struct contains details about a field's position in the data
|
310
|
+
# source it was read from. FasterCSV will pass this Struct to some blocks
|
311
|
+
# that make decisions based on field structure. See
|
312
|
+
# FasterCSV.convert_fields() for an example.
|
313
|
+
#
|
314
|
+
# <b><tt>index</tt></b>:: The zero-based index of the field in its row.
|
315
|
+
# <b><tt>line</tt></b>:: The line of the data source this row is from.
|
316
|
+
#
|
317
|
+
FieldInfo = Struct.new(:index, :line)
|
318
|
+
|
319
|
+
#
|
320
|
+
# This Hash holds the built-in converters of FasterCSV that can be accessed by
|
321
|
+
# name. You can select Converters with FasterCSV.convert() or through the
|
322
|
+
# +options+ Hash passed to FasterCSV::new().
|
323
|
+
#
|
324
|
+
# <b><tt>:integer</tt></b>:: Converts any field Integer() accepts.
|
325
|
+
# <b><tt>:float</tt></b>:: Converts any field Float() accepts.
|
326
|
+
# <b><tt>:numeric</tt></b>:: A combination of <tt>:integer</tt>
|
327
|
+
# and <tt>:float</tt>.
|
328
|
+
# <b><tt>:date</tt></b>:: Converts any field Date::parse() accepts.
|
329
|
+
# <b><tt>:date_time</tt></b>:: Converts any field DateTime::parse() accepts.
|
330
|
+
# <b><tt>:all</tt></b>:: All built-in converters. A combination of
|
331
|
+
# <tt>:date_time</tt> and <tt>:numeric</tt>.
|
332
|
+
#
|
333
|
+
# This Hash is intetionally left unfrozen and users should feel free to add
|
334
|
+
# values to it that can be accessed by all FasterCSV objects.
|
335
|
+
#
|
336
|
+
# To add a combo field, the value should be an Array of names. Combo fields
|
337
|
+
# can be nested with other combo fields.
|
338
|
+
#
|
339
|
+
Converters = { :integer => lambda { |f| Integer(f) rescue f },
|
340
|
+
:float => lambda { |f| Float(f) rescue f },
|
341
|
+
:numeric => [:integer, :float],
|
342
|
+
:date => lambda { |f| Date.parse(f) rescue f },
|
343
|
+
:date_time => lambda { |f| DateTime.parse(f) rescue f },
|
344
|
+
:all => [:date_time, :numeric] }
|
345
|
+
|
346
|
+
#
|
347
|
+
# This Hash holds the built-in header converters of FasterCSV that can be
|
348
|
+
# accessed by name. You can select HeaderConverters with
|
349
|
+
# FasterCSV.header_convert() or through the +options+ Hash passed to
|
350
|
+
# FasterCSV::new().
|
351
|
+
#
|
352
|
+
# <b><tt>:downcase</tt></b>:: Calls downcase() on the header String.
|
353
|
+
# <b><tt>:symbol</tt></b>:: The header String is downcased, spaces are
|
354
|
+
# replaced with underscores, non-word characters
|
355
|
+
# are dropped, and finally to_sym() is called.
|
356
|
+
#
|
357
|
+
# This Hash is intetionally left unfrozen and users should feel free to add
|
358
|
+
# values to it that can be accessed by all FasterCSV objects.
|
359
|
+
#
|
360
|
+
# To add a combo field, the value should be an Array of names. Combo fields
|
361
|
+
# can be nested with other combo fields.
|
362
|
+
#
|
363
|
+
HeaderConverters = {
|
364
|
+
:downcase => lambda { |h| h.downcase },
|
365
|
+
:symbol => lambda { |h|
|
366
|
+
h.downcase.tr(" ", "_").delete("^a-z0-9_").to_sym
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
71
370
|
#
|
72
371
|
# The options used when no overrides are given by calling code. They are:
|
73
372
|
#
|
74
|
-
# <b><tt>:col_sep</tt></b>::
|
75
|
-
# <b><tt>:row_sep</tt></b>::
|
373
|
+
# <b><tt>:col_sep</tt></b>:: <tt>","</tt>
|
374
|
+
# <b><tt>:row_sep</tt></b>:: <tt>:auto</tt>
|
375
|
+
# <b><tt>:converters</tt></b>:: +nil+
|
376
|
+
# <b><tt>:headers</tt></b>:: +false+
|
377
|
+
# <b><tt>:return_headers</tt></b>:: +false+
|
378
|
+
# <b><tt>:header_converters</tt></b>:: +nil+
|
379
|
+
#
|
380
|
+
DEFAULT_OPTIONS = { :col_sep => ",",
|
381
|
+
:row_sep => :auto,
|
382
|
+
:converters => nil,
|
383
|
+
:headers => false,
|
384
|
+
:return_headers => false,
|
385
|
+
:header_converters => nil }.freeze
|
386
|
+
|
387
|
+
#
|
388
|
+
# :call-seq:
|
389
|
+
# filter( options = Hash.new ) { |row| ... }
|
390
|
+
# filter( input, options = Hash.new ) { |row| ... }
|
391
|
+
# filter( input, output, options = Hash.new ) { |row| ... }
|
392
|
+
#
|
393
|
+
# This method is a convenience for building Unix-like filters for CSV data.
|
394
|
+
# Each row is yielded to the provided block which can alter it as needed.
|
395
|
+
# After the block returns, the row is appended to _output_ altered or not.
|
396
|
+
#
|
397
|
+
# The _input_ and _output_ arguments can be anything FasterCSV::new() accepts
|
398
|
+
# (generally String or IO objects). If not given, they default to
|
399
|
+
# <tt>$stdin</tt> and <tt>$stdout</tt>.
|
76
400
|
#
|
77
|
-
|
401
|
+
# The _options_ parameter is also filtered down to FasterCSV::new() after some
|
402
|
+
# clever key parsing. Any key beginning with <tt>:in_</tt> or
|
403
|
+
# <tt>:input_</tt> will have that leading identifier stripped and will only
|
404
|
+
# be used in the _options_ Hash for the _input_ object. Keys starting with
|
405
|
+
# <tt>:out_</tt> or <tt>:output_</tt> affect only _output_. All other keys
|
406
|
+
# are assigned to both objects.
|
407
|
+
#
|
408
|
+
def self.filter( *args )
|
409
|
+
# parse options for input, output, or both
|
410
|
+
input_options, output_options = Hash.new, Hash.new
|
411
|
+
if args.last.is_a? Hash
|
412
|
+
args.pop.each do |key, value|
|
413
|
+
case key.to_s
|
414
|
+
when /\Ain(?:put)?_(.+)\Z/
|
415
|
+
input_options[$1.to_sym] = value
|
416
|
+
when /\Aout(?:put)?_(.+)\Z/
|
417
|
+
output_options[$1.to_sym] = value
|
418
|
+
else
|
419
|
+
input_options[key] = value
|
420
|
+
output_options[key] = value
|
421
|
+
end
|
422
|
+
end
|
423
|
+
end
|
424
|
+
# build input and output wrappers
|
425
|
+
input = FasterCSV.new(args.shift || $stdin, input_options)
|
426
|
+
output = FasterCSV.new(args.shift || $stdout, output_options)
|
427
|
+
|
428
|
+
# read, yield, write
|
429
|
+
input.each do |row|
|
430
|
+
yield row
|
431
|
+
output << row
|
432
|
+
end
|
433
|
+
end
|
78
434
|
|
79
435
|
#
|
80
436
|
# This method is intended as the primary interface for reading CSV files. You
|
@@ -218,19 +574,6 @@ class FasterCSV
|
|
218
574
|
end
|
219
575
|
end
|
220
576
|
|
221
|
-
#
|
222
|
-
# Use to slurp a CSV file into an Array of Arrays. Pass the +path+ to the
|
223
|
-
# file and any +options+ FasterCSV::new() understands.
|
224
|
-
#
|
225
|
-
def self.read( path, options = Hash.new )
|
226
|
-
open(path, options) { |csv| csv.read }
|
227
|
-
end
|
228
|
-
|
229
|
-
# Alias for FasterCSV::read().
|
230
|
-
def self.readlines( path, options = Hash.new )
|
231
|
-
open(path, options) { |csv| csv.readlines }
|
232
|
-
end
|
233
|
-
|
234
577
|
#
|
235
578
|
# This method is a shortcut for converting a single line of a CSV String into
|
236
579
|
# a into an Array. Note that if +line+ contains multiple rows, anything
|
@@ -242,6 +585,19 @@ class FasterCSV
|
|
242
585
|
new(line, options).shift
|
243
586
|
end
|
244
587
|
|
588
|
+
#
|
589
|
+
# Use to slurp a CSV file into an Array of Arrays. Pass the +path+ to the
|
590
|
+
# file and any +options+ FasterCSV::new() understands.
|
591
|
+
#
|
592
|
+
def self.read( path, options = Hash.new )
|
593
|
+
open(path, options) { |csv| csv.read }
|
594
|
+
end
|
595
|
+
|
596
|
+
# Alias for FasterCSV::read().
|
597
|
+
def self.readlines( *args )
|
598
|
+
read(*args)
|
599
|
+
end
|
600
|
+
|
245
601
|
#
|
246
602
|
# This constructor will wrap either a String or IO object passed in +data+ for
|
247
603
|
# reading and/or writing. In addition to the FasterCSV instance methods,
|
@@ -257,8 +613,43 @@ class FasterCSV
|
|
257
613
|
# You may set any reading and/or writing preferences in the +options+ Hash.
|
258
614
|
# Available options are:
|
259
615
|
#
|
260
|
-
# <b><tt>:col_sep</tt></b>::
|
261
|
-
# <b><tt>:row_sep</tt></b>::
|
616
|
+
# <b><tt>:col_sep</tt></b>:: The String placed between each field.
|
617
|
+
# <b><tt>:row_sep</tt></b>:: The String appended to the end of each
|
618
|
+
# row. This can be set to the special
|
619
|
+
# <tt>:auto</tt> setting, which requests
|
620
|
+
# that FasterCSV automatically discover
|
621
|
+
# this from the data. Auto-discovery
|
622
|
+
# reads ahead in the data looking for
|
623
|
+
# the next <tt>"\r\n"</tt>,
|
624
|
+
# <tt>"\n"</tt>, or <tt>"\r"</tt>
|
625
|
+
# sequence. A sequence will be selected
|
626
|
+
# even if it occurs in a quoted field,
|
627
|
+
# assuming that you would have the same
|
628
|
+
# line endings there. If none of those
|
629
|
+
# sequences is found, the default
|
630
|
+
# <tt>$/</tt> is used. Obviously,
|
631
|
+
# discovery takes a little time. Set
|
632
|
+
# manually if speed is important.
|
633
|
+
# <b><tt>:converters</tt></b>:: An Array of names from the Converters
|
634
|
+
# Hash and/or lambdas that handle custom
|
635
|
+
# conversion. A single converter
|
636
|
+
# doesn't have to be in an Array.
|
637
|
+
# <b><tt>:headers</tt></b>:: If set to <tt>:first_row</tt> or
|
638
|
+
# +true+, the initial row of the CSV
|
639
|
+
# file will be treated as a row of
|
640
|
+
# headers. This setting causes
|
641
|
+
# FasterCSV.shift() to return rows as
|
642
|
+
# FasterCSV::Row objects instead of
|
643
|
+
# Arrays.
|
644
|
+
# <b><tt>:return_headers</tt></b>:: When +false+, header rows are silently
|
645
|
+
# swallowed. If set to +true+, header
|
646
|
+
# rows are returned in a FasterCSV::Row
|
647
|
+
# object with identical headers and
|
648
|
+
# fields.
|
649
|
+
# <b><tt>:header_converters</tt></b>:: Identical in functionality to
|
650
|
+
# <tt>:converters</tt> save that the
|
651
|
+
# conversions are only made to header
|
652
|
+
# rows.
|
262
653
|
#
|
263
654
|
# See FasterCSV::DEFAULT_OPTIONS for the default settings.
|
264
655
|
#
|
@@ -272,25 +663,14 @@ class FasterCSV
|
|
272
663
|
# create the IO object we will read from
|
273
664
|
@io = if data.is_a? String then StringIO.new(data) else data end
|
274
665
|
|
275
|
-
|
276
|
-
|
277
|
-
|
666
|
+
init_separators(options)
|
667
|
+
init_parsers(options)
|
668
|
+
init_converters(options)
|
669
|
+
init_headers(options)
|
278
670
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
/\A#{Regexp.escape(@col_sep)}+/, # for empty leading fields
|
283
|
-
:csv_row =>
|
284
|
-
### The Primary Parser ###
|
285
|
-
/ \G(?:^|#{Regexp.escape(@col_sep)}) # anchor the match
|
286
|
-
(?: "((?>[^"]*)(?>""[^"]*)*)" # find quoted fields
|
287
|
-
| # ... or ...
|
288
|
-
([^"#{Regexp.escape(@col_sep)}]*) # unquoted fields
|
289
|
-
)/x,
|
290
|
-
### End Primary Parser ###
|
291
|
-
:line_end =>
|
292
|
-
/#{Regexp.escape(@row_sep)}\Z/ # safer than chomp!()
|
293
|
-
}
|
671
|
+
unless options.empty?
|
672
|
+
raise ArgumentError, "Unknown options: #{options.keys.join(', ')}."
|
673
|
+
end
|
294
674
|
end
|
295
675
|
|
296
676
|
### IO and StringIO Delegation ###
|
@@ -330,6 +710,43 @@ class FasterCSV
|
|
330
710
|
alias_method :add_row, :<<
|
331
711
|
alias_method :puts, :<<
|
332
712
|
|
713
|
+
#
|
714
|
+
# :call-seq:
|
715
|
+
# convert( name )
|
716
|
+
# convert { |field| ... }
|
717
|
+
# convert { |field, field_info| ... }
|
718
|
+
#
|
719
|
+
# You can use this method to install a FasterCSV::Converters built-in, or
|
720
|
+
# provide a block that handles a custom conversion.
|
721
|
+
#
|
722
|
+
# If you provide a block that takes one argument, it will be passed the field
|
723
|
+
# and is expected to return the converted value or the field itself. If your
|
724
|
+
# block takes two arguments, it will also be passed a FieldInfo Struct,
|
725
|
+
# containing details about the field. Again, the block should return a
|
726
|
+
# converted field or the field itself.
|
727
|
+
#
|
728
|
+
def convert( name = nil, &converter )
|
729
|
+
add_converter(:converters, self.class::Converters, name, &converter)
|
730
|
+
end
|
731
|
+
|
732
|
+
#
|
733
|
+
# :call-seq:
|
734
|
+
# header_convert( name )
|
735
|
+
# header_convert { |field| ... }
|
736
|
+
# header_convert { |field, field_info| ... }
|
737
|
+
#
|
738
|
+
# Identical to FasterCSV.convert(), but for header rows.
|
739
|
+
#
|
740
|
+
# Note that this method must be called before header rows are read to have any
|
741
|
+
# effect.
|
742
|
+
#
|
743
|
+
def header_convert( name = nil, &converter )
|
744
|
+
add_converter( :header_converters,
|
745
|
+
self.class::HeaderConverters,
|
746
|
+
name,
|
747
|
+
&converter )
|
748
|
+
end
|
749
|
+
|
333
750
|
include Enumerable
|
334
751
|
|
335
752
|
#
|
@@ -355,9 +772,15 @@ class FasterCSV
|
|
355
772
|
end
|
356
773
|
alias_method :readlines, :read
|
357
774
|
|
775
|
+
# Returns +true+ if the next row read will be a header row.
|
776
|
+
def header_row?
|
777
|
+
@use_headers and @headers.nil?
|
778
|
+
end
|
779
|
+
|
358
780
|
#
|
359
781
|
# The primary read method for wrapped Strings and IOs, a single row is pulled
|
360
|
-
# from the data source, parsed and returned as an Array of fields
|
782
|
+
# from the data source, parsed and returned as an Array of fields (if header
|
783
|
+
# rows are not used) or a FasterCSV::Row (when header rows are used).
|
361
784
|
#
|
362
785
|
# The data source must be open for reading.
|
363
786
|
#
|
@@ -415,7 +838,16 @@ class FasterCSV
|
|
415
838
|
end
|
416
839
|
|
417
840
|
# if parse is empty?(), we found all the fields on the line...
|
418
|
-
|
841
|
+
if parse.empty?
|
842
|
+
# convert headers or fields if needed...
|
843
|
+
csv = convert_fields(csv) if ( header_row? and
|
844
|
+
not @header_converters.empty? ) or
|
845
|
+
not @converters.empty?
|
846
|
+
# parse out header rows and handle FasterCSV::Row conversions...
|
847
|
+
csv = parse_headers(csv) if @use_headers
|
848
|
+
# return the results
|
849
|
+
break csv
|
850
|
+
end
|
419
851
|
# if we're not empty?() but at eof?(), a quoted field wasn't closed...
|
420
852
|
raise MalformedCSVError, "Unclosed quoted field." if @io.eof?
|
421
853
|
# otherwise, we need to loop and pull some more data to complete the row
|
@@ -423,4 +855,187 @@ class FasterCSV
|
|
423
855
|
end
|
424
856
|
alias_method :gets, :shift
|
425
857
|
alias_method :readline, :shift
|
858
|
+
|
859
|
+
private
|
860
|
+
|
861
|
+
#
|
862
|
+
# Stores the indicated separators for later use.
|
863
|
+
#
|
864
|
+
# If auto-discovery was requested for <tt>@row_sep</tt>, this method will read
|
865
|
+
# ahead in the <tt>@io</tt> and try to find one.
|
866
|
+
#
|
867
|
+
def init_separators( options )
|
868
|
+
# store the selected separators
|
869
|
+
@col_sep = options.delete(:col_sep)
|
870
|
+
@row_sep = options.delete(:row_sep)
|
871
|
+
|
872
|
+
# automatically discover row separator when requested
|
873
|
+
saved_pos = @io.pos # remember where we were
|
874
|
+
while @row_sep == :auto
|
875
|
+
#
|
876
|
+
# if we run out of data, it's probably a single line
|
877
|
+
# (use a sensible default)
|
878
|
+
#
|
879
|
+
if @io.eof?
|
880
|
+
@row_sep = $/
|
881
|
+
break
|
882
|
+
end
|
883
|
+
|
884
|
+
# read ahead a bit
|
885
|
+
sample = @io.read(1024)
|
886
|
+
sample += @io.read(1) if sample[-1..-1] == "\r" and not @io.eof?
|
887
|
+
|
888
|
+
# try to find a standard separator
|
889
|
+
if sample =~ /\r\n?|\n/
|
890
|
+
@row_sep = $&
|
891
|
+
break
|
892
|
+
end
|
893
|
+
end
|
894
|
+
@io.seek(saved_pos) # reset back to the remembered position
|
895
|
+
end
|
896
|
+
|
897
|
+
# Pre-compiles parsers and stores them by name for access during reads.
|
898
|
+
def init_parsers( options )
|
899
|
+
# prebuild Regexps for faster parsing
|
900
|
+
@parsers = {
|
901
|
+
:leading_fields =>
|
902
|
+
/\A#{Regexp.escape(@col_sep)}+/, # for empty leading fields
|
903
|
+
:csv_row =>
|
904
|
+
### The Primary Parser ###
|
905
|
+
/ \G(?:^|#{Regexp.escape(@col_sep)}) # anchor the match
|
906
|
+
(?: "((?>[^"]*)(?>""[^"]*)*)" # find quoted fields
|
907
|
+
| # ... or ...
|
908
|
+
([^"#{Regexp.escape(@col_sep)}]*) # unquoted fields
|
909
|
+
)/x,
|
910
|
+
### End Primary Parser ###
|
911
|
+
:line_end =>
|
912
|
+
/#{Regexp.escape(@row_sep)}\Z/ # safer than chomp!()
|
913
|
+
}
|
914
|
+
end
|
915
|
+
|
916
|
+
#
|
917
|
+
# Loads any converters requested during construction.
|
918
|
+
#
|
919
|
+
# If +field_name+ is set <tt>:converters</tt> (the default) field converters
|
920
|
+
# are set. When +field_name+ is <tt>:header_converters</tt> header converters
|
921
|
+
# are added instead.
|
922
|
+
#
|
923
|
+
def init_converters( options, field_name = :converters )
|
924
|
+
instance_variable_set("@#{field_name}", Array.new)
|
925
|
+
|
926
|
+
# find the correct method to add the coverters
|
927
|
+
convert = method(field_name.to_s.sub(/ers\Z/, ""))
|
928
|
+
|
929
|
+
# load converters
|
930
|
+
unless options[field_name].nil?
|
931
|
+
# allow a single converter not wrapped in an Array
|
932
|
+
unless options[field_name].is_a? Array
|
933
|
+
options[field_name] = [options[field_name]]
|
934
|
+
end
|
935
|
+
# load each converter...
|
936
|
+
options[field_name].each do |converter|
|
937
|
+
if converter.is_a? Proc # custom code block
|
938
|
+
convert.call(&converter)
|
939
|
+
else # by name
|
940
|
+
convert.call(converter)
|
941
|
+
end
|
942
|
+
end
|
943
|
+
end
|
944
|
+
|
945
|
+
options.delete(field_name)
|
946
|
+
end
|
947
|
+
|
948
|
+
# Stores header row settings and loads header converters, if needed.
|
949
|
+
def init_headers( options )
|
950
|
+
@use_headers = options.delete(:headers)
|
951
|
+
@return_headers = options.delete(:return_headers)
|
952
|
+
|
953
|
+
@headers = nil
|
954
|
+
|
955
|
+
init_converters(options, :header_converters)
|
956
|
+
end
|
957
|
+
|
958
|
+
#
|
959
|
+
# The actual work method for adding converters, used by both
|
960
|
+
# FasterCSV.convert() and FasterCSV.header_convert().
|
961
|
+
#
|
962
|
+
# This method requires the +var_name+ of the instance variable to place the
|
963
|
+
# converters in, the +const+ Hash to lookup named converters in, and the
|
964
|
+
# normal parameters of the FasterCSV.convert() and FasterCSV.header_convert()
|
965
|
+
# methods.
|
966
|
+
#
|
967
|
+
def add_converter( var_name, const, name = nil, &converter )
|
968
|
+
if name.nil? # custom converter
|
969
|
+
instance_variable_get("@#{var_name}") << converter
|
970
|
+
else # named converter
|
971
|
+
combo = const[name]
|
972
|
+
case combo
|
973
|
+
when Array # combo converter
|
974
|
+
combo.each do |converter_name|
|
975
|
+
add_converter(var_name, const, converter_name)
|
976
|
+
end
|
977
|
+
else # individual named converter
|
978
|
+
instance_variable_get("@#{var_name}") << combo
|
979
|
+
end
|
980
|
+
end
|
981
|
+
end
|
982
|
+
|
983
|
+
#
|
984
|
+
# Processes +fields+ with <tt>@converters</tt>, returning the converted field
|
985
|
+
# set. Any converter that changes the field into something other than a
|
986
|
+
# String halts the pipeline of conversion for that field. This is primarily
|
987
|
+
# an efficiency shortcut.
|
988
|
+
#
|
989
|
+
def convert_fields( fields )
|
990
|
+
converters = if header_row? # see if we are converting headers or fields
|
991
|
+
@header_converters
|
992
|
+
else
|
993
|
+
@converters
|
994
|
+
end
|
995
|
+
|
996
|
+
fields.enum_for(:each_with_index).map do |field, index| # map_with_index
|
997
|
+
converters.each do |converter|
|
998
|
+
field = if converter.arity == 1 # straight field converter
|
999
|
+
converter[field]
|
1000
|
+
else # FieldInfo converter
|
1001
|
+
converter[field, FieldInfo.new(index, @io.lineno)]
|
1002
|
+
end
|
1003
|
+
break unless field.is_a? String # short-curcuit pipeline for speed
|
1004
|
+
end
|
1005
|
+
field # return final state of each field, converted or original
|
1006
|
+
end
|
1007
|
+
end
|
1008
|
+
|
1009
|
+
#
|
1010
|
+
# This methods is used to turn a finished +row+ into a FasterCSV::Row. Header
|
1011
|
+
# rows are also dealt with here, either by returning a FasterCSV::Row with
|
1012
|
+
# identical headers and fields or by reading past them to return a field row.
|
1013
|
+
# Headers are also saved in <tt>@headers</tt> for use in future rows.
|
1014
|
+
#
|
1015
|
+
def parse_headers( row )
|
1016
|
+
if @headers.nil? # header row
|
1017
|
+
@headers = row # save
|
1018
|
+
if @return_headers # return the headers
|
1019
|
+
FasterCSV::Row.new(@headers, @headers)
|
1020
|
+
else # skip to next field row
|
1021
|
+
shift
|
1022
|
+
end
|
1023
|
+
else # field row
|
1024
|
+
FasterCSV::Row.new(@headers, row)
|
1025
|
+
end
|
1026
|
+
end
|
1027
|
+
end
|
1028
|
+
|
1029
|
+
class Array
|
1030
|
+
# Equivalent to <tt>FasterCSV::generate_line(self, options)</tt>.
|
1031
|
+
def to_csv( options = Hash.new )
|
1032
|
+
FasterCSV.generate_line(self, options)
|
1033
|
+
end
|
1034
|
+
end
|
1035
|
+
|
1036
|
+
class String
|
1037
|
+
# Equivalent to <tt>FasterCSV::parse_line(self, options)</tt>.
|
1038
|
+
def parse_csv( options = Hash.new )
|
1039
|
+
FasterCSV.parse_line(self, options)
|
1040
|
+
end
|
426
1041
|
end
|