csv 3.0.1 → 3.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/{news.md → NEWS.md} +29 -0
- data/lib/csv.rb +222 -614
- data/lib/csv/fields_converter.rb +78 -0
- data/lib/csv/match_p.rb +20 -0
- data/lib/csv/parser.rb +713 -0
- data/lib/csv/version.rb +1 -1
- data/lib/csv/writer.rb +144 -0
- metadata +8 -4
@@ -0,0 +1,78 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
class CSV
|
4
|
+
class FieldsConverter
|
5
|
+
include Enumerable
|
6
|
+
|
7
|
+
def initialize(options={})
|
8
|
+
@converters = []
|
9
|
+
@nil_value = options[:nil_value]
|
10
|
+
@empty_value = options[:empty_value]
|
11
|
+
@empty_value_is_empty_string = (@empty_value == "")
|
12
|
+
@accept_nil = options[:accept_nil]
|
13
|
+
@builtin_converters = options[:builtin_converters]
|
14
|
+
@need_static_convert = need_static_convert?
|
15
|
+
end
|
16
|
+
|
17
|
+
def add_converter(name=nil, &converter)
|
18
|
+
if name.nil? # custom converter
|
19
|
+
@converters << converter
|
20
|
+
else # named converter
|
21
|
+
combo = @builtin_converters[name]
|
22
|
+
case combo
|
23
|
+
when Array # combo converter
|
24
|
+
combo.each do |sub_name|
|
25
|
+
add_converter(sub_name)
|
26
|
+
end
|
27
|
+
else # individual named converter
|
28
|
+
@converters << combo
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def each(&block)
|
34
|
+
@converters.each(&block)
|
35
|
+
end
|
36
|
+
|
37
|
+
def empty?
|
38
|
+
@converters.empty?
|
39
|
+
end
|
40
|
+
|
41
|
+
def convert(fields, headers, lineno)
|
42
|
+
return fields unless need_convert?
|
43
|
+
|
44
|
+
fields.collect.with_index do |field, index|
|
45
|
+
if field.nil?
|
46
|
+
field = @nil_value
|
47
|
+
elsif field.empty?
|
48
|
+
field = @empty_value unless @empty_value_is_empty_string
|
49
|
+
end
|
50
|
+
@converters.each do |converter|
|
51
|
+
break if field.nil? and @accept_nil
|
52
|
+
if converter.arity == 1 # straight field converter
|
53
|
+
field = converter[field]
|
54
|
+
else # FieldInfo converter
|
55
|
+
if headers
|
56
|
+
header = headers[index]
|
57
|
+
else
|
58
|
+
header = nil
|
59
|
+
end
|
60
|
+
field = converter[field, FieldInfo.new(index, lineno, header)]
|
61
|
+
end
|
62
|
+
break unless field.is_a?(String) # short-circuit pipeline for speed
|
63
|
+
end
|
64
|
+
field # final state of each field, converted or original
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
def need_static_convert?
|
70
|
+
not (@nil_value.nil? and @empty_value_is_empty_string)
|
71
|
+
end
|
72
|
+
|
73
|
+
def need_convert?
|
74
|
+
@need_static_convert or
|
75
|
+
(not @converters.empty?)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
data/lib/csv/match_p.rb
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# This provides String#match? and Regexp#match? for Ruby 2.3.
|
4
|
+
unless String.method_defined?(:match?)
|
5
|
+
class CSV
|
6
|
+
module MatchP
|
7
|
+
refine String do
|
8
|
+
def match?(pattern)
|
9
|
+
self =~ pattern
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
refine Regexp do
|
14
|
+
def match?(string)
|
15
|
+
self =~ string
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/csv/parser.rb
ADDED
@@ -0,0 +1,713 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "strscan"
|
4
|
+
|
5
|
+
require_relative "match_p"
|
6
|
+
require_relative "row"
|
7
|
+
require_relative "table"
|
8
|
+
|
9
|
+
using CSV::MatchP if CSV.const_defined?(:MatchP)
|
10
|
+
|
11
|
+
class CSV
|
12
|
+
class Parser
|
13
|
+
class InvalidEncoding < StandardError
|
14
|
+
end
|
15
|
+
|
16
|
+
class Scanner < StringScanner
|
17
|
+
alias_method :scan_all, :scan
|
18
|
+
|
19
|
+
def initialize(*args)
|
20
|
+
super
|
21
|
+
@keeps = []
|
22
|
+
end
|
23
|
+
|
24
|
+
def keep_start
|
25
|
+
@keeps.push(pos)
|
26
|
+
end
|
27
|
+
|
28
|
+
def keep_end
|
29
|
+
start = @keeps.pop
|
30
|
+
string[start, pos - start]
|
31
|
+
end
|
32
|
+
|
33
|
+
def keep_back
|
34
|
+
self.pos = @keeps.pop
|
35
|
+
end
|
36
|
+
|
37
|
+
def keep_drop
|
38
|
+
@keeps.pop
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
class InputsScanner
|
43
|
+
def initialize(inputs, encoding, chunk_size: 8192)
|
44
|
+
@inputs = inputs.dup
|
45
|
+
@encoding = encoding
|
46
|
+
@chunk_size = chunk_size
|
47
|
+
@last_scanner = @inputs.empty?
|
48
|
+
@keeps = []
|
49
|
+
read_chunk
|
50
|
+
end
|
51
|
+
|
52
|
+
def scan(pattern)
|
53
|
+
value = @scanner.scan(pattern)
|
54
|
+
return value if @last_scanner
|
55
|
+
|
56
|
+
if value
|
57
|
+
read_chunk if @scanner.eos?
|
58
|
+
return value
|
59
|
+
else
|
60
|
+
nil
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
def scan_all(pattern)
|
65
|
+
value = @scanner.scan(pattern)
|
66
|
+
return value if @last_scanner
|
67
|
+
|
68
|
+
return nil if value.nil?
|
69
|
+
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
|
70
|
+
value << sub_value
|
71
|
+
end
|
72
|
+
value
|
73
|
+
end
|
74
|
+
|
75
|
+
def eos?
|
76
|
+
@scanner.eos?
|
77
|
+
end
|
78
|
+
|
79
|
+
def keep_start
|
80
|
+
@keeps.push([@scanner.pos, nil])
|
81
|
+
end
|
82
|
+
|
83
|
+
def keep_end
|
84
|
+
start, buffer = @keeps.pop
|
85
|
+
keep = @scanner.string[start, @scanner.pos - start]
|
86
|
+
if buffer
|
87
|
+
buffer << keep
|
88
|
+
keep = buffer
|
89
|
+
end
|
90
|
+
keep
|
91
|
+
end
|
92
|
+
|
93
|
+
def keep_back
|
94
|
+
start, buffer = @keeps.pop
|
95
|
+
if buffer
|
96
|
+
string = @scanner.string
|
97
|
+
keep = string[start, string.size - start]
|
98
|
+
if keep and not keep.empty?
|
99
|
+
@inputs.unshift(StringIO.new(keep))
|
100
|
+
@last_scanner = false
|
101
|
+
end
|
102
|
+
@scanner = StringScanner.new(buffer)
|
103
|
+
else
|
104
|
+
@scanner.pos = start
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def keep_drop
|
109
|
+
@keeps.pop
|
110
|
+
end
|
111
|
+
|
112
|
+
def rest
|
113
|
+
@scanner.rest
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
def read_chunk
|
118
|
+
return false if @last_scanner
|
119
|
+
|
120
|
+
unless @keeps.empty?
|
121
|
+
keep = @keeps.last
|
122
|
+
keep_start = keep[0]
|
123
|
+
string = @scanner.string
|
124
|
+
keep_data = string[keep_start, @scanner.pos - keep_start]
|
125
|
+
if keep_data
|
126
|
+
keep_buffer = keep[1]
|
127
|
+
if keep_buffer
|
128
|
+
keep_buffer << keep_data
|
129
|
+
else
|
130
|
+
keep[1] = keep_data.dup
|
131
|
+
end
|
132
|
+
end
|
133
|
+
keep[0] = 0
|
134
|
+
end
|
135
|
+
|
136
|
+
input = @inputs.first
|
137
|
+
case input
|
138
|
+
when StringIO
|
139
|
+
string = input.string
|
140
|
+
raise InvalidEncoding unless string.valid_encoding?
|
141
|
+
@scanner = StringScanner.new(string)
|
142
|
+
@inputs.shift
|
143
|
+
@last_scanner = @inputs.empty?
|
144
|
+
true
|
145
|
+
else
|
146
|
+
chunk = input.gets(nil, @chunk_size)
|
147
|
+
if chunk
|
148
|
+
raise InvalidEncoding unless chunk.valid_encoding?
|
149
|
+
@scanner = StringScanner.new(chunk)
|
150
|
+
if input.respond_to?(:eof?) and input.eof?
|
151
|
+
@inputs.shift
|
152
|
+
@last_scanner = @inputs.empty?
|
153
|
+
end
|
154
|
+
true
|
155
|
+
else
|
156
|
+
@scanner = StringScanner.new("".encode(@encoding))
|
157
|
+
@inputs.shift
|
158
|
+
@last_scanner = @inputs.empty?
|
159
|
+
if @last_scanner
|
160
|
+
false
|
161
|
+
else
|
162
|
+
read_chunk
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
def initialize(input, options)
|
170
|
+
@input = input
|
171
|
+
@options = options
|
172
|
+
@samples = []
|
173
|
+
|
174
|
+
prepare
|
175
|
+
end
|
176
|
+
|
177
|
+
def column_separator
|
178
|
+
@column_separator
|
179
|
+
end
|
180
|
+
|
181
|
+
def row_separator
|
182
|
+
@row_separator
|
183
|
+
end
|
184
|
+
|
185
|
+
def quote_character
|
186
|
+
@quote_character
|
187
|
+
end
|
188
|
+
|
189
|
+
def field_size_limit
|
190
|
+
@field_size_limit
|
191
|
+
end
|
192
|
+
|
193
|
+
def skip_lines
|
194
|
+
@skip_lines
|
195
|
+
end
|
196
|
+
|
197
|
+
def unconverted_fields?
|
198
|
+
@unconverted_fields
|
199
|
+
end
|
200
|
+
|
201
|
+
def headers
|
202
|
+
@headers
|
203
|
+
end
|
204
|
+
|
205
|
+
def header_row?
|
206
|
+
@use_headers and @headers.nil?
|
207
|
+
end
|
208
|
+
|
209
|
+
def return_headers?
|
210
|
+
@return_headers
|
211
|
+
end
|
212
|
+
|
213
|
+
def skip_blanks?
|
214
|
+
@skip_blanks
|
215
|
+
end
|
216
|
+
|
217
|
+
def liberal_parsing?
|
218
|
+
@liberal_parsing
|
219
|
+
end
|
220
|
+
|
221
|
+
def lineno
|
222
|
+
@lineno
|
223
|
+
end
|
224
|
+
|
225
|
+
def line
|
226
|
+
last_line
|
227
|
+
end
|
228
|
+
|
229
|
+
def parse(&block)
|
230
|
+
return to_enum(__method__) unless block_given?
|
231
|
+
|
232
|
+
if @return_headers and @headers
|
233
|
+
headers = Row.new(@headers, @raw_headers, true)
|
234
|
+
if @unconverted_fields
|
235
|
+
headers = add_unconverted_fields(headers, [])
|
236
|
+
end
|
237
|
+
yield headers
|
238
|
+
end
|
239
|
+
|
240
|
+
row = []
|
241
|
+
begin
|
242
|
+
@scanner = build_scanner
|
243
|
+
skip_needless_lines
|
244
|
+
start_row
|
245
|
+
while true
|
246
|
+
@quoted_column_value = false
|
247
|
+
@unquoted_column_value = false
|
248
|
+
value = parse_column_value
|
249
|
+
if value and @field_size_limit and value.size >= @field_size_limit
|
250
|
+
raise MalformedCSVError.new("Field size exceeded", @lineno + 1)
|
251
|
+
end
|
252
|
+
if parse_column_end
|
253
|
+
row << value
|
254
|
+
elsif parse_row_end
|
255
|
+
if row.empty? and value.nil?
|
256
|
+
emit_row([], &block) unless @skip_blanks
|
257
|
+
else
|
258
|
+
row << value
|
259
|
+
emit_row(row, &block)
|
260
|
+
row = []
|
261
|
+
end
|
262
|
+
skip_needless_lines
|
263
|
+
start_row
|
264
|
+
elsif @scanner.eos?
|
265
|
+
return if row.empty? and value.nil?
|
266
|
+
row << value
|
267
|
+
emit_row(row, &block)
|
268
|
+
return
|
269
|
+
else
|
270
|
+
if @quoted_column_value
|
271
|
+
message = "Do not allow except col_sep_split_separator " +
|
272
|
+
"after quoted fields"
|
273
|
+
raise MalformedCSVError.new(message, @lineno + 1)
|
274
|
+
elsif @unquoted_column_value and @scanner.scan(@cr_or_lf)
|
275
|
+
message = "Unquoted fields do not allow \\r or \\n"
|
276
|
+
raise MalformedCSVError.new(message, @lineno + 1)
|
277
|
+
elsif @scanner.rest.start_with?(@quote_character)
|
278
|
+
message = "Illegal quoting"
|
279
|
+
raise MalformedCSVError.new(message, @lineno + 1)
|
280
|
+
else
|
281
|
+
raise MalformedCSVError.new("TODO: Meaningful message",
|
282
|
+
@lineno + 1)
|
283
|
+
end
|
284
|
+
end
|
285
|
+
end
|
286
|
+
rescue InvalidEncoding
|
287
|
+
message = "Invalid byte sequence in #{@encoding}"
|
288
|
+
raise MalformedCSVError.new(message, @lineno + 1)
|
289
|
+
end
|
290
|
+
end
|
291
|
+
|
292
|
+
private
|
293
|
+
def prepare
|
294
|
+
prepare_variable
|
295
|
+
prepare_regexp
|
296
|
+
prepare_line
|
297
|
+
prepare_header
|
298
|
+
prepare_parser
|
299
|
+
end
|
300
|
+
|
301
|
+
def prepare_variable
|
302
|
+
@encoding = @options[:encoding]
|
303
|
+
@liberal_parsing = @options[:liberal_parsing]
|
304
|
+
@unconverted_fields = @options[:unconverted_fields]
|
305
|
+
@field_size_limit = @options[:field_size_limit]
|
306
|
+
@skip_blanks = @options[:skip_blanks]
|
307
|
+
@fields_converter = @options[:fields_converter]
|
308
|
+
@header_fields_converter = @options[:header_fields_converter]
|
309
|
+
end
|
310
|
+
|
311
|
+
def prepare_regexp
|
312
|
+
@column_separator = @options[:column_separator].to_s.encode(@encoding)
|
313
|
+
@row_separator =
|
314
|
+
resolve_row_separator(@options[:row_separator]).encode(@encoding)
|
315
|
+
@quote_character = @options[:quote_character].to_s.encode(@encoding)
|
316
|
+
if @quote_character.length != 1
|
317
|
+
raise ArgumentError, ":quote_char has to be a single character String"
|
318
|
+
end
|
319
|
+
|
320
|
+
escaped_column_separator = Regexp.escape(@column_separator)
|
321
|
+
escaped_row_separator = Regexp.escape(@row_separator)
|
322
|
+
escaped_quote_character = Regexp.escape(@quote_character)
|
323
|
+
|
324
|
+
skip_lines = @options[:skip_lines]
|
325
|
+
case skip_lines
|
326
|
+
when String
|
327
|
+
@skip_lines = skip_lines.encode(@encoding)
|
328
|
+
when Regexp, nil
|
329
|
+
@skip_lines = skip_lines
|
330
|
+
else
|
331
|
+
unless skip_lines.respond_to?(:match)
|
332
|
+
message =
|
333
|
+
":skip_lines has to respond to \#match: #{skip_lines.inspect}"
|
334
|
+
raise ArgumentError, message
|
335
|
+
end
|
336
|
+
@skip_lines = skip_lines
|
337
|
+
end
|
338
|
+
|
339
|
+
@column_end = Regexp.new(escaped_column_separator)
|
340
|
+
if @column_separator.size > 1
|
341
|
+
@column_ends = @column_separator.each_char.collect do |char|
|
342
|
+
Regexp.new(Regexp.escape(char))
|
343
|
+
end
|
344
|
+
else
|
345
|
+
@column_ends = nil
|
346
|
+
end
|
347
|
+
@row_end = Regexp.new(escaped_row_separator)
|
348
|
+
if @row_separator.size > 1
|
349
|
+
@row_ends = @row_separator.each_char.collect do |char|
|
350
|
+
Regexp.new(Regexp.escape(char))
|
351
|
+
end
|
352
|
+
else
|
353
|
+
@row_ends = nil
|
354
|
+
end
|
355
|
+
@quotes = Regexp.new(escaped_quote_character +
|
356
|
+
"+".encode(@encoding))
|
357
|
+
@quoted_value = Regexp.new("[^".encode(@encoding) +
|
358
|
+
escaped_quote_character +
|
359
|
+
"]+".encode(@encoding))
|
360
|
+
if @liberal_parsing
|
361
|
+
@unquoted_value = Regexp.new("[^".encode(@encoding) +
|
362
|
+
escaped_column_separator +
|
363
|
+
"\r\n]+".encode(@encoding))
|
364
|
+
else
|
365
|
+
@unquoted_value = Regexp.new("[^".encode(@encoding) +
|
366
|
+
escaped_quote_character +
|
367
|
+
escaped_column_separator +
|
368
|
+
"\r\n]+".encode(@encoding))
|
369
|
+
end
|
370
|
+
@cr_or_lf = Regexp.new("[\r\n]".encode(@encoding))
|
371
|
+
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
|
372
|
+
end
|
373
|
+
|
374
|
+
def resolve_row_separator(separator)
|
375
|
+
if separator == :auto
|
376
|
+
cr = "\r".encode(@encoding)
|
377
|
+
lf = "\n".encode(@encoding)
|
378
|
+
if @input.is_a?(StringIO)
|
379
|
+
separator = detect_row_separator(@input.string, cr, lf)
|
380
|
+
elsif @input.respond_to?(:gets)
|
381
|
+
if @input.is_a?(File)
|
382
|
+
chunk_size = 32 * 1024
|
383
|
+
else
|
384
|
+
chunk_size = 1024
|
385
|
+
end
|
386
|
+
begin
|
387
|
+
while separator == :auto
|
388
|
+
#
|
389
|
+
# if we run out of data, it's probably a single line
|
390
|
+
# (ensure will set default value)
|
391
|
+
#
|
392
|
+
break unless sample = @input.gets(nil, chunk_size)
|
393
|
+
|
394
|
+
# extend sample if we're unsure of the line ending
|
395
|
+
if sample.end_with?(cr)
|
396
|
+
sample << (@input.gets(nil, 1) || "")
|
397
|
+
end
|
398
|
+
|
399
|
+
@samples << sample
|
400
|
+
|
401
|
+
separator = detect_row_separator(sample, cr, lf)
|
402
|
+
end
|
403
|
+
rescue IOError
|
404
|
+
# do nothing: ensure will set default
|
405
|
+
end
|
406
|
+
end
|
407
|
+
separator = $INPUT_RECORD_SEPARATOR if separator == :auto
|
408
|
+
end
|
409
|
+
separator.to_s.encode(@encoding)
|
410
|
+
end
|
411
|
+
|
412
|
+
def detect_row_separator(sample, cr, lf)
|
413
|
+
lf_index = sample.index(lf)
|
414
|
+
if lf_index
|
415
|
+
cr_index = sample[0, lf_index].index(cr)
|
416
|
+
else
|
417
|
+
cr_index = sample.index(cr)
|
418
|
+
end
|
419
|
+
if cr_index and lf_index
|
420
|
+
if cr_index + 1 == lf_index
|
421
|
+
cr + lf
|
422
|
+
elsif cr_index < lf_index
|
423
|
+
cr
|
424
|
+
else
|
425
|
+
lf
|
426
|
+
end
|
427
|
+
elsif cr_index
|
428
|
+
cr
|
429
|
+
elsif lf_index
|
430
|
+
lf
|
431
|
+
else
|
432
|
+
:auto
|
433
|
+
end
|
434
|
+
end
|
435
|
+
|
436
|
+
def prepare_line
|
437
|
+
@lineno = 0
|
438
|
+
@last_line = nil
|
439
|
+
@scanner = nil
|
440
|
+
end
|
441
|
+
|
442
|
+
def last_line
|
443
|
+
if @scanner
|
444
|
+
@last_line ||= @scanner.keep_end
|
445
|
+
else
|
446
|
+
@last_line
|
447
|
+
end
|
448
|
+
end
|
449
|
+
|
450
|
+
def prepare_header
|
451
|
+
@return_headers = @options[:return_headers]
|
452
|
+
|
453
|
+
headers = @options[:headers]
|
454
|
+
case headers
|
455
|
+
when Array
|
456
|
+
@raw_headers = headers
|
457
|
+
@use_headers = true
|
458
|
+
when String
|
459
|
+
@raw_headers = parse_headers(headers)
|
460
|
+
@use_headers = true
|
461
|
+
when nil, false
|
462
|
+
@raw_headers = nil
|
463
|
+
@use_headers = false
|
464
|
+
else
|
465
|
+
@raw_headers = nil
|
466
|
+
@use_headers = true
|
467
|
+
end
|
468
|
+
if @raw_headers
|
469
|
+
@headers = adjust_headers(@raw_headers)
|
470
|
+
else
|
471
|
+
@headers = nil
|
472
|
+
end
|
473
|
+
end
|
474
|
+
|
475
|
+
def parse_headers(row)
|
476
|
+
CSV.parse_line(row,
|
477
|
+
col_sep: @column_separator,
|
478
|
+
row_sep: @row_separator,
|
479
|
+
quote_char: @quote_character)
|
480
|
+
end
|
481
|
+
|
482
|
+
def adjust_headers(headers)
|
483
|
+
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
|
484
|
+
adjusted_headers.each {|h| h.freeze if h.is_a? String}
|
485
|
+
adjusted_headers
|
486
|
+
end
|
487
|
+
|
488
|
+
def prepare_parser
|
489
|
+
@may_quoted = may_quoted?
|
490
|
+
end
|
491
|
+
|
492
|
+
def may_quoted?
|
493
|
+
if @input.is_a?(StringIO)
|
494
|
+
sample = @input.string
|
495
|
+
else
|
496
|
+
return false if @samples.empty?
|
497
|
+
sample = @samples.first
|
498
|
+
end
|
499
|
+
sample[0, 128].index(@quote_character)
|
500
|
+
end
|
501
|
+
|
502
|
+
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
|
503
|
+
if SCANNER_TEST
|
504
|
+
class UnoptimizedStringIO
|
505
|
+
def initialize(string)
|
506
|
+
@io = StringIO.new(string)
|
507
|
+
end
|
508
|
+
|
509
|
+
def gets(*args)
|
510
|
+
@io.gets(*args)
|
511
|
+
end
|
512
|
+
|
513
|
+
def eof?
|
514
|
+
@io.eof?
|
515
|
+
end
|
516
|
+
end
|
517
|
+
|
518
|
+
def build_scanner
|
519
|
+
inputs = @samples.collect do |sample|
|
520
|
+
UnoptimizedStringIO.new(sample)
|
521
|
+
end
|
522
|
+
if @input.is_a?(StringIO)
|
523
|
+
inputs << UnoptimizedStringIO.new(@input.string)
|
524
|
+
else
|
525
|
+
inputs << @input
|
526
|
+
end
|
527
|
+
InputsScanner.new(inputs, @encoding, chunk_size: 1)
|
528
|
+
end
|
529
|
+
else
|
530
|
+
def build_scanner
|
531
|
+
string = nil
|
532
|
+
if @samples.empty? and @input.is_a?(StringIO)
|
533
|
+
string = @input.string
|
534
|
+
elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
|
535
|
+
string = @samples[0]
|
536
|
+
end
|
537
|
+
if string
|
538
|
+
unless string.valid_encoding?
|
539
|
+
message = "Invalid byte sequence in #{@encoding}"
|
540
|
+
raise MalformedCSVError.new(message, @lineno + 1)
|
541
|
+
end
|
542
|
+
Scanner.new(string)
|
543
|
+
else
|
544
|
+
inputs = @samples.collect do |sample|
|
545
|
+
StringIO.new(sample)
|
546
|
+
end
|
547
|
+
inputs << @input
|
548
|
+
InputsScanner.new(inputs, @encoding)
|
549
|
+
end
|
550
|
+
end
|
551
|
+
end
|
552
|
+
|
553
|
+
def skip_needless_lines
|
554
|
+
return unless @skip_lines
|
555
|
+
|
556
|
+
while true
|
557
|
+
@scanner.keep_start
|
558
|
+
line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
|
559
|
+
line << @row_separator if parse_row_end
|
560
|
+
if skip_line?(line)
|
561
|
+
@scanner.keep_drop
|
562
|
+
else
|
563
|
+
@scanner.keep_back
|
564
|
+
return
|
565
|
+
end
|
566
|
+
end
|
567
|
+
end
|
568
|
+
|
569
|
+
def skip_line?(line)
|
570
|
+
case @skip_lines
|
571
|
+
when String
|
572
|
+
line.include?(@skip_lines)
|
573
|
+
when Regexp
|
574
|
+
@skip_lines.match?(line)
|
575
|
+
else
|
576
|
+
@skip_lines.match(line)
|
577
|
+
end
|
578
|
+
end
|
579
|
+
|
580
|
+
def parse_column_value
|
581
|
+
if @liberal_parsing
|
582
|
+
quoted_value = parse_quoted_column_value
|
583
|
+
if quoted_value
|
584
|
+
unquoted_value = parse_unquoted_column_value
|
585
|
+
if unquoted_value
|
586
|
+
@quote_character + quoted_value + @quote_character + unquoted_value
|
587
|
+
else
|
588
|
+
quoted_value
|
589
|
+
end
|
590
|
+
else
|
591
|
+
parse_unquoted_column_value
|
592
|
+
end
|
593
|
+
elsif @may_quoted
|
594
|
+
parse_quoted_column_value ||
|
595
|
+
parse_unquoted_column_value
|
596
|
+
else
|
597
|
+
parse_unquoted_column_value ||
|
598
|
+
parse_quoted_column_value
|
599
|
+
end
|
600
|
+
end
|
601
|
+
|
602
|
+
def parse_unquoted_column_value
|
603
|
+
value = @scanner.scan_all(@unquoted_value)
|
604
|
+
@unquoted_column_value = true if value
|
605
|
+
value
|
606
|
+
end
|
607
|
+
|
608
|
+
def parse_quoted_column_value
|
609
|
+
quotes = @scanner.scan_all(@quotes)
|
610
|
+
return nil unless quotes
|
611
|
+
|
612
|
+
@quoted_column_value = true
|
613
|
+
n_quotes = quotes.size
|
614
|
+
if (n_quotes % 2).zero?
|
615
|
+
quotes[0, (n_quotes - 2) / 2]
|
616
|
+
else
|
617
|
+
value = quotes[0, (n_quotes - 1) / 2]
|
618
|
+
while true
|
619
|
+
quoted_value = @scanner.scan_all(@quoted_value)
|
620
|
+
value << quoted_value if quoted_value
|
621
|
+
quotes = @scanner.scan_all(@quotes)
|
622
|
+
unless quotes
|
623
|
+
message = "Unclosed quoted field"
|
624
|
+
raise MalformedCSVError.new(message, @lineno + 1)
|
625
|
+
end
|
626
|
+
n_quotes = quotes.size
|
627
|
+
if n_quotes == 1
|
628
|
+
break
|
629
|
+
elsif (n_quotes % 2) == 1
|
630
|
+
value << quotes[0, (n_quotes - 1) / 2]
|
631
|
+
break
|
632
|
+
else
|
633
|
+
value << quotes[0, n_quotes / 2]
|
634
|
+
end
|
635
|
+
end
|
636
|
+
value
|
637
|
+
end
|
638
|
+
end
|
639
|
+
|
640
|
+
def parse_column_end
|
641
|
+
return true if @scanner.scan(@column_end)
|
642
|
+
return false unless @column_ends
|
643
|
+
|
644
|
+
@scanner.keep_start
|
645
|
+
if @column_ends.all? {|column_end| @scanner.scan(column_end)}
|
646
|
+
@scanner.keep_drop
|
647
|
+
true
|
648
|
+
else
|
649
|
+
@scanner.keep_back
|
650
|
+
false
|
651
|
+
end
|
652
|
+
end
|
653
|
+
|
654
|
+
def parse_row_end
|
655
|
+
return true if @scanner.scan(@row_end)
|
656
|
+
return false unless @row_ends
|
657
|
+
@scanner.keep_start
|
658
|
+
if @row_ends.all? {|row_end| @scanner.scan(row_end)}
|
659
|
+
@scanner.keep_drop
|
660
|
+
true
|
661
|
+
else
|
662
|
+
@scanner.keep_back
|
663
|
+
false
|
664
|
+
end
|
665
|
+
end
|
666
|
+
|
667
|
+
def start_row
|
668
|
+
if @last_line
|
669
|
+
@last_line = nil
|
670
|
+
else
|
671
|
+
@scanner.keep_drop
|
672
|
+
end
|
673
|
+
@scanner.keep_start
|
674
|
+
end
|
675
|
+
|
676
|
+
def emit_row(row, &block)
|
677
|
+
@lineno += 1
|
678
|
+
|
679
|
+
raw_row = row
|
680
|
+
if @use_headers
|
681
|
+
if @headers.nil?
|
682
|
+
@headers = adjust_headers(row)
|
683
|
+
return unless @return_headers
|
684
|
+
row = Row.new(@headers, row, true)
|
685
|
+
else
|
686
|
+
row = Row.new(@headers,
|
687
|
+
@fields_converter.convert(raw_row, @headers, @lineno))
|
688
|
+
end
|
689
|
+
else
|
690
|
+
# convert fields, if needed...
|
691
|
+
row = @fields_converter.convert(raw_row, nil, @lineno)
|
692
|
+
end
|
693
|
+
|
694
|
+
# inject unconverted fields and accessor, if requested...
|
695
|
+
if @unconverted_fields and not row.respond_to?(:unconverted_fields)
|
696
|
+
add_unconverted_fields(row, raw_row)
|
697
|
+
end
|
698
|
+
|
699
|
+
yield(row)
|
700
|
+
end
|
701
|
+
|
702
|
+
# This method injects an instance variable <tt>unconverted_fields</tt> into
|
703
|
+
# +row+ and an accessor method for +row+ called unconverted_fields(). The
|
704
|
+
# variable is set to the contents of +fields+.
|
705
|
+
def add_unconverted_fields(row, fields)
|
706
|
+
class << row
|
707
|
+
attr_reader :unconverted_fields
|
708
|
+
end
|
709
|
+
row.instance_variable_set(:@unconverted_fields, fields)
|
710
|
+
row
|
711
|
+
end
|
712
|
+
end
|
713
|
+
end
|