csv 3.0.0 → 3.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +882 -0
- data/README.md +6 -3
- data/doc/csv/arguments/io.rdoc +5 -0
- data/doc/csv/options/common/col_sep.rdoc +57 -0
- data/doc/csv/options/common/quote_char.rdoc +42 -0
- data/doc/csv/options/common/row_sep.rdoc +91 -0
- data/doc/csv/options/generating/force_quotes.rdoc +17 -0
- data/doc/csv/options/generating/quote_empty.rdoc +12 -0
- data/doc/csv/options/generating/write_converters.rdoc +25 -0
- data/doc/csv/options/generating/write_empty_value.rdoc +15 -0
- data/doc/csv/options/generating/write_headers.rdoc +29 -0
- data/doc/csv/options/generating/write_nil_value.rdoc +14 -0
- data/doc/csv/options/parsing/converters.rdoc +46 -0
- data/doc/csv/options/parsing/empty_value.rdoc +13 -0
- data/doc/csv/options/parsing/field_size_limit.rdoc +39 -0
- data/doc/csv/options/parsing/header_converters.rdoc +43 -0
- data/doc/csv/options/parsing/headers.rdoc +63 -0
- data/doc/csv/options/parsing/liberal_parsing.rdoc +38 -0
- data/doc/csv/options/parsing/nil_value.rdoc +12 -0
- data/doc/csv/options/parsing/return_headers.rdoc +22 -0
- data/doc/csv/options/parsing/skip_blanks.rdoc +31 -0
- data/doc/csv/options/parsing/skip_lines.rdoc +37 -0
- data/doc/csv/options/parsing/strip.rdoc +15 -0
- data/doc/csv/options/parsing/unconverted_fields.rdoc +27 -0
- data/doc/csv/recipes/filtering.rdoc +158 -0
- data/doc/csv/recipes/generating.rdoc +298 -0
- data/doc/csv/recipes/parsing.rdoc +545 -0
- data/doc/csv/recipes/recipes.rdoc +6 -0
- data/lib/csv/core_ext/array.rb +1 -1
- data/lib/csv/core_ext/string.rb +1 -1
- data/lib/csv/fields_converter.rb +89 -0
- data/lib/csv/input_record_separator.rb +18 -0
- data/lib/csv/parser.rb +1288 -0
- data/lib/csv/row.rb +505 -136
- data/lib/csv/table.rb +791 -114
- data/lib/csv/version.rb +1 -1
- data/lib/csv/writer.rb +210 -0
- data/lib/csv.rb +2433 -1329
- metadata +66 -13
- data/news.md +0 -123
data/lib/csv/parser.rb
ADDED
@@ -0,0 +1,1288 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "strscan"
|
4
|
+
|
5
|
+
require_relative "input_record_separator"
|
6
|
+
require_relative "row"
|
7
|
+
require_relative "table"
|
8
|
+
|
9
|
+
class CSV
|
10
|
+
# Note: Don't use this class directly. This is an internal class.
|
11
|
+
class Parser
|
12
|
+
#
|
13
|
+
# A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
|
14
|
+
# or String object being read from or written to. Your data is never transcoded
|
15
|
+
# (unless you ask Ruby to transcode it for you) and will literally be parsed in
|
16
|
+
# the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
|
17
|
+
# Encoding of your data. This is accomplished by transcoding the parser itself
|
18
|
+
# into your Encoding.
|
19
|
+
#
|
20
|
+
|
21
|
+
# Raised when encoding is invalid.
|
22
|
+
class InvalidEncoding < StandardError
|
23
|
+
end
|
24
|
+
|
25
|
+
# Raised when unexpected case is happen.
|
26
|
+
class UnexpectedError < StandardError
|
27
|
+
end
|
28
|
+
|
29
|
+
#
|
30
|
+
# CSV::Scanner receives a CSV output, scans it and return the content.
|
31
|
+
# It also controls the life cycle of the object with its methods +keep_start+,
|
32
|
+
# +keep_end+, +keep_back+, +keep_drop+.
|
33
|
+
#
|
34
|
+
# Uses StringScanner (the official strscan gem). Strscan provides lexical
|
35
|
+
# scanning operations on a String. We inherit its object and take advantage
|
36
|
+
# on the methods. For more information, please visit:
|
37
|
+
# https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
|
38
|
+
#
|
39
|
+
class Scanner < StringScanner
|
40
|
+
alias_method :scan_all, :scan
|
41
|
+
|
42
|
+
def initialize(*args)
|
43
|
+
super
|
44
|
+
@keeps = []
|
45
|
+
end
|
46
|
+
|
47
|
+
def each_line(row_separator)
|
48
|
+
position = pos
|
49
|
+
rest.each_line(row_separator) do |line|
|
50
|
+
position += line.bytesize
|
51
|
+
self.pos = position
|
52
|
+
yield(line)
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def keep_start
|
57
|
+
@keeps.push(pos)
|
58
|
+
end
|
59
|
+
|
60
|
+
def keep_end
|
61
|
+
start = @keeps.pop
|
62
|
+
string.byteslice(start, pos - start)
|
63
|
+
end
|
64
|
+
|
65
|
+
def keep_back
|
66
|
+
self.pos = @keeps.pop
|
67
|
+
end
|
68
|
+
|
69
|
+
def keep_drop
|
70
|
+
@keeps.pop
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
#
|
75
|
+
# CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
|
76
|
+
# It also controls the life cycle of the object with its methods +keep_start+,
|
77
|
+
# +keep_end+, +keep_back+, +keep_drop+.
|
78
|
+
#
|
79
|
+
# CSV::InputsScanner.scan() tries to match with pattern at the current position.
|
80
|
+
# If there's a match, the scanner advances the "scan pointer" and returns the matched string.
|
81
|
+
# Otherwise, the scanner returns nil.
|
82
|
+
#
|
83
|
+
# CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
|
84
|
+
# If there is no more data (eos? = true), it returns "".
|
85
|
+
#
|
86
|
+
class InputsScanner
|
87
|
+
def initialize(inputs, encoding, row_separator, chunk_size: 8192)
|
88
|
+
@inputs = inputs.dup
|
89
|
+
@encoding = encoding
|
90
|
+
@row_separator = row_separator
|
91
|
+
@chunk_size = chunk_size
|
92
|
+
@last_scanner = @inputs.empty?
|
93
|
+
@keeps = []
|
94
|
+
read_chunk
|
95
|
+
end
|
96
|
+
|
97
|
+
def each_line(row_separator)
|
98
|
+
return enum_for(__method__, row_separator) unless block_given?
|
99
|
+
buffer = nil
|
100
|
+
input = @scanner.rest
|
101
|
+
position = @scanner.pos
|
102
|
+
offset = 0
|
103
|
+
n_row_separator_chars = row_separator.size
|
104
|
+
# trace(__method__, :start, input)
|
105
|
+
while true
|
106
|
+
input.each_line(row_separator) do |line|
|
107
|
+
@scanner.pos += line.bytesize
|
108
|
+
if buffer
|
109
|
+
if n_row_separator_chars == 2 and
|
110
|
+
buffer.end_with?(row_separator[0]) and
|
111
|
+
line.start_with?(row_separator[1])
|
112
|
+
buffer << line[0]
|
113
|
+
line = line[1..-1]
|
114
|
+
position += buffer.bytesize + offset
|
115
|
+
@scanner.pos = position
|
116
|
+
offset = 0
|
117
|
+
yield(buffer)
|
118
|
+
buffer = nil
|
119
|
+
next if line.empty?
|
120
|
+
else
|
121
|
+
buffer << line
|
122
|
+
line = buffer
|
123
|
+
buffer = nil
|
124
|
+
end
|
125
|
+
end
|
126
|
+
if line.end_with?(row_separator)
|
127
|
+
position += line.bytesize + offset
|
128
|
+
@scanner.pos = position
|
129
|
+
offset = 0
|
130
|
+
yield(line)
|
131
|
+
else
|
132
|
+
buffer = line
|
133
|
+
end
|
134
|
+
end
|
135
|
+
break unless read_chunk
|
136
|
+
input = @scanner.rest
|
137
|
+
position = @scanner.pos
|
138
|
+
offset = -buffer.bytesize if buffer
|
139
|
+
end
|
140
|
+
yield(buffer) if buffer
|
141
|
+
end
|
142
|
+
|
143
|
+
def scan(pattern)
|
144
|
+
# trace(__method__, pattern, :start)
|
145
|
+
value = @scanner.scan(pattern)
|
146
|
+
# trace(__method__, pattern, :done, :last, value) if @last_scanner
|
147
|
+
return value if @last_scanner
|
148
|
+
|
149
|
+
read_chunk if value and @scanner.eos?
|
150
|
+
# trace(__method__, pattern, :done, value)
|
151
|
+
value
|
152
|
+
end
|
153
|
+
|
154
|
+
def scan_all(pattern)
|
155
|
+
# trace(__method__, pattern, :start)
|
156
|
+
value = @scanner.scan(pattern)
|
157
|
+
# trace(__method__, pattern, :done, :last, value) if @last_scanner
|
158
|
+
return value if @last_scanner
|
159
|
+
|
160
|
+
# trace(__method__, pattern, :done, :nil) if value.nil?
|
161
|
+
return nil if value.nil?
|
162
|
+
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
|
163
|
+
# trace(__method__, pattern, :sub, sub_value)
|
164
|
+
value << sub_value
|
165
|
+
end
|
166
|
+
# trace(__method__, pattern, :done, value)
|
167
|
+
value
|
168
|
+
end
|
169
|
+
|
170
|
+
def eos?
|
171
|
+
@scanner.eos?
|
172
|
+
end
|
173
|
+
|
174
|
+
def keep_start
|
175
|
+
# trace(__method__, :start)
|
176
|
+
adjust_last_keep
|
177
|
+
@keeps.push([@scanner, @scanner.pos, nil])
|
178
|
+
# trace(__method__, :done)
|
179
|
+
end
|
180
|
+
|
181
|
+
def keep_end
|
182
|
+
# trace(__method__, :start)
|
183
|
+
scanner, start, buffer = @keeps.pop
|
184
|
+
if scanner == @scanner
|
185
|
+
keep = @scanner.string.byteslice(start, @scanner.pos - start)
|
186
|
+
else
|
187
|
+
keep = @scanner.string.byteslice(0, @scanner.pos)
|
188
|
+
end
|
189
|
+
if buffer
|
190
|
+
buffer << keep
|
191
|
+
keep = buffer
|
192
|
+
end
|
193
|
+
# trace(__method__, :done, keep)
|
194
|
+
keep
|
195
|
+
end
|
196
|
+
|
197
|
+
def keep_back
|
198
|
+
# trace(__method__, :start)
|
199
|
+
scanner, start, buffer = @keeps.pop
|
200
|
+
if buffer
|
201
|
+
# trace(__method__, :rescan, start, buffer)
|
202
|
+
string = @scanner.string
|
203
|
+
if scanner == @scanner
|
204
|
+
keep = string.byteslice(start,
|
205
|
+
string.bytesize - @scanner.pos - start)
|
206
|
+
else
|
207
|
+
keep = string
|
208
|
+
end
|
209
|
+
if keep and not keep.empty?
|
210
|
+
@inputs.unshift(StringIO.new(keep))
|
211
|
+
@last_scanner = false
|
212
|
+
end
|
213
|
+
@scanner = StringScanner.new(buffer)
|
214
|
+
else
|
215
|
+
if @scanner != scanner
|
216
|
+
message = "scanners are different but no buffer: "
|
217
|
+
message += "#{@scanner.inspect}(#{@scanner.object_id}): "
|
218
|
+
message += "#{scanner.inspect}(#{scanner.object_id})"
|
219
|
+
raise UnexpectedError, message
|
220
|
+
end
|
221
|
+
# trace(__method__, :repos, start, buffer)
|
222
|
+
@scanner.pos = start
|
223
|
+
end
|
224
|
+
read_chunk if @scanner.eos?
|
225
|
+
end
|
226
|
+
|
227
|
+
def keep_drop
|
228
|
+
_, _, buffer = @keeps.pop
|
229
|
+
# trace(__method__, :done, :empty) unless buffer
|
230
|
+
return unless buffer
|
231
|
+
|
232
|
+
last_keep = @keeps.last
|
233
|
+
# trace(__method__, :done, :no_last_keep) unless last_keep
|
234
|
+
return unless last_keep
|
235
|
+
|
236
|
+
if last_keep[2]
|
237
|
+
last_keep[2] << buffer
|
238
|
+
else
|
239
|
+
last_keep[2] = buffer
|
240
|
+
end
|
241
|
+
# trace(__method__, :done)
|
242
|
+
end
|
243
|
+
|
244
|
+
def rest
|
245
|
+
@scanner.rest
|
246
|
+
end
|
247
|
+
|
248
|
+
def check(pattern)
|
249
|
+
@scanner.check(pattern)
|
250
|
+
end
|
251
|
+
|
252
|
+
private
|
253
|
+
def trace(*args)
|
254
|
+
pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
|
255
|
+
end
|
256
|
+
|
257
|
+
def adjust_last_keep
|
258
|
+
# trace(__method__, :start)
|
259
|
+
|
260
|
+
keep = @keeps.last
|
261
|
+
# trace(__method__, :done, :empty) if keep.nil?
|
262
|
+
return if keep.nil?
|
263
|
+
|
264
|
+
scanner, start, buffer = keep
|
265
|
+
string = @scanner.string
|
266
|
+
if @scanner != scanner
|
267
|
+
start = 0
|
268
|
+
end
|
269
|
+
if start == 0 and @scanner.eos?
|
270
|
+
keep_data = string
|
271
|
+
else
|
272
|
+
keep_data = string.byteslice(start, @scanner.pos - start)
|
273
|
+
end
|
274
|
+
if keep_data
|
275
|
+
if buffer
|
276
|
+
buffer << keep_data
|
277
|
+
else
|
278
|
+
keep[2] = keep_data.dup
|
279
|
+
end
|
280
|
+
end
|
281
|
+
|
282
|
+
# trace(__method__, :done)
|
283
|
+
end
|
284
|
+
|
285
|
+
def read_chunk
|
286
|
+
return false if @last_scanner
|
287
|
+
|
288
|
+
adjust_last_keep
|
289
|
+
|
290
|
+
input = @inputs.first
|
291
|
+
case input
|
292
|
+
when StringIO
|
293
|
+
string = input.read
|
294
|
+
raise InvalidEncoding unless string.valid_encoding?
|
295
|
+
# trace(__method__, :stringio, string)
|
296
|
+
@scanner = StringScanner.new(string)
|
297
|
+
@inputs.shift
|
298
|
+
@last_scanner = @inputs.empty?
|
299
|
+
true
|
300
|
+
else
|
301
|
+
chunk = input.gets(@row_separator, @chunk_size)
|
302
|
+
if chunk
|
303
|
+
raise InvalidEncoding unless chunk.valid_encoding?
|
304
|
+
# trace(__method__, :chunk, chunk)
|
305
|
+
@scanner = StringScanner.new(chunk)
|
306
|
+
if input.respond_to?(:eof?) and input.eof?
|
307
|
+
@inputs.shift
|
308
|
+
@last_scanner = @inputs.empty?
|
309
|
+
end
|
310
|
+
true
|
311
|
+
else
|
312
|
+
# trace(__method__, :no_chunk)
|
313
|
+
@scanner = StringScanner.new("".encode(@encoding))
|
314
|
+
@inputs.shift
|
315
|
+
@last_scanner = @inputs.empty?
|
316
|
+
if @last_scanner
|
317
|
+
false
|
318
|
+
else
|
319
|
+
read_chunk
|
320
|
+
end
|
321
|
+
end
|
322
|
+
end
|
323
|
+
end
|
324
|
+
end
|
325
|
+
|
326
|
+
def initialize(input, options)
|
327
|
+
@input = input
|
328
|
+
@options = options
|
329
|
+
@samples = []
|
330
|
+
|
331
|
+
prepare
|
332
|
+
end
|
333
|
+
|
334
|
+
def column_separator
|
335
|
+
@column_separator
|
336
|
+
end
|
337
|
+
|
338
|
+
def row_separator
|
339
|
+
@row_separator
|
340
|
+
end
|
341
|
+
|
342
|
+
def quote_character
|
343
|
+
@quote_character
|
344
|
+
end
|
345
|
+
|
346
|
+
def field_size_limit
|
347
|
+
@max_field_size&.succ
|
348
|
+
end
|
349
|
+
|
350
|
+
def max_field_size
|
351
|
+
@max_field_size
|
352
|
+
end
|
353
|
+
|
354
|
+
def skip_lines
|
355
|
+
@skip_lines
|
356
|
+
end
|
357
|
+
|
358
|
+
def unconverted_fields?
|
359
|
+
@unconverted_fields
|
360
|
+
end
|
361
|
+
|
362
|
+
def headers
|
363
|
+
@headers
|
364
|
+
end
|
365
|
+
|
366
|
+
def header_row?
|
367
|
+
@use_headers and @headers.nil?
|
368
|
+
end
|
369
|
+
|
370
|
+
def return_headers?
|
371
|
+
@return_headers
|
372
|
+
end
|
373
|
+
|
374
|
+
def skip_blanks?
|
375
|
+
@skip_blanks
|
376
|
+
end
|
377
|
+
|
378
|
+
def liberal_parsing?
|
379
|
+
@liberal_parsing
|
380
|
+
end
|
381
|
+
|
382
|
+
def lineno
|
383
|
+
@lineno
|
384
|
+
end
|
385
|
+
|
386
|
+
def line
|
387
|
+
last_line
|
388
|
+
end
|
389
|
+
|
390
|
+
def parse(&block)
|
391
|
+
return to_enum(__method__) unless block_given?
|
392
|
+
|
393
|
+
if @return_headers and @headers and @raw_headers
|
394
|
+
headers = Row.new(@headers, @raw_headers, true)
|
395
|
+
if @unconverted_fields
|
396
|
+
headers = add_unconverted_fields(headers, [])
|
397
|
+
end
|
398
|
+
yield headers
|
399
|
+
end
|
400
|
+
|
401
|
+
begin
|
402
|
+
@scanner ||= build_scanner
|
403
|
+
if quote_character.nil?
|
404
|
+
parse_no_quote(&block)
|
405
|
+
elsif @need_robust_parsing
|
406
|
+
parse_quotable_robust(&block)
|
407
|
+
else
|
408
|
+
parse_quotable_loose(&block)
|
409
|
+
end
|
410
|
+
rescue InvalidEncoding
|
411
|
+
if @scanner
|
412
|
+
ignore_broken_line
|
413
|
+
lineno = @lineno
|
414
|
+
else
|
415
|
+
lineno = @lineno + 1
|
416
|
+
end
|
417
|
+
raise InvalidEncodingError.new(@encoding, lineno)
|
418
|
+
rescue UnexpectedError => error
|
419
|
+
if @scanner
|
420
|
+
ignore_broken_line
|
421
|
+
lineno = @lineno
|
422
|
+
else
|
423
|
+
lineno = @lineno + 1
|
424
|
+
end
|
425
|
+
message = "This should not be happen: #{error.message}: "
|
426
|
+
message += "Please report this to https://github.com/ruby/csv/issues"
|
427
|
+
raise MalformedCSVError.new(message, lineno)
|
428
|
+
end
|
429
|
+
end
|
430
|
+
|
431
|
+
def use_headers?
|
432
|
+
@use_headers
|
433
|
+
end
|
434
|
+
|
435
|
+
private
|
436
|
+
# A set of tasks to prepare the file in order to parse it
|
437
|
+
def prepare
|
438
|
+
prepare_variable
|
439
|
+
prepare_quote_character
|
440
|
+
prepare_backslash
|
441
|
+
prepare_skip_lines
|
442
|
+
prepare_strip
|
443
|
+
prepare_separators
|
444
|
+
validate_strip_and_col_sep_options
|
445
|
+
prepare_quoted
|
446
|
+
prepare_unquoted
|
447
|
+
prepare_line
|
448
|
+
prepare_header
|
449
|
+
prepare_parser
|
450
|
+
end
|
451
|
+
|
452
|
+
def prepare_variable
|
453
|
+
@need_robust_parsing = false
|
454
|
+
@encoding = @options[:encoding]
|
455
|
+
liberal_parsing = @options[:liberal_parsing]
|
456
|
+
if liberal_parsing
|
457
|
+
@liberal_parsing = true
|
458
|
+
if liberal_parsing.is_a?(Hash)
|
459
|
+
@double_quote_outside_quote =
|
460
|
+
liberal_parsing[:double_quote_outside_quote]
|
461
|
+
@backslash_quote = liberal_parsing[:backslash_quote]
|
462
|
+
else
|
463
|
+
@double_quote_outside_quote = false
|
464
|
+
@backslash_quote = false
|
465
|
+
end
|
466
|
+
@need_robust_parsing = true
|
467
|
+
else
|
468
|
+
@liberal_parsing = false
|
469
|
+
@backslash_quote = false
|
470
|
+
end
|
471
|
+
@unconverted_fields = @options[:unconverted_fields]
|
472
|
+
@max_field_size = @options[:max_field_size]
|
473
|
+
@skip_blanks = @options[:skip_blanks]
|
474
|
+
@fields_converter = @options[:fields_converter]
|
475
|
+
@header_fields_converter = @options[:header_fields_converter]
|
476
|
+
end
|
477
|
+
|
478
|
+
def prepare_quote_character
|
479
|
+
@quote_character = @options[:quote_character]
|
480
|
+
if @quote_character.nil?
|
481
|
+
@escaped_quote_character = nil
|
482
|
+
@escaped_quote = nil
|
483
|
+
else
|
484
|
+
@quote_character = @quote_character.to_s.encode(@encoding)
|
485
|
+
if @quote_character.length != 1
|
486
|
+
message = ":quote_char has to be nil or a single character String"
|
487
|
+
raise ArgumentError, message
|
488
|
+
end
|
489
|
+
@escaped_quote_character = Regexp.escape(@quote_character)
|
490
|
+
@escaped_quote = Regexp.new(@escaped_quote_character)
|
491
|
+
end
|
492
|
+
end
|
493
|
+
|
494
|
+
def prepare_backslash
|
495
|
+
return unless @backslash_quote
|
496
|
+
|
497
|
+
@backslash_character = "\\".encode(@encoding)
|
498
|
+
|
499
|
+
@escaped_backslash_character = Regexp.escape(@backslash_character)
|
500
|
+
@escaped_backslash = Regexp.new(@escaped_backslash_character)
|
501
|
+
if @quote_character.nil?
|
502
|
+
@backslash_quote_character = nil
|
503
|
+
else
|
504
|
+
@backslash_quote_character =
|
505
|
+
@backslash_character + @escaped_quote_character
|
506
|
+
end
|
507
|
+
end
|
508
|
+
|
509
|
+
def prepare_skip_lines
|
510
|
+
skip_lines = @options[:skip_lines]
|
511
|
+
case skip_lines
|
512
|
+
when String
|
513
|
+
@skip_lines = skip_lines.encode(@encoding)
|
514
|
+
when Regexp, nil
|
515
|
+
@skip_lines = skip_lines
|
516
|
+
else
|
517
|
+
unless skip_lines.respond_to?(:match)
|
518
|
+
message =
|
519
|
+
":skip_lines has to respond to \#match: #{skip_lines.inspect}"
|
520
|
+
raise ArgumentError, message
|
521
|
+
end
|
522
|
+
@skip_lines = skip_lines
|
523
|
+
end
|
524
|
+
end
|
525
|
+
|
526
|
+
def prepare_strip
|
527
|
+
@strip = @options[:strip]
|
528
|
+
@escaped_strip = nil
|
529
|
+
@strip_value = nil
|
530
|
+
@rstrip_value = nil
|
531
|
+
if @strip.is_a?(String)
|
532
|
+
case @strip.length
|
533
|
+
when 0
|
534
|
+
raise ArgumentError, ":strip must not be an empty String"
|
535
|
+
when 1
|
536
|
+
# ok
|
537
|
+
else
|
538
|
+
raise ArgumentError, ":strip doesn't support 2 or more characters yet"
|
539
|
+
end
|
540
|
+
@strip = @strip.encode(@encoding)
|
541
|
+
@escaped_strip = Regexp.escape(@strip)
|
542
|
+
if @quote_character
|
543
|
+
@strip_value = Regexp.new(@escaped_strip +
|
544
|
+
"+".encode(@encoding))
|
545
|
+
@rstrip_value = Regexp.new(@escaped_strip +
|
546
|
+
"+\\z".encode(@encoding))
|
547
|
+
end
|
548
|
+
@need_robust_parsing = true
|
549
|
+
elsif @strip
|
550
|
+
strip_values = " \t\f\v"
|
551
|
+
@escaped_strip = strip_values.encode(@encoding)
|
552
|
+
if @quote_character
|
553
|
+
@strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
|
554
|
+
@rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
|
555
|
+
end
|
556
|
+
@need_robust_parsing = true
|
557
|
+
end
|
558
|
+
end
|
559
|
+
|
560
|
+
begin
|
561
|
+
StringScanner.new("x").scan("x")
|
562
|
+
rescue TypeError
|
563
|
+
STRING_SCANNER_SCAN_ACCEPT_STRING = false
|
564
|
+
else
|
565
|
+
STRING_SCANNER_SCAN_ACCEPT_STRING = true
|
566
|
+
end
|
567
|
+
|
568
|
+
def prepare_separators
|
569
|
+
column_separator = @options[:column_separator]
|
570
|
+
@column_separator = column_separator.to_s.encode(@encoding)
|
571
|
+
if @column_separator.size < 1
|
572
|
+
message = ":col_sep must be 1 or more characters: "
|
573
|
+
message += column_separator.inspect
|
574
|
+
raise ArgumentError, message
|
575
|
+
end
|
576
|
+
@row_separator =
|
577
|
+
resolve_row_separator(@options[:row_separator]).encode(@encoding)
|
578
|
+
|
579
|
+
@escaped_column_separator = Regexp.escape(@column_separator)
|
580
|
+
@escaped_first_column_separator = Regexp.escape(@column_separator[0])
|
581
|
+
if @column_separator.size > 1
|
582
|
+
@column_end = Regexp.new(@escaped_column_separator)
|
583
|
+
@column_ends = @column_separator.each_char.collect do |char|
|
584
|
+
Regexp.new(Regexp.escape(char))
|
585
|
+
end
|
586
|
+
@first_column_separators = Regexp.new(@escaped_first_column_separator +
|
587
|
+
"+".encode(@encoding))
|
588
|
+
else
|
589
|
+
if STRING_SCANNER_SCAN_ACCEPT_STRING
|
590
|
+
@column_end = @column_separator
|
591
|
+
else
|
592
|
+
@column_end = Regexp.new(@escaped_column_separator)
|
593
|
+
end
|
594
|
+
@column_ends = nil
|
595
|
+
@first_column_separators = nil
|
596
|
+
end
|
597
|
+
|
598
|
+
escaped_row_separator = Regexp.escape(@row_separator)
|
599
|
+
@row_end = Regexp.new(escaped_row_separator)
|
600
|
+
if @row_separator.size > 1
|
601
|
+
@row_ends = @row_separator.each_char.collect do |char|
|
602
|
+
Regexp.new(Regexp.escape(char))
|
603
|
+
end
|
604
|
+
else
|
605
|
+
@row_ends = nil
|
606
|
+
end
|
607
|
+
|
608
|
+
@cr = "\r".encode(@encoding)
|
609
|
+
@lf = "\n".encode(@encoding)
|
610
|
+
@line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
|
611
|
+
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
|
612
|
+
end
|
613
|
+
|
614
|
+
# This method verifies that there are no (obvious) ambiguities with the
|
615
|
+
# provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
|
616
|
+
# and +strip+ were both equal to +\t+, then there would be no clear way to
|
617
|
+
# parse the input.
|
618
|
+
def validate_strip_and_col_sep_options
|
619
|
+
return unless @strip
|
620
|
+
|
621
|
+
if @strip.is_a?(String)
|
622
|
+
if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
|
623
|
+
raise ArgumentError,
|
624
|
+
"The provided strip (#{@escaped_strip}) and " \
|
625
|
+
"col_sep (#{@escaped_column_separator}) options are incompatible."
|
626
|
+
end
|
627
|
+
else
|
628
|
+
if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
|
629
|
+
raise ArgumentError,
|
630
|
+
"The provided strip (true) and " \
|
631
|
+
"col_sep (#{@escaped_column_separator}) options are incompatible."
|
632
|
+
end
|
633
|
+
end
|
634
|
+
end
|
635
|
+
|
636
|
+
def prepare_quoted
|
637
|
+
if @quote_character
|
638
|
+
@quotes = Regexp.new(@escaped_quote_character +
|
639
|
+
"+".encode(@encoding))
|
640
|
+
no_quoted_values = @escaped_quote_character.dup
|
641
|
+
if @backslash_quote
|
642
|
+
no_quoted_values << @escaped_backslash_character
|
643
|
+
end
|
644
|
+
@quoted_value = Regexp.new("[^".encode(@encoding) +
|
645
|
+
no_quoted_values +
|
646
|
+
"]+".encode(@encoding))
|
647
|
+
end
|
648
|
+
if @escaped_strip
|
649
|
+
@split_column_separator = Regexp.new(@escaped_strip +
|
650
|
+
"*".encode(@encoding) +
|
651
|
+
@escaped_column_separator +
|
652
|
+
@escaped_strip +
|
653
|
+
"*".encode(@encoding))
|
654
|
+
else
|
655
|
+
if @column_separator == " ".encode(@encoding)
|
656
|
+
@split_column_separator = Regexp.new(@escaped_column_separator)
|
657
|
+
else
|
658
|
+
@split_column_separator = @column_separator
|
659
|
+
end
|
660
|
+
end
|
661
|
+
end
|
662
|
+
|
663
|
+
def prepare_unquoted
|
664
|
+
return if @quote_character.nil?
|
665
|
+
|
666
|
+
no_unquoted_values = "\r\n".encode(@encoding)
|
667
|
+
no_unquoted_values << @escaped_first_column_separator
|
668
|
+
unless @liberal_parsing
|
669
|
+
no_unquoted_values << @escaped_quote_character
|
670
|
+
end
|
671
|
+
@unquoted_value = Regexp.new("[^".encode(@encoding) +
|
672
|
+
no_unquoted_values +
|
673
|
+
"]+".encode(@encoding))
|
674
|
+
end
|
675
|
+
|
676
|
+
def resolve_row_separator(separator)
|
677
|
+
if separator == :auto
|
678
|
+
cr = "\r".encode(@encoding)
|
679
|
+
lf = "\n".encode(@encoding)
|
680
|
+
if @input.is_a?(StringIO)
|
681
|
+
pos = @input.pos
|
682
|
+
separator = detect_row_separator(@input.read, cr, lf)
|
683
|
+
@input.seek(pos)
|
684
|
+
elsif @input.respond_to?(:gets)
|
685
|
+
if @input.is_a?(File)
|
686
|
+
chunk_size = 32 * 1024
|
687
|
+
else
|
688
|
+
chunk_size = 1024
|
689
|
+
end
|
690
|
+
begin
|
691
|
+
while separator == :auto
|
692
|
+
#
|
693
|
+
# if we run out of data, it's probably a single line
|
694
|
+
# (ensure will set default value)
|
695
|
+
#
|
696
|
+
break unless sample = @input.gets(nil, chunk_size)
|
697
|
+
|
698
|
+
# extend sample if we're unsure of the line ending
|
699
|
+
if sample.end_with?(cr)
|
700
|
+
sample << (@input.gets(nil, 1) || "")
|
701
|
+
end
|
702
|
+
|
703
|
+
@samples << sample
|
704
|
+
|
705
|
+
separator = detect_row_separator(sample, cr, lf)
|
706
|
+
end
|
707
|
+
rescue IOError
|
708
|
+
# do nothing: ensure will set default
|
709
|
+
end
|
710
|
+
end
|
711
|
+
separator = InputRecordSeparator.value if separator == :auto
|
712
|
+
end
|
713
|
+
separator.to_s.encode(@encoding)
|
714
|
+
end
|
715
|
+
|
716
|
+
def detect_row_separator(sample, cr, lf)
|
717
|
+
lf_index = sample.index(lf)
|
718
|
+
if lf_index
|
719
|
+
cr_index = sample[0, lf_index].index(cr)
|
720
|
+
else
|
721
|
+
cr_index = sample.index(cr)
|
722
|
+
end
|
723
|
+
if cr_index and lf_index
|
724
|
+
if cr_index + 1 == lf_index
|
725
|
+
cr + lf
|
726
|
+
elsif cr_index < lf_index
|
727
|
+
cr
|
728
|
+
else
|
729
|
+
lf
|
730
|
+
end
|
731
|
+
elsif cr_index
|
732
|
+
cr
|
733
|
+
elsif lf_index
|
734
|
+
lf
|
735
|
+
else
|
736
|
+
:auto
|
737
|
+
end
|
738
|
+
end
|
739
|
+
|
740
|
+
def prepare_line
|
741
|
+
@lineno = 0
|
742
|
+
@last_line = nil
|
743
|
+
@scanner = nil
|
744
|
+
end
|
745
|
+
|
746
|
+
def last_line
|
747
|
+
if @scanner
|
748
|
+
@last_line ||= @scanner.keep_end
|
749
|
+
else
|
750
|
+
@last_line
|
751
|
+
end
|
752
|
+
end
|
753
|
+
|
754
|
+
def prepare_header
|
755
|
+
@return_headers = @options[:return_headers]
|
756
|
+
|
757
|
+
headers = @options[:headers]
|
758
|
+
case headers
|
759
|
+
when Array
|
760
|
+
@raw_headers = headers
|
761
|
+
quoted_fields = [false] * @raw_headers.size
|
762
|
+
@use_headers = true
|
763
|
+
when String
|
764
|
+
@raw_headers, quoted_fields = parse_headers(headers)
|
765
|
+
@use_headers = true
|
766
|
+
when nil, false
|
767
|
+
@raw_headers = nil
|
768
|
+
@use_headers = false
|
769
|
+
else
|
770
|
+
@raw_headers = nil
|
771
|
+
@use_headers = true
|
772
|
+
end
|
773
|
+
if @raw_headers
|
774
|
+
@headers = adjust_headers(@raw_headers, quoted_fields)
|
775
|
+
else
|
776
|
+
@headers = nil
|
777
|
+
end
|
778
|
+
end
|
779
|
+
|
780
|
+
def parse_headers(row)
|
781
|
+
quoted_fields = []
|
782
|
+
converter = lambda do |field, info|
|
783
|
+
quoted_fields << info.quoted?
|
784
|
+
field
|
785
|
+
end
|
786
|
+
headers = CSV.parse_line(row,
|
787
|
+
col_sep: @column_separator,
|
788
|
+
row_sep: @row_separator,
|
789
|
+
quote_char: @quote_character,
|
790
|
+
converters: [converter])
|
791
|
+
[headers, quoted_fields]
|
792
|
+
end
|
793
|
+
|
794
|
+
def adjust_headers(headers, quoted_fields)
|
795
|
+
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
|
796
|
+
adjusted_headers.each {|h| h.freeze if h.is_a? String}
|
797
|
+
adjusted_headers
|
798
|
+
end
|
799
|
+
|
800
|
+
def prepare_parser
|
801
|
+
@may_quoted = may_quoted?
|
802
|
+
end
|
803
|
+
|
804
|
+
def may_quoted?
|
805
|
+
return false if @quote_character.nil?
|
806
|
+
|
807
|
+
if @input.is_a?(StringIO)
|
808
|
+
pos = @input.pos
|
809
|
+
sample = @input.read
|
810
|
+
@input.seek(pos)
|
811
|
+
else
|
812
|
+
return false if @samples.empty?
|
813
|
+
sample = @samples.first
|
814
|
+
end
|
815
|
+
sample[0, 128].index(@quote_character)
|
816
|
+
end
|
817
|
+
|
818
|
+
class UnoptimizedStringIO # :nodoc:
|
819
|
+
def initialize(string)
|
820
|
+
@io = StringIO.new(string, "rb:#{string.encoding}")
|
821
|
+
end
|
822
|
+
|
823
|
+
def gets(*args)
|
824
|
+
@io.gets(*args)
|
825
|
+
end
|
826
|
+
|
827
|
+
def each_line(*args, &block)
|
828
|
+
@io.each_line(*args, &block)
|
829
|
+
end
|
830
|
+
|
831
|
+
def eof?
|
832
|
+
@io.eof?
|
833
|
+
end
|
834
|
+
end
|
835
|
+
|
836
|
+
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
|
837
|
+
if SCANNER_TEST
|
838
|
+
SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
|
839
|
+
SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
|
840
|
+
def build_scanner
|
841
|
+
inputs = @samples.collect do |sample|
|
842
|
+
UnoptimizedStringIO.new(sample)
|
843
|
+
end
|
844
|
+
if @input.is_a?(StringIO)
|
845
|
+
inputs << UnoptimizedStringIO.new(@input.read)
|
846
|
+
else
|
847
|
+
inputs << @input
|
848
|
+
end
|
849
|
+
begin
|
850
|
+
chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
|
851
|
+
rescue # Ractor::IsolationError
|
852
|
+
# Ractor on Ruby 3.0 can't read ENV value.
|
853
|
+
chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
|
854
|
+
end
|
855
|
+
chunk_size = Integer((chunk_size_value || "1"), 10)
|
856
|
+
InputsScanner.new(inputs,
|
857
|
+
@encoding,
|
858
|
+
@row_separator,
|
859
|
+
chunk_size: chunk_size)
|
860
|
+
end
|
861
|
+
else
|
862
|
+
def build_scanner
|
863
|
+
string = nil
|
864
|
+
if @samples.empty? and @input.is_a?(StringIO)
|
865
|
+
string = @input.read
|
866
|
+
elsif @samples.size == 1 and
|
867
|
+
@input != ARGF and
|
868
|
+
@input.respond_to?(:eof?) and
|
869
|
+
@input.eof?
|
870
|
+
string = @samples[0]
|
871
|
+
end
|
872
|
+
if string
|
873
|
+
unless string.valid_encoding?
|
874
|
+
index = string.lines(@row_separator).index do |line|
|
875
|
+
!line.valid_encoding?
|
876
|
+
end
|
877
|
+
if index
|
878
|
+
raise InvalidEncodingError.new(@encoding, @lineno + index + 1)
|
879
|
+
end
|
880
|
+
end
|
881
|
+
Scanner.new(string)
|
882
|
+
else
|
883
|
+
inputs = @samples.collect do |sample|
|
884
|
+
StringIO.new(sample)
|
885
|
+
end
|
886
|
+
inputs << @input
|
887
|
+
InputsScanner.new(inputs, @encoding, @row_separator)
|
888
|
+
end
|
889
|
+
end
|
890
|
+
end
|
891
|
+
|
892
|
+
def skip_needless_lines
|
893
|
+
return unless @skip_lines
|
894
|
+
|
895
|
+
until @scanner.eos?
|
896
|
+
@scanner.keep_start
|
897
|
+
line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
|
898
|
+
line << @row_separator if parse_row_end
|
899
|
+
if skip_line?(line)
|
900
|
+
@lineno += 1
|
901
|
+
@scanner.keep_drop
|
902
|
+
else
|
903
|
+
@scanner.keep_back
|
904
|
+
return
|
905
|
+
end
|
906
|
+
end
|
907
|
+
end
|
908
|
+
|
909
|
+
def skip_line?(line)
|
910
|
+
line = line.delete_suffix(@row_separator)
|
911
|
+
case @skip_lines
|
912
|
+
when String
|
913
|
+
line.include?(@skip_lines)
|
914
|
+
when Regexp
|
915
|
+
@skip_lines.match?(line)
|
916
|
+
else
|
917
|
+
@skip_lines.match(line)
|
918
|
+
end
|
919
|
+
end
|
920
|
+
|
921
|
+
def validate_field_size(field)
|
922
|
+
return unless @max_field_size
|
923
|
+
return if field.size <= @max_field_size
|
924
|
+
ignore_broken_line
|
925
|
+
message = "Field size exceeded: #{field.size} > #{@max_field_size}"
|
926
|
+
raise MalformedCSVError.new(message, @lineno)
|
927
|
+
end
|
928
|
+
|
929
|
+
def parse_no_quote(&block)
|
930
|
+
@scanner.each_line(@row_separator) do |line|
|
931
|
+
next if @skip_lines and skip_line?(line)
|
932
|
+
original_line = line
|
933
|
+
line = line.delete_suffix(@row_separator)
|
934
|
+
|
935
|
+
if line.empty?
|
936
|
+
next if @skip_blanks
|
937
|
+
row = []
|
938
|
+
quoted_fields = []
|
939
|
+
else
|
940
|
+
line = strip_value(line)
|
941
|
+
row = line.split(@split_column_separator, -1)
|
942
|
+
quoted_fields = [false] * row.size
|
943
|
+
if @max_field_size
|
944
|
+
row.each do |column|
|
945
|
+
validate_field_size(column)
|
946
|
+
end
|
947
|
+
end
|
948
|
+
n_columns = row.size
|
949
|
+
i = 0
|
950
|
+
while i < n_columns
|
951
|
+
row[i] = nil if row[i].empty?
|
952
|
+
i += 1
|
953
|
+
end
|
954
|
+
end
|
955
|
+
@last_line = original_line
|
956
|
+
emit_row(row, quoted_fields, &block)
|
957
|
+
end
|
958
|
+
end
|
959
|
+
|
960
|
+
def parse_quotable_loose(&block)
|
961
|
+
@scanner.keep_start
|
962
|
+
@scanner.each_line(@row_separator) do |line|
|
963
|
+
if @skip_lines and skip_line?(line)
|
964
|
+
@scanner.keep_drop
|
965
|
+
@scanner.keep_start
|
966
|
+
next
|
967
|
+
end
|
968
|
+
original_line = line
|
969
|
+
line = line.delete_suffix(@row_separator)
|
970
|
+
|
971
|
+
if line.empty?
|
972
|
+
if @skip_blanks
|
973
|
+
@scanner.keep_drop
|
974
|
+
@scanner.keep_start
|
975
|
+
next
|
976
|
+
end
|
977
|
+
row = []
|
978
|
+
quoted_fields = []
|
979
|
+
elsif line.include?(@cr) or line.include?(@lf)
|
980
|
+
@scanner.keep_back
|
981
|
+
@need_robust_parsing = true
|
982
|
+
return parse_quotable_robust(&block)
|
983
|
+
else
|
984
|
+
row = line.split(@split_column_separator, -1)
|
985
|
+
quoted_fields = []
|
986
|
+
n_columns = row.size
|
987
|
+
i = 0
|
988
|
+
while i < n_columns
|
989
|
+
column = row[i]
|
990
|
+
if column.empty?
|
991
|
+
quoted_fields << false
|
992
|
+
row[i] = nil
|
993
|
+
else
|
994
|
+
n_quotes = column.count(@quote_character)
|
995
|
+
if n_quotes.zero?
|
996
|
+
quoted_fields << false
|
997
|
+
# no quote
|
998
|
+
elsif n_quotes == 2 and
|
999
|
+
column.start_with?(@quote_character) and
|
1000
|
+
column.end_with?(@quote_character)
|
1001
|
+
quoted_fields << true
|
1002
|
+
row[i] = column[1..-2]
|
1003
|
+
else
|
1004
|
+
@scanner.keep_back
|
1005
|
+
@need_robust_parsing = true
|
1006
|
+
return parse_quotable_robust(&block)
|
1007
|
+
end
|
1008
|
+
validate_field_size(row[i])
|
1009
|
+
end
|
1010
|
+
i += 1
|
1011
|
+
end
|
1012
|
+
end
|
1013
|
+
@scanner.keep_drop
|
1014
|
+
@scanner.keep_start
|
1015
|
+
@last_line = original_line
|
1016
|
+
emit_row(row, quoted_fields, &block)
|
1017
|
+
end
|
1018
|
+
@scanner.keep_drop
|
1019
|
+
end
|
1020
|
+
|
1021
|
+
def parse_quotable_robust(&block)
|
1022
|
+
row = []
|
1023
|
+
quoted_fields = []
|
1024
|
+
skip_needless_lines
|
1025
|
+
start_row
|
1026
|
+
while true
|
1027
|
+
@quoted_column_value = false
|
1028
|
+
@unquoted_column_value = false
|
1029
|
+
@scanner.scan_all(@strip_value) if @strip_value
|
1030
|
+
value = parse_column_value
|
1031
|
+
if value
|
1032
|
+
@scanner.scan_all(@strip_value) if @strip_value
|
1033
|
+
validate_field_size(value)
|
1034
|
+
end
|
1035
|
+
if parse_column_end
|
1036
|
+
row << value
|
1037
|
+
quoted_fields << @quoted_column_value
|
1038
|
+
elsif parse_row_end
|
1039
|
+
if row.empty? and value.nil?
|
1040
|
+
emit_row([], [], &block) unless @skip_blanks
|
1041
|
+
else
|
1042
|
+
row << value
|
1043
|
+
quoted_fields << @quoted_column_value
|
1044
|
+
emit_row(row, quoted_fields, &block)
|
1045
|
+
row = []
|
1046
|
+
quoted_fields = []
|
1047
|
+
end
|
1048
|
+
skip_needless_lines
|
1049
|
+
start_row
|
1050
|
+
elsif @scanner.eos?
|
1051
|
+
break if row.empty? and value.nil?
|
1052
|
+
row << value
|
1053
|
+
quoted_fields << @quoted_column_value
|
1054
|
+
emit_row(row, quoted_fields, &block)
|
1055
|
+
break
|
1056
|
+
else
|
1057
|
+
if @quoted_column_value
|
1058
|
+
if liberal_parsing? and (new_line = @scanner.check(@line_end))
|
1059
|
+
message =
|
1060
|
+
"Illegal end-of-line sequence outside of a quoted field " +
|
1061
|
+
"<#{new_line.inspect}>"
|
1062
|
+
else
|
1063
|
+
message = "Any value after quoted field isn't allowed"
|
1064
|
+
end
|
1065
|
+
ignore_broken_line
|
1066
|
+
raise MalformedCSVError.new(message, @lineno)
|
1067
|
+
elsif @unquoted_column_value and
|
1068
|
+
(new_line = @scanner.scan(@line_end))
|
1069
|
+
ignore_broken_line
|
1070
|
+
message = "Unquoted fields do not allow new line " +
|
1071
|
+
"<#{new_line.inspect}>"
|
1072
|
+
raise MalformedCSVError.new(message, @lineno)
|
1073
|
+
elsif @scanner.rest.start_with?(@quote_character)
|
1074
|
+
ignore_broken_line
|
1075
|
+
message = "Illegal quoting"
|
1076
|
+
raise MalformedCSVError.new(message, @lineno)
|
1077
|
+
elsif (new_line = @scanner.scan(@line_end))
|
1078
|
+
ignore_broken_line
|
1079
|
+
message = "New line must be <#{@row_separator.inspect}> " +
|
1080
|
+
"not <#{new_line.inspect}>"
|
1081
|
+
raise MalformedCSVError.new(message, @lineno)
|
1082
|
+
else
|
1083
|
+
ignore_broken_line
|
1084
|
+
raise MalformedCSVError.new("TODO: Meaningful message",
|
1085
|
+
@lineno)
|
1086
|
+
end
|
1087
|
+
end
|
1088
|
+
end
|
1089
|
+
end
|
1090
|
+
|
1091
|
+
def parse_column_value
|
1092
|
+
if @liberal_parsing
|
1093
|
+
quoted_value = parse_quoted_column_value
|
1094
|
+
if quoted_value
|
1095
|
+
@scanner.scan_all(@strip_value) if @strip_value
|
1096
|
+
unquoted_value = parse_unquoted_column_value
|
1097
|
+
if unquoted_value
|
1098
|
+
if @double_quote_outside_quote
|
1099
|
+
unquoted_value = unquoted_value.gsub(@quote_character * 2,
|
1100
|
+
@quote_character)
|
1101
|
+
if quoted_value.empty? # %Q{""...} case
|
1102
|
+
return @quote_character + unquoted_value
|
1103
|
+
end
|
1104
|
+
end
|
1105
|
+
@quote_character + quoted_value + @quote_character + unquoted_value
|
1106
|
+
else
|
1107
|
+
quoted_value
|
1108
|
+
end
|
1109
|
+
else
|
1110
|
+
parse_unquoted_column_value
|
1111
|
+
end
|
1112
|
+
elsif @may_quoted
|
1113
|
+
parse_quoted_column_value ||
|
1114
|
+
parse_unquoted_column_value
|
1115
|
+
else
|
1116
|
+
parse_unquoted_column_value ||
|
1117
|
+
parse_quoted_column_value
|
1118
|
+
end
|
1119
|
+
end
|
1120
|
+
|
1121
|
+
def parse_unquoted_column_value
|
1122
|
+
value = @scanner.scan_all(@unquoted_value)
|
1123
|
+
return nil unless value
|
1124
|
+
|
1125
|
+
@unquoted_column_value = true
|
1126
|
+
if @first_column_separators
|
1127
|
+
while true
|
1128
|
+
@scanner.keep_start
|
1129
|
+
is_column_end = @column_ends.all? do |column_end|
|
1130
|
+
@scanner.scan(column_end)
|
1131
|
+
end
|
1132
|
+
@scanner.keep_back
|
1133
|
+
break if is_column_end
|
1134
|
+
sub_separator = @scanner.scan_all(@first_column_separators)
|
1135
|
+
break if sub_separator.nil?
|
1136
|
+
value << sub_separator
|
1137
|
+
sub_value = @scanner.scan_all(@unquoted_value)
|
1138
|
+
break if sub_value.nil?
|
1139
|
+
value << sub_value
|
1140
|
+
end
|
1141
|
+
end
|
1142
|
+
value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
|
1143
|
+
if @rstrip_value
|
1144
|
+
value.gsub!(@rstrip_value, "")
|
1145
|
+
end
|
1146
|
+
value
|
1147
|
+
end
|
1148
|
+
|
1149
|
+
def parse_quoted_column_value
|
1150
|
+
quotes = @scanner.scan_all(@quotes)
|
1151
|
+
return nil unless quotes
|
1152
|
+
|
1153
|
+
@quoted_column_value = true
|
1154
|
+
n_quotes = quotes.size
|
1155
|
+
if (n_quotes % 2).zero?
|
1156
|
+
quotes[0, (n_quotes - 2) / 2]
|
1157
|
+
else
|
1158
|
+
value = quotes[0, n_quotes / 2]
|
1159
|
+
while true
|
1160
|
+
quoted_value = @scanner.scan_all(@quoted_value)
|
1161
|
+
value << quoted_value if quoted_value
|
1162
|
+
if @backslash_quote
|
1163
|
+
if @scanner.scan(@escaped_backslash)
|
1164
|
+
if @scanner.scan(@escaped_quote)
|
1165
|
+
value << @quote_character
|
1166
|
+
else
|
1167
|
+
value << @backslash_character
|
1168
|
+
end
|
1169
|
+
next
|
1170
|
+
end
|
1171
|
+
end
|
1172
|
+
|
1173
|
+
quotes = @scanner.scan_all(@quotes)
|
1174
|
+
unless quotes
|
1175
|
+
ignore_broken_line
|
1176
|
+
message = "Unclosed quoted field"
|
1177
|
+
raise MalformedCSVError.new(message, @lineno)
|
1178
|
+
end
|
1179
|
+
n_quotes = quotes.size
|
1180
|
+
if n_quotes == 1
|
1181
|
+
break
|
1182
|
+
else
|
1183
|
+
value << quotes[0, n_quotes / 2]
|
1184
|
+
break if (n_quotes % 2) == 1
|
1185
|
+
end
|
1186
|
+
end
|
1187
|
+
value
|
1188
|
+
end
|
1189
|
+
end
|
1190
|
+
|
1191
|
+
def parse_column_end
|
1192
|
+
return true if @scanner.scan(@column_end)
|
1193
|
+
return false unless @column_ends
|
1194
|
+
|
1195
|
+
@scanner.keep_start
|
1196
|
+
if @column_ends.all? {|column_end| @scanner.scan(column_end)}
|
1197
|
+
@scanner.keep_drop
|
1198
|
+
true
|
1199
|
+
else
|
1200
|
+
@scanner.keep_back
|
1201
|
+
false
|
1202
|
+
end
|
1203
|
+
end
|
1204
|
+
|
1205
|
+
def parse_row_end
|
1206
|
+
return true if @scanner.scan(@row_end)
|
1207
|
+
return false unless @row_ends
|
1208
|
+
@scanner.keep_start
|
1209
|
+
if @row_ends.all? {|row_end| @scanner.scan(row_end)}
|
1210
|
+
@scanner.keep_drop
|
1211
|
+
true
|
1212
|
+
else
|
1213
|
+
@scanner.keep_back
|
1214
|
+
false
|
1215
|
+
end
|
1216
|
+
end
|
1217
|
+
|
1218
|
+
def strip_value(value)
|
1219
|
+
return value unless @strip
|
1220
|
+
return value if value.nil?
|
1221
|
+
|
1222
|
+
case @strip
|
1223
|
+
when String
|
1224
|
+
while value.delete_prefix!(@strip)
|
1225
|
+
# do nothing
|
1226
|
+
end
|
1227
|
+
while value.delete_suffix!(@strip)
|
1228
|
+
# do nothing
|
1229
|
+
end
|
1230
|
+
else
|
1231
|
+
value.strip!
|
1232
|
+
end
|
1233
|
+
value
|
1234
|
+
end
|
1235
|
+
|
1236
|
+
def ignore_broken_line
|
1237
|
+
@scanner.scan_all(@not_line_end)
|
1238
|
+
@scanner.scan_all(@line_end)
|
1239
|
+
@lineno += 1
|
1240
|
+
end
|
1241
|
+
|
1242
|
+
def start_row
|
1243
|
+
if @last_line
|
1244
|
+
@last_line = nil
|
1245
|
+
else
|
1246
|
+
@scanner.keep_drop
|
1247
|
+
end
|
1248
|
+
@scanner.keep_start
|
1249
|
+
end
|
1250
|
+
|
1251
|
+
def emit_row(row, quoted_fields, &block)
|
1252
|
+
@lineno += 1
|
1253
|
+
|
1254
|
+
raw_row = row
|
1255
|
+
if @use_headers
|
1256
|
+
if @headers.nil?
|
1257
|
+
@headers = adjust_headers(row, quoted_fields)
|
1258
|
+
return unless @return_headers
|
1259
|
+
row = Row.new(@headers, row, true)
|
1260
|
+
else
|
1261
|
+
row = Row.new(@headers,
|
1262
|
+
@fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
|
1263
|
+
end
|
1264
|
+
else
|
1265
|
+
# convert fields, if needed...
|
1266
|
+
row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
|
1267
|
+
end
|
1268
|
+
|
1269
|
+
# inject unconverted fields and accessor, if requested...
|
1270
|
+
if @unconverted_fields and not row.respond_to?(:unconverted_fields)
|
1271
|
+
add_unconverted_fields(row, raw_row)
|
1272
|
+
end
|
1273
|
+
|
1274
|
+
yield(row)
|
1275
|
+
end
|
1276
|
+
|
1277
|
+
# This method injects an instance variable <tt>unconverted_fields</tt> into
|
1278
|
+
# +row+ and an accessor method for +row+ called unconverted_fields(). The
|
1279
|
+
# variable is set to the contents of +fields+.
|
1280
|
+
def add_unconverted_fields(row, fields)
|
1281
|
+
class << row
|
1282
|
+
attr_reader :unconverted_fields
|
1283
|
+
end
|
1284
|
+
row.instance_variable_set(:@unconverted_fields, fields)
|
1285
|
+
row
|
1286
|
+
end
|
1287
|
+
end
|
1288
|
+
end
|