csv 3.1.9 → 3.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/NEWS.md +361 -0
- data/README.md +3 -6
- data/doc/csv/options/generating/write_headers.rdoc +1 -1
- data/doc/csv/options/parsing/liberal_parsing.rdoc +21 -2
- data/doc/csv/recipes/filtering.rdoc +85 -17
- data/doc/csv/recipes/generating.rdoc +2 -2
- data/doc/csv/recipes/parsing.rdoc +16 -7
- data/lib/csv/core_ext/array.rb +1 -1
- data/lib/csv/core_ext/string.rb +1 -1
- data/lib/csv/fields_converter.rb +16 -4
- data/lib/csv/input_record_separator.rb +18 -0
- data/lib/csv/parser.rb +263 -113
- data/lib/csv/row.rb +23 -1
- data/lib/csv/table.rb +18 -7
- data/lib/csv/version.rb +1 -1
- data/lib/csv/writer.rb +6 -6
- data/lib/csv.rb +535 -188
- metadata +9 -66
- data/lib/csv/delete_suffix.rb +0 -18
- data/lib/csv/match_p.rb +0 -20
data/lib/csv/parser.rb
CHANGED
|
@@ -2,14 +2,10 @@
|
|
|
2
2
|
|
|
3
3
|
require "strscan"
|
|
4
4
|
|
|
5
|
-
require_relative "
|
|
6
|
-
require_relative "match_p"
|
|
5
|
+
require_relative "input_record_separator"
|
|
7
6
|
require_relative "row"
|
|
8
7
|
require_relative "table"
|
|
9
8
|
|
|
10
|
-
using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
|
|
11
|
-
using CSV::MatchP if CSV.const_defined?(:MatchP)
|
|
12
|
-
|
|
13
9
|
class CSV
|
|
14
10
|
# Note: Don't use this class directly. This is an internal class.
|
|
15
11
|
class Parser
|
|
@@ -26,6 +22,10 @@ class CSV
|
|
|
26
22
|
class InvalidEncoding < StandardError
|
|
27
23
|
end
|
|
28
24
|
|
|
25
|
+
# Raised when unexpected case is happen.
|
|
26
|
+
class UnexpectedError < StandardError
|
|
27
|
+
end
|
|
28
|
+
|
|
29
29
|
#
|
|
30
30
|
# CSV::Scanner receives a CSV output, scans it and return the content.
|
|
31
31
|
# It also controls the life cycle of the object with its methods +keep_start+,
|
|
@@ -77,16 +77,17 @@ class CSV
|
|
|
77
77
|
# +keep_end+, +keep_back+, +keep_drop+.
|
|
78
78
|
#
|
|
79
79
|
# CSV::InputsScanner.scan() tries to match with pattern at the current position.
|
|
80
|
-
# If there's a match, the scanner advances the
|
|
80
|
+
# If there's a match, the scanner advances the "scan pointer" and returns the matched string.
|
|
81
81
|
# Otherwise, the scanner returns nil.
|
|
82
82
|
#
|
|
83
|
-
# CSV::InputsScanner.rest() returns the
|
|
83
|
+
# CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
|
|
84
84
|
# If there is no more data (eos? = true), it returns "".
|
|
85
85
|
#
|
|
86
86
|
class InputsScanner
|
|
87
|
-
def initialize(inputs, encoding, chunk_size: 8192)
|
|
87
|
+
def initialize(inputs, encoding, row_separator, chunk_size: 8192)
|
|
88
88
|
@inputs = inputs.dup
|
|
89
89
|
@encoding = encoding
|
|
90
|
+
@row_separator = row_separator
|
|
90
91
|
@chunk_size = chunk_size
|
|
91
92
|
@last_scanner = @inputs.empty?
|
|
92
93
|
@keeps = []
|
|
@@ -94,11 +95,13 @@ class CSV
|
|
|
94
95
|
end
|
|
95
96
|
|
|
96
97
|
def each_line(row_separator)
|
|
98
|
+
return enum_for(__method__, row_separator) unless block_given?
|
|
97
99
|
buffer = nil
|
|
98
100
|
input = @scanner.rest
|
|
99
101
|
position = @scanner.pos
|
|
100
102
|
offset = 0
|
|
101
103
|
n_row_separator_chars = row_separator.size
|
|
104
|
+
# trace(__method__, :start, input)
|
|
102
105
|
while true
|
|
103
106
|
input.each_line(row_separator) do |line|
|
|
104
107
|
@scanner.pos += line.bytesize
|
|
@@ -138,25 +141,29 @@ class CSV
|
|
|
138
141
|
end
|
|
139
142
|
|
|
140
143
|
def scan(pattern)
|
|
144
|
+
# trace(__method__, pattern, :start)
|
|
141
145
|
value = @scanner.scan(pattern)
|
|
146
|
+
# trace(__method__, pattern, :done, :last, value) if @last_scanner
|
|
142
147
|
return value if @last_scanner
|
|
143
148
|
|
|
144
|
-
if value
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
else
|
|
148
|
-
nil
|
|
149
|
-
end
|
|
149
|
+
read_chunk if value and @scanner.eos?
|
|
150
|
+
# trace(__method__, pattern, :done, value)
|
|
151
|
+
value
|
|
150
152
|
end
|
|
151
153
|
|
|
152
154
|
def scan_all(pattern)
|
|
155
|
+
# trace(__method__, pattern, :start)
|
|
153
156
|
value = @scanner.scan(pattern)
|
|
157
|
+
# trace(__method__, pattern, :done, :last, value) if @last_scanner
|
|
154
158
|
return value if @last_scanner
|
|
155
159
|
|
|
160
|
+
# trace(__method__, pattern, :done, :nil) if value.nil?
|
|
156
161
|
return nil if value.nil?
|
|
157
162
|
while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
|
|
163
|
+
# trace(__method__, pattern, :sub, sub_value)
|
|
158
164
|
value << sub_value
|
|
159
165
|
end
|
|
166
|
+
# trace(__method__, pattern, :done, value)
|
|
160
167
|
value
|
|
161
168
|
end
|
|
162
169
|
|
|
@@ -165,76 +172,145 @@ class CSV
|
|
|
165
172
|
end
|
|
166
173
|
|
|
167
174
|
def keep_start
|
|
168
|
-
|
|
175
|
+
# trace(__method__, :start)
|
|
176
|
+
adjust_last_keep
|
|
177
|
+
@keeps.push([@scanner, @scanner.pos, nil])
|
|
178
|
+
# trace(__method__, :done)
|
|
169
179
|
end
|
|
170
180
|
|
|
171
181
|
def keep_end
|
|
172
|
-
|
|
173
|
-
|
|
182
|
+
# trace(__method__, :start)
|
|
183
|
+
scanner, start, buffer = @keeps.pop
|
|
184
|
+
if scanner == @scanner
|
|
185
|
+
keep = @scanner.string.byteslice(start, @scanner.pos - start)
|
|
186
|
+
else
|
|
187
|
+
keep = @scanner.string.byteslice(0, @scanner.pos)
|
|
188
|
+
end
|
|
174
189
|
if buffer
|
|
175
190
|
buffer << keep
|
|
176
191
|
keep = buffer
|
|
177
192
|
end
|
|
193
|
+
# trace(__method__, :done, keep)
|
|
178
194
|
keep
|
|
179
195
|
end
|
|
180
196
|
|
|
181
197
|
def keep_back
|
|
182
|
-
|
|
198
|
+
# trace(__method__, :start)
|
|
199
|
+
scanner, start, buffer = @keeps.pop
|
|
183
200
|
if buffer
|
|
201
|
+
# trace(__method__, :rescan, start, buffer)
|
|
184
202
|
string = @scanner.string
|
|
185
|
-
|
|
203
|
+
if scanner == @scanner
|
|
204
|
+
keep = string.byteslice(start,
|
|
205
|
+
string.bytesize - @scanner.pos - start)
|
|
206
|
+
else
|
|
207
|
+
keep = string
|
|
208
|
+
end
|
|
186
209
|
if keep and not keep.empty?
|
|
187
210
|
@inputs.unshift(StringIO.new(keep))
|
|
188
211
|
@last_scanner = false
|
|
189
212
|
end
|
|
190
213
|
@scanner = StringScanner.new(buffer)
|
|
191
214
|
else
|
|
215
|
+
if @scanner != scanner
|
|
216
|
+
message = "scanners are different but no buffer: "
|
|
217
|
+
message += "#{@scanner.inspect}(#{@scanner.object_id}): "
|
|
218
|
+
message += "#{scanner.inspect}(#{scanner.object_id})"
|
|
219
|
+
raise UnexpectedError, message
|
|
220
|
+
end
|
|
221
|
+
# trace(__method__, :repos, start, buffer)
|
|
192
222
|
@scanner.pos = start
|
|
223
|
+
last_scanner, last_start, last_buffer = @keeps.last
|
|
224
|
+
# Drop the last buffer when the last buffer is the same data
|
|
225
|
+
# in the last keep. If we keep it, we have duplicated data
|
|
226
|
+
# by the next keep_back.
|
|
227
|
+
if last_scanner == @scanner and
|
|
228
|
+
last_buffer and
|
|
229
|
+
last_buffer == last_scanner.string.byteslice(last_start, start)
|
|
230
|
+
@keeps.last[2] = nil
|
|
231
|
+
end
|
|
193
232
|
end
|
|
194
233
|
read_chunk if @scanner.eos?
|
|
195
234
|
end
|
|
196
235
|
|
|
197
236
|
def keep_drop
|
|
198
|
-
@keeps.pop
|
|
237
|
+
_, _, buffer = @keeps.pop
|
|
238
|
+
# trace(__method__, :done, :empty) unless buffer
|
|
239
|
+
return unless buffer
|
|
240
|
+
|
|
241
|
+
last_keep = @keeps.last
|
|
242
|
+
# trace(__method__, :done, :no_last_keep) unless last_keep
|
|
243
|
+
return unless last_keep
|
|
244
|
+
|
|
245
|
+
if last_keep[2]
|
|
246
|
+
last_keep[2] << buffer
|
|
247
|
+
else
|
|
248
|
+
last_keep[2] = buffer
|
|
249
|
+
end
|
|
250
|
+
# trace(__method__, :done)
|
|
199
251
|
end
|
|
200
252
|
|
|
201
253
|
def rest
|
|
202
254
|
@scanner.rest
|
|
203
255
|
end
|
|
204
256
|
|
|
257
|
+
def check(pattern)
|
|
258
|
+
@scanner.check(pattern)
|
|
259
|
+
end
|
|
260
|
+
|
|
205
261
|
private
|
|
206
|
-
def
|
|
207
|
-
|
|
262
|
+
def trace(*args)
|
|
263
|
+
pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
|
|
264
|
+
end
|
|
208
265
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
266
|
+
def adjust_last_keep
|
|
267
|
+
# trace(__method__, :start)
|
|
268
|
+
|
|
269
|
+
keep = @keeps.last
|
|
270
|
+
# trace(__method__, :done, :empty) if keep.nil?
|
|
271
|
+
return if keep.nil?
|
|
272
|
+
|
|
273
|
+
scanner, start, buffer = keep
|
|
274
|
+
string = @scanner.string
|
|
275
|
+
if @scanner != scanner
|
|
276
|
+
start = 0
|
|
277
|
+
end
|
|
278
|
+
if start == 0 and @scanner.eos?
|
|
279
|
+
keep_data = string
|
|
280
|
+
else
|
|
281
|
+
keep_data = string.byteslice(start, @scanner.pos - start)
|
|
282
|
+
end
|
|
283
|
+
if keep_data
|
|
284
|
+
if buffer
|
|
285
|
+
buffer << keep_data
|
|
286
|
+
else
|
|
287
|
+
keep[2] = keep_data.dup
|
|
221
288
|
end
|
|
222
|
-
keep[0] = 0
|
|
223
289
|
end
|
|
224
290
|
|
|
291
|
+
# trace(__method__, :done)
|
|
292
|
+
end
|
|
293
|
+
|
|
294
|
+
def read_chunk
|
|
295
|
+
return false if @last_scanner
|
|
296
|
+
|
|
297
|
+
adjust_last_keep
|
|
298
|
+
|
|
225
299
|
input = @inputs.first
|
|
226
300
|
case input
|
|
227
301
|
when StringIO
|
|
228
302
|
string = input.read
|
|
229
303
|
raise InvalidEncoding unless string.valid_encoding?
|
|
304
|
+
# trace(__method__, :stringio, string)
|
|
230
305
|
@scanner = StringScanner.new(string)
|
|
231
306
|
@inputs.shift
|
|
232
307
|
@last_scanner = @inputs.empty?
|
|
233
308
|
true
|
|
234
309
|
else
|
|
235
|
-
chunk = input.gets(
|
|
310
|
+
chunk = input.gets(@row_separator, @chunk_size)
|
|
236
311
|
if chunk
|
|
237
312
|
raise InvalidEncoding unless chunk.valid_encoding?
|
|
313
|
+
# trace(__method__, :chunk, chunk)
|
|
238
314
|
@scanner = StringScanner.new(chunk)
|
|
239
315
|
if input.respond_to?(:eof?) and input.eof?
|
|
240
316
|
@inputs.shift
|
|
@@ -242,6 +318,7 @@ class CSV
|
|
|
242
318
|
end
|
|
243
319
|
true
|
|
244
320
|
else
|
|
321
|
+
# trace(__method__, :no_chunk)
|
|
245
322
|
@scanner = StringScanner.new("".encode(@encoding))
|
|
246
323
|
@inputs.shift
|
|
247
324
|
@last_scanner = @inputs.empty?
|
|
@@ -276,7 +353,11 @@ class CSV
|
|
|
276
353
|
end
|
|
277
354
|
|
|
278
355
|
def field_size_limit
|
|
279
|
-
@
|
|
356
|
+
@max_field_size&.succ
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def max_field_size
|
|
360
|
+
@max_field_size
|
|
280
361
|
end
|
|
281
362
|
|
|
282
363
|
def skip_lines
|
|
@@ -328,21 +409,24 @@ class CSV
|
|
|
328
409
|
|
|
329
410
|
begin
|
|
330
411
|
@scanner ||= build_scanner
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
412
|
+
__send__(@parse_method, &block)
|
|
413
|
+
rescue InvalidEncoding
|
|
414
|
+
if @scanner
|
|
415
|
+
ignore_broken_line
|
|
416
|
+
lineno = @lineno
|
|
335
417
|
else
|
|
336
|
-
|
|
418
|
+
lineno = @lineno + 1
|
|
337
419
|
end
|
|
338
|
-
|
|
420
|
+
raise InvalidEncodingError.new(@encoding, lineno)
|
|
421
|
+
rescue UnexpectedError => error
|
|
339
422
|
if @scanner
|
|
340
423
|
ignore_broken_line
|
|
341
424
|
lineno = @lineno
|
|
342
425
|
else
|
|
343
426
|
lineno = @lineno + 1
|
|
344
427
|
end
|
|
345
|
-
message = "
|
|
428
|
+
message = "This should not be happen: #{error.message}: "
|
|
429
|
+
message += "Please report this to https://github.com/ruby/csv/issues"
|
|
346
430
|
raise MalformedCSVError.new(message, lineno)
|
|
347
431
|
end
|
|
348
432
|
end
|
|
@@ -360,6 +444,7 @@ class CSV
|
|
|
360
444
|
prepare_skip_lines
|
|
361
445
|
prepare_strip
|
|
362
446
|
prepare_separators
|
|
447
|
+
validate_strip_and_col_sep_options
|
|
363
448
|
prepare_quoted
|
|
364
449
|
prepare_unquoted
|
|
365
450
|
prepare_line
|
|
@@ -368,7 +453,6 @@ class CSV
|
|
|
368
453
|
end
|
|
369
454
|
|
|
370
455
|
def prepare_variable
|
|
371
|
-
@need_robust_parsing = false
|
|
372
456
|
@encoding = @options[:encoding]
|
|
373
457
|
liberal_parsing = @options[:liberal_parsing]
|
|
374
458
|
if liberal_parsing
|
|
@@ -381,13 +465,12 @@ class CSV
|
|
|
381
465
|
@double_quote_outside_quote = false
|
|
382
466
|
@backslash_quote = false
|
|
383
467
|
end
|
|
384
|
-
@need_robust_parsing = true
|
|
385
468
|
else
|
|
386
469
|
@liberal_parsing = false
|
|
387
470
|
@backslash_quote = false
|
|
388
471
|
end
|
|
389
472
|
@unconverted_fields = @options[:unconverted_fields]
|
|
390
|
-
@
|
|
473
|
+
@max_field_size = @options[:max_field_size]
|
|
391
474
|
@skip_blanks = @options[:skip_blanks]
|
|
392
475
|
@fields_converter = @options[:fields_converter]
|
|
393
476
|
@header_fields_converter = @options[:header_fields_converter]
|
|
@@ -404,7 +487,6 @@ class CSV
|
|
|
404
487
|
message = ":quote_char has to be nil or a single character String"
|
|
405
488
|
raise ArgumentError, message
|
|
406
489
|
end
|
|
407
|
-
@double_quote_character = @quote_character * 2
|
|
408
490
|
@escaped_quote_character = Regexp.escape(@quote_character)
|
|
409
491
|
@escaped_quote = Regexp.new(@escaped_quote_character)
|
|
410
492
|
end
|
|
@@ -464,7 +546,6 @@ class CSV
|
|
|
464
546
|
@rstrip_value = Regexp.new(@escaped_strip +
|
|
465
547
|
"+\\z".encode(@encoding))
|
|
466
548
|
end
|
|
467
|
-
@need_robust_parsing = true
|
|
468
549
|
elsif @strip
|
|
469
550
|
strip_values = " \t\f\v"
|
|
470
551
|
@escaped_strip = strip_values.encode(@encoding)
|
|
@@ -472,16 +553,15 @@ class CSV
|
|
|
472
553
|
@strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
|
|
473
554
|
@rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
|
|
474
555
|
end
|
|
475
|
-
@need_robust_parsing = true
|
|
476
556
|
end
|
|
477
557
|
end
|
|
478
558
|
|
|
479
559
|
begin
|
|
480
560
|
StringScanner.new("x").scan("x")
|
|
481
561
|
rescue TypeError
|
|
482
|
-
|
|
562
|
+
STRING_SCANNER_SCAN_ACCEPT_STRING = false
|
|
483
563
|
else
|
|
484
|
-
|
|
564
|
+
STRING_SCANNER_SCAN_ACCEPT_STRING = true
|
|
485
565
|
end
|
|
486
566
|
|
|
487
567
|
def prepare_separators
|
|
@@ -505,7 +585,7 @@ class CSV
|
|
|
505
585
|
@first_column_separators = Regexp.new(@escaped_first_column_separator +
|
|
506
586
|
"+".encode(@encoding))
|
|
507
587
|
else
|
|
508
|
-
if
|
|
588
|
+
if STRING_SCANNER_SCAN_ACCEPT_STRING
|
|
509
589
|
@column_end = @column_separator
|
|
510
590
|
else
|
|
511
591
|
@column_end = Regexp.new(@escaped_column_separator)
|
|
@@ -526,10 +606,32 @@ class CSV
|
|
|
526
606
|
|
|
527
607
|
@cr = "\r".encode(@encoding)
|
|
528
608
|
@lf = "\n".encode(@encoding)
|
|
529
|
-
@
|
|
609
|
+
@line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
|
|
530
610
|
@not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
|
|
531
611
|
end
|
|
532
612
|
|
|
613
|
+
# This method verifies that there are no (obvious) ambiguities with the
|
|
614
|
+
# provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
|
|
615
|
+
# and +strip+ were both equal to +\t+, then there would be no clear way to
|
|
616
|
+
# parse the input.
|
|
617
|
+
def validate_strip_and_col_sep_options
|
|
618
|
+
return unless @strip
|
|
619
|
+
|
|
620
|
+
if @strip.is_a?(String)
|
|
621
|
+
if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
|
|
622
|
+
raise ArgumentError,
|
|
623
|
+
"The provided strip (#{@escaped_strip}) and " \
|
|
624
|
+
"col_sep (#{@escaped_column_separator}) options are incompatible."
|
|
625
|
+
end
|
|
626
|
+
else
|
|
627
|
+
if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
|
|
628
|
+
raise ArgumentError,
|
|
629
|
+
"The provided strip (true) and " \
|
|
630
|
+
"col_sep (#{@escaped_column_separator}) options are incompatible."
|
|
631
|
+
end
|
|
632
|
+
end
|
|
633
|
+
end
|
|
634
|
+
|
|
533
635
|
def prepare_quoted
|
|
534
636
|
if @quote_character
|
|
535
637
|
@quotes = Regexp.new(@escaped_quote_character +
|
|
@@ -605,7 +707,7 @@ class CSV
|
|
|
605
707
|
# do nothing: ensure will set default
|
|
606
708
|
end
|
|
607
709
|
end
|
|
608
|
-
separator =
|
|
710
|
+
separator = InputRecordSeparator.value if separator == :auto
|
|
609
711
|
end
|
|
610
712
|
separator.to_s.encode(@encoding)
|
|
611
713
|
end
|
|
@@ -655,9 +757,10 @@ class CSV
|
|
|
655
757
|
case headers
|
|
656
758
|
when Array
|
|
657
759
|
@raw_headers = headers
|
|
760
|
+
quoted_fields = FieldsConverter::NO_QUOTED_FIELDS
|
|
658
761
|
@use_headers = true
|
|
659
762
|
when String
|
|
660
|
-
@raw_headers = parse_headers(headers)
|
|
763
|
+
@raw_headers, quoted_fields = parse_headers(headers)
|
|
661
764
|
@use_headers = true
|
|
662
765
|
when nil, false
|
|
663
766
|
@raw_headers = nil
|
|
@@ -667,27 +770,41 @@ class CSV
|
|
|
667
770
|
@use_headers = true
|
|
668
771
|
end
|
|
669
772
|
if @raw_headers
|
|
670
|
-
@headers = adjust_headers(@raw_headers)
|
|
773
|
+
@headers = adjust_headers(@raw_headers, quoted_fields)
|
|
671
774
|
else
|
|
672
775
|
@headers = nil
|
|
673
776
|
end
|
|
674
777
|
end
|
|
675
778
|
|
|
676
779
|
def parse_headers(row)
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
780
|
+
quoted_fields = []
|
|
781
|
+
converter = lambda do |field, info|
|
|
782
|
+
quoted_fields << info.quoted?
|
|
783
|
+
field
|
|
784
|
+
end
|
|
785
|
+
headers = CSV.parse_line(row,
|
|
786
|
+
col_sep: @column_separator,
|
|
787
|
+
row_sep: @row_separator,
|
|
788
|
+
quote_char: @quote_character,
|
|
789
|
+
converters: [converter])
|
|
790
|
+
[headers, quoted_fields]
|
|
681
791
|
end
|
|
682
792
|
|
|
683
|
-
def adjust_headers(headers)
|
|
684
|
-
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
|
|
793
|
+
def adjust_headers(headers, quoted_fields)
|
|
794
|
+
adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
|
|
685
795
|
adjusted_headers.each {|h| h.freeze if h.is_a? String}
|
|
686
796
|
adjusted_headers
|
|
687
797
|
end
|
|
688
798
|
|
|
689
799
|
def prepare_parser
|
|
690
800
|
@may_quoted = may_quoted?
|
|
801
|
+
if @quote_character.nil?
|
|
802
|
+
@parse_method = :parse_no_quote
|
|
803
|
+
elsif @liberal_parsing or @strip
|
|
804
|
+
@parse_method = :parse_quotable_robust
|
|
805
|
+
else
|
|
806
|
+
@parse_method = :parse_quotable_loose
|
|
807
|
+
end
|
|
691
808
|
end
|
|
692
809
|
|
|
693
810
|
def may_quoted?
|
|
@@ -704,26 +821,28 @@ class CSV
|
|
|
704
821
|
sample[0, 128].index(@quote_character)
|
|
705
822
|
end
|
|
706
823
|
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
@io = StringIO.new(string, "rb:#{string.encoding}")
|
|
712
|
-
end
|
|
824
|
+
class UnoptimizedStringIO # :nodoc:
|
|
825
|
+
def initialize(string)
|
|
826
|
+
@io = StringIO.new(string, "rb:#{string.encoding}")
|
|
827
|
+
end
|
|
713
828
|
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
829
|
+
def gets(*args)
|
|
830
|
+
@io.gets(*args)
|
|
831
|
+
end
|
|
717
832
|
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
833
|
+
def each_line(*args, &block)
|
|
834
|
+
@io.each_line(*args, &block)
|
|
835
|
+
end
|
|
721
836
|
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
end
|
|
837
|
+
def eof?
|
|
838
|
+
@io.eof?
|
|
725
839
|
end
|
|
840
|
+
end
|
|
726
841
|
|
|
842
|
+
SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
|
|
843
|
+
if SCANNER_TEST
|
|
844
|
+
SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
|
|
845
|
+
SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
|
|
727
846
|
def build_scanner
|
|
728
847
|
inputs = @samples.collect do |sample|
|
|
729
848
|
UnoptimizedStringIO.new(sample)
|
|
@@ -733,17 +852,27 @@ class CSV
|
|
|
733
852
|
else
|
|
734
853
|
inputs << @input
|
|
735
854
|
end
|
|
736
|
-
|
|
855
|
+
begin
|
|
856
|
+
chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
|
|
857
|
+
rescue # Ractor::IsolationError
|
|
858
|
+
# Ractor on Ruby 3.0 can't read ENV value.
|
|
859
|
+
chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
|
|
860
|
+
end
|
|
861
|
+
chunk_size = Integer((chunk_size_value || "1"), 10)
|
|
737
862
|
InputsScanner.new(inputs,
|
|
738
863
|
@encoding,
|
|
739
|
-
|
|
864
|
+
@row_separator,
|
|
865
|
+
chunk_size: chunk_size)
|
|
740
866
|
end
|
|
741
867
|
else
|
|
742
868
|
def build_scanner
|
|
743
869
|
string = nil
|
|
744
870
|
if @samples.empty? and @input.is_a?(StringIO)
|
|
745
871
|
string = @input.read
|
|
746
|
-
elsif @samples.size == 1 and
|
|
872
|
+
elsif @samples.size == 1 and
|
|
873
|
+
@input != ARGF and
|
|
874
|
+
@input.respond_to?(:eof?) and
|
|
875
|
+
@input.eof?
|
|
747
876
|
string = @samples[0]
|
|
748
877
|
end
|
|
749
878
|
if string
|
|
@@ -752,8 +881,7 @@ class CSV
|
|
|
752
881
|
!line.valid_encoding?
|
|
753
882
|
end
|
|
754
883
|
if index
|
|
755
|
-
|
|
756
|
-
raise MalformedCSVError.new(message, @lineno + index + 1)
|
|
884
|
+
raise InvalidEncodingError.new(@encoding, @lineno + index + 1)
|
|
757
885
|
end
|
|
758
886
|
end
|
|
759
887
|
Scanner.new(string)
|
|
@@ -762,7 +890,7 @@ class CSV
|
|
|
762
890
|
StringIO.new(sample)
|
|
763
891
|
end
|
|
764
892
|
inputs << @input
|
|
765
|
-
InputsScanner.new(inputs, @encoding)
|
|
893
|
+
InputsScanner.new(inputs, @encoding, @row_separator)
|
|
766
894
|
end
|
|
767
895
|
end
|
|
768
896
|
end
|
|
@@ -796,6 +924,14 @@ class CSV
|
|
|
796
924
|
end
|
|
797
925
|
end
|
|
798
926
|
|
|
927
|
+
def validate_field_size(field)
|
|
928
|
+
return unless @max_field_size
|
|
929
|
+
return if field.size <= @max_field_size
|
|
930
|
+
ignore_broken_line
|
|
931
|
+
message = "Field size exceeded: #{field.size} > #{@max_field_size}"
|
|
932
|
+
raise MalformedCSVError.new(message, @lineno)
|
|
933
|
+
end
|
|
934
|
+
|
|
799
935
|
def parse_no_quote(&block)
|
|
800
936
|
@scanner.each_line(@row_separator) do |line|
|
|
801
937
|
next if @skip_lines and skip_line?(line)
|
|
@@ -808,6 +944,11 @@ class CSV
|
|
|
808
944
|
else
|
|
809
945
|
line = strip_value(line)
|
|
810
946
|
row = line.split(@split_column_separator, -1)
|
|
947
|
+
if @max_field_size
|
|
948
|
+
row.each do |column|
|
|
949
|
+
validate_field_size(column)
|
|
950
|
+
end
|
|
951
|
+
end
|
|
811
952
|
n_columns = row.size
|
|
812
953
|
i = 0
|
|
813
954
|
while i < n_columns
|
|
@@ -838,31 +979,37 @@ class CSV
|
|
|
838
979
|
next
|
|
839
980
|
end
|
|
840
981
|
row = []
|
|
982
|
+
quoted_fields = FieldsConverter::NO_QUOTED_FIELDS
|
|
841
983
|
elsif line.include?(@cr) or line.include?(@lf)
|
|
842
984
|
@scanner.keep_back
|
|
843
|
-
@
|
|
985
|
+
@parse_method = :parse_quotable_robust
|
|
844
986
|
return parse_quotable_robust(&block)
|
|
845
987
|
else
|
|
846
988
|
row = line.split(@split_column_separator, -1)
|
|
989
|
+
quoted_fields = []
|
|
847
990
|
n_columns = row.size
|
|
848
991
|
i = 0
|
|
849
992
|
while i < n_columns
|
|
850
993
|
column = row[i]
|
|
851
994
|
if column.empty?
|
|
995
|
+
quoted_fields << false
|
|
852
996
|
row[i] = nil
|
|
853
997
|
else
|
|
854
998
|
n_quotes = column.count(@quote_character)
|
|
855
999
|
if n_quotes.zero?
|
|
1000
|
+
quoted_fields << false
|
|
856
1001
|
# no quote
|
|
857
1002
|
elsif n_quotes == 2 and
|
|
858
1003
|
column.start_with?(@quote_character) and
|
|
859
1004
|
column.end_with?(@quote_character)
|
|
1005
|
+
quoted_fields << true
|
|
860
1006
|
row[i] = column[1..-2]
|
|
861
1007
|
else
|
|
862
1008
|
@scanner.keep_back
|
|
863
|
-
@
|
|
1009
|
+
@parse_method = :parse_quotable_robust
|
|
864
1010
|
return parse_quotable_robust(&block)
|
|
865
1011
|
end
|
|
1012
|
+
validate_field_size(row[i])
|
|
866
1013
|
end
|
|
867
1014
|
i += 1
|
|
868
1015
|
end
|
|
@@ -870,13 +1017,14 @@ class CSV
|
|
|
870
1017
|
@scanner.keep_drop
|
|
871
1018
|
@scanner.keep_start
|
|
872
1019
|
@last_line = original_line
|
|
873
|
-
emit_row(row, &block)
|
|
1020
|
+
emit_row(row, quoted_fields, &block)
|
|
874
1021
|
end
|
|
875
1022
|
@scanner.keep_drop
|
|
876
1023
|
end
|
|
877
1024
|
|
|
878
1025
|
def parse_quotable_robust(&block)
|
|
879
1026
|
row = []
|
|
1027
|
+
quoted_fields = []
|
|
880
1028
|
skip_needless_lines
|
|
881
1029
|
start_row
|
|
882
1030
|
while true
|
|
@@ -886,35 +1034,42 @@ class CSV
|
|
|
886
1034
|
value = parse_column_value
|
|
887
1035
|
if value
|
|
888
1036
|
@scanner.scan_all(@strip_value) if @strip_value
|
|
889
|
-
|
|
890
|
-
ignore_broken_line
|
|
891
|
-
raise MalformedCSVError.new("Field size exceeded", @lineno)
|
|
892
|
-
end
|
|
1037
|
+
validate_field_size(value)
|
|
893
1038
|
end
|
|
894
1039
|
if parse_column_end
|
|
895
1040
|
row << value
|
|
1041
|
+
quoted_fields << @quoted_column_value
|
|
896
1042
|
elsif parse_row_end
|
|
897
1043
|
if row.empty? and value.nil?
|
|
898
1044
|
emit_row([], &block) unless @skip_blanks
|
|
899
1045
|
else
|
|
900
1046
|
row << value
|
|
901
|
-
|
|
1047
|
+
quoted_fields << @quoted_column_value
|
|
1048
|
+
emit_row(row, quoted_fields, &block)
|
|
902
1049
|
row = []
|
|
1050
|
+
quoted_fields.clear
|
|
903
1051
|
end
|
|
904
1052
|
skip_needless_lines
|
|
905
1053
|
start_row
|
|
906
1054
|
elsif @scanner.eos?
|
|
907
1055
|
break if row.empty? and value.nil?
|
|
908
1056
|
row << value
|
|
909
|
-
|
|
1057
|
+
quoted_fields << @quoted_column_value
|
|
1058
|
+
emit_row(row, quoted_fields, &block)
|
|
910
1059
|
break
|
|
911
1060
|
else
|
|
912
1061
|
if @quoted_column_value
|
|
1062
|
+
if liberal_parsing? and (new_line = @scanner.check(@line_end))
|
|
1063
|
+
message =
|
|
1064
|
+
"Illegal end-of-line sequence outside of a quoted field " +
|
|
1065
|
+
"<#{new_line.inspect}>"
|
|
1066
|
+
else
|
|
1067
|
+
message = "Any value after quoted field isn't allowed"
|
|
1068
|
+
end
|
|
913
1069
|
ignore_broken_line
|
|
914
|
-
message = "Any value after quoted field isn't allowed"
|
|
915
1070
|
raise MalformedCSVError.new(message, @lineno)
|
|
916
1071
|
elsif @unquoted_column_value and
|
|
917
|
-
(new_line = @scanner.scan(@
|
|
1072
|
+
(new_line = @scanner.scan(@line_end))
|
|
918
1073
|
ignore_broken_line
|
|
919
1074
|
message = "Unquoted fields do not allow new line " +
|
|
920
1075
|
"<#{new_line.inspect}>"
|
|
@@ -923,7 +1078,7 @@ class CSV
|
|
|
923
1078
|
ignore_broken_line
|
|
924
1079
|
message = "Illegal quoting"
|
|
925
1080
|
raise MalformedCSVError.new(message, @lineno)
|
|
926
|
-
elsif (new_line = @scanner.scan(@
|
|
1081
|
+
elsif (new_line = @scanner.scan(@line_end))
|
|
927
1082
|
ignore_broken_line
|
|
928
1083
|
message = "New line must be <#{@row_separator.inspect}> " +
|
|
929
1084
|
"not <#{new_line.inspect}>"
|
|
@@ -1004,7 +1159,7 @@ class CSV
|
|
|
1004
1159
|
if (n_quotes % 2).zero?
|
|
1005
1160
|
quotes[0, (n_quotes - 2) / 2]
|
|
1006
1161
|
else
|
|
1007
|
-
value = quotes[0,
|
|
1162
|
+
value = quotes[0, n_quotes / 2]
|
|
1008
1163
|
while true
|
|
1009
1164
|
quoted_value = @scanner.scan_all(@quoted_value)
|
|
1010
1165
|
value << quoted_value if quoted_value
|
|
@@ -1028,11 +1183,9 @@ class CSV
|
|
|
1028
1183
|
n_quotes = quotes.size
|
|
1029
1184
|
if n_quotes == 1
|
|
1030
1185
|
break
|
|
1031
|
-
elsif (n_quotes % 2) == 1
|
|
1032
|
-
value << quotes[0, (n_quotes - 1) / 2]
|
|
1033
|
-
break
|
|
1034
1186
|
else
|
|
1035
1187
|
value << quotes[0, n_quotes / 2]
|
|
1188
|
+
break if (n_quotes % 2) == 1
|
|
1036
1189
|
end
|
|
1037
1190
|
end
|
|
1038
1191
|
value
|
|
@@ -1068,18 +1221,15 @@ class CSV
|
|
|
1068
1221
|
|
|
1069
1222
|
def strip_value(value)
|
|
1070
1223
|
return value unless @strip
|
|
1071
|
-
return
|
|
1224
|
+
return value if value.nil?
|
|
1072
1225
|
|
|
1073
1226
|
case @strip
|
|
1074
1227
|
when String
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
size -= 1
|
|
1078
|
-
value = value[1, size]
|
|
1228
|
+
while value.delete_prefix!(@strip)
|
|
1229
|
+
# do nothing
|
|
1079
1230
|
end
|
|
1080
|
-
while value.
|
|
1081
|
-
|
|
1082
|
-
value = value[0, size]
|
|
1231
|
+
while value.delete_suffix!(@strip)
|
|
1232
|
+
# do nothing
|
|
1083
1233
|
end
|
|
1084
1234
|
else
|
|
1085
1235
|
value.strip!
|
|
@@ -1089,7 +1239,7 @@ class CSV
|
|
|
1089
1239
|
|
|
1090
1240
|
def ignore_broken_line
|
|
1091
1241
|
@scanner.scan_all(@not_line_end)
|
|
1092
|
-
@scanner.scan_all(@
|
|
1242
|
+
@scanner.scan_all(@line_end)
|
|
1093
1243
|
@lineno += 1
|
|
1094
1244
|
end
|
|
1095
1245
|
|
|
@@ -1102,22 +1252,22 @@ class CSV
|
|
|
1102
1252
|
@scanner.keep_start
|
|
1103
1253
|
end
|
|
1104
1254
|
|
|
1105
|
-
def emit_row(row, &block)
|
|
1255
|
+
def emit_row(row, quoted_fields=FieldsConverter::NO_QUOTED_FIELDS, &block)
|
|
1106
1256
|
@lineno += 1
|
|
1107
1257
|
|
|
1108
1258
|
raw_row = row
|
|
1109
1259
|
if @use_headers
|
|
1110
1260
|
if @headers.nil?
|
|
1111
|
-
@headers = adjust_headers(row)
|
|
1261
|
+
@headers = adjust_headers(row, quoted_fields)
|
|
1112
1262
|
return unless @return_headers
|
|
1113
1263
|
row = Row.new(@headers, row, true)
|
|
1114
1264
|
else
|
|
1115
1265
|
row = Row.new(@headers,
|
|
1116
|
-
@fields_converter.convert(raw_row, @headers, @lineno))
|
|
1266
|
+
@fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
|
|
1117
1267
|
end
|
|
1118
1268
|
else
|
|
1119
1269
|
# convert fields, if needed...
|
|
1120
|
-
row = @fields_converter.convert(raw_row, nil, @lineno)
|
|
1270
|
+
row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
|
|
1121
1271
|
end
|
|
1122
1272
|
|
|
1123
1273
|
# inject unconverted fields and accessor, if requested...
|