csv 3.1.9 → 3.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/csv/parser.rb CHANGED
@@ -2,14 +2,10 @@
2
2
 
3
3
  require "strscan"
4
4
 
5
- require_relative "delete_suffix"
6
- require_relative "match_p"
5
+ require_relative "input_record_separator"
7
6
  require_relative "row"
8
7
  require_relative "table"
9
8
 
10
- using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
11
- using CSV::MatchP if CSV.const_defined?(:MatchP)
12
-
13
9
  class CSV
14
10
  # Note: Don't use this class directly. This is an internal class.
15
11
  class Parser
@@ -26,6 +22,10 @@ class CSV
26
22
  class InvalidEncoding < StandardError
27
23
  end
28
24
 
25
+ # Raised when unexpected case is happen.
26
+ class UnexpectedError < StandardError
27
+ end
28
+
29
29
  #
30
30
  # CSV::Scanner receives a CSV output, scans it and return the content.
31
31
  # It also controls the life cycle of the object with its methods +keep_start+,
@@ -77,16 +77,17 @@ class CSV
77
77
  # +keep_end+, +keep_back+, +keep_drop+.
78
78
  #
79
79
  # CSV::InputsScanner.scan() tries to match with pattern at the current position.
80
- # If there's a match, the scanner advances the scan pointer and returns the matched string.
80
+ # If there's a match, the scanner advances the "scan pointer" and returns the matched string.
81
81
  # Otherwise, the scanner returns nil.
82
82
  #
83
- # CSV::InputsScanner.rest() returns the rest of the string (i.e. everything after the scan pointer).
83
+ # CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
84
84
  # If there is no more data (eos? = true), it returns "".
85
85
  #
86
86
  class InputsScanner
87
- def initialize(inputs, encoding, chunk_size: 8192)
87
+ def initialize(inputs, encoding, row_separator, chunk_size: 8192)
88
88
  @inputs = inputs.dup
89
89
  @encoding = encoding
90
+ @row_separator = row_separator
90
91
  @chunk_size = chunk_size
91
92
  @last_scanner = @inputs.empty?
92
93
  @keeps = []
@@ -94,11 +95,13 @@ class CSV
94
95
  end
95
96
 
96
97
  def each_line(row_separator)
98
+ return enum_for(__method__, row_separator) unless block_given?
97
99
  buffer = nil
98
100
  input = @scanner.rest
99
101
  position = @scanner.pos
100
102
  offset = 0
101
103
  n_row_separator_chars = row_separator.size
104
+ # trace(__method__, :start, input)
102
105
  while true
103
106
  input.each_line(row_separator) do |line|
104
107
  @scanner.pos += line.bytesize
@@ -138,25 +141,29 @@ class CSV
138
141
  end
139
142
 
140
143
  def scan(pattern)
144
+ # trace(__method__, pattern, :start)
141
145
  value = @scanner.scan(pattern)
146
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
142
147
  return value if @last_scanner
143
148
 
144
- if value
145
- read_chunk if @scanner.eos?
146
- return value
147
- else
148
- nil
149
- end
149
+ read_chunk if value and @scanner.eos?
150
+ # trace(__method__, pattern, :done, value)
151
+ value
150
152
  end
151
153
 
152
154
  def scan_all(pattern)
155
+ # trace(__method__, pattern, :start)
153
156
  value = @scanner.scan(pattern)
157
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
154
158
  return value if @last_scanner
155
159
 
160
+ # trace(__method__, pattern, :done, :nil) if value.nil?
156
161
  return nil if value.nil?
157
162
  while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
163
+ # trace(__method__, pattern, :sub, sub_value)
158
164
  value << sub_value
159
165
  end
166
+ # trace(__method__, pattern, :done, value)
160
167
  value
161
168
  end
162
169
 
@@ -165,76 +172,145 @@ class CSV
165
172
  end
166
173
 
167
174
  def keep_start
168
- @keeps.push([@scanner.pos, nil])
175
+ # trace(__method__, :start)
176
+ adjust_last_keep
177
+ @keeps.push([@scanner, @scanner.pos, nil])
178
+ # trace(__method__, :done)
169
179
  end
170
180
 
171
181
  def keep_end
172
- start, buffer = @keeps.pop
173
- keep = @scanner.string.byteslice(start, @scanner.pos - start)
182
+ # trace(__method__, :start)
183
+ scanner, start, buffer = @keeps.pop
184
+ if scanner == @scanner
185
+ keep = @scanner.string.byteslice(start, @scanner.pos - start)
186
+ else
187
+ keep = @scanner.string.byteslice(0, @scanner.pos)
188
+ end
174
189
  if buffer
175
190
  buffer << keep
176
191
  keep = buffer
177
192
  end
193
+ # trace(__method__, :done, keep)
178
194
  keep
179
195
  end
180
196
 
181
197
  def keep_back
182
- start, buffer = @keeps.pop
198
+ # trace(__method__, :start)
199
+ scanner, start, buffer = @keeps.pop
183
200
  if buffer
201
+ # trace(__method__, :rescan, start, buffer)
184
202
  string = @scanner.string
185
- keep = string.byteslice(start, string.bytesize - start)
203
+ if scanner == @scanner
204
+ keep = string.byteslice(start,
205
+ string.bytesize - @scanner.pos - start)
206
+ else
207
+ keep = string
208
+ end
186
209
  if keep and not keep.empty?
187
210
  @inputs.unshift(StringIO.new(keep))
188
211
  @last_scanner = false
189
212
  end
190
213
  @scanner = StringScanner.new(buffer)
191
214
  else
215
+ if @scanner != scanner
216
+ message = "scanners are different but no buffer: "
217
+ message += "#{@scanner.inspect}(#{@scanner.object_id}): "
218
+ message += "#{scanner.inspect}(#{scanner.object_id})"
219
+ raise UnexpectedError, message
220
+ end
221
+ # trace(__method__, :repos, start, buffer)
192
222
  @scanner.pos = start
223
+ last_scanner, last_start, last_buffer = @keeps.last
224
+ # Drop the last buffer when the last buffer is the same data
225
+ # in the last keep. If we keep it, we have duplicated data
226
+ # by the next keep_back.
227
+ if last_scanner == @scanner and
228
+ last_buffer and
229
+ last_buffer == last_scanner.string.byteslice(last_start, start)
230
+ @keeps.last[2] = nil
231
+ end
193
232
  end
194
233
  read_chunk if @scanner.eos?
195
234
  end
196
235
 
197
236
  def keep_drop
198
- @keeps.pop
237
+ _, _, buffer = @keeps.pop
238
+ # trace(__method__, :done, :empty) unless buffer
239
+ return unless buffer
240
+
241
+ last_keep = @keeps.last
242
+ # trace(__method__, :done, :no_last_keep) unless last_keep
243
+ return unless last_keep
244
+
245
+ if last_keep[2]
246
+ last_keep[2] << buffer
247
+ else
248
+ last_keep[2] = buffer
249
+ end
250
+ # trace(__method__, :done)
199
251
  end
200
252
 
201
253
  def rest
202
254
  @scanner.rest
203
255
  end
204
256
 
257
+ def check(pattern)
258
+ @scanner.check(pattern)
259
+ end
260
+
205
261
  private
206
- def read_chunk
207
- return false if @last_scanner
262
+ def trace(*args)
263
+ pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
264
+ end
208
265
 
209
- unless @keeps.empty?
210
- keep = @keeps.last
211
- keep_start = keep[0]
212
- string = @scanner.string
213
- keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
214
- if keep_data
215
- keep_buffer = keep[1]
216
- if keep_buffer
217
- keep_buffer << keep_data
218
- else
219
- keep[1] = keep_data.dup
220
- end
266
+ def adjust_last_keep
267
+ # trace(__method__, :start)
268
+
269
+ keep = @keeps.last
270
+ # trace(__method__, :done, :empty) if keep.nil?
271
+ return if keep.nil?
272
+
273
+ scanner, start, buffer = keep
274
+ string = @scanner.string
275
+ if @scanner != scanner
276
+ start = 0
277
+ end
278
+ if start == 0 and @scanner.eos?
279
+ keep_data = string
280
+ else
281
+ keep_data = string.byteslice(start, @scanner.pos - start)
282
+ end
283
+ if keep_data
284
+ if buffer
285
+ buffer << keep_data
286
+ else
287
+ keep[2] = keep_data.dup
221
288
  end
222
- keep[0] = 0
223
289
  end
224
290
 
291
+ # trace(__method__, :done)
292
+ end
293
+
294
+ def read_chunk
295
+ return false if @last_scanner
296
+
297
+ adjust_last_keep
298
+
225
299
  input = @inputs.first
226
300
  case input
227
301
  when StringIO
228
302
  string = input.read
229
303
  raise InvalidEncoding unless string.valid_encoding?
304
+ # trace(__method__, :stringio, string)
230
305
  @scanner = StringScanner.new(string)
231
306
  @inputs.shift
232
307
  @last_scanner = @inputs.empty?
233
308
  true
234
309
  else
235
- chunk = input.gets(nil, @chunk_size)
310
+ chunk = input.gets(@row_separator, @chunk_size)
236
311
  if chunk
237
312
  raise InvalidEncoding unless chunk.valid_encoding?
313
+ # trace(__method__, :chunk, chunk)
238
314
  @scanner = StringScanner.new(chunk)
239
315
  if input.respond_to?(:eof?) and input.eof?
240
316
  @inputs.shift
@@ -242,6 +318,7 @@ class CSV
242
318
  end
243
319
  true
244
320
  else
321
+ # trace(__method__, :no_chunk)
245
322
  @scanner = StringScanner.new("".encode(@encoding))
246
323
  @inputs.shift
247
324
  @last_scanner = @inputs.empty?
@@ -276,7 +353,11 @@ class CSV
276
353
  end
277
354
 
278
355
  def field_size_limit
279
- @field_size_limit
356
+ @max_field_size&.succ
357
+ end
358
+
359
+ def max_field_size
360
+ @max_field_size
280
361
  end
281
362
 
282
363
  def skip_lines
@@ -328,21 +409,24 @@ class CSV
328
409
 
329
410
  begin
330
411
  @scanner ||= build_scanner
331
- if quote_character.nil?
332
- parse_no_quote(&block)
333
- elsif @need_robust_parsing
334
- parse_quotable_robust(&block)
412
+ __send__(@parse_method, &block)
413
+ rescue InvalidEncoding
414
+ if @scanner
415
+ ignore_broken_line
416
+ lineno = @lineno
335
417
  else
336
- parse_quotable_loose(&block)
418
+ lineno = @lineno + 1
337
419
  end
338
- rescue InvalidEncoding
420
+ raise InvalidEncodingError.new(@encoding, lineno)
421
+ rescue UnexpectedError => error
339
422
  if @scanner
340
423
  ignore_broken_line
341
424
  lineno = @lineno
342
425
  else
343
426
  lineno = @lineno + 1
344
427
  end
345
- message = "Invalid byte sequence in #{@encoding}"
428
+ message = "This should not be happen: #{error.message}: "
429
+ message += "Please report this to https://github.com/ruby/csv/issues"
346
430
  raise MalformedCSVError.new(message, lineno)
347
431
  end
348
432
  end
@@ -360,6 +444,7 @@ class CSV
360
444
  prepare_skip_lines
361
445
  prepare_strip
362
446
  prepare_separators
447
+ validate_strip_and_col_sep_options
363
448
  prepare_quoted
364
449
  prepare_unquoted
365
450
  prepare_line
@@ -368,7 +453,6 @@ class CSV
368
453
  end
369
454
 
370
455
  def prepare_variable
371
- @need_robust_parsing = false
372
456
  @encoding = @options[:encoding]
373
457
  liberal_parsing = @options[:liberal_parsing]
374
458
  if liberal_parsing
@@ -381,13 +465,12 @@ class CSV
381
465
  @double_quote_outside_quote = false
382
466
  @backslash_quote = false
383
467
  end
384
- @need_robust_parsing = true
385
468
  else
386
469
  @liberal_parsing = false
387
470
  @backslash_quote = false
388
471
  end
389
472
  @unconverted_fields = @options[:unconverted_fields]
390
- @field_size_limit = @options[:field_size_limit]
473
+ @max_field_size = @options[:max_field_size]
391
474
  @skip_blanks = @options[:skip_blanks]
392
475
  @fields_converter = @options[:fields_converter]
393
476
  @header_fields_converter = @options[:header_fields_converter]
@@ -404,7 +487,6 @@ class CSV
404
487
  message = ":quote_char has to be nil or a single character String"
405
488
  raise ArgumentError, message
406
489
  end
407
- @double_quote_character = @quote_character * 2
408
490
  @escaped_quote_character = Regexp.escape(@quote_character)
409
491
  @escaped_quote = Regexp.new(@escaped_quote_character)
410
492
  end
@@ -464,7 +546,6 @@ class CSV
464
546
  @rstrip_value = Regexp.new(@escaped_strip +
465
547
  "+\\z".encode(@encoding))
466
548
  end
467
- @need_robust_parsing = true
468
549
  elsif @strip
469
550
  strip_values = " \t\f\v"
470
551
  @escaped_strip = strip_values.encode(@encoding)
@@ -472,16 +553,15 @@ class CSV
472
553
  @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
473
554
  @rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
474
555
  end
475
- @need_robust_parsing = true
476
556
  end
477
557
  end
478
558
 
479
559
  begin
480
560
  StringScanner.new("x").scan("x")
481
561
  rescue TypeError
482
- @@string_scanner_scan_accept_string = false
562
+ STRING_SCANNER_SCAN_ACCEPT_STRING = false
483
563
  else
484
- @@string_scanner_scan_accept_string = true
564
+ STRING_SCANNER_SCAN_ACCEPT_STRING = true
485
565
  end
486
566
 
487
567
  def prepare_separators
@@ -505,7 +585,7 @@ class CSV
505
585
  @first_column_separators = Regexp.new(@escaped_first_column_separator +
506
586
  "+".encode(@encoding))
507
587
  else
508
- if @@string_scanner_scan_accept_string
588
+ if STRING_SCANNER_SCAN_ACCEPT_STRING
509
589
  @column_end = @column_separator
510
590
  else
511
591
  @column_end = Regexp.new(@escaped_column_separator)
@@ -526,10 +606,32 @@ class CSV
526
606
 
527
607
  @cr = "\r".encode(@encoding)
528
608
  @lf = "\n".encode(@encoding)
529
- @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding))
609
+ @line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
530
610
  @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
531
611
  end
532
612
 
613
+ # This method verifies that there are no (obvious) ambiguities with the
614
+ # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
615
+ # and +strip+ were both equal to +\t+, then there would be no clear way to
616
+ # parse the input.
617
+ def validate_strip_and_col_sep_options
618
+ return unless @strip
619
+
620
+ if @strip.is_a?(String)
621
+ if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
622
+ raise ArgumentError,
623
+ "The provided strip (#{@escaped_strip}) and " \
624
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
625
+ end
626
+ else
627
+ if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
628
+ raise ArgumentError,
629
+ "The provided strip (true) and " \
630
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
631
+ end
632
+ end
633
+ end
634
+
533
635
  def prepare_quoted
534
636
  if @quote_character
535
637
  @quotes = Regexp.new(@escaped_quote_character +
@@ -605,7 +707,7 @@ class CSV
605
707
  # do nothing: ensure will set default
606
708
  end
607
709
  end
608
- separator = $INPUT_RECORD_SEPARATOR if separator == :auto
710
+ separator = InputRecordSeparator.value if separator == :auto
609
711
  end
610
712
  separator.to_s.encode(@encoding)
611
713
  end
@@ -655,9 +757,10 @@ class CSV
655
757
  case headers
656
758
  when Array
657
759
  @raw_headers = headers
760
+ quoted_fields = FieldsConverter::NO_QUOTED_FIELDS
658
761
  @use_headers = true
659
762
  when String
660
- @raw_headers = parse_headers(headers)
763
+ @raw_headers, quoted_fields = parse_headers(headers)
661
764
  @use_headers = true
662
765
  when nil, false
663
766
  @raw_headers = nil
@@ -667,27 +770,41 @@ class CSV
667
770
  @use_headers = true
668
771
  end
669
772
  if @raw_headers
670
- @headers = adjust_headers(@raw_headers)
773
+ @headers = adjust_headers(@raw_headers, quoted_fields)
671
774
  else
672
775
  @headers = nil
673
776
  end
674
777
  end
675
778
 
676
779
  def parse_headers(row)
677
- CSV.parse_line(row,
678
- col_sep: @column_separator,
679
- row_sep: @row_separator,
680
- quote_char: @quote_character)
780
+ quoted_fields = []
781
+ converter = lambda do |field, info|
782
+ quoted_fields << info.quoted?
783
+ field
784
+ end
785
+ headers = CSV.parse_line(row,
786
+ col_sep: @column_separator,
787
+ row_sep: @row_separator,
788
+ quote_char: @quote_character,
789
+ converters: [converter])
790
+ [headers, quoted_fields]
681
791
  end
682
792
 
683
- def adjust_headers(headers)
684
- adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
793
+ def adjust_headers(headers, quoted_fields)
794
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
685
795
  adjusted_headers.each {|h| h.freeze if h.is_a? String}
686
796
  adjusted_headers
687
797
  end
688
798
 
689
799
  def prepare_parser
690
800
  @may_quoted = may_quoted?
801
+ if @quote_character.nil?
802
+ @parse_method = :parse_no_quote
803
+ elsif @liberal_parsing or @strip
804
+ @parse_method = :parse_quotable_robust
805
+ else
806
+ @parse_method = :parse_quotable_loose
807
+ end
691
808
  end
692
809
 
693
810
  def may_quoted?
@@ -704,26 +821,28 @@ class CSV
704
821
  sample[0, 128].index(@quote_character)
705
822
  end
706
823
 
707
- SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
708
- if SCANNER_TEST
709
- class UnoptimizedStringIO
710
- def initialize(string)
711
- @io = StringIO.new(string, "rb:#{string.encoding}")
712
- end
824
+ class UnoptimizedStringIO # :nodoc:
825
+ def initialize(string)
826
+ @io = StringIO.new(string, "rb:#{string.encoding}")
827
+ end
713
828
 
714
- def gets(*args)
715
- @io.gets(*args)
716
- end
829
+ def gets(*args)
830
+ @io.gets(*args)
831
+ end
717
832
 
718
- def each_line(*args, &block)
719
- @io.each_line(*args, &block)
720
- end
833
+ def each_line(*args, &block)
834
+ @io.each_line(*args, &block)
835
+ end
721
836
 
722
- def eof?
723
- @io.eof?
724
- end
837
+ def eof?
838
+ @io.eof?
725
839
  end
840
+ end
726
841
 
842
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
843
+ if SCANNER_TEST
844
+ SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
845
+ SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
727
846
  def build_scanner
728
847
  inputs = @samples.collect do |sample|
729
848
  UnoptimizedStringIO.new(sample)
@@ -733,17 +852,27 @@ class CSV
733
852
  else
734
853
  inputs << @input
735
854
  end
736
- chunk_size = ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"
855
+ begin
856
+ chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
857
+ rescue # Ractor::IsolationError
858
+ # Ractor on Ruby 3.0 can't read ENV value.
859
+ chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
860
+ end
861
+ chunk_size = Integer((chunk_size_value || "1"), 10)
737
862
  InputsScanner.new(inputs,
738
863
  @encoding,
739
- chunk_size: Integer(chunk_size, 10))
864
+ @row_separator,
865
+ chunk_size: chunk_size)
740
866
  end
741
867
  else
742
868
  def build_scanner
743
869
  string = nil
744
870
  if @samples.empty? and @input.is_a?(StringIO)
745
871
  string = @input.read
746
- elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
872
+ elsif @samples.size == 1 and
873
+ @input != ARGF and
874
+ @input.respond_to?(:eof?) and
875
+ @input.eof?
747
876
  string = @samples[0]
748
877
  end
749
878
  if string
@@ -752,8 +881,7 @@ class CSV
752
881
  !line.valid_encoding?
753
882
  end
754
883
  if index
755
- message = "Invalid byte sequence in #{@encoding}"
756
- raise MalformedCSVError.new(message, @lineno + index + 1)
884
+ raise InvalidEncodingError.new(@encoding, @lineno + index + 1)
757
885
  end
758
886
  end
759
887
  Scanner.new(string)
@@ -762,7 +890,7 @@ class CSV
762
890
  StringIO.new(sample)
763
891
  end
764
892
  inputs << @input
765
- InputsScanner.new(inputs, @encoding)
893
+ InputsScanner.new(inputs, @encoding, @row_separator)
766
894
  end
767
895
  end
768
896
  end
@@ -796,6 +924,14 @@ class CSV
796
924
  end
797
925
  end
798
926
 
927
+ def validate_field_size(field)
928
+ return unless @max_field_size
929
+ return if field.size <= @max_field_size
930
+ ignore_broken_line
931
+ message = "Field size exceeded: #{field.size} > #{@max_field_size}"
932
+ raise MalformedCSVError.new(message, @lineno)
933
+ end
934
+
799
935
  def parse_no_quote(&block)
800
936
  @scanner.each_line(@row_separator) do |line|
801
937
  next if @skip_lines and skip_line?(line)
@@ -808,6 +944,11 @@ class CSV
808
944
  else
809
945
  line = strip_value(line)
810
946
  row = line.split(@split_column_separator, -1)
947
+ if @max_field_size
948
+ row.each do |column|
949
+ validate_field_size(column)
950
+ end
951
+ end
811
952
  n_columns = row.size
812
953
  i = 0
813
954
  while i < n_columns
@@ -838,31 +979,37 @@ class CSV
838
979
  next
839
980
  end
840
981
  row = []
982
+ quoted_fields = FieldsConverter::NO_QUOTED_FIELDS
841
983
  elsif line.include?(@cr) or line.include?(@lf)
842
984
  @scanner.keep_back
843
- @need_robust_parsing = true
985
+ @parse_method = :parse_quotable_robust
844
986
  return parse_quotable_robust(&block)
845
987
  else
846
988
  row = line.split(@split_column_separator, -1)
989
+ quoted_fields = []
847
990
  n_columns = row.size
848
991
  i = 0
849
992
  while i < n_columns
850
993
  column = row[i]
851
994
  if column.empty?
995
+ quoted_fields << false
852
996
  row[i] = nil
853
997
  else
854
998
  n_quotes = column.count(@quote_character)
855
999
  if n_quotes.zero?
1000
+ quoted_fields << false
856
1001
  # no quote
857
1002
  elsif n_quotes == 2 and
858
1003
  column.start_with?(@quote_character) and
859
1004
  column.end_with?(@quote_character)
1005
+ quoted_fields << true
860
1006
  row[i] = column[1..-2]
861
1007
  else
862
1008
  @scanner.keep_back
863
- @need_robust_parsing = true
1009
+ @parse_method = :parse_quotable_robust
864
1010
  return parse_quotable_robust(&block)
865
1011
  end
1012
+ validate_field_size(row[i])
866
1013
  end
867
1014
  i += 1
868
1015
  end
@@ -870,13 +1017,14 @@ class CSV
870
1017
  @scanner.keep_drop
871
1018
  @scanner.keep_start
872
1019
  @last_line = original_line
873
- emit_row(row, &block)
1020
+ emit_row(row, quoted_fields, &block)
874
1021
  end
875
1022
  @scanner.keep_drop
876
1023
  end
877
1024
 
878
1025
  def parse_quotable_robust(&block)
879
1026
  row = []
1027
+ quoted_fields = []
880
1028
  skip_needless_lines
881
1029
  start_row
882
1030
  while true
@@ -886,35 +1034,42 @@ class CSV
886
1034
  value = parse_column_value
887
1035
  if value
888
1036
  @scanner.scan_all(@strip_value) if @strip_value
889
- if @field_size_limit and value.size >= @field_size_limit
890
- ignore_broken_line
891
- raise MalformedCSVError.new("Field size exceeded", @lineno)
892
- end
1037
+ validate_field_size(value)
893
1038
  end
894
1039
  if parse_column_end
895
1040
  row << value
1041
+ quoted_fields << @quoted_column_value
896
1042
  elsif parse_row_end
897
1043
  if row.empty? and value.nil?
898
1044
  emit_row([], &block) unless @skip_blanks
899
1045
  else
900
1046
  row << value
901
- emit_row(row, &block)
1047
+ quoted_fields << @quoted_column_value
1048
+ emit_row(row, quoted_fields, &block)
902
1049
  row = []
1050
+ quoted_fields.clear
903
1051
  end
904
1052
  skip_needless_lines
905
1053
  start_row
906
1054
  elsif @scanner.eos?
907
1055
  break if row.empty? and value.nil?
908
1056
  row << value
909
- emit_row(row, &block)
1057
+ quoted_fields << @quoted_column_value
1058
+ emit_row(row, quoted_fields, &block)
910
1059
  break
911
1060
  else
912
1061
  if @quoted_column_value
1062
+ if liberal_parsing? and (new_line = @scanner.check(@line_end))
1063
+ message =
1064
+ "Illegal end-of-line sequence outside of a quoted field " +
1065
+ "<#{new_line.inspect}>"
1066
+ else
1067
+ message = "Any value after quoted field isn't allowed"
1068
+ end
913
1069
  ignore_broken_line
914
- message = "Any value after quoted field isn't allowed"
915
1070
  raise MalformedCSVError.new(message, @lineno)
916
1071
  elsif @unquoted_column_value and
917
- (new_line = @scanner.scan(@cr_or_lf))
1072
+ (new_line = @scanner.scan(@line_end))
918
1073
  ignore_broken_line
919
1074
  message = "Unquoted fields do not allow new line " +
920
1075
  "<#{new_line.inspect}>"
@@ -923,7 +1078,7 @@ class CSV
923
1078
  ignore_broken_line
924
1079
  message = "Illegal quoting"
925
1080
  raise MalformedCSVError.new(message, @lineno)
926
- elsif (new_line = @scanner.scan(@cr_or_lf))
1081
+ elsif (new_line = @scanner.scan(@line_end))
927
1082
  ignore_broken_line
928
1083
  message = "New line must be <#{@row_separator.inspect}> " +
929
1084
  "not <#{new_line.inspect}>"
@@ -1004,7 +1159,7 @@ class CSV
1004
1159
  if (n_quotes % 2).zero?
1005
1160
  quotes[0, (n_quotes - 2) / 2]
1006
1161
  else
1007
- value = quotes[0, (n_quotes - 1) / 2]
1162
+ value = quotes[0, n_quotes / 2]
1008
1163
  while true
1009
1164
  quoted_value = @scanner.scan_all(@quoted_value)
1010
1165
  value << quoted_value if quoted_value
@@ -1028,11 +1183,9 @@ class CSV
1028
1183
  n_quotes = quotes.size
1029
1184
  if n_quotes == 1
1030
1185
  break
1031
- elsif (n_quotes % 2) == 1
1032
- value << quotes[0, (n_quotes - 1) / 2]
1033
- break
1034
1186
  else
1035
1187
  value << quotes[0, n_quotes / 2]
1188
+ break if (n_quotes % 2) == 1
1036
1189
  end
1037
1190
  end
1038
1191
  value
@@ -1068,18 +1221,15 @@ class CSV
1068
1221
 
1069
1222
  def strip_value(value)
1070
1223
  return value unless @strip
1071
- return nil if value.nil?
1224
+ return value if value.nil?
1072
1225
 
1073
1226
  case @strip
1074
1227
  when String
1075
- size = value.size
1076
- while value.start_with?(@strip)
1077
- size -= 1
1078
- value = value[1, size]
1228
+ while value.delete_prefix!(@strip)
1229
+ # do nothing
1079
1230
  end
1080
- while value.end_with?(@strip)
1081
- size -= 1
1082
- value = value[0, size]
1231
+ while value.delete_suffix!(@strip)
1232
+ # do nothing
1083
1233
  end
1084
1234
  else
1085
1235
  value.strip!
@@ -1089,7 +1239,7 @@ class CSV
1089
1239
 
1090
1240
  def ignore_broken_line
1091
1241
  @scanner.scan_all(@not_line_end)
1092
- @scanner.scan_all(@cr_or_lf)
1242
+ @scanner.scan_all(@line_end)
1093
1243
  @lineno += 1
1094
1244
  end
1095
1245
 
@@ -1102,22 +1252,22 @@ class CSV
1102
1252
  @scanner.keep_start
1103
1253
  end
1104
1254
 
1105
- def emit_row(row, &block)
1255
+ def emit_row(row, quoted_fields=FieldsConverter::NO_QUOTED_FIELDS, &block)
1106
1256
  @lineno += 1
1107
1257
 
1108
1258
  raw_row = row
1109
1259
  if @use_headers
1110
1260
  if @headers.nil?
1111
- @headers = adjust_headers(row)
1261
+ @headers = adjust_headers(row, quoted_fields)
1112
1262
  return unless @return_headers
1113
1263
  row = Row.new(@headers, row, true)
1114
1264
  else
1115
1265
  row = Row.new(@headers,
1116
- @fields_converter.convert(raw_row, @headers, @lineno))
1266
+ @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
1117
1267
  end
1118
1268
  else
1119
1269
  # convert fields, if needed...
1120
- row = @fields_converter.convert(raw_row, nil, @lineno)
1270
+ row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
1121
1271
  end
1122
1272
 
1123
1273
  # inject unconverted fields and accessor, if requested...