csv 3.0.0 → 3.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/csv/parser.rb ADDED
@@ -0,0 +1,1092 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "delete_suffix"
6
+ require_relative "match_p"
7
+ require_relative "row"
8
+ require_relative "table"
9
+
10
+ using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
11
+ using CSV::MatchP if CSV.const_defined?(:MatchP)
12
+
13
+ class CSV
14
+ class Parser
15
+ class InvalidEncoding < StandardError
16
+ end
17
+
18
+ class Scanner < StringScanner
19
+ alias_method :scan_all, :scan
20
+
21
+ def initialize(*args)
22
+ super
23
+ @keeps = []
24
+ end
25
+
26
+ def each_line(row_separator)
27
+ position = pos
28
+ rest.each_line(row_separator) do |line|
29
+ position += line.bytesize
30
+ self.pos = position
31
+ yield(line)
32
+ end
33
+ end
34
+
35
+ def keep_start
36
+ @keeps.push(pos)
37
+ end
38
+
39
+ def keep_end
40
+ start = @keeps.pop
41
+ string[start, pos - start]
42
+ end
43
+
44
+ def keep_back
45
+ self.pos = @keeps.pop
46
+ end
47
+
48
+ def keep_drop
49
+ @keeps.pop
50
+ end
51
+ end
52
+
53
+ class InputsScanner
54
+ def initialize(inputs, encoding, chunk_size: 8192)
55
+ @inputs = inputs.dup
56
+ @encoding = encoding
57
+ @chunk_size = chunk_size
58
+ @last_scanner = @inputs.empty?
59
+ @keeps = []
60
+ read_chunk
61
+ end
62
+
63
+ def each_line(row_separator)
64
+ buffer = nil
65
+ input = @scanner.rest
66
+ position = @scanner.pos
67
+ offset = 0
68
+ n_row_separator_chars = row_separator.size
69
+ while true
70
+ input.each_line(row_separator) do |line|
71
+ @scanner.pos += line.bytesize
72
+ if buffer
73
+ if n_row_separator_chars == 2 and
74
+ buffer.end_with?(row_separator[0]) and
75
+ line.start_with?(row_separator[1])
76
+ buffer << line[0]
77
+ line = line[1..-1]
78
+ position += buffer.bytesize + offset
79
+ @scanner.pos = position
80
+ offset = 0
81
+ yield(buffer)
82
+ buffer = nil
83
+ next if line.empty?
84
+ else
85
+ buffer << line
86
+ line = buffer
87
+ buffer = nil
88
+ end
89
+ end
90
+ if line.end_with?(row_separator)
91
+ position += line.bytesize + offset
92
+ @scanner.pos = position
93
+ offset = 0
94
+ yield(line)
95
+ else
96
+ buffer = line
97
+ end
98
+ end
99
+ break unless read_chunk
100
+ input = @scanner.rest
101
+ position = @scanner.pos
102
+ offset = -buffer.bytesize if buffer
103
+ end
104
+ yield(buffer) if buffer
105
+ end
106
+
107
+ def scan(pattern)
108
+ value = @scanner.scan(pattern)
109
+ return value if @last_scanner
110
+
111
+ if value
112
+ read_chunk if @scanner.eos?
113
+ return value
114
+ else
115
+ nil
116
+ end
117
+ end
118
+
119
+ def scan_all(pattern)
120
+ value = @scanner.scan(pattern)
121
+ return value if @last_scanner
122
+
123
+ return nil if value.nil?
124
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
125
+ value << sub_value
126
+ end
127
+ value
128
+ end
129
+
130
+ def eos?
131
+ @scanner.eos?
132
+ end
133
+
134
+ def keep_start
135
+ @keeps.push([@scanner.pos, nil])
136
+ end
137
+
138
+ def keep_end
139
+ start, buffer = @keeps.pop
140
+ keep = @scanner.string[start, @scanner.pos - start]
141
+ if buffer
142
+ buffer << keep
143
+ keep = buffer
144
+ end
145
+ keep
146
+ end
147
+
148
+ def keep_back
149
+ start, buffer = @keeps.pop
150
+ if buffer
151
+ string = @scanner.string
152
+ keep = string.byteslice(start, string.bytesize - start)
153
+ if keep and not keep.empty?
154
+ @inputs.unshift(StringIO.new(keep))
155
+ @last_scanner = false
156
+ end
157
+ @scanner = StringScanner.new(buffer)
158
+ else
159
+ @scanner.pos = start
160
+ end
161
+ read_chunk if @scanner.eos?
162
+ end
163
+
164
+ def keep_drop
165
+ @keeps.pop
166
+ end
167
+
168
+ def rest
169
+ @scanner.rest
170
+ end
171
+
172
+ private
173
+ def read_chunk
174
+ return false if @last_scanner
175
+
176
+ unless @keeps.empty?
177
+ keep = @keeps.last
178
+ keep_start = keep[0]
179
+ string = @scanner.string
180
+ keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
181
+ if keep_data
182
+ keep_buffer = keep[1]
183
+ if keep_buffer
184
+ keep_buffer << keep_data
185
+ else
186
+ keep[1] = keep_data.dup
187
+ end
188
+ end
189
+ keep[0] = 0
190
+ end
191
+
192
+ input = @inputs.first
193
+ case input
194
+ when StringIO
195
+ string = input.string
196
+ raise InvalidEncoding unless string.valid_encoding?
197
+ @scanner = StringScanner.new(string)
198
+ @inputs.shift
199
+ @last_scanner = @inputs.empty?
200
+ true
201
+ else
202
+ chunk = input.gets(nil, @chunk_size)
203
+ if chunk
204
+ raise InvalidEncoding unless chunk.valid_encoding?
205
+ @scanner = StringScanner.new(chunk)
206
+ if input.respond_to?(:eof?) and input.eof?
207
+ @inputs.shift
208
+ @last_scanner = @inputs.empty?
209
+ end
210
+ true
211
+ else
212
+ @scanner = StringScanner.new("".encode(@encoding))
213
+ @inputs.shift
214
+ @last_scanner = @inputs.empty?
215
+ if @last_scanner
216
+ false
217
+ else
218
+ read_chunk
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
224
+
225
+ def initialize(input, options)
226
+ @input = input
227
+ @options = options
228
+ @samples = []
229
+
230
+ prepare
231
+ end
232
+
233
+ def column_separator
234
+ @column_separator
235
+ end
236
+
237
+ def row_separator
238
+ @row_separator
239
+ end
240
+
241
+ def quote_character
242
+ @quote_character
243
+ end
244
+
245
+ def field_size_limit
246
+ @field_size_limit
247
+ end
248
+
249
+ def skip_lines
250
+ @skip_lines
251
+ end
252
+
253
+ def unconverted_fields?
254
+ @unconverted_fields
255
+ end
256
+
257
+ def headers
258
+ @headers
259
+ end
260
+
261
+ def header_row?
262
+ @use_headers and @headers.nil?
263
+ end
264
+
265
+ def return_headers?
266
+ @return_headers
267
+ end
268
+
269
+ def skip_blanks?
270
+ @skip_blanks
271
+ end
272
+
273
+ def liberal_parsing?
274
+ @liberal_parsing
275
+ end
276
+
277
+ def lineno
278
+ @lineno
279
+ end
280
+
281
+ def line
282
+ last_line
283
+ end
284
+
285
+ def parse(&block)
286
+ return to_enum(__method__) unless block_given?
287
+
288
+ if @return_headers and @headers and @raw_headers
289
+ headers = Row.new(@headers, @raw_headers, true)
290
+ if @unconverted_fields
291
+ headers = add_unconverted_fields(headers, [])
292
+ end
293
+ yield headers
294
+ end
295
+
296
+ begin
297
+ @scanner ||= build_scanner
298
+ if quote_character.nil?
299
+ parse_no_quote(&block)
300
+ elsif @need_robust_parsing
301
+ parse_quotable_robust(&block)
302
+ else
303
+ parse_quotable_loose(&block)
304
+ end
305
+ rescue InvalidEncoding
306
+ if @scanner
307
+ ignore_broken_line
308
+ lineno = @lineno
309
+ else
310
+ lineno = @lineno + 1
311
+ end
312
+ message = "Invalid byte sequence in #{@encoding}"
313
+ raise MalformedCSVError.new(message, lineno)
314
+ end
315
+ end
316
+
317
+ def use_headers?
318
+ @use_headers
319
+ end
320
+
321
+ private
322
+ def prepare
323
+ prepare_variable
324
+ prepare_quote_character
325
+ prepare_backslash
326
+ prepare_skip_lines
327
+ prepare_strip
328
+ prepare_separators
329
+ prepare_quoted
330
+ prepare_unquoted
331
+ prepare_line
332
+ prepare_header
333
+ prepare_parser
334
+ end
335
+
336
+ def prepare_variable
337
+ @need_robust_parsing = false
338
+ @encoding = @options[:encoding]
339
+ liberal_parsing = @options[:liberal_parsing]
340
+ if liberal_parsing
341
+ @liberal_parsing = true
342
+ if liberal_parsing.is_a?(Hash)
343
+ @double_quote_outside_quote =
344
+ liberal_parsing[:double_quote_outside_quote]
345
+ @backslash_quote = liberal_parsing[:backslash_quote]
346
+ else
347
+ @double_quote_outside_quote = false
348
+ @backslash_quote = false
349
+ end
350
+ @need_robust_parsing = true
351
+ else
352
+ @liberal_parsing = false
353
+ @backslash_quote = false
354
+ end
355
+ @unconverted_fields = @options[:unconverted_fields]
356
+ @field_size_limit = @options[:field_size_limit]
357
+ @skip_blanks = @options[:skip_blanks]
358
+ @fields_converter = @options[:fields_converter]
359
+ @header_fields_converter = @options[:header_fields_converter]
360
+ end
361
+
362
+ def prepare_quote_character
363
+ @quote_character = @options[:quote_character]
364
+ if @quote_character.nil?
365
+ @escaped_quote_character = nil
366
+ @escaped_quote = nil
367
+ else
368
+ @quote_character = @quote_character.to_s.encode(@encoding)
369
+ if @quote_character.length != 1
370
+ message = ":quote_char has to be nil or a single character String"
371
+ raise ArgumentError, message
372
+ end
373
+ @double_quote_character = @quote_character * 2
374
+ @escaped_quote_character = Regexp.escape(@quote_character)
375
+ @escaped_quote = Regexp.new(@escaped_quote_character)
376
+ end
377
+ end
378
+
379
+ def prepare_backslash
380
+ return unless @backslash_quote
381
+
382
+ @backslash_character = "\\".encode(@encoding)
383
+
384
+ @escaped_backslash_character = Regexp.escape(@backslash_character)
385
+ @escaped_backslash = Regexp.new(@escaped_backslash_character)
386
+ if @quote_character.nil?
387
+ @backslash_quote_character = nil
388
+ else
389
+ @backslash_quote_character =
390
+ @backslash_character + @escaped_quote_character
391
+ end
392
+ end
393
+
394
+ def prepare_skip_lines
395
+ skip_lines = @options[:skip_lines]
396
+ case skip_lines
397
+ when String
398
+ @skip_lines = skip_lines.encode(@encoding)
399
+ when Regexp, nil
400
+ @skip_lines = skip_lines
401
+ else
402
+ unless skip_lines.respond_to?(:match)
403
+ message =
404
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
405
+ raise ArgumentError, message
406
+ end
407
+ @skip_lines = skip_lines
408
+ end
409
+ end
410
+
411
+ def prepare_strip
412
+ @strip = @options[:strip]
413
+ @escaped_strip = nil
414
+ @strip_value = nil
415
+ if @strip.is_a?(String)
416
+ case @strip.length
417
+ when 0
418
+ raise ArgumentError, ":strip must not be an empty String"
419
+ when 1
420
+ # ok
421
+ else
422
+ raise ArgumentError, ":strip doesn't support 2 or more characters yet"
423
+ end
424
+ @strip = @strip.encode(@encoding)
425
+ @escaped_strip = Regexp.escape(@strip)
426
+ if @quote_character
427
+ @strip_value = Regexp.new(@escaped_strip +
428
+ "+".encode(@encoding))
429
+ end
430
+ @need_robust_parsing = true
431
+ elsif @strip
432
+ strip_values = " \t\r\n\f\v"
433
+ @escaped_strip = strip_values.encode(@encoding)
434
+ if @quote_character
435
+ @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
436
+ end
437
+ @need_robust_parsing = true
438
+ end
439
+ end
440
+
441
+ begin
442
+ StringScanner.new("x").scan("x")
443
+ rescue TypeError
444
+ @@string_scanner_scan_accept_string = false
445
+ else
446
+ @@string_scanner_scan_accept_string = true
447
+ end
448
+
449
+ def prepare_separators
450
+ @column_separator = @options[:column_separator].to_s.encode(@encoding)
451
+ @row_separator =
452
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
453
+
454
+ @escaped_column_separator = Regexp.escape(@column_separator)
455
+ @escaped_first_column_separator = Regexp.escape(@column_separator[0])
456
+ if @column_separator.size > 1
457
+ @column_end = Regexp.new(@escaped_column_separator)
458
+ @column_ends = @column_separator.each_char.collect do |char|
459
+ Regexp.new(Regexp.escape(char))
460
+ end
461
+ @first_column_separators = Regexp.new(@escaped_first_column_separator +
462
+ "+".encode(@encoding))
463
+ else
464
+ if @@string_scanner_scan_accept_string
465
+ @column_end = @column_separator
466
+ else
467
+ @column_end = Regexp.new(@escaped_column_separator)
468
+ end
469
+ @column_ends = nil
470
+ @first_column_separators = nil
471
+ end
472
+
473
+ escaped_row_separator = Regexp.escape(@row_separator)
474
+ @row_end = Regexp.new(escaped_row_separator)
475
+ if @row_separator.size > 1
476
+ @row_ends = @row_separator.each_char.collect do |char|
477
+ Regexp.new(Regexp.escape(char))
478
+ end
479
+ else
480
+ @row_ends = nil
481
+ end
482
+
483
+ @cr = "\r".encode(@encoding)
484
+ @lf = "\n".encode(@encoding)
485
+ @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding))
486
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
487
+ end
488
+
489
+ def prepare_quoted
490
+ if @quote_character
491
+ @quotes = Regexp.new(@escaped_quote_character +
492
+ "+".encode(@encoding))
493
+ no_quoted_values = @escaped_quote_character.dup
494
+ if @backslash_quote
495
+ no_quoted_values << @escaped_backslash_character
496
+ end
497
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
498
+ no_quoted_values +
499
+ "]+".encode(@encoding))
500
+ end
501
+ if @escaped_strip
502
+ @split_column_separator = Regexp.new(@escaped_strip +
503
+ "*".encode(@encoding) +
504
+ @escaped_column_separator +
505
+ @escaped_strip +
506
+ "*".encode(@encoding))
507
+ else
508
+ if @column_separator == " ".encode(@encoding)
509
+ @split_column_separator = Regexp.new(@escaped_column_separator)
510
+ else
511
+ @split_column_separator = @column_separator
512
+ end
513
+ end
514
+ end
515
+
516
+ def prepare_unquoted
517
+ return if @quote_character.nil?
518
+
519
+ no_unquoted_values = "\r\n".encode(@encoding)
520
+ no_unquoted_values << @escaped_first_column_separator
521
+ unless @liberal_parsing
522
+ no_unquoted_values << @escaped_quote_character
523
+ end
524
+ if @escaped_strip
525
+ no_unquoted_values << @escaped_strip
526
+ end
527
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
528
+ no_unquoted_values +
529
+ "]+".encode(@encoding))
530
+ end
531
+
532
+ def resolve_row_separator(separator)
533
+ if separator == :auto
534
+ cr = "\r".encode(@encoding)
535
+ lf = "\n".encode(@encoding)
536
+ if @input.is_a?(StringIO)
537
+ separator = detect_row_separator(@input.string, cr, lf)
538
+ elsif @input.respond_to?(:gets)
539
+ if @input.is_a?(File)
540
+ chunk_size = 32 * 1024
541
+ else
542
+ chunk_size = 1024
543
+ end
544
+ begin
545
+ while separator == :auto
546
+ #
547
+ # if we run out of data, it's probably a single line
548
+ # (ensure will set default value)
549
+ #
550
+ break unless sample = @input.gets(nil, chunk_size)
551
+
552
+ # extend sample if we're unsure of the line ending
553
+ if sample.end_with?(cr)
554
+ sample << (@input.gets(nil, 1) || "")
555
+ end
556
+
557
+ @samples << sample
558
+
559
+ separator = detect_row_separator(sample, cr, lf)
560
+ end
561
+ rescue IOError
562
+ # do nothing: ensure will set default
563
+ end
564
+ end
565
+ separator = $INPUT_RECORD_SEPARATOR if separator == :auto
566
+ end
567
+ separator.to_s.encode(@encoding)
568
+ end
569
+
570
+ def detect_row_separator(sample, cr, lf)
571
+ lf_index = sample.index(lf)
572
+ if lf_index
573
+ cr_index = sample[0, lf_index].index(cr)
574
+ else
575
+ cr_index = sample.index(cr)
576
+ end
577
+ if cr_index and lf_index
578
+ if cr_index + 1 == lf_index
579
+ cr + lf
580
+ elsif cr_index < lf_index
581
+ cr
582
+ else
583
+ lf
584
+ end
585
+ elsif cr_index
586
+ cr
587
+ elsif lf_index
588
+ lf
589
+ else
590
+ :auto
591
+ end
592
+ end
593
+
594
+ def prepare_line
595
+ @lineno = 0
596
+ @last_line = nil
597
+ @scanner = nil
598
+ end
599
+
600
+ def last_line
601
+ if @scanner
602
+ @last_line ||= @scanner.keep_end
603
+ else
604
+ @last_line
605
+ end
606
+ end
607
+
608
+ def prepare_header
609
+ @return_headers = @options[:return_headers]
610
+
611
+ headers = @options[:headers]
612
+ case headers
613
+ when Array
614
+ @raw_headers = headers
615
+ @use_headers = true
616
+ when String
617
+ @raw_headers = parse_headers(headers)
618
+ @use_headers = true
619
+ when nil, false
620
+ @raw_headers = nil
621
+ @use_headers = false
622
+ else
623
+ @raw_headers = nil
624
+ @use_headers = true
625
+ end
626
+ if @raw_headers
627
+ @headers = adjust_headers(@raw_headers)
628
+ else
629
+ @headers = nil
630
+ end
631
+ end
632
+
633
+ def parse_headers(row)
634
+ CSV.parse_line(row,
635
+ col_sep: @column_separator,
636
+ row_sep: @row_separator,
637
+ quote_char: @quote_character)
638
+ end
639
+
640
+ def adjust_headers(headers)
641
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
642
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
643
+ adjusted_headers
644
+ end
645
+
646
+ def prepare_parser
647
+ @may_quoted = may_quoted?
648
+ end
649
+
650
+ def may_quoted?
651
+ return false if @quote_character.nil?
652
+
653
+ if @input.is_a?(StringIO)
654
+ sample = @input.string
655
+ else
656
+ return false if @samples.empty?
657
+ sample = @samples.first
658
+ end
659
+ sample[0, 128].index(@quote_character)
660
+ end
661
+
662
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
663
+ if SCANNER_TEST
664
+ class UnoptimizedStringIO
665
+ def initialize(string)
666
+ @io = StringIO.new(string)
667
+ end
668
+
669
+ def gets(*args)
670
+ @io.gets(*args)
671
+ end
672
+
673
+ def each_line(*args, &block)
674
+ @io.each_line(*args, &block)
675
+ end
676
+
677
+ def eof?
678
+ @io.eof?
679
+ end
680
+ end
681
+
682
+ def build_scanner
683
+ inputs = @samples.collect do |sample|
684
+ UnoptimizedStringIO.new(sample)
685
+ end
686
+ if @input.is_a?(StringIO)
687
+ inputs << UnoptimizedStringIO.new(@input.string)
688
+ else
689
+ inputs << @input
690
+ end
691
+ chunk_size = ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"
692
+ InputsScanner.new(inputs,
693
+ @encoding,
694
+ chunk_size: Integer(chunk_size, 10))
695
+ end
696
+ else
697
+ def build_scanner
698
+ string = nil
699
+ if @samples.empty? and @input.is_a?(StringIO)
700
+ string = @input.string
701
+ elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
702
+ string = @samples[0]
703
+ end
704
+ if string
705
+ unless string.valid_encoding?
706
+ index = string.lines(@row_separator).index do |line|
707
+ !line.valid_encoding?
708
+ end
709
+ if index
710
+ message = "Invalid byte sequence in #{@encoding}"
711
+ raise MalformedCSVError.new(message, @lineno + index + 1)
712
+ end
713
+ end
714
+ Scanner.new(string)
715
+ else
716
+ inputs = @samples.collect do |sample|
717
+ StringIO.new(sample)
718
+ end
719
+ inputs << @input
720
+ InputsScanner.new(inputs, @encoding)
721
+ end
722
+ end
723
+ end
724
+
725
+ def skip_needless_lines
726
+ return unless @skip_lines
727
+
728
+ while true
729
+ @scanner.keep_start
730
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
731
+ line << @row_separator if parse_row_end
732
+ if skip_line?(line)
733
+ @lineno += 1
734
+ @scanner.keep_drop
735
+ else
736
+ @scanner.keep_back
737
+ return
738
+ end
739
+ end
740
+ end
741
+
742
+ def skip_line?(line)
743
+ case @skip_lines
744
+ when String
745
+ line.include?(@skip_lines)
746
+ when Regexp
747
+ @skip_lines.match?(line)
748
+ else
749
+ @skip_lines.match(line)
750
+ end
751
+ end
752
+
753
+ def parse_no_quote(&block)
754
+ @scanner.each_line(@row_separator) do |line|
755
+ next if @skip_lines and skip_line?(line)
756
+ original_line = line
757
+ line = line.delete_suffix(@row_separator)
758
+
759
+ if line.empty?
760
+ next if @skip_blanks
761
+ row = []
762
+ else
763
+ line = strip_value(line)
764
+ row = line.split(@split_column_separator, -1)
765
+ n_columns = row.size
766
+ i = 0
767
+ while i < n_columns
768
+ row[i] = nil if row[i].empty?
769
+ i += 1
770
+ end
771
+ end
772
+ @last_line = original_line
773
+ emit_row(row, &block)
774
+ end
775
+ end
776
+
777
+ def parse_quotable_loose(&block)
778
+ @scanner.keep_start
779
+ @scanner.each_line(@row_separator) do |line|
780
+ if @skip_lines and skip_line?(line)
781
+ @scanner.keep_drop
782
+ @scanner.keep_start
783
+ next
784
+ end
785
+ original_line = line
786
+ line = line.delete_suffix(@row_separator)
787
+
788
+ if line.empty?
789
+ if @skip_blanks
790
+ @scanner.keep_drop
791
+ @scanner.keep_start
792
+ next
793
+ end
794
+ row = []
795
+ elsif line.include?(@cr) or line.include?(@lf)
796
+ @scanner.keep_back
797
+ @need_robust_parsing = true
798
+ return parse_quotable_robust(&block)
799
+ else
800
+ row = line.split(@split_column_separator, -1)
801
+ n_columns = row.size
802
+ i = 0
803
+ while i < n_columns
804
+ column = row[i]
805
+ if column.empty?
806
+ row[i] = nil
807
+ else
808
+ n_quotes = column.count(@quote_character)
809
+ if n_quotes.zero?
810
+ # no quote
811
+ elsif n_quotes == 2 and
812
+ column.start_with?(@quote_character) and
813
+ column.end_with?(@quote_character)
814
+ row[i] = column[1..-2]
815
+ else
816
+ @scanner.keep_back
817
+ @need_robust_parsing = true
818
+ return parse_quotable_robust(&block)
819
+ end
820
+ end
821
+ i += 1
822
+ end
823
+ end
824
+ @scanner.keep_drop
825
+ @scanner.keep_start
826
+ @last_line = original_line
827
+ emit_row(row, &block)
828
+ end
829
+ @scanner.keep_drop
830
+ end
831
+
832
+ def parse_quotable_robust(&block)
833
+ row = []
834
+ skip_needless_lines
835
+ start_row
836
+ while true
837
+ @quoted_column_value = false
838
+ @unquoted_column_value = false
839
+ @scanner.scan_all(@strip_value) if @strip_value
840
+ value = parse_column_value
841
+ if value
842
+ @scanner.scan_all(@strip_value) if @strip_value
843
+ if @field_size_limit and value.size >= @field_size_limit
844
+ ignore_broken_line
845
+ raise MalformedCSVError.new("Field size exceeded", @lineno)
846
+ end
847
+ end
848
+ if parse_column_end
849
+ row << value
850
+ elsif parse_row_end
851
+ if row.empty? and value.nil?
852
+ emit_row([], &block) unless @skip_blanks
853
+ else
854
+ row << value
855
+ emit_row(row, &block)
856
+ row = []
857
+ end
858
+ skip_needless_lines
859
+ start_row
860
+ elsif @scanner.eos?
861
+ break if row.empty? and value.nil?
862
+ row << value
863
+ emit_row(row, &block)
864
+ break
865
+ else
866
+ if @quoted_column_value
867
+ ignore_broken_line
868
+ message = "Any value after quoted field isn't allowed"
869
+ raise MalformedCSVError.new(message, @lineno)
870
+ elsif @unquoted_column_value and
871
+ (new_line = @scanner.scan(@cr_or_lf))
872
+ ignore_broken_line
873
+ message = "Unquoted fields do not allow new line " +
874
+ "<#{new_line.inspect}>"
875
+ raise MalformedCSVError.new(message, @lineno)
876
+ elsif @scanner.rest.start_with?(@quote_character)
877
+ ignore_broken_line
878
+ message = "Illegal quoting"
879
+ raise MalformedCSVError.new(message, @lineno)
880
+ elsif (new_line = @scanner.scan(@cr_or_lf))
881
+ ignore_broken_line
882
+ message = "New line must be <#{@row_separator.inspect}> " +
883
+ "not <#{new_line.inspect}>"
884
+ raise MalformedCSVError.new(message, @lineno)
885
+ else
886
+ ignore_broken_line
887
+ raise MalformedCSVError.new("TODO: Meaningful message",
888
+ @lineno)
889
+ end
890
+ end
891
+ end
892
+ end
893
+
894
+ def parse_column_value
895
+ if @liberal_parsing
896
+ quoted_value = parse_quoted_column_value
897
+ if quoted_value
898
+ unquoted_value = parse_unquoted_column_value
899
+ if unquoted_value
900
+ if @double_quote_outside_quote
901
+ unquoted_value = unquoted_value.gsub(@quote_character * 2,
902
+ @quote_character)
903
+ if quoted_value.empty? # %Q{""...} case
904
+ return @quote_character + unquoted_value
905
+ end
906
+ end
907
+ @quote_character + quoted_value + @quote_character + unquoted_value
908
+ else
909
+ quoted_value
910
+ end
911
+ else
912
+ parse_unquoted_column_value
913
+ end
914
+ elsif @may_quoted
915
+ parse_quoted_column_value ||
916
+ parse_unquoted_column_value
917
+ else
918
+ parse_unquoted_column_value ||
919
+ parse_quoted_column_value
920
+ end
921
+ end
922
+
923
+ def parse_unquoted_column_value
924
+ value = @scanner.scan_all(@unquoted_value)
925
+ return nil unless value
926
+
927
+ @unquoted_column_value = true
928
+ if @first_column_separators
929
+ while true
930
+ @scanner.keep_start
931
+ is_column_end = @column_ends.all? do |column_end|
932
+ @scanner.scan(column_end)
933
+ end
934
+ @scanner.keep_back
935
+ break if is_column_end
936
+ sub_separator = @scanner.scan_all(@first_column_separators)
937
+ break if sub_separator.nil?
938
+ value << sub_separator
939
+ sub_value = @scanner.scan_all(@unquoted_value)
940
+ break if sub_value.nil?
941
+ value << sub_value
942
+ end
943
+ end
944
+ value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
945
+ value
946
+ end
947
+
948
+ def parse_quoted_column_value
949
+ quotes = @scanner.scan_all(@quotes)
950
+ return nil unless quotes
951
+
952
+ @quoted_column_value = true
953
+ n_quotes = quotes.size
954
+ if (n_quotes % 2).zero?
955
+ quotes[0, (n_quotes - 2) / 2]
956
+ else
957
+ value = quotes[0, (n_quotes - 1) / 2]
958
+ while true
959
+ quoted_value = @scanner.scan_all(@quoted_value)
960
+ value << quoted_value if quoted_value
961
+ if @backslash_quote
962
+ if @scanner.scan(@escaped_backslash)
963
+ if @scanner.scan(@escaped_quote)
964
+ value << @quote_character
965
+ else
966
+ value << @backslash_character
967
+ end
968
+ next
969
+ end
970
+ end
971
+
972
+ quotes = @scanner.scan_all(@quotes)
973
+ unless quotes
974
+ ignore_broken_line
975
+ message = "Unclosed quoted field"
976
+ raise MalformedCSVError.new(message, @lineno)
977
+ end
978
+ n_quotes = quotes.size
979
+ if n_quotes == 1
980
+ break
981
+ elsif (n_quotes % 2) == 1
982
+ value << quotes[0, (n_quotes - 1) / 2]
983
+ break
984
+ else
985
+ value << quotes[0, n_quotes / 2]
986
+ end
987
+ end
988
+ value
989
+ end
990
+ end
991
+
992
+ def parse_column_end
993
+ return true if @scanner.scan(@column_end)
994
+ return false unless @column_ends
995
+
996
+ @scanner.keep_start
997
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
998
+ @scanner.keep_drop
999
+ true
1000
+ else
1001
+ @scanner.keep_back
1002
+ false
1003
+ end
1004
+ end
1005
+
1006
+ def parse_row_end
1007
+ return true if @scanner.scan(@row_end)
1008
+ return false unless @row_ends
1009
+ @scanner.keep_start
1010
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
1011
+ @scanner.keep_drop
1012
+ true
1013
+ else
1014
+ @scanner.keep_back
1015
+ false
1016
+ end
1017
+ end
1018
+
1019
+ def strip_value(value)
1020
+ return value unless @strip
1021
+ return nil if value.nil?
1022
+
1023
+ case @strip
1024
+ when String
1025
+ size = value.size
1026
+ while value.start_with?(@strip)
1027
+ size -= 1
1028
+ value = value[1, size]
1029
+ end
1030
+ while value.end_with?(@strip)
1031
+ size -= 1
1032
+ value = value[0, size]
1033
+ end
1034
+ else
1035
+ value.strip!
1036
+ end
1037
+ value
1038
+ end
1039
+
1040
+ def ignore_broken_line
1041
+ @scanner.scan_all(@not_line_end)
1042
+ @scanner.scan_all(@cr_or_lf)
1043
+ @lineno += 1
1044
+ end
1045
+
1046
+ def start_row
1047
+ if @last_line
1048
+ @last_line = nil
1049
+ else
1050
+ @scanner.keep_drop
1051
+ end
1052
+ @scanner.keep_start
1053
+ end
1054
+
1055
+ def emit_row(row, &block)
1056
+ @lineno += 1
1057
+
1058
+ raw_row = row
1059
+ if @use_headers
1060
+ if @headers.nil?
1061
+ @headers = adjust_headers(row)
1062
+ return unless @return_headers
1063
+ row = Row.new(@headers, row, true)
1064
+ else
1065
+ row = Row.new(@headers,
1066
+ @fields_converter.convert(raw_row, @headers, @lineno))
1067
+ end
1068
+ else
1069
+ # convert fields, if needed...
1070
+ row = @fields_converter.convert(raw_row, nil, @lineno)
1071
+ end
1072
+
1073
+ # inject unconverted fields and accessor, if requested...
1074
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
1075
+ add_unconverted_fields(row, raw_row)
1076
+ end
1077
+
1078
+ yield(row)
1079
+ end
1080
+
1081
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
1082
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
1083
+ # variable is set to the contents of +fields+.
1084
+ def add_unconverted_fields(row, fields)
1085
+ class << row
1086
+ attr_reader :unconverted_fields
1087
+ end
1088
+ row.instance_variable_set(:@unconverted_fields, fields)
1089
+ row
1090
+ end
1091
+ end
1092
+ end