csv 3.0.0 → 3.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/csv/parser.rb ADDED
@@ -0,0 +1,1092 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "delete_suffix"
6
+ require_relative "match_p"
7
+ require_relative "row"
8
+ require_relative "table"
9
+
10
+ using CSV::DeleteSuffix if CSV.const_defined?(:DeleteSuffix)
11
+ using CSV::MatchP if CSV.const_defined?(:MatchP)
12
+
13
+ class CSV
14
+ class Parser
15
+ class InvalidEncoding < StandardError
16
+ end
17
+
18
+ class Scanner < StringScanner
19
+ alias_method :scan_all, :scan
20
+
21
+ def initialize(*args)
22
+ super
23
+ @keeps = []
24
+ end
25
+
26
+ def each_line(row_separator)
27
+ position = pos
28
+ rest.each_line(row_separator) do |line|
29
+ position += line.bytesize
30
+ self.pos = position
31
+ yield(line)
32
+ end
33
+ end
34
+
35
+ def keep_start
36
+ @keeps.push(pos)
37
+ end
38
+
39
+ def keep_end
40
+ start = @keeps.pop
41
+ string[start, pos - start]
42
+ end
43
+
44
+ def keep_back
45
+ self.pos = @keeps.pop
46
+ end
47
+
48
+ def keep_drop
49
+ @keeps.pop
50
+ end
51
+ end
52
+
53
+ class InputsScanner
54
+ def initialize(inputs, encoding, chunk_size: 8192)
55
+ @inputs = inputs.dup
56
+ @encoding = encoding
57
+ @chunk_size = chunk_size
58
+ @last_scanner = @inputs.empty?
59
+ @keeps = []
60
+ read_chunk
61
+ end
62
+
63
+ def each_line(row_separator)
64
+ buffer = nil
65
+ input = @scanner.rest
66
+ position = @scanner.pos
67
+ offset = 0
68
+ n_row_separator_chars = row_separator.size
69
+ while true
70
+ input.each_line(row_separator) do |line|
71
+ @scanner.pos += line.bytesize
72
+ if buffer
73
+ if n_row_separator_chars == 2 and
74
+ buffer.end_with?(row_separator[0]) and
75
+ line.start_with?(row_separator[1])
76
+ buffer << line[0]
77
+ line = line[1..-1]
78
+ position += buffer.bytesize + offset
79
+ @scanner.pos = position
80
+ offset = 0
81
+ yield(buffer)
82
+ buffer = nil
83
+ next if line.empty?
84
+ else
85
+ buffer << line
86
+ line = buffer
87
+ buffer = nil
88
+ end
89
+ end
90
+ if line.end_with?(row_separator)
91
+ position += line.bytesize + offset
92
+ @scanner.pos = position
93
+ offset = 0
94
+ yield(line)
95
+ else
96
+ buffer = line
97
+ end
98
+ end
99
+ break unless read_chunk
100
+ input = @scanner.rest
101
+ position = @scanner.pos
102
+ offset = -buffer.bytesize if buffer
103
+ end
104
+ yield(buffer) if buffer
105
+ end
106
+
107
+ def scan(pattern)
108
+ value = @scanner.scan(pattern)
109
+ return value if @last_scanner
110
+
111
+ if value
112
+ read_chunk if @scanner.eos?
113
+ return value
114
+ else
115
+ nil
116
+ end
117
+ end
118
+
119
+ def scan_all(pattern)
120
+ value = @scanner.scan(pattern)
121
+ return value if @last_scanner
122
+
123
+ return nil if value.nil?
124
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
125
+ value << sub_value
126
+ end
127
+ value
128
+ end
129
+
130
+ def eos?
131
+ @scanner.eos?
132
+ end
133
+
134
+ def keep_start
135
+ @keeps.push([@scanner.pos, nil])
136
+ end
137
+
138
+ def keep_end
139
+ start, buffer = @keeps.pop
140
+ keep = @scanner.string[start, @scanner.pos - start]
141
+ if buffer
142
+ buffer << keep
143
+ keep = buffer
144
+ end
145
+ keep
146
+ end
147
+
148
+ def keep_back
149
+ start, buffer = @keeps.pop
150
+ if buffer
151
+ string = @scanner.string
152
+ keep = string.byteslice(start, string.bytesize - start)
153
+ if keep and not keep.empty?
154
+ @inputs.unshift(StringIO.new(keep))
155
+ @last_scanner = false
156
+ end
157
+ @scanner = StringScanner.new(buffer)
158
+ else
159
+ @scanner.pos = start
160
+ end
161
+ read_chunk if @scanner.eos?
162
+ end
163
+
164
+ def keep_drop
165
+ @keeps.pop
166
+ end
167
+
168
+ def rest
169
+ @scanner.rest
170
+ end
171
+
172
+ private
173
+ def read_chunk
174
+ return false if @last_scanner
175
+
176
+ unless @keeps.empty?
177
+ keep = @keeps.last
178
+ keep_start = keep[0]
179
+ string = @scanner.string
180
+ keep_data = string.byteslice(keep_start, @scanner.pos - keep_start)
181
+ if keep_data
182
+ keep_buffer = keep[1]
183
+ if keep_buffer
184
+ keep_buffer << keep_data
185
+ else
186
+ keep[1] = keep_data.dup
187
+ end
188
+ end
189
+ keep[0] = 0
190
+ end
191
+
192
+ input = @inputs.first
193
+ case input
194
+ when StringIO
195
+ string = input.string
196
+ raise InvalidEncoding unless string.valid_encoding?
197
+ @scanner = StringScanner.new(string)
198
+ @inputs.shift
199
+ @last_scanner = @inputs.empty?
200
+ true
201
+ else
202
+ chunk = input.gets(nil, @chunk_size)
203
+ if chunk
204
+ raise InvalidEncoding unless chunk.valid_encoding?
205
+ @scanner = StringScanner.new(chunk)
206
+ if input.respond_to?(:eof?) and input.eof?
207
+ @inputs.shift
208
+ @last_scanner = @inputs.empty?
209
+ end
210
+ true
211
+ else
212
+ @scanner = StringScanner.new("".encode(@encoding))
213
+ @inputs.shift
214
+ @last_scanner = @inputs.empty?
215
+ if @last_scanner
216
+ false
217
+ else
218
+ read_chunk
219
+ end
220
+ end
221
+ end
222
+ end
223
+ end
224
+
225
+ def initialize(input, options)
226
+ @input = input
227
+ @options = options
228
+ @samples = []
229
+
230
+ prepare
231
+ end
232
+
233
+ def column_separator
234
+ @column_separator
235
+ end
236
+
237
+ def row_separator
238
+ @row_separator
239
+ end
240
+
241
+ def quote_character
242
+ @quote_character
243
+ end
244
+
245
+ def field_size_limit
246
+ @field_size_limit
247
+ end
248
+
249
+ def skip_lines
250
+ @skip_lines
251
+ end
252
+
253
+ def unconverted_fields?
254
+ @unconverted_fields
255
+ end
256
+
257
+ def headers
258
+ @headers
259
+ end
260
+
261
+ def header_row?
262
+ @use_headers and @headers.nil?
263
+ end
264
+
265
+ def return_headers?
266
+ @return_headers
267
+ end
268
+
269
+ def skip_blanks?
270
+ @skip_blanks
271
+ end
272
+
273
+ def liberal_parsing?
274
+ @liberal_parsing
275
+ end
276
+
277
+ def lineno
278
+ @lineno
279
+ end
280
+
281
+ def line
282
+ last_line
283
+ end
284
+
285
+ def parse(&block)
286
+ return to_enum(__method__) unless block_given?
287
+
288
+ if @return_headers and @headers and @raw_headers
289
+ headers = Row.new(@headers, @raw_headers, true)
290
+ if @unconverted_fields
291
+ headers = add_unconverted_fields(headers, [])
292
+ end
293
+ yield headers
294
+ end
295
+
296
+ begin
297
+ @scanner ||= build_scanner
298
+ if quote_character.nil?
299
+ parse_no_quote(&block)
300
+ elsif @need_robust_parsing
301
+ parse_quotable_robust(&block)
302
+ else
303
+ parse_quotable_loose(&block)
304
+ end
305
+ rescue InvalidEncoding
306
+ if @scanner
307
+ ignore_broken_line
308
+ lineno = @lineno
309
+ else
310
+ lineno = @lineno + 1
311
+ end
312
+ message = "Invalid byte sequence in #{@encoding}"
313
+ raise MalformedCSVError.new(message, lineno)
314
+ end
315
+ end
316
+
317
+ def use_headers?
318
+ @use_headers
319
+ end
320
+
321
+ private
322
+ def prepare
323
+ prepare_variable
324
+ prepare_quote_character
325
+ prepare_backslash
326
+ prepare_skip_lines
327
+ prepare_strip
328
+ prepare_separators
329
+ prepare_quoted
330
+ prepare_unquoted
331
+ prepare_line
332
+ prepare_header
333
+ prepare_parser
334
+ end
335
+
336
+ def prepare_variable
337
+ @need_robust_parsing = false
338
+ @encoding = @options[:encoding]
339
+ liberal_parsing = @options[:liberal_parsing]
340
+ if liberal_parsing
341
+ @liberal_parsing = true
342
+ if liberal_parsing.is_a?(Hash)
343
+ @double_quote_outside_quote =
344
+ liberal_parsing[:double_quote_outside_quote]
345
+ @backslash_quote = liberal_parsing[:backslash_quote]
346
+ else
347
+ @double_quote_outside_quote = false
348
+ @backslash_quote = false
349
+ end
350
+ @need_robust_parsing = true
351
+ else
352
+ @liberal_parsing = false
353
+ @backslash_quote = false
354
+ end
355
+ @unconverted_fields = @options[:unconverted_fields]
356
+ @field_size_limit = @options[:field_size_limit]
357
+ @skip_blanks = @options[:skip_blanks]
358
+ @fields_converter = @options[:fields_converter]
359
+ @header_fields_converter = @options[:header_fields_converter]
360
+ end
361
+
362
+ def prepare_quote_character
363
+ @quote_character = @options[:quote_character]
364
+ if @quote_character.nil?
365
+ @escaped_quote_character = nil
366
+ @escaped_quote = nil
367
+ else
368
+ @quote_character = @quote_character.to_s.encode(@encoding)
369
+ if @quote_character.length != 1
370
+ message = ":quote_char has to be nil or a single character String"
371
+ raise ArgumentError, message
372
+ end
373
+ @double_quote_character = @quote_character * 2
374
+ @escaped_quote_character = Regexp.escape(@quote_character)
375
+ @escaped_quote = Regexp.new(@escaped_quote_character)
376
+ end
377
+ end
378
+
379
+ def prepare_backslash
380
+ return unless @backslash_quote
381
+
382
+ @backslash_character = "\\".encode(@encoding)
383
+
384
+ @escaped_backslash_character = Regexp.escape(@backslash_character)
385
+ @escaped_backslash = Regexp.new(@escaped_backslash_character)
386
+ if @quote_character.nil?
387
+ @backslash_quote_character = nil
388
+ else
389
+ @backslash_quote_character =
390
+ @backslash_character + @escaped_quote_character
391
+ end
392
+ end
393
+
394
+ def prepare_skip_lines
395
+ skip_lines = @options[:skip_lines]
396
+ case skip_lines
397
+ when String
398
+ @skip_lines = skip_lines.encode(@encoding)
399
+ when Regexp, nil
400
+ @skip_lines = skip_lines
401
+ else
402
+ unless skip_lines.respond_to?(:match)
403
+ message =
404
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
405
+ raise ArgumentError, message
406
+ end
407
+ @skip_lines = skip_lines
408
+ end
409
+ end
410
+
411
+ def prepare_strip
412
+ @strip = @options[:strip]
413
+ @escaped_strip = nil
414
+ @strip_value = nil
415
+ if @strip.is_a?(String)
416
+ case @strip.length
417
+ when 0
418
+ raise ArgumentError, ":strip must not be an empty String"
419
+ when 1
420
+ # ok
421
+ else
422
+ raise ArgumentError, ":strip doesn't support 2 or more characters yet"
423
+ end
424
+ @strip = @strip.encode(@encoding)
425
+ @escaped_strip = Regexp.escape(@strip)
426
+ if @quote_character
427
+ @strip_value = Regexp.new(@escaped_strip +
428
+ "+".encode(@encoding))
429
+ end
430
+ @need_robust_parsing = true
431
+ elsif @strip
432
+ strip_values = " \t\r\n\f\v"
433
+ @escaped_strip = strip_values.encode(@encoding)
434
+ if @quote_character
435
+ @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
436
+ end
437
+ @need_robust_parsing = true
438
+ end
439
+ end
440
+
441
+ begin
442
+ StringScanner.new("x").scan("x")
443
+ rescue TypeError
444
+ @@string_scanner_scan_accept_string = false
445
+ else
446
+ @@string_scanner_scan_accept_string = true
447
+ end
448
+
449
+ def prepare_separators
450
+ @column_separator = @options[:column_separator].to_s.encode(@encoding)
451
+ @row_separator =
452
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
453
+
454
+ @escaped_column_separator = Regexp.escape(@column_separator)
455
+ @escaped_first_column_separator = Regexp.escape(@column_separator[0])
456
+ if @column_separator.size > 1
457
+ @column_end = Regexp.new(@escaped_column_separator)
458
+ @column_ends = @column_separator.each_char.collect do |char|
459
+ Regexp.new(Regexp.escape(char))
460
+ end
461
+ @first_column_separators = Regexp.new(@escaped_first_column_separator +
462
+ "+".encode(@encoding))
463
+ else
464
+ if @@string_scanner_scan_accept_string
465
+ @column_end = @column_separator
466
+ else
467
+ @column_end = Regexp.new(@escaped_column_separator)
468
+ end
469
+ @column_ends = nil
470
+ @first_column_separators = nil
471
+ end
472
+
473
+ escaped_row_separator = Regexp.escape(@row_separator)
474
+ @row_end = Regexp.new(escaped_row_separator)
475
+ if @row_separator.size > 1
476
+ @row_ends = @row_separator.each_char.collect do |char|
477
+ Regexp.new(Regexp.escape(char))
478
+ end
479
+ else
480
+ @row_ends = nil
481
+ end
482
+
483
+ @cr = "\r".encode(@encoding)
484
+ @lf = "\n".encode(@encoding)
485
+ @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding))
486
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
487
+ end
488
+
489
+ def prepare_quoted
490
+ if @quote_character
491
+ @quotes = Regexp.new(@escaped_quote_character +
492
+ "+".encode(@encoding))
493
+ no_quoted_values = @escaped_quote_character.dup
494
+ if @backslash_quote
495
+ no_quoted_values << @escaped_backslash_character
496
+ end
497
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
498
+ no_quoted_values +
499
+ "]+".encode(@encoding))
500
+ end
501
+ if @escaped_strip
502
+ @split_column_separator = Regexp.new(@escaped_strip +
503
+ "*".encode(@encoding) +
504
+ @escaped_column_separator +
505
+ @escaped_strip +
506
+ "*".encode(@encoding))
507
+ else
508
+ if @column_separator == " ".encode(@encoding)
509
+ @split_column_separator = Regexp.new(@escaped_column_separator)
510
+ else
511
+ @split_column_separator = @column_separator
512
+ end
513
+ end
514
+ end
515
+
516
+ def prepare_unquoted
517
+ return if @quote_character.nil?
518
+
519
+ no_unquoted_values = "\r\n".encode(@encoding)
520
+ no_unquoted_values << @escaped_first_column_separator
521
+ unless @liberal_parsing
522
+ no_unquoted_values << @escaped_quote_character
523
+ end
524
+ if @escaped_strip
525
+ no_unquoted_values << @escaped_strip
526
+ end
527
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
528
+ no_unquoted_values +
529
+ "]+".encode(@encoding))
530
+ end
531
+
532
+ def resolve_row_separator(separator)
533
+ if separator == :auto
534
+ cr = "\r".encode(@encoding)
535
+ lf = "\n".encode(@encoding)
536
+ if @input.is_a?(StringIO)
537
+ separator = detect_row_separator(@input.string, cr, lf)
538
+ elsif @input.respond_to?(:gets)
539
+ if @input.is_a?(File)
540
+ chunk_size = 32 * 1024
541
+ else
542
+ chunk_size = 1024
543
+ end
544
+ begin
545
+ while separator == :auto
546
+ #
547
+ # if we run out of data, it's probably a single line
548
+ # (ensure will set default value)
549
+ #
550
+ break unless sample = @input.gets(nil, chunk_size)
551
+
552
+ # extend sample if we're unsure of the line ending
553
+ if sample.end_with?(cr)
554
+ sample << (@input.gets(nil, 1) || "")
555
+ end
556
+
557
+ @samples << sample
558
+
559
+ separator = detect_row_separator(sample, cr, lf)
560
+ end
561
+ rescue IOError
562
+ # do nothing: ensure will set default
563
+ end
564
+ end
565
+ separator = $INPUT_RECORD_SEPARATOR if separator == :auto
566
+ end
567
+ separator.to_s.encode(@encoding)
568
+ end
569
+
570
+ def detect_row_separator(sample, cr, lf)
571
+ lf_index = sample.index(lf)
572
+ if lf_index
573
+ cr_index = sample[0, lf_index].index(cr)
574
+ else
575
+ cr_index = sample.index(cr)
576
+ end
577
+ if cr_index and lf_index
578
+ if cr_index + 1 == lf_index
579
+ cr + lf
580
+ elsif cr_index < lf_index
581
+ cr
582
+ else
583
+ lf
584
+ end
585
+ elsif cr_index
586
+ cr
587
+ elsif lf_index
588
+ lf
589
+ else
590
+ :auto
591
+ end
592
+ end
593
+
594
+ def prepare_line
595
+ @lineno = 0
596
+ @last_line = nil
597
+ @scanner = nil
598
+ end
599
+
600
+ def last_line
601
+ if @scanner
602
+ @last_line ||= @scanner.keep_end
603
+ else
604
+ @last_line
605
+ end
606
+ end
607
+
608
+ def prepare_header
609
+ @return_headers = @options[:return_headers]
610
+
611
+ headers = @options[:headers]
612
+ case headers
613
+ when Array
614
+ @raw_headers = headers
615
+ @use_headers = true
616
+ when String
617
+ @raw_headers = parse_headers(headers)
618
+ @use_headers = true
619
+ when nil, false
620
+ @raw_headers = nil
621
+ @use_headers = false
622
+ else
623
+ @raw_headers = nil
624
+ @use_headers = true
625
+ end
626
+ if @raw_headers
627
+ @headers = adjust_headers(@raw_headers)
628
+ else
629
+ @headers = nil
630
+ end
631
+ end
632
+
633
+ def parse_headers(row)
634
+ CSV.parse_line(row,
635
+ col_sep: @column_separator,
636
+ row_sep: @row_separator,
637
+ quote_char: @quote_character)
638
+ end
639
+
640
+ def adjust_headers(headers)
641
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
642
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
643
+ adjusted_headers
644
+ end
645
+
646
+ def prepare_parser
647
+ @may_quoted = may_quoted?
648
+ end
649
+
650
+ def may_quoted?
651
+ return false if @quote_character.nil?
652
+
653
+ if @input.is_a?(StringIO)
654
+ sample = @input.string
655
+ else
656
+ return false if @samples.empty?
657
+ sample = @samples.first
658
+ end
659
+ sample[0, 128].index(@quote_character)
660
+ end
661
+
662
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
663
+ if SCANNER_TEST
664
+ class UnoptimizedStringIO
665
+ def initialize(string)
666
+ @io = StringIO.new(string)
667
+ end
668
+
669
+ def gets(*args)
670
+ @io.gets(*args)
671
+ end
672
+
673
+ def each_line(*args, &block)
674
+ @io.each_line(*args, &block)
675
+ end
676
+
677
+ def eof?
678
+ @io.eof?
679
+ end
680
+ end
681
+
682
+ def build_scanner
683
+ inputs = @samples.collect do |sample|
684
+ UnoptimizedStringIO.new(sample)
685
+ end
686
+ if @input.is_a?(StringIO)
687
+ inputs << UnoptimizedStringIO.new(@input.string)
688
+ else
689
+ inputs << @input
690
+ end
691
+ chunk_size = ENV["CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"] || "1"
692
+ InputsScanner.new(inputs,
693
+ @encoding,
694
+ chunk_size: Integer(chunk_size, 10))
695
+ end
696
+ else
697
+ def build_scanner
698
+ string = nil
699
+ if @samples.empty? and @input.is_a?(StringIO)
700
+ string = @input.string
701
+ elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
702
+ string = @samples[0]
703
+ end
704
+ if string
705
+ unless string.valid_encoding?
706
+ index = string.lines(@row_separator).index do |line|
707
+ !line.valid_encoding?
708
+ end
709
+ if index
710
+ message = "Invalid byte sequence in #{@encoding}"
711
+ raise MalformedCSVError.new(message, @lineno + index + 1)
712
+ end
713
+ end
714
+ Scanner.new(string)
715
+ else
716
+ inputs = @samples.collect do |sample|
717
+ StringIO.new(sample)
718
+ end
719
+ inputs << @input
720
+ InputsScanner.new(inputs, @encoding)
721
+ end
722
+ end
723
+ end
724
+
725
+ def skip_needless_lines
726
+ return unless @skip_lines
727
+
728
+ while true
729
+ @scanner.keep_start
730
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
731
+ line << @row_separator if parse_row_end
732
+ if skip_line?(line)
733
+ @lineno += 1
734
+ @scanner.keep_drop
735
+ else
736
+ @scanner.keep_back
737
+ return
738
+ end
739
+ end
740
+ end
741
+
742
+ def skip_line?(line)
743
+ case @skip_lines
744
+ when String
745
+ line.include?(@skip_lines)
746
+ when Regexp
747
+ @skip_lines.match?(line)
748
+ else
749
+ @skip_lines.match(line)
750
+ end
751
+ end
752
+
753
+ def parse_no_quote(&block)
754
+ @scanner.each_line(@row_separator) do |line|
755
+ next if @skip_lines and skip_line?(line)
756
+ original_line = line
757
+ line = line.delete_suffix(@row_separator)
758
+
759
+ if line.empty?
760
+ next if @skip_blanks
761
+ row = []
762
+ else
763
+ line = strip_value(line)
764
+ row = line.split(@split_column_separator, -1)
765
+ n_columns = row.size
766
+ i = 0
767
+ while i < n_columns
768
+ row[i] = nil if row[i].empty?
769
+ i += 1
770
+ end
771
+ end
772
+ @last_line = original_line
773
+ emit_row(row, &block)
774
+ end
775
+ end
776
+
777
+ def parse_quotable_loose(&block)
778
+ @scanner.keep_start
779
+ @scanner.each_line(@row_separator) do |line|
780
+ if @skip_lines and skip_line?(line)
781
+ @scanner.keep_drop
782
+ @scanner.keep_start
783
+ next
784
+ end
785
+ original_line = line
786
+ line = line.delete_suffix(@row_separator)
787
+
788
+ if line.empty?
789
+ if @skip_blanks
790
+ @scanner.keep_drop
791
+ @scanner.keep_start
792
+ next
793
+ end
794
+ row = []
795
+ elsif line.include?(@cr) or line.include?(@lf)
796
+ @scanner.keep_back
797
+ @need_robust_parsing = true
798
+ return parse_quotable_robust(&block)
799
+ else
800
+ row = line.split(@split_column_separator, -1)
801
+ n_columns = row.size
802
+ i = 0
803
+ while i < n_columns
804
+ column = row[i]
805
+ if column.empty?
806
+ row[i] = nil
807
+ else
808
+ n_quotes = column.count(@quote_character)
809
+ if n_quotes.zero?
810
+ # no quote
811
+ elsif n_quotes == 2 and
812
+ column.start_with?(@quote_character) and
813
+ column.end_with?(@quote_character)
814
+ row[i] = column[1..-2]
815
+ else
816
+ @scanner.keep_back
817
+ @need_robust_parsing = true
818
+ return parse_quotable_robust(&block)
819
+ end
820
+ end
821
+ i += 1
822
+ end
823
+ end
824
+ @scanner.keep_drop
825
+ @scanner.keep_start
826
+ @last_line = original_line
827
+ emit_row(row, &block)
828
+ end
829
+ @scanner.keep_drop
830
+ end
831
+
832
+ def parse_quotable_robust(&block)
833
+ row = []
834
+ skip_needless_lines
835
+ start_row
836
+ while true
837
+ @quoted_column_value = false
838
+ @unquoted_column_value = false
839
+ @scanner.scan_all(@strip_value) if @strip_value
840
+ value = parse_column_value
841
+ if value
842
+ @scanner.scan_all(@strip_value) if @strip_value
843
+ if @field_size_limit and value.size >= @field_size_limit
844
+ ignore_broken_line
845
+ raise MalformedCSVError.new("Field size exceeded", @lineno)
846
+ end
847
+ end
848
+ if parse_column_end
849
+ row << value
850
+ elsif parse_row_end
851
+ if row.empty? and value.nil?
852
+ emit_row([], &block) unless @skip_blanks
853
+ else
854
+ row << value
855
+ emit_row(row, &block)
856
+ row = []
857
+ end
858
+ skip_needless_lines
859
+ start_row
860
+ elsif @scanner.eos?
861
+ break if row.empty? and value.nil?
862
+ row << value
863
+ emit_row(row, &block)
864
+ break
865
+ else
866
+ if @quoted_column_value
867
+ ignore_broken_line
868
+ message = "Any value after quoted field isn't allowed"
869
+ raise MalformedCSVError.new(message, @lineno)
870
+ elsif @unquoted_column_value and
871
+ (new_line = @scanner.scan(@cr_or_lf))
872
+ ignore_broken_line
873
+ message = "Unquoted fields do not allow new line " +
874
+ "<#{new_line.inspect}>"
875
+ raise MalformedCSVError.new(message, @lineno)
876
+ elsif @scanner.rest.start_with?(@quote_character)
877
+ ignore_broken_line
878
+ message = "Illegal quoting"
879
+ raise MalformedCSVError.new(message, @lineno)
880
+ elsif (new_line = @scanner.scan(@cr_or_lf))
881
+ ignore_broken_line
882
+ message = "New line must be <#{@row_separator.inspect}> " +
883
+ "not <#{new_line.inspect}>"
884
+ raise MalformedCSVError.new(message, @lineno)
885
+ else
886
+ ignore_broken_line
887
+ raise MalformedCSVError.new("TODO: Meaningful message",
888
+ @lineno)
889
+ end
890
+ end
891
+ end
892
+ end
893
+
894
+ def parse_column_value
895
+ if @liberal_parsing
896
+ quoted_value = parse_quoted_column_value
897
+ if quoted_value
898
+ unquoted_value = parse_unquoted_column_value
899
+ if unquoted_value
900
+ if @double_quote_outside_quote
901
+ unquoted_value = unquoted_value.gsub(@quote_character * 2,
902
+ @quote_character)
903
+ if quoted_value.empty? # %Q{""...} case
904
+ return @quote_character + unquoted_value
905
+ end
906
+ end
907
+ @quote_character + quoted_value + @quote_character + unquoted_value
908
+ else
909
+ quoted_value
910
+ end
911
+ else
912
+ parse_unquoted_column_value
913
+ end
914
+ elsif @may_quoted
915
+ parse_quoted_column_value ||
916
+ parse_unquoted_column_value
917
+ else
918
+ parse_unquoted_column_value ||
919
+ parse_quoted_column_value
920
+ end
921
+ end
922
+
923
+ def parse_unquoted_column_value
924
+ value = @scanner.scan_all(@unquoted_value)
925
+ return nil unless value
926
+
927
+ @unquoted_column_value = true
928
+ if @first_column_separators
929
+ while true
930
+ @scanner.keep_start
931
+ is_column_end = @column_ends.all? do |column_end|
932
+ @scanner.scan(column_end)
933
+ end
934
+ @scanner.keep_back
935
+ break if is_column_end
936
+ sub_separator = @scanner.scan_all(@first_column_separators)
937
+ break if sub_separator.nil?
938
+ value << sub_separator
939
+ sub_value = @scanner.scan_all(@unquoted_value)
940
+ break if sub_value.nil?
941
+ value << sub_value
942
+ end
943
+ end
944
+ value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
945
+ value
946
+ end
947
+
948
+ def parse_quoted_column_value
949
+ quotes = @scanner.scan_all(@quotes)
950
+ return nil unless quotes
951
+
952
+ @quoted_column_value = true
953
+ n_quotes = quotes.size
954
+ if (n_quotes % 2).zero?
955
+ quotes[0, (n_quotes - 2) / 2]
956
+ else
957
+ value = quotes[0, (n_quotes - 1) / 2]
958
+ while true
959
+ quoted_value = @scanner.scan_all(@quoted_value)
960
+ value << quoted_value if quoted_value
961
+ if @backslash_quote
962
+ if @scanner.scan(@escaped_backslash)
963
+ if @scanner.scan(@escaped_quote)
964
+ value << @quote_character
965
+ else
966
+ value << @backslash_character
967
+ end
968
+ next
969
+ end
970
+ end
971
+
972
+ quotes = @scanner.scan_all(@quotes)
973
+ unless quotes
974
+ ignore_broken_line
975
+ message = "Unclosed quoted field"
976
+ raise MalformedCSVError.new(message, @lineno)
977
+ end
978
+ n_quotes = quotes.size
979
+ if n_quotes == 1
980
+ break
981
+ elsif (n_quotes % 2) == 1
982
+ value << quotes[0, (n_quotes - 1) / 2]
983
+ break
984
+ else
985
+ value << quotes[0, n_quotes / 2]
986
+ end
987
+ end
988
+ value
989
+ end
990
+ end
991
+
992
+ def parse_column_end
993
+ return true if @scanner.scan(@column_end)
994
+ return false unless @column_ends
995
+
996
+ @scanner.keep_start
997
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
998
+ @scanner.keep_drop
999
+ true
1000
+ else
1001
+ @scanner.keep_back
1002
+ false
1003
+ end
1004
+ end
1005
+
1006
+ def parse_row_end
1007
+ return true if @scanner.scan(@row_end)
1008
+ return false unless @row_ends
1009
+ @scanner.keep_start
1010
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
1011
+ @scanner.keep_drop
1012
+ true
1013
+ else
1014
+ @scanner.keep_back
1015
+ false
1016
+ end
1017
+ end
1018
+
1019
+ def strip_value(value)
1020
+ return value unless @strip
1021
+ return nil if value.nil?
1022
+
1023
+ case @strip
1024
+ when String
1025
+ size = value.size
1026
+ while value.start_with?(@strip)
1027
+ size -= 1
1028
+ value = value[1, size]
1029
+ end
1030
+ while value.end_with?(@strip)
1031
+ size -= 1
1032
+ value = value[0, size]
1033
+ end
1034
+ else
1035
+ value.strip!
1036
+ end
1037
+ value
1038
+ end
1039
+
1040
+ def ignore_broken_line
1041
+ @scanner.scan_all(@not_line_end)
1042
+ @scanner.scan_all(@cr_or_lf)
1043
+ @lineno += 1
1044
+ end
1045
+
1046
+ def start_row
1047
+ if @last_line
1048
+ @last_line = nil
1049
+ else
1050
+ @scanner.keep_drop
1051
+ end
1052
+ @scanner.keep_start
1053
+ end
1054
+
1055
+ def emit_row(row, &block)
1056
+ @lineno += 1
1057
+
1058
+ raw_row = row
1059
+ if @use_headers
1060
+ if @headers.nil?
1061
+ @headers = adjust_headers(row)
1062
+ return unless @return_headers
1063
+ row = Row.new(@headers, row, true)
1064
+ else
1065
+ row = Row.new(@headers,
1066
+ @fields_converter.convert(raw_row, @headers, @lineno))
1067
+ end
1068
+ else
1069
+ # convert fields, if needed...
1070
+ row = @fields_converter.convert(raw_row, nil, @lineno)
1071
+ end
1072
+
1073
+ # inject unconverted fields and accessor, if requested...
1074
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
1075
+ add_unconverted_fields(row, raw_row)
1076
+ end
1077
+
1078
+ yield(row)
1079
+ end
1080
+
1081
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
1082
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
1083
+ # variable is set to the contents of +fields+.
1084
+ def add_unconverted_fields(row, fields)
1085
+ class << row
1086
+ attr_reader :unconverted_fields
1087
+ end
1088
+ row.instance_variable_set(:@unconverted_fields, fields)
1089
+ row
1090
+ end
1091
+ end
1092
+ end