csv 3.0.1 → 3.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CSV
4
+ class FieldsConverter
5
+ include Enumerable
6
+
7
+ def initialize(options={})
8
+ @converters = []
9
+ @nil_value = options[:nil_value]
10
+ @empty_value = options[:empty_value]
11
+ @empty_value_is_empty_string = (@empty_value == "")
12
+ @accept_nil = options[:accept_nil]
13
+ @builtin_converters = options[:builtin_converters]
14
+ @need_static_convert = need_static_convert?
15
+ end
16
+
17
+ def add_converter(name=nil, &converter)
18
+ if name.nil? # custom converter
19
+ @converters << converter
20
+ else # named converter
21
+ combo = @builtin_converters[name]
22
+ case combo
23
+ when Array # combo converter
24
+ combo.each do |sub_name|
25
+ add_converter(sub_name)
26
+ end
27
+ else # individual named converter
28
+ @converters << combo
29
+ end
30
+ end
31
+ end
32
+
33
+ def each(&block)
34
+ @converters.each(&block)
35
+ end
36
+
37
+ def empty?
38
+ @converters.empty?
39
+ end
40
+
41
+ def convert(fields, headers, lineno)
42
+ return fields unless need_convert?
43
+
44
+ fields.collect.with_index do |field, index|
45
+ if field.nil?
46
+ field = @nil_value
47
+ elsif field.empty?
48
+ field = @empty_value unless @empty_value_is_empty_string
49
+ end
50
+ @converters.each do |converter|
51
+ break if field.nil? and @accept_nil
52
+ if converter.arity == 1 # straight field converter
53
+ field = converter[field]
54
+ else # FieldInfo converter
55
+ if headers
56
+ header = headers[index]
57
+ else
58
+ header = nil
59
+ end
60
+ field = converter[field, FieldInfo.new(index, lineno, header)]
61
+ end
62
+ break unless field.is_a?(String) # short-circuit pipeline for speed
63
+ end
64
+ field # final state of each field, converted or original
65
+ end
66
+ end
67
+
68
+ private
69
+ def need_static_convert?
70
+ not (@nil_value.nil? and @empty_value_is_empty_string)
71
+ end
72
+
73
+ def need_convert?
74
+ @need_static_convert or
75
+ (not @converters.empty?)
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This provides String#match? and Regexp#match? for Ruby 2.3.
4
+ unless String.method_defined?(:match?)
5
+ class CSV
6
+ module MatchP
7
+ refine String do
8
+ def match?(pattern)
9
+ self =~ pattern
10
+ end
11
+ end
12
+
13
+ refine Regexp do
14
+ def match?(string)
15
+ self =~ string
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,713 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "match_p"
6
+ require_relative "row"
7
+ require_relative "table"
8
+
9
+ using CSV::MatchP if CSV.const_defined?(:MatchP)
10
+
11
+ class CSV
12
+ class Parser
13
+ class InvalidEncoding < StandardError
14
+ end
15
+
16
+ class Scanner < StringScanner
17
+ alias_method :scan_all, :scan
18
+
19
+ def initialize(*args)
20
+ super
21
+ @keeps = []
22
+ end
23
+
24
+ def keep_start
25
+ @keeps.push(pos)
26
+ end
27
+
28
+ def keep_end
29
+ start = @keeps.pop
30
+ string[start, pos - start]
31
+ end
32
+
33
+ def keep_back
34
+ self.pos = @keeps.pop
35
+ end
36
+
37
+ def keep_drop
38
+ @keeps.pop
39
+ end
40
+ end
41
+
42
+ class InputsScanner
43
+ def initialize(inputs, encoding, chunk_size: 8192)
44
+ @inputs = inputs.dup
45
+ @encoding = encoding
46
+ @chunk_size = chunk_size
47
+ @last_scanner = @inputs.empty?
48
+ @keeps = []
49
+ read_chunk
50
+ end
51
+
52
+ def scan(pattern)
53
+ value = @scanner.scan(pattern)
54
+ return value if @last_scanner
55
+
56
+ if value
57
+ read_chunk if @scanner.eos?
58
+ return value
59
+ else
60
+ nil
61
+ end
62
+ end
63
+
64
+ def scan_all(pattern)
65
+ value = @scanner.scan(pattern)
66
+ return value if @last_scanner
67
+
68
+ return nil if value.nil?
69
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
70
+ value << sub_value
71
+ end
72
+ value
73
+ end
74
+
75
+ def eos?
76
+ @scanner.eos?
77
+ end
78
+
79
+ def keep_start
80
+ @keeps.push([@scanner.pos, nil])
81
+ end
82
+
83
+ def keep_end
84
+ start, buffer = @keeps.pop
85
+ keep = @scanner.string[start, @scanner.pos - start]
86
+ if buffer
87
+ buffer << keep
88
+ keep = buffer
89
+ end
90
+ keep
91
+ end
92
+
93
+ def keep_back
94
+ start, buffer = @keeps.pop
95
+ if buffer
96
+ string = @scanner.string
97
+ keep = string[start, string.size - start]
98
+ if keep and not keep.empty?
99
+ @inputs.unshift(StringIO.new(keep))
100
+ @last_scanner = false
101
+ end
102
+ @scanner = StringScanner.new(buffer)
103
+ else
104
+ @scanner.pos = start
105
+ end
106
+ end
107
+
108
+ def keep_drop
109
+ @keeps.pop
110
+ end
111
+
112
+ def rest
113
+ @scanner.rest
114
+ end
115
+
116
+ private
117
+ def read_chunk
118
+ return false if @last_scanner
119
+
120
+ unless @keeps.empty?
121
+ keep = @keeps.last
122
+ keep_start = keep[0]
123
+ string = @scanner.string
124
+ keep_data = string[keep_start, @scanner.pos - keep_start]
125
+ if keep_data
126
+ keep_buffer = keep[1]
127
+ if keep_buffer
128
+ keep_buffer << keep_data
129
+ else
130
+ keep[1] = keep_data.dup
131
+ end
132
+ end
133
+ keep[0] = 0
134
+ end
135
+
136
+ input = @inputs.first
137
+ case input
138
+ when StringIO
139
+ string = input.string
140
+ raise InvalidEncoding unless string.valid_encoding?
141
+ @scanner = StringScanner.new(string)
142
+ @inputs.shift
143
+ @last_scanner = @inputs.empty?
144
+ true
145
+ else
146
+ chunk = input.gets(nil, @chunk_size)
147
+ if chunk
148
+ raise InvalidEncoding unless chunk.valid_encoding?
149
+ @scanner = StringScanner.new(chunk)
150
+ if input.respond_to?(:eof?) and input.eof?
151
+ @inputs.shift
152
+ @last_scanner = @inputs.empty?
153
+ end
154
+ true
155
+ else
156
+ @scanner = StringScanner.new("".encode(@encoding))
157
+ @inputs.shift
158
+ @last_scanner = @inputs.empty?
159
+ if @last_scanner
160
+ false
161
+ else
162
+ read_chunk
163
+ end
164
+ end
165
+ end
166
+ end
167
+ end
168
+
169
+ def initialize(input, options)
170
+ @input = input
171
+ @options = options
172
+ @samples = []
173
+
174
+ prepare
175
+ end
176
+
177
+ def column_separator
178
+ @column_separator
179
+ end
180
+
181
+ def row_separator
182
+ @row_separator
183
+ end
184
+
185
+ def quote_character
186
+ @quote_character
187
+ end
188
+
189
+ def field_size_limit
190
+ @field_size_limit
191
+ end
192
+
193
+ def skip_lines
194
+ @skip_lines
195
+ end
196
+
197
+ def unconverted_fields?
198
+ @unconverted_fields
199
+ end
200
+
201
+ def headers
202
+ @headers
203
+ end
204
+
205
+ def header_row?
206
+ @use_headers and @headers.nil?
207
+ end
208
+
209
+ def return_headers?
210
+ @return_headers
211
+ end
212
+
213
+ def skip_blanks?
214
+ @skip_blanks
215
+ end
216
+
217
+ def liberal_parsing?
218
+ @liberal_parsing
219
+ end
220
+
221
+ def lineno
222
+ @lineno
223
+ end
224
+
225
+ def line
226
+ last_line
227
+ end
228
+
229
+ def parse(&block)
230
+ return to_enum(__method__) unless block_given?
231
+
232
+ if @return_headers and @headers
233
+ headers = Row.new(@headers, @raw_headers, true)
234
+ if @unconverted_fields
235
+ headers = add_unconverted_fields(headers, [])
236
+ end
237
+ yield headers
238
+ end
239
+
240
+ row = []
241
+ begin
242
+ @scanner = build_scanner
243
+ skip_needless_lines
244
+ start_row
245
+ while true
246
+ @quoted_column_value = false
247
+ @unquoted_column_value = false
248
+ value = parse_column_value
249
+ if value and @field_size_limit and value.size >= @field_size_limit
250
+ raise MalformedCSVError.new("Field size exceeded", @lineno + 1)
251
+ end
252
+ if parse_column_end
253
+ row << value
254
+ elsif parse_row_end
255
+ if row.empty? and value.nil?
256
+ emit_row([], &block) unless @skip_blanks
257
+ else
258
+ row << value
259
+ emit_row(row, &block)
260
+ row = []
261
+ end
262
+ skip_needless_lines
263
+ start_row
264
+ elsif @scanner.eos?
265
+ return if row.empty? and value.nil?
266
+ row << value
267
+ emit_row(row, &block)
268
+ return
269
+ else
270
+ if @quoted_column_value
271
+ message = "Do not allow except col_sep_split_separator " +
272
+ "after quoted fields"
273
+ raise MalformedCSVError.new(message, @lineno + 1)
274
+ elsif @unquoted_column_value and @scanner.scan(@cr_or_lf)
275
+ message = "Unquoted fields do not allow \\r or \\n"
276
+ raise MalformedCSVError.new(message, @lineno + 1)
277
+ elsif @scanner.rest.start_with?(@quote_character)
278
+ message = "Illegal quoting"
279
+ raise MalformedCSVError.new(message, @lineno + 1)
280
+ else
281
+ raise MalformedCSVError.new("TODO: Meaningful message",
282
+ @lineno + 1)
283
+ end
284
+ end
285
+ end
286
+ rescue InvalidEncoding
287
+ message = "Invalid byte sequence in #{@encoding}"
288
+ raise MalformedCSVError.new(message, @lineno + 1)
289
+ end
290
+ end
291
+
292
+ private
293
+ def prepare
294
+ prepare_variable
295
+ prepare_regexp
296
+ prepare_line
297
+ prepare_header
298
+ prepare_parser
299
+ end
300
+
301
+ def prepare_variable
302
+ @encoding = @options[:encoding]
303
+ @liberal_parsing = @options[:liberal_parsing]
304
+ @unconverted_fields = @options[:unconverted_fields]
305
+ @field_size_limit = @options[:field_size_limit]
306
+ @skip_blanks = @options[:skip_blanks]
307
+ @fields_converter = @options[:fields_converter]
308
+ @header_fields_converter = @options[:header_fields_converter]
309
+ end
310
+
311
+ def prepare_regexp
312
+ @column_separator = @options[:column_separator].to_s.encode(@encoding)
313
+ @row_separator =
314
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
315
+ @quote_character = @options[:quote_character].to_s.encode(@encoding)
316
+ if @quote_character.length != 1
317
+ raise ArgumentError, ":quote_char has to be a single character String"
318
+ end
319
+
320
+ escaped_column_separator = Regexp.escape(@column_separator)
321
+ escaped_row_separator = Regexp.escape(@row_separator)
322
+ escaped_quote_character = Regexp.escape(@quote_character)
323
+
324
+ skip_lines = @options[:skip_lines]
325
+ case skip_lines
326
+ when String
327
+ @skip_lines = skip_lines.encode(@encoding)
328
+ when Regexp, nil
329
+ @skip_lines = skip_lines
330
+ else
331
+ unless skip_lines.respond_to?(:match)
332
+ message =
333
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
334
+ raise ArgumentError, message
335
+ end
336
+ @skip_lines = skip_lines
337
+ end
338
+
339
+ @column_end = Regexp.new(escaped_column_separator)
340
+ if @column_separator.size > 1
341
+ @column_ends = @column_separator.each_char.collect do |char|
342
+ Regexp.new(Regexp.escape(char))
343
+ end
344
+ else
345
+ @column_ends = nil
346
+ end
347
+ @row_end = Regexp.new(escaped_row_separator)
348
+ if @row_separator.size > 1
349
+ @row_ends = @row_separator.each_char.collect do |char|
350
+ Regexp.new(Regexp.escape(char))
351
+ end
352
+ else
353
+ @row_ends = nil
354
+ end
355
+ @quotes = Regexp.new(escaped_quote_character +
356
+ "+".encode(@encoding))
357
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
358
+ escaped_quote_character +
359
+ "]+".encode(@encoding))
360
+ if @liberal_parsing
361
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
362
+ escaped_column_separator +
363
+ "\r\n]+".encode(@encoding))
364
+ else
365
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
366
+ escaped_quote_character +
367
+ escaped_column_separator +
368
+ "\r\n]+".encode(@encoding))
369
+ end
370
+ @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding))
371
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
372
+ end
373
+
374
+ def resolve_row_separator(separator)
375
+ if separator == :auto
376
+ cr = "\r".encode(@encoding)
377
+ lf = "\n".encode(@encoding)
378
+ if @input.is_a?(StringIO)
379
+ separator = detect_row_separator(@input.string, cr, lf)
380
+ elsif @input.respond_to?(:gets)
381
+ if @input.is_a?(File)
382
+ chunk_size = 32 * 1024
383
+ else
384
+ chunk_size = 1024
385
+ end
386
+ begin
387
+ while separator == :auto
388
+ #
389
+ # if we run out of data, it's probably a single line
390
+ # (ensure will set default value)
391
+ #
392
+ break unless sample = @input.gets(nil, chunk_size)
393
+
394
+ # extend sample if we're unsure of the line ending
395
+ if sample.end_with?(cr)
396
+ sample << (@input.gets(nil, 1) || "")
397
+ end
398
+
399
+ @samples << sample
400
+
401
+ separator = detect_row_separator(sample, cr, lf)
402
+ end
403
+ rescue IOError
404
+ # do nothing: ensure will set default
405
+ end
406
+ end
407
+ separator = $INPUT_RECORD_SEPARATOR if separator == :auto
408
+ end
409
+ separator.to_s.encode(@encoding)
410
+ end
411
+
412
+ def detect_row_separator(sample, cr, lf)
413
+ lf_index = sample.index(lf)
414
+ if lf_index
415
+ cr_index = sample[0, lf_index].index(cr)
416
+ else
417
+ cr_index = sample.index(cr)
418
+ end
419
+ if cr_index and lf_index
420
+ if cr_index + 1 == lf_index
421
+ cr + lf
422
+ elsif cr_index < lf_index
423
+ cr
424
+ else
425
+ lf
426
+ end
427
+ elsif cr_index
428
+ cr
429
+ elsif lf_index
430
+ lf
431
+ else
432
+ :auto
433
+ end
434
+ end
435
+
436
+ def prepare_line
437
+ @lineno = 0
438
+ @last_line = nil
439
+ @scanner = nil
440
+ end
441
+
442
+ def last_line
443
+ if @scanner
444
+ @last_line ||= @scanner.keep_end
445
+ else
446
+ @last_line
447
+ end
448
+ end
449
+
450
+ def prepare_header
451
+ @return_headers = @options[:return_headers]
452
+
453
+ headers = @options[:headers]
454
+ case headers
455
+ when Array
456
+ @raw_headers = headers
457
+ @use_headers = true
458
+ when String
459
+ @raw_headers = parse_headers(headers)
460
+ @use_headers = true
461
+ when nil, false
462
+ @raw_headers = nil
463
+ @use_headers = false
464
+ else
465
+ @raw_headers = nil
466
+ @use_headers = true
467
+ end
468
+ if @raw_headers
469
+ @headers = adjust_headers(@raw_headers)
470
+ else
471
+ @headers = nil
472
+ end
473
+ end
474
+
475
+ def parse_headers(row)
476
+ CSV.parse_line(row,
477
+ col_sep: @column_separator,
478
+ row_sep: @row_separator,
479
+ quote_char: @quote_character)
480
+ end
481
+
482
+ def adjust_headers(headers)
483
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
484
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
485
+ adjusted_headers
486
+ end
487
+
488
+ def prepare_parser
489
+ @may_quoted = may_quoted?
490
+ end
491
+
492
+ def may_quoted?
493
+ if @input.is_a?(StringIO)
494
+ sample = @input.string
495
+ else
496
+ return false if @samples.empty?
497
+ sample = @samples.first
498
+ end
499
+ sample[0, 128].index(@quote_character)
500
+ end
501
+
502
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
503
+ if SCANNER_TEST
504
+ class UnoptimizedStringIO
505
+ def initialize(string)
506
+ @io = StringIO.new(string)
507
+ end
508
+
509
+ def gets(*args)
510
+ @io.gets(*args)
511
+ end
512
+
513
+ def eof?
514
+ @io.eof?
515
+ end
516
+ end
517
+
518
+ def build_scanner
519
+ inputs = @samples.collect do |sample|
520
+ UnoptimizedStringIO.new(sample)
521
+ end
522
+ if @input.is_a?(StringIO)
523
+ inputs << UnoptimizedStringIO.new(@input.string)
524
+ else
525
+ inputs << @input
526
+ end
527
+ InputsScanner.new(inputs, @encoding, chunk_size: 1)
528
+ end
529
+ else
530
+ def build_scanner
531
+ string = nil
532
+ if @samples.empty? and @input.is_a?(StringIO)
533
+ string = @input.string
534
+ elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
535
+ string = @samples[0]
536
+ end
537
+ if string
538
+ unless string.valid_encoding?
539
+ message = "Invalid byte sequence in #{@encoding}"
540
+ raise MalformedCSVError.new(message, @lineno + 1)
541
+ end
542
+ Scanner.new(string)
543
+ else
544
+ inputs = @samples.collect do |sample|
545
+ StringIO.new(sample)
546
+ end
547
+ inputs << @input
548
+ InputsScanner.new(inputs, @encoding)
549
+ end
550
+ end
551
+ end
552
+
553
+ def skip_needless_lines
554
+ return unless @skip_lines
555
+
556
+ while true
557
+ @scanner.keep_start
558
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
559
+ line << @row_separator if parse_row_end
560
+ if skip_line?(line)
561
+ @scanner.keep_drop
562
+ else
563
+ @scanner.keep_back
564
+ return
565
+ end
566
+ end
567
+ end
568
+
569
+ def skip_line?(line)
570
+ case @skip_lines
571
+ when String
572
+ line.include?(@skip_lines)
573
+ when Regexp
574
+ @skip_lines.match?(line)
575
+ else
576
+ @skip_lines.match(line)
577
+ end
578
+ end
579
+
580
+ def parse_column_value
581
+ if @liberal_parsing
582
+ quoted_value = parse_quoted_column_value
583
+ if quoted_value
584
+ unquoted_value = parse_unquoted_column_value
585
+ if unquoted_value
586
+ @quote_character + quoted_value + @quote_character + unquoted_value
587
+ else
588
+ quoted_value
589
+ end
590
+ else
591
+ parse_unquoted_column_value
592
+ end
593
+ elsif @may_quoted
594
+ parse_quoted_column_value ||
595
+ parse_unquoted_column_value
596
+ else
597
+ parse_unquoted_column_value ||
598
+ parse_quoted_column_value
599
+ end
600
+ end
601
+
602
+ def parse_unquoted_column_value
603
+ value = @scanner.scan_all(@unquoted_value)
604
+ @unquoted_column_value = true if value
605
+ value
606
+ end
607
+
608
+ def parse_quoted_column_value
609
+ quotes = @scanner.scan_all(@quotes)
610
+ return nil unless quotes
611
+
612
+ @quoted_column_value = true
613
+ n_quotes = quotes.size
614
+ if (n_quotes % 2).zero?
615
+ quotes[0, (n_quotes - 2) / 2]
616
+ else
617
+ value = quotes[0, (n_quotes - 1) / 2]
618
+ while true
619
+ quoted_value = @scanner.scan_all(@quoted_value)
620
+ value << quoted_value if quoted_value
621
+ quotes = @scanner.scan_all(@quotes)
622
+ unless quotes
623
+ message = "Unclosed quoted field"
624
+ raise MalformedCSVError.new(message, @lineno + 1)
625
+ end
626
+ n_quotes = quotes.size
627
+ if n_quotes == 1
628
+ break
629
+ elsif (n_quotes % 2) == 1
630
+ value << quotes[0, (n_quotes - 1) / 2]
631
+ break
632
+ else
633
+ value << quotes[0, n_quotes / 2]
634
+ end
635
+ end
636
+ value
637
+ end
638
+ end
639
+
640
+ def parse_column_end
641
+ return true if @scanner.scan(@column_end)
642
+ return false unless @column_ends
643
+
644
+ @scanner.keep_start
645
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
646
+ @scanner.keep_drop
647
+ true
648
+ else
649
+ @scanner.keep_back
650
+ false
651
+ end
652
+ end
653
+
654
+ def parse_row_end
655
+ return true if @scanner.scan(@row_end)
656
+ return false unless @row_ends
657
+ @scanner.keep_start
658
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
659
+ @scanner.keep_drop
660
+ true
661
+ else
662
+ @scanner.keep_back
663
+ false
664
+ end
665
+ end
666
+
667
+ def start_row
668
+ if @last_line
669
+ @last_line = nil
670
+ else
671
+ @scanner.keep_drop
672
+ end
673
+ @scanner.keep_start
674
+ end
675
+
676
+ def emit_row(row, &block)
677
+ @lineno += 1
678
+
679
+ raw_row = row
680
+ if @use_headers
681
+ if @headers.nil?
682
+ @headers = adjust_headers(row)
683
+ return unless @return_headers
684
+ row = Row.new(@headers, row, true)
685
+ else
686
+ row = Row.new(@headers,
687
+ @fields_converter.convert(raw_row, @headers, @lineno))
688
+ end
689
+ else
690
+ # convert fields, if needed...
691
+ row = @fields_converter.convert(raw_row, nil, @lineno)
692
+ end
693
+
694
+ # inject unconverted fields and accessor, if requested...
695
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
696
+ add_unconverted_fields(row, raw_row)
697
+ end
698
+
699
+ yield(row)
700
+ end
701
+
702
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
703
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
704
+ # variable is set to the contents of +fields+.
705
+ def add_unconverted_fields(row, fields)
706
+ class << row
707
+ attr_reader :unconverted_fields
708
+ end
709
+ row.instance_variable_set(:@unconverted_fields, fields)
710
+ row
711
+ end
712
+ end
713
+ end