csv 3.0.1 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,78 @@
1
+ # frozen_string_literal: true
2
+
3
+ class CSV
4
+ class FieldsConverter
5
+ include Enumerable
6
+
7
+ def initialize(options={})
8
+ @converters = []
9
+ @nil_value = options[:nil_value]
10
+ @empty_value = options[:empty_value]
11
+ @empty_value_is_empty_string = (@empty_value == "")
12
+ @accept_nil = options[:accept_nil]
13
+ @builtin_converters = options[:builtin_converters]
14
+ @need_static_convert = need_static_convert?
15
+ end
16
+
17
+ def add_converter(name=nil, &converter)
18
+ if name.nil? # custom converter
19
+ @converters << converter
20
+ else # named converter
21
+ combo = @builtin_converters[name]
22
+ case combo
23
+ when Array # combo converter
24
+ combo.each do |sub_name|
25
+ add_converter(sub_name)
26
+ end
27
+ else # individual named converter
28
+ @converters << combo
29
+ end
30
+ end
31
+ end
32
+
33
+ def each(&block)
34
+ @converters.each(&block)
35
+ end
36
+
37
+ def empty?
38
+ @converters.empty?
39
+ end
40
+
41
+ def convert(fields, headers, lineno)
42
+ return fields unless need_convert?
43
+
44
+ fields.collect.with_index do |field, index|
45
+ if field.nil?
46
+ field = @nil_value
47
+ elsif field.empty?
48
+ field = @empty_value unless @empty_value_is_empty_string
49
+ end
50
+ @converters.each do |converter|
51
+ break if field.nil? and @accept_nil
52
+ if converter.arity == 1 # straight field converter
53
+ field = converter[field]
54
+ else # FieldInfo converter
55
+ if headers
56
+ header = headers[index]
57
+ else
58
+ header = nil
59
+ end
60
+ field = converter[field, FieldInfo.new(index, lineno, header)]
61
+ end
62
+ break unless field.is_a?(String) # short-circuit pipeline for speed
63
+ end
64
+ field # final state of each field, converted or original
65
+ end
66
+ end
67
+
68
+ private
69
+ def need_static_convert?
70
+ not (@nil_value.nil? and @empty_value_is_empty_string)
71
+ end
72
+
73
+ def need_convert?
74
+ @need_static_convert or
75
+ (not @converters.empty?)
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ # This provides String#match? and Regexp#match? for Ruby 2.3.
4
+ unless String.method_defined?(:match?)
5
+ class CSV
6
+ module MatchP
7
+ refine String do
8
+ def match?(pattern)
9
+ self =~ pattern
10
+ end
11
+ end
12
+
13
+ refine Regexp do
14
+ def match?(string)
15
+ self =~ string
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,713 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "match_p"
6
+ require_relative "row"
7
+ require_relative "table"
8
+
9
+ using CSV::MatchP if CSV.const_defined?(:MatchP)
10
+
11
+ class CSV
12
+ class Parser
13
+ class InvalidEncoding < StandardError
14
+ end
15
+
16
+ class Scanner < StringScanner
17
+ alias_method :scan_all, :scan
18
+
19
+ def initialize(*args)
20
+ super
21
+ @keeps = []
22
+ end
23
+
24
+ def keep_start
25
+ @keeps.push(pos)
26
+ end
27
+
28
+ def keep_end
29
+ start = @keeps.pop
30
+ string[start, pos - start]
31
+ end
32
+
33
+ def keep_back
34
+ self.pos = @keeps.pop
35
+ end
36
+
37
+ def keep_drop
38
+ @keeps.pop
39
+ end
40
+ end
41
+
42
+ class InputsScanner
43
+ def initialize(inputs, encoding, chunk_size: 8192)
44
+ @inputs = inputs.dup
45
+ @encoding = encoding
46
+ @chunk_size = chunk_size
47
+ @last_scanner = @inputs.empty?
48
+ @keeps = []
49
+ read_chunk
50
+ end
51
+
52
+ def scan(pattern)
53
+ value = @scanner.scan(pattern)
54
+ return value if @last_scanner
55
+
56
+ if value
57
+ read_chunk if @scanner.eos?
58
+ return value
59
+ else
60
+ nil
61
+ end
62
+ end
63
+
64
+ def scan_all(pattern)
65
+ value = @scanner.scan(pattern)
66
+ return value if @last_scanner
67
+
68
+ return nil if value.nil?
69
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
70
+ value << sub_value
71
+ end
72
+ value
73
+ end
74
+
75
+ def eos?
76
+ @scanner.eos?
77
+ end
78
+
79
+ def keep_start
80
+ @keeps.push([@scanner.pos, nil])
81
+ end
82
+
83
+ def keep_end
84
+ start, buffer = @keeps.pop
85
+ keep = @scanner.string[start, @scanner.pos - start]
86
+ if buffer
87
+ buffer << keep
88
+ keep = buffer
89
+ end
90
+ keep
91
+ end
92
+
93
+ def keep_back
94
+ start, buffer = @keeps.pop
95
+ if buffer
96
+ string = @scanner.string
97
+ keep = string[start, string.size - start]
98
+ if keep and not keep.empty?
99
+ @inputs.unshift(StringIO.new(keep))
100
+ @last_scanner = false
101
+ end
102
+ @scanner = StringScanner.new(buffer)
103
+ else
104
+ @scanner.pos = start
105
+ end
106
+ end
107
+
108
+ def keep_drop
109
+ @keeps.pop
110
+ end
111
+
112
+ def rest
113
+ @scanner.rest
114
+ end
115
+
116
+ private
117
+ def read_chunk
118
+ return false if @last_scanner
119
+
120
+ unless @keeps.empty?
121
+ keep = @keeps.last
122
+ keep_start = keep[0]
123
+ string = @scanner.string
124
+ keep_data = string[keep_start, @scanner.pos - keep_start]
125
+ if keep_data
126
+ keep_buffer = keep[1]
127
+ if keep_buffer
128
+ keep_buffer << keep_data
129
+ else
130
+ keep[1] = keep_data.dup
131
+ end
132
+ end
133
+ keep[0] = 0
134
+ end
135
+
136
+ input = @inputs.first
137
+ case input
138
+ when StringIO
139
+ string = input.string
140
+ raise InvalidEncoding unless string.valid_encoding?
141
+ @scanner = StringScanner.new(string)
142
+ @inputs.shift
143
+ @last_scanner = @inputs.empty?
144
+ true
145
+ else
146
+ chunk = input.gets(nil, @chunk_size)
147
+ if chunk
148
+ raise InvalidEncoding unless chunk.valid_encoding?
149
+ @scanner = StringScanner.new(chunk)
150
+ if input.respond_to?(:eof?) and input.eof?
151
+ @inputs.shift
152
+ @last_scanner = @inputs.empty?
153
+ end
154
+ true
155
+ else
156
+ @scanner = StringScanner.new("".encode(@encoding))
157
+ @inputs.shift
158
+ @last_scanner = @inputs.empty?
159
+ if @last_scanner
160
+ false
161
+ else
162
+ read_chunk
163
+ end
164
+ end
165
+ end
166
+ end
167
+ end
168
+
169
+ def initialize(input, options)
170
+ @input = input
171
+ @options = options
172
+ @samples = []
173
+
174
+ prepare
175
+ end
176
+
177
+ def column_separator
178
+ @column_separator
179
+ end
180
+
181
+ def row_separator
182
+ @row_separator
183
+ end
184
+
185
+ def quote_character
186
+ @quote_character
187
+ end
188
+
189
+ def field_size_limit
190
+ @field_size_limit
191
+ end
192
+
193
+ def skip_lines
194
+ @skip_lines
195
+ end
196
+
197
+ def unconverted_fields?
198
+ @unconverted_fields
199
+ end
200
+
201
+ def headers
202
+ @headers
203
+ end
204
+
205
+ def header_row?
206
+ @use_headers and @headers.nil?
207
+ end
208
+
209
+ def return_headers?
210
+ @return_headers
211
+ end
212
+
213
+ def skip_blanks?
214
+ @skip_blanks
215
+ end
216
+
217
+ def liberal_parsing?
218
+ @liberal_parsing
219
+ end
220
+
221
+ def lineno
222
+ @lineno
223
+ end
224
+
225
+ def line
226
+ last_line
227
+ end
228
+
229
+ def parse(&block)
230
+ return to_enum(__method__) unless block_given?
231
+
232
+ if @return_headers and @headers
233
+ headers = Row.new(@headers, @raw_headers, true)
234
+ if @unconverted_fields
235
+ headers = add_unconverted_fields(headers, [])
236
+ end
237
+ yield headers
238
+ end
239
+
240
+ row = []
241
+ begin
242
+ @scanner = build_scanner
243
+ skip_needless_lines
244
+ start_row
245
+ while true
246
+ @quoted_column_value = false
247
+ @unquoted_column_value = false
248
+ value = parse_column_value
249
+ if value and @field_size_limit and value.size >= @field_size_limit
250
+ raise MalformedCSVError.new("Field size exceeded", @lineno + 1)
251
+ end
252
+ if parse_column_end
253
+ row << value
254
+ elsif parse_row_end
255
+ if row.empty? and value.nil?
256
+ emit_row([], &block) unless @skip_blanks
257
+ else
258
+ row << value
259
+ emit_row(row, &block)
260
+ row = []
261
+ end
262
+ skip_needless_lines
263
+ start_row
264
+ elsif @scanner.eos?
265
+ return if row.empty? and value.nil?
266
+ row << value
267
+ emit_row(row, &block)
268
+ return
269
+ else
270
+ if @quoted_column_value
271
+ message = "Do not allow except col_sep_split_separator " +
272
+ "after quoted fields"
273
+ raise MalformedCSVError.new(message, @lineno + 1)
274
+ elsif @unquoted_column_value and @scanner.scan(@cr_or_lf)
275
+ message = "Unquoted fields do not allow \\r or \\n"
276
+ raise MalformedCSVError.new(message, @lineno + 1)
277
+ elsif @scanner.rest.start_with?(@quote_character)
278
+ message = "Illegal quoting"
279
+ raise MalformedCSVError.new(message, @lineno + 1)
280
+ else
281
+ raise MalformedCSVError.new("TODO: Meaningful message",
282
+ @lineno + 1)
283
+ end
284
+ end
285
+ end
286
+ rescue InvalidEncoding
287
+ message = "Invalid byte sequence in #{@encoding}"
288
+ raise MalformedCSVError.new(message, @lineno + 1)
289
+ end
290
+ end
291
+
292
+ private
293
+ def prepare
294
+ prepare_variable
295
+ prepare_regexp
296
+ prepare_line
297
+ prepare_header
298
+ prepare_parser
299
+ end
300
+
301
+ def prepare_variable
302
+ @encoding = @options[:encoding]
303
+ @liberal_parsing = @options[:liberal_parsing]
304
+ @unconverted_fields = @options[:unconverted_fields]
305
+ @field_size_limit = @options[:field_size_limit]
306
+ @skip_blanks = @options[:skip_blanks]
307
+ @fields_converter = @options[:fields_converter]
308
+ @header_fields_converter = @options[:header_fields_converter]
309
+ end
310
+
311
+ def prepare_regexp
312
+ @column_separator = @options[:column_separator].to_s.encode(@encoding)
313
+ @row_separator =
314
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
315
+ @quote_character = @options[:quote_character].to_s.encode(@encoding)
316
+ if @quote_character.length != 1
317
+ raise ArgumentError, ":quote_char has to be a single character String"
318
+ end
319
+
320
+ escaped_column_separator = Regexp.escape(@column_separator)
321
+ escaped_row_separator = Regexp.escape(@row_separator)
322
+ escaped_quote_character = Regexp.escape(@quote_character)
323
+
324
+ skip_lines = @options[:skip_lines]
325
+ case skip_lines
326
+ when String
327
+ @skip_lines = skip_lines.encode(@encoding)
328
+ when Regexp, nil
329
+ @skip_lines = skip_lines
330
+ else
331
+ unless skip_lines.respond_to?(:match)
332
+ message =
333
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
334
+ raise ArgumentError, message
335
+ end
336
+ @skip_lines = skip_lines
337
+ end
338
+
339
+ @column_end = Regexp.new(escaped_column_separator)
340
+ if @column_separator.size > 1
341
+ @column_ends = @column_separator.each_char.collect do |char|
342
+ Regexp.new(Regexp.escape(char))
343
+ end
344
+ else
345
+ @column_ends = nil
346
+ end
347
+ @row_end = Regexp.new(escaped_row_separator)
348
+ if @row_separator.size > 1
349
+ @row_ends = @row_separator.each_char.collect do |char|
350
+ Regexp.new(Regexp.escape(char))
351
+ end
352
+ else
353
+ @row_ends = nil
354
+ end
355
+ @quotes = Regexp.new(escaped_quote_character +
356
+ "+".encode(@encoding))
357
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
358
+ escaped_quote_character +
359
+ "]+".encode(@encoding))
360
+ if @liberal_parsing
361
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
362
+ escaped_column_separator +
363
+ "\r\n]+".encode(@encoding))
364
+ else
365
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
366
+ escaped_quote_character +
367
+ escaped_column_separator +
368
+ "\r\n]+".encode(@encoding))
369
+ end
370
+ @cr_or_lf = Regexp.new("[\r\n]".encode(@encoding))
371
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
372
+ end
373
+
374
+ def resolve_row_separator(separator)
375
+ if separator == :auto
376
+ cr = "\r".encode(@encoding)
377
+ lf = "\n".encode(@encoding)
378
+ if @input.is_a?(StringIO)
379
+ separator = detect_row_separator(@input.string, cr, lf)
380
+ elsif @input.respond_to?(:gets)
381
+ if @input.is_a?(File)
382
+ chunk_size = 32 * 1024
383
+ else
384
+ chunk_size = 1024
385
+ end
386
+ begin
387
+ while separator == :auto
388
+ #
389
+ # if we run out of data, it's probably a single line
390
+ # (ensure will set default value)
391
+ #
392
+ break unless sample = @input.gets(nil, chunk_size)
393
+
394
+ # extend sample if we're unsure of the line ending
395
+ if sample.end_with?(cr)
396
+ sample << (@input.gets(nil, 1) || "")
397
+ end
398
+
399
+ @samples << sample
400
+
401
+ separator = detect_row_separator(sample, cr, lf)
402
+ end
403
+ rescue IOError
404
+ # do nothing: ensure will set default
405
+ end
406
+ end
407
+ separator = $INPUT_RECORD_SEPARATOR if separator == :auto
408
+ end
409
+ separator.to_s.encode(@encoding)
410
+ end
411
+
412
+ def detect_row_separator(sample, cr, lf)
413
+ lf_index = sample.index(lf)
414
+ if lf_index
415
+ cr_index = sample[0, lf_index].index(cr)
416
+ else
417
+ cr_index = sample.index(cr)
418
+ end
419
+ if cr_index and lf_index
420
+ if cr_index + 1 == lf_index
421
+ cr + lf
422
+ elsif cr_index < lf_index
423
+ cr
424
+ else
425
+ lf
426
+ end
427
+ elsif cr_index
428
+ cr
429
+ elsif lf_index
430
+ lf
431
+ else
432
+ :auto
433
+ end
434
+ end
435
+
436
+ def prepare_line
437
+ @lineno = 0
438
+ @last_line = nil
439
+ @scanner = nil
440
+ end
441
+
442
+ def last_line
443
+ if @scanner
444
+ @last_line ||= @scanner.keep_end
445
+ else
446
+ @last_line
447
+ end
448
+ end
449
+
450
+ def prepare_header
451
+ @return_headers = @options[:return_headers]
452
+
453
+ headers = @options[:headers]
454
+ case headers
455
+ when Array
456
+ @raw_headers = headers
457
+ @use_headers = true
458
+ when String
459
+ @raw_headers = parse_headers(headers)
460
+ @use_headers = true
461
+ when nil, false
462
+ @raw_headers = nil
463
+ @use_headers = false
464
+ else
465
+ @raw_headers = nil
466
+ @use_headers = true
467
+ end
468
+ if @raw_headers
469
+ @headers = adjust_headers(@raw_headers)
470
+ else
471
+ @headers = nil
472
+ end
473
+ end
474
+
475
+ def parse_headers(row)
476
+ CSV.parse_line(row,
477
+ col_sep: @column_separator,
478
+ row_sep: @row_separator,
479
+ quote_char: @quote_character)
480
+ end
481
+
482
+ def adjust_headers(headers)
483
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno)
484
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
485
+ adjusted_headers
486
+ end
487
+
488
+ def prepare_parser
489
+ @may_quoted = may_quoted?
490
+ end
491
+
492
+ def may_quoted?
493
+ if @input.is_a?(StringIO)
494
+ sample = @input.string
495
+ else
496
+ return false if @samples.empty?
497
+ sample = @samples.first
498
+ end
499
+ sample[0, 128].index(@quote_character)
500
+ end
501
+
502
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
503
+ if SCANNER_TEST
504
+ class UnoptimizedStringIO
505
+ def initialize(string)
506
+ @io = StringIO.new(string)
507
+ end
508
+
509
+ def gets(*args)
510
+ @io.gets(*args)
511
+ end
512
+
513
+ def eof?
514
+ @io.eof?
515
+ end
516
+ end
517
+
518
+ def build_scanner
519
+ inputs = @samples.collect do |sample|
520
+ UnoptimizedStringIO.new(sample)
521
+ end
522
+ if @input.is_a?(StringIO)
523
+ inputs << UnoptimizedStringIO.new(@input.string)
524
+ else
525
+ inputs << @input
526
+ end
527
+ InputsScanner.new(inputs, @encoding, chunk_size: 1)
528
+ end
529
+ else
530
+ def build_scanner
531
+ string = nil
532
+ if @samples.empty? and @input.is_a?(StringIO)
533
+ string = @input.string
534
+ elsif @samples.size == 1 and @input.respond_to?(:eof?) and @input.eof?
535
+ string = @samples[0]
536
+ end
537
+ if string
538
+ unless string.valid_encoding?
539
+ message = "Invalid byte sequence in #{@encoding}"
540
+ raise MalformedCSVError.new(message, @lineno + 1)
541
+ end
542
+ Scanner.new(string)
543
+ else
544
+ inputs = @samples.collect do |sample|
545
+ StringIO.new(sample)
546
+ end
547
+ inputs << @input
548
+ InputsScanner.new(inputs, @encoding)
549
+ end
550
+ end
551
+ end
552
+
553
+ def skip_needless_lines
554
+ return unless @skip_lines
555
+
556
+ while true
557
+ @scanner.keep_start
558
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
559
+ line << @row_separator if parse_row_end
560
+ if skip_line?(line)
561
+ @scanner.keep_drop
562
+ else
563
+ @scanner.keep_back
564
+ return
565
+ end
566
+ end
567
+ end
568
+
569
+ def skip_line?(line)
570
+ case @skip_lines
571
+ when String
572
+ line.include?(@skip_lines)
573
+ when Regexp
574
+ @skip_lines.match?(line)
575
+ else
576
+ @skip_lines.match(line)
577
+ end
578
+ end
579
+
580
+ def parse_column_value
581
+ if @liberal_parsing
582
+ quoted_value = parse_quoted_column_value
583
+ if quoted_value
584
+ unquoted_value = parse_unquoted_column_value
585
+ if unquoted_value
586
+ @quote_character + quoted_value + @quote_character + unquoted_value
587
+ else
588
+ quoted_value
589
+ end
590
+ else
591
+ parse_unquoted_column_value
592
+ end
593
+ elsif @may_quoted
594
+ parse_quoted_column_value ||
595
+ parse_unquoted_column_value
596
+ else
597
+ parse_unquoted_column_value ||
598
+ parse_quoted_column_value
599
+ end
600
+ end
601
+
602
+ def parse_unquoted_column_value
603
+ value = @scanner.scan_all(@unquoted_value)
604
+ @unquoted_column_value = true if value
605
+ value
606
+ end
607
+
608
+ def parse_quoted_column_value
609
+ quotes = @scanner.scan_all(@quotes)
610
+ return nil unless quotes
611
+
612
+ @quoted_column_value = true
613
+ n_quotes = quotes.size
614
+ if (n_quotes % 2).zero?
615
+ quotes[0, (n_quotes - 2) / 2]
616
+ else
617
+ value = quotes[0, (n_quotes - 1) / 2]
618
+ while true
619
+ quoted_value = @scanner.scan_all(@quoted_value)
620
+ value << quoted_value if quoted_value
621
+ quotes = @scanner.scan_all(@quotes)
622
+ unless quotes
623
+ message = "Unclosed quoted field"
624
+ raise MalformedCSVError.new(message, @lineno + 1)
625
+ end
626
+ n_quotes = quotes.size
627
+ if n_quotes == 1
628
+ break
629
+ elsif (n_quotes % 2) == 1
630
+ value << quotes[0, (n_quotes - 1) / 2]
631
+ break
632
+ else
633
+ value << quotes[0, n_quotes / 2]
634
+ end
635
+ end
636
+ value
637
+ end
638
+ end
639
+
640
+ def parse_column_end
641
+ return true if @scanner.scan(@column_end)
642
+ return false unless @column_ends
643
+
644
+ @scanner.keep_start
645
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
646
+ @scanner.keep_drop
647
+ true
648
+ else
649
+ @scanner.keep_back
650
+ false
651
+ end
652
+ end
653
+
654
+ def parse_row_end
655
+ return true if @scanner.scan(@row_end)
656
+ return false unless @row_ends
657
+ @scanner.keep_start
658
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
659
+ @scanner.keep_drop
660
+ true
661
+ else
662
+ @scanner.keep_back
663
+ false
664
+ end
665
+ end
666
+
667
+ def start_row
668
+ if @last_line
669
+ @last_line = nil
670
+ else
671
+ @scanner.keep_drop
672
+ end
673
+ @scanner.keep_start
674
+ end
675
+
676
+ def emit_row(row, &block)
677
+ @lineno += 1
678
+
679
+ raw_row = row
680
+ if @use_headers
681
+ if @headers.nil?
682
+ @headers = adjust_headers(row)
683
+ return unless @return_headers
684
+ row = Row.new(@headers, row, true)
685
+ else
686
+ row = Row.new(@headers,
687
+ @fields_converter.convert(raw_row, @headers, @lineno))
688
+ end
689
+ else
690
+ # convert fields, if needed...
691
+ row = @fields_converter.convert(raw_row, nil, @lineno)
692
+ end
693
+
694
+ # inject unconverted fields and accessor, if requested...
695
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
696
+ add_unconverted_fields(row, raw_row)
697
+ end
698
+
699
+ yield(row)
700
+ end
701
+
702
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
703
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
704
+ # variable is set to the contents of +fields+.
705
+ def add_unconverted_fields(row, fields)
706
+ class << row
707
+ attr_reader :unconverted_fields
708
+ end
709
+ row.instance_variable_set(:@unconverted_fields, fields)
710
+ row
711
+ end
712
+ end
713
+ end