csv 3.0.0 → 3.2.8

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/NEWS.md +882 -0
  3. data/README.md +6 -3
  4. data/doc/csv/arguments/io.rdoc +5 -0
  5. data/doc/csv/options/common/col_sep.rdoc +57 -0
  6. data/doc/csv/options/common/quote_char.rdoc +42 -0
  7. data/doc/csv/options/common/row_sep.rdoc +91 -0
  8. data/doc/csv/options/generating/force_quotes.rdoc +17 -0
  9. data/doc/csv/options/generating/quote_empty.rdoc +12 -0
  10. data/doc/csv/options/generating/write_converters.rdoc +25 -0
  11. data/doc/csv/options/generating/write_empty_value.rdoc +15 -0
  12. data/doc/csv/options/generating/write_headers.rdoc +29 -0
  13. data/doc/csv/options/generating/write_nil_value.rdoc +14 -0
  14. data/doc/csv/options/parsing/converters.rdoc +46 -0
  15. data/doc/csv/options/parsing/empty_value.rdoc +13 -0
  16. data/doc/csv/options/parsing/field_size_limit.rdoc +39 -0
  17. data/doc/csv/options/parsing/header_converters.rdoc +43 -0
  18. data/doc/csv/options/parsing/headers.rdoc +63 -0
  19. data/doc/csv/options/parsing/liberal_parsing.rdoc +38 -0
  20. data/doc/csv/options/parsing/nil_value.rdoc +12 -0
  21. data/doc/csv/options/parsing/return_headers.rdoc +22 -0
  22. data/doc/csv/options/parsing/skip_blanks.rdoc +31 -0
  23. data/doc/csv/options/parsing/skip_lines.rdoc +37 -0
  24. data/doc/csv/options/parsing/strip.rdoc +15 -0
  25. data/doc/csv/options/parsing/unconverted_fields.rdoc +27 -0
  26. data/doc/csv/recipes/filtering.rdoc +158 -0
  27. data/doc/csv/recipes/generating.rdoc +298 -0
  28. data/doc/csv/recipes/parsing.rdoc +545 -0
  29. data/doc/csv/recipes/recipes.rdoc +6 -0
  30. data/lib/csv/core_ext/array.rb +1 -1
  31. data/lib/csv/core_ext/string.rb +1 -1
  32. data/lib/csv/fields_converter.rb +89 -0
  33. data/lib/csv/input_record_separator.rb +18 -0
  34. data/lib/csv/parser.rb +1288 -0
  35. data/lib/csv/row.rb +505 -136
  36. data/lib/csv/table.rb +791 -114
  37. data/lib/csv/version.rb +1 -1
  38. data/lib/csv/writer.rb +210 -0
  39. data/lib/csv.rb +2433 -1329
  40. metadata +66 -13
  41. data/news.md +0 -123
data/lib/csv/parser.rb ADDED
@@ -0,0 +1,1288 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "input_record_separator"
6
+ require_relative "row"
7
+ require_relative "table"
8
+
9
+ class CSV
10
+ # Note: Don't use this class directly. This is an internal class.
11
+ class Parser
12
+ #
13
+ # A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
14
+ # or String object being read from or written to. Your data is never transcoded
15
+ # (unless you ask Ruby to transcode it for you) and will literally be parsed in
16
+ # the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
17
+ # Encoding of your data. This is accomplished by transcoding the parser itself
18
+ # into your Encoding.
19
+ #
20
+
21
+ # Raised when encoding is invalid.
22
+ class InvalidEncoding < StandardError
23
+ end
24
+
25
+ # Raised when unexpected case is happen.
26
+ class UnexpectedError < StandardError
27
+ end
28
+
29
+ #
30
+ # CSV::Scanner receives a CSV output, scans it and return the content.
31
+ # It also controls the life cycle of the object with its methods +keep_start+,
32
+ # +keep_end+, +keep_back+, +keep_drop+.
33
+ #
34
+ # Uses StringScanner (the official strscan gem). Strscan provides lexical
35
+ # scanning operations on a String. We inherit its object and take advantage
36
+ # on the methods. For more information, please visit:
37
+ # https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
38
+ #
39
+ class Scanner < StringScanner
40
+ alias_method :scan_all, :scan
41
+
42
+ def initialize(*args)
43
+ super
44
+ @keeps = []
45
+ end
46
+
47
+ def each_line(row_separator)
48
+ position = pos
49
+ rest.each_line(row_separator) do |line|
50
+ position += line.bytesize
51
+ self.pos = position
52
+ yield(line)
53
+ end
54
+ end
55
+
56
+ def keep_start
57
+ @keeps.push(pos)
58
+ end
59
+
60
+ def keep_end
61
+ start = @keeps.pop
62
+ string.byteslice(start, pos - start)
63
+ end
64
+
65
+ def keep_back
66
+ self.pos = @keeps.pop
67
+ end
68
+
69
+ def keep_drop
70
+ @keeps.pop
71
+ end
72
+ end
73
+
74
+ #
75
+ # CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
76
+ # It also controls the life cycle of the object with its methods +keep_start+,
77
+ # +keep_end+, +keep_back+, +keep_drop+.
78
+ #
79
+ # CSV::InputsScanner.scan() tries to match with pattern at the current position.
80
+ # If there's a match, the scanner advances the "scan pointer" and returns the matched string.
81
+ # Otherwise, the scanner returns nil.
82
+ #
83
+ # CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
84
+ # If there is no more data (eos? = true), it returns "".
85
+ #
86
+ class InputsScanner
87
+ def initialize(inputs, encoding, row_separator, chunk_size: 8192)
88
+ @inputs = inputs.dup
89
+ @encoding = encoding
90
+ @row_separator = row_separator
91
+ @chunk_size = chunk_size
92
+ @last_scanner = @inputs.empty?
93
+ @keeps = []
94
+ read_chunk
95
+ end
96
+
97
+ def each_line(row_separator)
98
+ return enum_for(__method__, row_separator) unless block_given?
99
+ buffer = nil
100
+ input = @scanner.rest
101
+ position = @scanner.pos
102
+ offset = 0
103
+ n_row_separator_chars = row_separator.size
104
+ # trace(__method__, :start, input)
105
+ while true
106
+ input.each_line(row_separator) do |line|
107
+ @scanner.pos += line.bytesize
108
+ if buffer
109
+ if n_row_separator_chars == 2 and
110
+ buffer.end_with?(row_separator[0]) and
111
+ line.start_with?(row_separator[1])
112
+ buffer << line[0]
113
+ line = line[1..-1]
114
+ position += buffer.bytesize + offset
115
+ @scanner.pos = position
116
+ offset = 0
117
+ yield(buffer)
118
+ buffer = nil
119
+ next if line.empty?
120
+ else
121
+ buffer << line
122
+ line = buffer
123
+ buffer = nil
124
+ end
125
+ end
126
+ if line.end_with?(row_separator)
127
+ position += line.bytesize + offset
128
+ @scanner.pos = position
129
+ offset = 0
130
+ yield(line)
131
+ else
132
+ buffer = line
133
+ end
134
+ end
135
+ break unless read_chunk
136
+ input = @scanner.rest
137
+ position = @scanner.pos
138
+ offset = -buffer.bytesize if buffer
139
+ end
140
+ yield(buffer) if buffer
141
+ end
142
+
143
+ def scan(pattern)
144
+ # trace(__method__, pattern, :start)
145
+ value = @scanner.scan(pattern)
146
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
147
+ return value if @last_scanner
148
+
149
+ read_chunk if value and @scanner.eos?
150
+ # trace(__method__, pattern, :done, value)
151
+ value
152
+ end
153
+
154
+ def scan_all(pattern)
155
+ # trace(__method__, pattern, :start)
156
+ value = @scanner.scan(pattern)
157
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
158
+ return value if @last_scanner
159
+
160
+ # trace(__method__, pattern, :done, :nil) if value.nil?
161
+ return nil if value.nil?
162
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
163
+ # trace(__method__, pattern, :sub, sub_value)
164
+ value << sub_value
165
+ end
166
+ # trace(__method__, pattern, :done, value)
167
+ value
168
+ end
169
+
170
+ def eos?
171
+ @scanner.eos?
172
+ end
173
+
174
+ def keep_start
175
+ # trace(__method__, :start)
176
+ adjust_last_keep
177
+ @keeps.push([@scanner, @scanner.pos, nil])
178
+ # trace(__method__, :done)
179
+ end
180
+
181
+ def keep_end
182
+ # trace(__method__, :start)
183
+ scanner, start, buffer = @keeps.pop
184
+ if scanner == @scanner
185
+ keep = @scanner.string.byteslice(start, @scanner.pos - start)
186
+ else
187
+ keep = @scanner.string.byteslice(0, @scanner.pos)
188
+ end
189
+ if buffer
190
+ buffer << keep
191
+ keep = buffer
192
+ end
193
+ # trace(__method__, :done, keep)
194
+ keep
195
+ end
196
+
197
+ def keep_back
198
+ # trace(__method__, :start)
199
+ scanner, start, buffer = @keeps.pop
200
+ if buffer
201
+ # trace(__method__, :rescan, start, buffer)
202
+ string = @scanner.string
203
+ if scanner == @scanner
204
+ keep = string.byteslice(start,
205
+ string.bytesize - @scanner.pos - start)
206
+ else
207
+ keep = string
208
+ end
209
+ if keep and not keep.empty?
210
+ @inputs.unshift(StringIO.new(keep))
211
+ @last_scanner = false
212
+ end
213
+ @scanner = StringScanner.new(buffer)
214
+ else
215
+ if @scanner != scanner
216
+ message = "scanners are different but no buffer: "
217
+ message += "#{@scanner.inspect}(#{@scanner.object_id}): "
218
+ message += "#{scanner.inspect}(#{scanner.object_id})"
219
+ raise UnexpectedError, message
220
+ end
221
+ # trace(__method__, :repos, start, buffer)
222
+ @scanner.pos = start
223
+ end
224
+ read_chunk if @scanner.eos?
225
+ end
226
+
227
+ def keep_drop
228
+ _, _, buffer = @keeps.pop
229
+ # trace(__method__, :done, :empty) unless buffer
230
+ return unless buffer
231
+
232
+ last_keep = @keeps.last
233
+ # trace(__method__, :done, :no_last_keep) unless last_keep
234
+ return unless last_keep
235
+
236
+ if last_keep[2]
237
+ last_keep[2] << buffer
238
+ else
239
+ last_keep[2] = buffer
240
+ end
241
+ # trace(__method__, :done)
242
+ end
243
+
244
+ def rest
245
+ @scanner.rest
246
+ end
247
+
248
+ def check(pattern)
249
+ @scanner.check(pattern)
250
+ end
251
+
252
+ private
253
+ def trace(*args)
254
+ pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
255
+ end
256
+
257
+ def adjust_last_keep
258
+ # trace(__method__, :start)
259
+
260
+ keep = @keeps.last
261
+ # trace(__method__, :done, :empty) if keep.nil?
262
+ return if keep.nil?
263
+
264
+ scanner, start, buffer = keep
265
+ string = @scanner.string
266
+ if @scanner != scanner
267
+ start = 0
268
+ end
269
+ if start == 0 and @scanner.eos?
270
+ keep_data = string
271
+ else
272
+ keep_data = string.byteslice(start, @scanner.pos - start)
273
+ end
274
+ if keep_data
275
+ if buffer
276
+ buffer << keep_data
277
+ else
278
+ keep[2] = keep_data.dup
279
+ end
280
+ end
281
+
282
+ # trace(__method__, :done)
283
+ end
284
+
285
+ def read_chunk
286
+ return false if @last_scanner
287
+
288
+ adjust_last_keep
289
+
290
+ input = @inputs.first
291
+ case input
292
+ when StringIO
293
+ string = input.read
294
+ raise InvalidEncoding unless string.valid_encoding?
295
+ # trace(__method__, :stringio, string)
296
+ @scanner = StringScanner.new(string)
297
+ @inputs.shift
298
+ @last_scanner = @inputs.empty?
299
+ true
300
+ else
301
+ chunk = input.gets(@row_separator, @chunk_size)
302
+ if chunk
303
+ raise InvalidEncoding unless chunk.valid_encoding?
304
+ # trace(__method__, :chunk, chunk)
305
+ @scanner = StringScanner.new(chunk)
306
+ if input.respond_to?(:eof?) and input.eof?
307
+ @inputs.shift
308
+ @last_scanner = @inputs.empty?
309
+ end
310
+ true
311
+ else
312
+ # trace(__method__, :no_chunk)
313
+ @scanner = StringScanner.new("".encode(@encoding))
314
+ @inputs.shift
315
+ @last_scanner = @inputs.empty?
316
+ if @last_scanner
317
+ false
318
+ else
319
+ read_chunk
320
+ end
321
+ end
322
+ end
323
+ end
324
+ end
325
+
326
+ def initialize(input, options)
327
+ @input = input
328
+ @options = options
329
+ @samples = []
330
+
331
+ prepare
332
+ end
333
+
334
+ def column_separator
335
+ @column_separator
336
+ end
337
+
338
+ def row_separator
339
+ @row_separator
340
+ end
341
+
342
+ def quote_character
343
+ @quote_character
344
+ end
345
+
346
+ def field_size_limit
347
+ @max_field_size&.succ
348
+ end
349
+
350
+ def max_field_size
351
+ @max_field_size
352
+ end
353
+
354
+ def skip_lines
355
+ @skip_lines
356
+ end
357
+
358
+ def unconverted_fields?
359
+ @unconverted_fields
360
+ end
361
+
362
+ def headers
363
+ @headers
364
+ end
365
+
366
+ def header_row?
367
+ @use_headers and @headers.nil?
368
+ end
369
+
370
+ def return_headers?
371
+ @return_headers
372
+ end
373
+
374
+ def skip_blanks?
375
+ @skip_blanks
376
+ end
377
+
378
+ def liberal_parsing?
379
+ @liberal_parsing
380
+ end
381
+
382
+ def lineno
383
+ @lineno
384
+ end
385
+
386
+ def line
387
+ last_line
388
+ end
389
+
390
+ def parse(&block)
391
+ return to_enum(__method__) unless block_given?
392
+
393
+ if @return_headers and @headers and @raw_headers
394
+ headers = Row.new(@headers, @raw_headers, true)
395
+ if @unconverted_fields
396
+ headers = add_unconverted_fields(headers, [])
397
+ end
398
+ yield headers
399
+ end
400
+
401
+ begin
402
+ @scanner ||= build_scanner
403
+ if quote_character.nil?
404
+ parse_no_quote(&block)
405
+ elsif @need_robust_parsing
406
+ parse_quotable_robust(&block)
407
+ else
408
+ parse_quotable_loose(&block)
409
+ end
410
+ rescue InvalidEncoding
411
+ if @scanner
412
+ ignore_broken_line
413
+ lineno = @lineno
414
+ else
415
+ lineno = @lineno + 1
416
+ end
417
+ raise InvalidEncodingError.new(@encoding, lineno)
418
+ rescue UnexpectedError => error
419
+ if @scanner
420
+ ignore_broken_line
421
+ lineno = @lineno
422
+ else
423
+ lineno = @lineno + 1
424
+ end
425
+ message = "This should not be happen: #{error.message}: "
426
+ message += "Please report this to https://github.com/ruby/csv/issues"
427
+ raise MalformedCSVError.new(message, lineno)
428
+ end
429
+ end
430
+
431
+ def use_headers?
432
+ @use_headers
433
+ end
434
+
435
+ private
436
+ # A set of tasks to prepare the file in order to parse it
437
+ def prepare
438
+ prepare_variable
439
+ prepare_quote_character
440
+ prepare_backslash
441
+ prepare_skip_lines
442
+ prepare_strip
443
+ prepare_separators
444
+ validate_strip_and_col_sep_options
445
+ prepare_quoted
446
+ prepare_unquoted
447
+ prepare_line
448
+ prepare_header
449
+ prepare_parser
450
+ end
451
+
452
+ def prepare_variable
453
+ @need_robust_parsing = false
454
+ @encoding = @options[:encoding]
455
+ liberal_parsing = @options[:liberal_parsing]
456
+ if liberal_parsing
457
+ @liberal_parsing = true
458
+ if liberal_parsing.is_a?(Hash)
459
+ @double_quote_outside_quote =
460
+ liberal_parsing[:double_quote_outside_quote]
461
+ @backslash_quote = liberal_parsing[:backslash_quote]
462
+ else
463
+ @double_quote_outside_quote = false
464
+ @backslash_quote = false
465
+ end
466
+ @need_robust_parsing = true
467
+ else
468
+ @liberal_parsing = false
469
+ @backslash_quote = false
470
+ end
471
+ @unconverted_fields = @options[:unconverted_fields]
472
+ @max_field_size = @options[:max_field_size]
473
+ @skip_blanks = @options[:skip_blanks]
474
+ @fields_converter = @options[:fields_converter]
475
+ @header_fields_converter = @options[:header_fields_converter]
476
+ end
477
+
478
+ def prepare_quote_character
479
+ @quote_character = @options[:quote_character]
480
+ if @quote_character.nil?
481
+ @escaped_quote_character = nil
482
+ @escaped_quote = nil
483
+ else
484
+ @quote_character = @quote_character.to_s.encode(@encoding)
485
+ if @quote_character.length != 1
486
+ message = ":quote_char has to be nil or a single character String"
487
+ raise ArgumentError, message
488
+ end
489
+ @escaped_quote_character = Regexp.escape(@quote_character)
490
+ @escaped_quote = Regexp.new(@escaped_quote_character)
491
+ end
492
+ end
493
+
494
+ def prepare_backslash
495
+ return unless @backslash_quote
496
+
497
+ @backslash_character = "\\".encode(@encoding)
498
+
499
+ @escaped_backslash_character = Regexp.escape(@backslash_character)
500
+ @escaped_backslash = Regexp.new(@escaped_backslash_character)
501
+ if @quote_character.nil?
502
+ @backslash_quote_character = nil
503
+ else
504
+ @backslash_quote_character =
505
+ @backslash_character + @escaped_quote_character
506
+ end
507
+ end
508
+
509
+ def prepare_skip_lines
510
+ skip_lines = @options[:skip_lines]
511
+ case skip_lines
512
+ when String
513
+ @skip_lines = skip_lines.encode(@encoding)
514
+ when Regexp, nil
515
+ @skip_lines = skip_lines
516
+ else
517
+ unless skip_lines.respond_to?(:match)
518
+ message =
519
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
520
+ raise ArgumentError, message
521
+ end
522
+ @skip_lines = skip_lines
523
+ end
524
+ end
525
+
526
+ def prepare_strip
527
+ @strip = @options[:strip]
528
+ @escaped_strip = nil
529
+ @strip_value = nil
530
+ @rstrip_value = nil
531
+ if @strip.is_a?(String)
532
+ case @strip.length
533
+ when 0
534
+ raise ArgumentError, ":strip must not be an empty String"
535
+ when 1
536
+ # ok
537
+ else
538
+ raise ArgumentError, ":strip doesn't support 2 or more characters yet"
539
+ end
540
+ @strip = @strip.encode(@encoding)
541
+ @escaped_strip = Regexp.escape(@strip)
542
+ if @quote_character
543
+ @strip_value = Regexp.new(@escaped_strip +
544
+ "+".encode(@encoding))
545
+ @rstrip_value = Regexp.new(@escaped_strip +
546
+ "+\\z".encode(@encoding))
547
+ end
548
+ @need_robust_parsing = true
549
+ elsif @strip
550
+ strip_values = " \t\f\v"
551
+ @escaped_strip = strip_values.encode(@encoding)
552
+ if @quote_character
553
+ @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
554
+ @rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
555
+ end
556
+ @need_robust_parsing = true
557
+ end
558
+ end
559
+
560
+ begin
561
+ StringScanner.new("x").scan("x")
562
+ rescue TypeError
563
+ STRING_SCANNER_SCAN_ACCEPT_STRING = false
564
+ else
565
+ STRING_SCANNER_SCAN_ACCEPT_STRING = true
566
+ end
567
+
568
+ def prepare_separators
569
+ column_separator = @options[:column_separator]
570
+ @column_separator = column_separator.to_s.encode(@encoding)
571
+ if @column_separator.size < 1
572
+ message = ":col_sep must be 1 or more characters: "
573
+ message += column_separator.inspect
574
+ raise ArgumentError, message
575
+ end
576
+ @row_separator =
577
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
578
+
579
+ @escaped_column_separator = Regexp.escape(@column_separator)
580
+ @escaped_first_column_separator = Regexp.escape(@column_separator[0])
581
+ if @column_separator.size > 1
582
+ @column_end = Regexp.new(@escaped_column_separator)
583
+ @column_ends = @column_separator.each_char.collect do |char|
584
+ Regexp.new(Regexp.escape(char))
585
+ end
586
+ @first_column_separators = Regexp.new(@escaped_first_column_separator +
587
+ "+".encode(@encoding))
588
+ else
589
+ if STRING_SCANNER_SCAN_ACCEPT_STRING
590
+ @column_end = @column_separator
591
+ else
592
+ @column_end = Regexp.new(@escaped_column_separator)
593
+ end
594
+ @column_ends = nil
595
+ @first_column_separators = nil
596
+ end
597
+
598
+ escaped_row_separator = Regexp.escape(@row_separator)
599
+ @row_end = Regexp.new(escaped_row_separator)
600
+ if @row_separator.size > 1
601
+ @row_ends = @row_separator.each_char.collect do |char|
602
+ Regexp.new(Regexp.escape(char))
603
+ end
604
+ else
605
+ @row_ends = nil
606
+ end
607
+
608
+ @cr = "\r".encode(@encoding)
609
+ @lf = "\n".encode(@encoding)
610
+ @line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
611
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
612
+ end
613
+
614
+ # This method verifies that there are no (obvious) ambiguities with the
615
+ # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
616
+ # and +strip+ were both equal to +\t+, then there would be no clear way to
617
+ # parse the input.
618
+ def validate_strip_and_col_sep_options
619
+ return unless @strip
620
+
621
+ if @strip.is_a?(String)
622
+ if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
623
+ raise ArgumentError,
624
+ "The provided strip (#{@escaped_strip}) and " \
625
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
626
+ end
627
+ else
628
+ if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
629
+ raise ArgumentError,
630
+ "The provided strip (true) and " \
631
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
632
+ end
633
+ end
634
+ end
635
+
636
+ def prepare_quoted
637
+ if @quote_character
638
+ @quotes = Regexp.new(@escaped_quote_character +
639
+ "+".encode(@encoding))
640
+ no_quoted_values = @escaped_quote_character.dup
641
+ if @backslash_quote
642
+ no_quoted_values << @escaped_backslash_character
643
+ end
644
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
645
+ no_quoted_values +
646
+ "]+".encode(@encoding))
647
+ end
648
+ if @escaped_strip
649
+ @split_column_separator = Regexp.new(@escaped_strip +
650
+ "*".encode(@encoding) +
651
+ @escaped_column_separator +
652
+ @escaped_strip +
653
+ "*".encode(@encoding))
654
+ else
655
+ if @column_separator == " ".encode(@encoding)
656
+ @split_column_separator = Regexp.new(@escaped_column_separator)
657
+ else
658
+ @split_column_separator = @column_separator
659
+ end
660
+ end
661
+ end
662
+
663
+ def prepare_unquoted
664
+ return if @quote_character.nil?
665
+
666
+ no_unquoted_values = "\r\n".encode(@encoding)
667
+ no_unquoted_values << @escaped_first_column_separator
668
+ unless @liberal_parsing
669
+ no_unquoted_values << @escaped_quote_character
670
+ end
671
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
672
+ no_unquoted_values +
673
+ "]+".encode(@encoding))
674
+ end
675
+
676
+ def resolve_row_separator(separator)
677
+ if separator == :auto
678
+ cr = "\r".encode(@encoding)
679
+ lf = "\n".encode(@encoding)
680
+ if @input.is_a?(StringIO)
681
+ pos = @input.pos
682
+ separator = detect_row_separator(@input.read, cr, lf)
683
+ @input.seek(pos)
684
+ elsif @input.respond_to?(:gets)
685
+ if @input.is_a?(File)
686
+ chunk_size = 32 * 1024
687
+ else
688
+ chunk_size = 1024
689
+ end
690
+ begin
691
+ while separator == :auto
692
+ #
693
+ # if we run out of data, it's probably a single line
694
+ # (ensure will set default value)
695
+ #
696
+ break unless sample = @input.gets(nil, chunk_size)
697
+
698
+ # extend sample if we're unsure of the line ending
699
+ if sample.end_with?(cr)
700
+ sample << (@input.gets(nil, 1) || "")
701
+ end
702
+
703
+ @samples << sample
704
+
705
+ separator = detect_row_separator(sample, cr, lf)
706
+ end
707
+ rescue IOError
708
+ # do nothing: ensure will set default
709
+ end
710
+ end
711
+ separator = InputRecordSeparator.value if separator == :auto
712
+ end
713
+ separator.to_s.encode(@encoding)
714
+ end
715
+
716
+ def detect_row_separator(sample, cr, lf)
717
+ lf_index = sample.index(lf)
718
+ if lf_index
719
+ cr_index = sample[0, lf_index].index(cr)
720
+ else
721
+ cr_index = sample.index(cr)
722
+ end
723
+ if cr_index and lf_index
724
+ if cr_index + 1 == lf_index
725
+ cr + lf
726
+ elsif cr_index < lf_index
727
+ cr
728
+ else
729
+ lf
730
+ end
731
+ elsif cr_index
732
+ cr
733
+ elsif lf_index
734
+ lf
735
+ else
736
+ :auto
737
+ end
738
+ end
739
+
740
+ def prepare_line
741
+ @lineno = 0
742
+ @last_line = nil
743
+ @scanner = nil
744
+ end
745
+
746
+ def last_line
747
+ if @scanner
748
+ @last_line ||= @scanner.keep_end
749
+ else
750
+ @last_line
751
+ end
752
+ end
753
+
754
+ def prepare_header
755
+ @return_headers = @options[:return_headers]
756
+
757
+ headers = @options[:headers]
758
+ case headers
759
+ when Array
760
+ @raw_headers = headers
761
+ quoted_fields = [false] * @raw_headers.size
762
+ @use_headers = true
763
+ when String
764
+ @raw_headers, quoted_fields = parse_headers(headers)
765
+ @use_headers = true
766
+ when nil, false
767
+ @raw_headers = nil
768
+ @use_headers = false
769
+ else
770
+ @raw_headers = nil
771
+ @use_headers = true
772
+ end
773
+ if @raw_headers
774
+ @headers = adjust_headers(@raw_headers, quoted_fields)
775
+ else
776
+ @headers = nil
777
+ end
778
+ end
779
+
780
+ def parse_headers(row)
781
+ quoted_fields = []
782
+ converter = lambda do |field, info|
783
+ quoted_fields << info.quoted?
784
+ field
785
+ end
786
+ headers = CSV.parse_line(row,
787
+ col_sep: @column_separator,
788
+ row_sep: @row_separator,
789
+ quote_char: @quote_character,
790
+ converters: [converter])
791
+ [headers, quoted_fields]
792
+ end
793
+
794
+ def adjust_headers(headers, quoted_fields)
795
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
796
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
797
+ adjusted_headers
798
+ end
799
+
800
+ def prepare_parser
801
+ @may_quoted = may_quoted?
802
+ end
803
+
804
+ def may_quoted?
805
+ return false if @quote_character.nil?
806
+
807
+ if @input.is_a?(StringIO)
808
+ pos = @input.pos
809
+ sample = @input.read
810
+ @input.seek(pos)
811
+ else
812
+ return false if @samples.empty?
813
+ sample = @samples.first
814
+ end
815
+ sample[0, 128].index(@quote_character)
816
+ end
817
+
818
+ class UnoptimizedStringIO # :nodoc:
819
+ def initialize(string)
820
+ @io = StringIO.new(string, "rb:#{string.encoding}")
821
+ end
822
+
823
+ def gets(*args)
824
+ @io.gets(*args)
825
+ end
826
+
827
+ def each_line(*args, &block)
828
+ @io.each_line(*args, &block)
829
+ end
830
+
831
+ def eof?
832
+ @io.eof?
833
+ end
834
+ end
835
+
836
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
837
+ if SCANNER_TEST
838
+ SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
839
+ SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
840
+ def build_scanner
841
+ inputs = @samples.collect do |sample|
842
+ UnoptimizedStringIO.new(sample)
843
+ end
844
+ if @input.is_a?(StringIO)
845
+ inputs << UnoptimizedStringIO.new(@input.read)
846
+ else
847
+ inputs << @input
848
+ end
849
+ begin
850
+ chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
851
+ rescue # Ractor::IsolationError
852
+ # Ractor on Ruby 3.0 can't read ENV value.
853
+ chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
854
+ end
855
+ chunk_size = Integer((chunk_size_value || "1"), 10)
856
+ InputsScanner.new(inputs,
857
+ @encoding,
858
+ @row_separator,
859
+ chunk_size: chunk_size)
860
+ end
861
+ else
862
+ def build_scanner
863
+ string = nil
864
+ if @samples.empty? and @input.is_a?(StringIO)
865
+ string = @input.read
866
+ elsif @samples.size == 1 and
867
+ @input != ARGF and
868
+ @input.respond_to?(:eof?) and
869
+ @input.eof?
870
+ string = @samples[0]
871
+ end
872
+ if string
873
+ unless string.valid_encoding?
874
+ index = string.lines(@row_separator).index do |line|
875
+ !line.valid_encoding?
876
+ end
877
+ if index
878
+ raise InvalidEncodingError.new(@encoding, @lineno + index + 1)
879
+ end
880
+ end
881
+ Scanner.new(string)
882
+ else
883
+ inputs = @samples.collect do |sample|
884
+ StringIO.new(sample)
885
+ end
886
+ inputs << @input
887
+ InputsScanner.new(inputs, @encoding, @row_separator)
888
+ end
889
+ end
890
+ end
891
+
892
+ def skip_needless_lines
893
+ return unless @skip_lines
894
+
895
+ until @scanner.eos?
896
+ @scanner.keep_start
897
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
898
+ line << @row_separator if parse_row_end
899
+ if skip_line?(line)
900
+ @lineno += 1
901
+ @scanner.keep_drop
902
+ else
903
+ @scanner.keep_back
904
+ return
905
+ end
906
+ end
907
+ end
908
+
909
+ def skip_line?(line)
910
+ line = line.delete_suffix(@row_separator)
911
+ case @skip_lines
912
+ when String
913
+ line.include?(@skip_lines)
914
+ when Regexp
915
+ @skip_lines.match?(line)
916
+ else
917
+ @skip_lines.match(line)
918
+ end
919
+ end
920
+
921
+ def validate_field_size(field)
922
+ return unless @max_field_size
923
+ return if field.size <= @max_field_size
924
+ ignore_broken_line
925
+ message = "Field size exceeded: #{field.size} > #{@max_field_size}"
926
+ raise MalformedCSVError.new(message, @lineno)
927
+ end
928
+
929
+ def parse_no_quote(&block)
930
+ @scanner.each_line(@row_separator) do |line|
931
+ next if @skip_lines and skip_line?(line)
932
+ original_line = line
933
+ line = line.delete_suffix(@row_separator)
934
+
935
+ if line.empty?
936
+ next if @skip_blanks
937
+ row = []
938
+ quoted_fields = []
939
+ else
940
+ line = strip_value(line)
941
+ row = line.split(@split_column_separator, -1)
942
+ quoted_fields = [false] * row.size
943
+ if @max_field_size
944
+ row.each do |column|
945
+ validate_field_size(column)
946
+ end
947
+ end
948
+ n_columns = row.size
949
+ i = 0
950
+ while i < n_columns
951
+ row[i] = nil if row[i].empty?
952
+ i += 1
953
+ end
954
+ end
955
+ @last_line = original_line
956
+ emit_row(row, quoted_fields, &block)
957
+ end
958
+ end
959
+
960
+ def parse_quotable_loose(&block)
961
+ @scanner.keep_start
962
+ @scanner.each_line(@row_separator) do |line|
963
+ if @skip_lines and skip_line?(line)
964
+ @scanner.keep_drop
965
+ @scanner.keep_start
966
+ next
967
+ end
968
+ original_line = line
969
+ line = line.delete_suffix(@row_separator)
970
+
971
+ if line.empty?
972
+ if @skip_blanks
973
+ @scanner.keep_drop
974
+ @scanner.keep_start
975
+ next
976
+ end
977
+ row = []
978
+ quoted_fields = []
979
+ elsif line.include?(@cr) or line.include?(@lf)
980
+ @scanner.keep_back
981
+ @need_robust_parsing = true
982
+ return parse_quotable_robust(&block)
983
+ else
984
+ row = line.split(@split_column_separator, -1)
985
+ quoted_fields = []
986
+ n_columns = row.size
987
+ i = 0
988
+ while i < n_columns
989
+ column = row[i]
990
+ if column.empty?
991
+ quoted_fields << false
992
+ row[i] = nil
993
+ else
994
+ n_quotes = column.count(@quote_character)
995
+ if n_quotes.zero?
996
+ quoted_fields << false
997
+ # no quote
998
+ elsif n_quotes == 2 and
999
+ column.start_with?(@quote_character) and
1000
+ column.end_with?(@quote_character)
1001
+ quoted_fields << true
1002
+ row[i] = column[1..-2]
1003
+ else
1004
+ @scanner.keep_back
1005
+ @need_robust_parsing = true
1006
+ return parse_quotable_robust(&block)
1007
+ end
1008
+ validate_field_size(row[i])
1009
+ end
1010
+ i += 1
1011
+ end
1012
+ end
1013
+ @scanner.keep_drop
1014
+ @scanner.keep_start
1015
+ @last_line = original_line
1016
+ emit_row(row, quoted_fields, &block)
1017
+ end
1018
+ @scanner.keep_drop
1019
+ end
1020
+
1021
+ def parse_quotable_robust(&block)
1022
+ row = []
1023
+ quoted_fields = []
1024
+ skip_needless_lines
1025
+ start_row
1026
+ while true
1027
+ @quoted_column_value = false
1028
+ @unquoted_column_value = false
1029
+ @scanner.scan_all(@strip_value) if @strip_value
1030
+ value = parse_column_value
1031
+ if value
1032
+ @scanner.scan_all(@strip_value) if @strip_value
1033
+ validate_field_size(value)
1034
+ end
1035
+ if parse_column_end
1036
+ row << value
1037
+ quoted_fields << @quoted_column_value
1038
+ elsif parse_row_end
1039
+ if row.empty? and value.nil?
1040
+ emit_row([], [], &block) unless @skip_blanks
1041
+ else
1042
+ row << value
1043
+ quoted_fields << @quoted_column_value
1044
+ emit_row(row, quoted_fields, &block)
1045
+ row = []
1046
+ quoted_fields = []
1047
+ end
1048
+ skip_needless_lines
1049
+ start_row
1050
+ elsif @scanner.eos?
1051
+ break if row.empty? and value.nil?
1052
+ row << value
1053
+ quoted_fields << @quoted_column_value
1054
+ emit_row(row, quoted_fields, &block)
1055
+ break
1056
+ else
1057
+ if @quoted_column_value
1058
+ if liberal_parsing? and (new_line = @scanner.check(@line_end))
1059
+ message =
1060
+ "Illegal end-of-line sequence outside of a quoted field " +
1061
+ "<#{new_line.inspect}>"
1062
+ else
1063
+ message = "Any value after quoted field isn't allowed"
1064
+ end
1065
+ ignore_broken_line
1066
+ raise MalformedCSVError.new(message, @lineno)
1067
+ elsif @unquoted_column_value and
1068
+ (new_line = @scanner.scan(@line_end))
1069
+ ignore_broken_line
1070
+ message = "Unquoted fields do not allow new line " +
1071
+ "<#{new_line.inspect}>"
1072
+ raise MalformedCSVError.new(message, @lineno)
1073
+ elsif @scanner.rest.start_with?(@quote_character)
1074
+ ignore_broken_line
1075
+ message = "Illegal quoting"
1076
+ raise MalformedCSVError.new(message, @lineno)
1077
+ elsif (new_line = @scanner.scan(@line_end))
1078
+ ignore_broken_line
1079
+ message = "New line must be <#{@row_separator.inspect}> " +
1080
+ "not <#{new_line.inspect}>"
1081
+ raise MalformedCSVError.new(message, @lineno)
1082
+ else
1083
+ ignore_broken_line
1084
+ raise MalformedCSVError.new("TODO: Meaningful message",
1085
+ @lineno)
1086
+ end
1087
+ end
1088
+ end
1089
+ end
1090
+
1091
+ def parse_column_value
1092
+ if @liberal_parsing
1093
+ quoted_value = parse_quoted_column_value
1094
+ if quoted_value
1095
+ @scanner.scan_all(@strip_value) if @strip_value
1096
+ unquoted_value = parse_unquoted_column_value
1097
+ if unquoted_value
1098
+ if @double_quote_outside_quote
1099
+ unquoted_value = unquoted_value.gsub(@quote_character * 2,
1100
+ @quote_character)
1101
+ if quoted_value.empty? # %Q{""...} case
1102
+ return @quote_character + unquoted_value
1103
+ end
1104
+ end
1105
+ @quote_character + quoted_value + @quote_character + unquoted_value
1106
+ else
1107
+ quoted_value
1108
+ end
1109
+ else
1110
+ parse_unquoted_column_value
1111
+ end
1112
+ elsif @may_quoted
1113
+ parse_quoted_column_value ||
1114
+ parse_unquoted_column_value
1115
+ else
1116
+ parse_unquoted_column_value ||
1117
+ parse_quoted_column_value
1118
+ end
1119
+ end
1120
+
1121
+ def parse_unquoted_column_value
1122
+ value = @scanner.scan_all(@unquoted_value)
1123
+ return nil unless value
1124
+
1125
+ @unquoted_column_value = true
1126
+ if @first_column_separators
1127
+ while true
1128
+ @scanner.keep_start
1129
+ is_column_end = @column_ends.all? do |column_end|
1130
+ @scanner.scan(column_end)
1131
+ end
1132
+ @scanner.keep_back
1133
+ break if is_column_end
1134
+ sub_separator = @scanner.scan_all(@first_column_separators)
1135
+ break if sub_separator.nil?
1136
+ value << sub_separator
1137
+ sub_value = @scanner.scan_all(@unquoted_value)
1138
+ break if sub_value.nil?
1139
+ value << sub_value
1140
+ end
1141
+ end
1142
+ value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
1143
+ if @rstrip_value
1144
+ value.gsub!(@rstrip_value, "")
1145
+ end
1146
+ value
1147
+ end
1148
+
1149
+ def parse_quoted_column_value
1150
+ quotes = @scanner.scan_all(@quotes)
1151
+ return nil unless quotes
1152
+
1153
+ @quoted_column_value = true
1154
+ n_quotes = quotes.size
1155
+ if (n_quotes % 2).zero?
1156
+ quotes[0, (n_quotes - 2) / 2]
1157
+ else
1158
+ value = quotes[0, n_quotes / 2]
1159
+ while true
1160
+ quoted_value = @scanner.scan_all(@quoted_value)
1161
+ value << quoted_value if quoted_value
1162
+ if @backslash_quote
1163
+ if @scanner.scan(@escaped_backslash)
1164
+ if @scanner.scan(@escaped_quote)
1165
+ value << @quote_character
1166
+ else
1167
+ value << @backslash_character
1168
+ end
1169
+ next
1170
+ end
1171
+ end
1172
+
1173
+ quotes = @scanner.scan_all(@quotes)
1174
+ unless quotes
1175
+ ignore_broken_line
1176
+ message = "Unclosed quoted field"
1177
+ raise MalformedCSVError.new(message, @lineno)
1178
+ end
1179
+ n_quotes = quotes.size
1180
+ if n_quotes == 1
1181
+ break
1182
+ else
1183
+ value << quotes[0, n_quotes / 2]
1184
+ break if (n_quotes % 2) == 1
1185
+ end
1186
+ end
1187
+ value
1188
+ end
1189
+ end
1190
+
1191
+ def parse_column_end
1192
+ return true if @scanner.scan(@column_end)
1193
+ return false unless @column_ends
1194
+
1195
+ @scanner.keep_start
1196
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
1197
+ @scanner.keep_drop
1198
+ true
1199
+ else
1200
+ @scanner.keep_back
1201
+ false
1202
+ end
1203
+ end
1204
+
1205
+ def parse_row_end
1206
+ return true if @scanner.scan(@row_end)
1207
+ return false unless @row_ends
1208
+ @scanner.keep_start
1209
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
1210
+ @scanner.keep_drop
1211
+ true
1212
+ else
1213
+ @scanner.keep_back
1214
+ false
1215
+ end
1216
+ end
1217
+
1218
+ def strip_value(value)
1219
+ return value unless @strip
1220
+ return value if value.nil?
1221
+
1222
+ case @strip
1223
+ when String
1224
+ while value.delete_prefix!(@strip)
1225
+ # do nothing
1226
+ end
1227
+ while value.delete_suffix!(@strip)
1228
+ # do nothing
1229
+ end
1230
+ else
1231
+ value.strip!
1232
+ end
1233
+ value
1234
+ end
1235
+
1236
+ def ignore_broken_line
1237
+ @scanner.scan_all(@not_line_end)
1238
+ @scanner.scan_all(@line_end)
1239
+ @lineno += 1
1240
+ end
1241
+
1242
+ def start_row
1243
+ if @last_line
1244
+ @last_line = nil
1245
+ else
1246
+ @scanner.keep_drop
1247
+ end
1248
+ @scanner.keep_start
1249
+ end
1250
+
1251
+ def emit_row(row, quoted_fields, &block)
1252
+ @lineno += 1
1253
+
1254
+ raw_row = row
1255
+ if @use_headers
1256
+ if @headers.nil?
1257
+ @headers = adjust_headers(row, quoted_fields)
1258
+ return unless @return_headers
1259
+ row = Row.new(@headers, row, true)
1260
+ else
1261
+ row = Row.new(@headers,
1262
+ @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
1263
+ end
1264
+ else
1265
+ # convert fields, if needed...
1266
+ row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
1267
+ end
1268
+
1269
+ # inject unconverted fields and accessor, if requested...
1270
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
1271
+ add_unconverted_fields(row, raw_row)
1272
+ end
1273
+
1274
+ yield(row)
1275
+ end
1276
+
1277
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
1278
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
1279
+ # variable is set to the contents of +fields+.
1280
+ def add_unconverted_fields(row, fields)
1281
+ class << row
1282
+ attr_reader :unconverted_fields
1283
+ end
1284
+ row.instance_variable_set(:@unconverted_fields, fields)
1285
+ row
1286
+ end
1287
+ end
1288
+ end