csv 3.0.0 → 3.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/NEWS.md +882 -0
  3. data/README.md +6 -3
  4. data/doc/csv/arguments/io.rdoc +5 -0
  5. data/doc/csv/options/common/col_sep.rdoc +57 -0
  6. data/doc/csv/options/common/quote_char.rdoc +42 -0
  7. data/doc/csv/options/common/row_sep.rdoc +91 -0
  8. data/doc/csv/options/generating/force_quotes.rdoc +17 -0
  9. data/doc/csv/options/generating/quote_empty.rdoc +12 -0
  10. data/doc/csv/options/generating/write_converters.rdoc +25 -0
  11. data/doc/csv/options/generating/write_empty_value.rdoc +15 -0
  12. data/doc/csv/options/generating/write_headers.rdoc +29 -0
  13. data/doc/csv/options/generating/write_nil_value.rdoc +14 -0
  14. data/doc/csv/options/parsing/converters.rdoc +46 -0
  15. data/doc/csv/options/parsing/empty_value.rdoc +13 -0
  16. data/doc/csv/options/parsing/field_size_limit.rdoc +39 -0
  17. data/doc/csv/options/parsing/header_converters.rdoc +43 -0
  18. data/doc/csv/options/parsing/headers.rdoc +63 -0
  19. data/doc/csv/options/parsing/liberal_parsing.rdoc +38 -0
  20. data/doc/csv/options/parsing/nil_value.rdoc +12 -0
  21. data/doc/csv/options/parsing/return_headers.rdoc +22 -0
  22. data/doc/csv/options/parsing/skip_blanks.rdoc +31 -0
  23. data/doc/csv/options/parsing/skip_lines.rdoc +37 -0
  24. data/doc/csv/options/parsing/strip.rdoc +15 -0
  25. data/doc/csv/options/parsing/unconverted_fields.rdoc +27 -0
  26. data/doc/csv/recipes/filtering.rdoc +158 -0
  27. data/doc/csv/recipes/generating.rdoc +298 -0
  28. data/doc/csv/recipes/parsing.rdoc +545 -0
  29. data/doc/csv/recipes/recipes.rdoc +6 -0
  30. data/lib/csv/core_ext/array.rb +1 -1
  31. data/lib/csv/core_ext/string.rb +1 -1
  32. data/lib/csv/fields_converter.rb +89 -0
  33. data/lib/csv/input_record_separator.rb +18 -0
  34. data/lib/csv/parser.rb +1288 -0
  35. data/lib/csv/row.rb +505 -136
  36. data/lib/csv/table.rb +791 -114
  37. data/lib/csv/version.rb +1 -1
  38. data/lib/csv/writer.rb +210 -0
  39. data/lib/csv.rb +2433 -1329
  40. metadata +66 -13
  41. data/news.md +0 -123
data/lib/csv/parser.rb ADDED
@@ -0,0 +1,1288 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "input_record_separator"
6
+ require_relative "row"
7
+ require_relative "table"
8
+
9
+ class CSV
10
+ # Note: Don't use this class directly. This is an internal class.
11
+ class Parser
12
+ #
13
+ # A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
14
+ # or String object being read from or written to. Your data is never transcoded
15
+ # (unless you ask Ruby to transcode it for you) and will literally be parsed in
16
+ # the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
17
+ # Encoding of your data. This is accomplished by transcoding the parser itself
18
+ # into your Encoding.
19
+ #
20
+
21
+ # Raised when encoding is invalid.
22
+ class InvalidEncoding < StandardError
23
+ end
24
+
25
+ # Raised when unexpected case is happen.
26
+ class UnexpectedError < StandardError
27
+ end
28
+
29
+ #
30
+ # CSV::Scanner receives a CSV output, scans it and return the content.
31
+ # It also controls the life cycle of the object with its methods +keep_start+,
32
+ # +keep_end+, +keep_back+, +keep_drop+.
33
+ #
34
+ # Uses StringScanner (the official strscan gem). Strscan provides lexical
35
+ # scanning operations on a String. We inherit its object and take advantage
36
+ # on the methods. For more information, please visit:
37
+ # https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
38
+ #
39
+ class Scanner < StringScanner
40
+ alias_method :scan_all, :scan
41
+
42
+ def initialize(*args)
43
+ super
44
+ @keeps = []
45
+ end
46
+
47
+ def each_line(row_separator)
48
+ position = pos
49
+ rest.each_line(row_separator) do |line|
50
+ position += line.bytesize
51
+ self.pos = position
52
+ yield(line)
53
+ end
54
+ end
55
+
56
+ def keep_start
57
+ @keeps.push(pos)
58
+ end
59
+
60
+ def keep_end
61
+ start = @keeps.pop
62
+ string.byteslice(start, pos - start)
63
+ end
64
+
65
+ def keep_back
66
+ self.pos = @keeps.pop
67
+ end
68
+
69
+ def keep_drop
70
+ @keeps.pop
71
+ end
72
+ end
73
+
74
+ #
75
+ # CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
76
+ # It also controls the life cycle of the object with its methods +keep_start+,
77
+ # +keep_end+, +keep_back+, +keep_drop+.
78
+ #
79
+ # CSV::InputsScanner.scan() tries to match with pattern at the current position.
80
+ # If there's a match, the scanner advances the "scan pointer" and returns the matched string.
81
+ # Otherwise, the scanner returns nil.
82
+ #
83
+ # CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
84
+ # If there is no more data (eos? = true), it returns "".
85
+ #
86
+ class InputsScanner
87
+ def initialize(inputs, encoding, row_separator, chunk_size: 8192)
88
+ @inputs = inputs.dup
89
+ @encoding = encoding
90
+ @row_separator = row_separator
91
+ @chunk_size = chunk_size
92
+ @last_scanner = @inputs.empty?
93
+ @keeps = []
94
+ read_chunk
95
+ end
96
+
97
+ def each_line(row_separator)
98
+ return enum_for(__method__, row_separator) unless block_given?
99
+ buffer = nil
100
+ input = @scanner.rest
101
+ position = @scanner.pos
102
+ offset = 0
103
+ n_row_separator_chars = row_separator.size
104
+ # trace(__method__, :start, input)
105
+ while true
106
+ input.each_line(row_separator) do |line|
107
+ @scanner.pos += line.bytesize
108
+ if buffer
109
+ if n_row_separator_chars == 2 and
110
+ buffer.end_with?(row_separator[0]) and
111
+ line.start_with?(row_separator[1])
112
+ buffer << line[0]
113
+ line = line[1..-1]
114
+ position += buffer.bytesize + offset
115
+ @scanner.pos = position
116
+ offset = 0
117
+ yield(buffer)
118
+ buffer = nil
119
+ next if line.empty?
120
+ else
121
+ buffer << line
122
+ line = buffer
123
+ buffer = nil
124
+ end
125
+ end
126
+ if line.end_with?(row_separator)
127
+ position += line.bytesize + offset
128
+ @scanner.pos = position
129
+ offset = 0
130
+ yield(line)
131
+ else
132
+ buffer = line
133
+ end
134
+ end
135
+ break unless read_chunk
136
+ input = @scanner.rest
137
+ position = @scanner.pos
138
+ offset = -buffer.bytesize if buffer
139
+ end
140
+ yield(buffer) if buffer
141
+ end
142
+
143
+ def scan(pattern)
144
+ # trace(__method__, pattern, :start)
145
+ value = @scanner.scan(pattern)
146
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
147
+ return value if @last_scanner
148
+
149
+ read_chunk if value and @scanner.eos?
150
+ # trace(__method__, pattern, :done, value)
151
+ value
152
+ end
153
+
154
+ def scan_all(pattern)
155
+ # trace(__method__, pattern, :start)
156
+ value = @scanner.scan(pattern)
157
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
158
+ return value if @last_scanner
159
+
160
+ # trace(__method__, pattern, :done, :nil) if value.nil?
161
+ return nil if value.nil?
162
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
163
+ # trace(__method__, pattern, :sub, sub_value)
164
+ value << sub_value
165
+ end
166
+ # trace(__method__, pattern, :done, value)
167
+ value
168
+ end
169
+
170
+ def eos?
171
+ @scanner.eos?
172
+ end
173
+
174
+ def keep_start
175
+ # trace(__method__, :start)
176
+ adjust_last_keep
177
+ @keeps.push([@scanner, @scanner.pos, nil])
178
+ # trace(__method__, :done)
179
+ end
180
+
181
+ def keep_end
182
+ # trace(__method__, :start)
183
+ scanner, start, buffer = @keeps.pop
184
+ if scanner == @scanner
185
+ keep = @scanner.string.byteslice(start, @scanner.pos - start)
186
+ else
187
+ keep = @scanner.string.byteslice(0, @scanner.pos)
188
+ end
189
+ if buffer
190
+ buffer << keep
191
+ keep = buffer
192
+ end
193
+ # trace(__method__, :done, keep)
194
+ keep
195
+ end
196
+
197
+ def keep_back
198
+ # trace(__method__, :start)
199
+ scanner, start, buffer = @keeps.pop
200
+ if buffer
201
+ # trace(__method__, :rescan, start, buffer)
202
+ string = @scanner.string
203
+ if scanner == @scanner
204
+ keep = string.byteslice(start,
205
+ string.bytesize - @scanner.pos - start)
206
+ else
207
+ keep = string
208
+ end
209
+ if keep and not keep.empty?
210
+ @inputs.unshift(StringIO.new(keep))
211
+ @last_scanner = false
212
+ end
213
+ @scanner = StringScanner.new(buffer)
214
+ else
215
+ if @scanner != scanner
216
+ message = "scanners are different but no buffer: "
217
+ message += "#{@scanner.inspect}(#{@scanner.object_id}): "
218
+ message += "#{scanner.inspect}(#{scanner.object_id})"
219
+ raise UnexpectedError, message
220
+ end
221
+ # trace(__method__, :repos, start, buffer)
222
+ @scanner.pos = start
223
+ end
224
+ read_chunk if @scanner.eos?
225
+ end
226
+
227
+ def keep_drop
228
+ _, _, buffer = @keeps.pop
229
+ # trace(__method__, :done, :empty) unless buffer
230
+ return unless buffer
231
+
232
+ last_keep = @keeps.last
233
+ # trace(__method__, :done, :no_last_keep) unless last_keep
234
+ return unless last_keep
235
+
236
+ if last_keep[2]
237
+ last_keep[2] << buffer
238
+ else
239
+ last_keep[2] = buffer
240
+ end
241
+ # trace(__method__, :done)
242
+ end
243
+
244
+ def rest
245
+ @scanner.rest
246
+ end
247
+
248
+ def check(pattern)
249
+ @scanner.check(pattern)
250
+ end
251
+
252
+ private
253
+ def trace(*args)
254
+ pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
255
+ end
256
+
257
+ def adjust_last_keep
258
+ # trace(__method__, :start)
259
+
260
+ keep = @keeps.last
261
+ # trace(__method__, :done, :empty) if keep.nil?
262
+ return if keep.nil?
263
+
264
+ scanner, start, buffer = keep
265
+ string = @scanner.string
266
+ if @scanner != scanner
267
+ start = 0
268
+ end
269
+ if start == 0 and @scanner.eos?
270
+ keep_data = string
271
+ else
272
+ keep_data = string.byteslice(start, @scanner.pos - start)
273
+ end
274
+ if keep_data
275
+ if buffer
276
+ buffer << keep_data
277
+ else
278
+ keep[2] = keep_data.dup
279
+ end
280
+ end
281
+
282
+ # trace(__method__, :done)
283
+ end
284
+
285
+ def read_chunk
286
+ return false if @last_scanner
287
+
288
+ adjust_last_keep
289
+
290
+ input = @inputs.first
291
+ case input
292
+ when StringIO
293
+ string = input.read
294
+ raise InvalidEncoding unless string.valid_encoding?
295
+ # trace(__method__, :stringio, string)
296
+ @scanner = StringScanner.new(string)
297
+ @inputs.shift
298
+ @last_scanner = @inputs.empty?
299
+ true
300
+ else
301
+ chunk = input.gets(@row_separator, @chunk_size)
302
+ if chunk
303
+ raise InvalidEncoding unless chunk.valid_encoding?
304
+ # trace(__method__, :chunk, chunk)
305
+ @scanner = StringScanner.new(chunk)
306
+ if input.respond_to?(:eof?) and input.eof?
307
+ @inputs.shift
308
+ @last_scanner = @inputs.empty?
309
+ end
310
+ true
311
+ else
312
+ # trace(__method__, :no_chunk)
313
+ @scanner = StringScanner.new("".encode(@encoding))
314
+ @inputs.shift
315
+ @last_scanner = @inputs.empty?
316
+ if @last_scanner
317
+ false
318
+ else
319
+ read_chunk
320
+ end
321
+ end
322
+ end
323
+ end
324
+ end
325
+
326
+ def initialize(input, options)
327
+ @input = input
328
+ @options = options
329
+ @samples = []
330
+
331
+ prepare
332
+ end
333
+
334
+ def column_separator
335
+ @column_separator
336
+ end
337
+
338
+ def row_separator
339
+ @row_separator
340
+ end
341
+
342
+ def quote_character
343
+ @quote_character
344
+ end
345
+
346
+ def field_size_limit
347
+ @max_field_size&.succ
348
+ end
349
+
350
+ def max_field_size
351
+ @max_field_size
352
+ end
353
+
354
+ def skip_lines
355
+ @skip_lines
356
+ end
357
+
358
+ def unconverted_fields?
359
+ @unconverted_fields
360
+ end
361
+
362
+ def headers
363
+ @headers
364
+ end
365
+
366
+ def header_row?
367
+ @use_headers and @headers.nil?
368
+ end
369
+
370
+ def return_headers?
371
+ @return_headers
372
+ end
373
+
374
+ def skip_blanks?
375
+ @skip_blanks
376
+ end
377
+
378
+ def liberal_parsing?
379
+ @liberal_parsing
380
+ end
381
+
382
+ def lineno
383
+ @lineno
384
+ end
385
+
386
+ def line
387
+ last_line
388
+ end
389
+
390
+ def parse(&block)
391
+ return to_enum(__method__) unless block_given?
392
+
393
+ if @return_headers and @headers and @raw_headers
394
+ headers = Row.new(@headers, @raw_headers, true)
395
+ if @unconverted_fields
396
+ headers = add_unconverted_fields(headers, [])
397
+ end
398
+ yield headers
399
+ end
400
+
401
+ begin
402
+ @scanner ||= build_scanner
403
+ if quote_character.nil?
404
+ parse_no_quote(&block)
405
+ elsif @need_robust_parsing
406
+ parse_quotable_robust(&block)
407
+ else
408
+ parse_quotable_loose(&block)
409
+ end
410
+ rescue InvalidEncoding
411
+ if @scanner
412
+ ignore_broken_line
413
+ lineno = @lineno
414
+ else
415
+ lineno = @lineno + 1
416
+ end
417
+ raise InvalidEncodingError.new(@encoding, lineno)
418
+ rescue UnexpectedError => error
419
+ if @scanner
420
+ ignore_broken_line
421
+ lineno = @lineno
422
+ else
423
+ lineno = @lineno + 1
424
+ end
425
+ message = "This should not be happen: #{error.message}: "
426
+ message += "Please report this to https://github.com/ruby/csv/issues"
427
+ raise MalformedCSVError.new(message, lineno)
428
+ end
429
+ end
430
+
431
+ def use_headers?
432
+ @use_headers
433
+ end
434
+
435
+ private
436
+ # A set of tasks to prepare the file in order to parse it
437
+ def prepare
438
+ prepare_variable
439
+ prepare_quote_character
440
+ prepare_backslash
441
+ prepare_skip_lines
442
+ prepare_strip
443
+ prepare_separators
444
+ validate_strip_and_col_sep_options
445
+ prepare_quoted
446
+ prepare_unquoted
447
+ prepare_line
448
+ prepare_header
449
+ prepare_parser
450
+ end
451
+
452
+ def prepare_variable
453
+ @need_robust_parsing = false
454
+ @encoding = @options[:encoding]
455
+ liberal_parsing = @options[:liberal_parsing]
456
+ if liberal_parsing
457
+ @liberal_parsing = true
458
+ if liberal_parsing.is_a?(Hash)
459
+ @double_quote_outside_quote =
460
+ liberal_parsing[:double_quote_outside_quote]
461
+ @backslash_quote = liberal_parsing[:backslash_quote]
462
+ else
463
+ @double_quote_outside_quote = false
464
+ @backslash_quote = false
465
+ end
466
+ @need_robust_parsing = true
467
+ else
468
+ @liberal_parsing = false
469
+ @backslash_quote = false
470
+ end
471
+ @unconverted_fields = @options[:unconverted_fields]
472
+ @max_field_size = @options[:max_field_size]
473
+ @skip_blanks = @options[:skip_blanks]
474
+ @fields_converter = @options[:fields_converter]
475
+ @header_fields_converter = @options[:header_fields_converter]
476
+ end
477
+
478
+ def prepare_quote_character
479
+ @quote_character = @options[:quote_character]
480
+ if @quote_character.nil?
481
+ @escaped_quote_character = nil
482
+ @escaped_quote = nil
483
+ else
484
+ @quote_character = @quote_character.to_s.encode(@encoding)
485
+ if @quote_character.length != 1
486
+ message = ":quote_char has to be nil or a single character String"
487
+ raise ArgumentError, message
488
+ end
489
+ @escaped_quote_character = Regexp.escape(@quote_character)
490
+ @escaped_quote = Regexp.new(@escaped_quote_character)
491
+ end
492
+ end
493
+
494
+ def prepare_backslash
495
+ return unless @backslash_quote
496
+
497
+ @backslash_character = "\\".encode(@encoding)
498
+
499
+ @escaped_backslash_character = Regexp.escape(@backslash_character)
500
+ @escaped_backslash = Regexp.new(@escaped_backslash_character)
501
+ if @quote_character.nil?
502
+ @backslash_quote_character = nil
503
+ else
504
+ @backslash_quote_character =
505
+ @backslash_character + @escaped_quote_character
506
+ end
507
+ end
508
+
509
+ def prepare_skip_lines
510
+ skip_lines = @options[:skip_lines]
511
+ case skip_lines
512
+ when String
513
+ @skip_lines = skip_lines.encode(@encoding)
514
+ when Regexp, nil
515
+ @skip_lines = skip_lines
516
+ else
517
+ unless skip_lines.respond_to?(:match)
518
+ message =
519
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
520
+ raise ArgumentError, message
521
+ end
522
+ @skip_lines = skip_lines
523
+ end
524
+ end
525
+
526
+ def prepare_strip
527
+ @strip = @options[:strip]
528
+ @escaped_strip = nil
529
+ @strip_value = nil
530
+ @rstrip_value = nil
531
+ if @strip.is_a?(String)
532
+ case @strip.length
533
+ when 0
534
+ raise ArgumentError, ":strip must not be an empty String"
535
+ when 1
536
+ # ok
537
+ else
538
+ raise ArgumentError, ":strip doesn't support 2 or more characters yet"
539
+ end
540
+ @strip = @strip.encode(@encoding)
541
+ @escaped_strip = Regexp.escape(@strip)
542
+ if @quote_character
543
+ @strip_value = Regexp.new(@escaped_strip +
544
+ "+".encode(@encoding))
545
+ @rstrip_value = Regexp.new(@escaped_strip +
546
+ "+\\z".encode(@encoding))
547
+ end
548
+ @need_robust_parsing = true
549
+ elsif @strip
550
+ strip_values = " \t\f\v"
551
+ @escaped_strip = strip_values.encode(@encoding)
552
+ if @quote_character
553
+ @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
554
+ @rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
555
+ end
556
+ @need_robust_parsing = true
557
+ end
558
+ end
559
+
560
+ begin
561
+ StringScanner.new("x").scan("x")
562
+ rescue TypeError
563
+ STRING_SCANNER_SCAN_ACCEPT_STRING = false
564
+ else
565
+ STRING_SCANNER_SCAN_ACCEPT_STRING = true
566
+ end
567
+
568
+ def prepare_separators
569
+ column_separator = @options[:column_separator]
570
+ @column_separator = column_separator.to_s.encode(@encoding)
571
+ if @column_separator.size < 1
572
+ message = ":col_sep must be 1 or more characters: "
573
+ message += column_separator.inspect
574
+ raise ArgumentError, message
575
+ end
576
+ @row_separator =
577
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
578
+
579
+ @escaped_column_separator = Regexp.escape(@column_separator)
580
+ @escaped_first_column_separator = Regexp.escape(@column_separator[0])
581
+ if @column_separator.size > 1
582
+ @column_end = Regexp.new(@escaped_column_separator)
583
+ @column_ends = @column_separator.each_char.collect do |char|
584
+ Regexp.new(Regexp.escape(char))
585
+ end
586
+ @first_column_separators = Regexp.new(@escaped_first_column_separator +
587
+ "+".encode(@encoding))
588
+ else
589
+ if STRING_SCANNER_SCAN_ACCEPT_STRING
590
+ @column_end = @column_separator
591
+ else
592
+ @column_end = Regexp.new(@escaped_column_separator)
593
+ end
594
+ @column_ends = nil
595
+ @first_column_separators = nil
596
+ end
597
+
598
+ escaped_row_separator = Regexp.escape(@row_separator)
599
+ @row_end = Regexp.new(escaped_row_separator)
600
+ if @row_separator.size > 1
601
+ @row_ends = @row_separator.each_char.collect do |char|
602
+ Regexp.new(Regexp.escape(char))
603
+ end
604
+ else
605
+ @row_ends = nil
606
+ end
607
+
608
+ @cr = "\r".encode(@encoding)
609
+ @lf = "\n".encode(@encoding)
610
+ @line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
611
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
612
+ end
613
+
614
+ # This method verifies that there are no (obvious) ambiguities with the
615
+ # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
616
+ # and +strip+ were both equal to +\t+, then there would be no clear way to
617
+ # parse the input.
618
+ def validate_strip_and_col_sep_options
619
+ return unless @strip
620
+
621
+ if @strip.is_a?(String)
622
+ if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
623
+ raise ArgumentError,
624
+ "The provided strip (#{@escaped_strip}) and " \
625
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
626
+ end
627
+ else
628
+ if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
629
+ raise ArgumentError,
630
+ "The provided strip (true) and " \
631
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
632
+ end
633
+ end
634
+ end
635
+
636
+ def prepare_quoted
637
+ if @quote_character
638
+ @quotes = Regexp.new(@escaped_quote_character +
639
+ "+".encode(@encoding))
640
+ no_quoted_values = @escaped_quote_character.dup
641
+ if @backslash_quote
642
+ no_quoted_values << @escaped_backslash_character
643
+ end
644
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
645
+ no_quoted_values +
646
+ "]+".encode(@encoding))
647
+ end
648
+ if @escaped_strip
649
+ @split_column_separator = Regexp.new(@escaped_strip +
650
+ "*".encode(@encoding) +
651
+ @escaped_column_separator +
652
+ @escaped_strip +
653
+ "*".encode(@encoding))
654
+ else
655
+ if @column_separator == " ".encode(@encoding)
656
+ @split_column_separator = Regexp.new(@escaped_column_separator)
657
+ else
658
+ @split_column_separator = @column_separator
659
+ end
660
+ end
661
+ end
662
+
663
+ def prepare_unquoted
664
+ return if @quote_character.nil?
665
+
666
+ no_unquoted_values = "\r\n".encode(@encoding)
667
+ no_unquoted_values << @escaped_first_column_separator
668
+ unless @liberal_parsing
669
+ no_unquoted_values << @escaped_quote_character
670
+ end
671
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
672
+ no_unquoted_values +
673
+ "]+".encode(@encoding))
674
+ end
675
+
676
+ def resolve_row_separator(separator)
677
+ if separator == :auto
678
+ cr = "\r".encode(@encoding)
679
+ lf = "\n".encode(@encoding)
680
+ if @input.is_a?(StringIO)
681
+ pos = @input.pos
682
+ separator = detect_row_separator(@input.read, cr, lf)
683
+ @input.seek(pos)
684
+ elsif @input.respond_to?(:gets)
685
+ if @input.is_a?(File)
686
+ chunk_size = 32 * 1024
687
+ else
688
+ chunk_size = 1024
689
+ end
690
+ begin
691
+ while separator == :auto
692
+ #
693
+ # if we run out of data, it's probably a single line
694
+ # (ensure will set default value)
695
+ #
696
+ break unless sample = @input.gets(nil, chunk_size)
697
+
698
+ # extend sample if we're unsure of the line ending
699
+ if sample.end_with?(cr)
700
+ sample << (@input.gets(nil, 1) || "")
701
+ end
702
+
703
+ @samples << sample
704
+
705
+ separator = detect_row_separator(sample, cr, lf)
706
+ end
707
+ rescue IOError
708
+ # do nothing: ensure will set default
709
+ end
710
+ end
711
+ separator = InputRecordSeparator.value if separator == :auto
712
+ end
713
+ separator.to_s.encode(@encoding)
714
+ end
715
+
716
+ def detect_row_separator(sample, cr, lf)
717
+ lf_index = sample.index(lf)
718
+ if lf_index
719
+ cr_index = sample[0, lf_index].index(cr)
720
+ else
721
+ cr_index = sample.index(cr)
722
+ end
723
+ if cr_index and lf_index
724
+ if cr_index + 1 == lf_index
725
+ cr + lf
726
+ elsif cr_index < lf_index
727
+ cr
728
+ else
729
+ lf
730
+ end
731
+ elsif cr_index
732
+ cr
733
+ elsif lf_index
734
+ lf
735
+ else
736
+ :auto
737
+ end
738
+ end
739
+
740
+ def prepare_line
741
+ @lineno = 0
742
+ @last_line = nil
743
+ @scanner = nil
744
+ end
745
+
746
+ def last_line
747
+ if @scanner
748
+ @last_line ||= @scanner.keep_end
749
+ else
750
+ @last_line
751
+ end
752
+ end
753
+
754
+ def prepare_header
755
+ @return_headers = @options[:return_headers]
756
+
757
+ headers = @options[:headers]
758
+ case headers
759
+ when Array
760
+ @raw_headers = headers
761
+ quoted_fields = [false] * @raw_headers.size
762
+ @use_headers = true
763
+ when String
764
+ @raw_headers, quoted_fields = parse_headers(headers)
765
+ @use_headers = true
766
+ when nil, false
767
+ @raw_headers = nil
768
+ @use_headers = false
769
+ else
770
+ @raw_headers = nil
771
+ @use_headers = true
772
+ end
773
+ if @raw_headers
774
+ @headers = adjust_headers(@raw_headers, quoted_fields)
775
+ else
776
+ @headers = nil
777
+ end
778
+ end
779
+
780
+ def parse_headers(row)
781
+ quoted_fields = []
782
+ converter = lambda do |field, info|
783
+ quoted_fields << info.quoted?
784
+ field
785
+ end
786
+ headers = CSV.parse_line(row,
787
+ col_sep: @column_separator,
788
+ row_sep: @row_separator,
789
+ quote_char: @quote_character,
790
+ converters: [converter])
791
+ [headers, quoted_fields]
792
+ end
793
+
794
+ def adjust_headers(headers, quoted_fields)
795
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
796
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
797
+ adjusted_headers
798
+ end
799
+
800
+ def prepare_parser
801
+ @may_quoted = may_quoted?
802
+ end
803
+
804
+ def may_quoted?
805
+ return false if @quote_character.nil?
806
+
807
+ if @input.is_a?(StringIO)
808
+ pos = @input.pos
809
+ sample = @input.read
810
+ @input.seek(pos)
811
+ else
812
+ return false if @samples.empty?
813
+ sample = @samples.first
814
+ end
815
+ sample[0, 128].index(@quote_character)
816
+ end
817
+
818
+ class UnoptimizedStringIO # :nodoc:
819
+ def initialize(string)
820
+ @io = StringIO.new(string, "rb:#{string.encoding}")
821
+ end
822
+
823
+ def gets(*args)
824
+ @io.gets(*args)
825
+ end
826
+
827
+ def each_line(*args, &block)
828
+ @io.each_line(*args, &block)
829
+ end
830
+
831
+ def eof?
832
+ @io.eof?
833
+ end
834
+ end
835
+
836
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
837
+ if SCANNER_TEST
838
+ SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
839
+ SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
840
+ def build_scanner
841
+ inputs = @samples.collect do |sample|
842
+ UnoptimizedStringIO.new(sample)
843
+ end
844
+ if @input.is_a?(StringIO)
845
+ inputs << UnoptimizedStringIO.new(@input.read)
846
+ else
847
+ inputs << @input
848
+ end
849
+ begin
850
+ chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
851
+ rescue # Ractor::IsolationError
852
+ # Ractor on Ruby 3.0 can't read ENV value.
853
+ chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
854
+ end
855
+ chunk_size = Integer((chunk_size_value || "1"), 10)
856
+ InputsScanner.new(inputs,
857
+ @encoding,
858
+ @row_separator,
859
+ chunk_size: chunk_size)
860
+ end
861
+ else
862
+ def build_scanner
863
+ string = nil
864
+ if @samples.empty? and @input.is_a?(StringIO)
865
+ string = @input.read
866
+ elsif @samples.size == 1 and
867
+ @input != ARGF and
868
+ @input.respond_to?(:eof?) and
869
+ @input.eof?
870
+ string = @samples[0]
871
+ end
872
+ if string
873
+ unless string.valid_encoding?
874
+ index = string.lines(@row_separator).index do |line|
875
+ !line.valid_encoding?
876
+ end
877
+ if index
878
+ raise InvalidEncodingError.new(@encoding, @lineno + index + 1)
879
+ end
880
+ end
881
+ Scanner.new(string)
882
+ else
883
+ inputs = @samples.collect do |sample|
884
+ StringIO.new(sample)
885
+ end
886
+ inputs << @input
887
+ InputsScanner.new(inputs, @encoding, @row_separator)
888
+ end
889
+ end
890
+ end
891
+
892
+ def skip_needless_lines
893
+ return unless @skip_lines
894
+
895
+ until @scanner.eos?
896
+ @scanner.keep_start
897
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
898
+ line << @row_separator if parse_row_end
899
+ if skip_line?(line)
900
+ @lineno += 1
901
+ @scanner.keep_drop
902
+ else
903
+ @scanner.keep_back
904
+ return
905
+ end
906
+ end
907
+ end
908
+
909
+ def skip_line?(line)
910
+ line = line.delete_suffix(@row_separator)
911
+ case @skip_lines
912
+ when String
913
+ line.include?(@skip_lines)
914
+ when Regexp
915
+ @skip_lines.match?(line)
916
+ else
917
+ @skip_lines.match(line)
918
+ end
919
+ end
920
+
921
+ def validate_field_size(field)
922
+ return unless @max_field_size
923
+ return if field.size <= @max_field_size
924
+ ignore_broken_line
925
+ message = "Field size exceeded: #{field.size} > #{@max_field_size}"
926
+ raise MalformedCSVError.new(message, @lineno)
927
+ end
928
+
929
+ def parse_no_quote(&block)
930
+ @scanner.each_line(@row_separator) do |line|
931
+ next if @skip_lines and skip_line?(line)
932
+ original_line = line
933
+ line = line.delete_suffix(@row_separator)
934
+
935
+ if line.empty?
936
+ next if @skip_blanks
937
+ row = []
938
+ quoted_fields = []
939
+ else
940
+ line = strip_value(line)
941
+ row = line.split(@split_column_separator, -1)
942
+ quoted_fields = [false] * row.size
943
+ if @max_field_size
944
+ row.each do |column|
945
+ validate_field_size(column)
946
+ end
947
+ end
948
+ n_columns = row.size
949
+ i = 0
950
+ while i < n_columns
951
+ row[i] = nil if row[i].empty?
952
+ i += 1
953
+ end
954
+ end
955
+ @last_line = original_line
956
+ emit_row(row, quoted_fields, &block)
957
+ end
958
+ end
959
+
960
+ def parse_quotable_loose(&block)
961
+ @scanner.keep_start
962
+ @scanner.each_line(@row_separator) do |line|
963
+ if @skip_lines and skip_line?(line)
964
+ @scanner.keep_drop
965
+ @scanner.keep_start
966
+ next
967
+ end
968
+ original_line = line
969
+ line = line.delete_suffix(@row_separator)
970
+
971
+ if line.empty?
972
+ if @skip_blanks
973
+ @scanner.keep_drop
974
+ @scanner.keep_start
975
+ next
976
+ end
977
+ row = []
978
+ quoted_fields = []
979
+ elsif line.include?(@cr) or line.include?(@lf)
980
+ @scanner.keep_back
981
+ @need_robust_parsing = true
982
+ return parse_quotable_robust(&block)
983
+ else
984
+ row = line.split(@split_column_separator, -1)
985
+ quoted_fields = []
986
+ n_columns = row.size
987
+ i = 0
988
+ while i < n_columns
989
+ column = row[i]
990
+ if column.empty?
991
+ quoted_fields << false
992
+ row[i] = nil
993
+ else
994
+ n_quotes = column.count(@quote_character)
995
+ if n_quotes.zero?
996
+ quoted_fields << false
997
+ # no quote
998
+ elsif n_quotes == 2 and
999
+ column.start_with?(@quote_character) and
1000
+ column.end_with?(@quote_character)
1001
+ quoted_fields << true
1002
+ row[i] = column[1..-2]
1003
+ else
1004
+ @scanner.keep_back
1005
+ @need_robust_parsing = true
1006
+ return parse_quotable_robust(&block)
1007
+ end
1008
+ validate_field_size(row[i])
1009
+ end
1010
+ i += 1
1011
+ end
1012
+ end
1013
+ @scanner.keep_drop
1014
+ @scanner.keep_start
1015
+ @last_line = original_line
1016
+ emit_row(row, quoted_fields, &block)
1017
+ end
1018
+ @scanner.keep_drop
1019
+ end
1020
+
1021
+ def parse_quotable_robust(&block)
1022
+ row = []
1023
+ quoted_fields = []
1024
+ skip_needless_lines
1025
+ start_row
1026
+ while true
1027
+ @quoted_column_value = false
1028
+ @unquoted_column_value = false
1029
+ @scanner.scan_all(@strip_value) if @strip_value
1030
+ value = parse_column_value
1031
+ if value
1032
+ @scanner.scan_all(@strip_value) if @strip_value
1033
+ validate_field_size(value)
1034
+ end
1035
+ if parse_column_end
1036
+ row << value
1037
+ quoted_fields << @quoted_column_value
1038
+ elsif parse_row_end
1039
+ if row.empty? and value.nil?
1040
+ emit_row([], [], &block) unless @skip_blanks
1041
+ else
1042
+ row << value
1043
+ quoted_fields << @quoted_column_value
1044
+ emit_row(row, quoted_fields, &block)
1045
+ row = []
1046
+ quoted_fields = []
1047
+ end
1048
+ skip_needless_lines
1049
+ start_row
1050
+ elsif @scanner.eos?
1051
+ break if row.empty? and value.nil?
1052
+ row << value
1053
+ quoted_fields << @quoted_column_value
1054
+ emit_row(row, quoted_fields, &block)
1055
+ break
1056
+ else
1057
+ if @quoted_column_value
1058
+ if liberal_parsing? and (new_line = @scanner.check(@line_end))
1059
+ message =
1060
+ "Illegal end-of-line sequence outside of a quoted field " +
1061
+ "<#{new_line.inspect}>"
1062
+ else
1063
+ message = "Any value after quoted field isn't allowed"
1064
+ end
1065
+ ignore_broken_line
1066
+ raise MalformedCSVError.new(message, @lineno)
1067
+ elsif @unquoted_column_value and
1068
+ (new_line = @scanner.scan(@line_end))
1069
+ ignore_broken_line
1070
+ message = "Unquoted fields do not allow new line " +
1071
+ "<#{new_line.inspect}>"
1072
+ raise MalformedCSVError.new(message, @lineno)
1073
+ elsif @scanner.rest.start_with?(@quote_character)
1074
+ ignore_broken_line
1075
+ message = "Illegal quoting"
1076
+ raise MalformedCSVError.new(message, @lineno)
1077
+ elsif (new_line = @scanner.scan(@line_end))
1078
+ ignore_broken_line
1079
+ message = "New line must be <#{@row_separator.inspect}> " +
1080
+ "not <#{new_line.inspect}>"
1081
+ raise MalformedCSVError.new(message, @lineno)
1082
+ else
1083
+ ignore_broken_line
1084
+ raise MalformedCSVError.new("TODO: Meaningful message",
1085
+ @lineno)
1086
+ end
1087
+ end
1088
+ end
1089
+ end
1090
+
1091
+ def parse_column_value
1092
+ if @liberal_parsing
1093
+ quoted_value = parse_quoted_column_value
1094
+ if quoted_value
1095
+ @scanner.scan_all(@strip_value) if @strip_value
1096
+ unquoted_value = parse_unquoted_column_value
1097
+ if unquoted_value
1098
+ if @double_quote_outside_quote
1099
+ unquoted_value = unquoted_value.gsub(@quote_character * 2,
1100
+ @quote_character)
1101
+ if quoted_value.empty? # %Q{""...} case
1102
+ return @quote_character + unquoted_value
1103
+ end
1104
+ end
1105
+ @quote_character + quoted_value + @quote_character + unquoted_value
1106
+ else
1107
+ quoted_value
1108
+ end
1109
+ else
1110
+ parse_unquoted_column_value
1111
+ end
1112
+ elsif @may_quoted
1113
+ parse_quoted_column_value ||
1114
+ parse_unquoted_column_value
1115
+ else
1116
+ parse_unquoted_column_value ||
1117
+ parse_quoted_column_value
1118
+ end
1119
+ end
1120
+
1121
+ def parse_unquoted_column_value
1122
+ value = @scanner.scan_all(@unquoted_value)
1123
+ return nil unless value
1124
+
1125
+ @unquoted_column_value = true
1126
+ if @first_column_separators
1127
+ while true
1128
+ @scanner.keep_start
1129
+ is_column_end = @column_ends.all? do |column_end|
1130
+ @scanner.scan(column_end)
1131
+ end
1132
+ @scanner.keep_back
1133
+ break if is_column_end
1134
+ sub_separator = @scanner.scan_all(@first_column_separators)
1135
+ break if sub_separator.nil?
1136
+ value << sub_separator
1137
+ sub_value = @scanner.scan_all(@unquoted_value)
1138
+ break if sub_value.nil?
1139
+ value << sub_value
1140
+ end
1141
+ end
1142
+ value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
1143
+ if @rstrip_value
1144
+ value.gsub!(@rstrip_value, "")
1145
+ end
1146
+ value
1147
+ end
1148
+
1149
+ def parse_quoted_column_value
1150
+ quotes = @scanner.scan_all(@quotes)
1151
+ return nil unless quotes
1152
+
1153
+ @quoted_column_value = true
1154
+ n_quotes = quotes.size
1155
+ if (n_quotes % 2).zero?
1156
+ quotes[0, (n_quotes - 2) / 2]
1157
+ else
1158
+ value = quotes[0, n_quotes / 2]
1159
+ while true
1160
+ quoted_value = @scanner.scan_all(@quoted_value)
1161
+ value << quoted_value if quoted_value
1162
+ if @backslash_quote
1163
+ if @scanner.scan(@escaped_backslash)
1164
+ if @scanner.scan(@escaped_quote)
1165
+ value << @quote_character
1166
+ else
1167
+ value << @backslash_character
1168
+ end
1169
+ next
1170
+ end
1171
+ end
1172
+
1173
+ quotes = @scanner.scan_all(@quotes)
1174
+ unless quotes
1175
+ ignore_broken_line
1176
+ message = "Unclosed quoted field"
1177
+ raise MalformedCSVError.new(message, @lineno)
1178
+ end
1179
+ n_quotes = quotes.size
1180
+ if n_quotes == 1
1181
+ break
1182
+ else
1183
+ value << quotes[0, n_quotes / 2]
1184
+ break if (n_quotes % 2) == 1
1185
+ end
1186
+ end
1187
+ value
1188
+ end
1189
+ end
1190
+
1191
+ def parse_column_end
1192
+ return true if @scanner.scan(@column_end)
1193
+ return false unless @column_ends
1194
+
1195
+ @scanner.keep_start
1196
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
1197
+ @scanner.keep_drop
1198
+ true
1199
+ else
1200
+ @scanner.keep_back
1201
+ false
1202
+ end
1203
+ end
1204
+
1205
+ def parse_row_end
1206
+ return true if @scanner.scan(@row_end)
1207
+ return false unless @row_ends
1208
+ @scanner.keep_start
1209
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
1210
+ @scanner.keep_drop
1211
+ true
1212
+ else
1213
+ @scanner.keep_back
1214
+ false
1215
+ end
1216
+ end
1217
+
1218
+ def strip_value(value)
1219
+ return value unless @strip
1220
+ return value if value.nil?
1221
+
1222
+ case @strip
1223
+ when String
1224
+ while value.delete_prefix!(@strip)
1225
+ # do nothing
1226
+ end
1227
+ while value.delete_suffix!(@strip)
1228
+ # do nothing
1229
+ end
1230
+ else
1231
+ value.strip!
1232
+ end
1233
+ value
1234
+ end
1235
+
1236
+ def ignore_broken_line
1237
+ @scanner.scan_all(@not_line_end)
1238
+ @scanner.scan_all(@line_end)
1239
+ @lineno += 1
1240
+ end
1241
+
1242
+ def start_row
1243
+ if @last_line
1244
+ @last_line = nil
1245
+ else
1246
+ @scanner.keep_drop
1247
+ end
1248
+ @scanner.keep_start
1249
+ end
1250
+
1251
+ def emit_row(row, quoted_fields, &block)
1252
+ @lineno += 1
1253
+
1254
+ raw_row = row
1255
+ if @use_headers
1256
+ if @headers.nil?
1257
+ @headers = adjust_headers(row, quoted_fields)
1258
+ return unless @return_headers
1259
+ row = Row.new(@headers, row, true)
1260
+ else
1261
+ row = Row.new(@headers,
1262
+ @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
1263
+ end
1264
+ else
1265
+ # convert fields, if needed...
1266
+ row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
1267
+ end
1268
+
1269
+ # inject unconverted fields and accessor, if requested...
1270
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
1271
+ add_unconverted_fields(row, raw_row)
1272
+ end
1273
+
1274
+ yield(row)
1275
+ end
1276
+
1277
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
1278
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
1279
+ # variable is set to the contents of +fields+.
1280
+ def add_unconverted_fields(row, fields)
1281
+ class << row
1282
+ attr_reader :unconverted_fields
1283
+ end
1284
+ row.instance_variable_set(:@unconverted_fields, fields)
1285
+ row
1286
+ end
1287
+ end
1288
+ end