csv 1.0.2 → 3.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/NEWS.md +868 -0
  3. data/README.md +6 -3
  4. data/doc/csv/arguments/io.rdoc +5 -0
  5. data/doc/csv/options/common/col_sep.rdoc +57 -0
  6. data/doc/csv/options/common/quote_char.rdoc +42 -0
  7. data/doc/csv/options/common/row_sep.rdoc +91 -0
  8. data/doc/csv/options/generating/force_quotes.rdoc +17 -0
  9. data/doc/csv/options/generating/quote_empty.rdoc +12 -0
  10. data/doc/csv/options/generating/write_converters.rdoc +25 -0
  11. data/doc/csv/options/generating/write_empty_value.rdoc +15 -0
  12. data/doc/csv/options/generating/write_headers.rdoc +29 -0
  13. data/doc/csv/options/generating/write_nil_value.rdoc +14 -0
  14. data/doc/csv/options/parsing/converters.rdoc +46 -0
  15. data/doc/csv/options/parsing/empty_value.rdoc +13 -0
  16. data/doc/csv/options/parsing/field_size_limit.rdoc +39 -0
  17. data/doc/csv/options/parsing/header_converters.rdoc +43 -0
  18. data/doc/csv/options/parsing/headers.rdoc +63 -0
  19. data/doc/csv/options/parsing/liberal_parsing.rdoc +38 -0
  20. data/doc/csv/options/parsing/nil_value.rdoc +12 -0
  21. data/doc/csv/options/parsing/return_headers.rdoc +22 -0
  22. data/doc/csv/options/parsing/skip_blanks.rdoc +31 -0
  23. data/doc/csv/options/parsing/skip_lines.rdoc +37 -0
  24. data/doc/csv/options/parsing/strip.rdoc +15 -0
  25. data/doc/csv/options/parsing/unconverted_fields.rdoc +27 -0
  26. data/doc/csv/recipes/filtering.rdoc +158 -0
  27. data/doc/csv/recipes/generating.rdoc +298 -0
  28. data/doc/csv/recipes/parsing.rdoc +545 -0
  29. data/doc/csv/recipes/recipes.rdoc +6 -0
  30. data/lib/csv/core_ext/array.rb +1 -1
  31. data/lib/csv/core_ext/string.rb +1 -1
  32. data/lib/csv/fields_converter.rb +89 -0
  33. data/lib/csv/input_record_separator.rb +18 -0
  34. data/lib/csv/parser.rb +1290 -0
  35. data/lib/csv/row.rb +505 -136
  36. data/lib/csv/table.rb +791 -114
  37. data/lib/csv/version.rb +1 -1
  38. data/lib/csv/writer.rb +210 -0
  39. data/lib/csv.rb +2432 -1329
  40. metadata +66 -13
  41. data/news.md +0 -112
data/lib/csv/parser.rb ADDED
@@ -0,0 +1,1290 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "input_record_separator"
6
+ require_relative "row"
7
+ require_relative "table"
8
+
9
+ class CSV
10
+ # Note: Don't use this class directly. This is an internal class.
11
+ class Parser
12
+ #
13
+ # A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
14
+ # or String object being read from or written to. Your data is never transcoded
15
+ # (unless you ask Ruby to transcode it for you) and will literally be parsed in
16
+ # the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
17
+ # Encoding of your data. This is accomplished by transcoding the parser itself
18
+ # into your Encoding.
19
+ #
20
+
21
+ # Raised when encoding is invalid.
22
+ class InvalidEncoding < StandardError
23
+ end
24
+
25
+ # Raised when unexpected case is happen.
26
+ class UnexpectedError < StandardError
27
+ end
28
+
29
+ #
30
+ # CSV::Scanner receives a CSV output, scans it and return the content.
31
+ # It also controls the life cycle of the object with its methods +keep_start+,
32
+ # +keep_end+, +keep_back+, +keep_drop+.
33
+ #
34
+ # Uses StringScanner (the official strscan gem). Strscan provides lexical
35
+ # scanning operations on a String. We inherit its object and take advantage
36
+ # on the methods. For more information, please visit:
37
+ # https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
38
+ #
39
+ class Scanner < StringScanner
40
+ alias_method :scan_all, :scan
41
+
42
+ def initialize(*args)
43
+ super
44
+ @keeps = []
45
+ end
46
+
47
+ def each_line(row_separator)
48
+ position = pos
49
+ rest.each_line(row_separator) do |line|
50
+ position += line.bytesize
51
+ self.pos = position
52
+ yield(line)
53
+ end
54
+ end
55
+
56
+ def keep_start
57
+ @keeps.push(pos)
58
+ end
59
+
60
+ def keep_end
61
+ start = @keeps.pop
62
+ string.byteslice(start, pos - start)
63
+ end
64
+
65
+ def keep_back
66
+ self.pos = @keeps.pop
67
+ end
68
+
69
+ def keep_drop
70
+ @keeps.pop
71
+ end
72
+ end
73
+
74
+ #
75
+ # CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
76
+ # It also controls the life cycle of the object with its methods +keep_start+,
77
+ # +keep_end+, +keep_back+, +keep_drop+.
78
+ #
79
+ # CSV::InputsScanner.scan() tries to match with pattern at the current position.
80
+ # If there's a match, the scanner advances the "scan pointer" and returns the matched string.
81
+ # Otherwise, the scanner returns nil.
82
+ #
83
+ # CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
84
+ # If there is no more data (eos? = true), it returns "".
85
+ #
86
+ class InputsScanner
87
+ def initialize(inputs, encoding, row_separator, chunk_size: 8192)
88
+ @inputs = inputs.dup
89
+ @encoding = encoding
90
+ @row_separator = row_separator
91
+ @chunk_size = chunk_size
92
+ @last_scanner = @inputs.empty?
93
+ @keeps = []
94
+ read_chunk
95
+ end
96
+
97
+ def each_line(row_separator)
98
+ return enum_for(__method__, row_separator) unless block_given?
99
+ buffer = nil
100
+ input = @scanner.rest
101
+ position = @scanner.pos
102
+ offset = 0
103
+ n_row_separator_chars = row_separator.size
104
+ # trace(__method__, :start, input)
105
+ while true
106
+ input.each_line(row_separator) do |line|
107
+ @scanner.pos += line.bytesize
108
+ if buffer
109
+ if n_row_separator_chars == 2 and
110
+ buffer.end_with?(row_separator[0]) and
111
+ line.start_with?(row_separator[1])
112
+ buffer << line[0]
113
+ line = line[1..-1]
114
+ position += buffer.bytesize + offset
115
+ @scanner.pos = position
116
+ offset = 0
117
+ yield(buffer)
118
+ buffer = nil
119
+ next if line.empty?
120
+ else
121
+ buffer << line
122
+ line = buffer
123
+ buffer = nil
124
+ end
125
+ end
126
+ if line.end_with?(row_separator)
127
+ position += line.bytesize + offset
128
+ @scanner.pos = position
129
+ offset = 0
130
+ yield(line)
131
+ else
132
+ buffer = line
133
+ end
134
+ end
135
+ break unless read_chunk
136
+ input = @scanner.rest
137
+ position = @scanner.pos
138
+ offset = -buffer.bytesize if buffer
139
+ end
140
+ yield(buffer) if buffer
141
+ end
142
+
143
+ def scan(pattern)
144
+ # trace(__method__, pattern, :start)
145
+ value = @scanner.scan(pattern)
146
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
147
+ return value if @last_scanner
148
+
149
+ read_chunk if value and @scanner.eos?
150
+ # trace(__method__, pattern, :done, value)
151
+ value
152
+ end
153
+
154
+ def scan_all(pattern)
155
+ # trace(__method__, pattern, :start)
156
+ value = @scanner.scan(pattern)
157
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
158
+ return value if @last_scanner
159
+
160
+ # trace(__method__, pattern, :done, :nil) if value.nil?
161
+ return nil if value.nil?
162
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
163
+ # trace(__method__, pattern, :sub, sub_value)
164
+ value << sub_value
165
+ end
166
+ # trace(__method__, pattern, :done, value)
167
+ value
168
+ end
169
+
170
+ def eos?
171
+ @scanner.eos?
172
+ end
173
+
174
+ def keep_start
175
+ # trace(__method__, :start)
176
+ adjust_last_keep
177
+ @keeps.push([@scanner, @scanner.pos, nil])
178
+ # trace(__method__, :done)
179
+ end
180
+
181
+ def keep_end
182
+ # trace(__method__, :start)
183
+ scanner, start, buffer = @keeps.pop
184
+ if scanner == @scanner
185
+ keep = @scanner.string.byteslice(start, @scanner.pos - start)
186
+ else
187
+ keep = @scanner.string.byteslice(0, @scanner.pos)
188
+ end
189
+ if buffer
190
+ buffer << keep
191
+ keep = buffer
192
+ end
193
+ # trace(__method__, :done, keep)
194
+ keep
195
+ end
196
+
197
+ def keep_back
198
+ # trace(__method__, :start)
199
+ scanner, start, buffer = @keeps.pop
200
+ if buffer
201
+ # trace(__method__, :rescan, start, buffer)
202
+ string = @scanner.string
203
+ if scanner == @scanner
204
+ keep = string.byteslice(start,
205
+ string.bytesize - @scanner.pos - start)
206
+ else
207
+ keep = string
208
+ end
209
+ if keep and not keep.empty?
210
+ @inputs.unshift(StringIO.new(keep))
211
+ @last_scanner = false
212
+ end
213
+ @scanner = StringScanner.new(buffer)
214
+ else
215
+ if @scanner != scanner
216
+ message = "scanners are different but no buffer: "
217
+ message += "#{@scanner.inspect}(#{@scanner.object_id}): "
218
+ message += "#{scanner.inspect}(#{scanner.object_id})"
219
+ raise UnexpectedError, message
220
+ end
221
+ # trace(__method__, :repos, start, buffer)
222
+ @scanner.pos = start
223
+ end
224
+ read_chunk if @scanner.eos?
225
+ end
226
+
227
+ def keep_drop
228
+ _, _, buffer = @keeps.pop
229
+ # trace(__method__, :done, :empty) unless buffer
230
+ return unless buffer
231
+
232
+ last_keep = @keeps.last
233
+ # trace(__method__, :done, :no_last_keep) unless last_keep
234
+ return unless last_keep
235
+
236
+ if last_keep[2]
237
+ last_keep[2] << buffer
238
+ else
239
+ last_keep[2] = buffer
240
+ end
241
+ # trace(__method__, :done)
242
+ end
243
+
244
+ def rest
245
+ @scanner.rest
246
+ end
247
+
248
+ def check(pattern)
249
+ @scanner.check(pattern)
250
+ end
251
+
252
+ private
253
+ def trace(*args)
254
+ pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
255
+ end
256
+
257
+ def adjust_last_keep
258
+ # trace(__method__, :start)
259
+
260
+ keep = @keeps.last
261
+ # trace(__method__, :done, :empty) if keep.nil?
262
+ return if keep.nil?
263
+
264
+ scanner, start, buffer = keep
265
+ string = @scanner.string
266
+ if @scanner != scanner
267
+ start = 0
268
+ end
269
+ if start == 0 and @scanner.eos?
270
+ keep_data = string
271
+ else
272
+ keep_data = string.byteslice(start, @scanner.pos - start)
273
+ end
274
+ if keep_data
275
+ if buffer
276
+ buffer << keep_data
277
+ else
278
+ keep[2] = keep_data.dup
279
+ end
280
+ end
281
+
282
+ # trace(__method__, :done)
283
+ end
284
+
285
+ def read_chunk
286
+ return false if @last_scanner
287
+
288
+ adjust_last_keep
289
+
290
+ input = @inputs.first
291
+ case input
292
+ when StringIO
293
+ string = input.read
294
+ raise InvalidEncoding unless string.valid_encoding?
295
+ # trace(__method__, :stringio, string)
296
+ @scanner = StringScanner.new(string)
297
+ @inputs.shift
298
+ @last_scanner = @inputs.empty?
299
+ true
300
+ else
301
+ chunk = input.gets(@row_separator, @chunk_size)
302
+ if chunk
303
+ raise InvalidEncoding unless chunk.valid_encoding?
304
+ # trace(__method__, :chunk, chunk)
305
+ @scanner = StringScanner.new(chunk)
306
+ if input.respond_to?(:eof?) and input.eof?
307
+ @inputs.shift
308
+ @last_scanner = @inputs.empty?
309
+ end
310
+ true
311
+ else
312
+ # trace(__method__, :no_chunk)
313
+ @scanner = StringScanner.new("".encode(@encoding))
314
+ @inputs.shift
315
+ @last_scanner = @inputs.empty?
316
+ if @last_scanner
317
+ false
318
+ else
319
+ read_chunk
320
+ end
321
+ end
322
+ end
323
+ end
324
+ end
325
+
326
+ def initialize(input, options)
327
+ @input = input
328
+ @options = options
329
+ @samples = []
330
+
331
+ prepare
332
+ end
333
+
334
+ def column_separator
335
+ @column_separator
336
+ end
337
+
338
+ def row_separator
339
+ @row_separator
340
+ end
341
+
342
+ def quote_character
343
+ @quote_character
344
+ end
345
+
346
+ def field_size_limit
347
+ @max_field_size&.succ
348
+ end
349
+
350
+ def max_field_size
351
+ @max_field_size
352
+ end
353
+
354
+ def skip_lines
355
+ @skip_lines
356
+ end
357
+
358
+ def unconverted_fields?
359
+ @unconverted_fields
360
+ end
361
+
362
+ def headers
363
+ @headers
364
+ end
365
+
366
+ def header_row?
367
+ @use_headers and @headers.nil?
368
+ end
369
+
370
+ def return_headers?
371
+ @return_headers
372
+ end
373
+
374
+ def skip_blanks?
375
+ @skip_blanks
376
+ end
377
+
378
+ def liberal_parsing?
379
+ @liberal_parsing
380
+ end
381
+
382
+ def lineno
383
+ @lineno
384
+ end
385
+
386
+ def line
387
+ last_line
388
+ end
389
+
390
+ def parse(&block)
391
+ return to_enum(__method__) unless block_given?
392
+
393
+ if @return_headers and @headers and @raw_headers
394
+ headers = Row.new(@headers, @raw_headers, true)
395
+ if @unconverted_fields
396
+ headers = add_unconverted_fields(headers, [])
397
+ end
398
+ yield headers
399
+ end
400
+
401
+ begin
402
+ @scanner ||= build_scanner
403
+ if quote_character.nil?
404
+ parse_no_quote(&block)
405
+ elsif @need_robust_parsing
406
+ parse_quotable_robust(&block)
407
+ else
408
+ parse_quotable_loose(&block)
409
+ end
410
+ rescue InvalidEncoding
411
+ if @scanner
412
+ ignore_broken_line
413
+ lineno = @lineno
414
+ else
415
+ lineno = @lineno + 1
416
+ end
417
+ message = "Invalid byte sequence in #{@encoding}"
418
+ raise MalformedCSVError.new(message, lineno)
419
+ rescue UnexpectedError => error
420
+ if @scanner
421
+ ignore_broken_line
422
+ lineno = @lineno
423
+ else
424
+ lineno = @lineno + 1
425
+ end
426
+ message = "This should not be happen: #{error.message}: "
427
+ message += "Please report this to https://github.com/ruby/csv/issues"
428
+ raise MalformedCSVError.new(message, lineno)
429
+ end
430
+ end
431
+
432
+ def use_headers?
433
+ @use_headers
434
+ end
435
+
436
+ private
437
+ # A set of tasks to prepare the file in order to parse it
438
+ def prepare
439
+ prepare_variable
440
+ prepare_quote_character
441
+ prepare_backslash
442
+ prepare_skip_lines
443
+ prepare_strip
444
+ prepare_separators
445
+ validate_strip_and_col_sep_options
446
+ prepare_quoted
447
+ prepare_unquoted
448
+ prepare_line
449
+ prepare_header
450
+ prepare_parser
451
+ end
452
+
453
+ def prepare_variable
454
+ @need_robust_parsing = false
455
+ @encoding = @options[:encoding]
456
+ liberal_parsing = @options[:liberal_parsing]
457
+ if liberal_parsing
458
+ @liberal_parsing = true
459
+ if liberal_parsing.is_a?(Hash)
460
+ @double_quote_outside_quote =
461
+ liberal_parsing[:double_quote_outside_quote]
462
+ @backslash_quote = liberal_parsing[:backslash_quote]
463
+ else
464
+ @double_quote_outside_quote = false
465
+ @backslash_quote = false
466
+ end
467
+ @need_robust_parsing = true
468
+ else
469
+ @liberal_parsing = false
470
+ @backslash_quote = false
471
+ end
472
+ @unconverted_fields = @options[:unconverted_fields]
473
+ @max_field_size = @options[:max_field_size]
474
+ @skip_blanks = @options[:skip_blanks]
475
+ @fields_converter = @options[:fields_converter]
476
+ @header_fields_converter = @options[:header_fields_converter]
477
+ end
478
+
479
+ def prepare_quote_character
480
+ @quote_character = @options[:quote_character]
481
+ if @quote_character.nil?
482
+ @escaped_quote_character = nil
483
+ @escaped_quote = nil
484
+ else
485
+ @quote_character = @quote_character.to_s.encode(@encoding)
486
+ if @quote_character.length != 1
487
+ message = ":quote_char has to be nil or a single character String"
488
+ raise ArgumentError, message
489
+ end
490
+ @escaped_quote_character = Regexp.escape(@quote_character)
491
+ @escaped_quote = Regexp.new(@escaped_quote_character)
492
+ end
493
+ end
494
+
495
+ def prepare_backslash
496
+ return unless @backslash_quote
497
+
498
+ @backslash_character = "\\".encode(@encoding)
499
+
500
+ @escaped_backslash_character = Regexp.escape(@backslash_character)
501
+ @escaped_backslash = Regexp.new(@escaped_backslash_character)
502
+ if @quote_character.nil?
503
+ @backslash_quote_character = nil
504
+ else
505
+ @backslash_quote_character =
506
+ @backslash_character + @escaped_quote_character
507
+ end
508
+ end
509
+
510
+ def prepare_skip_lines
511
+ skip_lines = @options[:skip_lines]
512
+ case skip_lines
513
+ when String
514
+ @skip_lines = skip_lines.encode(@encoding)
515
+ when Regexp, nil
516
+ @skip_lines = skip_lines
517
+ else
518
+ unless skip_lines.respond_to?(:match)
519
+ message =
520
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
521
+ raise ArgumentError, message
522
+ end
523
+ @skip_lines = skip_lines
524
+ end
525
+ end
526
+
527
+ def prepare_strip
528
+ @strip = @options[:strip]
529
+ @escaped_strip = nil
530
+ @strip_value = nil
531
+ @rstrip_value = nil
532
+ if @strip.is_a?(String)
533
+ case @strip.length
534
+ when 0
535
+ raise ArgumentError, ":strip must not be an empty String"
536
+ when 1
537
+ # ok
538
+ else
539
+ raise ArgumentError, ":strip doesn't support 2 or more characters yet"
540
+ end
541
+ @strip = @strip.encode(@encoding)
542
+ @escaped_strip = Regexp.escape(@strip)
543
+ if @quote_character
544
+ @strip_value = Regexp.new(@escaped_strip +
545
+ "+".encode(@encoding))
546
+ @rstrip_value = Regexp.new(@escaped_strip +
547
+ "+\\z".encode(@encoding))
548
+ end
549
+ @need_robust_parsing = true
550
+ elsif @strip
551
+ strip_values = " \t\f\v"
552
+ @escaped_strip = strip_values.encode(@encoding)
553
+ if @quote_character
554
+ @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
555
+ @rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
556
+ end
557
+ @need_robust_parsing = true
558
+ end
559
+ end
560
+
561
+ begin
562
+ StringScanner.new("x").scan("x")
563
+ rescue TypeError
564
+ STRING_SCANNER_SCAN_ACCEPT_STRING = false
565
+ else
566
+ STRING_SCANNER_SCAN_ACCEPT_STRING = true
567
+ end
568
+
569
+ def prepare_separators
570
+ column_separator = @options[:column_separator]
571
+ @column_separator = column_separator.to_s.encode(@encoding)
572
+ if @column_separator.size < 1
573
+ message = ":col_sep must be 1 or more characters: "
574
+ message += column_separator.inspect
575
+ raise ArgumentError, message
576
+ end
577
+ @row_separator =
578
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
579
+
580
+ @escaped_column_separator = Regexp.escape(@column_separator)
581
+ @escaped_first_column_separator = Regexp.escape(@column_separator[0])
582
+ if @column_separator.size > 1
583
+ @column_end = Regexp.new(@escaped_column_separator)
584
+ @column_ends = @column_separator.each_char.collect do |char|
585
+ Regexp.new(Regexp.escape(char))
586
+ end
587
+ @first_column_separators = Regexp.new(@escaped_first_column_separator +
588
+ "+".encode(@encoding))
589
+ else
590
+ if STRING_SCANNER_SCAN_ACCEPT_STRING
591
+ @column_end = @column_separator
592
+ else
593
+ @column_end = Regexp.new(@escaped_column_separator)
594
+ end
595
+ @column_ends = nil
596
+ @first_column_separators = nil
597
+ end
598
+
599
+ escaped_row_separator = Regexp.escape(@row_separator)
600
+ @row_end = Regexp.new(escaped_row_separator)
601
+ if @row_separator.size > 1
602
+ @row_ends = @row_separator.each_char.collect do |char|
603
+ Regexp.new(Regexp.escape(char))
604
+ end
605
+ else
606
+ @row_ends = nil
607
+ end
608
+
609
+ @cr = "\r".encode(@encoding)
610
+ @lf = "\n".encode(@encoding)
611
+ @line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
612
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
613
+ end
614
+
615
+ # This method verifies that there are no (obvious) ambiguities with the
616
+ # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
617
+ # and +strip+ were both equal to +\t+, then there would be no clear way to
618
+ # parse the input.
619
+ def validate_strip_and_col_sep_options
620
+ return unless @strip
621
+
622
+ if @strip.is_a?(String)
623
+ if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
624
+ raise ArgumentError,
625
+ "The provided strip (#{@escaped_strip}) and " \
626
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
627
+ end
628
+ else
629
+ if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
630
+ raise ArgumentError,
631
+ "The provided strip (true) and " \
632
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
633
+ end
634
+ end
635
+ end
636
+
637
+ def prepare_quoted
638
+ if @quote_character
639
+ @quotes = Regexp.new(@escaped_quote_character +
640
+ "+".encode(@encoding))
641
+ no_quoted_values = @escaped_quote_character.dup
642
+ if @backslash_quote
643
+ no_quoted_values << @escaped_backslash_character
644
+ end
645
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
646
+ no_quoted_values +
647
+ "]+".encode(@encoding))
648
+ end
649
+ if @escaped_strip
650
+ @split_column_separator = Regexp.new(@escaped_strip +
651
+ "*".encode(@encoding) +
652
+ @escaped_column_separator +
653
+ @escaped_strip +
654
+ "*".encode(@encoding))
655
+ else
656
+ if @column_separator == " ".encode(@encoding)
657
+ @split_column_separator = Regexp.new(@escaped_column_separator)
658
+ else
659
+ @split_column_separator = @column_separator
660
+ end
661
+ end
662
+ end
663
+
664
+ def prepare_unquoted
665
+ return if @quote_character.nil?
666
+
667
+ no_unquoted_values = "\r\n".encode(@encoding)
668
+ no_unquoted_values << @escaped_first_column_separator
669
+ unless @liberal_parsing
670
+ no_unquoted_values << @escaped_quote_character
671
+ end
672
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
673
+ no_unquoted_values +
674
+ "]+".encode(@encoding))
675
+ end
676
+
677
+ def resolve_row_separator(separator)
678
+ if separator == :auto
679
+ cr = "\r".encode(@encoding)
680
+ lf = "\n".encode(@encoding)
681
+ if @input.is_a?(StringIO)
682
+ pos = @input.pos
683
+ separator = detect_row_separator(@input.read, cr, lf)
684
+ @input.seek(pos)
685
+ elsif @input.respond_to?(:gets)
686
+ if @input.is_a?(File)
687
+ chunk_size = 32 * 1024
688
+ else
689
+ chunk_size = 1024
690
+ end
691
+ begin
692
+ while separator == :auto
693
+ #
694
+ # if we run out of data, it's probably a single line
695
+ # (ensure will set default value)
696
+ #
697
+ break unless sample = @input.gets(nil, chunk_size)
698
+
699
+ # extend sample if we're unsure of the line ending
700
+ if sample.end_with?(cr)
701
+ sample << (@input.gets(nil, 1) || "")
702
+ end
703
+
704
+ @samples << sample
705
+
706
+ separator = detect_row_separator(sample, cr, lf)
707
+ end
708
+ rescue IOError
709
+ # do nothing: ensure will set default
710
+ end
711
+ end
712
+ separator = InputRecordSeparator.value if separator == :auto
713
+ end
714
+ separator.to_s.encode(@encoding)
715
+ end
716
+
717
+ def detect_row_separator(sample, cr, lf)
718
+ lf_index = sample.index(lf)
719
+ if lf_index
720
+ cr_index = sample[0, lf_index].index(cr)
721
+ else
722
+ cr_index = sample.index(cr)
723
+ end
724
+ if cr_index and lf_index
725
+ if cr_index + 1 == lf_index
726
+ cr + lf
727
+ elsif cr_index < lf_index
728
+ cr
729
+ else
730
+ lf
731
+ end
732
+ elsif cr_index
733
+ cr
734
+ elsif lf_index
735
+ lf
736
+ else
737
+ :auto
738
+ end
739
+ end
740
+
741
+ def prepare_line
742
+ @lineno = 0
743
+ @last_line = nil
744
+ @scanner = nil
745
+ end
746
+
747
+ def last_line
748
+ if @scanner
749
+ @last_line ||= @scanner.keep_end
750
+ else
751
+ @last_line
752
+ end
753
+ end
754
+
755
+ def prepare_header
756
+ @return_headers = @options[:return_headers]
757
+
758
+ headers = @options[:headers]
759
+ case headers
760
+ when Array
761
+ @raw_headers = headers
762
+ quoted_fields = [false] * @raw_headers.size
763
+ @use_headers = true
764
+ when String
765
+ @raw_headers, quoted_fields = parse_headers(headers)
766
+ @use_headers = true
767
+ when nil, false
768
+ @raw_headers = nil
769
+ @use_headers = false
770
+ else
771
+ @raw_headers = nil
772
+ @use_headers = true
773
+ end
774
+ if @raw_headers
775
+ @headers = adjust_headers(@raw_headers, quoted_fields)
776
+ else
777
+ @headers = nil
778
+ end
779
+ end
780
+
781
+ def parse_headers(row)
782
+ quoted_fields = []
783
+ converter = lambda do |field, info|
784
+ quoted_fields << info.quoted?
785
+ field
786
+ end
787
+ headers = CSV.parse_line(row,
788
+ col_sep: @column_separator,
789
+ row_sep: @row_separator,
790
+ quote_char: @quote_character,
791
+ converters: [converter])
792
+ [headers, quoted_fields]
793
+ end
794
+
795
+ def adjust_headers(headers, quoted_fields)
796
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
797
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
798
+ adjusted_headers
799
+ end
800
+
801
+ def prepare_parser
802
+ @may_quoted = may_quoted?
803
+ end
804
+
805
+ def may_quoted?
806
+ return false if @quote_character.nil?
807
+
808
+ if @input.is_a?(StringIO)
809
+ pos = @input.pos
810
+ sample = @input.read
811
+ @input.seek(pos)
812
+ else
813
+ return false if @samples.empty?
814
+ sample = @samples.first
815
+ end
816
+ sample[0, 128].index(@quote_character)
817
+ end
818
+
819
+ class UnoptimizedStringIO # :nodoc:
820
+ def initialize(string)
821
+ @io = StringIO.new(string, "rb:#{string.encoding}")
822
+ end
823
+
824
+ def gets(*args)
825
+ @io.gets(*args)
826
+ end
827
+
828
+ def each_line(*args, &block)
829
+ @io.each_line(*args, &block)
830
+ end
831
+
832
+ def eof?
833
+ @io.eof?
834
+ end
835
+ end
836
+
837
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
838
+ if SCANNER_TEST
839
+ SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
840
+ SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
841
+ def build_scanner
842
+ inputs = @samples.collect do |sample|
843
+ UnoptimizedStringIO.new(sample)
844
+ end
845
+ if @input.is_a?(StringIO)
846
+ inputs << UnoptimizedStringIO.new(@input.read)
847
+ else
848
+ inputs << @input
849
+ end
850
+ begin
851
+ chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
852
+ rescue # Ractor::IsolationError
853
+ # Ractor on Ruby 3.0 can't read ENV value.
854
+ chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
855
+ end
856
+ chunk_size = Integer((chunk_size_value || "1"), 10)
857
+ InputsScanner.new(inputs,
858
+ @encoding,
859
+ @row_separator,
860
+ chunk_size: chunk_size)
861
+ end
862
+ else
863
+ def build_scanner
864
+ string = nil
865
+ if @samples.empty? and @input.is_a?(StringIO)
866
+ string = @input.read
867
+ elsif @samples.size == 1 and
868
+ @input != ARGF and
869
+ @input.respond_to?(:eof?) and
870
+ @input.eof?
871
+ string = @samples[0]
872
+ end
873
+ if string
874
+ unless string.valid_encoding?
875
+ index = string.lines(@row_separator).index do |line|
876
+ !line.valid_encoding?
877
+ end
878
+ if index
879
+ message = "Invalid byte sequence in #{@encoding}"
880
+ raise MalformedCSVError.new(message, @lineno + index + 1)
881
+ end
882
+ end
883
+ Scanner.new(string)
884
+ else
885
+ inputs = @samples.collect do |sample|
886
+ StringIO.new(sample)
887
+ end
888
+ inputs << @input
889
+ InputsScanner.new(inputs, @encoding, @row_separator)
890
+ end
891
+ end
892
+ end
893
+
894
+ def skip_needless_lines
895
+ return unless @skip_lines
896
+
897
+ until @scanner.eos?
898
+ @scanner.keep_start
899
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
900
+ line << @row_separator if parse_row_end
901
+ if skip_line?(line)
902
+ @lineno += 1
903
+ @scanner.keep_drop
904
+ else
905
+ @scanner.keep_back
906
+ return
907
+ end
908
+ end
909
+ end
910
+
911
+ def skip_line?(line)
912
+ line = line.delete_suffix(@row_separator)
913
+ case @skip_lines
914
+ when String
915
+ line.include?(@skip_lines)
916
+ when Regexp
917
+ @skip_lines.match?(line)
918
+ else
919
+ @skip_lines.match(line)
920
+ end
921
+ end
922
+
923
+ def validate_field_size(field)
924
+ return unless @max_field_size
925
+ return if field.size <= @max_field_size
926
+ ignore_broken_line
927
+ message = "Field size exceeded: #{field.size} > #{@max_field_size}"
928
+ raise MalformedCSVError.new(message, @lineno)
929
+ end
930
+
931
+ def parse_no_quote(&block)
932
+ @scanner.each_line(@row_separator) do |line|
933
+ next if @skip_lines and skip_line?(line)
934
+ original_line = line
935
+ line = line.delete_suffix(@row_separator)
936
+
937
+ if line.empty?
938
+ next if @skip_blanks
939
+ row = []
940
+ quoted_fields = []
941
+ else
942
+ line = strip_value(line)
943
+ row = line.split(@split_column_separator, -1)
944
+ quoted_fields = [false] * row.size
945
+ if @max_field_size
946
+ row.each do |column|
947
+ validate_field_size(column)
948
+ end
949
+ end
950
+ n_columns = row.size
951
+ i = 0
952
+ while i < n_columns
953
+ row[i] = nil if row[i].empty?
954
+ i += 1
955
+ end
956
+ end
957
+ @last_line = original_line
958
+ emit_row(row, quoted_fields, &block)
959
+ end
960
+ end
961
+
962
+ def parse_quotable_loose(&block)
963
+ @scanner.keep_start
964
+ @scanner.each_line(@row_separator) do |line|
965
+ if @skip_lines and skip_line?(line)
966
+ @scanner.keep_drop
967
+ @scanner.keep_start
968
+ next
969
+ end
970
+ original_line = line
971
+ line = line.delete_suffix(@row_separator)
972
+
973
+ if line.empty?
974
+ if @skip_blanks
975
+ @scanner.keep_drop
976
+ @scanner.keep_start
977
+ next
978
+ end
979
+ row = []
980
+ quoted_fields = []
981
+ elsif line.include?(@cr) or line.include?(@lf)
982
+ @scanner.keep_back
983
+ @need_robust_parsing = true
984
+ return parse_quotable_robust(&block)
985
+ else
986
+ row = line.split(@split_column_separator, -1)
987
+ quoted_fields = []
988
+ n_columns = row.size
989
+ i = 0
990
+ while i < n_columns
991
+ column = row[i]
992
+ if column.empty?
993
+ quoted_fields << false
994
+ row[i] = nil
995
+ else
996
+ n_quotes = column.count(@quote_character)
997
+ if n_quotes.zero?
998
+ quoted_fields << false
999
+ # no quote
1000
+ elsif n_quotes == 2 and
1001
+ column.start_with?(@quote_character) and
1002
+ column.end_with?(@quote_character)
1003
+ quoted_fields << true
1004
+ row[i] = column[1..-2]
1005
+ else
1006
+ @scanner.keep_back
1007
+ @need_robust_parsing = true
1008
+ return parse_quotable_robust(&block)
1009
+ end
1010
+ validate_field_size(row[i])
1011
+ end
1012
+ i += 1
1013
+ end
1014
+ end
1015
+ @scanner.keep_drop
1016
+ @scanner.keep_start
1017
+ @last_line = original_line
1018
+ emit_row(row, quoted_fields, &block)
1019
+ end
1020
+ @scanner.keep_drop
1021
+ end
1022
+
1023
+ def parse_quotable_robust(&block)
1024
+ row = []
1025
+ quoted_fields = []
1026
+ skip_needless_lines
1027
+ start_row
1028
+ while true
1029
+ @quoted_column_value = false
1030
+ @unquoted_column_value = false
1031
+ @scanner.scan_all(@strip_value) if @strip_value
1032
+ value = parse_column_value
1033
+ if value
1034
+ @scanner.scan_all(@strip_value) if @strip_value
1035
+ validate_field_size(value)
1036
+ end
1037
+ if parse_column_end
1038
+ row << value
1039
+ quoted_fields << @quoted_column_value
1040
+ elsif parse_row_end
1041
+ if row.empty? and value.nil?
1042
+ emit_row([], [], &block) unless @skip_blanks
1043
+ else
1044
+ row << value
1045
+ quoted_fields << @quoted_column_value
1046
+ emit_row(row, quoted_fields, &block)
1047
+ row = []
1048
+ quoted_fields = []
1049
+ end
1050
+ skip_needless_lines
1051
+ start_row
1052
+ elsif @scanner.eos?
1053
+ break if row.empty? and value.nil?
1054
+ row << value
1055
+ quoted_fields << @quoted_column_value
1056
+ emit_row(row, quoted_fields, &block)
1057
+ break
1058
+ else
1059
+ if @quoted_column_value
1060
+ if liberal_parsing? and (new_line = @scanner.check(@line_end))
1061
+ message =
1062
+ "Illegal end-of-line sequence outside of a quoted field " +
1063
+ "<#{new_line.inspect}>"
1064
+ else
1065
+ message = "Any value after quoted field isn't allowed"
1066
+ end
1067
+ ignore_broken_line
1068
+ raise MalformedCSVError.new(message, @lineno)
1069
+ elsif @unquoted_column_value and
1070
+ (new_line = @scanner.scan(@line_end))
1071
+ ignore_broken_line
1072
+ message = "Unquoted fields do not allow new line " +
1073
+ "<#{new_line.inspect}>"
1074
+ raise MalformedCSVError.new(message, @lineno)
1075
+ elsif @scanner.rest.start_with?(@quote_character)
1076
+ ignore_broken_line
1077
+ message = "Illegal quoting"
1078
+ raise MalformedCSVError.new(message, @lineno)
1079
+ elsif (new_line = @scanner.scan(@line_end))
1080
+ ignore_broken_line
1081
+ message = "New line must be <#{@row_separator.inspect}> " +
1082
+ "not <#{new_line.inspect}>"
1083
+ raise MalformedCSVError.new(message, @lineno)
1084
+ else
1085
+ ignore_broken_line
1086
+ raise MalformedCSVError.new("TODO: Meaningful message",
1087
+ @lineno)
1088
+ end
1089
+ end
1090
+ end
1091
+ end
1092
+
1093
+ def parse_column_value
1094
+ if @liberal_parsing
1095
+ quoted_value = parse_quoted_column_value
1096
+ if quoted_value
1097
+ @scanner.scan_all(@strip_value) if @strip_value
1098
+ unquoted_value = parse_unquoted_column_value
1099
+ if unquoted_value
1100
+ if @double_quote_outside_quote
1101
+ unquoted_value = unquoted_value.gsub(@quote_character * 2,
1102
+ @quote_character)
1103
+ if quoted_value.empty? # %Q{""...} case
1104
+ return @quote_character + unquoted_value
1105
+ end
1106
+ end
1107
+ @quote_character + quoted_value + @quote_character + unquoted_value
1108
+ else
1109
+ quoted_value
1110
+ end
1111
+ else
1112
+ parse_unquoted_column_value
1113
+ end
1114
+ elsif @may_quoted
1115
+ parse_quoted_column_value ||
1116
+ parse_unquoted_column_value
1117
+ else
1118
+ parse_unquoted_column_value ||
1119
+ parse_quoted_column_value
1120
+ end
1121
+ end
1122
+
1123
+ def parse_unquoted_column_value
1124
+ value = @scanner.scan_all(@unquoted_value)
1125
+ return nil unless value
1126
+
1127
+ @unquoted_column_value = true
1128
+ if @first_column_separators
1129
+ while true
1130
+ @scanner.keep_start
1131
+ is_column_end = @column_ends.all? do |column_end|
1132
+ @scanner.scan(column_end)
1133
+ end
1134
+ @scanner.keep_back
1135
+ break if is_column_end
1136
+ sub_separator = @scanner.scan_all(@first_column_separators)
1137
+ break if sub_separator.nil?
1138
+ value << sub_separator
1139
+ sub_value = @scanner.scan_all(@unquoted_value)
1140
+ break if sub_value.nil?
1141
+ value << sub_value
1142
+ end
1143
+ end
1144
+ value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
1145
+ if @rstrip_value
1146
+ value.gsub!(@rstrip_value, "")
1147
+ end
1148
+ value
1149
+ end
1150
+
1151
+ def parse_quoted_column_value
1152
+ quotes = @scanner.scan_all(@quotes)
1153
+ return nil unless quotes
1154
+
1155
+ @quoted_column_value = true
1156
+ n_quotes = quotes.size
1157
+ if (n_quotes % 2).zero?
1158
+ quotes[0, (n_quotes - 2) / 2]
1159
+ else
1160
+ value = quotes[0, n_quotes / 2]
1161
+ while true
1162
+ quoted_value = @scanner.scan_all(@quoted_value)
1163
+ value << quoted_value if quoted_value
1164
+ if @backslash_quote
1165
+ if @scanner.scan(@escaped_backslash)
1166
+ if @scanner.scan(@escaped_quote)
1167
+ value << @quote_character
1168
+ else
1169
+ value << @backslash_character
1170
+ end
1171
+ next
1172
+ end
1173
+ end
1174
+
1175
+ quotes = @scanner.scan_all(@quotes)
1176
+ unless quotes
1177
+ ignore_broken_line
1178
+ message = "Unclosed quoted field"
1179
+ raise MalformedCSVError.new(message, @lineno)
1180
+ end
1181
+ n_quotes = quotes.size
1182
+ if n_quotes == 1
1183
+ break
1184
+ else
1185
+ value << quotes[0, n_quotes / 2]
1186
+ break if (n_quotes % 2) == 1
1187
+ end
1188
+ end
1189
+ value
1190
+ end
1191
+ end
1192
+
1193
+ def parse_column_end
1194
+ return true if @scanner.scan(@column_end)
1195
+ return false unless @column_ends
1196
+
1197
+ @scanner.keep_start
1198
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
1199
+ @scanner.keep_drop
1200
+ true
1201
+ else
1202
+ @scanner.keep_back
1203
+ false
1204
+ end
1205
+ end
1206
+
1207
+ def parse_row_end
1208
+ return true if @scanner.scan(@row_end)
1209
+ return false unless @row_ends
1210
+ @scanner.keep_start
1211
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
1212
+ @scanner.keep_drop
1213
+ true
1214
+ else
1215
+ @scanner.keep_back
1216
+ false
1217
+ end
1218
+ end
1219
+
1220
+ def strip_value(value)
1221
+ return value unless @strip
1222
+ return value if value.nil?
1223
+
1224
+ case @strip
1225
+ when String
1226
+ while value.delete_prefix!(@strip)
1227
+ # do nothing
1228
+ end
1229
+ while value.delete_suffix!(@strip)
1230
+ # do nothing
1231
+ end
1232
+ else
1233
+ value.strip!
1234
+ end
1235
+ value
1236
+ end
1237
+
1238
+ def ignore_broken_line
1239
+ @scanner.scan_all(@not_line_end)
1240
+ @scanner.scan_all(@line_end)
1241
+ @lineno += 1
1242
+ end
1243
+
1244
+ def start_row
1245
+ if @last_line
1246
+ @last_line = nil
1247
+ else
1248
+ @scanner.keep_drop
1249
+ end
1250
+ @scanner.keep_start
1251
+ end
1252
+
1253
+ def emit_row(row, quoted_fields, &block)
1254
+ @lineno += 1
1255
+
1256
+ raw_row = row
1257
+ if @use_headers
1258
+ if @headers.nil?
1259
+ @headers = adjust_headers(row, quoted_fields)
1260
+ return unless @return_headers
1261
+ row = Row.new(@headers, row, true)
1262
+ else
1263
+ row = Row.new(@headers,
1264
+ @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
1265
+ end
1266
+ else
1267
+ # convert fields, if needed...
1268
+ row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
1269
+ end
1270
+
1271
+ # inject unconverted fields and accessor, if requested...
1272
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
1273
+ add_unconverted_fields(row, raw_row)
1274
+ end
1275
+
1276
+ yield(row)
1277
+ end
1278
+
1279
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
1280
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
1281
+ # variable is set to the contents of +fields+.
1282
+ def add_unconverted_fields(row, fields)
1283
+ class << row
1284
+ attr_reader :unconverted_fields
1285
+ end
1286
+ row.instance_variable_set(:@unconverted_fields, fields)
1287
+ row
1288
+ end
1289
+ end
1290
+ end