csv 1.0.2 → 3.2.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/NEWS.md +868 -0
  3. data/README.md +6 -3
  4. data/doc/csv/arguments/io.rdoc +5 -0
  5. data/doc/csv/options/common/col_sep.rdoc +57 -0
  6. data/doc/csv/options/common/quote_char.rdoc +42 -0
  7. data/doc/csv/options/common/row_sep.rdoc +91 -0
  8. data/doc/csv/options/generating/force_quotes.rdoc +17 -0
  9. data/doc/csv/options/generating/quote_empty.rdoc +12 -0
  10. data/doc/csv/options/generating/write_converters.rdoc +25 -0
  11. data/doc/csv/options/generating/write_empty_value.rdoc +15 -0
  12. data/doc/csv/options/generating/write_headers.rdoc +29 -0
  13. data/doc/csv/options/generating/write_nil_value.rdoc +14 -0
  14. data/doc/csv/options/parsing/converters.rdoc +46 -0
  15. data/doc/csv/options/parsing/empty_value.rdoc +13 -0
  16. data/doc/csv/options/parsing/field_size_limit.rdoc +39 -0
  17. data/doc/csv/options/parsing/header_converters.rdoc +43 -0
  18. data/doc/csv/options/parsing/headers.rdoc +63 -0
  19. data/doc/csv/options/parsing/liberal_parsing.rdoc +38 -0
  20. data/doc/csv/options/parsing/nil_value.rdoc +12 -0
  21. data/doc/csv/options/parsing/return_headers.rdoc +22 -0
  22. data/doc/csv/options/parsing/skip_blanks.rdoc +31 -0
  23. data/doc/csv/options/parsing/skip_lines.rdoc +37 -0
  24. data/doc/csv/options/parsing/strip.rdoc +15 -0
  25. data/doc/csv/options/parsing/unconverted_fields.rdoc +27 -0
  26. data/doc/csv/recipes/filtering.rdoc +158 -0
  27. data/doc/csv/recipes/generating.rdoc +298 -0
  28. data/doc/csv/recipes/parsing.rdoc +545 -0
  29. data/doc/csv/recipes/recipes.rdoc +6 -0
  30. data/lib/csv/core_ext/array.rb +1 -1
  31. data/lib/csv/core_ext/string.rb +1 -1
  32. data/lib/csv/fields_converter.rb +89 -0
  33. data/lib/csv/input_record_separator.rb +18 -0
  34. data/lib/csv/parser.rb +1290 -0
  35. data/lib/csv/row.rb +505 -136
  36. data/lib/csv/table.rb +791 -114
  37. data/lib/csv/version.rb +1 -1
  38. data/lib/csv/writer.rb +210 -0
  39. data/lib/csv.rb +2432 -1329
  40. metadata +66 -13
  41. data/news.md +0 -112
data/lib/csv/parser.rb ADDED
@@ -0,0 +1,1290 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "strscan"
4
+
5
+ require_relative "input_record_separator"
6
+ require_relative "row"
7
+ require_relative "table"
8
+
9
+ class CSV
10
+ # Note: Don't use this class directly. This is an internal class.
11
+ class Parser
12
+ #
13
+ # A CSV::Parser is m17n aware. The parser works in the Encoding of the IO
14
+ # or String object being read from or written to. Your data is never transcoded
15
+ # (unless you ask Ruby to transcode it for you) and will literally be parsed in
16
+ # the Encoding it is in. Thus CSV will return Arrays or Rows of Strings in the
17
+ # Encoding of your data. This is accomplished by transcoding the parser itself
18
+ # into your Encoding.
19
+ #
20
+
21
+ # Raised when encoding is invalid.
22
+ class InvalidEncoding < StandardError
23
+ end
24
+
25
+ # Raised when unexpected case is happen.
26
+ class UnexpectedError < StandardError
27
+ end
28
+
29
+ #
30
+ # CSV::Scanner receives a CSV output, scans it and return the content.
31
+ # It also controls the life cycle of the object with its methods +keep_start+,
32
+ # +keep_end+, +keep_back+, +keep_drop+.
33
+ #
34
+ # Uses StringScanner (the official strscan gem). Strscan provides lexical
35
+ # scanning operations on a String. We inherit its object and take advantage
36
+ # on the methods. For more information, please visit:
37
+ # https://ruby-doc.org/stdlib-2.6.1/libdoc/strscan/rdoc/StringScanner.html
38
+ #
39
+ class Scanner < StringScanner
40
+ alias_method :scan_all, :scan
41
+
42
+ def initialize(*args)
43
+ super
44
+ @keeps = []
45
+ end
46
+
47
+ def each_line(row_separator)
48
+ position = pos
49
+ rest.each_line(row_separator) do |line|
50
+ position += line.bytesize
51
+ self.pos = position
52
+ yield(line)
53
+ end
54
+ end
55
+
56
+ def keep_start
57
+ @keeps.push(pos)
58
+ end
59
+
60
+ def keep_end
61
+ start = @keeps.pop
62
+ string.byteslice(start, pos - start)
63
+ end
64
+
65
+ def keep_back
66
+ self.pos = @keeps.pop
67
+ end
68
+
69
+ def keep_drop
70
+ @keeps.pop
71
+ end
72
+ end
73
+
74
+ #
75
+ # CSV::InputsScanner receives IO inputs, encoding and the chunk_size.
76
+ # It also controls the life cycle of the object with its methods +keep_start+,
77
+ # +keep_end+, +keep_back+, +keep_drop+.
78
+ #
79
+ # CSV::InputsScanner.scan() tries to match with pattern at the current position.
80
+ # If there's a match, the scanner advances the "scan pointer" and returns the matched string.
81
+ # Otherwise, the scanner returns nil.
82
+ #
83
+ # CSV::InputsScanner.rest() returns the "rest" of the string (i.e. everything after the scan pointer).
84
+ # If there is no more data (eos? = true), it returns "".
85
+ #
86
+ class InputsScanner
87
+ def initialize(inputs, encoding, row_separator, chunk_size: 8192)
88
+ @inputs = inputs.dup
89
+ @encoding = encoding
90
+ @row_separator = row_separator
91
+ @chunk_size = chunk_size
92
+ @last_scanner = @inputs.empty?
93
+ @keeps = []
94
+ read_chunk
95
+ end
96
+
97
+ def each_line(row_separator)
98
+ return enum_for(__method__, row_separator) unless block_given?
99
+ buffer = nil
100
+ input = @scanner.rest
101
+ position = @scanner.pos
102
+ offset = 0
103
+ n_row_separator_chars = row_separator.size
104
+ # trace(__method__, :start, input)
105
+ while true
106
+ input.each_line(row_separator) do |line|
107
+ @scanner.pos += line.bytesize
108
+ if buffer
109
+ if n_row_separator_chars == 2 and
110
+ buffer.end_with?(row_separator[0]) and
111
+ line.start_with?(row_separator[1])
112
+ buffer << line[0]
113
+ line = line[1..-1]
114
+ position += buffer.bytesize + offset
115
+ @scanner.pos = position
116
+ offset = 0
117
+ yield(buffer)
118
+ buffer = nil
119
+ next if line.empty?
120
+ else
121
+ buffer << line
122
+ line = buffer
123
+ buffer = nil
124
+ end
125
+ end
126
+ if line.end_with?(row_separator)
127
+ position += line.bytesize + offset
128
+ @scanner.pos = position
129
+ offset = 0
130
+ yield(line)
131
+ else
132
+ buffer = line
133
+ end
134
+ end
135
+ break unless read_chunk
136
+ input = @scanner.rest
137
+ position = @scanner.pos
138
+ offset = -buffer.bytesize if buffer
139
+ end
140
+ yield(buffer) if buffer
141
+ end
142
+
143
+ def scan(pattern)
144
+ # trace(__method__, pattern, :start)
145
+ value = @scanner.scan(pattern)
146
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
147
+ return value if @last_scanner
148
+
149
+ read_chunk if value and @scanner.eos?
150
+ # trace(__method__, pattern, :done, value)
151
+ value
152
+ end
153
+
154
+ def scan_all(pattern)
155
+ # trace(__method__, pattern, :start)
156
+ value = @scanner.scan(pattern)
157
+ # trace(__method__, pattern, :done, :last, value) if @last_scanner
158
+ return value if @last_scanner
159
+
160
+ # trace(__method__, pattern, :done, :nil) if value.nil?
161
+ return nil if value.nil?
162
+ while @scanner.eos? and read_chunk and (sub_value = @scanner.scan(pattern))
163
+ # trace(__method__, pattern, :sub, sub_value)
164
+ value << sub_value
165
+ end
166
+ # trace(__method__, pattern, :done, value)
167
+ value
168
+ end
169
+
170
+ def eos?
171
+ @scanner.eos?
172
+ end
173
+
174
+ def keep_start
175
+ # trace(__method__, :start)
176
+ adjust_last_keep
177
+ @keeps.push([@scanner, @scanner.pos, nil])
178
+ # trace(__method__, :done)
179
+ end
180
+
181
+ def keep_end
182
+ # trace(__method__, :start)
183
+ scanner, start, buffer = @keeps.pop
184
+ if scanner == @scanner
185
+ keep = @scanner.string.byteslice(start, @scanner.pos - start)
186
+ else
187
+ keep = @scanner.string.byteslice(0, @scanner.pos)
188
+ end
189
+ if buffer
190
+ buffer << keep
191
+ keep = buffer
192
+ end
193
+ # trace(__method__, :done, keep)
194
+ keep
195
+ end
196
+
197
+ def keep_back
198
+ # trace(__method__, :start)
199
+ scanner, start, buffer = @keeps.pop
200
+ if buffer
201
+ # trace(__method__, :rescan, start, buffer)
202
+ string = @scanner.string
203
+ if scanner == @scanner
204
+ keep = string.byteslice(start,
205
+ string.bytesize - @scanner.pos - start)
206
+ else
207
+ keep = string
208
+ end
209
+ if keep and not keep.empty?
210
+ @inputs.unshift(StringIO.new(keep))
211
+ @last_scanner = false
212
+ end
213
+ @scanner = StringScanner.new(buffer)
214
+ else
215
+ if @scanner != scanner
216
+ message = "scanners are different but no buffer: "
217
+ message += "#{@scanner.inspect}(#{@scanner.object_id}): "
218
+ message += "#{scanner.inspect}(#{scanner.object_id})"
219
+ raise UnexpectedError, message
220
+ end
221
+ # trace(__method__, :repos, start, buffer)
222
+ @scanner.pos = start
223
+ end
224
+ read_chunk if @scanner.eos?
225
+ end
226
+
227
+ def keep_drop
228
+ _, _, buffer = @keeps.pop
229
+ # trace(__method__, :done, :empty) unless buffer
230
+ return unless buffer
231
+
232
+ last_keep = @keeps.last
233
+ # trace(__method__, :done, :no_last_keep) unless last_keep
234
+ return unless last_keep
235
+
236
+ if last_keep[2]
237
+ last_keep[2] << buffer
238
+ else
239
+ last_keep[2] = buffer
240
+ end
241
+ # trace(__method__, :done)
242
+ end
243
+
244
+ def rest
245
+ @scanner.rest
246
+ end
247
+
248
+ def check(pattern)
249
+ @scanner.check(pattern)
250
+ end
251
+
252
+ private
253
+ def trace(*args)
254
+ pp([*args, @scanner, @scanner&.string, @scanner&.pos, @keeps])
255
+ end
256
+
257
+ def adjust_last_keep
258
+ # trace(__method__, :start)
259
+
260
+ keep = @keeps.last
261
+ # trace(__method__, :done, :empty) if keep.nil?
262
+ return if keep.nil?
263
+
264
+ scanner, start, buffer = keep
265
+ string = @scanner.string
266
+ if @scanner != scanner
267
+ start = 0
268
+ end
269
+ if start == 0 and @scanner.eos?
270
+ keep_data = string
271
+ else
272
+ keep_data = string.byteslice(start, @scanner.pos - start)
273
+ end
274
+ if keep_data
275
+ if buffer
276
+ buffer << keep_data
277
+ else
278
+ keep[2] = keep_data.dup
279
+ end
280
+ end
281
+
282
+ # trace(__method__, :done)
283
+ end
284
+
285
+ def read_chunk
286
+ return false if @last_scanner
287
+
288
+ adjust_last_keep
289
+
290
+ input = @inputs.first
291
+ case input
292
+ when StringIO
293
+ string = input.read
294
+ raise InvalidEncoding unless string.valid_encoding?
295
+ # trace(__method__, :stringio, string)
296
+ @scanner = StringScanner.new(string)
297
+ @inputs.shift
298
+ @last_scanner = @inputs.empty?
299
+ true
300
+ else
301
+ chunk = input.gets(@row_separator, @chunk_size)
302
+ if chunk
303
+ raise InvalidEncoding unless chunk.valid_encoding?
304
+ # trace(__method__, :chunk, chunk)
305
+ @scanner = StringScanner.new(chunk)
306
+ if input.respond_to?(:eof?) and input.eof?
307
+ @inputs.shift
308
+ @last_scanner = @inputs.empty?
309
+ end
310
+ true
311
+ else
312
+ # trace(__method__, :no_chunk)
313
+ @scanner = StringScanner.new("".encode(@encoding))
314
+ @inputs.shift
315
+ @last_scanner = @inputs.empty?
316
+ if @last_scanner
317
+ false
318
+ else
319
+ read_chunk
320
+ end
321
+ end
322
+ end
323
+ end
324
+ end
325
+
326
+ def initialize(input, options)
327
+ @input = input
328
+ @options = options
329
+ @samples = []
330
+
331
+ prepare
332
+ end
333
+
334
+ def column_separator
335
+ @column_separator
336
+ end
337
+
338
+ def row_separator
339
+ @row_separator
340
+ end
341
+
342
+ def quote_character
343
+ @quote_character
344
+ end
345
+
346
+ def field_size_limit
347
+ @max_field_size&.succ
348
+ end
349
+
350
+ def max_field_size
351
+ @max_field_size
352
+ end
353
+
354
+ def skip_lines
355
+ @skip_lines
356
+ end
357
+
358
+ def unconverted_fields?
359
+ @unconverted_fields
360
+ end
361
+
362
+ def headers
363
+ @headers
364
+ end
365
+
366
+ def header_row?
367
+ @use_headers and @headers.nil?
368
+ end
369
+
370
+ def return_headers?
371
+ @return_headers
372
+ end
373
+
374
+ def skip_blanks?
375
+ @skip_blanks
376
+ end
377
+
378
+ def liberal_parsing?
379
+ @liberal_parsing
380
+ end
381
+
382
+ def lineno
383
+ @lineno
384
+ end
385
+
386
+ def line
387
+ last_line
388
+ end
389
+
390
+ def parse(&block)
391
+ return to_enum(__method__) unless block_given?
392
+
393
+ if @return_headers and @headers and @raw_headers
394
+ headers = Row.new(@headers, @raw_headers, true)
395
+ if @unconverted_fields
396
+ headers = add_unconverted_fields(headers, [])
397
+ end
398
+ yield headers
399
+ end
400
+
401
+ begin
402
+ @scanner ||= build_scanner
403
+ if quote_character.nil?
404
+ parse_no_quote(&block)
405
+ elsif @need_robust_parsing
406
+ parse_quotable_robust(&block)
407
+ else
408
+ parse_quotable_loose(&block)
409
+ end
410
+ rescue InvalidEncoding
411
+ if @scanner
412
+ ignore_broken_line
413
+ lineno = @lineno
414
+ else
415
+ lineno = @lineno + 1
416
+ end
417
+ message = "Invalid byte sequence in #{@encoding}"
418
+ raise MalformedCSVError.new(message, lineno)
419
+ rescue UnexpectedError => error
420
+ if @scanner
421
+ ignore_broken_line
422
+ lineno = @lineno
423
+ else
424
+ lineno = @lineno + 1
425
+ end
426
+ message = "This should not be happen: #{error.message}: "
427
+ message += "Please report this to https://github.com/ruby/csv/issues"
428
+ raise MalformedCSVError.new(message, lineno)
429
+ end
430
+ end
431
+
432
+ def use_headers?
433
+ @use_headers
434
+ end
435
+
436
+ private
437
+ # A set of tasks to prepare the file in order to parse it
438
+ def prepare
439
+ prepare_variable
440
+ prepare_quote_character
441
+ prepare_backslash
442
+ prepare_skip_lines
443
+ prepare_strip
444
+ prepare_separators
445
+ validate_strip_and_col_sep_options
446
+ prepare_quoted
447
+ prepare_unquoted
448
+ prepare_line
449
+ prepare_header
450
+ prepare_parser
451
+ end
452
+
453
+ def prepare_variable
454
+ @need_robust_parsing = false
455
+ @encoding = @options[:encoding]
456
+ liberal_parsing = @options[:liberal_parsing]
457
+ if liberal_parsing
458
+ @liberal_parsing = true
459
+ if liberal_parsing.is_a?(Hash)
460
+ @double_quote_outside_quote =
461
+ liberal_parsing[:double_quote_outside_quote]
462
+ @backslash_quote = liberal_parsing[:backslash_quote]
463
+ else
464
+ @double_quote_outside_quote = false
465
+ @backslash_quote = false
466
+ end
467
+ @need_robust_parsing = true
468
+ else
469
+ @liberal_parsing = false
470
+ @backslash_quote = false
471
+ end
472
+ @unconverted_fields = @options[:unconverted_fields]
473
+ @max_field_size = @options[:max_field_size]
474
+ @skip_blanks = @options[:skip_blanks]
475
+ @fields_converter = @options[:fields_converter]
476
+ @header_fields_converter = @options[:header_fields_converter]
477
+ end
478
+
479
+ def prepare_quote_character
480
+ @quote_character = @options[:quote_character]
481
+ if @quote_character.nil?
482
+ @escaped_quote_character = nil
483
+ @escaped_quote = nil
484
+ else
485
+ @quote_character = @quote_character.to_s.encode(@encoding)
486
+ if @quote_character.length != 1
487
+ message = ":quote_char has to be nil or a single character String"
488
+ raise ArgumentError, message
489
+ end
490
+ @escaped_quote_character = Regexp.escape(@quote_character)
491
+ @escaped_quote = Regexp.new(@escaped_quote_character)
492
+ end
493
+ end
494
+
495
+ def prepare_backslash
496
+ return unless @backslash_quote
497
+
498
+ @backslash_character = "\\".encode(@encoding)
499
+
500
+ @escaped_backslash_character = Regexp.escape(@backslash_character)
501
+ @escaped_backslash = Regexp.new(@escaped_backslash_character)
502
+ if @quote_character.nil?
503
+ @backslash_quote_character = nil
504
+ else
505
+ @backslash_quote_character =
506
+ @backslash_character + @escaped_quote_character
507
+ end
508
+ end
509
+
510
+ def prepare_skip_lines
511
+ skip_lines = @options[:skip_lines]
512
+ case skip_lines
513
+ when String
514
+ @skip_lines = skip_lines.encode(@encoding)
515
+ when Regexp, nil
516
+ @skip_lines = skip_lines
517
+ else
518
+ unless skip_lines.respond_to?(:match)
519
+ message =
520
+ ":skip_lines has to respond to \#match: #{skip_lines.inspect}"
521
+ raise ArgumentError, message
522
+ end
523
+ @skip_lines = skip_lines
524
+ end
525
+ end
526
+
527
+ def prepare_strip
528
+ @strip = @options[:strip]
529
+ @escaped_strip = nil
530
+ @strip_value = nil
531
+ @rstrip_value = nil
532
+ if @strip.is_a?(String)
533
+ case @strip.length
534
+ when 0
535
+ raise ArgumentError, ":strip must not be an empty String"
536
+ when 1
537
+ # ok
538
+ else
539
+ raise ArgumentError, ":strip doesn't support 2 or more characters yet"
540
+ end
541
+ @strip = @strip.encode(@encoding)
542
+ @escaped_strip = Regexp.escape(@strip)
543
+ if @quote_character
544
+ @strip_value = Regexp.new(@escaped_strip +
545
+ "+".encode(@encoding))
546
+ @rstrip_value = Regexp.new(@escaped_strip +
547
+ "+\\z".encode(@encoding))
548
+ end
549
+ @need_robust_parsing = true
550
+ elsif @strip
551
+ strip_values = " \t\f\v"
552
+ @escaped_strip = strip_values.encode(@encoding)
553
+ if @quote_character
554
+ @strip_value = Regexp.new("[#{strip_values}]+".encode(@encoding))
555
+ @rstrip_value = Regexp.new("[#{strip_values}]+\\z".encode(@encoding))
556
+ end
557
+ @need_robust_parsing = true
558
+ end
559
+ end
560
+
561
+ begin
562
+ StringScanner.new("x").scan("x")
563
+ rescue TypeError
564
+ STRING_SCANNER_SCAN_ACCEPT_STRING = false
565
+ else
566
+ STRING_SCANNER_SCAN_ACCEPT_STRING = true
567
+ end
568
+
569
+ def prepare_separators
570
+ column_separator = @options[:column_separator]
571
+ @column_separator = column_separator.to_s.encode(@encoding)
572
+ if @column_separator.size < 1
573
+ message = ":col_sep must be 1 or more characters: "
574
+ message += column_separator.inspect
575
+ raise ArgumentError, message
576
+ end
577
+ @row_separator =
578
+ resolve_row_separator(@options[:row_separator]).encode(@encoding)
579
+
580
+ @escaped_column_separator = Regexp.escape(@column_separator)
581
+ @escaped_first_column_separator = Regexp.escape(@column_separator[0])
582
+ if @column_separator.size > 1
583
+ @column_end = Regexp.new(@escaped_column_separator)
584
+ @column_ends = @column_separator.each_char.collect do |char|
585
+ Regexp.new(Regexp.escape(char))
586
+ end
587
+ @first_column_separators = Regexp.new(@escaped_first_column_separator +
588
+ "+".encode(@encoding))
589
+ else
590
+ if STRING_SCANNER_SCAN_ACCEPT_STRING
591
+ @column_end = @column_separator
592
+ else
593
+ @column_end = Regexp.new(@escaped_column_separator)
594
+ end
595
+ @column_ends = nil
596
+ @first_column_separators = nil
597
+ end
598
+
599
+ escaped_row_separator = Regexp.escape(@row_separator)
600
+ @row_end = Regexp.new(escaped_row_separator)
601
+ if @row_separator.size > 1
602
+ @row_ends = @row_separator.each_char.collect do |char|
603
+ Regexp.new(Regexp.escape(char))
604
+ end
605
+ else
606
+ @row_ends = nil
607
+ end
608
+
609
+ @cr = "\r".encode(@encoding)
610
+ @lf = "\n".encode(@encoding)
611
+ @line_end = Regexp.new("\r\n|\n|\r".encode(@encoding))
612
+ @not_line_end = Regexp.new("[^\r\n]+".encode(@encoding))
613
+ end
614
+
615
+ # This method verifies that there are no (obvious) ambiguities with the
616
+ # provided +col_sep+ and +strip+ parsing options. For example, if +col_sep+
617
+ # and +strip+ were both equal to +\t+, then there would be no clear way to
618
+ # parse the input.
619
+ def validate_strip_and_col_sep_options
620
+ return unless @strip
621
+
622
+ if @strip.is_a?(String)
623
+ if @column_separator.start_with?(@strip) || @column_separator.end_with?(@strip)
624
+ raise ArgumentError,
625
+ "The provided strip (#{@escaped_strip}) and " \
626
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
627
+ end
628
+ else
629
+ if Regexp.new("\\A[#{@escaped_strip}]|[#{@escaped_strip}]\\z").match?(@column_separator)
630
+ raise ArgumentError,
631
+ "The provided strip (true) and " \
632
+ "col_sep (#{@escaped_column_separator}) options are incompatible."
633
+ end
634
+ end
635
+ end
636
+
637
+ def prepare_quoted
638
+ if @quote_character
639
+ @quotes = Regexp.new(@escaped_quote_character +
640
+ "+".encode(@encoding))
641
+ no_quoted_values = @escaped_quote_character.dup
642
+ if @backslash_quote
643
+ no_quoted_values << @escaped_backslash_character
644
+ end
645
+ @quoted_value = Regexp.new("[^".encode(@encoding) +
646
+ no_quoted_values +
647
+ "]+".encode(@encoding))
648
+ end
649
+ if @escaped_strip
650
+ @split_column_separator = Regexp.new(@escaped_strip +
651
+ "*".encode(@encoding) +
652
+ @escaped_column_separator +
653
+ @escaped_strip +
654
+ "*".encode(@encoding))
655
+ else
656
+ if @column_separator == " ".encode(@encoding)
657
+ @split_column_separator = Regexp.new(@escaped_column_separator)
658
+ else
659
+ @split_column_separator = @column_separator
660
+ end
661
+ end
662
+ end
663
+
664
+ def prepare_unquoted
665
+ return if @quote_character.nil?
666
+
667
+ no_unquoted_values = "\r\n".encode(@encoding)
668
+ no_unquoted_values << @escaped_first_column_separator
669
+ unless @liberal_parsing
670
+ no_unquoted_values << @escaped_quote_character
671
+ end
672
+ @unquoted_value = Regexp.new("[^".encode(@encoding) +
673
+ no_unquoted_values +
674
+ "]+".encode(@encoding))
675
+ end
676
+
677
+ def resolve_row_separator(separator)
678
+ if separator == :auto
679
+ cr = "\r".encode(@encoding)
680
+ lf = "\n".encode(@encoding)
681
+ if @input.is_a?(StringIO)
682
+ pos = @input.pos
683
+ separator = detect_row_separator(@input.read, cr, lf)
684
+ @input.seek(pos)
685
+ elsif @input.respond_to?(:gets)
686
+ if @input.is_a?(File)
687
+ chunk_size = 32 * 1024
688
+ else
689
+ chunk_size = 1024
690
+ end
691
+ begin
692
+ while separator == :auto
693
+ #
694
+ # if we run out of data, it's probably a single line
695
+ # (ensure will set default value)
696
+ #
697
+ break unless sample = @input.gets(nil, chunk_size)
698
+
699
+ # extend sample if we're unsure of the line ending
700
+ if sample.end_with?(cr)
701
+ sample << (@input.gets(nil, 1) || "")
702
+ end
703
+
704
+ @samples << sample
705
+
706
+ separator = detect_row_separator(sample, cr, lf)
707
+ end
708
+ rescue IOError
709
+ # do nothing: ensure will set default
710
+ end
711
+ end
712
+ separator = InputRecordSeparator.value if separator == :auto
713
+ end
714
+ separator.to_s.encode(@encoding)
715
+ end
716
+
717
+ def detect_row_separator(sample, cr, lf)
718
+ lf_index = sample.index(lf)
719
+ if lf_index
720
+ cr_index = sample[0, lf_index].index(cr)
721
+ else
722
+ cr_index = sample.index(cr)
723
+ end
724
+ if cr_index and lf_index
725
+ if cr_index + 1 == lf_index
726
+ cr + lf
727
+ elsif cr_index < lf_index
728
+ cr
729
+ else
730
+ lf
731
+ end
732
+ elsif cr_index
733
+ cr
734
+ elsif lf_index
735
+ lf
736
+ else
737
+ :auto
738
+ end
739
+ end
740
+
741
+ def prepare_line
742
+ @lineno = 0
743
+ @last_line = nil
744
+ @scanner = nil
745
+ end
746
+
747
+ def last_line
748
+ if @scanner
749
+ @last_line ||= @scanner.keep_end
750
+ else
751
+ @last_line
752
+ end
753
+ end
754
+
755
+ def prepare_header
756
+ @return_headers = @options[:return_headers]
757
+
758
+ headers = @options[:headers]
759
+ case headers
760
+ when Array
761
+ @raw_headers = headers
762
+ quoted_fields = [false] * @raw_headers.size
763
+ @use_headers = true
764
+ when String
765
+ @raw_headers, quoted_fields = parse_headers(headers)
766
+ @use_headers = true
767
+ when nil, false
768
+ @raw_headers = nil
769
+ @use_headers = false
770
+ else
771
+ @raw_headers = nil
772
+ @use_headers = true
773
+ end
774
+ if @raw_headers
775
+ @headers = adjust_headers(@raw_headers, quoted_fields)
776
+ else
777
+ @headers = nil
778
+ end
779
+ end
780
+
781
+ def parse_headers(row)
782
+ quoted_fields = []
783
+ converter = lambda do |field, info|
784
+ quoted_fields << info.quoted?
785
+ field
786
+ end
787
+ headers = CSV.parse_line(row,
788
+ col_sep: @column_separator,
789
+ row_sep: @row_separator,
790
+ quote_char: @quote_character,
791
+ converters: [converter])
792
+ [headers, quoted_fields]
793
+ end
794
+
795
+ def adjust_headers(headers, quoted_fields)
796
+ adjusted_headers = @header_fields_converter.convert(headers, nil, @lineno, quoted_fields)
797
+ adjusted_headers.each {|h| h.freeze if h.is_a? String}
798
+ adjusted_headers
799
+ end
800
+
801
+ def prepare_parser
802
+ @may_quoted = may_quoted?
803
+ end
804
+
805
+ def may_quoted?
806
+ return false if @quote_character.nil?
807
+
808
+ if @input.is_a?(StringIO)
809
+ pos = @input.pos
810
+ sample = @input.read
811
+ @input.seek(pos)
812
+ else
813
+ return false if @samples.empty?
814
+ sample = @samples.first
815
+ end
816
+ sample[0, 128].index(@quote_character)
817
+ end
818
+
819
+ class UnoptimizedStringIO # :nodoc:
820
+ def initialize(string)
821
+ @io = StringIO.new(string, "rb:#{string.encoding}")
822
+ end
823
+
824
+ def gets(*args)
825
+ @io.gets(*args)
826
+ end
827
+
828
+ def each_line(*args, &block)
829
+ @io.each_line(*args, &block)
830
+ end
831
+
832
+ def eof?
833
+ @io.eof?
834
+ end
835
+ end
836
+
837
+ SCANNER_TEST = (ENV["CSV_PARSER_SCANNER_TEST"] == "yes")
838
+ if SCANNER_TEST
839
+ SCANNER_TEST_CHUNK_SIZE_NAME = "CSV_PARSER_SCANNER_TEST_CHUNK_SIZE"
840
+ SCANNER_TEST_CHUNK_SIZE_VALUE = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
841
+ def build_scanner
842
+ inputs = @samples.collect do |sample|
843
+ UnoptimizedStringIO.new(sample)
844
+ end
845
+ if @input.is_a?(StringIO)
846
+ inputs << UnoptimizedStringIO.new(@input.read)
847
+ else
848
+ inputs << @input
849
+ end
850
+ begin
851
+ chunk_size_value = ENV[SCANNER_TEST_CHUNK_SIZE_NAME]
852
+ rescue # Ractor::IsolationError
853
+ # Ractor on Ruby 3.0 can't read ENV value.
854
+ chunk_size_value = SCANNER_TEST_CHUNK_SIZE_VALUE
855
+ end
856
+ chunk_size = Integer((chunk_size_value || "1"), 10)
857
+ InputsScanner.new(inputs,
858
+ @encoding,
859
+ @row_separator,
860
+ chunk_size: chunk_size)
861
+ end
862
+ else
863
+ def build_scanner
864
+ string = nil
865
+ if @samples.empty? and @input.is_a?(StringIO)
866
+ string = @input.read
867
+ elsif @samples.size == 1 and
868
+ @input != ARGF and
869
+ @input.respond_to?(:eof?) and
870
+ @input.eof?
871
+ string = @samples[0]
872
+ end
873
+ if string
874
+ unless string.valid_encoding?
875
+ index = string.lines(@row_separator).index do |line|
876
+ !line.valid_encoding?
877
+ end
878
+ if index
879
+ message = "Invalid byte sequence in #{@encoding}"
880
+ raise MalformedCSVError.new(message, @lineno + index + 1)
881
+ end
882
+ end
883
+ Scanner.new(string)
884
+ else
885
+ inputs = @samples.collect do |sample|
886
+ StringIO.new(sample)
887
+ end
888
+ inputs << @input
889
+ InputsScanner.new(inputs, @encoding, @row_separator)
890
+ end
891
+ end
892
+ end
893
+
894
+ def skip_needless_lines
895
+ return unless @skip_lines
896
+
897
+ until @scanner.eos?
898
+ @scanner.keep_start
899
+ line = @scanner.scan_all(@not_line_end) || "".encode(@encoding)
900
+ line << @row_separator if parse_row_end
901
+ if skip_line?(line)
902
+ @lineno += 1
903
+ @scanner.keep_drop
904
+ else
905
+ @scanner.keep_back
906
+ return
907
+ end
908
+ end
909
+ end
910
+
911
+ def skip_line?(line)
912
+ line = line.delete_suffix(@row_separator)
913
+ case @skip_lines
914
+ when String
915
+ line.include?(@skip_lines)
916
+ when Regexp
917
+ @skip_lines.match?(line)
918
+ else
919
+ @skip_lines.match(line)
920
+ end
921
+ end
922
+
923
+ def validate_field_size(field)
924
+ return unless @max_field_size
925
+ return if field.size <= @max_field_size
926
+ ignore_broken_line
927
+ message = "Field size exceeded: #{field.size} > #{@max_field_size}"
928
+ raise MalformedCSVError.new(message, @lineno)
929
+ end
930
+
931
+ def parse_no_quote(&block)
932
+ @scanner.each_line(@row_separator) do |line|
933
+ next if @skip_lines and skip_line?(line)
934
+ original_line = line
935
+ line = line.delete_suffix(@row_separator)
936
+
937
+ if line.empty?
938
+ next if @skip_blanks
939
+ row = []
940
+ quoted_fields = []
941
+ else
942
+ line = strip_value(line)
943
+ row = line.split(@split_column_separator, -1)
944
+ quoted_fields = [false] * row.size
945
+ if @max_field_size
946
+ row.each do |column|
947
+ validate_field_size(column)
948
+ end
949
+ end
950
+ n_columns = row.size
951
+ i = 0
952
+ while i < n_columns
953
+ row[i] = nil if row[i].empty?
954
+ i += 1
955
+ end
956
+ end
957
+ @last_line = original_line
958
+ emit_row(row, quoted_fields, &block)
959
+ end
960
+ end
961
+
962
+ def parse_quotable_loose(&block)
963
+ @scanner.keep_start
964
+ @scanner.each_line(@row_separator) do |line|
965
+ if @skip_lines and skip_line?(line)
966
+ @scanner.keep_drop
967
+ @scanner.keep_start
968
+ next
969
+ end
970
+ original_line = line
971
+ line = line.delete_suffix(@row_separator)
972
+
973
+ if line.empty?
974
+ if @skip_blanks
975
+ @scanner.keep_drop
976
+ @scanner.keep_start
977
+ next
978
+ end
979
+ row = []
980
+ quoted_fields = []
981
+ elsif line.include?(@cr) or line.include?(@lf)
982
+ @scanner.keep_back
983
+ @need_robust_parsing = true
984
+ return parse_quotable_robust(&block)
985
+ else
986
+ row = line.split(@split_column_separator, -1)
987
+ quoted_fields = []
988
+ n_columns = row.size
989
+ i = 0
990
+ while i < n_columns
991
+ column = row[i]
992
+ if column.empty?
993
+ quoted_fields << false
994
+ row[i] = nil
995
+ else
996
+ n_quotes = column.count(@quote_character)
997
+ if n_quotes.zero?
998
+ quoted_fields << false
999
+ # no quote
1000
+ elsif n_quotes == 2 and
1001
+ column.start_with?(@quote_character) and
1002
+ column.end_with?(@quote_character)
1003
+ quoted_fields << true
1004
+ row[i] = column[1..-2]
1005
+ else
1006
+ @scanner.keep_back
1007
+ @need_robust_parsing = true
1008
+ return parse_quotable_robust(&block)
1009
+ end
1010
+ validate_field_size(row[i])
1011
+ end
1012
+ i += 1
1013
+ end
1014
+ end
1015
+ @scanner.keep_drop
1016
+ @scanner.keep_start
1017
+ @last_line = original_line
1018
+ emit_row(row, quoted_fields, &block)
1019
+ end
1020
+ @scanner.keep_drop
1021
+ end
1022
+
1023
+ def parse_quotable_robust(&block)
1024
+ row = []
1025
+ quoted_fields = []
1026
+ skip_needless_lines
1027
+ start_row
1028
+ while true
1029
+ @quoted_column_value = false
1030
+ @unquoted_column_value = false
1031
+ @scanner.scan_all(@strip_value) if @strip_value
1032
+ value = parse_column_value
1033
+ if value
1034
+ @scanner.scan_all(@strip_value) if @strip_value
1035
+ validate_field_size(value)
1036
+ end
1037
+ if parse_column_end
1038
+ row << value
1039
+ quoted_fields << @quoted_column_value
1040
+ elsif parse_row_end
1041
+ if row.empty? and value.nil?
1042
+ emit_row([], [], &block) unless @skip_blanks
1043
+ else
1044
+ row << value
1045
+ quoted_fields << @quoted_column_value
1046
+ emit_row(row, quoted_fields, &block)
1047
+ row = []
1048
+ quoted_fields = []
1049
+ end
1050
+ skip_needless_lines
1051
+ start_row
1052
+ elsif @scanner.eos?
1053
+ break if row.empty? and value.nil?
1054
+ row << value
1055
+ quoted_fields << @quoted_column_value
1056
+ emit_row(row, quoted_fields, &block)
1057
+ break
1058
+ else
1059
+ if @quoted_column_value
1060
+ if liberal_parsing? and (new_line = @scanner.check(@line_end))
1061
+ message =
1062
+ "Illegal end-of-line sequence outside of a quoted field " +
1063
+ "<#{new_line.inspect}>"
1064
+ else
1065
+ message = "Any value after quoted field isn't allowed"
1066
+ end
1067
+ ignore_broken_line
1068
+ raise MalformedCSVError.new(message, @lineno)
1069
+ elsif @unquoted_column_value and
1070
+ (new_line = @scanner.scan(@line_end))
1071
+ ignore_broken_line
1072
+ message = "Unquoted fields do not allow new line " +
1073
+ "<#{new_line.inspect}>"
1074
+ raise MalformedCSVError.new(message, @lineno)
1075
+ elsif @scanner.rest.start_with?(@quote_character)
1076
+ ignore_broken_line
1077
+ message = "Illegal quoting"
1078
+ raise MalformedCSVError.new(message, @lineno)
1079
+ elsif (new_line = @scanner.scan(@line_end))
1080
+ ignore_broken_line
1081
+ message = "New line must be <#{@row_separator.inspect}> " +
1082
+ "not <#{new_line.inspect}>"
1083
+ raise MalformedCSVError.new(message, @lineno)
1084
+ else
1085
+ ignore_broken_line
1086
+ raise MalformedCSVError.new("TODO: Meaningful message",
1087
+ @lineno)
1088
+ end
1089
+ end
1090
+ end
1091
+ end
1092
+
1093
+ def parse_column_value
1094
+ if @liberal_parsing
1095
+ quoted_value = parse_quoted_column_value
1096
+ if quoted_value
1097
+ @scanner.scan_all(@strip_value) if @strip_value
1098
+ unquoted_value = parse_unquoted_column_value
1099
+ if unquoted_value
1100
+ if @double_quote_outside_quote
1101
+ unquoted_value = unquoted_value.gsub(@quote_character * 2,
1102
+ @quote_character)
1103
+ if quoted_value.empty? # %Q{""...} case
1104
+ return @quote_character + unquoted_value
1105
+ end
1106
+ end
1107
+ @quote_character + quoted_value + @quote_character + unquoted_value
1108
+ else
1109
+ quoted_value
1110
+ end
1111
+ else
1112
+ parse_unquoted_column_value
1113
+ end
1114
+ elsif @may_quoted
1115
+ parse_quoted_column_value ||
1116
+ parse_unquoted_column_value
1117
+ else
1118
+ parse_unquoted_column_value ||
1119
+ parse_quoted_column_value
1120
+ end
1121
+ end
1122
+
1123
+ def parse_unquoted_column_value
1124
+ value = @scanner.scan_all(@unquoted_value)
1125
+ return nil unless value
1126
+
1127
+ @unquoted_column_value = true
1128
+ if @first_column_separators
1129
+ while true
1130
+ @scanner.keep_start
1131
+ is_column_end = @column_ends.all? do |column_end|
1132
+ @scanner.scan(column_end)
1133
+ end
1134
+ @scanner.keep_back
1135
+ break if is_column_end
1136
+ sub_separator = @scanner.scan_all(@first_column_separators)
1137
+ break if sub_separator.nil?
1138
+ value << sub_separator
1139
+ sub_value = @scanner.scan_all(@unquoted_value)
1140
+ break if sub_value.nil?
1141
+ value << sub_value
1142
+ end
1143
+ end
1144
+ value.gsub!(@backslash_quote_character, @quote_character) if @backslash_quote
1145
+ if @rstrip_value
1146
+ value.gsub!(@rstrip_value, "")
1147
+ end
1148
+ value
1149
+ end
1150
+
1151
+ def parse_quoted_column_value
1152
+ quotes = @scanner.scan_all(@quotes)
1153
+ return nil unless quotes
1154
+
1155
+ @quoted_column_value = true
1156
+ n_quotes = quotes.size
1157
+ if (n_quotes % 2).zero?
1158
+ quotes[0, (n_quotes - 2) / 2]
1159
+ else
1160
+ value = quotes[0, n_quotes / 2]
1161
+ while true
1162
+ quoted_value = @scanner.scan_all(@quoted_value)
1163
+ value << quoted_value if quoted_value
1164
+ if @backslash_quote
1165
+ if @scanner.scan(@escaped_backslash)
1166
+ if @scanner.scan(@escaped_quote)
1167
+ value << @quote_character
1168
+ else
1169
+ value << @backslash_character
1170
+ end
1171
+ next
1172
+ end
1173
+ end
1174
+
1175
+ quotes = @scanner.scan_all(@quotes)
1176
+ unless quotes
1177
+ ignore_broken_line
1178
+ message = "Unclosed quoted field"
1179
+ raise MalformedCSVError.new(message, @lineno)
1180
+ end
1181
+ n_quotes = quotes.size
1182
+ if n_quotes == 1
1183
+ break
1184
+ else
1185
+ value << quotes[0, n_quotes / 2]
1186
+ break if (n_quotes % 2) == 1
1187
+ end
1188
+ end
1189
+ value
1190
+ end
1191
+ end
1192
+
1193
+ def parse_column_end
1194
+ return true if @scanner.scan(@column_end)
1195
+ return false unless @column_ends
1196
+
1197
+ @scanner.keep_start
1198
+ if @column_ends.all? {|column_end| @scanner.scan(column_end)}
1199
+ @scanner.keep_drop
1200
+ true
1201
+ else
1202
+ @scanner.keep_back
1203
+ false
1204
+ end
1205
+ end
1206
+
1207
+ def parse_row_end
1208
+ return true if @scanner.scan(@row_end)
1209
+ return false unless @row_ends
1210
+ @scanner.keep_start
1211
+ if @row_ends.all? {|row_end| @scanner.scan(row_end)}
1212
+ @scanner.keep_drop
1213
+ true
1214
+ else
1215
+ @scanner.keep_back
1216
+ false
1217
+ end
1218
+ end
1219
+
1220
+ def strip_value(value)
1221
+ return value unless @strip
1222
+ return value if value.nil?
1223
+
1224
+ case @strip
1225
+ when String
1226
+ while value.delete_prefix!(@strip)
1227
+ # do nothing
1228
+ end
1229
+ while value.delete_suffix!(@strip)
1230
+ # do nothing
1231
+ end
1232
+ else
1233
+ value.strip!
1234
+ end
1235
+ value
1236
+ end
1237
+
1238
+ def ignore_broken_line
1239
+ @scanner.scan_all(@not_line_end)
1240
+ @scanner.scan_all(@line_end)
1241
+ @lineno += 1
1242
+ end
1243
+
1244
+ def start_row
1245
+ if @last_line
1246
+ @last_line = nil
1247
+ else
1248
+ @scanner.keep_drop
1249
+ end
1250
+ @scanner.keep_start
1251
+ end
1252
+
1253
+ def emit_row(row, quoted_fields, &block)
1254
+ @lineno += 1
1255
+
1256
+ raw_row = row
1257
+ if @use_headers
1258
+ if @headers.nil?
1259
+ @headers = adjust_headers(row, quoted_fields)
1260
+ return unless @return_headers
1261
+ row = Row.new(@headers, row, true)
1262
+ else
1263
+ row = Row.new(@headers,
1264
+ @fields_converter.convert(raw_row, @headers, @lineno, quoted_fields))
1265
+ end
1266
+ else
1267
+ # convert fields, if needed...
1268
+ row = @fields_converter.convert(raw_row, nil, @lineno, quoted_fields)
1269
+ end
1270
+
1271
+ # inject unconverted fields and accessor, if requested...
1272
+ if @unconverted_fields and not row.respond_to?(:unconverted_fields)
1273
+ add_unconverted_fields(row, raw_row)
1274
+ end
1275
+
1276
+ yield(row)
1277
+ end
1278
+
1279
+ # This method injects an instance variable <tt>unconverted_fields</tt> into
1280
+ # +row+ and an accessor method for +row+ called unconverted_fields(). The
1281
+ # variable is set to the contents of +fields+.
1282
+ def add_unconverted_fields(row, fields)
1283
+ class << row
1284
+ attr_reader :unconverted_fields
1285
+ end
1286
+ row.instance_variable_set(:@unconverted_fields, fields)
1287
+ row
1288
+ end
1289
+ end
1290
+ end