smarter_csv 1.16.4 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +10 -1
  3. data/CHANGELOG.md +54 -0
  4. data/Gemfile +10 -5
  5. data/README.md +98 -14
  6. data/TO_DO.md +109 -0
  7. data/docs/_introduction.md +1 -0
  8. data/docs/bad_row_quarantine.md +2 -1
  9. data/docs/basic_read_api.md +6 -1
  10. data/docs/basic_write_api.md +30 -0
  11. data/docs/batch_processing.md +25 -0
  12. data/docs/column_selection.md +1 -0
  13. data/docs/data_transformations.md +1 -0
  14. data/docs/examples.md +126 -0
  15. data/docs/header_transformations.md +23 -0
  16. data/docs/header_validations.md +1 -0
  17. data/docs/history.md +1 -0
  18. data/docs/instrumentation.md +2 -1
  19. data/docs/migrating_from_csv.md +1 -0
  20. data/docs/options.md +20 -18
  21. data/docs/parsing_strategy.md +1 -0
  22. data/docs/real_world_csv.md +51 -1
  23. data/docs/releases/1.16.0/performance_notes.md +15 -15
  24. data/docs/releases/1.17.0/benchmarks.md +121 -0
  25. data/docs/releases/1.17.0/changes.md +161 -0
  26. data/docs/releases/1.17.0/performance_notes.md +126 -0
  27. data/docs/row_col_sep.md +21 -1
  28. data/docs/ruby_csv_pitfalls.md +1 -0
  29. data/docs/value_converters.md +24 -0
  30. data/docs/warnings.md +141 -0
  31. data/ext/smarter_csv/smarter_csv.c +98 -32
  32. data/images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg +106 -0
  33. data/images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg +181 -0
  34. data/images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg +179 -0
  35. data/lib/smarter_csv/auto_detection.rb +215 -30
  36. data/lib/smarter_csv/file_io.rb +2 -2
  37. data/lib/smarter_csv/hash_transformations.rb +29 -13
  38. data/lib/smarter_csv/parser.rb +42 -33
  39. data/lib/smarter_csv/peekable_io.rb +453 -0
  40. data/lib/smarter_csv/reader.rb +119 -23
  41. data/lib/smarter_csv/reader_options.rb +61 -1
  42. data/lib/smarter_csv/version.rb +1 -1
  43. data/lib/smarter_csv.rb +40 -12
  44. metadata +12 -5
  45. data/TO_DO_v2.md +0 -14
  46. data/ext/smarter_csv/Makefile +0 -270
@@ -30,8 +30,7 @@ module SmarterCSV
30
30
 
31
31
  if options[:acceleration] && has_acceleration
32
32
  # :nocov:
33
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash, options[:quote_boundary] == :standard, options[:row_sep])
34
- [elements, elements.size]
33
+ parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash, options[:quote_boundary] == :standard, options[:row_sep])
35
34
  # :nocov:
36
35
  else
37
36
  # puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
@@ -53,8 +52,7 @@ module SmarterCSV
53
52
  unless line.include?('\\')
54
53
  if options[:acceleration] && has_acceleration
55
54
  # :nocov:
56
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
57
- return [elements, elements.size]
55
+ return parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
58
56
  # :nocov:
59
57
  else
60
58
  return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
@@ -67,8 +65,7 @@ module SmarterCSV
67
65
  # Try backslash-escape interpretation first
68
66
  if options[:acceleration] && has_acceleration
69
67
  # :nocov:
70
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true, options[:quote_boundary] == :standard, options[:row_sep])
71
- [elements, elements.size]
68
+ parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true, options[:quote_boundary] == :standard, options[:row_sep])
72
69
  # :nocov:
73
70
  else
74
71
  parse_csv_line_ruby(line, @quote_escaping_backslash, header_size, has_quotes)
@@ -77,8 +74,7 @@ module SmarterCSV
77
74
  # Backslash raised a hard error — fall back to RFC 4180 immediately
78
75
  if options[:acceleration] && has_acceleration
79
76
  # :nocov:
80
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
81
- return [elements, elements.size]
77
+ return parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
82
78
  # :nocov:
83
79
  else
84
80
  return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
@@ -89,8 +85,7 @@ module SmarterCSV
89
85
  if result[1] == -1
90
86
  rfc_result = if options[:acceleration] && has_acceleration
91
87
  # :nocov:
92
- elements = parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
93
- [elements, elements.size]
88
+ parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
94
89
  # :nocov:
95
90
  else
96
91
  parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
@@ -205,8 +200,9 @@ module SmarterCSV
205
200
 
206
201
  remove_empty = options[:remove_empty_values]
207
202
  hash = {}
208
- fields.each_with_index do |v, i| # C-level iteration, faster than Ruby while counter loop
203
+ fields.each_with_index do |v, i| # C-level iteration, faster than Ruby while counter loop
209
204
  next if remove_empty && v.empty?
205
+
210
206
  hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = v
211
207
  end
212
208
 
@@ -280,11 +276,12 @@ module SmarterCSV
280
276
 
281
277
  col_sep = options[:col_sep]
282
278
  strip = options[:strip_whitespace]
279
+ quote = @quote_char
283
280
 
284
281
  # Ensure has_quotes is set correctly (callers via parse/parse_line_to_hash
285
282
  # always pass this, but direct callers may not)
286
283
  # rubocop:disable Style/OrAssignment
287
- has_quotes = line.include?(options[:quote_char]) unless has_quotes
284
+ has_quotes = line.include?(quote) unless has_quotes
288
285
  # rubocop:enable Style/OrAssignment
289
286
 
290
287
  # Optimization #7: when line has no quotes, use String#split (C-implemented)
@@ -299,14 +296,16 @@ module SmarterCSV
299
296
 
300
297
  elements = line.split(col_sep, -1) # -1 preserves trailing empty fields
301
298
  elements = elements[0, header_size] if header_size
302
- elements.map!(&:strip) if strip
299
+ # split returns fresh, mutable strings — strip them in place (strip! allocates
300
+ # nothing when there's no leading/trailing whitespace, which is the common case)
301
+ elements.each(&:strip!) if strip
303
302
  return [elements, elements.size]
304
303
  end
305
304
 
306
305
  # Quoted-line path: character-by-character parsing required
307
306
  line_size = line.size
308
307
  col_sep_size = col_sep.size
309
- quote = options[:quote_char]
308
+ doubled_quotes = @doubled_quote_chars
310
309
  elements = []
311
310
  start = 0
312
311
  i = 0
@@ -384,14 +383,20 @@ module SmarterCSV
384
383
  field_len = i - start
385
384
  if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(i - 1) == quote_byte
386
385
  field = line.byteslice(start + 1, field_len - 2)
387
- field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
388
- field.strip! if strip # in-place: no extra allocation; safe on fresh byteslice
389
- elements << field
386
+ # Tighter guard: only walk the field with gsub! when a doubled quote pair
387
+ # actually exists. include?(doubled_quotes) is a single memmem scan; cheaper
388
+ # than gsub!'s full walk when no doubled pair is present.
389
+ field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
390
+ field.strip! if strip # in-place: no extra allocation; safe on fresh byteslice
390
391
  else
391
392
  field = line.byteslice(start, field_len)
392
393
  field = cleanup_quotes(field, quote)
393
- elements << (strip ? field.strip : field) # cleanup_quotes may return frozen EMPTY_STRING
394
+ # cleanup_quotes may return the frozen EMPTY_STRING; all non-empty returns are
395
+ # fresh mutable byteslices/substrings. So strip! in place (no allocation), guarded
396
+ # by !field.empty? to skip the frozen-empty case (which needs no stripping anyway).
397
+ field.strip! if strip && !field.empty?
394
398
  end
399
+ elements << field
395
400
  i += 1
396
401
  start = i
397
402
  backslash_count = 0
@@ -450,14 +455,16 @@ module SmarterCSV
450
455
  field_len = bytesize - start
451
456
  if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(bytesize - 1) == quote_byte
452
457
  field = line.byteslice(start + 1, field_len - 2)
453
- field.gsub!(doubled_quote(quote), quote)
458
+ field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
454
459
  field.strip! if strip
455
- elements << field
456
460
  else
457
461
  field = line.byteslice(start, field_len)
458
462
  field = cleanup_quotes(field, quote)
459
- elements << (strip ? field.strip : field)
463
+ # cleanup_quotes may return the frozen EMPTY_STRING; non-empty returns are fresh
464
+ # mutable byteslices — strip! in place, guarded by !field.empty? for the frozen case.
465
+ field.strip! if strip && !field.empty?
460
466
  end
467
+ elements << field
461
468
  end
462
469
  else
463
470
  # Multi-char col_sep: use substring comparison (original path)
@@ -489,14 +496,16 @@ module SmarterCSV
489
496
  field_len = i - start
490
497
  if field_len >= 2 && line[start] == quote && line[i - 1] == quote
491
498
  field = line[start + 1...i - 1]
492
- field.gsub!(doubled_quote(quote), quote) if field.include?(quote)
499
+ field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
493
500
  field.strip! if strip
494
- elements << field
495
501
  else
496
502
  field = line[start...i]
497
503
  field = cleanup_quotes(field, quote)
498
- elements << (strip ? field.strip : field)
504
+ # cleanup_quotes may return the frozen EMPTY_STRING; non-empty returns are fresh
505
+ # mutable substrings — strip! in place, guarded by !field.empty? for the frozen case.
506
+ field.strip! if strip && !field.empty?
499
507
  end
508
+ elements << field
500
509
  i += col_sep_size
501
510
  start = i
502
511
  backslash_count = 0
@@ -553,14 +562,16 @@ module SmarterCSV
553
562
  field_len = line_size - start
554
563
  if field_len >= 2 && line[start] == quote && line[line_size - 1] == quote
555
564
  field = line[start + 1..line_size - 2]
556
- field.gsub!(doubled_quote(quote), quote)
565
+ field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
557
566
  field.strip! if strip
558
- elements << field
559
567
  else
560
568
  field = line[start..-1]
561
569
  field = cleanup_quotes(field, quote)
562
- elements << (strip ? field.strip : field)
570
+ # cleanup_quotes may return the frozen EMPTY_STRING; non-empty returns are fresh
571
+ # mutable substrings — strip! in place, guarded by !field.empty? for the frozen case.
572
+ field.strip! if strip && !field.empty?
563
573
  end
574
+ elements << field
564
575
  end
565
576
  end
566
577
 
@@ -571,19 +582,17 @@ module SmarterCSV
571
582
  return nil if field.nil?
572
583
  return EMPTY_STRING if field.empty?
573
584
 
585
+ doubled_quotes = @doubled_quote_chars
586
+
574
587
  # Remove surrounding quotes if present
575
588
  if field.start_with?(quote) && field.end_with?(quote)
576
589
  field = field[1..-2]
577
590
  end
578
591
 
579
- # Replace double quotes with a single quote
580
- field.gsub!(doubled_quote(quote), quote)
592
+ # Replace double quotes with a single quote (skip the gsub walk when no doubled pair exists)
593
+ field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
581
594
 
582
595
  field
583
596
  end
584
-
585
- def doubled_quote(quote)
586
- @doubled_quote ||= (quote * 2).to_s.freeze
587
- end
588
597
  end
589
598
  end
@@ -0,0 +1,453 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ # PeekableIO wraps any IO-like object and buffers the first chunk of bytes
5
+ # so that auto-detection (row_sep, col_sep) can call rewind without requiring
6
+ # the underlying source to be seekable.
7
+ #
8
+ # Works transparently with files, StringIO, pipes, STDIN, Zlib streams, and
9
+ # any other IO-like object that responds to read.
10
+ #
11
+ # Lifecycle:
12
+ # 1. peek(n) — reads up to n bytes from the underlying IO into the buffer
13
+ # 2. rewind — resets @peek_pos to 0 (replays buffer, never seeks underlying IO)
14
+ # 3. gets/read/each_char — drain the buffer first, then read from @io in
15
+ # @buffer_size chunks, appending each to @peek_buf so that a subsequent
16
+ # rewind can replay the full stream from position 0.
17
+ # 4. rewind — resets @peek_pos to 0; does NOT freeze. Detection may rewind
18
+ # multiple times (once per pass) and must keep accumulating between passes.
19
+ # 5. freeze_buffer! — called once after all detection passes are done. After
20
+ # this point reads beyond the buffer delegate directly to @io without growing
21
+ # @peek_buf. @peek_buf is kept alive (never nilled) so rewind can replay.
22
+ #
23
+ class PeekableIO
24
+ # 16KB is enough for separator detection on any real-world CSV header.
25
+ # Matches one EBS gp3 I/O block and one Apple Silicon VM page exactly.
26
+ DEFAULT_PEEK_SIZE = 16_384
27
+
28
+ # Lower bound for a sane peek buffer chunk size. Below this the buffer is
29
+ # too small to be useful even on local SSD (one VM page on x86 is 4 KB).
30
+ MIN_BUFFER_SIZE = 4_096
31
+
32
+ # Upper bound for the peek buffer chunk size. Equal to
33
+ # AutoDetection::MAX_AUTO_ROW_SEP_CHARS — beyond this, bytes are unused by
34
+ # auto-detection and only delay parse start by pre-loading bytes that
35
+ # would have been read during parsing anyway.
36
+ MAX_BUFFER_SIZE = SmarterCSV::AutoDetection::MAX_AUTO_ROW_SEP_CHARS
37
+
38
+ def initialize(io, options, buffer_size: DEFAULT_PEEK_SIZE)
39
+ @io = io
40
+ @buffer_size = buffer_size
41
+ @options = options # live reference — options[:row_sep] is the default sep for gets/readline
42
+ @peek_buf = nil # nil = buffer not yet filled
43
+ @peek_pos = 0
44
+ @emit_encoding = nil # encoding of strings returned by @io.read — set on first peek
45
+ @buffer_frozen = false # true after freeze_buffer!: buffer stops growing, detection phase is over
46
+ end
47
+
48
+ # Read up to n bytes into the buffer and return them.
49
+ # Called once before auto-detection begins.
50
+ #
51
+ # Works for any IO source — files, StringIO, pipes, Zlib streams, etc.
52
+ # The BOM (if any) is stripped immediately so all downstream code is clean.
53
+ # For transcoded streams (e.g. r:iso-8859-1:utf-8), the raw bytes are
54
+ # converted to the internal encoding in-place; @emit_encoding records the
55
+ # final encoding so read-out can re-tag strings correctly.
56
+ def peek(n = @buffer_size)
57
+ # Idempotent: a second peek call returns the existing buffer without reading
58
+ # more from @io. Calling peek twice would otherwise overwrite the buffer and
59
+ # silently drop any unconsumed bytes from the first peek.
60
+ return @peek_buf.dup.force_encoding(@emit_encoding || Encoding::ASCII_8BIT) if @peek_buf
61
+
62
+ # read(n) fetches raw bytes as ASCII-8BIT regardless of the file's declared
63
+ # encoding — this is what we want because it works even for files that begin
64
+ # with non-UTF-8 BOMs (\xFF\xFE etc.) that would cause gets(nil,n) on a
65
+ # r:utf-8 handle to stop after the first invalid byte.
66
+ chunk = @io.read(n)
67
+ if chunk && !chunk.empty?
68
+ raw = strip_bom(chunk.b)
69
+ # The buffer always holds raw bytes in the external encoding (ASCII-8BIT tagged).
70
+ # Transcoding (ext → int) is the caller's responsibility — it happens externally
71
+ # when consuming data, not here during storage.
72
+ @emit_encoding = external_encoding
73
+ # Ensure the buffer ends on a complete codepoint boundary.
74
+ # align_to_char_boundary reads single bytes from @io until the buffer is valid
75
+ # in @emit_encoding, guarded by MAX_ALIGN_BYTES to avoid infinite loops on
76
+ # malformed input. Skipped when encoding is unknown (nil) or single-byte.
77
+ raw = align_to_char_boundary(raw) if @emit_encoding
78
+ @peek_buf = raw
79
+ @peek_pos = 0
80
+ end
81
+ # Return the full buffered content (BOM-stripped + char-aligned) rather than
82
+ # the original chunk so callers see what was actually consumed.
83
+ @peek_buf ? @peek_buf.dup.force_encoding(@emit_encoding || Encoding::ASCII_8BIT) : chunk
84
+ end
85
+
86
+ # Returns the next line up to and including sep.
87
+ # Hot path: @peek_buf is nil (never peeked) or exhausted — delegate directly to @io.
88
+ # The buffer is never nilled out by read methods so that rewind always works during
89
+ # the auto-detection phase. @peek_pos advancing past bytesize is the exhaustion signal.
90
+ #
91
+ # NOTE: sep must be a String. gets(nil) — which reads until EOF in Ruby IO — is not
92
+ # supported; smarter_csv always passes an explicit row separator string.
93
+ # The default is @options[:row_sep] (resolved after auto-detection), never $/.
94
+ #
95
+ # NOTE: we don't support **kwargs because smarter_csv does not use them.
96
+ #
97
+ # NOTE: the limit parameter (Ruby IO#gets(sep, limit)) is intentionally omitted.
98
+ # PeekableIO is internal to SmarterCSV and no caller passes a limit. If this class
99
+ # were ever extracted into a stand-alone library, limit support would be required
100
+ # to fully comply with the IO#gets contract.
101
+ def gets(sep = @options[:row_sep])
102
+ raise ArgumentError, "PeekableIO#gets does not support gets(nil) — pass an explicit separator string" if sep.nil?
103
+ return @io.gets(sep) if @peek_buf.nil?
104
+
105
+ # Buffer frozen (post auto-detection): delegate once buffer is exhausted — no more accumulation.
106
+ # Must still apply encoding tagging and maybe_transcode so callers see consistent encodings.
107
+ if @buffer_frozen && buffer_exhausted?
108
+ line = @io.gets(sep)
109
+ return nil if line.nil?
110
+
111
+ int = internal_encoding
112
+ # Real IO objects opened with a transcoding pair (e.g. r:iso-8859-1:utf-8) already transcode
113
+ # on read — the returned string is already in the internal encoding. Return it as-is.
114
+ # For wrapper objects (e.g. EncodedBytesIO) that declare encodings but don't transcode on
115
+ # read, the returned string will still be in ASCII-8BIT — fall through to tag + transcode.
116
+ return line if int && line.encoding == int
117
+
118
+ out_enc = @emit_encoding || external_encoding
119
+ # Needed for the single-encoding case (int == nil) when the source declares an
120
+ # external_encoding but returns ASCII-8BIT from #gets (wrapper IOs: EncodedBytesIO,
121
+ # pipes, STDIN, decompression streams). maybe_transcode is a no-op when int is nil,
122
+ # so this is the only step that tags the line in the correct external encoding —
123
+ # otherwise reader.rb#enforce_utf8_encoding would misread the bytes as UTF-8.
124
+ # Redundant on the transcoding-pair path (maybe_transcode force_encodes there too),
125
+ # but the guard keeps it cheap. Covered by peekable_io_spec.rb frozen-exhausted
126
+ # single-encoding test.
127
+ line = line.force_encoding(out_enc) if out_enc && line.encoding != out_enc
128
+ return maybe_transcode(line)
129
+ end
130
+
131
+ # Compute the output encoding once — used by both the detection and frozen paths.
132
+ # For sources with no declared encoding (nil) we fall back to ASCII_8BIT rather
133
+ # than assuming UTF-8 — the caller gets the raw bytes and can re-tag as needed.
134
+ out_enc = @emit_encoding || external_encoding
135
+
136
+ # ---------------------------------------------------------------------------
137
+ # Auto-Detection phase (buffer not yet frozen):
138
+ # Extend the buffer in @buffer_size chunks until the separator is found
139
+ # or EOF. No straddle detection needed — the extension absorbs any boundary.
140
+ # @peek_pos never advances until we have a complete line, so the search always
141
+ # covers the full unread portion of the ever-growing buffer.
142
+ # ---------------------------------------------------------------------------
143
+ unless @buffer_frozen
144
+ loop do
145
+ rest = @peek_buf.byteslice(@peek_pos..-1)
146
+ rest.force_encoding(out_enc || Encoding::ASCII_8BIT)
147
+ # NOTE: rest.b.index(sep.b) is the Ruby 2.6 compatible equivalent of rest.byteindex(sep)
148
+ idx = rest.b.index(sep.b)
149
+ if idx
150
+ line = rest.byteslice(0, idx + sep.bytesize)
151
+ @peek_pos += line.bytesize
152
+ return maybe_transcode(line)
153
+ end
154
+ # Separator not found — fetch another chunk and search again.
155
+ break unless extend_buffer!
156
+ end
157
+ # EOF: return remaining bytes as final line, or nil if nothing left.
158
+ rest = @peek_buf.byteslice(@peek_pos..-1)
159
+ return nil if rest.empty?
160
+
161
+ @peek_pos = @peek_buf.bytesize
162
+ return maybe_transcode(rest.force_encoding(out_enc || Encoding::ASCII_8BIT))
163
+ end
164
+
165
+ # ---------------------------------------------------------------------------
166
+ # Frozen phase (processing): buffer has fixed content.
167
+ # Search within the buffer; handle the separator straddling the buffer/@io
168
+ # boundary for multi-byte separators (e.g. \r\n split across the edge).
169
+ # ---------------------------------------------------------------------------
170
+ rest = @peek_buf.byteslice(@peek_pos..-1)
171
+ rest.force_encoding(out_enc || Encoding::ASCII_8BIT)
172
+ # Use byteindex + byteslice — the buffer stores raw bytes and @peek_pos is a
173
+ # byte offset. Separators are always ASCII, so byteindex is correct regardless
174
+ # of the encoding tag.
175
+ # NOTE: rest.b.index(sep.b) is the Ruby 2.6 compatible equivalent of rest.byteindex(sep)
176
+ idx = rest.b.index(sep.b)
177
+ if idx
178
+ line = rest.byteslice(0, idx + sep.bytesize)
179
+ @peek_pos += line.bytesize
180
+ maybe_transcode(line)
181
+ else
182
+ @peek_pos = @peek_buf.bytesize # mark exhausted, keep buffer alive for rewind
183
+
184
+ # Detect multi-byte separator (e.g. \r\n) split at the buffer boundary —
185
+ # \r is the last byte of @peek_buf, \n is the first byte of @io.
186
+ # byteindex found nothing because the separator straddles the boundary.
187
+ # Check if the buffer tail matches any prefix of sep and read ahead to confirm.
188
+ # For non-seekable IO: on a non-match the already-read bytes are prepended
189
+ # to the remainder so no data is lost.
190
+ if sep.bytesize > 1
191
+ (sep.bytesize - 1).downto(1) do |prefix_len|
192
+ next unless rest.b.end_with?(sep.b.byteslice(0, prefix_len))
193
+
194
+ tail_needed = sep.b.byteslice(prefix_len..-1)
195
+ peeked = @io.read(tail_needed.bytesize)
196
+
197
+ if peeked.nil?
198
+ combined = rest.b # EOF — nothing new to read
199
+ elsif peeked.b == tail_needed
200
+ combined = rest.b + tail_needed # separator confirmed
201
+ else
202
+ # peeked bytes are content, not separator completion.
203
+ # But peeked itself may end with a prefix of sep (e.g. peeked="\r"
204
+ # when sep="\r\n"), meaning @io could begin with sep's tail ("\n").
205
+ # Calling @io.gets(sep) from here would over-read past that boundary.
206
+ # Instead, recursively check for a nested straddle in peeked.
207
+ content = peeked.b
208
+ nested_handled = false
209
+ (sep.bytesize - 1).downto(1) do |n|
210
+ next unless content.end_with?(sep.b.byteslice(0, n))
211
+
212
+ confirmed_tail = @io.read(sep.bytesize - n)
213
+ if confirmed_tail.nil?
214
+ # EOF — nothing more to read; content stays as-is
215
+ elsif confirmed_tail.b == sep.b.byteslice(n..-1)
216
+ content += confirmed_tail.b # separator confirmed
217
+ else
218
+ remainder = @io.gets(sep)
219
+ content = content + confirmed_tail.b + (remainder ? remainder.b : ''.b)
220
+ end
221
+ nested_handled = true
222
+ break
223
+ end
224
+ unless nested_handled
225
+ remainder = @io.gets(sep)
226
+ content += (remainder ? remainder.b : ''.b)
227
+ end
228
+ combined = rest.b + content
229
+ end
230
+ return maybe_transcode(out_enc ? combined.force_encoding(out_enc) : combined)
231
+ end
232
+ end
233
+
234
+ remainder = @io.gets(sep)
235
+ combined = rest.b + (remainder ? remainder.b : ''.b)
236
+ maybe_transcode(out_enc ? combined.force_encoding(out_enc) : combined)
237
+ end
238
+ end
239
+
240
+ # Unlike gets, readline raises EOFError at end of file rather than returning nil.
241
+ # Defaults to @options[:row_sep], never $/.
242
+ def readline(sep = @options[:row_sep])
243
+ line = gets(sep)
244
+ raise EOFError, "end of file reached" if line.nil?
245
+
246
+ line
247
+ end
248
+
249
+ def read(n = nil)
250
+ # Delegate to @io only when (a) we never peeked, or (b) the buffer is
251
+ # frozen and fully replayed. During auto-detection (not frozen) the
252
+ # buffer must be extended even when @peek_pos has caught up to its end,
253
+ # otherwise bytes read from @io are not appended to @peek_buf and a
254
+ # subsequent rewind_buffer would lose them.
255
+ return @io.read(n) if @peek_buf.nil?
256
+ return @io.read(n) if @buffer_frozen && buffer_exhausted?
257
+
258
+ buffered = @peek_buf.byteslice(@peek_pos..-1)
259
+ out_enc = @emit_encoding || Encoding::ASCII_8BIT
260
+
261
+ # All paths use binary concatenation then re-tag to avoid encoding mismatches.
262
+ if n.nil?
263
+ @peek_pos = @peek_buf.bytesize # consume all buffered bytes
264
+ rest_from_io = @io.read
265
+ appended = rest_from_io ? rest_from_io.b : ''.b
266
+ @peek_buf << appended unless @buffer_frozen
267
+ combined = buffered + appended
268
+ maybe_transcode(combined.force_encoding(out_enc))
269
+ elsif n == 0
270
+ String.new.force_encoding(out_enc) # read(0) must not advance @peek_pos
271
+ elsif buffered.bytesize >= n
272
+ @peek_pos += n # advance exactly n, not the whole buffer
273
+ maybe_transcode(buffered.byteslice(0, n).force_encoding(out_enc))
274
+ else
275
+ @peek_pos = @peek_buf.bytesize # consume all buffered bytes
276
+ rest_from_io = @io.read(n - buffered.bytesize)
277
+ appended = rest_from_io ? rest_from_io.b : ''.b
278
+ @peek_buf << appended unless @buffer_frozen
279
+ combined = buffered + appended
280
+ maybe_transcode(combined.force_encoding(out_enc))
281
+ end
282
+ end
283
+
284
+ def each_char(&block)
285
+ return enum_for(:each_char) unless block_given?
286
+ # Same guard as read(): only delegate when never peeked, or when frozen
287
+ # and fully replayed. Otherwise we must extend the buffer so rewind_buffer
288
+ # can replay the bytes during the parsing phase.
289
+ return @io.each_char(&block) if @peek_buf.nil?
290
+ return @io.each_char(&block) if @buffer_frozen && buffer_exhausted?
291
+
292
+ rest = @peek_buf.byteslice(@peek_pos..-1)
293
+ rest.force_encoding(@emit_encoding || external_encoding || Encoding::ASCII_8BIT)
294
+ rest = maybe_transcode(rest) || rest
295
+ rest.each_char(&block)
296
+ @peek_pos = @peek_buf.bytesize # mark exhausted, keep buffer alive for rewind
297
+
298
+ # Read remaining @io in chunks — avoids O(n²) string concatenation from
299
+ # appending one byte at a time. Row-sep detection only needs ASCII chars
300
+ # (\n, \r) so codepoint boundaries at chunk edges are inconsequential.
301
+ until @io.eof?
302
+ chunk = @io.read(@buffer_size)
303
+ break unless chunk
304
+
305
+ @peek_buf << chunk.b unless @buffer_frozen
306
+ chunk.force_encoding(@emit_encoding || external_encoding || Encoding::ASCII_8BIT)
307
+ (maybe_transcode(chunk) || chunk).each_char(&block)
308
+ end
309
+ end
310
+
311
+ def eof?
312
+ return @io.eof? if buffer_exhausted?
313
+
314
+ false # still have unread bytes in peek buffer
315
+ end
316
+
317
+ # Resets to the start of the peek buffer — never touches the underlying IO.
318
+ # Since auto-detection happens at the very beginning, the buffer IS byte 0.
319
+ # Works identically for files, StringIO, pipes, and any other source.
320
+ #
321
+ # Does NOT freeze the buffer — detection may call rewind_buffer multiple times
322
+ # (once per pass) and must continue accumulating bytes beyond the initial
323
+ # peek chunk. Call freeze_buffer! explicitly when detection is complete.
324
+ def rewind_buffer
325
+ @peek_pos = 0
326
+ end
327
+
328
+ def rewind
329
+ raise NoMethodError, "use rewind_buffer instead of rewind — PeekableIO does not seek the underlying IO"
330
+ end
331
+
332
+ # Freeze the buffer: signals that auto-detection is complete and normal
333
+ # processing is beginning. After this point, reads that go beyond the
334
+ # buffered bytes delegate directly to @io without growing @peek_buf further.
335
+ def freeze_buffer!
336
+ @buffer_frozen = true
337
+ end
338
+
339
+ def close
340
+ @io.close if @io.respond_to?(:close)
341
+ end
342
+
343
+ def external_encoding
344
+ @io.respond_to?(:external_encoding) ? @io.external_encoding : nil
345
+ end
346
+
347
+ def internal_encoding
348
+ @io.respond_to?(:internal_encoding) ? @io.internal_encoding : nil
349
+ end
350
+
351
+ private
352
+
353
+ def buffer_exhausted?
354
+ @peek_buf.nil? || @peek_pos >= @peek_buf.bytesize
355
+ end
356
+
357
+ # Append one @buffer_size chunk from @io to @peek_buf.
358
+ # Returns true if bytes were added, false if @io was already at EOF.
359
+ def extend_buffer!
360
+ chunk = @io.read(@buffer_size)
361
+ return false unless chunk && !chunk.empty?
362
+
363
+ @peek_buf << chunk.b
364
+ true
365
+ end
366
+
367
+ # Strip any BOM from the start of the raw (ASCII_8BIT-tagged) buffer bytes.
368
+ # Doing this once here means all downstream code — auto-detection, the C
369
+ # extension parser, remove_bom in file_io.rb — never sees BOM bytes.
370
+ # Patterns ordered longest-first so UTF-32 is matched before UTF-16.
371
+ BOM_PATTERNS = [
372
+ "\x00\x00\xFE\xFF".b, # UTF-32 BE
373
+ "\xFF\xFE\x00\x00".b, # UTF-32 LE
374
+ "\xEF\xBB\xBF".b, # UTF-8
375
+ "\xFE\xFF".b, # UTF-16 BE
376
+ "\xFF\xFE".b, # UTF-16 LE
377
+ ].freeze
378
+
379
+ def strip_bom(raw)
380
+ BOM_PATTERNS.each do |bom|
381
+ return raw.byteslice(bom.bytesize..-1) if raw.start_with?(bom)
382
+ end
383
+ raw
384
+ end
385
+
386
+ # Read up to MAX_ALIGN_BYTES extra bytes from @io until the buffer ends on a
387
+ # complete codepoint boundary in @emit_encoding.
388
+ #
389
+ # For single-byte encodings (ISO-8859-1, ASCII) valid_encoding? is true
390
+ # immediately, so no extra reads occur.
391
+ #
392
+ # Bounded to MAX_ALIGN_BYTES (4) to guard against malformed files: a corrupt
393
+ # byte anywhere in the first peek chunk makes valid_encoding? permanently false.
394
+ # Without the cap the loop would read the entire remaining file one byte at a
395
+ # time before giving up. 4 bytes covers the largest codepoint in any Ruby-supported
396
+ # variable-width encoding (UTF-8 max 4, UTF-32 4, UTF-16 surrogate pairs 4,
397
+ # EUC-JP 3, Shift-JIS 2, GB18030 4).
398
+ MAX_ALIGN_BYTES = 4
399
+
400
+ def align_to_char_boundary(raw)
401
+ MAX_ALIGN_BYTES.times do
402
+ probe = raw.dup.force_encoding(@emit_encoding)
403
+ return raw if probe.valid_encoding?
404
+
405
+ extra = @io.read(1)
406
+ break unless extra # EOF mid-codepoint — malformed input, stop here
407
+
408
+ raw += extra.b
409
+ end
410
+ raw
411
+ end
412
+
413
+ # Apply external→internal transcoding to a string returned from the buffer.
414
+ # The buffer stores raw bytes in the external encoding (@emit_encoding).
415
+ # When the underlying IO was opened with a transcoding pair (e.g. r:iso-8859-1:utf-8),
416
+ # callers expect strings in the internal encoding — the same as IO#gets returns.
417
+ # No-op when there is no transcoding pair or no declared encoding.
418
+ def maybe_transcode(str)
419
+ return str unless str
420
+
421
+ int = internal_encoding
422
+ return str unless int && @emit_encoding && int != @emit_encoding
423
+
424
+ str.force_encoding(@emit_encoding).encode(int, invalid: :replace, undef: :replace)
425
+ end
426
+
427
+ # Allow-list of @io methods safe to expose via method_missing.
428
+ #
429
+ # PeekableIO is an internal SmarterCSV utility; reader.rb is its only caller.
430
+ # Every method SmarterCSV uses on a PeekableIO is either defined explicitly on
431
+ # this class (peek, gets, read, each_char, readline, eof?, close, rewind_buffer,
432
+ # freeze_buffer!, external_encoding, internal_encoding) or is on this list.
433
+ #
434
+ # Any other call — seek, pos=, lineno=, ungetc, ungetbyte, readpartial, sysread,
435
+ # readlines, each_line, etc. — raises NoMethodError. That surfaces a future
436
+ # maintainer's mistake loudly rather than silently desyncing @peek_pos from @io
437
+ # and breaking replay-after-rewind_buffer.
438
+ #
439
+ # Extending this list is a deliberate contract change: add a method only when a
440
+ # real caller inside SmarterCSV needs it.
441
+ ALLOWED_METHODS = %i[encoding].freeze
442
+
443
+ def respond_to_missing?(method, include_private = false)
444
+ (ALLOWED_METHODS.include?(method) && @io.respond_to?(method, include_private)) || super
445
+ end
446
+
447
+ def method_missing(method, *args, &block)
448
+ return super unless ALLOWED_METHODS.include?(method) && @io.respond_to?(method)
449
+
450
+ @io.send(method, *args, &block)
451
+ end
452
+ end
453
+ end