smarter_csv 1.16.3 → 1.17.0.pre5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,8 +8,6 @@ module SmarterCSV
8
8
  # Otherwise guesses column separator from contents.
9
9
  # Raises exception if none is found.
10
10
  def guess_column_separator(filehandle, options)
11
- skip_lines(filehandle, options)
12
-
13
11
  delimiters = [',', "\t", ';', ':', '|']
14
12
 
15
13
  line = nil
@@ -29,11 +27,13 @@ module SmarterCSV
29
27
  candidates[d] += non_quoted_text.scan(d).count
30
28
  end
31
29
  end
32
- rewind(filehandle)
30
+ # No lines were read at all — empty file or stream.
31
+ # Return a safe default and let process_headers raise EmptyFileError.
32
+ return ',' if line.nil?
33
33
 
34
34
  if candidates.values.max == 0
35
35
  # if the header only contains word characters and whitespace, assume comma separator
36
- return ',' if line && line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
36
+ return ',' if line.chomp(options[:row_sep]) =~ /^[\w\s]+$/
37
37
 
38
38
  raise SmarterCSV::NoColSepDetected
39
39
  end
@@ -41,38 +41,79 @@ module SmarterCSV
41
41
  candidates.key(candidates.values.max)
42
42
  end
43
43
 
44
- # limitation: this currently reads the whole file in before making a decision
44
+ # Hard cap on total bytes scanned looking for a decisive row separator.
45
+ # Kept as a constant for now; we can promote it to an option if a real
46
+ # use case appears.
47
+ MAX_AUTO_SCAN = 65_536
48
+
49
+ # Guess the row separator ("\n", "\r\n", or "\r") by counting occurrences
50
+ # in the unquoted portion of the stream.
51
+ #
52
+ # Reads one chunk of options[:auto_row_sep_chars] bytes at a time and
53
+ # grows the buffer up to MAX_AUTO_SCAN bytes while no candidate has a clear
54
+ # majority (count > sum of the others).
55
+ #
56
+ # When a chunk ends exactly on "\r", one extra byte is read so a lone
57
+ # "\r" is never mistaken for the first half of "\r\n".
58
+ #
59
+ # Falls back to "\n" (and emits a warning unless verbose: :quiet) when:
60
+ # * no known separator is found within MAX_AUTO_SCAN bytes — e.g. a file
61
+ # that uses an exotic separator like "\u2028"; or
62
+ # * a tie between candidates persists across MAX_AUTO_SCAN bytes.
63
+ #
64
+ # The fallback preserves 14 years of permissive behavior; the warning lets
65
+ # infrastructure code (logs, captured stderr) surface the ambiguity.
45
66
  def guess_line_ending(filehandle, options)
46
- counts = {"\n" => 0, "\r" => 0, "\r\n" => 0}
47
- quoted_char = false
48
-
49
- # count how many of the pre-defined line-endings we find
50
- # ignoring those contained within quote characters
51
- last_char = nil
52
- lines = 0
53
- filehandle.each_char do |c|
54
- quoted_char = !quoted_char if c == options[:quote_char]
55
- next if quoted_char
56
-
57
- if last_char == "\r"
58
- if c == "\n"
59
- counts["\r\n"] += 1
60
- else
61
- counts["\r"] += 1 # \r are counted after they appeared
62
- end
63
- elsif c == "\n"
64
- counts["\n"] += 1
67
+ q = Regexp.escape(options[:quote_char])
68
+ quoted_re = /#{q}[^#{q}]*#{q}/
69
+ chunk_size = [options[:auto_row_sep_chars].to_i, 64].max
70
+ buf = String.new
71
+ bytes_read = false
72
+ crlf = lf = cr = 0
73
+
74
+ loop do
75
+ part = filehandle.read(chunk_size)
76
+ break if part.nil? || part.empty?
77
+
78
+ bytes_read = true
79
+ buf << part
80
+
81
+ if buf.end_with?("\r")
82
+ extra = filehandle.read(1)
83
+ buf << extra if extra
65
84
  end
66
- last_char = c
67
- lines += 1
68
- break if options[:auto_row_sep_chars] && options[:auto_row_sep_chars] > 0 && lines >= options[:auto_row_sep_chars]
85
+
86
+ unquoted = buf.gsub(quoted_re, '')
87
+ crlf = unquoted.scan("\r\n").size
88
+ lf = unquoted.count("\n") - crlf
89
+ cr = unquoted.count("\r") - crlf
90
+
91
+ # Clear majority: winner strictly greater than the sum of the others.
92
+ return "\r\n" if crlf > lf + cr
93
+ return "\n" if lf > crlf + cr
94
+ return "\r" if cr > crlf + lf
95
+
96
+ break if buf.bytesize >= MAX_AUTO_SCAN
69
97
  end
70
- rewind(filehandle)
71
98
 
72
- counts["\r"] += 1 if last_char == "\r"
73
- # find the most frequent key/value pair:
74
- most_frequent_key, _count = counts.max_by{|_, v| v}
75
- most_frequent_key
99
+ # Empty stream return harmless fallback; downstream raises EmptyFileError.
100
+ return "\n" unless bytes_read
101
+
102
+ unless options[:verbose] == :quiet
103
+ if crlf == 0 && lf == 0 && cr == 0
104
+ record_warning(type: :row_sep, code: :no_row_sep_found, severity: :error) do
105
+ "no row separator found in first #{buf.bytesize} bytes; " \
106
+ "defaulting to \"\\n\". Pass row_sep: explicitly if this is wrong."
107
+ end
108
+ else
109
+ record_warning(type: :row_sep, code: :no_clear_row_sep, severity: :error) do
110
+ "no clear row separator in first #{buf.bytesize} bytes " \
111
+ "(saw #{lf}×\"\\n\", #{crlf}×\"\\r\\n\", #{cr}×\"\\r\"); defaulting to \"\\n\". " \
112
+ "Pass row_sep: explicitly if this is wrong."
113
+ end
114
+ end
115
+ end
116
+ "\n"
76
117
  end
77
118
  end
78
119
  end
@@ -20,10 +20,10 @@ module SmarterCSV
20
20
  end
21
21
  end
22
22
 
23
- def rewind(filehandle)
23
+ def rewind_buffer(filehandle)
24
24
  @file_line_count = 0
25
25
  @csv_line_count = 0
26
- filehandle.rewind
26
+ filehandle.rewind_buffer # this is PeekableIO.rewind_buffer, not io.rewind !
27
27
  end
28
28
 
29
29
  private
@@ -0,0 +1,432 @@
1
+ # frozen_string_literal: true
2
+
3
+ module SmarterCSV
4
+ # PeekableIO wraps any IO-like object and buffers the first chunk of bytes
5
+ # so that auto-detection (row_sep, col_sep) can call rewind without requiring
6
+ # the underlying source to be seekable.
7
+ #
8
+ # Works transparently with files, StringIO, pipes, STDIN, Zlib streams, and
9
+ # any other IO-like object that responds to read.
10
+ #
11
+ # Lifecycle:
12
+ # 1. peek(n) — reads up to n bytes from the underlying IO into the buffer
13
+ # 2. rewind — resets @peek_pos to 0 (replays buffer, never seeks underlying IO)
14
+ # 3. gets/read/each_char — drain the buffer first, then read from @io in
15
+ # @buffer_size chunks, appending each to @peek_buf so that a subsequent
16
+ # rewind can replay the full stream from position 0.
17
+ # 4. rewind — resets @peek_pos to 0; does NOT freeze. Detection may rewind
18
+ # multiple times (once per pass) and must keep accumulating between passes.
19
+ # 5. freeze_buffer! — called once after all detection passes are done. After
20
+ # this point reads beyond the buffer delegate directly to @io without growing
21
+ # @peek_buf. @peek_buf is kept alive (never nilled) so rewind can replay.
22
+ #
23
+ class PeekableIO
24
+ # 16KB is enough for separator detection on any real-world CSV header.
25
+ DEFAULT_PEEK_SIZE = 16_384
26
+
27
+ def initialize(io, options, buffer_size: DEFAULT_PEEK_SIZE)
28
+ @io = io
29
+ @buffer_size = buffer_size
30
+ @options = options # live reference — options[:row_sep] is the default sep for gets/readline
31
+ @peek_buf = nil # nil = buffer not yet filled
32
+ @peek_pos = 0
33
+ @emit_encoding = nil # encoding of strings returned by @io.read — set on first peek
34
+ @buffer_frozen = false # true after freeze_buffer!: buffer stops growing, detection phase is over
35
+ end
36
+
37
+ # Read up to n bytes into the buffer and return them.
38
+ # Called once before auto-detection begins.
39
+ #
40
+ # Works for any IO source — files, StringIO, pipes, Zlib streams, etc.
41
+ # The BOM (if any) is stripped immediately so all downstream code is clean.
42
+ # For transcoded streams (e.g. r:iso-8859-1:utf-8), the raw bytes are
43
+ # converted to the internal encoding in-place; @emit_encoding records the
44
+ # final encoding so read-out can re-tag strings correctly.
45
+ def peek(n = @buffer_size)
46
+ # Idempotent: a second peek call returns the existing buffer without reading
47
+ # more from @io. Calling peek twice would otherwise overwrite the buffer and
48
+ # silently drop any unconsumed bytes from the first peek.
49
+ return @peek_buf.dup.force_encoding(@emit_encoding || Encoding::ASCII_8BIT) if @peek_buf
50
+
51
+ # read(n) fetches raw bytes as ASCII-8BIT regardless of the file's declared
52
+ # encoding — this is what we want because it works even for files that begin
53
+ # with non-UTF-8 BOMs (\xFF\xFE etc.) that would cause gets(nil,n) on a
54
+ # r:utf-8 handle to stop after the first invalid byte.
55
+ chunk = @io.read(n)
56
+ if chunk && !chunk.empty?
57
+ raw = strip_bom(chunk.b)
58
+ # The buffer always holds raw bytes in the external encoding (ASCII-8BIT tagged).
59
+ # Transcoding (ext → int) is the caller's responsibility — it happens externally
60
+ # when consuming data, not here during storage.
61
+ @emit_encoding = external_encoding
62
+ # Ensure the buffer ends on a complete codepoint boundary.
63
+ # align_to_char_boundary reads single bytes from @io until the buffer is valid
64
+ # in @emit_encoding, guarded by MAX_ALIGN_BYTES to avoid infinite loops on
65
+ # malformed input. Skipped when encoding is unknown (nil) or single-byte.
66
+ raw = align_to_char_boundary(raw) if @emit_encoding
67
+ @peek_buf = raw
68
+ @peek_pos = 0
69
+ end
70
+ # Return the full buffered content (BOM-stripped + char-aligned) rather than
71
+ # the original chunk so callers see what was actually consumed.
72
+ @peek_buf ? @peek_buf.dup.force_encoding(@emit_encoding || Encoding::ASCII_8BIT) : chunk
73
+ end
74
+
75
+ # Returns the next line up to and including sep.
76
+ # Hot path: @peek_buf is nil (never peeked) or exhausted — delegate directly to @io.
77
+ # The buffer is never nilled out by read methods so that rewind always works during
78
+ # the auto-detection phase. @peek_pos advancing past bytesize is the exhaustion signal.
79
+ #
80
+ # NOTE: sep must be a String. gets(nil) — which reads until EOF in Ruby IO — is not
81
+ # supported; smarter_csv always passes an explicit row separator string.
82
+ # The default is @options[:row_sep] (resolved after auto-detection), never $/.
83
+ #
84
+ # NOTE: we don't support **kwargs because smarter_csv does not use them.
85
+ #
86
+ # NOTE: the limit parameter (Ruby IO#gets(sep, limit)) is intentionally omitted.
87
+ # PeekableIO is internal to SmarterCSV and no caller passes a limit. If this class
88
+ # were ever extracted into a stand-alone library, limit support would be required
89
+ # to fully comply with the IO#gets contract.
90
+ def gets(sep = @options[:row_sep])
91
+ raise ArgumentError, "PeekableIO#gets does not support gets(nil) — pass an explicit separator string" if sep.nil?
92
+ return @io.gets(sep) if @peek_buf.nil?
93
+
94
+ # Buffer frozen (post auto-detection): delegate once buffer is exhausted — no more accumulation.
95
+ # Must still apply encoding tagging and maybe_transcode so callers see consistent encodings.
96
+ if @buffer_frozen && buffer_exhausted?
97
+ line = @io.gets(sep)
98
+ return nil if line.nil?
99
+
100
+ int = internal_encoding
101
+ # Real IO objects opened with a transcoding pair (e.g. r:iso-8859-1:utf-8) already transcode
102
+ # on read — the returned string is already in the internal encoding. Return it as-is.
103
+ # For wrapper objects (e.g. EncodedBytesIO) that declare encodings but don't transcode on
104
+ # read, the returned string will still be in ASCII-8BIT — fall through to tag + transcode.
105
+ return line if int && line.encoding == int
106
+
107
+ out_enc = @emit_encoding || external_encoding
108
+ # Needed for the single-encoding case (int == nil) when the source declares an
109
+ # external_encoding but returns ASCII-8BIT from #gets (wrapper IOs: EncodedBytesIO,
110
+ # pipes, STDIN, decompression streams). maybe_transcode is a no-op when int is nil,
111
+ # so this is the only step that tags the line in the correct external encoding —
112
+ # otherwise reader.rb#enforce_utf8_encoding would misread the bytes as UTF-8.
113
+ # Redundant on the transcoding-pair path (maybe_transcode force_encodes there too),
114
+ # but the guard keeps it cheap. Covered by peekable_io_spec.rb frozen-exhausted
115
+ # single-encoding test.
116
+ line = line.force_encoding(out_enc) if out_enc && line.encoding != out_enc
117
+ return maybe_transcode(line)
118
+ end
119
+
120
+ # Compute the output encoding once — used by both the detection and frozen paths.
121
+ # For sources with no declared encoding (nil) we fall back to ASCII_8BIT rather
122
+ # than assuming UTF-8 — the caller gets the raw bytes and can re-tag as needed.
123
+ out_enc = @emit_encoding || external_encoding
124
+
125
+ # ---------------------------------------------------------------------------
126
+ # Auto-Detection phase (buffer not yet frozen):
127
+ # Extend the buffer in @buffer_size chunks until the separator is found
128
+ # or EOF. No straddle detection needed — the extension absorbs any boundary.
129
+ # @peek_pos never advances until we have a complete line, so the search always
130
+ # covers the full unread portion of the ever-growing buffer.
131
+ # ---------------------------------------------------------------------------
132
+ unless @buffer_frozen
133
+ loop do
134
+ rest = @peek_buf.byteslice(@peek_pos..-1)
135
+ rest.force_encoding(out_enc || Encoding::ASCII_8BIT)
136
+ # NOTE: rest.b.index(sep.b) is the Ruby 2.6 compatible equivalent of rest.byteindex(sep)
137
+ idx = rest.b.index(sep.b)
138
+ if idx
139
+ line = rest.byteslice(0, idx + sep.bytesize)
140
+ @peek_pos += line.bytesize
141
+ return maybe_transcode(line)
142
+ end
143
+ # Separator not found — fetch another chunk and search again.
144
+ break unless extend_buffer!
145
+ end
146
+ # EOF: return remaining bytes as final line, or nil if nothing left.
147
+ rest = @peek_buf.byteslice(@peek_pos..-1)
148
+ return nil if rest.empty?
149
+
150
+ @peek_pos = @peek_buf.bytesize
151
+ return maybe_transcode(rest.force_encoding(out_enc || Encoding::ASCII_8BIT))
152
+ end
153
+
154
+ # ---------------------------------------------------------------------------
155
+ # Frozen phase (processing): buffer has fixed content.
156
+ # Search within the buffer; handle the separator straddling the buffer/@io
157
+ # boundary for multi-byte separators (e.g. \r\n split across the edge).
158
+ # ---------------------------------------------------------------------------
159
+ rest = @peek_buf.byteslice(@peek_pos..-1)
160
+ rest.force_encoding(out_enc || Encoding::ASCII_8BIT)
161
+ # Use byteindex + byteslice — the buffer stores raw bytes and @peek_pos is a
162
+ # byte offset. Separators are always ASCII, so byteindex is correct regardless
163
+ # of the encoding tag.
164
+ # NOTE: rest.b.index(sep.b) is the Ruby 2.6 compatible equivalent of rest.byteindex(sep)
165
+ idx = rest.b.index(sep.b)
166
+ if idx
167
+ line = rest.byteslice(0, idx + sep.bytesize)
168
+ @peek_pos += line.bytesize
169
+ maybe_transcode(line)
170
+ else
171
+ @peek_pos = @peek_buf.bytesize # mark exhausted, keep buffer alive for rewind
172
+
173
+ # Detect multi-byte separator (e.g. \r\n) split at the buffer boundary —
174
+ # \r is the last byte of @peek_buf, \n is the first byte of @io.
175
+ # byteindex found nothing because the separator straddles the boundary.
176
+ # Check if the buffer tail matches any prefix of sep and read ahead to confirm.
177
+ # For non-seekable IO: on a non-match the already-read bytes are prepended
178
+ # to the remainder so no data is lost.
179
+ if sep.bytesize > 1
180
+ (sep.bytesize - 1).downto(1) do |prefix_len|
181
+ next unless rest.b.end_with?(sep.b.byteslice(0, prefix_len))
182
+
183
+ tail_needed = sep.b.byteslice(prefix_len..-1)
184
+ peeked = @io.read(tail_needed.bytesize)
185
+
186
+ if peeked.nil?
187
+ combined = rest.b # EOF — nothing new to read
188
+ elsif peeked.b == tail_needed
189
+ combined = rest.b + tail_needed # separator confirmed
190
+ else
191
+ # peeked bytes are content, not separator completion.
192
+ # But peeked itself may end with a prefix of sep (e.g. peeked="\r"
193
+ # when sep="\r\n"), meaning @io could begin with sep's tail ("\n").
194
+ # Calling @io.gets(sep) from here would over-read past that boundary.
195
+ # Instead, recursively check for a nested straddle in peeked.
196
+ content = peeked.b
197
+ nested_handled = false
198
+ (sep.bytesize - 1).downto(1) do |n|
199
+ next unless content.end_with?(sep.b.byteslice(0, n))
200
+
201
+ confirmed_tail = @io.read(sep.bytesize - n)
202
+ if confirmed_tail.nil?
203
+ # EOF — nothing more to read; content stays as-is
204
+ elsif confirmed_tail.b == sep.b.byteslice(n..-1)
205
+ content += confirmed_tail.b # separator confirmed
206
+ else
207
+ remainder = @io.gets(sep)
208
+ content = content + confirmed_tail.b + (remainder ? remainder.b : ''.b)
209
+ end
210
+ nested_handled = true
211
+ break
212
+ end
213
+ unless nested_handled
214
+ remainder = @io.gets(sep)
215
+ content += (remainder ? remainder.b : ''.b)
216
+ end
217
+ combined = rest.b + content
218
+ end
219
+ return maybe_transcode(out_enc ? combined.force_encoding(out_enc) : combined)
220
+ end
221
+ end
222
+
223
+ remainder = @io.gets(sep)
224
+ combined = rest.b + (remainder ? remainder.b : ''.b)
225
+ maybe_transcode(out_enc ? combined.force_encoding(out_enc) : combined)
226
+ end
227
+ end
228
+
229
+ # Unlike gets, readline raises EOFError at end of file rather than returning nil.
230
+ # Defaults to @options[:row_sep], never $/.
231
+ def readline(sep = @options[:row_sep])
232
+ line = gets(sep)
233
+ raise EOFError, "end of file reached" if line.nil?
234
+
235
+ line
236
+ end
237
+
238
+ def read(n = nil)
239
+ return @io.read(n) if buffer_exhausted?
240
+
241
+ buffered = @peek_buf.byteslice(@peek_pos..-1)
242
+ out_enc = @emit_encoding || Encoding::ASCII_8BIT
243
+
244
+ # All paths use binary concatenation then re-tag to avoid encoding mismatches.
245
+ if n.nil?
246
+ @peek_pos = @peek_buf.bytesize # consume all buffered bytes
247
+ rest_from_io = @io.read
248
+ appended = rest_from_io ? rest_from_io.b : ''.b
249
+ @peek_buf << appended unless @buffer_frozen
250
+ combined = buffered + appended
251
+ maybe_transcode(combined.force_encoding(out_enc))
252
+ elsif n == 0
253
+ String.new.force_encoding(out_enc) # read(0) must not advance @peek_pos
254
+ elsif buffered.bytesize >= n
255
+ @peek_pos += n # advance exactly n, not the whole buffer
256
+ maybe_transcode(buffered.byteslice(0, n).force_encoding(out_enc))
257
+ else
258
+ @peek_pos = @peek_buf.bytesize # consume all buffered bytes
259
+ rest_from_io = @io.read(n - buffered.bytesize)
260
+ appended = rest_from_io ? rest_from_io.b : ''.b
261
+ @peek_buf << appended unless @buffer_frozen
262
+ combined = buffered + appended
263
+ maybe_transcode(combined.force_encoding(out_enc))
264
+ end
265
+ end
266
+
267
+ def each_char(&block)
268
+ return enum_for(:each_char) unless block_given?
269
+ return @io.each_char(&block) if buffer_exhausted?
270
+
271
+ rest = @peek_buf.byteslice(@peek_pos..-1)
272
+ rest.force_encoding(@emit_encoding || external_encoding || Encoding::ASCII_8BIT)
273
+ rest = maybe_transcode(rest) || rest
274
+ rest.each_char(&block)
275
+ @peek_pos = @peek_buf.bytesize # mark exhausted, keep buffer alive for rewind
276
+
277
+ # Read remaining @io in chunks — avoids O(n²) string concatenation from
278
+ # appending one byte at a time. Row-sep detection only needs ASCII chars
279
+ # (\n, \r) so codepoint boundaries at chunk edges are inconsequential.
280
+ until @io.eof?
281
+ chunk = @io.read(@buffer_size)
282
+ break unless chunk
283
+
284
+ @peek_buf << chunk.b unless @buffer_frozen
285
+ chunk.force_encoding(@emit_encoding || external_encoding || Encoding::ASCII_8BIT)
286
+ (maybe_transcode(chunk) || chunk).each_char(&block)
287
+ end
288
+ end
289
+
290
+ def eof?
291
+ return @io.eof? if buffer_exhausted?
292
+
293
+ false # still have unread bytes in peek buffer
294
+ end
295
+
296
+ # Resets to the start of the peek buffer — never touches the underlying IO.
297
+ # Since auto-detection happens at the very beginning, the buffer IS byte 0.
298
+ # Works identically for files, StringIO, pipes, and any other source.
299
+ #
300
+ # Does NOT freeze the buffer — detection may call rewind_buffer multiple times
301
+ # (once per pass) and must continue accumulating bytes beyond the initial
302
+ # peek chunk. Call freeze_buffer! explicitly when detection is complete.
303
+ def rewind_buffer
304
+ @peek_pos = 0
305
+ end
306
+
307
+ def rewind
308
+ raise NoMethodError, "use rewind_buffer instead of rewind — PeekableIO does not seek the underlying IO"
309
+ end
310
+
311
+ # Freeze the buffer: signals that auto-detection is complete and normal
312
+ # processing is beginning. After this point, reads that go beyond the
313
+ # buffered bytes delegate directly to @io without growing @peek_buf further.
314
+ def freeze_buffer!
315
+ @buffer_frozen = true
316
+ end
317
+
318
+ def close
319
+ @io.close if @io.respond_to?(:close)
320
+ end
321
+
322
+ def external_encoding
323
+ @io.respond_to?(:external_encoding) ? @io.external_encoding : nil
324
+ end
325
+
326
+ def internal_encoding
327
+ @io.respond_to?(:internal_encoding) ? @io.internal_encoding : nil
328
+ end
329
+
330
+ private
331
+
332
+ def buffer_exhausted?
333
+ @peek_buf.nil? || @peek_pos >= @peek_buf.bytesize
334
+ end
335
+
336
+ # Append one @buffer_size chunk from @io to @peek_buf.
337
+ # Returns true if bytes were added, false if @io was already at EOF.
338
+ def extend_buffer!
339
+ chunk = @io.read(@buffer_size)
340
+ return false unless chunk && !chunk.empty?
341
+
342
+ @peek_buf << chunk.b
343
+ true
344
+ end
345
+
346
+ # Strip any BOM from the start of the raw (ASCII_8BIT-tagged) buffer bytes.
347
+ # Doing this once here means all downstream code — auto-detection, the C
348
+ # extension parser, remove_bom in file_io.rb — never sees BOM bytes.
349
+ # Patterns ordered longest-first so UTF-32 is matched before UTF-16.
350
+ BOM_PATTERNS = [
351
+ "\x00\x00\xFE\xFF".b, # UTF-32 BE
352
+ "\xFF\xFE\x00\x00".b, # UTF-32 LE
353
+ "\xEF\xBB\xBF".b, # UTF-8
354
+ "\xFE\xFF".b, # UTF-16 BE
355
+ "\xFF\xFE".b, # UTF-16 LE
356
+ ].freeze
357
+
358
+ def strip_bom(raw)
359
+ BOM_PATTERNS.each do |bom|
360
+ return raw.byteslice(bom.bytesize..-1) if raw.start_with?(bom)
361
+ end
362
+ raw
363
+ end
364
+
365
+ # Read up to MAX_ALIGN_BYTES extra bytes from @io until the buffer ends on a
366
+ # complete codepoint boundary in @emit_encoding.
367
+ #
368
+ # For single-byte encodings (ISO-8859-1, ASCII) valid_encoding? is true
369
+ # immediately, so no extra reads occur.
370
+ #
371
+ # Bounded to MAX_ALIGN_BYTES (4) to guard against malformed files: a corrupt
372
+ # byte anywhere in the first peek chunk makes valid_encoding? permanently false.
373
+ # Without the cap the loop would read the entire remaining file one byte at a
374
+ # time before giving up. 4 bytes covers the largest codepoint in any Ruby-supported
375
+ # variable-width encoding (UTF-8 max 4, UTF-32 4, UTF-16 surrogate pairs 4,
376
+ # EUC-JP 3, Shift-JIS 2, GB18030 4).
377
+ MAX_ALIGN_BYTES = 4
378
+
379
+ def align_to_char_boundary(raw)
380
+ MAX_ALIGN_BYTES.times do
381
+ probe = raw.dup.force_encoding(@emit_encoding)
382
+ return raw if probe.valid_encoding?
383
+
384
+ extra = @io.read(1)
385
+ break unless extra # EOF mid-codepoint — malformed input, stop here
386
+
387
+ raw += extra.b
388
+ end
389
+ raw
390
+ end
391
+
392
+ # Apply external→internal transcoding to a string returned from the buffer.
393
+ # The buffer stores raw bytes in the external encoding (@emit_encoding).
394
+ # When the underlying IO was opened with a transcoding pair (e.g. r:iso-8859-1:utf-8),
395
+ # callers expect strings in the internal encoding — the same as IO#gets returns.
396
+ # No-op when there is no transcoding pair or no declared encoding.
397
+ def maybe_transcode(str)
398
+ return str unless str
399
+
400
+ int = internal_encoding
401
+ return str unless int && @emit_encoding && int != @emit_encoding
402
+
403
+ str.force_encoding(@emit_encoding).encode(int, invalid: :replace, undef: :replace)
404
+ end
405
+
406
+ # Allow-list of @io methods safe to expose via method_missing.
407
+ #
408
+ # PeekableIO is an internal SmarterCSV utility; reader.rb is its only caller.
409
+ # Every method SmarterCSV uses on a PeekableIO is either defined explicitly on
410
+ # this class (peek, gets, read, each_char, readline, eof?, close, rewind_buffer,
411
+ # freeze_buffer!, external_encoding, internal_encoding) or is on this list.
412
+ #
413
+ # Any other call — seek, pos=, lineno=, ungetc, ungetbyte, readpartial, sysread,
414
+ # readlines, each_line, etc. — raises NoMethodError. That surfaces a future
415
+ # maintainer's mistake loudly rather than silently desyncing @peek_pos from @io
416
+ # and breaking replay-after-rewind_buffer.
417
+ #
418
+ # Extending this list is a deliberate contract change: add a method only when a
419
+ # real caller inside SmarterCSV needs it.
420
+ ALLOWED_METHODS = %i[encoding].freeze
421
+
422
+ def respond_to_missing?(method, include_private = false)
423
+ (ALLOWED_METHODS.include?(method) && @io.respond_to?(method, include_private)) || super
424
+ end
425
+
426
+ def method_missing(method, *args, &block)
427
+ return super unless ALLOWED_METHODS.include?(method) && @io.respond_to?(method)
428
+
429
+ @io.send(method, *args, &block)
430
+ end
431
+ end
432
+ end