smarter_csv 1.16.4 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -1
- data/CHANGELOG.md +54 -0
- data/Gemfile +10 -5
- data/README.md +98 -14
- data/TO_DO.md +109 -0
- data/docs/_introduction.md +1 -0
- data/docs/bad_row_quarantine.md +2 -1
- data/docs/basic_read_api.md +6 -1
- data/docs/basic_write_api.md +30 -0
- data/docs/batch_processing.md +25 -0
- data/docs/column_selection.md +1 -0
- data/docs/data_transformations.md +1 -0
- data/docs/examples.md +126 -0
- data/docs/header_transformations.md +23 -0
- data/docs/header_validations.md +1 -0
- data/docs/history.md +1 -0
- data/docs/instrumentation.md +2 -1
- data/docs/migrating_from_csv.md +1 -0
- data/docs/options.md +20 -18
- data/docs/parsing_strategy.md +1 -0
- data/docs/real_world_csv.md +51 -1
- data/docs/releases/1.16.0/performance_notes.md +15 -15
- data/docs/releases/1.17.0/benchmarks.md +121 -0
- data/docs/releases/1.17.0/changes.md +161 -0
- data/docs/releases/1.17.0/performance_notes.md +126 -0
- data/docs/row_col_sep.md +21 -1
- data/docs/ruby_csv_pitfalls.md +1 -0
- data/docs/value_converters.md +24 -0
- data/docs/warnings.md +141 -0
- data/ext/smarter_csv/smarter_csv.c +98 -32
- data/images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg +106 -0
- data/images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg +181 -0
- data/images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg +179 -0
- data/lib/smarter_csv/auto_detection.rb +215 -30
- data/lib/smarter_csv/file_io.rb +2 -2
- data/lib/smarter_csv/hash_transformations.rb +29 -13
- data/lib/smarter_csv/parser.rb +42 -33
- data/lib/smarter_csv/peekable_io.rb +453 -0
- data/lib/smarter_csv/reader.rb +119 -23
- data/lib/smarter_csv/reader_options.rb +61 -1
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +40 -12
- metadata +12 -5
- data/TO_DO_v2.md +0 -14
- data/ext/smarter_csv/Makefile +0 -270
data/lib/smarter_csv/parser.rb
CHANGED
|
@@ -30,8 +30,7 @@ module SmarterCSV
|
|
|
30
30
|
|
|
31
31
|
if options[:acceleration] && has_acceleration
|
|
32
32
|
# :nocov:
|
|
33
|
-
|
|
34
|
-
[elements, elements.size]
|
|
33
|
+
parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], options[:quote_escaping] == :backslash, options[:quote_boundary] == :standard, options[:row_sep])
|
|
35
34
|
# :nocov:
|
|
36
35
|
else
|
|
37
36
|
# puts "WARNING: SmarterCSV is using un-accelerated parsing of lines. Check options[:acceleration]"
|
|
@@ -53,8 +52,7 @@ module SmarterCSV
|
|
|
53
52
|
unless line.include?('\\')
|
|
54
53
|
if options[:acceleration] && has_acceleration
|
|
55
54
|
# :nocov:
|
|
56
|
-
|
|
57
|
-
return [elements, elements.size]
|
|
55
|
+
return parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
58
56
|
# :nocov:
|
|
59
57
|
else
|
|
60
58
|
return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
@@ -67,8 +65,7 @@ module SmarterCSV
|
|
|
67
65
|
# Try backslash-escape interpretation first
|
|
68
66
|
if options[:acceleration] && has_acceleration
|
|
69
67
|
# :nocov:
|
|
70
|
-
|
|
71
|
-
[elements, elements.size]
|
|
68
|
+
parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], true, options[:quote_boundary] == :standard, options[:row_sep])
|
|
72
69
|
# :nocov:
|
|
73
70
|
else
|
|
74
71
|
parse_csv_line_ruby(line, @quote_escaping_backslash, header_size, has_quotes)
|
|
@@ -77,8 +74,7 @@ module SmarterCSV
|
|
|
77
74
|
# Backslash raised a hard error — fall back to RFC 4180 immediately
|
|
78
75
|
if options[:acceleration] && has_acceleration
|
|
79
76
|
# :nocov:
|
|
80
|
-
|
|
81
|
-
return [elements, elements.size]
|
|
77
|
+
return parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
82
78
|
# :nocov:
|
|
83
79
|
else
|
|
84
80
|
return parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
@@ -89,8 +85,7 @@ module SmarterCSV
|
|
|
89
85
|
if result[1] == -1
|
|
90
86
|
rfc_result = if options[:acceleration] && has_acceleration
|
|
91
87
|
# :nocov:
|
|
92
|
-
|
|
93
|
-
[elements, elements.size]
|
|
88
|
+
parse_csv_line_c(line, options[:col_sep], options[:quote_char], header_size, has_quotes, options[:strip_whitespace], false, options[:quote_boundary] == :standard, options[:row_sep])
|
|
94
89
|
# :nocov:
|
|
95
90
|
else
|
|
96
91
|
parse_csv_line_ruby(line, @quote_escaping_double, header_size, has_quotes)
|
|
@@ -205,8 +200,9 @@ module SmarterCSV
|
|
|
205
200
|
|
|
206
201
|
remove_empty = options[:remove_empty_values]
|
|
207
202
|
hash = {}
|
|
208
|
-
fields.each_with_index do |v, i|
|
|
203
|
+
fields.each_with_index do |v, i| # C-level iteration, faster than Ruby while counter loop
|
|
209
204
|
next if remove_empty && v.empty?
|
|
205
|
+
|
|
210
206
|
hash[i < headers.size ? headers[i] : :"#{prefix}#{i + 1}"] = v
|
|
211
207
|
end
|
|
212
208
|
|
|
@@ -280,11 +276,12 @@ module SmarterCSV
|
|
|
280
276
|
|
|
281
277
|
col_sep = options[:col_sep]
|
|
282
278
|
strip = options[:strip_whitespace]
|
|
279
|
+
quote = @quote_char
|
|
283
280
|
|
|
284
281
|
# Ensure has_quotes is set correctly (callers via parse/parse_line_to_hash
|
|
285
282
|
# always pass this, but direct callers may not)
|
|
286
283
|
# rubocop:disable Style/OrAssignment
|
|
287
|
-
has_quotes = line.include?(
|
|
284
|
+
has_quotes = line.include?(quote) unless has_quotes
|
|
288
285
|
# rubocop:enable Style/OrAssignment
|
|
289
286
|
|
|
290
287
|
# Optimization #7: when line has no quotes, use String#split (C-implemented)
|
|
@@ -299,14 +296,16 @@ module SmarterCSV
|
|
|
299
296
|
|
|
300
297
|
elements = line.split(col_sep, -1) # -1 preserves trailing empty fields
|
|
301
298
|
elements = elements[0, header_size] if header_size
|
|
302
|
-
|
|
299
|
+
# split returns fresh, mutable strings — strip them in place (strip! allocates
|
|
300
|
+
# nothing when there's no leading/trailing whitespace, which is the common case)
|
|
301
|
+
elements.each(&:strip!) if strip
|
|
303
302
|
return [elements, elements.size]
|
|
304
303
|
end
|
|
305
304
|
|
|
306
305
|
# Quoted-line path: character-by-character parsing required
|
|
307
306
|
line_size = line.size
|
|
308
307
|
col_sep_size = col_sep.size
|
|
309
|
-
|
|
308
|
+
doubled_quotes = @doubled_quote_chars
|
|
310
309
|
elements = []
|
|
311
310
|
start = 0
|
|
312
311
|
i = 0
|
|
@@ -384,14 +383,20 @@ module SmarterCSV
|
|
|
384
383
|
field_len = i - start
|
|
385
384
|
if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(i - 1) == quote_byte
|
|
386
385
|
field = line.byteslice(start + 1, field_len - 2)
|
|
387
|
-
field
|
|
388
|
-
|
|
389
|
-
|
|
386
|
+
# Tighter guard: only walk the field with gsub! when a doubled quote pair
|
|
387
|
+
# actually exists. include?(doubled_quotes) is a single memmem scan; cheaper
|
|
388
|
+
# than gsub!'s full walk when no doubled pair is present.
|
|
389
|
+
field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
|
|
390
|
+
field.strip! if strip # in-place: no extra allocation; safe on fresh byteslice
|
|
390
391
|
else
|
|
391
392
|
field = line.byteslice(start, field_len)
|
|
392
393
|
field = cleanup_quotes(field, quote)
|
|
393
|
-
|
|
394
|
+
# cleanup_quotes may return the frozen EMPTY_STRING; all non-empty returns are
|
|
395
|
+
# fresh mutable byteslices/substrings. So strip! in place (no allocation), guarded
|
|
396
|
+
# by !field.empty? to skip the frozen-empty case (which needs no stripping anyway).
|
|
397
|
+
field.strip! if strip && !field.empty?
|
|
394
398
|
end
|
|
399
|
+
elements << field
|
|
395
400
|
i += 1
|
|
396
401
|
start = i
|
|
397
402
|
backslash_count = 0
|
|
@@ -450,14 +455,16 @@ module SmarterCSV
|
|
|
450
455
|
field_len = bytesize - start
|
|
451
456
|
if field_len >= 2 && line.getbyte(start) == quote_byte && line.getbyte(bytesize - 1) == quote_byte
|
|
452
457
|
field = line.byteslice(start + 1, field_len - 2)
|
|
453
|
-
field.gsub!(
|
|
458
|
+
field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
|
|
454
459
|
field.strip! if strip
|
|
455
|
-
elements << field
|
|
456
460
|
else
|
|
457
461
|
field = line.byteslice(start, field_len)
|
|
458
462
|
field = cleanup_quotes(field, quote)
|
|
459
|
-
|
|
463
|
+
# cleanup_quotes may return the frozen EMPTY_STRING; non-empty returns are fresh
|
|
464
|
+
# mutable byteslices — strip! in place, guarded by !field.empty? for the frozen case.
|
|
465
|
+
field.strip! if strip && !field.empty?
|
|
460
466
|
end
|
|
467
|
+
elements << field
|
|
461
468
|
end
|
|
462
469
|
else
|
|
463
470
|
# Multi-char col_sep: use substring comparison (original path)
|
|
@@ -489,14 +496,16 @@ module SmarterCSV
|
|
|
489
496
|
field_len = i - start
|
|
490
497
|
if field_len >= 2 && line[start] == quote && line[i - 1] == quote
|
|
491
498
|
field = line[start + 1...i - 1]
|
|
492
|
-
field.gsub!(
|
|
499
|
+
field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
|
|
493
500
|
field.strip! if strip
|
|
494
|
-
elements << field
|
|
495
501
|
else
|
|
496
502
|
field = line[start...i]
|
|
497
503
|
field = cleanup_quotes(field, quote)
|
|
498
|
-
|
|
504
|
+
# cleanup_quotes may return the frozen EMPTY_STRING; non-empty returns are fresh
|
|
505
|
+
# mutable substrings — strip! in place, guarded by !field.empty? for the frozen case.
|
|
506
|
+
field.strip! if strip && !field.empty?
|
|
499
507
|
end
|
|
508
|
+
elements << field
|
|
500
509
|
i += col_sep_size
|
|
501
510
|
start = i
|
|
502
511
|
backslash_count = 0
|
|
@@ -553,14 +562,16 @@ module SmarterCSV
|
|
|
553
562
|
field_len = line_size - start
|
|
554
563
|
if field_len >= 2 && line[start] == quote && line[line_size - 1] == quote
|
|
555
564
|
field = line[start + 1..line_size - 2]
|
|
556
|
-
field.gsub!(
|
|
565
|
+
field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
|
|
557
566
|
field.strip! if strip
|
|
558
|
-
elements << field
|
|
559
567
|
else
|
|
560
568
|
field = line[start..-1]
|
|
561
569
|
field = cleanup_quotes(field, quote)
|
|
562
|
-
|
|
570
|
+
# cleanup_quotes may return the frozen EMPTY_STRING; non-empty returns are fresh
|
|
571
|
+
# mutable substrings — strip! in place, guarded by !field.empty? for the frozen case.
|
|
572
|
+
field.strip! if strip && !field.empty?
|
|
563
573
|
end
|
|
574
|
+
elements << field
|
|
564
575
|
end
|
|
565
576
|
end
|
|
566
577
|
|
|
@@ -571,19 +582,17 @@ module SmarterCSV
|
|
|
571
582
|
return nil if field.nil?
|
|
572
583
|
return EMPTY_STRING if field.empty?
|
|
573
584
|
|
|
585
|
+
doubled_quotes = @doubled_quote_chars
|
|
586
|
+
|
|
574
587
|
# Remove surrounding quotes if present
|
|
575
588
|
if field.start_with?(quote) && field.end_with?(quote)
|
|
576
589
|
field = field[1..-2]
|
|
577
590
|
end
|
|
578
591
|
|
|
579
|
-
# Replace double quotes with a single quote
|
|
580
|
-
field.gsub!(
|
|
592
|
+
# Replace double quotes with a single quote (skip the gsub walk when no doubled pair exists)
|
|
593
|
+
field.gsub!(doubled_quotes, quote) if field.include?(doubled_quotes)
|
|
581
594
|
|
|
582
595
|
field
|
|
583
596
|
end
|
|
584
|
-
|
|
585
|
-
def doubled_quote(quote)
|
|
586
|
-
@doubled_quote ||= (quote * 2).to_s.freeze
|
|
587
|
-
end
|
|
588
597
|
end
|
|
589
598
|
end
|
|
@@ -0,0 +1,453 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SmarterCSV
|
|
4
|
+
# PeekableIO wraps any IO-like object and buffers the first chunk of bytes
|
|
5
|
+
# so that auto-detection (row_sep, col_sep) can call rewind without requiring
|
|
6
|
+
# the underlying source to be seekable.
|
|
7
|
+
#
|
|
8
|
+
# Works transparently with files, StringIO, pipes, STDIN, Zlib streams, and
|
|
9
|
+
# any other IO-like object that responds to read.
|
|
10
|
+
#
|
|
11
|
+
# Lifecycle:
|
|
12
|
+
# 1. peek(n) — reads up to n bytes from the underlying IO into the buffer
|
|
13
|
+
# 2. rewind — resets @peek_pos to 0 (replays buffer, never seeks underlying IO)
|
|
14
|
+
# 3. gets/read/each_char — drain the buffer first, then read from @io in
|
|
15
|
+
# @buffer_size chunks, appending each to @peek_buf so that a subsequent
|
|
16
|
+
# rewind can replay the full stream from position 0.
|
|
17
|
+
# 4. rewind — resets @peek_pos to 0; does NOT freeze. Detection may rewind
|
|
18
|
+
# multiple times (once per pass) and must keep accumulating between passes.
|
|
19
|
+
# 5. freeze_buffer! — called once after all detection passes are done. After
|
|
20
|
+
# this point reads beyond the buffer delegate directly to @io without growing
|
|
21
|
+
# @peek_buf. @peek_buf is kept alive (never nilled) so rewind can replay.
|
|
22
|
+
#
|
|
23
|
+
class PeekableIO
|
|
24
|
+
# 16KB is enough for separator detection on any real-world CSV header.
|
|
25
|
+
# Matches one EBS gp3 I/O block and one Apple Silicon VM page exactly.
|
|
26
|
+
DEFAULT_PEEK_SIZE = 16_384
|
|
27
|
+
|
|
28
|
+
# Lower bound for a sane peek buffer chunk size. Below this the buffer is
|
|
29
|
+
# too small to be useful even on local SSD (one VM page on x86 is 4 KB).
|
|
30
|
+
MIN_BUFFER_SIZE = 4_096
|
|
31
|
+
|
|
32
|
+
# Upper bound for the peek buffer chunk size. Equal to
|
|
33
|
+
# AutoDetection::MAX_AUTO_ROW_SEP_CHARS — beyond this, bytes are unused by
|
|
34
|
+
# auto-detection and only delay parse start by pre-loading bytes that
|
|
35
|
+
# would have been read during parsing anyway.
|
|
36
|
+
MAX_BUFFER_SIZE = SmarterCSV::AutoDetection::MAX_AUTO_ROW_SEP_CHARS
|
|
37
|
+
|
|
38
|
+
def initialize(io, options, buffer_size: DEFAULT_PEEK_SIZE)
|
|
39
|
+
@io = io
|
|
40
|
+
@buffer_size = buffer_size
|
|
41
|
+
@options = options # live reference — options[:row_sep] is the default sep for gets/readline
|
|
42
|
+
@peek_buf = nil # nil = buffer not yet filled
|
|
43
|
+
@peek_pos = 0
|
|
44
|
+
@emit_encoding = nil # encoding of strings returned by @io.read — set on first peek
|
|
45
|
+
@buffer_frozen = false # true after freeze_buffer!: buffer stops growing, detection phase is over
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Read up to n bytes into the buffer and return them.
|
|
49
|
+
# Called once before auto-detection begins.
|
|
50
|
+
#
|
|
51
|
+
# Works for any IO source — files, StringIO, pipes, Zlib streams, etc.
|
|
52
|
+
# The BOM (if any) is stripped immediately so all downstream code is clean.
|
|
53
|
+
# For transcoded streams (e.g. r:iso-8859-1:utf-8), the raw bytes are
|
|
54
|
+
# converted to the internal encoding in-place; @emit_encoding records the
|
|
55
|
+
# final encoding so read-out can re-tag strings correctly.
|
|
56
|
+
def peek(n = @buffer_size)
|
|
57
|
+
# Idempotent: a second peek call returns the existing buffer without reading
|
|
58
|
+
# more from @io. Calling peek twice would otherwise overwrite the buffer and
|
|
59
|
+
# silently drop any unconsumed bytes from the first peek.
|
|
60
|
+
return @peek_buf.dup.force_encoding(@emit_encoding || Encoding::ASCII_8BIT) if @peek_buf
|
|
61
|
+
|
|
62
|
+
# read(n) fetches raw bytes as ASCII-8BIT regardless of the file's declared
|
|
63
|
+
# encoding — this is what we want because it works even for files that begin
|
|
64
|
+
# with non-UTF-8 BOMs (\xFF\xFE etc.) that would cause gets(nil,n) on a
|
|
65
|
+
# r:utf-8 handle to stop after the first invalid byte.
|
|
66
|
+
chunk = @io.read(n)
|
|
67
|
+
if chunk && !chunk.empty?
|
|
68
|
+
raw = strip_bom(chunk.b)
|
|
69
|
+
# The buffer always holds raw bytes in the external encoding (ASCII-8BIT tagged).
|
|
70
|
+
# Transcoding (ext → int) is the caller's responsibility — it happens externally
|
|
71
|
+
# when consuming data, not here during storage.
|
|
72
|
+
@emit_encoding = external_encoding
|
|
73
|
+
# Ensure the buffer ends on a complete codepoint boundary.
|
|
74
|
+
# align_to_char_boundary reads single bytes from @io until the buffer is valid
|
|
75
|
+
# in @emit_encoding, guarded by MAX_ALIGN_BYTES to avoid infinite loops on
|
|
76
|
+
# malformed input. Skipped when encoding is unknown (nil) or single-byte.
|
|
77
|
+
raw = align_to_char_boundary(raw) if @emit_encoding
|
|
78
|
+
@peek_buf = raw
|
|
79
|
+
@peek_pos = 0
|
|
80
|
+
end
|
|
81
|
+
# Return the full buffered content (BOM-stripped + char-aligned) rather than
|
|
82
|
+
# the original chunk so callers see what was actually consumed.
|
|
83
|
+
@peek_buf ? @peek_buf.dup.force_encoding(@emit_encoding || Encoding::ASCII_8BIT) : chunk
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Returns the next line up to and including sep.
|
|
87
|
+
# Hot path: @peek_buf is nil (never peeked) or exhausted — delegate directly to @io.
|
|
88
|
+
# The buffer is never nilled out by read methods so that rewind always works during
|
|
89
|
+
# the auto-detection phase. @peek_pos advancing past bytesize is the exhaustion signal.
|
|
90
|
+
#
|
|
91
|
+
# NOTE: sep must be a String. gets(nil) — which reads until EOF in Ruby IO — is not
|
|
92
|
+
# supported; smarter_csv always passes an explicit row separator string.
|
|
93
|
+
# The default is @options[:row_sep] (resolved after auto-detection), never $/.
|
|
94
|
+
#
|
|
95
|
+
# NOTE: we don't support **kwargs because smarter_csv does not use them.
|
|
96
|
+
#
|
|
97
|
+
# NOTE: the limit parameter (Ruby IO#gets(sep, limit)) is intentionally omitted.
|
|
98
|
+
# PeekableIO is internal to SmarterCSV and no caller passes a limit. If this class
|
|
99
|
+
# were ever extracted into a stand-alone library, limit support would be required
|
|
100
|
+
# to fully comply with the IO#gets contract.
|
|
101
|
+
def gets(sep = @options[:row_sep])
|
|
102
|
+
raise ArgumentError, "PeekableIO#gets does not support gets(nil) — pass an explicit separator string" if sep.nil?
|
|
103
|
+
return @io.gets(sep) if @peek_buf.nil?
|
|
104
|
+
|
|
105
|
+
# Buffer frozen (post auto-detection): delegate once buffer is exhausted — no more accumulation.
|
|
106
|
+
# Must still apply encoding tagging and maybe_transcode so callers see consistent encodings.
|
|
107
|
+
if @buffer_frozen && buffer_exhausted?
|
|
108
|
+
line = @io.gets(sep)
|
|
109
|
+
return nil if line.nil?
|
|
110
|
+
|
|
111
|
+
int = internal_encoding
|
|
112
|
+
# Real IO objects opened with a transcoding pair (e.g. r:iso-8859-1:utf-8) already transcode
|
|
113
|
+
# on read — the returned string is already in the internal encoding. Return it as-is.
|
|
114
|
+
# For wrapper objects (e.g. EncodedBytesIO) that declare encodings but don't transcode on
|
|
115
|
+
# read, the returned string will still be in ASCII-8BIT — fall through to tag + transcode.
|
|
116
|
+
return line if int && line.encoding == int
|
|
117
|
+
|
|
118
|
+
out_enc = @emit_encoding || external_encoding
|
|
119
|
+
# Needed for the single-encoding case (int == nil) when the source declares an
|
|
120
|
+
# external_encoding but returns ASCII-8BIT from #gets (wrapper IOs: EncodedBytesIO,
|
|
121
|
+
# pipes, STDIN, decompression streams). maybe_transcode is a no-op when int is nil,
|
|
122
|
+
# so this is the only step that tags the line in the correct external encoding —
|
|
123
|
+
# otherwise reader.rb#enforce_utf8_encoding would misread the bytes as UTF-8.
|
|
124
|
+
# Redundant on the transcoding-pair path (maybe_transcode force_encodes there too),
|
|
125
|
+
# but the guard keeps it cheap. Covered by peekable_io_spec.rb frozen-exhausted
|
|
126
|
+
# single-encoding test.
|
|
127
|
+
line = line.force_encoding(out_enc) if out_enc && line.encoding != out_enc
|
|
128
|
+
return maybe_transcode(line)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
# Compute the output encoding once — used by both the detection and frozen paths.
|
|
132
|
+
# For sources with no declared encoding (nil) we fall back to ASCII_8BIT rather
|
|
133
|
+
# than assuming UTF-8 — the caller gets the raw bytes and can re-tag as needed.
|
|
134
|
+
out_enc = @emit_encoding || external_encoding
|
|
135
|
+
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
# Auto-Detection phase (buffer not yet frozen):
|
|
138
|
+
# Extend the buffer in @buffer_size chunks until the separator is found
|
|
139
|
+
# or EOF. No straddle detection needed — the extension absorbs any boundary.
|
|
140
|
+
# @peek_pos never advances until we have a complete line, so the search always
|
|
141
|
+
# covers the full unread portion of the ever-growing buffer.
|
|
142
|
+
# ---------------------------------------------------------------------------
|
|
143
|
+
unless @buffer_frozen
|
|
144
|
+
loop do
|
|
145
|
+
rest = @peek_buf.byteslice(@peek_pos..-1)
|
|
146
|
+
rest.force_encoding(out_enc || Encoding::ASCII_8BIT)
|
|
147
|
+
# NOTE: rest.b.index(sep.b) is the Ruby 2.6 compatible equivalent of rest.byteindex(sep)
|
|
148
|
+
idx = rest.b.index(sep.b)
|
|
149
|
+
if idx
|
|
150
|
+
line = rest.byteslice(0, idx + sep.bytesize)
|
|
151
|
+
@peek_pos += line.bytesize
|
|
152
|
+
return maybe_transcode(line)
|
|
153
|
+
end
|
|
154
|
+
# Separator not found — fetch another chunk and search again.
|
|
155
|
+
break unless extend_buffer!
|
|
156
|
+
end
|
|
157
|
+
# EOF: return remaining bytes as final line, or nil if nothing left.
|
|
158
|
+
rest = @peek_buf.byteslice(@peek_pos..-1)
|
|
159
|
+
return nil if rest.empty?
|
|
160
|
+
|
|
161
|
+
@peek_pos = @peek_buf.bytesize
|
|
162
|
+
return maybe_transcode(rest.force_encoding(out_enc || Encoding::ASCII_8BIT))
|
|
163
|
+
end
|
|
164
|
+
|
|
165
|
+
# ---------------------------------------------------------------------------
|
|
166
|
+
# Frozen phase (processing): buffer has fixed content.
|
|
167
|
+
# Search within the buffer; handle the separator straddling the buffer/@io
|
|
168
|
+
# boundary for multi-byte separators (e.g. \r\n split across the edge).
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
rest = @peek_buf.byteslice(@peek_pos..-1)
|
|
171
|
+
rest.force_encoding(out_enc || Encoding::ASCII_8BIT)
|
|
172
|
+
# Use byteindex + byteslice — the buffer stores raw bytes and @peek_pos is a
|
|
173
|
+
# byte offset. Separators are always ASCII, so byteindex is correct regardless
|
|
174
|
+
# of the encoding tag.
|
|
175
|
+
# NOTE: rest.b.index(sep.b) is the Ruby 2.6 compatible equivalent of rest.byteindex(sep)
|
|
176
|
+
idx = rest.b.index(sep.b)
|
|
177
|
+
if idx
|
|
178
|
+
line = rest.byteslice(0, idx + sep.bytesize)
|
|
179
|
+
@peek_pos += line.bytesize
|
|
180
|
+
maybe_transcode(line)
|
|
181
|
+
else
|
|
182
|
+
@peek_pos = @peek_buf.bytesize # mark exhausted, keep buffer alive for rewind
|
|
183
|
+
|
|
184
|
+
# Detect multi-byte separator (e.g. \r\n) split at the buffer boundary —
|
|
185
|
+
# \r is the last byte of @peek_buf, \n is the first byte of @io.
|
|
186
|
+
# byteindex found nothing because the separator straddles the boundary.
|
|
187
|
+
# Check if the buffer tail matches any prefix of sep and read ahead to confirm.
|
|
188
|
+
# For non-seekable IO: on a non-match the already-read bytes are prepended
|
|
189
|
+
# to the remainder so no data is lost.
|
|
190
|
+
if sep.bytesize > 1
|
|
191
|
+
(sep.bytesize - 1).downto(1) do |prefix_len|
|
|
192
|
+
next unless rest.b.end_with?(sep.b.byteslice(0, prefix_len))
|
|
193
|
+
|
|
194
|
+
tail_needed = sep.b.byteslice(prefix_len..-1)
|
|
195
|
+
peeked = @io.read(tail_needed.bytesize)
|
|
196
|
+
|
|
197
|
+
if peeked.nil?
|
|
198
|
+
combined = rest.b # EOF — nothing new to read
|
|
199
|
+
elsif peeked.b == tail_needed
|
|
200
|
+
combined = rest.b + tail_needed # separator confirmed
|
|
201
|
+
else
|
|
202
|
+
# peeked bytes are content, not separator completion.
|
|
203
|
+
# But peeked itself may end with a prefix of sep (e.g. peeked="\r"
|
|
204
|
+
# when sep="\r\n"), meaning @io could begin with sep's tail ("\n").
|
|
205
|
+
# Calling @io.gets(sep) from here would over-read past that boundary.
|
|
206
|
+
# Instead, recursively check for a nested straddle in peeked.
|
|
207
|
+
content = peeked.b
|
|
208
|
+
nested_handled = false
|
|
209
|
+
(sep.bytesize - 1).downto(1) do |n|
|
|
210
|
+
next unless content.end_with?(sep.b.byteslice(0, n))
|
|
211
|
+
|
|
212
|
+
confirmed_tail = @io.read(sep.bytesize - n)
|
|
213
|
+
if confirmed_tail.nil?
|
|
214
|
+
# EOF — nothing more to read; content stays as-is
|
|
215
|
+
elsif confirmed_tail.b == sep.b.byteslice(n..-1)
|
|
216
|
+
content += confirmed_tail.b # separator confirmed
|
|
217
|
+
else
|
|
218
|
+
remainder = @io.gets(sep)
|
|
219
|
+
content = content + confirmed_tail.b + (remainder ? remainder.b : ''.b)
|
|
220
|
+
end
|
|
221
|
+
nested_handled = true
|
|
222
|
+
break
|
|
223
|
+
end
|
|
224
|
+
unless nested_handled
|
|
225
|
+
remainder = @io.gets(sep)
|
|
226
|
+
content += (remainder ? remainder.b : ''.b)
|
|
227
|
+
end
|
|
228
|
+
combined = rest.b + content
|
|
229
|
+
end
|
|
230
|
+
return maybe_transcode(out_enc ? combined.force_encoding(out_enc) : combined)
|
|
231
|
+
end
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
remainder = @io.gets(sep)
|
|
235
|
+
combined = rest.b + (remainder ? remainder.b : ''.b)
|
|
236
|
+
maybe_transcode(out_enc ? combined.force_encoding(out_enc) : combined)
|
|
237
|
+
end
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Unlike gets, readline raises EOFError at end of file rather than returning nil.
|
|
241
|
+
# Defaults to @options[:row_sep], never $/.
|
|
242
|
+
def readline(sep = @options[:row_sep])
|
|
243
|
+
line = gets(sep)
|
|
244
|
+
raise EOFError, "end of file reached" if line.nil?
|
|
245
|
+
|
|
246
|
+
line
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
def read(n = nil)
|
|
250
|
+
# Delegate to @io only when (a) we never peeked, or (b) the buffer is
|
|
251
|
+
# frozen and fully replayed. During auto-detection (not frozen) the
|
|
252
|
+
# buffer must be extended even when @peek_pos has caught up to its end,
|
|
253
|
+
# otherwise bytes read from @io are not appended to @peek_buf and a
|
|
254
|
+
# subsequent rewind_buffer would lose them.
|
|
255
|
+
return @io.read(n) if @peek_buf.nil?
|
|
256
|
+
return @io.read(n) if @buffer_frozen && buffer_exhausted?
|
|
257
|
+
|
|
258
|
+
buffered = @peek_buf.byteslice(@peek_pos..-1)
|
|
259
|
+
out_enc = @emit_encoding || Encoding::ASCII_8BIT
|
|
260
|
+
|
|
261
|
+
# All paths use binary concatenation then re-tag to avoid encoding mismatches.
|
|
262
|
+
if n.nil?
|
|
263
|
+
@peek_pos = @peek_buf.bytesize # consume all buffered bytes
|
|
264
|
+
rest_from_io = @io.read
|
|
265
|
+
appended = rest_from_io ? rest_from_io.b : ''.b
|
|
266
|
+
@peek_buf << appended unless @buffer_frozen
|
|
267
|
+
combined = buffered + appended
|
|
268
|
+
maybe_transcode(combined.force_encoding(out_enc))
|
|
269
|
+
elsif n == 0
|
|
270
|
+
String.new.force_encoding(out_enc) # read(0) must not advance @peek_pos
|
|
271
|
+
elsif buffered.bytesize >= n
|
|
272
|
+
@peek_pos += n # advance exactly n, not the whole buffer
|
|
273
|
+
maybe_transcode(buffered.byteslice(0, n).force_encoding(out_enc))
|
|
274
|
+
else
|
|
275
|
+
@peek_pos = @peek_buf.bytesize # consume all buffered bytes
|
|
276
|
+
rest_from_io = @io.read(n - buffered.bytesize)
|
|
277
|
+
appended = rest_from_io ? rest_from_io.b : ''.b
|
|
278
|
+
@peek_buf << appended unless @buffer_frozen
|
|
279
|
+
combined = buffered + appended
|
|
280
|
+
maybe_transcode(combined.force_encoding(out_enc))
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def each_char(&block)
|
|
285
|
+
return enum_for(:each_char) unless block_given?
|
|
286
|
+
# Same guard as read(): only delegate when never peeked, or when frozen
|
|
287
|
+
# and fully replayed. Otherwise we must extend the buffer so rewind_buffer
|
|
288
|
+
# can replay the bytes during the parsing phase.
|
|
289
|
+
return @io.each_char(&block) if @peek_buf.nil?
|
|
290
|
+
return @io.each_char(&block) if @buffer_frozen && buffer_exhausted?
|
|
291
|
+
|
|
292
|
+
rest = @peek_buf.byteslice(@peek_pos..-1)
|
|
293
|
+
rest.force_encoding(@emit_encoding || external_encoding || Encoding::ASCII_8BIT)
|
|
294
|
+
rest = maybe_transcode(rest) || rest
|
|
295
|
+
rest.each_char(&block)
|
|
296
|
+
@peek_pos = @peek_buf.bytesize # mark exhausted, keep buffer alive for rewind
|
|
297
|
+
|
|
298
|
+
# Read remaining @io in chunks — avoids O(n²) string concatenation from
|
|
299
|
+
# appending one byte at a time. Row-sep detection only needs ASCII chars
|
|
300
|
+
# (\n, \r) so codepoint boundaries at chunk edges are inconsequential.
|
|
301
|
+
until @io.eof?
|
|
302
|
+
chunk = @io.read(@buffer_size)
|
|
303
|
+
break unless chunk
|
|
304
|
+
|
|
305
|
+
@peek_buf << chunk.b unless @buffer_frozen
|
|
306
|
+
chunk.force_encoding(@emit_encoding || external_encoding || Encoding::ASCII_8BIT)
|
|
307
|
+
(maybe_transcode(chunk) || chunk).each_char(&block)
|
|
308
|
+
end
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
def eof?
|
|
312
|
+
return @io.eof? if buffer_exhausted?
|
|
313
|
+
|
|
314
|
+
false # still have unread bytes in peek buffer
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Resets to the start of the peek buffer — never touches the underlying IO.
|
|
318
|
+
# Since auto-detection happens at the very beginning, the buffer IS byte 0.
|
|
319
|
+
# Works identically for files, StringIO, pipes, and any other source.
|
|
320
|
+
#
|
|
321
|
+
# Does NOT freeze the buffer — detection may call rewind_buffer multiple times
|
|
322
|
+
# (once per pass) and must continue accumulating bytes beyond the initial
|
|
323
|
+
# peek chunk. Call freeze_buffer! explicitly when detection is complete.
|
|
324
|
+
def rewind_buffer
|
|
325
|
+
@peek_pos = 0
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
def rewind
|
|
329
|
+
raise NoMethodError, "use rewind_buffer instead of rewind — PeekableIO does not seek the underlying IO"
|
|
330
|
+
end
|
|
331
|
+
|
|
332
|
+
# Freeze the buffer: signals that auto-detection is complete and normal
|
|
333
|
+
# processing is beginning. After this point, reads that go beyond the
|
|
334
|
+
# buffered bytes delegate directly to @io without growing @peek_buf further.
|
|
335
|
+
def freeze_buffer!
|
|
336
|
+
@buffer_frozen = true
|
|
337
|
+
end
|
|
338
|
+
|
|
339
|
+
def close
|
|
340
|
+
@io.close if @io.respond_to?(:close)
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
def external_encoding
|
|
344
|
+
@io.respond_to?(:external_encoding) ? @io.external_encoding : nil
|
|
345
|
+
end
|
|
346
|
+
|
|
347
|
+
def internal_encoding
|
|
348
|
+
@io.respond_to?(:internal_encoding) ? @io.internal_encoding : nil
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
private
|
|
352
|
+
|
|
353
|
+
def buffer_exhausted?
|
|
354
|
+
@peek_buf.nil? || @peek_pos >= @peek_buf.bytesize
|
|
355
|
+
end
|
|
356
|
+
|
|
357
|
+
# Append one @buffer_size chunk from @io to @peek_buf.
|
|
358
|
+
# Returns true if bytes were added, false if @io was already at EOF.
|
|
359
|
+
def extend_buffer!
|
|
360
|
+
chunk = @io.read(@buffer_size)
|
|
361
|
+
return false unless chunk && !chunk.empty?
|
|
362
|
+
|
|
363
|
+
@peek_buf << chunk.b
|
|
364
|
+
true
|
|
365
|
+
end
|
|
366
|
+
|
|
367
|
+
# Strip any BOM from the start of the raw (ASCII_8BIT-tagged) buffer bytes.
|
|
368
|
+
# Doing this once here means all downstream code — auto-detection, the C
|
|
369
|
+
# extension parser, remove_bom in file_io.rb — never sees BOM bytes.
|
|
370
|
+
# Patterns ordered longest-first so UTF-32 is matched before UTF-16.
|
|
371
|
+
BOM_PATTERNS = [
|
|
372
|
+
"\x00\x00\xFE\xFF".b, # UTF-32 BE
|
|
373
|
+
"\xFF\xFE\x00\x00".b, # UTF-32 LE
|
|
374
|
+
"\xEF\xBB\xBF".b, # UTF-8
|
|
375
|
+
"\xFE\xFF".b, # UTF-16 BE
|
|
376
|
+
"\xFF\xFE".b, # UTF-16 LE
|
|
377
|
+
].freeze
|
|
378
|
+
|
|
379
|
+
def strip_bom(raw)
|
|
380
|
+
BOM_PATTERNS.each do |bom|
|
|
381
|
+
return raw.byteslice(bom.bytesize..-1) if raw.start_with?(bom)
|
|
382
|
+
end
|
|
383
|
+
raw
|
|
384
|
+
end
|
|
385
|
+
|
|
386
|
+
# Read up to MAX_ALIGN_BYTES extra bytes from @io until the buffer ends on a
|
|
387
|
+
# complete codepoint boundary in @emit_encoding.
|
|
388
|
+
#
|
|
389
|
+
# For single-byte encodings (ISO-8859-1, ASCII) valid_encoding? is true
|
|
390
|
+
# immediately, so no extra reads occur.
|
|
391
|
+
#
|
|
392
|
+
# Bounded to MAX_ALIGN_BYTES (4) to guard against malformed files: a corrupt
|
|
393
|
+
# byte anywhere in the first peek chunk makes valid_encoding? permanently false.
|
|
394
|
+
# Without the cap the loop would read the entire remaining file one byte at a
|
|
395
|
+
# time before giving up. 4 bytes covers the largest codepoint in any Ruby-supported
|
|
396
|
+
# variable-width encoding (UTF-8 max 4, UTF-32 4, UTF-16 surrogate pairs 4,
|
|
397
|
+
# EUC-JP 3, Shift-JIS 2, GB18030 4).
|
|
398
|
+
MAX_ALIGN_BYTES = 4
|
|
399
|
+
|
|
400
|
+
def align_to_char_boundary(raw)
|
|
401
|
+
MAX_ALIGN_BYTES.times do
|
|
402
|
+
probe = raw.dup.force_encoding(@emit_encoding)
|
|
403
|
+
return raw if probe.valid_encoding?
|
|
404
|
+
|
|
405
|
+
extra = @io.read(1)
|
|
406
|
+
break unless extra # EOF mid-codepoint — malformed input, stop here
|
|
407
|
+
|
|
408
|
+
raw += extra.b
|
|
409
|
+
end
|
|
410
|
+
raw
|
|
411
|
+
end
|
|
412
|
+
|
|
413
|
+
# Apply external→internal transcoding to a string returned from the buffer.
|
|
414
|
+
# The buffer stores raw bytes in the external encoding (@emit_encoding).
|
|
415
|
+
# When the underlying IO was opened with a transcoding pair (e.g. r:iso-8859-1:utf-8),
|
|
416
|
+
# callers expect strings in the internal encoding — the same as IO#gets returns.
|
|
417
|
+
# No-op when there is no transcoding pair or no declared encoding.
|
|
418
|
+
def maybe_transcode(str)
|
|
419
|
+
return str unless str
|
|
420
|
+
|
|
421
|
+
int = internal_encoding
|
|
422
|
+
return str unless int && @emit_encoding && int != @emit_encoding
|
|
423
|
+
|
|
424
|
+
str.force_encoding(@emit_encoding).encode(int, invalid: :replace, undef: :replace)
|
|
425
|
+
end
|
|
426
|
+
|
|
427
|
+
# Allow-list of @io methods safe to expose via method_missing.
|
|
428
|
+
#
|
|
429
|
+
# PeekableIO is an internal SmarterCSV utility; reader.rb is its only caller.
|
|
430
|
+
# Every method SmarterCSV uses on a PeekableIO is either defined explicitly on
|
|
431
|
+
# this class (peek, gets, read, each_char, readline, eof?, close, rewind_buffer,
|
|
432
|
+
# freeze_buffer!, external_encoding, internal_encoding) or is on this list.
|
|
433
|
+
#
|
|
434
|
+
# Any other call — seek, pos=, lineno=, ungetc, ungetbyte, readpartial, sysread,
|
|
435
|
+
# readlines, each_line, etc. — raises NoMethodError. That surfaces a future
|
|
436
|
+
# maintainer's mistake loudly rather than silently desyncing @peek_pos from @io
|
|
437
|
+
# and breaking replay-after-rewind_buffer.
|
|
438
|
+
#
|
|
439
|
+
# Extending this list is a deliberate contract change: add a method only when a
|
|
440
|
+
# real caller inside SmarterCSV needs it.
|
|
441
|
+
ALLOWED_METHODS = %i[encoding].freeze
|
|
442
|
+
|
|
443
|
+
def respond_to_missing?(method, include_private = false)
|
|
444
|
+
(ALLOWED_METHODS.include?(method) && @io.respond_to?(method, include_private)) || super
|
|
445
|
+
end
|
|
446
|
+
|
|
447
|
+
def method_missing(method, *args, &block)
|
|
448
|
+
return super unless ALLOWED_METHODS.include?(method) && @io.respond_to?(method)
|
|
449
|
+
|
|
450
|
+
@io.send(method, *args, &block)
|
|
451
|
+
end
|
|
452
|
+
end
|
|
453
|
+
end
|