smarter_csv 1.16.4 → 1.17.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +10 -1
- data/CHANGELOG.md +54 -0
- data/Gemfile +10 -5
- data/README.md +98 -14
- data/TO_DO.md +109 -0
- data/docs/_introduction.md +1 -0
- data/docs/bad_row_quarantine.md +2 -1
- data/docs/basic_read_api.md +6 -1
- data/docs/basic_write_api.md +30 -0
- data/docs/batch_processing.md +25 -0
- data/docs/column_selection.md +1 -0
- data/docs/data_transformations.md +1 -0
- data/docs/examples.md +126 -0
- data/docs/header_transformations.md +23 -0
- data/docs/header_validations.md +1 -0
- data/docs/history.md +1 -0
- data/docs/instrumentation.md +2 -1
- data/docs/migrating_from_csv.md +1 -0
- data/docs/options.md +20 -18
- data/docs/parsing_strategy.md +1 -0
- data/docs/real_world_csv.md +51 -1
- data/docs/releases/1.16.0/performance_notes.md +15 -15
- data/docs/releases/1.17.0/benchmarks.md +121 -0
- data/docs/releases/1.17.0/changes.md +161 -0
- data/docs/releases/1.17.0/performance_notes.md +126 -0
- data/docs/row_col_sep.md +21 -1
- data/docs/ruby_csv_pitfalls.md +1 -0
- data/docs/value_converters.md +24 -0
- data/docs/warnings.md +141 -0
- data/ext/smarter_csv/smarter_csv.c +98 -32
- data/images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg +106 -0
- data/images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg +181 -0
- data/images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg +179 -0
- data/lib/smarter_csv/auto_detection.rb +215 -30
- data/lib/smarter_csv/file_io.rb +2 -2
- data/lib/smarter_csv/hash_transformations.rb +29 -13
- data/lib/smarter_csv/parser.rb +42 -33
- data/lib/smarter_csv/peekable_io.rb +453 -0
- data/lib/smarter_csv/reader.rb +119 -23
- data/lib/smarter_csv/reader_options.rb +61 -1
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +40 -12
- metadata +12 -5
- data/TO_DO_v2.md +0 -14
- data/ext/smarter_csv/Makefile +0 -270
data/lib/smarter_csv/reader.rb
CHANGED
|
@@ -7,7 +7,8 @@ module SmarterCSV
|
|
|
7
7
|
include Enumerable
|
|
8
8
|
|
|
9
9
|
# Default chunk size used by each_chunk when chunk_size is not explicitly set.
|
|
10
|
-
# A warning is
|
|
10
|
+
# A warning is recorded (and emitted via Rails.logger or Kernel#warn) so users
|
|
11
|
+
# know to configure it explicitly.
|
|
11
12
|
DEFAULT_CHUNK_SIZE = 100
|
|
12
13
|
|
|
13
14
|
include ::SmarterCSV::Reader::Options
|
|
@@ -21,7 +22,7 @@ module SmarterCSV
|
|
|
21
22
|
|
|
22
23
|
attr_reader :input, :options
|
|
23
24
|
attr_reader :csv_line_count, :chunk_count, :file_line_count
|
|
24
|
-
attr_reader :enforce_utf8, :has_rails, :has_acceleration
|
|
25
|
+
attr_reader :enforce_utf8, :has_rails, :has_rails_logger, :has_acceleration
|
|
25
26
|
attr_reader :errors, :warnings, :headers, :raw_header, :result
|
|
26
27
|
|
|
27
28
|
def self.default_options
|
|
@@ -30,7 +31,9 @@ module SmarterCSV
|
|
|
30
31
|
|
|
31
32
|
# rubocop:disable Naming/MethodName
|
|
32
33
|
def headerA
|
|
33
|
-
|
|
34
|
+
record_warning(type: :deprecation, code: :header_a_method) do
|
|
35
|
+
"Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
|
|
36
|
+
end
|
|
34
37
|
@headerA
|
|
35
38
|
end
|
|
36
39
|
# rubocop:enable Naming/MethodName
|
|
@@ -39,6 +42,7 @@ module SmarterCSV
|
|
|
39
42
|
def initialize(input, given_options = {})
|
|
40
43
|
@input = input
|
|
41
44
|
@has_rails = !!defined?(Rails)
|
|
45
|
+
@has_rails_logger = defined?(::Rails) && ::Rails.respond_to?(:logger) && !::Rails.logger.nil?
|
|
42
46
|
@csv_line_count = 0
|
|
43
47
|
@chunk_count = 0
|
|
44
48
|
@errors = {}
|
|
@@ -47,9 +51,13 @@ module SmarterCSV
|
|
|
47
51
|
@headers = nil
|
|
48
52
|
@raw_header = nil # header as it appears in the file
|
|
49
53
|
@result = []
|
|
50
|
-
@warnings =
|
|
54
|
+
@warnings = []
|
|
55
|
+
@warnings_by_key = {}
|
|
51
56
|
@enforce_utf8 = false # only set to true if needed (after options parsing)
|
|
52
57
|
@options = process_options(given_options)
|
|
58
|
+
# Cache quote_char as an ivar — stable for the Reader's lifetime; avoids per-row/per-line hash lookups.
|
|
59
|
+
@quote_char = @options[:quote_char]
|
|
60
|
+
@doubled_quote_chars = @quote_char * 2
|
|
53
61
|
# true if it is compiled with accelleration
|
|
54
62
|
@has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
|
|
55
63
|
end
|
|
@@ -87,7 +95,11 @@ module SmarterCSV
|
|
|
87
95
|
|
|
88
96
|
chunk_size = @options[:chunk_size]
|
|
89
97
|
if chunk_size.nil?
|
|
90
|
-
|
|
98
|
+
unless @options[:verbose] == :quiet
|
|
99
|
+
record_warning(type: :config, code: :chunk_size_default) do
|
|
100
|
+
"chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning."
|
|
101
|
+
end
|
|
102
|
+
end
|
|
91
103
|
chunk_size = DEFAULT_CHUNK_SIZE
|
|
92
104
|
end
|
|
93
105
|
unless chunk_size.is_a?(Integer) && chunk_size >= 1
|
|
@@ -106,23 +118,65 @@ module SmarterCSV
|
|
|
106
118
|
end
|
|
107
119
|
end
|
|
108
120
|
|
|
109
|
-
def process(&block)
|
|
121
|
+
def process(&block)
|
|
110
122
|
@enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
|
|
111
123
|
@verbose = options[:verbose]
|
|
112
124
|
|
|
113
125
|
begin
|
|
114
|
-
fh = input.
|
|
126
|
+
fh = input.is_a?(String) ? File.open(input, "r:#{options[:file_encoding]}") : input
|
|
127
|
+
|
|
128
|
+
# Rewindable inputs (File, Tempfile, StringIO, Zlib::GzipReader, ...) use
|
|
129
|
+
# native rewind for auto-detection — no wrapper overhead in the hot loop.
|
|
130
|
+
# Non-rewindable streams (pipes, STDIN, custom non-seekable IOs) go through
|
|
131
|
+
# PeekableIO which buffers the first chunk so detection can replay without
|
|
132
|
+
# seeking the underlying source.
|
|
133
|
+
has_rewind = seekable?(fh)
|
|
134
|
+
|
|
135
|
+
unless has_rewind
|
|
136
|
+
# buffer_size has been validated and clamped by reader_options.rb to be in
|
|
137
|
+
# [MIN_BUFFER_SIZE, MAX_BUFFER_SIZE], with a cross-validation bump if it was
|
|
138
|
+
# below auto_row_sep_chars. Use it directly.
|
|
139
|
+
fh = SmarterCSV::PeekableIO.new(fh, options, buffer_size: options[:buffer_size])
|
|
140
|
+
end
|
|
115
141
|
|
|
116
142
|
if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
|
|
117
|
-
|
|
143
|
+
unless options[:verbose] == :quiet
|
|
144
|
+
record_warning(type: :encoding, code: :utf8_missing_binary_mode) do
|
|
145
|
+
'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
|
|
146
|
+
end
|
|
147
|
+
end
|
|
118
148
|
end
|
|
119
149
|
|
|
120
|
-
#
|
|
121
|
-
|
|
122
|
-
#
|
|
123
|
-
|
|
150
|
+
# Auto-detection. Two orchestrations, same detection functions:
|
|
151
|
+
# has_rewind=true → native fh.rewind between passes; BOM is stripped by
|
|
152
|
+
# next_line_with_counts on the first real line.
|
|
153
|
+
# has_rewind=false → PeekableIO buffers the first chunk; peek strips BOM,
|
|
154
|
+
# rewind_buffer replays, freeze_buffer! locks the buffer.
|
|
155
|
+
if options[:row_sep]&.to_sym == :auto || options[:col_sep]&.to_sym == :auto
|
|
156
|
+
if has_rewind
|
|
157
|
+
options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
|
|
158
|
+
fh.rewind
|
|
159
|
+
@file_line_count = 0
|
|
160
|
+
@csv_line_count = 0
|
|
161
|
+
# skip_lines feeds clean data lines to guess_column_separator. When col_sep is
|
|
162
|
+
# explicit, it's wasted work — the bytes are consumed and rewound. Guard it.
|
|
163
|
+
skip_lines(fh, options) if options[:skip_lines] && options[:col_sep]&.to_sym == :auto
|
|
164
|
+
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
|
|
165
|
+
fh.rewind
|
|
166
|
+
@file_line_count = 0
|
|
167
|
+
@csv_line_count = 0
|
|
168
|
+
else
|
|
169
|
+
fh.peek
|
|
170
|
+
options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
|
|
171
|
+
rewind_buffer(fh)
|
|
172
|
+
skip_lines(fh, options) if options[:skip_lines] && options[:col_sep]&.to_sym == :auto
|
|
173
|
+
options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
|
|
174
|
+
fh.freeze_buffer!
|
|
175
|
+
rewind_buffer(fh)
|
|
176
|
+
end
|
|
177
|
+
end
|
|
124
178
|
|
|
125
|
-
skip_lines(fh, options)
|
|
179
|
+
skip_lines(fh, options) if options[:skip_lines] # skip comments
|
|
126
180
|
|
|
127
181
|
# NOTE: we are no longer using header_size
|
|
128
182
|
@headers, _header_size = process_headers(fh, options)
|
|
@@ -195,8 +249,6 @@ module SmarterCSV
|
|
|
195
249
|
@delete_nil_keys = !!options[:key_mapping]
|
|
196
250
|
@delete_empty_keys = !!options[:key_mapping] || @headers.include?(:"")
|
|
197
251
|
|
|
198
|
-
# Cache quote_char as an ivar for the stitch-loop memchr guard (avoids hash lookup per continuation line).
|
|
199
|
-
@quote_char = options[:quote_char]
|
|
200
252
|
# Cache field_size_limit as an ivar (nil when unset → one nil-check per row, no method calls).
|
|
201
253
|
@field_size_limit = options[:field_size_limit]
|
|
202
254
|
|
|
@@ -221,9 +273,9 @@ module SmarterCSV
|
|
|
221
273
|
|
|
222
274
|
if on_start
|
|
223
275
|
input_meta = if @input.is_a?(String)
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
276
|
+
{ input: @input, file_size: (File.size(@input) rescue nil) }
|
|
277
|
+
else
|
|
278
|
+
{ input: @input.class.name, file_size: nil }
|
|
227
279
|
end
|
|
228
280
|
on_start.call(input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
|
|
229
281
|
end
|
|
@@ -254,7 +306,7 @@ module SmarterCSV
|
|
|
254
306
|
hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
|
|
255
307
|
end
|
|
256
308
|
else
|
|
257
|
-
has_quotes = line.include?(
|
|
309
|
+
has_quotes = line.include?(@quote_char)
|
|
258
310
|
hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
|
|
259
311
|
if @quote_escaping_auto && data_size == -1 && line.include?('\\')
|
|
260
312
|
hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
|
|
@@ -515,6 +567,50 @@ module SmarterCSV
|
|
|
515
567
|
|
|
516
568
|
private
|
|
517
569
|
|
|
570
|
+
# Records a warning into the histogram and emits it to the warning sink.
|
|
571
|
+
# `@warnings` is an Array of unique (type, code) records with a `count` field.
|
|
572
|
+
# `@warnings_by_key` is a dedup map keyed by `[type, code]` — key shape must
|
|
573
|
+
# stay fixed to keep both structures bounded by distinct codes, not by calls.
|
|
574
|
+
# The block form avoids string allocation on dedup-hit: on a repeat call we
|
|
575
|
+
# increment count and return without yielding the block.
|
|
576
|
+
# `dedup: false` bypasses the dedup map (still appends a new record per call);
|
|
577
|
+
# reserve for per-occurrence warnings where each call carries distinct info.
|
|
578
|
+
# `severity:` controls the Rails.logger level (`:debug`/`:info`/`:warn`/`:error`/`:fatal`);
|
|
579
|
+
# the non-Rails fallback is always `Kernel#warn` regardless of severity.
|
|
580
|
+
# `type` is purely a semantic grouping for callers iterating reader.warnings.
|
|
581
|
+
def record_warning(type:, code:, severity: :warn, dedup: true)
|
|
582
|
+
key = [type, code]
|
|
583
|
+
if dedup && (existing = @warnings_by_key[key])
|
|
584
|
+
existing[:count] += 1
|
|
585
|
+
return
|
|
586
|
+
end
|
|
587
|
+
|
|
588
|
+
message = yield
|
|
589
|
+
record = { type: type, code: code, severity: severity, message: message, count: 1 }
|
|
590
|
+
@warnings << record
|
|
591
|
+
@warnings_by_key[key] = record if dedup
|
|
592
|
+
|
|
593
|
+
if @has_rails_logger
|
|
594
|
+
::Rails.logger.public_send(severity, "SmarterCSV: #{message}")
|
|
595
|
+
else
|
|
596
|
+
warn "SmarterCSV: #{message}"
|
|
597
|
+
end
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
# True when the IO is genuinely seekable — i.e. rewind will succeed at the kernel
|
|
601
|
+
# level, not just the Ruby method table. IO.pipe readers respond to :rewind but
|
|
602
|
+
# raise at the kernel layer when called; probing #pos surfaces that at decision time.
|
|
603
|
+
# Rescue SystemCallError (parent of all Errno::*) because the exact subclass varies
|
|
604
|
+
# by Ruby implementation: MRI raises Errno::ESPIPE, jruby raises Errno::EPIPE.
|
|
605
|
+
def seekable?(io)
|
|
606
|
+
return false unless io.respond_to?(:rewind)
|
|
607
|
+
|
|
608
|
+
io.pos if io.respond_to?(:pos)
|
|
609
|
+
true
|
|
610
|
+
rescue SystemCallError, IOError, NotImplementedError
|
|
611
|
+
false
|
|
612
|
+
end
|
|
613
|
+
|
|
518
614
|
# Determine if a line has unbalanced quotes requiring multiline stitching.
|
|
519
615
|
# For :auto mode, uses dual counting to avoid false multiline detection.
|
|
520
616
|
# For :standard quote_boundary mode, uses a full state machine so that
|
|
@@ -663,9 +759,9 @@ module SmarterCSV
|
|
|
663
759
|
elsif !in_quotes
|
|
664
760
|
# Non-quote character: track whether field has started
|
|
665
761
|
if strip # -- two direct == comparisons are faster than Array#include? in this hot loop
|
|
666
|
-
field_started = true unless
|
|
667
|
-
|
|
668
|
-
|
|
762
|
+
field_started = true unless [' ', "\t"].include?(line[i])
|
|
763
|
+
else
|
|
764
|
+
field_started = true
|
|
669
765
|
end
|
|
670
766
|
end
|
|
671
767
|
i += 1
|
|
@@ -679,7 +775,7 @@ module SmarterCSV
|
|
|
679
775
|
|
|
680
776
|
# SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
|
|
681
777
|
# and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
|
|
682
|
-
BLANK_RE = /\A
|
|
778
|
+
BLANK_RE = /\A[[:space:]]*\z/.freeze # Unicode whitespace, same as Rails' String#blank?
|
|
683
779
|
|
|
684
780
|
# Optimization #5: fast-path empty string and nil checks before regex
|
|
685
781
|
def blank?(value)
|
|
@@ -5,8 +5,13 @@ module SmarterCSV
|
|
|
5
5
|
module Options
|
|
6
6
|
DEFAULT_OPTIONS = {
|
|
7
7
|
acceleration: true, # if user wants to use accelleration or not
|
|
8
|
-
auto_row_sep_chars:
|
|
8
|
+
auto_row_sep_chars: SmarterCSV::AutoDetection::DEFAULT_AUTO_ROW_SEP_CHARS, # initial scan chunk size (default 4096).
|
|
9
|
+
# Validated against [MIN, MAX]_AUTO_ROW_SEP_CHARS in reader_options.rb.
|
|
10
|
+
# Doubling escalation in guess_line_ending caps at MAX_AUTO_ROW_SEP_CHARS.
|
|
9
11
|
bad_row_limit: nil,
|
|
12
|
+
buffer_size: SmarterCSV::PeekableIO::DEFAULT_PEEK_SIZE, # peek buffer chunk size for non-seekable inputs.
|
|
13
|
+
# Validated: nil/0 → use default; clamped to [MIN_BUFFER_SIZE, MAX_BUFFER_SIZE];
|
|
14
|
+
# bumped if < auto_row_sep_chars (see validation in reader_options.rb).
|
|
10
15
|
chunk_size: nil,
|
|
11
16
|
col_sep: :auto, # was: ',',
|
|
12
17
|
collect_raw_lines: true,
|
|
@@ -118,12 +123,14 @@ module SmarterCSV
|
|
|
118
123
|
values = Array(@options[:only_headers])
|
|
119
124
|
bad = values.reject { |v| v.is_a?(Symbol) || v.is_a?(String) }
|
|
120
125
|
raise SmarterCSV::ValidationError, "headers: { only: } elements must be String or Symbol, got: #{bad.map(&:class).uniq.inspect}" if bad.any?
|
|
126
|
+
|
|
121
127
|
@options[:only_headers] = values.map(&:to_sym)
|
|
122
128
|
end
|
|
123
129
|
if @options[:except_headers]
|
|
124
130
|
values = Array(@options[:except_headers])
|
|
125
131
|
bad = values.reject { |v| v.is_a?(Symbol) || v.is_a?(String) }
|
|
126
132
|
raise SmarterCSV::ValidationError, "headers: { except: } elements must be String or Symbol, got: #{bad.map(&:class).uniq.inspect}" if bad.any?
|
|
133
|
+
|
|
127
134
|
@options[:except_headers] = values.map(&:to_sym)
|
|
128
135
|
end
|
|
129
136
|
|
|
@@ -187,6 +194,59 @@ module SmarterCSV
|
|
|
187
194
|
unless %i[legacy standard].include?(options[:quote_boundary])
|
|
188
195
|
errors << "invalid quote_boundary: must be :legacy or :standard"
|
|
189
196
|
end
|
|
197
|
+
arc = options[:auto_row_sep_chars]
|
|
198
|
+
min_arc = SmarterCSV::AutoDetection::MIN_AUTO_ROW_SEP_CHARS
|
|
199
|
+
max_arc = SmarterCSV::AutoDetection::MAX_AUTO_ROW_SEP_CHARS
|
|
200
|
+
default_arc = DEFAULT_OPTIONS[:auto_row_sep_chars]
|
|
201
|
+
|
|
202
|
+
if arc.is_a?(Integer)
|
|
203
|
+
if arc < min_arc
|
|
204
|
+
warn "WARNING: auto_row_sep_chars value #{arc.inspect} is below minimum (#{min_arc}); using default (#{default_arc})" unless options[:verbose] == :quiet
|
|
205
|
+
options[:auto_row_sep_chars] = default_arc
|
|
206
|
+
elsif arc > max_arc
|
|
207
|
+
warn "WARNING: auto_row_sep_chars value #{arc.inspect} exceeds maximum (#{max_arc}); clamping to #{max_arc}" unless options[:verbose] == :quiet
|
|
208
|
+
options[:auto_row_sep_chars] = max_arc
|
|
209
|
+
end
|
|
210
|
+
else
|
|
211
|
+
warn "WARNING: invalid auto_row_sep_chars value #{arc.inspect} — must be an Integer; using default (#{default_arc})" unless options[:verbose] == :quiet
|
|
212
|
+
options[:auto_row_sep_chars] = default_arc
|
|
213
|
+
end
|
|
214
|
+
# buffer_size validation:
|
|
215
|
+
# nil / 0 → unset, use default
|
|
216
|
+
# non-Integer → warn, use default
|
|
217
|
+
# < MIN_BUFFER_SIZE → warn, clamp to MIN_BUFFER_SIZE
|
|
218
|
+
# > MAX_BUFFER_SIZE → warn, clamp to MAX_BUFFER_SIZE
|
|
219
|
+
# Cross-validation (after the above):
|
|
220
|
+
# < auto_row_sep_chars → warn, bump to max(2 * buffer_size, MIN_AUTO_ROW_SEP_CHARS)
|
|
221
|
+
bs = options[:buffer_size]
|
|
222
|
+
default_bs = DEFAULT_OPTIONS[:buffer_size]
|
|
223
|
+
min_bs = SmarterCSV::PeekableIO::MIN_BUFFER_SIZE
|
|
224
|
+
max_bs = SmarterCSV::PeekableIO::MAX_BUFFER_SIZE
|
|
225
|
+
quiet = options[:verbose] == :quiet
|
|
226
|
+
|
|
227
|
+
if bs.nil? || bs == 0
|
|
228
|
+
options[:buffer_size] = default_bs
|
|
229
|
+
elsif !bs.is_a?(Integer)
|
|
230
|
+
warn "WARNING: invalid buffer_size value #{bs.inspect} — must be an Integer; using default (#{default_bs})" unless quiet
|
|
231
|
+
options[:buffer_size] = default_bs
|
|
232
|
+
elsif bs < min_bs
|
|
233
|
+
warn "WARNING: buffer_size #{bs} is below minimum (#{min_bs}); clamping to #{min_bs}" unless quiet
|
|
234
|
+
options[:buffer_size] = min_bs
|
|
235
|
+
elsif bs > max_bs
|
|
236
|
+
warn "WARNING: buffer_size #{bs} exceeds maximum (#{max_bs}); clamping to #{max_bs}" unless quiet
|
|
237
|
+
options[:buffer_size] = max_bs
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
# Cross-validation: ensure buffer_size is reasonably sized relative to auto_row_sep_chars.
|
|
241
|
+
# Gentle bump (does not strictly enforce buffer_size >= auto_row_sep_chars — PeekableIO
|
|
242
|
+
# handles smaller buffers via multiple reads — but ensures at least one default scan window fits).
|
|
243
|
+
arc = options[:auto_row_sep_chars]
|
|
244
|
+
if arc.is_a?(Integer) && options[:buffer_size] < arc
|
|
245
|
+
bumped = [2 * options[:buffer_size], SmarterCSV::AutoDetection::MIN_AUTO_ROW_SEP_CHARS].max
|
|
246
|
+
bumped = [bumped, max_bs].min # Clamp bumped value to not exceed MAX_BUFFER_SIZE
|
|
247
|
+
warn "WARNING: buffer_size (#{options[:buffer_size]}) < auto_row_sep_chars (#{arc}); bumping buffer_size to #{bumped}" unless quiet
|
|
248
|
+
options[:buffer_size] = bumped
|
|
249
|
+
end
|
|
190
250
|
fsl = options[:field_size_limit]
|
|
191
251
|
unless fsl.nil? || (fsl.is_a?(Integer) && fsl > 0)
|
|
192
252
|
errors << "invalid field_size_limit: must be nil or a positive Integer (got #{fsl.inspect})"
|
data/lib/smarter_csv/version.rb
CHANGED
data/lib/smarter_csv.rb
CHANGED
|
@@ -5,9 +5,10 @@ require "smarter_csv/version"
|
|
|
5
5
|
require "smarter_csv/errors"
|
|
6
6
|
|
|
7
7
|
require "smarter_csv/file_io"
|
|
8
|
+
require "smarter_csv/auto_detection" # MAX_AUTO_ROW_SEP_CHARS is the canonical 64KB cap; loaded first so peekable_io.rb and reader_options.rb can reference it
|
|
9
|
+
require "smarter_csv/peekable_io"
|
|
8
10
|
require "smarter_csv/reader_options"
|
|
9
11
|
require "smarter_csv/writer_options"
|
|
10
|
-
require "smarter_csv/auto_detection"
|
|
11
12
|
require 'smarter_csv/header_transformations'
|
|
12
13
|
require 'smarter_csv/header_validations'
|
|
13
14
|
require "smarter_csv/headers"
|
|
@@ -66,24 +67,30 @@ module SmarterCSV
|
|
|
66
67
|
# reader = SmarterCSV::Reader.new(input, options)
|
|
67
68
|
# reader.process # with or without block
|
|
68
69
|
#
|
|
69
|
-
# After calling any of the class-level methods, errors from the last run
|
|
70
|
+
# After calling any of the class-level methods, errors and warnings from the last run
|
|
71
|
+
# are available via:
|
|
70
72
|
#
|
|
71
|
-
# SmarterCSV.errors
|
|
73
|
+
# SmarterCSV.errors # => { bad_row_count: 2, bad_rows: [...] }
|
|
74
|
+
# SmarterCSV.warnings # => [ { type:, code:, message:, count: }, ... ]
|
|
72
75
|
#
|
|
73
|
-
#
|
|
74
|
-
#
|
|
75
|
-
# multi-threaded environments (Puma, Sidekiq).
|
|
76
|
-
# are retained per thread.
|
|
76
|
+
# These expose the same reader.errors / reader.warnings without requiring access to the
|
|
77
|
+
# Reader instance. Both are cleared at the start of each call and stored per-thread, so
|
|
78
|
+
# this is safe in multi-threaded environments (Puma, Sidekiq). Only the most recent
|
|
79
|
+
# call's errors and warnings are retained per thread.
|
|
77
80
|
#
|
|
78
81
|
def self.process(input, given_options = {}, &block)
|
|
79
82
|
Thread.current[:current_thread_recent_errors] = {}
|
|
83
|
+
Thread.current[:current_thread_recent_warnings] = []
|
|
80
84
|
reader = Reader.new(input, given_options)
|
|
81
85
|
reader.process(&block)
|
|
82
86
|
ensure
|
|
83
87
|
# Preserve partial error state when processing raises mid-stream
|
|
84
88
|
# (e.g. TooManyBadRows, or a user block raising). `reader` is nil if
|
|
85
89
|
# Reader.new itself raised before the local was assigned.
|
|
86
|
-
|
|
90
|
+
if reader
|
|
91
|
+
Thread.current[:current_thread_recent_errors] = reader.errors
|
|
92
|
+
Thread.current[:current_thread_recent_warnings] = reader.warnings
|
|
93
|
+
end
|
|
87
94
|
end
|
|
88
95
|
|
|
89
96
|
# Convenience method for parsing a CSV string directly.
|
|
@@ -111,10 +118,14 @@ module SmarterCSV
|
|
|
111
118
|
# SmarterCSV.each("data.csv").lazy.map { |h| h[:name] }.first(10)
|
|
112
119
|
def self.each(input, options = {}, &block)
|
|
113
120
|
Thread.current[:current_thread_recent_errors] = {}
|
|
121
|
+
Thread.current[:current_thread_recent_warnings] = []
|
|
114
122
|
reader = Reader.new(input, options)
|
|
115
123
|
reader.each(&block)
|
|
116
124
|
ensure
|
|
117
|
-
|
|
125
|
+
if reader
|
|
126
|
+
Thread.current[:current_thread_recent_errors] = reader.errors
|
|
127
|
+
Thread.current[:current_thread_recent_warnings] = reader.warnings
|
|
128
|
+
end
|
|
118
129
|
end
|
|
119
130
|
|
|
120
131
|
# Yields each chunk as Array<Hash> plus its 0-based chunk index.
|
|
@@ -128,10 +139,14 @@ module SmarterCSV
|
|
|
128
139
|
# SmarterCSV.each_chunk("data.csv", chunk_size: 100).with_index { |chunk, i| ... }
|
|
129
140
|
def self.each_chunk(input, options = {}, &block)
|
|
130
141
|
Thread.current[:current_thread_recent_errors] = {}
|
|
142
|
+
Thread.current[:current_thread_recent_warnings] = []
|
|
131
143
|
reader = Reader.new(input, options)
|
|
132
144
|
reader.each_chunk(&block)
|
|
133
145
|
ensure
|
|
134
|
-
|
|
146
|
+
if reader
|
|
147
|
+
Thread.current[:current_thread_recent_errors] = reader.errors
|
|
148
|
+
Thread.current[:current_thread_recent_warnings] = reader.warnings
|
|
149
|
+
end
|
|
135
150
|
end
|
|
136
151
|
|
|
137
152
|
# Returns the errors from the most recent call to .process, .parse, .each, or .each_chunk
|
|
@@ -149,6 +164,21 @@ module SmarterCSV
|
|
|
149
164
|
Thread.current[:current_thread_recent_errors] || {}
|
|
150
165
|
end
|
|
151
166
|
|
|
167
|
+
# Returns the warnings from the most recent call to .process, .parse, .each, or .each_chunk
|
|
168
|
+
# on the current thread. Cleared at the start of each new call.
|
|
169
|
+
#
|
|
170
|
+
# Each warning is a Hash: { type:, code:, message:, count: }.
|
|
171
|
+
# Repeated warnings of the same (type, code) are deduped — `count` tracks
|
|
172
|
+
# the number of occurrences.
|
|
173
|
+
#
|
|
174
|
+
# Example:
|
|
175
|
+
# SmarterCSV.process('data.csv')
|
|
176
|
+
# SmarterCSV.warnings.each { |w| logger.warn("[#{w[:type]}/#{w[:code]}] #{w[:message]} (×#{w[:count]})") }
|
|
177
|
+
#
|
|
178
|
+
def self.warnings
|
|
179
|
+
Thread.current[:current_thread_recent_warnings] || []
|
|
180
|
+
end
|
|
181
|
+
|
|
152
182
|
# Convenience method for generating CSV files, IO objects, or in-memory strings.
|
|
153
183
|
#
|
|
154
184
|
# When called WITHOUT a first argument, generates CSV in memory and returns it as a String.
|
|
@@ -184,7 +214,6 @@ module SmarterCSV
|
|
|
184
214
|
# end
|
|
185
215
|
# end
|
|
186
216
|
#
|
|
187
|
-
# rubocop:disable Lint/UnusedMethodArgument
|
|
188
217
|
def self.generate(file_path_or_io = nil, options = {}, &block)
|
|
189
218
|
raise ArgumentError, "SmarterCSV.generate requires a block" unless block_given?
|
|
190
219
|
|
|
@@ -213,5 +242,4 @@ module SmarterCSV
|
|
|
213
242
|
end
|
|
214
243
|
end
|
|
215
244
|
end
|
|
216
|
-
# rubocop:enable Lint/UnusedMethodArgument
|
|
217
245
|
end
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: smarter_csv
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.17.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Tilo Sloboda
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2026-
|
|
10
|
+
date: 2026-05-14 00:00:00.000000000 Z
|
|
11
11
|
dependencies: []
|
|
12
12
|
description: |
|
|
13
13
|
SmarterCSV is a high-performance CSV reader and writer for Ruby focused on
|
|
@@ -39,7 +39,7 @@ files:
|
|
|
39
39
|
- LICENSE.txt
|
|
40
40
|
- README.md
|
|
41
41
|
- Rakefile
|
|
42
|
-
-
|
|
42
|
+
- TO_DO.md
|
|
43
43
|
- docs/_introduction.md
|
|
44
44
|
- docs/bad_row_quarantine.md
|
|
45
45
|
- docs/basic_read_api.md
|
|
@@ -59,10 +59,13 @@ files:
|
|
|
59
59
|
- docs/releases/1.16.0/benchmarks.md
|
|
60
60
|
- docs/releases/1.16.0/changes.md
|
|
61
61
|
- docs/releases/1.16.0/performance_notes.md
|
|
62
|
+
- docs/releases/1.17.0/benchmarks.md
|
|
63
|
+
- docs/releases/1.17.0/changes.md
|
|
64
|
+
- docs/releases/1.17.0/performance_notes.md
|
|
62
65
|
- docs/row_col_sep.md
|
|
63
66
|
- docs/ruby_csv_pitfalls.md
|
|
64
67
|
- docs/value_converters.md
|
|
65
|
-
-
|
|
68
|
+
- docs/warnings.md
|
|
66
69
|
- ext/smarter_csv/extconf.rb
|
|
67
70
|
- ext/smarter_csv/smarter_csv.c
|
|
68
71
|
- images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png
|
|
@@ -71,6 +74,9 @@ files:
|
|
|
71
74
|
- images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg
|
|
72
75
|
- images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png
|
|
73
76
|
- images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg
|
|
77
|
+
- images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg
|
|
78
|
+
- images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg
|
|
79
|
+
- images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg
|
|
74
80
|
- lib/smarter_csv.rb
|
|
75
81
|
- lib/smarter_csv/auto_detection.rb
|
|
76
82
|
- lib/smarter_csv/errors.rb
|
|
@@ -80,6 +86,7 @@ files:
|
|
|
80
86
|
- lib/smarter_csv/header_validations.rb
|
|
81
87
|
- lib/smarter_csv/headers.rb
|
|
82
88
|
- lib/smarter_csv/parser.rb
|
|
89
|
+
- lib/smarter_csv/peekable_io.rb
|
|
83
90
|
- lib/smarter_csv/reader.rb
|
|
84
91
|
- lib/smarter_csv/reader_options.rb
|
|
85
92
|
- lib/smarter_csv/version.rb
|
|
@@ -110,7 +117,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
110
117
|
- !ruby/object:Gem::Version
|
|
111
118
|
version: '0'
|
|
112
119
|
requirements: []
|
|
113
|
-
rubygems_version: 4.0.
|
|
120
|
+
rubygems_version: 4.0.11
|
|
114
121
|
specification_version: 4
|
|
115
122
|
summary: Fastest end-to-end CSV ingestion for Ruby with smart defaults and Rails-ready
|
|
116
123
|
hash output
|
data/TO_DO_v2.md
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
# SmarterCSV v2.0 TO DO List
|
|
2
|
-
|
|
3
|
-
* add enumerable to speed up parallel processing [issue #66](https://github.com/tilo/smarter_csv/issues/66), [issue #32](https://github.com/tilo/smarter_csv/issues/32)
|
|
4
|
-
* use Procs for validations and transformatoins [issue #118](https://github.com/tilo/smarter_csv/issues/118)
|
|
5
|
-
* make @errors and @warnings work [issue #118](https://github.com/tilo/smarter_csv/issues/118)
|
|
6
|
-
* skip file opening, allow reading from CSV string, e.g. reading from S3 file [issue #120](https://github.com/tilo/smarter_csv/issues/120).
|
|
7
|
-
Or stream large file from S3 (linked in the issue)
|
|
8
|
-
* Collect all Errors, before surfacing them. Avoid throwing an exception on the first error [issue #133](https://github.com/tilo/smarter_csv/issues/133)
|
|
9
|
-
* Don't call rewind on filehandle
|
|
10
|
-
* [2.0 BUG] :convert_values_to_numeric_unless_leading_zeros drops leading zeros [issue #151](https://github.com/tilo/smarter_csv/issues/151)
|
|
11
|
-
* [2.0 BUG] convert_to_float saves Proc as @@convert_to_integer [issue #157](https://github.com/tilo/smarter_csv/issues/157)
|
|
12
|
-
* Provide an example for custom Procs for hash_transformations in the docs [issue #174](https://github.com/tilo/smarter_csv/issues/174)
|
|
13
|
-
* Replace remove_empty_values: false [issue #213](https://github.com/tilo/smarter_csv/issues/213)
|
|
14
|
-
|