smarter_csv 1.16.4 → 1.17.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. checksums.yaml +4 -4
  2. data/.rubocop.yml +10 -1
  3. data/CHANGELOG.md +54 -0
  4. data/Gemfile +10 -5
  5. data/README.md +98 -14
  6. data/TO_DO.md +109 -0
  7. data/docs/_introduction.md +1 -0
  8. data/docs/bad_row_quarantine.md +2 -1
  9. data/docs/basic_read_api.md +6 -1
  10. data/docs/basic_write_api.md +30 -0
  11. data/docs/batch_processing.md +25 -0
  12. data/docs/column_selection.md +1 -0
  13. data/docs/data_transformations.md +1 -0
  14. data/docs/examples.md +126 -0
  15. data/docs/header_transformations.md +23 -0
  16. data/docs/header_validations.md +1 -0
  17. data/docs/history.md +1 -0
  18. data/docs/instrumentation.md +2 -1
  19. data/docs/migrating_from_csv.md +1 -0
  20. data/docs/options.md +20 -18
  21. data/docs/parsing_strategy.md +1 -0
  22. data/docs/real_world_csv.md +51 -1
  23. data/docs/releases/1.16.0/performance_notes.md +15 -15
  24. data/docs/releases/1.17.0/benchmarks.md +121 -0
  25. data/docs/releases/1.17.0/changes.md +161 -0
  26. data/docs/releases/1.17.0/performance_notes.md +126 -0
  27. data/docs/row_col_sep.md +21 -1
  28. data/docs/ruby_csv_pitfalls.md +1 -0
  29. data/docs/value_converters.md +24 -0
  30. data/docs/warnings.md +141 -0
  31. data/ext/smarter_csv/smarter_csv.c +98 -32
  32. data/images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg +106 -0
  33. data/images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg +181 -0
  34. data/images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg +179 -0
  35. data/lib/smarter_csv/auto_detection.rb +215 -30
  36. data/lib/smarter_csv/file_io.rb +2 -2
  37. data/lib/smarter_csv/hash_transformations.rb +29 -13
  38. data/lib/smarter_csv/parser.rb +42 -33
  39. data/lib/smarter_csv/peekable_io.rb +453 -0
  40. data/lib/smarter_csv/reader.rb +119 -23
  41. data/lib/smarter_csv/reader_options.rb +61 -1
  42. data/lib/smarter_csv/version.rb +1 -1
  43. data/lib/smarter_csv.rb +40 -12
  44. metadata +12 -5
  45. data/TO_DO_v2.md +0 -14
  46. data/ext/smarter_csv/Makefile +0 -270
@@ -7,7 +7,8 @@ module SmarterCSV
7
7
  include Enumerable
8
8
 
9
9
  # Default chunk size used by each_chunk when chunk_size is not explicitly set.
10
- # A warning is emitted to STDERR so users know to configure it explicitly.
10
+ # A warning is recorded (and emitted via Rails.logger or Kernel#warn) so users
11
+ # know to configure it explicitly.
11
12
  DEFAULT_CHUNK_SIZE = 100
12
13
 
13
14
  include ::SmarterCSV::Reader::Options
@@ -21,7 +22,7 @@ module SmarterCSV
21
22
 
22
23
  attr_reader :input, :options
23
24
  attr_reader :csv_line_count, :chunk_count, :file_line_count
24
- attr_reader :enforce_utf8, :has_rails, :has_acceleration
25
+ attr_reader :enforce_utf8, :has_rails, :has_rails_logger, :has_acceleration
25
26
  attr_reader :errors, :warnings, :headers, :raw_header, :result
26
27
 
27
28
  def self.default_options
@@ -30,7 +31,9 @@ module SmarterCSV
30
31
 
31
32
  # rubocop:disable Naming/MethodName
32
33
  def headerA
33
- warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
34
+ record_warning(type: :deprecation, code: :header_a_method) do
35
+ "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
36
+ end
34
37
  @headerA
35
38
  end
36
39
  # rubocop:enable Naming/MethodName
@@ -39,6 +42,7 @@ module SmarterCSV
39
42
  def initialize(input, given_options = {})
40
43
  @input = input
41
44
  @has_rails = !!defined?(Rails)
45
+ @has_rails_logger = defined?(::Rails) && ::Rails.respond_to?(:logger) && !::Rails.logger.nil?
42
46
  @csv_line_count = 0
43
47
  @chunk_count = 0
44
48
  @errors = {}
@@ -47,9 +51,13 @@ module SmarterCSV
47
51
  @headers = nil
48
52
  @raw_header = nil # header as it appears in the file
49
53
  @result = []
50
- @warnings = {}
54
+ @warnings = []
55
+ @warnings_by_key = {}
51
56
  @enforce_utf8 = false # only set to true if needed (after options parsing)
52
57
  @options = process_options(given_options)
58
+ # Cache quote_char as an ivar — stable for the Reader's lifetime; avoids per-row/per-line hash lookups.
59
+ @quote_char = @options[:quote_char]
60
+ @doubled_quote_chars = @quote_char * 2
53
61
  # true if it is compiled with accelleration
54
62
  @has_acceleration = !!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)
55
63
  end
@@ -87,7 +95,11 @@ module SmarterCSV
87
95
 
88
96
  chunk_size = @options[:chunk_size]
89
97
  if chunk_size.nil?
90
- warn "SmarterCSV: chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning." unless @options[:verbose] == :quiet
98
+ unless @options[:verbose] == :quiet
99
+ record_warning(type: :config, code: :chunk_size_default) do
100
+ "chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning."
101
+ end
102
+ end
91
103
  chunk_size = DEFAULT_CHUNK_SIZE
92
104
  end
93
105
  unless chunk_size.is_a?(Integer) && chunk_size >= 1
@@ -106,23 +118,65 @@ module SmarterCSV
106
118
  end
107
119
  end
108
120
 
109
- def process(&block) # rubocop:disable Lint/UnusedMethodArgument
121
+ def process(&block)
110
122
  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
111
123
  @verbose = options[:verbose]
112
124
 
113
125
  begin
114
- fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
126
+ fh = input.is_a?(String) ? File.open(input, "r:#{options[:file_encoding]}") : input
127
+
128
+ # Rewindable inputs (File, Tempfile, StringIO, Zlib::GzipReader, ...) use
129
+ # native rewind for auto-detection — no wrapper overhead in the hot loop.
130
+ # Non-rewindable streams (pipes, STDIN, custom non-seekable IOs) go through
131
+ # PeekableIO which buffers the first chunk so detection can replay without
132
+ # seeking the underlying source.
133
+ has_rewind = seekable?(fh)
134
+
135
+ unless has_rewind
136
+ # buffer_size has been validated and clamped by reader_options.rb to be in
137
+ # [MIN_BUFFER_SIZE, MAX_BUFFER_SIZE], with a cross-validation bump if it was
138
+ # below auto_row_sep_chars. Use it directly.
139
+ fh = SmarterCSV::PeekableIO.new(fh, options, buffer_size: options[:buffer_size])
140
+ end
115
141
 
116
142
  if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
117
- warn 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' unless options[:verbose] == :quiet
143
+ unless options[:verbose] == :quiet
144
+ record_warning(type: :encoding, code: :utf8_missing_binary_mode) do
145
+ 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
146
+ end
147
+ end
118
148
  end
119
149
 
120
- # auto-detect the row separator
121
- options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
122
- # attempt to auto-detect column separator
123
- options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
150
+ # Auto-detection. Two orchestrations, same detection functions:
151
+ # has_rewind=true → native fh.rewind between passes; BOM is stripped by
152
+ # next_line_with_counts on the first real line.
153
+ # has_rewind=false PeekableIO buffers the first chunk; peek strips BOM,
154
+ # rewind_buffer replays, freeze_buffer! locks the buffer.
155
+ if options[:row_sep]&.to_sym == :auto || options[:col_sep]&.to_sym == :auto
156
+ if has_rewind
157
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
158
+ fh.rewind
159
+ @file_line_count = 0
160
+ @csv_line_count = 0
161
+ # skip_lines feeds clean data lines to guess_column_separator. When col_sep is
162
+ # explicit, it's wasted work — the bytes are consumed and rewound. Guard it.
163
+ skip_lines(fh, options) if options[:skip_lines] && options[:col_sep]&.to_sym == :auto
164
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
165
+ fh.rewind
166
+ @file_line_count = 0
167
+ @csv_line_count = 0
168
+ else
169
+ fh.peek
170
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
171
+ rewind_buffer(fh)
172
+ skip_lines(fh, options) if options[:skip_lines] && options[:col_sep]&.to_sym == :auto
173
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
174
+ fh.freeze_buffer!
175
+ rewind_buffer(fh)
176
+ end
177
+ end
124
178
 
125
- skip_lines(fh, options)
179
+ skip_lines(fh, options) if options[:skip_lines] # skip comments
126
180
 
127
181
  # NOTE: we are no longer using header_size
128
182
  @headers, _header_size = process_headers(fh, options)
@@ -195,8 +249,6 @@ module SmarterCSV
195
249
  @delete_nil_keys = !!options[:key_mapping]
196
250
  @delete_empty_keys = !!options[:key_mapping] || @headers.include?(:"")
197
251
 
198
- # Cache quote_char as an ivar for the stitch-loop memchr guard (avoids hash lookup per continuation line).
199
- @quote_char = options[:quote_char]
200
252
  # Cache field_size_limit as an ivar (nil when unset → one nil-check per row, no method calls).
201
253
  @field_size_limit = options[:field_size_limit]
202
254
 
@@ -221,9 +273,9 @@ module SmarterCSV
221
273
 
222
274
  if on_start
223
275
  input_meta = if @input.is_a?(String)
224
- { input: @input, file_size: (File.size(@input) rescue nil) }
225
- else
226
- { input: @input.class.name, file_size: nil }
276
+ { input: @input, file_size: (File.size(@input) rescue nil) }
277
+ else
278
+ { input: @input.class.name, file_size: nil }
227
279
  end
228
280
  on_start.call(input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
229
281
  end
@@ -254,7 +306,7 @@ module SmarterCSV
254
306
  hash, data_size = parse_line_to_hash_ctx_c(line, @parse_ctx_double)
255
307
  end
256
308
  else
257
- has_quotes = line.include?(options[:quote_char])
309
+ has_quotes = line.include?(@quote_char)
258
310
  hash, data_size = parse_line_to_hash_ruby(line, @headers, @hot_path_options, has_quotes)
259
311
  if @quote_escaping_auto && data_size == -1 && line.include?('\\')
260
312
  hash, data_size = parse_line_to_hash_ruby(line, @headers, @quote_escaping_double, has_quotes)
@@ -515,6 +567,50 @@ module SmarterCSV
515
567
 
516
568
  private
517
569
 
570
+ # Records a warning into the histogram and emits it to the warning sink.
571
+ # `@warnings` is an Array of unique (type, code) records with a `count` field.
572
+ # `@warnings_by_key` is a dedup map keyed by `[type, code]` — key shape must
573
+ # stay fixed to keep both structures bounded by distinct codes, not by calls.
574
+ # The block form avoids string allocation on dedup-hit: on a repeat call we
575
+ # increment count and return without yielding the block.
576
+ # `dedup: false` bypasses the dedup map (still appends a new record per call);
577
+ # reserve for per-occurrence warnings where each call carries distinct info.
578
+ # `severity:` controls the Rails.logger level (`:debug`/`:info`/`:warn`/`:error`/`:fatal`);
579
+ # the non-Rails fallback is always `Kernel#warn` regardless of severity.
580
+ # `type` is purely a semantic grouping for callers iterating reader.warnings.
581
+ def record_warning(type:, code:, severity: :warn, dedup: true)
582
+ key = [type, code]
583
+ if dedup && (existing = @warnings_by_key[key])
584
+ existing[:count] += 1
585
+ return
586
+ end
587
+
588
+ message = yield
589
+ record = { type: type, code: code, severity: severity, message: message, count: 1 }
590
+ @warnings << record
591
+ @warnings_by_key[key] = record if dedup
592
+
593
+ if @has_rails_logger
594
+ ::Rails.logger.public_send(severity, "SmarterCSV: #{message}")
595
+ else
596
+ warn "SmarterCSV: #{message}"
597
+ end
598
+ end
599
+
600
+ # True when the IO is genuinely seekable — i.e. rewind will succeed at the kernel
601
+ # level, not just the Ruby method table. IO.pipe readers respond to :rewind but
602
+ # raise at the kernel layer when called; probing #pos surfaces that at decision time.
603
+ # Rescue SystemCallError (parent of all Errno::*) because the exact subclass varies
604
+ # by Ruby implementation: MRI raises Errno::ESPIPE, jruby raises Errno::EPIPE.
605
+ def seekable?(io)
606
+ return false unless io.respond_to?(:rewind)
607
+
608
+ io.pos if io.respond_to?(:pos)
609
+ true
610
+ rescue SystemCallError, IOError, NotImplementedError
611
+ false
612
+ end
613
+
518
614
  # Determine if a line has unbalanced quotes requiring multiline stitching.
519
615
  # For :auto mode, uses dual counting to avoid false multiline detection.
520
616
  # For :standard quote_boundary mode, uses a full state machine so that
@@ -663,9 +759,9 @@ module SmarterCSV
663
759
  elsif !in_quotes
664
760
  # Non-quote character: track whether field has started
665
761
  if strip # -- two direct == comparisons are faster than Array#include? in this hot loop
666
- field_started = true unless line[i] == ' ' || line[i] == "\t"
667
- else
668
- field_started = true
762
+ field_started = true unless [' ', "\t"].include?(line[i])
763
+ else
764
+ field_started = true
669
765
  end
670
766
  end
671
767
  i += 1
@@ -679,7 +775,7 @@ module SmarterCSV
679
775
 
680
776
  # SEE: https://github.com/rails/rails/blob/32015b6f369adc839c4f0955f2d9dce50c0b6123/activesupport/lib/active_support/core_ext/object/blank.rb#L121
681
777
  # and in the future we might also include UTF-8 space characters: https://www.compart.com/en/unicode/category/Zs
682
- BLANK_RE = /\A\s*\z/.freeze
778
+ BLANK_RE = /\A[[:space:]]*\z/.freeze # Unicode whitespace, same as Rails' String#blank?
683
779
 
684
780
  # Optimization #5: fast-path empty string and nil checks before regex
685
781
  def blank?(value)
@@ -5,8 +5,13 @@ module SmarterCSV
5
5
  module Options
6
6
  DEFAULT_OPTIONS = {
7
7
  acceleration: true, # if user wants to use accelleration or not
8
- auto_row_sep_chars: 500,
8
+ auto_row_sep_chars: SmarterCSV::AutoDetection::DEFAULT_AUTO_ROW_SEP_CHARS, # initial scan chunk size (default 4096).
9
+ # Validated against [MIN, MAX]_AUTO_ROW_SEP_CHARS in reader_options.rb.
10
+ # Doubling escalation in guess_line_ending caps at MAX_AUTO_ROW_SEP_CHARS.
9
11
  bad_row_limit: nil,
12
+ buffer_size: SmarterCSV::PeekableIO::DEFAULT_PEEK_SIZE, # peek buffer chunk size for non-seekable inputs.
13
+ # Validated: nil/0 → use default; clamped to [MIN_BUFFER_SIZE, MAX_BUFFER_SIZE];
14
+ # bumped if < auto_row_sep_chars (see validation in reader_options.rb).
10
15
  chunk_size: nil,
11
16
  col_sep: :auto, # was: ',',
12
17
  collect_raw_lines: true,
@@ -118,12 +123,14 @@ module SmarterCSV
118
123
  values = Array(@options[:only_headers])
119
124
  bad = values.reject { |v| v.is_a?(Symbol) || v.is_a?(String) }
120
125
  raise SmarterCSV::ValidationError, "headers: { only: } elements must be String or Symbol, got: #{bad.map(&:class).uniq.inspect}" if bad.any?
126
+
121
127
  @options[:only_headers] = values.map(&:to_sym)
122
128
  end
123
129
  if @options[:except_headers]
124
130
  values = Array(@options[:except_headers])
125
131
  bad = values.reject { |v| v.is_a?(Symbol) || v.is_a?(String) }
126
132
  raise SmarterCSV::ValidationError, "headers: { except: } elements must be String or Symbol, got: #{bad.map(&:class).uniq.inspect}" if bad.any?
133
+
127
134
  @options[:except_headers] = values.map(&:to_sym)
128
135
  end
129
136
 
@@ -187,6 +194,59 @@ module SmarterCSV
187
194
  unless %i[legacy standard].include?(options[:quote_boundary])
188
195
  errors << "invalid quote_boundary: must be :legacy or :standard"
189
196
  end
197
+ arc = options[:auto_row_sep_chars]
198
+ min_arc = SmarterCSV::AutoDetection::MIN_AUTO_ROW_SEP_CHARS
199
+ max_arc = SmarterCSV::AutoDetection::MAX_AUTO_ROW_SEP_CHARS
200
+ default_arc = DEFAULT_OPTIONS[:auto_row_sep_chars]
201
+
202
+ if arc.is_a?(Integer)
203
+ if arc < min_arc
204
+ warn "WARNING: auto_row_sep_chars value #{arc.inspect} is below minimum (#{min_arc}); using default (#{default_arc})" unless options[:verbose] == :quiet
205
+ options[:auto_row_sep_chars] = default_arc
206
+ elsif arc > max_arc
207
+ warn "WARNING: auto_row_sep_chars value #{arc.inspect} exceeds maximum (#{max_arc}); clamping to #{max_arc}" unless options[:verbose] == :quiet
208
+ options[:auto_row_sep_chars] = max_arc
209
+ end
210
+ else
211
+ warn "WARNING: invalid auto_row_sep_chars value #{arc.inspect} — must be an Integer; using default (#{default_arc})" unless options[:verbose] == :quiet
212
+ options[:auto_row_sep_chars] = default_arc
213
+ end
214
+ # buffer_size validation:
215
+ # nil / 0 → unset, use default
216
+ # non-Integer → warn, use default
217
+ # < MIN_BUFFER_SIZE → warn, clamp to MIN_BUFFER_SIZE
218
+ # > MAX_BUFFER_SIZE → warn, clamp to MAX_BUFFER_SIZE
219
+ # Cross-validation (after the above):
220
+ # < auto_row_sep_chars → warn, bump to max(2 * buffer_size, MIN_AUTO_ROW_SEP_CHARS)
221
+ bs = options[:buffer_size]
222
+ default_bs = DEFAULT_OPTIONS[:buffer_size]
223
+ min_bs = SmarterCSV::PeekableIO::MIN_BUFFER_SIZE
224
+ max_bs = SmarterCSV::PeekableIO::MAX_BUFFER_SIZE
225
+ quiet = options[:verbose] == :quiet
226
+
227
+ if bs.nil? || bs == 0
228
+ options[:buffer_size] = default_bs
229
+ elsif !bs.is_a?(Integer)
230
+ warn "WARNING: invalid buffer_size value #{bs.inspect} — must be an Integer; using default (#{default_bs})" unless quiet
231
+ options[:buffer_size] = default_bs
232
+ elsif bs < min_bs
233
+ warn "WARNING: buffer_size #{bs} is below minimum (#{min_bs}); clamping to #{min_bs}" unless quiet
234
+ options[:buffer_size] = min_bs
235
+ elsif bs > max_bs
236
+ warn "WARNING: buffer_size #{bs} exceeds maximum (#{max_bs}); clamping to #{max_bs}" unless quiet
237
+ options[:buffer_size] = max_bs
238
+ end
239
+
240
+ # Cross-validation: ensure buffer_size is reasonably sized relative to auto_row_sep_chars.
241
+ # Gentle bump (does not strictly enforce buffer_size >= auto_row_sep_chars — PeekableIO
242
+ # handles smaller buffers via multiple reads — but ensures at least one default scan window fits).
243
+ arc = options[:auto_row_sep_chars]
244
+ if arc.is_a?(Integer) && options[:buffer_size] < arc
245
+ bumped = [2 * options[:buffer_size], SmarterCSV::AutoDetection::MIN_AUTO_ROW_SEP_CHARS].max
246
+ bumped = [bumped, max_bs].min # Clamp bumped value to not exceed MAX_BUFFER_SIZE
247
+ warn "WARNING: buffer_size (#{options[:buffer_size]}) < auto_row_sep_chars (#{arc}); bumping buffer_size to #{bumped}" unless quiet
248
+ options[:buffer_size] = bumped
249
+ end
190
250
  fsl = options[:field_size_limit]
191
251
  unless fsl.nil? || (fsl.is_a?(Integer) && fsl > 0)
192
252
  errors << "invalid field_size_limit: must be nil or a positive Integer (got #{fsl.inspect})"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.16.4"
4
+ VERSION = "1.17.0"
5
5
  end
data/lib/smarter_csv.rb CHANGED
@@ -5,9 +5,10 @@ require "smarter_csv/version"
5
5
  require "smarter_csv/errors"
6
6
 
7
7
  require "smarter_csv/file_io"
8
+ require "smarter_csv/auto_detection" # MAX_AUTO_ROW_SEP_CHARS is the canonical 64KB cap; loaded first so peekable_io.rb and reader_options.rb can reference it
9
+ require "smarter_csv/peekable_io"
8
10
  require "smarter_csv/reader_options"
9
11
  require "smarter_csv/writer_options"
10
- require "smarter_csv/auto_detection"
11
12
  require 'smarter_csv/header_transformations'
12
13
  require 'smarter_csv/header_validations'
13
14
  require "smarter_csv/headers"
@@ -66,24 +67,30 @@ module SmarterCSV
66
67
  # reader = SmarterCSV::Reader.new(input, options)
67
68
  # reader.process # with or without block
68
69
  #
69
- # After calling any of the class-level methods, errors from the last run are available via:
70
+ # After calling any of the class-level methods, errors and warnings from the last run
71
+ # are available via:
70
72
  #
71
- # SmarterCSV.errors # => { bad_row_count: 2, bad_rows: [...] }
73
+ # SmarterCSV.errors # => { bad_row_count: 2, bad_rows: [...] }
74
+ # SmarterCSV.warnings # => [ { type:, code:, message:, count: }, ... ]
72
75
  #
73
- # This exposes the same reader.errors hash without requiring access to the Reader instance.
74
- # Errors are cleared at the start of each call and stored per-thread, so this is safe in
75
- # multi-threaded environments (Puma, Sidekiq). Note: only the most recent call's errors
76
- # are retained per thread.
76
+ # These expose the same reader.errors / reader.warnings without requiring access to the
77
+ # Reader instance. Both are cleared at the start of each call and stored per-thread, so
78
+ # this is safe in multi-threaded environments (Puma, Sidekiq). Only the most recent
79
+ # call's errors and warnings are retained per thread.
77
80
  #
78
81
  def self.process(input, given_options = {}, &block)
79
82
  Thread.current[:current_thread_recent_errors] = {}
83
+ Thread.current[:current_thread_recent_warnings] = []
80
84
  reader = Reader.new(input, given_options)
81
85
  reader.process(&block)
82
86
  ensure
83
87
  # Preserve partial error state when processing raises mid-stream
84
88
  # (e.g. TooManyBadRows, or a user block raising). `reader` is nil if
85
89
  # Reader.new itself raised before the local was assigned.
86
- Thread.current[:current_thread_recent_errors] = reader.errors if reader
90
+ if reader
91
+ Thread.current[:current_thread_recent_errors] = reader.errors
92
+ Thread.current[:current_thread_recent_warnings] = reader.warnings
93
+ end
87
94
  end
88
95
 
89
96
  # Convenience method for parsing a CSV string directly.
@@ -111,10 +118,14 @@ module SmarterCSV
111
118
  # SmarterCSV.each("data.csv").lazy.map { |h| h[:name] }.first(10)
112
119
  def self.each(input, options = {}, &block)
113
120
  Thread.current[:current_thread_recent_errors] = {}
121
+ Thread.current[:current_thread_recent_warnings] = []
114
122
  reader = Reader.new(input, options)
115
123
  reader.each(&block)
116
124
  ensure
117
- Thread.current[:current_thread_recent_errors] = reader.errors if reader
125
+ if reader
126
+ Thread.current[:current_thread_recent_errors] = reader.errors
127
+ Thread.current[:current_thread_recent_warnings] = reader.warnings
128
+ end
118
129
  end
119
130
 
120
131
  # Yields each chunk as Array<Hash> plus its 0-based chunk index.
@@ -128,10 +139,14 @@ module SmarterCSV
128
139
  # SmarterCSV.each_chunk("data.csv", chunk_size: 100).with_index { |chunk, i| ... }
129
140
  def self.each_chunk(input, options = {}, &block)
130
141
  Thread.current[:current_thread_recent_errors] = {}
142
+ Thread.current[:current_thread_recent_warnings] = []
131
143
  reader = Reader.new(input, options)
132
144
  reader.each_chunk(&block)
133
145
  ensure
134
- Thread.current[:current_thread_recent_errors] = reader.errors if reader
146
+ if reader
147
+ Thread.current[:current_thread_recent_errors] = reader.errors
148
+ Thread.current[:current_thread_recent_warnings] = reader.warnings
149
+ end
135
150
  end
136
151
 
137
152
  # Returns the errors from the most recent call to .process, .parse, .each, or .each_chunk
@@ -149,6 +164,21 @@ module SmarterCSV
149
164
  Thread.current[:current_thread_recent_errors] || {}
150
165
  end
151
166
 
167
+ # Returns the warnings from the most recent call to .process, .parse, .each, or .each_chunk
168
+ # on the current thread. Cleared at the start of each new call.
169
+ #
170
+ # Each warning is a Hash: { type:, code:, message:, count: }.
171
+ # Repeated warnings of the same (type, code) are deduped — `count` tracks
172
+ # the number of occurrences.
173
+ #
174
+ # Example:
175
+ # SmarterCSV.process('data.csv')
176
+ # SmarterCSV.warnings.each { |w| logger.warn("[#{w[:type]}/#{w[:code]}] #{w[:message]} (×#{w[:count]})") }
177
+ #
178
+ def self.warnings
179
+ Thread.current[:current_thread_recent_warnings] || []
180
+ end
181
+
152
182
  # Convenience method for generating CSV files, IO objects, or in-memory strings.
153
183
  #
154
184
  # When called WITHOUT a first argument, generates CSV in memory and returns it as a String.
@@ -184,7 +214,6 @@ module SmarterCSV
184
214
  # end
185
215
  # end
186
216
  #
187
- # rubocop:disable Lint/UnusedMethodArgument
188
217
  def self.generate(file_path_or_io = nil, options = {}, &block)
189
218
  raise ArgumentError, "SmarterCSV.generate requires a block" unless block_given?
190
219
 
@@ -213,5 +242,4 @@ module SmarterCSV
213
242
  end
214
243
  end
215
244
  end
216
- # rubocop:enable Lint/UnusedMethodArgument
217
245
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.4
4
+ version: 1.17.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2026-04-21 00:00:00.000000000 Z
10
+ date: 2026-05-14 00:00:00.000000000 Z
11
11
  dependencies: []
12
12
  description: |
13
13
  SmarterCSV is a high-performance CSV reader and writer for Ruby focused on
@@ -39,7 +39,7 @@ files:
39
39
  - LICENSE.txt
40
40
  - README.md
41
41
  - Rakefile
42
- - TO_DO_v2.md
42
+ - TO_DO.md
43
43
  - docs/_introduction.md
44
44
  - docs/bad_row_quarantine.md
45
45
  - docs/basic_read_api.md
@@ -59,10 +59,13 @@ files:
59
59
  - docs/releases/1.16.0/benchmarks.md
60
60
  - docs/releases/1.16.0/changes.md
61
61
  - docs/releases/1.16.0/performance_notes.md
62
+ - docs/releases/1.17.0/benchmarks.md
63
+ - docs/releases/1.17.0/changes.md
64
+ - docs/releases/1.17.0/performance_notes.md
62
65
  - docs/row_col_sep.md
63
66
  - docs/ruby_csv_pitfalls.md
64
67
  - docs/value_converters.md
65
- - ext/smarter_csv/Makefile
68
+ - docs/warnings.md
66
69
  - ext/smarter_csv/extconf.rb
67
70
  - ext/smarter_csv/smarter_csv.c
68
71
  - images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png
@@ -71,6 +74,9 @@ files:
71
74
  - images/SmarterCSV_1.16.0_vs_previous_C-speedup.svg
72
75
  - images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.png
73
76
  - images/SmarterCSV_1.16.0_vs_previous_Rb-speedup.svg
77
+ - images/SmarterCSV_1.17.0_vs_RubyCSV_3.3.5_speedup.svg
78
+ - images/SmarterCSV_1.17.0_vs_previous_C-speedup.svg
79
+ - images/SmarterCSV_1.17.0_vs_previous_Rb-speedup.svg
74
80
  - lib/smarter_csv.rb
75
81
  - lib/smarter_csv/auto_detection.rb
76
82
  - lib/smarter_csv/errors.rb
@@ -80,6 +86,7 @@ files:
80
86
  - lib/smarter_csv/header_validations.rb
81
87
  - lib/smarter_csv/headers.rb
82
88
  - lib/smarter_csv/parser.rb
89
+ - lib/smarter_csv/peekable_io.rb
83
90
  - lib/smarter_csv/reader.rb
84
91
  - lib/smarter_csv/reader_options.rb
85
92
  - lib/smarter_csv/version.rb
@@ -110,7 +117,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
110
117
  - !ruby/object:Gem::Version
111
118
  version: '0'
112
119
  requirements: []
113
- rubygems_version: 4.0.6
120
+ rubygems_version: 4.0.11
114
121
  specification_version: 4
115
122
  summary: Fastest end-to-end CSV ingestion for Ruby with smart defaults and Rails-ready
116
123
  hash output
data/TO_DO_v2.md DELETED
@@ -1,14 +0,0 @@
1
- # SmarterCSV v2.0 TO DO List
2
-
3
- * add enumerable to speed up parallel processing [issue #66](https://github.com/tilo/smarter_csv/issues/66), [issue #32](https://github.com/tilo/smarter_csv/issues/32)
4
- * use Procs for validations and transformatoins [issue #118](https://github.com/tilo/smarter_csv/issues/118)
5
- * make @errors and @warnings work [issue #118](https://github.com/tilo/smarter_csv/issues/118)
6
- * skip file opening, allow reading from CSV string, e.g. reading from S3 file [issue #120](https://github.com/tilo/smarter_csv/issues/120).
7
- Or stream large file from S3 (linked in the issue)
8
- * Collect all Errors, before surfacing them. Avoid throwing an exception on the first error [issue #133](https://github.com/tilo/smarter_csv/issues/133)
9
- * Don't call rewind on filehandle
10
- * [2.0 BUG] :convert_values_to_numeric_unless_leading_zeros drops leading zeros [issue #151](https://github.com/tilo/smarter_csv/issues/151)
11
- * [2.0 BUG] convert_to_float saves Proc as @@convert_to_integer [issue #157](https://github.com/tilo/smarter_csv/issues/157)
12
- * Provide an example for custom Procs for hash_transformations in the docs [issue #174](https://github.com/tilo/smarter_csv/issues/174)
13
- * Replace remove_empty_values: false [issue #213](https://github.com/tilo/smarter_csv/issues/213)
14
-