smarter_csv 1.16.6 → 1.17.0.pre5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,8 @@ module SmarterCSV
7
7
  include Enumerable
8
8
 
9
9
  # Default chunk size used by each_chunk when chunk_size is not explicitly set.
10
- # A warning is emitted to STDERR so users know to configure it explicitly.
10
+ # A warning is recorded (and emitted via Rails.logger or Kernel#warn) so users
11
+ # know to configure it explicitly.
11
12
  DEFAULT_CHUNK_SIZE = 100
12
13
 
13
14
  include ::SmarterCSV::Reader::Options
@@ -21,7 +22,7 @@ module SmarterCSV
21
22
 
22
23
  attr_reader :input, :options
23
24
  attr_reader :csv_line_count, :chunk_count, :file_line_count
24
- attr_reader :enforce_utf8, :has_rails, :has_acceleration
25
+ attr_reader :enforce_utf8, :has_rails, :has_rails_logger, :has_acceleration
25
26
  attr_reader :errors, :warnings, :headers, :raw_header, :result
26
27
 
27
28
  def self.default_options
@@ -30,7 +31,9 @@ module SmarterCSV
30
31
 
31
32
  # rubocop:disable Naming/MethodName
32
33
  def headerA
33
- warn "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
34
+ record_warning(type: :deprecation, code: :header_a_method) do
35
+ "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
36
+ end
34
37
  @headerA
35
38
  end
36
39
  # rubocop:enable Naming/MethodName
@@ -39,6 +42,7 @@ module SmarterCSV
39
42
  def initialize(input, given_options = {})
40
43
  @input = input
41
44
  @has_rails = !!defined?(Rails)
45
+ @has_rails_logger = defined?(::Rails) && ::Rails.respond_to?(:logger) && !::Rails.logger.nil?
42
46
  @csv_line_count = 0
43
47
  @chunk_count = 0
44
48
  @errors = {}
@@ -47,7 +51,8 @@ module SmarterCSV
47
51
  @headers = nil
48
52
  @raw_header = nil # header as it appears in the file
49
53
  @result = []
50
- @warnings = {}
54
+ @warnings = []
55
+ @warnings_by_key = {}
51
56
  @enforce_utf8 = false # only set to true if needed (after options parsing)
52
57
  @options = process_options(given_options)
53
58
  # true if it is compiled with accelleration
@@ -87,7 +92,11 @@ module SmarterCSV
87
92
 
88
93
  chunk_size = @options[:chunk_size]
89
94
  if chunk_size.nil?
90
- warn "SmarterCSV: chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning." unless @options[:verbose] == :quiet
95
+ unless @options[:verbose] == :quiet
96
+ record_warning(type: :config, code: :chunk_size_default) do
97
+ "chunk_size not set, defaulting to #{DEFAULT_CHUNK_SIZE}. Set chunk_size explicitly to suppress this warning."
98
+ end
99
+ end
91
100
  chunk_size = DEFAULT_CHUNK_SIZE
92
101
  end
93
102
  unless chunk_size.is_a?(Integer) && chunk_size >= 1
@@ -106,23 +115,72 @@ module SmarterCSV
106
115
  end
107
116
  end
108
117
 
109
- def process(&block) # rubocop:disable Lint/UnusedMethodArgument
118
+ def process(&block)
110
119
  @enforce_utf8 = options[:force_utf8] || options[:file_encoding] !~ /utf-8/i
111
120
  @verbose = options[:verbose]
112
121
 
113
122
  begin
114
- fh = input.respond_to?(:readline) ? input : File.open(input, "r:#{options[:file_encoding]}")
123
+ fh = input.is_a?(String) ? File.open(input, "r:#{options[:file_encoding]}") : input
124
+
125
+ # Rewindable inputs (File, Tempfile, StringIO, Zlib::GzipReader, ...) use
126
+ # native rewind for auto-detection — no wrapper overhead in the hot loop.
127
+ # Non-rewindable streams (pipes, STDIN, custom non-seekable IOs) go through
128
+ # PeekableIO which buffers the first chunk so detection can replay without
129
+ # seeking the underlying source.
130
+ has_rewind = seekable?(fh)
131
+
132
+ unless has_rewind
133
+ # buffer_size can be passed directly; otherwise it scales from auto_row_sep_chars.
134
+ # 2× auto_row_sep_chars ensures the first peek covers the full row_sep scan with
135
+ # room for col_sep detection. Falls back to DEFAULT_PEEK_SIZE when auto_row_sep_chars
136
+ # is 0 (scan whole file).
137
+ buf_size = if options[:buffer_size]
138
+ options[:buffer_size].to_i
139
+ else
140
+ auto_row_sep_chars = options[:auto_row_sep_chars].to_i
141
+ auto_row_sep_chars > 0 ? 2 * auto_row_sep_chars : SmarterCSV::PeekableIO::DEFAULT_PEEK_SIZE
142
+ end
143
+ fh = SmarterCSV::PeekableIO.new(fh, options, buffer_size: buf_size)
144
+ end
115
145
 
116
146
  if (options[:force_utf8] || options[:file_encoding] =~ /utf-8/i) && (fh.respond_to?(:external_encoding) && fh.external_encoding != Encoding.find('UTF-8') || fh.respond_to?(:encoding) && fh.encoding != Encoding.find('UTF-8'))
117
- warn 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".' unless options[:verbose] == :quiet
147
+ unless options[:verbose] == :quiet
148
+ record_warning(type: :encoding, code: :utf8_missing_binary_mode) do
149
+ 'WARNING: you are trying to process UTF-8 input, but did not open the input with "b:utf-8" option. See README file "NOTES about File Encodings".'
150
+ end
151
+ end
118
152
  end
119
153
 
120
- # auto-detect the row separator
121
- options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
122
- # attempt to auto-detect column separator
123
- options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
154
+ # Auto-detection. Two orchestrations, same detection functions:
155
+ # has_rewind=true → native fh.rewind between passes; BOM is stripped by
156
+ # next_line_with_counts on the first real line.
157
+ # has_rewind=false PeekableIO buffers the first chunk; peek strips BOM,
158
+ # rewind_buffer replays, freeze_buffer! locks the buffer.
159
+ if options[:row_sep]&.to_sym == :auto || options[:col_sep]&.to_sym == :auto
160
+ if has_rewind
161
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
162
+ fh.rewind
163
+ @file_line_count = 0
164
+ @csv_line_count = 0
165
+ # skip_lines feeds clean data lines to guess_column_separator. When col_sep is
166
+ # explicit, it's wasted work — the bytes are consumed and rewound. Guard it.
167
+ skip_lines(fh, options) if options[:skip_lines] && options[:col_sep]&.to_sym == :auto
168
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
169
+ fh.rewind
170
+ @file_line_count = 0
171
+ @csv_line_count = 0
172
+ else
173
+ fh.peek
174
+ options[:row_sep] = guess_line_ending(fh, options) if options[:row_sep]&.to_sym == :auto
175
+ rewind_buffer(fh)
176
+ skip_lines(fh, options) if options[:skip_lines] && options[:col_sep]&.to_sym == :auto
177
+ options[:col_sep] = guess_column_separator(fh, options) if options[:col_sep]&.to_sym == :auto
178
+ fh.freeze_buffer!
179
+ rewind_buffer(fh)
180
+ end
181
+ end
124
182
 
125
- skip_lines(fh, options)
183
+ skip_lines(fh, options) if options[:skip_lines] # skip comments
126
184
 
127
185
  # NOTE: we are no longer using header_size
128
186
  @headers, _header_size = process_headers(fh, options)
@@ -221,9 +279,9 @@ module SmarterCSV
221
279
 
222
280
  if on_start
223
281
  input_meta = if @input.is_a?(String)
224
- { input: @input, file_size: (File.size(@input) rescue nil) }
225
- else
226
- { input: @input.class.name, file_size: nil }
282
+ { input: @input, file_size: (File.size(@input) rescue nil) }
283
+ else
284
+ { input: @input.class.name, file_size: nil }
227
285
  end
228
286
  on_start.call(input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
229
287
  end
@@ -515,6 +573,50 @@ module SmarterCSV
515
573
 
516
574
  private
517
575
 
576
+ # Records a warning into the histogram and emits it to the warning sink.
577
+ # `@warnings` is an Array of unique (type, code) records with a `count` field.
578
+ # `@warnings_by_key` is a dedup map keyed by `[type, code]` — key shape must
579
+ # stay fixed to keep both structures bounded by distinct codes, not by calls.
580
+ # The block form avoids string allocation on dedup-hit: on a repeat call we
581
+ # increment count and return without yielding the block.
582
+ # `dedup: false` bypasses the dedup map (still appends a new record per call);
583
+ # reserve for per-occurrence warnings where each call carries distinct info.
584
+ # `severity:` controls the Rails.logger level (`:debug`/`:info`/`:warn`/`:error`/`:fatal`);
585
+ # the non-Rails fallback is always `Kernel#warn` regardless of severity.
586
+ # `type` is purely a semantic grouping for callers iterating reader.warnings.
587
+ def record_warning(type:, code:, severity: :warn, dedup: true)
588
+ key = [type, code]
589
+ if dedup && (existing = @warnings_by_key[key])
590
+ existing[:count] += 1
591
+ return
592
+ end
593
+
594
+ message = yield
595
+ record = { type: type, code: code, severity: severity, message: message, count: 1 }
596
+ @warnings << record
597
+ @warnings_by_key[key] = record if dedup
598
+
599
+ if @has_rails_logger
600
+ ::Rails.logger.public_send(severity, "SmarterCSV: #{message}")
601
+ else
602
+ warn "SmarterCSV: #{message}"
603
+ end
604
+ end
605
+
606
+ # True when the IO is genuinely seekable — i.e. rewind will succeed at the kernel
607
+ # level, not just the Ruby method table. IO.pipe readers respond to :rewind but
608
+ # raise at the kernel layer when called; probing #pos surfaces that at decision time.
609
+ # Rescue SystemCallError (parent of all Errno::*) because the exact subclass varies
610
+ # by Ruby implementation: MRI raises Errno::ESPIPE, jruby raises Errno::EPIPE.
611
+ def seekable?(io)
612
+ return false unless io.respond_to?(:rewind)
613
+
614
+ io.pos if io.respond_to?(:pos)
615
+ true
616
+ rescue SystemCallError, IOError, NotImplementedError
617
+ false
618
+ end
619
+
518
620
  # Determine if a line has unbalanced quotes requiring multiline stitching.
519
621
  # For :auto mode, uses dual counting to avoid false multiline detection.
520
622
  # For :standard quote_boundary mode, uses a full state machine so that
@@ -663,9 +765,9 @@ module SmarterCSV
663
765
  elsif !in_quotes
664
766
  # Non-quote character: track whether field has started
665
767
  if strip # -- two direct == comparisons are faster than Array#include? in this hot loop
666
- field_started = true unless line[i] == ' ' || line[i] == "\t"
667
- else
668
- field_started = true
768
+ field_started = true unless [' ', "\t"].include?(line[i])
769
+ else
770
+ field_started = true
669
771
  end
670
772
  end
671
773
  i += 1
@@ -5,7 +5,7 @@ module SmarterCSV
5
5
  module Options
6
6
  DEFAULT_OPTIONS = {
7
7
  acceleration: true, # if user wants to use accelleration or not
8
- auto_row_sep_chars: 500,
8
+ auto_row_sep_chars: 8_192,
9
9
  bad_row_limit: nil,
10
10
  chunk_size: nil,
11
11
  col_sep: :auto, # was: ',',
@@ -187,6 +187,19 @@ module SmarterCSV
187
187
  unless %i[legacy standard].include?(options[:quote_boundary])
188
188
  errors << "invalid quote_boundary: must be :legacy or :standard"
189
189
  end
190
+ arc = options[:auto_row_sep_chars]
191
+ unless arc.is_a?(Integer) && arc >= DEFAULT_OPTIONS[:auto_row_sep_chars]
192
+ warn "WARNING: invalid auto_row_sep_chars value #{arc.inspect} — must be an Integer >= #{DEFAULT_OPTIONS[:auto_row_sep_chars]}; using default (#{DEFAULT_OPTIONS[:auto_row_sep_chars]})" unless options[:verbose] == :quiet
193
+ options[:auto_row_sep_chars] = DEFAULT_OPTIONS[:auto_row_sep_chars]
194
+ end
195
+ # buffer_size is an internal option used by tests to exercise small-buffer boundary conditions.
196
+ # It is purposely not part of the public API and purposely not listed in DEFAULT_OPTIONS.
197
+ if options.key?(:buffer_size)
198
+ bs = options[:buffer_size]
199
+ unless bs.is_a?(Integer) && bs > 0
200
+ errors << "invalid buffer_size: must be a positive Integer (got #{bs.inspect})"
201
+ end
202
+ end
190
203
  fsl = options[:field_size_limit]
191
204
  unless fsl.nil? || (fsl.is_a?(Integer) && fsl > 0)
192
205
  errors << "invalid field_size_limit: must be nil or a positive Integer (got #{fsl.inspect})"
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.16.6"
4
+ VERSION = "1.17.0.pre5"
5
5
  end
data/lib/smarter_csv.rb CHANGED
@@ -5,6 +5,7 @@ require "smarter_csv/version"
5
5
  require "smarter_csv/errors"
6
6
 
7
7
  require "smarter_csv/file_io"
8
+ require "smarter_csv/peekable_io"
8
9
  require "smarter_csv/reader_options"
9
10
  require "smarter_csv/writer_options"
10
11
  require "smarter_csv/auto_detection"
@@ -66,24 +67,30 @@ module SmarterCSV
66
67
  # reader = SmarterCSV::Reader.new(input, options)
67
68
  # reader.process # with or without block
68
69
  #
69
- # After calling any of the class-level methods, errors from the last run are available via:
70
+ # After calling any of the class-level methods, errors and warnings from the last run
71
+ # are available via:
70
72
  #
71
- # SmarterCSV.errors # => { bad_row_count: 2, bad_rows: [...] }
73
+ # SmarterCSV.errors # => { bad_row_count: 2, bad_rows: [...] }
74
+ # SmarterCSV.warnings # => [ { type:, code:, message:, count: }, ... ]
72
75
  #
73
- # This exposes the same reader.errors hash without requiring access to the Reader instance.
74
- # Errors are cleared at the start of each call and stored per-thread, so this is safe in
75
- # multi-threaded environments (Puma, Sidekiq). Note: only the most recent call's errors
76
- # are retained per thread.
76
+ # These expose the same reader.errors / reader.warnings without requiring access to the
77
+ # Reader instance. Both are cleared at the start of each call and stored per-thread, so
78
+ # this is safe in multi-threaded environments (Puma, Sidekiq). Only the most recent
79
+ # call's errors and warnings are retained per thread.
77
80
  #
78
81
  def self.process(input, given_options = {}, &block)
79
82
  Thread.current[:current_thread_recent_errors] = {}
83
+ Thread.current[:current_thread_recent_warnings] = []
80
84
  reader = Reader.new(input, given_options)
81
85
  reader.process(&block)
82
86
  ensure
83
87
  # Preserve partial error state when processing raises mid-stream
84
88
  # (e.g. TooManyBadRows, or a user block raising). `reader` is nil if
85
89
  # Reader.new itself raised before the local was assigned.
86
- Thread.current[:current_thread_recent_errors] = reader.errors if reader
90
+ if reader
91
+ Thread.current[:current_thread_recent_errors] = reader.errors
92
+ Thread.current[:current_thread_recent_warnings] = reader.warnings
93
+ end
87
94
  end
88
95
 
89
96
  # Convenience method for parsing a CSV string directly.
@@ -111,10 +118,14 @@ module SmarterCSV
111
118
  # SmarterCSV.each("data.csv").lazy.map { |h| h[:name] }.first(10)
112
119
  def self.each(input, options = {}, &block)
113
120
  Thread.current[:current_thread_recent_errors] = {}
121
+ Thread.current[:current_thread_recent_warnings] = []
114
122
  reader = Reader.new(input, options)
115
123
  reader.each(&block)
116
124
  ensure
117
- Thread.current[:current_thread_recent_errors] = reader.errors if reader
125
+ if reader
126
+ Thread.current[:current_thread_recent_errors] = reader.errors
127
+ Thread.current[:current_thread_recent_warnings] = reader.warnings
128
+ end
118
129
  end
119
130
 
120
131
  # Yields each chunk as Array<Hash> plus its 0-based chunk index.
@@ -128,10 +139,14 @@ module SmarterCSV
128
139
  # SmarterCSV.each_chunk("data.csv", chunk_size: 100).with_index { |chunk, i| ... }
129
140
  def self.each_chunk(input, options = {}, &block)
130
141
  Thread.current[:current_thread_recent_errors] = {}
142
+ Thread.current[:current_thread_recent_warnings] = []
131
143
  reader = Reader.new(input, options)
132
144
  reader.each_chunk(&block)
133
145
  ensure
134
- Thread.current[:current_thread_recent_errors] = reader.errors if reader
146
+ if reader
147
+ Thread.current[:current_thread_recent_errors] = reader.errors
148
+ Thread.current[:current_thread_recent_warnings] = reader.warnings
149
+ end
135
150
  end
136
151
 
137
152
  # Returns the errors from the most recent call to .process, .parse, .each, or .each_chunk
@@ -149,6 +164,21 @@ module SmarterCSV
149
164
  Thread.current[:current_thread_recent_errors] || {}
150
165
  end
151
166
 
167
+ # Returns the warnings from the most recent call to .process, .parse, .each, or .each_chunk
168
+ # on the current thread. Cleared at the start of each new call.
169
+ #
170
+ # Each warning is a Hash: { type:, code:, message:, count: }.
171
+ # Repeated warnings of the same (type, code) are deduped — `count` tracks
172
+ # the number of occurrences.
173
+ #
174
+ # Example:
175
+ # SmarterCSV.process('data.csv')
176
+ # SmarterCSV.warnings.each { |w| logger.warn("[#{w[:type]}/#{w[:code]}] #{w[:message]} (×#{w[:count]})") }
177
+ #
178
+ def self.warnings
179
+ Thread.current[:current_thread_recent_warnings] || []
180
+ end
181
+
152
182
  # Convenience method for generating CSV files, IO objects, or in-memory strings.
153
183
  #
154
184
  # When called WITHOUT a first argument, generates CSV in memory and returns it as a String.
@@ -184,7 +214,6 @@ module SmarterCSV
184
214
  # end
185
215
  # end
186
216
  #
187
- # rubocop:disable Lint/UnusedMethodArgument
188
217
  def self.generate(file_path_or_io = nil, options = {}, &block)
189
218
  raise ArgumentError, "SmarterCSV.generate requires a block" unless block_given?
190
219
 
@@ -213,5 +242,4 @@ module SmarterCSV
213
242
  end
214
243
  end
215
244
  end
216
- # rubocop:enable Lint/UnusedMethodArgument
217
245
  end
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.16.6
4
+ version: 1.17.0.pre5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2026-05-21 00:00:00.000000000 Z
10
+ date: 2026-04-28 00:00:00.000000000 Z
11
11
  dependencies: []
12
12
  description: |
13
13
  SmarterCSV is a high-performance CSV reader and writer for Ruby focused on
@@ -62,6 +62,8 @@ files:
62
62
  - docs/row_col_sep.md
63
63
  - docs/ruby_csv_pitfalls.md
64
64
  - docs/value_converters.md
65
+ - docs/warnings.md
66
+ - ext/smarter_csv/Makefile
65
67
  - ext/smarter_csv/extconf.rb
66
68
  - ext/smarter_csv/smarter_csv.c
67
69
  - images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png
@@ -79,6 +81,7 @@ files:
79
81
  - lib/smarter_csv/header_validations.rb
80
82
  - lib/smarter_csv/headers.rb
81
83
  - lib/smarter_csv/parser.rb
84
+ - lib/smarter_csv/peekable_io.rb
82
85
  - lib/smarter_csv/reader.rb
83
86
  - lib/smarter_csv/reader_options.rb
84
87
  - lib/smarter_csv/version.rb
@@ -109,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
112
  - !ruby/object:Gem::Version
110
113
  version: '0'
111
114
  requirements: []
112
- rubygems_version: 4.0.11
115
+ rubygems_version: 4.0.6
113
116
  specification_version: 4
114
117
  summary: Fastest end-to-end CSV ingestion for Ruby with smart defaults and Rails-ready
115
118
  hash output