smarter_csv 1.17.3 → 1.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,10 +3,11 @@
3
3
  module SmarterCSV
4
4
  module HashTransformations
5
5
  # Frozen regex constants for performance (avoid recompilation on every value)
6
- NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?\z/.freeze
6
+ NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\z/.freeze
7
7
  # FLOAT_REGEX = /\A[+-]?\d+\.\d+\z/.freeze
8
8
  # INTEGER_REGEX = /\A[+-]?\d+\z/.freeze
9
9
  ZERO_REGEX = /\A[+-]?0+(?:\.0+)?\z/.freeze # could be +0.0
10
+ EXPONENT_CHARS = %w[e E].freeze # mantissa scan stops here in significant_digits
10
11
 
11
12
  # First-byte values that can begin a numeric literal — used to skip the numeric
12
13
  # regexes for values that obviously aren't numbers (e.g. city names).
@@ -70,7 +71,13 @@ module SmarterCSV
70
71
  first_byte = v.getbyte(0)
71
72
  if first_byte && ((first_byte >= ZERO_BYTE && first_byte <= NINE_BYTE) || first_byte == MINUS_BYTE || first_byte == PLUS_BYTE)
72
73
  if NUMERIC_REGEX.match?(v)
73
- hash[k] = v.include?('.') ? v.to_f : v.to_i
74
+ # A value with a '.' or an exponent is a decimal → honor decimal_precision;
75
+ # otherwise it's an integer.
76
+ hash[k] = if v.include?('.') || v.include?('e') || v.include?('E')
77
+ convert_decimal(v, options[:decimal_precision])
78
+ else
79
+ v.to_i
80
+ end
74
81
  end
75
82
  end
76
83
  end
@@ -121,6 +128,48 @@ module SmarterCSV
121
128
 
122
129
  protected
123
130
 
131
+ # Convert a decimal string (has a '.' or an exponent) to a numeric, honoring
132
+ # decimal_precision: :float -> Float, :bigdecimal -> BigDecimal, :auto -> Float unless
133
+ # the value carries more than 16 significant digits (then BigDecimal, no precision loss).
134
+ def convert_decimal(str, decimal_precision)
135
+ case decimal_precision
136
+ when :float
137
+ str.to_f
138
+ when :bigdecimal
139
+ BigDecimal(str)
140
+ else # :auto
141
+ # A float token always has a '.' or 'e', so a token of <= 17 bytes holds at most
142
+ # 16 digits and therefore <= 16 significant digits — skip the per-char scan and go
143
+ # straight to Float (the common case: coordinates, sensor readings, prices). Only
144
+ # longer tokens can reach the BigDecimal threshold, so pay for the scan only then.
145
+ if str.bytesize > 17 && significant_digits(str) > 16
146
+ BigDecimal(str)
147
+ else
148
+ str.to_f
149
+ end
150
+ end
151
+ end
152
+
153
+ # Count significant mantissa digits (leading zeros excluded, trailing and fraction
154
+ # digits included, exponent excluded). Matches the C path's fj_sig_digits / Oj's dec_cnt
155
+ # so :auto picks Float vs BigDecimal identically on both paths.
156
+ def significant_digits(str)
157
+ cnt = 0
158
+ started = false
159
+ str.each_char do |c|
160
+ break if EXPONENT_CHARS.include?(c)
161
+ next unless c >= '0' && c <= '9'
162
+
163
+ if started
164
+ cnt += 1
165
+ elsif c != '0'
166
+ started = true
167
+ cnt = 1
168
+ end
169
+ end
170
+ cnt
171
+ end
172
+
124
173
  # acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
125
174
  def limit_execution_for_only_or_except(options, option_name, key)
126
175
  if options[option_name].is_a?(Hash)
@@ -32,13 +32,14 @@ module SmarterCSV
32
32
  # rubocop:disable Naming/MethodName
33
33
  def headerA
34
34
  record_warning(type: :deprecation, code: :header_a_method) do
35
- "Deprecarion Warning: 'headerA' will be removed in future versions. Use 'headders'"
35
+ "Deprecation Warning: 'headerA' will be removed in future versions. Use 'headers'"
36
36
  end
37
37
  @headerA
38
38
  end
39
39
  # rubocop:enable Naming/MethodName
40
40
 
41
- # first parameter: filename or input object which responds to readline method
41
+ # first parameter: a path (String or Pathname) to open, or an already-open readable IO
42
+ # (anything responding to #gets — File, StringIO, Tempfile, Zlib::GzipReader, pipes, ...)
42
43
  def initialize(input, given_options = {})
43
44
  @input = input
44
45
  @has_rails = !!defined?(Rails)
@@ -123,7 +124,14 @@ module SmarterCSV
123
124
  @verbose = options[:verbose]
124
125
 
125
126
  begin
126
- fh = input.is_a?(String) ? File.open(input, "r:#{options[:file_encoding]}") : input
127
+ # Decide whether `input` is an already-open, readable stream or a path we must open.
128
+ # The reader reads lines via #gets (see file_io.rb and PeekableIO), so a public #gets
129
+ # is exactly what we need: real IOs (File, StringIO, Tempfile, Zlib::GzipReader, pipes,
130
+ # custom non-seekable streams) expose it, while path-like inputs (String, Pathname) do
131
+ # not — their only #gets is the private Kernel#gets. 1.17.0 narrowed this to
132
+ # input.is_a?(String), which sent Pathname down the IO branch and then called its
133
+ # private Kernel#gets, raising "private method 'gets' called" (issue #337).
134
+ fh = input.respond_to?(:gets) ? input : File.open(input, "r:#{options[:file_encoding]}")
127
135
 
128
136
  # Rewindable inputs (File, Tempfile, StringIO, Zlib::GzipReader, ...) use
129
137
  # native rewind for auto-detection — no wrapper overhead in the hot loop.
@@ -272,10 +280,14 @@ module SmarterCSV
272
280
  start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) if on_start || on_complete
273
281
 
274
282
  if on_start
275
- input_meta = if @input.is_a?(String)
276
- { input: @input, file_size: (File.size(@input) rescue nil) }
277
- else
283
+ # Same path-vs-IO distinction as the File.open above: an already-open IO responds
284
+ # to #gets and we can't know its on-disk size, so we report its class name. A
285
+ # path-like input (String, or a Pathname via #to_path) gets its path and file size.
286
+ input_meta = if @input.respond_to?(:gets)
278
287
  { input: @input.class.name, file_size: nil }
288
+ else
289
+ path = @input.respond_to?(:to_path) ? @input.to_path : @input
290
+ { input: path, file_size: (File.size(path) rescue nil) }
279
291
  end
280
292
  on_start.call(input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
281
293
  end
@@ -17,6 +17,7 @@ module SmarterCSV
17
17
  collect_raw_lines: true,
18
18
  comment_regexp: nil, # was: /\A#/,
19
19
  convert_values_to_numeric: true,
20
+ decimal_precision: :auto, # :auto (Float, but BigDecimal above 16 significant digits), :float, or :bigdecimal
20
21
  downcase_header: true,
21
22
  duplicate_header_suffix: '', # was: nil,
22
23
  field_size_limit: nil, # Integer (bytes) or nil for no limit. Raises FieldSizeLimitExceeded if any
@@ -58,6 +59,12 @@ module SmarterCSV
58
59
  with_line_numbers: false,
59
60
  }.freeze
60
61
 
62
+ # Options whose canonical value is one of a fixed set of symbols. A string form
63
+ # (e.g. "backslash" from options round-tripped through JSON or YAML) is coerced to
64
+ # the matching symbol. Non-string values (a callable for on_bad_row, true/false for
65
+ # legacy verbose) pass through untouched.
66
+ SYMBOL_VALUE_OPTIONS = %i[quote_escaping quote_boundary missing_headers on_bad_row verbose decimal_precision].freeze
67
+
61
68
  # NOTE: this is not called when "parse" methods are tested by themselves
62
69
  def process_options(given_options = {})
63
70
  # Debug output before merge — check raw verbose value (true or :debug)
@@ -77,6 +84,10 @@ module SmarterCSV
77
84
 
78
85
  @options = DEFAULT_OPTIONS.dup.merge!(given_options)
79
86
 
87
+ # Symbol/string interchangeability: accept either form for every option whose
88
+ # value is a symbol or a string. Done once here, before any value is read below.
89
+ normalize_option_value_types!(@options)
90
+
80
91
  # Normalize verbose to a symbol — done once here, stored back into @options.
81
92
  # All subsequent checks are free symbol comparisons; no re-evaluation needed.
82
93
  # :quiet — suppress all warnings and notices (good for production)
@@ -194,6 +205,9 @@ module SmarterCSV
194
205
  unless %i[legacy standard].include?(options[:quote_boundary])
195
206
  errors << "invalid quote_boundary: must be :legacy or :standard"
196
207
  end
208
+ unless %i[auto float bigdecimal].include?(options[:decimal_precision])
209
+ errors << "invalid decimal_precision: must be :auto, :float, or :bigdecimal"
210
+ end
197
211
  arc = options[:auto_row_sep_chars]
198
212
  min_arc = SmarterCSV::AutoDetection::MIN_AUTO_ROW_SEP_CHARS
199
213
  max_arc = SmarterCSV::AutoDetection::MAX_AUTO_ROW_SEP_CHARS
@@ -268,6 +282,16 @@ module SmarterCSV
268
282
  raise SmarterCSV::ValidationError, errors.inspect if errors.any?
269
283
  end
270
284
 
285
+ # Accept either a symbol or a string for every option whose value is one or the
286
+ # other, so callers limited to strings (JSON/YAML) behave the same as those passing
287
+ # symbols, and vice versa. Validation of the resulting value happens later.
288
+ def normalize_option_value_types!(options)
289
+ SYMBOL_VALUE_OPTIONS.each do |key|
290
+ v = options[key]
291
+ options[key] = v.to_sym if v.is_a?(String)
292
+ end
293
+ end
294
+
271
295
  def option_valid?(str)
272
296
  return true if str.is_a?(Symbol) && str == :auto
273
297
  return true if str.is_a?(String) && !str.empty?
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.17.3"
4
+ VERSION = "1.18.0"
5
5
  end
data/lib/smarter_csv.rb CHANGED
@@ -1,6 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  require 'stringio'
4
+ require 'bigdecimal' # for decimal_precision: :auto / :bigdecimal
4
5
  require "smarter_csv/version"
5
6
  require "smarter_csv/errors"
6
7
 
data/smarter_csv.gemspec CHANGED
@@ -40,6 +40,9 @@ Gem::Specification.new do |spec|
40
40
 
41
41
  spec.required_ruby_version = ">= 2.6.0"
42
42
 
43
+ # bigdecimal is no longer a default gem on Ruby 3.4+; needed for decimal_precision: :auto / :bigdecimal
44
+ spec.add_dependency "bigdecimal"
45
+
43
46
  # Specify which files should be added to the gem when it is released.
44
47
  # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
45
48
  spec.files = Dir.chdir(__dir__) do
metadata CHANGED
@@ -1,14 +1,28 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.17.3
4
+ version: 1.18.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2026-05-27 00:00:00.000000000 Z
11
- dependencies: []
10
+ date: 2026-06-19 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: bigdecimal
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '0'
12
26
  description: |
13
27
  SmarterCSV is a high-performance CSV reader and writer for Ruby focused on
14
28
  fastest end-to-end ingestion — not just parsing. It returns ready-to-use
@@ -71,6 +85,10 @@ files:
71
85
  - docs/warnings.md
72
86
  - ext/smarter_csv/extconf.rb
73
87
  - ext/smarter_csv/smarter_csv.c
88
+ - ext/smarter_csv/vendor/LICENSE-fast_float-MIT
89
+ - ext/smarter_csv/vendor/eisel_lemire.h
90
+ - ext/smarter_csv/vendor/eisel_lemire.md
91
+ - ext/smarter_csv/vendor/eisel_lemire_powers.h
74
92
  - images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png
75
93
  - images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg
76
94
  - images/SmarterCSV_1.16.0_vs_previous_C-speedup.png
@@ -122,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
122
140
  - !ruby/object:Gem::Version
123
141
  version: '0'
124
142
  requirements: []
125
- rubygems_version: 4.0.11
143
+ rubygems_version: 3.6.9
126
144
  specification_version: 4
127
145
  summary: Fastest end-to-end CSV ingestion for Ruby with smart defaults and Rails-ready
128
146
  hash output