smarter_csv 1.17.3 → 1.18.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +50 -1
- data/CONTRIBUTORS.md +2 -1
- data/README.md +7 -2
- data/docs/data_transformations.md +33 -0
- data/docs/migrating_from_csv.md +18 -0
- data/docs/options.md +2 -1
- data/docs/upgrade_wizard.html +14 -10
- data/ext/smarter_csv/smarter_csv.c +204 -32
- data/ext/smarter_csv/vendor/LICENSE-fast_float-MIT +27 -0
- data/ext/smarter_csv/vendor/eisel_lemire.h +117 -0
- data/ext/smarter_csv/vendor/eisel_lemire.md +29 -0
- data/ext/smarter_csv/vendor/eisel_lemire_powers.h +663 -0
- data/lib/smarter_csv/hash_transformations.rb +51 -2
- data/lib/smarter_csv/reader.rb +18 -6
- data/lib/smarter_csv/reader_options.rb +24 -0
- data/lib/smarter_csv/version.rb +1 -1
- data/lib/smarter_csv.rb +1 -0
- data/smarter_csv.gemspec +3 -0
- metadata +22 -4
|
@@ -3,10 +3,11 @@
|
|
|
3
3
|
module SmarterCSV
|
|
4
4
|
module HashTransformations
|
|
5
5
|
# Frozen regex constants for performance (avoid recompilation on every value)
|
|
6
|
-
NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?\z/.freeze
|
|
6
|
+
NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?\z/.freeze
|
|
7
7
|
# FLOAT_REGEX = /\A[+-]?\d+\.\d+\z/.freeze
|
|
8
8
|
# INTEGER_REGEX = /\A[+-]?\d+\z/.freeze
|
|
9
9
|
ZERO_REGEX = /\A[+-]?0+(?:\.0+)?\z/.freeze # could be +0.0
|
|
10
|
+
EXPONENT_CHARS = %w[e E].freeze # mantissa scan stops here in significant_digits
|
|
10
11
|
|
|
11
12
|
# First-byte values that can begin a numeric literal — used to skip the numeric
|
|
12
13
|
# regexes for values that obviously aren't numbers (e.g. city names).
|
|
@@ -70,7 +71,13 @@ module SmarterCSV
|
|
|
70
71
|
first_byte = v.getbyte(0)
|
|
71
72
|
if first_byte && ((first_byte >= ZERO_BYTE && first_byte <= NINE_BYTE) || first_byte == MINUS_BYTE || first_byte == PLUS_BYTE)
|
|
72
73
|
if NUMERIC_REGEX.match?(v)
|
|
73
|
-
|
|
74
|
+
# A value with a '.' or an exponent is a decimal → honor decimal_precision;
|
|
75
|
+
# otherwise it's an integer.
|
|
76
|
+
hash[k] = if v.include?('.') || v.include?('e') || v.include?('E')
|
|
77
|
+
convert_decimal(v, options[:decimal_precision])
|
|
78
|
+
else
|
|
79
|
+
v.to_i
|
|
80
|
+
end
|
|
74
81
|
end
|
|
75
82
|
end
|
|
76
83
|
end
|
|
@@ -121,6 +128,48 @@ module SmarterCSV
|
|
|
121
128
|
|
|
122
129
|
protected
|
|
123
130
|
|
|
131
|
+
# Convert a decimal string (has a '.' or an exponent) to a numeric, honoring
|
|
132
|
+
# decimal_precision: :float -> Float, :bigdecimal -> BigDecimal, :auto -> Float unless
|
|
133
|
+
# the value carries more than 16 significant digits (then BigDecimal, no precision loss).
|
|
134
|
+
def convert_decimal(str, decimal_precision)
|
|
135
|
+
case decimal_precision
|
|
136
|
+
when :float
|
|
137
|
+
str.to_f
|
|
138
|
+
when :bigdecimal
|
|
139
|
+
BigDecimal(str)
|
|
140
|
+
else # :auto
|
|
141
|
+
# A float token always has a '.' or 'e', so a token of <= 17 bytes holds at most
|
|
142
|
+
# 16 digits and therefore <= 16 significant digits — skip the per-char scan and go
|
|
143
|
+
# straight to Float (the common case: coordinates, sensor readings, prices). Only
|
|
144
|
+
# longer tokens can reach the BigDecimal threshold, so pay for the scan only then.
|
|
145
|
+
if str.bytesize > 17 && significant_digits(str) > 16
|
|
146
|
+
BigDecimal(str)
|
|
147
|
+
else
|
|
148
|
+
str.to_f
|
|
149
|
+
end
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
# Count significant mantissa digits (leading zeros excluded, trailing and fraction
|
|
154
|
+
# digits included, exponent excluded). Matches the C path's fj_sig_digits / Oj's dec_cnt
|
|
155
|
+
# so :auto picks Float vs BigDecimal identically on both paths.
|
|
156
|
+
def significant_digits(str)
|
|
157
|
+
cnt = 0
|
|
158
|
+
started = false
|
|
159
|
+
str.each_char do |c|
|
|
160
|
+
break if EXPONENT_CHARS.include?(c)
|
|
161
|
+
next unless c >= '0' && c <= '9'
|
|
162
|
+
|
|
163
|
+
if started
|
|
164
|
+
cnt += 1
|
|
165
|
+
elsif c != '0'
|
|
166
|
+
started = true
|
|
167
|
+
cnt = 1
|
|
168
|
+
end
|
|
169
|
+
end
|
|
170
|
+
cnt
|
|
171
|
+
end
|
|
172
|
+
|
|
124
173
|
# acts as a road-block to limit processing when iterating over all k/v pairs of a CSV-hash:
|
|
125
174
|
def limit_execution_for_only_or_except(options, option_name, key)
|
|
126
175
|
if options[option_name].is_a?(Hash)
|
data/lib/smarter_csv/reader.rb
CHANGED
|
@@ -32,13 +32,14 @@ module SmarterCSV
|
|
|
32
32
|
# rubocop:disable Naming/MethodName
|
|
33
33
|
def headerA
|
|
34
34
|
record_warning(type: :deprecation, code: :header_a_method) do
|
|
35
|
-
"
|
|
35
|
+
"Deprecation Warning: 'headerA' will be removed in future versions. Use 'headers'"
|
|
36
36
|
end
|
|
37
37
|
@headerA
|
|
38
38
|
end
|
|
39
39
|
# rubocop:enable Naming/MethodName
|
|
40
40
|
|
|
41
|
-
# first parameter:
|
|
41
|
+
# first parameter: a path (String or Pathname) to open, or an already-open readable IO
|
|
42
|
+
# (anything responding to #gets — File, StringIO, Tempfile, Zlib::GzipReader, pipes, ...)
|
|
42
43
|
def initialize(input, given_options = {})
|
|
43
44
|
@input = input
|
|
44
45
|
@has_rails = !!defined?(Rails)
|
|
@@ -123,7 +124,14 @@ module SmarterCSV
|
|
|
123
124
|
@verbose = options[:verbose]
|
|
124
125
|
|
|
125
126
|
begin
|
|
126
|
-
|
|
127
|
+
# Decide whether `input` is an already-open, readable stream or a path we must open.
|
|
128
|
+
# The reader reads lines via #gets (see file_io.rb and PeekableIO), so a public #gets
|
|
129
|
+
# is exactly what we need: real IOs (File, StringIO, Tempfile, Zlib::GzipReader, pipes,
|
|
130
|
+
# custom non-seekable streams) expose it, while path-like inputs (String, Pathname) do
|
|
131
|
+
# not — their only #gets is the private Kernel#gets. 1.17.0 narrowed this to
|
|
132
|
+
# input.is_a?(String), which sent Pathname down the IO branch and then called its
|
|
133
|
+
# private Kernel#gets, raising "private method 'gets' called" (issue #337).
|
|
134
|
+
fh = input.respond_to?(:gets) ? input : File.open(input, "r:#{options[:file_encoding]}")
|
|
127
135
|
|
|
128
136
|
# Rewindable inputs (File, Tempfile, StringIO, Zlib::GzipReader, ...) use
|
|
129
137
|
# native rewind for auto-detection — no wrapper overhead in the hot loop.
|
|
@@ -272,10 +280,14 @@ module SmarterCSV
|
|
|
272
280
|
start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC) if on_start || on_complete
|
|
273
281
|
|
|
274
282
|
if on_start
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
283
|
+
# Same path-vs-IO distinction as the File.open above: an already-open IO responds
|
|
284
|
+
# to #gets and we can't know its on-disk size, so we report its class name. A
|
|
285
|
+
# path-like input (String, or a Pathname via #to_path) gets its path and file size.
|
|
286
|
+
input_meta = if @input.respond_to?(:gets)
|
|
278
287
|
{ input: @input.class.name, file_size: nil }
|
|
288
|
+
else
|
|
289
|
+
path = @input.respond_to?(:to_path) ? @input.to_path : @input
|
|
290
|
+
{ input: path, file_size: (File.size(path) rescue nil) }
|
|
279
291
|
end
|
|
280
292
|
on_start.call(input_meta.merge(col_sep: options[:col_sep], row_sep: options[:row_sep]))
|
|
281
293
|
end
|
|
@@ -17,6 +17,7 @@ module SmarterCSV
|
|
|
17
17
|
collect_raw_lines: true,
|
|
18
18
|
comment_regexp: nil, # was: /\A#/,
|
|
19
19
|
convert_values_to_numeric: true,
|
|
20
|
+
decimal_precision: :auto, # :auto (Float, but BigDecimal above 16 significant digits), :float, or :bigdecimal
|
|
20
21
|
downcase_header: true,
|
|
21
22
|
duplicate_header_suffix: '', # was: nil,
|
|
22
23
|
field_size_limit: nil, # Integer (bytes) or nil for no limit. Raises FieldSizeLimitExceeded if any
|
|
@@ -58,6 +59,12 @@ module SmarterCSV
|
|
|
58
59
|
with_line_numbers: false,
|
|
59
60
|
}.freeze
|
|
60
61
|
|
|
62
|
+
# Options whose canonical value is one of a fixed set of symbols. A string form
|
|
63
|
+
# (e.g. "backslash" from options round-tripped through JSON or YAML) is coerced to
|
|
64
|
+
# the matching symbol. Non-string values (a callable for on_bad_row, true/false for
|
|
65
|
+
# legacy verbose) pass through untouched.
|
|
66
|
+
SYMBOL_VALUE_OPTIONS = %i[quote_escaping quote_boundary missing_headers on_bad_row verbose decimal_precision].freeze
|
|
67
|
+
|
|
61
68
|
# NOTE: this is not called when "parse" methods are tested by themselves
|
|
62
69
|
def process_options(given_options = {})
|
|
63
70
|
# Debug output before merge — check raw verbose value (true or :debug)
|
|
@@ -77,6 +84,10 @@ module SmarterCSV
|
|
|
77
84
|
|
|
78
85
|
@options = DEFAULT_OPTIONS.dup.merge!(given_options)
|
|
79
86
|
|
|
87
|
+
# Symbol/string interchangeability: accept either form for every option whose
|
|
88
|
+
# value is a symbol or a string. Done once here, before any value is read below.
|
|
89
|
+
normalize_option_value_types!(@options)
|
|
90
|
+
|
|
80
91
|
# Normalize verbose to a symbol — done once here, stored back into @options.
|
|
81
92
|
# All subsequent checks are free symbol comparisons; no re-evaluation needed.
|
|
82
93
|
# :quiet — suppress all warnings and notices (good for production)
|
|
@@ -194,6 +205,9 @@ module SmarterCSV
|
|
|
194
205
|
unless %i[legacy standard].include?(options[:quote_boundary])
|
|
195
206
|
errors << "invalid quote_boundary: must be :legacy or :standard"
|
|
196
207
|
end
|
|
208
|
+
unless %i[auto float bigdecimal].include?(options[:decimal_precision])
|
|
209
|
+
errors << "invalid decimal_precision: must be :auto, :float, or :bigdecimal"
|
|
210
|
+
end
|
|
197
211
|
arc = options[:auto_row_sep_chars]
|
|
198
212
|
min_arc = SmarterCSV::AutoDetection::MIN_AUTO_ROW_SEP_CHARS
|
|
199
213
|
max_arc = SmarterCSV::AutoDetection::MAX_AUTO_ROW_SEP_CHARS
|
|
@@ -268,6 +282,16 @@ module SmarterCSV
|
|
|
268
282
|
raise SmarterCSV::ValidationError, errors.inspect if errors.any?
|
|
269
283
|
end
|
|
270
284
|
|
|
285
|
+
# Accept either a symbol or a string for every option whose value is one or the
|
|
286
|
+
# other, so callers limited to strings (JSON/YAML) behave the same as those passing
|
|
287
|
+
# symbols, and vice versa. Validation of the resulting value happens later.
|
|
288
|
+
def normalize_option_value_types!(options)
|
|
289
|
+
SYMBOL_VALUE_OPTIONS.each do |key|
|
|
290
|
+
v = options[key]
|
|
291
|
+
options[key] = v.to_sym if v.is_a?(String)
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
|
|
271
295
|
def option_valid?(str)
|
|
272
296
|
return true if str.is_a?(Symbol) && str == :auto
|
|
273
297
|
return true if str.is_a?(String) && !str.empty?
|
data/lib/smarter_csv/version.rb
CHANGED
data/lib/smarter_csv.rb
CHANGED
data/smarter_csv.gemspec
CHANGED
|
@@ -40,6 +40,9 @@ Gem::Specification.new do |spec|
|
|
|
40
40
|
|
|
41
41
|
spec.required_ruby_version = ">= 2.6.0"
|
|
42
42
|
|
|
43
|
+
# bigdecimal is no longer a default gem on Ruby 3.4+; needed for decimal_precision: :auto / :bigdecimal
|
|
44
|
+
spec.add_dependency "bigdecimal"
|
|
45
|
+
|
|
43
46
|
# Specify which files should be added to the gem when it is released.
|
|
44
47
|
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
|
45
48
|
spec.files = Dir.chdir(__dir__) do
|
metadata
CHANGED
|
@@ -1,14 +1,28 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: smarter_csv
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.
|
|
4
|
+
version: 1.18.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Tilo Sloboda
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2026-
|
|
11
|
-
dependencies:
|
|
10
|
+
date: 2026-06-19 00:00:00.000000000 Z
|
|
11
|
+
dependencies:
|
|
12
|
+
- !ruby/object:Gem::Dependency
|
|
13
|
+
name: bigdecimal
|
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
|
15
|
+
requirements:
|
|
16
|
+
- - ">="
|
|
17
|
+
- !ruby/object:Gem::Version
|
|
18
|
+
version: '0'
|
|
19
|
+
type: :runtime
|
|
20
|
+
prerelease: false
|
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
|
22
|
+
requirements:
|
|
23
|
+
- - ">="
|
|
24
|
+
- !ruby/object:Gem::Version
|
|
25
|
+
version: '0'
|
|
12
26
|
description: |
|
|
13
27
|
SmarterCSV is a high-performance CSV reader and writer for Ruby focused on
|
|
14
28
|
fastest end-to-end ingestion — not just parsing. It returns ready-to-use
|
|
@@ -71,6 +85,10 @@ files:
|
|
|
71
85
|
- docs/warnings.md
|
|
72
86
|
- ext/smarter_csv/extconf.rb
|
|
73
87
|
- ext/smarter_csv/smarter_csv.c
|
|
88
|
+
- ext/smarter_csv/vendor/LICENSE-fast_float-MIT
|
|
89
|
+
- ext/smarter_csv/vendor/eisel_lemire.h
|
|
90
|
+
- ext/smarter_csv/vendor/eisel_lemire.md
|
|
91
|
+
- ext/smarter_csv/vendor/eisel_lemire_powers.h
|
|
74
92
|
- images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.png
|
|
75
93
|
- images/SmarterCSV_1.16.0_vs_RubyCSV_3.3.5_speedup.svg
|
|
76
94
|
- images/SmarterCSV_1.16.0_vs_previous_C-speedup.png
|
|
@@ -122,7 +140,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
122
140
|
- !ruby/object:Gem::Version
|
|
123
141
|
version: '0'
|
|
124
142
|
requirements: []
|
|
125
|
-
rubygems_version:
|
|
143
|
+
rubygems_version: 3.6.9
|
|
126
144
|
specification_version: 4
|
|
127
145
|
summary: Fastest end-to-end CSV ingestion for Ruby with smart defaults and Rails-ready
|
|
128
146
|
hash output
|