smarter_csv 1.17.1 → 1.17.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +246 -63
- data/CONTRIBUTORS.md +2 -1
- data/README.md +6 -3
- data/UPGRADING.md +251 -0
- data/docs/.nojekyll +0 -0
- data/docs/upgrade_path.json +175 -0
- data/docs/upgrade_wizard.html +498 -0
- data/ext/smarter_csv/smarter_csv.c +248 -323
- data/lib/smarter_csv/parser.rb +40 -12
- data/lib/smarter_csv/version.rb +1 -1
- data/smarter_csv.gemspec +7 -5
- metadata +8 -3
- data/TO_DO.md +0 -109
data/lib/smarter_csv/parser.rb
CHANGED
|
@@ -195,8 +195,10 @@ module SmarterCSV
|
|
|
195
195
|
return [nil, n] if all_blank
|
|
196
196
|
end
|
|
197
197
|
|
|
198
|
-
#
|
|
199
|
-
|
|
198
|
+
# In-place strip! — allocation-free when there's no surrounding whitespace
|
|
199
|
+
# (matches the sister site in parse_csv_line_ruby; completes the
|
|
200
|
+
# "strip -> strip!" sweep documented in the 1.17.0 commit notes).
|
|
201
|
+
fields.each(&:strip!) if strip
|
|
200
202
|
|
|
201
203
|
remove_empty = options[:remove_empty_values]
|
|
202
204
|
hash = {}
|
|
@@ -410,15 +412,28 @@ module SmarterCSV
|
|
|
410
412
|
if !allow_escaped_quotes || backslash_count % 2 == 0
|
|
411
413
|
if quote_boundary_standard
|
|
412
414
|
if in_quotes
|
|
413
|
-
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
414
415
|
next_i = i + 1
|
|
415
|
-
if next_i
|
|
416
|
-
|
|
417
|
-
|
|
416
|
+
if next_i + 1 < bytesize && line.getbyte(next_i) == quote_byte
|
|
417
|
+
# RFC doubled quote inside a quoted field ("" → ").
|
|
418
|
+
# Give this precedence over the closing-quote check, but only
|
|
419
|
+
# when another byte follows the doubled pair.
|
|
420
|
+
#
|
|
421
|
+
# Compatibility note: we intentionally do NOT force terminal
|
|
422
|
+
# "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
423
|
+
# behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
424
|
+
# the final quote may still close the field instead of turning the
|
|
425
|
+
# row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
426
|
+
# precedence for ..."",... (more content follows), but we keep the
|
|
427
|
+
# historical leniency for terminal ..."".
|
|
428
|
+
i = next_i
|
|
429
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
430
|
+
elsif next_i >= bytesize ||
|
|
431
|
+
line.getbyte(next_i) == col_sep_byte ||
|
|
432
|
+
(row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
|
|
418
433
|
in_quotes = false
|
|
419
434
|
field_started = true
|
|
420
435
|
end
|
|
421
|
-
# else: quote inside quoted field → literal
|
|
436
|
+
# else: quote inside quoted field → literal
|
|
422
437
|
elsif !field_started # at field boundary: open quoted field
|
|
423
438
|
in_quotes = true
|
|
424
439
|
field_started = true
|
|
@@ -519,15 +534,28 @@ module SmarterCSV
|
|
|
519
534
|
if !allow_escaped_quotes || backslash_count % 2 == 0
|
|
520
535
|
if quote_boundary_standard
|
|
521
536
|
if in_quotes
|
|
522
|
-
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
523
537
|
next_i = i + 1
|
|
524
|
-
if next_i
|
|
525
|
-
|
|
526
|
-
|
|
538
|
+
if next_i + 1 < line_size && line[next_i] == quote
|
|
539
|
+
# RFC doubled quote inside a quoted field ("" → ").
|
|
540
|
+
# Give this precedence over the closing-quote check, but only
|
|
541
|
+
# when another character follows the doubled pair.
|
|
542
|
+
#
|
|
543
|
+
# Compatibility note: we intentionally do NOT force terminal
|
|
544
|
+
# "" to be consumed here. SmarterCSV has a long-standing lenient
|
|
545
|
+
# behavior for malformed tails like ...\"" in :double_quotes mode:
|
|
546
|
+
# the final quote may still close the field instead of turning the
|
|
547
|
+
# row into an unclosed-quote error. Issue #334 needs doubled-quote
|
|
548
|
+
# precedence for ..."",... (more content follows), but we keep the
|
|
549
|
+
# historical leniency for terminal ..."".
|
|
550
|
+
i = next_i
|
|
551
|
+
# closing quote: only valid if followed by col_sep, row_sep, or end of line
|
|
552
|
+
elsif next_i >= line_size ||
|
|
553
|
+
line[next_i...next_i + col_sep_size] == col_sep ||
|
|
554
|
+
(row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
|
|
527
555
|
in_quotes = false
|
|
528
556
|
field_started = true
|
|
529
557
|
end
|
|
530
|
-
# else: quote inside quoted field → literal
|
|
558
|
+
# else: quote inside quoted field → literal
|
|
531
559
|
elsif !field_started # at field boundary: open quoted field
|
|
532
560
|
in_quotes = true
|
|
533
561
|
field_started = true
|
data/lib/smarter_csv/version.rb
CHANGED
data/smarter_csv.gemspec
CHANGED
|
@@ -30,11 +30,13 @@ Gem::Specification.new do |spec|
|
|
|
30
30
|
spec.homepage = "https://github.com/tilo/smarter_csv"
|
|
31
31
|
spec.license = 'MIT'
|
|
32
32
|
|
|
33
|
-
spec.metadata["homepage_uri"]
|
|
34
|
-
spec.metadata["source_code_uri"]
|
|
35
|
-
spec.metadata["changelog_uri"]
|
|
36
|
-
spec.metadata["documentation_uri"]
|
|
37
|
-
spec.metadata["bug_tracker_uri"]
|
|
33
|
+
spec.metadata["homepage_uri"] = spec.homepage
|
|
34
|
+
spec.metadata["source_code_uri"] = spec.homepage
|
|
35
|
+
spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
|
|
36
|
+
spec.metadata["documentation_uri"] = "https://github.com/tilo/smarter_csv/tree/main/docs"
|
|
37
|
+
spec.metadata["bug_tracker_uri"] = "https://github.com/tilo/smarter_csv/issues"
|
|
38
|
+
spec.metadata["upgrade_uri"] = "https://github.com/tilo/smarter_csv/blob/main/UPGRADING.md"
|
|
39
|
+
spec.metadata["upgrade_wizard_uri"] = "https://tilo.github.io/smarter_csv/upgrade_wizard.html"
|
|
38
40
|
|
|
39
41
|
spec.required_ruby_version = ">= 2.6.0"
|
|
40
42
|
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: smarter_csv
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 1.17.
|
|
4
|
+
version: 1.17.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Tilo Sloboda
|
|
8
8
|
bindir: bin
|
|
9
9
|
cert_chain: []
|
|
10
|
-
date: 2026-05-
|
|
10
|
+
date: 2026-05-27 00:00:00.000000000 Z
|
|
11
11
|
dependencies: []
|
|
12
12
|
description: |
|
|
13
13
|
SmarterCSV is a high-performance CSV reader and writer for Ruby focused on
|
|
@@ -39,7 +39,8 @@ files:
|
|
|
39
39
|
- LICENSE.txt
|
|
40
40
|
- README.md
|
|
41
41
|
- Rakefile
|
|
42
|
-
-
|
|
42
|
+
- UPGRADING.md
|
|
43
|
+
- docs/.nojekyll
|
|
43
44
|
- docs/_introduction.md
|
|
44
45
|
- docs/bad_row_quarantine.md
|
|
45
46
|
- docs/basic_read_api.md
|
|
@@ -64,6 +65,8 @@ files:
|
|
|
64
65
|
- docs/releases/1.17.0/performance_notes.md
|
|
65
66
|
- docs/row_col_sep.md
|
|
66
67
|
- docs/ruby_csv_pitfalls.md
|
|
68
|
+
- docs/upgrade_path.json
|
|
69
|
+
- docs/upgrade_wizard.html
|
|
67
70
|
- docs/value_converters.md
|
|
68
71
|
- docs/warnings.md
|
|
69
72
|
- ext/smarter_csv/extconf.rb
|
|
@@ -102,6 +105,8 @@ metadata:
|
|
|
102
105
|
changelog_uri: https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md
|
|
103
106
|
documentation_uri: https://github.com/tilo/smarter_csv/tree/main/docs
|
|
104
107
|
bug_tracker_uri: https://github.com/tilo/smarter_csv/issues
|
|
108
|
+
upgrade_uri: https://github.com/tilo/smarter_csv/blob/main/UPGRADING.md
|
|
109
|
+
upgrade_wizard_uri: https://tilo.github.io/smarter_csv/upgrade_wizard.html
|
|
105
110
|
rdoc_options: []
|
|
106
111
|
require_paths:
|
|
107
112
|
- lib
|
data/TO_DO.md
DELETED
|
@@ -1,109 +0,0 @@
|
|
|
1
|
-
# SmarterCSV v2.0 TO DO List
|
|
2
|
-
|
|
3
|
-
DONE:
|
|
4
|
-
[X] Don't call rewind on filehandle
|
|
5
|
-
[X] use Procs for validations and transformatoins [issue #118](https://github.com/tilo/smarter_csv/issues/118)
|
|
6
|
-
[X] skip file opening, allow reading from CSV string, e.g. reading from S3 file [issue #120](https://github.com/tilo/smarter_csv/issues/120). Or stream large file from S3 (linked in the issue)
|
|
7
|
-
[X] [2.0 BUG] convert_to_float saves Proc as @@convert_to_integer [issue #157](https://github.com/tilo/smarter_csv/issues/157)
|
|
8
|
-
[X] add enumerable to speed up parallel processing [issue #66](https://github.com/tilo/smarter_csv/issues/66), [issue #32](https://github.com/tilo/smarter_csv/issues/32)
|
|
9
|
-
[X] Provide an example for custom Procs for hash_transformations in the docs [issue #174](https://github.com/tilo/smarter_csv/issues/174)
|
|
10
|
-
[X] Collect all Errors, before surfacing them. Avoid throwing an exception on the first error [issue #133](https://github.com/tilo/smarter_csv/issues/133)
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
Partially Done:
|
|
14
|
-
[ ] make @errors and @warnings work [issue #118](https://github.com/tilo/smarter_csv/issues/118)
|
|
15
|
-
|
|
16
|
-
StilL TO DO:
|
|
17
|
-
[ ] Replace remove_empty_values: false [issue #213](https://github.com/tilo/smarter_csv/issues/213)
|
|
18
|
-
|
|
19
|
-
Arguably by design (e.g. exclude these columns from conversion and have them returned as a string)
|
|
20
|
-
[ ] [2.0 BUG] :convert_values_to_numeric_unless_leading_zeros drops leading zeros [issue #151](https://github.com/tilo/smarter_csv/issues/151)
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
## Numeric conversion: align the Ruby fallback path with the C path (permissive)
|
|
24
|
-
|
|
25
|
-
Context: `convert_values_to_numeric` runs in two places that currently DISAGREE on edge cases:
|
|
26
|
-
- C path (`acceleration: true`, the default): `ext/smarter_csv/smarter_csv.c#try_numeric_conversion`
|
|
27
|
-
uses `strtol`/`strtod` (base 10; float branch only entered when the field contains a `.`).
|
|
28
|
-
- Ruby fallback (`acceleration: false`): `lib/smarter_csv/hash_transformations.rb` uses the
|
|
29
|
-
strict regex `NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?\z/` plus `to_i` / `to_f`.
|
|
30
|
-
|
|
31
|
-
Divergence (verified empirically):
|
|
32
|
-
| value | C path | Ruby fallback |
|
|
33
|
-
|-----------|------------------|-------------------|
|
|
34
|
-
| ".5" | 0.5 (Float) | ".5" (String) |
|
|
35
|
-
| "3." | 3.0 (Float) | "3." (String) |
|
|
36
|
-
| "1.5e3" | 1500.0 (Float) | "1.5e3" (String) |
|
|
37
|
-
| "1.0e10" | 10000000000.0 | "1.0e10" (String) |
|
|
38
|
-
|
|
39
|
-
Decision: the C path's permissive behavior (corner cases + scientific notation) is the intended
|
|
40
|
-
contract. Fix = make the Ruby fallback match the C path. Do NOT tighten the C path.
|
|
41
|
-
|
|
42
|
-
Ruby-side changes (in `hash_transformations.rb`):
|
|
43
|
-
1. Swap NUMERIC_REGEX for a permissive one:
|
|
44
|
-
/\A[+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?\z/
|
|
45
|
-
matches 1, 1., 1.5, .5, 1e3, 1.5e3, -3.14e-2, etc.; still rejects ".", "e3", "1.2.3",
|
|
46
|
-
"1_000", "0x1F".
|
|
47
|
-
2. Add `DOT_BYTE = '.'.ord` (46) and include it in the first-byte fast-reject's allowed set
|
|
48
|
-
(the C pre-check already allows a leading `.`; without this, ".5" gets rejected on byte 0).
|
|
49
|
-
3. Int-vs-float decision: `(v.include?('.') || v.include?('e') || v.include?('E')) ? v.to_f : v.to_i`
|
|
50
|
-
(currently only checks for `.`).
|
|
51
|
-
|
|
52
|
-
Stays a string on BOTH paths (no change needed, but worth characterization tests — there are
|
|
53
|
-
currently NONE):
|
|
54
|
-
- "010" => 10 (NOT octal 8 — both paths use base-10 conversion: String#to_i / strtol(.,10).
|
|
55
|
-
A switch to Kernel#Integer() would break this. Lock it down with a test.)
|
|
56
|
-
- "0x1F", "0b101", "0o17" => string (radix prefixes not honored by base-10 conversion)
|
|
57
|
-
- "1_000" => string (underscores)
|
|
58
|
-
- "1,200.00", "1.300,00" => string (thousands sep / decimal comma — strtod stops at the
|
|
59
|
-
separator → not fully consumed; regex rejects. This is the only safe behavior; "1,200" is
|
|
60
|
-
genuinely ambiguous. Locale-specific number formats are the caller's job via value_converters.)
|
|
61
|
-
|
|
62
|
-
NOT doing: locale sniffing (read LC_NUMERIC at init and adjust the regexes). Rejected because
|
|
63
|
-
the machine locale tells you nothing about the file's number format, it breaks reproducibility
|
|
64
|
-
(same code + same file → different results on a US vs EU box), and `,` can't be both col_sep and
|
|
65
|
-
decimal separator anyway. Note `strtod` IS locale-sensitive (LC_NUMERIC) but it's dormant — Ruby
|
|
66
|
-
runs in the C/POSIX locale; don't deliberately activate it.
|
|
67
|
-
|
|
68
|
-
When done: parity tests (`[true, false].each`) for the now-consistent set (.5, 3., 1.5e3, 1e3)
|
|
69
|
-
plus characterization tests for the stays-a-string set above; CHANGELOG line noting the Ruby
|
|
70
|
-
fallback's numeric conversion now accepts scientific notation and bare-dot forms, matching the
|
|
71
|
-
accelerated path. Behavior change affects `acceleration: false` users only — and aligns them with
|
|
72
|
-
the default.
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
## Warn once when the C extension didn't load on a platform that supports it
|
|
76
|
-
|
|
77
|
-
Context: `acceleration: true` is the default. When the C extension fails to build / isn't loaded,
|
|
78
|
-
SmarterCSV silently falls back to the Ruby parser — graceful degradation by design (so the gem
|
|
79
|
-
keeps working for users with broken toolchains, JRuby, TruffleRuby, etc.). Today there is no
|
|
80
|
-
signal to the user that they're not getting the C path; their CSV parsing is just slower than
|
|
81
|
-
they might have expected.
|
|
82
|
-
|
|
83
|
-
Idea: emit a one-time warning when:
|
|
84
|
-
* the C extension is NOT loaded — `!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)`, AND
|
|
85
|
-
* the platform is one where it *should* be available — `RUBY_ENGINE == 'ruby'` (MRI / CRuby).
|
|
86
|
-
JRuby and TruffleRuby don't load CRuby C extensions natively; nothing for the user to do.
|
|
87
|
-
|
|
88
|
-
Where to fire:
|
|
89
|
-
* NOT at `require 'smarter_csv'` time — Rails.logger typically isn't set up yet, so any
|
|
90
|
-
"route through the warnings system" code would just fall through to `Kernel#warn` anyway,
|
|
91
|
-
and the warning would land in stderr instead of the Rails log where ops would see it.
|
|
92
|
-
* At first `Reader.new` / `SmarterCSV.process` call — Rails has booted, the existing
|
|
93
|
-
routing-through-Rails.logger-or-Kernel#warn infra works, and the existing deduped warnings
|
|
94
|
-
histogram means it fires once per process regardless of how many parse calls.
|
|
95
|
-
|
|
96
|
-
Implementation sketch:
|
|
97
|
-
* Add a new warning code (e.g. `:c_extension_unavailable`) alongside the existing ones
|
|
98
|
-
(`:chunk_size_default`, `:header_a_method`, `:utf8_missing_binary_mode`, ...).
|
|
99
|
-
* Severity `:warn`. Suppressible via the existing `verbose: :quiet`.
|
|
100
|
-
* Message points at the fix — e.g. "C acceleration extension not loaded on this Ruby; using
|
|
101
|
-
Ruby parser. To enable acceleration, reinstall with `gem pristine smarter_csv` and check
|
|
102
|
-
the build log." Plus a link/pointer to a troubleshooting section in the docs.
|
|
103
|
-
|
|
104
|
-
Bonus: add a public predicate `SmarterCSV.acceleration_available?` returning
|
|
105
|
-
`Parser.respond_to?(:parse_csv_line_c)`. Zero noise, useful for scripts / CI / future spec
|
|
106
|
-
files that want to branch on the environment fact rather than guess.
|
|
107
|
-
|
|
108
|
-
NOT doing: a banner at `require` time (every Rails app would print it at boot, too noisy);
|
|
109
|
-
warning when `acceleration: false` was explicitly chosen (the user knows what they're doing).
|