smarter_csv 1.17.1 → 1.17.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -195,8 +195,10 @@ module SmarterCSV
195
195
  return [nil, n] if all_blank
196
196
  end
197
197
 
198
- # Batch-strip using C-level map! — faster than per-element strip inside the loop
199
- fields.map!(&:strip) if strip
198
+ # In-place strip! — allocation-free when there's no surrounding whitespace
199
+ # (matches the sister site in parse_csv_line_ruby; completes the
200
+ # "strip -> strip!" sweep documented in the 1.17.0 commit notes).
201
+ fields.each(&:strip!) if strip
200
202
 
201
203
  remove_empty = options[:remove_empty_values]
202
204
  hash = {}
@@ -410,15 +412,28 @@ module SmarterCSV
410
412
  if !allow_escaped_quotes || backslash_count % 2 == 0
411
413
  if quote_boundary_standard
412
414
  if in_quotes
413
- # closing quote: only valid if followed by col_sep, row_sep, or end of line
414
415
  next_i = i + 1
415
- if next_i >= bytesize ||
416
- line.getbyte(next_i) == col_sep_byte ||
417
- (row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
416
+ if next_i + 1 < bytesize && line.getbyte(next_i) == quote_byte
417
+ # RFC doubled quote inside a quoted field ("" ").
418
+ # Give this precedence over the closing-quote check, but only
419
+ # when another byte follows the doubled pair.
420
+ #
421
+ # Compatibility note: we intentionally do NOT force terminal
422
+ # "" to be consumed here. SmarterCSV has a long-standing lenient
423
+ # behavior for malformed tails like ...\"" in :double_quotes mode:
424
+ # the final quote may still close the field instead of turning the
425
+ # row into an unclosed-quote error. Issue #334 needs doubled-quote
426
+ # precedence for ..."",... (more content follows), but we keep the
427
+ # historical leniency for terminal ..."".
428
+ i = next_i
429
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
430
+ elsif next_i >= bytesize ||
431
+ line.getbyte(next_i) == col_sep_byte ||
432
+ (row_sep_bytesize > 0 && line.byteslice(next_i, row_sep_bytesize) == row_sep)
418
433
  in_quotes = false
419
434
  field_started = true
420
435
  end
421
- # else: quote inside quoted field → literal (handles "" doubling)
436
+ # else: quote inside quoted field → literal
422
437
  elsif !field_started # at field boundary: open quoted field
423
438
  in_quotes = true
424
439
  field_started = true
@@ -519,15 +534,28 @@ module SmarterCSV
519
534
  if !allow_escaped_quotes || backslash_count % 2 == 0
520
535
  if quote_boundary_standard
521
536
  if in_quotes
522
- # closing quote: only valid if followed by col_sep, row_sep, or end of line
523
537
  next_i = i + 1
524
- if next_i >= line_size ||
525
- line[next_i...next_i + col_sep_size] == col_sep ||
526
- (row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
538
+ if next_i + 1 < line_size && line[next_i] == quote
539
+ # RFC doubled quote inside a quoted field ("" → ").
540
+ # Give this precedence over the closing-quote check, but only
541
+ # when another character follows the doubled pair.
542
+ #
543
+ # Compatibility note: we intentionally do NOT force terminal
544
+ # "" to be consumed here. SmarterCSV has a long-standing lenient
545
+ # behavior for malformed tails like ...\"" in :double_quotes mode:
546
+ # the final quote may still close the field instead of turning the
547
+ # row into an unclosed-quote error. Issue #334 needs doubled-quote
548
+ # precedence for ..."",... (more content follows), but we keep the
549
+ # historical leniency for terminal ..."".
550
+ i = next_i
551
+ # closing quote: only valid if followed by col_sep, row_sep, or end of line
552
+ elsif next_i >= line_size ||
553
+ line[next_i...next_i + col_sep_size] == col_sep ||
554
+ (row_sep_size > 0 && line[next_i...next_i + row_sep_size] == row_sep)
527
555
  in_quotes = false
528
556
  field_started = true
529
557
  end
530
- # else: quote inside quoted field → literal (handles "" doubling)
558
+ # else: quote inside quoted field → literal
531
559
  elsif !field_started # at field boundary: open quoted field
532
560
  in_quotes = true
533
561
  field_started = true
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module SmarterCSV
4
- VERSION = "1.17.1"
4
+ VERSION = "1.17.3"
5
5
  end
data/smarter_csv.gemspec CHANGED
@@ -30,11 +30,13 @@ Gem::Specification.new do |spec|
30
30
  spec.homepage = "https://github.com/tilo/smarter_csv"
31
31
  spec.license = 'MIT'
32
32
 
33
- spec.metadata["homepage_uri"] = spec.homepage
34
- spec.metadata["source_code_uri"] = spec.homepage
35
- spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
36
- spec.metadata["documentation_uri"] = "https://github.com/tilo/smarter_csv/tree/main/docs"
37
- spec.metadata["bug_tracker_uri"] = "https://github.com/tilo/smarter_csv/issues"
33
+ spec.metadata["homepage_uri"] = spec.homepage
34
+ spec.metadata["source_code_uri"] = spec.homepage
35
+ spec.metadata["changelog_uri"] = "https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md"
36
+ spec.metadata["documentation_uri"] = "https://github.com/tilo/smarter_csv/tree/main/docs"
37
+ spec.metadata["bug_tracker_uri"] = "https://github.com/tilo/smarter_csv/issues"
38
+ spec.metadata["upgrade_uri"] = "https://github.com/tilo/smarter_csv/blob/main/UPGRADING.md"
39
+ spec.metadata["upgrade_wizard_uri"] = "https://tilo.github.io/smarter_csv/upgrade_wizard.html"
38
40
 
39
41
  spec.required_ruby_version = ">= 2.6.0"
40
42
 
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: smarter_csv
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.17.1
4
+ version: 1.17.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Tilo Sloboda
8
8
  bindir: bin
9
9
  cert_chain: []
10
- date: 2026-05-17 00:00:00.000000000 Z
10
+ date: 2026-05-27 00:00:00.000000000 Z
11
11
  dependencies: []
12
12
  description: |
13
13
  SmarterCSV is a high-performance CSV reader and writer for Ruby focused on
@@ -39,7 +39,8 @@ files:
39
39
  - LICENSE.txt
40
40
  - README.md
41
41
  - Rakefile
42
- - TO_DO.md
42
+ - UPGRADING.md
43
+ - docs/.nojekyll
43
44
  - docs/_introduction.md
44
45
  - docs/bad_row_quarantine.md
45
46
  - docs/basic_read_api.md
@@ -64,6 +65,8 @@ files:
64
65
  - docs/releases/1.17.0/performance_notes.md
65
66
  - docs/row_col_sep.md
66
67
  - docs/ruby_csv_pitfalls.md
68
+ - docs/upgrade_path.json
69
+ - docs/upgrade_wizard.html
67
70
  - docs/value_converters.md
68
71
  - docs/warnings.md
69
72
  - ext/smarter_csv/extconf.rb
@@ -102,6 +105,8 @@ metadata:
102
105
  changelog_uri: https://github.com/tilo/smarter_csv/blob/main/CHANGELOG.md
103
106
  documentation_uri: https://github.com/tilo/smarter_csv/tree/main/docs
104
107
  bug_tracker_uri: https://github.com/tilo/smarter_csv/issues
108
+ upgrade_uri: https://github.com/tilo/smarter_csv/blob/main/UPGRADING.md
109
+ upgrade_wizard_uri: https://tilo.github.io/smarter_csv/upgrade_wizard.html
105
110
  rdoc_options: []
106
111
  require_paths:
107
112
  - lib
data/TO_DO.md DELETED
@@ -1,109 +0,0 @@
1
- # SmarterCSV v2.0 TO DO List
2
-
3
- DONE:
4
- [X] Don't call rewind on filehandle
5
- [X] use Procs for validations and transformatoins [issue #118](https://github.com/tilo/smarter_csv/issues/118)
6
- [X] skip file opening, allow reading from CSV string, e.g. reading from S3 file [issue #120](https://github.com/tilo/smarter_csv/issues/120). Or stream large file from S3 (linked in the issue)
7
- [X] [2.0 BUG] convert_to_float saves Proc as @@convert_to_integer [issue #157](https://github.com/tilo/smarter_csv/issues/157)
8
- [X] add enumerable to speed up parallel processing [issue #66](https://github.com/tilo/smarter_csv/issues/66), [issue #32](https://github.com/tilo/smarter_csv/issues/32)
9
- [X] Provide an example for custom Procs for hash_transformations in the docs [issue #174](https://github.com/tilo/smarter_csv/issues/174)
10
- [X] Collect all Errors, before surfacing them. Avoid throwing an exception on the first error [issue #133](https://github.com/tilo/smarter_csv/issues/133)
11
-
12
-
13
- Partially Done:
14
- [ ] make @errors and @warnings work [issue #118](https://github.com/tilo/smarter_csv/issues/118)
15
-
16
- StilL TO DO:
17
- [ ] Replace remove_empty_values: false [issue #213](https://github.com/tilo/smarter_csv/issues/213)
18
-
19
- Arguably by design (e.g. exclude these columns from conversion and have them returned as a string)
20
- [ ] [2.0 BUG] :convert_values_to_numeric_unless_leading_zeros drops leading zeros [issue #151](https://github.com/tilo/smarter_csv/issues/151)
21
-
22
-
23
- ## Numeric conversion: align the Ruby fallback path with the C path (permissive)
24
-
25
- Context: `convert_values_to_numeric` runs in two places that currently DISAGREE on edge cases:
26
- - C path (`acceleration: true`, the default): `ext/smarter_csv/smarter_csv.c#try_numeric_conversion`
27
- uses `strtol`/`strtod` (base 10; float branch only entered when the field contains a `.`).
28
- - Ruby fallback (`acceleration: false`): `lib/smarter_csv/hash_transformations.rb` uses the
29
- strict regex `NUMERIC_REGEX = /\A[+-]?\d+(?:\.\d+)?\z/` plus `to_i` / `to_f`.
30
-
31
- Divergence (verified empirically):
32
- | value | C path | Ruby fallback |
33
- |-----------|------------------|-------------------|
34
- | ".5" | 0.5 (Float) | ".5" (String) |
35
- | "3." | 3.0 (Float) | "3." (String) |
36
- | "1.5e3" | 1500.0 (Float) | "1.5e3" (String) |
37
- | "1.0e10" | 10000000000.0 | "1.0e10" (String) |
38
-
39
- Decision: the C path's permissive behavior (corner cases + scientific notation) is the intended
40
- contract. Fix = make the Ruby fallback match the C path. Do NOT tighten the C path.
41
-
42
- Ruby-side changes (in `hash_transformations.rb`):
43
- 1. Swap NUMERIC_REGEX for a permissive one:
44
- /\A[+-]?(?:\d+\.?\d*|\.\d+)(?:[eE][+-]?\d+)?\z/
45
- matches 1, 1., 1.5, .5, 1e3, 1.5e3, -3.14e-2, etc.; still rejects ".", "e3", "1.2.3",
46
- "1_000", "0x1F".
47
- 2. Add `DOT_BYTE = '.'.ord` (46) and include it in the first-byte fast-reject's allowed set
48
- (the C pre-check already allows a leading `.`; without this, ".5" gets rejected on byte 0).
49
- 3. Int-vs-float decision: `(v.include?('.') || v.include?('e') || v.include?('E')) ? v.to_f : v.to_i`
50
- (currently only checks for `.`).
51
-
52
- Stays a string on BOTH paths (no change needed, but worth characterization tests — there are
53
- currently NONE):
54
- - "010" => 10 (NOT octal 8 — both paths use base-10 conversion: String#to_i / strtol(.,10).
55
- A switch to Kernel#Integer() would break this. Lock it down with a test.)
56
- - "0x1F", "0b101", "0o17" => string (radix prefixes not honored by base-10 conversion)
57
- - "1_000" => string (underscores)
58
- - "1,200.00", "1.300,00" => string (thousands sep / decimal comma — strtod stops at the
59
- separator → not fully consumed; regex rejects. This is the only safe behavior; "1,200" is
60
- genuinely ambiguous. Locale-specific number formats are the caller's job via value_converters.)
61
-
62
- NOT doing: locale sniffing (read LC_NUMERIC at init and adjust the regexes). Rejected because
63
- the machine locale tells you nothing about the file's number format, it breaks reproducibility
64
- (same code + same file → different results on a US vs EU box), and `,` can't be both col_sep and
65
- decimal separator anyway. Note `strtod` IS locale-sensitive (LC_NUMERIC) but it's dormant — Ruby
66
- runs in the C/POSIX locale; don't deliberately activate it.
67
-
68
- When done: parity tests (`[true, false].each`) for the now-consistent set (.5, 3., 1.5e3, 1e3)
69
- plus characterization tests for the stays-a-string set above; CHANGELOG line noting the Ruby
70
- fallback's numeric conversion now accepts scientific notation and bare-dot forms, matching the
71
- accelerated path. Behavior change affects `acceleration: false` users only — and aligns them with
72
- the default.
73
-
74
-
75
- ## Warn once when the C extension didn't load on a platform that supports it
76
-
77
- Context: `acceleration: true` is the default. When the C extension fails to build / isn't loaded,
78
- SmarterCSV silently falls back to the Ruby parser — graceful degradation by design (so the gem
79
- keeps working for users with broken toolchains, JRuby, TruffleRuby, etc.). Today there is no
80
- signal to the user that they're not getting the C path; their CSV parsing is just slower than
81
- they might have expected.
82
-
83
- Idea: emit a one-time warning when:
84
- * the C extension is NOT loaded — `!SmarterCSV::Parser.respond_to?(:parse_csv_line_c)`, AND
85
- * the platform is one where it *should* be available — `RUBY_ENGINE == 'ruby'` (MRI / CRuby).
86
- JRuby and TruffleRuby don't load CRuby C extensions natively; nothing for the user to do.
87
-
88
- Where to fire:
89
- * NOT at `require 'smarter_csv'` time — Rails.logger typically isn't set up yet, so any
90
- "route through the warnings system" code would just fall through to `Kernel#warn` anyway,
91
- and the warning would land in stderr instead of the Rails log where ops would see it.
92
- * At first `Reader.new` / `SmarterCSV.process` call — Rails has booted, the existing
93
- routing-through-Rails.logger-or-Kernel#warn infra works, and the existing deduped warnings
94
- histogram means it fires once per process regardless of how many parse calls.
95
-
96
- Implementation sketch:
97
- * Add a new warning code (e.g. `:c_extension_unavailable`) alongside the existing ones
98
- (`:chunk_size_default`, `:header_a_method`, `:utf8_missing_binary_mode`, ...).
99
- * Severity `:warn`. Suppressible via the existing `verbose: :quiet`.
100
- * Message points at the fix — e.g. "C acceleration extension not loaded on this Ruby; using
101
- Ruby parser. To enable acceleration, reinstall with `gem pristine smarter_csv` and check
102
- the build log." Plus a link/pointer to a troubleshooting section in the docs.
103
-
104
- Bonus: add a public predicate `SmarterCSV.acceleration_available?` returning
105
- `Parser.respond_to?(:parse_csv_line_c)`. Zero noise, useful for scripts / CI / future spec
106
- files that want to branch on the environment fact rather than guess.
107
-
108
- NOT doing: a banner at `require` time (every Rails app would print it at boot, too noisy);
109
- warning when `acceleration: false` was explicitly chosen (the user knows what they're doing).