parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,632 @@
1
+ use std::collections::{HashMap, HashSet};
2
+ use std::os::raw::{c_char, c_long};
3
+ use std::str::FromStr;
4
+ use std::sync::{Mutex, OnceLock};
5
+
6
+ use magnus::value::{BoxValue, ReprValue};
7
+ use magnus::{RString, Ruby, Value};
8
+
9
+ /// Default cap on how many distinct strings the [`StringStorageMode::Shared`]
10
+ /// strategy will leak before returning frozen owned copies (overridable via the
11
+ /// shared budget).
12
+ ///
13
+ /// `Shared` hands Ruby a zero-copy view into Rust-owned bytes, which requires
14
+ /// those bytes to live for the entire (unbounded) lifetime of the Ruby string,
15
+ /// i.e. `'static`. We obtain `'static` by leaking one copy per distinct value
16
+ /// into a process-wide registry. The registry is shared by all reads, so
17
+ /// repeated `each_row`/`each_column` calls reuse the same leaked values. The
18
+ /// requested budget bounds how many values the current read may return this way
19
+ /// and how many new process-wide leaks that read may admit; hard process
20
+ /// ceilings below bound the registry even when callers request larger budgets.
21
+ pub const DEFAULT_SHARED_MAX_ENTRIES: usize = 8192;
22
+
23
+ /// Default cap on the size of an individual string [`StringStorageMode::Shared`]
24
+ /// will leak (overridable per read). Longer values are returned as a frozen
25
+ /// owned copy rather than leaked, so a column of large blobs cannot blow the
26
+ /// leak budget. `Shared` targets short, repeated, low-cardinality strings (enums,
27
+ /// categories, codes); large values gain little from zero-copy and would
28
+ /// dominate the leak, so they opt out.
29
+ pub const DEFAULT_SHARED_MAX_VALUE_BYTES: usize = 4096;
30
+
31
+ /// Hard process-wide entry ceiling for `:shared`, regardless of user-supplied
32
+ /// budgets. This keeps a single large requested budget from making the leak
33
+ /// table unbounded. The default budget is still much smaller, but callers can
34
+ /// explicitly request more up to this ceiling.
35
+ const SHARED_PROCESS_MAX_ENTRIES: usize = 65_536;
36
+
37
+ /// Hard process-wide byte ceiling for leaked `:shared` string buffers. This
38
+ /// bounds the data plane independently from hash-table overhead and from any
39
+ /// single caller's requested budget.
40
+ const SHARED_PROCESS_MAX_BYTES: usize = 64 * 1024 * 1024;
41
+
42
+ /// Per-read cache for shared values that could not use the process registry.
43
+ /// It avoids repeating the global lock for known process-cap fallbacks while
44
+ /// staying bounded for high-cardinality data.
45
+ const SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX: usize = 8192;
46
+ const SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX: usize = 4 * 1024 * 1024;
47
+
48
+ /// Cache size bound for `:intern` *values*. Low-cardinality columns (the case
49
+ /// `:intern` targets) fit well within this, making their repeats allocation
50
+ /// free; higher-cardinality values past the bound become frozen owned copies
51
+ /// rather than adding more entries to Ruby's immortal intern table.
52
+ const INTERN_VALUE_CACHE_ENTRY_COUNT_MAX: usize = 8192;
53
+
54
+ /// Cache size bound for hash *keys* (struct field names). Field-name cardinality
55
+ /// is fixed by the schema and small; the bound is a defensive ceiling.
56
+ const KEY_CACHE_ENTRY_COUNT_MAX: usize = 4096;
57
+
58
+ /// How a Rust string value is materialized as a Ruby `String` when reading.
59
+ ///
60
+ /// The choice trades per-value allocation against memory ownership:
61
+ /// - [`Copy`](Self::Copy) is always safe and produces independent, mutable strings.
62
+ /// - [`Intern`](Self::Intern) deduplicates equal values through Ruby's frozen
63
+ /// string table (Ruby owns the bytes); repeats reuse one immortal object.
64
+ /// - [`Shared`](Self::Shared) avoids the byte copy entirely by viewing leaked
65
+ /// `'static` Rust bytes; bounded per read and by process-wide ceilings (see
66
+ /// [`StringStorageConfig`]).
67
+ #[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
68
+ pub enum StringStorageMode {
69
+ /// Allocate a fresh, mutable Ruby `String` (one allocation + byte copy) per
70
+ /// value. This is the default and matches historical behavior.
71
+ #[default]
72
+ Copy,
73
+ /// Deduplicate equal values through Ruby's interned (frozen) string table up
74
+ /// to a bounded per-read cache. Values after that bound become frozen owned
75
+ /// copies, so high-cardinality columns cannot keep growing Ruby's immortal
76
+ /// intern table. Note: a transient copy still happens per value (even on a
77
+ /// dedup hit), so this is not a per-value throughput win over `Copy`; it
78
+ /// lowers retained footprint and GC pressure for low-cardinality /
79
+ /// repeat-heavy columns.
80
+ Intern,
81
+ /// Zero byte-copy: equal values share leaked `'static` Rust bytes via a
82
+ /// frozen static Ruby string. Strings are always frozen in this mode. Best
83
+ /// for short, repeated, low-cardinality values. The shared budget bounds
84
+ /// per-read entry count, per-value byte size, and new process leak admission;
85
+ /// values past either bound are returned as a frozen owned copy. See
86
+ /// [`StringStorageConfig`].
87
+ Shared,
88
+ }
89
+
90
+ /// The reader's string-materialization configuration: the [`StringStorageMode`]
91
+ /// plus the budget that bounds [`StringStorageMode::Shared`] values for this read
92
+ /// and new process-wide leak admission. The budget is ignored by `Copy` and
93
+ /// `Intern`.
94
+ #[derive(Clone, Copy, Debug, PartialEq, Eq)]
95
+ pub struct StringStorageConfig {
96
+ pub mode: StringStorageMode,
97
+ pub shared_max_entries: usize,
98
+ pub shared_max_value_bytes: usize,
99
+ }
100
+
101
+ impl Default for StringStorageConfig {
102
+ fn default() -> Self {
103
+ Self {
104
+ mode: StringStorageMode::default(),
105
+ shared_max_entries: DEFAULT_SHARED_MAX_ENTRIES,
106
+ shared_max_value_bytes: DEFAULT_SHARED_MAX_VALUE_BYTES,
107
+ }
108
+ }
109
+ }
110
+
111
+ impl StringStorageConfig {
112
+ /// A config for `mode` with the default shared budget.
113
+ pub fn from_mode(mode: StringStorageMode) -> Self {
114
+ Self {
115
+ mode,
116
+ ..Self::default()
117
+ }
118
+ }
119
+ }
120
+
121
+ impl FromStr for StringStorageMode {
122
+ type Err = String;
123
+
124
+ fn from_str(value: &str) -> Result<Self, Self::Err> {
125
+ match value {
126
+ "copy" => Ok(StringStorageMode::Copy),
127
+ "intern" => Ok(StringStorageMode::Intern),
128
+ "shared" => Ok(StringStorageMode::Shared),
129
+ other => Err(format!(
130
+ "Invalid string_storage: {} (expected :copy, :intern, or :shared)",
131
+ other
132
+ )),
133
+ }
134
+ }
135
+ }
136
+
137
+ impl std::fmt::Display for StringStorageMode {
138
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
139
+ let name = match self {
140
+ StringStorageMode::Copy => "copy",
141
+ StringStorageMode::Intern => "intern",
142
+ StringStorageMode::Shared => "shared",
143
+ };
144
+ f.write_str(name)
145
+ }
146
+ }
147
+
148
+ /// Process-wide registry of leaked `'static` string bytes.
149
+ ///
150
+ /// The registry performs no Ruby calls and is protected by a small mutex. Reads
151
+ /// only take the lock while checking/inserting a string; the hot Ruby object
152
+ /// creation path happens after the leaked slice is returned.
153
+ #[derive(Debug)]
154
+ struct SharedLeakRegistry {
155
+ entries: HashSet<&'static str>,
156
+ leaked_bytes: usize,
157
+ }
158
+
159
+ impl SharedLeakRegistry {
160
+ fn new() -> Self {
161
+ Self {
162
+ entries: HashSet::new(),
163
+ leaked_bytes: 0,
164
+ }
165
+ }
166
+
167
+ fn intern(
168
+ &mut self,
169
+ s: &str,
170
+ requested_max_entries: usize,
171
+ requested_max_value_bytes: usize,
172
+ ) -> Option<&'static str> {
173
+ if let Some(&existing) = self.entries.get(s) {
174
+ return Some(existing);
175
+ }
176
+
177
+ let entry_limit = requested_max_entries.min(SHARED_PROCESS_MAX_ENTRIES);
178
+ let requested_byte_limit =
179
+ requested_max_entries.saturating_mul(requested_max_value_bytes.saturating_add(1));
180
+ let byte_limit = requested_byte_limit.min(SHARED_PROCESS_MAX_BYTES);
181
+ let entry_bytes = s.len().checked_add(1)?;
182
+
183
+ if self.entries.len() >= entry_limit {
184
+ return None;
185
+ }
186
+ if self.leaked_bytes.saturating_add(entry_bytes) > byte_limit {
187
+ return None;
188
+ }
189
+
190
+ let leaked = leak_nul_terminated(s);
191
+ self.entries.insert(leaked);
192
+ self.leaked_bytes += entry_bytes;
193
+
194
+ debug_assert!(self.entries.len() <= SHARED_PROCESS_MAX_ENTRIES);
195
+ debug_assert!(self.leaked_bytes <= SHARED_PROCESS_MAX_BYTES);
196
+ Some(leaked)
197
+ }
198
+ }
199
+
200
+ fn shared_leak_registry() -> &'static Mutex<SharedLeakRegistry> {
201
+ static REGISTRY: OnceLock<Mutex<SharedLeakRegistry>> = OnceLock::new();
202
+ REGISTRY.get_or_init(|| Mutex::new(SharedLeakRegistry::new()))
203
+ }
204
+
205
+ fn lock_shared_leak_registry() -> std::sync::MutexGuard<'static, SharedLeakRegistry> {
206
+ shared_leak_registry()
207
+ .lock()
208
+ .unwrap_or_else(|poisoned| poisoned.into_inner())
209
+ }
210
+
211
+ /// Deduplicating, bounded interner of leaked `'static` string bytes.
212
+ ///
213
+ /// Each distinct string is leaked at most once process-wide (so equal values
214
+ /// reuse the same `'static` slice). Each read still enforces its own entry and
215
+ /// value-size budget before using the process registry; a larger earlier read
216
+ /// cannot make a later smaller read return out-of-budget zero-copy strings.
217
+ /// Values outside those bounds are returned by the caller as frozen owned copies.
218
+ #[derive(Debug)]
219
+ pub struct SharedLeakInterner {
220
+ entries: HashSet<&'static str>,
221
+ fallbacks: HashSet<Box<str>>,
222
+ fallback_bytes: usize,
223
+ max_entries: usize,
224
+ max_value_bytes: usize,
225
+ }
226
+
227
+ impl SharedLeakInterner {
228
+ /// Both limits must be positive; callers parse them from positive Integers.
229
+ fn new(max_entries: usize, max_value_bytes: usize) -> Self {
230
+ debug_assert!(max_entries > 0);
231
+ debug_assert!(max_value_bytes > 0);
232
+ Self {
233
+ entries: HashSet::new(),
234
+ fallbacks: HashSet::new(),
235
+ fallback_bytes: 0,
236
+ max_entries,
237
+ max_value_bytes,
238
+ }
239
+ }
240
+
241
+ /// Return a `'static` view of `s`, leaking one NUL-terminated copy when the
242
+ /// current read and process registry both have room, or `None` when either
243
+ /// bound is reached (caller then copies).
244
+ fn intern(&mut self, s: &str) -> Option<&'static str> {
245
+ if let Some(&existing) = self.entries.get(s) {
246
+ return Some(existing);
247
+ }
248
+ if self.fallbacks.contains(s) {
249
+ return None;
250
+ }
251
+
252
+ if s.len() > self.max_value_bytes {
253
+ return None;
254
+ }
255
+
256
+ if self.entries.len() >= self.max_entries {
257
+ self.remember_fallback(s);
258
+ return None;
259
+ }
260
+
261
+ match lock_shared_leak_registry().intern(s, self.max_entries, self.max_value_bytes) {
262
+ Some(leaked) => {
263
+ self.entries.insert(leaked);
264
+ debug_assert!(self.entries.len() <= self.max_entries);
265
+ Some(leaked)
266
+ }
267
+ None => {
268
+ self.remember_fallback(s);
269
+ None
270
+ }
271
+ }
272
+ }
273
+
274
+ fn remember_fallback(&mut self, s: &str) {
275
+ if s.len() > self.max_value_bytes {
276
+ return;
277
+ }
278
+ if self.fallbacks.len() >= SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX {
279
+ return;
280
+ }
281
+ if self.fallback_bytes.saturating_add(s.len()) > SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX {
282
+ return;
283
+ }
284
+ if self.fallbacks.insert(Box::from(s)) {
285
+ self.fallback_bytes += s.len();
286
+ }
287
+ debug_assert!(self.fallbacks.len() <= SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX);
288
+ debug_assert!(self.fallback_bytes <= SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX);
289
+ }
290
+ }
291
+
292
+ /// Leak one copy of `s` with a trailing NUL byte and return a `'static` view of
293
+ /// the content only (excluding the NUL).
294
+ ///
295
+ /// The NUL terminator is mandatory: `rb_utf8_str_new_static` (used by
296
+ /// [`StringStorage::to_ruby_string`] for `Shared`) builds a string that points
297
+ /// at this buffer and relies on Ruby's invariant that `ptr[len] == '\0'`. The
298
+ /// boxed bytes are never freed, so the returned reference is genuinely `'static`.
299
+ fn leak_nul_terminated(s: &str) -> &'static str {
300
+ let mut bytes = Vec::with_capacity(s.len() + 1);
301
+ bytes.extend_from_slice(s.as_bytes());
302
+ bytes.push(0);
303
+ let leaked: &'static [u8] = Box::leak(bytes.into_boxed_slice());
304
+ // SAFETY: the leading `s.len()` bytes are exactly `s`'s valid UTF-8 content;
305
+ // only the trailing NUL is excluded from the returned slice.
306
+ unsafe { std::str::from_utf8_unchecked(&leaked[..leaked.len() - 1]) }
307
+ }
308
+
309
+ /// Per-read string output: how string *values* are materialized (per the
310
+ /// configured mode) plus a cache that always interns hash *keys* (struct field
311
+ /// names). One `StringStorage` is created per `each_row`/`each_column`
312
+ /// invocation; its local caches are not shared across calls. In `Shared` mode,
313
+ /// value bytes are coordinated through the process-wide leak registry above.
314
+ #[derive(Debug)]
315
+ pub struct StringStorage {
316
+ values: ValueStrategy,
317
+ keys: InternCache,
318
+ }
319
+
320
+ impl StringStorage {
321
+ pub fn new(config: StringStorageConfig) -> Self {
322
+ Self {
323
+ values: ValueStrategy::new(config),
324
+ keys: InternCache::new(KEY_CACHE_ENTRY_COUNT_MAX),
325
+ }
326
+ }
327
+
328
+ /// Materialize a string *value* per the configured mode. The result is
329
+ /// frozen for `Intern`/`Shared` and mutable for `Copy`.
330
+ pub fn ruby_string(&mut self, ruby: &Ruby, s: &str) -> Value {
331
+ self.values.ruby_string(ruby, s)
332
+ }
333
+
334
+ /// Materialize a hash *key* (a struct field name). Keys are always interned
335
+ /// and reused regardless of the value mode, because field names are a small
336
+ /// set repeated on every row.
337
+ pub fn ruby_key(&mut self, ruby: &Ruby, name: &str) -> Value {
338
+ self.keys.intern_key(ruby, name)
339
+ }
340
+ }
341
+
342
+ /// The value-materialization strategy (mode plus any per-mode state).
343
+ #[derive(Debug)]
344
+ enum ValueStrategy {
345
+ Copy,
346
+ Intern(InternCache),
347
+ Shared(SharedLeakInterner),
348
+ }
349
+
350
+ impl ValueStrategy {
351
+ fn new(config: StringStorageConfig) -> Self {
352
+ match config.mode {
353
+ StringStorageMode::Copy => ValueStrategy::Copy,
354
+ StringStorageMode::Intern => {
355
+ ValueStrategy::Intern(InternCache::new(INTERN_VALUE_CACHE_ENTRY_COUNT_MAX))
356
+ }
357
+ StringStorageMode::Shared => ValueStrategy::Shared(SharedLeakInterner::new(
358
+ config.shared_max_entries,
359
+ config.shared_max_value_bytes,
360
+ )),
361
+ }
362
+ }
363
+
364
+ fn ruby_string(&mut self, ruby: &Ruby, s: &str) -> Value {
365
+ match self {
366
+ ValueStrategy::Copy => ruby.str_new(s).as_value(),
367
+ ValueStrategy::Intern(cache) => cache
368
+ .intern_cached(ruby, s)
369
+ .unwrap_or_else(|| frozen_copy(ruby, s)),
370
+ ValueStrategy::Shared(interner) => match interner.intern(s) {
371
+ Some(leaked) => unsafe { static_ruby_string(ruby, leaked) },
372
+ // Past a leak bound: a frozen owned copy, so `Shared` results are
373
+ // uniformly frozen and no extra memory is leaked.
374
+ None => frozen_copy(ruby, s),
375
+ },
376
+ }
377
+ }
378
+ }
379
+
380
+ /// Caches the interned Ruby string for each distinct content value, so a
381
+ /// repeated string is interned once and then returned with no further Ruby
382
+ /// allocation. Used both for hash keys (struct field names) and for `:intern`
383
+ /// values.
384
+ ///
385
+ /// Each cached value is held in a [`BoxValue`], which registers the string with
386
+ /// Ruby's GC via `rb_gc_register_address`. That is required for correctness: a
387
+ /// plain `RString` stored in this Rust-heap map is invisible to the GC, and
388
+ /// `GC.compact` would relocate the interned string and leave the cached handle
389
+ /// dangling. `BoxValue` keeps the handle at a stable address that the GC updates
390
+ /// on compaction.
391
+ ///
392
+ /// Bounded: at most `capacity` distinct values are cached. Value callers fall
393
+ /// back to frozen owned copies after that; key callers continue interning after
394
+ /// the cache because key cardinality is fixed by the schema and key identity is
395
+ /// part of the public read contract.
396
+ #[derive(Debug)]
397
+ struct InternCache {
398
+ cache: HashMap<Box<str>, BoxValue<RString>>,
399
+ capacity: usize,
400
+ }
401
+
402
+ impl InternCache {
403
+ fn new(capacity: usize) -> Self {
404
+ Self {
405
+ cache: HashMap::new(),
406
+ capacity,
407
+ }
408
+ }
409
+
410
+ fn intern_cached(&mut self, ruby: &Ruby, s: &str) -> Option<Value> {
411
+ if let Some(boxed) = self.cache.get(s) {
412
+ return Some(boxed.as_value());
413
+ }
414
+ if self.cache.len() >= self.capacity {
415
+ return None;
416
+ }
417
+ let interned = ruby.str_new(s).to_interned_str();
418
+ self.cache.insert(Box::from(s), BoxValue::new(interned));
419
+ debug_assert!(self.cache.len() <= self.capacity);
420
+ Some(interned.as_value())
421
+ }
422
+
423
+ fn intern_key(&mut self, ruby: &Ruby, s: &str) -> Value {
424
+ self.intern_cached(ruby, s).unwrap_or_else(|| {
425
+ let interned = ruby.str_new(s).to_interned_str();
426
+ interned.as_value()
427
+ })
428
+ }
429
+ }
430
+
431
+ /// Build a frozen, owned Ruby `String` (a normal copy that is then frozen).
432
+ fn frozen_copy(ruby: &Ruby, s: &str) -> Value {
433
+ let string = ruby.str_new(s);
434
+ string.freeze();
435
+ string.as_value()
436
+ }
437
+
438
+ /// Build a frozen Ruby `String` that points directly at `bytes` without copying.
439
+ ///
440
+ /// # Safety
441
+ /// `bytes` must remain valid and immutable for the entire lifetime of the
442
+ /// returned Ruby string. Because Ruby code may retain the string for an
443
+ /// unbounded time, this requires `bytes: &'static str`. The backing buffer must
444
+ /// additionally be NUL-terminated at `bytes.as_ptr()[bytes.len()]`, which Ruby's
445
+ /// static-string constructor requires; [`leak_nul_terminated`] guarantees this.
446
+ /// The returned string is frozen so the shared, immutable backing is never
447
+ /// mutated in place.
448
+ unsafe fn static_ruby_string(ruby: &Ruby, bytes: &'static str) -> Value {
449
+ // The static-string constructor reads bytes[len] expecting a NUL; check the
450
+ // byte we are about to rely on rather than trusting the caller's comment.
451
+ debug_assert_eq!(
452
+ *bytes.as_ptr().add(bytes.len()),
453
+ 0,
454
+ "static_ruby_string requires a NUL terminator at bytes[len]"
455
+ );
456
+ let string = ruby.str_new_lit(bytes.as_ptr() as *const c_char, bytes.len() as c_long);
457
+ string.freeze();
458
+ string.as_value()
459
+ }
460
+
461
+ #[cfg(test)]
462
+ mod tests {
463
+ use super::*;
464
+ use std::sync::{Mutex, OnceLock};
465
+
466
+ fn shared_leak_test_lock() -> &'static Mutex<()> {
467
+ static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
468
+ LOCK.get_or_init(|| Mutex::new(()))
469
+ }
470
+
471
+ fn reset_shared_leak_registry() {
472
+ let mut registry = lock_shared_leak_registry();
473
+ registry.entries.clear();
474
+ registry.leaked_bytes = 0;
475
+ }
476
+
477
+ fn shared_leak_registry_len() -> usize {
478
+ lock_shared_leak_registry().entries.len()
479
+ }
480
+
481
+ fn with_clean_shared_leak_registry(test: impl FnOnce()) {
482
+ let _guard = shared_leak_test_lock()
483
+ .lock()
484
+ .unwrap_or_else(|poisoned| poisoned.into_inner());
485
+ reset_shared_leak_registry();
486
+ test();
487
+ reset_shared_leak_registry();
488
+ }
489
+
490
+ #[test]
491
+ fn interns_distinct_values_once_and_reuses_pointer() {
492
+ with_clean_shared_leak_registry(|| {
493
+ let mut interner =
494
+ SharedLeakInterner::new(DEFAULT_SHARED_MAX_ENTRIES, DEFAULT_SHARED_MAX_VALUE_BYTES);
495
+
496
+ let first = interner.intern("repeat").unwrap();
497
+ let second = interner.intern("repeat").unwrap();
498
+ let other = interner.intern("different").unwrap();
499
+
500
+ // Equal values share the same leaked allocation.
501
+ assert_eq!(first.as_ptr(), second.as_ptr());
502
+ assert_eq!(first, "repeat");
503
+ // Distinct values get distinct allocations with the right contents.
504
+ assert_ne!(first.as_ptr(), other.as_ptr());
505
+ assert_eq!(other, "different");
506
+ assert_eq!(shared_leak_registry_len(), 2);
507
+ });
508
+ }
509
+
510
+ #[test]
511
+ fn leak_is_bounded_and_falls_back_past_the_cap() {
512
+ with_clean_shared_leak_registry(|| {
513
+ let max_entries = 4;
514
+ let mut interner = SharedLeakInterner::new(max_entries, DEFAULT_SHARED_MAX_VALUE_BYTES);
515
+
516
+ for index in 0..max_entries {
517
+ assert!(interner.intern(&format!("value-{index}")).is_some());
518
+ }
519
+ assert_eq!(shared_leak_registry_len(), max_entries);
520
+
521
+ // A new distinct value past the cap is not leaked; caller must copy.
522
+ assert!(interner.intern("over-the-bound").is_none());
523
+ assert_eq!(shared_leak_registry_len(), max_entries);
524
+
525
+ // A value already interned still resolves even after the cap is hit.
526
+ assert!(interner.intern("value-0").is_some());
527
+ assert_eq!(shared_leak_registry_len(), max_entries);
528
+ });
529
+ }
530
+
531
+ #[test]
532
+ fn oversized_values_are_not_leaked() {
533
+ with_clean_shared_leak_registry(|| {
534
+ let mut interner =
535
+ SharedLeakInterner::new(DEFAULT_SHARED_MAX_ENTRIES, DEFAULT_SHARED_MAX_VALUE_BYTES);
536
+
537
+ let at_limit = "x".repeat(DEFAULT_SHARED_MAX_VALUE_BYTES);
538
+ let over_limit = "x".repeat(DEFAULT_SHARED_MAX_VALUE_BYTES + 1);
539
+
540
+ assert!(interner.intern(&at_limit).is_some());
541
+ assert!(interner.intern(&over_limit).is_none());
542
+ // Only the in-bound value was leaked.
543
+ assert_eq!(shared_leak_registry_len(), 1);
544
+ assert!(
545
+ interner.fallbacks.is_empty(),
546
+ "oversized fallbacks must not be retained in the per-read cache"
547
+ );
548
+ assert_eq!(interner.fallback_bytes, 0);
549
+ });
550
+ }
551
+
552
+ #[test]
553
+ fn fallback_cache_retained_bytes_are_bounded() {
554
+ with_clean_shared_leak_registry(|| {
555
+ let mut first = SharedLeakInterner::new(1, DEFAULT_SHARED_MAX_VALUE_BYTES);
556
+ assert!(first.intern("already-leaked").is_some());
557
+
558
+ let mut second = SharedLeakInterner::new(1, DEFAULT_SHARED_MAX_VALUE_BYTES);
559
+ let suffix = "x".repeat(1024);
560
+ for index in 0..(SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX * 2) {
561
+ assert!(second.intern(&format!("{index:08}-{suffix}")).is_none());
562
+ }
563
+
564
+ assert!(second.fallbacks.len() <= SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX);
565
+ assert!(second.fallback_bytes <= SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX);
566
+ });
567
+ }
568
+
569
+ #[test]
570
+ fn shared_leak_budget_is_process_wide_for_matching_budget() {
571
+ with_clean_shared_leak_registry(|| {
572
+ let mut first = SharedLeakInterner::new(4, DEFAULT_SHARED_MAX_VALUE_BYTES);
573
+ for index in 0..4 {
574
+ assert!(first.intern(&format!("reader-one-{index}")).is_some());
575
+ }
576
+
577
+ let mut second = SharedLeakInterner::new(4, DEFAULT_SHARED_MAX_VALUE_BYTES);
578
+ assert!(second.intern("reader-one-0").is_some());
579
+ assert!(
580
+ second.intern("reader-two-new").is_none(),
581
+ "the shared leak budget must not reset for each reader"
582
+ );
583
+ });
584
+ }
585
+
586
+ #[test]
587
+ fn current_read_value_bound_applies_to_registry_hits() {
588
+ with_clean_shared_leak_registry(|| {
589
+ let value = "larger-than-second-budget";
590
+ let mut first = SharedLeakInterner::new(4, value.len());
591
+ assert!(first.intern(value).is_some());
592
+ assert_eq!(shared_leak_registry_len(), 1);
593
+
594
+ let mut second = SharedLeakInterner::new(4, value.len() - 1);
595
+ assert!(second.intern(value).is_none());
596
+ assert_eq!(second.entries.len(), 0);
597
+ assert_eq!(shared_leak_registry_len(), 1);
598
+ });
599
+ }
600
+
601
+ #[test]
602
+ fn current_read_entry_bound_applies_to_registry_hits() {
603
+ with_clean_shared_leak_registry(|| {
604
+ let mut first = SharedLeakInterner::new(4, DEFAULT_SHARED_MAX_VALUE_BYTES);
605
+ assert!(first.intern("already-leaked-one").is_some());
606
+ assert!(first.intern("already-leaked-two").is_some());
607
+ assert_eq!(shared_leak_registry_len(), 2);
608
+
609
+ let mut second = SharedLeakInterner::new(1, DEFAULT_SHARED_MAX_VALUE_BYTES);
610
+ assert!(second.intern("already-leaked-one").is_some());
611
+ assert!(second.intern("already-leaked-two").is_none());
612
+ assert_eq!(second.entries.len(), 1);
613
+ assert_eq!(shared_leak_registry_len(), 2);
614
+ });
615
+ }
616
+
617
+ #[test]
618
+ fn mode_parses_and_round_trips() {
619
+ for mode in [
620
+ StringStorageMode::Copy,
621
+ StringStorageMode::Intern,
622
+ StringStorageMode::Shared,
623
+ ] {
624
+ assert_eq!(
625
+ StringStorageMode::from_str(&mode.to_string()).unwrap(),
626
+ mode
627
+ );
628
+ }
629
+ assert_eq!(StringStorageMode::default(), StringStorageMode::Copy);
630
+ assert!(StringStorageMode::from_str("nonsense").is_err());
631
+ }
632
+ }