parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
use std::collections::{HashMap, HashSet};
|
|
2
|
+
use std::os::raw::{c_char, c_long};
|
|
3
|
+
use std::str::FromStr;
|
|
4
|
+
use std::sync::{Mutex, OnceLock};
|
|
5
|
+
|
|
6
|
+
use magnus::value::{BoxValue, ReprValue};
|
|
7
|
+
use magnus::{RString, Ruby, Value};
|
|
8
|
+
|
|
9
|
+
/// Default cap on how many distinct strings the [`StringStorageMode::Shared`]
|
|
10
|
+
/// strategy will leak before returning frozen owned copies (overridable via the
|
|
11
|
+
/// shared budget).
|
|
12
|
+
///
|
|
13
|
+
/// `Shared` hands Ruby a zero-copy view into Rust-owned bytes, which requires
|
|
14
|
+
/// those bytes to live for the entire (unbounded) lifetime of the Ruby string,
|
|
15
|
+
/// i.e. `'static`. We obtain `'static` by leaking one copy per distinct value
|
|
16
|
+
/// into a process-wide registry. The registry is shared by all reads, so
|
|
17
|
+
/// repeated `each_row`/`each_column` calls reuse the same leaked values. The
|
|
18
|
+
/// requested budget bounds how many values the current read may return this way
|
|
19
|
+
/// and how many new process-wide leaks that read may admit; hard process
|
|
20
|
+
/// ceilings below bound the registry even when callers request larger budgets.
|
|
21
|
+
pub const DEFAULT_SHARED_MAX_ENTRIES: usize = 8192;
|
|
22
|
+
|
|
23
|
+
/// Default cap on the size of an individual string [`StringStorageMode::Shared`]
|
|
24
|
+
/// will leak (overridable per read). Longer values are returned as a frozen
|
|
25
|
+
/// owned copy rather than leaked, so a column of large blobs cannot blow the
|
|
26
|
+
/// leak budget. `Shared` targets short, repeated, low-cardinality strings (enums,
|
|
27
|
+
/// categories, codes); large values gain little from zero-copy and would
|
|
28
|
+
/// dominate the leak, so they opt out.
|
|
29
|
+
pub const DEFAULT_SHARED_MAX_VALUE_BYTES: usize = 4096;
|
|
30
|
+
|
|
31
|
+
/// Hard process-wide entry ceiling for `:shared`, regardless of user-supplied
|
|
32
|
+
/// budgets. This keeps a single large requested budget from making the leak
|
|
33
|
+
/// table unbounded. The default budget is still much smaller, but callers can
|
|
34
|
+
/// explicitly request more up to this ceiling.
|
|
35
|
+
const SHARED_PROCESS_MAX_ENTRIES: usize = 65_536;
|
|
36
|
+
|
|
37
|
+
/// Hard process-wide byte ceiling for leaked `:shared` string buffers. This
|
|
38
|
+
/// bounds the data plane independently from hash-table overhead and from any
|
|
39
|
+
/// single caller's requested budget.
|
|
40
|
+
const SHARED_PROCESS_MAX_BYTES: usize = 64 * 1024 * 1024;
|
|
41
|
+
|
|
42
|
+
/// Per-read cache for shared values that could not use the process registry.
|
|
43
|
+
/// It avoids repeating the global lock for known process-cap fallbacks while
|
|
44
|
+
/// staying bounded for high-cardinality data.
|
|
45
|
+
const SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX: usize = 8192;
|
|
46
|
+
const SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX: usize = 4 * 1024 * 1024;
|
|
47
|
+
|
|
48
|
+
/// Cache size bound for `:intern` *values*. Low-cardinality columns (the case
|
|
49
|
+
/// `:intern` targets) fit well within this, making their repeats allocation
|
|
50
|
+
/// free; higher-cardinality values past the bound become frozen owned copies
|
|
51
|
+
/// rather than adding more entries to Ruby's immortal intern table.
|
|
52
|
+
const INTERN_VALUE_CACHE_ENTRY_COUNT_MAX: usize = 8192;
|
|
53
|
+
|
|
54
|
+
/// Cache size bound for hash *keys* (struct field names). Field-name cardinality
|
|
55
|
+
/// is fixed by the schema and small; the bound is a defensive ceiling.
|
|
56
|
+
const KEY_CACHE_ENTRY_COUNT_MAX: usize = 4096;
|
|
57
|
+
|
|
58
|
+
/// How a Rust string value is materialized as a Ruby `String` when reading.
|
|
59
|
+
///
|
|
60
|
+
/// The choice trades per-value allocation against memory ownership:
|
|
61
|
+
/// - [`Copy`](Self::Copy) is always safe and produces independent, mutable strings.
|
|
62
|
+
/// - [`Intern`](Self::Intern) deduplicates equal values through Ruby's frozen
|
|
63
|
+
/// string table (Ruby owns the bytes); repeats reuse one immortal object.
|
|
64
|
+
/// - [`Shared`](Self::Shared) avoids the byte copy entirely by viewing leaked
|
|
65
|
+
/// `'static` Rust bytes; bounded per read and by process-wide ceilings (see
|
|
66
|
+
/// [`StringStorageConfig`]).
|
|
67
|
+
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
|
|
68
|
+
pub enum StringStorageMode {
|
|
69
|
+
/// Allocate a fresh, mutable Ruby `String` (one allocation + byte copy) per
|
|
70
|
+
/// value. This is the default and matches historical behavior.
|
|
71
|
+
#[default]
|
|
72
|
+
Copy,
|
|
73
|
+
/// Deduplicate equal values through Ruby's interned (frozen) string table up
|
|
74
|
+
/// to a bounded per-read cache. Values after that bound become frozen owned
|
|
75
|
+
/// copies, so high-cardinality columns cannot keep growing Ruby's immortal
|
|
76
|
+
/// intern table. Note: a transient copy still happens per value (even on a
|
|
77
|
+
/// dedup hit), so this is not a per-value throughput win over `Copy`; it
|
|
78
|
+
/// lowers retained footprint and GC pressure for low-cardinality /
|
|
79
|
+
/// repeat-heavy columns.
|
|
80
|
+
Intern,
|
|
81
|
+
/// Zero byte-copy: equal values share leaked `'static` Rust bytes via a
|
|
82
|
+
/// frozen static Ruby string. Strings are always frozen in this mode. Best
|
|
83
|
+
/// for short, repeated, low-cardinality values. The shared budget bounds
|
|
84
|
+
/// per-read entry count, per-value byte size, and new process leak admission;
|
|
85
|
+
/// values past either bound are returned as a frozen owned copy. See
|
|
86
|
+
/// [`StringStorageConfig`].
|
|
87
|
+
Shared,
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/// The reader's string-materialization configuration: the [`StringStorageMode`]
|
|
91
|
+
/// plus the budget that bounds [`StringStorageMode::Shared`] values for this read
|
|
92
|
+
/// and new process-wide leak admission. The budget is ignored by `Copy` and
|
|
93
|
+
/// `Intern`.
|
|
94
|
+
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
95
|
+
pub struct StringStorageConfig {
|
|
96
|
+
pub mode: StringStorageMode,
|
|
97
|
+
pub shared_max_entries: usize,
|
|
98
|
+
pub shared_max_value_bytes: usize,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
impl Default for StringStorageConfig {
|
|
102
|
+
fn default() -> Self {
|
|
103
|
+
Self {
|
|
104
|
+
mode: StringStorageMode::default(),
|
|
105
|
+
shared_max_entries: DEFAULT_SHARED_MAX_ENTRIES,
|
|
106
|
+
shared_max_value_bytes: DEFAULT_SHARED_MAX_VALUE_BYTES,
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
impl StringStorageConfig {
|
|
112
|
+
/// A config for `mode` with the default shared budget.
|
|
113
|
+
pub fn from_mode(mode: StringStorageMode) -> Self {
|
|
114
|
+
Self {
|
|
115
|
+
mode,
|
|
116
|
+
..Self::default()
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
impl FromStr for StringStorageMode {
|
|
122
|
+
type Err = String;
|
|
123
|
+
|
|
124
|
+
fn from_str(value: &str) -> Result<Self, Self::Err> {
|
|
125
|
+
match value {
|
|
126
|
+
"copy" => Ok(StringStorageMode::Copy),
|
|
127
|
+
"intern" => Ok(StringStorageMode::Intern),
|
|
128
|
+
"shared" => Ok(StringStorageMode::Shared),
|
|
129
|
+
other => Err(format!(
|
|
130
|
+
"Invalid string_storage: {} (expected :copy, :intern, or :shared)",
|
|
131
|
+
other
|
|
132
|
+
)),
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
impl std::fmt::Display for StringStorageMode {
|
|
138
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
139
|
+
let name = match self {
|
|
140
|
+
StringStorageMode::Copy => "copy",
|
|
141
|
+
StringStorageMode::Intern => "intern",
|
|
142
|
+
StringStorageMode::Shared => "shared",
|
|
143
|
+
};
|
|
144
|
+
f.write_str(name)
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Process-wide registry of leaked `'static` string bytes.
|
|
149
|
+
///
|
|
150
|
+
/// The registry performs no Ruby calls and is protected by a small mutex. Reads
|
|
151
|
+
/// only take the lock while checking/inserting a string; the hot Ruby object
|
|
152
|
+
/// creation path happens after the leaked slice is returned.
|
|
153
|
+
#[derive(Debug)]
|
|
154
|
+
struct SharedLeakRegistry {
|
|
155
|
+
entries: HashSet<&'static str>,
|
|
156
|
+
leaked_bytes: usize,
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
impl SharedLeakRegistry {
|
|
160
|
+
fn new() -> Self {
|
|
161
|
+
Self {
|
|
162
|
+
entries: HashSet::new(),
|
|
163
|
+
leaked_bytes: 0,
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
fn intern(
|
|
168
|
+
&mut self,
|
|
169
|
+
s: &str,
|
|
170
|
+
requested_max_entries: usize,
|
|
171
|
+
requested_max_value_bytes: usize,
|
|
172
|
+
) -> Option<&'static str> {
|
|
173
|
+
if let Some(&existing) = self.entries.get(s) {
|
|
174
|
+
return Some(existing);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
let entry_limit = requested_max_entries.min(SHARED_PROCESS_MAX_ENTRIES);
|
|
178
|
+
let requested_byte_limit =
|
|
179
|
+
requested_max_entries.saturating_mul(requested_max_value_bytes.saturating_add(1));
|
|
180
|
+
let byte_limit = requested_byte_limit.min(SHARED_PROCESS_MAX_BYTES);
|
|
181
|
+
let entry_bytes = s.len().checked_add(1)?;
|
|
182
|
+
|
|
183
|
+
if self.entries.len() >= entry_limit {
|
|
184
|
+
return None;
|
|
185
|
+
}
|
|
186
|
+
if self.leaked_bytes.saturating_add(entry_bytes) > byte_limit {
|
|
187
|
+
return None;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
let leaked = leak_nul_terminated(s);
|
|
191
|
+
self.entries.insert(leaked);
|
|
192
|
+
self.leaked_bytes += entry_bytes;
|
|
193
|
+
|
|
194
|
+
debug_assert!(self.entries.len() <= SHARED_PROCESS_MAX_ENTRIES);
|
|
195
|
+
debug_assert!(self.leaked_bytes <= SHARED_PROCESS_MAX_BYTES);
|
|
196
|
+
Some(leaked)
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
fn shared_leak_registry() -> &'static Mutex<SharedLeakRegistry> {
|
|
201
|
+
static REGISTRY: OnceLock<Mutex<SharedLeakRegistry>> = OnceLock::new();
|
|
202
|
+
REGISTRY.get_or_init(|| Mutex::new(SharedLeakRegistry::new()))
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
fn lock_shared_leak_registry() -> std::sync::MutexGuard<'static, SharedLeakRegistry> {
|
|
206
|
+
shared_leak_registry()
|
|
207
|
+
.lock()
|
|
208
|
+
.unwrap_or_else(|poisoned| poisoned.into_inner())
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/// Deduplicating, bounded interner of leaked `'static` string bytes.
|
|
212
|
+
///
|
|
213
|
+
/// Each distinct string is leaked at most once process-wide (so equal values
|
|
214
|
+
/// reuse the same `'static` slice). Each read still enforces its own entry and
|
|
215
|
+
/// value-size budget before using the process registry; a larger earlier read
|
|
216
|
+
/// cannot make a later smaller read return out-of-budget zero-copy strings.
|
|
217
|
+
/// Values outside those bounds are returned by the caller as frozen owned copies.
|
|
218
|
+
#[derive(Debug)]
|
|
219
|
+
pub struct SharedLeakInterner {
|
|
220
|
+
entries: HashSet<&'static str>,
|
|
221
|
+
fallbacks: HashSet<Box<str>>,
|
|
222
|
+
fallback_bytes: usize,
|
|
223
|
+
max_entries: usize,
|
|
224
|
+
max_value_bytes: usize,
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
impl SharedLeakInterner {
|
|
228
|
+
/// Both limits must be positive; callers parse them from positive Integers.
|
|
229
|
+
fn new(max_entries: usize, max_value_bytes: usize) -> Self {
|
|
230
|
+
debug_assert!(max_entries > 0);
|
|
231
|
+
debug_assert!(max_value_bytes > 0);
|
|
232
|
+
Self {
|
|
233
|
+
entries: HashSet::new(),
|
|
234
|
+
fallbacks: HashSet::new(),
|
|
235
|
+
fallback_bytes: 0,
|
|
236
|
+
max_entries,
|
|
237
|
+
max_value_bytes,
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/// Return a `'static` view of `s`, leaking one NUL-terminated copy when the
|
|
242
|
+
/// current read and process registry both have room, or `None` when either
|
|
243
|
+
/// bound is reached (caller then copies).
|
|
244
|
+
fn intern(&mut self, s: &str) -> Option<&'static str> {
|
|
245
|
+
if let Some(&existing) = self.entries.get(s) {
|
|
246
|
+
return Some(existing);
|
|
247
|
+
}
|
|
248
|
+
if self.fallbacks.contains(s) {
|
|
249
|
+
return None;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if s.len() > self.max_value_bytes {
|
|
253
|
+
return None;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
if self.entries.len() >= self.max_entries {
|
|
257
|
+
self.remember_fallback(s);
|
|
258
|
+
return None;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
match lock_shared_leak_registry().intern(s, self.max_entries, self.max_value_bytes) {
|
|
262
|
+
Some(leaked) => {
|
|
263
|
+
self.entries.insert(leaked);
|
|
264
|
+
debug_assert!(self.entries.len() <= self.max_entries);
|
|
265
|
+
Some(leaked)
|
|
266
|
+
}
|
|
267
|
+
None => {
|
|
268
|
+
self.remember_fallback(s);
|
|
269
|
+
None
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
fn remember_fallback(&mut self, s: &str) {
|
|
275
|
+
if s.len() > self.max_value_bytes {
|
|
276
|
+
return;
|
|
277
|
+
}
|
|
278
|
+
if self.fallbacks.len() >= SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX {
|
|
279
|
+
return;
|
|
280
|
+
}
|
|
281
|
+
if self.fallback_bytes.saturating_add(s.len()) > SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX {
|
|
282
|
+
return;
|
|
283
|
+
}
|
|
284
|
+
if self.fallbacks.insert(Box::from(s)) {
|
|
285
|
+
self.fallback_bytes += s.len();
|
|
286
|
+
}
|
|
287
|
+
debug_assert!(self.fallbacks.len() <= SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX);
|
|
288
|
+
debug_assert!(self.fallback_bytes <= SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX);
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
/// Leak one copy of `s` with a trailing NUL byte and return a `'static` view of
|
|
293
|
+
/// the content only (excluding the NUL).
|
|
294
|
+
///
|
|
295
|
+
/// The NUL terminator is mandatory: `rb_utf8_str_new_static` (used by
|
|
296
|
+
/// [`StringStorage::to_ruby_string`] for `Shared`) builds a string that points
|
|
297
|
+
/// at this buffer and relies on Ruby's invariant that `ptr[len] == '\0'`. The
|
|
298
|
+
/// boxed bytes are never freed, so the returned reference is genuinely `'static`.
|
|
299
|
+
fn leak_nul_terminated(s: &str) -> &'static str {
|
|
300
|
+
let mut bytes = Vec::with_capacity(s.len() + 1);
|
|
301
|
+
bytes.extend_from_slice(s.as_bytes());
|
|
302
|
+
bytes.push(0);
|
|
303
|
+
let leaked: &'static [u8] = Box::leak(bytes.into_boxed_slice());
|
|
304
|
+
// SAFETY: the leading `s.len()` bytes are exactly `s`'s valid UTF-8 content;
|
|
305
|
+
// only the trailing NUL is excluded from the returned slice.
|
|
306
|
+
unsafe { std::str::from_utf8_unchecked(&leaked[..leaked.len() - 1]) }
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
/// Per-read string output: how string *values* are materialized (per the
|
|
310
|
+
/// configured mode) plus a cache that always interns hash *keys* (struct field
|
|
311
|
+
/// names). One `StringStorage` is created per `each_row`/`each_column`
|
|
312
|
+
/// invocation; its local caches are not shared across calls. In `Shared` mode,
|
|
313
|
+
/// value bytes are coordinated through the process-wide leak registry above.
|
|
314
|
+
#[derive(Debug)]
|
|
315
|
+
pub struct StringStorage {
|
|
316
|
+
values: ValueStrategy,
|
|
317
|
+
keys: InternCache,
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
impl StringStorage {
|
|
321
|
+
pub fn new(config: StringStorageConfig) -> Self {
|
|
322
|
+
Self {
|
|
323
|
+
values: ValueStrategy::new(config),
|
|
324
|
+
keys: InternCache::new(KEY_CACHE_ENTRY_COUNT_MAX),
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
/// Materialize a string *value* per the configured mode. The result is
|
|
329
|
+
/// frozen for `Intern`/`Shared` and mutable for `Copy`.
|
|
330
|
+
pub fn ruby_string(&mut self, ruby: &Ruby, s: &str) -> Value {
|
|
331
|
+
self.values.ruby_string(ruby, s)
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
/// Materialize a hash *key* (a struct field name). Keys are always interned
|
|
335
|
+
/// and reused regardless of the value mode, because field names are a small
|
|
336
|
+
/// set repeated on every row.
|
|
337
|
+
pub fn ruby_key(&mut self, ruby: &Ruby, name: &str) -> Value {
|
|
338
|
+
self.keys.intern_key(ruby, name)
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
/// The value-materialization strategy (mode plus any per-mode state).
|
|
343
|
+
#[derive(Debug)]
|
|
344
|
+
enum ValueStrategy {
|
|
345
|
+
Copy,
|
|
346
|
+
Intern(InternCache),
|
|
347
|
+
Shared(SharedLeakInterner),
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
impl ValueStrategy {
|
|
351
|
+
fn new(config: StringStorageConfig) -> Self {
|
|
352
|
+
match config.mode {
|
|
353
|
+
StringStorageMode::Copy => ValueStrategy::Copy,
|
|
354
|
+
StringStorageMode::Intern => {
|
|
355
|
+
ValueStrategy::Intern(InternCache::new(INTERN_VALUE_CACHE_ENTRY_COUNT_MAX))
|
|
356
|
+
}
|
|
357
|
+
StringStorageMode::Shared => ValueStrategy::Shared(SharedLeakInterner::new(
|
|
358
|
+
config.shared_max_entries,
|
|
359
|
+
config.shared_max_value_bytes,
|
|
360
|
+
)),
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
fn ruby_string(&mut self, ruby: &Ruby, s: &str) -> Value {
|
|
365
|
+
match self {
|
|
366
|
+
ValueStrategy::Copy => ruby.str_new(s).as_value(),
|
|
367
|
+
ValueStrategy::Intern(cache) => cache
|
|
368
|
+
.intern_cached(ruby, s)
|
|
369
|
+
.unwrap_or_else(|| frozen_copy(ruby, s)),
|
|
370
|
+
ValueStrategy::Shared(interner) => match interner.intern(s) {
|
|
371
|
+
Some(leaked) => unsafe { static_ruby_string(ruby, leaked) },
|
|
372
|
+
// Past a leak bound: a frozen owned copy, so `Shared` results are
|
|
373
|
+
// uniformly frozen and no extra memory is leaked.
|
|
374
|
+
None => frozen_copy(ruby, s),
|
|
375
|
+
},
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/// Caches the interned Ruby string for each distinct content value, so a
|
|
381
|
+
/// repeated string is interned once and then returned with no further Ruby
|
|
382
|
+
/// allocation. Used both for hash keys (struct field names) and for `:intern`
|
|
383
|
+
/// values.
|
|
384
|
+
///
|
|
385
|
+
/// Each cached value is held in a [`BoxValue`], which registers the string with
|
|
386
|
+
/// Ruby's GC via `rb_gc_register_address`. That is required for correctness: a
|
|
387
|
+
/// plain `RString` stored in this Rust-heap map is invisible to the GC, and
|
|
388
|
+
/// `GC.compact` would relocate the interned string and leave the cached handle
|
|
389
|
+
/// dangling. `BoxValue` keeps the handle at a stable address that the GC updates
|
|
390
|
+
/// on compaction.
|
|
391
|
+
///
|
|
392
|
+
/// Bounded: at most `capacity` distinct values are cached. Value callers fall
|
|
393
|
+
/// back to frozen owned copies after that; key callers continue interning after
|
|
394
|
+
/// the cache because key cardinality is fixed by the schema and key identity is
|
|
395
|
+
/// part of the public read contract.
|
|
396
|
+
#[derive(Debug)]
|
|
397
|
+
struct InternCache {
|
|
398
|
+
cache: HashMap<Box<str>, BoxValue<RString>>,
|
|
399
|
+
capacity: usize,
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
impl InternCache {
|
|
403
|
+
fn new(capacity: usize) -> Self {
|
|
404
|
+
Self {
|
|
405
|
+
cache: HashMap::new(),
|
|
406
|
+
capacity,
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
fn intern_cached(&mut self, ruby: &Ruby, s: &str) -> Option<Value> {
|
|
411
|
+
if let Some(boxed) = self.cache.get(s) {
|
|
412
|
+
return Some(boxed.as_value());
|
|
413
|
+
}
|
|
414
|
+
if self.cache.len() >= self.capacity {
|
|
415
|
+
return None;
|
|
416
|
+
}
|
|
417
|
+
let interned = ruby.str_new(s).to_interned_str();
|
|
418
|
+
self.cache.insert(Box::from(s), BoxValue::new(interned));
|
|
419
|
+
debug_assert!(self.cache.len() <= self.capacity);
|
|
420
|
+
Some(interned.as_value())
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
fn intern_key(&mut self, ruby: &Ruby, s: &str) -> Value {
|
|
424
|
+
self.intern_cached(ruby, s).unwrap_or_else(|| {
|
|
425
|
+
let interned = ruby.str_new(s).to_interned_str();
|
|
426
|
+
interned.as_value()
|
|
427
|
+
})
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
/// Build a frozen, owned Ruby `String` (a normal copy that is then frozen).
|
|
432
|
+
fn frozen_copy(ruby: &Ruby, s: &str) -> Value {
|
|
433
|
+
let string = ruby.str_new(s);
|
|
434
|
+
string.freeze();
|
|
435
|
+
string.as_value()
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
/// Build a frozen Ruby `String` that points directly at `bytes` without copying.
|
|
439
|
+
///
|
|
440
|
+
/// # Safety
|
|
441
|
+
/// `bytes` must remain valid and immutable for the entire lifetime of the
|
|
442
|
+
/// returned Ruby string. Because Ruby code may retain the string for an
|
|
443
|
+
/// unbounded time, this requires `bytes: &'static str`. The backing buffer must
|
|
444
|
+
/// additionally be NUL-terminated at `bytes.as_ptr()[bytes.len()]`, which Ruby's
|
|
445
|
+
/// static-string constructor requires; [`leak_nul_terminated`] guarantees this.
|
|
446
|
+
/// The returned string is frozen so the shared, immutable backing is never
|
|
447
|
+
/// mutated in place.
|
|
448
|
+
unsafe fn static_ruby_string(ruby: &Ruby, bytes: &'static str) -> Value {
|
|
449
|
+
// The static-string constructor reads bytes[len] expecting a NUL; check the
|
|
450
|
+
// byte we are about to rely on rather than trusting the caller's comment.
|
|
451
|
+
debug_assert_eq!(
|
|
452
|
+
*bytes.as_ptr().add(bytes.len()),
|
|
453
|
+
0,
|
|
454
|
+
"static_ruby_string requires a NUL terminator at bytes[len]"
|
|
455
|
+
);
|
|
456
|
+
let string = ruby.str_new_lit(bytes.as_ptr() as *const c_char, bytes.len() as c_long);
|
|
457
|
+
string.freeze();
|
|
458
|
+
string.as_value()
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
#[cfg(test)]
|
|
462
|
+
mod tests {
|
|
463
|
+
use super::*;
|
|
464
|
+
use std::sync::{Mutex, OnceLock};
|
|
465
|
+
|
|
466
|
+
fn shared_leak_test_lock() -> &'static Mutex<()> {
|
|
467
|
+
static LOCK: OnceLock<Mutex<()>> = OnceLock::new();
|
|
468
|
+
LOCK.get_or_init(|| Mutex::new(()))
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
fn reset_shared_leak_registry() {
|
|
472
|
+
let mut registry = lock_shared_leak_registry();
|
|
473
|
+
registry.entries.clear();
|
|
474
|
+
registry.leaked_bytes = 0;
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
fn shared_leak_registry_len() -> usize {
|
|
478
|
+
lock_shared_leak_registry().entries.len()
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
fn with_clean_shared_leak_registry(test: impl FnOnce()) {
|
|
482
|
+
let _guard = shared_leak_test_lock()
|
|
483
|
+
.lock()
|
|
484
|
+
.unwrap_or_else(|poisoned| poisoned.into_inner());
|
|
485
|
+
reset_shared_leak_registry();
|
|
486
|
+
test();
|
|
487
|
+
reset_shared_leak_registry();
|
|
488
|
+
}
|
|
489
|
+
|
|
490
|
+
#[test]
|
|
491
|
+
fn interns_distinct_values_once_and_reuses_pointer() {
|
|
492
|
+
with_clean_shared_leak_registry(|| {
|
|
493
|
+
let mut interner =
|
|
494
|
+
SharedLeakInterner::new(DEFAULT_SHARED_MAX_ENTRIES, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
495
|
+
|
|
496
|
+
let first = interner.intern("repeat").unwrap();
|
|
497
|
+
let second = interner.intern("repeat").unwrap();
|
|
498
|
+
let other = interner.intern("different").unwrap();
|
|
499
|
+
|
|
500
|
+
// Equal values share the same leaked allocation.
|
|
501
|
+
assert_eq!(first.as_ptr(), second.as_ptr());
|
|
502
|
+
assert_eq!(first, "repeat");
|
|
503
|
+
// Distinct values get distinct allocations with the right contents.
|
|
504
|
+
assert_ne!(first.as_ptr(), other.as_ptr());
|
|
505
|
+
assert_eq!(other, "different");
|
|
506
|
+
assert_eq!(shared_leak_registry_len(), 2);
|
|
507
|
+
});
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
#[test]
|
|
511
|
+
fn leak_is_bounded_and_falls_back_past_the_cap() {
|
|
512
|
+
with_clean_shared_leak_registry(|| {
|
|
513
|
+
let max_entries = 4;
|
|
514
|
+
let mut interner = SharedLeakInterner::new(max_entries, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
515
|
+
|
|
516
|
+
for index in 0..max_entries {
|
|
517
|
+
assert!(interner.intern(&format!("value-{index}")).is_some());
|
|
518
|
+
}
|
|
519
|
+
assert_eq!(shared_leak_registry_len(), max_entries);
|
|
520
|
+
|
|
521
|
+
// A new distinct value past the cap is not leaked; caller must copy.
|
|
522
|
+
assert!(interner.intern("over-the-bound").is_none());
|
|
523
|
+
assert_eq!(shared_leak_registry_len(), max_entries);
|
|
524
|
+
|
|
525
|
+
// A value already interned still resolves even after the cap is hit.
|
|
526
|
+
assert!(interner.intern("value-0").is_some());
|
|
527
|
+
assert_eq!(shared_leak_registry_len(), max_entries);
|
|
528
|
+
});
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
#[test]
|
|
532
|
+
fn oversized_values_are_not_leaked() {
|
|
533
|
+
with_clean_shared_leak_registry(|| {
|
|
534
|
+
let mut interner =
|
|
535
|
+
SharedLeakInterner::new(DEFAULT_SHARED_MAX_ENTRIES, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
536
|
+
|
|
537
|
+
let at_limit = "x".repeat(DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
538
|
+
let over_limit = "x".repeat(DEFAULT_SHARED_MAX_VALUE_BYTES + 1);
|
|
539
|
+
|
|
540
|
+
assert!(interner.intern(&at_limit).is_some());
|
|
541
|
+
assert!(interner.intern(&over_limit).is_none());
|
|
542
|
+
// Only the in-bound value was leaked.
|
|
543
|
+
assert_eq!(shared_leak_registry_len(), 1);
|
|
544
|
+
assert!(
|
|
545
|
+
interner.fallbacks.is_empty(),
|
|
546
|
+
"oversized fallbacks must not be retained in the per-read cache"
|
|
547
|
+
);
|
|
548
|
+
assert_eq!(interner.fallback_bytes, 0);
|
|
549
|
+
});
|
|
550
|
+
}
|
|
551
|
+
|
|
552
|
+
#[test]
|
|
553
|
+
fn fallback_cache_retained_bytes_are_bounded() {
|
|
554
|
+
with_clean_shared_leak_registry(|| {
|
|
555
|
+
let mut first = SharedLeakInterner::new(1, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
556
|
+
assert!(first.intern("already-leaked").is_some());
|
|
557
|
+
|
|
558
|
+
let mut second = SharedLeakInterner::new(1, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
559
|
+
let suffix = "x".repeat(1024);
|
|
560
|
+
for index in 0..(SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX * 2) {
|
|
561
|
+
assert!(second.intern(&format!("{index:08}-{suffix}")).is_none());
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
assert!(second.fallbacks.len() <= SHARED_FALLBACK_CACHE_ENTRY_COUNT_MAX);
|
|
565
|
+
assert!(second.fallback_bytes <= SHARED_FALLBACK_CACHE_RETAINED_BYTES_MAX);
|
|
566
|
+
});
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
#[test]
|
|
570
|
+
fn shared_leak_budget_is_process_wide_for_matching_budget() {
|
|
571
|
+
with_clean_shared_leak_registry(|| {
|
|
572
|
+
let mut first = SharedLeakInterner::new(4, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
573
|
+
for index in 0..4 {
|
|
574
|
+
assert!(first.intern(&format!("reader-one-{index}")).is_some());
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
let mut second = SharedLeakInterner::new(4, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
578
|
+
assert!(second.intern("reader-one-0").is_some());
|
|
579
|
+
assert!(
|
|
580
|
+
second.intern("reader-two-new").is_none(),
|
|
581
|
+
"the shared leak budget must not reset for each reader"
|
|
582
|
+
);
|
|
583
|
+
});
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
#[test]
|
|
587
|
+
fn current_read_value_bound_applies_to_registry_hits() {
|
|
588
|
+
with_clean_shared_leak_registry(|| {
|
|
589
|
+
let value = "larger-than-second-budget";
|
|
590
|
+
let mut first = SharedLeakInterner::new(4, value.len());
|
|
591
|
+
assert!(first.intern(value).is_some());
|
|
592
|
+
assert_eq!(shared_leak_registry_len(), 1);
|
|
593
|
+
|
|
594
|
+
let mut second = SharedLeakInterner::new(4, value.len() - 1);
|
|
595
|
+
assert!(second.intern(value).is_none());
|
|
596
|
+
assert_eq!(second.entries.len(), 0);
|
|
597
|
+
assert_eq!(shared_leak_registry_len(), 1);
|
|
598
|
+
});
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
#[test]
|
|
602
|
+
fn current_read_entry_bound_applies_to_registry_hits() {
|
|
603
|
+
with_clean_shared_leak_registry(|| {
|
|
604
|
+
let mut first = SharedLeakInterner::new(4, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
605
|
+
assert!(first.intern("already-leaked-one").is_some());
|
|
606
|
+
assert!(first.intern("already-leaked-two").is_some());
|
|
607
|
+
assert_eq!(shared_leak_registry_len(), 2);
|
|
608
|
+
|
|
609
|
+
let mut second = SharedLeakInterner::new(1, DEFAULT_SHARED_MAX_VALUE_BYTES);
|
|
610
|
+
assert!(second.intern("already-leaked-one").is_some());
|
|
611
|
+
assert!(second.intern("already-leaked-two").is_none());
|
|
612
|
+
assert_eq!(second.entries.len(), 1);
|
|
613
|
+
assert_eq!(shared_leak_registry_len(), 2);
|
|
614
|
+
});
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
#[test]
|
|
618
|
+
fn mode_parses_and_round_trips() {
|
|
619
|
+
for mode in [
|
|
620
|
+
StringStorageMode::Copy,
|
|
621
|
+
StringStorageMode::Intern,
|
|
622
|
+
StringStorageMode::Shared,
|
|
623
|
+
] {
|
|
624
|
+
assert_eq!(
|
|
625
|
+
StringStorageMode::from_str(&mode.to_string()).unwrap(),
|
|
626
|
+
mode
|
|
627
|
+
);
|
|
628
|
+
}
|
|
629
|
+
assert_eq!(StringStorageMode::default(), StringStorageMode::Copy);
|
|
630
|
+
assert!(StringStorageMode::from_str("nonsense").is_err());
|
|
631
|
+
}
|
|
632
|
+
}
|