parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
use std::collections::HashSet;
|
|
2
|
+
use triomphe::Arc;
|
|
3
|
+
|
|
4
|
+
/// Default cap on distinct strings retained for reuse when `string_cache: true`.
|
|
5
|
+
/// The cap is a deliberate memory bound (safety over hit rate): a high-cardinality
|
|
6
|
+
/// column cannot grow the cache without limit. Once full, further distinct strings
|
|
7
|
+
/// are not retained but already-cached values still reuse their shared storage.
|
|
8
|
+
/// Callers can override it (`string_cache: <Integer>`). Reported statistics expose
|
|
9
|
+
/// the cumulative miss count rather than pretending to know exact distinct
|
|
10
|
+
/// cardinality after the bounded cache fills.
|
|
11
|
+
pub const DEFAULT_STRING_CACHE_CAPACITY: usize = 100;
|
|
12
|
+
|
|
13
|
+
/// Hard capacity ceiling for `string_cache:`. Each retained entry owns hash-table
|
|
14
|
+
/// metadata plus one shared string allocation, so an explicit upper bound keeps a
|
|
15
|
+
/// caller-provided capacity from becoming an eager unbounded allocation.
|
|
16
|
+
pub const STRING_CACHE_CAPACITY_MAX: usize = 65_536;
|
|
17
|
+
|
|
18
|
+
/// Per-value byte ceiling for retained cache entries. Oversized values still
|
|
19
|
+
/// write correctly, but they are not retained for reuse across later rows.
|
|
20
|
+
pub const STRING_CACHE_VALUE_BYTES_MAX: usize = 4096;
|
|
21
|
+
|
|
22
|
+
/// Total retained UTF-8 bytes for cached string contents. This does not include
|
|
23
|
+
/// hash-table metadata, which is separately bounded by `STRING_CACHE_CAPACITY_MAX`.
|
|
24
|
+
pub const STRING_CACHE_RETAINED_BYTES_MAX: usize = 16 * 1024 * 1024;
|
|
25
|
+
|
|
26
|
+
/// A cache for reusing string storage to reduce memory usage
|
|
27
|
+
/// when there are many repeated strings
|
|
28
|
+
#[derive(Debug)]
|
|
29
|
+
pub struct StringCache {
|
|
30
|
+
capacity: usize,
|
|
31
|
+
entries: HashSet<Arc<str>>,
|
|
32
|
+
retained_bytes: usize,
|
|
33
|
+
hits: usize,
|
|
34
|
+
misses: usize,
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
impl StringCache {
|
|
38
|
+
/// Create a new string cache that retains at most `capacity` distinct
|
|
39
|
+
/// strings. The caller only constructs a cache when caching is enabled; a
|
|
40
|
+
/// disabled cache is represented by not creating one at all.
|
|
41
|
+
pub fn new(capacity: usize) -> Self {
|
|
42
|
+
debug_assert!(capacity > 0);
|
|
43
|
+
let capacity = capacity.min(STRING_CACHE_CAPACITY_MAX);
|
|
44
|
+
|
|
45
|
+
Self {
|
|
46
|
+
capacity,
|
|
47
|
+
entries: HashSet::with_capacity(capacity),
|
|
48
|
+
retained_bytes: 0,
|
|
49
|
+
hits: 0,
|
|
50
|
+
misses: 0,
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/// Intern a string, returning shared storage for repeated values.
|
|
55
|
+
pub fn intern(&mut self, s: String) -> Arc<str> {
|
|
56
|
+
debug_assert!(self.entries.len() <= self.capacity);
|
|
57
|
+
|
|
58
|
+
if let Some(interned) = self.entries.get(s.as_str()) {
|
|
59
|
+
self.hits += 1;
|
|
60
|
+
return Arc::clone(interned);
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
let interned = Arc::<str>::from(s);
|
|
64
|
+
self.misses += 1;
|
|
65
|
+
|
|
66
|
+
let retained_bytes_next = self.retained_bytes.saturating_add(interned.len());
|
|
67
|
+
if self.entries.len() < self.capacity
|
|
68
|
+
&& interned.len() <= STRING_CACHE_VALUE_BYTES_MAX
|
|
69
|
+
&& retained_bytes_next <= STRING_CACHE_RETAINED_BYTES_MAX
|
|
70
|
+
{
|
|
71
|
+
self.retained_bytes = retained_bytes_next;
|
|
72
|
+
self.entries.insert(Arc::clone(&interned));
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
debug_assert!(self.entries.len() <= self.capacity);
|
|
76
|
+
debug_assert!(self.retained_bytes <= STRING_CACHE_RETAINED_BYTES_MAX);
|
|
77
|
+
interned
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/// Get cache statistics
|
|
81
|
+
pub fn stats(&self) -> CacheStats {
|
|
82
|
+
debug_assert!(self.entries.len() <= self.capacity);
|
|
83
|
+
|
|
84
|
+
CacheStats {
|
|
85
|
+
size: self.entries.len(),
|
|
86
|
+
hits: self.hits,
|
|
87
|
+
misses: self.misses,
|
|
88
|
+
hit_rate: if self.hits + self.misses > 0 {
|
|
89
|
+
self.hits as f64 / (self.hits + self.misses) as f64
|
|
90
|
+
} else {
|
|
91
|
+
0.0
|
|
92
|
+
},
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/// Clear the cache
|
|
97
|
+
pub fn clear(&mut self) {
|
|
98
|
+
self.entries.clear();
|
|
99
|
+
self.retained_bytes = 0;
|
|
100
|
+
self.hits = 0;
|
|
101
|
+
self.misses = 0;
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
#[derive(Debug)]
|
|
106
|
+
pub struct CacheStats {
|
|
107
|
+
pub size: usize,
|
|
108
|
+
pub hits: usize,
|
|
109
|
+
pub misses: usize,
|
|
110
|
+
pub hit_rate: f64,
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
#[cfg(test)]
|
|
114
|
+
#[path = "./string_cache_test.rs"]
|
|
115
|
+
mod tests;
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
use super::{
|
|
2
|
+
StringCache, DEFAULT_STRING_CACHE_CAPACITY, STRING_CACHE_CAPACITY_MAX,
|
|
3
|
+
STRING_CACHE_RETAINED_BYTES_MAX, STRING_CACHE_VALUE_BYTES_MAX,
|
|
4
|
+
};
|
|
5
|
+
use triomphe::Arc;
|
|
6
|
+
|
|
7
|
+
#[test]
|
|
8
|
+
fn cache_reuses_storage_and_counts_hits() {
|
|
9
|
+
let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
|
|
10
|
+
|
|
11
|
+
let first = cache.intern("repeat".to_string());
|
|
12
|
+
let second = cache.intern("repeat".to_string());
|
|
13
|
+
|
|
14
|
+
assert!(Arc::ptr_eq(&first, &second));
|
|
15
|
+
|
|
16
|
+
let stats = cache.stats();
|
|
17
|
+
assert_eq!(stats.size, 1);
|
|
18
|
+
assert_eq!(stats.hits, 1);
|
|
19
|
+
assert_eq!(stats.misses, 1);
|
|
20
|
+
assert_eq!(stats.hit_rate, 0.5);
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
#[test]
|
|
24
|
+
fn cache_stats_are_instance_local() {
|
|
25
|
+
let mut first_cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
|
|
26
|
+
let mut second_cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
|
|
27
|
+
|
|
28
|
+
let first_value = first_cache.intern("shared".to_string());
|
|
29
|
+
let first_value_again = first_cache.intern("shared".to_string());
|
|
30
|
+
let second_value = second_cache.intern("shared".to_string());
|
|
31
|
+
|
|
32
|
+
assert!(Arc::ptr_eq(&first_value, &first_value_again));
|
|
33
|
+
assert!(!Arc::ptr_eq(&first_value, &second_value));
|
|
34
|
+
|
|
35
|
+
let first_stats = first_cache.stats();
|
|
36
|
+
assert_eq!(first_stats.size, 1);
|
|
37
|
+
assert_eq!(first_stats.hits, 1);
|
|
38
|
+
assert_eq!(first_stats.misses, 1);
|
|
39
|
+
|
|
40
|
+
let second_stats = second_cache.stats();
|
|
41
|
+
assert_eq!(second_stats.size, 1);
|
|
42
|
+
assert_eq!(second_stats.hits, 0);
|
|
43
|
+
assert_eq!(second_stats.misses, 1);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
#[test]
|
|
47
|
+
fn cache_retention_is_bounded() {
|
|
48
|
+
let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
|
|
49
|
+
|
|
50
|
+
for index in 0..DEFAULT_STRING_CACHE_CAPACITY {
|
|
51
|
+
cache.intern(format!("value-{index}"));
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
let first_uncached = cache.intern("outside-bound".to_string());
|
|
55
|
+
let second_uncached = cache.intern("outside-bound".to_string());
|
|
56
|
+
|
|
57
|
+
assert!(!Arc::ptr_eq(&first_uncached, &second_uncached));
|
|
58
|
+
|
|
59
|
+
let stats = cache.stats();
|
|
60
|
+
assert_eq!(stats.size, DEFAULT_STRING_CACHE_CAPACITY);
|
|
61
|
+
assert_eq!(stats.hits, 0);
|
|
62
|
+
assert_eq!(stats.misses, DEFAULT_STRING_CACHE_CAPACITY + 2);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#[test]
|
|
66
|
+
fn cache_capacity_has_a_hard_ceiling() {
|
|
67
|
+
let cache = StringCache::new(STRING_CACHE_CAPACITY_MAX + 1);
|
|
68
|
+
|
|
69
|
+
assert_eq!(cache.capacity, STRING_CACHE_CAPACITY_MAX);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
#[test]
|
|
73
|
+
fn oversized_values_are_not_retained() {
|
|
74
|
+
let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
|
|
75
|
+
let value = "x".repeat(STRING_CACHE_VALUE_BYTES_MAX + 1);
|
|
76
|
+
|
|
77
|
+
let first = cache.intern(value.clone());
|
|
78
|
+
let second = cache.intern(value);
|
|
79
|
+
let stats = cache.stats();
|
|
80
|
+
|
|
81
|
+
assert!(!Arc::ptr_eq(&first, &second));
|
|
82
|
+
assert_eq!(stats.size, 0);
|
|
83
|
+
assert_eq!(stats.hits, 0);
|
|
84
|
+
assert_eq!(stats.misses, 2);
|
|
85
|
+
assert_eq!(cache.retained_bytes, 0);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
#[test]
|
|
89
|
+
fn retained_bytes_are_bounded() {
|
|
90
|
+
let mut cache = StringCache::new(STRING_CACHE_CAPACITY_MAX);
|
|
91
|
+
let value_len = STRING_CACHE_VALUE_BYTES_MAX;
|
|
92
|
+
let retained_entry_count_max = STRING_CACHE_RETAINED_BYTES_MAX / value_len;
|
|
93
|
+
|
|
94
|
+
for index in 0..retained_entry_count_max {
|
|
95
|
+
cache.intern(format!("{index:08}-{}", "x".repeat(value_len - 9)));
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
assert_eq!(cache.retained_bytes, STRING_CACHE_RETAINED_BYTES_MAX);
|
|
99
|
+
|
|
100
|
+
let overflow = cache.intern("y".repeat(value_len));
|
|
101
|
+
let overflow_again = cache.intern("y".repeat(value_len));
|
|
102
|
+
let stats = cache.stats();
|
|
103
|
+
|
|
104
|
+
assert!(!Arc::ptr_eq(&overflow, &overflow_again));
|
|
105
|
+
assert_eq!(stats.size, retained_entry_count_max);
|
|
106
|
+
assert_eq!(cache.retained_bytes, STRING_CACHE_RETAINED_BYTES_MAX);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
#[test]
|
|
110
|
+
fn clear_removes_entries_and_resets_counts() {
|
|
111
|
+
let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
|
|
112
|
+
|
|
113
|
+
cache.intern("repeat".to_string());
|
|
114
|
+
cache.intern("repeat".to_string());
|
|
115
|
+
cache.clear();
|
|
116
|
+
|
|
117
|
+
let stats = cache.stats();
|
|
118
|
+
assert_eq!(stats.size, 0);
|
|
119
|
+
assert_eq!(stats.hits, 0);
|
|
120
|
+
assert_eq!(stats.misses, 0);
|
|
121
|
+
assert_eq!(stats.hit_rate, 0.0);
|
|
122
|
+
}
|