parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,115 @@
1
+ use std::collections::HashSet;
2
+ use triomphe::Arc;
3
+
4
+ /// Default cap on distinct strings retained for reuse when `string_cache: true`.
5
+ /// The cap is a deliberate memory bound (safety over hit rate): a high-cardinality
6
+ /// column cannot grow the cache without limit. Once full, further distinct strings
7
+ /// are not retained but already-cached values still reuse their shared storage.
8
+ /// Callers can override it (`string_cache: <Integer>`). Reported statistics expose
9
+ /// the cumulative miss count rather than pretending to know exact distinct
10
+ /// cardinality after the bounded cache fills.
11
+ pub const DEFAULT_STRING_CACHE_CAPACITY: usize = 100;
12
+
13
+ /// Hard capacity ceiling for `string_cache:`. Each retained entry owns hash-table
14
+ /// metadata plus one shared string allocation, so an explicit upper bound keeps a
15
+ /// caller-provided capacity from becoming an eager unbounded allocation.
16
+ pub const STRING_CACHE_CAPACITY_MAX: usize = 65_536;
17
+
18
+ /// Per-value byte ceiling for retained cache entries. Oversized values still
19
+ /// write correctly, but they are not retained for reuse across later rows.
20
+ pub const STRING_CACHE_VALUE_BYTES_MAX: usize = 4096;
21
+
22
+ /// Total retained UTF-8 bytes for cached string contents. This does not include
23
+ /// hash-table metadata, which is separately bounded by `STRING_CACHE_CAPACITY_MAX`.
24
+ pub const STRING_CACHE_RETAINED_BYTES_MAX: usize = 16 * 1024 * 1024;
25
+
26
+ /// A cache for reusing string storage to reduce memory usage
27
+ /// when there are many repeated strings
28
+ #[derive(Debug)]
29
+ pub struct StringCache {
30
+ capacity: usize,
31
+ entries: HashSet<Arc<str>>,
32
+ retained_bytes: usize,
33
+ hits: usize,
34
+ misses: usize,
35
+ }
36
+
37
+ impl StringCache {
38
+ /// Create a new string cache that retains at most `capacity` distinct
39
+ /// strings. The caller only constructs a cache when caching is enabled; a
40
+ /// disabled cache is represented by not creating one at all.
41
+ pub fn new(capacity: usize) -> Self {
42
+ debug_assert!(capacity > 0);
43
+ let capacity = capacity.min(STRING_CACHE_CAPACITY_MAX);
44
+
45
+ Self {
46
+ capacity,
47
+ entries: HashSet::with_capacity(capacity),
48
+ retained_bytes: 0,
49
+ hits: 0,
50
+ misses: 0,
51
+ }
52
+ }
53
+
54
+ /// Intern a string, returning shared storage for repeated values.
55
+ pub fn intern(&mut self, s: String) -> Arc<str> {
56
+ debug_assert!(self.entries.len() <= self.capacity);
57
+
58
+ if let Some(interned) = self.entries.get(s.as_str()) {
59
+ self.hits += 1;
60
+ return Arc::clone(interned);
61
+ }
62
+
63
+ let interned = Arc::<str>::from(s);
64
+ self.misses += 1;
65
+
66
+ let retained_bytes_next = self.retained_bytes.saturating_add(interned.len());
67
+ if self.entries.len() < self.capacity
68
+ && interned.len() <= STRING_CACHE_VALUE_BYTES_MAX
69
+ && retained_bytes_next <= STRING_CACHE_RETAINED_BYTES_MAX
70
+ {
71
+ self.retained_bytes = retained_bytes_next;
72
+ self.entries.insert(Arc::clone(&interned));
73
+ }
74
+
75
+ debug_assert!(self.entries.len() <= self.capacity);
76
+ debug_assert!(self.retained_bytes <= STRING_CACHE_RETAINED_BYTES_MAX);
77
+ interned
78
+ }
79
+
80
+ /// Get cache statistics
81
+ pub fn stats(&self) -> CacheStats {
82
+ debug_assert!(self.entries.len() <= self.capacity);
83
+
84
+ CacheStats {
85
+ size: self.entries.len(),
86
+ hits: self.hits,
87
+ misses: self.misses,
88
+ hit_rate: if self.hits + self.misses > 0 {
89
+ self.hits as f64 / (self.hits + self.misses) as f64
90
+ } else {
91
+ 0.0
92
+ },
93
+ }
94
+ }
95
+
96
+ /// Clear the cache
97
+ pub fn clear(&mut self) {
98
+ self.entries.clear();
99
+ self.retained_bytes = 0;
100
+ self.hits = 0;
101
+ self.misses = 0;
102
+ }
103
+ }
104
+
105
+ #[derive(Debug)]
106
+ pub struct CacheStats {
107
+ pub size: usize,
108
+ pub hits: usize,
109
+ pub misses: usize,
110
+ pub hit_rate: f64,
111
+ }
112
+
113
+ #[cfg(test)]
114
+ #[path = "./string_cache_test.rs"]
115
+ mod tests;
@@ -0,0 +1,122 @@
1
+ use super::{
2
+ StringCache, DEFAULT_STRING_CACHE_CAPACITY, STRING_CACHE_CAPACITY_MAX,
3
+ STRING_CACHE_RETAINED_BYTES_MAX, STRING_CACHE_VALUE_BYTES_MAX,
4
+ };
5
+ use triomphe::Arc;
6
+
7
+ #[test]
8
+ fn cache_reuses_storage_and_counts_hits() {
9
+ let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
10
+
11
+ let first = cache.intern("repeat".to_string());
12
+ let second = cache.intern("repeat".to_string());
13
+
14
+ assert!(Arc::ptr_eq(&first, &second));
15
+
16
+ let stats = cache.stats();
17
+ assert_eq!(stats.size, 1);
18
+ assert_eq!(stats.hits, 1);
19
+ assert_eq!(stats.misses, 1);
20
+ assert_eq!(stats.hit_rate, 0.5);
21
+ }
22
+
23
+ #[test]
24
+ fn cache_stats_are_instance_local() {
25
+ let mut first_cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
26
+ let mut second_cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
27
+
28
+ let first_value = first_cache.intern("shared".to_string());
29
+ let first_value_again = first_cache.intern("shared".to_string());
30
+ let second_value = second_cache.intern("shared".to_string());
31
+
32
+ assert!(Arc::ptr_eq(&first_value, &first_value_again));
33
+ assert!(!Arc::ptr_eq(&first_value, &second_value));
34
+
35
+ let first_stats = first_cache.stats();
36
+ assert_eq!(first_stats.size, 1);
37
+ assert_eq!(first_stats.hits, 1);
38
+ assert_eq!(first_stats.misses, 1);
39
+
40
+ let second_stats = second_cache.stats();
41
+ assert_eq!(second_stats.size, 1);
42
+ assert_eq!(second_stats.hits, 0);
43
+ assert_eq!(second_stats.misses, 1);
44
+ }
45
+
46
+ #[test]
47
+ fn cache_retention_is_bounded() {
48
+ let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
49
+
50
+ for index in 0..DEFAULT_STRING_CACHE_CAPACITY {
51
+ cache.intern(format!("value-{index}"));
52
+ }
53
+
54
+ let first_uncached = cache.intern("outside-bound".to_string());
55
+ let second_uncached = cache.intern("outside-bound".to_string());
56
+
57
+ assert!(!Arc::ptr_eq(&first_uncached, &second_uncached));
58
+
59
+ let stats = cache.stats();
60
+ assert_eq!(stats.size, DEFAULT_STRING_CACHE_CAPACITY);
61
+ assert_eq!(stats.hits, 0);
62
+ assert_eq!(stats.misses, DEFAULT_STRING_CACHE_CAPACITY + 2);
63
+ }
64
+
65
+ #[test]
66
+ fn cache_capacity_has_a_hard_ceiling() {
67
+ let cache = StringCache::new(STRING_CACHE_CAPACITY_MAX + 1);
68
+
69
+ assert_eq!(cache.capacity, STRING_CACHE_CAPACITY_MAX);
70
+ }
71
+
72
+ #[test]
73
+ fn oversized_values_are_not_retained() {
74
+ let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
75
+ let value = "x".repeat(STRING_CACHE_VALUE_BYTES_MAX + 1);
76
+
77
+ let first = cache.intern(value.clone());
78
+ let second = cache.intern(value);
79
+ let stats = cache.stats();
80
+
81
+ assert!(!Arc::ptr_eq(&first, &second));
82
+ assert_eq!(stats.size, 0);
83
+ assert_eq!(stats.hits, 0);
84
+ assert_eq!(stats.misses, 2);
85
+ assert_eq!(cache.retained_bytes, 0);
86
+ }
87
+
88
+ #[test]
89
+ fn retained_bytes_are_bounded() {
90
+ let mut cache = StringCache::new(STRING_CACHE_CAPACITY_MAX);
91
+ let value_len = STRING_CACHE_VALUE_BYTES_MAX;
92
+ let retained_entry_count_max = STRING_CACHE_RETAINED_BYTES_MAX / value_len;
93
+
94
+ for index in 0..retained_entry_count_max {
95
+ cache.intern(format!("{index:08}-{}", "x".repeat(value_len - 9)));
96
+ }
97
+
98
+ assert_eq!(cache.retained_bytes, STRING_CACHE_RETAINED_BYTES_MAX);
99
+
100
+ let overflow = cache.intern("y".repeat(value_len));
101
+ let overflow_again = cache.intern("y".repeat(value_len));
102
+ let stats = cache.stats();
103
+
104
+ assert!(!Arc::ptr_eq(&overflow, &overflow_again));
105
+ assert_eq!(stats.size, retained_entry_count_max);
106
+ assert_eq!(cache.retained_bytes, STRING_CACHE_RETAINED_BYTES_MAX);
107
+ }
108
+
109
+ #[test]
110
+ fn clear_removes_entries_and_resets_counts() {
111
+ let mut cache = StringCache::new(DEFAULT_STRING_CACHE_CAPACITY);
112
+
113
+ cache.intern("repeat".to_string());
114
+ cache.intern("repeat".to_string());
115
+ cache.clear();
116
+
117
+ let stats = cache.stats();
118
+ assert_eq!(stats.size, 0);
119
+ assert_eq!(stats.hits, 0);
120
+ assert_eq!(stats.misses, 0);
121
+ assert_eq!(stats.hit_rate, 0.0);
122
+ }