parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,220 @@
1
+ use bytes::Bytes;
2
+ use indexmap::IndexMap;
3
+ use num::BigInt;
4
+ use triomphe::Arc;
5
+ use uuid::Uuid;
6
+
7
+ #[derive(Debug, Clone, PartialEq, Eq)]
8
+ pub enum ParquetValue {
9
+ // Numeric types
10
+ Int8(i8),
11
+ Int16(i16),
12
+ Int32(i32),
13
+ Int64(i64),
14
+ UInt8(u8),
15
+ UInt16(u16),
16
+ UInt32(u32),
17
+ UInt64(u64),
18
+ Float16(ordered_float::OrderedFloat<f32>), // f16 converted to f32
19
+ Float32(ordered_float::OrderedFloat<f32>),
20
+ Float64(ordered_float::OrderedFloat<f64>),
21
+
22
+ // Basic types
23
+ Boolean(bool),
24
+ String(Arc<str>),
25
+ Bytes(Bytes),
26
+ Uuid(Uuid),
27
+
28
+ // Date/Time types
29
+ Date32(i32), // Days since epoch
30
+ Date64(i64), // Milliseconds since epoch
31
+
32
+ // Decimal types
33
+ Decimal128(i128, i8), // value, scale
34
+ Decimal256(BigInt, i8), // Using BigInt instead of arrow_buffer::i256 for pure Rust
35
+
36
+ // Timestamp types - all store microseconds since epoch with optional timezone
37
+ TimestampSecond(i64, Option<Arc<str>>),
38
+ TimestampMillis(i64, Option<Arc<str>>),
39
+ TimestampMicros(i64, Option<Arc<str>>),
40
+ TimestampNanos(i64, Option<Arc<str>>),
41
+
42
+ // Time types
43
+ TimeMillis(i32), // Time of day in milliseconds since midnight
44
+ TimeMicros(i64), // Time of day in microseconds since midnight
45
+ TimeNanos(i64), // Time of day in nanoseconds since midnight
46
+
47
+ // Complex types
48
+ List(Vec<ParquetValue>),
49
+ Map(Vec<(ParquetValue, ParquetValue)>), // Using Vec of tuples for deterministic ordering
50
+ Record(IndexMap<Arc<str>, ParquetValue>), // For struct/record types, preserves field order
51
+
52
+ // Null value
53
+ Null,
54
+ }
55
+
56
+ impl std::hash::Hash for ParquetValue {
57
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
58
+ std::mem::discriminant(self).hash(state);
59
+ match self {
60
+ ParquetValue::Int8(i) => i.hash(state),
61
+ ParquetValue::Int16(i) => i.hash(state),
62
+ ParquetValue::Int32(i) => i.hash(state),
63
+ ParquetValue::Int64(i) => i.hash(state),
64
+ ParquetValue::UInt8(i) => i.hash(state),
65
+ ParquetValue::UInt16(i) => i.hash(state),
66
+ ParquetValue::UInt32(i) => i.hash(state),
67
+ ParquetValue::UInt64(i) => i.hash(state),
68
+ ParquetValue::Float16(f) => f.hash(state),
69
+ ParquetValue::Float32(f) => f.hash(state),
70
+ ParquetValue::Float64(f) => f.hash(state),
71
+ ParquetValue::Boolean(b) => b.hash(state),
72
+ ParquetValue::String(s) => s.hash(state),
73
+ ParquetValue::Bytes(b) => b.hash(state),
74
+ ParquetValue::Uuid(u) => u.hash(state),
75
+ ParquetValue::Date32(d) => d.hash(state),
76
+ ParquetValue::Date64(d) => d.hash(state),
77
+ ParquetValue::Decimal128(d, scale) => {
78
+ d.hash(state);
79
+ scale.hash(state);
80
+ }
81
+ ParquetValue::Decimal256(d, scale) => {
82
+ d.hash(state);
83
+ scale.hash(state);
84
+ }
85
+ ParquetValue::TimestampSecond(ts, tz) => {
86
+ ts.hash(state);
87
+ tz.hash(state);
88
+ }
89
+ ParquetValue::TimestampMillis(ts, tz) => {
90
+ ts.hash(state);
91
+ tz.hash(state);
92
+ }
93
+ ParquetValue::TimestampMicros(ts, tz) => {
94
+ ts.hash(state);
95
+ tz.hash(state);
96
+ }
97
+ ParquetValue::TimestampNanos(ts, tz) => {
98
+ ts.hash(state);
99
+ tz.hash(state);
100
+ }
101
+ ParquetValue::TimeMillis(t) => t.hash(state),
102
+ ParquetValue::TimeMicros(t) => t.hash(state),
103
+ ParquetValue::TimeNanos(t) => t.hash(state),
104
+ ParquetValue::List(l) => l.hash(state),
105
+ ParquetValue::Map(m) => m.hash(state),
106
+ ParquetValue::Record(r) => {
107
+ r.len().hash(state);
108
+ let mut entries = r.iter().collect::<Vec<_>>();
109
+ entries.sort_by(|(left_key, _), (right_key, _)| {
110
+ left_key.as_ref().cmp(right_key.as_ref())
111
+ });
112
+ for (k, v) in entries {
113
+ k.hash(state);
114
+ v.hash(state);
115
+ }
116
+ }
117
+ ParquetValue::Null => 0_i32.hash(state),
118
+ }
119
+ }
120
+ }
121
+
122
+ impl ParquetValue {
123
+ /// Check if the value is null
124
+ pub fn is_null(&self) -> bool {
125
+ matches!(self, ParquetValue::Null)
126
+ }
127
+
128
+ /// Get the type name of the value
129
+ pub fn type_name(&self) -> &'static str {
130
+ match self {
131
+ ParquetValue::Int8(_) => "Int8",
132
+ ParquetValue::Int16(_) => "Int16",
133
+ ParquetValue::Int32(_) => "Int32",
134
+ ParquetValue::Int64(_) => "Int64",
135
+ ParquetValue::UInt8(_) => "UInt8",
136
+ ParquetValue::UInt16(_) => "UInt16",
137
+ ParquetValue::UInt32(_) => "UInt32",
138
+ ParquetValue::UInt64(_) => "UInt64",
139
+ ParquetValue::Float16(_) => "Float16",
140
+ ParquetValue::Float32(_) => "Float32",
141
+ ParquetValue::Float64(_) => "Float64",
142
+ ParquetValue::Boolean(_) => "Boolean",
143
+ ParquetValue::String(_) => "String",
144
+ ParquetValue::Bytes(_) => "Bytes",
145
+ ParquetValue::Uuid(_) => "Uuid",
146
+ ParquetValue::Date32(_) => "Date32",
147
+ ParquetValue::Date64(_) => "Date64",
148
+ ParquetValue::Decimal128(_, _) => "Decimal128",
149
+ ParquetValue::Decimal256(_, _) => "Decimal256",
150
+ ParquetValue::TimestampSecond(_, _) => "TimestampSecond",
151
+ ParquetValue::TimestampMillis(_, _) => "TimestampMillis",
152
+ ParquetValue::TimestampMicros(_, _) => "TimestampMicros",
153
+ ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
154
+ ParquetValue::TimeMillis(_) => "TimeMillis",
155
+ ParquetValue::TimeMicros(_) => "TimeMicros",
156
+ ParquetValue::TimeNanos(_) => "TimeNanos",
157
+ ParquetValue::List(_) => "List",
158
+ ParquetValue::Map(_) => "Map",
159
+ ParquetValue::Record(_) => "Record",
160
+ ParquetValue::Null => "Null",
161
+ }
162
+ }
163
+ }
164
+
165
+ #[cfg(test)]
166
+ mod tests {
167
+ use super::*;
168
+ use ordered_float::OrderedFloat;
169
+
170
+ #[test]
171
+ fn test_value_creation() {
172
+ let v = ParquetValue::Int32(42);
173
+ assert_eq!(v, ParquetValue::Int32(42));
174
+ assert!(!v.is_null());
175
+ assert_eq!(v.type_name(), "Int32");
176
+ }
177
+
178
+ #[test]
179
+ fn test_null_value() {
180
+ let v = ParquetValue::Null;
181
+ assert!(v.is_null());
182
+ assert_eq!(v.type_name(), "Null");
183
+ }
184
+
185
+ #[test]
186
+ fn test_float_equality() {
187
+ let v1 = ParquetValue::Float32(OrderedFloat(3.5));
188
+ let v2 = ParquetValue::Float32(OrderedFloat(3.5));
189
+ assert_eq!(v1, v2);
190
+ }
191
+
192
+ #[test]
193
+ fn test_complex_types() {
194
+ let list = ParquetValue::List(vec![
195
+ ParquetValue::Int32(1),
196
+ ParquetValue::Int32(2),
197
+ ParquetValue::Int32(3),
198
+ ]);
199
+ assert_eq!(list.type_name(), "List");
200
+
201
+ let map = ParquetValue::Map(vec![(
202
+ ParquetValue::String(Arc::from("key")),
203
+ ParquetValue::Int32(42),
204
+ )]);
205
+ assert_eq!(map.type_name(), "Map");
206
+ }
207
+
208
+ #[test]
209
+ fn test_hash_consistency() {
210
+ use std::collections::HashSet;
211
+
212
+ let mut set = HashSet::new();
213
+ set.insert(ParquetValue::Int32(42));
214
+ set.insert(ParquetValue::String(Arc::from("hello")));
215
+
216
+ assert!(set.contains(&ParquetValue::Int32(42)));
217
+ assert!(set.contains(&ParquetValue::String(Arc::from("hello"))));
218
+ assert!(!set.contains(&ParquetValue::Int32(43)));
219
+ }
220
+ }