parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,308 @@
1
+ //! Test utilities for parquet-core
2
+
3
+ #[cfg(test)]
4
+ pub mod test {
5
+ use crate::{ParquetValue, PrimitiveType, Schema, SchemaBuilder, SchemaNode};
6
+ use indexmap::IndexMap;
7
+ use ordered_float::OrderedFloat;
8
+ use triomphe::Arc;
9
+
10
+ /// Create a simple schema for testing
11
+ pub fn sample_schema() -> Schema {
12
+ SchemaBuilder::new()
13
+ .with_root(SchemaNode::Struct {
14
+ name: "root".to_string(),
15
+ nullable: false,
16
+ fields: vec![
17
+ SchemaNode::Primitive {
18
+ name: "id".to_string(),
19
+ primitive_type: PrimitiveType::Int64,
20
+ nullable: false,
21
+ format: None,
22
+ },
23
+ SchemaNode::Primitive {
24
+ name: "name".to_string(),
25
+ primitive_type: PrimitiveType::String,
26
+ nullable: true,
27
+ format: None,
28
+ },
29
+ SchemaNode::Primitive {
30
+ name: "age".to_string(),
31
+ primitive_type: PrimitiveType::Int32,
32
+ nullable: true,
33
+ format: None,
34
+ },
35
+ SchemaNode::Primitive {
36
+ name: "salary".to_string(),
37
+ primitive_type: PrimitiveType::Float64,
38
+ nullable: true,
39
+ format: None,
40
+ },
41
+ ],
42
+ })
43
+ .build()
44
+ .unwrap()
45
+ }
46
+
47
+ /// Create a complex schema with nested types
48
+ pub fn complex_schema() -> Schema {
49
+ SchemaBuilder::new()
50
+ .with_root(SchemaNode::Struct {
51
+ name: "root".to_string(),
52
+ nullable: false,
53
+ fields: vec![
54
+ SchemaNode::Primitive {
55
+ name: "id".to_string(),
56
+ primitive_type: PrimitiveType::Int64,
57
+ nullable: false,
58
+ format: None,
59
+ },
60
+ SchemaNode::Struct {
61
+ name: "person".to_string(),
62
+ nullable: true,
63
+ fields: vec![
64
+ SchemaNode::Primitive {
65
+ name: "name".to_string(),
66
+ primitive_type: PrimitiveType::String,
67
+ nullable: false,
68
+ format: None,
69
+ },
70
+ SchemaNode::Primitive {
71
+ name: "age".to_string(),
72
+ primitive_type: PrimitiveType::Int32,
73
+ nullable: true,
74
+ format: None,
75
+ },
76
+ ],
77
+ },
78
+ SchemaNode::List {
79
+ name: "scores".to_string(),
80
+ nullable: true,
81
+ item: Box::new(SchemaNode::Primitive {
82
+ name: "item".to_string(),
83
+ primitive_type: PrimitiveType::Float32,
84
+ nullable: false,
85
+ format: None,
86
+ }),
87
+ },
88
+ ],
89
+ })
90
+ .build()
91
+ .unwrap()
92
+ }
93
+
94
+ /// Create sample row values matching the simple schema
95
+ pub fn sample_values() -> Vec<ParquetValue> {
96
+ vec![
97
+ ParquetValue::Int64(1),
98
+ ParquetValue::String(Arc::from("Alice")),
99
+ ParquetValue::Int32(30),
100
+ ParquetValue::Float64(OrderedFloat(75000.0)),
101
+ ]
102
+ }
103
+
104
+ /// Create multiple sample rows
105
+ pub fn sample_rows(count: usize) -> Vec<Vec<ParquetValue>> {
106
+ (0..count)
107
+ .map(|i| {
108
+ vec![
109
+ ParquetValue::Int64(i as i64),
110
+ ParquetValue::String(Arc::from(format!("Person{}", i))),
111
+ ParquetValue::Int32((20 + i % 50) as i32),
112
+ ParquetValue::Float64(OrderedFloat(50000.0 + (i as f64 * 1000.0))),
113
+ ]
114
+ })
115
+ .collect()
116
+ }
117
+
118
+ /// Create sample values with nulls
119
+ pub fn sample_values_with_nulls() -> Vec<ParquetValue> {
120
+ vec![
121
+ ParquetValue::Int64(2),
122
+ ParquetValue::Null,
123
+ ParquetValue::Int32(25),
124
+ ParquetValue::Null,
125
+ ]
126
+ }
127
+
128
+ /// Create complex values matching the complex schema
129
+ pub fn complex_values() -> Vec<ParquetValue> {
130
+ let mut person = IndexMap::new();
131
+ person.insert(Arc::from("name"), ParquetValue::String(Arc::from("Bob")));
132
+ person.insert(Arc::from("age"), ParquetValue::Int32(35));
133
+
134
+ vec![
135
+ ParquetValue::Int64(1),
136
+ ParquetValue::Record(person),
137
+ ParquetValue::List(vec![
138
+ ParquetValue::Float32(OrderedFloat(90.5)),
139
+ ParquetValue::Float32(OrderedFloat(87.3)),
140
+ ParquetValue::Float32(OrderedFloat(92.1)),
141
+ ]),
142
+ ]
143
+ }
144
+
145
+ /// Test data for all primitive types
146
+ pub fn all_primitive_values() -> Vec<(PrimitiveType, ParquetValue)> {
147
+ vec![
148
+ (PrimitiveType::Boolean, ParquetValue::Boolean(true)),
149
+ (PrimitiveType::Int8, ParquetValue::Int8(42)),
150
+ (PrimitiveType::Int16, ParquetValue::Int16(1000)),
151
+ (PrimitiveType::Int32, ParquetValue::Int32(100000)),
152
+ (PrimitiveType::Int64, ParquetValue::Int64(1000000000)),
153
+ (PrimitiveType::UInt8, ParquetValue::UInt8(200)),
154
+ (PrimitiveType::UInt16, ParquetValue::UInt16(50000)),
155
+ (PrimitiveType::UInt32, ParquetValue::UInt32(3000000000)),
156
+ (PrimitiveType::UInt64, ParquetValue::UInt64(10000000000)),
157
+ (
158
+ PrimitiveType::Float32,
159
+ ParquetValue::Float32(OrderedFloat(3.75)),
160
+ ),
161
+ (
162
+ PrimitiveType::Float64,
163
+ ParquetValue::Float64(OrderedFloat(2.625)),
164
+ ),
165
+ (
166
+ PrimitiveType::String,
167
+ ParquetValue::String(Arc::from("test string")),
168
+ ),
169
+ (
170
+ PrimitiveType::Binary,
171
+ ParquetValue::Bytes(bytes::Bytes::from(vec![0x01, 0x02, 0x03])),
172
+ ),
173
+ (PrimitiveType::Date32, ParquetValue::Date32(18628)), // 2021-01-01
174
+ (
175
+ PrimitiveType::TimeMillis,
176
+ ParquetValue::TimeMillis(43200000),
177
+ ), // 12:00:00
178
+ (
179
+ PrimitiveType::TimeMicros,
180
+ ParquetValue::TimeMicros(43200000000),
181
+ ), // 12:00:00
182
+ (
183
+ PrimitiveType::TimestampMillis(None),
184
+ ParquetValue::TimestampMillis(1609459200000, None),
185
+ ), // 2021-01-01 00:00:00
186
+ (
187
+ PrimitiveType::TimestampMicros(None),
188
+ ParquetValue::TimestampMicros(1609459200000000, None),
189
+ ), // 2021-01-01 00:00:00
190
+ (
191
+ PrimitiveType::Decimal128(10, 2),
192
+ ParquetValue::Decimal128(12345, 2),
193
+ ), // 123.45
194
+ ]
195
+ }
196
+
197
+ /// Create a temporary file path for testing
198
+ pub fn temp_file_path() -> String {
199
+ format!("/tmp/parquet_test_{}.parquet", uuid::Uuid::new_v4())
200
+ }
201
+
202
+ /// Compare two ParquetValues for equality, handling floating point comparison
203
+ pub fn values_equal(a: &ParquetValue, b: &ParquetValue) -> bool {
204
+ match (a, b) {
205
+ (ParquetValue::Float32(OrderedFloat(a)), ParquetValue::Float32(OrderedFloat(b))) => {
206
+ (a - b).abs() < f32::EPSILON
207
+ }
208
+ (ParquetValue::Float64(OrderedFloat(a)), ParquetValue::Float64(OrderedFloat(b))) => {
209
+ (a - b).abs() < f64::EPSILON
210
+ }
211
+ (ParquetValue::List(a), ParquetValue::List(b)) => {
212
+ a.len() == b.len() && a.iter().zip(b.iter()).all(|(a, b)| values_equal(a, b))
213
+ }
214
+ (ParquetValue::Map(a), ParquetValue::Map(b)) => {
215
+ a.len() == b.len()
216
+ && a.iter()
217
+ .zip(b.iter())
218
+ .all(|((k1, v1), (k2, v2))| values_equal(k1, k2) && values_equal(v1, v2))
219
+ }
220
+ (ParquetValue::Record(a), ParquetValue::Record(b)) => {
221
+ a.len() == b.len()
222
+ && a.iter()
223
+ .all(|(k, v)| b.get(k).map_or(false, |v2| values_equal(v, v2)))
224
+ }
225
+ _ => a == b,
226
+ }
227
+ }
228
+
229
+ /// Assert that two vectors of ParquetValues are equal
230
+ pub fn assert_values_equal(expected: &[ParquetValue], actual: &[ParquetValue]) {
231
+ assert_eq!(
232
+ expected.len(),
233
+ actual.len(),
234
+ "Value vectors have different lengths: expected {}, got {}",
235
+ expected.len(),
236
+ actual.len()
237
+ );
238
+
239
+ for (i, (e, a)) in expected.iter().zip(actual.iter()).enumerate() {
240
+ assert!(
241
+ values_equal(e, a),
242
+ "Values at index {} are not equal:\nExpected: {:?}\nActual: {:?}",
243
+ i,
244
+ e,
245
+ a
246
+ );
247
+ }
248
+ }
249
+ }
250
+
251
+ #[cfg(test)]
252
+ mod test_utils_tests {
253
+ use super::test::*;
254
+
255
+ #[test]
256
+ fn test_sample_schema() {
257
+ let schema = sample_schema();
258
+ assert_eq!(schema.root.name(), "root");
259
+
260
+ if let crate::SchemaNode::Struct { fields, .. } = &schema.root {
261
+ assert_eq!(fields.len(), 4);
262
+ assert_eq!(fields[0].name(), "id");
263
+ assert_eq!(fields[1].name(), "name");
264
+ assert_eq!(fields[2].name(), "age");
265
+ assert_eq!(fields[3].name(), "salary");
266
+ } else {
267
+ panic!("Expected struct schema");
268
+ }
269
+ }
270
+
271
+ #[test]
272
+ fn test_sample_values() {
273
+ let values = sample_values();
274
+ assert_eq!(values.len(), 4);
275
+ assert!(matches!(values[0], crate::ParquetValue::Int64(1)));
276
+ assert!(matches!(&values[1], crate::ParquetValue::String(s) if s.as_ref() == "Alice"));
277
+ }
278
+
279
+ #[test]
280
+ fn test_values_equal() {
281
+ use crate::ParquetValue;
282
+ use ordered_float::OrderedFloat;
283
+
284
+ // Test exact equality
285
+ assert!(values_equal(
286
+ &ParquetValue::Int32(42),
287
+ &ParquetValue::Int32(42)
288
+ ));
289
+
290
+ // Test floating point equality
291
+ assert!(values_equal(
292
+ &ParquetValue::Float32(OrderedFloat(1.0)),
293
+ &ParquetValue::Float32(OrderedFloat(1.0 + f32::EPSILON / 2.0))
294
+ ));
295
+
296
+ // Test list equality
297
+ assert!(values_equal(
298
+ &ParquetValue::List(vec![ParquetValue::Int32(1), ParquetValue::Int32(2)]),
299
+ &ParquetValue::List(vec![ParquetValue::Int32(1), ParquetValue::Int32(2)])
300
+ ));
301
+
302
+ // Test inequality
303
+ assert!(!values_equal(
304
+ &ParquetValue::Int32(42),
305
+ &ParquetValue::Int32(43)
306
+ ));
307
+ }
308
+ }
@@ -0,0 +1,5 @@
1
+ //! Traits for abstracting Parquet operations
2
+
3
+ pub mod schema;
4
+
5
+ pub use schema::SchemaInspector;
@@ -0,0 +1,190 @@
1
+ use crate::SchemaNode;
2
+
3
+ /// Trait for schema introspection
4
+ ///
5
+ /// This trait provides methods for examining and querying schemas
6
+ /// without modifying them.
7
+ pub trait SchemaInspector {
8
+ /// Get the total number of fields (including nested)
9
+ fn field_count(&self) -> usize;
10
+
11
+ /// Get field by path (e.g., "address.city")
12
+ fn get_field_by_path(&self, path: &str) -> Option<&SchemaNode>;
13
+
14
+ /// Check if schema contains a specific field
15
+ fn has_field(&self, name: &str) -> bool;
16
+
17
+ /// Get all field paths in the schema
18
+ fn all_field_paths(&self) -> Vec<String>;
19
+ }
20
+
21
+ impl SchemaInspector for crate::Schema {
22
+ fn field_count(&self) -> usize {
23
+ count_fields(&self.root)
24
+ }
25
+
26
+ fn get_field_by_path(&self, path: &str) -> Option<&SchemaNode> {
27
+ get_field_by_path(&self.root, path)
28
+ }
29
+
30
+ fn has_field(&self, name: &str) -> bool {
31
+ self.get_field_by_path(name).is_some()
32
+ }
33
+
34
+ fn all_field_paths(&self) -> Vec<String> {
35
+ let mut paths = Vec::new();
36
+ collect_field_paths(&self.root, String::new(), &mut paths);
37
+ paths
38
+ }
39
+ }
40
+
41
+ // Helper functions for schema inspection
42
+ fn count_fields(node: &SchemaNode) -> usize {
43
+ match node {
44
+ SchemaNode::Struct { fields, .. } => 1 + fields.iter().map(count_fields).sum::<usize>(),
45
+ SchemaNode::List { item, .. } => 1 + count_fields(item),
46
+ SchemaNode::Map { key, value, .. } => 1 + count_fields(key) + count_fields(value),
47
+ SchemaNode::Primitive { .. } => 1,
48
+ }
49
+ }
50
+
51
+ fn get_field_by_path<'a>(node: &'a SchemaNode, path: &str) -> Option<&'a SchemaNode> {
52
+ if path.is_empty() {
53
+ return None;
54
+ }
55
+
56
+ let mut parts: Vec<&str> = path.split('.').collect();
57
+ // Strip a leading segment equal to the root's own name (so a path may carry
58
+ // the root name as a prefix or omit it), but not when the root actually has a
59
+ // child of that name — there the segment refers to the child, not the root.
60
+ if parts.first().copied() == Some(node.name()) && !has_child_named(node, node.name()) {
61
+ parts.remove(0);
62
+ }
63
+ get_field_by_path_parts(node, &parts)
64
+ }
65
+
66
+ fn has_child_named(node: &SchemaNode, name: &str) -> bool {
67
+ matches!(node, SchemaNode::Struct { fields, .. } if fields.iter().any(|f| f.name() == name))
68
+ }
69
+
70
+ fn get_field_by_path_parts<'a>(node: &'a SchemaNode, parts: &[&str]) -> Option<&'a SchemaNode> {
71
+ if parts.is_empty() {
72
+ return Some(node);
73
+ }
74
+
75
+ let first = parts[0];
76
+ let rest = &parts[1..];
77
+
78
+ match node {
79
+ SchemaNode::Struct { fields, .. } => fields
80
+ .iter()
81
+ .find(|f| f.name() == first)
82
+ .and_then(|f| get_field_by_path_parts(f, rest)),
83
+ SchemaNode::List { item, .. } if first == "item" || first == item.name() => {
84
+ get_field_by_path_parts(item, rest)
85
+ }
86
+ SchemaNode::Map { key, value, .. } => match first {
87
+ name if name == "key" || name == key.name() => get_field_by_path_parts(key, rest),
88
+ name if name == "value" || name == value.name() => get_field_by_path_parts(value, rest),
89
+ _ => None,
90
+ },
91
+ _ => None,
92
+ }
93
+ }
94
+
95
+ fn collect_field_paths(node: &SchemaNode, prefix: String, paths: &mut Vec<String>) {
96
+ let current_path = if prefix.is_empty() {
97
+ node.name().to_string()
98
+ } else {
99
+ format!("{}.{}", prefix, node.name())
100
+ };
101
+
102
+ paths.push(current_path.clone());
103
+
104
+ match node {
105
+ SchemaNode::Struct { fields, .. } => {
106
+ for field in fields {
107
+ collect_field_paths(field, current_path.clone(), paths);
108
+ }
109
+ }
110
+ SchemaNode::List { item, .. } => {
111
+ collect_field_paths(item, current_path, paths);
112
+ }
113
+ SchemaNode::Map { key, value, .. } => {
114
+ collect_field_paths(key, current_path.clone(), paths);
115
+ collect_field_paths(value, current_path, paths);
116
+ }
117
+ SchemaNode::Primitive { .. } => {}
118
+ }
119
+ }
120
+
121
+ #[cfg(test)]
122
+ mod tests {
123
+ use super::*;
124
+ use crate::{PrimitiveType, SchemaBuilder as CoreSchemaBuilder};
125
+
126
+ #[test]
127
+ fn test_schema_inspector() {
128
+ let schema = CoreSchemaBuilder::new()
129
+ .with_root(SchemaNode::Struct {
130
+ name: "root".to_string(),
131
+ nullable: false,
132
+ fields: vec![
133
+ SchemaNode::Primitive {
134
+ name: "id".to_string(),
135
+ primitive_type: PrimitiveType::Int64,
136
+ nullable: false,
137
+ format: None,
138
+ },
139
+ SchemaNode::Struct {
140
+ name: "address".to_string(),
141
+ nullable: true,
142
+ fields: vec![SchemaNode::Primitive {
143
+ name: "city".to_string(),
144
+ primitive_type: PrimitiveType::String,
145
+ nullable: true,
146
+ format: None,
147
+ }],
148
+ },
149
+ ],
150
+ })
151
+ .build()
152
+ .unwrap();
153
+
154
+ // Test field count
155
+ assert_eq!(schema.field_count(), 4); // root, id, address, city
156
+
157
+ // Test field lookup
158
+ assert!(schema.has_field("id"));
159
+ assert!(schema.has_field("address"));
160
+ assert!(schema.has_field("address.city"));
161
+ assert!(!schema.has_field("missing"));
162
+
163
+ // Test get field by path
164
+ let city = schema.get_field_by_path("address.city").unwrap();
165
+ assert_eq!(city.name(), "city");
166
+ }
167
+
168
+ #[test]
169
+ fn leading_root_segment_resolves_to_child_when_root_has_such_a_child() {
170
+ // When the root struct has a child sharing the root's own name, a leading
171
+ // "root" segment must refer to that child, not be stripped as the root.
172
+ let schema = CoreSchemaBuilder::new()
173
+ .with_root(SchemaNode::Struct {
174
+ name: "root".to_string(),
175
+ nullable: false,
176
+ fields: vec![SchemaNode::Primitive {
177
+ name: "root".to_string(),
178
+ primitive_type: PrimitiveType::Int64,
179
+ nullable: false,
180
+ format: None,
181
+ }],
182
+ })
183
+ .build()
184
+ .unwrap();
185
+
186
+ let resolved = schema.get_field_by_path("root").unwrap();
187
+ assert!(matches!(resolved, SchemaNode::Primitive { .. }));
188
+ assert_eq!(resolved.name(), "root");
189
+ }
190
+ }