parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,452 @@
1
+ use std::collections::HashSet;
2
+ use triomphe::Arc;
3
+
4
+ const DECIMAL128_MAX_PRECISION: u8 = 38;
5
+ const DECIMAL256_MAX_PRECISION: u8 = 76;
6
+
7
+ /// Core schema representation for Parquet files
8
+ #[derive(Debug, Clone, PartialEq)]
9
+ pub struct Schema {
10
+ pub root: SchemaNode,
11
+ }
12
+
13
+ impl Schema {
14
+ pub fn validate(&self) -> Result<(), String> {
15
+ validate_root(&self.root)
16
+ }
17
+ }
18
+
19
+ /// Represents a node in the Parquet schema tree
20
+ #[derive(Debug, Clone, PartialEq)]
21
+ pub enum SchemaNode {
22
+ /// A struct with named fields
23
+ Struct {
24
+ name: String,
25
+ nullable: bool,
26
+ fields: Vec<SchemaNode>,
27
+ },
28
+ /// A list containing items of a single type
29
+ List {
30
+ name: String,
31
+ nullable: bool,
32
+ item: Box<SchemaNode>,
33
+ },
34
+ /// A map with key-value pairs
35
+ Map {
36
+ name: String,
37
+ nullable: bool,
38
+ key: Box<SchemaNode>,
39
+ value: Box<SchemaNode>,
40
+ },
41
+ /// A primitive/leaf type
42
+ Primitive {
43
+ name: String,
44
+ primitive_type: PrimitiveType,
45
+ nullable: bool,
46
+ format: Option<String>,
47
+ },
48
+ }
49
+
50
+ /// Primitive data types supported by Parquet
51
+ #[derive(Debug, Clone, PartialEq, Eq, Hash)]
52
+ pub enum PrimitiveType {
53
+ // Integer types
54
+ Int8,
55
+ Int16,
56
+ Int32,
57
+ Int64,
58
+ UInt8,
59
+ UInt16,
60
+ UInt32,
61
+ UInt64,
62
+
63
+ // Floating point types
64
+ Float32,
65
+ Float64,
66
+
67
+ // Decimal types (precision, scale)
68
+ Decimal128(u8, i8),
69
+ Decimal256(u8, i8),
70
+
71
+ // Other basic types
72
+ Boolean,
73
+ String,
74
+ Binary,
75
+
76
+ // Date/Time types
77
+ Date32,
78
+ Date64,
79
+ TimestampSecond(Option<Arc<str>>),
80
+ TimestampMillis(Option<Arc<str>>),
81
+ TimestampMicros(Option<Arc<str>>),
82
+ TimestampNanos(Option<Arc<str>>),
83
+ TimeMillis,
84
+ TimeMicros,
85
+ TimeNanos,
86
+
87
+ // Fixed-length byte array
88
+ FixedLenByteArray(i32),
89
+ }
90
+
91
+ /// Represents how values are repeated in Parquet
92
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
93
+ pub enum Repetition {
94
+ /// Field must have exactly one value
95
+ Required,
96
+ /// Field can have 0 or 1 value
97
+ Optional,
98
+ /// Field can have 0 or more values
99
+ Repeated,
100
+ }
101
+
102
+ impl SchemaNode {
103
+ /// Get the name of this schema node
104
+ pub fn name(&self) -> &str {
105
+ match self {
106
+ SchemaNode::Struct { name, .. } => name,
107
+ SchemaNode::List { name, .. } => name,
108
+ SchemaNode::Map { name, .. } => name,
109
+ SchemaNode::Primitive { name, .. } => name,
110
+ }
111
+ }
112
+
113
+ /// Check if this node is nullable
114
+ pub fn is_nullable(&self) -> bool {
115
+ match self {
116
+ SchemaNode::Struct { nullable, .. } => *nullable,
117
+ SchemaNode::List { nullable, .. } => *nullable,
118
+ SchemaNode::Map { nullable, .. } => *nullable,
119
+ SchemaNode::Primitive { nullable, .. } => *nullable,
120
+ }
121
+ }
122
+
123
+ /// Get the repetition level based on nullability
124
+ pub fn repetition(&self) -> Repetition {
125
+ if self.is_nullable() {
126
+ Repetition::Optional
127
+ } else {
128
+ Repetition::Required
129
+ }
130
+ }
131
+ }
132
+
133
+ impl PrimitiveType {
134
+ /// Get the logical type name for display
135
+ pub fn type_name(&self) -> &'static str {
136
+ match self {
137
+ PrimitiveType::Int8 => "Int8",
138
+ PrimitiveType::Int16 => "Int16",
139
+ PrimitiveType::Int32 => "Int32",
140
+ PrimitiveType::Int64 => "Int64",
141
+ PrimitiveType::UInt8 => "UInt8",
142
+ PrimitiveType::UInt16 => "UInt16",
143
+ PrimitiveType::UInt32 => "UInt32",
144
+ PrimitiveType::UInt64 => "UInt64",
145
+ PrimitiveType::Float32 => "Float32",
146
+ PrimitiveType::Float64 => "Float64",
147
+ PrimitiveType::Decimal128(_, _) => "Decimal128",
148
+ PrimitiveType::Decimal256(_, _) => "Decimal256",
149
+ PrimitiveType::Boolean => "Boolean",
150
+ PrimitiveType::String => "String",
151
+ PrimitiveType::Binary => "Binary",
152
+ PrimitiveType::Date32 => "Date32",
153
+ PrimitiveType::Date64 => "Date64",
154
+ PrimitiveType::TimestampSecond(_) => "TimestampSecond",
155
+ PrimitiveType::TimestampMillis(_) => "TimestampMillis",
156
+ PrimitiveType::TimestampMicros(_) => "TimestampMicros",
157
+ PrimitiveType::TimestampNanos(_) => "TimestampNanos",
158
+ PrimitiveType::TimeMillis => "TimeMillis",
159
+ PrimitiveType::TimeMicros => "TimeMicros",
160
+ PrimitiveType::TimeNanos => "TimeNanos",
161
+ PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
162
+ }
163
+ }
164
+
165
+ /// Check if this type requires a format specifier
166
+ pub fn requires_format(&self) -> bool {
167
+ matches!(
168
+ self,
169
+ PrimitiveType::Date32
170
+ | PrimitiveType::Date64
171
+ | PrimitiveType::TimestampSecond(_)
172
+ | PrimitiveType::TimestampMillis(_)
173
+ | PrimitiveType::TimestampMicros(_)
174
+ | PrimitiveType::TimestampNanos(_)
175
+ | PrimitiveType::TimeMillis
176
+ | PrimitiveType::TimeMicros
177
+ | PrimitiveType::TimeNanos
178
+ )
179
+ }
180
+ }
181
+
182
+ /// Builder for creating schemas
183
+ pub struct SchemaBuilder {
184
+ root: Option<SchemaNode>,
185
+ }
186
+
187
+ impl SchemaBuilder {
188
+ pub fn new() -> Self {
189
+ Self { root: None }
190
+ }
191
+
192
+ pub fn with_root(mut self, root: SchemaNode) -> Self {
193
+ self.root = Some(root);
194
+ self
195
+ }
196
+
197
+ pub fn build(self) -> Result<Schema, String> {
198
+ match self.root {
199
+ Some(root) => {
200
+ validate_root(&root)?;
201
+ Ok(Schema { root })
202
+ }
203
+ None => Err("Schema must have a root node".to_string()),
204
+ }
205
+ }
206
+ }
207
+
208
+ impl Default for SchemaBuilder {
209
+ fn default() -> Self {
210
+ Self::new()
211
+ }
212
+ }
213
+
214
+ fn validate_root(root: &SchemaNode) -> Result<(), String> {
215
+ match root {
216
+ SchemaNode::Struct { name, fields, .. } => {
217
+ if fields.is_empty() {
218
+ return Err(format!(
219
+ "Root struct '{}' must contain at least one field",
220
+ name
221
+ ));
222
+ }
223
+ validate_unique_field_names(fields, name)?;
224
+ for field in fields {
225
+ validate_schema_node(field, name)?;
226
+ }
227
+ Ok(())
228
+ }
229
+ _ => Err("Root schema node must be a struct".to_string()),
230
+ }
231
+ }
232
+
233
+ fn validate_schema_node(node: &SchemaNode, parent_path: &str) -> Result<(), String> {
234
+ let path = format!("{}.{}", parent_path, node.name());
235
+ match node {
236
+ SchemaNode::Struct { fields, .. } => {
237
+ if fields.is_empty() {
238
+ return Err(format!(
239
+ "Struct field '{}' must contain at least one field",
240
+ path
241
+ ));
242
+ }
243
+ validate_unique_field_names(fields, &path)?;
244
+ for field in fields {
245
+ validate_schema_node(field, &path)?;
246
+ }
247
+ }
248
+ SchemaNode::List { item, .. } => {
249
+ validate_schema_node(item, &path)?;
250
+ }
251
+ SchemaNode::Map { key, value, .. } => {
252
+ if key.is_nullable() {
253
+ return Err(format!(
254
+ "Map key field '{}.{}' must be required",
255
+ path,
256
+ key.name()
257
+ ));
258
+ }
259
+ validate_schema_node(key, &path)?;
260
+ validate_schema_node(value, &path)?;
261
+ }
262
+ SchemaNode::Primitive {
263
+ primitive_type,
264
+ format,
265
+ ..
266
+ } => {
267
+ validate_primitive_type(primitive_type, format.as_deref(), &path)?;
268
+ }
269
+ }
270
+ Ok(())
271
+ }
272
+
273
+ fn validate_unique_field_names(fields: &[SchemaNode], path: &str) -> Result<(), String> {
274
+ let mut names = HashSet::with_capacity(fields.len());
275
+ for field in fields {
276
+ let name = field.name();
277
+ if !names.insert(name) {
278
+ return Err(format!(
279
+ "Struct field '{}' contains duplicate field '{}'",
280
+ path, name
281
+ ));
282
+ }
283
+ }
284
+ Ok(())
285
+ }
286
+
287
+ fn validate_primitive_type(
288
+ primitive_type: &PrimitiveType,
289
+ format: Option<&str>,
290
+ path: &str,
291
+ ) -> Result<(), String> {
292
+ match primitive_type {
293
+ PrimitiveType::Decimal128(precision, scale) => validate_decimal_type(
294
+ "Decimal128",
295
+ *precision,
296
+ *scale,
297
+ DECIMAL128_MAX_PRECISION,
298
+ path,
299
+ )?,
300
+ PrimitiveType::Decimal256(precision, scale) => validate_decimal_type(
301
+ "Decimal256",
302
+ *precision,
303
+ *scale,
304
+ DECIMAL256_MAX_PRECISION,
305
+ path,
306
+ )?,
307
+ PrimitiveType::FixedLenByteArray(length) => {
308
+ if *length <= 0 {
309
+ return Err(format!(
310
+ "FixedLenByteArray field '{}' must have a positive length",
311
+ path
312
+ ));
313
+ }
314
+ if format == Some("uuid") && *length != 16 {
315
+ return Err(format!(
316
+ "UUID field '{}' must use FixedLenByteArray(16)",
317
+ path
318
+ ));
319
+ }
320
+ }
321
+ _ => {
322
+ if format == Some("uuid") {
323
+ return Err(format!(
324
+ "UUID field '{}' must use FixedLenByteArray(16)",
325
+ path
326
+ ));
327
+ }
328
+ }
329
+ }
330
+ Ok(())
331
+ }
332
+
333
+ fn validate_decimal_type(
334
+ type_name: &str,
335
+ precision: u8,
336
+ scale: i8,
337
+ max_precision: u8,
338
+ path: &str,
339
+ ) -> Result<(), String> {
340
+ if precision == 0 {
341
+ return Err(format!(
342
+ "{} field '{}' precision must be at least 1",
343
+ type_name, path
344
+ ));
345
+ }
346
+ if precision > max_precision {
347
+ return Err(format!(
348
+ "{} field '{}' precision {} exceeds maximum precision {}",
349
+ type_name, path, precision, max_precision
350
+ ));
351
+ }
352
+ if scale < 0 {
353
+ return Err(format!(
354
+ "{} field '{}' scale must be non-negative",
355
+ type_name, path
356
+ ));
357
+ }
358
+ if scale as u8 > precision {
359
+ return Err(format!(
360
+ "{} field '{}' scale {} cannot exceed precision {}",
361
+ type_name, path, scale, precision
362
+ ));
363
+ }
364
+ Ok(())
365
+ }
366
+
367
+ #[cfg(test)]
368
+ mod tests {
369
+ use super::*;
370
+
371
+ #[test]
372
+ fn test_schema_creation() {
373
+ let schema = SchemaBuilder::new()
374
+ .with_root(SchemaNode::Struct {
375
+ name: "root".to_string(),
376
+ nullable: false,
377
+ fields: vec![
378
+ SchemaNode::Primitive {
379
+ name: "id".to_string(),
380
+ primitive_type: PrimitiveType::Int64,
381
+ nullable: false,
382
+ format: None,
383
+ },
384
+ SchemaNode::Primitive {
385
+ name: "name".to_string(),
386
+ primitive_type: PrimitiveType::String,
387
+ nullable: true,
388
+ format: None,
389
+ },
390
+ ],
391
+ })
392
+ .build()
393
+ .unwrap();
394
+
395
+ assert_eq!(schema.root.name(), "root");
396
+ assert!(!schema.root.is_nullable());
397
+ }
398
+
399
+ #[test]
400
+ fn test_primitive_types() {
401
+ let decimal = PrimitiveType::Decimal128(10, 2);
402
+ assert_eq!(decimal.type_name(), "Decimal128");
403
+
404
+ let timestamp = PrimitiveType::TimestampMicros(None);
405
+ assert!(timestamp.requires_format());
406
+
407
+ let integer = PrimitiveType::Int32;
408
+ assert!(!integer.requires_format());
409
+ }
410
+
411
+ #[test]
412
+ fn test_nested_schema() {
413
+ let list_node = SchemaNode::List {
414
+ name: "items".to_string(),
415
+ nullable: true,
416
+ item: Box::new(SchemaNode::Primitive {
417
+ name: "item".to_string(),
418
+ primitive_type: PrimitiveType::String,
419
+ nullable: false,
420
+ format: None,
421
+ }),
422
+ };
423
+
424
+ assert_eq!(list_node.name(), "items");
425
+ assert!(list_node.is_nullable());
426
+ assert_eq!(list_node.repetition(), Repetition::Optional);
427
+ }
428
+
429
+ #[test]
430
+ fn test_map_schema() {
431
+ let map_node = SchemaNode::Map {
432
+ name: "metadata".to_string(),
433
+ nullable: false,
434
+ key: Box::new(SchemaNode::Primitive {
435
+ name: "key".to_string(),
436
+ primitive_type: PrimitiveType::String,
437
+ nullable: false,
438
+ format: None,
439
+ }),
440
+ value: Box::new(SchemaNode::Primitive {
441
+ name: "value".to_string(),
442
+ primitive_type: PrimitiveType::String,
443
+ nullable: true,
444
+ format: None,
445
+ }),
446
+ };
447
+
448
+ assert_eq!(map_node.name(), "metadata");
449
+ assert!(!map_node.is_nullable());
450
+ assert_eq!(map_node.repetition(), Repetition::Required);
451
+ }
452
+ }