parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,884 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
3
+ use parquet_core::{ParquetError, PrimitiveType, Schema, SchemaNode};
4
+
5
+ use crate::utils::parse_string_or_symbol;
6
+ use crate::RubyAdapterError;
7
+
8
+ /// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
9
+ pub struct RubySchemaBuilder;
10
+
11
+ impl RubySchemaBuilder {
12
+ pub fn new() -> Self {
13
+ Self
14
+ }
15
+
16
+ /// Parse a Ruby schema definition (hash) into a SchemaNode
17
+ fn parse_schema_node(
18
+ &self,
19
+ name: String,
20
+ schema_def: Value,
21
+ ) -> Result<SchemaNode, RubyAdapterError> {
22
+ // If it's a Hash, parse it as a complex type
23
+ if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
24
+ return self.parse_hash_schema_node(name, hash);
25
+ }
26
+
27
+ // Otherwise, try to parse as a simple type symbol
28
+ if let Ok(type_str) = schema_def.to_r_string()?.to_string() {
29
+ // Check if it's a complex type with angle brackets
30
+ if type_str.contains('<') {
31
+ return self.parse_complex_type_string(name, type_str.to_string(), true);
32
+ }
33
+
34
+ let primitive_type =
35
+ self.parse_primitive_type(type_str.to_string(), None, None, None)?;
36
+ return Ok(SchemaNode::Primitive {
37
+ name,
38
+ primitive_type,
39
+ nullable: true, // Default to nullable for simple types
40
+ format: None,
41
+ });
42
+ }
43
+
44
+ Err(RubyAdapterError::InvalidInput(format!(
45
+ "Expected Hash or Symbol for schema definition, got {}",
46
+ schema_def.class()
47
+ )))
48
+ }
49
+
50
+ /// Parse a Ruby hash schema node
51
+ fn parse_hash_schema_node(
52
+ &self,
53
+ name: String,
54
+ hash: RHash,
55
+ ) -> Result<SchemaNode, RubyAdapterError> {
56
+ let ruby = Ruby::get().map_err(|e| RubyAdapterError::Ruby(e.to_string()))?;
57
+ // Get the type field
58
+ let type_sym: Value = hash
59
+ .fetch::<_, Value>(ruby.to_symbol("type"))
60
+ .map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
61
+
62
+ let type_str = type_sym.to_r_string()?.to_string()?;
63
+
64
+ // Get nullable field (default to true)
65
+ let nullable = hash
66
+ .fetch::<_, Value>(ruby.to_symbol("nullable"))
67
+ .ok()
68
+ .and_then(|v| <bool as TryConvert>::try_convert(v).ok())
69
+ .unwrap_or(true);
70
+
71
+ // Get format field if present
72
+ let format = hash
73
+ .fetch::<_, Value>(ruby.to_symbol("format"))
74
+ .ok()
75
+ .and_then(|v| <String as TryConvert>::try_convert(v).ok());
76
+
77
+ match type_str.to_string().as_str() {
78
+ "struct" => {
79
+ let fields_array: RArray = hash
80
+ .fetch(ruby.to_symbol("fields"))
81
+ .map_err(|e| ParquetError::Schema(format!("Struct missing 'fields': {}", e)))?;
82
+
83
+ let mut fields = Vec::new();
84
+ for field_value in fields_array.into_iter() {
85
+ let field_hash: RHash = <RHash as TryConvert>::try_convert(field_value)
86
+ .map_err(|e: MagnusError| {
87
+ ParquetError::Schema(format!("Invalid field definition: {}", e))
88
+ })?;
89
+
90
+ let _field_name: String =
91
+ field_hash.fetch(ruby.to_symbol("name")).map_err(|e| {
92
+ ParquetError::Schema(format!("Field missing 'name': {}", e))
93
+ })?;
94
+
95
+ let field_node = self.parse_field_definition(field_hash)?;
96
+ fields.push(field_node);
97
+ }
98
+
99
+ Ok(SchemaNode::Struct {
100
+ name,
101
+ nullable,
102
+ fields,
103
+ })
104
+ }
105
+
106
+ "list" => {
107
+ let item_def = hash
108
+ .fetch::<_, Value>(ruby.to_symbol("item"))
109
+ .map_err(|e| ParquetError::Schema(format!("List missing 'item': {}", e)))?;
110
+
111
+ let item_name = format!("{}_item", name);
112
+ let item_node = self.parse_schema_node(item_name, item_def)?;
113
+
114
+ Ok(SchemaNode::List {
115
+ name,
116
+ nullable,
117
+ item: Box::new(item_node),
118
+ })
119
+ }
120
+
121
+ "map" => {
122
+ // Parse key definition. Parquet requires map keys to be
123
+ // required (non-nullable), so enforce that invariant here
124
+ // regardless of what the key hash specifies. This matches the
125
+ // schema DSL (lib/parquet/schema.rb) and the `map<...>` string
126
+ // form, both of which already build required keys.
127
+ let key_def = hash
128
+ .fetch::<_, Value>(ruby.to_symbol("key"))
129
+ .map_err(|e| ParquetError::Schema(format!("Map missing 'key': {}", e)))?;
130
+ let key_node = into_required(self.parse_schema_node("key".to_string(), key_def)?);
131
+
132
+ // Parse value definition
133
+ let value_def = hash
134
+ .fetch::<_, Value>(ruby.to_symbol("value"))
135
+ .map_err(|e| ParquetError::Schema(format!("Map missing 'value': {}", e)))?;
136
+ let value_node = self.parse_schema_node("value".to_string(), value_def)?;
137
+
138
+ Ok(SchemaNode::Map {
139
+ name,
140
+ nullable,
141
+ key: Box::new(key_node),
142
+ value: Box::new(value_node),
143
+ })
144
+ }
145
+
146
+ // Check if it's a complex type with angle brackets
147
+ type_str if type_str.contains('<') => {
148
+ self.parse_complex_type_string(name, type_str.to_string(), nullable)
149
+ }
150
+
151
+ // Primitive types
152
+ primitive_type => {
153
+ if format.as_deref() == Some("uuid") {
154
+ return Ok(SchemaNode::Primitive {
155
+ name,
156
+ primitive_type: PrimitiveType::FixedLenByteArray(16),
157
+ nullable,
158
+ format,
159
+ });
160
+ }
161
+
162
+ // Get precision and scale for decimal types
163
+ let precision = hash
164
+ .fetch::<_, Value>(ruby.to_symbol("precision"))
165
+ .ok()
166
+ .and_then(|v| <u8 as TryConvert>::try_convert(v).ok());
167
+
168
+ let scale = hash
169
+ .fetch::<_, Value>(ruby.to_symbol("scale"))
170
+ .ok()
171
+ .and_then(|v| <i8 as TryConvert>::try_convert(v).ok());
172
+
173
+ // Handle timezone for timestamp types
174
+ // Support both new has_timezone (preferred) and legacy timezone parameters
175
+ let timezone =
176
+ if let Ok(has_tz) = hash.fetch::<_, Value>(ruby.to_symbol("has_timezone")) {
177
+ // New approach: has_timezone boolean
178
+ if let Ok(has_timezone) = <bool as TryConvert>::try_convert(has_tz) {
179
+ if has_timezone {
180
+ Some("UTC".to_string()) // Presence means UTC storage
181
+ } else {
182
+ None // Absence means local/unzoned storage
183
+ }
184
+ } else {
185
+ None
186
+ }
187
+ } else {
188
+ hash.fetch::<_, Value>(ruby.to_symbol("timezone"))
189
+ .ok()
190
+ .map(|_| "UTC".to_string()) // Any value -> UTC
191
+ };
192
+
193
+ let primitive = self.parse_primitive_type(
194
+ primitive_type.to_string(),
195
+ precision,
196
+ scale,
197
+ timezone,
198
+ )?;
199
+
200
+ Ok(SchemaNode::Primitive {
201
+ name,
202
+ primitive_type: primitive,
203
+ nullable,
204
+ format,
205
+ })
206
+ }
207
+ }
208
+ }
209
+
210
+ /// Parse a complex type string like "list<string>" or "map<string,int32>"
211
+ fn parse_complex_type_string(
212
+ &self,
213
+ name: String,
214
+ type_str: String,
215
+ nullable: bool,
216
+ ) -> Result<SchemaNode, RubyAdapterError> {
217
+ if type_str.starts_with("list<") && type_str.ends_with('>') {
218
+ let inner_type = &type_str[5..type_str.len() - 1];
219
+ let item_name = format!("{}_item", name);
220
+
221
+ // Create a simple type node for the item
222
+ let item_node = if inner_type.contains('<') {
223
+ // Nested complex type
224
+ self.parse_complex_type_string(item_name, inner_type.to_string(), true)?
225
+ } else {
226
+ // Simple primitive type
227
+ SchemaNode::Primitive {
228
+ name: item_name,
229
+ primitive_type: self.parse_primitive_type(
230
+ inner_type.to_string(),
231
+ None,
232
+ None,
233
+ None,
234
+ )?,
235
+ nullable: true,
236
+ format: None,
237
+ }
238
+ };
239
+
240
+ Ok(SchemaNode::List {
241
+ name,
242
+ nullable,
243
+ item: Box::new(item_node),
244
+ })
245
+ } else if type_str.starts_with("map<") && type_str.ends_with('>') {
246
+ let inner = &type_str[4..type_str.len() - 1];
247
+ let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
248
+ if parts.len() != 2 {
249
+ return Err(RubyAdapterError::InvalidInput(format!(
250
+ "Invalid map type: {}",
251
+ type_str
252
+ )));
253
+ }
254
+
255
+ let key_type = self.parse_primitive_type(parts[0].to_string(), None, None, None)?;
256
+ let value_type = self.parse_primitive_type(parts[1].to_string(), None, None, None)?;
257
+
258
+ Ok(SchemaNode::Map {
259
+ name,
260
+ nullable,
261
+ key: Box::new(SchemaNode::Primitive {
262
+ name: "key".to_string(),
263
+ primitive_type: key_type,
264
+ nullable: false,
265
+ format: None,
266
+ }),
267
+ value: Box::new(SchemaNode::Primitive {
268
+ name: "value".to_string(),
269
+ primitive_type: value_type,
270
+ nullable: true,
271
+ format: None,
272
+ }),
273
+ })
274
+ } else {
275
+ Err(RubyAdapterError::InvalidInput(format!(
276
+ "Unknown complex type: {}",
277
+ type_str
278
+ )))
279
+ }
280
+ }
281
+
282
+ /// Parse a field definition from a Ruby hash
283
+ fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode, RubyAdapterError> {
284
+ let ruby = Ruby::get().map_err(|e| RubyAdapterError::Ruby(e.to_string()))?;
285
+ let name: String = field_hash
286
+ .fetch(ruby.to_symbol("name"))
287
+ .map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
288
+
289
+ // Check if there's a 'type' field - if so, parse as full definition
290
+ if let Ok(_type_value) = field_hash.fetch::<_, Value>(ruby.to_symbol("type")) {
291
+ // This is a full field definition
292
+ self.parse_schema_node(name, field_hash.as_value())
293
+ } else {
294
+ // This might be a simplified definition - look for known field patterns
295
+ Err(RubyAdapterError::InvalidInput(format!(
296
+ "Field '{}' missing 'type' definition",
297
+ name
298
+ )))
299
+ }
300
+ }
301
+
302
+ /// Parse a primitive type string to PrimitiveType enum
303
+ fn parse_primitive_type(
304
+ &self,
305
+ type_str: String,
306
+ precision: Option<u8>,
307
+ scale: Option<i8>,
308
+ timezone: Option<String>,
309
+ ) -> Result<PrimitiveType, RubyAdapterError> {
310
+ // Check if it's a decimal type with parentheses notation like "decimal(5,2)"
311
+ if type_str.starts_with("decimal(") && type_str.ends_with(')') {
312
+ let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
313
+ let parts: Vec<&str> = params.split(',').map(|s| s.trim()).collect();
314
+ if parts.len() == 2 {
315
+ let p = parts[0].parse::<u8>().map_err(|_| {
316
+ ParquetError::Schema(format!("Invalid decimal precision: {}", parts[0]))
317
+ })?;
318
+ let s = parts[1].parse::<i8>().map_err(|_| {
319
+ ParquetError::Schema(format!("Invalid decimal scale: {}", parts[1]))
320
+ })?;
321
+
322
+ // Choose decimal type based on precision
323
+ if p <= 38 {
324
+ return Ok(PrimitiveType::Decimal128(p, s));
325
+ } else {
326
+ return Ok(PrimitiveType::Decimal256(p, s));
327
+ }
328
+ }
329
+ }
330
+ // Check for decimal256 with parentheses notation
331
+ if type_str.starts_with("decimal256(") && type_str.ends_with(')') {
332
+ let params = &type_str[11..type_str.len() - 1];
333
+ let parts: Vec<&str> = params.split(',').map(|s| s.trim()).collect();
334
+ if parts.len() == 2 {
335
+ let p = parts[0].parse::<u8>().map_err(|_| {
336
+ ParquetError::Schema(format!("Invalid decimal256 precision: {}", parts[0]))
337
+ })?;
338
+ let s = parts[1].parse::<i8>().map_err(|_| {
339
+ ParquetError::Schema(format!("Invalid decimal256 scale: {}", parts[1]))
340
+ })?;
341
+ return Ok(PrimitiveType::Decimal256(p, s));
342
+ }
343
+ }
344
+
345
+ if type_str.starts_with("fixed_len_byte_array(") && type_str.ends_with(')') {
346
+ let params = &type_str[21..type_str.len() - 1];
347
+ let len = params.parse::<i32>().map_err(|_| {
348
+ ParquetError::Schema(format!("Invalid fixed_len_byte_array length: {}", params))
349
+ })?;
350
+ return Ok(PrimitiveType::FixedLenByteArray(len));
351
+ }
352
+
353
+ match type_str.as_str() {
354
+ "boolean" | "bool" => Ok(PrimitiveType::Boolean),
355
+ "int8" => Ok(PrimitiveType::Int8),
356
+ "int16" => Ok(PrimitiveType::Int16),
357
+ "int32" => Ok(PrimitiveType::Int32),
358
+ "int64" => Ok(PrimitiveType::Int64),
359
+ "uint8" => Ok(PrimitiveType::UInt8),
360
+ "uint16" => Ok(PrimitiveType::UInt16),
361
+ "uint32" => Ok(PrimitiveType::UInt32),
362
+ "uint64" => Ok(PrimitiveType::UInt64),
363
+ "float" | "float32" => Ok(PrimitiveType::Float32),
364
+ "double" | "float64" => Ok(PrimitiveType::Float64),
365
+ "string" => Ok(PrimitiveType::String),
366
+ "binary" => Ok(PrimitiveType::Binary),
367
+ "date32" | "date" => Ok(PrimitiveType::Date32),
368
+ "date64" => Ok(PrimitiveType::Date64),
369
+ "timestamp" | "timestamp_millis" => {
370
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
371
+ Ok(PrimitiveType::TimestampMillis(timezone.map(Into::into)))
372
+ }
373
+ "timestamp_second" => {
374
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
375
+ Ok(PrimitiveType::TimestampSecond(timezone.map(Into::into)))
376
+ }
377
+ "timestamp_micros" => {
378
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
379
+ Ok(PrimitiveType::TimestampMicros(timezone.map(Into::into)))
380
+ }
381
+ "timestamp_nanos" => {
382
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
383
+ Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
384
+ }
385
+ "time_millis" => Ok(PrimitiveType::TimeMillis),
386
+ "time_micros" => Ok(PrimitiveType::TimeMicros),
387
+ "time_nanos" => Ok(PrimitiveType::TimeNanos),
388
+ "decimal" => {
389
+ // Use provided precision/scale or defaults
390
+ let p = precision.unwrap_or(38);
391
+ let s = scale.unwrap_or(0);
392
+
393
+ // Choose decimal type based on precision
394
+ if p <= 38 {
395
+ Ok(PrimitiveType::Decimal128(p, s))
396
+ } else {
397
+ Ok(PrimitiveType::Decimal256(p, s))
398
+ }
399
+ }
400
+ "decimal128" => {
401
+ let p = precision.unwrap_or(38);
402
+ let s = scale.unwrap_or(0);
403
+ Ok(PrimitiveType::Decimal128(p, s))
404
+ }
405
+ "decimal256" => {
406
+ let p = precision.unwrap_or(76);
407
+ let s = scale.unwrap_or(0);
408
+ Ok(PrimitiveType::Decimal256(p, s))
409
+ }
410
+ _ => Err(RubyAdapterError::InvalidInput(format!(
411
+ "Unknown primitive type: {}",
412
+ type_str
413
+ ))),
414
+ }
415
+ }
416
+ }
417
+
418
+ impl Default for RubySchemaBuilder {
419
+ fn default() -> Self {
420
+ Self::new()
421
+ }
422
+ }
423
+
424
+ /// Return `node` with its nullability forced to required (non-nullable).
425
+ ///
426
+ /// Parquet maps store keys with `Repetition::Required`; a nullable map key is
427
+ /// an illegal state that the core schema validator rejects. Map keys reach this
428
+ /// helper from a user-supplied key hash whose `nullable` field defaults to
429
+ /// `true`, so forcing required here keeps the raw-hash path consistent with the
430
+ /// schema DSL and the `map<...>` string form.
431
+ fn into_required(node: SchemaNode) -> SchemaNode {
432
+ match node {
433
+ SchemaNode::Struct { name, fields, .. } => SchemaNode::Struct {
434
+ name,
435
+ nullable: false,
436
+ fields,
437
+ },
438
+ SchemaNode::List { name, item, .. } => SchemaNode::List {
439
+ name,
440
+ nullable: false,
441
+ item,
442
+ },
443
+ SchemaNode::Map {
444
+ name, key, value, ..
445
+ } => SchemaNode::Map {
446
+ name,
447
+ nullable: false,
448
+ key,
449
+ value,
450
+ },
451
+ SchemaNode::Primitive {
452
+ name,
453
+ primitive_type,
454
+ format,
455
+ ..
456
+ } => SchemaNode::Primitive {
457
+ name,
458
+ primitive_type,
459
+ nullable: false,
460
+ format,
461
+ },
462
+ }
463
+ }
464
+
465
+ /// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
466
+ /// and Ruby Value is not Send/Sync
467
+ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema, RubyAdapterError> {
468
+ let ruby = Ruby::get().map_err(|e| RubyAdapterError::Ruby(e.to_string()))?;
469
+ let builder = RubySchemaBuilder::new();
470
+
471
+ // The Ruby schema should be a hash with a root struct
472
+ let hash: RHash = <RHash as TryConvert>::try_convert(schema_def)
473
+ .map_err(|e: MagnusError| ParquetError::Schema(format!("Schema must be a hash: {}", e)))?;
474
+
475
+ // Check if it's already in the expected format (with type: :struct)
476
+ let root_node = if hash.get(ruby.to_symbol("type")).is_some() {
477
+ // It's a complete schema definition
478
+ builder.parse_hash_schema_node("root".to_string(), hash)?
479
+ } else if let Ok(fields) = hash.fetch::<_, RArray>(ruby.to_symbol("fields")) {
480
+ // It's a simplified format with just fields array
481
+ let mut field_nodes = Vec::new();
482
+ for field_value in fields.into_iter() {
483
+ let field_hash: RHash = <RHash as TryConvert>::try_convert(field_value)
484
+ .map_err(|e: MagnusError| ParquetError::Schema(format!("Invalid field: {}", e)))?;
485
+ field_nodes.push(builder.parse_field_definition(field_hash)?);
486
+ }
487
+
488
+ // Check for duplicate field names
489
+ let field_names: Vec<String> = field_nodes
490
+ .iter()
491
+ .map(|node| match node {
492
+ SchemaNode::Primitive { name, .. } => name.clone(),
493
+ SchemaNode::List { name, .. } => name.clone(),
494
+ SchemaNode::Map { name, .. } => name.clone(),
495
+ SchemaNode::Struct { name, .. } => name.clone(),
496
+ })
497
+ .collect();
498
+
499
+ let mut unique_names = std::collections::HashSet::new();
500
+ for name in &field_names {
501
+ if !unique_names.insert(name) {
502
+ return Err(RubyAdapterError::InvalidInput(format!(
503
+ "Duplicate field names in root level schema: {:?}",
504
+ field_names
505
+ )));
506
+ }
507
+ }
508
+
509
+ SchemaNode::Struct {
510
+ name: "root".to_string(),
511
+ nullable: false,
512
+ fields: field_nodes,
513
+ }
514
+ } else {
515
+ return Err(RubyAdapterError::InvalidInput(
516
+ "Schema must have 'type' or 'fields' key".to_string(),
517
+ ));
518
+ };
519
+
520
+ // Build the schema
521
+ parquet_core::SchemaBuilder::new()
522
+ .with_root(root_node)
523
+ .build()
524
+ .map_err(|e| RubyAdapterError::InvalidInput(e.to_string()))
525
+ }
526
+
527
+ /// Convert a Parquet schema back to Ruby representation
528
+ pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value, RubyAdapterError> {
529
+ let ruby = Ruby::get()
530
+ .map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
531
+
532
+ schema_node_to_ruby(&schema.root, &ruby)
533
+ }
534
+
535
+ fn schema_node_to_ruby(node: &SchemaNode, ruby: &Ruby) -> Result<Value, RubyAdapterError> {
536
+ let hash = ruby.hash_new();
537
+
538
+ match node {
539
+ SchemaNode::Struct {
540
+ name,
541
+ nullable,
542
+ fields,
543
+ } => {
544
+ hash.aset(ruby.to_symbol("type"), ruby.to_symbol("struct"))
545
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
546
+ hash.aset(ruby.to_symbol("name"), name.as_str())
547
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
548
+ hash.aset(ruby.to_symbol("nullable"), *nullable)
549
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
550
+
551
+ let fields_array = ruby.ary_new();
552
+ for field in fields {
553
+ fields_array
554
+ .push(schema_node_to_ruby(field, ruby)?)
555
+ .map_err(|e| {
556
+ ParquetError::Conversion(format!("Failed to push field: {}", e))
557
+ })?;
558
+ }
559
+ hash.aset(ruby.to_symbol("fields"), fields_array)
560
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set fields: {}", e)))?;
561
+ }
562
+
563
+ SchemaNode::List {
564
+ name,
565
+ nullable,
566
+ item,
567
+ } => {
568
+ hash.aset(ruby.to_symbol("type"), ruby.to_symbol("list"))
569
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
570
+ hash.aset(ruby.to_symbol("name"), name.as_str())
571
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
572
+ hash.aset(ruby.to_symbol("nullable"), *nullable)
573
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
574
+ hash.aset(ruby.to_symbol("item"), schema_node_to_ruby(item, ruby)?)
575
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set item: {}", e)))?;
576
+ }
577
+
578
+ SchemaNode::Map {
579
+ name,
580
+ nullable,
581
+ key,
582
+ value,
583
+ } => {
584
+ hash.aset(ruby.to_symbol("type"), ruby.to_symbol("map"))
585
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
586
+ hash.aset(ruby.to_symbol("name"), name.as_str())
587
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
588
+ hash.aset(ruby.to_symbol("nullable"), *nullable)
589
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
590
+ hash.aset(ruby.to_symbol("key"), schema_node_to_ruby(key, ruby)?)
591
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set key: {}", e)))?;
592
+ hash.aset(ruby.to_symbol("value"), schema_node_to_ruby(value, ruby)?)
593
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set value: {}", e)))?;
594
+ }
595
+
596
+ SchemaNode::Primitive {
597
+ name,
598
+ primitive_type,
599
+ nullable,
600
+ format,
601
+ } => {
602
+ let type_sym = match primitive_type {
603
+ PrimitiveType::Boolean => ruby.to_symbol("boolean"),
604
+ PrimitiveType::Int8 => ruby.to_symbol("int8"),
605
+ PrimitiveType::Int16 => ruby.to_symbol("int16"),
606
+ PrimitiveType::Int32 => ruby.to_symbol("int32"),
607
+ PrimitiveType::Int64 => ruby.to_symbol("int64"),
608
+ PrimitiveType::UInt8 => ruby.to_symbol("uint8"),
609
+ PrimitiveType::UInt16 => ruby.to_symbol("uint16"),
610
+ PrimitiveType::UInt32 => ruby.to_symbol("uint32"),
611
+ PrimitiveType::UInt64 => ruby.to_symbol("uint64"),
612
+ PrimitiveType::Float32 => ruby.to_symbol("float32"),
613
+ PrimitiveType::Float64 => ruby.to_symbol("float64"),
614
+ PrimitiveType::String => ruby.to_symbol("string"),
615
+ PrimitiveType::Binary => ruby.to_symbol("binary"),
616
+ PrimitiveType::Date32 => ruby.to_symbol("date32"),
617
+ PrimitiveType::Date64 => ruby.to_symbol("date64"),
618
+ PrimitiveType::TimestampSecond(_) => ruby.to_symbol("timestamp_second"),
619
+ PrimitiveType::TimestampMillis(_) => ruby.to_symbol("timestamp_millis"),
620
+ PrimitiveType::TimestampMicros(_) => ruby.to_symbol("timestamp_micros"),
621
+ PrimitiveType::TimestampNanos(_) => ruby.to_symbol("timestamp_nanos"),
622
+ PrimitiveType::TimeMillis => ruby.to_symbol("time_millis"),
623
+ PrimitiveType::TimeMicros => ruby.to_symbol("time_micros"),
624
+ PrimitiveType::TimeNanos => ruby.to_symbol("time_nanos"),
625
+ PrimitiveType::Decimal128(_, _) => ruby.to_symbol("decimal128"),
626
+ PrimitiveType::Decimal256(_, _) => ruby.to_symbol("decimal256"),
627
+ PrimitiveType::FixedLenByteArray(_) => ruby.to_symbol("fixed_len_byte_array"),
628
+ };
629
+
630
+ hash.aset(ruby.to_symbol("type"), type_sym)
631
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
632
+ hash.aset(ruby.to_symbol("name"), name.as_str())
633
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
634
+ hash.aset(ruby.to_symbol("nullable"), *nullable)
635
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
636
+
637
+ if let Some(fmt) = format {
638
+ hash.aset(ruby.to_symbol("format"), fmt.as_str())
639
+ .map_err(|e| {
640
+ ParquetError::Conversion(format!("Failed to set format: {}", e))
641
+ })?;
642
+ }
643
+
644
+ // Add precision/scale for decimal types
645
+ match primitive_type {
646
+ PrimitiveType::Decimal128(p, s) | PrimitiveType::Decimal256(p, s) => {
647
+ hash.aset(ruby.to_symbol("precision"), *p).map_err(|e| {
648
+ ParquetError::Conversion(format!("Failed to set precision: {}", e))
649
+ })?;
650
+ hash.aset(ruby.to_symbol("scale"), *s).map_err(|e| {
651
+ ParquetError::Conversion(format!("Failed to set scale: {}", e))
652
+ })?;
653
+ }
654
+ PrimitiveType::FixedLenByteArray(len) => {
655
+ hash.aset(ruby.to_symbol("length"), *len).map_err(|e| {
656
+ ParquetError::Conversion(format!("Failed to set length: {}", e))
657
+ })?;
658
+ }
659
+ _ => {}
660
+ }
661
+ }
662
+ }
663
+
664
+ Ok(hash.as_value())
665
+ }
666
+
667
+ /// Convert old schema format to new format
668
+ /// Old: [{ "column_name" => "type" }, ...]
669
+ /// New: [{ name: "column_name", type: :type }, ...]
670
+ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray, RubyAdapterError> {
671
+ let new_schema = ruby.ary_new();
672
+
673
+ for item in schema.into_iter() {
674
+ let hash: RHash = TryConvert::try_convert(item).map_err(|e: MagnusError| {
675
+ ParquetError::Schema(format!("Invalid schema item: {}", e))
676
+ })?;
677
+ let new_field = ruby.hash_new();
678
+
679
+ // The old format has a single key-value pair per hash
680
+ let process_result = hash.foreach(
681
+ |key: Value,
682
+ value: Value|
683
+ -> std::result::Result<magnus::r_hash::ForEach, MagnusError> {
684
+ let key_str: String = parse_string_or_symbol(ruby, key)?.ok_or_else(|| {
685
+ MagnusError::new(ruby.exception_arg_error(), "Nil keys not allowed in schema")
686
+ })?;
687
+ let type_str: String = TryConvert::try_convert(value)?;
688
+
689
+ new_field.aset(ruby.to_symbol("name"), key_str)?;
690
+ new_field.aset(ruby.to_symbol("type"), ruby.to_symbol(&type_str))?;
691
+ if type_str.contains("timestamp") {
692
+ new_field.aset(ruby.to_symbol("has_timezone"), true)?;
693
+ }
694
+
695
+ Ok(magnus::r_hash::ForEach::Continue)
696
+ },
697
+ );
698
+
699
+ if let Err(e) = process_result {
700
+ return Err(RubyAdapterError::InvalidInput(format!(
701
+ "Failed to process field: {}",
702
+ e
703
+ )));
704
+ }
705
+
706
+ new_schema
707
+ .push(new_field)
708
+ .map_err(|e| ParquetError::Schema(format!("Failed to push field: {}", e)))?;
709
+ }
710
+
711
+ Ok(new_schema)
712
+ }
713
+
714
+ /// Check if schema is in new DSL format (hash with type: :struct)
715
+ pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool, RubyAdapterError> {
716
+ if !schema_value.is_kind_of(ruby.class_hash()) {
717
+ return Ok(false);
718
+ }
719
+
720
+ let schema_hash: RHash = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
721
+ ParquetError::Schema(format!("Failed to convert to hash: {}", e))
722
+ })?;
723
+ if let Some(type_val) = schema_hash.get(ruby.to_symbol("type")) {
724
+ if type_val.is_kind_of(ruby.class_symbol()) {
725
+ let type_sym: Symbol =
726
+ TryConvert::try_convert(type_val).map_err(|e: MagnusError| {
727
+ ParquetError::Schema(format!("Failed to convert to symbol: {}", e))
728
+ })?;
729
+ return Ok(type_sym.name().map_err(|e: MagnusError| {
730
+ ParquetError::Schema(format!("Failed to get symbol name: {}", e))
731
+ })? == "struct");
732
+ } else if type_val.is_kind_of(ruby.class_string()) {
733
+ let type_str: String =
734
+ TryConvert::try_convert(type_val).map_err(|e: MagnusError| {
735
+ ParquetError::Schema(format!("Failed to convert to string: {}", e))
736
+ })?;
737
+ return Ok(type_str == "struct");
738
+ }
739
+ }
740
+ Ok(false)
741
+ }
742
+
743
+ /// Process schema value and convert to format expected by ruby_schema_to_parquet
744
+ pub fn process_schema_value(
745
+ ruby: &Ruby,
746
+ schema_value: Value,
747
+ data_array: Option<&RArray>,
748
+ ) -> Result<Value, RubyAdapterError> {
749
+ // Check if it's the new DSL format
750
+ if is_dsl_schema(ruby, schema_value)? {
751
+ // For DSL format, pass it directly to ruby_schema_to_parquet
752
+ // which should handle the conversion
753
+ return Ok(schema_value);
754
+ }
755
+
756
+ // Handle array format or hash with fields
757
+ let mut schema_array = if schema_value.is_nil() {
758
+ ruby.ary_new()
759
+ } else if schema_value.is_kind_of(ruby.class_array()) {
760
+ let array: RArray = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
761
+ ParquetError::Schema(format!("Failed to convert to array: {}", e))
762
+ })?;
763
+
764
+ // Check if it's in old format (array of single-key hashes)
765
+ if !array.is_empty() {
766
+ let first_item: Value = array
767
+ .entry(0)
768
+ .map_err(|e| ParquetError::Schema(format!("Failed to get first item: {}", e)))?;
769
+
770
+ if first_item.is_kind_of(ruby.class_hash()) {
771
+ let first_hash: RHash =
772
+ TryConvert::try_convert(first_item).map_err(|e: MagnusError| {
773
+ ParquetError::Schema(format!("Failed to convert first item to hash: {}", e))
774
+ })?;
775
+ // Check if it has the new format keys
776
+ if first_hash.get(ruby.to_symbol("name")).is_some()
777
+ && first_hash.get(ruby.to_symbol("type")).is_some()
778
+ {
779
+ // Already in new format
780
+ array
781
+ } else {
782
+ // Old format, convert it
783
+ convert_legacy_schema(ruby, array)?
784
+ }
785
+ } else {
786
+ return Err(RubyAdapterError::InvalidInput(
787
+ "schema array must contain hashes".to_string(),
788
+ ));
789
+ }
790
+ } else {
791
+ array
792
+ }
793
+ } else if schema_value.is_kind_of(ruby.class_hash()) {
794
+ // Hash format with fields key
795
+ let hash: RHash = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
796
+ ParquetError::Schema(format!("Failed to convert to hash: {}", e))
797
+ })?;
798
+ if let Some(fields) = hash.get(ruby.to_symbol("fields")) {
799
+ TryConvert::try_convert(fields).map_err(|e: MagnusError| {
800
+ ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
801
+ })?
802
+ } else {
803
+ return Err(RubyAdapterError::InvalidInput(
804
+ "schema hash must have 'fields' key or be in DSL format with 'type' key"
805
+ .to_string(),
806
+ ));
807
+ }
808
+ } else {
809
+ return Err(RubyAdapterError::InvalidInput(
810
+ "schema must be nil, an array, or a hash".to_string(),
811
+ ));
812
+ };
813
+
814
+ // Check if we need to infer schema from data
815
+ if schema_array.is_empty() {
816
+ if let Some(data) = data_array {
817
+ if data.is_empty() {
818
+ return Err(RubyAdapterError::InvalidInput(
819
+ "Cannot infer schema from empty data".to_string(),
820
+ ));
821
+ }
822
+
823
+ // Get first row/batch to determine column count
824
+ let first_item: Value = data.entry(0).map_err(|e| {
825
+ ParquetError::Schema(format!("Failed to get first data item: {}", e))
826
+ })?;
827
+ let num_columns = if first_item.is_kind_of(ruby.class_array()) {
828
+ let first_array: RArray =
829
+ TryConvert::try_convert(first_item).map_err(|e: MagnusError| {
830
+ ParquetError::Schema(format!(
831
+ "Failed to convert first data item to array: {}",
832
+ e
833
+ ))
834
+ })?;
835
+ first_array.len()
836
+ } else {
837
+ return Err(RubyAdapterError::InvalidInput(
838
+ "First data item must be an array".to_string(),
839
+ ));
840
+ };
841
+
842
+ // Generate default schema with String types
843
+ let new_schema = ruby.ary_new();
844
+ for i in 0..num_columns {
845
+ let field = ruby.hash_new();
846
+ field
847
+ .aset(ruby.to_symbol("name"), format!("f{}", i))
848
+ .map_err(|e| {
849
+ ParquetError::Schema(format!("Failed to set field name: {}", e))
850
+ })?;
851
+ field
852
+ .aset(ruby.to_symbol("type"), ruby.to_symbol("string"))
853
+ .map_err(|e| {
854
+ ParquetError::Schema(format!("Failed to set field type: {}", e))
855
+ })?;
856
+ new_schema
857
+ .push(field)
858
+ .map_err(|e| ParquetError::Schema(format!("Failed to push field: {}", e)))?;
859
+ }
860
+
861
+ schema_array = new_schema;
862
+ } else {
863
+ return Err(RubyAdapterError::InvalidInput(
864
+ "Schema is required when data is not provided for inference".to_string(),
865
+ ));
866
+ }
867
+ }
868
+
869
+ // Convert schema to the format expected by ruby_schema_to_parquet
870
+ let schema_hash = ruby.hash_new();
871
+ schema_hash
872
+ .aset(ruby.to_symbol("fields"), schema_array)
873
+ .map_err(|e| ParquetError::Schema(format!("Failed to set fields: {}", e)))?;
874
+ Ok(schema_hash.as_value())
875
+ }
876
+
877
+ /// Extract schema nodes from schema fields
878
+ pub fn extract_field_schemas(schema: &Schema) -> Vec<SchemaNode> {
879
+ if let SchemaNode::Struct { fields, .. } = &schema.root {
880
+ fields.to_vec()
881
+ } else {
882
+ Vec::new()
883
+ }
884
+ }