parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,810 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
3
+ use parquet_core::{ParquetError, PrimitiveType, Result, Schema, SchemaNode};
4
+
5
+ /// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
6
+ pub struct RubySchemaBuilder;
7
+
8
+ impl RubySchemaBuilder {
9
+ pub fn new() -> Self {
10
+ Self
11
+ }
12
+
13
+ /// Parse a Ruby schema definition (hash) into a SchemaNode
14
+ fn parse_schema_node(&self, name: String, schema_def: Value) -> Result<SchemaNode> {
15
+ // If it's a Hash, parse it as a complex type
16
+ if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
17
+ return self.parse_hash_schema_node(name, hash);
18
+ }
19
+
20
+ // Otherwise, try to parse as a simple type symbol
21
+ if let Ok(type_sym) = <Symbol as TryConvert>::try_convert(schema_def) {
22
+ let type_str = type_sym.name().map_err(|e: MagnusError| {
23
+ ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
24
+ })?;
25
+
26
+ // Check if it's a complex type with angle brackets
27
+ if type_str.contains('<') {
28
+ return self.parse_complex_type_string(name, type_str.to_string(), true);
29
+ }
30
+
31
+ let primitive_type =
32
+ self.parse_primitive_type(type_str.to_string(), None, None, None)?;
33
+ return Ok(SchemaNode::Primitive {
34
+ name,
35
+ primitive_type,
36
+ nullable: true, // Default to nullable for simple types
37
+ format: None,
38
+ });
39
+ }
40
+
41
+ Err(ParquetError::Schema(format!(
42
+ "Expected Hash or Symbol for schema definition, got {}",
43
+ schema_def.class()
44
+ )))
45
+ }
46
+
47
+ /// Parse a Ruby hash schema node
48
+ fn parse_hash_schema_node(&self, name: String, hash: RHash) -> Result<SchemaNode> {
49
+ // Get the type field
50
+ let type_sym: Symbol = hash
51
+ .fetch::<_, Symbol>(Symbol::new("type"))
52
+ .map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
53
+
54
+ let type_str = type_sym.name().map_err(|e: MagnusError| {
55
+ ParquetError::Conversion(format!("Failed to get type name: {}", e))
56
+ })?;
57
+
58
+ // Get nullable field (default to true)
59
+ let nullable = hash
60
+ .fetch::<_, Value>(Symbol::new("nullable"))
61
+ .ok()
62
+ .and_then(|v| <bool as TryConvert>::try_convert(v).ok())
63
+ .unwrap_or(true);
64
+
65
+ // Get format field if present
66
+ let format = hash
67
+ .fetch::<_, Value>(Symbol::new("format"))
68
+ .ok()
69
+ .and_then(|v| <String as TryConvert>::try_convert(v).ok());
70
+
71
+ match type_str.to_string().as_str() {
72
+ "struct" => {
73
+ let fields_array: RArray = hash
74
+ .fetch(Symbol::new("fields"))
75
+ .map_err(|e| ParquetError::Schema(format!("Struct missing 'fields': {}", e)))?;
76
+
77
+ let mut fields = Vec::new();
78
+ for field_value in fields_array.into_iter() {
79
+ let field_hash: RHash = <RHash as TryConvert>::try_convert(field_value)
80
+ .map_err(|e: MagnusError| {
81
+ ParquetError::Schema(format!("Invalid field definition: {}", e))
82
+ })?;
83
+
84
+ let _field_name: String =
85
+ field_hash.fetch(Symbol::new("name")).map_err(|e| {
86
+ ParquetError::Schema(format!("Field missing 'name': {}", e))
87
+ })?;
88
+
89
+ let field_node = self.parse_field_definition(field_hash)?;
90
+ fields.push(field_node);
91
+ }
92
+
93
+ Ok(SchemaNode::Struct {
94
+ name,
95
+ nullable,
96
+ fields,
97
+ })
98
+ }
99
+
100
+ "list" => {
101
+ let item_def = hash
102
+ .fetch::<_, Value>(Symbol::new("item"))
103
+ .map_err(|e| ParquetError::Schema(format!("List missing 'item': {}", e)))?;
104
+
105
+ let item_name = format!("{}_item", name);
106
+ let item_node = self.parse_schema_node(item_name, item_def)?;
107
+
108
+ Ok(SchemaNode::List {
109
+ name,
110
+ nullable,
111
+ item: Box::new(item_node),
112
+ })
113
+ }
114
+
115
+ "map" => {
116
+ // Parse key definition
117
+ let key_def = hash
118
+ .fetch::<_, Value>(Symbol::new("key"))
119
+ .map_err(|e| ParquetError::Schema(format!("Map missing 'key': {}", e)))?;
120
+ let key_node = self.parse_schema_node("key".to_string(), key_def)?;
121
+
122
+ // Parse value definition
123
+ let value_def = hash
124
+ .fetch::<_, Value>(Symbol::new("value"))
125
+ .map_err(|e| ParquetError::Schema(format!("Map missing 'value': {}", e)))?;
126
+ let value_node = self.parse_schema_node("value".to_string(), value_def)?;
127
+
128
+ Ok(SchemaNode::Map {
129
+ name,
130
+ nullable,
131
+ key: Box::new(key_node),
132
+ value: Box::new(value_node),
133
+ })
134
+ }
135
+
136
+ // Check if it's a complex type with angle brackets
137
+ type_str if type_str.contains('<') => {
138
+ self.parse_complex_type_string(name, type_str.to_string(), nullable)
139
+ }
140
+
141
+ // Primitive types
142
+ primitive_type => {
143
+ // Get precision and scale for decimal types
144
+ let precision = hash
145
+ .fetch::<_, Value>(Symbol::new("precision"))
146
+ .ok()
147
+ .and_then(|v| <u8 as TryConvert>::try_convert(v).ok());
148
+
149
+ let scale = hash
150
+ .fetch::<_, Value>(Symbol::new("scale"))
151
+ .ok()
152
+ .and_then(|v| <i8 as TryConvert>::try_convert(v).ok());
153
+
154
+ // Handle timezone for timestamp types
155
+ // Support both new has_timezone (preferred) and legacy timezone parameters
156
+ let timezone =
157
+ if let Ok(has_tz) = hash.fetch::<_, Value>(Symbol::new("has_timezone")) {
158
+ // New approach: has_timezone boolean
159
+ if let Ok(has_timezone) = <bool as TryConvert>::try_convert(has_tz) {
160
+ if has_timezone {
161
+ Some("UTC".to_string()) // Presence means UTC storage
162
+ } else {
163
+ None // Absence means local/unzoned storage
164
+ }
165
+ } else {
166
+ None
167
+ }
168
+ } else {
169
+ hash.fetch::<_, Value>(Symbol::new("timezone"))
170
+ .ok()
171
+ .map(|_| "UTC".to_string()) // Any value -> UTC
172
+ };
173
+
174
+ let primitive = self.parse_primitive_type(
175
+ primitive_type.to_string(),
176
+ precision,
177
+ scale,
178
+ timezone,
179
+ )?;
180
+
181
+ Ok(SchemaNode::Primitive {
182
+ name,
183
+ primitive_type: primitive,
184
+ nullable,
185
+ format,
186
+ })
187
+ }
188
+ }
189
+ }
190
+
191
+ /// Parse a complex type string like "list<string>" or "map<string,int32>"
192
+ fn parse_complex_type_string(
193
+ &self,
194
+ name: String,
195
+ type_str: String,
196
+ nullable: bool,
197
+ ) -> Result<SchemaNode> {
198
+ if type_str.starts_with("list<") && type_str.ends_with('>') {
199
+ let inner_type = &type_str[5..type_str.len() - 1];
200
+ let item_name = format!("{}_item", name);
201
+
202
+ // Create a simple type node for the item
203
+ let item_node = if inner_type.contains('<') {
204
+ // Nested complex type
205
+ self.parse_complex_type_string(item_name, inner_type.to_string(), true)?
206
+ } else {
207
+ // Simple primitive type
208
+ SchemaNode::Primitive {
209
+ name: item_name,
210
+ primitive_type: self.parse_primitive_type(
211
+ inner_type.to_string(),
212
+ None,
213
+ None,
214
+ None,
215
+ )?,
216
+ nullable: true,
217
+ format: None,
218
+ }
219
+ };
220
+
221
+ Ok(SchemaNode::List {
222
+ name,
223
+ nullable,
224
+ item: Box::new(item_node),
225
+ })
226
+ } else if type_str.starts_with("map<") && type_str.ends_with('>') {
227
+ let inner = &type_str[4..type_str.len() - 1];
228
+ let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
229
+ if parts.len() != 2 {
230
+ return Err(ParquetError::Schema(format!(
231
+ "Invalid map type: {}",
232
+ type_str
233
+ )));
234
+ }
235
+
236
+ let key_type = self.parse_primitive_type(parts[0].to_string(), None, None, None)?;
237
+ let value_type = self.parse_primitive_type(parts[1].to_string(), None, None, None)?;
238
+
239
+ Ok(SchemaNode::Map {
240
+ name,
241
+ nullable,
242
+ key: Box::new(SchemaNode::Primitive {
243
+ name: "key".to_string(),
244
+ primitive_type: key_type,
245
+ nullable: false,
246
+ format: None,
247
+ }),
248
+ value: Box::new(SchemaNode::Primitive {
249
+ name: "value".to_string(),
250
+ primitive_type: value_type,
251
+ nullable: true,
252
+ format: None,
253
+ }),
254
+ })
255
+ } else {
256
+ Err(ParquetError::Schema(format!(
257
+ "Unknown complex type: {}",
258
+ type_str
259
+ )))
260
+ }
261
+ }
262
+
263
+ /// Parse a field definition from a Ruby hash
264
+ fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
265
+ let name: String = field_hash
266
+ .fetch(Symbol::new("name"))
267
+ .map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
268
+
269
+ // Check if there's a 'type' field - if so, parse as full definition
270
+ if let Ok(_type_value) = field_hash.fetch::<_, Value>(Symbol::new("type")) {
271
+ // This is a full field definition
272
+ self.parse_schema_node(name, field_hash.as_value())
273
+ } else {
274
+ // This might be a simplified definition - look for known field patterns
275
+ Err(ParquetError::Schema(format!(
276
+ "Field '{}' missing 'type' definition",
277
+ name
278
+ )))
279
+ }
280
+ }
281
+
282
+ /// Parse a primitive type string to PrimitiveType enum
283
+ fn parse_primitive_type(
284
+ &self,
285
+ type_str: String,
286
+ precision: Option<u8>,
287
+ scale: Option<i8>,
288
+ timezone: Option<String>,
289
+ ) -> Result<PrimitiveType> {
290
+ // Check if it's a decimal type with parentheses notation like "decimal(5,2)"
291
+ if type_str.starts_with("decimal(") && type_str.ends_with(')') {
292
+ let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
293
+ let parts: Vec<&str> = params.split(',').map(|s| s.trim()).collect();
294
+ if parts.len() == 2 {
295
+ let p = parts[0].parse::<u8>().map_err(|_| {
296
+ ParquetError::Schema(format!("Invalid decimal precision: {}", parts[0]))
297
+ })?;
298
+ let s = parts[1].parse::<i8>().map_err(|_| {
299
+ ParquetError::Schema(format!("Invalid decimal scale: {}", parts[1]))
300
+ })?;
301
+
302
+ // Choose decimal type based on precision
303
+ if p <= 38 {
304
+ return Ok(PrimitiveType::Decimal128(p, s));
305
+ } else {
306
+ return Ok(PrimitiveType::Decimal256(p, s));
307
+ }
308
+ }
309
+ }
310
+ // Check for decimal256 with parentheses notation
311
+ if type_str.starts_with("decimal256(") && type_str.ends_with(')') {
312
+ let params = &type_str[11..type_str.len() - 1];
313
+ let parts: Vec<&str> = params.split(',').map(|s| s.trim()).collect();
314
+ if parts.len() == 2 {
315
+ let p = parts[0].parse::<u8>().map_err(|_| {
316
+ ParquetError::Schema(format!("Invalid decimal256 precision: {}", parts[0]))
317
+ })?;
318
+ let s = parts[1].parse::<i8>().map_err(|_| {
319
+ ParquetError::Schema(format!("Invalid decimal256 scale: {}", parts[1]))
320
+ })?;
321
+ return Ok(PrimitiveType::Decimal256(p, s));
322
+ }
323
+ }
324
+
325
+ match type_str.as_str() {
326
+ "boolean" | "bool" => Ok(PrimitiveType::Boolean),
327
+ "int8" => Ok(PrimitiveType::Int8),
328
+ "int16" => Ok(PrimitiveType::Int16),
329
+ "int32" => Ok(PrimitiveType::Int32),
330
+ "int64" => Ok(PrimitiveType::Int64),
331
+ "uint8" => Ok(PrimitiveType::UInt8),
332
+ "uint16" => Ok(PrimitiveType::UInt16),
333
+ "uint32" => Ok(PrimitiveType::UInt32),
334
+ "uint64" => Ok(PrimitiveType::UInt64),
335
+ "float" | "float32" => Ok(PrimitiveType::Float32),
336
+ "double" | "float64" => Ok(PrimitiveType::Float64),
337
+ "string" => Ok(PrimitiveType::String),
338
+ "binary" => Ok(PrimitiveType::Binary),
339
+ "date32" | "date" => Ok(PrimitiveType::Date32),
340
+ "date64" => Ok(PrimitiveType::Date64),
341
+ "timestamp" | "timestamp_millis" => {
342
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
343
+ Ok(PrimitiveType::TimestampMillis(timezone.map(Into::into)))
344
+ }
345
+ "timestamp_second" => {
346
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
347
+ Ok(PrimitiveType::TimestampSecond(timezone.map(Into::into)))
348
+ }
349
+ "timestamp_micros" => {
350
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
351
+ Ok(PrimitiveType::TimestampMicros(timezone.map(Into::into)))
352
+ }
353
+ "timestamp_nanos" => {
354
+ // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
355
+ Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
356
+ }
357
+ "time32" | "time_millis" => Ok(PrimitiveType::TimeMillis),
358
+ "time64" | "time_micros" => Ok(PrimitiveType::TimeMicros),
359
+ "decimal" => {
360
+ // Use provided precision/scale or defaults
361
+ let p = precision.unwrap_or(38);
362
+ let s = scale.unwrap_or(0);
363
+
364
+ // Choose decimal type based on precision
365
+ if p <= 38 {
366
+ Ok(PrimitiveType::Decimal128(p, s))
367
+ } else {
368
+ Ok(PrimitiveType::Decimal256(p, s))
369
+ }
370
+ }
371
+ "decimal128" => {
372
+ let p = precision.unwrap_or(38);
373
+ let s = scale.unwrap_or(0);
374
+ Ok(PrimitiveType::Decimal128(p, s))
375
+ }
376
+ "decimal256" => {
377
+ let p = precision.unwrap_or(76);
378
+ let s = scale.unwrap_or(0);
379
+ Ok(PrimitiveType::Decimal256(p, s))
380
+ }
381
+ _ => Err(ParquetError::Schema(format!(
382
+ "Unknown primitive type: {}",
383
+ type_str
384
+ ))),
385
+ }
386
+ }
387
+ }
388
+
389
+ impl Default for RubySchemaBuilder {
390
+ fn default() -> Self {
391
+ Self::new()
392
+ }
393
+ }
394
+
395
+ /// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
396
+ /// and Ruby Value is not Send/Sync
397
+ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
398
+ let builder = RubySchemaBuilder::new();
399
+
400
+ // The Ruby schema should be a hash with a root struct
401
+ let hash: RHash = <RHash as TryConvert>::try_convert(schema_def)
402
+ .map_err(|e: MagnusError| ParquetError::Schema(format!("Schema must be a hash: {}", e)))?;
403
+
404
+ // Check if it's already in the expected format (with type: :struct)
405
+ let root_node = if hash.get(Symbol::new("type")).is_some() {
406
+ // It's a complete schema definition
407
+ builder.parse_hash_schema_node("root".to_string(), hash)?
408
+ } else if let Ok(fields) = hash.fetch::<_, RArray>(Symbol::new("fields")) {
409
+ // It's a simplified format with just fields array
410
+ let mut field_nodes = Vec::new();
411
+ for field_value in fields.into_iter() {
412
+ let field_hash: RHash = <RHash as TryConvert>::try_convert(field_value)
413
+ .map_err(|e: MagnusError| ParquetError::Schema(format!("Invalid field: {}", e)))?;
414
+ field_nodes.push(builder.parse_field_definition(field_hash)?);
415
+ }
416
+
417
+ // Check for duplicate field names
418
+ let field_names: Vec<String> = field_nodes
419
+ .iter()
420
+ .map(|node| match node {
421
+ SchemaNode::Primitive { name, .. } => name.clone(),
422
+ SchemaNode::List { name, .. } => name.clone(),
423
+ SchemaNode::Map { name, .. } => name.clone(),
424
+ SchemaNode::Struct { name, .. } => name.clone(),
425
+ })
426
+ .collect();
427
+
428
+ let mut unique_names = std::collections::HashSet::new();
429
+ for name in &field_names {
430
+ if !unique_names.insert(name) {
431
+ return Err(ParquetError::Schema(format!(
432
+ "Duplicate field names in root level schema: {:?}",
433
+ field_names
434
+ )));
435
+ }
436
+ }
437
+
438
+ SchemaNode::Struct {
439
+ name: "root".to_string(),
440
+ nullable: false,
441
+ fields: field_nodes,
442
+ }
443
+ } else {
444
+ return Err(ParquetError::Schema(
445
+ "Schema must have 'type' or 'fields' key".to_string(),
446
+ ));
447
+ };
448
+
449
+ // Build the schema
450
+ parquet_core::SchemaBuilder::new()
451
+ .with_root(root_node)
452
+ .build()
453
+ .map_err(|e| ParquetError::Schema(e.to_string()))
454
+ }
455
+
456
+ /// Convert a Parquet schema back to Ruby representation
457
+ pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
458
+ let ruby = Ruby::get()
459
+ .map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
460
+
461
+ schema_node_to_ruby(&schema.root, &ruby)
462
+ }
463
+
464
+ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
465
+ let hash = RHash::new();
466
+
467
+ match node {
468
+ SchemaNode::Struct {
469
+ name,
470
+ nullable,
471
+ fields,
472
+ } => {
473
+ hash.aset(Symbol::new("type"), Symbol::new("struct"))
474
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
475
+ hash.aset(Symbol::new("name"), name.as_str())
476
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
477
+ hash.aset(Symbol::new("nullable"), *nullable)
478
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
479
+
480
+ let fields_array = RArray::new();
481
+ for field in fields {
482
+ fields_array
483
+ .push(schema_node_to_ruby(field, _ruby)?)
484
+ .map_err(|e| {
485
+ ParquetError::Conversion(format!("Failed to push field: {}", e))
486
+ })?;
487
+ }
488
+ hash.aset(Symbol::new("fields"), fields_array)
489
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set fields: {}", e)))?;
490
+ }
491
+
492
+ SchemaNode::List {
493
+ name,
494
+ nullable,
495
+ item,
496
+ } => {
497
+ hash.aset(Symbol::new("type"), Symbol::new("list"))
498
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
499
+ hash.aset(Symbol::new("name"), name.as_str())
500
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
501
+ hash.aset(Symbol::new("nullable"), *nullable)
502
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
503
+ hash.aset(Symbol::new("item"), schema_node_to_ruby(item, _ruby)?)
504
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set item: {}", e)))?;
505
+ }
506
+
507
+ SchemaNode::Map {
508
+ name,
509
+ nullable,
510
+ key,
511
+ value,
512
+ } => {
513
+ hash.aset(Symbol::new("type"), Symbol::new("map"))
514
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
515
+ hash.aset(Symbol::new("name"), name.as_str())
516
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
517
+ hash.aset(Symbol::new("nullable"), *nullable)
518
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
519
+ hash.aset(Symbol::new("key"), schema_node_to_ruby(key, _ruby)?)
520
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set key: {}", e)))?;
521
+ hash.aset(Symbol::new("value"), schema_node_to_ruby(value, _ruby)?)
522
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set value: {}", e)))?;
523
+ }
524
+
525
+ SchemaNode::Primitive {
526
+ name,
527
+ primitive_type,
528
+ nullable,
529
+ format,
530
+ } => {
531
+ let type_sym = match primitive_type {
532
+ PrimitiveType::Boolean => Symbol::new("boolean"),
533
+ PrimitiveType::Int8 => Symbol::new("int8"),
534
+ PrimitiveType::Int16 => Symbol::new("int16"),
535
+ PrimitiveType::Int32 => Symbol::new("int32"),
536
+ PrimitiveType::Int64 => Symbol::new("int64"),
537
+ PrimitiveType::UInt8 => Symbol::new("uint8"),
538
+ PrimitiveType::UInt16 => Symbol::new("uint16"),
539
+ PrimitiveType::UInt32 => Symbol::new("uint32"),
540
+ PrimitiveType::UInt64 => Symbol::new("uint64"),
541
+ PrimitiveType::Float32 => Symbol::new("float32"),
542
+ PrimitiveType::Float64 => Symbol::new("float64"),
543
+ PrimitiveType::String => Symbol::new("string"),
544
+ PrimitiveType::Binary => Symbol::new("binary"),
545
+ PrimitiveType::Date32 => Symbol::new("date32"),
546
+ PrimitiveType::Date64 => Symbol::new("date64"),
547
+ PrimitiveType::TimestampSecond(_) => Symbol::new("timestamp_second"),
548
+ PrimitiveType::TimestampMillis(_) => Symbol::new("timestamp_millis"),
549
+ PrimitiveType::TimestampMicros(_) => Symbol::new("timestamp_micros"),
550
+ PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
551
+ PrimitiveType::TimeMillis => Symbol::new("time_millis"),
552
+ PrimitiveType::TimeMicros => Symbol::new("time_micros"),
553
+ PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
554
+ PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
555
+ PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
556
+ };
557
+
558
+ hash.aset(Symbol::new("type"), type_sym)
559
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
560
+ hash.aset(Symbol::new("name"), name.as_str())
561
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
562
+ hash.aset(Symbol::new("nullable"), *nullable)
563
+ .map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
564
+
565
+ if let Some(fmt) = format {
566
+ hash.aset(Symbol::new("format"), fmt.as_str())
567
+ .map_err(|e| {
568
+ ParquetError::Conversion(format!("Failed to set format: {}", e))
569
+ })?;
570
+ }
571
+
572
+ // Add precision/scale for decimal types
573
+ match primitive_type {
574
+ PrimitiveType::Decimal128(p, s) | PrimitiveType::Decimal256(p, s) => {
575
+ hash.aset(Symbol::new("precision"), *p).map_err(|e| {
576
+ ParquetError::Conversion(format!("Failed to set precision: {}", e))
577
+ })?;
578
+ hash.aset(Symbol::new("scale"), *s).map_err(|e| {
579
+ ParquetError::Conversion(format!("Failed to set scale: {}", e))
580
+ })?;
581
+ }
582
+ PrimitiveType::FixedLenByteArray(len) => {
583
+ hash.aset(Symbol::new("length"), *len).map_err(|e| {
584
+ ParquetError::Conversion(format!("Failed to set length: {}", e))
585
+ })?;
586
+ }
587
+ _ => {}
588
+ }
589
+ }
590
+ }
591
+
592
+ Ok(hash.as_value())
593
+ }
594
+
595
+ /// Convert old schema format to new format
596
+ /// Old: [{ "column_name" => "type" }, ...]
597
+ /// New: [{ name: "column_name", type: :type }, ...]
598
+ pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
599
+ let new_schema = RArray::new();
600
+
601
+ for item in schema.into_iter() {
602
+ let hash: RHash = TryConvert::try_convert(item).map_err(|e: MagnusError| {
603
+ ParquetError::Schema(format!("Invalid schema item: {}", e))
604
+ })?;
605
+ let new_field = RHash::new();
606
+
607
+ // The old format has a single key-value pair per hash
608
+ let process_result = hash.foreach(
609
+ |key: Value,
610
+ value: Value|
611
+ -> std::result::Result<magnus::r_hash::ForEach, MagnusError> {
612
+ let key_str: String = TryConvert::try_convert(key)?;
613
+ let type_str: String = TryConvert::try_convert(value)?;
614
+
615
+ new_field.aset(Symbol::new("name"), key_str)?;
616
+ new_field.aset(Symbol::new("type"), Symbol::new(&type_str))?;
617
+ if type_str.contains("timestamp") {
618
+ new_field.aset(Symbol::new("has_timezone"), true)?;
619
+ }
620
+
621
+ Ok(magnus::r_hash::ForEach::Continue)
622
+ },
623
+ );
624
+
625
+ if let Err(e) = process_result {
626
+ return Err(ParquetError::Schema(format!(
627
+ "Failed to process field: {}",
628
+ e
629
+ )));
630
+ }
631
+
632
+ new_schema
633
+ .push(new_field)
634
+ .map_err(|e| ParquetError::Schema(format!("Failed to push field: {}", e)))?;
635
+ }
636
+
637
+ Ok(new_schema)
638
+ }
639
+
640
+ /// Check if schema is in new DSL format (hash with type: :struct)
641
+ pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
642
+ if !schema_value.is_kind_of(ruby.class_hash()) {
643
+ return Ok(false);
644
+ }
645
+
646
+ let schema_hash: RHash = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
647
+ ParquetError::Schema(format!("Failed to convert to hash: {}", e))
648
+ })?;
649
+ if let Some(type_val) = schema_hash.get(Symbol::new("type")) {
650
+ if type_val.is_kind_of(ruby.class_symbol()) {
651
+ let type_sym: Symbol =
652
+ TryConvert::try_convert(type_val).map_err(|e: MagnusError| {
653
+ ParquetError::Schema(format!("Failed to convert to symbol: {}", e))
654
+ })?;
655
+ return Ok(type_sym.name().map_err(|e: MagnusError| {
656
+ ParquetError::Schema(format!("Failed to get symbol name: {}", e))
657
+ })? == "struct");
658
+ } else if type_val.is_kind_of(ruby.class_string()) {
659
+ let type_str: String =
660
+ TryConvert::try_convert(type_val).map_err(|e: MagnusError| {
661
+ ParquetError::Schema(format!("Failed to convert to string: {}", e))
662
+ })?;
663
+ return Ok(type_str == "struct");
664
+ }
665
+ }
666
+ Ok(false)
667
+ }
668
+
669
+ /// Process schema value and convert to format expected by ruby_schema_to_parquet
670
+ pub fn process_schema_value(
671
+ ruby: &Ruby,
672
+ schema_value: Value,
673
+ data_array: Option<&RArray>,
674
+ ) -> Result<Value> {
675
+ // Check if it's the new DSL format
676
+ if is_dsl_schema(ruby, schema_value)? {
677
+ // For DSL format, pass it directly to ruby_schema_to_parquet
678
+ // which should handle the conversion
679
+ return Ok(schema_value);
680
+ }
681
+
682
+ // Handle array format or hash with fields
683
+ let mut schema_array = if schema_value.is_nil() {
684
+ RArray::new()
685
+ } else if schema_value.is_kind_of(ruby.class_array()) {
686
+ let array: RArray = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
687
+ ParquetError::Schema(format!("Failed to convert to array: {}", e))
688
+ })?;
689
+
690
+ // Check if it's in old format (array of single-key hashes)
691
+ if !array.is_empty() {
692
+ let first_item: Value = array
693
+ .entry(0)
694
+ .map_err(|e| ParquetError::Schema(format!("Failed to get first item: {}", e)))?;
695
+
696
+ if first_item.is_kind_of(ruby.class_hash()) {
697
+ let first_hash: RHash =
698
+ TryConvert::try_convert(first_item).map_err(|e: MagnusError| {
699
+ ParquetError::Schema(format!("Failed to convert first item to hash: {}", e))
700
+ })?;
701
+ // Check if it has the new format keys
702
+ if first_hash.get(Symbol::new("name")).is_some()
703
+ && first_hash.get(Symbol::new("type")).is_some()
704
+ {
705
+ // Already in new format
706
+ array
707
+ } else {
708
+ // Old format, convert it
709
+ convert_legacy_schema(ruby, array)?
710
+ }
711
+ } else {
712
+ return Err(ParquetError::Schema(
713
+ "schema array must contain hashes".to_string(),
714
+ ));
715
+ }
716
+ } else {
717
+ array
718
+ }
719
+ } else if schema_value.is_kind_of(ruby.class_hash()) {
720
+ // Hash format with fields key
721
+ let hash: RHash = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
722
+ ParquetError::Schema(format!("Failed to convert to hash: {}", e))
723
+ })?;
724
+ if let Some(fields) = hash.get(Symbol::new("fields")) {
725
+ TryConvert::try_convert(fields).map_err(|e: MagnusError| {
726
+ ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
727
+ })?
728
+ } else {
729
+ return Err(ParquetError::Schema(
730
+ "schema hash must have 'fields' key or be in DSL format with 'type' key"
731
+ .to_string(),
732
+ ));
733
+ }
734
+ } else {
735
+ return Err(ParquetError::Schema(
736
+ "schema must be nil, an array, or a hash".to_string(),
737
+ ));
738
+ };
739
+
740
+ // Check if we need to infer schema from data
741
+ if schema_array.is_empty() {
742
+ if let Some(data) = data_array {
743
+ if data.is_empty() {
744
+ return Err(ParquetError::Schema(
745
+ "Cannot infer schema from empty data".to_string(),
746
+ ));
747
+ }
748
+
749
+ // Get first row/batch to determine column count
750
+ let first_item: Value = data.entry(0).map_err(|e| {
751
+ ParquetError::Schema(format!("Failed to get first data item: {}", e))
752
+ })?;
753
+ let num_columns = if first_item.is_kind_of(ruby.class_array()) {
754
+ let first_array: RArray =
755
+ TryConvert::try_convert(first_item).map_err(|e: MagnusError| {
756
+ ParquetError::Schema(format!(
757
+ "Failed to convert first data item to array: {}",
758
+ e
759
+ ))
760
+ })?;
761
+ first_array.len()
762
+ } else {
763
+ return Err(ParquetError::Schema(
764
+ "First data item must be an array".to_string(),
765
+ ));
766
+ };
767
+
768
+ // Generate default schema with String types
769
+ let new_schema = RArray::new();
770
+ for i in 0..num_columns {
771
+ let field = RHash::new();
772
+ field
773
+ .aset(Symbol::new("name"), format!("f{}", i))
774
+ .map_err(|e| {
775
+ ParquetError::Schema(format!("Failed to set field name: {}", e))
776
+ })?;
777
+ field
778
+ .aset(Symbol::new("type"), Symbol::new("string"))
779
+ .map_err(|e| {
780
+ ParquetError::Schema(format!("Failed to set field type: {}", e))
781
+ })?;
782
+ new_schema
783
+ .push(field)
784
+ .map_err(|e| ParquetError::Schema(format!("Failed to push field: {}", e)))?;
785
+ }
786
+
787
+ schema_array = new_schema;
788
+ } else {
789
+ return Err(ParquetError::Schema(
790
+ "Schema is required when data is not provided for inference".to_string(),
791
+ ));
792
+ }
793
+ }
794
+
795
+ // Convert schema to the format expected by ruby_schema_to_parquet
796
+ let schema_hash = ruby.hash_new();
797
+ schema_hash
798
+ .aset(Symbol::new("fields"), schema_array)
799
+ .map_err(|e| ParquetError::Schema(format!("Failed to set fields: {}", e)))?;
800
+ Ok(schema_hash.as_value())
801
+ }
802
+
803
+ /// Extract schema nodes from schema fields
804
+ pub fn extract_field_schemas(schema: &Schema) -> Vec<SchemaNode> {
805
+ if let SchemaNode::Struct { fields, .. } = &schema.root {
806
+ fields.to_vec()
807
+ } else {
808
+ Vec::new()
809
+ }
810
+ }