parquet 0.4.2 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,12 @@
1
1
  use std::str::FromStr;
2
+ use std::sync::Arc;
3
+
4
+ use crate::reader::ReaderError;
2
5
 
3
6
  use super::*;
7
+ use arrow_array::builder::MapFieldNames;
4
8
  use arrow_array::builder::*;
9
+ use arrow_schema::{DataType, Field, Fields, TimeUnit};
5
10
  use jiff::tz::{Offset, TimeZone};
6
11
  use magnus::{RArray, RString, TryConvert};
7
12
 
@@ -64,14 +69,19 @@ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, Magn
64
69
 
65
70
  let x = timestamp
66
71
  .to_zoned(TimeZone::fixed(Offset::constant(0)))
67
- .unwrap()
72
+ .map_err(|e| {
73
+ MagnusError::new(
74
+ magnus::exception::type_error(),
75
+ format!("Failed to convert date32 to timestamp: {}", e),
76
+ )
77
+ })?
68
78
  .timestamp();
69
79
 
70
80
  // Convert to epoch days
71
81
  Ok((x.as_second() as i64 / 86400) as i32)
72
82
  } else if value.is_kind_of(ruby.class_time()) {
73
83
  // Convert Time object to epoch days
74
- let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ()).unwrap())?;
84
+ let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
75
85
  Ok(((secs as f64) / 86400.0) as i32)
76
86
  } else {
77
87
  Err(MagnusError::new(
@@ -115,8 +125,8 @@ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result
115
125
  Ok(timestamp.as_millisecond())
116
126
  } else if value.is_kind_of(ruby.class_time()) {
117
127
  // Convert Time object to milliseconds
118
- let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ()).unwrap())?;
119
- let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ()).unwrap())?;
128
+ let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
129
+ let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
120
130
  Ok(secs * 1000 + (usecs / 1000))
121
131
  } else {
122
132
  Err(MagnusError::new(
@@ -162,8 +172,8 @@ pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result
162
172
  Ok(timestamp.as_microsecond())
163
173
  } else if value.is_kind_of(ruby.class_time()) {
164
174
  // Convert Time object to microseconds
165
- let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ()).unwrap())?;
166
- let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ()).unwrap())?;
175
+ let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
176
+ let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
167
177
  Ok(secs * 1_000_000 + usecs)
168
178
  } else {
169
179
  Err(MagnusError::new(
@@ -209,225 +219,78 @@ pub fn convert_to_string(value: Value) -> Result<String, MagnusError> {
209
219
  })
210
220
  }
211
221
 
212
- pub fn convert_to_list(
213
- value: Value,
214
- list_field: &ListField,
215
- ) -> Result<Vec<ParquetValue>, MagnusError> {
216
- let ruby = unsafe { Ruby::get_unchecked() };
217
- if value.is_kind_of(ruby.class_array()) {
218
- let array = RArray::from_value(value).ok_or_else(|| {
219
- MagnusError::new(magnus::exception::type_error(), "Invalid list format")
220
- })?;
221
-
222
- let mut values = Vec::with_capacity(array.len());
223
- for item_value in array.into_iter() {
224
- let converted = match &list_field.item_type {
225
- ParquetSchemaType::Int8 => {
226
- let v = NumericConverter::<i8>::convert_with_string_fallback(item_value)?;
227
- ParquetValue::Int8(v)
228
- }
229
- ParquetSchemaType::Int16 => {
230
- let v = NumericConverter::<i16>::convert_with_string_fallback(item_value)?;
231
- ParquetValue::Int16(v)
232
- }
233
- ParquetSchemaType::Int32 => {
234
- let v = NumericConverter::<i32>::convert_with_string_fallback(item_value)?;
235
- ParquetValue::Int32(v)
236
- }
237
- ParquetSchemaType::Int64 => {
238
- let v = NumericConverter::<i64>::convert_with_string_fallback(item_value)?;
239
- ParquetValue::Int64(v)
240
- }
241
- ParquetSchemaType::UInt8 => {
242
- let v = NumericConverter::<u8>::convert_with_string_fallback(item_value)?;
243
- ParquetValue::UInt8(v)
244
- }
245
- ParquetSchemaType::UInt16 => {
246
- let v = NumericConverter::<u16>::convert_with_string_fallback(item_value)?;
247
- ParquetValue::UInt16(v)
248
- }
249
- ParquetSchemaType::UInt32 => {
250
- let v = NumericConverter::<u32>::convert_with_string_fallback(item_value)?;
251
- ParquetValue::UInt32(v)
252
- }
253
- ParquetSchemaType::UInt64 => {
254
- let v = NumericConverter::<u64>::convert_with_string_fallback(item_value)?;
255
- ParquetValue::UInt64(v)
256
- }
257
- ParquetSchemaType::Float => {
258
- let v = NumericConverter::<f32>::convert_with_string_fallback(item_value)?;
259
- ParquetValue::Float32(v)
260
- }
261
- ParquetSchemaType::Double => {
262
- let v = NumericConverter::<f64>::convert_with_string_fallback(item_value)?;
263
- ParquetValue::Float64(v)
264
- }
265
- ParquetSchemaType::String => {
266
- let v = String::try_convert(item_value)?;
267
- ParquetValue::String(v)
268
- }
269
- ParquetSchemaType::Binary => {
270
- let v = convert_to_binary(item_value)?;
271
- ParquetValue::Bytes(v)
272
- }
273
- ParquetSchemaType::Boolean => {
274
- let v = convert_to_boolean(item_value)?;
275
- ParquetValue::Boolean(v)
276
- }
277
- ParquetSchemaType::Date32 => {
278
- let v = convert_to_date32(item_value, list_field.format)?;
279
- ParquetValue::Date32(v)
280
- }
281
- ParquetSchemaType::TimestampMillis => {
282
- let v = convert_to_timestamp_millis(item_value, list_field.format)?;
283
- ParquetValue::TimestampMillis(v, None)
284
- }
285
- ParquetSchemaType::TimestampMicros => {
286
- let v = convert_to_timestamp_micros(item_value, list_field.format)?;
287
- ParquetValue::TimestampMicros(v, None)
288
- }
289
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
290
- return Err(MagnusError::new(
291
- magnus::exception::type_error(),
292
- "Nested lists and maps are not supported",
293
- ))
294
- }
295
- };
296
- values.push(converted);
297
- }
298
- Ok(values)
299
- } else {
300
- Err(MagnusError::new(
301
- magnus::exception::type_error(),
302
- "Invalid list format",
303
- ))
304
- }
305
- }
306
-
307
- pub fn convert_to_map(
308
- value: Value,
309
- map_field: &MapField,
310
- ) -> Result<HashMap<ParquetValue, ParquetValue>, MagnusError> {
311
- let ruby = unsafe { Ruby::get_unchecked() };
312
- if value.is_kind_of(ruby.class_hash()) {
313
- let mut map = HashMap::new();
314
- let entries: Vec<(Value, Value)> = value.funcall("to_a", ())?;
315
-
316
- for (key, value) in entries {
317
- let key_value = match &map_field.key_type {
318
- ParquetSchemaType::String => {
319
- let v = String::try_convert(key)?;
320
- ParquetValue::String(v)
321
- }
322
- _ => {
323
- return Err(MagnusError::new(
324
- magnus::exception::type_error(),
325
- "Map keys must be strings",
326
- ))
327
- }
328
- };
222
+ /// Converts our custom `ParquetSchemaType` into an Arrow `DataType`.
223
+ /// This ensures proper nullability settings for nested types.
224
+ /// Converts a ParquetSchemaType to an Arrow DataType
225
+ pub fn parquet_schema_type_to_arrow_data_type(
226
+ schema_type: &ParquetSchemaType,
227
+ ) -> Result<DataType, MagnusError> {
228
+ Ok(match schema_type {
229
+ ParquetSchemaType::Int8 => DataType::Int8,
230
+ ParquetSchemaType::Int16 => DataType::Int16,
231
+ ParquetSchemaType::Int32 => DataType::Int32,
232
+ ParquetSchemaType::Int64 => DataType::Int64,
233
+ ParquetSchemaType::UInt8 => DataType::UInt8,
234
+ ParquetSchemaType::UInt16 => DataType::UInt16,
235
+ ParquetSchemaType::UInt32 => DataType::UInt32,
236
+ ParquetSchemaType::UInt64 => DataType::UInt64,
237
+ ParquetSchemaType::Float => DataType::Float32,
238
+ ParquetSchemaType::Double => DataType::Float64,
239
+ ParquetSchemaType::String => DataType::Utf8,
240
+ ParquetSchemaType::Binary => DataType::Binary,
241
+ ParquetSchemaType::Boolean => DataType::Boolean,
242
+ ParquetSchemaType::Date32 => DataType::Date32,
243
+ ParquetSchemaType::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
244
+ ParquetSchemaType::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
329
245
 
330
- let value_value = match &map_field.value_type {
331
- ParquetSchemaType::Int8 => {
332
- let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
333
- ParquetValue::Int8(v)
334
- }
335
- ParquetSchemaType::Int16 => {
336
- let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
337
- ParquetValue::Int16(v)
338
- }
339
- ParquetSchemaType::Int32 => {
340
- let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
341
- ParquetValue::Int32(v)
342
- }
343
- ParquetSchemaType::Int64 => {
344
- let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
345
- ParquetValue::Int64(v)
346
- }
347
- ParquetSchemaType::UInt8 => {
348
- let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
349
- ParquetValue::UInt8(v)
350
- }
351
- ParquetSchemaType::UInt16 => {
352
- let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
353
- ParquetValue::UInt16(v)
354
- }
355
- ParquetSchemaType::UInt32 => {
356
- let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
357
- ParquetValue::UInt32(v)
358
- }
359
- ParquetSchemaType::UInt64 => {
360
- let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
361
- ParquetValue::UInt64(v)
362
- }
363
- ParquetSchemaType::Float => {
364
- let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
365
- ParquetValue::Float32(v)
366
- }
367
- ParquetSchemaType::Double => {
368
- let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
369
- ParquetValue::Float64(v)
370
- }
371
- ParquetSchemaType::String => {
372
- let v = String::try_convert(value)?;
373
- ParquetValue::String(v)
374
- }
375
- ParquetSchemaType::Binary => {
376
- let v = convert_to_binary(value)?;
377
- ParquetValue::Bytes(v)
378
- }
379
- ParquetSchemaType::Boolean => {
380
- let v = convert_to_boolean(value)?;
381
- ParquetValue::Boolean(v)
382
- }
383
- ParquetSchemaType::Date32 => {
384
- let v = convert_to_date32(value, map_field.format)?;
385
- ParquetValue::Date32(v)
386
- }
387
- ParquetSchemaType::TimestampMillis => {
388
- let v = convert_to_timestamp_millis(value, map_field.format)?;
389
- ParquetValue::TimestampMillis(v, None)
390
- }
391
- ParquetSchemaType::TimestampMicros => {
392
- let v = convert_to_timestamp_micros(value, map_field.format)?;
393
- ParquetValue::TimestampMicros(v, None)
394
- }
395
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
396
- return Err(MagnusError::new(
397
- magnus::exception::type_error(),
398
- "Map values cannot be lists or maps",
399
- ))
400
- }
401
- };
246
+ // For a List<T>, create a standard List in Arrow with nullable items
247
+ ParquetSchemaType::List(list_field) => {
248
+ let child_type = parquet_schema_type_to_arrow_data_type(&list_field.item_type)?;
249
+ // For a list, use empty field name to match expectations for schema_dsl test
250
+ // This is the critical fix for the schema_dsl test which expects an empty field name
251
+ // Use empty field name for all list field items - this is crucial for compatibility
252
+ DataType::List(Arc::new(Field::new(
253
+ "item",
254
+ child_type,
255
+ list_field.nullable,
256
+ )))
257
+ }
402
258
 
403
- map.insert(key_value, value_value);
259
+ // For a Map<K, V>, ensure entries field is non-nullable and key field is non-nullable
260
+ ParquetSchemaType::Map(map_field) => {
261
+ let key_arrow_type = parquet_schema_type_to_arrow_data_type(&map_field.key_type)?;
262
+ let value_arrow_type = parquet_schema_type_to_arrow_data_type(&map_field.value_type)?;
263
+ DataType::Map(
264
+ Arc::new(Field::new(
265
+ "entries",
266
+ DataType::Struct(Fields::from(vec![
267
+ Field::new("key", key_arrow_type, false), // key must be non-null
268
+ Field::new("value", value_arrow_type, true), // value can be null
269
+ ])),
270
+ /*nullable=*/ false, // crucial: entries must be non-nullable
271
+ )),
272
+ /*keys_sorted=*/ false,
273
+ )
404
274
  }
405
- Ok(map)
406
- } else {
407
- Err(MagnusError::new(
408
- magnus::exception::type_error(),
409
- "Invalid map format",
410
- ))
411
- }
412
- }
275
+ ParquetSchemaType::Struct(struct_field) => {
276
+ if struct_field.fields.is_empty() {
277
+ return Err(MagnusError::new(
278
+ magnus::exception::runtime_error(),
279
+ "Cannot create a struct with zero subfields (empty struct).",
280
+ ));
281
+ }
413
282
 
414
- macro_rules! impl_timestamp_to_arrow_conversion {
415
- ($values:expr, $builder_type:ty, $variant:ident) => {{
416
- let mut builder = <$builder_type>::with_capacity($values.len());
417
- for value in $values {
418
- match value {
419
- ParquetValue::$variant(v, _tz) => builder.append_value(v),
420
- ParquetValue::Null => builder.append_null(),
421
- _ => {
422
- return Err(MagnusError::new(
423
- magnus::exception::type_error(),
424
- format!("Expected {}, got {:?}", stringify!($variant), value),
425
- ))
426
- }
283
+ // Build arrow fields
284
+ let mut arrow_fields = Vec::with_capacity(struct_field.fields.len());
285
+
286
+ for field in &struct_field.fields {
287
+ let field_type = parquet_schema_type_to_arrow_data_type(&field.type_)?;
288
+ arrow_fields.push(Field::new(&field.name, field_type, true)); // All fields are nullable by default
427
289
  }
290
+
291
+ DataType::Struct(Fields::from(arrow_fields))
428
292
  }
429
- Ok(Arc::new(builder.finish()))
430
- }};
293
+ })
431
294
  }
432
295
 
433
296
  #[macro_export]
@@ -457,367 +320,1099 @@ macro_rules! impl_timestamp_array_conversion {
457
320
  }};
458
321
  }
459
322
 
460
- #[macro_export]
461
- macro_rules! impl_array_conversion {
462
- ($values:expr, $builder_type:ty, $variant:ident) => {{
463
- let mut builder = <$builder_type>::with_capacity($values.len());
464
- for value in $values {
465
- match value {
466
- ParquetValue::$variant(v) => builder.append_value(v),
467
- ParquetValue::Null => builder.append_null(),
468
- _ => {
469
- return Err(MagnusError::new(
470
- magnus::exception::type_error(),
471
- format!("Expected {}, got {:?}", stringify!($variant), value),
472
- ))
473
- }
474
- }
323
+ // Create the appropriate Arrow builder for a given ParquetSchemaType.
324
+ // We return a Box<dyn ArrayBuilder> so we can dynamically downcast.
325
+ fn create_arrow_builder_for_type(
326
+ type_: &ParquetSchemaType,
327
+ capacity: Option<usize>,
328
+ ) -> Result<Box<dyn ArrayBuilder>, ReaderError> {
329
+ let cap = capacity.unwrap_or(1); // Default to at least capacity 1 to avoid empty builders
330
+ match type_ {
331
+ ParquetSchemaType::Int8 => Ok(Box::new(Int8Builder::with_capacity(cap))),
332
+ ParquetSchemaType::Int16 => Ok(Box::new(Int16Builder::with_capacity(cap))),
333
+ ParquetSchemaType::Int32 => Ok(Box::new(Int32Builder::with_capacity(cap))),
334
+ ParquetSchemaType::Int64 => Ok(Box::new(Int64Builder::with_capacity(cap))),
335
+ ParquetSchemaType::UInt8 => Ok(Box::new(UInt8Builder::with_capacity(cap))),
336
+ ParquetSchemaType::UInt16 => Ok(Box::new(UInt16Builder::with_capacity(cap))),
337
+ ParquetSchemaType::UInt32 => Ok(Box::new(UInt32Builder::with_capacity(cap))),
338
+ ParquetSchemaType::UInt64 => Ok(Box::new(UInt64Builder::with_capacity(cap))),
339
+ ParquetSchemaType::Float => Ok(Box::new(Float32Builder::with_capacity(cap))),
340
+ ParquetSchemaType::Double => Ok(Box::new(Float64Builder::with_capacity(cap))),
341
+ ParquetSchemaType::String => Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32))),
342
+ ParquetSchemaType::Binary => Ok(Box::new(BinaryBuilder::with_capacity(cap, cap * 32))),
343
+ ParquetSchemaType::Boolean => Ok(Box::new(BooleanBuilder::with_capacity(cap))),
344
+ ParquetSchemaType::Date32 => Ok(Box::new(Date32Builder::with_capacity(cap))),
345
+ ParquetSchemaType::TimestampMillis => {
346
+ Ok(Box::new(TimestampMillisecondBuilder::with_capacity(cap)))
475
347
  }
476
- Ok(Arc::new(builder.finish()))
477
- }};
478
- ($values:expr, $builder_type:ty, $variant:ident, $capacity:expr) => {{
479
- let mut builder = <$builder_type>::with_capacity($values.len(), $capacity);
480
- for value in $values {
481
- match value {
482
- ParquetValue::$variant(v) => builder.append_value(v),
483
- ParquetValue::Null => builder.append_null(),
484
- _ => {
485
- return Err(MagnusError::new(
486
- magnus::exception::type_error(),
487
- format!("Expected {}, got {:?}", stringify!($variant), value),
488
- ))
489
- }
490
- }
348
+ ParquetSchemaType::TimestampMicros => {
349
+ Ok(Box::new(TimestampMicrosecondBuilder::with_capacity(cap)))
491
350
  }
492
- Ok(Arc::new(builder.finish()))
493
- }};
494
- }
351
+ ParquetSchemaType::List(list_field) => {
352
+ // For a list, we create a ListBuilder whose child builder is determined by item_type.
353
+ // Pass through capacity to ensure consistent sizing
354
+ let child_builder = create_arrow_builder_for_type(&list_field.item_type, Some(cap))?;
495
355
 
496
- #[macro_export]
497
- macro_rules! append_list_value {
498
- ($list_builder:expr, $item_type:path, $value:expr, $builder_type:ty, $value_variant:path) => {
499
- match (&$item_type, &$value) {
500
- ($item_type, $value_variant(v)) => {
501
- $list_builder
502
- .values()
503
- .as_any_mut()
504
- .downcast_mut::<$builder_type>()
505
- .unwrap()
506
- .append_value(v.clone());
507
- }
508
- (_, ParquetValue::Null) => {
509
- $list_builder.append_null();
356
+ // Ensure consistent builder capacity for lists
357
+ Ok(Box::new(ListBuilder::<Box<dyn ArrayBuilder>>::new(
358
+ child_builder,
359
+ )))
360
+ }
361
+ ParquetSchemaType::Map(map_field) => {
362
+ // A Map is physically a list<struct<key:..., value:...>> in Arrow.
363
+ // Pass through capacity to ensure consistent sizing
364
+ let key_builder = create_arrow_builder_for_type(&map_field.key_type, Some(cap))?;
365
+ let value_builder = create_arrow_builder_for_type(&map_field.value_type, Some(cap))?;
366
+
367
+ // Create a MapBuilder with explicit field names to ensure compatibility
368
+ Ok(Box::new(MapBuilder::<
369
+ Box<dyn ArrayBuilder>,
370
+ Box<dyn ArrayBuilder>,
371
+ >::new(
372
+ Some(MapFieldNames {
373
+ entry: "entries".to_string(),
374
+ key: "key".to_string(),
375
+ value: "value".to_string(),
376
+ }),
377
+ key_builder,
378
+ value_builder,
379
+ )))
380
+ }
381
+ ParquetSchemaType::Struct(struct_field) => {
382
+ // Check for empty struct immediately
383
+ if struct_field.fields.is_empty() {
384
+ return Err(MagnusError::new(
385
+ magnus::exception::runtime_error(),
386
+ "Cannot build a struct with zero fields - Parquet doesn't support empty structs".to_string(),
387
+ ))?;
510
388
  }
511
- _ => {
389
+
390
+ // Create a child builder for each field in the struct
391
+ let mut child_field_builders = Vec::with_capacity(struct_field.fields.len());
392
+
393
+ // Get struct data type first to ensure field compatibility
394
+ let data_type = parquet_schema_type_to_arrow_data_type(type_)?;
395
+
396
+ // Make sure the data type is a struct
397
+ let arrow_fields = if let DataType::Struct(ref fields) = data_type {
398
+ fields.clone()
399
+ } else {
512
400
  return Err(MagnusError::new(
513
401
  magnus::exception::type_error(),
402
+ "Expected struct data type".to_string(),
403
+ ))?;
404
+ };
405
+
406
+ // Create builders for each child field with consistent capacity
407
+ for child in &struct_field.fields {
408
+ let sub_builder = create_arrow_builder_for_type(&child.type_, Some(cap))?;
409
+ child_field_builders.push(sub_builder);
410
+ }
411
+
412
+ // Make sure we have the right number of builders
413
+ if child_field_builders.len() != arrow_fields.len() {
414
+ return Err(MagnusError::new(
415
+ magnus::exception::runtime_error(),
514
416
  format!(
515
- "Type mismatch in list: expected {:?}, got {:?}",
516
- $item_type, $value
417
+ "Number of field builders ({}) doesn't match number of arrow fields ({})",
418
+ child_field_builders.len(),
419
+ arrow_fields.len()
517
420
  ),
518
- ))
421
+ ))?;
519
422
  }
423
+
424
+ // Create the StructBuilder with the fields and child builders
425
+ Ok(Box::new(StructBuilder::new(
426
+ arrow_fields,
427
+ child_field_builders,
428
+ )))
520
429
  }
521
- };
430
+ }
522
431
  }
523
432
 
524
- #[macro_export]
525
- macro_rules! append_list_value_copy {
526
- ($list_builder:expr, $item_type:path, $value:expr, $builder_type:ty, $value_variant:path) => {
527
- match (&$item_type, &$value) {
528
- ($item_type, $value_variant(v)) => {
529
- $list_builder
530
- .values()
531
- .as_any_mut()
532
- .downcast_mut::<$builder_type>()
533
- .unwrap()
534
- .append_value(*v);
433
+ // Fill primitive scalar Int8 values
434
+ fn fill_int8_builder(
435
+ builder: &mut dyn ArrayBuilder,
436
+ values: &[ParquetValue],
437
+ ) -> Result<(), MagnusError> {
438
+ let typed_builder = builder
439
+ .as_any_mut()
440
+ .downcast_mut::<Int8Builder>()
441
+ .expect("Builder mismatch: expected Int8Builder");
442
+ for val in values {
443
+ match val {
444
+ ParquetValue::Int8(i) => typed_builder.append_value(*i),
445
+ // Handle Int64 that could be an Int8
446
+ ParquetValue::Int64(i) => {
447
+ if *i < i8::MIN as i64 || *i > i8::MAX as i64 {
448
+ return Err(MagnusError::new(
449
+ magnus::exception::range_error(),
450
+ format!("Integer {} is out of range for Int8", i),
451
+ ));
452
+ }
453
+ typed_builder.append_value(*i as i8)
535
454
  }
536
- (_, ParquetValue::Null) => {
537
- $list_builder.append_null();
455
+ ParquetValue::Null => typed_builder.append_null(),
456
+ other => {
457
+ return Err(MagnusError::new(
458
+ magnus::exception::type_error(),
459
+ format!("Expected Int8, got {:?}", other),
460
+ ))
461
+ }
462
+ }
463
+ }
464
+ Ok(())
465
+ }
466
+
467
+ // Fill primitive scalar Int16 values
468
+ fn fill_int16_builder(
469
+ builder: &mut dyn ArrayBuilder,
470
+ values: &[ParquetValue],
471
+ ) -> Result<(), MagnusError> {
472
+ let typed_builder = builder
473
+ .as_any_mut()
474
+ .downcast_mut::<Int16Builder>()
475
+ .expect("Builder mismatch: expected Int16Builder");
476
+ for val in values {
477
+ match val {
478
+ ParquetValue::Int16(i) => typed_builder.append_value(*i),
479
+ // Handle Int64 that could be an Int16
480
+ ParquetValue::Int64(i) => {
481
+ if *i < i16::MIN as i64 || *i > i16::MAX as i64 {
482
+ return Err(MagnusError::new(
483
+ magnus::exception::range_error(),
484
+ format!("Integer {} is out of range for Int16", i),
485
+ ));
486
+ }
487
+ typed_builder.append_value(*i as i16)
538
488
  }
539
- _ => {
489
+ ParquetValue::Null => typed_builder.append_null(),
490
+ other => {
540
491
  return Err(MagnusError::new(
541
492
  magnus::exception::type_error(),
542
- format!(
543
- "Type mismatch in list: expected {:?}, got {:?}",
544
- $item_type, $value
545
- ),
493
+ format!("Expected Int16, got {:?}", other),
546
494
  ))
547
495
  }
548
496
  }
549
- };
497
+ }
498
+ Ok(())
550
499
  }
551
500
 
552
- #[macro_export]
553
- macro_rules! append_timestamp_list_value {
554
- ($list_builder:expr, $item_type:path, $value:expr, $builder_type:ty, $value_variant:path) => {
555
- match (&$item_type, &$value) {
556
- ($item_type, $value_variant(v, _tz)) => {
557
- $list_builder
558
- .values()
559
- .as_any_mut()
560
- .downcast_mut::<$builder_type>()
561
- .unwrap()
562
- .append_value(*v);
501
+ // Fill list values by recursively filling child items
502
+ fn fill_list_builder(
503
+ builder: &mut dyn ArrayBuilder,
504
+ item_type: &ParquetSchemaType,
505
+ values: &[ParquetValue],
506
+ ) -> Result<(), MagnusError> {
507
+ // We need to use a more specific type for ListBuilder to help Rust's type inference
508
+ let lb = builder
509
+ .as_any_mut()
510
+ .downcast_mut::<ListBuilder<Box<dyn ArrayBuilder>>>()
511
+ .expect("Builder mismatch: expected ListBuilder");
512
+
513
+ for val in values {
514
+ if let ParquetValue::Null = val {
515
+ // null list
516
+ lb.append(false);
517
+ } else if let ParquetValue::List(list_items) = val {
518
+ // First fill the child builder with the items
519
+ let values_builder = lb.values();
520
+ fill_builder(values_builder, item_type, list_items)?;
521
+ // Then finalize the list by calling append(true)
522
+ lb.append(true);
523
+ } else {
524
+ return Err(MagnusError::new(
525
+ magnus::exception::type_error(),
526
+ format!("Expected ParquetValue::List(...) or Null, got {:?}", val),
527
+ ));
528
+ }
529
+ }
530
+
531
+ Ok(())
532
+ }
533
+
534
+ // Fill map values by recursively filling key and value items
535
+ fn fill_map_builder(
536
+ builder: &mut dyn ArrayBuilder,
537
+ key_type: &ParquetSchemaType,
538
+ value_type: &ParquetSchemaType,
539
+ values: &[ParquetValue],
540
+ ) -> Result<(), MagnusError> {
541
+ let mb = builder
542
+ .as_any_mut()
543
+ .downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
544
+ .expect("Builder mismatch: expected MapBuilder");
545
+
546
+ for val in values {
547
+ match val {
548
+ ParquetValue::Null => {
549
+ // null map
550
+ mb.append(false).map_err(|e| {
551
+ MagnusError::new(
552
+ magnus::exception::runtime_error(),
553
+ format!("Failed to append null to map: {}", e),
554
+ )
555
+ })?;
563
556
  }
564
- (_, ParquetValue::Null) => {
565
- $list_builder.append_null();
557
+ ParquetValue::Map(map_entries) => {
558
+ // First append all key-value pairs to the child arrays
559
+ for (k, v) in map_entries {
560
+ // Note: Arrow expects field names "key" and "value" (singular)
561
+ fill_builder(mb.keys(), key_type, &[k.clone()])?;
562
+ fill_builder(mb.values(), value_type, &[v.clone()])?;
563
+ }
564
+ // Then finalize the map by calling append(true)
565
+ mb.append(true).map_err(|e| {
566
+ MagnusError::new(
567
+ magnus::exception::runtime_error(),
568
+ format!("Failed to append map entry: {}", e),
569
+ )
570
+ })?;
566
571
  }
567
- _ => {
572
+ other => {
568
573
  return Err(MagnusError::new(
569
574
  magnus::exception::type_error(),
570
- format!(
571
- "Type mismatch in list: expected {:?}, got {:?}",
572
- $item_type, $value
573
- ),
575
+ format!("Expected ParquetValue::Map(...) or Null, got {:?}", other),
574
576
  ))
575
577
  }
576
578
  }
577
- };
579
+ }
580
+
581
+ Ok(())
578
582
  }
579
583
 
580
- pub fn convert_parquet_values_to_arrow(
581
- values: Vec<ParquetValue>,
584
+ // Append an entire slice of ParquetValue into the given Arrow builder.
585
+ // We do a `match` on the type for each item, recursing for nested list/map.
586
+ fn fill_builder(
587
+ builder: &mut dyn ArrayBuilder,
582
588
  type_: &ParquetSchemaType,
583
- ) -> Result<Arc<dyn Array>, MagnusError> {
589
+ values: &[ParquetValue],
590
+ ) -> Result<(), MagnusError> {
584
591
  match type_ {
585
- ParquetSchemaType::Int8 => impl_array_conversion!(values, Int8Builder, Int8),
586
- ParquetSchemaType::Int16 => impl_array_conversion!(values, Int16Builder, Int16),
587
- ParquetSchemaType::Int32 => impl_array_conversion!(values, Int32Builder, Int32),
588
- ParquetSchemaType::Int64 => impl_array_conversion!(values, Int64Builder, Int64),
589
- ParquetSchemaType::UInt8 => impl_array_conversion!(values, UInt8Builder, UInt8),
590
- ParquetSchemaType::UInt16 => impl_array_conversion!(values, UInt16Builder, UInt16),
591
- ParquetSchemaType::UInt32 => impl_array_conversion!(values, UInt32Builder, UInt32),
592
- ParquetSchemaType::UInt64 => impl_array_conversion!(values, UInt64Builder, UInt64),
593
- ParquetSchemaType::Float => impl_array_conversion!(values, Float32Builder, Float32),
594
- ParquetSchemaType::Double => impl_array_conversion!(values, Float64Builder, Float64),
595
- ParquetSchemaType::String => {
596
- impl_array_conversion!(values, StringBuilder, String, values.len() * 32)
597
- }
598
- ParquetSchemaType::Binary => {
599
- impl_array_conversion!(values, BinaryBuilder, Bytes, values.len() * 32)
600
- }
601
- ParquetSchemaType::Boolean => impl_array_conversion!(values, BooleanBuilder, Boolean),
602
- ParquetSchemaType::Date32 => impl_array_conversion!(values, Date32Builder, Date32),
603
- ParquetSchemaType::TimestampMillis => {
604
- impl_timestamp_to_arrow_conversion!(
605
- values,
606
- TimestampMillisecondBuilder,
607
- TimestampMillis
608
- )
592
+ // ------------------
593
+ // PRIMITIVE SCALARS - delegated to specialized helpers
594
+ // ------------------
595
+ ParquetSchemaType::Int8 => fill_int8_builder(builder, values),
596
+ ParquetSchemaType::Int16 => fill_int16_builder(builder, values),
597
+ ParquetSchemaType::Int32 => {
598
+ let typed_builder = builder
599
+ .as_any_mut()
600
+ .downcast_mut::<Int32Builder>()
601
+ .expect("Builder mismatch: expected Int32Builder");
602
+ for val in values {
603
+ match val {
604
+ ParquetValue::Int32(i) => typed_builder.append_value(*i),
605
+ ParquetValue::Date32(d) => typed_builder.append_value(*d), // if you allow date->int
606
+ // Handle the case where we have an Int64 in an Int32 field (common with Ruby Integers)
607
+ ParquetValue::Int64(i) => {
608
+ if *i < i32::MIN as i64 || *i > i32::MAX as i64 {
609
+ return Err(MagnusError::new(
610
+ magnus::exception::range_error(),
611
+ format!("Integer {} is out of range for Int32", i),
612
+ ));
613
+ }
614
+ typed_builder.append_value(*i as i32)
615
+ }
616
+ ParquetValue::Null => typed_builder.append_null(),
617
+ other => {
618
+ return Err(MagnusError::new(
619
+ magnus::exception::type_error(),
620
+ format!("Expected Int32, got {:?}", other),
621
+ ))
622
+ }
623
+ }
624
+ }
625
+ Ok(())
609
626
  }
610
- ParquetSchemaType::TimestampMicros => {
611
- impl_timestamp_to_arrow_conversion!(
612
- values,
613
- TimestampMicrosecondBuilder,
614
- TimestampMicros
615
- )
627
+ ParquetSchemaType::Int64 => {
628
+ let typed_builder = builder
629
+ .as_any_mut()
630
+ .downcast_mut::<Int64Builder>()
631
+ .expect("Builder mismatch: expected Int64Builder");
632
+ for val in values {
633
+ match val {
634
+ ParquetValue::Int64(i) => typed_builder.append_value(*i),
635
+ ParquetValue::Null => typed_builder.append_null(),
636
+ other => {
637
+ return Err(MagnusError::new(
638
+ magnus::exception::type_error(),
639
+ format!("Expected Int64, got {:?}", other),
640
+ ))
641
+ }
642
+ }
643
+ }
644
+ Ok(())
616
645
  }
617
- ParquetSchemaType::List(list_field) => {
618
- let value_builder = match list_field.item_type {
619
- ParquetSchemaType::Int8 => Box::new(Int8Builder::new()) as Box<dyn ArrayBuilder>,
620
- ParquetSchemaType::Int16 => Box::new(Int16Builder::new()) as Box<dyn ArrayBuilder>,
621
- ParquetSchemaType::Int32 => Box::new(Int32Builder::new()) as Box<dyn ArrayBuilder>,
622
- ParquetSchemaType::Int64 => Box::new(Int64Builder::new()) as Box<dyn ArrayBuilder>,
623
- ParquetSchemaType::UInt8 => Box::new(UInt8Builder::new()) as Box<dyn ArrayBuilder>,
624
- ParquetSchemaType::UInt16 => {
625
- Box::new(UInt16Builder::new()) as Box<dyn ArrayBuilder>
646
+ ParquetSchemaType::UInt8 => {
647
+ let typed_builder = builder
648
+ .as_any_mut()
649
+ .downcast_mut::<UInt8Builder>()
650
+ .expect("Builder mismatch: expected UInt8Builder");
651
+ for val in values {
652
+ match val {
653
+ ParquetValue::UInt8(u) => typed_builder.append_value(*u),
654
+ // Handle Int64 that could be a UInt8
655
+ ParquetValue::Int64(i) => {
656
+ if *i < 0 || *i > u8::MAX as i64 {
657
+ return Err(MagnusError::new(
658
+ magnus::exception::range_error(),
659
+ format!("Integer {} is out of range for UInt8", i),
660
+ ));
661
+ }
662
+ typed_builder.append_value(*i as u8)
663
+ }
664
+ ParquetValue::Null => typed_builder.append_null(),
665
+ other => {
666
+ return Err(MagnusError::new(
667
+ magnus::exception::type_error(),
668
+ format!("Expected UInt8, got {:?}", other),
669
+ ))
670
+ }
626
671
  }
627
- ParquetSchemaType::UInt32 => {
628
- Box::new(UInt32Builder::new()) as Box<dyn ArrayBuilder>
672
+ }
673
+ Ok(())
674
+ }
675
+ ParquetSchemaType::UInt16 => {
676
+ let typed_builder = builder
677
+ .as_any_mut()
678
+ .downcast_mut::<UInt16Builder>()
679
+ .expect("Builder mismatch: expected UInt16Builder");
680
+ for val in values {
681
+ match val {
682
+ ParquetValue::UInt16(u) => typed_builder.append_value(*u),
683
+ // Handle Int64 that could be a UInt16
684
+ ParquetValue::Int64(i) => {
685
+ if *i < 0 || *i > u16::MAX as i64 {
686
+ return Err(MagnusError::new(
687
+ magnus::exception::range_error(),
688
+ format!("Integer {} is out of range for UInt16", i),
689
+ ));
690
+ }
691
+ typed_builder.append_value(*i as u16)
692
+ }
693
+ ParquetValue::Null => typed_builder.append_null(),
694
+ other => {
695
+ return Err(MagnusError::new(
696
+ magnus::exception::type_error(),
697
+ format!("Expected UInt16, got {:?}", other),
698
+ ))
699
+ }
629
700
  }
630
- ParquetSchemaType::UInt64 => {
631
- Box::new(UInt64Builder::new()) as Box<dyn ArrayBuilder>
701
+ }
702
+ Ok(())
703
+ }
704
+ ParquetSchemaType::UInt32 => {
705
+ let typed_builder = builder
706
+ .as_any_mut()
707
+ .downcast_mut::<UInt32Builder>()
708
+ .expect("Builder mismatch: expected UInt32Builder");
709
+ for val in values {
710
+ match val {
711
+ ParquetValue::UInt32(u) => typed_builder.append_value(*u),
712
+ // Handle Int64 that could be a UInt32
713
+ ParquetValue::Int64(i) => {
714
+ if *i < 0 || *i > u32::MAX as i64 {
715
+ return Err(MagnusError::new(
716
+ magnus::exception::range_error(),
717
+ format!("Integer {} is out of range for UInt32", i),
718
+ ));
719
+ }
720
+ typed_builder.append_value(*i as u32)
721
+ }
722
+ ParquetValue::Null => typed_builder.append_null(),
723
+ other => {
724
+ return Err(MagnusError::new(
725
+ magnus::exception::type_error(),
726
+ format!("Expected UInt32, got {:?}", other),
727
+ ))
728
+ }
632
729
  }
633
- ParquetSchemaType::Float => {
634
- Box::new(Float32Builder::new()) as Box<dyn ArrayBuilder>
730
+ }
731
+ Ok(())
732
+ }
733
+ ParquetSchemaType::UInt64 => {
734
+ let typed_builder = builder
735
+ .as_any_mut()
736
+ .downcast_mut::<UInt64Builder>()
737
+ .expect("Builder mismatch: expected UInt64Builder");
738
+ for val in values {
739
+ match val {
740
+ ParquetValue::UInt64(u) => typed_builder.append_value(*u),
741
+ // Handle Int64 that could be a UInt64
742
+ ParquetValue::Int64(i) => {
743
+ if *i < 0 {
744
+ return Err(MagnusError::new(
745
+ magnus::exception::range_error(),
746
+ format!("Integer {} is out of range for UInt64", i),
747
+ ));
748
+ }
749
+ typed_builder.append_value(*i as u64)
750
+ }
751
+ ParquetValue::Null => typed_builder.append_null(),
752
+ other => {
753
+ return Err(MagnusError::new(
754
+ magnus::exception::type_error(),
755
+ format!("Expected UInt64, got {:?}", other),
756
+ ))
757
+ }
635
758
  }
636
- ParquetSchemaType::Double => {
637
- Box::new(Float64Builder::new()) as Box<dyn ArrayBuilder>
759
+ }
760
+ Ok(())
761
+ }
762
+ ParquetSchemaType::Float => {
763
+ let typed_builder = builder
764
+ .as_any_mut()
765
+ .downcast_mut::<Float32Builder>()
766
+ .expect("Builder mismatch: expected Float32Builder");
767
+ for val in values {
768
+ match val {
769
+ ParquetValue::Float32(f) => typed_builder.append_value(*f),
770
+ ParquetValue::Float16(fh) => typed_builder.append_value(*fh),
771
+ ParquetValue::Null => typed_builder.append_null(),
772
+ other => {
773
+ return Err(MagnusError::new(
774
+ magnus::exception::type_error(),
775
+ format!("Expected Float32, got {:?}", other),
776
+ ))
777
+ }
638
778
  }
639
- ParquetSchemaType::String => {
640
- Box::new(StringBuilder::new()) as Box<dyn ArrayBuilder>
779
+ }
780
+ Ok(())
781
+ }
782
+ ParquetSchemaType::Double => {
783
+ let typed_builder = builder
784
+ .as_any_mut()
785
+ .downcast_mut::<Float64Builder>()
786
+ .expect("Builder mismatch: expected Float64Builder");
787
+ for val in values {
788
+ match val {
789
+ ParquetValue::Float64(f) => typed_builder.append_value(*f),
790
+ // If you want to allow f32 => f64, do so:
791
+ ParquetValue::Float32(flo) => typed_builder.append_value(*flo as f64),
792
+ ParquetValue::Null => typed_builder.append_null(),
793
+ other => {
794
+ return Err(MagnusError::new(
795
+ magnus::exception::type_error(),
796
+ format!("Expected Float64, got {:?}", other),
797
+ ))
798
+ }
641
799
  }
642
- ParquetSchemaType::Binary => {
643
- Box::new(BinaryBuilder::new()) as Box<dyn ArrayBuilder>
800
+ }
801
+ Ok(())
802
+ }
803
+ ParquetSchemaType::Boolean => {
804
+ let typed_builder = builder
805
+ .as_any_mut()
806
+ .downcast_mut::<BooleanBuilder>()
807
+ .expect("Builder mismatch: expected BooleanBuilder");
808
+ for val in values {
809
+ match val {
810
+ ParquetValue::Boolean(b) => typed_builder.append_value(*b),
811
+ ParquetValue::Null => typed_builder.append_null(),
812
+ other => {
813
+ return Err(MagnusError::new(
814
+ magnus::exception::type_error(),
815
+ format!("Expected Boolean, got {:?}", other),
816
+ ))
817
+ }
644
818
  }
645
- ParquetSchemaType::Boolean => {
646
- Box::new(BooleanBuilder::new()) as Box<dyn ArrayBuilder>
819
+ }
820
+ Ok(())
821
+ }
822
+ ParquetSchemaType::Date32 => {
823
+ let typed_builder = builder
824
+ .as_any_mut()
825
+ .downcast_mut::<Date32Builder>()
826
+ .expect("Builder mismatch: expected Date32Builder");
827
+ for val in values {
828
+ match val {
829
+ ParquetValue::Date32(d) => typed_builder.append_value(*d),
830
+ ParquetValue::Null => typed_builder.append_null(),
831
+ other => {
832
+ return Err(MagnusError::new(
833
+ magnus::exception::type_error(),
834
+ format!("Expected Date32, got {:?}", other),
835
+ ))
836
+ }
647
837
  }
648
- ParquetSchemaType::Date32 => {
649
- Box::new(Date32Builder::new()) as Box<dyn ArrayBuilder>
838
+ }
839
+ Ok(())
840
+ }
841
+ ParquetSchemaType::TimestampMillis => {
842
+ let typed_builder = builder
843
+ .as_any_mut()
844
+ .downcast_mut::<TimestampMillisecondBuilder>()
845
+ .expect("Builder mismatch: expected TimestampMillisecondBuilder");
846
+ for val in values {
847
+ match val {
848
+ ParquetValue::TimestampMillis(ts, _tz) => typed_builder.append_value(*ts),
849
+ ParquetValue::Null => typed_builder.append_null(),
850
+ other => {
851
+ return Err(MagnusError::new(
852
+ magnus::exception::type_error(),
853
+ format!("Expected TimestampMillis, got {:?}", other),
854
+ ))
855
+ }
650
856
  }
651
- ParquetSchemaType::TimestampMillis => {
652
- Box::new(TimestampMillisecondBuilder::new()) as Box<dyn ArrayBuilder>
857
+ }
858
+ Ok(())
859
+ }
860
+ ParquetSchemaType::TimestampMicros => {
861
+ let typed_builder = builder
862
+ .as_any_mut()
863
+ .downcast_mut::<TimestampMicrosecondBuilder>()
864
+ .expect("Builder mismatch: expected TimestampMicrosecondBuilder");
865
+ for val in values {
866
+ match val {
867
+ ParquetValue::TimestampMicros(ts, _tz) => typed_builder.append_value(*ts),
868
+ ParquetValue::Null => typed_builder.append_null(),
869
+ other => {
870
+ return Err(MagnusError::new(
871
+ magnus::exception::type_error(),
872
+ format!("Expected TimestampMicros, got {:?}", other),
873
+ ))
874
+ }
653
875
  }
654
- ParquetSchemaType::TimestampMicros => {
655
- Box::new(TimestampMicrosecondBuilder::new()) as Box<dyn ArrayBuilder>
876
+ }
877
+ Ok(())
878
+ }
879
+
880
+ // ------------------
881
+ // NESTED LIST - using helper function
882
+ // ------------------
883
+ ParquetSchemaType::List(list_field) => {
884
+ fill_list_builder(builder, &list_field.item_type, values)
885
+ }
886
+
887
+ // ------------------
888
+ // NESTED MAP - using helper function
889
+ // ------------------
890
+ ParquetSchemaType::Map(map_field) => {
891
+ fill_map_builder(builder, &map_field.key_type, &map_field.value_type, values)
892
+ }
893
+
894
+ // ------------------
895
+ // OTHER TYPES - keep as is for now
896
+ // ------------------
897
+ ParquetSchemaType::String => {
898
+ let typed_builder = builder
899
+ .as_any_mut()
900
+ .downcast_mut::<StringBuilder>()
901
+ .expect("Builder mismatch: expected StringBuilder");
902
+ for val in values {
903
+ match val {
904
+ ParquetValue::String(s) => typed_builder.append_value(s),
905
+ ParquetValue::Null => typed_builder.append_null(),
906
+ other => {
907
+ return Err(MagnusError::new(
908
+ magnus::exception::type_error(),
909
+ format!("Expected String, got {:?}", other),
910
+ ))
911
+ }
656
912
  }
657
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
658
- return Err(MagnusError::new(
659
- magnus::exception::type_error(),
660
- "Nested lists and maps are not supported",
661
- ))
913
+ }
914
+ Ok(())
915
+ }
916
+ ParquetSchemaType::Binary => {
917
+ let typed_builder = builder
918
+ .as_any_mut()
919
+ .downcast_mut::<BinaryBuilder>()
920
+ .expect("Builder mismatch: expected BinaryBuilder");
921
+ for val in values {
922
+ match val {
923
+ ParquetValue::Bytes(b) => typed_builder.append_value(&b),
924
+ ParquetValue::Null => typed_builder.append_null(),
925
+ other => {
926
+ return Err(MagnusError::new(
927
+ magnus::exception::type_error(),
928
+ format!("Expected Binary, got {:?}", other),
929
+ ))
930
+ }
662
931
  }
663
- };
932
+ }
933
+ Ok(())
934
+ }
935
+ ParquetSchemaType::Struct(struct_field) => {
936
+ let typed_builder = builder
937
+ .as_any_mut()
938
+ .downcast_mut::<StructBuilder>()
939
+ .expect("Builder mismatch: expected StructBuilder");
664
940
 
665
- let mut list_builder = ListBuilder::new(value_builder);
666
-
667
- for value in values {
668
- match value {
669
- ParquetValue::List(items) => {
670
- for item in items {
671
- match &list_field.item_type {
672
- ParquetSchemaType::Int8 => append_list_value_copy!(
673
- list_builder,
674
- ParquetSchemaType::Int8,
675
- item,
676
- Int8Builder,
677
- ParquetValue::Int8
678
- ),
679
- ParquetSchemaType::Int16 => append_list_value_copy!(
680
- list_builder,
681
- ParquetSchemaType::Int16,
682
- item,
683
- Int16Builder,
684
- ParquetValue::Int16
685
- ),
686
- ParquetSchemaType::Int32 => append_list_value_copy!(
687
- list_builder,
688
- ParquetSchemaType::Int32,
689
- item,
690
- Int32Builder,
691
- ParquetValue::Int32
692
- ),
693
- ParquetSchemaType::Int64 => append_list_value_copy!(
694
- list_builder,
695
- ParquetSchemaType::Int64,
696
- item,
697
- Int64Builder,
698
- ParquetValue::Int64
699
- ),
700
- ParquetSchemaType::UInt8 => append_list_value_copy!(
701
- list_builder,
702
- ParquetSchemaType::UInt8,
703
- item,
704
- UInt8Builder,
705
- ParquetValue::UInt8
706
- ),
707
- ParquetSchemaType::UInt16 => append_list_value_copy!(
708
- list_builder,
709
- ParquetSchemaType::UInt16,
710
- item,
711
- UInt16Builder,
712
- ParquetValue::UInt16
713
- ),
714
- ParquetSchemaType::UInt32 => append_list_value_copy!(
715
- list_builder,
716
- ParquetSchemaType::UInt32,
717
- item,
718
- UInt32Builder,
719
- ParquetValue::UInt32
720
- ),
721
- ParquetSchemaType::UInt64 => append_list_value_copy!(
722
- list_builder,
723
- ParquetSchemaType::UInt64,
724
- item,
725
- UInt64Builder,
726
- ParquetValue::UInt64
727
- ),
728
- ParquetSchemaType::Float => append_list_value_copy!(
729
- list_builder,
730
- ParquetSchemaType::Float,
731
- item,
732
- Float32Builder,
733
- ParquetValue::Float32
734
- ),
735
- ParquetSchemaType::Double => append_list_value_copy!(
736
- list_builder,
737
- ParquetSchemaType::Double,
738
- item,
739
- Float64Builder,
740
- ParquetValue::Float64
741
- ),
742
- ParquetSchemaType::String => append_list_value!(
743
- list_builder,
744
- ParquetSchemaType::String,
745
- item,
746
- StringBuilder,
747
- ParquetValue::String
748
- ),
749
- ParquetSchemaType::Binary => append_list_value!(
750
- list_builder,
751
- ParquetSchemaType::Binary,
752
- item,
753
- BinaryBuilder,
754
- ParquetValue::Bytes
755
- ),
756
- ParquetSchemaType::Boolean => append_list_value_copy!(
757
- list_builder,
758
- ParquetSchemaType::Boolean,
759
- item,
760
- BooleanBuilder,
761
- ParquetValue::Boolean
762
- ),
763
- ParquetSchemaType::Date32 => append_list_value_copy!(
764
- list_builder,
765
- ParquetSchemaType::Date32,
766
- item,
767
- Date32Builder,
768
- ParquetValue::Date32
769
- ),
770
- ParquetSchemaType::TimestampMillis => append_timestamp_list_value!(
771
- list_builder,
772
- ParquetSchemaType::TimestampMillis,
773
- item,
774
- TimestampMillisecondBuilder,
775
- ParquetValue::TimestampMillis
776
- ),
777
- ParquetSchemaType::TimestampMicros => append_timestamp_list_value!(
778
- list_builder,
779
- ParquetSchemaType::TimestampMicros,
780
- item,
781
- TimestampMicrosecondBuilder,
782
- ParquetValue::TimestampMicros
783
- ),
784
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
785
- return Err(MagnusError::new(
786
- magnus::exception::type_error(),
787
- "Nested lists and maps are not supported",
788
- ))
941
+ for val in values {
942
+ match val {
943
+ ParquetValue::Null => {
944
+ // null struct
945
+ typed_builder.append(false);
946
+ }
947
+ ParquetValue::Map(map_data) => {
948
+ for (i, field) in struct_field.fields.iter().enumerate() {
949
+ let field_key = ParquetValue::String(field.name.clone());
950
+ if let Some(field_val) = map_data.get(&field_key) {
951
+ match field_val {
952
+ ParquetValue::Int8(x) => typed_builder
953
+ .field_builder::<Int8Builder>(i)
954
+ .ok_or_else(|| {
955
+ MagnusError::new(
956
+ magnus::exception::type_error(),
957
+ "Failed to coerce into Int8Builder",
958
+ )
959
+ })?
960
+ .append_value(*x),
961
+ ParquetValue::Int16(x) => typed_builder
962
+ .field_builder::<Int16Builder>(i)
963
+ .ok_or_else(|| {
964
+ MagnusError::new(
965
+ magnus::exception::type_error(),
966
+ "Failed to coerce into Int16Builder",
967
+ )
968
+ })?
969
+ .append_value(*x),
970
+ ParquetValue::Int32(x) => typed_builder
971
+ .field_builder::<Int32Builder>(i)
972
+ .ok_or_else(|| {
973
+ MagnusError::new(
974
+ magnus::exception::type_error(),
975
+ "Failed to coerce into Int32Builder",
976
+ )
977
+ })?
978
+ .append_value(*x),
979
+ ParquetValue::Int64(x) => typed_builder
980
+ .field_builder::<Int64Builder>(i)
981
+ .ok_or_else(|| {
982
+ MagnusError::new(
983
+ magnus::exception::type_error(),
984
+ "Failed to coerce into Int64Builder",
985
+ )
986
+ })?
987
+ .append_value(*x),
988
+ ParquetValue::UInt8(x) => typed_builder
989
+ .field_builder::<UInt8Builder>(i)
990
+ .ok_or_else(|| {
991
+ MagnusError::new(
992
+ magnus::exception::type_error(),
993
+ "Failed to coerce into UInt8Builder",
994
+ )
995
+ })?
996
+ .append_value(*x),
997
+ ParquetValue::UInt16(x) => typed_builder
998
+ .field_builder::<UInt16Builder>(i)
999
+ .ok_or_else(|| {
1000
+ MagnusError::new(
1001
+ magnus::exception::type_error(),
1002
+ "Failed to coerce into UInt16Builder",
1003
+ )
1004
+ })?
1005
+ .append_value(*x),
1006
+ ParquetValue::UInt32(x) => typed_builder
1007
+ .field_builder::<UInt32Builder>(i)
1008
+ .ok_or_else(|| {
1009
+ MagnusError::new(
1010
+ magnus::exception::type_error(),
1011
+ "Failed to coerce into UInt32Builder",
1012
+ )
1013
+ })?
1014
+ .append_value(*x),
1015
+ ParquetValue::UInt64(x) => typed_builder
1016
+ .field_builder::<UInt64Builder>(i)
1017
+ .ok_or_else(|| {
1018
+ MagnusError::new(
1019
+ magnus::exception::type_error(),
1020
+ "Failed to coerce into UInt64Builder",
1021
+ )
1022
+ })?
1023
+ .append_value(*x),
1024
+ ParquetValue::Float16(_) => {
1025
+ return Err(MagnusError::new(
1026
+ magnus::exception::runtime_error(),
1027
+ "Float16 not supported",
1028
+ ))
1029
+ }
1030
+ ParquetValue::Float32(x) => typed_builder
1031
+ .field_builder::<Float32Builder>(i)
1032
+ .ok_or_else(|| {
1033
+ MagnusError::new(
1034
+ magnus::exception::type_error(),
1035
+ "Failed to coerce into Float32Builder",
1036
+ )
1037
+ })?
1038
+ .append_value(*x),
1039
+ ParquetValue::Float64(x) => typed_builder
1040
+ .field_builder::<Float64Builder>(i)
1041
+ .ok_or_else(|| {
1042
+ MagnusError::new(
1043
+ magnus::exception::type_error(),
1044
+ "Failed to coerce into Float64Builder",
1045
+ )
1046
+ })?
1047
+ .append_value(*x),
1048
+ ParquetValue::Boolean(x) => typed_builder
1049
+ .field_builder::<BooleanBuilder>(i)
1050
+ .ok_or_else(|| {
1051
+ MagnusError::new(
1052
+ magnus::exception::type_error(),
1053
+ "Failed to coerce into BooleanBuilder",
1054
+ )
1055
+ })?
1056
+ .append_value(*x),
1057
+ ParquetValue::String(x) => typed_builder
1058
+ .field_builder::<StringBuilder>(i)
1059
+ .ok_or_else(|| {
1060
+ MagnusError::new(
1061
+ magnus::exception::type_error(),
1062
+ "Failed to coerce into StringBuilder",
1063
+ )
1064
+ })?
1065
+ .append_value(x),
1066
+ ParquetValue::Bytes(bytes) => typed_builder
1067
+ .field_builder::<BinaryBuilder>(i)
1068
+ .ok_or_else(|| {
1069
+ MagnusError::new(
1070
+ magnus::exception::type_error(),
1071
+ "Failed to coerce into BinaryBuilder",
1072
+ )
1073
+ })?
1074
+ .append_value(bytes),
1075
+ ParquetValue::Date32(x) => typed_builder
1076
+ .field_builder::<Date32Builder>(i)
1077
+ .ok_or_else(|| {
1078
+ MagnusError::new(
1079
+ magnus::exception::type_error(),
1080
+ "Failed to coerce into Date32Builder",
1081
+ )
1082
+ })?
1083
+ .append_value(*x),
1084
+ ParquetValue::Date64(x) => typed_builder
1085
+ .field_builder::<Date64Builder>(i)
1086
+ .ok_or_else(|| {
1087
+ MagnusError::new(
1088
+ magnus::exception::type_error(),
1089
+ "Failed to coerce into Date64Builder",
1090
+ )
1091
+ })?
1092
+ .append_value(*x),
1093
+ ParquetValue::TimestampSecond(x, _tz) => typed_builder
1094
+ .field_builder::<TimestampSecondBuilder>(i)
1095
+ .ok_or_else(|| {
1096
+ MagnusError::new(
1097
+ magnus::exception::type_error(),
1098
+ "Failed to coerce into TimestampSecondBuilder",
1099
+ )
1100
+ })?
1101
+ .append_value(*x),
1102
+ ParquetValue::TimestampMillis(x, _tz) => typed_builder
1103
+ .field_builder::<TimestampMillisecondBuilder>(i)
1104
+ .ok_or_else(|| {
1105
+ MagnusError::new(
1106
+ magnus::exception::type_error(),
1107
+ "Failed to coerce into TimestampMillisecondBuilder",
1108
+ )
1109
+ })?
1110
+ .append_value(*x),
1111
+ ParquetValue::TimestampMicros(x, _tz) => typed_builder
1112
+ .field_builder::<TimestampMicrosecondBuilder>(i)
1113
+ .ok_or_else(|| {
1114
+ MagnusError::new(
1115
+ magnus::exception::type_error(),
1116
+ "Failed to coerce into TimestampMicrosecondBuilder",
1117
+ )
1118
+ })?
1119
+ .append_value(*x),
1120
+ ParquetValue::TimestampNanos(x, _tz) => typed_builder
1121
+ .field_builder::<TimestampNanosecondBuilder>(i)
1122
+ .ok_or_else(|| {
1123
+ MagnusError::new(
1124
+ magnus::exception::type_error(),
1125
+ "Failed to coerce into TimestampNanosecondBuilder",
1126
+ )
1127
+ })?
1128
+ .append_value(*x),
1129
+ ParquetValue::List(items) => {
1130
+ let list_builder = typed_builder
1131
+ .field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
1132
+ .ok_or_else(|| {
1133
+ MagnusError::new(
1134
+ magnus::exception::type_error(),
1135
+ "Failed to coerce into ListBuilder",
1136
+ )
1137
+ })?;
1138
+ fill_builder(
1139
+ list_builder.values(),
1140
+ &struct_field.fields[i].type_,
1141
+ items,
1142
+ )?;
1143
+ list_builder.append(true);
1144
+ }
1145
+ ParquetValue::Map(map_data) => {
1146
+ let maybe_map_builder = typed_builder
1147
+ .field_builder::<MapBuilder<
1148
+ Box<dyn ArrayBuilder>,
1149
+ Box<dyn ArrayBuilder>,
1150
+ >>(i);
1151
+
1152
+ if let Some(map_builder) = maybe_map_builder {
1153
+ fill_builder(
1154
+ map_builder,
1155
+ &struct_field.fields[i].type_,
1156
+ &[ParquetValue::Map(map_data.clone())],
1157
+ )?;
1158
+ map_builder.append(true).map_err(|e| {
1159
+ MagnusError::new(
1160
+ magnus::exception::runtime_error(),
1161
+ format!("Failed to append map: {}", e),
1162
+ )
1163
+ })?;
1164
+ } else {
1165
+ let child_struct_builder = typed_builder
1166
+ .field_builder::<StructBuilder>(i)
1167
+ .ok_or_else(|| {
1168
+ MagnusError::new(
1169
+ magnus::exception::type_error(),
1170
+ "Failed to coerce into StructBuilder",
1171
+ )
1172
+ })?;
1173
+ fill_builder(
1174
+ child_struct_builder,
1175
+ &struct_field.fields[i].type_,
1176
+ &[ParquetValue::Map(map_data.clone())],
1177
+ )?;
1178
+ }
1179
+ }
1180
+ ParquetValue::Null => match struct_field.fields[i].type_ {
1181
+ ParquetSchemaType::Int8 => typed_builder
1182
+ .field_builder::<Int8Builder>(i)
1183
+ .ok_or_else(|| {
1184
+ MagnusError::new(
1185
+ magnus::exception::type_error(),
1186
+ "Failed to coerce into Int8Builder",
1187
+ )
1188
+ })?
1189
+ .append_null(),
1190
+ ParquetSchemaType::Int16 => typed_builder
1191
+ .field_builder::<Int16Builder>(i)
1192
+ .ok_or_else(|| {
1193
+ MagnusError::new(
1194
+ magnus::exception::type_error(),
1195
+ "Failed to coerce into Int16Builder",
1196
+ )
1197
+ })?
1198
+ .append_null(),
1199
+ ParquetSchemaType::Int32 => typed_builder
1200
+ .field_builder::<Int32Builder>(i)
1201
+ .ok_or_else(|| {
1202
+ MagnusError::new(
1203
+ magnus::exception::type_error(),
1204
+ "Failed to coerce into Int32Builder",
1205
+ )
1206
+ })?
1207
+ .append_null(),
1208
+ ParquetSchemaType::Int64 => typed_builder
1209
+ .field_builder::<Int64Builder>(i)
1210
+ .ok_or_else(|| {
1211
+ MagnusError::new(
1212
+ magnus::exception::type_error(),
1213
+ "Failed to coerce into Int64Builder",
1214
+ )
1215
+ })?
1216
+ .append_null(),
1217
+ ParquetSchemaType::UInt8 => typed_builder
1218
+ .field_builder::<UInt8Builder>(i)
1219
+ .ok_or_else(|| {
1220
+ MagnusError::new(
1221
+ magnus::exception::type_error(),
1222
+ "Failed to coerce into UInt8Builder",
1223
+ )
1224
+ })?
1225
+ .append_null(),
1226
+ ParquetSchemaType::UInt16 => typed_builder
1227
+ .field_builder::<UInt16Builder>(i)
1228
+ .ok_or_else(|| {
1229
+ MagnusError::new(
1230
+ magnus::exception::type_error(),
1231
+ "Failed to coerce into UInt16Builder",
1232
+ )
1233
+ })?
1234
+ .append_null(),
1235
+ ParquetSchemaType::UInt32 => typed_builder
1236
+ .field_builder::<UInt32Builder>(i)
1237
+ .ok_or_else(|| {
1238
+ MagnusError::new(
1239
+ magnus::exception::type_error(),
1240
+ "Failed to coerce into UInt32Builder",
1241
+ )
1242
+ })?
1243
+ .append_null(),
1244
+ ParquetSchemaType::UInt64 => typed_builder
1245
+ .field_builder::<UInt64Builder>(i)
1246
+ .ok_or_else(|| {
1247
+ MagnusError::new(
1248
+ magnus::exception::type_error(),
1249
+ "Failed to coerce into UInt64Builder",
1250
+ )
1251
+ })?
1252
+ .append_null(),
1253
+ ParquetSchemaType::Float => typed_builder
1254
+ .field_builder::<Float32Builder>(i)
1255
+ .ok_or_else(|| {
1256
+ MagnusError::new(
1257
+ magnus::exception::type_error(),
1258
+ "Failed to coerce into Float32Builder",
1259
+ )
1260
+ })?
1261
+ .append_null(),
1262
+ ParquetSchemaType::Double => typed_builder
1263
+ .field_builder::<Float64Builder>(i)
1264
+ .ok_or_else(|| {
1265
+ MagnusError::new(
1266
+ magnus::exception::type_error(),
1267
+ "Failed to coerce into Float64Builder",
1268
+ )
1269
+ })?
1270
+ .append_null(),
1271
+ ParquetSchemaType::String => typed_builder
1272
+ .field_builder::<StringBuilder>(i)
1273
+ .ok_or_else(|| {
1274
+ MagnusError::new(
1275
+ magnus::exception::type_error(),
1276
+ "Failed to coerce into StringBuilder",
1277
+ )
1278
+ })?
1279
+ .append_null(),
1280
+ ParquetSchemaType::Binary => typed_builder
1281
+ .field_builder::<BinaryBuilder>(i)
1282
+ .ok_or_else(|| {
1283
+ MagnusError::new(
1284
+ magnus::exception::type_error(),
1285
+ "Failed to coerce into BinaryBuilder",
1286
+ )
1287
+ })?
1288
+ .append_null(),
1289
+ ParquetSchemaType::Boolean => typed_builder
1290
+ .field_builder::<BooleanBuilder>(i)
1291
+ .ok_or_else(|| {
1292
+ MagnusError::new(
1293
+ magnus::exception::type_error(),
1294
+ "Failed to coerce into BooleanBuilder",
1295
+ )
1296
+ })?
1297
+ .append_null(),
1298
+ ParquetSchemaType::Date32 => typed_builder
1299
+ .field_builder::<Date32Builder>(i)
1300
+ .ok_or_else(|| {
1301
+ MagnusError::new(
1302
+ magnus::exception::type_error(),
1303
+ "Failed to coerce into Date32Builder",
1304
+ )
1305
+ })?
1306
+ .append_null(),
1307
+ ParquetSchemaType::TimestampMillis => typed_builder
1308
+ .field_builder::<TimestampMillisecondBuilder>(i)
1309
+ .ok_or_else(|| {
1310
+ MagnusError::new(
1311
+ magnus::exception::type_error(),
1312
+ "Failed to coerce into TimestampMillisecondBuilder",
1313
+ )
1314
+ })?
1315
+ .append_null(),
1316
+ ParquetSchemaType::TimestampMicros => typed_builder
1317
+ .field_builder::<TimestampMicrosecondBuilder>(i)
1318
+ .ok_or_else(|| {
1319
+ MagnusError::new(
1320
+ magnus::exception::type_error(),
1321
+ "Failed to coerce into TimestampMicrosecondBuilder",
1322
+ )
1323
+ })?
1324
+ .append_null(),
1325
+ ParquetSchemaType::List(_) => typed_builder
1326
+ .field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
1327
+ .ok_or_else(|| {
1328
+ MagnusError::new(
1329
+ magnus::exception::type_error(),
1330
+ "Failed to coerce into ListBuilder",
1331
+ )
1332
+ })?
1333
+ .append(false),
1334
+ ParquetSchemaType::Map(_) => {
1335
+ typed_builder
1336
+ .field_builder::<MapBuilder<
1337
+ Box<dyn ArrayBuilder>,
1338
+ Box<dyn ArrayBuilder>,
1339
+ >>(i)
1340
+ .ok_or_else(|| {
1341
+ MagnusError::new(
1342
+ magnus::exception::type_error(),
1343
+ "Failed to coerce into MapBuilder",
1344
+ )
1345
+ })?
1346
+ .append(false)
1347
+ .map_err(|e| {
1348
+ MagnusError::new(
1349
+ magnus::exception::runtime_error(),
1350
+ format!("Failed to append map: {}", e),
1351
+ )
1352
+ })?;
1353
+ }
1354
+ ParquetSchemaType::Struct(_) => typed_builder
1355
+ .field_builder::<StructBuilder>(i)
1356
+ .ok_or_else(|| {
1357
+ MagnusError::new(
1358
+ magnus::exception::type_error(),
1359
+ "Failed to coerce into StructBuilder",
1360
+ )
1361
+ })?
1362
+ .append_null(),
1363
+ },
789
1364
  }
1365
+ } else {
1366
+ return Err(MagnusError::new(
1367
+ magnus::exception::type_error(),
1368
+ format!("Field {} not found in map", i),
1369
+ ));
790
1370
  }
791
1371
  }
1372
+ typed_builder.append(true);
792
1373
  }
793
- ParquetValue::Null => list_builder.append_null(),
794
- _ => {
1374
+ other => {
795
1375
  return Err(MagnusError::new(
796
1376
  magnus::exception::type_error(),
797
- format!("Expected List, got {:?}", value),
798
- ))
1377
+ format!("Expected ParquetValue::Map(...) or Null, got {:?}", other),
1378
+ ));
799
1379
  }
800
1380
  }
801
1381
  }
802
- Ok(Arc::new(list_builder.finish()))
803
- }
804
- ParquetSchemaType::Map(_map_field) => {
805
- unimplemented!("Writing maps is not yet supported")
1382
+ Ok(())
806
1383
  }
807
1384
  }
808
1385
  }
809
1386
 
1387
+ /// Creates a final Arrow array from a list of ParquetValues and a schema type.
1388
+ /// This is your "unified" way to handle any nesting level.
1389
+ pub fn convert_parquet_values_to_arrow(
1390
+ values: Vec<ParquetValue>,
1391
+ type_: &ParquetSchemaType,
1392
+ ) -> Result<Arc<dyn Array>, ReaderError> {
1393
+ // Make sure we always have at least capacity 1 to avoid empty builders
1394
+ let capacity = if values.is_empty() { 1 } else { values.len() };
1395
+ let mut builder = create_arrow_builder_for_type(type_, Some(capacity))?;
1396
+
1397
+ fill_builder(&mut builder, type_, &values)?;
1398
+
1399
+ // Finish building the array
1400
+ let array = builder.finish();
1401
+
1402
+ Ok(Arc::new(array))
1403
+ }
1404
+
810
1405
  pub fn convert_ruby_array_to_arrow(
811
1406
  values: RArray,
812
1407
  type_: &ParquetSchemaType,
813
- ) -> Result<Arc<dyn Array>, MagnusError> {
1408
+ ) -> Result<Arc<dyn Array>, ReaderError> {
814
1409
  let mut parquet_values = Vec::with_capacity(values.len());
815
1410
  for value in values {
816
1411
  if value.is_nil() {
817
1412
  parquet_values.push(ParquetValue::Null);
818
1413
  continue;
819
1414
  }
820
- let parquet_value = ParquetValue::from_value(value, type_)?;
1415
+ let parquet_value = ParquetValue::from_value(value, type_, None)?;
821
1416
  parquet_values.push(parquet_value);
822
1417
  }
823
1418
  convert_parquet_values_to_arrow(parquet_values, type_)