parquet 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,14 @@
1
1
  use std::str::FromStr;
2
+ use std::sync::Arc;
3
+
4
+ use crate::reader::ReaderError;
2
5
 
3
6
  use super::*;
7
+ use arrow_array::builder::MapFieldNames;
4
8
  use arrow_array::builder::*;
9
+ use arrow_schema::{DataType, Field, Fields, TimeUnit};
5
10
  use jiff::tz::{Offset, TimeZone};
6
- use magnus::{RArray, TryConvert};
11
+ use magnus::{RArray, RString, TryConvert};
7
12
 
8
13
  pub struct NumericConverter<T> {
9
14
  _phantom: std::marker::PhantomData<T>,
@@ -64,14 +69,19 @@ pub fn convert_to_date32(value: Value, format: Option<&str>) -> Result<i32, Magn
64
69
 
65
70
  let x = timestamp
66
71
  .to_zoned(TimeZone::fixed(Offset::constant(0)))
67
- .unwrap()
72
+ .map_err(|e| {
73
+ MagnusError::new(
74
+ magnus::exception::type_error(),
75
+ format!("Failed to convert date32 to timestamp: {}", e),
76
+ )
77
+ })?
68
78
  .timestamp();
69
79
 
70
80
  // Convert to epoch days
71
81
  Ok((x.as_second() as i64 / 86400) as i32)
72
82
  } else if value.is_kind_of(ruby.class_time()) {
73
83
  // Convert Time object to epoch days
74
- let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ()).unwrap())?;
84
+ let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
75
85
  Ok(((secs as f64) / 86400.0) as i32)
76
86
  } else {
77
87
  Err(MagnusError::new(
@@ -115,8 +125,8 @@ pub fn convert_to_timestamp_millis(value: Value, format: Option<&str>) -> Result
115
125
  Ok(timestamp.as_millisecond())
116
126
  } else if value.is_kind_of(ruby.class_time()) {
117
127
  // Convert Time object to milliseconds
118
- let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ()).unwrap())?;
119
- let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ()).unwrap())?;
128
+ let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
129
+ let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
120
130
  Ok(secs * 1000 + (usecs / 1000))
121
131
  } else {
122
132
  Err(MagnusError::new(
@@ -162,8 +172,8 @@ pub fn convert_to_timestamp_micros(value: Value, format: Option<&str>) -> Result
162
172
  Ok(timestamp.as_microsecond())
163
173
  } else if value.is_kind_of(ruby.class_time()) {
164
174
  // Convert Time object to microseconds
165
- let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ()).unwrap())?;
166
- let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ()).unwrap())?;
175
+ let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
176
+ let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
167
177
  Ok(secs * 1_000_000 + usecs)
168
178
  } else {
169
179
  Err(MagnusError::new(
@@ -194,225 +204,93 @@ pub fn convert_to_boolean(value: Value) -> Result<bool, MagnusError> {
194
204
  }
195
205
  }
196
206
 
197
- pub fn convert_to_list(
198
- value: Value,
199
- list_field: &ListField,
200
- ) -> Result<Vec<ParquetValue>, MagnusError> {
201
- let ruby = unsafe { Ruby::get_unchecked() };
202
- if value.is_kind_of(ruby.class_array()) {
203
- let array = RArray::from_value(value).ok_or_else(|| {
204
- MagnusError::new(magnus::exception::type_error(), "Invalid list format")
205
- })?;
206
-
207
- let mut values = Vec::with_capacity(array.len());
208
- for item_value in array.into_iter() {
209
- let converted = match &list_field.item_type {
210
- ParquetSchemaType::Int8 => {
211
- let v = NumericConverter::<i8>::convert_with_string_fallback(item_value)?;
212
- ParquetValue::Int8(v)
213
- }
214
- ParquetSchemaType::Int16 => {
215
- let v = NumericConverter::<i16>::convert_with_string_fallback(item_value)?;
216
- ParquetValue::Int16(v)
217
- }
218
- ParquetSchemaType::Int32 => {
219
- let v = NumericConverter::<i32>::convert_with_string_fallback(item_value)?;
220
- ParquetValue::Int32(v)
221
- }
222
- ParquetSchemaType::Int64 => {
223
- let v = NumericConverter::<i64>::convert_with_string_fallback(item_value)?;
224
- ParquetValue::Int64(v)
225
- }
226
- ParquetSchemaType::UInt8 => {
227
- let v = NumericConverter::<u8>::convert_with_string_fallback(item_value)?;
228
- ParquetValue::UInt8(v)
229
- }
230
- ParquetSchemaType::UInt16 => {
231
- let v = NumericConverter::<u16>::convert_with_string_fallback(item_value)?;
232
- ParquetValue::UInt16(v)
233
- }
234
- ParquetSchemaType::UInt32 => {
235
- let v = NumericConverter::<u32>::convert_with_string_fallback(item_value)?;
236
- ParquetValue::UInt32(v)
237
- }
238
- ParquetSchemaType::UInt64 => {
239
- let v = NumericConverter::<u64>::convert_with_string_fallback(item_value)?;
240
- ParquetValue::UInt64(v)
241
- }
242
- ParquetSchemaType::Float => {
243
- let v = NumericConverter::<f32>::convert_with_string_fallback(item_value)?;
244
- ParquetValue::Float32(v)
245
- }
246
- ParquetSchemaType::Double => {
247
- let v = NumericConverter::<f64>::convert_with_string_fallback(item_value)?;
248
- ParquetValue::Float64(v)
249
- }
250
- ParquetSchemaType::String => {
251
- let v = String::try_convert(item_value)?;
252
- ParquetValue::String(v)
253
- }
254
- ParquetSchemaType::Binary => {
255
- let v = convert_to_binary(item_value)?;
256
- ParquetValue::Bytes(v)
257
- }
258
- ParquetSchemaType::Boolean => {
259
- let v = convert_to_boolean(item_value)?;
260
- ParquetValue::Boolean(v)
261
- }
262
- ParquetSchemaType::Date32 => {
263
- let v = convert_to_date32(item_value, list_field.format)?;
264
- ParquetValue::Date32(v)
265
- }
266
- ParquetSchemaType::TimestampMillis => {
267
- let v = convert_to_timestamp_millis(item_value, list_field.format)?;
268
- ParquetValue::TimestampMillis(v, None)
269
- }
270
- ParquetSchemaType::TimestampMicros => {
271
- let v = convert_to_timestamp_micros(item_value, list_field.format)?;
272
- ParquetValue::TimestampMicros(v, None)
273
- }
274
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
275
- return Err(MagnusError::new(
276
- magnus::exception::type_error(),
277
- "Nested lists and maps are not supported",
278
- ))
279
- }
280
- };
281
- values.push(converted);
207
+ pub fn convert_to_string(value: Value) -> Result<String, MagnusError> {
208
+ String::try_convert(value).or_else(|_| {
209
+ if value.respond_to("to_s", false)? {
210
+ value.funcall::<_, _, RString>("to_s", ())?.to_string()
211
+ } else if value.respond_to("to_str", false)? {
212
+ value.funcall::<_, _, RString>("to_str", ())?.to_string()
213
+ } else {
214
+ Err(MagnusError::new(
215
+ magnus::exception::type_error(),
216
+ format!("Not able to convert {:?} to String", value),
217
+ ))
282
218
  }
283
- Ok(values)
284
- } else {
285
- Err(MagnusError::new(
286
- magnus::exception::type_error(),
287
- "Invalid list format",
288
- ))
289
- }
219
+ })
290
220
  }
291
221
 
292
- pub fn convert_to_map(
293
- value: Value,
294
- map_field: &MapField,
295
- ) -> Result<HashMap<ParquetValue, ParquetValue>, MagnusError> {
296
- let ruby = unsafe { Ruby::get_unchecked() };
297
- if value.is_kind_of(ruby.class_hash()) {
298
- let mut map = HashMap::new();
299
- let entries: Vec<(Value, Value)> = value.funcall("to_a", ())?;
300
-
301
- for (key, value) in entries {
302
- let key_value = match &map_field.key_type {
303
- ParquetSchemaType::String => {
304
- let v = String::try_convert(key)?;
305
- ParquetValue::String(v)
306
- }
307
- _ => {
308
- return Err(MagnusError::new(
309
- magnus::exception::type_error(),
310
- "Map keys must be strings",
311
- ))
312
- }
313
- };
222
+ /// Converts our custom `ParquetSchemaType` into an Arrow `DataType`.
223
+ /// This ensures proper nullability settings for nested types.
224
+ /// Converts a ParquetSchemaType to an Arrow DataType
225
+ pub fn parquet_schema_type_to_arrow_data_type(
226
+ schema_type: &ParquetSchemaType,
227
+ ) -> Result<DataType, MagnusError> {
228
+ Ok(match schema_type {
229
+ ParquetSchemaType::Int8 => DataType::Int8,
230
+ ParquetSchemaType::Int16 => DataType::Int16,
231
+ ParquetSchemaType::Int32 => DataType::Int32,
232
+ ParquetSchemaType::Int64 => DataType::Int64,
233
+ ParquetSchemaType::UInt8 => DataType::UInt8,
234
+ ParquetSchemaType::UInt16 => DataType::UInt16,
235
+ ParquetSchemaType::UInt32 => DataType::UInt32,
236
+ ParquetSchemaType::UInt64 => DataType::UInt64,
237
+ ParquetSchemaType::Float => DataType::Float32,
238
+ ParquetSchemaType::Double => DataType::Float64,
239
+ ParquetSchemaType::String => DataType::Utf8,
240
+ ParquetSchemaType::Binary => DataType::Binary,
241
+ ParquetSchemaType::Boolean => DataType::Boolean,
242
+ ParquetSchemaType::Date32 => DataType::Date32,
243
+ ParquetSchemaType::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
244
+ ParquetSchemaType::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
314
245
 
315
- let value_value = match &map_field.value_type {
316
- ParquetSchemaType::Int8 => {
317
- let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
318
- ParquetValue::Int8(v)
319
- }
320
- ParquetSchemaType::Int16 => {
321
- let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
322
- ParquetValue::Int16(v)
323
- }
324
- ParquetSchemaType::Int32 => {
325
- let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
326
- ParquetValue::Int32(v)
327
- }
328
- ParquetSchemaType::Int64 => {
329
- let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
330
- ParquetValue::Int64(v)
331
- }
332
- ParquetSchemaType::UInt8 => {
333
- let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
334
- ParquetValue::UInt8(v)
335
- }
336
- ParquetSchemaType::UInt16 => {
337
- let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
338
- ParquetValue::UInt16(v)
339
- }
340
- ParquetSchemaType::UInt32 => {
341
- let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
342
- ParquetValue::UInt32(v)
343
- }
344
- ParquetSchemaType::UInt64 => {
345
- let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
346
- ParquetValue::UInt64(v)
347
- }
348
- ParquetSchemaType::Float => {
349
- let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
350
- ParquetValue::Float32(v)
351
- }
352
- ParquetSchemaType::Double => {
353
- let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
354
- ParquetValue::Float64(v)
355
- }
356
- ParquetSchemaType::String => {
357
- let v = String::try_convert(value)?;
358
- ParquetValue::String(v)
359
- }
360
- ParquetSchemaType::Binary => {
361
- let v = convert_to_binary(value)?;
362
- ParquetValue::Bytes(v)
363
- }
364
- ParquetSchemaType::Boolean => {
365
- let v = convert_to_boolean(value)?;
366
- ParquetValue::Boolean(v)
367
- }
368
- ParquetSchemaType::Date32 => {
369
- let v = convert_to_date32(value, map_field.format)?;
370
- ParquetValue::Date32(v)
371
- }
372
- ParquetSchemaType::TimestampMillis => {
373
- let v = convert_to_timestamp_millis(value, map_field.format)?;
374
- ParquetValue::TimestampMillis(v, None)
375
- }
376
- ParquetSchemaType::TimestampMicros => {
377
- let v = convert_to_timestamp_micros(value, map_field.format)?;
378
- ParquetValue::TimestampMicros(v, None)
379
- }
380
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
381
- return Err(MagnusError::new(
382
- magnus::exception::type_error(),
383
- "Map values cannot be lists or maps",
384
- ))
385
- }
386
- };
246
+ // For a List<T>, create a standard List in Arrow with nullable items
247
+ ParquetSchemaType::List(list_field) => {
248
+ let child_type = parquet_schema_type_to_arrow_data_type(&list_field.item_type)?;
249
+ // For a list, use empty field name to match expectations for schema_dsl test
250
+ // This is the critical fix for the schema_dsl test which expects an empty field name
251
+ // Use empty field name for all list field items - this is crucial for compatibility
252
+ DataType::List(Arc::new(Field::new(
253
+ "item",
254
+ child_type,
255
+ list_field.nullable,
256
+ )))
257
+ }
387
258
 
388
- map.insert(key_value, value_value);
259
+ // For a Map<K, V>, ensure entries field is non-nullable and key field is non-nullable
260
+ ParquetSchemaType::Map(map_field) => {
261
+ let key_arrow_type = parquet_schema_type_to_arrow_data_type(&map_field.key_type)?;
262
+ let value_arrow_type = parquet_schema_type_to_arrow_data_type(&map_field.value_type)?;
263
+ DataType::Map(
264
+ Arc::new(Field::new(
265
+ "entries",
266
+ DataType::Struct(Fields::from(vec![
267
+ Field::new("key", key_arrow_type, false), // key must be non-null
268
+ Field::new("value", value_arrow_type, true), // value can be null
269
+ ])),
270
+ /*nullable=*/ false, // crucial: entries must be non-nullable
271
+ )),
272
+ /*keys_sorted=*/ false,
273
+ )
389
274
  }
390
- Ok(map)
391
- } else {
392
- Err(MagnusError::new(
393
- magnus::exception::type_error(),
394
- "Invalid map format",
395
- ))
396
- }
397
- }
275
+ ParquetSchemaType::Struct(struct_field) => {
276
+ if struct_field.fields.is_empty() {
277
+ return Err(MagnusError::new(
278
+ magnus::exception::runtime_error(),
279
+ "Cannot create a struct with zero subfields (empty struct).",
280
+ ));
281
+ }
398
282
 
399
- macro_rules! impl_timestamp_to_arrow_conversion {
400
- ($values:expr, $builder_type:ty, $variant:ident) => {{
401
- let mut builder = <$builder_type>::with_capacity($values.len());
402
- for value in $values {
403
- match value {
404
- ParquetValue::$variant(v, _tz) => builder.append_value(v),
405
- ParquetValue::Null => builder.append_null(),
406
- _ => {
407
- return Err(MagnusError::new(
408
- magnus::exception::type_error(),
409
- format!("Expected {}, got {:?}", stringify!($variant), value),
410
- ))
411
- }
283
+ // Build arrow fields
284
+ let mut arrow_fields = Vec::with_capacity(struct_field.fields.len());
285
+
286
+ for field in &struct_field.fields {
287
+ let field_type = parquet_schema_type_to_arrow_data_type(&field.type_)?;
288
+ arrow_fields.push(Field::new(&field.name, field_type, true)); // All fields are nullable by default
412
289
  }
290
+
291
+ DataType::Struct(Fields::from(arrow_fields))
413
292
  }
414
- Ok(Arc::new(builder.finish()))
415
- }};
293
+ })
416
294
  }
417
295
 
418
296
  #[macro_export]
@@ -442,367 +320,1099 @@ macro_rules! impl_timestamp_array_conversion {
442
320
  }};
443
321
  }
444
322
 
445
- #[macro_export]
446
- macro_rules! impl_array_conversion {
447
- ($values:expr, $builder_type:ty, $variant:ident) => {{
448
- let mut builder = <$builder_type>::with_capacity($values.len());
449
- for value in $values {
450
- match value {
451
- ParquetValue::$variant(v) => builder.append_value(v),
452
- ParquetValue::Null => builder.append_null(),
453
- _ => {
454
- return Err(MagnusError::new(
455
- magnus::exception::type_error(),
456
- format!("Expected {}, got {:?}", stringify!($variant), value),
457
- ))
458
- }
459
- }
323
+ // Create the appropriate Arrow builder for a given ParquetSchemaType.
324
+ // We return a Box<dyn ArrayBuilder> so we can dynamically downcast.
325
+ fn create_arrow_builder_for_type(
326
+ type_: &ParquetSchemaType,
327
+ capacity: Option<usize>,
328
+ ) -> Result<Box<dyn ArrayBuilder>, ReaderError> {
329
+ let cap = capacity.unwrap_or(1); // Default to at least capacity 1 to avoid empty builders
330
+ match type_ {
331
+ ParquetSchemaType::Int8 => Ok(Box::new(Int8Builder::with_capacity(cap))),
332
+ ParquetSchemaType::Int16 => Ok(Box::new(Int16Builder::with_capacity(cap))),
333
+ ParquetSchemaType::Int32 => Ok(Box::new(Int32Builder::with_capacity(cap))),
334
+ ParquetSchemaType::Int64 => Ok(Box::new(Int64Builder::with_capacity(cap))),
335
+ ParquetSchemaType::UInt8 => Ok(Box::new(UInt8Builder::with_capacity(cap))),
336
+ ParquetSchemaType::UInt16 => Ok(Box::new(UInt16Builder::with_capacity(cap))),
337
+ ParquetSchemaType::UInt32 => Ok(Box::new(UInt32Builder::with_capacity(cap))),
338
+ ParquetSchemaType::UInt64 => Ok(Box::new(UInt64Builder::with_capacity(cap))),
339
+ ParquetSchemaType::Float => Ok(Box::new(Float32Builder::with_capacity(cap))),
340
+ ParquetSchemaType::Double => Ok(Box::new(Float64Builder::with_capacity(cap))),
341
+ ParquetSchemaType::String => Ok(Box::new(StringBuilder::with_capacity(cap, cap * 32))),
342
+ ParquetSchemaType::Binary => Ok(Box::new(BinaryBuilder::with_capacity(cap, cap * 32))),
343
+ ParquetSchemaType::Boolean => Ok(Box::new(BooleanBuilder::with_capacity(cap))),
344
+ ParquetSchemaType::Date32 => Ok(Box::new(Date32Builder::with_capacity(cap))),
345
+ ParquetSchemaType::TimestampMillis => {
346
+ Ok(Box::new(TimestampMillisecondBuilder::with_capacity(cap)))
460
347
  }
461
- Ok(Arc::new(builder.finish()))
462
- }};
463
- ($values:expr, $builder_type:ty, $variant:ident, $capacity:expr) => {{
464
- let mut builder = <$builder_type>::with_capacity($values.len(), $capacity);
465
- for value in $values {
466
- match value {
467
- ParquetValue::$variant(v) => builder.append_value(v),
468
- ParquetValue::Null => builder.append_null(),
469
- _ => {
470
- return Err(MagnusError::new(
471
- magnus::exception::type_error(),
472
- format!("Expected {}, got {:?}", stringify!($variant), value),
473
- ))
474
- }
475
- }
348
+ ParquetSchemaType::TimestampMicros => {
349
+ Ok(Box::new(TimestampMicrosecondBuilder::with_capacity(cap)))
476
350
  }
477
- Ok(Arc::new(builder.finish()))
478
- }};
479
- }
351
+ ParquetSchemaType::List(list_field) => {
352
+ // For a list, we create a ListBuilder whose child builder is determined by item_type.
353
+ // Pass through capacity to ensure consistent sizing
354
+ let child_builder = create_arrow_builder_for_type(&list_field.item_type, Some(cap))?;
480
355
 
481
- #[macro_export]
482
- macro_rules! append_list_value {
483
- ($list_builder:expr, $item_type:path, $value:expr, $builder_type:ty, $value_variant:path) => {
484
- match (&$item_type, &$value) {
485
- ($item_type, $value_variant(v)) => {
486
- $list_builder
487
- .values()
488
- .as_any_mut()
489
- .downcast_mut::<$builder_type>()
490
- .unwrap()
491
- .append_value(v.clone());
492
- }
493
- (_, ParquetValue::Null) => {
494
- $list_builder.append_null();
356
+ // Ensure consistent builder capacity for lists
357
+ Ok(Box::new(ListBuilder::<Box<dyn ArrayBuilder>>::new(
358
+ child_builder,
359
+ )))
360
+ }
361
+ ParquetSchemaType::Map(map_field) => {
362
+ // A Map is physically a list<struct<key:..., value:...>> in Arrow.
363
+ // Pass through capacity to ensure consistent sizing
364
+ let key_builder = create_arrow_builder_for_type(&map_field.key_type, Some(cap))?;
365
+ let value_builder = create_arrow_builder_for_type(&map_field.value_type, Some(cap))?;
366
+
367
+ // Create a MapBuilder with explicit field names to ensure compatibility
368
+ Ok(Box::new(MapBuilder::<
369
+ Box<dyn ArrayBuilder>,
370
+ Box<dyn ArrayBuilder>,
371
+ >::new(
372
+ Some(MapFieldNames {
373
+ entry: "entries".to_string(),
374
+ key: "key".to_string(),
375
+ value: "value".to_string(),
376
+ }),
377
+ key_builder,
378
+ value_builder,
379
+ )))
380
+ }
381
+ ParquetSchemaType::Struct(struct_field) => {
382
+ // Check for empty struct immediately
383
+ if struct_field.fields.is_empty() {
384
+ return Err(MagnusError::new(
385
+ magnus::exception::runtime_error(),
386
+ "Cannot build a struct with zero fields - Parquet doesn't support empty structs".to_string(),
387
+ ))?;
495
388
  }
496
- _ => {
389
+
390
+ // Create a child builder for each field in the struct
391
+ let mut child_field_builders = Vec::with_capacity(struct_field.fields.len());
392
+
393
+ // Get struct data type first to ensure field compatibility
394
+ let data_type = parquet_schema_type_to_arrow_data_type(type_)?;
395
+
396
+ // Make sure the data type is a struct
397
+ let arrow_fields = if let DataType::Struct(ref fields) = data_type {
398
+ fields.clone()
399
+ } else {
497
400
  return Err(MagnusError::new(
498
401
  magnus::exception::type_error(),
402
+ "Expected struct data type".to_string(),
403
+ ))?;
404
+ };
405
+
406
+ // Create builders for each child field with consistent capacity
407
+ for child in &struct_field.fields {
408
+ let sub_builder = create_arrow_builder_for_type(&child.type_, Some(cap))?;
409
+ child_field_builders.push(sub_builder);
410
+ }
411
+
412
+ // Make sure we have the right number of builders
413
+ if child_field_builders.len() != arrow_fields.len() {
414
+ return Err(MagnusError::new(
415
+ magnus::exception::runtime_error(),
499
416
  format!(
500
- "Type mismatch in list: expected {:?}, got {:?}",
501
- $item_type, $value
417
+ "Number of field builders ({}) doesn't match number of arrow fields ({})",
418
+ child_field_builders.len(),
419
+ arrow_fields.len()
502
420
  ),
503
- ))
421
+ ))?;
504
422
  }
423
+
424
+ // Create the StructBuilder with the fields and child builders
425
+ Ok(Box::new(StructBuilder::new(
426
+ arrow_fields,
427
+ child_field_builders,
428
+ )))
505
429
  }
506
- };
430
+ }
507
431
  }
508
432
 
509
- #[macro_export]
510
- macro_rules! append_list_value_copy {
511
- ($list_builder:expr, $item_type:path, $value:expr, $builder_type:ty, $value_variant:path) => {
512
- match (&$item_type, &$value) {
513
- ($item_type, $value_variant(v)) => {
514
- $list_builder
515
- .values()
516
- .as_any_mut()
517
- .downcast_mut::<$builder_type>()
518
- .unwrap()
519
- .append_value(*v);
433
+ // Fill primitive scalar Int8 values
434
+ fn fill_int8_builder(
435
+ builder: &mut dyn ArrayBuilder,
436
+ values: &[ParquetValue],
437
+ ) -> Result<(), MagnusError> {
438
+ let typed_builder = builder
439
+ .as_any_mut()
440
+ .downcast_mut::<Int8Builder>()
441
+ .expect("Builder mismatch: expected Int8Builder");
442
+ for val in values {
443
+ match val {
444
+ ParquetValue::Int8(i) => typed_builder.append_value(*i),
445
+ // Handle Int64 that could be an Int8
446
+ ParquetValue::Int64(i) => {
447
+ if *i < i8::MIN as i64 || *i > i8::MAX as i64 {
448
+ return Err(MagnusError::new(
449
+ magnus::exception::range_error(),
450
+ format!("Integer {} is out of range for Int8", i),
451
+ ));
452
+ }
453
+ typed_builder.append_value(*i as i8)
520
454
  }
521
- (_, ParquetValue::Null) => {
522
- $list_builder.append_null();
455
+ ParquetValue::Null => typed_builder.append_null(),
456
+ other => {
457
+ return Err(MagnusError::new(
458
+ magnus::exception::type_error(),
459
+ format!("Expected Int8, got {:?}", other),
460
+ ))
461
+ }
462
+ }
463
+ }
464
+ Ok(())
465
+ }
466
+
467
+ // Fill primitive scalar Int16 values
468
+ fn fill_int16_builder(
469
+ builder: &mut dyn ArrayBuilder,
470
+ values: &[ParquetValue],
471
+ ) -> Result<(), MagnusError> {
472
+ let typed_builder = builder
473
+ .as_any_mut()
474
+ .downcast_mut::<Int16Builder>()
475
+ .expect("Builder mismatch: expected Int16Builder");
476
+ for val in values {
477
+ match val {
478
+ ParquetValue::Int16(i) => typed_builder.append_value(*i),
479
+ // Handle Int64 that could be an Int16
480
+ ParquetValue::Int64(i) => {
481
+ if *i < i16::MIN as i64 || *i > i16::MAX as i64 {
482
+ return Err(MagnusError::new(
483
+ magnus::exception::range_error(),
484
+ format!("Integer {} is out of range for Int16", i),
485
+ ));
486
+ }
487
+ typed_builder.append_value(*i as i16)
523
488
  }
524
- _ => {
489
+ ParquetValue::Null => typed_builder.append_null(),
490
+ other => {
525
491
  return Err(MagnusError::new(
526
492
  magnus::exception::type_error(),
527
- format!(
528
- "Type mismatch in list: expected {:?}, got {:?}",
529
- $item_type, $value
530
- ),
493
+ format!("Expected Int16, got {:?}", other),
531
494
  ))
532
495
  }
533
496
  }
534
- };
497
+ }
498
+ Ok(())
535
499
  }
536
500
 
537
- #[macro_export]
538
- macro_rules! append_timestamp_list_value {
539
- ($list_builder:expr, $item_type:path, $value:expr, $builder_type:ty, $value_variant:path) => {
540
- match (&$item_type, &$value) {
541
- ($item_type, $value_variant(v, _tz)) => {
542
- $list_builder
543
- .values()
544
- .as_any_mut()
545
- .downcast_mut::<$builder_type>()
546
- .unwrap()
547
- .append_value(*v);
501
+ // Fill list values by recursively filling child items
502
+ fn fill_list_builder(
503
+ builder: &mut dyn ArrayBuilder,
504
+ item_type: &ParquetSchemaType,
505
+ values: &[ParquetValue],
506
+ ) -> Result<(), MagnusError> {
507
+ // We need to use a more specific type for ListBuilder to help Rust's type inference
508
+ let lb = builder
509
+ .as_any_mut()
510
+ .downcast_mut::<ListBuilder<Box<dyn ArrayBuilder>>>()
511
+ .expect("Builder mismatch: expected ListBuilder");
512
+
513
+ for val in values {
514
+ if let ParquetValue::Null = val {
515
+ // null list
516
+ lb.append(false);
517
+ } else if let ParquetValue::List(list_items) = val {
518
+ // First fill the child builder with the items
519
+ let values_builder = lb.values();
520
+ fill_builder(values_builder, item_type, list_items)?;
521
+ // Then finalize the list by calling append(true)
522
+ lb.append(true);
523
+ } else {
524
+ return Err(MagnusError::new(
525
+ magnus::exception::type_error(),
526
+ format!("Expected ParquetValue::List(...) or Null, got {:?}", val),
527
+ ));
528
+ }
529
+ }
530
+
531
+ Ok(())
532
+ }
533
+
534
+ // Fill map values by recursively filling key and value items
535
+ fn fill_map_builder(
536
+ builder: &mut dyn ArrayBuilder,
537
+ key_type: &ParquetSchemaType,
538
+ value_type: &ParquetSchemaType,
539
+ values: &[ParquetValue],
540
+ ) -> Result<(), MagnusError> {
541
+ let mb = builder
542
+ .as_any_mut()
543
+ .downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
544
+ .expect("Builder mismatch: expected MapBuilder");
545
+
546
+ for val in values {
547
+ match val {
548
+ ParquetValue::Null => {
549
+ // null map
550
+ mb.append(false).map_err(|e| {
551
+ MagnusError::new(
552
+ magnus::exception::runtime_error(),
553
+ format!("Failed to append null to map: {}", e),
554
+ )
555
+ })?;
548
556
  }
549
- (_, ParquetValue::Null) => {
550
- $list_builder.append_null();
557
+ ParquetValue::Map(map_entries) => {
558
+ // First append all key-value pairs to the child arrays
559
+ for (k, v) in map_entries {
560
+ // Note: Arrow expects field names "key" and "value" (singular)
561
+ fill_builder(mb.keys(), key_type, &[k.clone()])?;
562
+ fill_builder(mb.values(), value_type, &[v.clone()])?;
563
+ }
564
+ // Then finalize the map by calling append(true)
565
+ mb.append(true).map_err(|e| {
566
+ MagnusError::new(
567
+ magnus::exception::runtime_error(),
568
+ format!("Failed to append map entry: {}", e),
569
+ )
570
+ })?;
551
571
  }
552
- _ => {
572
+ other => {
553
573
  return Err(MagnusError::new(
554
574
  magnus::exception::type_error(),
555
- format!(
556
- "Type mismatch in list: expected {:?}, got {:?}",
557
- $item_type, $value
558
- ),
575
+ format!("Expected ParquetValue::Map(...) or Null, got {:?}", other),
559
576
  ))
560
577
  }
561
578
  }
562
- };
579
+ }
580
+
581
+ Ok(())
563
582
  }
564
583
 
565
- pub fn convert_parquet_values_to_arrow(
566
- values: Vec<ParquetValue>,
584
+ // Append an entire slice of ParquetValue into the given Arrow builder.
585
+ // We do a `match` on the type for each item, recursing for nested list/map.
586
+ fn fill_builder(
587
+ builder: &mut dyn ArrayBuilder,
567
588
  type_: &ParquetSchemaType,
568
- ) -> Result<Arc<dyn Array>, MagnusError> {
589
+ values: &[ParquetValue],
590
+ ) -> Result<(), MagnusError> {
569
591
  match type_ {
570
- ParquetSchemaType::Int8 => impl_array_conversion!(values, Int8Builder, Int8),
571
- ParquetSchemaType::Int16 => impl_array_conversion!(values, Int16Builder, Int16),
572
- ParquetSchemaType::Int32 => impl_array_conversion!(values, Int32Builder, Int32),
573
- ParquetSchemaType::Int64 => impl_array_conversion!(values, Int64Builder, Int64),
574
- ParquetSchemaType::UInt8 => impl_array_conversion!(values, UInt8Builder, UInt8),
575
- ParquetSchemaType::UInt16 => impl_array_conversion!(values, UInt16Builder, UInt16),
576
- ParquetSchemaType::UInt32 => impl_array_conversion!(values, UInt32Builder, UInt32),
577
- ParquetSchemaType::UInt64 => impl_array_conversion!(values, UInt64Builder, UInt64),
578
- ParquetSchemaType::Float => impl_array_conversion!(values, Float32Builder, Float32),
579
- ParquetSchemaType::Double => impl_array_conversion!(values, Float64Builder, Float64),
580
- ParquetSchemaType::String => {
581
- impl_array_conversion!(values, StringBuilder, String, values.len() * 32)
582
- }
583
- ParquetSchemaType::Binary => {
584
- impl_array_conversion!(values, BinaryBuilder, Bytes, values.len() * 32)
585
- }
586
- ParquetSchemaType::Boolean => impl_array_conversion!(values, BooleanBuilder, Boolean),
587
- ParquetSchemaType::Date32 => impl_array_conversion!(values, Date32Builder, Date32),
588
- ParquetSchemaType::TimestampMillis => {
589
- impl_timestamp_to_arrow_conversion!(
590
- values,
591
- TimestampMillisecondBuilder,
592
- TimestampMillis
593
- )
592
+ // ------------------
593
+ // PRIMITIVE SCALARS - delegated to specialized helpers
594
+ // ------------------
595
+ ParquetSchemaType::Int8 => fill_int8_builder(builder, values),
596
+ ParquetSchemaType::Int16 => fill_int16_builder(builder, values),
597
+ ParquetSchemaType::Int32 => {
598
+ let typed_builder = builder
599
+ .as_any_mut()
600
+ .downcast_mut::<Int32Builder>()
601
+ .expect("Builder mismatch: expected Int32Builder");
602
+ for val in values {
603
+ match val {
604
+ ParquetValue::Int32(i) => typed_builder.append_value(*i),
605
+ ParquetValue::Date32(d) => typed_builder.append_value(*d), // if you allow date->int
606
+ // Handle the case where we have an Int64 in an Int32 field (common with Ruby Integers)
607
+ ParquetValue::Int64(i) => {
608
+ if *i < i32::MIN as i64 || *i > i32::MAX as i64 {
609
+ return Err(MagnusError::new(
610
+ magnus::exception::range_error(),
611
+ format!("Integer {} is out of range for Int32", i),
612
+ ));
613
+ }
614
+ typed_builder.append_value(*i as i32)
615
+ }
616
+ ParquetValue::Null => typed_builder.append_null(),
617
+ other => {
618
+ return Err(MagnusError::new(
619
+ magnus::exception::type_error(),
620
+ format!("Expected Int32, got {:?}", other),
621
+ ))
622
+ }
623
+ }
624
+ }
625
+ Ok(())
594
626
  }
595
- ParquetSchemaType::TimestampMicros => {
596
- impl_timestamp_to_arrow_conversion!(
597
- values,
598
- TimestampMicrosecondBuilder,
599
- TimestampMicros
600
- )
627
+ ParquetSchemaType::Int64 => {
628
+ let typed_builder = builder
629
+ .as_any_mut()
630
+ .downcast_mut::<Int64Builder>()
631
+ .expect("Builder mismatch: expected Int64Builder");
632
+ for val in values {
633
+ match val {
634
+ ParquetValue::Int64(i) => typed_builder.append_value(*i),
635
+ ParquetValue::Null => typed_builder.append_null(),
636
+ other => {
637
+ return Err(MagnusError::new(
638
+ magnus::exception::type_error(),
639
+ format!("Expected Int64, got {:?}", other),
640
+ ))
641
+ }
642
+ }
643
+ }
644
+ Ok(())
601
645
  }
602
- ParquetSchemaType::List(list_field) => {
603
- let value_builder = match list_field.item_type {
604
- ParquetSchemaType::Int8 => Box::new(Int8Builder::new()) as Box<dyn ArrayBuilder>,
605
- ParquetSchemaType::Int16 => Box::new(Int16Builder::new()) as Box<dyn ArrayBuilder>,
606
- ParquetSchemaType::Int32 => Box::new(Int32Builder::new()) as Box<dyn ArrayBuilder>,
607
- ParquetSchemaType::Int64 => Box::new(Int64Builder::new()) as Box<dyn ArrayBuilder>,
608
- ParquetSchemaType::UInt8 => Box::new(UInt8Builder::new()) as Box<dyn ArrayBuilder>,
609
- ParquetSchemaType::UInt16 => {
610
- Box::new(UInt16Builder::new()) as Box<dyn ArrayBuilder>
646
+ ParquetSchemaType::UInt8 => {
647
+ let typed_builder = builder
648
+ .as_any_mut()
649
+ .downcast_mut::<UInt8Builder>()
650
+ .expect("Builder mismatch: expected UInt8Builder");
651
+ for val in values {
652
+ match val {
653
+ ParquetValue::UInt8(u) => typed_builder.append_value(*u),
654
+ // Handle Int64 that could be a UInt8
655
+ ParquetValue::Int64(i) => {
656
+ if *i < 0 || *i > u8::MAX as i64 {
657
+ return Err(MagnusError::new(
658
+ magnus::exception::range_error(),
659
+ format!("Integer {} is out of range for UInt8", i),
660
+ ));
661
+ }
662
+ typed_builder.append_value(*i as u8)
663
+ }
664
+ ParquetValue::Null => typed_builder.append_null(),
665
+ other => {
666
+ return Err(MagnusError::new(
667
+ magnus::exception::type_error(),
668
+ format!("Expected UInt8, got {:?}", other),
669
+ ))
670
+ }
611
671
  }
612
- ParquetSchemaType::UInt32 => {
613
- Box::new(UInt32Builder::new()) as Box<dyn ArrayBuilder>
672
+ }
673
+ Ok(())
674
+ }
675
+ ParquetSchemaType::UInt16 => {
676
+ let typed_builder = builder
677
+ .as_any_mut()
678
+ .downcast_mut::<UInt16Builder>()
679
+ .expect("Builder mismatch: expected UInt16Builder");
680
+ for val in values {
681
+ match val {
682
+ ParquetValue::UInt16(u) => typed_builder.append_value(*u),
683
+ // Handle Int64 that could be a UInt16
684
+ ParquetValue::Int64(i) => {
685
+ if *i < 0 || *i > u16::MAX as i64 {
686
+ return Err(MagnusError::new(
687
+ magnus::exception::range_error(),
688
+ format!("Integer {} is out of range for UInt16", i),
689
+ ));
690
+ }
691
+ typed_builder.append_value(*i as u16)
692
+ }
693
+ ParquetValue::Null => typed_builder.append_null(),
694
+ other => {
695
+ return Err(MagnusError::new(
696
+ magnus::exception::type_error(),
697
+ format!("Expected UInt16, got {:?}", other),
698
+ ))
699
+ }
614
700
  }
615
- ParquetSchemaType::UInt64 => {
616
- Box::new(UInt64Builder::new()) as Box<dyn ArrayBuilder>
701
+ }
702
+ Ok(())
703
+ }
704
+ ParquetSchemaType::UInt32 => {
705
+ let typed_builder = builder
706
+ .as_any_mut()
707
+ .downcast_mut::<UInt32Builder>()
708
+ .expect("Builder mismatch: expected UInt32Builder");
709
+ for val in values {
710
+ match val {
711
+ ParquetValue::UInt32(u) => typed_builder.append_value(*u),
712
+ // Handle Int64 that could be a UInt32
713
+ ParquetValue::Int64(i) => {
714
+ if *i < 0 || *i > u32::MAX as i64 {
715
+ return Err(MagnusError::new(
716
+ magnus::exception::range_error(),
717
+ format!("Integer {} is out of range for UInt32", i),
718
+ ));
719
+ }
720
+ typed_builder.append_value(*i as u32)
721
+ }
722
+ ParquetValue::Null => typed_builder.append_null(),
723
+ other => {
724
+ return Err(MagnusError::new(
725
+ magnus::exception::type_error(),
726
+ format!("Expected UInt32, got {:?}", other),
727
+ ))
728
+ }
617
729
  }
618
- ParquetSchemaType::Float => {
619
- Box::new(Float32Builder::new()) as Box<dyn ArrayBuilder>
730
+ }
731
+ Ok(())
732
+ }
733
+ ParquetSchemaType::UInt64 => {
734
+ let typed_builder = builder
735
+ .as_any_mut()
736
+ .downcast_mut::<UInt64Builder>()
737
+ .expect("Builder mismatch: expected UInt64Builder");
738
+ for val in values {
739
+ match val {
740
+ ParquetValue::UInt64(u) => typed_builder.append_value(*u),
741
+ // Handle Int64 that could be a UInt64
742
+ ParquetValue::Int64(i) => {
743
+ if *i < 0 {
744
+ return Err(MagnusError::new(
745
+ magnus::exception::range_error(),
746
+ format!("Integer {} is out of range for UInt64", i),
747
+ ));
748
+ }
749
+ typed_builder.append_value(*i as u64)
750
+ }
751
+ ParquetValue::Null => typed_builder.append_null(),
752
+ other => {
753
+ return Err(MagnusError::new(
754
+ magnus::exception::type_error(),
755
+ format!("Expected UInt64, got {:?}", other),
756
+ ))
757
+ }
620
758
  }
621
- ParquetSchemaType::Double => {
622
- Box::new(Float64Builder::new()) as Box<dyn ArrayBuilder>
759
+ }
760
+ Ok(())
761
+ }
762
+ ParquetSchemaType::Float => {
763
+ let typed_builder = builder
764
+ .as_any_mut()
765
+ .downcast_mut::<Float32Builder>()
766
+ .expect("Builder mismatch: expected Float32Builder");
767
+ for val in values {
768
+ match val {
769
+ ParquetValue::Float32(f) => typed_builder.append_value(*f),
770
+ ParquetValue::Float16(fh) => typed_builder.append_value(*fh),
771
+ ParquetValue::Null => typed_builder.append_null(),
772
+ other => {
773
+ return Err(MagnusError::new(
774
+ magnus::exception::type_error(),
775
+ format!("Expected Float32, got {:?}", other),
776
+ ))
777
+ }
623
778
  }
624
- ParquetSchemaType::String => {
625
- Box::new(StringBuilder::new()) as Box<dyn ArrayBuilder>
779
+ }
780
+ Ok(())
781
+ }
782
+ ParquetSchemaType::Double => {
783
+ let typed_builder = builder
784
+ .as_any_mut()
785
+ .downcast_mut::<Float64Builder>()
786
+ .expect("Builder mismatch: expected Float64Builder");
787
+ for val in values {
788
+ match val {
789
+ ParquetValue::Float64(f) => typed_builder.append_value(*f),
790
+ // If you want to allow f32 => f64, do so:
791
+ ParquetValue::Float32(flo) => typed_builder.append_value(*flo as f64),
792
+ ParquetValue::Null => typed_builder.append_null(),
793
+ other => {
794
+ return Err(MagnusError::new(
795
+ magnus::exception::type_error(),
796
+ format!("Expected Float64, got {:?}", other),
797
+ ))
798
+ }
626
799
  }
627
- ParquetSchemaType::Binary => {
628
- Box::new(BinaryBuilder::new()) as Box<dyn ArrayBuilder>
800
+ }
801
+ Ok(())
802
+ }
803
+ ParquetSchemaType::Boolean => {
804
+ let typed_builder = builder
805
+ .as_any_mut()
806
+ .downcast_mut::<BooleanBuilder>()
807
+ .expect("Builder mismatch: expected BooleanBuilder");
808
+ for val in values {
809
+ match val {
810
+ ParquetValue::Boolean(b) => typed_builder.append_value(*b),
811
+ ParquetValue::Null => typed_builder.append_null(),
812
+ other => {
813
+ return Err(MagnusError::new(
814
+ magnus::exception::type_error(),
815
+ format!("Expected Boolean, got {:?}", other),
816
+ ))
817
+ }
629
818
  }
630
- ParquetSchemaType::Boolean => {
631
- Box::new(BooleanBuilder::new()) as Box<dyn ArrayBuilder>
819
+ }
820
+ Ok(())
821
+ }
822
+ ParquetSchemaType::Date32 => {
823
+ let typed_builder = builder
824
+ .as_any_mut()
825
+ .downcast_mut::<Date32Builder>()
826
+ .expect("Builder mismatch: expected Date32Builder");
827
+ for val in values {
828
+ match val {
829
+ ParquetValue::Date32(d) => typed_builder.append_value(*d),
830
+ ParquetValue::Null => typed_builder.append_null(),
831
+ other => {
832
+ return Err(MagnusError::new(
833
+ magnus::exception::type_error(),
834
+ format!("Expected Date32, got {:?}", other),
835
+ ))
836
+ }
632
837
  }
633
- ParquetSchemaType::Date32 => {
634
- Box::new(Date32Builder::new()) as Box<dyn ArrayBuilder>
838
+ }
839
+ Ok(())
840
+ }
841
+ ParquetSchemaType::TimestampMillis => {
842
+ let typed_builder = builder
843
+ .as_any_mut()
844
+ .downcast_mut::<TimestampMillisecondBuilder>()
845
+ .expect("Builder mismatch: expected TimestampMillisecondBuilder");
846
+ for val in values {
847
+ match val {
848
+ ParquetValue::TimestampMillis(ts, _tz) => typed_builder.append_value(*ts),
849
+ ParquetValue::Null => typed_builder.append_null(),
850
+ other => {
851
+ return Err(MagnusError::new(
852
+ magnus::exception::type_error(),
853
+ format!("Expected TimestampMillis, got {:?}", other),
854
+ ))
855
+ }
635
856
  }
636
- ParquetSchemaType::TimestampMillis => {
637
- Box::new(TimestampMillisecondBuilder::new()) as Box<dyn ArrayBuilder>
857
+ }
858
+ Ok(())
859
+ }
860
+ ParquetSchemaType::TimestampMicros => {
861
+ let typed_builder = builder
862
+ .as_any_mut()
863
+ .downcast_mut::<TimestampMicrosecondBuilder>()
864
+ .expect("Builder mismatch: expected TimestampMicrosecondBuilder");
865
+ for val in values {
866
+ match val {
867
+ ParquetValue::TimestampMicros(ts, _tz) => typed_builder.append_value(*ts),
868
+ ParquetValue::Null => typed_builder.append_null(),
869
+ other => {
870
+ return Err(MagnusError::new(
871
+ magnus::exception::type_error(),
872
+ format!("Expected TimestampMicros, got {:?}", other),
873
+ ))
874
+ }
638
875
  }
639
- ParquetSchemaType::TimestampMicros => {
640
- Box::new(TimestampMicrosecondBuilder::new()) as Box<dyn ArrayBuilder>
876
+ }
877
+ Ok(())
878
+ }
879
+
880
+ // ------------------
881
+ // NESTED LIST - using helper function
882
+ // ------------------
883
+ ParquetSchemaType::List(list_field) => {
884
+ fill_list_builder(builder, &list_field.item_type, values)
885
+ }
886
+
887
+ // ------------------
888
+ // NESTED MAP - using helper function
889
+ // ------------------
890
+ ParquetSchemaType::Map(map_field) => {
891
+ fill_map_builder(builder, &map_field.key_type, &map_field.value_type, values)
892
+ }
893
+
894
+ // ------------------
895
+ // OTHER TYPES - keep as is for now
896
+ // ------------------
897
+ ParquetSchemaType::String => {
898
+ let typed_builder = builder
899
+ .as_any_mut()
900
+ .downcast_mut::<StringBuilder>()
901
+ .expect("Builder mismatch: expected StringBuilder");
902
+ for val in values {
903
+ match val {
904
+ ParquetValue::String(s) => typed_builder.append_value(s),
905
+ ParquetValue::Null => typed_builder.append_null(),
906
+ other => {
907
+ return Err(MagnusError::new(
908
+ magnus::exception::type_error(),
909
+ format!("Expected String, got {:?}", other),
910
+ ))
911
+ }
641
912
  }
642
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
643
- return Err(MagnusError::new(
644
- magnus::exception::type_error(),
645
- "Nested lists and maps are not supported",
646
- ))
913
+ }
914
+ Ok(())
915
+ }
916
+ ParquetSchemaType::Binary => {
917
+ let typed_builder = builder
918
+ .as_any_mut()
919
+ .downcast_mut::<BinaryBuilder>()
920
+ .expect("Builder mismatch: expected BinaryBuilder");
921
+ for val in values {
922
+ match val {
923
+ ParquetValue::Bytes(b) => typed_builder.append_value(&b),
924
+ ParquetValue::Null => typed_builder.append_null(),
925
+ other => {
926
+ return Err(MagnusError::new(
927
+ magnus::exception::type_error(),
928
+ format!("Expected Binary, got {:?}", other),
929
+ ))
930
+ }
647
931
  }
648
- };
932
+ }
933
+ Ok(())
934
+ }
935
+ ParquetSchemaType::Struct(struct_field) => {
936
+ let typed_builder = builder
937
+ .as_any_mut()
938
+ .downcast_mut::<StructBuilder>()
939
+ .expect("Builder mismatch: expected StructBuilder");
649
940
 
650
- let mut list_builder = ListBuilder::new(value_builder);
651
-
652
- for value in values {
653
- match value {
654
- ParquetValue::List(items) => {
655
- for item in items {
656
- match &list_field.item_type {
657
- ParquetSchemaType::Int8 => append_list_value_copy!(
658
- list_builder,
659
- ParquetSchemaType::Int8,
660
- item,
661
- Int8Builder,
662
- ParquetValue::Int8
663
- ),
664
- ParquetSchemaType::Int16 => append_list_value_copy!(
665
- list_builder,
666
- ParquetSchemaType::Int16,
667
- item,
668
- Int16Builder,
669
- ParquetValue::Int16
670
- ),
671
- ParquetSchemaType::Int32 => append_list_value_copy!(
672
- list_builder,
673
- ParquetSchemaType::Int32,
674
- item,
675
- Int32Builder,
676
- ParquetValue::Int32
677
- ),
678
- ParquetSchemaType::Int64 => append_list_value_copy!(
679
- list_builder,
680
- ParquetSchemaType::Int64,
681
- item,
682
- Int64Builder,
683
- ParquetValue::Int64
684
- ),
685
- ParquetSchemaType::UInt8 => append_list_value_copy!(
686
- list_builder,
687
- ParquetSchemaType::UInt8,
688
- item,
689
- UInt8Builder,
690
- ParquetValue::UInt8
691
- ),
692
- ParquetSchemaType::UInt16 => append_list_value_copy!(
693
- list_builder,
694
- ParquetSchemaType::UInt16,
695
- item,
696
- UInt16Builder,
697
- ParquetValue::UInt16
698
- ),
699
- ParquetSchemaType::UInt32 => append_list_value_copy!(
700
- list_builder,
701
- ParquetSchemaType::UInt32,
702
- item,
703
- UInt32Builder,
704
- ParquetValue::UInt32
705
- ),
706
- ParquetSchemaType::UInt64 => append_list_value_copy!(
707
- list_builder,
708
- ParquetSchemaType::UInt64,
709
- item,
710
- UInt64Builder,
711
- ParquetValue::UInt64
712
- ),
713
- ParquetSchemaType::Float => append_list_value_copy!(
714
- list_builder,
715
- ParquetSchemaType::Float,
716
- item,
717
- Float32Builder,
718
- ParquetValue::Float32
719
- ),
720
- ParquetSchemaType::Double => append_list_value_copy!(
721
- list_builder,
722
- ParquetSchemaType::Double,
723
- item,
724
- Float64Builder,
725
- ParquetValue::Float64
726
- ),
727
- ParquetSchemaType::String => append_list_value!(
728
- list_builder,
729
- ParquetSchemaType::String,
730
- item,
731
- StringBuilder,
732
- ParquetValue::String
733
- ),
734
- ParquetSchemaType::Binary => append_list_value!(
735
- list_builder,
736
- ParquetSchemaType::Binary,
737
- item,
738
- BinaryBuilder,
739
- ParquetValue::Bytes
740
- ),
741
- ParquetSchemaType::Boolean => append_list_value_copy!(
742
- list_builder,
743
- ParquetSchemaType::Boolean,
744
- item,
745
- BooleanBuilder,
746
- ParquetValue::Boolean
747
- ),
748
- ParquetSchemaType::Date32 => append_list_value_copy!(
749
- list_builder,
750
- ParquetSchemaType::Date32,
751
- item,
752
- Date32Builder,
753
- ParquetValue::Date32
754
- ),
755
- ParquetSchemaType::TimestampMillis => append_timestamp_list_value!(
756
- list_builder,
757
- ParquetSchemaType::TimestampMillis,
758
- item,
759
- TimestampMillisecondBuilder,
760
- ParquetValue::TimestampMillis
761
- ),
762
- ParquetSchemaType::TimestampMicros => append_timestamp_list_value!(
763
- list_builder,
764
- ParquetSchemaType::TimestampMicros,
765
- item,
766
- TimestampMicrosecondBuilder,
767
- ParquetValue::TimestampMicros
768
- ),
769
- ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
770
- return Err(MagnusError::new(
771
- magnus::exception::type_error(),
772
- "Nested lists and maps are not supported",
773
- ))
941
+ for val in values {
942
+ match val {
943
+ ParquetValue::Null => {
944
+ // null struct
945
+ typed_builder.append(false);
946
+ }
947
+ ParquetValue::Map(map_data) => {
948
+ for (i, field) in struct_field.fields.iter().enumerate() {
949
+ let field_key = ParquetValue::String(field.name.clone());
950
+ if let Some(field_val) = map_data.get(&field_key) {
951
+ match field_val {
952
+ ParquetValue::Int8(x) => typed_builder
953
+ .field_builder::<Int8Builder>(i)
954
+ .ok_or_else(|| {
955
+ MagnusError::new(
956
+ magnus::exception::type_error(),
957
+ "Failed to coerce into Int8Builder",
958
+ )
959
+ })?
960
+ .append_value(*x),
961
+ ParquetValue::Int16(x) => typed_builder
962
+ .field_builder::<Int16Builder>(i)
963
+ .ok_or_else(|| {
964
+ MagnusError::new(
965
+ magnus::exception::type_error(),
966
+ "Failed to coerce into Int16Builder",
967
+ )
968
+ })?
969
+ .append_value(*x),
970
+ ParquetValue::Int32(x) => typed_builder
971
+ .field_builder::<Int32Builder>(i)
972
+ .ok_or_else(|| {
973
+ MagnusError::new(
974
+ magnus::exception::type_error(),
975
+ "Failed to coerce into Int32Builder",
976
+ )
977
+ })?
978
+ .append_value(*x),
979
+ ParquetValue::Int64(x) => typed_builder
980
+ .field_builder::<Int64Builder>(i)
981
+ .ok_or_else(|| {
982
+ MagnusError::new(
983
+ magnus::exception::type_error(),
984
+ "Failed to coerce into Int64Builder",
985
+ )
986
+ })?
987
+ .append_value(*x),
988
+ ParquetValue::UInt8(x) => typed_builder
989
+ .field_builder::<UInt8Builder>(i)
990
+ .ok_or_else(|| {
991
+ MagnusError::new(
992
+ magnus::exception::type_error(),
993
+ "Failed to coerce into UInt8Builder",
994
+ )
995
+ })?
996
+ .append_value(*x),
997
+ ParquetValue::UInt16(x) => typed_builder
998
+ .field_builder::<UInt16Builder>(i)
999
+ .ok_or_else(|| {
1000
+ MagnusError::new(
1001
+ magnus::exception::type_error(),
1002
+ "Failed to coerce into UInt16Builder",
1003
+ )
1004
+ })?
1005
+ .append_value(*x),
1006
+ ParquetValue::UInt32(x) => typed_builder
1007
+ .field_builder::<UInt32Builder>(i)
1008
+ .ok_or_else(|| {
1009
+ MagnusError::new(
1010
+ magnus::exception::type_error(),
1011
+ "Failed to coerce into UInt32Builder",
1012
+ )
1013
+ })?
1014
+ .append_value(*x),
1015
+ ParquetValue::UInt64(x) => typed_builder
1016
+ .field_builder::<UInt64Builder>(i)
1017
+ .ok_or_else(|| {
1018
+ MagnusError::new(
1019
+ magnus::exception::type_error(),
1020
+ "Failed to coerce into UInt64Builder",
1021
+ )
1022
+ })?
1023
+ .append_value(*x),
1024
+ ParquetValue::Float16(_) => {
1025
+ return Err(MagnusError::new(
1026
+ magnus::exception::runtime_error(),
1027
+ "Float16 not supported",
1028
+ ))
1029
+ }
1030
+ ParquetValue::Float32(x) => typed_builder
1031
+ .field_builder::<Float32Builder>(i)
1032
+ .ok_or_else(|| {
1033
+ MagnusError::new(
1034
+ magnus::exception::type_error(),
1035
+ "Failed to coerce into Float32Builder",
1036
+ )
1037
+ })?
1038
+ .append_value(*x),
1039
+ ParquetValue::Float64(x) => typed_builder
1040
+ .field_builder::<Float64Builder>(i)
1041
+ .ok_or_else(|| {
1042
+ MagnusError::new(
1043
+ magnus::exception::type_error(),
1044
+ "Failed to coerce into Float64Builder",
1045
+ )
1046
+ })?
1047
+ .append_value(*x),
1048
+ ParquetValue::Boolean(x) => typed_builder
1049
+ .field_builder::<BooleanBuilder>(i)
1050
+ .ok_or_else(|| {
1051
+ MagnusError::new(
1052
+ magnus::exception::type_error(),
1053
+ "Failed to coerce into BooleanBuilder",
1054
+ )
1055
+ })?
1056
+ .append_value(*x),
1057
+ ParquetValue::String(x) => typed_builder
1058
+ .field_builder::<StringBuilder>(i)
1059
+ .ok_or_else(|| {
1060
+ MagnusError::new(
1061
+ magnus::exception::type_error(),
1062
+ "Failed to coerce into StringBuilder",
1063
+ )
1064
+ })?
1065
+ .append_value(x),
1066
+ ParquetValue::Bytes(bytes) => typed_builder
1067
+ .field_builder::<BinaryBuilder>(i)
1068
+ .ok_or_else(|| {
1069
+ MagnusError::new(
1070
+ magnus::exception::type_error(),
1071
+ "Failed to coerce into BinaryBuilder",
1072
+ )
1073
+ })?
1074
+ .append_value(bytes),
1075
+ ParquetValue::Date32(x) => typed_builder
1076
+ .field_builder::<Date32Builder>(i)
1077
+ .ok_or_else(|| {
1078
+ MagnusError::new(
1079
+ magnus::exception::type_error(),
1080
+ "Failed to coerce into Date32Builder",
1081
+ )
1082
+ })?
1083
+ .append_value(*x),
1084
+ ParquetValue::Date64(x) => typed_builder
1085
+ .field_builder::<Date64Builder>(i)
1086
+ .ok_or_else(|| {
1087
+ MagnusError::new(
1088
+ magnus::exception::type_error(),
1089
+ "Failed to coerce into Date64Builder",
1090
+ )
1091
+ })?
1092
+ .append_value(*x),
1093
+ ParquetValue::TimestampSecond(x, _tz) => typed_builder
1094
+ .field_builder::<TimestampSecondBuilder>(i)
1095
+ .ok_or_else(|| {
1096
+ MagnusError::new(
1097
+ magnus::exception::type_error(),
1098
+ "Failed to coerce into TimestampSecondBuilder",
1099
+ )
1100
+ })?
1101
+ .append_value(*x),
1102
+ ParquetValue::TimestampMillis(x, _tz) => typed_builder
1103
+ .field_builder::<TimestampMillisecondBuilder>(i)
1104
+ .ok_or_else(|| {
1105
+ MagnusError::new(
1106
+ magnus::exception::type_error(),
1107
+ "Failed to coerce into TimestampMillisecondBuilder",
1108
+ )
1109
+ })?
1110
+ .append_value(*x),
1111
+ ParquetValue::TimestampMicros(x, _tz) => typed_builder
1112
+ .field_builder::<TimestampMicrosecondBuilder>(i)
1113
+ .ok_or_else(|| {
1114
+ MagnusError::new(
1115
+ magnus::exception::type_error(),
1116
+ "Failed to coerce into TimestampMicrosecondBuilder",
1117
+ )
1118
+ })?
1119
+ .append_value(*x),
1120
+ ParquetValue::TimestampNanos(x, _tz) => typed_builder
1121
+ .field_builder::<TimestampNanosecondBuilder>(i)
1122
+ .ok_or_else(|| {
1123
+ MagnusError::new(
1124
+ magnus::exception::type_error(),
1125
+ "Failed to coerce into TimestampNanosecondBuilder",
1126
+ )
1127
+ })?
1128
+ .append_value(*x),
1129
+ ParquetValue::List(items) => {
1130
+ let list_builder = typed_builder
1131
+ .field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
1132
+ .ok_or_else(|| {
1133
+ MagnusError::new(
1134
+ magnus::exception::type_error(),
1135
+ "Failed to coerce into ListBuilder",
1136
+ )
1137
+ })?;
1138
+ fill_builder(
1139
+ list_builder.values(),
1140
+ &struct_field.fields[i].type_,
1141
+ items,
1142
+ )?;
1143
+ list_builder.append(true);
1144
+ }
1145
+ ParquetValue::Map(map_data) => {
1146
+ let maybe_map_builder = typed_builder
1147
+ .field_builder::<MapBuilder<
1148
+ Box<dyn ArrayBuilder>,
1149
+ Box<dyn ArrayBuilder>,
1150
+ >>(i);
1151
+
1152
+ if let Some(map_builder) = maybe_map_builder {
1153
+ fill_builder(
1154
+ map_builder,
1155
+ &struct_field.fields[i].type_,
1156
+ &[ParquetValue::Map(map_data.clone())],
1157
+ )?;
1158
+ map_builder.append(true).map_err(|e| {
1159
+ MagnusError::new(
1160
+ magnus::exception::runtime_error(),
1161
+ format!("Failed to append map: {}", e),
1162
+ )
1163
+ })?;
1164
+ } else {
1165
+ let child_struct_builder = typed_builder
1166
+ .field_builder::<StructBuilder>(i)
1167
+ .ok_or_else(|| {
1168
+ MagnusError::new(
1169
+ magnus::exception::type_error(),
1170
+ "Failed to coerce into StructBuilder",
1171
+ )
1172
+ })?;
1173
+ fill_builder(
1174
+ child_struct_builder,
1175
+ &struct_field.fields[i].type_,
1176
+ &[ParquetValue::Map(map_data.clone())],
1177
+ )?;
1178
+ }
1179
+ }
1180
+ ParquetValue::Null => match struct_field.fields[i].type_ {
1181
+ ParquetSchemaType::Int8 => typed_builder
1182
+ .field_builder::<Int8Builder>(i)
1183
+ .ok_or_else(|| {
1184
+ MagnusError::new(
1185
+ magnus::exception::type_error(),
1186
+ "Failed to coerce into Int8Builder",
1187
+ )
1188
+ })?
1189
+ .append_null(),
1190
+ ParquetSchemaType::Int16 => typed_builder
1191
+ .field_builder::<Int16Builder>(i)
1192
+ .ok_or_else(|| {
1193
+ MagnusError::new(
1194
+ magnus::exception::type_error(),
1195
+ "Failed to coerce into Int16Builder",
1196
+ )
1197
+ })?
1198
+ .append_null(),
1199
+ ParquetSchemaType::Int32 => typed_builder
1200
+ .field_builder::<Int32Builder>(i)
1201
+ .ok_or_else(|| {
1202
+ MagnusError::new(
1203
+ magnus::exception::type_error(),
1204
+ "Failed to coerce into Int32Builder",
1205
+ )
1206
+ })?
1207
+ .append_null(),
1208
+ ParquetSchemaType::Int64 => typed_builder
1209
+ .field_builder::<Int64Builder>(i)
1210
+ .ok_or_else(|| {
1211
+ MagnusError::new(
1212
+ magnus::exception::type_error(),
1213
+ "Failed to coerce into Int64Builder",
1214
+ )
1215
+ })?
1216
+ .append_null(),
1217
+ ParquetSchemaType::UInt8 => typed_builder
1218
+ .field_builder::<UInt8Builder>(i)
1219
+ .ok_or_else(|| {
1220
+ MagnusError::new(
1221
+ magnus::exception::type_error(),
1222
+ "Failed to coerce into UInt8Builder",
1223
+ )
1224
+ })?
1225
+ .append_null(),
1226
+ ParquetSchemaType::UInt16 => typed_builder
1227
+ .field_builder::<UInt16Builder>(i)
1228
+ .ok_or_else(|| {
1229
+ MagnusError::new(
1230
+ magnus::exception::type_error(),
1231
+ "Failed to coerce into UInt16Builder",
1232
+ )
1233
+ })?
1234
+ .append_null(),
1235
+ ParquetSchemaType::UInt32 => typed_builder
1236
+ .field_builder::<UInt32Builder>(i)
1237
+ .ok_or_else(|| {
1238
+ MagnusError::new(
1239
+ magnus::exception::type_error(),
1240
+ "Failed to coerce into UInt32Builder",
1241
+ )
1242
+ })?
1243
+ .append_null(),
1244
+ ParquetSchemaType::UInt64 => typed_builder
1245
+ .field_builder::<UInt64Builder>(i)
1246
+ .ok_or_else(|| {
1247
+ MagnusError::new(
1248
+ magnus::exception::type_error(),
1249
+ "Failed to coerce into UInt64Builder",
1250
+ )
1251
+ })?
1252
+ .append_null(),
1253
+ ParquetSchemaType::Float => typed_builder
1254
+ .field_builder::<Float32Builder>(i)
1255
+ .ok_or_else(|| {
1256
+ MagnusError::new(
1257
+ magnus::exception::type_error(),
1258
+ "Failed to coerce into Float32Builder",
1259
+ )
1260
+ })?
1261
+ .append_null(),
1262
+ ParquetSchemaType::Double => typed_builder
1263
+ .field_builder::<Float64Builder>(i)
1264
+ .ok_or_else(|| {
1265
+ MagnusError::new(
1266
+ magnus::exception::type_error(),
1267
+ "Failed to coerce into Float64Builder",
1268
+ )
1269
+ })?
1270
+ .append_null(),
1271
+ ParquetSchemaType::String => typed_builder
1272
+ .field_builder::<StringBuilder>(i)
1273
+ .ok_or_else(|| {
1274
+ MagnusError::new(
1275
+ magnus::exception::type_error(),
1276
+ "Failed to coerce into StringBuilder",
1277
+ )
1278
+ })?
1279
+ .append_null(),
1280
+ ParquetSchemaType::Binary => typed_builder
1281
+ .field_builder::<BinaryBuilder>(i)
1282
+ .ok_or_else(|| {
1283
+ MagnusError::new(
1284
+ magnus::exception::type_error(),
1285
+ "Failed to coerce into BinaryBuilder",
1286
+ )
1287
+ })?
1288
+ .append_null(),
1289
+ ParquetSchemaType::Boolean => typed_builder
1290
+ .field_builder::<BooleanBuilder>(i)
1291
+ .ok_or_else(|| {
1292
+ MagnusError::new(
1293
+ magnus::exception::type_error(),
1294
+ "Failed to coerce into BooleanBuilder",
1295
+ )
1296
+ })?
1297
+ .append_null(),
1298
+ ParquetSchemaType::Date32 => typed_builder
1299
+ .field_builder::<Date32Builder>(i)
1300
+ .ok_or_else(|| {
1301
+ MagnusError::new(
1302
+ magnus::exception::type_error(),
1303
+ "Failed to coerce into Date32Builder",
1304
+ )
1305
+ })?
1306
+ .append_null(),
1307
+ ParquetSchemaType::TimestampMillis => typed_builder
1308
+ .field_builder::<TimestampMillisecondBuilder>(i)
1309
+ .ok_or_else(|| {
1310
+ MagnusError::new(
1311
+ magnus::exception::type_error(),
1312
+ "Failed to coerce into TimestampMillisecondBuilder",
1313
+ )
1314
+ })?
1315
+ .append_null(),
1316
+ ParquetSchemaType::TimestampMicros => typed_builder
1317
+ .field_builder::<TimestampMicrosecondBuilder>(i)
1318
+ .ok_or_else(|| {
1319
+ MagnusError::new(
1320
+ magnus::exception::type_error(),
1321
+ "Failed to coerce into TimestampMicrosecondBuilder",
1322
+ )
1323
+ })?
1324
+ .append_null(),
1325
+ ParquetSchemaType::List(_) => typed_builder
1326
+ .field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
1327
+ .ok_or_else(|| {
1328
+ MagnusError::new(
1329
+ magnus::exception::type_error(),
1330
+ "Failed to coerce into ListBuilder",
1331
+ )
1332
+ })?
1333
+ .append(false),
1334
+ ParquetSchemaType::Map(_) => {
1335
+ typed_builder
1336
+ .field_builder::<MapBuilder<
1337
+ Box<dyn ArrayBuilder>,
1338
+ Box<dyn ArrayBuilder>,
1339
+ >>(i)
1340
+ .ok_or_else(|| {
1341
+ MagnusError::new(
1342
+ magnus::exception::type_error(),
1343
+ "Failed to coerce into MapBuilder",
1344
+ )
1345
+ })?
1346
+ .append(false)
1347
+ .map_err(|e| {
1348
+ MagnusError::new(
1349
+ magnus::exception::runtime_error(),
1350
+ format!("Failed to append map: {}", e),
1351
+ )
1352
+ })?;
1353
+ }
1354
+ ParquetSchemaType::Struct(_) => typed_builder
1355
+ .field_builder::<StructBuilder>(i)
1356
+ .ok_or_else(|| {
1357
+ MagnusError::new(
1358
+ magnus::exception::type_error(),
1359
+ "Failed to coerce into StructBuilder",
1360
+ )
1361
+ })?
1362
+ .append_null(),
1363
+ },
774
1364
  }
1365
+ } else {
1366
+ return Err(MagnusError::new(
1367
+ magnus::exception::type_error(),
1368
+ format!("Field {} not found in map", i),
1369
+ ));
775
1370
  }
776
1371
  }
1372
+ typed_builder.append(true);
777
1373
  }
778
- ParquetValue::Null => list_builder.append_null(),
779
- _ => {
1374
+ other => {
780
1375
  return Err(MagnusError::new(
781
1376
  magnus::exception::type_error(),
782
- format!("Expected List, got {:?}", value),
783
- ))
1377
+ format!("Expected ParquetValue::Map(...) or Null, got {:?}", other),
1378
+ ));
784
1379
  }
785
1380
  }
786
1381
  }
787
- Ok(Arc::new(list_builder.finish()))
788
- }
789
- ParquetSchemaType::Map(_map_field) => {
790
- unimplemented!("Writing maps is not yet supported")
1382
+ Ok(())
791
1383
  }
792
1384
  }
793
1385
  }
794
1386
 
1387
+ /// Creates a final Arrow array from a list of ParquetValues and a schema type.
1388
+ /// This is your "unified" way to handle any nesting level.
1389
+ pub fn convert_parquet_values_to_arrow(
1390
+ values: Vec<ParquetValue>,
1391
+ type_: &ParquetSchemaType,
1392
+ ) -> Result<Arc<dyn Array>, ReaderError> {
1393
+ // Make sure we always have at least capacity 1 to avoid empty builders
1394
+ let capacity = if values.is_empty() { 1 } else { values.len() };
1395
+ let mut builder = create_arrow_builder_for_type(type_, Some(capacity))?;
1396
+
1397
+ fill_builder(&mut builder, type_, &values)?;
1398
+
1399
+ // Finish building the array
1400
+ let array = builder.finish();
1401
+
1402
+ Ok(Arc::new(array))
1403
+ }
1404
+
795
1405
  pub fn convert_ruby_array_to_arrow(
796
1406
  values: RArray,
797
1407
  type_: &ParquetSchemaType,
798
- ) -> Result<Arc<dyn Array>, MagnusError> {
1408
+ ) -> Result<Arc<dyn Array>, ReaderError> {
799
1409
  let mut parquet_values = Vec::with_capacity(values.len());
800
1410
  for value in values {
801
1411
  if value.is_nil() {
802
1412
  parquet_values.push(ParquetValue::Null);
803
1413
  continue;
804
1414
  }
805
- let parquet_value = ParquetValue::from_value(value, type_)?;
1415
+ let parquet_value = ParquetValue::from_value(value, type_, None)?;
806
1416
  parquet_values.push(parquet_value);
807
1417
  }
808
1418
  convert_parquet_values_to_arrow(parquet_values, type_)