parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,1243 @@
1
+ //! Bidirectional conversion between Arrow arrays and ParquetValue
2
+ //!
3
+ //! This module provides a unified interface for converting between Arrow's
4
+ //! columnar format and Parquet's value representation. It consolidates
5
+ //! the conversion logic that was previously duplicated between the reader
6
+ //! and writer modules.
7
+
8
+ use crate::{ParquetError, ParquetValue, Result};
9
+ use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
10
+ use arrow_schema::extension::Uuid as ArrowUuid;
11
+ use arrow_schema::{DataType, Field};
12
+ use bytes::Bytes;
13
+ use indexmap::IndexMap;
14
+ use ordered_float::OrderedFloat;
15
+ use parquet::basic::LogicalType;
16
+ use parquet::schema::types::Type;
17
+ use std::sync::Arc as StdArc;
18
+ use triomphe::Arc;
19
+
20
+ /// Convert a single value from an Arrow array at the given index to a ParquetValue
21
+ pub fn arrow_to_parquet_value(
22
+ arrow_field: &Field,
23
+ parquet_field: &Type,
24
+ array: &dyn Array,
25
+ index: usize,
26
+ ) -> Result<ParquetValue> {
27
+ use arrow_array::*;
28
+
29
+ if array.is_null(index) {
30
+ return Ok(ParquetValue::Null);
31
+ }
32
+
33
+ match array.data_type() {
34
+ // Primitive types
35
+ DataType::Boolean => {
36
+ let array = downcast_array::<BooleanArray>(array)?;
37
+ Ok(ParquetValue::Boolean(array.value(index)))
38
+ }
39
+ DataType::Int8 => {
40
+ let array = downcast_array::<Int8Array>(array)?;
41
+ Ok(ParquetValue::Int8(array.value(index)))
42
+ }
43
+ DataType::Int16 => {
44
+ let array = downcast_array::<Int16Array>(array)?;
45
+ Ok(ParquetValue::Int16(array.value(index)))
46
+ }
47
+ DataType::Int32 => {
48
+ let array = downcast_array::<Int32Array>(array)?;
49
+ Ok(ParquetValue::Int32(array.value(index)))
50
+ }
51
+ DataType::Int64 => {
52
+ let array = downcast_array::<Int64Array>(array)?;
53
+ Ok(ParquetValue::Int64(array.value(index)))
54
+ }
55
+ DataType::UInt8 => {
56
+ let array = downcast_array::<UInt8Array>(array)?;
57
+ Ok(ParquetValue::UInt8(array.value(index)))
58
+ }
59
+ DataType::UInt16 => {
60
+ let array = downcast_array::<UInt16Array>(array)?;
61
+ Ok(ParquetValue::UInt16(array.value(index)))
62
+ }
63
+ DataType::UInt32 => {
64
+ let array = downcast_array::<UInt32Array>(array)?;
65
+ Ok(ParquetValue::UInt32(array.value(index)))
66
+ }
67
+ DataType::UInt64 => {
68
+ let array = downcast_array::<UInt64Array>(array)?;
69
+ Ok(ParquetValue::UInt64(array.value(index)))
70
+ }
71
+ DataType::Float16 => {
72
+ let array = downcast_array::<Float16Array>(array)?;
73
+ let value = array.value(index);
74
+ Ok(ParquetValue::Float16(OrderedFloat(value.to_f32())))
75
+ }
76
+ DataType::Float32 => {
77
+ let array = downcast_array::<Float32Array>(array)?;
78
+ Ok(ParquetValue::Float32(OrderedFloat(array.value(index))))
79
+ }
80
+ DataType::Float64 => {
81
+ let array = downcast_array::<Float64Array>(array)?;
82
+ Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
83
+ }
84
+ // String and binary types
85
+ DataType::Utf8 => {
86
+ let array = downcast_array::<StringArray>(array)?;
87
+ Ok(ParquetValue::String(Arc::from(array.value(index))))
88
+ }
89
+ DataType::Binary => {
90
+ let array = downcast_array::<BinaryArray>(array)?;
91
+ Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
92
+ array.value(index),
93
+ )))
94
+ }
95
+ DataType::FixedSizeBinary(_) => {
96
+ let array = downcast_array::<FixedSizeBinaryArray>(array)?;
97
+ let value = array.value(index);
98
+ if let Some(LogicalType::Uuid) = parquet_field.get_basic_info().logical_type_ref() {
99
+ let uuid = uuid::Uuid::from_slice(value)
100
+ .map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
101
+ Ok(ParquetValue::Uuid(uuid))
102
+ } else {
103
+ match arrow_field.try_extension_type::<ArrowUuid>() {
104
+ Ok(_) => {
105
+ let uuid = uuid::Uuid::from_slice(value).map_err(|e| {
106
+ ParquetError::Conversion(format!("Invalid UUID: {}", e))
107
+ })?;
108
+ Ok(ParquetValue::Uuid(uuid))
109
+ }
110
+ Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
111
+ }
112
+ }
113
+ }
114
+
115
+ // Date and time types
116
+ DataType::Date32 => {
117
+ let array = downcast_array::<Date32Array>(array)?;
118
+ Ok(ParquetValue::Date32(array.value(index)))
119
+ }
120
+ DataType::Date64 => {
121
+ let array = downcast_array::<Date64Array>(array)?;
122
+ Ok(ParquetValue::Date64(array.value(index)))
123
+ }
124
+
125
+ // Timestamp types
126
+ DataType::Timestamp(unit, timezone) => {
127
+ let timezone = timezone.as_ref().map(|s| Arc::from(s.as_ref()));
128
+ match unit {
129
+ arrow_schema::TimeUnit::Millisecond => {
130
+ let array = downcast_array::<TimestampMillisecondArray>(array)?;
131
+ Ok(ParquetValue::TimestampMillis(array.value(index), timezone))
132
+ }
133
+ arrow_schema::TimeUnit::Microsecond => {
134
+ let array = downcast_array::<TimestampMicrosecondArray>(array)?;
135
+ Ok(ParquetValue::TimestampMicros(array.value(index), timezone))
136
+ }
137
+ arrow_schema::TimeUnit::Second => {
138
+ let array = downcast_array::<TimestampSecondArray>(array)?;
139
+ Ok(ParquetValue::TimestampSecond(array.value(index), timezone))
140
+ }
141
+ arrow_schema::TimeUnit::Nanosecond => {
142
+ let array = downcast_array::<TimestampNanosecondArray>(array)?;
143
+ Ok(ParquetValue::TimestampNanos(array.value(index), timezone))
144
+ }
145
+ }
146
+ }
147
+
148
+ // Time types
149
+ DataType::Time32(unit) => match unit {
150
+ arrow_schema::TimeUnit::Millisecond => {
151
+ let array = downcast_array::<Time32MillisecondArray>(array)?;
152
+ Ok(ParquetValue::TimeMillis(array.value(index)))
153
+ }
154
+ _ => Err(ParquetError::Conversion(format!(
155
+ "Unsupported time32 unit: {:?}",
156
+ unit
157
+ ))),
158
+ },
159
+ DataType::Time64(unit) => match unit {
160
+ arrow_schema::TimeUnit::Microsecond => {
161
+ let array = downcast_array::<Time64MicrosecondArray>(array)?;
162
+ Ok(ParquetValue::TimeMicros(array.value(index)))
163
+ }
164
+ arrow_schema::TimeUnit::Nanosecond => {
165
+ let array = downcast_array::<Time64NanosecondArray>(array)?;
166
+ Ok(ParquetValue::TimeNanos(array.value(index)))
167
+ }
168
+ _ => Err(ParquetError::Conversion(format!(
169
+ "Unsupported time64 unit: {:?}",
170
+ unit
171
+ ))),
172
+ },
173
+
174
+ // Decimal types
175
+ DataType::Decimal128(_precision, scale) => {
176
+ let array = downcast_array::<Decimal128Array>(array)?;
177
+ let value = array.value(index);
178
+ Ok(ParquetValue::Decimal128(value, *scale))
179
+ }
180
+ DataType::Decimal256(_precision, scale) => {
181
+ let array = downcast_array::<Decimal256Array>(array)?;
182
+ let bytes = array.value(index).to_le_bytes();
183
+
184
+ // Convert to BigInt
185
+ let bigint = if bytes[31] & 0x80 != 0 {
186
+ // Negative number - convert from two's complement
187
+ let mut inverted = [0u8; 32];
188
+ for (i, &b) in bytes.iter().enumerate() {
189
+ inverted[i] = !b;
190
+ }
191
+ let positive = num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &inverted);
192
+ -(positive + num::BigInt::from(1))
193
+ } else {
194
+ num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &bytes)
195
+ };
196
+
197
+ Ok(ParquetValue::Decimal256(bigint, *scale))
198
+ }
199
+
200
+ // Complex types
201
+ DataType::List(item_field) => {
202
+ let array = downcast_array::<ListArray>(array)?;
203
+ let list_values = array.value(index);
204
+
205
+ let mut values = Vec::with_capacity(list_values.len());
206
+
207
+ // Get the list's element type from parquet schema
208
+ let element_type = match parquet_field {
209
+ parquet::schema::types::Type::GroupType { fields, .. } => {
210
+ // List has a repeated group containing the element
211
+ // The structure is: LIST -> repeated group -> element
212
+ if let Some(repeated_group) = fields.first() {
213
+ match repeated_group.as_ref() {
214
+ parquet::schema::types::Type::GroupType {
215
+ fields: inner_fields,
216
+ ..
217
+ } => {
218
+ // This is the repeated group, get the actual element
219
+ inner_fields.first().ok_or_else(|| {
220
+ ParquetError::Conversion(
221
+ "List repeated group missing element field".to_string(),
222
+ )
223
+ })?
224
+ }
225
+ _ => repeated_group, // If it's not a group, use it directly
226
+ }
227
+ } else {
228
+ return Err(ParquetError::Conversion(
229
+ "List type missing fields".to_string(),
230
+ ));
231
+ }
232
+ }
233
+ _ => parquet_field, // Fallback for cases where it's not a proper list structure
234
+ };
235
+
236
+ for i in 0..list_values.len() {
237
+ values.push(arrow_to_parquet_value(
238
+ item_field,
239
+ element_type,
240
+ &list_values,
241
+ i,
242
+ )?);
243
+ }
244
+
245
+ Ok(ParquetValue::List(values))
246
+ }
247
+ DataType::Map(_, _) => {
248
+ let array = downcast_array::<MapArray>(array)?;
249
+ let map_value = array.value(index);
250
+
251
+ // The Arrow `MapArray` entries struct is always (key, value) by
252
+ // position — `MapArray::keys()`/`values()` are `column(0)`/`column(1)`
253
+ // and `try_new` enforces exactly two columns — so we index by position
254
+ // and never depend on the entry field names (which the Parquet spec
255
+ // does not fix).
256
+ debug_assert_eq!(map_value.num_columns(), 2);
257
+ let keys = map_value.column(0);
258
+ let values = map_value.column(1);
259
+
260
+ let key_field = map_value
261
+ .fields()
262
+ .get(0)
263
+ .ok_or_else(|| ParquetError::Conversion("No key field found".to_string()))?;
264
+
265
+ let value_field = map_value
266
+ .fields()
267
+ .get(1)
268
+ .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
269
+
270
+ let mut map_vec = Vec::with_capacity(keys.len());
271
+
272
+ // Get key and value types from parquet schema
273
+ // Map structure is: MAP -> key_value (repeated group) -> key, value
274
+ let (key_type, value_type) = match parquet_field {
275
+ parquet::schema::types::Type::GroupType { fields, .. } => {
276
+ // Get the key_value repeated group
277
+ match fields.first() {
278
+ Some(key_value_group) => match key_value_group.as_ref() {
279
+ parquet::schema::types::Type::GroupType {
280
+ fields: kv_fields, ..
281
+ } => {
282
+ let key_field = kv_fields.first().ok_or_else(|| {
283
+ ParquetError::Conversion("Map missing key field".to_string())
284
+ })?;
285
+ let value_field = kv_fields.get(1).ok_or_else(|| {
286
+ ParquetError::Conversion("Map missing value field".to_string())
287
+ })?;
288
+ (key_field.as_ref(), value_field.as_ref())
289
+ }
290
+ _ => {
291
+ return Err(ParquetError::Conversion(
292
+ "Map key_value should be a group".to_string(),
293
+ ))
294
+ }
295
+ },
296
+ None => {
297
+ return Err(ParquetError::Conversion(
298
+ "Map type missing key_value field".to_string(),
299
+ ))
300
+ }
301
+ }
302
+ }
303
+ _ => {
304
+ return Err(ParquetError::Conversion(
305
+ "Map type must be a group".to_string(),
306
+ ))
307
+ }
308
+ };
309
+
310
+ for i in 0..keys.len() {
311
+ let key = arrow_to_parquet_value(key_field, key_type, keys, i)?;
312
+ let value = arrow_to_parquet_value(value_field, value_type, values, i)?;
313
+ map_vec.push((key, value));
314
+ }
315
+
316
+ Ok(ParquetValue::Map(map_vec))
317
+ }
318
+ DataType::Struct(_) => {
319
+ let array = downcast_array::<StructArray>(array)?;
320
+
321
+ let mut map = IndexMap::new();
322
+
323
+ // Get struct fields from parquet schema
324
+ let parquet_fields = match parquet_field {
325
+ parquet::schema::types::Type::GroupType { fields, .. } => fields,
326
+ _ => {
327
+ return Err(ParquetError::Conversion(
328
+ "Struct type must be a group".to_string(),
329
+ ))
330
+ }
331
+ };
332
+
333
+ for (col_idx, arrow_field) in array.fields().iter().enumerate() {
334
+ let column = array.column(col_idx);
335
+
336
+ // Find matching parquet field by name
337
+ let nested_parquet_field = parquet_fields
338
+ .iter()
339
+ .find(|f| f.name() == arrow_field.name())
340
+ .ok_or_else(|| {
341
+ ParquetError::Conversion(format!(
342
+ "No matching parquet field for struct field '{}'",
343
+ arrow_field.name()
344
+ ))
345
+ })?;
346
+
347
+ let value =
348
+ arrow_to_parquet_value(arrow_field, nested_parquet_field, column, index)?;
349
+ map.insert(Arc::from(arrow_field.name().as_str()), value);
350
+ }
351
+
352
+ Ok(ParquetValue::Record(map))
353
+ }
354
+
355
+ dt => Err(ParquetError::Conversion(format!(
356
+ "Unsupported data type for conversion: {:?}",
357
+ dt
358
+ ))),
359
+ }
360
+ }
361
+
362
+ /// Convert a slice of ParquetValues to an Arrow array
363
+ pub fn parquet_values_to_arrow_array(values: &[ParquetValue], field: &Field) -> Result<ArrayRef> {
364
+ let value_refs = values.iter().collect::<Vec<_>>();
365
+ parquet_value_refs_to_arrow_array(&value_refs, field)
366
+ }
367
+
368
+ fn parquet_value_refs_to_arrow_array(values: &[&ParquetValue], field: &Field) -> Result<ArrayRef> {
369
+ match field.data_type() {
370
+ // Boolean
371
+ DataType::Boolean => {
372
+ let mut builder = BooleanBuilder::with_capacity(values.len());
373
+ for value in values {
374
+ match *value {
375
+ ParquetValue::Boolean(b) => builder.append_value(*b),
376
+ ParquetValue::Null => builder.append_null(),
377
+ _ => {
378
+ return Err(ParquetError::Conversion(format!(
379
+ "Expected Boolean, got {:?}",
380
+ value.type_name()
381
+ )))
382
+ }
383
+ }
384
+ }
385
+ Ok(StdArc::new(builder.finish()))
386
+ }
387
+
388
+ // Integer types with automatic upcasting
389
+ DataType::Int8 => build_int8_array(values),
390
+ DataType::Int16 => build_int16_array(values),
391
+ DataType::Int32 => build_int32_array(values),
392
+ DataType::Int64 => build_int64_array(values),
393
+ DataType::UInt8 => build_uint8_array(values),
394
+ DataType::UInt16 => build_uint16_array(values),
395
+ DataType::UInt32 => build_uint32_array(values),
396
+ DataType::UInt64 => build_uint64_array(values),
397
+
398
+ // Float types
399
+ DataType::Float32 => build_float32_array(values),
400
+ DataType::Float64 => build_float64_array(values),
401
+
402
+ // String and binary
403
+ DataType::Utf8 => build_string_array(values),
404
+ DataType::Binary => build_binary_array(values),
405
+ DataType::FixedSizeBinary(size) => build_fixed_binary_array(values, *size),
406
+
407
+ // Date and time
408
+ DataType::Date32 => build_date32_array(values),
409
+ DataType::Date64 => build_date64_array(values),
410
+ DataType::Time32(unit) => build_time32_array(values, unit),
411
+ DataType::Time64(unit) => build_time64_array(values, unit),
412
+
413
+ // Timestamp
414
+ DataType::Timestamp(unit, tz) => build_timestamp_array(values, unit, tz.as_deref()),
415
+
416
+ // Decimal
417
+ DataType::Decimal128(precision, scale) => {
418
+ build_decimal128_array(values, *precision, *scale)
419
+ }
420
+ DataType::Decimal256(precision, scale) => {
421
+ build_decimal256_array(values, *precision, *scale)
422
+ }
423
+
424
+ // Complex types
425
+ DataType::List(item_field) => build_list_array(values, item_field),
426
+ DataType::Map(entries_field, sorted) => build_map_array(values, entries_field, *sorted),
427
+ DataType::Struct(fields) => build_struct_array(values, fields),
428
+
429
+ dt => Err(ParquetError::Conversion(format!(
430
+ "Unsupported data type for conversion: {:?}",
431
+ dt
432
+ ))),
433
+ }
434
+ }
435
+
436
+ /// Helper function to downcast an array with better error messages
437
+ fn downcast_array<T: 'static>(array: &dyn Array) -> Result<&T> {
438
+ array.as_any().downcast_ref::<T>().ok_or_else(|| {
439
+ ParquetError::Conversion(format!("Failed to cast to {}", std::any::type_name::<T>()))
440
+ })
441
+ }
442
+
443
+ /// Build Int8 array
444
+ fn build_int8_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
445
+ let mut builder = Int8Builder::with_capacity(values.len());
446
+ for value in values {
447
+ match *value {
448
+ ParquetValue::Int8(i) => builder.append_value(*i),
449
+ ParquetValue::Null => builder.append_null(),
450
+ _ => {
451
+ return Err(ParquetError::Conversion(format!(
452
+ "Expected Int8, got {:?}",
453
+ value.type_name()
454
+ )))
455
+ }
456
+ }
457
+ }
458
+ Ok(StdArc::new(builder.finish()))
459
+ }
460
+
461
+ /// Build Int16 array
462
+ fn build_int16_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
463
+ let mut builder = Int16Builder::with_capacity(values.len());
464
+ for value in values {
465
+ match *value {
466
+ ParquetValue::Int16(i) => builder.append_value(*i),
467
+ ParquetValue::Int8(i) => builder.append_value(*i as i16),
468
+ ParquetValue::Null => builder.append_null(),
469
+ _ => {
470
+ return Err(ParquetError::Conversion(format!(
471
+ "Expected Int16, got {:?}",
472
+ value.type_name()
473
+ )))
474
+ }
475
+ }
476
+ }
477
+ Ok(StdArc::new(builder.finish()))
478
+ }
479
+
480
+ /// Build Int32 array
481
+ fn build_int32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
482
+ let mut builder = Int32Builder::with_capacity(values.len());
483
+ for value in values {
484
+ match *value {
485
+ ParquetValue::Int32(i) => builder.append_value(*i),
486
+ ParquetValue::Int16(i) => builder.append_value(*i as i32),
487
+ ParquetValue::Int8(i) => builder.append_value(*i as i32),
488
+ ParquetValue::Null => builder.append_null(),
489
+ _ => {
490
+ return Err(ParquetError::Conversion(format!(
491
+ "Expected Int32, got {:?}",
492
+ value.type_name()
493
+ )))
494
+ }
495
+ }
496
+ }
497
+ Ok(StdArc::new(builder.finish()))
498
+ }
499
+
500
+ /// Build Int64 array
501
+ fn build_int64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
502
+ let mut builder = Int64Builder::with_capacity(values.len());
503
+ for value in values {
504
+ match *value {
505
+ ParquetValue::Int64(i) => builder.append_value(*i),
506
+ ParquetValue::Int32(i) => builder.append_value(*i as i64),
507
+ ParquetValue::Int16(i) => builder.append_value(*i as i64),
508
+ ParquetValue::Int8(i) => builder.append_value(*i as i64),
509
+ ParquetValue::Null => builder.append_null(),
510
+ _ => {
511
+ return Err(ParquetError::Conversion(format!(
512
+ "Expected Int64, got {:?}",
513
+ value.type_name()
514
+ )))
515
+ }
516
+ }
517
+ }
518
+ Ok(StdArc::new(builder.finish()))
519
+ }
520
+
521
+ /// Build UInt8 array
522
+ fn build_uint8_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
523
+ let mut builder = UInt8Builder::with_capacity(values.len());
524
+ for value in values {
525
+ match *value {
526
+ ParquetValue::UInt8(i) => builder.append_value(*i),
527
+ ParquetValue::Null => builder.append_null(),
528
+ _ => {
529
+ return Err(ParquetError::Conversion(format!(
530
+ "Expected UInt8, got {:?}",
531
+ value.type_name()
532
+ )))
533
+ }
534
+ }
535
+ }
536
+ Ok(StdArc::new(builder.finish()))
537
+ }
538
+
539
+ /// Build UInt16 array
540
+ fn build_uint16_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
541
+ let mut builder = UInt16Builder::with_capacity(values.len());
542
+ for value in values {
543
+ match *value {
544
+ ParquetValue::UInt16(i) => builder.append_value(*i),
545
+ ParquetValue::UInt8(i) => builder.append_value(*i as u16),
546
+ ParquetValue::Null => builder.append_null(),
547
+ _ => {
548
+ return Err(ParquetError::Conversion(format!(
549
+ "Expected UInt16, got {:?}",
550
+ value.type_name()
551
+ )))
552
+ }
553
+ }
554
+ }
555
+ Ok(StdArc::new(builder.finish()))
556
+ }
557
+
558
+ /// Build UInt32 array
559
+ fn build_uint32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
560
+ let mut builder = UInt32Builder::with_capacity(values.len());
561
+ for value in values {
562
+ match *value {
563
+ ParquetValue::UInt32(i) => builder.append_value(*i),
564
+ ParquetValue::UInt16(i) => builder.append_value(*i as u32),
565
+ ParquetValue::UInt8(i) => builder.append_value(*i as u32),
566
+ ParquetValue::Null => builder.append_null(),
567
+ _ => {
568
+ return Err(ParquetError::Conversion(format!(
569
+ "Expected UInt32, got {:?}",
570
+ value.type_name()
571
+ )))
572
+ }
573
+ }
574
+ }
575
+ Ok(StdArc::new(builder.finish()))
576
+ }
577
+
578
+ /// Build UInt64 array
579
+ fn build_uint64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
580
+ let mut builder = UInt64Builder::with_capacity(values.len());
581
+ for value in values {
582
+ match *value {
583
+ ParquetValue::UInt64(i) => builder.append_value(*i),
584
+ ParquetValue::UInt32(i) => builder.append_value(*i as u64),
585
+ ParquetValue::UInt16(i) => builder.append_value(*i as u64),
586
+ ParquetValue::UInt8(i) => builder.append_value(*i as u64),
587
+ ParquetValue::Null => builder.append_null(),
588
+ _ => {
589
+ return Err(ParquetError::Conversion(format!(
590
+ "Expected UInt64, got {:?}",
591
+ value.type_name()
592
+ )))
593
+ }
594
+ }
595
+ }
596
+ Ok(StdArc::new(builder.finish()))
597
+ }
598
+
599
+ /// Build Float32 array with Float16 support
600
+ fn build_float32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
601
+ let mut builder = Float32Builder::with_capacity(values.len());
602
+ for value in values {
603
+ match *value {
604
+ ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(*f),
605
+ ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(*f),
606
+ ParquetValue::Null => builder.append_null(),
607
+ _ => {
608
+ return Err(ParquetError::Conversion(format!(
609
+ "Expected Float32, got {:?}",
610
+ value.type_name()
611
+ )))
612
+ }
613
+ }
614
+ }
615
+ Ok(StdArc::new(builder.finish()))
616
+ }
617
+
618
+ /// Build Float64 array with Float32 and Float16 support
619
+ fn build_float64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
620
+ let mut builder = Float64Builder::with_capacity(values.len());
621
+ for value in values {
622
+ match *value {
623
+ ParquetValue::Float64(OrderedFloat(f)) => builder.append_value(*f),
624
+ ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(*f as f64),
625
+ ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(*f as f64),
626
+ ParquetValue::Null => builder.append_null(),
627
+ _ => {
628
+ return Err(ParquetError::Conversion(format!(
629
+ "Expected Float64, got {:?}",
630
+ value.type_name()
631
+ )))
632
+ }
633
+ }
634
+ }
635
+ Ok(StdArc::new(builder.finish()))
636
+ }
637
+
638
+ /// Build string array
639
+ fn build_string_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
640
+ // Pre-size the data buffer exactly: growing it by doubling would
641
+ // transiently hold up to 3x the payload during the final realloc.
642
+ let data_capacity: usize = values
643
+ .iter()
644
+ .map(|value| match value {
645
+ ParquetValue::String(s) => s.len(),
646
+ _ => 0,
647
+ })
648
+ .sum();
649
+ let mut builder = StringBuilder::with_capacity(values.len(), data_capacity);
650
+ for value in values {
651
+ match *value {
652
+ ParquetValue::String(s) => builder.append_value(s.as_ref()),
653
+ ParquetValue::Null => builder.append_null(),
654
+ _ => {
655
+ return Err(ParquetError::Conversion(format!(
656
+ "Expected String, got {:?}",
657
+ value.type_name()
658
+ )))
659
+ }
660
+ }
661
+ }
662
+ Ok(StdArc::new(builder.finish()))
663
+ }
664
+
665
+ /// Build binary array
666
+ fn build_binary_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
667
+ // Pre-size the data buffer exactly, as in build_string_array.
668
+ let data_capacity: usize = values
669
+ .iter()
670
+ .map(|value| match value {
671
+ ParquetValue::Bytes(b) => b.len(),
672
+ _ => 0,
673
+ })
674
+ .sum();
675
+ let mut builder = BinaryBuilder::with_capacity(values.len(), data_capacity);
676
+ for value in values {
677
+ match *value {
678
+ ParquetValue::Bytes(b) => builder.append_value(b.as_ref()),
679
+ ParquetValue::Null => builder.append_null(),
680
+ _ => {
681
+ return Err(ParquetError::Conversion(format!(
682
+ "Expected Bytes, got {:?}",
683
+ value.type_name()
684
+ )))
685
+ }
686
+ }
687
+ }
688
+ Ok(StdArc::new(builder.finish()))
689
+ }
690
+
691
+ /// Build fixed size binary array
692
+ fn build_fixed_binary_array(values: &[&ParquetValue], size: i32) -> Result<ArrayRef> {
693
+ let mut builder = FixedSizeBinaryBuilder::with_capacity(values.len(), size);
694
+ for value in values {
695
+ match *value {
696
+ ParquetValue::Bytes(b) => {
697
+ if b.len() != size as usize {
698
+ return Err(ParquetError::Conversion(format!(
699
+ "Fixed size binary expected {} bytes, got {}",
700
+ size,
701
+ b.len()
702
+ )));
703
+ }
704
+ builder.append_value(b.as_ref())?;
705
+ }
706
+ ParquetValue::Null => builder.append_null(),
707
+ _ => {
708
+ return Err(ParquetError::Conversion(format!(
709
+ "Expected Bytes, got {:?}",
710
+ value.type_name()
711
+ )))
712
+ }
713
+ }
714
+ }
715
+ Ok(StdArc::new(builder.finish()))
716
+ }
717
+
718
+ /// Build Date32 array
719
+ fn build_date32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
720
+ let mut builder = Date32Builder::with_capacity(values.len());
721
+ for value in values {
722
+ match *value {
723
+ ParquetValue::Date32(d) => builder.append_value(*d),
724
+ ParquetValue::Null => builder.append_null(),
725
+ _ => {
726
+ return Err(ParquetError::Conversion(format!(
727
+ "Expected Date32, got {:?}",
728
+ value.type_name()
729
+ )))
730
+ }
731
+ }
732
+ }
733
+ Ok(StdArc::new(builder.finish()))
734
+ }
735
+
736
+ /// Build Date64 array
737
+ fn build_date64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
738
+ let mut builder = Date64Builder::with_capacity(values.len());
739
+ for value in values {
740
+ match *value {
741
+ ParquetValue::Date64(d) => builder.append_value(*d),
742
+ ParquetValue::Null => builder.append_null(),
743
+ _ => {
744
+ return Err(ParquetError::Conversion(format!(
745
+ "Expected Date64, got {:?}",
746
+ value.type_name()
747
+ )))
748
+ }
749
+ }
750
+ }
751
+ Ok(StdArc::new(builder.finish()))
752
+ }
753
+
754
+ /// Build Time32 array
755
+ fn build_time32_array(values: &[&ParquetValue], unit: &arrow_schema::TimeUnit) -> Result<ArrayRef> {
756
+ match unit {
757
+ arrow_schema::TimeUnit::Millisecond => {
758
+ let mut builder = Time32MillisecondBuilder::with_capacity(values.len());
759
+ for value in values {
760
+ match *value {
761
+ ParquetValue::TimeMillis(t) => builder.append_value(*t),
762
+ ParquetValue::Null => builder.append_null(),
763
+ _ => {
764
+ return Err(ParquetError::Conversion(format!(
765
+ "Expected TimeMillis, got {:?}",
766
+ value.type_name()
767
+ )))
768
+ }
769
+ }
770
+ }
771
+ Ok(StdArc::new(builder.finish()))
772
+ }
773
+ _ => Err(ParquetError::Conversion(format!(
774
+ "Unsupported time32 unit: {:?}",
775
+ unit
776
+ ))),
777
+ }
778
+ }
779
+
780
+ /// Build Time64 array
781
+ fn build_time64_array(values: &[&ParquetValue], unit: &arrow_schema::TimeUnit) -> Result<ArrayRef> {
782
+ match unit {
783
+ arrow_schema::TimeUnit::Microsecond => {
784
+ let mut builder = Time64MicrosecondBuilder::with_capacity(values.len());
785
+ for value in values {
786
+ match *value {
787
+ ParquetValue::TimeMicros(t) => builder.append_value(*t),
788
+ ParquetValue::Null => builder.append_null(),
789
+ _ => {
790
+ return Err(ParquetError::Conversion(format!(
791
+ "Expected TimeMicros, got {:?}",
792
+ value.type_name()
793
+ )))
794
+ }
795
+ }
796
+ }
797
+ Ok(StdArc::new(builder.finish()))
798
+ }
799
+ arrow_schema::TimeUnit::Nanosecond => {
800
+ let mut builder = Time64NanosecondBuilder::with_capacity(values.len());
801
+ for value in values {
802
+ match *value {
803
+ ParquetValue::TimeNanos(t) => builder.append_value(*t),
804
+ ParquetValue::Null => builder.append_null(),
805
+ _ => {
806
+ return Err(ParquetError::Conversion(format!(
807
+ "Expected TimeNanos, got {:?}",
808
+ value.type_name()
809
+ )))
810
+ }
811
+ }
812
+ }
813
+ Ok(StdArc::new(builder.finish()))
814
+ }
815
+ _ => Err(ParquetError::Conversion(format!(
816
+ "Unsupported time64 unit: {:?}",
817
+ unit
818
+ ))),
819
+ }
820
+ }
821
+
822
+ /// Build timestamp array
823
+ fn build_timestamp_array(
824
+ values: &[&ParquetValue],
825
+ unit: &arrow_schema::TimeUnit,
826
+ timezone: Option<&str>,
827
+ ) -> Result<ArrayRef> {
828
+ let tz = timezone.map(StdArc::from);
829
+
830
+ match unit {
831
+ arrow_schema::TimeUnit::Second => {
832
+ let mut builder =
833
+ TimestampSecondBuilder::with_capacity(values.len()).with_timezone_opt(tz.clone());
834
+ for value in values {
835
+ match *value {
836
+ ParquetValue::TimestampSecond(t, _) => builder.append_value(*t),
837
+ ParquetValue::Null => builder.append_null(),
838
+ _ => {
839
+ return Err(ParquetError::Conversion(format!(
840
+ "Expected TimestampSecond, got {:?}",
841
+ value.type_name()
842
+ )))
843
+ }
844
+ }
845
+ }
846
+ Ok(StdArc::new(builder.finish()))
847
+ }
848
+ arrow_schema::TimeUnit::Millisecond => {
849
+ let mut builder = TimestampMillisecondBuilder::with_capacity(values.len())
850
+ .with_timezone_opt(tz.clone());
851
+ for value in values {
852
+ match *value {
853
+ ParquetValue::TimestampMillis(t, _) => builder.append_value(*t),
854
+ ParquetValue::Null => builder.append_null(),
855
+ _ => {
856
+ return Err(ParquetError::Conversion(format!(
857
+ "Expected TimestampMillis, got {:?}",
858
+ value.type_name()
859
+ )))
860
+ }
861
+ }
862
+ }
863
+ Ok(StdArc::new(builder.finish()))
864
+ }
865
+ arrow_schema::TimeUnit::Microsecond => {
866
+ let mut builder = TimestampMicrosecondBuilder::with_capacity(values.len())
867
+ .with_timezone_opt(tz.clone());
868
+ for value in values {
869
+ match *value {
870
+ ParquetValue::TimestampMicros(t, _) => builder.append_value(*t),
871
+ ParquetValue::Null => builder.append_null(),
872
+ _ => {
873
+ return Err(ParquetError::Conversion(format!(
874
+ "Expected TimestampMicros, got {:?}",
875
+ value.type_name()
876
+ )))
877
+ }
878
+ }
879
+ }
880
+ Ok(StdArc::new(builder.finish()))
881
+ }
882
+ arrow_schema::TimeUnit::Nanosecond => {
883
+ let mut builder = TimestampNanosecondBuilder::with_capacity(values.len())
884
+ .with_timezone_opt(tz.clone());
885
+ for value in values {
886
+ match *value {
887
+ ParquetValue::TimestampNanos(t, _) => builder.append_value(*t),
888
+ ParquetValue::Null => builder.append_null(),
889
+ _ => {
890
+ return Err(ParquetError::Conversion(format!(
891
+ "Expected TimestampNanos, got {:?}",
892
+ value.type_name()
893
+ )))
894
+ }
895
+ }
896
+ }
897
+ Ok(StdArc::new(builder.finish()))
898
+ }
899
+ }
900
+ }
901
+
902
+ /// Build Decimal128 array
903
+ fn build_decimal128_array(values: &[&ParquetValue], precision: u8, scale: i8) -> Result<ArrayRef> {
904
+ let mut builder = Decimal128Builder::with_capacity(values.len())
905
+ .with_precision_and_scale(precision, scale)?;
906
+ for (idx, value) in values.iter().enumerate() {
907
+ match *value {
908
+ ParquetValue::Decimal128(d, value_scale) => {
909
+ validate_decimal128_array_value(*d, *value_scale, precision, scale, idx)?;
910
+ builder.append_value(*d);
911
+ }
912
+ ParquetValue::Null => builder.append_null(),
913
+ _ => {
914
+ return Err(ParquetError::Conversion(format!(
915
+ "Expected Decimal128, got {:?}",
916
+ value.type_name()
917
+ )))
918
+ }
919
+ }
920
+ }
921
+ Ok(StdArc::new(builder.finish()))
922
+ }
923
+
924
+ /// Build Decimal256 array
925
+ fn build_decimal256_array(values: &[&ParquetValue], precision: u8, scale: i8) -> Result<ArrayRef> {
926
+ let mut builder = Decimal256Builder::with_capacity(values.len())
927
+ .with_precision_and_scale(precision, scale)?;
928
+ for (idx, value) in values.iter().enumerate() {
929
+ match *value {
930
+ ParquetValue::Decimal256(bigint, value_scale) => {
931
+ validate_decimal256_array_value(bigint, *value_scale, precision, scale, idx)?;
932
+ let bytes = decimal256_from_bigint(bigint)?;
933
+ builder.append_value(bytes);
934
+ }
935
+ ParquetValue::Null => builder.append_null(),
936
+ _ => {
937
+ return Err(ParquetError::Conversion(format!(
938
+ "Expected Decimal256, got {:?}",
939
+ value.type_name()
940
+ )))
941
+ }
942
+ }
943
+ }
944
+ Ok(StdArc::new(builder.finish()))
945
+ }
946
+
947
+ fn validate_decimal128_array_value(
948
+ value: i128,
949
+ value_scale: i8,
950
+ precision: u8,
951
+ scale: i8,
952
+ index: usize,
953
+ ) -> Result<()> {
954
+ if value_scale != scale {
955
+ return Err(ParquetError::Conversion(format!(
956
+ "Decimal scale mismatch at value[{}]: array scale {}, value scale {}",
957
+ index, scale, value_scale
958
+ )));
959
+ }
960
+
961
+ validate_decimal_array_precision(decimal128_digit_count(value), precision, index)
962
+ }
963
+
964
+ fn validate_decimal256_array_value(
965
+ value: &num::BigInt,
966
+ value_scale: i8,
967
+ precision: u8,
968
+ scale: i8,
969
+ index: usize,
970
+ ) -> Result<()> {
971
+ if value_scale != scale {
972
+ return Err(ParquetError::Conversion(format!(
973
+ "Decimal scale mismatch at value[{}]: array scale {}, value scale {}",
974
+ index, scale, value_scale
975
+ )));
976
+ }
977
+
978
+ validate_decimal_array_precision(decimal256_digit_count(value), precision, index)
979
+ }
980
+
981
+ fn validate_decimal_array_precision(
982
+ value_digits: usize,
983
+ precision: u8,
984
+ index: usize,
985
+ ) -> Result<()> {
986
+ if value_digits > precision as usize {
987
+ return Err(ParquetError::Conversion(format!(
988
+ "Decimal precision overflow at value[{}]: array precision {}, value has {} digits",
989
+ index, precision, value_digits
990
+ )));
991
+ }
992
+
993
+ Ok(())
994
+ }
995
+
996
+ fn decimal128_digit_count(value: i128) -> usize {
997
+ value.unsigned_abs().to_string().len()
998
+ }
999
+
1000
+ fn decimal256_digit_count(value: &num::BigInt) -> usize {
1001
+ value.to_str_radix(10).trim_start_matches('-').len()
1002
+ }
1003
+
1004
+ /// Convert BigInt to i256 (32-byte array)
1005
+ fn decimal256_from_bigint(bigint: &num::BigInt) -> Result<arrow_buffer::i256> {
1006
+ // Get bytes in little-endian format
1007
+ let (sign, mut bytes) = bigint.to_bytes_le();
1008
+
1009
+ // Ensure we have exactly 32 bytes
1010
+ if bytes.len() > 32 {
1011
+ return Err(ParquetError::Conversion(
1012
+ "Decimal256 value too large".to_string(),
1013
+ ));
1014
+ }
1015
+
1016
+ // Pad with zeros or ones (for negative numbers) to reach 32 bytes
1017
+ bytes.resize(32, 0);
1018
+
1019
+ // If negative, convert to two's complement
1020
+ if sign == num::bigint::Sign::Minus {
1021
+ // Invert all bits
1022
+ for byte in &mut bytes {
1023
+ *byte = !*byte;
1024
+ }
1025
+ // Add 1
1026
+ let mut carry = true;
1027
+ for byte in &mut bytes {
1028
+ if carry {
1029
+ let (new_byte, new_carry) = byte.overflowing_add(1);
1030
+ *byte = new_byte;
1031
+ carry = new_carry;
1032
+ } else {
1033
+ break;
1034
+ }
1035
+ }
1036
+ }
1037
+
1038
+ let byte_array: [u8; 32] = bytes
1039
+ .try_into()
1040
+ .map_err(|_| ParquetError::Conversion("Failed to convert bytes to i256".to_string()))?;
1041
+ Ok(arrow_buffer::i256::from_le_bytes(byte_array))
1042
+ }
1043
+
1044
+ /// Build list array
1045
+ fn build_list_array(values: &[&ParquetValue], item_field: &StdArc<Field>) -> Result<ArrayRef> {
1046
+ let mut all_items = Vec::new();
1047
+ let mut offsets = Vec::with_capacity(values.len() + 1);
1048
+ let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
1049
+ offsets.push(0i32);
1050
+
1051
+ for value in values {
1052
+ match *value {
1053
+ ParquetValue::List(items) => {
1054
+ all_items.extend(items.iter());
1055
+ offsets.push(all_items.len() as i32);
1056
+ null_buffer_builder.append(true);
1057
+ }
1058
+ ParquetValue::Null => {
1059
+ offsets.push(all_items.len() as i32);
1060
+ null_buffer_builder.append(false);
1061
+ }
1062
+ _ => {
1063
+ return Err(ParquetError::Conversion(format!(
1064
+ "Expected List, got {:?}",
1065
+ value.type_name()
1066
+ )))
1067
+ }
1068
+ }
1069
+ }
1070
+
1071
+ let item_array = parquet_value_refs_to_arrow_array(&all_items, item_field)?;
1072
+ let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
1073
+ let null_buffer = null_buffer_builder.finish();
1074
+
1075
+ Ok(StdArc::new(ListArray::new(
1076
+ item_field.clone(),
1077
+ offset_buffer,
1078
+ item_array,
1079
+ Some(null_buffer.into()),
1080
+ )))
1081
+ }
1082
+
1083
+ /// Build map array
1084
+ fn build_map_array(
1085
+ values: &[&ParquetValue],
1086
+ entries_field: &StdArc<Field>,
1087
+ _sorted: bool,
1088
+ ) -> Result<ArrayRef> {
1089
+ // Extract the key and value fields from the entries struct
1090
+ let (key_field, value_field) = match entries_field.data_type() {
1091
+ DataType::Struct(fields) if fields.len() == 2 => (&fields[0], &fields[1]),
1092
+ _ => {
1093
+ return Err(ParquetError::Conversion(
1094
+ "Map entries field must be a struct with exactly 2 fields".to_string(),
1095
+ ))
1096
+ }
1097
+ };
1098
+
1099
+ let mut all_keys = Vec::new();
1100
+ let mut all_values = Vec::new();
1101
+ let mut offsets = Vec::with_capacity(values.len() + 1);
1102
+ let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
1103
+ offsets.push(0i32);
1104
+
1105
+ for value in values {
1106
+ match *value {
1107
+ ParquetValue::Map(entries) => {
1108
+ for (k, v) in entries {
1109
+ all_keys.push(k);
1110
+ all_values.push(v);
1111
+ }
1112
+ offsets.push(all_keys.len() as i32);
1113
+ null_buffer_builder.append(true);
1114
+ }
1115
+ ParquetValue::Null => {
1116
+ offsets.push(all_keys.len() as i32);
1117
+ null_buffer_builder.append(false);
1118
+ }
1119
+ _ => {
1120
+ return Err(ParquetError::Conversion(format!(
1121
+ "Expected Map, got {:?}",
1122
+ value.type_name()
1123
+ )))
1124
+ }
1125
+ }
1126
+ }
1127
+
1128
+ let key_array = parquet_value_refs_to_arrow_array(&all_keys, key_field)?;
1129
+ let value_array = parquet_value_refs_to_arrow_array(&all_values, value_field)?;
1130
+
1131
+ // Create struct array for entries
1132
+ let struct_fields = match entries_field.data_type() {
1133
+ DataType::Struct(fields) => fields.clone(),
1134
+ _ => unreachable!("Map entries field must be a struct"),
1135
+ };
1136
+
1137
+ let struct_array = StructArray::new(struct_fields, vec![key_array, value_array], None);
1138
+
1139
+ let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
1140
+ let null_buffer = null_buffer_builder.finish();
1141
+
1142
+ Ok(StdArc::new(MapArray::new(
1143
+ entries_field.clone(),
1144
+ offset_buffer,
1145
+ struct_array,
1146
+ Some(null_buffer.into()),
1147
+ false, // sorted
1148
+ )))
1149
+ }
1150
+
1151
+ /// Build struct array
1152
+ fn build_struct_array(values: &[&ParquetValue], fields: &arrow_schema::Fields) -> Result<ArrayRef> {
1153
+ let num_rows = values.len();
1154
+ let mut field_arrays = Vec::with_capacity(fields.len());
1155
+ let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(num_rows);
1156
+ let null_value = ParquetValue::Null;
1157
+
1158
+ // Prepare columns for each field
1159
+ let mut field_columns: Vec<Vec<&ParquetValue>> =
1160
+ vec![Vec::with_capacity(num_rows); fields.len()];
1161
+
1162
+ for value in values {
1163
+ match *value {
1164
+ ParquetValue::Record(map) => {
1165
+ null_buffer_builder.append(true);
1166
+ for (idx, field) in fields.iter().enumerate() {
1167
+ let field_value = map.get(field.name().as_str()).unwrap_or(&null_value);
1168
+ field_columns[idx].push(field_value);
1169
+ }
1170
+ }
1171
+ ParquetValue::Null => {
1172
+ null_buffer_builder.append(false);
1173
+ for field_column in field_columns.iter_mut().take(fields.len()) {
1174
+ field_column.push(&null_value);
1175
+ }
1176
+ }
1177
+ _ => {
1178
+ return Err(ParquetError::Conversion(format!(
1179
+ "Expected Record, got {:?}",
1180
+ value.type_name()
1181
+ )))
1182
+ }
1183
+ }
1184
+ }
1185
+
1186
+ // Build arrays for each field
1187
+ for (column, field) in field_columns.iter().zip(fields.iter()) {
1188
+ let array = parquet_value_refs_to_arrow_array(column, field)?;
1189
+ field_arrays.push(array);
1190
+ }
1191
+
1192
+ let null_buffer = null_buffer_builder.finish();
1193
+ Ok(StdArc::new(StructArray::new(
1194
+ fields.clone(),
1195
+ field_arrays,
1196
+ Some(null_buffer.into()),
1197
+ )))
1198
+ }
1199
+
1200
+ #[cfg(test)]
1201
+ mod tests {
1202
+ use super::*;
1203
+ use arrow_array::*;
1204
+ use parquet::basic::Type as PhysicalType;
1205
+
1206
+ #[test]
1207
+ fn test_primitive_conversion_roundtrip() {
1208
+ // Test boolean
1209
+ let values = vec![
1210
+ ParquetValue::Boolean(true),
1211
+ ParquetValue::Boolean(false),
1212
+ ParquetValue::Null,
1213
+ ];
1214
+ let field = Field::new("test", DataType::Boolean, true);
1215
+ let array = parquet_values_to_arrow_array(&values, &field).unwrap();
1216
+ let type_ = Type::primitive_type_builder("test", PhysicalType::BOOLEAN)
1217
+ .build()
1218
+ .unwrap();
1219
+
1220
+ for (i, expected) in values.iter().enumerate() {
1221
+ let actual = arrow_to_parquet_value(&field, &type_, array.as_ref(), i).unwrap();
1222
+ assert_eq!(&actual, expected);
1223
+ }
1224
+ }
1225
+
1226
+ #[test]
1227
+ fn test_integer_upcasting() {
1228
+ // Test that smaller integers can be upcast to larger ones
1229
+ let values = vec![
1230
+ ParquetValue::Int8(42),
1231
+ ParquetValue::Int16(1000),
1232
+ ParquetValue::Int32(100000),
1233
+ ];
1234
+ let field = Field::new("test", DataType::Int64, false);
1235
+ let array = parquet_values_to_arrow_array(&values, &field).unwrap();
1236
+
1237
+ assert_eq!(array.len(), 3);
1238
+ let int64_array = array.as_any().downcast_ref::<Int64Array>().unwrap();
1239
+ assert_eq!(int64_array.value(0), 42);
1240
+ assert_eq!(int64_array.value(1), 1000);
1241
+ assert_eq!(int64_array.value(2), 100000);
1242
+ }
1243
+ }