parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,1133 @@
1
+ //! Bidirectional conversion between Arrow arrays and ParquetValue
2
+ //!
3
+ //! This module provides a unified interface for converting between Arrow's
4
+ //! columnar format and Parquet's value representation. It consolidates
5
+ //! the conversion logic that was previously duplicated between the reader
6
+ //! and writer modules.
7
+
8
+ use crate::{ParquetError, ParquetValue, Result};
9
+ use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
10
+ use arrow_schema::{DataType, Field};
11
+ use bytes::Bytes;
12
+ use indexmap::IndexMap;
13
+ use ordered_float::OrderedFloat;
14
+ use std::sync::Arc;
15
+
16
+ /// Convert a single value from an Arrow array at the given index to a ParquetValue
17
+ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<ParquetValue> {
18
+ use arrow_array::*;
19
+
20
+ if array.is_null(index) {
21
+ return Ok(ParquetValue::Null);
22
+ }
23
+
24
+ match array.data_type() {
25
+ // Primitive types
26
+ DataType::Boolean => {
27
+ let array = downcast_array::<BooleanArray>(array)?;
28
+ Ok(ParquetValue::Boolean(array.value(index)))
29
+ }
30
+ DataType::Int8 => {
31
+ let array = downcast_array::<Int8Array>(array)?;
32
+ Ok(ParquetValue::Int8(array.value(index)))
33
+ }
34
+ DataType::Int16 => {
35
+ let array = downcast_array::<Int16Array>(array)?;
36
+ Ok(ParquetValue::Int16(array.value(index)))
37
+ }
38
+ DataType::Int32 => {
39
+ let array = downcast_array::<Int32Array>(array)?;
40
+ Ok(ParquetValue::Int32(array.value(index)))
41
+ }
42
+ DataType::Int64 => {
43
+ let array = downcast_array::<Int64Array>(array)?;
44
+ Ok(ParquetValue::Int64(array.value(index)))
45
+ }
46
+ DataType::UInt8 => {
47
+ let array = downcast_array::<UInt8Array>(array)?;
48
+ Ok(ParquetValue::UInt8(array.value(index)))
49
+ }
50
+ DataType::UInt16 => {
51
+ let array = downcast_array::<UInt16Array>(array)?;
52
+ Ok(ParquetValue::UInt16(array.value(index)))
53
+ }
54
+ DataType::UInt32 => {
55
+ let array = downcast_array::<UInt32Array>(array)?;
56
+ Ok(ParquetValue::UInt32(array.value(index)))
57
+ }
58
+ DataType::UInt64 => {
59
+ let array = downcast_array::<UInt64Array>(array)?;
60
+ Ok(ParquetValue::UInt64(array.value(index)))
61
+ }
62
+ DataType::Float16 => {
63
+ let array = downcast_array::<Float16Array>(array)?;
64
+ let value = array.value(index);
65
+ Ok(ParquetValue::Float16(OrderedFloat(value.to_f32())))
66
+ }
67
+ DataType::Float32 => {
68
+ let array = downcast_array::<Float32Array>(array)?;
69
+ Ok(ParquetValue::Float32(OrderedFloat(array.value(index))))
70
+ }
71
+ DataType::Float64 => {
72
+ let array = downcast_array::<Float64Array>(array)?;
73
+ Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
74
+ }
75
+
76
+ // String and binary types
77
+ DataType::Utf8 => {
78
+ let array = downcast_array::<StringArray>(array)?;
79
+ Ok(ParquetValue::String(Arc::from(array.value(index))))
80
+ }
81
+ DataType::Binary => {
82
+ let array = downcast_array::<BinaryArray>(array)?;
83
+ Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
84
+ array.value(index),
85
+ )))
86
+ }
87
+ DataType::FixedSizeBinary(_) => {
88
+ let array = downcast_array::<FixedSizeBinaryArray>(array)?;
89
+ Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
90
+ array.value(index),
91
+ )))
92
+ }
93
+
94
+ // Date and time types
95
+ DataType::Date32 => {
96
+ let array = downcast_array::<Date32Array>(array)?;
97
+ Ok(ParquetValue::Date32(array.value(index)))
98
+ }
99
+ DataType::Date64 => {
100
+ let array = downcast_array::<Date64Array>(array)?;
101
+ Ok(ParquetValue::Date64(array.value(index)))
102
+ }
103
+
104
+ // Timestamp types
105
+ DataType::Timestamp(unit, timezone) => {
106
+ let timezone = timezone.as_ref().map(|s| Arc::from(s.as_ref()));
107
+ match unit {
108
+ arrow_schema::TimeUnit::Millisecond => {
109
+ let array = downcast_array::<TimestampMillisecondArray>(array)?;
110
+ Ok(ParquetValue::TimestampMillis(array.value(index), timezone))
111
+ }
112
+ arrow_schema::TimeUnit::Microsecond => {
113
+ let array = downcast_array::<TimestampMicrosecondArray>(array)?;
114
+ Ok(ParquetValue::TimestampMicros(array.value(index), timezone))
115
+ }
116
+ arrow_schema::TimeUnit::Second => {
117
+ let array = downcast_array::<TimestampSecondArray>(array)?;
118
+ Ok(ParquetValue::TimestampSecond(array.value(index), timezone))
119
+ }
120
+ arrow_schema::TimeUnit::Nanosecond => {
121
+ let array = downcast_array::<TimestampNanosecondArray>(array)?;
122
+ Ok(ParquetValue::TimestampNanos(array.value(index), timezone))
123
+ }
124
+ }
125
+ }
126
+
127
+ // Time types
128
+ DataType::Time32(unit) => match unit {
129
+ arrow_schema::TimeUnit::Millisecond => {
130
+ let array = downcast_array::<Time32MillisecondArray>(array)?;
131
+ Ok(ParquetValue::TimeMillis(array.value(index)))
132
+ }
133
+ _ => Err(ParquetError::Conversion(format!(
134
+ "Unsupported time32 unit: {:?}",
135
+ unit
136
+ ))),
137
+ },
138
+ DataType::Time64(unit) => match unit {
139
+ arrow_schema::TimeUnit::Microsecond => {
140
+ let array = downcast_array::<Time64MicrosecondArray>(array)?;
141
+ Ok(ParquetValue::TimeMicros(array.value(index)))
142
+ }
143
+ _ => Err(ParquetError::Conversion(format!(
144
+ "Unsupported time64 unit: {:?}",
145
+ unit
146
+ ))),
147
+ },
148
+
149
+ // Decimal types
150
+ DataType::Decimal128(_precision, scale) => {
151
+ let array = downcast_array::<Decimal128Array>(array)?;
152
+ let value = array.value(index);
153
+ Ok(ParquetValue::Decimal128(value, *scale))
154
+ }
155
+ DataType::Decimal256(_precision, scale) => {
156
+ let array = downcast_array::<Decimal256Array>(array)?;
157
+ let bytes = array.value(index).to_le_bytes();
158
+
159
+ // Convert to BigInt
160
+ let bigint = if bytes[31] & 0x80 != 0 {
161
+ // Negative number - convert from two's complement
162
+ let mut inverted = [0u8; 32];
163
+ for (i, &b) in bytes.iter().enumerate() {
164
+ inverted[i] = !b;
165
+ }
166
+ let positive = num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &inverted);
167
+ -(positive + num::BigInt::from(1))
168
+ } else {
169
+ num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &bytes)
170
+ };
171
+
172
+ Ok(ParquetValue::Decimal256(bigint, *scale))
173
+ }
174
+
175
+ // Complex types
176
+ DataType::List(_) => {
177
+ let array = downcast_array::<ListArray>(array)?;
178
+ let list_values = array.value(index);
179
+
180
+ let mut values = Vec::with_capacity(list_values.len());
181
+ for i in 0..list_values.len() {
182
+ values.push(arrow_to_parquet_value(&list_values, i)?);
183
+ }
184
+
185
+ Ok(ParquetValue::List(values))
186
+ }
187
+ DataType::Map(_, _) => {
188
+ let array = downcast_array::<MapArray>(array)?;
189
+ let map_value = array.value(index);
190
+
191
+ // Map is stored as a struct with two fields: keys and values
192
+ let keys = map_value.column(0);
193
+ let values = map_value.column(1);
194
+
195
+ let mut map_vec = Vec::with_capacity(keys.len());
196
+ for i in 0..keys.len() {
197
+ let key = arrow_to_parquet_value(keys, i)?;
198
+ let value = arrow_to_parquet_value(values, i)?;
199
+ map_vec.push((key, value));
200
+ }
201
+
202
+ Ok(ParquetValue::Map(map_vec))
203
+ }
204
+ DataType::Struct(_) => {
205
+ let array = downcast_array::<StructArray>(array)?;
206
+
207
+ let mut map = IndexMap::new();
208
+ for (col_idx, field) in array.fields().iter().enumerate() {
209
+ let column = array.column(col_idx);
210
+ let value = arrow_to_parquet_value(column, index)?;
211
+ map.insert(Arc::from(field.name().as_str()), value);
212
+ }
213
+
214
+ Ok(ParquetValue::Record(map))
215
+ }
216
+
217
+ dt => Err(ParquetError::Conversion(format!(
218
+ "Unsupported data type for conversion: {:?}",
219
+ dt
220
+ ))),
221
+ }
222
+ }
223
+
224
+ /// Convert a vector of ParquetValues to an Arrow array
225
+ pub fn parquet_values_to_arrow_array(values: Vec<ParquetValue>, field: &Field) -> Result<ArrayRef> {
226
+ match field.data_type() {
227
+ // Boolean
228
+ DataType::Boolean => {
229
+ let mut builder = BooleanBuilder::with_capacity(values.len());
230
+ for value in values {
231
+ match value {
232
+ ParquetValue::Boolean(b) => builder.append_value(b),
233
+ ParquetValue::Null => builder.append_null(),
234
+ _ => {
235
+ return Err(ParquetError::Conversion(format!(
236
+ "Expected Boolean, got {:?}",
237
+ value.type_name()
238
+ )))
239
+ }
240
+ }
241
+ }
242
+ Ok(Arc::new(builder.finish()))
243
+ }
244
+
245
+ // Integer types with automatic upcasting
246
+ DataType::Int8 => build_int8_array(values),
247
+ DataType::Int16 => build_int16_array(values),
248
+ DataType::Int32 => build_int32_array(values),
249
+ DataType::Int64 => build_int64_array(values),
250
+ DataType::UInt8 => build_uint8_array(values),
251
+ DataType::UInt16 => build_uint16_array(values),
252
+ DataType::UInt32 => build_uint32_array(values),
253
+ DataType::UInt64 => build_uint64_array(values),
254
+
255
+ // Float types
256
+ DataType::Float32 => build_float32_array(values),
257
+ DataType::Float64 => build_float64_array(values),
258
+
259
+ // String and binary
260
+ DataType::Utf8 => build_string_array(values),
261
+ DataType::Binary => build_binary_array(values),
262
+ DataType::FixedSizeBinary(size) => build_fixed_binary_array(values, *size),
263
+
264
+ // Date and time
265
+ DataType::Date32 => build_date32_array(values),
266
+ DataType::Date64 => build_date64_array(values),
267
+ DataType::Time32(unit) => build_time32_array(values, unit),
268
+ DataType::Time64(unit) => build_time64_array(values, unit),
269
+
270
+ // Timestamp
271
+ DataType::Timestamp(unit, tz) => build_timestamp_array(values, unit, tz.as_deref()),
272
+
273
+ // Decimal
274
+ DataType::Decimal128(precision, scale) => {
275
+ build_decimal128_array(values, *precision, *scale)
276
+ }
277
+ DataType::Decimal256(precision, scale) => {
278
+ build_decimal256_array(values, *precision, *scale)
279
+ }
280
+
281
+ // Complex types
282
+ DataType::List(item_field) => build_list_array(values, item_field),
283
+ DataType::Map(entries_field, sorted) => build_map_array(values, entries_field, *sorted),
284
+ DataType::Struct(fields) => build_struct_array(values, fields),
285
+
286
+ dt => Err(ParquetError::Conversion(format!(
287
+ "Unsupported data type for conversion: {:?}",
288
+ dt
289
+ ))),
290
+ }
291
+ }
292
+
293
+ /// Helper function to downcast an array with better error messages
294
+ fn downcast_array<T: 'static>(array: &dyn Array) -> Result<&T> {
295
+ array.as_any().downcast_ref::<T>().ok_or_else(|| {
296
+ ParquetError::Conversion(format!("Failed to cast to {}", std::any::type_name::<T>()))
297
+ })
298
+ }
299
+
300
+ /// Build Int8 array
301
+ fn build_int8_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
302
+ let mut builder = Int8Builder::with_capacity(values.len());
303
+ for value in values {
304
+ match value {
305
+ ParquetValue::Int8(i) => builder.append_value(i),
306
+ ParquetValue::Null => builder.append_null(),
307
+ _ => {
308
+ return Err(ParquetError::Conversion(format!(
309
+ "Expected Int8, got {:?}",
310
+ value.type_name()
311
+ )))
312
+ }
313
+ }
314
+ }
315
+ Ok(Arc::new(builder.finish()))
316
+ }
317
+
318
+ /// Build Int16 array
319
+ fn build_int16_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
320
+ let mut builder = Int16Builder::with_capacity(values.len());
321
+ for value in values {
322
+ match value {
323
+ ParquetValue::Int16(i) => builder.append_value(i),
324
+ ParquetValue::Int8(i) => builder.append_value(i as i16),
325
+ ParquetValue::Null => builder.append_null(),
326
+ _ => {
327
+ return Err(ParquetError::Conversion(format!(
328
+ "Expected Int16, got {:?}",
329
+ value.type_name()
330
+ )))
331
+ }
332
+ }
333
+ }
334
+ Ok(Arc::new(builder.finish()))
335
+ }
336
+
337
+ /// Build Int32 array
338
+ fn build_int32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
339
+ let mut builder = Int32Builder::with_capacity(values.len());
340
+ for value in values {
341
+ match value {
342
+ ParquetValue::Int32(i) => builder.append_value(i),
343
+ ParquetValue::Int16(i) => builder.append_value(i as i32),
344
+ ParquetValue::Int8(i) => builder.append_value(i as i32),
345
+ ParquetValue::Null => builder.append_null(),
346
+ _ => {
347
+ return Err(ParquetError::Conversion(format!(
348
+ "Expected Int32, got {:?}",
349
+ value.type_name()
350
+ )))
351
+ }
352
+ }
353
+ }
354
+ Ok(Arc::new(builder.finish()))
355
+ }
356
+
357
+ /// Build Int64 array
358
+ fn build_int64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
359
+ let mut builder = Int64Builder::with_capacity(values.len());
360
+ for value in values {
361
+ match value {
362
+ ParquetValue::Int64(i) => builder.append_value(i),
363
+ ParquetValue::Int32(i) => builder.append_value(i as i64),
364
+ ParquetValue::Int16(i) => builder.append_value(i as i64),
365
+ ParquetValue::Int8(i) => builder.append_value(i as i64),
366
+ ParquetValue::Null => builder.append_null(),
367
+ _ => {
368
+ return Err(ParquetError::Conversion(format!(
369
+ "Expected Int64, got {:?}",
370
+ value.type_name()
371
+ )))
372
+ }
373
+ }
374
+ }
375
+ Ok(Arc::new(builder.finish()))
376
+ }
377
+
378
+ /// Build UInt8 array
379
+ fn build_uint8_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
380
+ let mut builder = UInt8Builder::with_capacity(values.len());
381
+ for value in values {
382
+ match value {
383
+ ParquetValue::UInt8(i) => builder.append_value(i),
384
+ ParquetValue::Null => builder.append_null(),
385
+ _ => {
386
+ return Err(ParquetError::Conversion(format!(
387
+ "Expected UInt8, got {:?}",
388
+ value.type_name()
389
+ )))
390
+ }
391
+ }
392
+ }
393
+ Ok(Arc::new(builder.finish()))
394
+ }
395
+
396
+ /// Build UInt16 array
397
+ fn build_uint16_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
398
+ let mut builder = UInt16Builder::with_capacity(values.len());
399
+ for value in values {
400
+ match value {
401
+ ParquetValue::UInt16(i) => builder.append_value(i),
402
+ ParquetValue::UInt8(i) => builder.append_value(i as u16),
403
+ ParquetValue::Null => builder.append_null(),
404
+ _ => {
405
+ return Err(ParquetError::Conversion(format!(
406
+ "Expected UInt16, got {:?}",
407
+ value.type_name()
408
+ )))
409
+ }
410
+ }
411
+ }
412
+ Ok(Arc::new(builder.finish()))
413
+ }
414
+
415
+ /// Build UInt32 array
416
+ fn build_uint32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
417
+ let mut builder = UInt32Builder::with_capacity(values.len());
418
+ for value in values {
419
+ match value {
420
+ ParquetValue::UInt32(i) => builder.append_value(i),
421
+ ParquetValue::UInt16(i) => builder.append_value(i as u32),
422
+ ParquetValue::UInt8(i) => builder.append_value(i as u32),
423
+ ParquetValue::Null => builder.append_null(),
424
+ _ => {
425
+ return Err(ParquetError::Conversion(format!(
426
+ "Expected UInt32, got {:?}",
427
+ value.type_name()
428
+ )))
429
+ }
430
+ }
431
+ }
432
+ Ok(Arc::new(builder.finish()))
433
+ }
434
+
435
+ /// Build UInt64 array
436
+ fn build_uint64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
437
+ let mut builder = UInt64Builder::with_capacity(values.len());
438
+ for value in values {
439
+ match value {
440
+ ParquetValue::UInt64(i) => builder.append_value(i),
441
+ ParquetValue::UInt32(i) => builder.append_value(i as u64),
442
+ ParquetValue::UInt16(i) => builder.append_value(i as u64),
443
+ ParquetValue::UInt8(i) => builder.append_value(i as u64),
444
+ ParquetValue::Null => builder.append_null(),
445
+ _ => {
446
+ return Err(ParquetError::Conversion(format!(
447
+ "Expected UInt64, got {:?}",
448
+ value.type_name()
449
+ )))
450
+ }
451
+ }
452
+ }
453
+ Ok(Arc::new(builder.finish()))
454
+ }
455
+
456
+ /// Build Float32 array with Float16 support
457
+ fn build_float32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
458
+ let mut builder = Float32Builder::with_capacity(values.len());
459
+ for value in values {
460
+ match value {
461
+ ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(f),
462
+ ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(f),
463
+ ParquetValue::Null => builder.append_null(),
464
+ _ => {
465
+ return Err(ParquetError::Conversion(format!(
466
+ "Expected Float32, got {:?}",
467
+ value.type_name()
468
+ )))
469
+ }
470
+ }
471
+ }
472
+ Ok(Arc::new(builder.finish()))
473
+ }
474
+
475
+ /// Build Float64 array with Float32 and Float16 support
476
+ fn build_float64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
477
+ let mut builder = Float64Builder::with_capacity(values.len());
478
+ for value in values {
479
+ match value {
480
+ ParquetValue::Float64(OrderedFloat(f)) => builder.append_value(f),
481
+ ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(f as f64),
482
+ ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(f as f64),
483
+ ParquetValue::Null => builder.append_null(),
484
+ _ => {
485
+ return Err(ParquetError::Conversion(format!(
486
+ "Expected Float64, got {:?}",
487
+ value.type_name()
488
+ )))
489
+ }
490
+ }
491
+ }
492
+ Ok(Arc::new(builder.finish()))
493
+ }
494
+
495
+ /// Build string array
496
+ fn build_string_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
497
+ let mut builder = StringBuilder::with_capacity(values.len(), 0);
498
+ for value in values {
499
+ match value {
500
+ ParquetValue::String(s) => builder.append_value(&s),
501
+ ParquetValue::Null => builder.append_null(),
502
+ _ => {
503
+ return Err(ParquetError::Conversion(format!(
504
+ "Expected String, got {:?}",
505
+ value.type_name()
506
+ )))
507
+ }
508
+ }
509
+ }
510
+ Ok(Arc::new(builder.finish()))
511
+ }
512
+
513
+ /// Build binary array
514
+ fn build_binary_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
515
+ let mut builder = BinaryBuilder::with_capacity(values.len(), 0);
516
+ for value in values {
517
+ match value {
518
+ ParquetValue::Bytes(b) => builder.append_value(&b),
519
+ ParquetValue::Null => builder.append_null(),
520
+ _ => {
521
+ return Err(ParquetError::Conversion(format!(
522
+ "Expected Bytes, got {:?}",
523
+ value.type_name()
524
+ )))
525
+ }
526
+ }
527
+ }
528
+ Ok(Arc::new(builder.finish()))
529
+ }
530
+
531
+ /// Build fixed size binary array
532
+ fn build_fixed_binary_array(values: Vec<ParquetValue>, size: i32) -> Result<ArrayRef> {
533
+ let mut builder = FixedSizeBinaryBuilder::with_capacity(values.len(), size);
534
+ for value in values {
535
+ match value {
536
+ ParquetValue::Bytes(b) => {
537
+ if b.len() != size as usize {
538
+ return Err(ParquetError::Conversion(format!(
539
+ "Fixed size binary expected {} bytes, got {}",
540
+ size,
541
+ b.len()
542
+ )));
543
+ }
544
+ builder.append_value(&b)?;
545
+ }
546
+ ParquetValue::Null => builder.append_null(),
547
+ _ => {
548
+ return Err(ParquetError::Conversion(format!(
549
+ "Expected Bytes, got {:?}",
550
+ value.type_name()
551
+ )))
552
+ }
553
+ }
554
+ }
555
+ Ok(Arc::new(builder.finish()))
556
+ }
557
+
558
+ /// Build Date32 array
559
+ fn build_date32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
560
+ let mut builder = Date32Builder::with_capacity(values.len());
561
+ for value in values {
562
+ match value {
563
+ ParquetValue::Date32(d) => builder.append_value(d),
564
+ ParquetValue::Null => builder.append_null(),
565
+ _ => {
566
+ return Err(ParquetError::Conversion(format!(
567
+ "Expected Date32, got {:?}",
568
+ value.type_name()
569
+ )))
570
+ }
571
+ }
572
+ }
573
+ Ok(Arc::new(builder.finish()))
574
+ }
575
+
576
+ /// Build Date64 array
577
+ fn build_date64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
578
+ let mut builder = Date64Builder::with_capacity(values.len());
579
+ for value in values {
580
+ match value {
581
+ ParquetValue::Date64(d) => builder.append_value(d),
582
+ ParquetValue::Null => builder.append_null(),
583
+ _ => {
584
+ return Err(ParquetError::Conversion(format!(
585
+ "Expected Date64, got {:?}",
586
+ value.type_name()
587
+ )))
588
+ }
589
+ }
590
+ }
591
+ Ok(Arc::new(builder.finish()))
592
+ }
593
+
594
+ /// Build Time32 array
595
+ fn build_time32_array(
596
+ values: Vec<ParquetValue>,
597
+ unit: &arrow_schema::TimeUnit,
598
+ ) -> Result<ArrayRef> {
599
+ match unit {
600
+ arrow_schema::TimeUnit::Millisecond => {
601
+ let mut builder = Time32MillisecondBuilder::with_capacity(values.len());
602
+ for value in values {
603
+ match value {
604
+ ParquetValue::TimeMillis(t) => builder.append_value(t),
605
+ ParquetValue::Null => builder.append_null(),
606
+ _ => {
607
+ return Err(ParquetError::Conversion(format!(
608
+ "Expected TimeMillis, got {:?}",
609
+ value.type_name()
610
+ )))
611
+ }
612
+ }
613
+ }
614
+ Ok(Arc::new(builder.finish()))
615
+ }
616
+ _ => Err(ParquetError::Conversion(format!(
617
+ "Unsupported time32 unit: {:?}",
618
+ unit
619
+ ))),
620
+ }
621
+ }
622
+
623
+ /// Build Time64 array
624
+ fn build_time64_array(
625
+ values: Vec<ParquetValue>,
626
+ unit: &arrow_schema::TimeUnit,
627
+ ) -> Result<ArrayRef> {
628
+ match unit {
629
+ arrow_schema::TimeUnit::Microsecond => {
630
+ let mut builder = Time64MicrosecondBuilder::with_capacity(values.len());
631
+ for value in values {
632
+ match value {
633
+ ParquetValue::TimeMicros(t) => builder.append_value(t),
634
+ ParquetValue::Null => builder.append_null(),
635
+ _ => {
636
+ return Err(ParquetError::Conversion(format!(
637
+ "Expected TimeMicros, got {:?}",
638
+ value.type_name()
639
+ )))
640
+ }
641
+ }
642
+ }
643
+ Ok(Arc::new(builder.finish()))
644
+ }
645
+ _ => Err(ParquetError::Conversion(format!(
646
+ "Unsupported time64 unit: {:?}",
647
+ unit
648
+ ))),
649
+ }
650
+ }
651
+
652
+ /// Build timestamp array
653
+ fn build_timestamp_array(
654
+ values: Vec<ParquetValue>,
655
+ unit: &arrow_schema::TimeUnit,
656
+ timezone: Option<&str>,
657
+ ) -> Result<ArrayRef> {
658
+ // First, check if all values have the same timezone (or use the field timezone)
659
+ let mut common_tz: Option<Option<Arc<str>>> = None;
660
+ for value in &values {
661
+ match value {
662
+ ParquetValue::TimestampSecond(_, tz)
663
+ | ParquetValue::TimestampMillis(_, tz)
664
+ | ParquetValue::TimestampMicros(_, tz)
665
+ | ParquetValue::TimestampNanos(_, tz) => {
666
+ match &common_tz {
667
+ None => common_tz = Some(tz.clone()),
668
+ Some(existing) => {
669
+ // If we have mixed timezones, we'll use the field timezone
670
+ if existing != tz {
671
+ common_tz = Some(timezone.map(Arc::from));
672
+ break;
673
+ }
674
+ }
675
+ }
676
+ }
677
+ ParquetValue::Null => {}
678
+ _ => {}
679
+ }
680
+ }
681
+
682
+ // Use the common timezone from values, or fall back to field timezone
683
+ let tz = common_tz.unwrap_or_else(|| timezone.map(Arc::from));
684
+
685
+ match unit {
686
+ arrow_schema::TimeUnit::Second => {
687
+ let mut builder =
688
+ TimestampSecondBuilder::with_capacity(values.len()).with_timezone_opt(tz.clone());
689
+ for value in values {
690
+ match value {
691
+ ParquetValue::TimestampSecond(t, _) => builder.append_value(t),
692
+ ParquetValue::Null => builder.append_null(),
693
+ _ => {
694
+ return Err(ParquetError::Conversion(format!(
695
+ "Expected TimestampSecond, got {:?}",
696
+ value.type_name()
697
+ )))
698
+ }
699
+ }
700
+ }
701
+ Ok(Arc::new(builder.finish()))
702
+ }
703
+ arrow_schema::TimeUnit::Millisecond => {
704
+ let mut builder = TimestampMillisecondBuilder::with_capacity(values.len())
705
+ .with_timezone_opt(tz.clone());
706
+ for value in values {
707
+ match value {
708
+ ParquetValue::TimestampMillis(t, _) => builder.append_value(t),
709
+ ParquetValue::Null => builder.append_null(),
710
+ _ => {
711
+ return Err(ParquetError::Conversion(format!(
712
+ "Expected TimestampMillis, got {:?}",
713
+ value.type_name()
714
+ )))
715
+ }
716
+ }
717
+ }
718
+ Ok(Arc::new(builder.finish()))
719
+ }
720
+ arrow_schema::TimeUnit::Microsecond => {
721
+ let mut builder = TimestampMicrosecondBuilder::with_capacity(values.len())
722
+ .with_timezone_opt(tz.clone());
723
+ for value in values {
724
+ match value {
725
+ ParquetValue::TimestampMicros(t, _) => builder.append_value(t),
726
+ ParquetValue::Null => builder.append_null(),
727
+ _ => {
728
+ return Err(ParquetError::Conversion(format!(
729
+ "Expected TimestampMicros, got {:?}",
730
+ value.type_name()
731
+ )))
732
+ }
733
+ }
734
+ }
735
+ Ok(Arc::new(builder.finish()))
736
+ }
737
+ arrow_schema::TimeUnit::Nanosecond => {
738
+ let mut builder = TimestampNanosecondBuilder::with_capacity(values.len())
739
+ .with_timezone_opt(tz.clone());
740
+ for value in values {
741
+ match value {
742
+ ParquetValue::TimestampNanos(t, _) => builder.append_value(t),
743
+ ParquetValue::Null => builder.append_null(),
744
+ _ => {
745
+ return Err(ParquetError::Conversion(format!(
746
+ "Expected TimestampNanos, got {:?}",
747
+ value.type_name()
748
+ )))
749
+ }
750
+ }
751
+ }
752
+ Ok(Arc::new(builder.finish()))
753
+ }
754
+ }
755
+ }
756
+
757
+ /// Build Decimal128 array
758
+ fn build_decimal128_array(values: Vec<ParquetValue>, precision: u8, scale: i8) -> Result<ArrayRef> {
759
+ let mut builder = Decimal128Builder::with_capacity(values.len())
760
+ .with_precision_and_scale(precision, scale)?;
761
+ for value in values {
762
+ match value {
763
+ ParquetValue::Decimal128(d, _) => builder.append_value(d),
764
+ ParquetValue::Null => builder.append_null(),
765
+ _ => {
766
+ return Err(ParquetError::Conversion(format!(
767
+ "Expected Decimal128, got {:?}",
768
+ value.type_name()
769
+ )))
770
+ }
771
+ }
772
+ }
773
+ Ok(Arc::new(builder.finish()))
774
+ }
775
+
776
+ /// Build Decimal256 array
777
+ fn build_decimal256_array(values: Vec<ParquetValue>, precision: u8, scale: i8) -> Result<ArrayRef> {
778
+ let mut builder = Decimal256Builder::with_capacity(values.len())
779
+ .with_precision_and_scale(precision, scale)?;
780
+ for value in values {
781
+ match value {
782
+ ParquetValue::Decimal256(bigint, _) => {
783
+ let bytes = decimal256_from_bigint(&bigint)?;
784
+ builder.append_value(bytes);
785
+ }
786
+ ParquetValue::Null => builder.append_null(),
787
+ _ => {
788
+ return Err(ParquetError::Conversion(format!(
789
+ "Expected Decimal256, got {:?}",
790
+ value.type_name()
791
+ )))
792
+ }
793
+ }
794
+ }
795
+ Ok(Arc::new(builder.finish()))
796
+ }
797
+
798
+ /// Convert BigInt to i256 (32-byte array)
799
+ fn decimal256_from_bigint(bigint: &num::BigInt) -> Result<arrow_buffer::i256> {
800
+ // Get bytes in little-endian format
801
+ let (sign, mut bytes) = bigint.to_bytes_le();
802
+
803
+ // Ensure we have exactly 32 bytes
804
+ if bytes.len() > 32 {
805
+ return Err(ParquetError::Conversion(
806
+ "Decimal256 value too large".to_string(),
807
+ ));
808
+ }
809
+
810
+ // Pad with zeros or ones (for negative numbers) to reach 32 bytes
811
+ bytes.resize(32, 0);
812
+
813
+ // If negative, convert to two's complement
814
+ if sign == num::bigint::Sign::Minus {
815
+ // Invert all bits
816
+ for byte in &mut bytes {
817
+ *byte = !*byte;
818
+ }
819
+ // Add 1
820
+ let mut carry = true;
821
+ for byte in &mut bytes {
822
+ if carry {
823
+ let (new_byte, new_carry) = byte.overflowing_add(1);
824
+ *byte = new_byte;
825
+ carry = new_carry;
826
+ } else {
827
+ break;
828
+ }
829
+ }
830
+ }
831
+
832
+ let byte_array: [u8; 32] = bytes
833
+ .try_into()
834
+ .map_err(|_| ParquetError::Conversion("Failed to convert bytes to i256".to_string()))?;
835
+ Ok(arrow_buffer::i256::from_le_bytes(byte_array))
836
+ }
837
+
838
+ /// Build list array
839
+ fn build_list_array(values: Vec<ParquetValue>, item_field: &Arc<Field>) -> Result<ArrayRef> {
840
+ let mut all_items = Vec::new();
841
+ let mut offsets = Vec::with_capacity(values.len() + 1);
842
+ let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
843
+ offsets.push(0i32);
844
+
845
+ for value in values {
846
+ match value {
847
+ ParquetValue::List(items) => {
848
+ all_items.extend(items);
849
+ offsets.push(all_items.len() as i32);
850
+ null_buffer_builder.append(true);
851
+ }
852
+ ParquetValue::Null => {
853
+ offsets.push(all_items.len() as i32);
854
+ null_buffer_builder.append(false);
855
+ }
856
+ _ => {
857
+ return Err(ParquetError::Conversion(format!(
858
+ "Expected List, got {:?}",
859
+ value.type_name()
860
+ )))
861
+ }
862
+ }
863
+ }
864
+
865
+ let item_array = parquet_values_to_arrow_array(all_items, item_field)?;
866
+ let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
867
+ let null_buffer = null_buffer_builder.finish();
868
+
869
+ Ok(Arc::new(ListArray::new(
870
+ item_field.clone(),
871
+ offset_buffer,
872
+ item_array,
873
+ Some(null_buffer.into()),
874
+ )))
875
+ }
876
+
877
+ /// Build map array
878
+ fn build_map_array(
879
+ values: Vec<ParquetValue>,
880
+ entries_field: &Arc<Field>,
881
+ _sorted: bool,
882
+ ) -> Result<ArrayRef> {
883
+ // Extract the key and value fields from the entries struct
884
+ let (key_field, value_field) = match entries_field.data_type() {
885
+ DataType::Struct(fields) if fields.len() == 2 => (&fields[0], &fields[1]),
886
+ _ => {
887
+ return Err(ParquetError::Conversion(
888
+ "Map entries field must be a struct with exactly 2 fields".to_string(),
889
+ ))
890
+ }
891
+ };
892
+
893
+ let mut all_keys = Vec::new();
894
+ let mut all_values = Vec::new();
895
+ let mut offsets = Vec::with_capacity(values.len() + 1);
896
+ let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
897
+ offsets.push(0i32);
898
+
899
+ for value in values {
900
+ match value {
901
+ ParquetValue::Map(entries) => {
902
+ for (k, v) in entries {
903
+ all_keys.push(k);
904
+ all_values.push(v);
905
+ }
906
+ offsets.push(all_keys.len() as i32);
907
+ null_buffer_builder.append(true);
908
+ }
909
+ ParquetValue::Null => {
910
+ offsets.push(all_keys.len() as i32);
911
+ null_buffer_builder.append(false);
912
+ }
913
+ _ => {
914
+ return Err(ParquetError::Conversion(format!(
915
+ "Expected Map, got {:?}",
916
+ value.type_name()
917
+ )))
918
+ }
919
+ }
920
+ }
921
+
922
+ let key_array = parquet_values_to_arrow_array(all_keys, key_field)?;
923
+ let value_array = parquet_values_to_arrow_array(all_values, value_field)?;
924
+
925
+ // Create struct array for entries
926
+ let struct_fields = match entries_field.data_type() {
927
+ DataType::Struct(fields) => fields.clone(),
928
+ _ => unreachable!("Map entries field must be a struct"),
929
+ };
930
+
931
+ let struct_array = StructArray::new(struct_fields, vec![key_array, value_array], None);
932
+
933
+ let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
934
+ let null_buffer = null_buffer_builder.finish();
935
+
936
+ Ok(Arc::new(MapArray::new(
937
+ entries_field.clone(),
938
+ offset_buffer,
939
+ struct_array,
940
+ Some(null_buffer.into()),
941
+ false, // sorted
942
+ )))
943
+ }
944
+
945
+ /// Build struct array
946
+ fn build_struct_array(
947
+ values: Vec<ParquetValue>,
948
+ fields: &arrow_schema::Fields,
949
+ ) -> Result<ArrayRef> {
950
+ let num_rows = values.len();
951
+ let mut field_arrays = Vec::with_capacity(fields.len());
952
+ let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(num_rows);
953
+
954
+ // Prepare columns for each field
955
+ let mut field_columns: Vec<Vec<ParquetValue>> =
956
+ vec![Vec::with_capacity(num_rows); fields.len()];
957
+
958
+ for value in values {
959
+ match value {
960
+ ParquetValue::Record(map) => {
961
+ null_buffer_builder.append(true);
962
+ for (idx, field) in fields.iter().enumerate() {
963
+ let field_value = map
964
+ .get(field.name().as_str())
965
+ .cloned()
966
+ .unwrap_or(ParquetValue::Null);
967
+ field_columns[idx].push(field_value);
968
+ }
969
+ }
970
+ ParquetValue::Null => {
971
+ null_buffer_builder.append(false);
972
+ for field_column in field_columns.iter_mut().take(fields.len()) {
973
+ field_column.push(ParquetValue::Null);
974
+ }
975
+ }
976
+ _ => {
977
+ return Err(ParquetError::Conversion(format!(
978
+ "Expected Record, got {:?}",
979
+ value.type_name()
980
+ )))
981
+ }
982
+ }
983
+ }
984
+
985
+ // Build arrays for each field
986
+ for (column, field) in field_columns.into_iter().zip(fields.iter()) {
987
+ let array = parquet_values_to_arrow_array(column, field)?;
988
+ field_arrays.push(array);
989
+ }
990
+
991
+ let null_buffer = null_buffer_builder.finish();
992
+ Ok(Arc::new(StructArray::new(
993
+ fields.clone(),
994
+ field_arrays,
995
+ Some(null_buffer.into()),
996
+ )))
997
+ }
998
+
999
+ /// Append a single ParquetValue to an ArrayBuilder
1000
+ /// This is used for incremental building in complex scenarios
1001
+ pub fn append_parquet_value_to_builder(
1002
+ builder: &mut dyn ArrayBuilder,
1003
+ value: ParquetValue,
1004
+ data_type: &DataType,
1005
+ ) -> Result<()> {
1006
+ match data_type {
1007
+ DataType::Boolean => match value {
1008
+ ParquetValue::Boolean(b) => {
1009
+ let boolean_builder = builder
1010
+ .as_any_mut()
1011
+ .downcast_mut::<BooleanBuilder>()
1012
+ .ok_or_else(|| {
1013
+ ParquetError::Conversion("Failed to downcast to BooleanBuilder".to_string())
1014
+ })?;
1015
+ boolean_builder.append_value(b);
1016
+ }
1017
+ ParquetValue::Null => {
1018
+ let boolean_builder = builder
1019
+ .as_any_mut()
1020
+ .downcast_mut::<BooleanBuilder>()
1021
+ .ok_or_else(|| {
1022
+ ParquetError::Conversion("Failed to downcast to BooleanBuilder".to_string())
1023
+ })?;
1024
+ boolean_builder.append_null();
1025
+ }
1026
+ _ => {
1027
+ return Err(ParquetError::Conversion(format!(
1028
+ "Expected Boolean, got {:?}",
1029
+ value.type_name()
1030
+ )))
1031
+ }
1032
+ },
1033
+
1034
+ // For complex types like Map and Struct, we need special handling
1035
+ DataType::Map(entries_field, _) => match value {
1036
+ ParquetValue::Map(entries) => {
1037
+ let map_builder = builder
1038
+ .as_any_mut()
1039
+ .downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
1040
+ .ok_or_else(|| {
1041
+ ParquetError::Conversion("Failed to downcast to MapBuilder".to_string())
1042
+ })?;
1043
+
1044
+ if let DataType::Struct(fields) = entries_field.data_type() {
1045
+ if fields.len() != 2 {
1046
+ return Err(ParquetError::Conversion(
1047
+ "Map entries struct must have exactly 2 fields".to_string(),
1048
+ ));
1049
+ }
1050
+
1051
+ let key_type = fields[0].data_type();
1052
+ let value_type = fields[1].data_type();
1053
+
1054
+ for (key, val) in entries {
1055
+ append_parquet_value_to_builder(map_builder.keys(), key, key_type)?;
1056
+ append_parquet_value_to_builder(map_builder.values(), val, value_type)?;
1057
+ }
1058
+ map_builder.append(true)?;
1059
+ } else {
1060
+ return Err(ParquetError::Conversion(
1061
+ "Map entries field must be a struct".to_string(),
1062
+ ));
1063
+ }
1064
+ }
1065
+ ParquetValue::Null => {
1066
+ let map_builder = builder
1067
+ .as_any_mut()
1068
+ .downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
1069
+ .ok_or_else(|| {
1070
+ ParquetError::Conversion("Failed to downcast to MapBuilder".to_string())
1071
+ })?;
1072
+ map_builder.append(false)?;
1073
+ }
1074
+ _ => {
1075
+ return Err(ParquetError::Conversion(format!(
1076
+ "Expected Map, got {:?}",
1077
+ value.type_name()
1078
+ )))
1079
+ }
1080
+ },
1081
+
1082
+ // For other types, use the existing pattern
1083
+ _ => {
1084
+ return Err(ParquetError::Conversion(format!(
1085
+ "append_parquet_value_to_builder not implemented for type: {:?}",
1086
+ data_type
1087
+ )))
1088
+ }
1089
+ }
1090
+
1091
+ Ok(())
1092
+ }
1093
+
1094
+ #[cfg(test)]
1095
+ mod tests {
1096
+ use super::*;
1097
+ use arrow_array::*;
1098
+
1099
+ #[test]
1100
+ fn test_primitive_conversion_roundtrip() {
1101
+ // Test boolean
1102
+ let values = vec![
1103
+ ParquetValue::Boolean(true),
1104
+ ParquetValue::Boolean(false),
1105
+ ParquetValue::Null,
1106
+ ];
1107
+ let field = Field::new("test", DataType::Boolean, true);
1108
+ let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
1109
+
1110
+ for (i, expected) in values.iter().enumerate() {
1111
+ let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
1112
+ assert_eq!(&actual, expected);
1113
+ }
1114
+ }
1115
+
1116
+ #[test]
1117
+ fn test_integer_upcasting() {
1118
+ // Test that smaller integers can be upcast to larger ones
1119
+ let values = vec![
1120
+ ParquetValue::Int8(42),
1121
+ ParquetValue::Int16(1000),
1122
+ ParquetValue::Int32(100000),
1123
+ ];
1124
+ let field = Field::new("test", DataType::Int64, false);
1125
+ let array = parquet_values_to_arrow_array(values, &field).unwrap();
1126
+
1127
+ assert_eq!(array.len(), 3);
1128
+ let int64_array = array.as_any().downcast_ref::<Int64Array>().unwrap();
1129
+ assert_eq!(int64_array.value(0), 42);
1130
+ assert_eq!(int64_array.value(1), 1000);
1131
+ assert_eq!(int64_array.value(2), 100000);
1132
+ }
1133
+ }