parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,1133 @@
|
|
1
|
+
//! Bidirectional conversion between Arrow arrays and ParquetValue
|
2
|
+
//!
|
3
|
+
//! This module provides a unified interface for converting between Arrow's
|
4
|
+
//! columnar format and Parquet's value representation. It consolidates
|
5
|
+
//! the conversion logic that was previously duplicated between the reader
|
6
|
+
//! and writer modules.
|
7
|
+
|
8
|
+
use crate::{ParquetError, ParquetValue, Result};
|
9
|
+
use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
|
10
|
+
use arrow_schema::{DataType, Field};
|
11
|
+
use bytes::Bytes;
|
12
|
+
use indexmap::IndexMap;
|
13
|
+
use ordered_float::OrderedFloat;
|
14
|
+
use std::sync::Arc;
|
15
|
+
|
16
|
+
/// Convert a single value from an Arrow array at the given index to a ParquetValue
|
17
|
+
pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<ParquetValue> {
|
18
|
+
use arrow_array::*;
|
19
|
+
|
20
|
+
if array.is_null(index) {
|
21
|
+
return Ok(ParquetValue::Null);
|
22
|
+
}
|
23
|
+
|
24
|
+
match array.data_type() {
|
25
|
+
// Primitive types
|
26
|
+
DataType::Boolean => {
|
27
|
+
let array = downcast_array::<BooleanArray>(array)?;
|
28
|
+
Ok(ParquetValue::Boolean(array.value(index)))
|
29
|
+
}
|
30
|
+
DataType::Int8 => {
|
31
|
+
let array = downcast_array::<Int8Array>(array)?;
|
32
|
+
Ok(ParquetValue::Int8(array.value(index)))
|
33
|
+
}
|
34
|
+
DataType::Int16 => {
|
35
|
+
let array = downcast_array::<Int16Array>(array)?;
|
36
|
+
Ok(ParquetValue::Int16(array.value(index)))
|
37
|
+
}
|
38
|
+
DataType::Int32 => {
|
39
|
+
let array = downcast_array::<Int32Array>(array)?;
|
40
|
+
Ok(ParquetValue::Int32(array.value(index)))
|
41
|
+
}
|
42
|
+
DataType::Int64 => {
|
43
|
+
let array = downcast_array::<Int64Array>(array)?;
|
44
|
+
Ok(ParquetValue::Int64(array.value(index)))
|
45
|
+
}
|
46
|
+
DataType::UInt8 => {
|
47
|
+
let array = downcast_array::<UInt8Array>(array)?;
|
48
|
+
Ok(ParquetValue::UInt8(array.value(index)))
|
49
|
+
}
|
50
|
+
DataType::UInt16 => {
|
51
|
+
let array = downcast_array::<UInt16Array>(array)?;
|
52
|
+
Ok(ParquetValue::UInt16(array.value(index)))
|
53
|
+
}
|
54
|
+
DataType::UInt32 => {
|
55
|
+
let array = downcast_array::<UInt32Array>(array)?;
|
56
|
+
Ok(ParquetValue::UInt32(array.value(index)))
|
57
|
+
}
|
58
|
+
DataType::UInt64 => {
|
59
|
+
let array = downcast_array::<UInt64Array>(array)?;
|
60
|
+
Ok(ParquetValue::UInt64(array.value(index)))
|
61
|
+
}
|
62
|
+
DataType::Float16 => {
|
63
|
+
let array = downcast_array::<Float16Array>(array)?;
|
64
|
+
let value = array.value(index);
|
65
|
+
Ok(ParquetValue::Float16(OrderedFloat(value.to_f32())))
|
66
|
+
}
|
67
|
+
DataType::Float32 => {
|
68
|
+
let array = downcast_array::<Float32Array>(array)?;
|
69
|
+
Ok(ParquetValue::Float32(OrderedFloat(array.value(index))))
|
70
|
+
}
|
71
|
+
DataType::Float64 => {
|
72
|
+
let array = downcast_array::<Float64Array>(array)?;
|
73
|
+
Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
|
74
|
+
}
|
75
|
+
|
76
|
+
// String and binary types
|
77
|
+
DataType::Utf8 => {
|
78
|
+
let array = downcast_array::<StringArray>(array)?;
|
79
|
+
Ok(ParquetValue::String(Arc::from(array.value(index))))
|
80
|
+
}
|
81
|
+
DataType::Binary => {
|
82
|
+
let array = downcast_array::<BinaryArray>(array)?;
|
83
|
+
Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
|
84
|
+
array.value(index),
|
85
|
+
)))
|
86
|
+
}
|
87
|
+
DataType::FixedSizeBinary(_) => {
|
88
|
+
let array = downcast_array::<FixedSizeBinaryArray>(array)?;
|
89
|
+
Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
|
90
|
+
array.value(index),
|
91
|
+
)))
|
92
|
+
}
|
93
|
+
|
94
|
+
// Date and time types
|
95
|
+
DataType::Date32 => {
|
96
|
+
let array = downcast_array::<Date32Array>(array)?;
|
97
|
+
Ok(ParquetValue::Date32(array.value(index)))
|
98
|
+
}
|
99
|
+
DataType::Date64 => {
|
100
|
+
let array = downcast_array::<Date64Array>(array)?;
|
101
|
+
Ok(ParquetValue::Date64(array.value(index)))
|
102
|
+
}
|
103
|
+
|
104
|
+
// Timestamp types
|
105
|
+
DataType::Timestamp(unit, timezone) => {
|
106
|
+
let timezone = timezone.as_ref().map(|s| Arc::from(s.as_ref()));
|
107
|
+
match unit {
|
108
|
+
arrow_schema::TimeUnit::Millisecond => {
|
109
|
+
let array = downcast_array::<TimestampMillisecondArray>(array)?;
|
110
|
+
Ok(ParquetValue::TimestampMillis(array.value(index), timezone))
|
111
|
+
}
|
112
|
+
arrow_schema::TimeUnit::Microsecond => {
|
113
|
+
let array = downcast_array::<TimestampMicrosecondArray>(array)?;
|
114
|
+
Ok(ParquetValue::TimestampMicros(array.value(index), timezone))
|
115
|
+
}
|
116
|
+
arrow_schema::TimeUnit::Second => {
|
117
|
+
let array = downcast_array::<TimestampSecondArray>(array)?;
|
118
|
+
Ok(ParquetValue::TimestampSecond(array.value(index), timezone))
|
119
|
+
}
|
120
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
121
|
+
let array = downcast_array::<TimestampNanosecondArray>(array)?;
|
122
|
+
Ok(ParquetValue::TimestampNanos(array.value(index), timezone))
|
123
|
+
}
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
// Time types
|
128
|
+
DataType::Time32(unit) => match unit {
|
129
|
+
arrow_schema::TimeUnit::Millisecond => {
|
130
|
+
let array = downcast_array::<Time32MillisecondArray>(array)?;
|
131
|
+
Ok(ParquetValue::TimeMillis(array.value(index)))
|
132
|
+
}
|
133
|
+
_ => Err(ParquetError::Conversion(format!(
|
134
|
+
"Unsupported time32 unit: {:?}",
|
135
|
+
unit
|
136
|
+
))),
|
137
|
+
},
|
138
|
+
DataType::Time64(unit) => match unit {
|
139
|
+
arrow_schema::TimeUnit::Microsecond => {
|
140
|
+
let array = downcast_array::<Time64MicrosecondArray>(array)?;
|
141
|
+
Ok(ParquetValue::TimeMicros(array.value(index)))
|
142
|
+
}
|
143
|
+
_ => Err(ParquetError::Conversion(format!(
|
144
|
+
"Unsupported time64 unit: {:?}",
|
145
|
+
unit
|
146
|
+
))),
|
147
|
+
},
|
148
|
+
|
149
|
+
// Decimal types
|
150
|
+
DataType::Decimal128(_precision, scale) => {
|
151
|
+
let array = downcast_array::<Decimal128Array>(array)?;
|
152
|
+
let value = array.value(index);
|
153
|
+
Ok(ParquetValue::Decimal128(value, *scale))
|
154
|
+
}
|
155
|
+
DataType::Decimal256(_precision, scale) => {
|
156
|
+
let array = downcast_array::<Decimal256Array>(array)?;
|
157
|
+
let bytes = array.value(index).to_le_bytes();
|
158
|
+
|
159
|
+
// Convert to BigInt
|
160
|
+
let bigint = if bytes[31] & 0x80 != 0 {
|
161
|
+
// Negative number - convert from two's complement
|
162
|
+
let mut inverted = [0u8; 32];
|
163
|
+
for (i, &b) in bytes.iter().enumerate() {
|
164
|
+
inverted[i] = !b;
|
165
|
+
}
|
166
|
+
let positive = num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &inverted);
|
167
|
+
-(positive + num::BigInt::from(1))
|
168
|
+
} else {
|
169
|
+
num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &bytes)
|
170
|
+
};
|
171
|
+
|
172
|
+
Ok(ParquetValue::Decimal256(bigint, *scale))
|
173
|
+
}
|
174
|
+
|
175
|
+
// Complex types
|
176
|
+
DataType::List(_) => {
|
177
|
+
let array = downcast_array::<ListArray>(array)?;
|
178
|
+
let list_values = array.value(index);
|
179
|
+
|
180
|
+
let mut values = Vec::with_capacity(list_values.len());
|
181
|
+
for i in 0..list_values.len() {
|
182
|
+
values.push(arrow_to_parquet_value(&list_values, i)?);
|
183
|
+
}
|
184
|
+
|
185
|
+
Ok(ParquetValue::List(values))
|
186
|
+
}
|
187
|
+
DataType::Map(_, _) => {
|
188
|
+
let array = downcast_array::<MapArray>(array)?;
|
189
|
+
let map_value = array.value(index);
|
190
|
+
|
191
|
+
// Map is stored as a struct with two fields: keys and values
|
192
|
+
let keys = map_value.column(0);
|
193
|
+
let values = map_value.column(1);
|
194
|
+
|
195
|
+
let mut map_vec = Vec::with_capacity(keys.len());
|
196
|
+
for i in 0..keys.len() {
|
197
|
+
let key = arrow_to_parquet_value(keys, i)?;
|
198
|
+
let value = arrow_to_parquet_value(values, i)?;
|
199
|
+
map_vec.push((key, value));
|
200
|
+
}
|
201
|
+
|
202
|
+
Ok(ParquetValue::Map(map_vec))
|
203
|
+
}
|
204
|
+
DataType::Struct(_) => {
|
205
|
+
let array = downcast_array::<StructArray>(array)?;
|
206
|
+
|
207
|
+
let mut map = IndexMap::new();
|
208
|
+
for (col_idx, field) in array.fields().iter().enumerate() {
|
209
|
+
let column = array.column(col_idx);
|
210
|
+
let value = arrow_to_parquet_value(column, index)?;
|
211
|
+
map.insert(Arc::from(field.name().as_str()), value);
|
212
|
+
}
|
213
|
+
|
214
|
+
Ok(ParquetValue::Record(map))
|
215
|
+
}
|
216
|
+
|
217
|
+
dt => Err(ParquetError::Conversion(format!(
|
218
|
+
"Unsupported data type for conversion: {:?}",
|
219
|
+
dt
|
220
|
+
))),
|
221
|
+
}
|
222
|
+
}
|
223
|
+
|
224
|
+
/// Convert a vector of ParquetValues to an Arrow array
|
225
|
+
pub fn parquet_values_to_arrow_array(values: Vec<ParquetValue>, field: &Field) -> Result<ArrayRef> {
|
226
|
+
match field.data_type() {
|
227
|
+
// Boolean
|
228
|
+
DataType::Boolean => {
|
229
|
+
let mut builder = BooleanBuilder::with_capacity(values.len());
|
230
|
+
for value in values {
|
231
|
+
match value {
|
232
|
+
ParquetValue::Boolean(b) => builder.append_value(b),
|
233
|
+
ParquetValue::Null => builder.append_null(),
|
234
|
+
_ => {
|
235
|
+
return Err(ParquetError::Conversion(format!(
|
236
|
+
"Expected Boolean, got {:?}",
|
237
|
+
value.type_name()
|
238
|
+
)))
|
239
|
+
}
|
240
|
+
}
|
241
|
+
}
|
242
|
+
Ok(Arc::new(builder.finish()))
|
243
|
+
}
|
244
|
+
|
245
|
+
// Integer types with automatic upcasting
|
246
|
+
DataType::Int8 => build_int8_array(values),
|
247
|
+
DataType::Int16 => build_int16_array(values),
|
248
|
+
DataType::Int32 => build_int32_array(values),
|
249
|
+
DataType::Int64 => build_int64_array(values),
|
250
|
+
DataType::UInt8 => build_uint8_array(values),
|
251
|
+
DataType::UInt16 => build_uint16_array(values),
|
252
|
+
DataType::UInt32 => build_uint32_array(values),
|
253
|
+
DataType::UInt64 => build_uint64_array(values),
|
254
|
+
|
255
|
+
// Float types
|
256
|
+
DataType::Float32 => build_float32_array(values),
|
257
|
+
DataType::Float64 => build_float64_array(values),
|
258
|
+
|
259
|
+
// String and binary
|
260
|
+
DataType::Utf8 => build_string_array(values),
|
261
|
+
DataType::Binary => build_binary_array(values),
|
262
|
+
DataType::FixedSizeBinary(size) => build_fixed_binary_array(values, *size),
|
263
|
+
|
264
|
+
// Date and time
|
265
|
+
DataType::Date32 => build_date32_array(values),
|
266
|
+
DataType::Date64 => build_date64_array(values),
|
267
|
+
DataType::Time32(unit) => build_time32_array(values, unit),
|
268
|
+
DataType::Time64(unit) => build_time64_array(values, unit),
|
269
|
+
|
270
|
+
// Timestamp
|
271
|
+
DataType::Timestamp(unit, tz) => build_timestamp_array(values, unit, tz.as_deref()),
|
272
|
+
|
273
|
+
// Decimal
|
274
|
+
DataType::Decimal128(precision, scale) => {
|
275
|
+
build_decimal128_array(values, *precision, *scale)
|
276
|
+
}
|
277
|
+
DataType::Decimal256(precision, scale) => {
|
278
|
+
build_decimal256_array(values, *precision, *scale)
|
279
|
+
}
|
280
|
+
|
281
|
+
// Complex types
|
282
|
+
DataType::List(item_field) => build_list_array(values, item_field),
|
283
|
+
DataType::Map(entries_field, sorted) => build_map_array(values, entries_field, *sorted),
|
284
|
+
DataType::Struct(fields) => build_struct_array(values, fields),
|
285
|
+
|
286
|
+
dt => Err(ParquetError::Conversion(format!(
|
287
|
+
"Unsupported data type for conversion: {:?}",
|
288
|
+
dt
|
289
|
+
))),
|
290
|
+
}
|
291
|
+
}
|
292
|
+
|
293
|
+
/// Helper function to downcast an array with better error messages
|
294
|
+
fn downcast_array<T: 'static>(array: &dyn Array) -> Result<&T> {
|
295
|
+
array.as_any().downcast_ref::<T>().ok_or_else(|| {
|
296
|
+
ParquetError::Conversion(format!("Failed to cast to {}", std::any::type_name::<T>()))
|
297
|
+
})
|
298
|
+
}
|
299
|
+
|
300
|
+
/// Build Int8 array
|
301
|
+
fn build_int8_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
302
|
+
let mut builder = Int8Builder::with_capacity(values.len());
|
303
|
+
for value in values {
|
304
|
+
match value {
|
305
|
+
ParquetValue::Int8(i) => builder.append_value(i),
|
306
|
+
ParquetValue::Null => builder.append_null(),
|
307
|
+
_ => {
|
308
|
+
return Err(ParquetError::Conversion(format!(
|
309
|
+
"Expected Int8, got {:?}",
|
310
|
+
value.type_name()
|
311
|
+
)))
|
312
|
+
}
|
313
|
+
}
|
314
|
+
}
|
315
|
+
Ok(Arc::new(builder.finish()))
|
316
|
+
}
|
317
|
+
|
318
|
+
/// Build Int16 array
|
319
|
+
fn build_int16_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
320
|
+
let mut builder = Int16Builder::with_capacity(values.len());
|
321
|
+
for value in values {
|
322
|
+
match value {
|
323
|
+
ParquetValue::Int16(i) => builder.append_value(i),
|
324
|
+
ParquetValue::Int8(i) => builder.append_value(i as i16),
|
325
|
+
ParquetValue::Null => builder.append_null(),
|
326
|
+
_ => {
|
327
|
+
return Err(ParquetError::Conversion(format!(
|
328
|
+
"Expected Int16, got {:?}",
|
329
|
+
value.type_name()
|
330
|
+
)))
|
331
|
+
}
|
332
|
+
}
|
333
|
+
}
|
334
|
+
Ok(Arc::new(builder.finish()))
|
335
|
+
}
|
336
|
+
|
337
|
+
/// Build Int32 array
|
338
|
+
fn build_int32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
339
|
+
let mut builder = Int32Builder::with_capacity(values.len());
|
340
|
+
for value in values {
|
341
|
+
match value {
|
342
|
+
ParquetValue::Int32(i) => builder.append_value(i),
|
343
|
+
ParquetValue::Int16(i) => builder.append_value(i as i32),
|
344
|
+
ParquetValue::Int8(i) => builder.append_value(i as i32),
|
345
|
+
ParquetValue::Null => builder.append_null(),
|
346
|
+
_ => {
|
347
|
+
return Err(ParquetError::Conversion(format!(
|
348
|
+
"Expected Int32, got {:?}",
|
349
|
+
value.type_name()
|
350
|
+
)))
|
351
|
+
}
|
352
|
+
}
|
353
|
+
}
|
354
|
+
Ok(Arc::new(builder.finish()))
|
355
|
+
}
|
356
|
+
|
357
|
+
/// Build Int64 array
|
358
|
+
fn build_int64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
359
|
+
let mut builder = Int64Builder::with_capacity(values.len());
|
360
|
+
for value in values {
|
361
|
+
match value {
|
362
|
+
ParquetValue::Int64(i) => builder.append_value(i),
|
363
|
+
ParquetValue::Int32(i) => builder.append_value(i as i64),
|
364
|
+
ParquetValue::Int16(i) => builder.append_value(i as i64),
|
365
|
+
ParquetValue::Int8(i) => builder.append_value(i as i64),
|
366
|
+
ParquetValue::Null => builder.append_null(),
|
367
|
+
_ => {
|
368
|
+
return Err(ParquetError::Conversion(format!(
|
369
|
+
"Expected Int64, got {:?}",
|
370
|
+
value.type_name()
|
371
|
+
)))
|
372
|
+
}
|
373
|
+
}
|
374
|
+
}
|
375
|
+
Ok(Arc::new(builder.finish()))
|
376
|
+
}
|
377
|
+
|
378
|
+
/// Build UInt8 array
|
379
|
+
fn build_uint8_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
380
|
+
let mut builder = UInt8Builder::with_capacity(values.len());
|
381
|
+
for value in values {
|
382
|
+
match value {
|
383
|
+
ParquetValue::UInt8(i) => builder.append_value(i),
|
384
|
+
ParquetValue::Null => builder.append_null(),
|
385
|
+
_ => {
|
386
|
+
return Err(ParquetError::Conversion(format!(
|
387
|
+
"Expected UInt8, got {:?}",
|
388
|
+
value.type_name()
|
389
|
+
)))
|
390
|
+
}
|
391
|
+
}
|
392
|
+
}
|
393
|
+
Ok(Arc::new(builder.finish()))
|
394
|
+
}
|
395
|
+
|
396
|
+
/// Build UInt16 array
|
397
|
+
fn build_uint16_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
398
|
+
let mut builder = UInt16Builder::with_capacity(values.len());
|
399
|
+
for value in values {
|
400
|
+
match value {
|
401
|
+
ParquetValue::UInt16(i) => builder.append_value(i),
|
402
|
+
ParquetValue::UInt8(i) => builder.append_value(i as u16),
|
403
|
+
ParquetValue::Null => builder.append_null(),
|
404
|
+
_ => {
|
405
|
+
return Err(ParquetError::Conversion(format!(
|
406
|
+
"Expected UInt16, got {:?}",
|
407
|
+
value.type_name()
|
408
|
+
)))
|
409
|
+
}
|
410
|
+
}
|
411
|
+
}
|
412
|
+
Ok(Arc::new(builder.finish()))
|
413
|
+
}
|
414
|
+
|
415
|
+
/// Build UInt32 array
|
416
|
+
fn build_uint32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
417
|
+
let mut builder = UInt32Builder::with_capacity(values.len());
|
418
|
+
for value in values {
|
419
|
+
match value {
|
420
|
+
ParquetValue::UInt32(i) => builder.append_value(i),
|
421
|
+
ParquetValue::UInt16(i) => builder.append_value(i as u32),
|
422
|
+
ParquetValue::UInt8(i) => builder.append_value(i as u32),
|
423
|
+
ParquetValue::Null => builder.append_null(),
|
424
|
+
_ => {
|
425
|
+
return Err(ParquetError::Conversion(format!(
|
426
|
+
"Expected UInt32, got {:?}",
|
427
|
+
value.type_name()
|
428
|
+
)))
|
429
|
+
}
|
430
|
+
}
|
431
|
+
}
|
432
|
+
Ok(Arc::new(builder.finish()))
|
433
|
+
}
|
434
|
+
|
435
|
+
/// Build UInt64 array
|
436
|
+
fn build_uint64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
437
|
+
let mut builder = UInt64Builder::with_capacity(values.len());
|
438
|
+
for value in values {
|
439
|
+
match value {
|
440
|
+
ParquetValue::UInt64(i) => builder.append_value(i),
|
441
|
+
ParquetValue::UInt32(i) => builder.append_value(i as u64),
|
442
|
+
ParquetValue::UInt16(i) => builder.append_value(i as u64),
|
443
|
+
ParquetValue::UInt8(i) => builder.append_value(i as u64),
|
444
|
+
ParquetValue::Null => builder.append_null(),
|
445
|
+
_ => {
|
446
|
+
return Err(ParquetError::Conversion(format!(
|
447
|
+
"Expected UInt64, got {:?}",
|
448
|
+
value.type_name()
|
449
|
+
)))
|
450
|
+
}
|
451
|
+
}
|
452
|
+
}
|
453
|
+
Ok(Arc::new(builder.finish()))
|
454
|
+
}
|
455
|
+
|
456
|
+
/// Build Float32 array with Float16 support
|
457
|
+
fn build_float32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
458
|
+
let mut builder = Float32Builder::with_capacity(values.len());
|
459
|
+
for value in values {
|
460
|
+
match value {
|
461
|
+
ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(f),
|
462
|
+
ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(f),
|
463
|
+
ParquetValue::Null => builder.append_null(),
|
464
|
+
_ => {
|
465
|
+
return Err(ParquetError::Conversion(format!(
|
466
|
+
"Expected Float32, got {:?}",
|
467
|
+
value.type_name()
|
468
|
+
)))
|
469
|
+
}
|
470
|
+
}
|
471
|
+
}
|
472
|
+
Ok(Arc::new(builder.finish()))
|
473
|
+
}
|
474
|
+
|
475
|
+
/// Build Float64 array with Float32 and Float16 support
|
476
|
+
fn build_float64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
477
|
+
let mut builder = Float64Builder::with_capacity(values.len());
|
478
|
+
for value in values {
|
479
|
+
match value {
|
480
|
+
ParquetValue::Float64(OrderedFloat(f)) => builder.append_value(f),
|
481
|
+
ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(f as f64),
|
482
|
+
ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(f as f64),
|
483
|
+
ParquetValue::Null => builder.append_null(),
|
484
|
+
_ => {
|
485
|
+
return Err(ParquetError::Conversion(format!(
|
486
|
+
"Expected Float64, got {:?}",
|
487
|
+
value.type_name()
|
488
|
+
)))
|
489
|
+
}
|
490
|
+
}
|
491
|
+
}
|
492
|
+
Ok(Arc::new(builder.finish()))
|
493
|
+
}
|
494
|
+
|
495
|
+
/// Build string array
|
496
|
+
fn build_string_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
497
|
+
let mut builder = StringBuilder::with_capacity(values.len(), 0);
|
498
|
+
for value in values {
|
499
|
+
match value {
|
500
|
+
ParquetValue::String(s) => builder.append_value(&s),
|
501
|
+
ParquetValue::Null => builder.append_null(),
|
502
|
+
_ => {
|
503
|
+
return Err(ParquetError::Conversion(format!(
|
504
|
+
"Expected String, got {:?}",
|
505
|
+
value.type_name()
|
506
|
+
)))
|
507
|
+
}
|
508
|
+
}
|
509
|
+
}
|
510
|
+
Ok(Arc::new(builder.finish()))
|
511
|
+
}
|
512
|
+
|
513
|
+
/// Build binary array
|
514
|
+
fn build_binary_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
515
|
+
let mut builder = BinaryBuilder::with_capacity(values.len(), 0);
|
516
|
+
for value in values {
|
517
|
+
match value {
|
518
|
+
ParquetValue::Bytes(b) => builder.append_value(&b),
|
519
|
+
ParquetValue::Null => builder.append_null(),
|
520
|
+
_ => {
|
521
|
+
return Err(ParquetError::Conversion(format!(
|
522
|
+
"Expected Bytes, got {:?}",
|
523
|
+
value.type_name()
|
524
|
+
)))
|
525
|
+
}
|
526
|
+
}
|
527
|
+
}
|
528
|
+
Ok(Arc::new(builder.finish()))
|
529
|
+
}
|
530
|
+
|
531
|
+
/// Build fixed size binary array
|
532
|
+
fn build_fixed_binary_array(values: Vec<ParquetValue>, size: i32) -> Result<ArrayRef> {
|
533
|
+
let mut builder = FixedSizeBinaryBuilder::with_capacity(values.len(), size);
|
534
|
+
for value in values {
|
535
|
+
match value {
|
536
|
+
ParquetValue::Bytes(b) => {
|
537
|
+
if b.len() != size as usize {
|
538
|
+
return Err(ParquetError::Conversion(format!(
|
539
|
+
"Fixed size binary expected {} bytes, got {}",
|
540
|
+
size,
|
541
|
+
b.len()
|
542
|
+
)));
|
543
|
+
}
|
544
|
+
builder.append_value(&b)?;
|
545
|
+
}
|
546
|
+
ParquetValue::Null => builder.append_null(),
|
547
|
+
_ => {
|
548
|
+
return Err(ParquetError::Conversion(format!(
|
549
|
+
"Expected Bytes, got {:?}",
|
550
|
+
value.type_name()
|
551
|
+
)))
|
552
|
+
}
|
553
|
+
}
|
554
|
+
}
|
555
|
+
Ok(Arc::new(builder.finish()))
|
556
|
+
}
|
557
|
+
|
558
|
+
/// Build Date32 array
|
559
|
+
fn build_date32_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
560
|
+
let mut builder = Date32Builder::with_capacity(values.len());
|
561
|
+
for value in values {
|
562
|
+
match value {
|
563
|
+
ParquetValue::Date32(d) => builder.append_value(d),
|
564
|
+
ParquetValue::Null => builder.append_null(),
|
565
|
+
_ => {
|
566
|
+
return Err(ParquetError::Conversion(format!(
|
567
|
+
"Expected Date32, got {:?}",
|
568
|
+
value.type_name()
|
569
|
+
)))
|
570
|
+
}
|
571
|
+
}
|
572
|
+
}
|
573
|
+
Ok(Arc::new(builder.finish()))
|
574
|
+
}
|
575
|
+
|
576
|
+
/// Build Date64 array
|
577
|
+
fn build_date64_array(values: Vec<ParquetValue>) -> Result<ArrayRef> {
|
578
|
+
let mut builder = Date64Builder::with_capacity(values.len());
|
579
|
+
for value in values {
|
580
|
+
match value {
|
581
|
+
ParquetValue::Date64(d) => builder.append_value(d),
|
582
|
+
ParquetValue::Null => builder.append_null(),
|
583
|
+
_ => {
|
584
|
+
return Err(ParquetError::Conversion(format!(
|
585
|
+
"Expected Date64, got {:?}",
|
586
|
+
value.type_name()
|
587
|
+
)))
|
588
|
+
}
|
589
|
+
}
|
590
|
+
}
|
591
|
+
Ok(Arc::new(builder.finish()))
|
592
|
+
}
|
593
|
+
|
594
|
+
/// Build Time32 array
|
595
|
+
fn build_time32_array(
|
596
|
+
values: Vec<ParquetValue>,
|
597
|
+
unit: &arrow_schema::TimeUnit,
|
598
|
+
) -> Result<ArrayRef> {
|
599
|
+
match unit {
|
600
|
+
arrow_schema::TimeUnit::Millisecond => {
|
601
|
+
let mut builder = Time32MillisecondBuilder::with_capacity(values.len());
|
602
|
+
for value in values {
|
603
|
+
match value {
|
604
|
+
ParquetValue::TimeMillis(t) => builder.append_value(t),
|
605
|
+
ParquetValue::Null => builder.append_null(),
|
606
|
+
_ => {
|
607
|
+
return Err(ParquetError::Conversion(format!(
|
608
|
+
"Expected TimeMillis, got {:?}",
|
609
|
+
value.type_name()
|
610
|
+
)))
|
611
|
+
}
|
612
|
+
}
|
613
|
+
}
|
614
|
+
Ok(Arc::new(builder.finish()))
|
615
|
+
}
|
616
|
+
_ => Err(ParquetError::Conversion(format!(
|
617
|
+
"Unsupported time32 unit: {:?}",
|
618
|
+
unit
|
619
|
+
))),
|
620
|
+
}
|
621
|
+
}
|
622
|
+
|
623
|
+
/// Build Time64 array
|
624
|
+
fn build_time64_array(
|
625
|
+
values: Vec<ParquetValue>,
|
626
|
+
unit: &arrow_schema::TimeUnit,
|
627
|
+
) -> Result<ArrayRef> {
|
628
|
+
match unit {
|
629
|
+
arrow_schema::TimeUnit::Microsecond => {
|
630
|
+
let mut builder = Time64MicrosecondBuilder::with_capacity(values.len());
|
631
|
+
for value in values {
|
632
|
+
match value {
|
633
|
+
ParquetValue::TimeMicros(t) => builder.append_value(t),
|
634
|
+
ParquetValue::Null => builder.append_null(),
|
635
|
+
_ => {
|
636
|
+
return Err(ParquetError::Conversion(format!(
|
637
|
+
"Expected TimeMicros, got {:?}",
|
638
|
+
value.type_name()
|
639
|
+
)))
|
640
|
+
}
|
641
|
+
}
|
642
|
+
}
|
643
|
+
Ok(Arc::new(builder.finish()))
|
644
|
+
}
|
645
|
+
_ => Err(ParquetError::Conversion(format!(
|
646
|
+
"Unsupported time64 unit: {:?}",
|
647
|
+
unit
|
648
|
+
))),
|
649
|
+
}
|
650
|
+
}
|
651
|
+
|
652
|
+
/// Build timestamp array
|
653
|
+
fn build_timestamp_array(
|
654
|
+
values: Vec<ParquetValue>,
|
655
|
+
unit: &arrow_schema::TimeUnit,
|
656
|
+
timezone: Option<&str>,
|
657
|
+
) -> Result<ArrayRef> {
|
658
|
+
// First, check if all values have the same timezone (or use the field timezone)
|
659
|
+
let mut common_tz: Option<Option<Arc<str>>> = None;
|
660
|
+
for value in &values {
|
661
|
+
match value {
|
662
|
+
ParquetValue::TimestampSecond(_, tz)
|
663
|
+
| ParquetValue::TimestampMillis(_, tz)
|
664
|
+
| ParquetValue::TimestampMicros(_, tz)
|
665
|
+
| ParquetValue::TimestampNanos(_, tz) => {
|
666
|
+
match &common_tz {
|
667
|
+
None => common_tz = Some(tz.clone()),
|
668
|
+
Some(existing) => {
|
669
|
+
// If we have mixed timezones, we'll use the field timezone
|
670
|
+
if existing != tz {
|
671
|
+
common_tz = Some(timezone.map(Arc::from));
|
672
|
+
break;
|
673
|
+
}
|
674
|
+
}
|
675
|
+
}
|
676
|
+
}
|
677
|
+
ParquetValue::Null => {}
|
678
|
+
_ => {}
|
679
|
+
}
|
680
|
+
}
|
681
|
+
|
682
|
+
// Use the common timezone from values, or fall back to field timezone
|
683
|
+
let tz = common_tz.unwrap_or_else(|| timezone.map(Arc::from));
|
684
|
+
|
685
|
+
match unit {
|
686
|
+
arrow_schema::TimeUnit::Second => {
|
687
|
+
let mut builder =
|
688
|
+
TimestampSecondBuilder::with_capacity(values.len()).with_timezone_opt(tz.clone());
|
689
|
+
for value in values {
|
690
|
+
match value {
|
691
|
+
ParquetValue::TimestampSecond(t, _) => builder.append_value(t),
|
692
|
+
ParquetValue::Null => builder.append_null(),
|
693
|
+
_ => {
|
694
|
+
return Err(ParquetError::Conversion(format!(
|
695
|
+
"Expected TimestampSecond, got {:?}",
|
696
|
+
value.type_name()
|
697
|
+
)))
|
698
|
+
}
|
699
|
+
}
|
700
|
+
}
|
701
|
+
Ok(Arc::new(builder.finish()))
|
702
|
+
}
|
703
|
+
arrow_schema::TimeUnit::Millisecond => {
|
704
|
+
let mut builder = TimestampMillisecondBuilder::with_capacity(values.len())
|
705
|
+
.with_timezone_opt(tz.clone());
|
706
|
+
for value in values {
|
707
|
+
match value {
|
708
|
+
ParquetValue::TimestampMillis(t, _) => builder.append_value(t),
|
709
|
+
ParquetValue::Null => builder.append_null(),
|
710
|
+
_ => {
|
711
|
+
return Err(ParquetError::Conversion(format!(
|
712
|
+
"Expected TimestampMillis, got {:?}",
|
713
|
+
value.type_name()
|
714
|
+
)))
|
715
|
+
}
|
716
|
+
}
|
717
|
+
}
|
718
|
+
Ok(Arc::new(builder.finish()))
|
719
|
+
}
|
720
|
+
arrow_schema::TimeUnit::Microsecond => {
|
721
|
+
let mut builder = TimestampMicrosecondBuilder::with_capacity(values.len())
|
722
|
+
.with_timezone_opt(tz.clone());
|
723
|
+
for value in values {
|
724
|
+
match value {
|
725
|
+
ParquetValue::TimestampMicros(t, _) => builder.append_value(t),
|
726
|
+
ParquetValue::Null => builder.append_null(),
|
727
|
+
_ => {
|
728
|
+
return Err(ParquetError::Conversion(format!(
|
729
|
+
"Expected TimestampMicros, got {:?}",
|
730
|
+
value.type_name()
|
731
|
+
)))
|
732
|
+
}
|
733
|
+
}
|
734
|
+
}
|
735
|
+
Ok(Arc::new(builder.finish()))
|
736
|
+
}
|
737
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
738
|
+
let mut builder = TimestampNanosecondBuilder::with_capacity(values.len())
|
739
|
+
.with_timezone_opt(tz.clone());
|
740
|
+
for value in values {
|
741
|
+
match value {
|
742
|
+
ParquetValue::TimestampNanos(t, _) => builder.append_value(t),
|
743
|
+
ParquetValue::Null => builder.append_null(),
|
744
|
+
_ => {
|
745
|
+
return Err(ParquetError::Conversion(format!(
|
746
|
+
"Expected TimestampNanos, got {:?}",
|
747
|
+
value.type_name()
|
748
|
+
)))
|
749
|
+
}
|
750
|
+
}
|
751
|
+
}
|
752
|
+
Ok(Arc::new(builder.finish()))
|
753
|
+
}
|
754
|
+
}
|
755
|
+
}
|
756
|
+
|
757
|
+
/// Build Decimal128 array
|
758
|
+
fn build_decimal128_array(values: Vec<ParquetValue>, precision: u8, scale: i8) -> Result<ArrayRef> {
|
759
|
+
let mut builder = Decimal128Builder::with_capacity(values.len())
|
760
|
+
.with_precision_and_scale(precision, scale)?;
|
761
|
+
for value in values {
|
762
|
+
match value {
|
763
|
+
ParquetValue::Decimal128(d, _) => builder.append_value(d),
|
764
|
+
ParquetValue::Null => builder.append_null(),
|
765
|
+
_ => {
|
766
|
+
return Err(ParquetError::Conversion(format!(
|
767
|
+
"Expected Decimal128, got {:?}",
|
768
|
+
value.type_name()
|
769
|
+
)))
|
770
|
+
}
|
771
|
+
}
|
772
|
+
}
|
773
|
+
Ok(Arc::new(builder.finish()))
|
774
|
+
}
|
775
|
+
|
776
|
+
/// Build Decimal256 array
|
777
|
+
fn build_decimal256_array(values: Vec<ParquetValue>, precision: u8, scale: i8) -> Result<ArrayRef> {
|
778
|
+
let mut builder = Decimal256Builder::with_capacity(values.len())
|
779
|
+
.with_precision_and_scale(precision, scale)?;
|
780
|
+
for value in values {
|
781
|
+
match value {
|
782
|
+
ParquetValue::Decimal256(bigint, _) => {
|
783
|
+
let bytes = decimal256_from_bigint(&bigint)?;
|
784
|
+
builder.append_value(bytes);
|
785
|
+
}
|
786
|
+
ParquetValue::Null => builder.append_null(),
|
787
|
+
_ => {
|
788
|
+
return Err(ParquetError::Conversion(format!(
|
789
|
+
"Expected Decimal256, got {:?}",
|
790
|
+
value.type_name()
|
791
|
+
)))
|
792
|
+
}
|
793
|
+
}
|
794
|
+
}
|
795
|
+
Ok(Arc::new(builder.finish()))
|
796
|
+
}
|
797
|
+
|
798
|
+
/// Convert BigInt to i256 (32-byte array)
|
799
|
+
fn decimal256_from_bigint(bigint: &num::BigInt) -> Result<arrow_buffer::i256> {
|
800
|
+
// Get bytes in little-endian format
|
801
|
+
let (sign, mut bytes) = bigint.to_bytes_le();
|
802
|
+
|
803
|
+
// Ensure we have exactly 32 bytes
|
804
|
+
if bytes.len() > 32 {
|
805
|
+
return Err(ParquetError::Conversion(
|
806
|
+
"Decimal256 value too large".to_string(),
|
807
|
+
));
|
808
|
+
}
|
809
|
+
|
810
|
+
// Pad with zeros or ones (for negative numbers) to reach 32 bytes
|
811
|
+
bytes.resize(32, 0);
|
812
|
+
|
813
|
+
// If negative, convert to two's complement
|
814
|
+
if sign == num::bigint::Sign::Minus {
|
815
|
+
// Invert all bits
|
816
|
+
for byte in &mut bytes {
|
817
|
+
*byte = !*byte;
|
818
|
+
}
|
819
|
+
// Add 1
|
820
|
+
let mut carry = true;
|
821
|
+
for byte in &mut bytes {
|
822
|
+
if carry {
|
823
|
+
let (new_byte, new_carry) = byte.overflowing_add(1);
|
824
|
+
*byte = new_byte;
|
825
|
+
carry = new_carry;
|
826
|
+
} else {
|
827
|
+
break;
|
828
|
+
}
|
829
|
+
}
|
830
|
+
}
|
831
|
+
|
832
|
+
let byte_array: [u8; 32] = bytes
|
833
|
+
.try_into()
|
834
|
+
.map_err(|_| ParquetError::Conversion("Failed to convert bytes to i256".to_string()))?;
|
835
|
+
Ok(arrow_buffer::i256::from_le_bytes(byte_array))
|
836
|
+
}
|
837
|
+
|
838
|
+
/// Build list array
|
839
|
+
fn build_list_array(values: Vec<ParquetValue>, item_field: &Arc<Field>) -> Result<ArrayRef> {
|
840
|
+
let mut all_items = Vec::new();
|
841
|
+
let mut offsets = Vec::with_capacity(values.len() + 1);
|
842
|
+
let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
|
843
|
+
offsets.push(0i32);
|
844
|
+
|
845
|
+
for value in values {
|
846
|
+
match value {
|
847
|
+
ParquetValue::List(items) => {
|
848
|
+
all_items.extend(items);
|
849
|
+
offsets.push(all_items.len() as i32);
|
850
|
+
null_buffer_builder.append(true);
|
851
|
+
}
|
852
|
+
ParquetValue::Null => {
|
853
|
+
offsets.push(all_items.len() as i32);
|
854
|
+
null_buffer_builder.append(false);
|
855
|
+
}
|
856
|
+
_ => {
|
857
|
+
return Err(ParquetError::Conversion(format!(
|
858
|
+
"Expected List, got {:?}",
|
859
|
+
value.type_name()
|
860
|
+
)))
|
861
|
+
}
|
862
|
+
}
|
863
|
+
}
|
864
|
+
|
865
|
+
let item_array = parquet_values_to_arrow_array(all_items, item_field)?;
|
866
|
+
let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
|
867
|
+
let null_buffer = null_buffer_builder.finish();
|
868
|
+
|
869
|
+
Ok(Arc::new(ListArray::new(
|
870
|
+
item_field.clone(),
|
871
|
+
offset_buffer,
|
872
|
+
item_array,
|
873
|
+
Some(null_buffer.into()),
|
874
|
+
)))
|
875
|
+
}
|
876
|
+
|
877
|
+
/// Build map array
|
878
|
+
fn build_map_array(
|
879
|
+
values: Vec<ParquetValue>,
|
880
|
+
entries_field: &Arc<Field>,
|
881
|
+
_sorted: bool,
|
882
|
+
) -> Result<ArrayRef> {
|
883
|
+
// Extract the key and value fields from the entries struct
|
884
|
+
let (key_field, value_field) = match entries_field.data_type() {
|
885
|
+
DataType::Struct(fields) if fields.len() == 2 => (&fields[0], &fields[1]),
|
886
|
+
_ => {
|
887
|
+
return Err(ParquetError::Conversion(
|
888
|
+
"Map entries field must be a struct with exactly 2 fields".to_string(),
|
889
|
+
))
|
890
|
+
}
|
891
|
+
};
|
892
|
+
|
893
|
+
let mut all_keys = Vec::new();
|
894
|
+
let mut all_values = Vec::new();
|
895
|
+
let mut offsets = Vec::with_capacity(values.len() + 1);
|
896
|
+
let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
|
897
|
+
offsets.push(0i32);
|
898
|
+
|
899
|
+
for value in values {
|
900
|
+
match value {
|
901
|
+
ParquetValue::Map(entries) => {
|
902
|
+
for (k, v) in entries {
|
903
|
+
all_keys.push(k);
|
904
|
+
all_values.push(v);
|
905
|
+
}
|
906
|
+
offsets.push(all_keys.len() as i32);
|
907
|
+
null_buffer_builder.append(true);
|
908
|
+
}
|
909
|
+
ParquetValue::Null => {
|
910
|
+
offsets.push(all_keys.len() as i32);
|
911
|
+
null_buffer_builder.append(false);
|
912
|
+
}
|
913
|
+
_ => {
|
914
|
+
return Err(ParquetError::Conversion(format!(
|
915
|
+
"Expected Map, got {:?}",
|
916
|
+
value.type_name()
|
917
|
+
)))
|
918
|
+
}
|
919
|
+
}
|
920
|
+
}
|
921
|
+
|
922
|
+
let key_array = parquet_values_to_arrow_array(all_keys, key_field)?;
|
923
|
+
let value_array = parquet_values_to_arrow_array(all_values, value_field)?;
|
924
|
+
|
925
|
+
// Create struct array for entries
|
926
|
+
let struct_fields = match entries_field.data_type() {
|
927
|
+
DataType::Struct(fields) => fields.clone(),
|
928
|
+
_ => unreachable!("Map entries field must be a struct"),
|
929
|
+
};
|
930
|
+
|
931
|
+
let struct_array = StructArray::new(struct_fields, vec![key_array, value_array], None);
|
932
|
+
|
933
|
+
let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
|
934
|
+
let null_buffer = null_buffer_builder.finish();
|
935
|
+
|
936
|
+
Ok(Arc::new(MapArray::new(
|
937
|
+
entries_field.clone(),
|
938
|
+
offset_buffer,
|
939
|
+
struct_array,
|
940
|
+
Some(null_buffer.into()),
|
941
|
+
false, // sorted
|
942
|
+
)))
|
943
|
+
}
|
944
|
+
|
945
|
+
/// Build struct array
|
946
|
+
fn build_struct_array(
|
947
|
+
values: Vec<ParquetValue>,
|
948
|
+
fields: &arrow_schema::Fields,
|
949
|
+
) -> Result<ArrayRef> {
|
950
|
+
let num_rows = values.len();
|
951
|
+
let mut field_arrays = Vec::with_capacity(fields.len());
|
952
|
+
let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(num_rows);
|
953
|
+
|
954
|
+
// Prepare columns for each field
|
955
|
+
let mut field_columns: Vec<Vec<ParquetValue>> =
|
956
|
+
vec![Vec::with_capacity(num_rows); fields.len()];
|
957
|
+
|
958
|
+
for value in values {
|
959
|
+
match value {
|
960
|
+
ParquetValue::Record(map) => {
|
961
|
+
null_buffer_builder.append(true);
|
962
|
+
for (idx, field) in fields.iter().enumerate() {
|
963
|
+
let field_value = map
|
964
|
+
.get(field.name().as_str())
|
965
|
+
.cloned()
|
966
|
+
.unwrap_or(ParquetValue::Null);
|
967
|
+
field_columns[idx].push(field_value);
|
968
|
+
}
|
969
|
+
}
|
970
|
+
ParquetValue::Null => {
|
971
|
+
null_buffer_builder.append(false);
|
972
|
+
for field_column in field_columns.iter_mut().take(fields.len()) {
|
973
|
+
field_column.push(ParquetValue::Null);
|
974
|
+
}
|
975
|
+
}
|
976
|
+
_ => {
|
977
|
+
return Err(ParquetError::Conversion(format!(
|
978
|
+
"Expected Record, got {:?}",
|
979
|
+
value.type_name()
|
980
|
+
)))
|
981
|
+
}
|
982
|
+
}
|
983
|
+
}
|
984
|
+
|
985
|
+
// Build arrays for each field
|
986
|
+
for (column, field) in field_columns.into_iter().zip(fields.iter()) {
|
987
|
+
let array = parquet_values_to_arrow_array(column, field)?;
|
988
|
+
field_arrays.push(array);
|
989
|
+
}
|
990
|
+
|
991
|
+
let null_buffer = null_buffer_builder.finish();
|
992
|
+
Ok(Arc::new(StructArray::new(
|
993
|
+
fields.clone(),
|
994
|
+
field_arrays,
|
995
|
+
Some(null_buffer.into()),
|
996
|
+
)))
|
997
|
+
}
|
998
|
+
|
999
|
+
/// Append a single ParquetValue to an ArrayBuilder
|
1000
|
+
/// This is used for incremental building in complex scenarios
|
1001
|
+
pub fn append_parquet_value_to_builder(
|
1002
|
+
builder: &mut dyn ArrayBuilder,
|
1003
|
+
value: ParquetValue,
|
1004
|
+
data_type: &DataType,
|
1005
|
+
) -> Result<()> {
|
1006
|
+
match data_type {
|
1007
|
+
DataType::Boolean => match value {
|
1008
|
+
ParquetValue::Boolean(b) => {
|
1009
|
+
let boolean_builder = builder
|
1010
|
+
.as_any_mut()
|
1011
|
+
.downcast_mut::<BooleanBuilder>()
|
1012
|
+
.ok_or_else(|| {
|
1013
|
+
ParquetError::Conversion("Failed to downcast to BooleanBuilder".to_string())
|
1014
|
+
})?;
|
1015
|
+
boolean_builder.append_value(b);
|
1016
|
+
}
|
1017
|
+
ParquetValue::Null => {
|
1018
|
+
let boolean_builder = builder
|
1019
|
+
.as_any_mut()
|
1020
|
+
.downcast_mut::<BooleanBuilder>()
|
1021
|
+
.ok_or_else(|| {
|
1022
|
+
ParquetError::Conversion("Failed to downcast to BooleanBuilder".to_string())
|
1023
|
+
})?;
|
1024
|
+
boolean_builder.append_null();
|
1025
|
+
}
|
1026
|
+
_ => {
|
1027
|
+
return Err(ParquetError::Conversion(format!(
|
1028
|
+
"Expected Boolean, got {:?}",
|
1029
|
+
value.type_name()
|
1030
|
+
)))
|
1031
|
+
}
|
1032
|
+
},
|
1033
|
+
|
1034
|
+
// For complex types like Map and Struct, we need special handling
|
1035
|
+
DataType::Map(entries_field, _) => match value {
|
1036
|
+
ParquetValue::Map(entries) => {
|
1037
|
+
let map_builder = builder
|
1038
|
+
.as_any_mut()
|
1039
|
+
.downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
|
1040
|
+
.ok_or_else(|| {
|
1041
|
+
ParquetError::Conversion("Failed to downcast to MapBuilder".to_string())
|
1042
|
+
})?;
|
1043
|
+
|
1044
|
+
if let DataType::Struct(fields) = entries_field.data_type() {
|
1045
|
+
if fields.len() != 2 {
|
1046
|
+
return Err(ParquetError::Conversion(
|
1047
|
+
"Map entries struct must have exactly 2 fields".to_string(),
|
1048
|
+
));
|
1049
|
+
}
|
1050
|
+
|
1051
|
+
let key_type = fields[0].data_type();
|
1052
|
+
let value_type = fields[1].data_type();
|
1053
|
+
|
1054
|
+
for (key, val) in entries {
|
1055
|
+
append_parquet_value_to_builder(map_builder.keys(), key, key_type)?;
|
1056
|
+
append_parquet_value_to_builder(map_builder.values(), val, value_type)?;
|
1057
|
+
}
|
1058
|
+
map_builder.append(true)?;
|
1059
|
+
} else {
|
1060
|
+
return Err(ParquetError::Conversion(
|
1061
|
+
"Map entries field must be a struct".to_string(),
|
1062
|
+
));
|
1063
|
+
}
|
1064
|
+
}
|
1065
|
+
ParquetValue::Null => {
|
1066
|
+
let map_builder = builder
|
1067
|
+
.as_any_mut()
|
1068
|
+
.downcast_mut::<MapBuilder<Box<dyn ArrayBuilder>, Box<dyn ArrayBuilder>>>()
|
1069
|
+
.ok_or_else(|| {
|
1070
|
+
ParquetError::Conversion("Failed to downcast to MapBuilder".to_string())
|
1071
|
+
})?;
|
1072
|
+
map_builder.append(false)?;
|
1073
|
+
}
|
1074
|
+
_ => {
|
1075
|
+
return Err(ParquetError::Conversion(format!(
|
1076
|
+
"Expected Map, got {:?}",
|
1077
|
+
value.type_name()
|
1078
|
+
)))
|
1079
|
+
}
|
1080
|
+
},
|
1081
|
+
|
1082
|
+
// For other types, use the existing pattern
|
1083
|
+
_ => {
|
1084
|
+
return Err(ParquetError::Conversion(format!(
|
1085
|
+
"append_parquet_value_to_builder not implemented for type: {:?}",
|
1086
|
+
data_type
|
1087
|
+
)))
|
1088
|
+
}
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
Ok(())
|
1092
|
+
}
|
1093
|
+
|
1094
|
+
#[cfg(test)]
|
1095
|
+
mod tests {
|
1096
|
+
use super::*;
|
1097
|
+
use arrow_array::*;
|
1098
|
+
|
1099
|
+
#[test]
|
1100
|
+
fn test_primitive_conversion_roundtrip() {
|
1101
|
+
// Test boolean
|
1102
|
+
let values = vec![
|
1103
|
+
ParquetValue::Boolean(true),
|
1104
|
+
ParquetValue::Boolean(false),
|
1105
|
+
ParquetValue::Null,
|
1106
|
+
];
|
1107
|
+
let field = Field::new("test", DataType::Boolean, true);
|
1108
|
+
let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
|
1109
|
+
|
1110
|
+
for (i, expected) in values.iter().enumerate() {
|
1111
|
+
let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
|
1112
|
+
assert_eq!(&actual, expected);
|
1113
|
+
}
|
1114
|
+
}
|
1115
|
+
|
1116
|
+
#[test]
|
1117
|
+
fn test_integer_upcasting() {
|
1118
|
+
// Test that smaller integers can be upcast to larger ones
|
1119
|
+
let values = vec![
|
1120
|
+
ParquetValue::Int8(42),
|
1121
|
+
ParquetValue::Int16(1000),
|
1122
|
+
ParquetValue::Int32(100000),
|
1123
|
+
];
|
1124
|
+
let field = Field::new("test", DataType::Int64, false);
|
1125
|
+
let array = parquet_values_to_arrow_array(values, &field).unwrap();
|
1126
|
+
|
1127
|
+
assert_eq!(array.len(), 3);
|
1128
|
+
let int64_array = array.as_any().downcast_ref::<Int64Array>().unwrap();
|
1129
|
+
assert_eq!(int64_array.value(0), 42);
|
1130
|
+
assert_eq!(int64_array.value(1), 1000);
|
1131
|
+
assert_eq!(int64_array.value(2), 100000);
|
1132
|
+
}
|
1133
|
+
}
|