parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,1243 @@
|
|
|
1
|
+
//! Bidirectional conversion between Arrow arrays and ParquetValue
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a unified interface for converting between Arrow's
|
|
4
|
+
//! columnar format and Parquet's value representation. It consolidates
|
|
5
|
+
//! the conversion logic that was previously duplicated between the reader
|
|
6
|
+
//! and writer modules.
|
|
7
|
+
|
|
8
|
+
use crate::{ParquetError, ParquetValue, Result};
|
|
9
|
+
use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
|
|
10
|
+
use arrow_schema::extension::Uuid as ArrowUuid;
|
|
11
|
+
use arrow_schema::{DataType, Field};
|
|
12
|
+
use bytes::Bytes;
|
|
13
|
+
use indexmap::IndexMap;
|
|
14
|
+
use ordered_float::OrderedFloat;
|
|
15
|
+
use parquet::basic::LogicalType;
|
|
16
|
+
use parquet::schema::types::Type;
|
|
17
|
+
use std::sync::Arc as StdArc;
|
|
18
|
+
use triomphe::Arc;
|
|
19
|
+
|
|
20
|
+
/// Convert a single value from an Arrow array at the given index to a ParquetValue
|
|
21
|
+
pub fn arrow_to_parquet_value(
|
|
22
|
+
arrow_field: &Field,
|
|
23
|
+
parquet_field: &Type,
|
|
24
|
+
array: &dyn Array,
|
|
25
|
+
index: usize,
|
|
26
|
+
) -> Result<ParquetValue> {
|
|
27
|
+
use arrow_array::*;
|
|
28
|
+
|
|
29
|
+
if array.is_null(index) {
|
|
30
|
+
return Ok(ParquetValue::Null);
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
match array.data_type() {
|
|
34
|
+
// Primitive types
|
|
35
|
+
DataType::Boolean => {
|
|
36
|
+
let array = downcast_array::<BooleanArray>(array)?;
|
|
37
|
+
Ok(ParquetValue::Boolean(array.value(index)))
|
|
38
|
+
}
|
|
39
|
+
DataType::Int8 => {
|
|
40
|
+
let array = downcast_array::<Int8Array>(array)?;
|
|
41
|
+
Ok(ParquetValue::Int8(array.value(index)))
|
|
42
|
+
}
|
|
43
|
+
DataType::Int16 => {
|
|
44
|
+
let array = downcast_array::<Int16Array>(array)?;
|
|
45
|
+
Ok(ParquetValue::Int16(array.value(index)))
|
|
46
|
+
}
|
|
47
|
+
DataType::Int32 => {
|
|
48
|
+
let array = downcast_array::<Int32Array>(array)?;
|
|
49
|
+
Ok(ParquetValue::Int32(array.value(index)))
|
|
50
|
+
}
|
|
51
|
+
DataType::Int64 => {
|
|
52
|
+
let array = downcast_array::<Int64Array>(array)?;
|
|
53
|
+
Ok(ParquetValue::Int64(array.value(index)))
|
|
54
|
+
}
|
|
55
|
+
DataType::UInt8 => {
|
|
56
|
+
let array = downcast_array::<UInt8Array>(array)?;
|
|
57
|
+
Ok(ParquetValue::UInt8(array.value(index)))
|
|
58
|
+
}
|
|
59
|
+
DataType::UInt16 => {
|
|
60
|
+
let array = downcast_array::<UInt16Array>(array)?;
|
|
61
|
+
Ok(ParquetValue::UInt16(array.value(index)))
|
|
62
|
+
}
|
|
63
|
+
DataType::UInt32 => {
|
|
64
|
+
let array = downcast_array::<UInt32Array>(array)?;
|
|
65
|
+
Ok(ParquetValue::UInt32(array.value(index)))
|
|
66
|
+
}
|
|
67
|
+
DataType::UInt64 => {
|
|
68
|
+
let array = downcast_array::<UInt64Array>(array)?;
|
|
69
|
+
Ok(ParquetValue::UInt64(array.value(index)))
|
|
70
|
+
}
|
|
71
|
+
DataType::Float16 => {
|
|
72
|
+
let array = downcast_array::<Float16Array>(array)?;
|
|
73
|
+
let value = array.value(index);
|
|
74
|
+
Ok(ParquetValue::Float16(OrderedFloat(value.to_f32())))
|
|
75
|
+
}
|
|
76
|
+
DataType::Float32 => {
|
|
77
|
+
let array = downcast_array::<Float32Array>(array)?;
|
|
78
|
+
Ok(ParquetValue::Float32(OrderedFloat(array.value(index))))
|
|
79
|
+
}
|
|
80
|
+
DataType::Float64 => {
|
|
81
|
+
let array = downcast_array::<Float64Array>(array)?;
|
|
82
|
+
Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
|
|
83
|
+
}
|
|
84
|
+
// String and binary types
|
|
85
|
+
DataType::Utf8 => {
|
|
86
|
+
let array = downcast_array::<StringArray>(array)?;
|
|
87
|
+
Ok(ParquetValue::String(Arc::from(array.value(index))))
|
|
88
|
+
}
|
|
89
|
+
DataType::Binary => {
|
|
90
|
+
let array = downcast_array::<BinaryArray>(array)?;
|
|
91
|
+
Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
|
|
92
|
+
array.value(index),
|
|
93
|
+
)))
|
|
94
|
+
}
|
|
95
|
+
DataType::FixedSizeBinary(_) => {
|
|
96
|
+
let array = downcast_array::<FixedSizeBinaryArray>(array)?;
|
|
97
|
+
let value = array.value(index);
|
|
98
|
+
if let Some(LogicalType::Uuid) = parquet_field.get_basic_info().logical_type_ref() {
|
|
99
|
+
let uuid = uuid::Uuid::from_slice(value)
|
|
100
|
+
.map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
|
|
101
|
+
Ok(ParquetValue::Uuid(uuid))
|
|
102
|
+
} else {
|
|
103
|
+
match arrow_field.try_extension_type::<ArrowUuid>() {
|
|
104
|
+
Ok(_) => {
|
|
105
|
+
let uuid = uuid::Uuid::from_slice(value).map_err(|e| {
|
|
106
|
+
ParquetError::Conversion(format!("Invalid UUID: {}", e))
|
|
107
|
+
})?;
|
|
108
|
+
Ok(ParquetValue::Uuid(uuid))
|
|
109
|
+
}
|
|
110
|
+
Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// Date and time types
|
|
116
|
+
DataType::Date32 => {
|
|
117
|
+
let array = downcast_array::<Date32Array>(array)?;
|
|
118
|
+
Ok(ParquetValue::Date32(array.value(index)))
|
|
119
|
+
}
|
|
120
|
+
DataType::Date64 => {
|
|
121
|
+
let array = downcast_array::<Date64Array>(array)?;
|
|
122
|
+
Ok(ParquetValue::Date64(array.value(index)))
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Timestamp types
|
|
126
|
+
DataType::Timestamp(unit, timezone) => {
|
|
127
|
+
let timezone = timezone.as_ref().map(|s| Arc::from(s.as_ref()));
|
|
128
|
+
match unit {
|
|
129
|
+
arrow_schema::TimeUnit::Millisecond => {
|
|
130
|
+
let array = downcast_array::<TimestampMillisecondArray>(array)?;
|
|
131
|
+
Ok(ParquetValue::TimestampMillis(array.value(index), timezone))
|
|
132
|
+
}
|
|
133
|
+
arrow_schema::TimeUnit::Microsecond => {
|
|
134
|
+
let array = downcast_array::<TimestampMicrosecondArray>(array)?;
|
|
135
|
+
Ok(ParquetValue::TimestampMicros(array.value(index), timezone))
|
|
136
|
+
}
|
|
137
|
+
arrow_schema::TimeUnit::Second => {
|
|
138
|
+
let array = downcast_array::<TimestampSecondArray>(array)?;
|
|
139
|
+
Ok(ParquetValue::TimestampSecond(array.value(index), timezone))
|
|
140
|
+
}
|
|
141
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
|
142
|
+
let array = downcast_array::<TimestampNanosecondArray>(array)?;
|
|
143
|
+
Ok(ParquetValue::TimestampNanos(array.value(index), timezone))
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Time types
|
|
149
|
+
DataType::Time32(unit) => match unit {
|
|
150
|
+
arrow_schema::TimeUnit::Millisecond => {
|
|
151
|
+
let array = downcast_array::<Time32MillisecondArray>(array)?;
|
|
152
|
+
Ok(ParquetValue::TimeMillis(array.value(index)))
|
|
153
|
+
}
|
|
154
|
+
_ => Err(ParquetError::Conversion(format!(
|
|
155
|
+
"Unsupported time32 unit: {:?}",
|
|
156
|
+
unit
|
|
157
|
+
))),
|
|
158
|
+
},
|
|
159
|
+
DataType::Time64(unit) => match unit {
|
|
160
|
+
arrow_schema::TimeUnit::Microsecond => {
|
|
161
|
+
let array = downcast_array::<Time64MicrosecondArray>(array)?;
|
|
162
|
+
Ok(ParquetValue::TimeMicros(array.value(index)))
|
|
163
|
+
}
|
|
164
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
|
165
|
+
let array = downcast_array::<Time64NanosecondArray>(array)?;
|
|
166
|
+
Ok(ParquetValue::TimeNanos(array.value(index)))
|
|
167
|
+
}
|
|
168
|
+
_ => Err(ParquetError::Conversion(format!(
|
|
169
|
+
"Unsupported time64 unit: {:?}",
|
|
170
|
+
unit
|
|
171
|
+
))),
|
|
172
|
+
},
|
|
173
|
+
|
|
174
|
+
// Decimal types
|
|
175
|
+
DataType::Decimal128(_precision, scale) => {
|
|
176
|
+
let array = downcast_array::<Decimal128Array>(array)?;
|
|
177
|
+
let value = array.value(index);
|
|
178
|
+
Ok(ParquetValue::Decimal128(value, *scale))
|
|
179
|
+
}
|
|
180
|
+
DataType::Decimal256(_precision, scale) => {
|
|
181
|
+
let array = downcast_array::<Decimal256Array>(array)?;
|
|
182
|
+
let bytes = array.value(index).to_le_bytes();
|
|
183
|
+
|
|
184
|
+
// Convert to BigInt
|
|
185
|
+
let bigint = if bytes[31] & 0x80 != 0 {
|
|
186
|
+
// Negative number - convert from two's complement
|
|
187
|
+
let mut inverted = [0u8; 32];
|
|
188
|
+
for (i, &b) in bytes.iter().enumerate() {
|
|
189
|
+
inverted[i] = !b;
|
|
190
|
+
}
|
|
191
|
+
let positive = num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &inverted);
|
|
192
|
+
-(positive + num::BigInt::from(1))
|
|
193
|
+
} else {
|
|
194
|
+
num::BigInt::from_bytes_le(num::bigint::Sign::Plus, &bytes)
|
|
195
|
+
};
|
|
196
|
+
|
|
197
|
+
Ok(ParquetValue::Decimal256(bigint, *scale))
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Complex types
|
|
201
|
+
DataType::List(item_field) => {
|
|
202
|
+
let array = downcast_array::<ListArray>(array)?;
|
|
203
|
+
let list_values = array.value(index);
|
|
204
|
+
|
|
205
|
+
let mut values = Vec::with_capacity(list_values.len());
|
|
206
|
+
|
|
207
|
+
// Get the list's element type from parquet schema
|
|
208
|
+
let element_type = match parquet_field {
|
|
209
|
+
parquet::schema::types::Type::GroupType { fields, .. } => {
|
|
210
|
+
// List has a repeated group containing the element
|
|
211
|
+
// The structure is: LIST -> repeated group -> element
|
|
212
|
+
if let Some(repeated_group) = fields.first() {
|
|
213
|
+
match repeated_group.as_ref() {
|
|
214
|
+
parquet::schema::types::Type::GroupType {
|
|
215
|
+
fields: inner_fields,
|
|
216
|
+
..
|
|
217
|
+
} => {
|
|
218
|
+
// This is the repeated group, get the actual element
|
|
219
|
+
inner_fields.first().ok_or_else(|| {
|
|
220
|
+
ParquetError::Conversion(
|
|
221
|
+
"List repeated group missing element field".to_string(),
|
|
222
|
+
)
|
|
223
|
+
})?
|
|
224
|
+
}
|
|
225
|
+
_ => repeated_group, // If it's not a group, use it directly
|
|
226
|
+
}
|
|
227
|
+
} else {
|
|
228
|
+
return Err(ParquetError::Conversion(
|
|
229
|
+
"List type missing fields".to_string(),
|
|
230
|
+
));
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
_ => parquet_field, // Fallback for cases where it's not a proper list structure
|
|
234
|
+
};
|
|
235
|
+
|
|
236
|
+
for i in 0..list_values.len() {
|
|
237
|
+
values.push(arrow_to_parquet_value(
|
|
238
|
+
item_field,
|
|
239
|
+
element_type,
|
|
240
|
+
&list_values,
|
|
241
|
+
i,
|
|
242
|
+
)?);
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
Ok(ParquetValue::List(values))
|
|
246
|
+
}
|
|
247
|
+
DataType::Map(_, _) => {
|
|
248
|
+
let array = downcast_array::<MapArray>(array)?;
|
|
249
|
+
let map_value = array.value(index);
|
|
250
|
+
|
|
251
|
+
// The Arrow `MapArray` entries struct is always (key, value) by
|
|
252
|
+
// position — `MapArray::keys()`/`values()` are `column(0)`/`column(1)`
|
|
253
|
+
// and `try_new` enforces exactly two columns — so we index by position
|
|
254
|
+
// and never depend on the entry field names (which the Parquet spec
|
|
255
|
+
// does not fix).
|
|
256
|
+
debug_assert_eq!(map_value.num_columns(), 2);
|
|
257
|
+
let keys = map_value.column(0);
|
|
258
|
+
let values = map_value.column(1);
|
|
259
|
+
|
|
260
|
+
let key_field = map_value
|
|
261
|
+
.fields()
|
|
262
|
+
.get(0)
|
|
263
|
+
.ok_or_else(|| ParquetError::Conversion("No key field found".to_string()))?;
|
|
264
|
+
|
|
265
|
+
let value_field = map_value
|
|
266
|
+
.fields()
|
|
267
|
+
.get(1)
|
|
268
|
+
.ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
|
|
269
|
+
|
|
270
|
+
let mut map_vec = Vec::with_capacity(keys.len());
|
|
271
|
+
|
|
272
|
+
// Get key and value types from parquet schema
|
|
273
|
+
// Map structure is: MAP -> key_value (repeated group) -> key, value
|
|
274
|
+
let (key_type, value_type) = match parquet_field {
|
|
275
|
+
parquet::schema::types::Type::GroupType { fields, .. } => {
|
|
276
|
+
// Get the key_value repeated group
|
|
277
|
+
match fields.first() {
|
|
278
|
+
Some(key_value_group) => match key_value_group.as_ref() {
|
|
279
|
+
parquet::schema::types::Type::GroupType {
|
|
280
|
+
fields: kv_fields, ..
|
|
281
|
+
} => {
|
|
282
|
+
let key_field = kv_fields.first().ok_or_else(|| {
|
|
283
|
+
ParquetError::Conversion("Map missing key field".to_string())
|
|
284
|
+
})?;
|
|
285
|
+
let value_field = kv_fields.get(1).ok_or_else(|| {
|
|
286
|
+
ParquetError::Conversion("Map missing value field".to_string())
|
|
287
|
+
})?;
|
|
288
|
+
(key_field.as_ref(), value_field.as_ref())
|
|
289
|
+
}
|
|
290
|
+
_ => {
|
|
291
|
+
return Err(ParquetError::Conversion(
|
|
292
|
+
"Map key_value should be a group".to_string(),
|
|
293
|
+
))
|
|
294
|
+
}
|
|
295
|
+
},
|
|
296
|
+
None => {
|
|
297
|
+
return Err(ParquetError::Conversion(
|
|
298
|
+
"Map type missing key_value field".to_string(),
|
|
299
|
+
))
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
_ => {
|
|
304
|
+
return Err(ParquetError::Conversion(
|
|
305
|
+
"Map type must be a group".to_string(),
|
|
306
|
+
))
|
|
307
|
+
}
|
|
308
|
+
};
|
|
309
|
+
|
|
310
|
+
for i in 0..keys.len() {
|
|
311
|
+
let key = arrow_to_parquet_value(key_field, key_type, keys, i)?;
|
|
312
|
+
let value = arrow_to_parquet_value(value_field, value_type, values, i)?;
|
|
313
|
+
map_vec.push((key, value));
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
Ok(ParquetValue::Map(map_vec))
|
|
317
|
+
}
|
|
318
|
+
DataType::Struct(_) => {
|
|
319
|
+
let array = downcast_array::<StructArray>(array)?;
|
|
320
|
+
|
|
321
|
+
let mut map = IndexMap::new();
|
|
322
|
+
|
|
323
|
+
// Get struct fields from parquet schema
|
|
324
|
+
let parquet_fields = match parquet_field {
|
|
325
|
+
parquet::schema::types::Type::GroupType { fields, .. } => fields,
|
|
326
|
+
_ => {
|
|
327
|
+
return Err(ParquetError::Conversion(
|
|
328
|
+
"Struct type must be a group".to_string(),
|
|
329
|
+
))
|
|
330
|
+
}
|
|
331
|
+
};
|
|
332
|
+
|
|
333
|
+
for (col_idx, arrow_field) in array.fields().iter().enumerate() {
|
|
334
|
+
let column = array.column(col_idx);
|
|
335
|
+
|
|
336
|
+
// Find matching parquet field by name
|
|
337
|
+
let nested_parquet_field = parquet_fields
|
|
338
|
+
.iter()
|
|
339
|
+
.find(|f| f.name() == arrow_field.name())
|
|
340
|
+
.ok_or_else(|| {
|
|
341
|
+
ParquetError::Conversion(format!(
|
|
342
|
+
"No matching parquet field for struct field '{}'",
|
|
343
|
+
arrow_field.name()
|
|
344
|
+
))
|
|
345
|
+
})?;
|
|
346
|
+
|
|
347
|
+
let value =
|
|
348
|
+
arrow_to_parquet_value(arrow_field, nested_parquet_field, column, index)?;
|
|
349
|
+
map.insert(Arc::from(arrow_field.name().as_str()), value);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
Ok(ParquetValue::Record(map))
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
dt => Err(ParquetError::Conversion(format!(
|
|
356
|
+
"Unsupported data type for conversion: {:?}",
|
|
357
|
+
dt
|
|
358
|
+
))),
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
/// Convert a slice of ParquetValues to an Arrow array
|
|
363
|
+
pub fn parquet_values_to_arrow_array(values: &[ParquetValue], field: &Field) -> Result<ArrayRef> {
|
|
364
|
+
let value_refs = values.iter().collect::<Vec<_>>();
|
|
365
|
+
parquet_value_refs_to_arrow_array(&value_refs, field)
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
fn parquet_value_refs_to_arrow_array(values: &[&ParquetValue], field: &Field) -> Result<ArrayRef> {
|
|
369
|
+
match field.data_type() {
|
|
370
|
+
// Boolean
|
|
371
|
+
DataType::Boolean => {
|
|
372
|
+
let mut builder = BooleanBuilder::with_capacity(values.len());
|
|
373
|
+
for value in values {
|
|
374
|
+
match *value {
|
|
375
|
+
ParquetValue::Boolean(b) => builder.append_value(*b),
|
|
376
|
+
ParquetValue::Null => builder.append_null(),
|
|
377
|
+
_ => {
|
|
378
|
+
return Err(ParquetError::Conversion(format!(
|
|
379
|
+
"Expected Boolean, got {:?}",
|
|
380
|
+
value.type_name()
|
|
381
|
+
)))
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
Ok(StdArc::new(builder.finish()))
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Integer types with automatic upcasting
|
|
389
|
+
DataType::Int8 => build_int8_array(values),
|
|
390
|
+
DataType::Int16 => build_int16_array(values),
|
|
391
|
+
DataType::Int32 => build_int32_array(values),
|
|
392
|
+
DataType::Int64 => build_int64_array(values),
|
|
393
|
+
DataType::UInt8 => build_uint8_array(values),
|
|
394
|
+
DataType::UInt16 => build_uint16_array(values),
|
|
395
|
+
DataType::UInt32 => build_uint32_array(values),
|
|
396
|
+
DataType::UInt64 => build_uint64_array(values),
|
|
397
|
+
|
|
398
|
+
// Float types
|
|
399
|
+
DataType::Float32 => build_float32_array(values),
|
|
400
|
+
DataType::Float64 => build_float64_array(values),
|
|
401
|
+
|
|
402
|
+
// String and binary
|
|
403
|
+
DataType::Utf8 => build_string_array(values),
|
|
404
|
+
DataType::Binary => build_binary_array(values),
|
|
405
|
+
DataType::FixedSizeBinary(size) => build_fixed_binary_array(values, *size),
|
|
406
|
+
|
|
407
|
+
// Date and time
|
|
408
|
+
DataType::Date32 => build_date32_array(values),
|
|
409
|
+
DataType::Date64 => build_date64_array(values),
|
|
410
|
+
DataType::Time32(unit) => build_time32_array(values, unit),
|
|
411
|
+
DataType::Time64(unit) => build_time64_array(values, unit),
|
|
412
|
+
|
|
413
|
+
// Timestamp
|
|
414
|
+
DataType::Timestamp(unit, tz) => build_timestamp_array(values, unit, tz.as_deref()),
|
|
415
|
+
|
|
416
|
+
// Decimal
|
|
417
|
+
DataType::Decimal128(precision, scale) => {
|
|
418
|
+
build_decimal128_array(values, *precision, *scale)
|
|
419
|
+
}
|
|
420
|
+
DataType::Decimal256(precision, scale) => {
|
|
421
|
+
build_decimal256_array(values, *precision, *scale)
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
// Complex types
|
|
425
|
+
DataType::List(item_field) => build_list_array(values, item_field),
|
|
426
|
+
DataType::Map(entries_field, sorted) => build_map_array(values, entries_field, *sorted),
|
|
427
|
+
DataType::Struct(fields) => build_struct_array(values, fields),
|
|
428
|
+
|
|
429
|
+
dt => Err(ParquetError::Conversion(format!(
|
|
430
|
+
"Unsupported data type for conversion: {:?}",
|
|
431
|
+
dt
|
|
432
|
+
))),
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
/// Helper function to downcast an array with better error messages
|
|
437
|
+
fn downcast_array<T: 'static>(array: &dyn Array) -> Result<&T> {
|
|
438
|
+
array.as_any().downcast_ref::<T>().ok_or_else(|| {
|
|
439
|
+
ParquetError::Conversion(format!("Failed to cast to {}", std::any::type_name::<T>()))
|
|
440
|
+
})
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
/// Build Int8 array
|
|
444
|
+
fn build_int8_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
445
|
+
let mut builder = Int8Builder::with_capacity(values.len());
|
|
446
|
+
for value in values {
|
|
447
|
+
match *value {
|
|
448
|
+
ParquetValue::Int8(i) => builder.append_value(*i),
|
|
449
|
+
ParquetValue::Null => builder.append_null(),
|
|
450
|
+
_ => {
|
|
451
|
+
return Err(ParquetError::Conversion(format!(
|
|
452
|
+
"Expected Int8, got {:?}",
|
|
453
|
+
value.type_name()
|
|
454
|
+
)))
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
Ok(StdArc::new(builder.finish()))
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/// Build Int16 array
|
|
462
|
+
fn build_int16_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
463
|
+
let mut builder = Int16Builder::with_capacity(values.len());
|
|
464
|
+
for value in values {
|
|
465
|
+
match *value {
|
|
466
|
+
ParquetValue::Int16(i) => builder.append_value(*i),
|
|
467
|
+
ParquetValue::Int8(i) => builder.append_value(*i as i16),
|
|
468
|
+
ParquetValue::Null => builder.append_null(),
|
|
469
|
+
_ => {
|
|
470
|
+
return Err(ParquetError::Conversion(format!(
|
|
471
|
+
"Expected Int16, got {:?}",
|
|
472
|
+
value.type_name()
|
|
473
|
+
)))
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
}
|
|
477
|
+
Ok(StdArc::new(builder.finish()))
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
/// Build Int32 array
|
|
481
|
+
fn build_int32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
482
|
+
let mut builder = Int32Builder::with_capacity(values.len());
|
|
483
|
+
for value in values {
|
|
484
|
+
match *value {
|
|
485
|
+
ParquetValue::Int32(i) => builder.append_value(*i),
|
|
486
|
+
ParquetValue::Int16(i) => builder.append_value(*i as i32),
|
|
487
|
+
ParquetValue::Int8(i) => builder.append_value(*i as i32),
|
|
488
|
+
ParquetValue::Null => builder.append_null(),
|
|
489
|
+
_ => {
|
|
490
|
+
return Err(ParquetError::Conversion(format!(
|
|
491
|
+
"Expected Int32, got {:?}",
|
|
492
|
+
value.type_name()
|
|
493
|
+
)))
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
Ok(StdArc::new(builder.finish()))
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
/// Build Int64 array
|
|
501
|
+
fn build_int64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
502
|
+
let mut builder = Int64Builder::with_capacity(values.len());
|
|
503
|
+
for value in values {
|
|
504
|
+
match *value {
|
|
505
|
+
ParquetValue::Int64(i) => builder.append_value(*i),
|
|
506
|
+
ParquetValue::Int32(i) => builder.append_value(*i as i64),
|
|
507
|
+
ParquetValue::Int16(i) => builder.append_value(*i as i64),
|
|
508
|
+
ParquetValue::Int8(i) => builder.append_value(*i as i64),
|
|
509
|
+
ParquetValue::Null => builder.append_null(),
|
|
510
|
+
_ => {
|
|
511
|
+
return Err(ParquetError::Conversion(format!(
|
|
512
|
+
"Expected Int64, got {:?}",
|
|
513
|
+
value.type_name()
|
|
514
|
+
)))
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
Ok(StdArc::new(builder.finish()))
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
/// Build UInt8 array
|
|
522
|
+
fn build_uint8_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
523
|
+
let mut builder = UInt8Builder::with_capacity(values.len());
|
|
524
|
+
for value in values {
|
|
525
|
+
match *value {
|
|
526
|
+
ParquetValue::UInt8(i) => builder.append_value(*i),
|
|
527
|
+
ParquetValue::Null => builder.append_null(),
|
|
528
|
+
_ => {
|
|
529
|
+
return Err(ParquetError::Conversion(format!(
|
|
530
|
+
"Expected UInt8, got {:?}",
|
|
531
|
+
value.type_name()
|
|
532
|
+
)))
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
}
|
|
536
|
+
Ok(StdArc::new(builder.finish()))
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
/// Build UInt16 array
|
|
540
|
+
fn build_uint16_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
541
|
+
let mut builder = UInt16Builder::with_capacity(values.len());
|
|
542
|
+
for value in values {
|
|
543
|
+
match *value {
|
|
544
|
+
ParquetValue::UInt16(i) => builder.append_value(*i),
|
|
545
|
+
ParquetValue::UInt8(i) => builder.append_value(*i as u16),
|
|
546
|
+
ParquetValue::Null => builder.append_null(),
|
|
547
|
+
_ => {
|
|
548
|
+
return Err(ParquetError::Conversion(format!(
|
|
549
|
+
"Expected UInt16, got {:?}",
|
|
550
|
+
value.type_name()
|
|
551
|
+
)))
|
|
552
|
+
}
|
|
553
|
+
}
|
|
554
|
+
}
|
|
555
|
+
Ok(StdArc::new(builder.finish()))
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
/// Build UInt32 array
|
|
559
|
+
fn build_uint32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
560
|
+
let mut builder = UInt32Builder::with_capacity(values.len());
|
|
561
|
+
for value in values {
|
|
562
|
+
match *value {
|
|
563
|
+
ParquetValue::UInt32(i) => builder.append_value(*i),
|
|
564
|
+
ParquetValue::UInt16(i) => builder.append_value(*i as u32),
|
|
565
|
+
ParquetValue::UInt8(i) => builder.append_value(*i as u32),
|
|
566
|
+
ParquetValue::Null => builder.append_null(),
|
|
567
|
+
_ => {
|
|
568
|
+
return Err(ParquetError::Conversion(format!(
|
|
569
|
+
"Expected UInt32, got {:?}",
|
|
570
|
+
value.type_name()
|
|
571
|
+
)))
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
Ok(StdArc::new(builder.finish()))
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
/// Build UInt64 array
|
|
579
|
+
fn build_uint64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
580
|
+
let mut builder = UInt64Builder::with_capacity(values.len());
|
|
581
|
+
for value in values {
|
|
582
|
+
match *value {
|
|
583
|
+
ParquetValue::UInt64(i) => builder.append_value(*i),
|
|
584
|
+
ParquetValue::UInt32(i) => builder.append_value(*i as u64),
|
|
585
|
+
ParquetValue::UInt16(i) => builder.append_value(*i as u64),
|
|
586
|
+
ParquetValue::UInt8(i) => builder.append_value(*i as u64),
|
|
587
|
+
ParquetValue::Null => builder.append_null(),
|
|
588
|
+
_ => {
|
|
589
|
+
return Err(ParquetError::Conversion(format!(
|
|
590
|
+
"Expected UInt64, got {:?}",
|
|
591
|
+
value.type_name()
|
|
592
|
+
)))
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
Ok(StdArc::new(builder.finish()))
|
|
597
|
+
}
|
|
598
|
+
|
|
599
|
+
/// Build Float32 array with Float16 support
|
|
600
|
+
fn build_float32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
601
|
+
let mut builder = Float32Builder::with_capacity(values.len());
|
|
602
|
+
for value in values {
|
|
603
|
+
match *value {
|
|
604
|
+
ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(*f),
|
|
605
|
+
ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(*f),
|
|
606
|
+
ParquetValue::Null => builder.append_null(),
|
|
607
|
+
_ => {
|
|
608
|
+
return Err(ParquetError::Conversion(format!(
|
|
609
|
+
"Expected Float32, got {:?}",
|
|
610
|
+
value.type_name()
|
|
611
|
+
)))
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
}
|
|
615
|
+
Ok(StdArc::new(builder.finish()))
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
/// Build Float64 array with Float32 and Float16 support
|
|
619
|
+
fn build_float64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
620
|
+
let mut builder = Float64Builder::with_capacity(values.len());
|
|
621
|
+
for value in values {
|
|
622
|
+
match *value {
|
|
623
|
+
ParquetValue::Float64(OrderedFloat(f)) => builder.append_value(*f),
|
|
624
|
+
ParquetValue::Float32(OrderedFloat(f)) => builder.append_value(*f as f64),
|
|
625
|
+
ParquetValue::Float16(OrderedFloat(f)) => builder.append_value(*f as f64),
|
|
626
|
+
ParquetValue::Null => builder.append_null(),
|
|
627
|
+
_ => {
|
|
628
|
+
return Err(ParquetError::Conversion(format!(
|
|
629
|
+
"Expected Float64, got {:?}",
|
|
630
|
+
value.type_name()
|
|
631
|
+
)))
|
|
632
|
+
}
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
Ok(StdArc::new(builder.finish()))
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
/// Build string array
|
|
639
|
+
fn build_string_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
640
|
+
// Pre-size the data buffer exactly: growing it by doubling would
|
|
641
|
+
// transiently hold up to 3x the payload during the final realloc.
|
|
642
|
+
let data_capacity: usize = values
|
|
643
|
+
.iter()
|
|
644
|
+
.map(|value| match value {
|
|
645
|
+
ParquetValue::String(s) => s.len(),
|
|
646
|
+
_ => 0,
|
|
647
|
+
})
|
|
648
|
+
.sum();
|
|
649
|
+
let mut builder = StringBuilder::with_capacity(values.len(), data_capacity);
|
|
650
|
+
for value in values {
|
|
651
|
+
match *value {
|
|
652
|
+
ParquetValue::String(s) => builder.append_value(s.as_ref()),
|
|
653
|
+
ParquetValue::Null => builder.append_null(),
|
|
654
|
+
_ => {
|
|
655
|
+
return Err(ParquetError::Conversion(format!(
|
|
656
|
+
"Expected String, got {:?}",
|
|
657
|
+
value.type_name()
|
|
658
|
+
)))
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
Ok(StdArc::new(builder.finish()))
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
/// Build binary array
|
|
666
|
+
fn build_binary_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
667
|
+
// Pre-size the data buffer exactly, as in build_string_array.
|
|
668
|
+
let data_capacity: usize = values
|
|
669
|
+
.iter()
|
|
670
|
+
.map(|value| match value {
|
|
671
|
+
ParquetValue::Bytes(b) => b.len(),
|
|
672
|
+
_ => 0,
|
|
673
|
+
})
|
|
674
|
+
.sum();
|
|
675
|
+
let mut builder = BinaryBuilder::with_capacity(values.len(), data_capacity);
|
|
676
|
+
for value in values {
|
|
677
|
+
match *value {
|
|
678
|
+
ParquetValue::Bytes(b) => builder.append_value(b.as_ref()),
|
|
679
|
+
ParquetValue::Null => builder.append_null(),
|
|
680
|
+
_ => {
|
|
681
|
+
return Err(ParquetError::Conversion(format!(
|
|
682
|
+
"Expected Bytes, got {:?}",
|
|
683
|
+
value.type_name()
|
|
684
|
+
)))
|
|
685
|
+
}
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
Ok(StdArc::new(builder.finish()))
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
/// Build fixed size binary array
|
|
692
|
+
fn build_fixed_binary_array(values: &[&ParquetValue], size: i32) -> Result<ArrayRef> {
|
|
693
|
+
let mut builder = FixedSizeBinaryBuilder::with_capacity(values.len(), size);
|
|
694
|
+
for value in values {
|
|
695
|
+
match *value {
|
|
696
|
+
ParquetValue::Bytes(b) => {
|
|
697
|
+
if b.len() != size as usize {
|
|
698
|
+
return Err(ParquetError::Conversion(format!(
|
|
699
|
+
"Fixed size binary expected {} bytes, got {}",
|
|
700
|
+
size,
|
|
701
|
+
b.len()
|
|
702
|
+
)));
|
|
703
|
+
}
|
|
704
|
+
builder.append_value(b.as_ref())?;
|
|
705
|
+
}
|
|
706
|
+
ParquetValue::Null => builder.append_null(),
|
|
707
|
+
_ => {
|
|
708
|
+
return Err(ParquetError::Conversion(format!(
|
|
709
|
+
"Expected Bytes, got {:?}",
|
|
710
|
+
value.type_name()
|
|
711
|
+
)))
|
|
712
|
+
}
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
Ok(StdArc::new(builder.finish()))
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
/// Build Date32 array
|
|
719
|
+
fn build_date32_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
720
|
+
let mut builder = Date32Builder::with_capacity(values.len());
|
|
721
|
+
for value in values {
|
|
722
|
+
match *value {
|
|
723
|
+
ParquetValue::Date32(d) => builder.append_value(*d),
|
|
724
|
+
ParquetValue::Null => builder.append_null(),
|
|
725
|
+
_ => {
|
|
726
|
+
return Err(ParquetError::Conversion(format!(
|
|
727
|
+
"Expected Date32, got {:?}",
|
|
728
|
+
value.type_name()
|
|
729
|
+
)))
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
Ok(StdArc::new(builder.finish()))
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
/// Build Date64 array
|
|
737
|
+
fn build_date64_array(values: &[&ParquetValue]) -> Result<ArrayRef> {
|
|
738
|
+
let mut builder = Date64Builder::with_capacity(values.len());
|
|
739
|
+
for value in values {
|
|
740
|
+
match *value {
|
|
741
|
+
ParquetValue::Date64(d) => builder.append_value(*d),
|
|
742
|
+
ParquetValue::Null => builder.append_null(),
|
|
743
|
+
_ => {
|
|
744
|
+
return Err(ParquetError::Conversion(format!(
|
|
745
|
+
"Expected Date64, got {:?}",
|
|
746
|
+
value.type_name()
|
|
747
|
+
)))
|
|
748
|
+
}
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
Ok(StdArc::new(builder.finish()))
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
/// Build Time32 array
|
|
755
|
+
fn build_time32_array(values: &[&ParquetValue], unit: &arrow_schema::TimeUnit) -> Result<ArrayRef> {
|
|
756
|
+
match unit {
|
|
757
|
+
arrow_schema::TimeUnit::Millisecond => {
|
|
758
|
+
let mut builder = Time32MillisecondBuilder::with_capacity(values.len());
|
|
759
|
+
for value in values {
|
|
760
|
+
match *value {
|
|
761
|
+
ParquetValue::TimeMillis(t) => builder.append_value(*t),
|
|
762
|
+
ParquetValue::Null => builder.append_null(),
|
|
763
|
+
_ => {
|
|
764
|
+
return Err(ParquetError::Conversion(format!(
|
|
765
|
+
"Expected TimeMillis, got {:?}",
|
|
766
|
+
value.type_name()
|
|
767
|
+
)))
|
|
768
|
+
}
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
Ok(StdArc::new(builder.finish()))
|
|
772
|
+
}
|
|
773
|
+
_ => Err(ParquetError::Conversion(format!(
|
|
774
|
+
"Unsupported time32 unit: {:?}",
|
|
775
|
+
unit
|
|
776
|
+
))),
|
|
777
|
+
}
|
|
778
|
+
}
|
|
779
|
+
|
|
780
|
+
/// Build Time64 array
|
|
781
|
+
fn build_time64_array(values: &[&ParquetValue], unit: &arrow_schema::TimeUnit) -> Result<ArrayRef> {
|
|
782
|
+
match unit {
|
|
783
|
+
arrow_schema::TimeUnit::Microsecond => {
|
|
784
|
+
let mut builder = Time64MicrosecondBuilder::with_capacity(values.len());
|
|
785
|
+
for value in values {
|
|
786
|
+
match *value {
|
|
787
|
+
ParquetValue::TimeMicros(t) => builder.append_value(*t),
|
|
788
|
+
ParquetValue::Null => builder.append_null(),
|
|
789
|
+
_ => {
|
|
790
|
+
return Err(ParquetError::Conversion(format!(
|
|
791
|
+
"Expected TimeMicros, got {:?}",
|
|
792
|
+
value.type_name()
|
|
793
|
+
)))
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
}
|
|
797
|
+
Ok(StdArc::new(builder.finish()))
|
|
798
|
+
}
|
|
799
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
|
800
|
+
let mut builder = Time64NanosecondBuilder::with_capacity(values.len());
|
|
801
|
+
for value in values {
|
|
802
|
+
match *value {
|
|
803
|
+
ParquetValue::TimeNanos(t) => builder.append_value(*t),
|
|
804
|
+
ParquetValue::Null => builder.append_null(),
|
|
805
|
+
_ => {
|
|
806
|
+
return Err(ParquetError::Conversion(format!(
|
|
807
|
+
"Expected TimeNanos, got {:?}",
|
|
808
|
+
value.type_name()
|
|
809
|
+
)))
|
|
810
|
+
}
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
Ok(StdArc::new(builder.finish()))
|
|
814
|
+
}
|
|
815
|
+
_ => Err(ParquetError::Conversion(format!(
|
|
816
|
+
"Unsupported time64 unit: {:?}",
|
|
817
|
+
unit
|
|
818
|
+
))),
|
|
819
|
+
}
|
|
820
|
+
}
|
|
821
|
+
|
|
822
|
+
/// Build timestamp array
|
|
823
|
+
fn build_timestamp_array(
|
|
824
|
+
values: &[&ParquetValue],
|
|
825
|
+
unit: &arrow_schema::TimeUnit,
|
|
826
|
+
timezone: Option<&str>,
|
|
827
|
+
) -> Result<ArrayRef> {
|
|
828
|
+
let tz = timezone.map(StdArc::from);
|
|
829
|
+
|
|
830
|
+
match unit {
|
|
831
|
+
arrow_schema::TimeUnit::Second => {
|
|
832
|
+
let mut builder =
|
|
833
|
+
TimestampSecondBuilder::with_capacity(values.len()).with_timezone_opt(tz.clone());
|
|
834
|
+
for value in values {
|
|
835
|
+
match *value {
|
|
836
|
+
ParquetValue::TimestampSecond(t, _) => builder.append_value(*t),
|
|
837
|
+
ParquetValue::Null => builder.append_null(),
|
|
838
|
+
_ => {
|
|
839
|
+
return Err(ParquetError::Conversion(format!(
|
|
840
|
+
"Expected TimestampSecond, got {:?}",
|
|
841
|
+
value.type_name()
|
|
842
|
+
)))
|
|
843
|
+
}
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
Ok(StdArc::new(builder.finish()))
|
|
847
|
+
}
|
|
848
|
+
arrow_schema::TimeUnit::Millisecond => {
|
|
849
|
+
let mut builder = TimestampMillisecondBuilder::with_capacity(values.len())
|
|
850
|
+
.with_timezone_opt(tz.clone());
|
|
851
|
+
for value in values {
|
|
852
|
+
match *value {
|
|
853
|
+
ParquetValue::TimestampMillis(t, _) => builder.append_value(*t),
|
|
854
|
+
ParquetValue::Null => builder.append_null(),
|
|
855
|
+
_ => {
|
|
856
|
+
return Err(ParquetError::Conversion(format!(
|
|
857
|
+
"Expected TimestampMillis, got {:?}",
|
|
858
|
+
value.type_name()
|
|
859
|
+
)))
|
|
860
|
+
}
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
Ok(StdArc::new(builder.finish()))
|
|
864
|
+
}
|
|
865
|
+
arrow_schema::TimeUnit::Microsecond => {
|
|
866
|
+
let mut builder = TimestampMicrosecondBuilder::with_capacity(values.len())
|
|
867
|
+
.with_timezone_opt(tz.clone());
|
|
868
|
+
for value in values {
|
|
869
|
+
match *value {
|
|
870
|
+
ParquetValue::TimestampMicros(t, _) => builder.append_value(*t),
|
|
871
|
+
ParquetValue::Null => builder.append_null(),
|
|
872
|
+
_ => {
|
|
873
|
+
return Err(ParquetError::Conversion(format!(
|
|
874
|
+
"Expected TimestampMicros, got {:?}",
|
|
875
|
+
value.type_name()
|
|
876
|
+
)))
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
Ok(StdArc::new(builder.finish()))
|
|
881
|
+
}
|
|
882
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
|
883
|
+
let mut builder = TimestampNanosecondBuilder::with_capacity(values.len())
|
|
884
|
+
.with_timezone_opt(tz.clone());
|
|
885
|
+
for value in values {
|
|
886
|
+
match *value {
|
|
887
|
+
ParquetValue::TimestampNanos(t, _) => builder.append_value(*t),
|
|
888
|
+
ParquetValue::Null => builder.append_null(),
|
|
889
|
+
_ => {
|
|
890
|
+
return Err(ParquetError::Conversion(format!(
|
|
891
|
+
"Expected TimestampNanos, got {:?}",
|
|
892
|
+
value.type_name()
|
|
893
|
+
)))
|
|
894
|
+
}
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
Ok(StdArc::new(builder.finish()))
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
/// Build Decimal128 array
|
|
903
|
+
fn build_decimal128_array(values: &[&ParquetValue], precision: u8, scale: i8) -> Result<ArrayRef> {
|
|
904
|
+
let mut builder = Decimal128Builder::with_capacity(values.len())
|
|
905
|
+
.with_precision_and_scale(precision, scale)?;
|
|
906
|
+
for (idx, value) in values.iter().enumerate() {
|
|
907
|
+
match *value {
|
|
908
|
+
ParquetValue::Decimal128(d, value_scale) => {
|
|
909
|
+
validate_decimal128_array_value(*d, *value_scale, precision, scale, idx)?;
|
|
910
|
+
builder.append_value(*d);
|
|
911
|
+
}
|
|
912
|
+
ParquetValue::Null => builder.append_null(),
|
|
913
|
+
_ => {
|
|
914
|
+
return Err(ParquetError::Conversion(format!(
|
|
915
|
+
"Expected Decimal128, got {:?}",
|
|
916
|
+
value.type_name()
|
|
917
|
+
)))
|
|
918
|
+
}
|
|
919
|
+
}
|
|
920
|
+
}
|
|
921
|
+
Ok(StdArc::new(builder.finish()))
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
/// Build Decimal256 array
|
|
925
|
+
fn build_decimal256_array(values: &[&ParquetValue], precision: u8, scale: i8) -> Result<ArrayRef> {
|
|
926
|
+
let mut builder = Decimal256Builder::with_capacity(values.len())
|
|
927
|
+
.with_precision_and_scale(precision, scale)?;
|
|
928
|
+
for (idx, value) in values.iter().enumerate() {
|
|
929
|
+
match *value {
|
|
930
|
+
ParquetValue::Decimal256(bigint, value_scale) => {
|
|
931
|
+
validate_decimal256_array_value(bigint, *value_scale, precision, scale, idx)?;
|
|
932
|
+
let bytes = decimal256_from_bigint(bigint)?;
|
|
933
|
+
builder.append_value(bytes);
|
|
934
|
+
}
|
|
935
|
+
ParquetValue::Null => builder.append_null(),
|
|
936
|
+
_ => {
|
|
937
|
+
return Err(ParquetError::Conversion(format!(
|
|
938
|
+
"Expected Decimal256, got {:?}",
|
|
939
|
+
value.type_name()
|
|
940
|
+
)))
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
Ok(StdArc::new(builder.finish()))
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
fn validate_decimal128_array_value(
|
|
948
|
+
value: i128,
|
|
949
|
+
value_scale: i8,
|
|
950
|
+
precision: u8,
|
|
951
|
+
scale: i8,
|
|
952
|
+
index: usize,
|
|
953
|
+
) -> Result<()> {
|
|
954
|
+
if value_scale != scale {
|
|
955
|
+
return Err(ParquetError::Conversion(format!(
|
|
956
|
+
"Decimal scale mismatch at value[{}]: array scale {}, value scale {}",
|
|
957
|
+
index, scale, value_scale
|
|
958
|
+
)));
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
validate_decimal_array_precision(decimal128_digit_count(value), precision, index)
|
|
962
|
+
}
|
|
963
|
+
|
|
964
|
+
fn validate_decimal256_array_value(
|
|
965
|
+
value: &num::BigInt,
|
|
966
|
+
value_scale: i8,
|
|
967
|
+
precision: u8,
|
|
968
|
+
scale: i8,
|
|
969
|
+
index: usize,
|
|
970
|
+
) -> Result<()> {
|
|
971
|
+
if value_scale != scale {
|
|
972
|
+
return Err(ParquetError::Conversion(format!(
|
|
973
|
+
"Decimal scale mismatch at value[{}]: array scale {}, value scale {}",
|
|
974
|
+
index, scale, value_scale
|
|
975
|
+
)));
|
|
976
|
+
}
|
|
977
|
+
|
|
978
|
+
validate_decimal_array_precision(decimal256_digit_count(value), precision, index)
|
|
979
|
+
}
|
|
980
|
+
|
|
981
|
+
fn validate_decimal_array_precision(
|
|
982
|
+
value_digits: usize,
|
|
983
|
+
precision: u8,
|
|
984
|
+
index: usize,
|
|
985
|
+
) -> Result<()> {
|
|
986
|
+
if value_digits > precision as usize {
|
|
987
|
+
return Err(ParquetError::Conversion(format!(
|
|
988
|
+
"Decimal precision overflow at value[{}]: array precision {}, value has {} digits",
|
|
989
|
+
index, precision, value_digits
|
|
990
|
+
)));
|
|
991
|
+
}
|
|
992
|
+
|
|
993
|
+
Ok(())
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
fn decimal128_digit_count(value: i128) -> usize {
|
|
997
|
+
value.unsigned_abs().to_string().len()
|
|
998
|
+
}
|
|
999
|
+
|
|
1000
|
+
fn decimal256_digit_count(value: &num::BigInt) -> usize {
|
|
1001
|
+
value.to_str_radix(10).trim_start_matches('-').len()
|
|
1002
|
+
}
|
|
1003
|
+
|
|
1004
|
+
/// Convert BigInt to i256 (32-byte array)
|
|
1005
|
+
fn decimal256_from_bigint(bigint: &num::BigInt) -> Result<arrow_buffer::i256> {
|
|
1006
|
+
// Get bytes in little-endian format
|
|
1007
|
+
let (sign, mut bytes) = bigint.to_bytes_le();
|
|
1008
|
+
|
|
1009
|
+
// Ensure we have exactly 32 bytes
|
|
1010
|
+
if bytes.len() > 32 {
|
|
1011
|
+
return Err(ParquetError::Conversion(
|
|
1012
|
+
"Decimal256 value too large".to_string(),
|
|
1013
|
+
));
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
// Pad with zeros or ones (for negative numbers) to reach 32 bytes
|
|
1017
|
+
bytes.resize(32, 0);
|
|
1018
|
+
|
|
1019
|
+
// If negative, convert to two's complement
|
|
1020
|
+
if sign == num::bigint::Sign::Minus {
|
|
1021
|
+
// Invert all bits
|
|
1022
|
+
for byte in &mut bytes {
|
|
1023
|
+
*byte = !*byte;
|
|
1024
|
+
}
|
|
1025
|
+
// Add 1
|
|
1026
|
+
let mut carry = true;
|
|
1027
|
+
for byte in &mut bytes {
|
|
1028
|
+
if carry {
|
|
1029
|
+
let (new_byte, new_carry) = byte.overflowing_add(1);
|
|
1030
|
+
*byte = new_byte;
|
|
1031
|
+
carry = new_carry;
|
|
1032
|
+
} else {
|
|
1033
|
+
break;
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
let byte_array: [u8; 32] = bytes
|
|
1039
|
+
.try_into()
|
|
1040
|
+
.map_err(|_| ParquetError::Conversion("Failed to convert bytes to i256".to_string()))?;
|
|
1041
|
+
Ok(arrow_buffer::i256::from_le_bytes(byte_array))
|
|
1042
|
+
}
|
|
1043
|
+
|
|
1044
|
+
/// Build list array
|
|
1045
|
+
fn build_list_array(values: &[&ParquetValue], item_field: &StdArc<Field>) -> Result<ArrayRef> {
|
|
1046
|
+
let mut all_items = Vec::new();
|
|
1047
|
+
let mut offsets = Vec::with_capacity(values.len() + 1);
|
|
1048
|
+
let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
|
|
1049
|
+
offsets.push(0i32);
|
|
1050
|
+
|
|
1051
|
+
for value in values {
|
|
1052
|
+
match *value {
|
|
1053
|
+
ParquetValue::List(items) => {
|
|
1054
|
+
all_items.extend(items.iter());
|
|
1055
|
+
offsets.push(all_items.len() as i32);
|
|
1056
|
+
null_buffer_builder.append(true);
|
|
1057
|
+
}
|
|
1058
|
+
ParquetValue::Null => {
|
|
1059
|
+
offsets.push(all_items.len() as i32);
|
|
1060
|
+
null_buffer_builder.append(false);
|
|
1061
|
+
}
|
|
1062
|
+
_ => {
|
|
1063
|
+
return Err(ParquetError::Conversion(format!(
|
|
1064
|
+
"Expected List, got {:?}",
|
|
1065
|
+
value.type_name()
|
|
1066
|
+
)))
|
|
1067
|
+
}
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
1070
|
+
|
|
1071
|
+
let item_array = parquet_value_refs_to_arrow_array(&all_items, item_field)?;
|
|
1072
|
+
let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
|
|
1073
|
+
let null_buffer = null_buffer_builder.finish();
|
|
1074
|
+
|
|
1075
|
+
Ok(StdArc::new(ListArray::new(
|
|
1076
|
+
item_field.clone(),
|
|
1077
|
+
offset_buffer,
|
|
1078
|
+
item_array,
|
|
1079
|
+
Some(null_buffer.into()),
|
|
1080
|
+
)))
|
|
1081
|
+
}
|
|
1082
|
+
|
|
1083
|
+
/// Build map array
|
|
1084
|
+
fn build_map_array(
|
|
1085
|
+
values: &[&ParquetValue],
|
|
1086
|
+
entries_field: &StdArc<Field>,
|
|
1087
|
+
_sorted: bool,
|
|
1088
|
+
) -> Result<ArrayRef> {
|
|
1089
|
+
// Extract the key and value fields from the entries struct
|
|
1090
|
+
let (key_field, value_field) = match entries_field.data_type() {
|
|
1091
|
+
DataType::Struct(fields) if fields.len() == 2 => (&fields[0], &fields[1]),
|
|
1092
|
+
_ => {
|
|
1093
|
+
return Err(ParquetError::Conversion(
|
|
1094
|
+
"Map entries field must be a struct with exactly 2 fields".to_string(),
|
|
1095
|
+
))
|
|
1096
|
+
}
|
|
1097
|
+
};
|
|
1098
|
+
|
|
1099
|
+
let mut all_keys = Vec::new();
|
|
1100
|
+
let mut all_values = Vec::new();
|
|
1101
|
+
let mut offsets = Vec::with_capacity(values.len() + 1);
|
|
1102
|
+
let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(values.len());
|
|
1103
|
+
offsets.push(0i32);
|
|
1104
|
+
|
|
1105
|
+
for value in values {
|
|
1106
|
+
match *value {
|
|
1107
|
+
ParquetValue::Map(entries) => {
|
|
1108
|
+
for (k, v) in entries {
|
|
1109
|
+
all_keys.push(k);
|
|
1110
|
+
all_values.push(v);
|
|
1111
|
+
}
|
|
1112
|
+
offsets.push(all_keys.len() as i32);
|
|
1113
|
+
null_buffer_builder.append(true);
|
|
1114
|
+
}
|
|
1115
|
+
ParquetValue::Null => {
|
|
1116
|
+
offsets.push(all_keys.len() as i32);
|
|
1117
|
+
null_buffer_builder.append(false);
|
|
1118
|
+
}
|
|
1119
|
+
_ => {
|
|
1120
|
+
return Err(ParquetError::Conversion(format!(
|
|
1121
|
+
"Expected Map, got {:?}",
|
|
1122
|
+
value.type_name()
|
|
1123
|
+
)))
|
|
1124
|
+
}
|
|
1125
|
+
}
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
let key_array = parquet_value_refs_to_arrow_array(&all_keys, key_field)?;
|
|
1129
|
+
let value_array = parquet_value_refs_to_arrow_array(&all_values, value_field)?;
|
|
1130
|
+
|
|
1131
|
+
// Create struct array for entries
|
|
1132
|
+
let struct_fields = match entries_field.data_type() {
|
|
1133
|
+
DataType::Struct(fields) => fields.clone(),
|
|
1134
|
+
_ => unreachable!("Map entries field must be a struct"),
|
|
1135
|
+
};
|
|
1136
|
+
|
|
1137
|
+
let struct_array = StructArray::new(struct_fields, vec![key_array, value_array], None);
|
|
1138
|
+
|
|
1139
|
+
let offset_buffer = arrow_buffer::OffsetBuffer::new(offsets.into());
|
|
1140
|
+
let null_buffer = null_buffer_builder.finish();
|
|
1141
|
+
|
|
1142
|
+
Ok(StdArc::new(MapArray::new(
|
|
1143
|
+
entries_field.clone(),
|
|
1144
|
+
offset_buffer,
|
|
1145
|
+
struct_array,
|
|
1146
|
+
Some(null_buffer.into()),
|
|
1147
|
+
false, // sorted
|
|
1148
|
+
)))
|
|
1149
|
+
}
|
|
1150
|
+
|
|
1151
|
+
/// Build struct array
|
|
1152
|
+
fn build_struct_array(values: &[&ParquetValue], fields: &arrow_schema::Fields) -> Result<ArrayRef> {
|
|
1153
|
+
let num_rows = values.len();
|
|
1154
|
+
let mut field_arrays = Vec::with_capacity(fields.len());
|
|
1155
|
+
let mut null_buffer_builder = arrow_buffer::BooleanBufferBuilder::new(num_rows);
|
|
1156
|
+
let null_value = ParquetValue::Null;
|
|
1157
|
+
|
|
1158
|
+
// Prepare columns for each field
|
|
1159
|
+
let mut field_columns: Vec<Vec<&ParquetValue>> =
|
|
1160
|
+
vec![Vec::with_capacity(num_rows); fields.len()];
|
|
1161
|
+
|
|
1162
|
+
for value in values {
|
|
1163
|
+
match *value {
|
|
1164
|
+
ParquetValue::Record(map) => {
|
|
1165
|
+
null_buffer_builder.append(true);
|
|
1166
|
+
for (idx, field) in fields.iter().enumerate() {
|
|
1167
|
+
let field_value = map.get(field.name().as_str()).unwrap_or(&null_value);
|
|
1168
|
+
field_columns[idx].push(field_value);
|
|
1169
|
+
}
|
|
1170
|
+
}
|
|
1171
|
+
ParquetValue::Null => {
|
|
1172
|
+
null_buffer_builder.append(false);
|
|
1173
|
+
for field_column in field_columns.iter_mut().take(fields.len()) {
|
|
1174
|
+
field_column.push(&null_value);
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
_ => {
|
|
1178
|
+
return Err(ParquetError::Conversion(format!(
|
|
1179
|
+
"Expected Record, got {:?}",
|
|
1180
|
+
value.type_name()
|
|
1181
|
+
)))
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
}
|
|
1185
|
+
|
|
1186
|
+
// Build arrays for each field
|
|
1187
|
+
for (column, field) in field_columns.iter().zip(fields.iter()) {
|
|
1188
|
+
let array = parquet_value_refs_to_arrow_array(column, field)?;
|
|
1189
|
+
field_arrays.push(array);
|
|
1190
|
+
}
|
|
1191
|
+
|
|
1192
|
+
let null_buffer = null_buffer_builder.finish();
|
|
1193
|
+
Ok(StdArc::new(StructArray::new(
|
|
1194
|
+
fields.clone(),
|
|
1195
|
+
field_arrays,
|
|
1196
|
+
Some(null_buffer.into()),
|
|
1197
|
+
)))
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
#[cfg(test)]
|
|
1201
|
+
mod tests {
|
|
1202
|
+
use super::*;
|
|
1203
|
+
use arrow_array::*;
|
|
1204
|
+
use parquet::basic::Type as PhysicalType;
|
|
1205
|
+
|
|
1206
|
+
#[test]
|
|
1207
|
+
fn test_primitive_conversion_roundtrip() {
|
|
1208
|
+
// Test boolean
|
|
1209
|
+
let values = vec![
|
|
1210
|
+
ParquetValue::Boolean(true),
|
|
1211
|
+
ParquetValue::Boolean(false),
|
|
1212
|
+
ParquetValue::Null,
|
|
1213
|
+
];
|
|
1214
|
+
let field = Field::new("test", DataType::Boolean, true);
|
|
1215
|
+
let array = parquet_values_to_arrow_array(&values, &field).unwrap();
|
|
1216
|
+
let type_ = Type::primitive_type_builder("test", PhysicalType::BOOLEAN)
|
|
1217
|
+
.build()
|
|
1218
|
+
.unwrap();
|
|
1219
|
+
|
|
1220
|
+
for (i, expected) in values.iter().enumerate() {
|
|
1221
|
+
let actual = arrow_to_parquet_value(&field, &type_, array.as_ref(), i).unwrap();
|
|
1222
|
+
assert_eq!(&actual, expected);
|
|
1223
|
+
}
|
|
1224
|
+
}
|
|
1225
|
+
|
|
1226
|
+
#[test]
|
|
1227
|
+
fn test_integer_upcasting() {
|
|
1228
|
+
// Test that smaller integers can be upcast to larger ones
|
|
1229
|
+
let values = vec![
|
|
1230
|
+
ParquetValue::Int8(42),
|
|
1231
|
+
ParquetValue::Int16(1000),
|
|
1232
|
+
ParquetValue::Int32(100000),
|
|
1233
|
+
];
|
|
1234
|
+
let field = Field::new("test", DataType::Int64, false);
|
|
1235
|
+
let array = parquet_values_to_arrow_array(&values, &field).unwrap();
|
|
1236
|
+
|
|
1237
|
+
assert_eq!(array.len(), 3);
|
|
1238
|
+
let int64_array = array.as_any().downcast_ref::<Int64Array>().unwrap();
|
|
1239
|
+
assert_eq!(int64_array.value(0), 42);
|
|
1240
|
+
assert_eq!(int64_array.value(1), 1000);
|
|
1241
|
+
assert_eq!(int64_array.value(2), 100000);
|
|
1242
|
+
}
|
|
1243
|
+
}
|