parquet 0.7.2 → 0.7.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +264 -475
- data/ext/parquet-core/src/arrow_conversion.rs +138 -15
- data/ext/parquet-core/src/reader.rs +54 -4
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +46 -4
- data/lib/parquet/version.rb +1 -1
- metadata +1 -1
@@ -12,11 +12,14 @@ use arrow_schema::{DataType, Field};
|
|
12
12
|
use bytes::Bytes;
|
13
13
|
use indexmap::IndexMap;
|
14
14
|
use ordered_float::OrderedFloat;
|
15
|
+
use parquet::basic::LogicalType;
|
16
|
+
use parquet::schema::types::Type;
|
15
17
|
use std::sync::Arc;
|
16
18
|
|
17
19
|
/// Convert a single value from an Arrow array at the given index to a ParquetValue
|
18
20
|
pub fn arrow_to_parquet_value(
|
19
|
-
|
21
|
+
arrow_field: &Field,
|
22
|
+
parquet_field: &Type,
|
20
23
|
array: &dyn Array,
|
21
24
|
index: usize,
|
22
25
|
) -> Result<ParquetValue> {
|
@@ -91,13 +94,20 @@ pub fn arrow_to_parquet_value(
|
|
91
94
|
DataType::FixedSizeBinary(_) => {
|
92
95
|
let array = downcast_array::<FixedSizeBinaryArray>(array)?;
|
93
96
|
let value = array.value(index);
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
97
|
+
if let Some(LogicalType::Uuid) = parquet_field.get_basic_info().logical_type() {
|
98
|
+
let uuid = uuid::Uuid::from_slice(value)
|
99
|
+
.map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
|
100
|
+
Ok(ParquetValue::Uuid(uuid))
|
101
|
+
} else {
|
102
|
+
match arrow_field.try_extension_type::<ArrowUuid>() {
|
103
|
+
Ok(_) => {
|
104
|
+
let uuid = uuid::Uuid::from_slice(value).map_err(|e| {
|
105
|
+
ParquetError::Conversion(format!("Invalid UUID: {}", e))
|
106
|
+
})?;
|
107
|
+
Ok(ParquetValue::Uuid(uuid))
|
108
|
+
}
|
109
|
+
Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
|
99
110
|
}
|
100
|
-
Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
|
101
111
|
}
|
102
112
|
}
|
103
113
|
|
@@ -192,8 +202,43 @@ pub fn arrow_to_parquet_value(
|
|
192
202
|
let list_values = array.value(index);
|
193
203
|
|
194
204
|
let mut values = Vec::with_capacity(list_values.len());
|
205
|
+
|
206
|
+
// Get the list's element type from parquet schema
|
207
|
+
let element_type = match parquet_field {
|
208
|
+
parquet::schema::types::Type::GroupType { fields, .. } => {
|
209
|
+
// List has a repeated group containing the element
|
210
|
+
// The structure is: LIST -> repeated group -> element
|
211
|
+
if let Some(repeated_group) = fields.first() {
|
212
|
+
match repeated_group.as_ref() {
|
213
|
+
parquet::schema::types::Type::GroupType {
|
214
|
+
fields: inner_fields,
|
215
|
+
..
|
216
|
+
} => {
|
217
|
+
// This is the repeated group, get the actual element
|
218
|
+
inner_fields.first().ok_or_else(|| {
|
219
|
+
ParquetError::Conversion(
|
220
|
+
"List repeated group missing element field".to_string(),
|
221
|
+
)
|
222
|
+
})?
|
223
|
+
}
|
224
|
+
_ => repeated_group, // If it's not a group, use it directly
|
225
|
+
}
|
226
|
+
} else {
|
227
|
+
return Err(ParquetError::Conversion(
|
228
|
+
"List type missing fields".to_string(),
|
229
|
+
));
|
230
|
+
}
|
231
|
+
}
|
232
|
+
_ => parquet_field, // Fallback for cases where it's not a proper list structure
|
233
|
+
};
|
234
|
+
|
195
235
|
for i in 0..list_values.len() {
|
196
|
-
values.push(arrow_to_parquet_value(
|
236
|
+
values.push(arrow_to_parquet_value(
|
237
|
+
item_field,
|
238
|
+
element_type,
|
239
|
+
&list_values,
|
240
|
+
i,
|
241
|
+
)?);
|
197
242
|
}
|
198
243
|
|
199
244
|
Ok(ParquetValue::List(values))
|
@@ -210,7 +255,7 @@ pub fn arrow_to_parquet_value(
|
|
210
255
|
.fields()
|
211
256
|
.iter()
|
212
257
|
.find(|f| f.name() == "key")
|
213
|
-
.ok_or_else(|| ParquetError::Conversion("No
|
258
|
+
.ok_or_else(|| ParquetError::Conversion("No key field found".to_string()))?;
|
214
259
|
|
215
260
|
let value_field = map_value
|
216
261
|
.fields()
|
@@ -219,9 +264,59 @@ pub fn arrow_to_parquet_value(
|
|
219
264
|
.ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
|
220
265
|
|
221
266
|
let mut map_vec = Vec::with_capacity(keys.len());
|
267
|
+
|
268
|
+
// Get key and value types from parquet schema
|
269
|
+
// Map structure is: MAP -> key_value (repeated group) -> key, value
|
270
|
+
let (key_type, value_type) = match parquet_field {
|
271
|
+
parquet::schema::types::Type::GroupType { fields, .. } => {
|
272
|
+
// Get the key_value repeated group
|
273
|
+
match fields.first() {
|
274
|
+
Some(key_value_group) => match key_value_group.as_ref() {
|
275
|
+
parquet::schema::types::Type::GroupType {
|
276
|
+
fields: kv_fields, ..
|
277
|
+
} => {
|
278
|
+
// Find key and value fields by name
|
279
|
+
let key_field = kv_fields
|
280
|
+
.iter()
|
281
|
+
.find(|f| f.name() == "key")
|
282
|
+
.ok_or_else(|| {
|
283
|
+
ParquetError::Conversion(
|
284
|
+
"Map missing key field".to_string(),
|
285
|
+
)
|
286
|
+
})?;
|
287
|
+
let value_field = kv_fields
|
288
|
+
.iter()
|
289
|
+
.find(|f| f.name() == "value")
|
290
|
+
.ok_or_else(|| {
|
291
|
+
ParquetError::Conversion(
|
292
|
+
"Map missing value field".to_string(),
|
293
|
+
)
|
294
|
+
})?;
|
295
|
+
(key_field.as_ref(), value_field.as_ref())
|
296
|
+
}
|
297
|
+
_ => {
|
298
|
+
return Err(ParquetError::Conversion(
|
299
|
+
"Map key_value should be a group".to_string(),
|
300
|
+
))
|
301
|
+
}
|
302
|
+
},
|
303
|
+
None => {
|
304
|
+
return Err(ParquetError::Conversion(
|
305
|
+
"Map type missing key_value field".to_string(),
|
306
|
+
))
|
307
|
+
}
|
308
|
+
}
|
309
|
+
}
|
310
|
+
_ => {
|
311
|
+
return Err(ParquetError::Conversion(
|
312
|
+
"Map type must be a group".to_string(),
|
313
|
+
))
|
314
|
+
}
|
315
|
+
};
|
316
|
+
|
222
317
|
for i in 0..keys.len() {
|
223
|
-
let key = arrow_to_parquet_value(key_field, keys, i)?;
|
224
|
-
let value = arrow_to_parquet_value(value_field, values, i)?;
|
318
|
+
let key = arrow_to_parquet_value(key_field, key_type, keys, i)?;
|
319
|
+
let value = arrow_to_parquet_value(value_field, value_type, values, i)?;
|
225
320
|
map_vec.push((key, value));
|
226
321
|
}
|
227
322
|
|
@@ -231,10 +326,34 @@ pub fn arrow_to_parquet_value(
|
|
231
326
|
let array = downcast_array::<StructArray>(array)?;
|
232
327
|
|
233
328
|
let mut map = IndexMap::new();
|
234
|
-
|
329
|
+
|
330
|
+
// Get struct fields from parquet schema
|
331
|
+
let parquet_fields = match parquet_field {
|
332
|
+
parquet::schema::types::Type::GroupType { fields, .. } => fields,
|
333
|
+
_ => {
|
334
|
+
return Err(ParquetError::Conversion(
|
335
|
+
"Struct type must be a group".to_string(),
|
336
|
+
))
|
337
|
+
}
|
338
|
+
};
|
339
|
+
|
340
|
+
for (col_idx, arrow_field) in array.fields().iter().enumerate() {
|
235
341
|
let column = array.column(col_idx);
|
236
|
-
|
237
|
-
|
342
|
+
|
343
|
+
// Find matching parquet field by name
|
344
|
+
let nested_parquet_field = parquet_fields
|
345
|
+
.iter()
|
346
|
+
.find(|f| f.name() == arrow_field.name())
|
347
|
+
.ok_or_else(|| {
|
348
|
+
ParquetError::Conversion(format!(
|
349
|
+
"No matching parquet field for struct field '{}'",
|
350
|
+
arrow_field.name()
|
351
|
+
))
|
352
|
+
})?;
|
353
|
+
|
354
|
+
let value =
|
355
|
+
arrow_to_parquet_value(arrow_field, nested_parquet_field, column, index)?;
|
356
|
+
map.insert(Arc::from(arrow_field.name().as_str()), value);
|
238
357
|
}
|
239
358
|
|
240
359
|
Ok(ParquetValue::Record(map))
|
@@ -1121,6 +1240,7 @@ pub fn append_parquet_value_to_builder(
|
|
1121
1240
|
mod tests {
|
1122
1241
|
use super::*;
|
1123
1242
|
use arrow_array::*;
|
1243
|
+
use parquet::basic::Type as PhysicalType;
|
1124
1244
|
|
1125
1245
|
#[test]
|
1126
1246
|
fn test_primitive_conversion_roundtrip() {
|
@@ -1132,9 +1252,12 @@ mod tests {
|
|
1132
1252
|
];
|
1133
1253
|
let field = Field::new("test", DataType::Boolean, true);
|
1134
1254
|
let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
|
1255
|
+
let type_ = Type::primitive_type_builder("test", PhysicalType::BOOLEAN)
|
1256
|
+
.build()
|
1257
|
+
.unwrap();
|
1135
1258
|
|
1136
1259
|
for (i, expected) in values.iter().enumerate() {
|
1137
|
-
let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
|
1260
|
+
let actual = arrow_to_parquet_value(&field, &type_, array.as_ref(), i).unwrap();
|
1138
1261
|
assert_eq!(&actual, expected);
|
1139
1262
|
}
|
1140
1263
|
}
|
@@ -1,10 +1,10 @@
|
|
1
1
|
//! Core Parquet reading functionality
|
2
2
|
|
3
|
-
use crate::{arrow_conversion::arrow_to_parquet_value, ParquetValue, Result};
|
3
|
+
use crate::{arrow_conversion::arrow_to_parquet_value, ParquetError, ParquetValue, Result};
|
4
4
|
use arrow::record_batch::RecordBatch;
|
5
5
|
use arrow_array::Array;
|
6
6
|
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
|
7
|
-
use parquet::file::metadata::FileMetaData;
|
7
|
+
use parquet::file::metadata::{FileMetaData, ParquetMetaData};
|
8
8
|
use std::sync::Arc;
|
9
9
|
|
10
10
|
/// Core Parquet reader that works with any source implementing Read + Seek
|
@@ -33,10 +33,12 @@ where
|
|
33
33
|
/// Returns an iterator over rows where each row is a vector of ParquetValues
|
34
34
|
pub fn read_rows(self) -> Result<RowIterator<R>> {
|
35
35
|
let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
|
36
|
+
let metadata = builder.metadata().clone();
|
36
37
|
let reader = builder.build()?;
|
37
38
|
|
38
39
|
Ok(RowIterator {
|
39
40
|
batch_reader: reader,
|
41
|
+
metadata,
|
40
42
|
current_batch: None,
|
41
43
|
current_row: 0,
|
42
44
|
_phantom: std::marker::PhantomData,
|
@@ -64,10 +66,12 @@ where
|
|
64
66
|
|
65
67
|
let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
|
66
68
|
builder = builder.with_projection(mask);
|
69
|
+
let metadata = builder.metadata().clone();
|
67
70
|
let reader = builder.build()?;
|
68
71
|
|
69
72
|
Ok(RowIterator {
|
70
73
|
batch_reader: reader,
|
74
|
+
metadata,
|
71
75
|
current_batch: None,
|
72
76
|
current_row: 0,
|
73
77
|
_phantom: std::marker::PhantomData,
|
@@ -88,10 +92,12 @@ where
|
|
88
92
|
}
|
89
93
|
|
90
94
|
let schema = builder.schema().clone();
|
95
|
+
let metadata = builder.metadata().clone();
|
91
96
|
let reader = builder.build()?;
|
92
97
|
|
93
98
|
Ok(ColumnIterator {
|
94
99
|
batch_reader: reader,
|
100
|
+
metadata,
|
95
101
|
schema,
|
96
102
|
returned_empty_batch: false,
|
97
103
|
is_empty_file: is_empty,
|
@@ -129,10 +135,12 @@ where
|
|
129
135
|
}
|
130
136
|
|
131
137
|
let schema = builder.schema().clone();
|
138
|
+
let metadata = builder.metadata().clone();
|
132
139
|
let reader = builder.build()?;
|
133
140
|
|
134
141
|
Ok(ColumnIterator {
|
135
142
|
batch_reader: reader,
|
143
|
+
metadata,
|
136
144
|
schema,
|
137
145
|
returned_empty_batch: false,
|
138
146
|
is_empty_file: is_empty,
|
@@ -144,6 +152,7 @@ where
|
|
144
152
|
/// Iterator over rows in a Parquet file
|
145
153
|
pub struct RowIterator<R> {
|
146
154
|
batch_reader: ParquetRecordBatchReader,
|
155
|
+
metadata: Arc<ParquetMetaData>,
|
147
156
|
current_batch: Option<RecordBatch>,
|
148
157
|
current_row: usize,
|
149
158
|
_phantom: std::marker::PhantomData<R>,
|
@@ -156,6 +165,7 @@ where
|
|
156
165
|
type Item = Result<Vec<ParquetValue>>;
|
157
166
|
|
158
167
|
fn next(&mut self) -> Option<Self::Item> {
|
168
|
+
let schema_descriptor = self.metadata.file_metadata().schema_descr_ptr();
|
159
169
|
loop {
|
160
170
|
// If we have a current batch and haven't exhausted it
|
161
171
|
if let Some(ref batch) = self.current_batch {
|
@@ -164,9 +174,31 @@ where
|
|
164
174
|
let mut row_values = Vec::with_capacity(batch.num_columns());
|
165
175
|
|
166
176
|
let schema = batch.schema();
|
177
|
+
|
178
|
+
let root_schema = schema_descriptor.root_schema();
|
179
|
+
let parquet_fields = match root_schema {
|
180
|
+
parquet::schema::types::Type::GroupType { fields, .. } => fields,
|
181
|
+
_ => {
|
182
|
+
return Some(Err(ParquetError::Conversion(
|
183
|
+
"Root schema must be a group type".to_string(),
|
184
|
+
)))
|
185
|
+
}
|
186
|
+
};
|
187
|
+
|
167
188
|
for (i, column) in batch.columns().iter().enumerate() {
|
168
189
|
let field = schema.field(i);
|
169
|
-
let
|
190
|
+
let parquet_field = if i < parquet_fields.len() {
|
191
|
+
parquet_fields[i].clone()
|
192
|
+
} else {
|
193
|
+
// Fallback to leaf column if index out of bounds
|
194
|
+
schema_descriptor.column(i).self_type_ptr()
|
195
|
+
};
|
196
|
+
let value = match arrow_to_parquet_value(
|
197
|
+
field,
|
198
|
+
&parquet_field,
|
199
|
+
column,
|
200
|
+
self.current_row,
|
201
|
+
) {
|
170
202
|
Ok(v) => v,
|
171
203
|
Err(e) => return Some(Err(e)),
|
172
204
|
};
|
@@ -194,6 +226,7 @@ where
|
|
194
226
|
/// Iterator over column batches in a Parquet file
|
195
227
|
pub struct ColumnIterator<R> {
|
196
228
|
batch_reader: ParquetRecordBatchReader,
|
229
|
+
metadata: Arc<ParquetMetaData>,
|
197
230
|
schema: Arc<arrow_schema::Schema>,
|
198
231
|
returned_empty_batch: bool,
|
199
232
|
is_empty_file: bool,
|
@@ -228,6 +261,17 @@ where
|
|
228
261
|
match self.batch_reader.next() {
|
229
262
|
Some(Ok(batch)) => {
|
230
263
|
let mut columns = Vec::with_capacity(batch.num_columns());
|
264
|
+
let schema_descriptor = self.metadata.file_metadata().schema_descr_ptr();
|
265
|
+
|
266
|
+
let root_schema = schema_descriptor.root_schema();
|
267
|
+
let parquet_fields = match root_schema {
|
268
|
+
parquet::schema::types::Type::GroupType { fields, .. } => fields,
|
269
|
+
_ => {
|
270
|
+
return Some(Err(ParquetError::Conversion(
|
271
|
+
"Root schema must be a group type".to_string(),
|
272
|
+
)))
|
273
|
+
}
|
274
|
+
};
|
231
275
|
|
232
276
|
for (idx, column) in batch.columns().iter().enumerate() {
|
233
277
|
let field = self.schema.field(idx);
|
@@ -236,7 +280,13 @@ where
|
|
236
280
|
// Convert entire column to ParquetValues
|
237
281
|
let mut values = Vec::with_capacity(column.len());
|
238
282
|
for row_idx in 0..column.len() {
|
239
|
-
|
283
|
+
let parquet_field = if idx < parquet_fields.len() {
|
284
|
+
parquet_fields[idx].clone()
|
285
|
+
} else {
|
286
|
+
// Fallback to leaf column if index out of bounds
|
287
|
+
schema_descriptor.column(idx).self_type_ptr()
|
288
|
+
};
|
289
|
+
match arrow_to_parquet_value(field, &parquet_field, column, row_idx) {
|
240
290
|
Ok(value) => values.push(value),
|
241
291
|
Err(e) => return Some(Err(e)),
|
242
292
|
}
|
@@ -3,6 +3,7 @@ use arrow_schema::{DataType, Field, TimeUnit};
|
|
3
3
|
use bytes::Bytes;
|
4
4
|
use num::BigInt;
|
5
5
|
use ordered_float::OrderedFloat;
|
6
|
+
use parquet::schema::types::Type;
|
6
7
|
use parquet_core::arrow_conversion::{arrow_to_parquet_value, parquet_values_to_arrow_array};
|
7
8
|
use parquet_core::*;
|
8
9
|
use std::sync::Arc;
|
@@ -99,7 +100,19 @@ fn test_decimal256_large_values() {
|
|
99
100
|
|
100
101
|
// Verify roundtrip
|
101
102
|
for i in 0..4 {
|
102
|
-
|
103
|
+
// Create a dummy parquet type for testing
|
104
|
+
let parquet_type =
|
105
|
+
Type::primitive_type_builder("test", parquet::basic::Type::FIXED_LEN_BYTE_ARRAY)
|
106
|
+
.with_length(32)
|
107
|
+
.with_precision(76)
|
108
|
+
.with_scale(0)
|
109
|
+
.with_logical_type(Some(parquet::basic::LogicalType::Decimal {
|
110
|
+
scale: 0,
|
111
|
+
precision: 76,
|
112
|
+
}))
|
113
|
+
.build()
|
114
|
+
.unwrap();
|
115
|
+
let value = arrow_to_parquet_value(&field, &parquet_type, array.as_ref(), i).unwrap();
|
103
116
|
match (i, value) {
|
104
117
|
(0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
|
105
118
|
(1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
|
@@ -173,7 +186,15 @@ fn test_timestamp_with_timezone() {
|
|
173
186
|
|
174
187
|
// Verify roundtrip preserves timezone
|
175
188
|
for i in 0..3 {
|
176
|
-
|
189
|
+
// Create a dummy parquet type for testing
|
190
|
+
let parquet_type = Type::primitive_type_builder("test", parquet::basic::Type::INT64)
|
191
|
+
.with_logical_type(Some(parquet::basic::LogicalType::Timestamp {
|
192
|
+
is_adjusted_to_u_t_c: true,
|
193
|
+
unit: parquet::basic::TimeUnit::MILLIS(Default::default()),
|
194
|
+
}))
|
195
|
+
.build()
|
196
|
+
.unwrap();
|
197
|
+
let value = arrow_to_parquet_value(&field, &parquet_type, array.as_ref(), i).unwrap();
|
177
198
|
match value {
|
178
199
|
ParquetValue::TimestampMillis(_, Some(tz)) => {
|
179
200
|
assert_eq!(tz.as_ref(), "America/New_York");
|
@@ -209,7 +230,19 @@ fn test_nested_list_of_lists() {
|
|
209
230
|
assert_eq!(array.len(), 1);
|
210
231
|
|
211
232
|
// Verify roundtrip
|
212
|
-
|
233
|
+
// Create a dummy parquet type for testing - a list of list of int32
|
234
|
+
let int_type = Type::primitive_type_builder("item", parquet::basic::Type::INT32)
|
235
|
+
.build()
|
236
|
+
.unwrap();
|
237
|
+
let inner_list = Type::group_type_builder("inner_list")
|
238
|
+
.with_fields(vec![Arc::new(int_type)])
|
239
|
+
.build()
|
240
|
+
.unwrap();
|
241
|
+
let parquet_type = Type::group_type_builder("outer_list")
|
242
|
+
.with_fields(vec![Arc::new(inner_list)])
|
243
|
+
.build()
|
244
|
+
.unwrap();
|
245
|
+
let value = arrow_to_parquet_value(&outer_field, &parquet_type, array.as_ref(), 0).unwrap();
|
213
246
|
match value {
|
214
247
|
ParquetValue::List(items) => assert_eq!(items.len(), 5),
|
215
248
|
_ => panic!("Expected list"),
|
@@ -357,7 +390,16 @@ fn test_unsupported_arrow_types() {
|
|
357
390
|
)
|
358
391
|
.unwrap();
|
359
392
|
|
360
|
-
|
393
|
+
// Create a dummy parquet type for testing
|
394
|
+
let parquet_type = Type::primitive_type_builder("int", parquet::basic::Type::INT32)
|
395
|
+
.build()
|
396
|
+
.unwrap();
|
397
|
+
let result = arrow_to_parquet_value(
|
398
|
+
&Field::new("int", DataType::Int32, false),
|
399
|
+
&parquet_type,
|
400
|
+
&array,
|
401
|
+
0,
|
402
|
+
);
|
361
403
|
assert!(result.is_err());
|
362
404
|
assert!(result
|
363
405
|
.unwrap_err()
|
data/lib/parquet/version.rb
CHANGED