parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
use thiserror::Error;
|
|
2
|
+
|
|
3
|
+
/// Core error type for Parquet operations
|
|
4
|
+
#[derive(Error, Debug)]
|
|
5
|
+
pub enum ParquetError {
|
|
6
|
+
/// IO errors from file operations
|
|
7
|
+
#[error("IO error: {0}")]
|
|
8
|
+
Io(#[from] std::io::Error),
|
|
9
|
+
|
|
10
|
+
/// Arrow errors from Arrow operations
|
|
11
|
+
#[error("Arrow error: {0}")]
|
|
12
|
+
Arrow(#[from] arrow_schema::ArrowError),
|
|
13
|
+
|
|
14
|
+
/// Parquet format errors
|
|
15
|
+
#[error("Parquet error: {0}")]
|
|
16
|
+
Parquet(#[from] parquet::errors::ParquetError),
|
|
17
|
+
|
|
18
|
+
/// Schema-related errors
|
|
19
|
+
#[error("Schema error: {0}")]
|
|
20
|
+
Schema(String),
|
|
21
|
+
|
|
22
|
+
/// Type conversion errors
|
|
23
|
+
#[error("Conversion error: {0}")]
|
|
24
|
+
Conversion(String),
|
|
25
|
+
|
|
26
|
+
/// Invalid argument errors
|
|
27
|
+
#[error("Invalid argument: {0}")]
|
|
28
|
+
InvalidArgument(String),
|
|
29
|
+
|
|
30
|
+
/// Data validation errors
|
|
31
|
+
#[error("Data validation error: {0}")]
|
|
32
|
+
DataValidation(String),
|
|
33
|
+
|
|
34
|
+
/// Unsupported operation errors
|
|
35
|
+
#[error("Unsupported operation: {0}")]
|
|
36
|
+
Unsupported(String),
|
|
37
|
+
|
|
38
|
+
/// Internal errors that shouldn't happen
|
|
39
|
+
#[error("Internal error: {0}")]
|
|
40
|
+
Internal(String),
|
|
41
|
+
|
|
42
|
+
/// UTF-8 decoding errors
|
|
43
|
+
#[error("UTF-8 error: {0}")]
|
|
44
|
+
Utf8(#[from] std::str::Utf8Error),
|
|
45
|
+
|
|
46
|
+
/// Number parsing errors
|
|
47
|
+
#[error("Parse error: {0}")]
|
|
48
|
+
ParseInt(#[from] std::num::ParseIntError),
|
|
49
|
+
|
|
50
|
+
/// Float parsing errors
|
|
51
|
+
#[error("Parse float error: {0}")]
|
|
52
|
+
ParseFloat(#[from] std::num::ParseFloatError),
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Result type alias for Parquet operations
|
|
56
|
+
pub type Result<T> = std::result::Result<T, ParquetError>;
|
|
57
|
+
|
|
58
|
+
impl ParquetError {
|
|
59
|
+
/// Create a new schema error
|
|
60
|
+
pub fn schema<S: Into<String>>(msg: S) -> Self {
|
|
61
|
+
ParquetError::Schema(msg.into())
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/// Create a new conversion error
|
|
65
|
+
pub fn conversion<S: Into<String>>(msg: S) -> Self {
|
|
66
|
+
ParquetError::Conversion(msg.into())
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/// Create a new invalid argument error
|
|
70
|
+
pub fn invalid_argument<S: Into<String>>(msg: S) -> Self {
|
|
71
|
+
ParquetError::InvalidArgument(msg.into())
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/// Create a new data validation error
|
|
75
|
+
pub fn data_validation<S: Into<String>>(msg: S) -> Self {
|
|
76
|
+
ParquetError::DataValidation(msg.into())
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/// Create a new unsupported operation error
|
|
80
|
+
pub fn unsupported<S: Into<String>>(msg: S) -> Self {
|
|
81
|
+
ParquetError::Unsupported(msg.into())
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/// Create a new internal error
|
|
85
|
+
pub fn internal<S: Into<String>>(msg: S) -> Self {
|
|
86
|
+
ParquetError::Internal(msg.into())
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
fn with_context_message(self, ctx: String) -> Self {
|
|
90
|
+
match self {
|
|
91
|
+
ParquetError::Io(error) => ParquetError::Io(std::io::Error::new(
|
|
92
|
+
error.kind(),
|
|
93
|
+
format!("{}: {}", ctx, error),
|
|
94
|
+
)),
|
|
95
|
+
ParquetError::Schema(message) => ParquetError::Schema(format!("{}: {}", ctx, message)),
|
|
96
|
+
ParquetError::Conversion(message) => {
|
|
97
|
+
ParquetError::Conversion(format!("{}: {}", ctx, message))
|
|
98
|
+
}
|
|
99
|
+
ParquetError::InvalidArgument(message) => {
|
|
100
|
+
ParquetError::InvalidArgument(format!("{}: {}", ctx, message))
|
|
101
|
+
}
|
|
102
|
+
ParquetError::DataValidation(message) => {
|
|
103
|
+
ParquetError::DataValidation(format!("{}: {}", ctx, message))
|
|
104
|
+
}
|
|
105
|
+
ParquetError::Unsupported(message) => {
|
|
106
|
+
ParquetError::Unsupported(format!("{}: {}", ctx, message))
|
|
107
|
+
}
|
|
108
|
+
ParquetError::Internal(message) => {
|
|
109
|
+
ParquetError::Internal(format!("{}: {}", ctx, message))
|
|
110
|
+
}
|
|
111
|
+
error => ParquetError::Internal(format!("{}: {}", ctx, error)),
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/// Extension trait to add context to errors
|
|
117
|
+
pub trait ErrorContext<T> {
|
|
118
|
+
/// Add context to an error
|
|
119
|
+
fn context<S: Into<String>>(self, ctx: S) -> Result<T>;
|
|
120
|
+
|
|
121
|
+
/// Add context with a closure that's only called on error
|
|
122
|
+
fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T>;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
impl<T, E> ErrorContext<T> for std::result::Result<T, E>
|
|
126
|
+
where
|
|
127
|
+
E: Into<ParquetError>,
|
|
128
|
+
{
|
|
129
|
+
fn context<S: Into<String>>(self, ctx: S) -> Result<T> {
|
|
130
|
+
self.map_err(|e| {
|
|
131
|
+
let base_error = e.into();
|
|
132
|
+
base_error.with_context_message(ctx.into())
|
|
133
|
+
})
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T> {
|
|
137
|
+
self.map_err(|e| {
|
|
138
|
+
let base_error = e.into();
|
|
139
|
+
base_error.with_context_message(f().into())
|
|
140
|
+
})
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
#[cfg(test)]
|
|
145
|
+
mod tests {
|
|
146
|
+
use super::*;
|
|
147
|
+
|
|
148
|
+
#[test]
|
|
149
|
+
fn test_error_creation() {
|
|
150
|
+
let err = ParquetError::schema("Invalid schema");
|
|
151
|
+
assert_eq!(err.to_string(), "Schema error: Invalid schema");
|
|
152
|
+
|
|
153
|
+
let err = ParquetError::conversion("Cannot convert value");
|
|
154
|
+
assert_eq!(err.to_string(), "Conversion error: Cannot convert value");
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
#[test]
|
|
158
|
+
fn test_error_from_io() {
|
|
159
|
+
let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "File not found");
|
|
160
|
+
let err: ParquetError = io_err.into();
|
|
161
|
+
assert!(err.to_string().contains("IO error"));
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
#[test]
|
|
165
|
+
fn test_error_context() {
|
|
166
|
+
fn failing_operation() -> Result<()> {
|
|
167
|
+
Err(ParquetError::invalid_argument("bad input"))
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
let result = failing_operation().context("During file read");
|
|
171
|
+
assert!(result.is_err());
|
|
172
|
+
let err = result.unwrap_err();
|
|
173
|
+
assert!(err.to_string().contains("During file read"));
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
#[test]
|
|
177
|
+
fn test_error_with_context() {
|
|
178
|
+
fn failing_operation() -> Result<()> {
|
|
179
|
+
Err(ParquetError::data_validation("Invalid data"))
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
let filename = "test.parquet";
|
|
183
|
+
let result = failing_operation().with_context(|| format!("Processing file: {}", filename));
|
|
184
|
+
|
|
185
|
+
assert!(result.is_err());
|
|
186
|
+
let err = result.unwrap_err();
|
|
187
|
+
assert!(err.to_string().contains("Processing file: test.parquet"));
|
|
188
|
+
}
|
|
189
|
+
}
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
//! Language-agnostic core functionality for Parquet operations
|
|
2
|
+
//!
|
|
3
|
+
//! `parquet-core` provides core Parquet functionality that can be reused
|
|
4
|
+
//! across different language integrations. It wraps the Apache parquet-rs
|
|
5
|
+
//! crate with a simplified API focused on common use cases.
|
|
6
|
+
//!
|
|
7
|
+
//! # Key Components
|
|
8
|
+
//!
|
|
9
|
+
//! - **Reader**: High-performance Parquet file reader
|
|
10
|
+
//! - Row-wise iteration through [`reader::Reader`]
|
|
11
|
+
//! - Column-wise batch reading for analytics workloads
|
|
12
|
+
//! - Uses `parquet::file::reader::ChunkReader` for flexible input sources
|
|
13
|
+
//!
|
|
14
|
+
//! - **Writer**: Efficient Parquet file writer
|
|
15
|
+
//! - Supports both row and columnar data input
|
|
16
|
+
//! - Configurable compression and encoding options
|
|
17
|
+
//! - Dynamic batch sizing based on memory usage
|
|
18
|
+
//! - Uses `std::io::Write + Send` for output flexibility
|
|
19
|
+
//!
|
|
20
|
+
//! - **Schema**: Type-safe schema representation
|
|
21
|
+
//! - Builder API for constructing schemas
|
|
22
|
+
//! - Support for nested types (structs, lists, maps)
|
|
23
|
+
//! - Schema introspection through the [`traits::SchemaInspector`] trait
|
|
24
|
+
//!
|
|
25
|
+
//! - **Values**: Core value types without external dependencies
|
|
26
|
+
//! - All Parquet primitive types
|
|
27
|
+
//! - Decimal support (128 and 256 bit)
|
|
28
|
+
//! - Temporal types (dates, times, timestamps)
|
|
29
|
+
//!
|
|
30
|
+
//! - **Arrow Conversion**: Bidirectional conversion between Arrow and Parquet
|
|
31
|
+
//! - Zero-copy where possible
|
|
32
|
+
//! - Handles all supported types including nested structures
|
|
33
|
+
//!
|
|
34
|
+
//! # Design Philosophy
|
|
35
|
+
//!
|
|
36
|
+
//! This crate focuses on providing concrete implementations rather than
|
|
37
|
+
//! abstract traits. Language-specific adapters (like `parquet-ruby-adapter`)
|
|
38
|
+
//! handle the translation between language types and Parquet values.
|
|
39
|
+
//!
|
|
40
|
+
//! # Example Usage
|
|
41
|
+
//!
|
|
42
|
+
//! This crate is designed to be used through language-specific adapters.
|
|
43
|
+
//! See `parquet-ruby-adapter` for Ruby integration.
|
|
44
|
+
|
|
45
|
+
pub mod arrow_conversion;
|
|
46
|
+
pub mod error;
|
|
47
|
+
pub mod reader;
|
|
48
|
+
pub mod schema;
|
|
49
|
+
pub mod traits;
|
|
50
|
+
pub mod value;
|
|
51
|
+
pub mod writer;
|
|
52
|
+
|
|
53
|
+
#[cfg(test)]
|
|
54
|
+
pub mod test_utils;
|
|
55
|
+
|
|
56
|
+
pub use error::{ErrorContext, ParquetError, Result};
|
|
57
|
+
pub use reader::Reader;
|
|
58
|
+
pub use schema::{PrimitiveType, Repetition, Schema, SchemaBuilder, SchemaNode};
|
|
59
|
+
pub use value::ParquetValue;
|
|
60
|
+
pub use writer::{Writer, WriterBuilder, MAX_BATCH_SIZE, MAX_SAMPLE_SIZE};
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
//! Core Parquet reading functionality
|
|
2
|
+
|
|
3
|
+
use crate::{arrow_conversion::arrow_to_parquet_value, ParquetError, ParquetValue, Result};
|
|
4
|
+
use arrow::record_batch::RecordBatch;
|
|
5
|
+
use arrow_array::Array;
|
|
6
|
+
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
|
|
7
|
+
use parquet::file::metadata::{FileMetaData, ParquetMetaData};
|
|
8
|
+
use parquet::schema::types::{Type, TypePtr};
|
|
9
|
+
use std::{
|
|
10
|
+
collections::{HashMap, HashSet},
|
|
11
|
+
sync::Arc,
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
/// Core Parquet reader that works with any source implementing Read + Seek
|
|
15
|
+
#[derive(Clone)]
|
|
16
|
+
pub struct Reader<R> {
|
|
17
|
+
inner: R,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
impl<R> Reader<R>
|
|
21
|
+
where
|
|
22
|
+
R: parquet::file::reader::ChunkReader + Clone + 'static,
|
|
23
|
+
{
|
|
24
|
+
/// Create a new reader
|
|
25
|
+
pub fn new(reader: R) -> Self {
|
|
26
|
+
Self { inner: reader }
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Get the Parquet file metadata
|
|
30
|
+
pub fn metadata(&mut self) -> Result<FileMetaData> {
|
|
31
|
+
let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner.clone())?;
|
|
32
|
+
Ok(builder.metadata().file_metadata().clone())
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/// Read rows from the Parquet file
|
|
36
|
+
///
|
|
37
|
+
/// Returns an iterator over rows where each row is a vector of ParquetValues
|
|
38
|
+
pub fn read_rows(self) -> Result<RowIterator<R>> {
|
|
39
|
+
let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
|
|
40
|
+
let schema = builder.schema().clone();
|
|
41
|
+
let metadata = builder.metadata().clone();
|
|
42
|
+
let aligned_parquet_fields = build_alignment(&schema, &metadata)?;
|
|
43
|
+
let reader = builder.build()?;
|
|
44
|
+
|
|
45
|
+
Ok(RowIterator {
|
|
46
|
+
batch_reader: reader,
|
|
47
|
+
schema,
|
|
48
|
+
current_batch: None,
|
|
49
|
+
current_row: 0,
|
|
50
|
+
aligned_parquet_fields,
|
|
51
|
+
_phantom: std::marker::PhantomData,
|
|
52
|
+
})
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Read rows with column projection
|
|
56
|
+
///
|
|
57
|
+
/// Only the specified columns will be read, which can significantly
|
|
58
|
+
/// improve performance for wide tables. Projected row values are returned
|
|
59
|
+
/// in file schema order, not request order.
|
|
60
|
+
pub fn read_rows_with_projection(self, columns: &[String]) -> Result<RowIterator<R>> {
|
|
61
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
|
|
62
|
+
let arrow_schema = builder.schema();
|
|
63
|
+
let requested_columns = columns.iter().map(String::as_str).collect::<HashSet<_>>();
|
|
64
|
+
|
|
65
|
+
// Create projection mask based on column names
|
|
66
|
+
let mut column_indices = Vec::new();
|
|
67
|
+
for (idx, field) in arrow_schema.fields().iter().enumerate() {
|
|
68
|
+
if requested_columns.contains(field.name().as_str()) {
|
|
69
|
+
column_indices.push(idx);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
// The projected batches are emitted in file order over the selected
|
|
73
|
+
// columns; build that schema so alignment and field access match.
|
|
74
|
+
let projected_schema = Arc::new(arrow_schema::Schema::new(
|
|
75
|
+
column_indices
|
|
76
|
+
.iter()
|
|
77
|
+
.map(|idx| arrow_schema.field(*idx).clone())
|
|
78
|
+
.collect::<Vec<_>>(),
|
|
79
|
+
));
|
|
80
|
+
|
|
81
|
+
// Allow empty column projections to match v1 behavior
|
|
82
|
+
// This will result in rows with no fields
|
|
83
|
+
|
|
84
|
+
let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
|
|
85
|
+
builder = builder.with_projection(mask);
|
|
86
|
+
let metadata = builder.metadata().clone();
|
|
87
|
+
let aligned_parquet_fields = build_alignment(&projected_schema, &metadata)?;
|
|
88
|
+
let reader = builder.build()?;
|
|
89
|
+
|
|
90
|
+
Ok(RowIterator {
|
|
91
|
+
batch_reader: reader,
|
|
92
|
+
schema: projected_schema,
|
|
93
|
+
current_batch: None,
|
|
94
|
+
current_row: 0,
|
|
95
|
+
aligned_parquet_fields,
|
|
96
|
+
_phantom: std::marker::PhantomData,
|
|
97
|
+
})
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/// Read columns from the Parquet file
|
|
101
|
+
///
|
|
102
|
+
/// Returns an iterator over column batches where each batch contains
|
|
103
|
+
/// arrays of values for each column.
|
|
104
|
+
pub fn read_columns(self, batch_size: Option<usize>) -> Result<ColumnIterator<R>> {
|
|
105
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
|
|
106
|
+
|
|
107
|
+
let is_empty = builder.metadata().file_metadata().num_rows() == 0;
|
|
108
|
+
|
|
109
|
+
if let Some(size) = batch_size {
|
|
110
|
+
builder = builder.with_batch_size(size);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
let schema = builder.schema().clone();
|
|
114
|
+
let metadata = builder.metadata().clone();
|
|
115
|
+
let aligned_parquet_fields = build_alignment(&schema, &metadata)?;
|
|
116
|
+
let reader = builder.build()?;
|
|
117
|
+
|
|
118
|
+
Ok(ColumnIterator {
|
|
119
|
+
batch_reader: reader,
|
|
120
|
+
schema,
|
|
121
|
+
returned_empty_batch: false,
|
|
122
|
+
is_empty_file: is_empty,
|
|
123
|
+
aligned_parquet_fields,
|
|
124
|
+
_phantom: std::marker::PhantomData,
|
|
125
|
+
})
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
/// Read columns with projection
|
|
129
|
+
pub fn read_columns_with_projection(
|
|
130
|
+
self,
|
|
131
|
+
columns: &[String],
|
|
132
|
+
batch_size: Option<usize>,
|
|
133
|
+
) -> Result<ColumnIterator<R>> {
|
|
134
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
|
|
135
|
+
let arrow_schema = builder.schema();
|
|
136
|
+
let requested_columns = columns.iter().map(String::as_str).collect::<HashSet<_>>();
|
|
137
|
+
|
|
138
|
+
let is_empty = builder.metadata().file_metadata().num_rows() == 0;
|
|
139
|
+
|
|
140
|
+
// Create projection mask
|
|
141
|
+
let mut column_indices = Vec::new();
|
|
142
|
+
for (idx, field) in arrow_schema.fields().iter().enumerate() {
|
|
143
|
+
if requested_columns.contains(field.name().as_str()) {
|
|
144
|
+
column_indices.push(idx);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
let projected_schema = Arc::new(arrow_schema::Schema::new(
|
|
148
|
+
column_indices
|
|
149
|
+
.iter()
|
|
150
|
+
.map(|idx| arrow_schema.field(*idx).clone())
|
|
151
|
+
.collect::<Vec<_>>(),
|
|
152
|
+
));
|
|
153
|
+
|
|
154
|
+
// Allow empty column projections to match v1 behavior
|
|
155
|
+
// This will result in rows with no fields
|
|
156
|
+
|
|
157
|
+
let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
|
|
158
|
+
builder = builder.with_projection(mask);
|
|
159
|
+
|
|
160
|
+
if let Some(size) = batch_size {
|
|
161
|
+
builder = builder.with_batch_size(size);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
let metadata = builder.metadata().clone();
|
|
165
|
+
let aligned_parquet_fields = build_alignment(&projected_schema, &metadata)?;
|
|
166
|
+
let reader = builder.build()?;
|
|
167
|
+
|
|
168
|
+
Ok(ColumnIterator {
|
|
169
|
+
batch_reader: reader,
|
|
170
|
+
schema: projected_schema,
|
|
171
|
+
returned_empty_batch: false,
|
|
172
|
+
is_empty_file: is_empty,
|
|
173
|
+
aligned_parquet_fields,
|
|
174
|
+
_phantom: std::marker::PhantomData,
|
|
175
|
+
})
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/// Build a column-aligned list of parquet root fields for `arrow_schema`,
|
|
180
|
+
/// matching each arrow field to a parquet root field by name once.
|
|
181
|
+
///
|
|
182
|
+
/// The arrow schema may be a projection (a subset of the file's columns in file
|
|
183
|
+
/// order), so positional indexing into the full parquet root is wrong; we match
|
|
184
|
+
/// by name. Computed once per read and indexed by column position thereafter,
|
|
185
|
+
/// turning a per-row O(columns^2) scan into a one-time O(columns) build. If the
|
|
186
|
+
/// file has duplicate root column names the first occurrence wins, matching the
|
|
187
|
+
/// previous lookup behavior.
|
|
188
|
+
fn align_parquet_fields(
|
|
189
|
+
arrow_schema: &arrow_schema::Schema,
|
|
190
|
+
parquet_fields: &[TypePtr],
|
|
191
|
+
) -> Result<Vec<TypePtr>> {
|
|
192
|
+
let mut by_name: HashMap<&str, &TypePtr> = HashMap::with_capacity(parquet_fields.len());
|
|
193
|
+
for field in parquet_fields {
|
|
194
|
+
by_name.entry(field.name()).or_insert(field);
|
|
195
|
+
}
|
|
196
|
+
arrow_schema
|
|
197
|
+
.fields()
|
|
198
|
+
.iter()
|
|
199
|
+
.map(|field| {
|
|
200
|
+
by_name
|
|
201
|
+
.get(field.name().as_str())
|
|
202
|
+
.map(|matched| (*matched).clone())
|
|
203
|
+
.ok_or_else(|| {
|
|
204
|
+
ParquetError::Conversion(format!(
|
|
205
|
+
"No matching parquet field for arrow field '{}'",
|
|
206
|
+
field.name()
|
|
207
|
+
))
|
|
208
|
+
})
|
|
209
|
+
})
|
|
210
|
+
.collect()
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/// Extract the parquet root group's fields from file metadata.
|
|
214
|
+
fn root_parquet_fields(metadata: &ParquetMetaData) -> Result<Vec<TypePtr>> {
|
|
215
|
+
match metadata.file_metadata().schema_descr().root_schema() {
|
|
216
|
+
Type::GroupType { fields, .. } => Ok(fields.clone()),
|
|
217
|
+
_ => Err(ParquetError::Conversion(
|
|
218
|
+
"Root schema must be a group type".to_string(),
|
|
219
|
+
)),
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
/// Compute the column-aligned parquet fields for an (output) arrow schema. The
|
|
224
|
+
/// schema is fixed for the whole read, so this is computed once at construction
|
|
225
|
+
/// and then indexed by column position for every batch and row.
|
|
226
|
+
fn build_alignment(
|
|
227
|
+
schema: &arrow_schema::Schema,
|
|
228
|
+
metadata: &ParquetMetaData,
|
|
229
|
+
) -> Result<Vec<TypePtr>> {
|
|
230
|
+
align_parquet_fields(schema, &root_parquet_fields(metadata)?)
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
/// Iterator over rows in a Parquet file
|
|
234
|
+
pub struct RowIterator<R> {
|
|
235
|
+
batch_reader: ParquetRecordBatchReader,
|
|
236
|
+
/// Output arrow schema (projected subset in file order, or the full schema),
|
|
237
|
+
/// fixed for every batch.
|
|
238
|
+
schema: Arc<arrow_schema::Schema>,
|
|
239
|
+
current_batch: Option<RecordBatch>,
|
|
240
|
+
current_row: usize,
|
|
241
|
+
/// Parquet root fields aligned to `schema` column order, computed once.
|
|
242
|
+
aligned_parquet_fields: Vec<TypePtr>,
|
|
243
|
+
_phantom: std::marker::PhantomData<R>,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
impl<R> Iterator for RowIterator<R>
|
|
247
|
+
where
|
|
248
|
+
R: parquet::file::reader::ChunkReader + 'static,
|
|
249
|
+
{
|
|
250
|
+
type Item = Result<Vec<ParquetValue>>;
|
|
251
|
+
|
|
252
|
+
fn next(&mut self) -> Option<Self::Item> {
|
|
253
|
+
loop {
|
|
254
|
+
// If we have a current batch and haven't exhausted it
|
|
255
|
+
if let Some(ref batch) = self.current_batch {
|
|
256
|
+
if self.current_row < batch.num_rows() {
|
|
257
|
+
// Extract values from current row, using the column-aligned
|
|
258
|
+
// parquet fields computed once at construction.
|
|
259
|
+
let mut row_values = Vec::with_capacity(batch.num_columns());
|
|
260
|
+
|
|
261
|
+
for (i, column) in batch.columns().iter().enumerate() {
|
|
262
|
+
let field = self.schema.field(i);
|
|
263
|
+
let value = match arrow_to_parquet_value(
|
|
264
|
+
field,
|
|
265
|
+
&self.aligned_parquet_fields[i],
|
|
266
|
+
column,
|
|
267
|
+
self.current_row,
|
|
268
|
+
) {
|
|
269
|
+
Ok(v) => v,
|
|
270
|
+
Err(e) => return Some(Err(e)),
|
|
271
|
+
};
|
|
272
|
+
row_values.push(value);
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
self.current_row += 1;
|
|
276
|
+
return Some(Ok(row_values));
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Need to fetch next batch
|
|
281
|
+
match self.batch_reader.next() {
|
|
282
|
+
Some(Ok(batch)) => {
|
|
283
|
+
self.current_batch = Some(batch);
|
|
284
|
+
self.current_row = 0;
|
|
285
|
+
}
|
|
286
|
+
Some(Err(e)) => return Some(Err(e.into())),
|
|
287
|
+
None => return None,
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
/// Iterator over column batches in a Parquet file
|
|
294
|
+
pub struct ColumnIterator<R> {
|
|
295
|
+
batch_reader: ParquetRecordBatchReader,
|
|
296
|
+
schema: Arc<arrow_schema::Schema>,
|
|
297
|
+
returned_empty_batch: bool,
|
|
298
|
+
is_empty_file: bool,
|
|
299
|
+
/// Parquet root fields aligned to `schema` column order, computed once.
|
|
300
|
+
aligned_parquet_fields: Vec<TypePtr>,
|
|
301
|
+
_phantom: std::marker::PhantomData<R>,
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
/// A batch of columns with their names
|
|
305
|
+
pub struct ColumnBatch {
|
|
306
|
+
pub columns: Vec<(String, Vec<ParquetValue>)>,
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
impl<R> Iterator for ColumnIterator<R>
|
|
310
|
+
where
|
|
311
|
+
R: parquet::file::reader::ChunkReader + 'static,
|
|
312
|
+
{
|
|
313
|
+
type Item = Result<ColumnBatch>;
|
|
314
|
+
|
|
315
|
+
fn next(&mut self) -> Option<Self::Item> {
|
|
316
|
+
// Check if this is the first call and we have no data
|
|
317
|
+
if self.is_empty_file && !self.returned_empty_batch {
|
|
318
|
+
// Return one batch with empty columns to show schema
|
|
319
|
+
self.returned_empty_batch = true;
|
|
320
|
+
let mut columns = Vec::with_capacity(self.schema.fields().len());
|
|
321
|
+
|
|
322
|
+
for field in self.schema.fields() {
|
|
323
|
+
columns.push((field.name().to_string(), Vec::new()));
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return Some(Ok(ColumnBatch { columns }));
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
match self.batch_reader.next() {
|
|
330
|
+
Some(Ok(batch)) => {
|
|
331
|
+
let mut columns = Vec::with_capacity(batch.num_columns());
|
|
332
|
+
|
|
333
|
+
for (idx, column) in batch.columns().iter().enumerate() {
|
|
334
|
+
let field = self.schema.field(idx);
|
|
335
|
+
let column_name = field.name().to_string();
|
|
336
|
+
let parquet_field = &self.aligned_parquet_fields[idx];
|
|
337
|
+
|
|
338
|
+
// Convert entire column to ParquetValues
|
|
339
|
+
let mut values = Vec::with_capacity(column.len());
|
|
340
|
+
for row_idx in 0..column.len() {
|
|
341
|
+
match arrow_to_parquet_value(field, parquet_field, column, row_idx) {
|
|
342
|
+
Ok(value) => values.push(value),
|
|
343
|
+
Err(e) => return Some(Err(e)),
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
columns.push((column_name, values));
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
Some(Ok(ColumnBatch { columns }))
|
|
351
|
+
}
|
|
352
|
+
Some(Err(e)) => Some(Err(e.into())),
|
|
353
|
+
None => None,
|
|
354
|
+
}
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
#[cfg(test)]
|
|
359
|
+
mod tests {
|
|
360
|
+
use super::*;
|
|
361
|
+
|
|
362
|
+
#[test]
|
|
363
|
+
fn test_reader_creation() {
|
|
364
|
+
let data = vec![0u8; 1024];
|
|
365
|
+
let bytes = bytes::Bytes::from(data);
|
|
366
|
+
let _reader = Reader::new(bytes);
|
|
367
|
+
}
|
|
368
|
+
}
|