parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,189 @@
1
+ use thiserror::Error;
2
+
3
+ /// Core error type for Parquet operations
4
+ #[derive(Error, Debug)]
5
+ pub enum ParquetError {
6
+ /// IO errors from file operations
7
+ #[error("IO error: {0}")]
8
+ Io(#[from] std::io::Error),
9
+
10
+ /// Arrow errors from Arrow operations
11
+ #[error("Arrow error: {0}")]
12
+ Arrow(#[from] arrow_schema::ArrowError),
13
+
14
+ /// Parquet format errors
15
+ #[error("Parquet error: {0}")]
16
+ Parquet(#[from] parquet::errors::ParquetError),
17
+
18
+ /// Schema-related errors
19
+ #[error("Schema error: {0}")]
20
+ Schema(String),
21
+
22
+ /// Type conversion errors
23
+ #[error("Conversion error: {0}")]
24
+ Conversion(String),
25
+
26
+ /// Invalid argument errors
27
+ #[error("Invalid argument: {0}")]
28
+ InvalidArgument(String),
29
+
30
+ /// Data validation errors
31
+ #[error("Data validation error: {0}")]
32
+ DataValidation(String),
33
+
34
+ /// Unsupported operation errors
35
+ #[error("Unsupported operation: {0}")]
36
+ Unsupported(String),
37
+
38
+ /// Internal errors that shouldn't happen
39
+ #[error("Internal error: {0}")]
40
+ Internal(String),
41
+
42
+ /// UTF-8 decoding errors
43
+ #[error("UTF-8 error: {0}")]
44
+ Utf8(#[from] std::str::Utf8Error),
45
+
46
+ /// Number parsing errors
47
+ #[error("Parse error: {0}")]
48
+ ParseInt(#[from] std::num::ParseIntError),
49
+
50
+ /// Float parsing errors
51
+ #[error("Parse float error: {0}")]
52
+ ParseFloat(#[from] std::num::ParseFloatError),
53
+ }
54
+
55
+ /// Result type alias for Parquet operations
56
+ pub type Result<T> = std::result::Result<T, ParquetError>;
57
+
58
+ impl ParquetError {
59
+ /// Create a new schema error
60
+ pub fn schema<S: Into<String>>(msg: S) -> Self {
61
+ ParquetError::Schema(msg.into())
62
+ }
63
+
64
+ /// Create a new conversion error
65
+ pub fn conversion<S: Into<String>>(msg: S) -> Self {
66
+ ParquetError::Conversion(msg.into())
67
+ }
68
+
69
+ /// Create a new invalid argument error
70
+ pub fn invalid_argument<S: Into<String>>(msg: S) -> Self {
71
+ ParquetError::InvalidArgument(msg.into())
72
+ }
73
+
74
+ /// Create a new data validation error
75
+ pub fn data_validation<S: Into<String>>(msg: S) -> Self {
76
+ ParquetError::DataValidation(msg.into())
77
+ }
78
+
79
+ /// Create a new unsupported operation error
80
+ pub fn unsupported<S: Into<String>>(msg: S) -> Self {
81
+ ParquetError::Unsupported(msg.into())
82
+ }
83
+
84
+ /// Create a new internal error
85
+ pub fn internal<S: Into<String>>(msg: S) -> Self {
86
+ ParquetError::Internal(msg.into())
87
+ }
88
+
89
+ fn with_context_message(self, ctx: String) -> Self {
90
+ match self {
91
+ ParquetError::Io(error) => ParquetError::Io(std::io::Error::new(
92
+ error.kind(),
93
+ format!("{}: {}", ctx, error),
94
+ )),
95
+ ParquetError::Schema(message) => ParquetError::Schema(format!("{}: {}", ctx, message)),
96
+ ParquetError::Conversion(message) => {
97
+ ParquetError::Conversion(format!("{}: {}", ctx, message))
98
+ }
99
+ ParquetError::InvalidArgument(message) => {
100
+ ParquetError::InvalidArgument(format!("{}: {}", ctx, message))
101
+ }
102
+ ParquetError::DataValidation(message) => {
103
+ ParquetError::DataValidation(format!("{}: {}", ctx, message))
104
+ }
105
+ ParquetError::Unsupported(message) => {
106
+ ParquetError::Unsupported(format!("{}: {}", ctx, message))
107
+ }
108
+ ParquetError::Internal(message) => {
109
+ ParquetError::Internal(format!("{}: {}", ctx, message))
110
+ }
111
+ error => ParquetError::Internal(format!("{}: {}", ctx, error)),
112
+ }
113
+ }
114
+ }
115
+
116
+ /// Extension trait to add context to errors
117
+ pub trait ErrorContext<T> {
118
+ /// Add context to an error
119
+ fn context<S: Into<String>>(self, ctx: S) -> Result<T>;
120
+
121
+ /// Add context with a closure that's only called on error
122
+ fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T>;
123
+ }
124
+
125
+ impl<T, E> ErrorContext<T> for std::result::Result<T, E>
126
+ where
127
+ E: Into<ParquetError>,
128
+ {
129
+ fn context<S: Into<String>>(self, ctx: S) -> Result<T> {
130
+ self.map_err(|e| {
131
+ let base_error = e.into();
132
+ base_error.with_context_message(ctx.into())
133
+ })
134
+ }
135
+
136
+ fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T> {
137
+ self.map_err(|e| {
138
+ let base_error = e.into();
139
+ base_error.with_context_message(f().into())
140
+ })
141
+ }
142
+ }
143
+
144
+ #[cfg(test)]
145
+ mod tests {
146
+ use super::*;
147
+
148
+ #[test]
149
+ fn test_error_creation() {
150
+ let err = ParquetError::schema("Invalid schema");
151
+ assert_eq!(err.to_string(), "Schema error: Invalid schema");
152
+
153
+ let err = ParquetError::conversion("Cannot convert value");
154
+ assert_eq!(err.to_string(), "Conversion error: Cannot convert value");
155
+ }
156
+
157
+ #[test]
158
+ fn test_error_from_io() {
159
+ let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "File not found");
160
+ let err: ParquetError = io_err.into();
161
+ assert!(err.to_string().contains("IO error"));
162
+ }
163
+
164
+ #[test]
165
+ fn test_error_context() {
166
+ fn failing_operation() -> Result<()> {
167
+ Err(ParquetError::invalid_argument("bad input"))
168
+ }
169
+
170
+ let result = failing_operation().context("During file read");
171
+ assert!(result.is_err());
172
+ let err = result.unwrap_err();
173
+ assert!(err.to_string().contains("During file read"));
174
+ }
175
+
176
+ #[test]
177
+ fn test_error_with_context() {
178
+ fn failing_operation() -> Result<()> {
179
+ Err(ParquetError::data_validation("Invalid data"))
180
+ }
181
+
182
+ let filename = "test.parquet";
183
+ let result = failing_operation().with_context(|| format!("Processing file: {}", filename));
184
+
185
+ assert!(result.is_err());
186
+ let err = result.unwrap_err();
187
+ assert!(err.to_string().contains("Processing file: test.parquet"));
188
+ }
189
+ }
@@ -0,0 +1,60 @@
1
+ //! Language-agnostic core functionality for Parquet operations
2
+ //!
3
+ //! `parquet-core` provides core Parquet functionality that can be reused
4
+ //! across different language integrations. It wraps the Apache parquet-rs
5
+ //! crate with a simplified API focused on common use cases.
6
+ //!
7
+ //! # Key Components
8
+ //!
9
+ //! - **Reader**: High-performance Parquet file reader
10
+ //! - Row-wise iteration through [`reader::Reader`]
11
+ //! - Column-wise batch reading for analytics workloads
12
+ //! - Uses `parquet::file::reader::ChunkReader` for flexible input sources
13
+ //!
14
+ //! - **Writer**: Efficient Parquet file writer
15
+ //! - Supports both row and columnar data input
16
+ //! - Configurable compression and encoding options
17
+ //! - Dynamic batch sizing based on memory usage
18
+ //! - Uses `std::io::Write + Send` for output flexibility
19
+ //!
20
+ //! - **Schema**: Type-safe schema representation
21
+ //! - Builder API for constructing schemas
22
+ //! - Support for nested types (structs, lists, maps)
23
+ //! - Schema introspection through the [`traits::SchemaInspector`] trait
24
+ //!
25
+ //! - **Values**: Core value types without external dependencies
26
+ //! - All Parquet primitive types
27
+ //! - Decimal support (128 and 256 bit)
28
+ //! - Temporal types (dates, times, timestamps)
29
+ //!
30
+ //! - **Arrow Conversion**: Bidirectional conversion between Arrow and Parquet
31
+ //! - Zero-copy where possible
32
+ //! - Handles all supported types including nested structures
33
+ //!
34
+ //! # Design Philosophy
35
+ //!
36
+ //! This crate focuses on providing concrete implementations rather than
37
+ //! abstract traits. Language-specific adapters (like `parquet-ruby-adapter`)
38
+ //! handle the translation between language types and Parquet values.
39
+ //!
40
+ //! # Example Usage
41
+ //!
42
+ //! This crate is designed to be used through language-specific adapters.
43
+ //! See `parquet-ruby-adapter` for Ruby integration.
44
+
45
+ pub mod arrow_conversion;
46
+ pub mod error;
47
+ pub mod reader;
48
+ pub mod schema;
49
+ pub mod traits;
50
+ pub mod value;
51
+ pub mod writer;
52
+
53
+ #[cfg(test)]
54
+ pub mod test_utils;
55
+
56
+ pub use error::{ErrorContext, ParquetError, Result};
57
+ pub use reader::Reader;
58
+ pub use schema::{PrimitiveType, Repetition, Schema, SchemaBuilder, SchemaNode};
59
+ pub use value::ParquetValue;
60
+ pub use writer::{Writer, WriterBuilder, MAX_BATCH_SIZE, MAX_SAMPLE_SIZE};
@@ -0,0 +1,368 @@
1
+ //! Core Parquet reading functionality
2
+
3
+ use crate::{arrow_conversion::arrow_to_parquet_value, ParquetError, ParquetValue, Result};
4
+ use arrow::record_batch::RecordBatch;
5
+ use arrow_array::Array;
6
+ use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
7
+ use parquet::file::metadata::{FileMetaData, ParquetMetaData};
8
+ use parquet::schema::types::{Type, TypePtr};
9
+ use std::{
10
+ collections::{HashMap, HashSet},
11
+ sync::Arc,
12
+ };
13
+
14
+ /// Core Parquet reader that works with any source implementing Read + Seek
15
+ #[derive(Clone)]
16
+ pub struct Reader<R> {
17
+ inner: R,
18
+ }
19
+
20
+ impl<R> Reader<R>
21
+ where
22
+ R: parquet::file::reader::ChunkReader + Clone + 'static,
23
+ {
24
+ /// Create a new reader
25
+ pub fn new(reader: R) -> Self {
26
+ Self { inner: reader }
27
+ }
28
+
29
+ /// Get the Parquet file metadata
30
+ pub fn metadata(&mut self) -> Result<FileMetaData> {
31
+ let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner.clone())?;
32
+ Ok(builder.metadata().file_metadata().clone())
33
+ }
34
+
35
+ /// Read rows from the Parquet file
36
+ ///
37
+ /// Returns an iterator over rows where each row is a vector of ParquetValues
38
+ pub fn read_rows(self) -> Result<RowIterator<R>> {
39
+ let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
40
+ let schema = builder.schema().clone();
41
+ let metadata = builder.metadata().clone();
42
+ let aligned_parquet_fields = build_alignment(&schema, &metadata)?;
43
+ let reader = builder.build()?;
44
+
45
+ Ok(RowIterator {
46
+ batch_reader: reader,
47
+ schema,
48
+ current_batch: None,
49
+ current_row: 0,
50
+ aligned_parquet_fields,
51
+ _phantom: std::marker::PhantomData,
52
+ })
53
+ }
54
+
55
+ /// Read rows with column projection
56
+ ///
57
+ /// Only the specified columns will be read, which can significantly
58
+ /// improve performance for wide tables. Projected row values are returned
59
+ /// in file schema order, not request order.
60
+ pub fn read_rows_with_projection(self, columns: &[String]) -> Result<RowIterator<R>> {
61
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
62
+ let arrow_schema = builder.schema();
63
+ let requested_columns = columns.iter().map(String::as_str).collect::<HashSet<_>>();
64
+
65
+ // Create projection mask based on column names
66
+ let mut column_indices = Vec::new();
67
+ for (idx, field) in arrow_schema.fields().iter().enumerate() {
68
+ if requested_columns.contains(field.name().as_str()) {
69
+ column_indices.push(idx);
70
+ }
71
+ }
72
+ // The projected batches are emitted in file order over the selected
73
+ // columns; build that schema so alignment and field access match.
74
+ let projected_schema = Arc::new(arrow_schema::Schema::new(
75
+ column_indices
76
+ .iter()
77
+ .map(|idx| arrow_schema.field(*idx).clone())
78
+ .collect::<Vec<_>>(),
79
+ ));
80
+
81
+ // Allow empty column projections to match v1 behavior
82
+ // This will result in rows with no fields
83
+
84
+ let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
85
+ builder = builder.with_projection(mask);
86
+ let metadata = builder.metadata().clone();
87
+ let aligned_parquet_fields = build_alignment(&projected_schema, &metadata)?;
88
+ let reader = builder.build()?;
89
+
90
+ Ok(RowIterator {
91
+ batch_reader: reader,
92
+ schema: projected_schema,
93
+ current_batch: None,
94
+ current_row: 0,
95
+ aligned_parquet_fields,
96
+ _phantom: std::marker::PhantomData,
97
+ })
98
+ }
99
+
100
+ /// Read columns from the Parquet file
101
+ ///
102
+ /// Returns an iterator over column batches where each batch contains
103
+ /// arrays of values for each column.
104
+ pub fn read_columns(self, batch_size: Option<usize>) -> Result<ColumnIterator<R>> {
105
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
106
+
107
+ let is_empty = builder.metadata().file_metadata().num_rows() == 0;
108
+
109
+ if let Some(size) = batch_size {
110
+ builder = builder.with_batch_size(size);
111
+ }
112
+
113
+ let schema = builder.schema().clone();
114
+ let metadata = builder.metadata().clone();
115
+ let aligned_parquet_fields = build_alignment(&schema, &metadata)?;
116
+ let reader = builder.build()?;
117
+
118
+ Ok(ColumnIterator {
119
+ batch_reader: reader,
120
+ schema,
121
+ returned_empty_batch: false,
122
+ is_empty_file: is_empty,
123
+ aligned_parquet_fields,
124
+ _phantom: std::marker::PhantomData,
125
+ })
126
+ }
127
+
128
+ /// Read columns with projection
129
+ pub fn read_columns_with_projection(
130
+ self,
131
+ columns: &[String],
132
+ batch_size: Option<usize>,
133
+ ) -> Result<ColumnIterator<R>> {
134
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
135
+ let arrow_schema = builder.schema();
136
+ let requested_columns = columns.iter().map(String::as_str).collect::<HashSet<_>>();
137
+
138
+ let is_empty = builder.metadata().file_metadata().num_rows() == 0;
139
+
140
+ // Create projection mask
141
+ let mut column_indices = Vec::new();
142
+ for (idx, field) in arrow_schema.fields().iter().enumerate() {
143
+ if requested_columns.contains(field.name().as_str()) {
144
+ column_indices.push(idx);
145
+ }
146
+ }
147
+ let projected_schema = Arc::new(arrow_schema::Schema::new(
148
+ column_indices
149
+ .iter()
150
+ .map(|idx| arrow_schema.field(*idx).clone())
151
+ .collect::<Vec<_>>(),
152
+ ));
153
+
154
+ // Allow empty column projections to match v1 behavior
155
+ // This will result in rows with no fields
156
+
157
+ let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
158
+ builder = builder.with_projection(mask);
159
+
160
+ if let Some(size) = batch_size {
161
+ builder = builder.with_batch_size(size);
162
+ }
163
+
164
+ let metadata = builder.metadata().clone();
165
+ let aligned_parquet_fields = build_alignment(&projected_schema, &metadata)?;
166
+ let reader = builder.build()?;
167
+
168
+ Ok(ColumnIterator {
169
+ batch_reader: reader,
170
+ schema: projected_schema,
171
+ returned_empty_batch: false,
172
+ is_empty_file: is_empty,
173
+ aligned_parquet_fields,
174
+ _phantom: std::marker::PhantomData,
175
+ })
176
+ }
177
+ }
178
+
179
+ /// Build a column-aligned list of parquet root fields for `arrow_schema`,
180
+ /// matching each arrow field to a parquet root field by name once.
181
+ ///
182
+ /// The arrow schema may be a projection (a subset of the file's columns in file
183
+ /// order), so positional indexing into the full parquet root is wrong; we match
184
+ /// by name. Computed once per read and indexed by column position thereafter,
185
+ /// turning a per-row O(columns^2) scan into a one-time O(columns) build. If the
186
+ /// file has duplicate root column names the first occurrence wins, matching the
187
+ /// previous lookup behavior.
188
+ fn align_parquet_fields(
189
+ arrow_schema: &arrow_schema::Schema,
190
+ parquet_fields: &[TypePtr],
191
+ ) -> Result<Vec<TypePtr>> {
192
+ let mut by_name: HashMap<&str, &TypePtr> = HashMap::with_capacity(parquet_fields.len());
193
+ for field in parquet_fields {
194
+ by_name.entry(field.name()).or_insert(field);
195
+ }
196
+ arrow_schema
197
+ .fields()
198
+ .iter()
199
+ .map(|field| {
200
+ by_name
201
+ .get(field.name().as_str())
202
+ .map(|matched| (*matched).clone())
203
+ .ok_or_else(|| {
204
+ ParquetError::Conversion(format!(
205
+ "No matching parquet field for arrow field '{}'",
206
+ field.name()
207
+ ))
208
+ })
209
+ })
210
+ .collect()
211
+ }
212
+
213
+ /// Extract the parquet root group's fields from file metadata.
214
+ fn root_parquet_fields(metadata: &ParquetMetaData) -> Result<Vec<TypePtr>> {
215
+ match metadata.file_metadata().schema_descr().root_schema() {
216
+ Type::GroupType { fields, .. } => Ok(fields.clone()),
217
+ _ => Err(ParquetError::Conversion(
218
+ "Root schema must be a group type".to_string(),
219
+ )),
220
+ }
221
+ }
222
+
223
+ /// Compute the column-aligned parquet fields for an (output) arrow schema. The
224
+ /// schema is fixed for the whole read, so this is computed once at construction
225
+ /// and then indexed by column position for every batch and row.
226
+ fn build_alignment(
227
+ schema: &arrow_schema::Schema,
228
+ metadata: &ParquetMetaData,
229
+ ) -> Result<Vec<TypePtr>> {
230
+ align_parquet_fields(schema, &root_parquet_fields(metadata)?)
231
+ }
232
+
233
+ /// Iterator over rows in a Parquet file
234
+ pub struct RowIterator<R> {
235
+ batch_reader: ParquetRecordBatchReader,
236
+ /// Output arrow schema (projected subset in file order, or the full schema),
237
+ /// fixed for every batch.
238
+ schema: Arc<arrow_schema::Schema>,
239
+ current_batch: Option<RecordBatch>,
240
+ current_row: usize,
241
+ /// Parquet root fields aligned to `schema` column order, computed once.
242
+ aligned_parquet_fields: Vec<TypePtr>,
243
+ _phantom: std::marker::PhantomData<R>,
244
+ }
245
+
246
+ impl<R> Iterator for RowIterator<R>
247
+ where
248
+ R: parquet::file::reader::ChunkReader + 'static,
249
+ {
250
+ type Item = Result<Vec<ParquetValue>>;
251
+
252
+ fn next(&mut self) -> Option<Self::Item> {
253
+ loop {
254
+ // If we have a current batch and haven't exhausted it
255
+ if let Some(ref batch) = self.current_batch {
256
+ if self.current_row < batch.num_rows() {
257
+ // Extract values from current row, using the column-aligned
258
+ // parquet fields computed once at construction.
259
+ let mut row_values = Vec::with_capacity(batch.num_columns());
260
+
261
+ for (i, column) in batch.columns().iter().enumerate() {
262
+ let field = self.schema.field(i);
263
+ let value = match arrow_to_parquet_value(
264
+ field,
265
+ &self.aligned_parquet_fields[i],
266
+ column,
267
+ self.current_row,
268
+ ) {
269
+ Ok(v) => v,
270
+ Err(e) => return Some(Err(e)),
271
+ };
272
+ row_values.push(value);
273
+ }
274
+
275
+ self.current_row += 1;
276
+ return Some(Ok(row_values));
277
+ }
278
+ }
279
+
280
+ // Need to fetch next batch
281
+ match self.batch_reader.next() {
282
+ Some(Ok(batch)) => {
283
+ self.current_batch = Some(batch);
284
+ self.current_row = 0;
285
+ }
286
+ Some(Err(e)) => return Some(Err(e.into())),
287
+ None => return None,
288
+ }
289
+ }
290
+ }
291
+ }
292
+
293
+ /// Iterator over column batches in a Parquet file
294
+ pub struct ColumnIterator<R> {
295
+ batch_reader: ParquetRecordBatchReader,
296
+ schema: Arc<arrow_schema::Schema>,
297
+ returned_empty_batch: bool,
298
+ is_empty_file: bool,
299
+ /// Parquet root fields aligned to `schema` column order, computed once.
300
+ aligned_parquet_fields: Vec<TypePtr>,
301
+ _phantom: std::marker::PhantomData<R>,
302
+ }
303
+
304
+ /// A batch of columns with their names
305
+ pub struct ColumnBatch {
306
+ pub columns: Vec<(String, Vec<ParquetValue>)>,
307
+ }
308
+
309
+ impl<R> Iterator for ColumnIterator<R>
310
+ where
311
+ R: parquet::file::reader::ChunkReader + 'static,
312
+ {
313
+ type Item = Result<ColumnBatch>;
314
+
315
+ fn next(&mut self) -> Option<Self::Item> {
316
+ // Check if this is the first call and we have no data
317
+ if self.is_empty_file && !self.returned_empty_batch {
318
+ // Return one batch with empty columns to show schema
319
+ self.returned_empty_batch = true;
320
+ let mut columns = Vec::with_capacity(self.schema.fields().len());
321
+
322
+ for field in self.schema.fields() {
323
+ columns.push((field.name().to_string(), Vec::new()));
324
+ }
325
+
326
+ return Some(Ok(ColumnBatch { columns }));
327
+ }
328
+
329
+ match self.batch_reader.next() {
330
+ Some(Ok(batch)) => {
331
+ let mut columns = Vec::with_capacity(batch.num_columns());
332
+
333
+ for (idx, column) in batch.columns().iter().enumerate() {
334
+ let field = self.schema.field(idx);
335
+ let column_name = field.name().to_string();
336
+ let parquet_field = &self.aligned_parquet_fields[idx];
337
+
338
+ // Convert entire column to ParquetValues
339
+ let mut values = Vec::with_capacity(column.len());
340
+ for row_idx in 0..column.len() {
341
+ match arrow_to_parquet_value(field, parquet_field, column, row_idx) {
342
+ Ok(value) => values.push(value),
343
+ Err(e) => return Some(Err(e)),
344
+ }
345
+ }
346
+
347
+ columns.push((column_name, values));
348
+ }
349
+
350
+ Some(Ok(ColumnBatch { columns }))
351
+ }
352
+ Some(Err(e)) => Some(Err(e.into())),
353
+ None => None,
354
+ }
355
+ }
356
+ }
357
+
358
+ #[cfg(test)]
359
+ mod tests {
360
+ use super::*;
361
+
362
+ #[test]
363
+ fn test_reader_creation() {
364
+ let data = vec![0u8; 1024];
365
+ let bytes = bytes::Bytes::from(data);
366
+ let _reader = Reader::new(bytes);
367
+ }
368
+ }