parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,163 @@
1
+ use thiserror::Error;
2
+
3
+ /// Core error type for Parquet operations
4
+ #[derive(Error, Debug)]
5
+ pub enum ParquetError {
6
+ /// IO errors from file operations
7
+ #[error("IO error: {0}")]
8
+ Io(#[from] std::io::Error),
9
+
10
+ /// Arrow errors from Arrow operations
11
+ #[error("Arrow error: {0}")]
12
+ Arrow(#[from] arrow_schema::ArrowError),
13
+
14
+ /// Parquet format errors
15
+ #[error("Parquet error: {0}")]
16
+ Parquet(#[from] parquet::errors::ParquetError),
17
+
18
+ /// Schema-related errors
19
+ #[error("Schema error: {0}")]
20
+ Schema(String),
21
+
22
+ /// Type conversion errors
23
+ #[error("Conversion error: {0}")]
24
+ Conversion(String),
25
+
26
+ /// Invalid argument errors
27
+ #[error("Invalid argument: {0}")]
28
+ InvalidArgument(String),
29
+
30
+ /// Data validation errors
31
+ #[error("Data validation error: {0}")]
32
+ DataValidation(String),
33
+
34
+ /// Unsupported operation errors
35
+ #[error("Unsupported operation: {0}")]
36
+ Unsupported(String),
37
+
38
+ /// Internal errors that shouldn't happen
39
+ #[error("Internal error: {0}")]
40
+ Internal(String),
41
+
42
+ /// UTF-8 decoding errors
43
+ #[error("UTF-8 error: {0}")]
44
+ Utf8(#[from] std::str::Utf8Error),
45
+
46
+ /// Number parsing errors
47
+ #[error("Parse error: {0}")]
48
+ ParseInt(#[from] std::num::ParseIntError),
49
+
50
+ /// Float parsing errors
51
+ #[error("Parse float error: {0}")]
52
+ ParseFloat(#[from] std::num::ParseFloatError),
53
+ }
54
+
55
+ /// Result type alias for Parquet operations
56
+ pub type Result<T> = std::result::Result<T, ParquetError>;
57
+
58
+ impl ParquetError {
59
+ /// Create a new schema error
60
+ pub fn schema<S: Into<String>>(msg: S) -> Self {
61
+ ParquetError::Schema(msg.into())
62
+ }
63
+
64
+ /// Create a new conversion error
65
+ pub fn conversion<S: Into<String>>(msg: S) -> Self {
66
+ ParquetError::Conversion(msg.into())
67
+ }
68
+
69
+ /// Create a new invalid argument error
70
+ pub fn invalid_argument<S: Into<String>>(msg: S) -> Self {
71
+ ParquetError::InvalidArgument(msg.into())
72
+ }
73
+
74
+ /// Create a new data validation error
75
+ pub fn data_validation<S: Into<String>>(msg: S) -> Self {
76
+ ParquetError::DataValidation(msg.into())
77
+ }
78
+
79
+ /// Create a new unsupported operation error
80
+ pub fn unsupported<S: Into<String>>(msg: S) -> Self {
81
+ ParquetError::Unsupported(msg.into())
82
+ }
83
+
84
+ /// Create a new internal error
85
+ pub fn internal<S: Into<String>>(msg: S) -> Self {
86
+ ParquetError::Internal(msg.into())
87
+ }
88
+ }
89
+
90
+ /// Extension trait to add context to errors
91
+ pub trait ErrorContext<T> {
92
+ /// Add context to an error
93
+ fn context<S: Into<String>>(self, ctx: S) -> Result<T>;
94
+
95
+ /// Add context with a closure that's only called on error
96
+ fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T>;
97
+ }
98
+
99
+ impl<T, E> ErrorContext<T> for std::result::Result<T, E>
100
+ where
101
+ E: Into<ParquetError>,
102
+ {
103
+ fn context<S: Into<String>>(self, ctx: S) -> Result<T> {
104
+ self.map_err(|e| {
105
+ let base_error = e.into();
106
+ ParquetError::Internal(format!("{}: {}", ctx.into(), base_error))
107
+ })
108
+ }
109
+
110
+ fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T> {
111
+ self.map_err(|e| {
112
+ let base_error = e.into();
113
+ ParquetError::Internal(format!("{}: {}", f().into(), base_error))
114
+ })
115
+ }
116
+ }
117
+
118
+ #[cfg(test)]
119
+ mod tests {
120
+ use super::*;
121
+
122
+ #[test]
123
+ fn test_error_creation() {
124
+ let err = ParquetError::schema("Invalid schema");
125
+ assert_eq!(err.to_string(), "Schema error: Invalid schema");
126
+
127
+ let err = ParquetError::conversion("Cannot convert value");
128
+ assert_eq!(err.to_string(), "Conversion error: Cannot convert value");
129
+ }
130
+
131
+ #[test]
132
+ fn test_error_from_io() {
133
+ let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "File not found");
134
+ let err: ParquetError = io_err.into();
135
+ assert!(err.to_string().contains("IO error"));
136
+ }
137
+
138
+ #[test]
139
+ fn test_error_context() {
140
+ fn failing_operation() -> Result<()> {
141
+ Err(ParquetError::invalid_argument("bad input"))
142
+ }
143
+
144
+ let result = failing_operation().context("During file read");
145
+ assert!(result.is_err());
146
+ let err = result.unwrap_err();
147
+ assert!(err.to_string().contains("During file read"));
148
+ }
149
+
150
+ #[test]
151
+ fn test_error_with_context() {
152
+ fn failing_operation() -> Result<()> {
153
+ Err(ParquetError::data_validation("Invalid data"))
154
+ }
155
+
156
+ let filename = "test.parquet";
157
+ let result = failing_operation().with_context(|| format!("Processing file: {}", filename));
158
+
159
+ assert!(result.is_err());
160
+ let err = result.unwrap_err();
161
+ assert!(err.to_string().contains("Processing file: test.parquet"));
162
+ }
163
+ }
@@ -0,0 +1,60 @@
1
+ //! Language-agnostic core functionality for Parquet operations
2
+ //!
3
+ //! `parquet-core` provides core Parquet functionality that can be reused
4
+ //! across different language integrations. It wraps the Apache parquet-rs
5
+ //! crate with a simplified API focused on common use cases.
6
+ //!
7
+ //! # Key Components
8
+ //!
9
+ //! - **Reader**: High-performance Parquet file reader
10
+ //! - Row-wise iteration through [`reader::Reader`]
11
+ //! - Column-wise batch reading for analytics workloads
12
+ //! - Uses `parquet::file::reader::ChunkReader` for flexible input sources
13
+ //!
14
+ //! - **Writer**: Efficient Parquet file writer
15
+ //! - Supports both row and columnar data input
16
+ //! - Configurable compression and encoding options
17
+ //! - Dynamic batch sizing based on memory usage
18
+ //! - Uses `std::io::Write + Send` for output flexibility
19
+ //!
20
+ //! - **Schema**: Type-safe schema representation
21
+ //! - Builder API for constructing schemas
22
+ //! - Support for nested types (structs, lists, maps)
23
+ //! - Schema introspection through the [`traits::SchemaInspector`] trait
24
+ //!
25
+ //! - **Values**: Core value types without external dependencies
26
+ //! - All Parquet primitive types
27
+ //! - Decimal support (128 and 256 bit)
28
+ //! - Temporal types (dates, times, timestamps)
29
+ //!
30
+ //! - **Arrow Conversion**: Bidirectional conversion between Arrow and Parquet
31
+ //! - Zero-copy where possible
32
+ //! - Handles all supported types including nested structures
33
+ //!
34
+ //! # Design Philosophy
35
+ //!
36
+ //! This crate focuses on providing concrete implementations rather than
37
+ //! abstract traits. Language-specific adapters (like `parquet-ruby-adapter`)
38
+ //! handle the translation between language types and Parquet values.
39
+ //!
40
+ //! # Example Usage
41
+ //!
42
+ //! This crate is designed to be used through language-specific adapters.
43
+ //! See `parquet-ruby-adapter` for Ruby integration.
44
+
45
+ pub mod arrow_conversion;
46
+ pub mod error;
47
+ pub mod reader;
48
+ pub mod schema;
49
+ pub mod traits;
50
+ pub mod value;
51
+ pub mod writer;
52
+
53
+ #[cfg(test)]
54
+ pub mod test_utils;
55
+
56
+ pub use error::{ErrorContext, ParquetError, Result};
57
+ pub use reader::Reader;
58
+ pub use schema::{PrimitiveType, Repetition, Schema, SchemaBuilder, SchemaNode};
59
+ pub use value::ParquetValue;
60
+ pub use writer::{Writer, WriterBuilder};
@@ -0,0 +1,263 @@
1
+ //! Core Parquet reading functionality
2
+
3
+ use crate::{arrow_conversion::arrow_to_parquet_value, ParquetValue, Result};
4
+ use arrow::record_batch::RecordBatch;
5
+ use arrow_array::Array;
6
+ use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
7
+ use parquet::file::metadata::FileMetaData;
8
+ use std::sync::Arc;
9
+
10
+ /// Core Parquet reader that works with any source implementing Read + Seek
11
+ #[derive(Clone)]
12
+ pub struct Reader<R> {
13
+ inner: R,
14
+ }
15
+
16
+ impl<R> Reader<R>
17
+ where
18
+ R: parquet::file::reader::ChunkReader + Clone + 'static,
19
+ {
20
+ /// Create a new reader
21
+ pub fn new(reader: R) -> Self {
22
+ Self { inner: reader }
23
+ }
24
+
25
+ /// Get the Parquet file metadata
26
+ pub fn metadata(&mut self) -> Result<FileMetaData> {
27
+ let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner.clone())?;
28
+ Ok(builder.metadata().file_metadata().clone())
29
+ }
30
+
31
+ /// Read rows from the Parquet file
32
+ ///
33
+ /// Returns an iterator over rows where each row is a vector of ParquetValues
34
+ pub fn read_rows(self) -> Result<RowIterator<R>> {
35
+ let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
36
+ let reader = builder.build()?;
37
+
38
+ Ok(RowIterator {
39
+ batch_reader: reader,
40
+ current_batch: None,
41
+ current_row: 0,
42
+ _phantom: std::marker::PhantomData,
43
+ })
44
+ }
45
+
46
+ /// Read rows with column projection
47
+ ///
48
+ /// Only the specified columns will be read, which can significantly
49
+ /// improve performance for wide tables.
50
+ pub fn read_rows_with_projection(self, columns: &[String]) -> Result<RowIterator<R>> {
51
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
52
+ let arrow_schema = builder.schema();
53
+
54
+ // Create projection mask based on column names
55
+ let mut column_indices = Vec::new();
56
+ for (idx, field) in arrow_schema.fields().iter().enumerate() {
57
+ if columns.contains(&field.name().to_string()) {
58
+ column_indices.push(idx);
59
+ }
60
+ }
61
+
62
+ // Allow empty column projections to match v1 behavior
63
+ // This will result in rows with no fields
64
+
65
+ let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
66
+ builder = builder.with_projection(mask);
67
+ let reader = builder.build()?;
68
+
69
+ Ok(RowIterator {
70
+ batch_reader: reader,
71
+ current_batch: None,
72
+ current_row: 0,
73
+ _phantom: std::marker::PhantomData,
74
+ })
75
+ }
76
+
77
+ /// Read columns from the Parquet file
78
+ ///
79
+ /// Returns an iterator over column batches where each batch contains
80
+ /// arrays of values for each column.
81
+ pub fn read_columns(self, batch_size: Option<usize>) -> Result<ColumnIterator<R>> {
82
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
83
+
84
+ let is_empty = builder.metadata().file_metadata().num_rows() == 0;
85
+
86
+ if let Some(size) = batch_size {
87
+ builder = builder.with_batch_size(size);
88
+ }
89
+
90
+ let schema = builder.schema().clone();
91
+ let reader = builder.build()?;
92
+
93
+ Ok(ColumnIterator {
94
+ batch_reader: reader,
95
+ schema,
96
+ returned_empty_batch: false,
97
+ is_empty_file: is_empty,
98
+ _phantom: std::marker::PhantomData,
99
+ })
100
+ }
101
+
102
+ /// Read columns with projection
103
+ pub fn read_columns_with_projection(
104
+ self,
105
+ columns: &[String],
106
+ batch_size: Option<usize>,
107
+ ) -> Result<ColumnIterator<R>> {
108
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
109
+ let arrow_schema = builder.schema();
110
+
111
+ let is_empty = builder.metadata().file_metadata().num_rows() == 0;
112
+
113
+ // Create projection mask
114
+ let mut column_indices = Vec::new();
115
+ for (idx, field) in arrow_schema.fields().iter().enumerate() {
116
+ if columns.contains(&field.name().to_string()) {
117
+ column_indices.push(idx);
118
+ }
119
+ }
120
+
121
+ // Allow empty column projections to match v1 behavior
122
+ // This will result in rows with no fields
123
+
124
+ let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
125
+ builder = builder.with_projection(mask);
126
+
127
+ if let Some(size) = batch_size {
128
+ builder = builder.with_batch_size(size);
129
+ }
130
+
131
+ let schema = builder.schema().clone();
132
+ let reader = builder.build()?;
133
+
134
+ Ok(ColumnIterator {
135
+ batch_reader: reader,
136
+ schema,
137
+ returned_empty_batch: false,
138
+ is_empty_file: is_empty,
139
+ _phantom: std::marker::PhantomData,
140
+ })
141
+ }
142
+ }
143
+
144
+ /// Iterator over rows in a Parquet file
145
+ pub struct RowIterator<R> {
146
+ batch_reader: ParquetRecordBatchReader,
147
+ current_batch: Option<RecordBatch>,
148
+ current_row: usize,
149
+ _phantom: std::marker::PhantomData<R>,
150
+ }
151
+
152
+ impl<R> Iterator for RowIterator<R>
153
+ where
154
+ R: parquet::file::reader::ChunkReader + 'static,
155
+ {
156
+ type Item = Result<Vec<ParquetValue>>;
157
+
158
+ fn next(&mut self) -> Option<Self::Item> {
159
+ loop {
160
+ // If we have a current batch and haven't exhausted it
161
+ if let Some(ref batch) = self.current_batch {
162
+ if self.current_row < batch.num_rows() {
163
+ // Extract values from current row
164
+ let mut row_values = Vec::with_capacity(batch.num_columns());
165
+
166
+ for column in batch.columns() {
167
+ let value = match arrow_to_parquet_value(column, self.current_row) {
168
+ Ok(v) => v,
169
+ Err(e) => return Some(Err(e)),
170
+ };
171
+ row_values.push(value);
172
+ }
173
+
174
+ self.current_row += 1;
175
+ return Some(Ok(row_values));
176
+ }
177
+ }
178
+
179
+ // Need to fetch next batch
180
+ match self.batch_reader.next() {
181
+ Some(Ok(batch)) => {
182
+ self.current_batch = Some(batch);
183
+ self.current_row = 0;
184
+ }
185
+ Some(Err(e)) => return Some(Err(e.into())),
186
+ None => return None,
187
+ }
188
+ }
189
+ }
190
+ }
191
+
192
+ /// Iterator over column batches in a Parquet file
193
+ pub struct ColumnIterator<R> {
194
+ batch_reader: ParquetRecordBatchReader,
195
+ schema: Arc<arrow_schema::Schema>,
196
+ returned_empty_batch: bool,
197
+ is_empty_file: bool,
198
+ _phantom: std::marker::PhantomData<R>,
199
+ }
200
+
201
+ /// A batch of columns with their names
202
+ pub struct ColumnBatch {
203
+ pub columns: Vec<(String, Vec<ParquetValue>)>,
204
+ }
205
+
206
+ impl<R> Iterator for ColumnIterator<R>
207
+ where
208
+ R: parquet::file::reader::ChunkReader + 'static,
209
+ {
210
+ type Item = Result<ColumnBatch>;
211
+
212
+ fn next(&mut self) -> Option<Self::Item> {
213
+ // Check if this is the first call and we have no data
214
+ if self.is_empty_file && !self.returned_empty_batch {
215
+ // Return one batch with empty columns to show schema
216
+ self.returned_empty_batch = true;
217
+ let mut columns = Vec::with_capacity(self.schema.fields().len());
218
+
219
+ for field in self.schema.fields() {
220
+ columns.push((field.name().to_string(), Vec::new()));
221
+ }
222
+
223
+ return Some(Ok(ColumnBatch { columns }));
224
+ }
225
+
226
+ match self.batch_reader.next() {
227
+ Some(Ok(batch)) => {
228
+ let mut columns = Vec::with_capacity(batch.num_columns());
229
+
230
+ for (idx, column) in batch.columns().iter().enumerate() {
231
+ let column_name = self.schema.field(idx).name().to_string();
232
+
233
+ // Convert entire column to ParquetValues
234
+ let mut values = Vec::with_capacity(column.len());
235
+ for row_idx in 0..column.len() {
236
+ match arrow_to_parquet_value(column, row_idx) {
237
+ Ok(value) => values.push(value),
238
+ Err(e) => return Some(Err(e)),
239
+ }
240
+ }
241
+
242
+ columns.push((column_name, values));
243
+ }
244
+
245
+ Some(Ok(ColumnBatch { columns }))
246
+ }
247
+ Some(Err(e)) => Some(Err(e.into())),
248
+ None => None,
249
+ }
250
+ }
251
+ }
252
+
253
+ #[cfg(test)]
254
+ mod tests {
255
+ use super::*;
256
+
257
+ #[test]
258
+ fn test_reader_creation() {
259
+ let data = vec![0u8; 1024];
260
+ let bytes = bytes::Bytes::from(data);
261
+ let _reader = Reader::new(bytes);
262
+ }
263
+ }