RubyGems - parquet - Versions diffs - 0.5.12 → 0.6.0 - Mend

parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

checksums.yaml +4 -4
data/Cargo.lock +295 -98
data/Cargo.toml +1 -1
data/Gemfile +1 -0
data/README.md +94 -3
data/ext/parquet/Cargo.toml +8 -5
data/ext/parquet/src/adapter_ffi.rs +156 -0
data/ext/parquet/src/lib.rs +13 -21
data/ext/parquet-core/Cargo.toml +23 -0
data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
data/ext/parquet-core/src/error.rs +163 -0
data/ext/parquet-core/src/lib.rs +60 -0
data/ext/parquet-core/src/reader.rs +263 -0
data/ext/parquet-core/src/schema.rs +283 -0
data/ext/parquet-core/src/test_utils.rs +308 -0
data/ext/parquet-core/src/traits/mod.rs +5 -0
data/ext/parquet-core/src/traits/schema.rs +151 -0
data/ext/parquet-core/src/value.rs +209 -0
data/ext/parquet-core/src/writer.rs +839 -0
data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
data/ext/parquet-core/tests/binary_data.rs +437 -0
data/ext/parquet-core/tests/column_projection.rs +557 -0
data/ext/parquet-core/tests/complex_types.rs +821 -0
data/ext/parquet-core/tests/compression_tests.rs +434 -0
data/ext/parquet-core/tests/concurrent_access.rs +430 -0
data/ext/parquet-core/tests/decimal_tests.rs +488 -0
data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
data/ext/parquet-core/tests/performance_memory.rs +181 -0
data/ext/parquet-core/tests/primitive_types.rs +547 -0
data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
data/ext/parquet-core/tests/temporal_tests.rs +518 -0
data/ext/parquet-core/tests/test_helpers.rs +132 -0
data/ext/parquet-core/tests/writer_tests.rs +545 -0
data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
data/ext/parquet-ruby-adapter/build.rs +5 -0
data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
data/ext/parquet-ruby-adapter/src/error.rs +148 -0
data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
data/ext/parquet-ruby-adapter/src/types.rs +94 -0
data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
data/lib/parquet/schema.rb +19 -0
data/lib/parquet/version.rb +1 -1
metadata +50 -24
data/ext/parquet/src/enumerator.rs +0 -68
data/ext/parquet/src/header_cache.rs +0 -99
data/ext/parquet/src/logger.rs +0 -171
data/ext/parquet/src/reader/common.rs +0 -111
data/ext/parquet/src/reader/mod.rs +0 -211
data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
data/ext/parquet/src/reader/unified/mod.rs +0 -363
data/ext/parquet/src/types/core_types.rs +0 -120
data/ext/parquet/src/types/mod.rs +0 -100
data/ext/parquet/src/types/parquet_value.rs +0 -1275
data/ext/parquet/src/types/record_types.rs +0 -603
data/ext/parquet/src/types/schema_converter.rs +0 -290
data/ext/parquet/src/types/schema_node.rs +0 -424
data/ext/parquet/src/types/timestamp.rs +0 -285
data/ext/parquet/src/types/type_conversion.rs +0 -1949
data/ext/parquet/src/types/writer_types.rs +0 -329
data/ext/parquet/src/utils.rs +0 -184
data/ext/parquet/src/writer/mod.rs +0 -505
data/ext/parquet/src/writer/write_columns.rs +0 -238
data/ext/parquet/src/writer/write_rows.rs +0 -488

data/ext/parquet-core/src/error.rs ADDED Viewed

@@ -0,0 +1,163 @@
+use thiserror::Error;
+/// Core error type for Parquet operations
+#[derive(Error, Debug)]
+pub enum ParquetError {
+    /// IO errors from file operations
+    #[error("IO error: {0}")]
+    Io(#[from] std::io::Error),
+    /// Arrow errors from Arrow operations
+    #[error("Arrow error: {0}")]
+    Arrow(#[from] arrow_schema::ArrowError),
+    /// Parquet format errors
+    #[error("Parquet error: {0}")]
+    Parquet(#[from] parquet::errors::ParquetError),
+    /// Schema-related errors
+    #[error("Schema error: {0}")]
+    Schema(String),
+    /// Type conversion errors
+    #[error("Conversion error: {0}")]
+    Conversion(String),
+    /// Invalid argument errors
+    #[error("Invalid argument: {0}")]
+    InvalidArgument(String),
+    /// Data validation errors
+    #[error("Data validation error: {0}")]
+    DataValidation(String),
+    /// Unsupported operation errors
+    #[error("Unsupported operation: {0}")]
+    Unsupported(String),
+    /// Internal errors that shouldn't happen
+    #[error("Internal error: {0}")]
+    Internal(String),
+    /// UTF-8 decoding errors
+    #[error("UTF-8 error: {0}")]
+    Utf8(#[from] std::str::Utf8Error),
+    /// Number parsing errors
+    #[error("Parse error: {0}")]
+    ParseInt(#[from] std::num::ParseIntError),
+    /// Float parsing errors
+    #[error("Parse float error: {0}")]
+    ParseFloat(#[from] std::num::ParseFloatError),
+}
+/// Result type alias for Parquet operations
+pub type Result<T> = std::result::Result<T, ParquetError>;
+impl ParquetError {
+    /// Create a new schema error
+    pub fn schema<S: Into<String>>(msg: S) -> Self {
+        ParquetError::Schema(msg.into())
+    }
+    /// Create a new conversion error
+    pub fn conversion<S: Into<String>>(msg: S) -> Self {
+        ParquetError::Conversion(msg.into())
+    }
+    /// Create a new invalid argument error
+    pub fn invalid_argument<S: Into<String>>(msg: S) -> Self {
+        ParquetError::InvalidArgument(msg.into())
+    }
+    /// Create a new data validation error
+    pub fn data_validation<S: Into<String>>(msg: S) -> Self {
+        ParquetError::DataValidation(msg.into())
+    }
+    /// Create a new unsupported operation error
+    pub fn unsupported<S: Into<String>>(msg: S) -> Self {
+        ParquetError::Unsupported(msg.into())
+    }
+    /// Create a new internal error
+    pub fn internal<S: Into<String>>(msg: S) -> Self {
+        ParquetError::Internal(msg.into())
+    }
+}
+/// Extension trait to add context to errors
+pub trait ErrorContext<T> {
+    /// Add context to an error
+    fn context<S: Into<String>>(self, ctx: S) -> Result<T>;
+    /// Add context with a closure that's only called on error
+    fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T>;
+}
+impl<T, E> ErrorContext<T> for std::result::Result<T, E>
+where
+    E: Into<ParquetError>,
+{
+    fn context<S: Into<String>>(self, ctx: S) -> Result<T> {
+        self.map_err(|e| {
+            let base_error = e.into();
+            ParquetError::Internal(format!("{}: {}", ctx.into(), base_error))
+        })
+    }
+    fn with_context<S: Into<String>, F: FnOnce() -> S>(self, f: F) -> Result<T> {
+        self.map_err(|e| {
+            let base_error = e.into();
+            ParquetError::Internal(format!("{}: {}", f().into(), base_error))
+        })
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_error_creation() {
+        let err = ParquetError::schema("Invalid schema");
+        assert_eq!(err.to_string(), "Schema error: Invalid schema");
+        let err = ParquetError::conversion("Cannot convert value");
+        assert_eq!(err.to_string(), "Conversion error: Cannot convert value");
+    }
+    #[test]
+    fn test_error_from_io() {
+        let io_err = std::io::Error::new(std::io::ErrorKind::NotFound, "File not found");
+        let err: ParquetError = io_err.into();
+        assert!(err.to_string().contains("IO error"));
+    }
+    #[test]
+    fn test_error_context() {
+        fn failing_operation() -> Result<()> {
+            Err(ParquetError::invalid_argument("bad input"))
+        }
+        let result = failing_operation().context("During file read");
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(err.to_string().contains("During file read"));
+    }
+    #[test]
+    fn test_error_with_context() {
+        fn failing_operation() -> Result<()> {
+            Err(ParquetError::data_validation("Invalid data"))
+        }
+        let filename = "test.parquet";
+        let result = failing_operation().with_context(|| format!("Processing file: {}", filename));
+        assert!(result.is_err());
+        let err = result.unwrap_err();
+        assert!(err.to_string().contains("Processing file: test.parquet"));
+    }
+}

data/ext/parquet-core/src/lib.rs ADDED Viewed

@@ -0,0 +1,60 @@
+//! Language-agnostic core functionality for Parquet operations
+//!
+//! `parquet-core` provides core Parquet functionality that can be reused
+//! across different language integrations. It wraps the Apache parquet-rs
+//! crate with a simplified API focused on common use cases.
+//!
+//! # Key Components
+//!
+//! - **Reader**: High-performance Parquet file reader
+//!   - Row-wise iteration through [`reader::Reader`]
+//!   - Column-wise batch reading for analytics workloads
+//!   - Uses `parquet::file::reader::ChunkReader` for flexible input sources
+//!
+//! - **Writer**: Efficient Parquet file writer
+//!   - Supports both row and columnar data input
+//!   - Configurable compression and encoding options
+//!   - Dynamic batch sizing based on memory usage
+//!   - Uses `std::io::Write + Send` for output flexibility
+//!
+//! - **Schema**: Type-safe schema representation
+//!   - Builder API for constructing schemas
+//!   - Support for nested types (structs, lists, maps)
+//!   - Schema introspection through the [`traits::SchemaInspector`] trait
+//!
+//! - **Values**: Core value types without external dependencies
+//!   - All Parquet primitive types
+//!   - Decimal support (128 and 256 bit)
+//!   - Temporal types (dates, times, timestamps)
+//!
+//! - **Arrow Conversion**: Bidirectional conversion between Arrow and Parquet
+//!   - Zero-copy where possible
+//!   - Handles all supported types including nested structures
+//!
+//! # Design Philosophy
+//!
+//! This crate focuses on providing concrete implementations rather than
+//! abstract traits. Language-specific adapters (like `parquet-ruby-adapter`)
+//! handle the translation between language types and Parquet values.
+//!
+//! # Example Usage
+//!
+//! This crate is designed to be used through language-specific adapters.
+//! See `parquet-ruby-adapter` for Ruby integration.
+pub mod arrow_conversion;
+pub mod error;
+pub mod reader;
+pub mod schema;
+pub mod traits;
+pub mod value;
+pub mod writer;
+#[cfg(test)]
+pub mod test_utils;
+pub use error::{ErrorContext, ParquetError, Result};
+pub use reader::Reader;
+pub use schema::{PrimitiveType, Repetition, Schema, SchemaBuilder, SchemaNode};
+pub use value::ParquetValue;
+pub use writer::{Writer, WriterBuilder};

data/ext/parquet-core/src/reader.rs ADDED Viewed

@@ -0,0 +1,263 @@
+//! Core Parquet reading functionality
+use crate::{arrow_conversion::arrow_to_parquet_value, ParquetValue, Result};
+use arrow::record_batch::RecordBatch;
+use arrow_array::Array;
+use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
+use parquet::file::metadata::FileMetaData;
+use std::sync::Arc;
+/// Core Parquet reader that works with any source implementing Read + Seek
+#[derive(Clone)]
+pub struct Reader<R> {
+    inner: R,
+}
+impl<R> Reader<R>
+where
+    R: parquet::file::reader::ChunkReader + Clone + 'static,
+{
+    /// Create a new reader
+    pub fn new(reader: R) -> Self {
+        Self { inner: reader }
+    }
+    /// Get the Parquet file metadata
+    pub fn metadata(&mut self) -> Result<FileMetaData> {
+        let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner.clone())?;
+        Ok(builder.metadata().file_metadata().clone())
+    }
+    /// Read rows from the Parquet file
+    ///
+    /// Returns an iterator over rows where each row is a vector of ParquetValues
+    pub fn read_rows(self) -> Result<RowIterator<R>> {
+        let builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
+        let reader = builder.build()?;
+        Ok(RowIterator {
+            batch_reader: reader,
+            current_batch: None,
+            current_row: 0,
+            _phantom: std::marker::PhantomData,
+        })
+    }
+    /// Read rows with column projection
+    ///
+    /// Only the specified columns will be read, which can significantly
+    /// improve performance for wide tables.
+    pub fn read_rows_with_projection(self, columns: &[String]) -> Result<RowIterator<R>> {
+        let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
+        let arrow_schema = builder.schema();
+        // Create projection mask based on column names
+        let mut column_indices = Vec::new();
+        for (idx, field) in arrow_schema.fields().iter().enumerate() {
+            if columns.contains(&field.name().to_string()) {
+                column_indices.push(idx);
+            }
+        }
+        // Allow empty column projections to match v1 behavior
+        // This will result in rows with no fields
+        let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
+        builder = builder.with_projection(mask);
+        let reader = builder.build()?;
+        Ok(RowIterator {
+            batch_reader: reader,
+            current_batch: None,
+            current_row: 0,
+            _phantom: std::marker::PhantomData,
+        })
+    }
+    /// Read columns from the Parquet file
+    ///
+    /// Returns an iterator over column batches where each batch contains
+    /// arrays of values for each column.
+    pub fn read_columns(self, batch_size: Option<usize>) -> Result<ColumnIterator<R>> {
+        let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
+        let is_empty = builder.metadata().file_metadata().num_rows() == 0;
+        if let Some(size) = batch_size {
+            builder = builder.with_batch_size(size);
+        }
+        let schema = builder.schema().clone();
+        let reader = builder.build()?;
+        Ok(ColumnIterator {
+            batch_reader: reader,
+            schema,
+            returned_empty_batch: false,
+            is_empty_file: is_empty,
+            _phantom: std::marker::PhantomData,
+        })
+    }
+    /// Read columns with projection
+    pub fn read_columns_with_projection(
+        self,
+        columns: &[String],
+        batch_size: Option<usize>,
+    ) -> Result<ColumnIterator<R>> {
+        let mut builder = ParquetRecordBatchReaderBuilder::try_new(self.inner)?;
+        let arrow_schema = builder.schema();
+        let is_empty = builder.metadata().file_metadata().num_rows() == 0;
+        // Create projection mask
+        let mut column_indices = Vec::new();
+        for (idx, field) in arrow_schema.fields().iter().enumerate() {
+            if columns.contains(&field.name().to_string()) {
+                column_indices.push(idx);
+            }
+        }
+        // Allow empty column projections to match v1 behavior
+        // This will result in rows with no fields
+        let mask = parquet::arrow::ProjectionMask::roots(builder.parquet_schema(), column_indices);
+        builder = builder.with_projection(mask);
+        if let Some(size) = batch_size {
+            builder = builder.with_batch_size(size);
+        }
+        let schema = builder.schema().clone();
+        let reader = builder.build()?;
+        Ok(ColumnIterator {
+            batch_reader: reader,
+            schema,
+            returned_empty_batch: false,
+            is_empty_file: is_empty,
+            _phantom: std::marker::PhantomData,
+        })
+    }
+}
+/// Iterator over rows in a Parquet file
+pub struct RowIterator<R> {
+    batch_reader: ParquetRecordBatchReader,
+    current_batch: Option<RecordBatch>,
+    current_row: usize,
+    _phantom: std::marker::PhantomData<R>,
+}
+impl<R> Iterator for RowIterator<R>
+where
+    R: parquet::file::reader::ChunkReader + 'static,
+{
+    type Item = Result<Vec<ParquetValue>>;
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            // If we have a current batch and haven't exhausted it
+            if let Some(ref batch) = self.current_batch {
+                if self.current_row < batch.num_rows() {
+                    // Extract values from current row
+                    let mut row_values = Vec::with_capacity(batch.num_columns());
+                    for column in batch.columns() {
+                        let value = match arrow_to_parquet_value(column, self.current_row) {
+                            Ok(v) => v,
+                            Err(e) => return Some(Err(e)),
+                        };
+                        row_values.push(value);
+                    }
+                    self.current_row += 1;
+                    return Some(Ok(row_values));
+                }
+            }
+            // Need to fetch next batch
+            match self.batch_reader.next() {
+                Some(Ok(batch)) => {
+                    self.current_batch = Some(batch);
+                    self.current_row = 0;
+                }
+                Some(Err(e)) => return Some(Err(e.into())),
+                None => return None,
+            }
+        }
+    }
+}
+/// Iterator over column batches in a Parquet file
+pub struct ColumnIterator<R> {
+    batch_reader: ParquetRecordBatchReader,
+    schema: Arc<arrow_schema::Schema>,
+    returned_empty_batch: bool,
+    is_empty_file: bool,
+    _phantom: std::marker::PhantomData<R>,
+}
+/// A batch of columns with their names
+pub struct ColumnBatch {
+    pub columns: Vec<(String, Vec<ParquetValue>)>,
+}
+impl<R> Iterator for ColumnIterator<R>
+where
+    R: parquet::file::reader::ChunkReader + 'static,
+{
+    type Item = Result<ColumnBatch>;
+    fn next(&mut self) -> Option<Self::Item> {
+        // Check if this is the first call and we have no data
+        if self.is_empty_file && !self.returned_empty_batch {
+            // Return one batch with empty columns to show schema
+            self.returned_empty_batch = true;
+            let mut columns = Vec::with_capacity(self.schema.fields().len());
+            for field in self.schema.fields() {
+                columns.push((field.name().to_string(), Vec::new()));
+            }
+            return Some(Ok(ColumnBatch { columns }));
+        }
+        match self.batch_reader.next() {
+            Some(Ok(batch)) => {
+                let mut columns = Vec::with_capacity(batch.num_columns());
+                for (idx, column) in batch.columns().iter().enumerate() {
+                    let column_name = self.schema.field(idx).name().to_string();
+                    // Convert entire column to ParquetValues
+                    let mut values = Vec::with_capacity(column.len());
+                    for row_idx in 0..column.len() {
+                        match arrow_to_parquet_value(column, row_idx) {
+                            Ok(value) => values.push(value),
+                            Err(e) => return Some(Err(e)),
+                        }
+                    }
+                    columns.push((column_name, values));
+                }
+                Some(Ok(ColumnBatch { columns }))
+            }
+            Some(Err(e)) => Some(Err(e.into())),
+            None => None,
+        }
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    #[test]
+    fn test_reader_creation() {
+        let data = vec![0u8; 1024];
+        let bytes = bytes::Bytes::from(data);
+        let _reader = Reader::new(bytes);
+    }
+}