RubyGems - parquet - Versions diffs - 0.5.13 → 0.6.0 - Mend

parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

checksums.yaml +4 -4
data/Cargo.lock +295 -98
data/Cargo.toml +1 -1
data/Gemfile +1 -0
data/README.md +94 -3
data/ext/parquet/Cargo.toml +3 -0
data/ext/parquet/src/adapter_ffi.rs +156 -0
data/ext/parquet/src/lib.rs +13 -21
data/ext/parquet-core/Cargo.toml +23 -0
data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
data/ext/parquet-core/src/error.rs +163 -0
data/ext/parquet-core/src/lib.rs +60 -0
data/ext/parquet-core/src/reader.rs +263 -0
data/ext/parquet-core/src/schema.rs +283 -0
data/ext/parquet-core/src/test_utils.rs +308 -0
data/ext/parquet-core/src/traits/mod.rs +5 -0
data/ext/parquet-core/src/traits/schema.rs +151 -0
data/ext/parquet-core/src/value.rs +209 -0
data/ext/parquet-core/src/writer.rs +839 -0
data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
data/ext/parquet-core/tests/binary_data.rs +437 -0
data/ext/parquet-core/tests/column_projection.rs +557 -0
data/ext/parquet-core/tests/complex_types.rs +821 -0
data/ext/parquet-core/tests/compression_tests.rs +434 -0
data/ext/parquet-core/tests/concurrent_access.rs +430 -0
data/ext/parquet-core/tests/decimal_tests.rs +488 -0
data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
data/ext/parquet-core/tests/performance_memory.rs +181 -0
data/ext/parquet-core/tests/primitive_types.rs +547 -0
data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
data/ext/parquet-core/tests/temporal_tests.rs +518 -0
data/ext/parquet-core/tests/test_helpers.rs +132 -0
data/ext/parquet-core/tests/writer_tests.rs +545 -0
data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
data/ext/parquet-ruby-adapter/build.rs +5 -0
data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
data/ext/parquet-ruby-adapter/src/error.rs +148 -0
data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
data/ext/parquet-ruby-adapter/src/types.rs +94 -0
data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
data/lib/parquet/schema.rb +19 -0
data/lib/parquet/version.rb +1 -1
metadata +50 -24
data/ext/parquet/src/enumerator.rs +0 -68
data/ext/parquet/src/header_cache.rs +0 -99
data/ext/parquet/src/logger.rs +0 -171
data/ext/parquet/src/reader/common.rs +0 -111
data/ext/parquet/src/reader/mod.rs +0 -211
data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
data/ext/parquet/src/reader/unified/mod.rs +0 -363
data/ext/parquet/src/types/core_types.rs +0 -120
data/ext/parquet/src/types/mod.rs +0 -100
data/ext/parquet/src/types/parquet_value.rs +0 -1275
data/ext/parquet/src/types/record_types.rs +0 -605
data/ext/parquet/src/types/schema_converter.rs +0 -290
data/ext/parquet/src/types/schema_node.rs +0 -424
data/ext/parquet/src/types/timestamp.rs +0 -285
data/ext/parquet/src/types/type_conversion.rs +0 -1949
data/ext/parquet/src/types/writer_types.rs +0 -329
data/ext/parquet/src/utils.rs +0 -184
data/ext/parquet/src/writer/mod.rs +0 -505
data/ext/parquet/src/writer/write_columns.rs +0 -238
data/ext/parquet/src/writer/write_rows.rs +0 -488

data/ext/parquet/src/reader/unified/mod.rs DELETED Viewed

@@ -1,363 +0,0 @@
-use crate::header_cache::StringCache;
-use crate::logger::RubyLogger;
-use crate::types::TryIntoValue;
-use crate::{
-    create_column_enumerator, create_row_enumerator, ColumnEnumeratorArgs, ColumnRecord,
-    ParquetField, ParquetGemError, ParquetValueVec, ParserResultType, RowEnumeratorArgs, RowRecord,
-};
-use ahash::RandomState;
-use either::Either;
-use magnus::IntoValue;
-use magnus::{Error as MagnusError, Ruby, Value};
-use std::collections::HashMap;
-use std::rc::Rc;
-use std::sync::OnceLock;
-use super::common::{
-    create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
-};
-use crate::types::ArrayWrapper;
-/// A unified parser configuration that can be used for both row and column parsing
-pub enum ParserType {
-    Row {
-        strict: bool,
-    },
-    Column {
-        batch_size: Option<usize>,
-        strict: bool,
-    },
-}
-/// Unified parser arguments structure
-pub struct UnifiedParserArgs {
-    pub to_read: Value,
-    pub result_type: ParserResultType,
-    pub columns: Option<Vec<String>>,
-    pub parser_type: ParserType,
-    pub logger: Option<Value>,
-}
-/// Unified implementation for parsing Parquet data (both rows and columns)
-pub fn parse_parquet_unified(
-    ruby: Rc<Ruby>,
-    rb_self: Value,
-    args: UnifiedParserArgs,
-) -> Result<Value, ParquetGemError> {
-    let UnifiedParserArgs {
-        to_read,
-        result_type,
-        columns,
-        parser_type,
-        logger,
-    } = args;
-    // Initialize the logger if provided
-    let ruby_logger = RubyLogger::new(&ruby, logger)?;
-    // Clone values for the closure to avoid move issues
-    let columns_clone = columns.clone();
-    // Determine if we're handling rows or columns for enumerator creation
-    match &parser_type {
-        ParserType::Row { strict } => {
-            // Handle block or create row enumerator
-            if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
-                create_row_enumerator(RowEnumeratorArgs {
-                    rb_self,
-                    to_read,
-                    result_type,
-                    columns: columns_clone,
-                    strict: *strict,
-                    logger,
-                })
-                .map(|yield_enum| yield_enum.into_value_with(&ruby))
-            })? {
-                return Ok(enum_value);
-            }
-        }
-        ParserType::Column { batch_size, strict } => {
-            // For column-based parsing, log the batch size if present
-            if let Some(ref bs) = batch_size {
-                ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
-            }
-            // Handle block or create column enumerator
-            if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
-                create_column_enumerator(ColumnEnumeratorArgs {
-                    rb_self,
-                    to_read,
-                    result_type,
-                    columns: columns_clone,
-                    batch_size: *batch_size,
-                    strict: *strict,
-                    logger: logger.as_ref().map(|_| to_read),
-                })
-                .map(|yield_enum| yield_enum.into_value_with(&ruby))
-            })? {
-                return Ok(enum_value);
-            }
-        }
-    }
-    // Open the Parquet source
-    let source = open_parquet_source(ruby.clone(), to_read)?;
-    // Based on the parser type, handle the data differently
-    match parser_type {
-        ParserType::Row { strict } => {
-            // Handle row-based parsing
-            process_row_data(
-                ruby.clone(),
-                source,
-                &columns,
-                result_type,
-                strict,
-                &ruby_logger,
-            )?;
-        }
-        ParserType::Column { batch_size, strict } => {
-            // Handle column-based parsing
-            process_column_data(
-                ruby.clone(),
-                source,
-                &columns,
-                result_type,
-                batch_size,
-                strict,
-                &ruby_logger,
-            )?;
-        }
-    }
-    Ok(ruby.qnil().into_value_with(&ruby))
-}
-/// Process row-based Parquet data
-fn process_row_data(
-    ruby: Rc<Ruby>,
-    source: Either<std::fs::File, crate::ruby_reader::ThreadSafeRubyReader>,
-    columns: &Option<Vec<String>>,
-    result_type: ParserResultType,
-    strict: bool,
-    ruby_logger: &RubyLogger,
-) -> Result<(), ParquetGemError> {
-    use parquet::file::reader::{FileReader, SerializedFileReader};
-    use parquet::record::reader::RowIter as ParquetRowIter;
-    // Create the row-based reader
-    let reader: Box<dyn FileReader> = match source {
-        Either::Left(file) => {
-            Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
-        }
-        Either::Right(readable) => {
-            Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
-        }
-    };
-    let schema = reader.metadata().file_metadata().schema().clone();
-    ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
-    let mut iter = ParquetRowIter::from_file_into(reader);
-    if let Some(cols) = columns {
-        ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
-        let projection = create_projection_schema(&schema, cols);
-        iter = iter.project(Some(projection.to_owned())).map_err(|e| {
-            MagnusError::new(
-                ruby.exception_runtime_error(),
-                format!("Failed to create projection: {}", e),
-            )
-        })?;
-    }
-    match result_type {
-        ParserResultType::Hash => {
-            let headers = OnceLock::new();
-            let headers_clone = headers.clone();
-            let iter = iter.map(move |row| {
-                row.map(|row| {
-                    let headers = headers_clone.get_or_init(|| {
-                        let column_count = row.get_column_iter().count();
-                        let mut header_string = Vec::with_capacity(column_count);
-                        for (k, _) in row.get_column_iter() {
-                            header_string.push(k.to_owned());
-                        }
-                        StringCache::intern_many(&header_string).expect("Failed to intern headers")
-                    });
-                    let mut map =
-                        HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
-                    for (i, ((_, v), t)) in
-                        row.get_column_iter().zip(schema.get_fields()).enumerate()
-                    {
-                        let type_info = t.get_basic_info();
-                        map.insert(
-                            headers[i],
-                            ParquetField {
-                                field: v.clone(),
-                                converted_type: type_info.converted_type(),
-                                logical_type: type_info.logical_type().clone(),
-                                strict,
-                            },
-                        );
-                    }
-                    map
-                })
-                .map(RowRecord::Map::<RandomState>)
-                .map_err(ParquetGemError::from)
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-        ParserResultType::Array => {
-            let iter = iter.map(|row| {
-                row.map(|row| {
-                    let column_count = row.get_column_iter().count();
-                    let mut vec = Vec::with_capacity(column_count);
-                    for ((_, v), t) in row.get_column_iter().zip(schema.get_fields()) {
-                        let type_info = t.get_basic_info();
-                        vec.push(ParquetField {
-                            field: v.clone(),
-                            converted_type: type_info.converted_type(),
-                            logical_type: type_info.logical_type().clone(),
-                            strict,
-                        });
-                    }
-                    vec
-                })
-                .map(RowRecord::Vec::<RandomState>)
-                .map_err(ParquetGemError::from)
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-    }
-    Ok(())
-}
-/// Process column-based Parquet data
-fn process_column_data(
-    ruby: Rc<Ruby>,
-    source: Either<std::fs::File, crate::ruby_reader::ThreadSafeRubyReader>,
-    columns: &Option<Vec<String>>,
-    result_type: ParserResultType,
-    batch_size: Option<usize>,
-    strict: bool,
-    _ruby_logger: &RubyLogger,
-) -> Result<(), ParquetGemError> {
-    // Create the batch reader
-    let (batch_reader, schema, num_rows) = match source {
-        Either::Left(file) => create_batch_reader(file, columns, batch_size)?,
-        Either::Right(readable) => create_batch_reader(readable, columns, batch_size)?,
-    };
-    match result_type {
-        ParserResultType::Hash => {
-            // For hash return type, we need to return a hash with column names pointing at empty arrays
-            if handle_empty_file(&ruby, &schema, num_rows)? {
-                return Ok(());
-            }
-            let headers = OnceLock::new();
-            let headers_clone = headers.clone();
-            let iter = batch_reader.map(move |batch| {
-                batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
-                    let local_headers = headers_clone
-                        .get_or_init(|| {
-                            let schema = batch.schema();
-                            let fields = schema.fields();
-                            let mut header_string = Vec::with_capacity(fields.len());
-                            for field in fields {
-                                header_string.push(field.name().to_owned());
-                            }
-                            StringCache::intern_many(&header_string)
-                        })
-                        .as_ref()
-                        .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
-                    let mut map = HashMap::with_capacity_and_hasher(
-                        local_headers.len(),
-                        RandomState::default(),
-                    );
-                    batch
-                        .columns()
-                        .iter()
-                        .enumerate()
-                        .try_for_each(|(i, column)| {
-                            let header = local_headers[i];
-                            let values = ParquetValueVec::try_from(ArrayWrapper {
-                                array: column,
-                                strict,
-                            })?;
-                            map.insert(header, values.into_inner());
-                            Ok::<_, ParquetGemError>(())
-                        })?;
-                    Ok(ColumnRecord::Map::<RandomState>(map))
-                })
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-        ParserResultType::Array => {
-            let iter = batch_reader.map(|batch| {
-                batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
-                    let vec = batch
-                        .columns()
-                        .iter()
-                        .map(|column| {
-                            let values = ParquetValueVec::try_from(ArrayWrapper {
-                                array: column,
-                                strict,
-                            })?;
-                            Ok::<_, ParquetGemError>(values.into_inner())
-                        })
-                        .collect::<Result<Vec<_>, _>>()?;
-                    Ok(ColumnRecord::Vec::<RandomState>(vec))
-                })
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-    }
-    Ok(())
-}
-/// Helper function to create a projection schema
-fn create_projection_schema(
-    schema: &parquet::schema::types::Type,
-    columns: &[String],
-) -> parquet::schema::types::Type {
-    if let parquet::schema::types::Type::GroupType { fields, .. } = schema {
-        let projected_fields: Vec<std::sync::Arc<parquet::schema::types::Type>> = fields
-            .iter()
-            .filter(|field| columns.contains(&field.name().to_string()))
-            .cloned()
-            .collect();
-        parquet::schema::types::Type::GroupType {
-            basic_info: schema.get_basic_info().clone(),
-            fields: projected_fields,
-        }
-    } else {
-        // Return original schema if not a group type
-        schema.clone()
-    }
-}

data/ext/parquet/src/types/core_types.rs DELETED Viewed

@@ -1,120 +0,0 @@
-#[derive(Copy, Clone, Debug, PartialEq, Eq)]
-pub enum ParserResultType {
-    Hash,
-    Array,
-}
-impl ParserResultType {
-    pub fn iter() -> impl Iterator<Item = Self> {
-        [Self::Hash, Self::Array].into_iter()
-    }
-}
-impl TryFrom<&str> for ParserResultType {
-    type Error = String;
-    fn try_from(value: &str) -> Result<Self, Self::Error> {
-        match value {
-            "hash" => Ok(ParserResultType::Hash),
-            "array" => Ok(ParserResultType::Array),
-            _ => Err(format!("Invalid parser result type: {}", value)),
-        }
-    }
-}
-impl TryFrom<String> for ParserResultType {
-    type Error = String;
-    fn try_from(value: String) -> Result<Self, Self::Error> {
-        Self::try_from(value.as_str())
-    }
-}
-impl std::fmt::Display for ParserResultType {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            ParserResultType::Hash => write!(f, "hash"),
-            ParserResultType::Array => write!(f, "array"),
-        }
-    }
-}
-#[derive(Debug, Clone)]
-pub struct ListField<'a> {
-    pub item_type: ParquetSchemaType<'a>,
-    pub format: Option<&'a str>,
-    pub nullable: bool,
-}
-#[derive(Debug, Clone)]
-pub struct MapField<'a> {
-    pub key_type: ParquetSchemaType<'a>,
-    pub value_type: ParquetSchemaType<'a>,
-    pub key_format: Option<&'a str>,
-    pub value_format: Option<&'a str>,
-    pub value_nullable: bool,
-}
-#[derive(Debug, Clone)]
-pub struct StructField<'a> {
-    pub fields: Vec<super::writer_types::SchemaField<'a>>,
-}
-#[derive(Clone, Debug)]
-pub enum ParquetSchemaType<'a> {
-    Primitive(PrimitiveType),
-    List(Box<ListField<'a>>),
-    Map(Box<MapField<'a>>),
-    Struct(Box<StructField<'a>>),
-}
-// New schema representation for the DSL-based approach
-#[derive(Debug, Clone)]
-pub enum SchemaNode {
-    Struct {
-        name: String,
-        nullable: bool,
-        fields: Vec<SchemaNode>,
-    },
-    List {
-        name: String,
-        nullable: bool,
-        item: Box<SchemaNode>,
-    },
-    Map {
-        name: String,
-        nullable: bool,
-        key: Box<SchemaNode>,
-        value: Box<SchemaNode>,
-    },
-    Primitive {
-        name: String,
-        parquet_type: PrimitiveType,
-        nullable: bool,
-        format: Option<String>,
-    },
-}
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub enum PrimitiveType {
-    Int8,
-    Int16,
-    Int32,
-    Int64,
-    UInt8,
-    UInt16,
-    UInt32,
-    UInt64,
-    Float32,
-    Float64,
-    Decimal128(u8, i8),
-    Decimal256(u8, i8),
-    Boolean,
-    String,
-    Binary,
-    Date32,
-    TimestampMillis,
-    TimestampMicros,
-    TimeMillis,
-    TimeMicros,
-}

data/ext/parquet/src/types/mod.rs DELETED Viewed

@@ -1,100 +0,0 @@
-// Re-export all public items from submodules
-mod core_types;
-mod parquet_value;
-mod record_types;
-pub mod schema_converter;
-pub mod schema_node;
-mod timestamp;
-pub mod type_conversion;
-mod writer_types;
-pub use core_types::*;
-pub use parquet_value::*;
-pub use record_types::*;
-// Explicitly export schema-related items
-pub use schema_converter::{
-    infer_schema_from_first_row, legacy_schema_to_dsl, parse_legacy_schema,
-};
-pub use schema_node::parse_schema_node;
-pub use timestamp::*;
-pub use type_conversion::*;
-pub use writer_types::*;
-// Common imports used across the module
-use arrow_array::cast::downcast_array;
-use arrow_array::{
-    Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
-    Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
-    ListArray, NullArray, StringArray, StructArray, Time32MillisecondArray, Time64MicrosecondArray,
-    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
-    TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
-};
-use arrow_schema::{DataType, TimeUnit};
-use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
-use parquet::data_type::Decimal;
-use parquet::record::Field;
-use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
-use crate::header_cache::StringCacheKey;
-use crate::header_cache::CacheError;
-use std::io;
-use thiserror::Error;
-#[derive(Error, Debug)]
-pub enum ParquetGemError {
-    #[error("Failed to open file: {0}")]
-    FileOpen(#[from] io::Error),
-    #[error("Failed to intern headers: {0}")]
-    HeaderIntern(#[from] CacheError),
-    #[error("Ruby error: {0}")]
-    Ruby(#[from] MagnusErrorWrapper),
-    #[error("Parquet error: {0}")]
-    Parquet(#[from] parquet::errors::ParquetError),
-    #[error("Arrow error: {0}")]
-    Arrow(#[from] arrow_schema::ArrowError),
-    #[error("UTF-8 error: {0}")]
-    Utf8Error(#[from] simdutf8::basic::Utf8Error),
-    #[error("Jiff error: {0}")]
-    Jiff(#[from] jiff::Error),
-    #[error("Failed to cast slice to array: {0}")]
-    InvalidDecimal(String),
-    #[error("Failed to parse UUID: {0}")]
-    UuidError(#[from] uuid::Error),
-    #[error("Decimals larger than 128 bits are not supported")]
-    DecimalWouldBeTruncated,
-}
-#[derive(Debug)]
-pub struct MagnusErrorWrapper(pub MagnusError);
-impl From<MagnusError> for MagnusErrorWrapper {
-    fn from(err: MagnusError) -> Self {
-        Self(err)
-    }
-}
-impl std::fmt::Display for MagnusErrorWrapper {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
-impl std::error::Error for MagnusErrorWrapper {}
-impl From<MagnusError> for ParquetGemError {
-    fn from(err: MagnusError) -> Self {
-        Self::Ruby(MagnusErrorWrapper(err))
-    }
-}
-impl From<ParquetGemError> for MagnusError {
-    fn from(val: ParquetGemError) -> Self {
-        match val {
-            ParquetGemError::Ruby(MagnusErrorWrapper(err)) => err,
-            _ => MagnusError::new(magnus::exception::runtime_error(), val.to_string()),
-        }
-    }
-}