RubyGems - parquet - Versions diffs - 0.5.4 → 0.5.5 - Mend

parquet 0.5.4 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/ext/parquet/src/reader/mod.rs +2 -1
data/ext/parquet/src/reader/parquet_column_reader.rs +15 -127
data/ext/parquet/src/reader/parquet_row_reader.rs +14 -134
data/ext/parquet/src/reader/unified/mod.rs +328 -0
data/ext/parquet/src/types/parquet_value.rs +90 -16
data/ext/parquet/src/types/record_types.rs +28 -4
data/ext/parquet/src/types/type_conversion.rs +13 -11
data/lib/parquet/version.rb +1 -1
metadata +2 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 936feb49be7a1bbbb36236551480ae0522d6b52443e76b4ebb7502abdb9d2903
-  data.tar.gz: bcc56665ec0cd132e22c262373e7b1294e085be364c93efbd214e434ada7dcb6
+  metadata.gz: e2295ee94fe35758ae8e5137070e2206ec1e104aad6b9a0806aa508ad4799247
+  data.tar.gz: 340f86257082bdba22d6ced530ecd1d201c7b4e6d9116eebac41541ba2aaa257
 SHA512:
-  metadata.gz: 7856d7f36820a8384faf564f166d39e0daca1c9d15457b6f6aae8ff56f4176a8b1302bfbc2cc5edcfedfcb0805cbe71029f5712e716a29dc4942a1e6453a3e5e
-  data.tar.gz: '08d1f4cfe357b22bad4c4fab4ddd4fa93069b13c65559d668fb704e2f7d8884fc8f081270e4dc43a5db60aab7147be36bfe7d26945f93c9ad6e9badbd0ad957e'
+  metadata.gz: f333ae2914cdd00468c390e8b3d876aec4e522a546d43ab29db5d777792105a38d2a40c49db0f0afe1e800bf32e54bb4c479441f8f9876937ba59917b444d15a
+  data.tar.gz: da2832c3514729cc0e99e16f70a10bbfc4e9093dc734de55715305121649ebc371dff93a7bb462b97fde27c79ad65cec12c5fa90a47f70bc64153a7fd2ce1a5c

data/ext/parquet/src/reader/mod.rs CHANGED Viewed

@@ -1,6 +1,7 @@
 mod common;
 mod parquet_column_reader;
 mod parquet_row_reader;
+mod unified;
 use std::{fs::File, rc::Rc};
 use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
@@ -207,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
     let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
     Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
-}
+}

data/ext/parquet/src/reader/parquet_column_reader.rs CHANGED Viewed

@@ -1,21 +1,9 @@
-use crate::header_cache::StringCache;
-use crate::logger::RubyLogger;
-use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
-use crate::{
-    create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
-    ParserResultType,
-};
-use ahash::RandomState;
-use either::Either;
-use magnus::IntoValue;
+use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
+use crate::utils::*;
+use crate::ParquetGemError;
 use magnus::{Error as MagnusError, Ruby, Value};
-use std::collections::HashMap;
 use std::rc::Rc;
-use std::sync::OnceLock;
-use super::common::{
-    create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
-};
 #[inline]
 pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,116 +29,16 @@ fn parse_parquet_columns_impl(
         logger,
     } = parse_parquet_columns_args(&ruby, args)?;
-    // Initialize the logger if provided
-    let ruby_logger = RubyLogger::new(&ruby, logger)?;
-    if let Some(ref bs) = batch_size {
-        ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
-    }
-    // Clone values for the closure to avoid move issues
-    let columns_clone = columns.clone();
-    // Handle block or create enumerator
-    if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
-        create_column_enumerator(ColumnEnumeratorArgs {
-            rb_self,
+    // Use the unified parsing implementation
+    parse_parquet_unified(
+        ruby,
+        rb_self,
+        UnifiedParserArgs {
             to_read,
             result_type,
-            columns: columns_clone,
-            batch_size,
-            strict,
-            logger: logger.as_ref().map(|_| to_read),
-        })
-        .map(|yield_enum| yield_enum.into_value_with(&ruby))
-    })? {
-        return Ok(enum_value);
-    }
-    let source = open_parquet_source(ruby.clone(), to_read)?;
-    // Use the common function to create the batch reader
-    let (batch_reader, schema, num_rows) = match source {
-        Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
-        Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
-    };
-    match result_type {
-        ParserResultType::Hash => {
-            // For hash return type, we need to return a hash with column names pointing at empty arrays
-            if handle_empty_file(&ruby, &schema, num_rows)? {
-                return Ok(ruby.qnil().into_value_with(&ruby));
-            }
-            let headers = OnceLock::new();
-            let headers_clone = headers.clone();
-            let iter = batch_reader.map(move |batch| {
-                batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
-                    let local_headers = headers_clone
-                        .get_or_init(|| {
-                            let schema = batch.schema();
-                            let fields = schema.fields();
-                            let mut header_string = Vec::with_capacity(fields.len());
-                            for field in fields {
-                                header_string.push(field.name().to_owned());
-                            }
-                            StringCache::intern_many(&header_string)
-                        })
-                        .as_ref()
-                        .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
-                    let mut map = HashMap::with_capacity_and_hasher(
-                        local_headers.len(),
-                        RandomState::default(),
-                    );
-                    batch
-                        .columns()
-                        .iter()
-                        .enumerate()
-                        .try_for_each(|(i, column)| {
-                            let header = local_headers[i];
-                            let values = ParquetValueVec::try_from(ArrayWrapper {
-                                array: column,
-                                strict,
-                            })?;
-                            map.insert(header, values.into_inner());
-                            Ok::<_, ParquetGemError>(())
-                        })?;
-                    Ok(ColumnRecord::Map::<RandomState>(map))
-                })
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-        ParserResultType::Array => {
-            let iter = batch_reader.map(|batch| {
-                batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
-                    let vec = batch
-                        .columns()
-                        .iter()
-                        .map(|column| {
-                            let values = ParquetValueVec::try_from(ArrayWrapper {
-                                array: column,
-                                strict,
-                            })?;
-                            Ok::<_, ParquetGemError>(values.into_inner())
-                        })
-                        .collect::<Result<Vec<_>, _>>()?;
-                    Ok(ColumnRecord::Vec::<RandomState>(vec))
-                })
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-    }
-    Ok(ruby.qnil().into_value_with(&ruby))
-}
+            columns,
+            parser_type: ParserType::Column { batch_size, strict },
+            logger,
+        },
+    )
+}

data/ext/parquet/src/reader/parquet_row_reader.rs CHANGED Viewed

@@ -1,22 +1,9 @@
-use crate::header_cache::StringCache;
-use crate::logger::RubyLogger;
-use crate::types::TryIntoValue;
-use crate::{
-    create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
-    RowEnumeratorArgs, RowRecord,
-};
-use ahash::RandomState;
-use either::Either;
-use magnus::IntoValue;
+use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
+use crate::utils::*;
+use crate::ParquetGemError;
 use magnus::{Error as MagnusError, Ruby, Value};
-use parquet::file::reader::{FileReader, SerializedFileReader};
-use parquet::record::reader::RowIter as ParquetRowIter;
-use parquet::schema::types::{Type as SchemaType, TypePtr};
-use std::collections::HashMap;
 use std::rc::Rc;
-use std::sync::OnceLock;
-use super::common::{handle_block_or_enum, open_parquet_source};
 #[inline]
 pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,123 +28,16 @@ fn parse_parquet_rows_impl(
         logger,
     } = parse_parquet_rows_args(&ruby, args)?;
-    // Initialize the logger if provided
-    let ruby_logger = RubyLogger::new(&ruby, logger)?;
-    // Clone values for the closure to avoid move issues
-    let columns_clone = columns.clone();
-    // Handle block or create enumerator
-    if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
-        create_row_enumerator(RowEnumeratorArgs {
-            rb_self,
+    // Use the unified parsing implementation
+    parse_parquet_unified(
+        ruby,
+        rb_self,
+        UnifiedParserArgs {
             to_read,
             result_type,
-            columns: columns_clone,
-            strict,
+            columns,
+            parser_type: ParserType::Row { strict },
             logger,
-        })
-        .map(|yield_enum| yield_enum.into_value_with(&ruby))
-    })? {
-        return Ok(enum_value);
-    }
-    let source = open_parquet_source(ruby.clone(), to_read)?;
-    let reader: Box<dyn FileReader> = match source {
-        Either::Left(file) => {
-            Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
-        }
-        Either::Right(readable) => {
-            Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
-        }
-    };
-    let schema = reader.metadata().file_metadata().schema().clone();
-    ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
-    let mut iter = ParquetRowIter::from_file_into(reader);
-    if let Some(cols) = columns {
-        ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
-        let projection = create_projection_schema(&schema, &cols);
-        iter = iter.project(Some(projection.to_owned())).map_err(|e| {
-            MagnusError::new(
-                ruby.exception_runtime_error(),
-                format!("Failed to create projection: {}", e),
-            )
-        })?;
-    }
-    match result_type {
-        ParserResultType::Hash => {
-            let headers = OnceLock::new();
-            let headers_clone = headers.clone();
-            let iter = iter.map(move |row| {
-                row.map(|row| {
-                    let headers = headers_clone.get_or_init(|| {
-                        let column_count = row.get_column_iter().count();
-                        let mut header_string = Vec::with_capacity(column_count);
-                        for (k, _) in row.get_column_iter() {
-                            header_string.push(k.to_owned());
-                        }
-                        StringCache::intern_many(&header_string).expect("Failed to intern headers")
-                    });
-                    let mut map =
-                        HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
-                    for (i, (_, v)) in row.get_column_iter().enumerate() {
-                        map.insert(headers[i], ParquetField(v.clone(), strict));
-                    }
-                    map
-                })
-                .map(RowRecord::Map::<RandomState>)
-                .map_err(ParquetGemError::from)
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-        ParserResultType::Array => {
-            let iter = iter.map(|row| {
-                row.map(|row| {
-                    let column_count = row.get_column_iter().count();
-                    let mut vec = Vec::with_capacity(column_count);
-                    for (_, v) in row.get_column_iter() {
-                        vec.push(ParquetField(v.clone(), strict));
-                    }
-                    vec
-                })
-                .map(RowRecord::Vec::<RandomState>)
-                .map_err(ParquetGemError::from)
-            });
-            for result in iter {
-                let record = result?;
-                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
-            }
-        }
-    }
-    Ok(ruby.qnil().into_value_with(&ruby))
-}
-fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
-    if let SchemaType::GroupType { fields, .. } = schema {
-        let projected_fields: Vec<TypePtr> = fields
-            .iter()
-            .filter(|field| columns.contains(&field.name().to_string()))
-            .cloned()
-            .collect();
-        SchemaType::GroupType {
-            basic_info: schema.get_basic_info().clone(),
-            fields: projected_fields,
-        }
-    } else {
-        // Return original schema if not a group type
-        schema.clone()
-    }
-}
+        },
+    )
+}

data/ext/parquet/src/reader/unified/mod.rs ADDED Viewed

@@ -0,0 +1,328 @@
+use crate::header_cache::StringCache;
+use crate::logger::RubyLogger;
+use crate::types::TryIntoValue;
+use crate::{
+    create_column_enumerator, create_row_enumerator, ParquetField, ParquetGemError,
+    ParserResultType, ColumnEnumeratorArgs, RowEnumeratorArgs, RowRecord, ColumnRecord, ParquetValueVec,
+};
+use ahash::RandomState;
+use either::Either;
+use magnus::IntoValue;
+use magnus::{Error as MagnusError, Ruby, Value};
+use std::collections::HashMap;
+use std::rc::Rc;
+use std::sync::OnceLock;
+use crate::types::ArrayWrapper;
+use super::common::{
+    create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
+};
+/// A unified parser configuration that can be used for both row and column parsing
+pub enum ParserType {
+    Row {
+        strict: bool,
+    },
+    Column {
+        batch_size: Option<usize>,
+        strict: bool,
+    },
+}
+/// Unified parser arguments structure
+pub struct UnifiedParserArgs {
+    pub to_read: Value,
+    pub result_type: ParserResultType,
+    pub columns: Option<Vec<String>>,
+    pub parser_type: ParserType,
+    pub logger: Option<Value>,
+}
+/// Unified implementation for parsing Parquet data (both rows and columns)
+pub fn parse_parquet_unified(
+    ruby: Rc<Ruby>,
+    rb_self: Value,
+    args: UnifiedParserArgs,
+) -> Result<Value, ParquetGemError> {
+    let UnifiedParserArgs {
+        to_read,
+        result_type,
+        columns,
+        parser_type,
+        logger,
+    } = args;
+    // Initialize the logger if provided
+    let ruby_logger = RubyLogger::new(&ruby, logger.clone())?;
+    // Clone values for the closure to avoid move issues
+    let columns_clone = columns.clone();
+    // Determine if we're handling rows or columns for enumerator creation
+    match &parser_type {
+        ParserType::Row { strict } => {
+            // Handle block or create row enumerator
+            if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
+                create_row_enumerator(RowEnumeratorArgs {
+                    rb_self,
+                    to_read,
+                    result_type,
+                    columns: columns_clone,
+                    strict: *strict,
+                    logger,
+                })
+                .map(|yield_enum| yield_enum.into_value_with(&ruby))
+            })? {
+                return Ok(enum_value);
+            }
+        },
+        ParserType::Column { batch_size, strict } => {
+            // For column-based parsing, log the batch size if present
+            if let Some(ref bs) = batch_size {
+                ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
+            }
+            // Handle block or create column enumerator
+            if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
+                create_column_enumerator(ColumnEnumeratorArgs {
+                    rb_self,
+                    to_read,
+                    result_type,
+                    columns: columns_clone,
+                    batch_size: *batch_size,
+                    strict: *strict,
+                    logger: logger.as_ref().map(|_| to_read),
+                })
+                .map(|yield_enum| yield_enum.into_value_with(&ruby))
+            })? {
+                return Ok(enum_value);
+            }
+        }
+    }
+    // Open the Parquet source
+    let source = open_parquet_source(ruby.clone(), to_read)?;
+    // Based on the parser type, handle the data differently
+    match parser_type {
+        ParserType::Row { strict } => {
+            // Handle row-based parsing
+            process_row_data(ruby.clone(), source, &columns, result_type, strict, &ruby_logger)?;
+        },
+        ParserType::Column { batch_size, strict } => {
+            // Handle column-based parsing
+            process_column_data(ruby.clone(), source, &columns, result_type, batch_size, strict, &ruby_logger)?;
+        }
+    }
+    Ok(ruby.qnil().into_value_with(&ruby))
+}
+/// Process row-based Parquet data
+fn process_row_data(
+    ruby: Rc<Ruby>,
+    source: Either<std::fs::File, crate::ruby_reader::ThreadSafeRubyReader>,
+    columns: &Option<Vec<String>>,
+    result_type: ParserResultType,
+    strict: bool,
+    ruby_logger: &RubyLogger,
+) -> Result<(), ParquetGemError> {
+    use parquet::file::reader::{FileReader, SerializedFileReader};
+    use parquet::record::reader::RowIter as ParquetRowIter;
+    // Create the row-based reader
+    let reader: Box<dyn FileReader> = match source {
+        Either::Left(file) => {
+            Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
+        }
+        Either::Right(readable) => {
+            Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
+        }
+    };
+    let schema = reader.metadata().file_metadata().schema().clone();
+    ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
+    let mut iter = ParquetRowIter::from_file_into(reader);
+    if let Some(cols) = columns {
+        ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
+        let projection = create_projection_schema(&schema, cols);
+        iter = iter.project(Some(projection.to_owned())).map_err(|e| {
+            MagnusError::new(
+                ruby.exception_runtime_error(),
+                format!("Failed to create projection: {}", e),
+            )
+        })?;
+    }
+    match result_type {
+        ParserResultType::Hash => {
+            let headers = OnceLock::new();
+            let headers_clone = headers.clone();
+            let iter = iter.map(move |row| {
+                row.map(|row| {
+                    let headers = headers_clone.get_or_init(|| {
+                        let column_count = row.get_column_iter().count();
+                        let mut header_string = Vec::with_capacity(column_count);
+                        for (k, _) in row.get_column_iter() {
+                            header_string.push(k.to_owned());
+                        }
+                        StringCache::intern_many(&header_string).expect("Failed to intern headers")
+                    });
+                    let mut map =
+                        HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
+                    for (i, (_, v)) in row.get_column_iter().enumerate() {
+                        map.insert(headers[i], ParquetField(v.clone(), strict));
+                    }
+                    map
+                })
+                .map(RowRecord::Map::<RandomState>)
+                .map_err(ParquetGemError::from)
+            });
+            for result in iter {
+                let record = result?;
+                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
+            }
+        }
+        ParserResultType::Array => {
+            let iter = iter.map(|row| {
+                row.map(|row| {
+                    let column_count = row.get_column_iter().count();
+                    let mut vec = Vec::with_capacity(column_count);
+                    for (_, v) in row.get_column_iter() {
+                        vec.push(ParquetField(v.clone(), strict));
+                    }
+                    vec
+                })
+                .map(RowRecord::Vec::<RandomState>)
+                .map_err(ParquetGemError::from)
+            });
+            for result in iter {
+                let record = result?;
+                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
+            }
+        }
+    }
+    Ok(())
+}
+/// Process column-based Parquet data
+fn process_column_data(
+    ruby: Rc<Ruby>,
+    source: Either<std::fs::File, crate::ruby_reader::ThreadSafeRubyReader>,
+    columns: &Option<Vec<String>>,
+    result_type: ParserResultType,
+    batch_size: Option<usize>,
+    strict: bool,
+    _ruby_logger: &RubyLogger,
+) -> Result<(), ParquetGemError> {
+    // Create the batch reader
+    let (batch_reader, schema, num_rows) = match source {
+        Either::Left(file) => create_batch_reader(file, columns, batch_size)?,
+        Either::Right(readable) => create_batch_reader(readable, columns, batch_size)?,
+    };
+    match result_type {
+        ParserResultType::Hash => {
+            // For hash return type, we need to return a hash with column names pointing at empty arrays
+            if handle_empty_file(&ruby, &schema, num_rows)? {
+                return Ok(());
+            }
+            let headers = OnceLock::new();
+            let headers_clone = headers.clone();
+            let iter = batch_reader.map(move |batch| {
+                batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
+                    let local_headers = headers_clone
+                        .get_or_init(|| {
+                            let schema = batch.schema();
+                            let fields = schema.fields();
+                            let mut header_string = Vec::with_capacity(fields.len());
+                            for field in fields {
+                                header_string.push(field.name().to_owned());
+                            }
+                            StringCache::intern_many(&header_string)
+                        })
+                        .as_ref()
+                        .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
+                    let mut map = HashMap::with_capacity_and_hasher(
+                        local_headers.len(),
+                        RandomState::default(),
+                    );
+                    batch
+                        .columns()
+                        .iter()
+                        .enumerate()
+                        .try_for_each(|(i, column)| {
+                            let header = local_headers[i];
+                            let values = ParquetValueVec::try_from(ArrayWrapper {
+                                array: column,
+                                strict,
+                            })?;
+                            map.insert(header, values.into_inner());
+                            Ok::<_, ParquetGemError>(())
+                        })?;
+                    Ok(ColumnRecord::Map::<RandomState>(map))
+                })
+            });
+            for result in iter {
+                let record = result?;
+                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
+            }
+        }
+        ParserResultType::Array => {
+            let iter = batch_reader.map(|batch| {
+                batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
+                    let vec = batch
+                        .columns()
+                        .iter()
+                        .map(|column| {
+                            let values = ParquetValueVec::try_from(ArrayWrapper {
+                                array: column,
+                                strict,
+                            })?;
+                            Ok::<_, ParquetGemError>(values.into_inner())
+                        })
+                        .collect::<Result<Vec<_>, _>>()?;
+                    Ok(ColumnRecord::Vec::<RandomState>(vec))
+                })
+            });
+            for result in iter {
+                let record = result?;
+                let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
+            }
+        }
+    }
+    Ok(())
+}
+/// Helper function to create a projection schema
+fn create_projection_schema(schema: &parquet::schema::types::Type, columns: &[String]) -> parquet::schema::types::Type {
+    if let parquet::schema::types::Type::GroupType { fields, .. } = schema {
+        let projected_fields: Vec<std::sync::Arc<parquet::schema::types::Type>> = fields
+            .iter()
+            .filter(|field| columns.contains(&field.name().to_string()))
+            .cloned()
+            .collect();
+        parquet::schema::types::Type::GroupType {
+            basic_info: schema.get_basic_info().clone(),
+            fields: projected_fields,
+        }
+    } else {
+        // Return original schema if not a group type
+        schema.clone()
+    }
+}

data/ext/parquet/src/types/parquet_value.rs CHANGED Viewed

@@ -1,6 +1,7 @@
 use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
 use super::*;
+use super::record_types::format_decimal_with_i8_scale;
 use arrow_array::MapArray;
 use magnus::{RArray, RString};
@@ -22,7 +23,7 @@ pub enum ParquetValue {
     Bytes(Vec<u8>),
     Date32(i32),
     Date64(i64),
-    Decimal128(i128),
+    Decimal128(i128, i8),
     TimestampSecond(i64, Option<Arc<str>>),
     TimestampMillis(i64, Option<Arc<str>>),
     TimestampMicros(i64, Option<Arc<str>>),
@@ -52,7 +53,47 @@ impl PartialEq for ParquetValue {
             (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
             (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
             (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
-            (ParquetValue::Decimal128(a), ParquetValue::Decimal128(b)) => a == b,
+            (ParquetValue::Decimal128(a, scale_a), ParquetValue::Decimal128(b, scale_b)) => {
+                if scale_a == scale_b {
+                    // Same scale, compare directly
+                    a == b
+                } else {
+                    // Different scales, need to adjust for proper comparison
+                    let mut a_val = *a;
+                    let mut b_val = *b;
+                    // Adjust to the same scale for proper comparison
+                    if scale_a < scale_b {
+                        // Scale up a to match b's scale
+                        let scale_diff = (*scale_b - *scale_a) as u32;
+                        if scale_diff <= 38 {
+                            // Limit to avoid overflow
+                            a_val *= 10_i128.pow(scale_diff);
+                        } else {
+                            // For large scale differences, use BigInt for the comparison
+                            let a_big = num::BigInt::from(*a)
+                                * num::BigInt::from(10_i128.pow(scale_diff.min(38)));
+                            let b_big = num::BigInt::from(*b);
+                            return a_big == b_big;
+                        }
+                    } else {
+                        // Scale up b to match a's scale
+                        let scale_diff = (*scale_a - *scale_b) as u32;
+                        if scale_diff <= 38 {
+                            // Limit to avoid overflow
+                            b_val *= 10_i128.pow(scale_diff);
+                        } else {
+                            // For large scale differences, use BigInt for the comparison
+                            let a_big = num::BigInt::from(*a);
+                            let b_big = num::BigInt::from(*b)
+                                * num::BigInt::from(10_i128.pow(scale_diff.min(38)));
+                            return a_big == b_big;
+                        }
+                    }
+                    a_val == b_val
+                }
+            }
             (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
             (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
             (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
@@ -85,7 +126,10 @@ impl std::hash::Hash for ParquetValue {
             ParquetValue::Bytes(b) => b.hash(state),
             ParquetValue::Date32(d) => d.hash(state),
             ParquetValue::Date64(d) => d.hash(state),
-            ParquetValue::Decimal128(d) => d.hash(state),
+            ParquetValue::Decimal128(d, scale) => {
+                d.hash(state);
+                scale.hash(state);
+            }
             ParquetValue::TimestampSecond(ts, tz) => {
                 ts.hash(state);
                 tz.hash(state);
@@ -131,7 +175,16 @@ impl TryIntoValue for ParquetValue {
             ParquetValue::Boolean(b) => Ok(b.into_value_with(handle)),
             ParquetValue::String(s) => Ok(s.into_value_with(handle)),
             ParquetValue::Bytes(b) => Ok(handle.str_from_slice(&b).as_value()),
-            ParquetValue::Decimal128(d) => Ok(d.to_string().into_value_with(handle)),
+            ParquetValue::Decimal128(d, scale) => {
+                // Load the bigdecimal gem if it's not already loaded
+                LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
+                // Format with proper scaling based on the sign of scale
+                let value = format_decimal_with_i8_scale(d, scale);
+                let kernel = handle.module_kernel();
+                Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
+            }
             ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
             ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
             timestamp @ ParquetValue::TimestampSecond(_, _) => {
@@ -375,7 +428,7 @@ impl ParquetValue {
 /// Unified helper to parse a decimal string and apply scaling
 fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, MagnusError> {
     let s = input_str.trim();
     // 1. Handle scientific notation case (e.g., "0.12345e3")
     if let Some(e_pos) = s.to_lowercase().find('e') {
         let base = &s[0..e_pos];
@@ -385,7 +438,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
         let exp_val = exp.parse::<i32>().map_err(|e| {
             MagnusError::new(
                 magnus::exception::type_error(),
-                format!("Failed to parse exponent '{}' in decimal string '{}': {}", exp, s, e),
+                format!(
+                    "Failed to parse exponent '{}' in decimal string '{}': {}",
+                    exp, s, e
+                ),
             )
         })?;
@@ -407,7 +463,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
             let base_val = base_without_point.parse::<i128>().map_err(|e| {
                 MagnusError::new(
                     magnus::exception::type_error(),
-                    format!("Failed to parse base '{}' in scientific notation '{}': {}", base, s, e),
+                    format!(
+                        "Failed to parse base '{}' in scientific notation '{}': {}",
+                        base, s, e
+                    ),
                 )
             })?;
@@ -417,7 +476,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
             let base_val = base.parse::<i128>().map_err(|e| {
                 MagnusError::new(
                     magnus::exception::type_error(),
-                    format!("Failed to parse base '{}' in scientific notation '{}': {}", base, s, e),
+                    format!(
+                        "Failed to parse base '{}' in scientific notation '{}': {}",
+                        base, s, e
+                    ),
                 )
             })?;
@@ -466,7 +528,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
         let v = s_without_point.parse::<i128>().map_err(|e| {
             MagnusError::new(
                 magnus::exception::type_error(),
-                format!("Failed to parse decimal string '{}' (without decimal point: '{}'): {}", s, s_without_point, e),
+                format!(
+                    "Failed to parse decimal string '{}' (without decimal point: '{}'): {}",
+                    s, s_without_point, e
+                ),
             )
         })?;
@@ -497,7 +562,7 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
             }
             std::cmp::Ordering::Equal => Ok(v),
         }
-    }
+    }
     // 3. Plain integer value (e.g., "12345")
     else {
         // No decimal point, parse as i128 and scale appropriately
@@ -512,12 +577,18 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
         if input_scale > 38 {
             return Err(MagnusError::new(
                 magnus::exception::range_error(),
-                format!("Scale {} is too large for decimal value '{}'. Must be ≤ 38.", input_scale, s),
+                format!(
+                    "Scale {} is too large for decimal value '{}'. Must be ≤ 38.",
+                    input_scale, s
+                ),
             ));
         } else if input_scale < -38 {
             return Err(MagnusError::new(
                 magnus::exception::range_error(),
-                format!("Scale {} is too small for decimal value '{}'. Must be ≥ -38.", input_scale, s),
+                format!(
+                    "Scale {} is too small for decimal value '{}'. Must be ≥ -38.",
+                    input_scale, s
+                ),
             ));
         }
@@ -540,14 +611,17 @@ fn convert_to_decimal128(value: Value, scale: i8) -> Result<ParquetValue, Magnus
     } else {
         value.to_r_string()?.to_string()?
     };
     // Use our unified parser to convert the string to a decimal value with scaling
     match parse_decimal_string(&s, scale) {
-        Ok(decimal_value) => Ok(ParquetValue::Decimal128(decimal_value)),
+        Ok(decimal_value) => Ok(ParquetValue::Decimal128(decimal_value, scale)),
         Err(e) => Err(MagnusError::new(
             magnus::exception::type_error(),
-            format!("Failed to convert '{}' to decimal with scale {}: {}", s, scale, e),
-        ))
+            format!(
+                "Failed to convert '{}' to decimal with scale {}: {}",
+                s, scale, e
+            ),
+        )),
     }
 }

data/ext/parquet/src/types/record_types.rs CHANGED Viewed

@@ -5,7 +5,31 @@ use parquet::data_type::AsBytes;
 use super::*;
-static LOADED_BIGDECIMAL: OnceLock<bool> = OnceLock::new();
+pub static LOADED_BIGDECIMAL: OnceLock<bool> = OnceLock::new();
+/// Format decimal value with appropriate scale for BigDecimal conversion
+/// Handles positive and negative scales correctly for i8 scale
+pub fn format_decimal_with_i8_scale<T: std::fmt::Display>(value: T, scale: i8) -> String {
+    if scale >= 0 {
+        // Positive scale means divide (move decimal point left)
+        format!("{}e-{}", value, scale)
+    } else {
+        // Negative scale means multiply (move decimal point right)
+        format!("{}e{}", value, -scale)
+    }
+}
+/// Format decimal value with appropriate scale for BigDecimal conversion
+/// Handles positive and negative scales correctly for i32 scale
+pub fn format_decimal_with_i32_scale<T: std::fmt::Display>(value: T, scale: i32) -> String {
+    if scale >= 0 {
+        // Positive scale means divide (move decimal point left)
+        format!("{}e-{}", value, scale)
+    } else {
+        // Negative scale means multiply (move decimal point right)
+        format!("{}e{}", value, -scale)
+    }
+}
 #[derive(Debug)]
 pub enum RowRecord<S: BuildHasher + Default> {
@@ -207,17 +231,17 @@ impl TryIntoValue for ParquetField {
                 let value = match d {
                     Decimal::Int32 { value, scale, .. } => {
                         let unscaled = i32::from_be_bytes(value);
-                        format!("{}e-{}", unscaled, scale)
+                        format_decimal_with_i32_scale(unscaled, scale)
                     }
                     Decimal::Int64 { value, scale, .. } => {
                         let unscaled = i64::from_be_bytes(value);
-                        format!("{}e-{}", unscaled, scale)
+                        format_decimal_with_i32_scale(unscaled, scale)
                     }
                     Decimal::Bytes { value, scale, .. } => {
                         // value is a byte array containing the bytes for an i128 value in big endian order
                         let casted = value.as_bytes()[..16].try_into()?;
                         let unscaled = i128::from_be_bytes(casted);
-                        format!("{}e-{}", unscaled, scale)
+                        format_decimal_with_i32_scale(unscaled, scale)
                     }
                 };

data/ext/parquet/src/types/type_conversion.rs CHANGED Viewed

@@ -2,8 +2,8 @@ use std::str::FromStr;
 use std::sync::Arc;
 use super::*;
-use arrow_array::builder::*;
 use arrow_array::builder::MapFieldNames;
+use arrow_array::builder::*;
 use arrow_schema::{DataType, Field, Fields, TimeUnit};
 use jiff::tz::{Offset, TimeZone};
 use magnus::{RArray, RString, TryConvert};
@@ -368,15 +368,17 @@ fn create_arrow_builder_for_type(
         ParquetSchemaType::Primitive(PrimitiveType::Decimal128(precision, scale)) => {
             // Create a Decimal128Builder with specific precision and scale
             let builder = Decimal128Builder::with_capacity(cap);
             // Set precision and scale for the decimal and return the new builder
-            let builder_with_precision = builder.with_precision_and_scale(*precision, *scale).map_err(|e| {
-                MagnusError::new(
-                    magnus::exception::runtime_error(),
-                    format!("Failed to set precision and scale: {}", e),
-                )
-            })?;
+            let builder_with_precision = builder
+                .with_precision_and_scale(*precision, *scale)
+                .map_err(|e| {
+                    MagnusError::new(
+                        magnus::exception::runtime_error(),
+                        format!("Failed to set precision and scale: {}", e),
+                    )
+                })?;
             Ok(Box::new(builder_with_precision))
         }
         ParquetSchemaType::Primitive(PrimitiveType::String) => {
@@ -857,7 +859,7 @@ fn fill_builder(
             for val in values {
                 match val {
-                    ParquetValue::Decimal128(d) => typed_builder.append_value(*d),
+                    ParquetValue::Decimal128(d, _scale) => typed_builder.append_value(*d),
                     ParquetValue::Float64(f) => {
                         // Scale the float to the desired precision and scale
                         let scaled_value = (*f * 10_f64.powi(*scale as i32)) as i128;
@@ -1161,7 +1163,7 @@ fn fill_builder(
                                             )
                                         })?
                                         .append_value(bytes),
-                                    ParquetValue::Decimal128(x) => typed_builder
+                                    ParquetValue::Decimal128(x, _scale) => typed_builder
                                         .field_builder::<Decimal128Builder>(i)
                                         .ok_or_else(|| {
                                             MagnusError::new(

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.5.4"
+  VERSION = "0.5.5"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.5.4
+  version: 0.5.5
 platform: ruby
 authors:
 - Nathan Jaremko
@@ -66,6 +66,7 @@ files:
 - ext/parquet/src/reader/mod.rs
 - ext/parquet/src/reader/parquet_column_reader.rs
 - ext/parquet/src/reader/parquet_row_reader.rs
+- ext/parquet/src/reader/unified/mod.rs
 - ext/parquet/src/ruby_reader.rs
 - ext/parquet/src/types/core_types.rs
 - ext/parquet/src/types/mod.rs