RubyGems - parquet - Versions diffs - 0.5.6 → 0.5.7 - Mend

parquet 0.5.6 → 0.5.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml +4 -4
data/Cargo.lock +15 -8
data/ext/parquet/Cargo.toml +4 -3
data/ext/parquet/src/reader/unified/mod.rs +55 -20
data/ext/parquet/src/types/mod.rs +2 -0
data/ext/parquet/src/types/record_types.rs +160 -9
data/lib/parquet/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dc1d1eda7d71aa6336fbf6cc94789517439df3fab1852ec7d2e9d265e0c016c4
-  data.tar.gz: 6fff5321a31d3fe19a59a4f47add56222dbeb274bef7a068163b48757d65252d
+  metadata.gz: e8a79e74af0419282904a0041c09509520f64ce1e504e133237f4b87697dce14
+  data.tar.gz: 63391ffff73907caccc142f37550e85c12826f302f00ac726f826af391f8d8cd
 SHA512:
-  metadata.gz: ddd50f82df2b42cf844e379a7f07c0214e9aef925e7c43ec566b6b9f27be311676b6f887c163aa5d41d4523cd1d506266b15623205453bc8e08467c88e7c2b63
-  data.tar.gz: afb235ad09338d8c4cd59588dded3d312890c5d5d879b77040fcbf960be69653981fe5176cc591969a80ba54214d4c6a63cff96c36ceda7b9e00c75ba8e9e913
+  metadata.gz: cddb7c6711e7e49ea785f6c0ab5ae3c40181756ad0e3fc23f298c291b725b178fdfbe5a8430fd9be10591b09e1b963255cb50637743054fe2173c9798e1e8bcc
+  data.tar.gz: 927a112ff1994800b3ed989f5000ed2a43438cebff886a545d0dd22018731b042b9052ead5b14983faf50fffc593cdd7512dd764bfed4de8ffa7781e6f2fda1a

data/Cargo.lock CHANGED Viewed

@@ -64,7 +64,7 @@ dependencies = [
 [[package]]
 name = "arrow-array"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 dependencies = [
  "ahash",
  "arrow-buffer",
@@ -79,7 +79,7 @@ dependencies = [
 [[package]]
 name = "arrow-buffer"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 dependencies = [
  "bytes",
  "half",
@@ -89,7 +89,7 @@ dependencies = [
 [[package]]
 name = "arrow-cast"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -108,7 +108,7 @@ dependencies = [
 [[package]]
 name = "arrow-data"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 dependencies = [
  "arrow-buffer",
  "arrow-schema",
@@ -119,7 +119,7 @@ dependencies = [
 [[package]]
 name = "arrow-ipc"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 dependencies = [
  "arrow-array",
  "arrow-buffer",
@@ -131,12 +131,12 @@ dependencies = [
 [[package]]
 name = "arrow-schema"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 [[package]]
 name = "arrow-select"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 dependencies = [
  "ahash",
  "arrow-array",
@@ -844,12 +844,13 @@ dependencies = [
  "simdutf8",
  "tempfile",
  "thiserror",
+ "uuid",
 ]
 [[package]]
 name = "parquet"
 version = "55.1.0"
-source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
+source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
 dependencies = [
  "ahash",
  "arrow-array",
@@ -1230,6 +1231,12 @@ version = "1.0.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe"
+[[package]]
+name = "uuid"
+version = "1.16.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
 [[package]]
 name = "version_check"
 version = "0.9.5"

data/ext/parquet/Cargo.toml CHANGED Viewed

@@ -11,20 +11,21 @@ rb-sys-env = "^0.2"
 [dependencies]
 ahash = "0.8"
-arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records" }
-arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records" }
+arrow-array = { git = "https://github.com/apache/arrow-rs", branch = "main" }
+arrow-schema = { git = "https://github.com/apache/arrow-rs", branch = "main" }
 bytes = "^1.9"
 either = "1.9"
 itertools = "^0.14"
 jiff = "0.2"
 magnus = { version = "0.7", features = ["rb-sys"] }
-parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records", features = ["json"] }
+parquet = { git = "https://github.com/apache/arrow-rs", branch = "main", features = ["json"] }
 rand = "0.9"
 rb-sys = "^0.9"
 simdutf8 = "0.1.5"
 tempfile = "^3.15"
 thiserror = "2.0"
 num = "0.4.3"
+uuid = "1.16.0"
 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }

data/ext/parquet/src/reader/unified/mod.rs CHANGED Viewed

@@ -2,8 +2,8 @@ use crate::header_cache::StringCache;
 use crate::logger::RubyLogger;
 use crate::types::TryIntoValue;
 use crate::{
-    create_column_enumerator, create_row_enumerator, ParquetField, ParquetGemError,
-    ParserResultType, ColumnEnumeratorArgs, RowEnumeratorArgs, RowRecord, ColumnRecord, ParquetValueVec,
+    create_column_enumerator, create_row_enumerator, ColumnEnumeratorArgs, ColumnRecord,
+    ParquetField, ParquetGemError, ParquetValueVec, ParserResultType, RowEnumeratorArgs, RowRecord,
 };
 use ahash::RandomState;
 use either::Either;
@@ -13,10 +13,10 @@ use std::collections::HashMap;
 use std::rc::Rc;
 use std::sync::OnceLock;
-use crate::types::ArrayWrapper;
 use super::common::{
     create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
 };
+use crate::types::ArrayWrapper;
 /// A unified parser configuration that can be used for both row and column parsing
 pub enum ParserType {
@@ -53,11 +53,11 @@ pub fn parse_parquet_unified(
     } = args;
     // Initialize the logger if provided
-    let ruby_logger = RubyLogger::new(&ruby, logger.clone())?;
+    let ruby_logger = RubyLogger::new(&ruby, logger)?;
     // Clone values for the closure to avoid move issues
     let columns_clone = columns.clone();
     // Determine if we're handling rows or columns for enumerator creation
     match &parser_type {
         ParserType::Row { strict } => {
@@ -75,13 +75,13 @@ pub fn parse_parquet_unified(
             })? {
                 return Ok(enum_value);
             }
-        },
+        }
         ParserType::Column { batch_size, strict } => {
             // For column-based parsing, log the batch size if present
             if let Some(ref bs) = batch_size {
                 ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
             }
             // Handle block or create column enumerator
             if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
                 create_column_enumerator(ColumnEnumeratorArgs {
@@ -102,19 +102,34 @@ pub fn parse_parquet_unified(
     // Open the Parquet source
     let source = open_parquet_source(ruby.clone(), to_read)?;
     // Based on the parser type, handle the data differently
     match parser_type {
         ParserType::Row { strict } => {
             // Handle row-based parsing
-            process_row_data(ruby.clone(), source, &columns, result_type, strict, &ruby_logger)?;
-        },
+            process_row_data(
+                ruby.clone(),
+                source,
+                &columns,
+                result_type,
+                strict,
+                &ruby_logger,
+            )?;
+        }
         ParserType::Column { batch_size, strict } => {
             // Handle column-based parsing
-            process_column_data(ruby.clone(), source, &columns, result_type, batch_size, strict, &ruby_logger)?;
+            process_column_data(
+                ruby.clone(),
+                source,
+                &columns,
+                result_type,
+                batch_size,
+                strict,
+                &ruby_logger,
+            )?;
         }
     }
     Ok(ruby.qnil().into_value_with(&ruby))
 }
@@ -129,7 +144,7 @@ fn process_row_data(
 ) -> Result<(), ParquetGemError> {
     use parquet::file::reader::{FileReader, SerializedFileReader};
     use parquet::record::reader::RowIter as ParquetRowIter;
     // Create the row-based reader
     let reader: Box<dyn FileReader> = match source {
         Either::Left(file) => {
@@ -174,8 +189,19 @@ fn process_row_data(
                     let mut map =
                         HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
-                    for (i, (_, v)) in row.get_column_iter().enumerate() {
-                        map.insert(headers[i], ParquetField(v.clone(), strict));
+                    for (i, ((_, v), t)) in
+                        row.get_column_iter().zip(schema.get_fields()).enumerate()
+                    {
+                        let type_info = t.get_basic_info();
+                        map.insert(
+                            headers[i],
+                            ParquetField {
+                                field: v.clone(),
+                                converted_type: type_info.converted_type(),
+                                logical_type: type_info.logical_type().clone(),
+                                strict,
+                            },
+                        );
                     }
                     map
                 })
@@ -193,8 +219,14 @@ fn process_row_data(
                 row.map(|row| {
                     let column_count = row.get_column_iter().count();
                     let mut vec = Vec::with_capacity(column_count);
-                    for (_, v) in row.get_column_iter() {
-                        vec.push(ParquetField(v.clone(), strict));
+                    for ((_, v), t) in row.get_column_iter().zip(schema.get_fields()) {
+                        let type_info = t.get_basic_info();
+                        vec.push(ParquetField {
+                            field: v.clone(),
+                            converted_type: type_info.converted_type(),
+                            logical_type: type_info.logical_type().clone(),
+                            strict,
+                        });
                     }
                     vec
                 })
@@ -309,7 +341,10 @@ fn process_column_data(
 }
 /// Helper function to create a projection schema
-fn create_projection_schema(schema: &parquet::schema::types::Type, columns: &[String]) -> parquet::schema::types::Type {
+fn create_projection_schema(
+    schema: &parquet::schema::types::Type,
+    columns: &[String],
+) -> parquet::schema::types::Type {
     if let parquet::schema::types::Type::GroupType { fields, .. } = schema {
         let projected_fields: Vec<std::sync::Arc<parquet::schema::types::Type>> = fields
             .iter()
@@ -325,4 +360,4 @@ fn create_projection_schema(schema: &parquet::schema::types::Type, columns: &[St
         // Return original schema if not a group type
         schema.clone()
     }
-}
+}

data/ext/parquet/src/types/mod.rs CHANGED Viewed

@@ -61,6 +61,8 @@ pub enum ParquetGemError {
     Jiff(#[from] jiff::Error),
     #[error("Failed to cast slice to array: {0}")]
     InvalidDecimal(#[from] TryFromSliceError),
+    #[error("Failed to parse UUID: {0}")]
+    UuidError(#[from] uuid::Error),
 }
 #[derive(Debug)]

data/ext/parquet/src/types/record_types.rs CHANGED Viewed

@@ -1,7 +1,10 @@
 use std::sync::OnceLock;
 use itertools::Itertools;
-use parquet::data_type::AsBytes;
+use parquet::{
+    basic::{ConvertedType, LogicalType},
+    data_type::AsBytes,
+};
 use super::*;
@@ -44,7 +47,13 @@ pub enum ColumnRecord<S: BuildHasher + Default> {
 }
 #[derive(Debug)]
-pub struct ParquetField(pub Field, pub bool);
+pub struct ParquetField {
+    pub field: Field,
+    #[allow(dead_code)]
+    pub converted_type: ConvertedType,
+    pub logical_type: Option<LogicalType>,
+    pub strict: bool,
+}
 impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
     fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
@@ -158,7 +167,7 @@ pub trait TryIntoValue {
 impl TryIntoValue for ParquetField {
     fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
-        match self.0 {
+        match self.field {
             Field::Null => Ok(handle.qnil().as_value()),
             Field::Bool(b) => Ok(b.into_value_with(handle)),
             Field::Short(s) => Ok(s.into_value_with(handle)),
@@ -172,7 +181,7 @@ impl TryIntoValue for ParquetField {
             Field::Float(f) => Ok(f.into_value_with(handle)),
             Field::Double(d) => Ok(d.into_value_with(handle)),
             Field::Str(s) => {
-                if self.1 {
+                if self.strict {
                     Ok(simdutf8::basic::from_utf8(s.as_bytes())
                         .map_err(ParquetGemError::Utf8Error)
                         .map(|s| s.into_value_with(handle))?)
@@ -182,7 +191,15 @@ impl TryIntoValue for ParquetField {
                 }
             }
             Field::Byte(b) => Ok(b.into_value_with(handle)),
-            Field::Bytes(b) => Ok(handle.str_from_slice(b.data()).as_value()),
+            Field::Bytes(b) => {
+                if matches!(self.logical_type, Some(parquet::basic::LogicalType::Uuid)) {
+                    let bytes = b.as_bytes();
+                    let uuid = uuid::Uuid::from_slice(bytes)?;
+                    Ok(uuid.to_string().into_value_with(handle))
+                } else {
+                    Ok(handle.str_from_slice(b.data()).as_value())
+                }
+            }
             Field::Date(d) => {
                 let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
                 let formatted = ts.strftime("%Y-%m-%d").to_string();
@@ -206,7 +223,15 @@ impl TryIntoValue for ParquetField {
                 let elements = list.elements();
                 let ary = handle.ary_new_capa(elements.len());
                 elements.iter().try_for_each(|e| {
-                    ary.push(ParquetField(e.clone(), self.1).try_into_value_with(handle)?)?;
+                    ary.push(
+                        ParquetField {
+                            field: e.clone(),
+                            logical_type: e.to_logical_type(),
+                            converted_type: e.to_converted_type(),
+                            strict: self.strict,
+                        }
+                        .try_into_value_with(handle)?,
+                    )?;
                     Ok::<_, ParquetGemError>(())
                 })?;
                 Ok(ary.into_value_with(handle))
@@ -220,8 +245,20 @@ impl TryIntoValue for ParquetField {
                 map.entries().iter().try_for_each(|(k, v)| {
                     hash.aset(
-                        ParquetField(k.clone(), self.1).try_into_value_with(handle)?,
-                        ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
+                        ParquetField {
+                            field: k.clone(),
+                            converted_type: k.to_converted_type(),
+                            logical_type: k.to_logical_type(),
+                            strict: self.strict,
+                        }
+                        .try_into_value_with(handle)?,
+                        ParquetField {
+                            field: v.clone(),
+                            converted_type: v.to_converted_type(),
+                            logical_type: v.to_logical_type(),
+                            strict: self.strict,
+                        }
+                        .try_into_value_with(handle)?,
                     )?;
                     Ok::<_, ParquetGemError>(())
                 })?;
@@ -278,7 +315,13 @@ impl TryIntoValue for ParquetField {
                 row.get_column_iter().try_for_each(|(k, v)| {
                     hash.aset(
                         k.clone().into_value_with(handle),
-                        ParquetField(v.clone(), self.1).try_into_value_with(handle)?,
+                        ParquetField {
+                            field: v.clone(),
+                            converted_type: v.to_converted_type(),
+                            logical_type: v.to_logical_type(),
+                            strict: self.strict,
+                        }
+                        .try_into_value_with(handle)?,
                     )?;
                     Ok::<_, ParquetGemError>(())
                 })?;
@@ -287,3 +330,111 @@ impl TryIntoValue for ParquetField {
         }
     }
 }
+trait ToTypeInfo {
+    fn to_converted_type(&self) -> ConvertedType;
+    fn to_logical_type(&self) -> Option<LogicalType>;
+}
+impl ToTypeInfo for &parquet::record::Field {
+    fn to_converted_type(&self) -> ConvertedType {
+        match self {
+            Field::Null => ConvertedType::NONE,
+            Field::Bool(_) => ConvertedType::INT_8,
+            Field::Byte(_) => ConvertedType::INT_8,
+            Field::Short(_) => ConvertedType::INT_16,
+            Field::Int(_) => ConvertedType::INT_32,
+            Field::Long(_) => ConvertedType::INT_64,
+            Field::UByte(_) => ConvertedType::UINT_8,
+            Field::UShort(_) => ConvertedType::UINT_16,
+            Field::UInt(_) => ConvertedType::UINT_32,
+            Field::ULong(_) => ConvertedType::UINT_64,
+            Field::Float16(_) => ConvertedType::NONE,
+            Field::Float(_) => ConvertedType::NONE,
+            Field::Double(_) => ConvertedType::NONE,
+            Field::Decimal(_) => ConvertedType::DECIMAL,
+            Field::Str(_) => ConvertedType::UTF8,
+            Field::Bytes(_) => ConvertedType::LIST,
+            Field::Date(_) => ConvertedType::DATE,
+            Field::TimestampMillis(_) => ConvertedType::TIMESTAMP_MILLIS,
+            Field::TimestampMicros(_) => ConvertedType::TIMESTAMP_MICROS,
+            Field::Group(_) => ConvertedType::NONE,
+            Field::ListInternal(_) => ConvertedType::LIST,
+            Field::MapInternal(_) => ConvertedType::MAP,
+        }
+    }
+    fn to_logical_type(&self) -> Option<LogicalType> {
+        Some(match self {
+            Field::Null => LogicalType::Unknown,
+            Field::Bool(_) => LogicalType::Integer {
+                bit_width: 1,
+                is_signed: false,
+            },
+            Field::Byte(_) => LogicalType::Integer {
+                bit_width: 8,
+                is_signed: false,
+            },
+            Field::Short(_) => LogicalType::Integer {
+                bit_width: 16,
+                is_signed: true,
+            },
+            Field::Int(_) => LogicalType::Integer {
+                bit_width: 32,
+                is_signed: true,
+            },
+            Field::Long(_) => LogicalType::Integer {
+                bit_width: 64,
+                is_signed: true,
+            },
+            Field::UByte(_) => LogicalType::Integer {
+                bit_width: 8,
+                is_signed: false,
+            },
+            Field::UShort(_) => LogicalType::Integer {
+                bit_width: 16,
+                is_signed: false,
+            },
+            Field::UInt(_) => LogicalType::Integer {
+                bit_width: 32,
+                is_signed: false,
+            },
+            Field::ULong(_) => LogicalType::Integer {
+                bit_width: 64,
+                is_signed: false,
+            },
+            Field::Float16(_) => LogicalType::Float16,
+            Field::Float(_) => LogicalType::Decimal {
+                scale: 7,
+                precision: 7,
+            },
+            Field::Double(_) => LogicalType::Decimal {
+                scale: 15,
+                precision: 15,
+            },
+            Field::Decimal(decimal) => LogicalType::Decimal {
+                scale: decimal.scale(),
+                precision: decimal.precision(),
+            },
+            Field::Str(_) => LogicalType::String,
+            Field::Bytes(b) => {
+                if b.data().len() == 16 && uuid::Uuid::from_slice(b.as_bytes()).is_ok() {
+                    LogicalType::Uuid
+                } else {
+                    LogicalType::Unknown
+                }
+            }
+            Field::Date(_) => LogicalType::Date,
+            Field::TimestampMillis(_) => LogicalType::Timestamp {
+                is_adjusted_to_u_t_c: true,
+                unit: parquet::basic::TimeUnit::MILLIS(parquet::format::MilliSeconds {}),
+            },
+            Field::TimestampMicros(_) => LogicalType::Timestamp {
+                is_adjusted_to_u_t_c: true,
+                unit: parquet::basic::TimeUnit::MICROS(parquet::format::MicroSeconds {}),
+            },
+            Field::Group(_) => LogicalType::Unknown,
+            Field::ListInternal(_) => LogicalType::List,
+            Field::MapInternal(_) => LogicalType::Map,
+        })
+    }
+}

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.5.6"
+  VERSION = "0.5.7"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.5.6
+  version: 0.5.7
 platform: ruby
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-05-15 00:00:00.000000000 Z
+date: 2025-05-23 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys