RubyGems - parquet - Versions diffs - 0.6.1 → 0.6.2 - Mend

parquet 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +4 -4
data/Cargo.lock +5 -0
data/ext/parquet-core/Cargo.toml +2 -2
data/ext/parquet-core/src/arrow_conversion.rs +35 -11
data/ext/parquet-core/src/reader.rs +7 -4
data/ext/parquet-core/src/schema.rs +2 -0
data/ext/parquet-core/src/value.rs +7 -0
data/ext/parquet-core/src/writer.rs +14 -3
data/ext/parquet-core/tests/arrow_conversion_tests.rs +4 -4
data/ext/parquet-ruby-adapter/Cargo.toml +1 -0
data/ext/parquet-ruby-adapter/src/converter.rs +53 -59
data/ext/parquet-ruby-adapter/src/reader.rs +9 -2
data/ext/parquet-ruby-adapter/src/schema.rs +61 -39
data/ext/parquet-ruby-adapter/src/string_cache.rs +8 -9
data/ext/parquet-ruby-adapter/src/utils.rs +2 -0
data/lib/parquet/version.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: dd9295880f123b0ed979fb970d789bf63461007e8c087641b47391baa02bef29
-  data.tar.gz: e5ccd26a48e7b800412049e9d44e7f8c6fb5c12dbe25c74980467c822fb114c6
+  metadata.gz: dfd19103b2414e7feeaa6d1ec3c9a9c25ce42cf5c8362baa37e3b9d8d5245f82
+  data.tar.gz: c5c1170dbdc3635577738a568688c36adc9670710f4b0d570fae29294e337754
 SHA512:
-  metadata.gz: f275d82733cd6b1658ed7b6212618830e2a12e6831bcfcc26beeae79691ddc972b22b0b0660f10db9b0bb7a03e2af29da15de28d037d5896891fbd678b3ecca9
-  data.tar.gz: 8fbfd33a92350c84409c0e879745fbc5a07d60f0a2862a2993a4780eb5c110be7b3918bfdfe6721e60e44635792108a1a3eeed460a189076bd03056a27764e50
+  metadata.gz: c9bf72b4e708c750ab7ae30afd97aef7f456a4249904fe3eb74f916557e28ca1a53bc262a6492db38c38162ab3e3f684e30f0c70dabbaf8f8f4145ef4d9af259
+  data.tar.gz: 164c5b0569d3d13242bcff7c09d66edf67b279d8289f97def043000a508b4333dd1387a4f47517be09023cd270cf3b6dfd57fdf658ac1b52e25f3f5b2b5ca30c

data/Cargo.lock CHANGED Viewed

@@ -225,6 +225,10 @@ dependencies = [
 name = "arrow-schema"
 version = "55.2.0"
 source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#54858bf019ff3faeb8f5b562da8c01012162aef0"
+dependencies = [
+ "serde",
+ "serde_json",
+]
 [[package]]
 name = "arrow-select"
@@ -1092,6 +1096,7 @@ dependencies = [
  "rb-sys-env 0.2.2",
  "tempfile",
  "thiserror",
+ "uuid",
 ]
 [[package]]

data/ext/parquet-core/Cargo.toml CHANGED Viewed

@@ -7,7 +7,7 @@ edition = "2021"
 arrow = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
 arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
 arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
-arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
+arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["canonical_extension_types"]}
 bytes = "1.5"
 indexmap = "2.2"
 jiff = "0.2"
@@ -17,7 +17,7 @@ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24
 rand = "0.9.1"
 serde = { version = "1.0", features = ["derive"] }
 thiserror = "2.0"
+uuid = { version = "1.0", features = ["v4"] }
 [dev-dependencies]
-uuid = { version = "1.0", features = ["v4"] }
 tempfile = "3.8"

data/ext/parquet-core/src/arrow_conversion.rs CHANGED Viewed

@@ -7,6 +7,7 @@
 use crate::{ParquetError, ParquetValue, Result};
 use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
+use arrow_schema::extension::Uuid as ArrowUuid;
 use arrow_schema::{DataType, Field};
 use bytes::Bytes;
 use indexmap::IndexMap;
@@ -14,7 +15,11 @@ use ordered_float::OrderedFloat;
 use std::sync::Arc;
 /// Convert a single value from an Arrow array at the given index to a ParquetValue
-pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<ParquetValue> {
+pub fn arrow_to_parquet_value(
+    field: &Field,
+    array: &dyn Array,
+    index: usize,
+) -> Result<ParquetValue> {
     use arrow_array::*;
     if array.is_null(index) {
@@ -72,7 +77,6 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
             let array = downcast_array::<Float64Array>(array)?;
             Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
         }
         // String and binary types
         DataType::Utf8 => {
             let array = downcast_array::<StringArray>(array)?;
@@ -86,9 +90,15 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
         }
         DataType::FixedSizeBinary(_) => {
             let array = downcast_array::<FixedSizeBinaryArray>(array)?;
-            Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
-                array.value(index),
-            )))
+            let value = array.value(index);
+            match field.try_extension_type::<ArrowUuid>() {
+                Ok(_) => {
+                    let uuid = uuid::Uuid::from_slice(value)
+                        .map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
+                    Ok(ParquetValue::Uuid(uuid))
+                }
+                Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
+            }
         }
         // Date and time types
@@ -140,6 +150,10 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
                 let array = downcast_array::<Time64MicrosecondArray>(array)?;
                 Ok(ParquetValue::TimeMicros(array.value(index)))
             }
+            arrow_schema::TimeUnit::Nanosecond => {
+                let array = downcast_array::<Time64NanosecondArray>(array)?;
+                Ok(ParquetValue::TimeNanos(array.value(index)))
+            }
             _ => Err(ParquetError::Conversion(format!(
                 "Unsupported time64 unit: {:?}",
                 unit
@@ -173,13 +187,13 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
         }
         // Complex types
-        DataType::List(_) => {
+        DataType::List(item_field) => {
             let array = downcast_array::<ListArray>(array)?;
             let list_values = array.value(index);
             let mut values = Vec::with_capacity(list_values.len());
             for i in 0..list_values.len() {
-                values.push(arrow_to_parquet_value(&list_values, i)?);
+                values.push(arrow_to_parquet_value(item_field, &list_values, i)?);
             }
             Ok(ParquetValue::List(values))
@@ -192,10 +206,20 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
             let keys = map_value.column(0);
             let values = map_value.column(1);
+            let key_field = map_value
+                .fields()
+                .iter().find(|f| f.name() == "key")
+                .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
+            let value_field = map_value
+                .fields()
+                .iter().find(|f| f.name() == "value")
+                .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
             let mut map_vec = Vec::with_capacity(keys.len());
             for i in 0..keys.len() {
-                let key = arrow_to_parquet_value(keys, i)?;
-                let value = arrow_to_parquet_value(values, i)?;
+                let key = arrow_to_parquet_value(key_field, keys, i)?;
+                let value = arrow_to_parquet_value(value_field, values, i)?;
                 map_vec.push((key, value));
             }
@@ -207,7 +231,7 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
             let mut map = IndexMap::new();
             for (col_idx, field) in array.fields().iter().enumerate() {
                 let column = array.column(col_idx);
-                let value = arrow_to_parquet_value(column, index)?;
+                let value = arrow_to_parquet_value(field, column, index)?;
                 map.insert(Arc::from(field.name().as_str()), value);
             }
@@ -1108,7 +1132,7 @@ mod tests {
         let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
         for (i, expected) in values.iter().enumerate() {
-            let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
+            let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
             assert_eq!(&actual, expected);
         }
     }

data/ext/parquet-core/src/reader.rs CHANGED Viewed

@@ -163,8 +163,10 @@ where
                     // Extract values from current row
                     let mut row_values = Vec::with_capacity(batch.num_columns());
-                    for column in batch.columns() {
-                        let value = match arrow_to_parquet_value(column, self.current_row) {
+                    let schema = batch.schema();
+                    for (i, column) in batch.columns().iter().enumerate() {
+                        let field = schema.field(i);
+                        let value = match arrow_to_parquet_value(field, column, self.current_row) {
                             Ok(v) => v,
                             Err(e) => return Some(Err(e)),
                         };
@@ -228,12 +230,13 @@ where
                 let mut columns = Vec::with_capacity(batch.num_columns());
                 for (idx, column) in batch.columns().iter().enumerate() {
-                    let column_name = self.schema.field(idx).name().to_string();
+                    let field = self.schema.field(idx);
+                    let column_name = field.name().to_string();
                     // Convert entire column to ParquetValues
                     let mut values = Vec::with_capacity(column.len());
                     for row_idx in 0..column.len() {
-                        match arrow_to_parquet_value(column, row_idx) {
+                        match arrow_to_parquet_value(field, column, row_idx) {
                             Ok(value) => values.push(value),
                             Err(e) => return Some(Err(e)),
                         }

data/ext/parquet-core/src/schema.rs CHANGED Viewed

@@ -72,6 +72,7 @@ pub enum PrimitiveType {
     TimestampNanos(Option<Arc<str>>),
     TimeMillis,
     TimeMicros,
+    TimeNanos,
     // Fixed-length byte array
     FixedLenByteArray(i32),
@@ -146,6 +147,7 @@ impl PrimitiveType {
             PrimitiveType::TimestampNanos(_) => "TimestampNanos",
             PrimitiveType::TimeMillis => "TimeMillis",
             PrimitiveType::TimeMicros => "TimeMicros",
+            PrimitiveType::TimeNanos => "TimeNanos",
             PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
         }
     }

data/ext/parquet-core/src/value.rs CHANGED Viewed

@@ -2,6 +2,7 @@ use bytes::Bytes;
 use indexmap::IndexMap;
 use num::BigInt;
 use std::sync::Arc;
+use uuid::Uuid;
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub enum ParquetValue {
@@ -22,6 +23,7 @@ pub enum ParquetValue {
     Boolean(bool),
     String(Arc<str>),
     Bytes(Bytes),
+    Uuid(Uuid),
     // Date/Time types
     Date32(i32), // Days since epoch
@@ -40,6 +42,7 @@ pub enum ParquetValue {
     // Time types
     TimeMillis(i32), // Time of day in milliseconds since midnight
     TimeMicros(i64), // Time of day in microseconds since midnight
+    TimeNanos(i64),  // Time of day in nanoseconds since midnight
     // Complex types
     List(Vec<ParquetValue>),
@@ -68,6 +71,7 @@ impl std::hash::Hash for ParquetValue {
             ParquetValue::Boolean(b) => b.hash(state),
             ParquetValue::String(s) => s.hash(state),
             ParquetValue::Bytes(b) => b.hash(state),
+            ParquetValue::Uuid(u) => u.hash(state),
             ParquetValue::Date32(d) => d.hash(state),
             ParquetValue::Date64(d) => d.hash(state),
             ParquetValue::Decimal128(d, scale) => {
@@ -96,6 +100,7 @@ impl std::hash::Hash for ParquetValue {
             }
             ParquetValue::TimeMillis(t) => t.hash(state),
             ParquetValue::TimeMicros(t) => t.hash(state),
+            ParquetValue::TimeNanos(t) => t.hash(state),
             ParquetValue::List(l) => l.hash(state),
             ParquetValue::Map(m) => m.hash(state),
             ParquetValue::Record(r) => {
@@ -133,6 +138,7 @@ impl ParquetValue {
             ParquetValue::Boolean(_) => "Boolean",
             ParquetValue::String(_) => "String",
             ParquetValue::Bytes(_) => "Bytes",
+            ParquetValue::Uuid(_) => "Uuid",
             ParquetValue::Date32(_) => "Date32",
             ParquetValue::Date64(_) => "Date64",
             ParquetValue::Decimal128(_, _) => "Decimal128",
@@ -143,6 +149,7 @@ impl ParquetValue {
             ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
             ParquetValue::TimeMillis(_) => "TimeMillis",
             ParquetValue::TimeMicros(_) => "TimeMicros",
+            ParquetValue::TimeNanos(_) => "TimeNanos",
             ParquetValue::List(_) => "List",
             ParquetValue::Map(_) => "Map",
             ParquetValue::Record(_) => "Record",

data/ext/parquet-core/src/writer.rs CHANGED Viewed

@@ -235,6 +235,7 @@ where
             (Date64(_), DataType::Date64) => 8,
             (TimeMillis(_), DataType::Time32(_)) => 4,
             (TimeMicros(_), DataType::Time64(_)) => 8,
+            (TimeNanos(_), DataType::Time64(_)) => 8,
             (TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
             (TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
             (TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
@@ -364,7 +365,9 @@ where
             writer.write(&batch)?;
             // Check if we need to flush based on memory usage
-            if writer.in_progress_size() >= self.memory_threshold {
+            if writer.in_progress_size() >= self.memory_threshold
+                || writer.memory_size() >= self.memory_threshold
+            {
                 writer.flush()?;
             }
         } else {
@@ -496,6 +499,7 @@ fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str)
         (Date64(_), DataType::Date64) => Ok(()),
         (TimeMillis(_), DataType::Time32(_)) => Ok(()),
         (TimeMicros(_), DataType::Time64(_)) => Ok(()),
+        (TimeNanos(_), DataType::Time64(_)) => Ok(()),
         (TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
         (TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
         (TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
@@ -591,10 +595,16 @@ fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
             name,
             primitive_type,
             nullable,
-            ..
+            format,
         } => {
             let data_type = primitive_type_to_arrow(primitive_type)?;
-            Ok(Field::new(name, data_type, *nullable))
+            let field = Field::new(name, data_type, *nullable);
+            let extended_field = if format.as_deref() == Some("uuid") {
+                field.with_extension_type(arrow_schema::extension::Uuid)
+            } else {
+                field
+            };
+            Ok(extended_field)
         }
         SchemaNode::List {
             name,
@@ -671,6 +681,7 @@ fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
         Date32 => DataType::Date32,
         TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
         TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
+        TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
         TimestampMillis(tz) => DataType::Timestamp(
             arrow_schema::TimeUnit::Millisecond,
             // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means

data/ext/parquet-core/tests/arrow_conversion_tests.rs CHANGED Viewed

@@ -99,7 +99,7 @@ fn test_decimal256_large_values() {
     // Verify roundtrip
     for i in 0..4 {
-        let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
+        let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
         match (i, value) {
             (0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
             (1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
@@ -173,7 +173,7 @@ fn test_timestamp_with_timezone() {
     // Verify roundtrip preserves timezone
     for i in 0..3 {
-        let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
+        let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
         match value {
             ParquetValue::TimestampMillis(_, Some(tz)) => {
                 assert_eq!(tz.as_ref(), "America/New_York");
@@ -209,7 +209,7 @@ fn test_nested_list_of_lists() {
     assert_eq!(array.len(), 1);
     // Verify roundtrip
-    let value = arrow_to_parquet_value(array.as_ref(), 0).unwrap();
+    let value = arrow_to_parquet_value(&outer_field, array.as_ref(), 0).unwrap();
     match value {
         ParquetValue::List(items) => assert_eq!(items.len(), 5),
         _ => panic!("Expected list"),
@@ -357,7 +357,7 @@ fn test_unsupported_arrow_types() {
     )
     .unwrap();
-    let result = arrow_to_parquet_value(&array, 0);
+    let result = arrow_to_parquet_value(&Field::new("int", DataType::Int32, false), &array, 0);
     assert!(result.is_err());
     assert!(result
         .unwrap_err()

data/ext/parquet-ruby-adapter/Cargo.toml CHANGED Viewed

@@ -20,3 +20,4 @@ rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
 tempfile = "^3.15"
 thiserror = "2.0"
 indexmap = "2.2"
+uuid = "*"

data/ext/parquet-ruby-adapter/src/converter.rs CHANGED Viewed

@@ -41,27 +41,6 @@ impl RubyValueConverter {
             .map(|cache| cache.stats())
     }
-    /// Convert a Ruby value to ParquetValue with type hint
-    /// This is the primary conversion method that handles all Ruby types
-    pub fn to_parquet_with_type_hint(
-        &mut self,
-        value: Value,
-        type_hint: Option<&parquet_core::PrimitiveType>,
-    ) -> Result<ParquetValue> {
-        // Handle nil values
-        if value.is_nil() {
-            return Ok(ParquetValue::Null);
-        }
-        // If we have a type hint, use it to guide conversion
-        if let Some(hint) = type_hint {
-            return self.convert_with_type_hint(value, hint);
-        }
-        // Otherwise, infer type from Ruby value
-        self.infer_and_convert(value)
-    }
     /// Convert a Ruby value to ParquetValue with schema hint
     /// This handles both primitive and complex types
     pub fn to_parquet_with_schema_hint(
@@ -115,7 +94,7 @@ impl RubyValueConverter {
         use parquet_core::PrimitiveType::*;
         // Special handling for UUID format
-        if let (Binary, Some("uuid")) = (type_hint, format) {
+        if let (FixedLenByteArray(16), Some("uuid")) = (type_hint, format) {
             return self.convert_to_uuid_binary(value);
         }
@@ -156,6 +135,7 @@ impl RubyValueConverter {
             Date64 => self.convert_to_date64(value, None),
             TimeMillis => self.convert_to_time_millis(value),
             TimeMicros => self.convert_to_time_micros(value),
+            TimeNanos => self.convert_to_time_nanos(value),
             TimestampSecond(schema_tz) => {
                 self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
             }
@@ -484,32 +464,19 @@ impl RubyValueConverter {
         // Convert value to string
         let uuid_str: String = value
-            .funcall("to_s", ())
-            .and_then(TryConvert::try_convert)
+            .to_r_string()
+            .map_err(|e: MagnusError| {
+                ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
+            })?
+            .to_string()
             .map_err(|e: MagnusError| {
                 ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
             })?;
-        // Remove hyphens and validate length
-        let clean_uuid = uuid_str.replace('-', "");
-        if clean_uuid.len() != 32 {
-            return Err(ParquetError::Conversion(format!(
-                "Invalid UUID format: expected 32 hex characters (ignoring hyphens), got {}",
-                clean_uuid.len()
-            )));
-        }
-        // Parse hex string to bytes
-        let mut bytes = Vec::with_capacity(16);
-        for i in 0..16 {
-            let hex_byte = &clean_uuid[i * 2..i * 2 + 2];
-            let byte = u8::from_str_radix(hex_byte, 16).map_err(|_| {
-                ParquetError::Conversion(format!("Invalid hex character in UUID: {}", hex_byte))
-            })?;
-            bytes.push(byte);
-        }
-        Ok(ParquetValue::Bytes(bytes.into()))
+        let parsed = uuid::Uuid::parse_str(&uuid_str)
+            .map_err(|e| ParquetError::Conversion(format!("Failed to parse UUID: {}", e)))?;
+        let bytes = Bytes::copy_from_slice(parsed.as_bytes());
+        Ok(ParquetValue::Bytes(bytes))
     }
     fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
@@ -692,6 +659,38 @@ impl RubyValueConverter {
         )))
     }
+    fn convert_to_time_nanos(&self, value: Value) -> Result<ParquetValue> {
+        if value.is_nil() {
+            return Ok(ParquetValue::Null);
+        }
+        // Convert to microseconds since midnight
+        let ruby = Ruby::get()
+            .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
+        if value.is_kind_of(ruby.class_time()) {
+            let hour: i64 = value
+                .funcall("hour", ())
+                .map_err(|e| ParquetError::Conversion(e.to_string()))?;
+            let min: i64 = value
+                .funcall("min", ())
+                .map_err(|e| ParquetError::Conversion(e.to_string()))?;
+            let sec: i64 = value
+                .funcall("sec", ())
+                .map_err(|e| ParquetError::Conversion(e.to_string()))?;
+            let nsec: i64 = value
+                .funcall("nsec", ())
+                .map_err(|e| ParquetError::Conversion(e.to_string()))?;
+            let nanos = (hour * 3600 + min * 60 + sec) * 1_000_000_000 + nsec;
+            return Ok(ParquetValue::TimeNanos(nanos));
+        }
+        Err(ParquetError::Conversion(format!(
+            "Cannot convert {} to time_micros",
+            value.class()
+        )))
+    }
     // Timestamp conversion methods that respect schema timezone
     fn convert_to_timestamp_second_with_tz(
         &self,
@@ -1399,21 +1398,8 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
         ParquetValue::Float32(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
         ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
         ParquetValue::String(s) => Ok(s.into_value_with(&ruby)),
-        ParquetValue::Bytes(b) => {
-            // Check if this is a UUID (16 bytes)
-            if b.len() == 16 {
-                // Format as UUID string
-                let uuid_str = format!(
-                    "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
-                    b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
-                    b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
-                );
-                Ok(uuid_str.into_value_with(&ruby))
-            } else {
-                // Regular bytes - convert to string
-                Ok(ruby.str_from_slice(&b).as_value())
-            }
-        }
+        ParquetValue::Uuid(u) => Ok(u.to_string().into_value_with(&ruby)),
+        ParquetValue::Bytes(b) => Ok(ruby.enc_str_new(&b, ruby.ascii8bit_encoding()).as_value()),
         ParquetValue::Date32(days) => {
             // Convert days since epoch to Date object
             let _ = ruby.require("date");
@@ -1528,6 +1514,14 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
                 .map_err(|e| ParquetError::Conversion(e.to_string()))?;
             apply_timezone(time, &tz)
         }
+        ParquetValue::TimeNanos(nanos) => {
+            let time_class = ruby.class_time();
+            let secs = nanos / 1_000_000_000;
+            let nsec = nanos % 1_000_000_000;
+            time_class
+                .funcall("at", (secs, nsec, Symbol::new("nanosecond")))
+                .map_err(|e| ParquetError::Conversion(e.to_string()))
+        }
         ParquetValue::TimestampNanos(nanos, tz) => {
             let time_class = ruby.class_time();
             let secs = nanos / 1_000_000_000;

data/ext/parquet-ruby-adapter/src/reader.rs CHANGED Viewed

@@ -2,6 +2,7 @@ use magnus::value::ReprValue;
 use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
 use parquet_core::reader::Reader;
+use crate::StringCache;
 use crate::{
     converter::parquet_to_ruby,
     io::{RubyIOReader, ThreadSafeRubyIOReader},
@@ -101,6 +102,12 @@ pub fn each_row(
     })?;
     let mut row_count = 0u64;
+    let mut cache = StringCache::new(true);
+    let interned_column_names = column_names
+        .iter()
+        .map(|name| cache.intern(name.clone()))
+        .collect::<Vec<_>>();
     for row_result in row_iter {
         let row = row_result
             .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
@@ -120,11 +127,11 @@ pub fn each_row(
             ParserResultType::Hash => {
                 let hash: RHash = ruby.hash_new();
                 for (idx, value) in row.into_iter().enumerate() {
-                    if idx < column_names.len() {
+                    if idx < interned_column_names.len() {
                         let ruby_value = parquet_to_ruby(value).map_err(|e| {
                             MagnusError::new(ruby.exception_runtime_error(), e.to_string())
                         })?;
-                        hash.aset(column_names[idx].as_str(), ruby_value)?;
+                        hash.aset(interned_column_names[idx].as_ref(), ruby_value)?;
                     }
                 }
                 hash.as_value()

data/ext/parquet-ruby-adapter/src/schema.rs CHANGED Viewed

@@ -1,8 +1,9 @@
 use magnus::value::ReprValue;
 use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
-use parquet_core::{ParquetError, PrimitiveType, Result, Schema, SchemaNode};
+use parquet_core::{ParquetError, PrimitiveType, Schema, SchemaNode};
 use crate::utils::parse_string_or_symbol;
+use crate::RubyAdapterError;
 /// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
 pub struct RubySchemaBuilder;
@@ -13,18 +14,18 @@ impl RubySchemaBuilder {
     }
     /// Parse a Ruby schema definition (hash) into a SchemaNode
-    fn parse_schema_node(&self, name: String, schema_def: Value) -> Result<SchemaNode> {
+    fn parse_schema_node(
+        &self,
+        name: String,
+        schema_def: Value,
+    ) -> Result<SchemaNode, RubyAdapterError> {
         // If it's a Hash, parse it as a complex type
         if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
             return self.parse_hash_schema_node(name, hash);
         }
         // Otherwise, try to parse as a simple type symbol
-        if let Ok(type_sym) = <Symbol as TryConvert>::try_convert(schema_def) {
-            let type_str = type_sym.name().map_err(|e: MagnusError| {
-                ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
-            })?;
+        if let Ok(type_str) = schema_def.to_r_string()?.to_string() {
             // Check if it's a complex type with angle brackets
             if type_str.contains('<') {
                 return self.parse_complex_type_string(name, type_str.to_string(), true);
@@ -40,22 +41,24 @@ impl RubySchemaBuilder {
             });
         }
-        Err(ParquetError::Schema(format!(
+        Err(RubyAdapterError::InvalidInput(format!(
             "Expected Hash or Symbol for schema definition, got {}",
             schema_def.class()
         )))
     }
     /// Parse a Ruby hash schema node
-    fn parse_hash_schema_node(&self, name: String, hash: RHash) -> Result<SchemaNode> {
+    fn parse_hash_schema_node(
+        &self,
+        name: String,
+        hash: RHash,
+    ) -> Result<SchemaNode, RubyAdapterError> {
         // Get the type field
-        let type_sym: Symbol = hash
-            .fetch::<_, Symbol>(Symbol::new("type"))
+        let type_sym: Value = hash
+            .fetch::<_, Value>(Symbol::new("type"))
             .map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
-        let type_str = type_sym.name().map_err(|e: MagnusError| {
-            ParquetError::Conversion(format!("Failed to get type name: {}", e))
-        })?;
+        let type_str = type_sym.to_r_string()?.to_string()?;
         // Get nullable field (default to true)
         let nullable = hash
@@ -142,6 +145,15 @@ impl RubySchemaBuilder {
             // Primitive types
             primitive_type => {
+                if format.as_deref() == Some("uuid") {
+                    return Ok(SchemaNode::Primitive {
+                        name,
+                        primitive_type: PrimitiveType::FixedLenByteArray(16),
+                        nullable,
+                        format,
+                    });
+                }
                 // Get precision and scale for decimal types
                 let precision = hash
                     .fetch::<_, Value>(Symbol::new("precision"))
@@ -196,7 +208,7 @@ impl RubySchemaBuilder {
         name: String,
         type_str: String,
         nullable: bool,
-    ) -> Result<SchemaNode> {
+    ) -> Result<SchemaNode, RubyAdapterError> {
         if type_str.starts_with("list<") && type_str.ends_with('>') {
             let inner_type = &type_str[5..type_str.len() - 1];
             let item_name = format!("{}_item", name);
@@ -229,7 +241,7 @@ impl RubySchemaBuilder {
             let inner = &type_str[4..type_str.len() - 1];
             let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
             if parts.len() != 2 {
-                return Err(ParquetError::Schema(format!(
+                return Err(RubyAdapterError::InvalidInput(format!(
                     "Invalid map type: {}",
                     type_str
                 )));
@@ -255,7 +267,7 @@ impl RubySchemaBuilder {
                 }),
             })
         } else {
-            Err(ParquetError::Schema(format!(
+            Err(RubyAdapterError::InvalidInput(format!(
                 "Unknown complex type: {}",
                 type_str
             )))
@@ -263,7 +275,7 @@ impl RubySchemaBuilder {
     }
     /// Parse a field definition from a Ruby hash
-    fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
+    fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode, RubyAdapterError> {
         let name: String = field_hash
             .fetch(Symbol::new("name"))
             .map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
@@ -274,7 +286,7 @@ impl RubySchemaBuilder {
             self.parse_schema_node(name, field_hash.as_value())
         } else {
             // This might be a simplified definition - look for known field patterns
-            Err(ParquetError::Schema(format!(
+            Err(RubyAdapterError::InvalidInput(format!(
                 "Field '{}' missing 'type' definition",
                 name
             )))
@@ -288,7 +300,7 @@ impl RubySchemaBuilder {
         precision: Option<u8>,
         scale: Option<i8>,
         timezone: Option<String>,
-    ) -> Result<PrimitiveType> {
+    ) -> Result<PrimitiveType, RubyAdapterError> {
         // Check if it's a decimal type with parentheses notation like "decimal(5,2)"
         if type_str.starts_with("decimal(") && type_str.ends_with(')') {
             let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
@@ -324,6 +336,14 @@ impl RubySchemaBuilder {
             }
         }
+        if type_str.starts_with("fixed_len_byte_array(") && type_str.ends_with(')') {
+            let params = &type_str[20..type_str.len() - 1];
+            let len = params.parse::<i32>().map_err(|_| {
+                ParquetError::Schema(format!("Invalid fixed_len_byte_array length: {}", params))
+            })?;
+            return Ok(PrimitiveType::FixedLenByteArray(len));
+        }
         match type_str.as_str() {
             "boolean" | "bool" => Ok(PrimitiveType::Boolean),
             "int8" => Ok(PrimitiveType::Int8),
@@ -356,8 +376,9 @@ impl RubySchemaBuilder {
                 // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
                 Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
             }
-            "time32" | "time_millis" => Ok(PrimitiveType::TimeMillis),
-            "time64" | "time_micros" => Ok(PrimitiveType::TimeMicros),
+            "time_millis" => Ok(PrimitiveType::TimeMillis),
+            "time_micros" => Ok(PrimitiveType::TimeMicros),
+            "time_nanos" => Ok(PrimitiveType::TimeNanos),
             "decimal" => {
                 // Use provided precision/scale or defaults
                 let p = precision.unwrap_or(38);
@@ -380,7 +401,7 @@ impl RubySchemaBuilder {
                 let s = scale.unwrap_or(0);
                 Ok(PrimitiveType::Decimal256(p, s))
             }
-            _ => Err(ParquetError::Schema(format!(
+            _ => Err(RubyAdapterError::InvalidInput(format!(
                 "Unknown primitive type: {}",
                 type_str
             ))),
@@ -396,7 +417,7 @@ impl Default for RubySchemaBuilder {
 /// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
 /// and Ruby Value is not Send/Sync
-pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
+pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema, RubyAdapterError> {
     let builder = RubySchemaBuilder::new();
     // The Ruby schema should be a hash with a root struct
@@ -430,7 +451,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
         let mut unique_names = std::collections::HashSet::new();
         for name in &field_names {
             if !unique_names.insert(name) {
-                return Err(ParquetError::Schema(format!(
+                return Err(RubyAdapterError::InvalidInput(format!(
                     "Duplicate field names in root level schema: {:?}",
                     field_names
                 )));
@@ -443,7 +464,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
             fields: field_nodes,
         }
     } else {
-        return Err(ParquetError::Schema(
+        return Err(RubyAdapterError::InvalidInput(
             "Schema must have 'type' or 'fields' key".to_string(),
         ));
     };
@@ -452,18 +473,18 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
     parquet_core::SchemaBuilder::new()
         .with_root(root_node)
         .build()
-        .map_err(|e| ParquetError::Schema(e.to_string()))
+        .map_err(|e| RubyAdapterError::InvalidInput(e.to_string()))
 }
 /// Convert a Parquet schema back to Ruby representation
-pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
+pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value, RubyAdapterError> {
     let ruby = Ruby::get()
         .map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
     schema_node_to_ruby(&schema.root, &ruby)
 }
-fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
+fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value, RubyAdapterError> {
     let hash = RHash::new();
     match node {
@@ -552,6 +573,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
                 PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
                 PrimitiveType::TimeMillis => Symbol::new("time_millis"),
                 PrimitiveType::TimeMicros => Symbol::new("time_micros"),
+                PrimitiveType::TimeNanos => Symbol::new("time_nanos"),
                 PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
                 PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
                 PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
@@ -597,7 +619,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
 /// Convert old schema format to new format
 /// Old: [{ "column_name" => "type" }, ...]
 /// New: [{ name: "column_name", type: :type }, ...]
-pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
+pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray, RubyAdapterError> {
     let new_schema = RArray::new();
     for item in schema.into_iter() {
@@ -630,7 +652,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
         );
         if let Err(e) = process_result {
-            return Err(ParquetError::Schema(format!(
+            return Err(RubyAdapterError::InvalidInput(format!(
                 "Failed to process field: {}",
                 e
             )));
@@ -645,7 +667,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
 }
 /// Check if schema is in new DSL format (hash with type: :struct)
-pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
+pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool, RubyAdapterError> {
     if !schema_value.is_kind_of(ruby.class_hash()) {
         return Ok(false);
     }
@@ -678,7 +700,7 @@ pub fn process_schema_value(
     ruby: &Ruby,
     schema_value: Value,
     data_array: Option<&RArray>,
-) -> Result<Value> {
+) -> Result<Value, RubyAdapterError> {
     // Check if it's the new DSL format
     if is_dsl_schema(ruby, schema_value)? {
         // For DSL format, pass it directly to ruby_schema_to_parquet
@@ -716,7 +738,7 @@ pub fn process_schema_value(
                     convert_legacy_schema(ruby, array)?
                 }
             } else {
-                return Err(ParquetError::Schema(
+                return Err(RubyAdapterError::InvalidInput(
                     "schema array must contain hashes".to_string(),
                 ));
             }
@@ -733,13 +755,13 @@ pub fn process_schema_value(
                 ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
             })?
         } else {
-            return Err(ParquetError::Schema(
+            return Err(RubyAdapterError::InvalidInput(
                 "schema hash must have 'fields' key or be in DSL format with 'type' key"
                     .to_string(),
             ));
         }
     } else {
-        return Err(ParquetError::Schema(
+        return Err(RubyAdapterError::InvalidInput(
             "schema must be nil, an array, or a hash".to_string(),
         ));
     };
@@ -748,7 +770,7 @@ pub fn process_schema_value(
     if schema_array.is_empty() {
         if let Some(data) = data_array {
             if data.is_empty() {
-                return Err(ParquetError::Schema(
+                return Err(RubyAdapterError::InvalidInput(
                     "Cannot infer schema from empty data".to_string(),
                 ));
             }
@@ -767,7 +789,7 @@ pub fn process_schema_value(
                     })?;
                 first_array.len()
             } else {
-                return Err(ParquetError::Schema(
+                return Err(RubyAdapterError::InvalidInput(
                     "First data item must be an array".to_string(),
                 ));
             };
@@ -793,7 +815,7 @@ pub fn process_schema_value(
             schema_array = new_schema;
         } else {
-            return Err(ParquetError::Schema(
+            return Err(RubyAdapterError::InvalidInput(
                 "Schema is required when data is not provided for inference".to_string(),
             ));
         }

data/ext/parquet-ruby-adapter/src/string_cache.rs CHANGED Viewed

@@ -1,15 +1,15 @@
 use std::collections::HashMap;
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, LazyLock, Mutex};
 use magnus::RString;
+static STRING_CACHE: LazyLock<Mutex<HashMap<String, &'static str>>> =
+    LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
 /// A cache for interning strings in the Ruby VM to reduce memory usage
 /// when there are many repeated strings
 #[derive(Debug)]
 pub struct StringCache {
-    /// The actual cache is shared behind an Arc<Mutex> to allow cloning
-    /// while maintaining a single global cache
-    cache: Arc<Mutex<HashMap<String, &'static str>>>,
     enabled: bool,
     hits: Arc<Mutex<usize>>,
     misses: Arc<Mutex<usize>>,
@@ -19,7 +19,6 @@ impl StringCache {
     /// Create a new string cache
     pub fn new(enabled: bool) -> Self {
         Self {
-            cache: Arc::new(Mutex::new(HashMap::new())),
             enabled,
             hits: Arc::new(Mutex::new(0)),
             misses: Arc::new(Mutex::new(0)),
@@ -36,9 +35,9 @@ impl StringCache {
         // Try to get or create the interned string
         let result = (|| -> Result<(), String> {
-            let mut cache = self.cache.lock().map_err(|e| e.to_string())?;
+            let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
-            if cache.contains_key(&s) {
+            if cache.contains_key(s.as_str()) {
                 let mut hits = self.hits.lock().map_err(|e| e.to_string())?;
                 *hits += 1;
             } else {
@@ -65,7 +64,7 @@ impl StringCache {
     /// Get cache statistics
     pub fn stats(&self) -> CacheStats {
-        let cache_size = self.cache.lock().map(|c| c.len()).unwrap_or(0);
+        let cache_size = STRING_CACHE.lock().map(|c| c.len()).unwrap_or(0);
         let hits = self.hits.lock().map(|h| *h).unwrap_or(0);
         let misses = self.misses.lock().map(|m| *m).unwrap_or(0);
@@ -84,7 +83,7 @@ impl StringCache {
     /// Clear the cache
     pub fn clear(&mut self) {
-        if let Ok(mut cache) = self.cache.lock() {
+        if let Ok(mut cache) = STRING_CACHE.lock() {
             cache.clear();
         }
         if let Ok(mut hits) = self.hits.lock() {

data/ext/parquet-ruby-adapter/src/utils.rs CHANGED Viewed

@@ -26,6 +26,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
         ParquetValue::Float64(_) => 8,
         ParquetValue::String(s) => s.len() + 24, // String overhead
         ParquetValue::Bytes(b) => b.len() + 24,  // Vec overhead
+        ParquetValue::Uuid(_) => 16,
         ParquetValue::Date32(_) => 4,
         ParquetValue::Date64(_) => 8,
         ParquetValue::Decimal128(_, _) => 16 + 1, // value + scale
@@ -36,6 +37,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
         ParquetValue::TimestampNanos(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
         ParquetValue::TimeMillis(_) => 4,
         ParquetValue::TimeMicros(_) => 8,
+        ParquetValue::TimeNanos(_) => 8,
         ParquetValue::List(items) => {
             24 + items.iter().map(estimate_parquet_value_size).sum::<usize>()
         }

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.6.1"
+  VERSION = "0.6.2"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.6.1
+  version: 0.6.2
 platform: ruby
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-07-03 00:00:00.000000000 Z
+date: 2025-07-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys