RubyGems - parquet - Versions diffs - 0.5.10 → 0.5.12 - Mend

parquet 0.5.10 → 0.5.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +4 -4
data/README.md +1 -0
data/ext/parquet/src/reader/common.rs +11 -65
data/ext/parquet/src/reader/mod.rs +2 -7
data/ext/parquet/src/reader/unified/mod.rs +14 -82
data/ext/parquet/src/types/core_types.rs +2 -0
data/ext/parquet/src/types/mod.rs +3 -7
data/ext/parquet/src/types/parquet_value.rs +108 -4
data/ext/parquet/src/types/schema_node.rs +8 -0
data/ext/parquet/src/types/timestamp.rs +222 -25
data/ext/parquet/src/types/type_conversion.rs +204 -0
data/ext/parquet/src/types/writer_types.rs +6 -1
data/ext/parquet/src/writer/mod.rs +10 -0
data/ext/parquet/src/writer/write_columns.rs +6 -0
data/ext/parquet/src/writer/write_rows.rs +5 -3
data/lib/parquet/version.rb +1 -1
metadata +2 -4
data/ext/parquet/src/reader/arrow_reader.rs +0 -579
data/ext/parquet/src/reader/format_detector.rs +0 -69

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 114891cfa5fa190e1f00d44803327f1c90cc11f64ba23f7f2a9cc9f9379da787
-  data.tar.gz: 9168b2be960faa93ce9c84d170c6e8f73819535bcedbf3d3b26869ff9829ecc6
+  metadata.gz: 4f2474bf56190257826281d5135739d010ad3a8e51a30eea807d03fc147f7300
+  data.tar.gz: 880084ad0ceb3836195588ce834583359ecf65304e826fd3b025590b960fed37
 SHA512:
-  metadata.gz: f07f99a188ac5fa0663616fba00b1990a2cbd6bb14462383915f0e1617c26c5ca481840c16179958f2b3760b334f176e2e4542d95e3cc922379948ac2b0bfa61
-  data.tar.gz: 42c7b0779d6e3fa46addc5fa92420f326418a54962d391e9b063db8378f8a5f8c2916b43f356649fc127e8fc582aa1e98d7afd71f0bc5f9700a0664ed46313f6
+  metadata.gz: 71348a0d7a46fdb32467a15466201898f8752ec232fe279d30e30631a0d876a639474cfc492c92316c131d7ed057ded4eb9d8a2cbb4b13fb81ebd46de806aa51
+  data.tar.gz: 476b7f307813c3163088557b0b5af62117a9ccb3bb964d5d5e072d2634a9ad11eab3712b4d6585f7007fbbd5f872ebf4d2d5b296de885048a04f6313209dd179

data/README.md CHANGED Viewed

@@ -265,6 +265,7 @@ The following data types are supported in the schema:
 - `boolean`
 - `date32`
 - `timestamp_millis`, `timestamp_micros`
+- `time_millis`, `time_micros`
 ### Schema DSL for Complex Data Types

data/ext/parquet/src/reader/common.rs CHANGED Viewed

@@ -12,81 +12,27 @@ use magnus::value::ReprValue;
 use magnus::{Error as MagnusError, Ruby, Value};
 use crate::header_cache::StringCache;
-use crate::logger::RubyLogger;
 use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
 use crate::types::{ParquetGemError, TryIntoValue};
 use crate::ColumnRecord;
-use super::format_detector::{detect_file_format, detect_format_from_extension, FileFormat};
-/// Represents the different data sources we can open
-pub enum DataSource {
-    Parquet(Either<File, ThreadSafeRubyReader>),
-    Arrow(Either<File, ThreadSafeRubyReader>),
-}
-/// Opens a data file (Parquet or Arrow) for reading, automatically detecting the format
-pub fn open_data_source(
+/// Opens a parquet file or IO-like object for reading
+///
+/// This function handles both file paths (as strings) and IO-like objects,
+/// returning either a File or a ThreadSafeRubyReader that can be used with
+/// parquet readers.
+pub fn open_parquet_source(
     ruby: Rc<Ruby>,
     to_read: Value,
-    ruby_logger: &RubyLogger,
-) -> Result<DataSource, ParquetGemError> {
+) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
     if to_read.is_kind_of(ruby.class_string()) {
         let path_string = to_read.to_r_string()?;
         let file_path = unsafe { path_string.as_str()? };
-        // Try to detect format from extension first
-        let format_hint = detect_format_from_extension(file_path);
-        let mut file = File::open(file_path).map_err(ParquetGemError::from)?;
-        // Detect actual format from file content
-        let format = detect_file_format(&mut file)?;
-        // Warn if extension doesn't match content
-        if let Some(hint) = format_hint {
-            if hint != format {
-                ruby_logger.warn(|| {
-                    format!(
-                        "Extension implied format {:?} but actual format is {:?}",
-                        hint, format
-                    )
-                })?;
-            }
-        }
-        match format {
-            FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
-            FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
-        }
+        let file = File::open(file_path).map_err(ParquetGemError::from)?;
+        Ok(Either::Left(file))
     } else {
-        // For IO-like objects, we need to use a temporary file
-        use std::io::{Read, Write};
-        use tempfile::NamedTempFile;
-        let mut readable = RubyReader::new(ruby.clone(), to_read)?;
-        let mut temp_file = NamedTempFile::new().map_err(ParquetGemError::from)?;
-        // Copy the entire content to the temporary file
-        let mut buffer = vec![0u8; 8192];
-        loop {
-            let bytes_read = readable.read(&mut buffer)?;
-            if bytes_read == 0 {
-                break;
-            }
-            temp_file.write_all(&buffer[..bytes_read])?;
-        }
-        temp_file.flush()?;
-        // Detect format from the temporary file
-        let mut file = temp_file.reopen()?;
-        let format = detect_file_format(&mut file)?;
-        // Use the temporary file as the source
-        match format {
-            FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
-            FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
-        }
+        let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
+        Ok(Either::Right(readable))
     }
 }

data/ext/parquet/src/reader/mod.rs CHANGED Viewed

@@ -1,6 +1,4 @@
-mod arrow_reader;
 mod common;
-mod format_detector;
 mod parquet_column_reader;
 mod parquet_row_reader;
 mod unified;
@@ -190,10 +188,7 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
     if args.len() != 1 {
         return Err(MagnusError::new(
             magnus::exception::arg_error(),
-            format!(
-                "metadata expects exactly 1 argument (file path or IO-like object), got {}",
-                args.len()
-            ),
+            format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
         ));
     }
@@ -213,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
     let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
     Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
-}
+}

data/ext/parquet/src/reader/unified/mod.rs CHANGED Viewed

@@ -13,11 +13,8 @@ use std::collections::HashMap;
 use std::rc::Rc;
 use std::sync::OnceLock;
-use super::arrow_reader::{
-    process_arrow_column_data, process_arrow_file_column_data, process_arrow_row_data,
-};
 use super::common::{
-    create_batch_reader, handle_block_or_enum, handle_empty_file, open_data_source, DataSource,
+    create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
 };
 use crate::types::ArrayWrapper;
@@ -103,99 +100,34 @@ pub fn parse_parquet_unified(
         }
     }
-    // Open the data source and detect format
-    let source = open_data_source(ruby.clone(), to_read, &ruby_logger)?;
+    // Open the Parquet source
+    let source = open_parquet_source(ruby.clone(), to_read)?;
-    // Based on the source format and parser type, handle the data differently
-    match (source, &parser_type) {
-        (DataSource::Parquet(reader), ParserType::Row { strict }) => {
-            // Handle Parquet row-based parsing
+    // Based on the parser type, handle the data differently
+    match parser_type {
+        ParserType::Row { strict } => {
+            // Handle row-based parsing
             process_row_data(
                 ruby.clone(),
-                reader,
+                source,
                 &columns,
                 result_type,
-                *strict,
+                strict,
                 &ruby_logger,
             )?;
         }
-        (DataSource::Parquet(reader), ParserType::Column { batch_size, strict }) => {
-            // Handle Parquet column-based parsing
+        ParserType::Column { batch_size, strict } => {
+            // Handle column-based parsing
             process_column_data(
                 ruby.clone(),
-                reader,
+                source,
                 &columns,
                 result_type,
-                *batch_size,
-                *strict,
+                batch_size,
+                strict,
                 &ruby_logger,
             )?;
         }
-        (DataSource::Arrow(reader), ParserType::Row { strict }) => {
-            // Handle Arrow row-based parsing
-            match reader {
-                Either::Left(file) => {
-                    // For seekable files, use FileReader which handles IPC file format
-                    use arrow_ipc::reader::FileReader;
-                    let file_reader = FileReader::try_new(file, None)
-                        .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
-                    use super::arrow_reader::process_arrow_file_row_data;
-                    process_arrow_file_row_data(
-                        ruby.clone(),
-                        file_reader,
-                        &columns,
-                        result_type,
-                        *strict,
-                        &ruby_logger,
-                    )?;
-                }
-                Either::Right(readable) => {
-                    use arrow_ipc::reader::StreamReader;
-                    let stream_reader = StreamReader::try_new(readable, None)
-                        .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
-                    process_arrow_row_data(
-                        ruby.clone(),
-                        stream_reader,
-                        &columns,
-                        result_type,
-                        *strict,
-                        &ruby_logger,
-                    )?;
-                }
-            }
-        }
-        (DataSource::Arrow(reader), ParserType::Column { batch_size, strict }) => {
-            // Handle Arrow column-based parsing
-            match reader {
-                Either::Left(file) => {
-                    // For seekable files, we can use the optimized FileReader
-                    process_arrow_file_column_data(
-                        ruby.clone(),
-                        file,
-                        &columns,
-                        result_type,
-                        *batch_size,
-                        *strict,
-                        &ruby_logger,
-                    )?;
-                }
-                Either::Right(readable) => {
-                    use arrow_ipc::reader::StreamReader;
-                    let stream_reader = StreamReader::try_new(readable, None)
-                        .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
-                    process_arrow_column_data(
-                        ruby.clone(),
-                        stream_reader,
-                        &columns,
-                        result_type,
-                        *batch_size,
-                        *strict,
-                        &ruby_logger,
-                    )?;
-                }
-            }
-        }
     }
     Ok(ruby.qnil().into_value_with(&ruby))

data/ext/parquet/src/types/core_types.rs CHANGED Viewed

@@ -115,4 +115,6 @@ pub enum PrimitiveType {
     Date32,
     TimestampMillis,
     TimestampMicros,
+    TimeMillis,
+    TimeMicros,
 }

data/ext/parquet/src/types/mod.rs CHANGED Viewed

@@ -25,9 +25,9 @@ use arrow_array::cast::downcast_array;
 use arrow_array::{
     Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
     Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
-    ListArray, NullArray, StringArray, StructArray, TimestampMicrosecondArray,
-    TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
-    UInt32Array, UInt64Array, UInt8Array,
+    ListArray, NullArray, StringArray, StructArray, Time32MillisecondArray, Time64MicrosecondArray,
+    TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
+    TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
 };
 use arrow_schema::{DataType, TimeUnit};
 use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
@@ -55,10 +55,6 @@ pub enum ParquetGemError {
     Parquet(#[from] parquet::errors::ParquetError),
     #[error("Arrow error: {0}")]
     Arrow(#[from] arrow_schema::ArrowError),
-    #[error("Arrow IPC error: {0}")]
-    ArrowIpc(String),
-    #[error("Unknown file format")]
-    UnknownFormat,
     #[error("UTF-8 error: {0}")]
     Utf8Error(#[from] simdutf8::basic::Utf8Error),
     #[error("Jiff error: {0}")]

data/ext/parquet/src/types/parquet_value.rs CHANGED Viewed

@@ -29,6 +29,8 @@ pub enum ParquetValue {
     TimestampMillis(i64, Option<Arc<str>>),
     TimestampMicros(i64, Option<Arc<str>>),
     TimestampNanos(i64, Option<Arc<str>>),
+    TimeMillis(i32),         // Time of day in milliseconds since midnight
+    TimeMicros(i64),         // Time of day in microseconds since midnight
     List(Vec<ParquetValue>), // A list of values (can be empty or have null items)
     // We're not using a separate NilList type anymore - we'll handle nil lists elsewhere
     Map(HashMap<ParquetValue, ParquetValue>),
@@ -108,6 +110,8 @@ impl PartialEq for ParquetValue {
             (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
             (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
             (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
+            (ParquetValue::TimeMillis(a), ParquetValue::TimeMillis(b)) => a == b,
+            (ParquetValue::TimeMicros(a), ParquetValue::TimeMicros(b)) => a == b,
             (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
             (ParquetValue::Null, ParquetValue::Null) => true,
             _ => false,
@@ -160,6 +164,8 @@ impl std::hash::Hash for ParquetValue {
                 ts.hash(state);
                 tz.hash(state);
             }
+            ParquetValue::TimeMillis(t) => t.hash(state),
+            ParquetValue::TimeMicros(t) => t.hash(state),
             ParquetValue::List(l) => l.hash(state),
             ParquetValue::Map(m) => {
                 for (k, v) in m {
@@ -224,6 +230,38 @@ impl TryIntoValue for ParquetValue {
             timestamp @ ParquetValue::TimestampNanos(_, _) => {
                 impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
             }
+            ParquetValue::TimeMillis(millis) => {
+                // Convert time of day in milliseconds to a Ruby Time object
+                // Use epoch date (1970-01-01) with the given time
+                let total_seconds = millis / 1000;
+                let ms = millis % 1000;
+                let hours = total_seconds / 3600;
+                let minutes = (total_seconds % 3600) / 60;
+                let seconds = total_seconds % 60;
+                // Create a Time object for 1970-01-01 with the given time
+                let time_class = handle.class_time();
+                let time = time_class.funcall::<_, _, Value>(
+                    "new",
+                    (1970, 1, 1, hours, minutes, seconds, ms * 1000), // Ruby expects microseconds
+                )?;
+                Ok(time.into_value_with(handle))
+            }
+            ParquetValue::TimeMicros(micros) => {
+                // Convert time of day in microseconds to a Ruby Time object
+                // Use epoch date (1970-01-01) with the given time
+                let total_seconds = micros / 1_000_000;
+                let us = micros % 1_000_000;
+                let hours = total_seconds / 3600;
+                let minutes = (total_seconds % 3600) / 60;
+                let seconds = total_seconds % 60;
+                // Create a Time object for 1970-01-01 with the given time
+                let time_class = handle.class_time();
+                let time = time_class
+                    .funcall::<_, _, Value>("new", (1970, 1, 1, hours, minutes, seconds, us))?;
+                Ok(time.into_value_with(handle))
+            }
             ParquetValue::List(l) => {
                 // For lists, convert to Ruby array and check for specific cases
                 // when we might need to return nil instead of an empty array
@@ -356,12 +394,32 @@ impl ParquetValue {
                     Ok(ParquetValue::Date32(v))
                 }
                 PrimitiveType::TimestampMillis => {
-                    let v = convert_to_timestamp_millis(ruby, value, format)?;
-                    Ok(ParquetValue::TimestampMillis(v, None))
+                    if value.is_kind_of(ruby.class_time()) {
+                        use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
+                        let (v, tz) = ruby_time_to_timestamp_with_tz(value, "millis")?;
+                        Ok(ParquetValue::TimestampMillis(v, tz))
+                    } else {
+                        let v = convert_to_timestamp_millis(ruby, value, format)?;
+                        Ok(ParquetValue::TimestampMillis(v, None))
+                    }
                 }
                 PrimitiveType::TimestampMicros => {
-                    let v = convert_to_timestamp_micros(ruby, value, format)?;
-                    Ok(ParquetValue::TimestampMicros(v, None))
+                    if value.is_kind_of(ruby.class_time()) {
+                        use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
+                        let (v, tz) = ruby_time_to_timestamp_with_tz(value, "micros")?;
+                        Ok(ParquetValue::TimestampMicros(v, tz))
+                    } else {
+                        let v = convert_to_timestamp_micros(ruby, value, format)?;
+                        Ok(ParquetValue::TimestampMicros(v, None))
+                    }
+                }
+                PrimitiveType::TimeMillis => {
+                    let v = convert_to_time_millis(ruby, value, format)?;
+                    Ok(ParquetValue::TimeMillis(v))
+                }
+                PrimitiveType::TimeMicros => {
+                    let v = convert_to_time_micros(ruby, value, format)?;
+                    Ok(ParquetValue::TimeMicros(v))
                 }
             },
             ParquetSchemaType::List(list_field) => {
@@ -980,6 +1038,52 @@ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
                     tz
                 )
             }
+            DataType::Time32(TimeUnit::Millisecond) => {
+                let array = downcast_array::<Time32MillisecondArray>(column.array);
+                Ok(ParquetValueVec(if array.is_nullable() {
+                    array
+                        .values()
+                        .iter()
+                        .enumerate()
+                        .map(|(i, x)| {
+                            if array.is_null(i) {
+                                ParquetValue::Null
+                            } else {
+                                ParquetValue::TimeMillis(*x)
+                            }
+                        })
+                        .collect()
+                } else {
+                    array
+                        .values()
+                        .iter()
+                        .map(|x| ParquetValue::TimeMillis(*x))
+                        .collect()
+                }))
+            }
+            DataType::Time64(TimeUnit::Microsecond) => {
+                let array = downcast_array::<Time64MicrosecondArray>(column.array);
+                Ok(ParquetValueVec(if array.is_nullable() {
+                    array
+                        .values()
+                        .iter()
+                        .enumerate()
+                        .map(|(i, x)| {
+                            if array.is_null(i) {
+                                ParquetValue::Null
+                            } else {
+                                ParquetValue::TimeMicros(*x)
+                            }
+                        })
+                        .collect()
+                } else {
+                    array
+                        .values()
+                        .iter()
+                        .map(|x| ParquetValue::TimeMicros(*x))
+                        .collect()
+                }))
+            }
             DataType::Float16 => {
                 let array = downcast_array::<Float16Array>(column.array);
                 if array.is_nullable() {

data/ext/parquet/src/types/schema_node.rs CHANGED Viewed

@@ -295,6 +295,8 @@ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
         "date" | "date32" => Some(PrimitiveType::Date32),
         "timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
         "timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
+        "time_millis" | "time_ms" => Some(PrimitiveType::TimeMillis),
+        "time_micros" | "time_us" => Some(PrimitiveType::TimeMicros),
         "decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
         "decimal256" => Some(PrimitiveType::Decimal256(38, 0)), // Maximum precision, scale 0
         _ => None,
@@ -337,6 +339,12 @@ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
                 PrimitiveType::TimestampMicros => {
                     ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
                 }
+                PrimitiveType::TimeMillis => {
+                    ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond)
+                }
+                PrimitiveType::TimeMicros => {
+                    ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond)
+                }
             };
             ArrowField::new(name, dt, *nullable)
         }