RubyGems - parquet - Versions diffs - 0.5.12 → 0.6.0 - Mend

parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

checksums.yaml +4 -4
data/Cargo.lock +295 -98
data/Cargo.toml +1 -1
data/Gemfile +1 -0
data/README.md +94 -3
data/ext/parquet/Cargo.toml +8 -5
data/ext/parquet/src/adapter_ffi.rs +156 -0
data/ext/parquet/src/lib.rs +13 -21
data/ext/parquet-core/Cargo.toml +23 -0
data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
data/ext/parquet-core/src/error.rs +163 -0
data/ext/parquet-core/src/lib.rs +60 -0
data/ext/parquet-core/src/reader.rs +263 -0
data/ext/parquet-core/src/schema.rs +283 -0
data/ext/parquet-core/src/test_utils.rs +308 -0
data/ext/parquet-core/src/traits/mod.rs +5 -0
data/ext/parquet-core/src/traits/schema.rs +151 -0
data/ext/parquet-core/src/value.rs +209 -0
data/ext/parquet-core/src/writer.rs +839 -0
data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
data/ext/parquet-core/tests/binary_data.rs +437 -0
data/ext/parquet-core/tests/column_projection.rs +557 -0
data/ext/parquet-core/tests/complex_types.rs +821 -0
data/ext/parquet-core/tests/compression_tests.rs +434 -0
data/ext/parquet-core/tests/concurrent_access.rs +430 -0
data/ext/parquet-core/tests/decimal_tests.rs +488 -0
data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
data/ext/parquet-core/tests/performance_memory.rs +181 -0
data/ext/parquet-core/tests/primitive_types.rs +547 -0
data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
data/ext/parquet-core/tests/temporal_tests.rs +518 -0
data/ext/parquet-core/tests/test_helpers.rs +132 -0
data/ext/parquet-core/tests/writer_tests.rs +545 -0
data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
data/ext/parquet-ruby-adapter/build.rs +5 -0
data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
data/ext/parquet-ruby-adapter/src/error.rs +148 -0
data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
data/ext/parquet-ruby-adapter/src/types.rs +94 -0
data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
data/lib/parquet/schema.rb +19 -0
data/lib/parquet/version.rb +1 -1
metadata +50 -24
data/ext/parquet/src/enumerator.rs +0 -68
data/ext/parquet/src/header_cache.rs +0 -99
data/ext/parquet/src/logger.rs +0 -171
data/ext/parquet/src/reader/common.rs +0 -111
data/ext/parquet/src/reader/mod.rs +0 -211
data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
data/ext/parquet/src/reader/unified/mod.rs +0 -363
data/ext/parquet/src/types/core_types.rs +0 -120
data/ext/parquet/src/types/mod.rs +0 -100
data/ext/parquet/src/types/parquet_value.rs +0 -1275
data/ext/parquet/src/types/record_types.rs +0 -603
data/ext/parquet/src/types/schema_converter.rs +0 -290
data/ext/parquet/src/types/schema_node.rs +0 -424
data/ext/parquet/src/types/timestamp.rs +0 -285
data/ext/parquet/src/types/type_conversion.rs +0 -1949
data/ext/parquet/src/types/writer_types.rs +0 -329
data/ext/parquet/src/utils.rs +0 -184
data/ext/parquet/src/writer/mod.rs +0 -505
data/ext/parquet/src/writer/write_columns.rs +0 -238
data/ext/parquet/src/writer/write_rows.rs +0 -488

data/ext/parquet-ruby-adapter/src/string_cache.rs ADDED Viewed

@@ -0,0 +1,106 @@
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use magnus::RString;
+/// A cache for interning strings in the Ruby VM to reduce memory usage
+/// when there are many repeated strings
+#[derive(Debug)]
+pub struct StringCache {
+    /// The actual cache is shared behind an Arc<Mutex> to allow cloning
+    /// while maintaining a single global cache
+    cache: Arc<Mutex<HashMap<String, &'static str>>>,
+    enabled: bool,
+    hits: Arc<Mutex<usize>>,
+    misses: Arc<Mutex<usize>>,
+}
+impl StringCache {
+    /// Create a new string cache
+    pub fn new(enabled: bool) -> Self {
+        Self {
+            cache: Arc::new(Mutex::new(HashMap::new())),
+            enabled,
+            hits: Arc::new(Mutex::new(0)),
+            misses: Arc::new(Mutex::new(0)),
+        }
+    }
+    /// Intern a string in Ruby's VM, returning the same string for tracking
+    /// Note: We return the input string to maintain API compatibility,
+    /// but internally we ensure it's interned in Ruby's VM
+    pub fn intern(&mut self, s: String) -> Arc<str> {
+        if !self.enabled {
+            return Arc::from(s.as_str());
+        }
+        // Try to get or create the interned string
+        let result = (|| -> Result<(), String> {
+            let mut cache = self.cache.lock().map_err(|e| e.to_string())?;
+            if cache.contains_key(&s) {
+                let mut hits = self.hits.lock().map_err(|e| e.to_string())?;
+                *hits += 1;
+            } else {
+                // Create Ruby string and intern it
+                let rstring = RString::new(&s);
+                let interned = rstring.to_interned_str();
+                let static_str = interned.as_str().map_err(|e| e.to_string())?;
+                cache.insert(s.clone(), static_str);
+                let mut misses = self.misses.lock().map_err(|e| e.to_string())?;
+                *misses += 1;
+            }
+            Ok(())
+        })();
+        // Log any errors but don't fail - just return the string
+        if let Err(e) = result {
+            eprintln!("String cache error: {}", e);
+        }
+        Arc::from(s.as_str())
+    }
+    /// Get cache statistics
+    pub fn stats(&self) -> CacheStats {
+        let cache_size = self.cache.lock().map(|c| c.len()).unwrap_or(0);
+        let hits = self.hits.lock().map(|h| *h).unwrap_or(0);
+        let misses = self.misses.lock().map(|m| *m).unwrap_or(0);
+        CacheStats {
+            enabled: self.enabled,
+            size: cache_size,
+            hits,
+            misses,
+            hit_rate: if hits + misses > 0 {
+                hits as f64 / (hits + misses) as f64
+            } else {
+                0.0
+            },
+        }
+    }
+    /// Clear the cache
+    pub fn clear(&mut self) {
+        if let Ok(mut cache) = self.cache.lock() {
+            cache.clear();
+        }
+        if let Ok(mut hits) = self.hits.lock() {
+            *hits = 0;
+        }
+        if let Ok(mut misses) = self.misses.lock() {
+            *misses = 0;
+        }
+    }
+}
+#[derive(Debug)]
+pub struct CacheStats {
+    pub enabled: bool,
+    pub size: usize,
+    pub hits: usize,
+    pub misses: usize,
+    pub hit_rate: f64,
+}

data/ext/parquet-ruby-adapter/src/try_into_value.rs ADDED Viewed

@@ -0,0 +1,91 @@
+use crate::{error::Result, RubyAdapterError};
+use magnus::{value::ReprValue, IntoValue, Ruby, Value};
+/// Trait for converting Rust values to Ruby values with error handling
+///
+/// This is similar to Magnus's `IntoValue` trait but allows for returning errors
+/// instead of panicking or returning invalid values.
+pub trait TryIntoValue: Sized {
+    /// Convert `self` to a Ruby value with error handling
+    fn try_into_value(self, handle: &Ruby) -> Result<Value>;
+    /// Convert `self` to a Ruby value with error handling, using the Ruby runtime from the current thread
+    fn try_into_value_with_current_thread(self) -> Result<Value> {
+        let ruby =
+            Ruby::get().map_err(|_| RubyAdapterError::runtime("Failed to get Ruby runtime"))?;
+        self.try_into_value(&ruby)
+    }
+}
+// Note: We don't provide a blanket implementation for all IntoValue types
+// because some types may want to provide custom error handling.
+// Types that need TryIntoValue should implement it explicitly.
+// Convenience implementations for common types
+impl TryIntoValue for String {
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        Ok(self.into_value_with(handle))
+    }
+}
+impl TryIntoValue for &str {
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        Ok(self.into_value_with(handle))
+    }
+}
+impl TryIntoValue for i32 {
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        Ok(self.into_value_with(handle))
+    }
+}
+impl TryIntoValue for i64 {
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        Ok(self.into_value_with(handle))
+    }
+}
+impl TryIntoValue for f32 {
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        Ok(self.into_value_with(handle))
+    }
+}
+impl TryIntoValue for f64 {
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        Ok(self.into_value_with(handle))
+    }
+}
+impl TryIntoValue for bool {
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        Ok(self.into_value_with(handle))
+    }
+}
+impl<T> TryIntoValue for Vec<T>
+where
+    T: TryIntoValue,
+{
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        let array = handle.ary_new();
+        for item in self {
+            let ruby_value = item.try_into_value(handle)?;
+            array.push(ruby_value)?;
+        }
+        Ok(handle.into_value(array))
+    }
+}
+impl<T> TryIntoValue for Option<T>
+where
+    T: TryIntoValue,
+{
+    fn try_into_value(self, handle: &Ruby) -> Result<Value> {
+        match self {
+            Some(value) => value.try_into_value(handle),
+            None => Ok(handle.qnil().as_value()),
+        }
+    }
+}

data/ext/parquet-ruby-adapter/src/types.rs ADDED Viewed

@@ -0,0 +1,94 @@
+use magnus::Value;
+use std::fs::File;
+use std::str::FromStr;
+use tempfile::NamedTempFile;
+/// Arguments for writing Parquet files
+#[derive(Debug)]
+pub struct ParquetWriteArgs {
+    pub read_from: Value,
+    pub write_to: Value,
+    pub schema_value: Value,
+    pub batch_size: Option<usize>,
+    pub flush_threshold: Option<usize>,
+    pub compression: Option<String>,
+    pub sample_size: Option<usize>,
+    pub logger: Option<Value>,
+    pub string_cache: Option<bool>,
+}
+/// Arguments for creating row enumerators
+pub struct RowEnumeratorArgs {
+    pub rb_self: Value,
+    pub to_read: Value,
+    pub result_type: ParserResultType,
+    pub columns: Option<Vec<String>>,
+    pub strict: bool,
+    pub logger: Option<Value>,
+}
+/// Arguments for creating column enumerators
+pub struct ColumnEnumeratorArgs {
+    pub rb_self: Value,
+    pub to_read: Value,
+    pub result_type: ParserResultType,
+    pub columns: Option<Vec<String>>,
+    pub batch_size: Option<usize>,
+    pub strict: bool,
+    pub logger: Option<Value>,
+}
+/// Enum to handle different writer outputs
+pub enum WriterOutput {
+    File(parquet_core::Writer<File>),
+    TempFile(parquet_core::Writer<File>, NamedTempFile, Value), // Writer, temp file, IO object
+}
+/// Result type for parser output
+#[derive(Copy, Clone, Debug, PartialEq, Eq)]
+pub enum ParserResultType {
+    Hash,
+    Array,
+}
+impl ParserResultType {
+    pub fn iter() -> impl Iterator<Item = Self> {
+        [Self::Hash, Self::Array].into_iter()
+    }
+}
+impl FromStr for ParserResultType {
+    type Err = String;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Self::try_from(s)
+    }
+}
+impl TryFrom<&str> for ParserResultType {
+    type Error = String;
+    fn try_from(value: &str) -> Result<Self, Self::Error> {
+        match value {
+            "hash" => Ok(ParserResultType::Hash),
+            "array" => Ok(ParserResultType::Array),
+            _ => Err(format!("Invalid parser result type: {}", value)),
+        }
+    }
+}
+impl TryFrom<String> for ParserResultType {
+    type Error = String;
+    fn try_from(value: String) -> Result<Self, Self::Error> {
+        Self::try_from(value.as_str())
+    }
+}
+impl std::fmt::Display for ParserResultType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            ParserResultType::Hash => write!(f, "hash"),
+            ParserResultType::Array => write!(f, "array"),
+        }
+    }
+}

data/ext/parquet-ruby-adapter/src/utils.rs ADDED Viewed

@@ -0,0 +1,186 @@
+use magnus::value::ReprValue;
+use magnus::{
+    scan_args::{get_kwargs, scan_args},
+    Error as MagnusError, KwArgs, RArray, RHash, Ruby, Symbol, Value,
+};
+use parquet::basic::Compression;
+use parquet_core::ParquetValue;
+use crate::types::{ColumnEnumeratorArgs, ParquetWriteArgs, RowEnumeratorArgs};
+/// Estimate the memory size of a ParquetValue
+pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
+    match value {
+        ParquetValue::Null => 1,
+        ParquetValue::Boolean(_) => 1,
+        ParquetValue::Int8(_) => 1,
+        ParquetValue::Int16(_) => 2,
+        ParquetValue::Int32(_) => 4,
+        ParquetValue::Int64(_) => 8,
+        ParquetValue::UInt8(_) => 1,
+        ParquetValue::UInt16(_) => 2,
+        ParquetValue::UInt32(_) => 4,
+        ParquetValue::UInt64(_) => 8,
+        ParquetValue::Float16(_) => 4,
+        ParquetValue::Float32(_) => 4,
+        ParquetValue::Float64(_) => 8,
+        ParquetValue::String(s) => s.len() + 24, // String overhead
+        ParquetValue::Bytes(b) => b.len() + 24,  // Vec overhead
+        ParquetValue::Date32(_) => 4,
+        ParquetValue::Date64(_) => 8,
+        ParquetValue::Decimal128(_, _) => 16 + 1, // value + scale
+        ParquetValue::Decimal256(_, _) => 32 + 1, // approx size for BigInt + scale
+        ParquetValue::TimestampSecond(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
+        ParquetValue::TimestampMillis(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
+        ParquetValue::TimestampMicros(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
+        ParquetValue::TimestampNanos(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
+        ParquetValue::TimeMillis(_) => 4,
+        ParquetValue::TimeMicros(_) => 8,
+        ParquetValue::List(items) => {
+            24 + items.iter().map(estimate_parquet_value_size).sum::<usize>()
+        }
+        ParquetValue::Map(entries) => {
+            48 + entries
+                .iter()
+                .map(|(k, v)| estimate_parquet_value_size(k) + estimate_parquet_value_size(v))
+                .sum::<usize>()
+        }
+        ParquetValue::Record(fields) => {
+            48 + fields
+                .iter()
+                .map(|(k, v)| k.len() + 24 + estimate_parquet_value_size(v))
+                .sum::<usize>()
+        }
+    }
+}
+/// Estimate the memory size of a row
+pub fn estimate_row_size(row: &[ParquetValue]) -> usize {
+    row.iter().map(estimate_parquet_value_size).sum()
+}
+/// Parse compression type from string
+pub fn parse_compression(compression: Option<String>) -> Result<Compression, MagnusError> {
+    match compression.map(|s| s.to_lowercase()).as_deref() {
+        Some("none") | Some("uncompressed") => Ok(Compression::UNCOMPRESSED),
+        Some("snappy") => Ok(Compression::SNAPPY),
+        Some("gzip") => Ok(Compression::GZIP(parquet::basic::GzipLevel::default())),
+        Some("lz4") => Ok(Compression::LZ4),
+        Some("zstd") => Ok(Compression::ZSTD(parquet::basic::ZstdLevel::default())),
+        Some("brotli") => Ok(Compression::BROTLI(parquet::basic::BrotliLevel::default())),
+        None => Ok(Compression::SNAPPY), // Default to SNAPPY
+        Some(other) => Err(MagnusError::new(
+            magnus::exception::arg_error(),
+            format!("Invalid compression option: '{}'. Valid options are: none, snappy, gzip, lz4, zstd, brotli", other),
+        )),
+    }
+}
+/// Parse arguments for Parquet writing
+pub fn parse_parquet_write_args(
+    _ruby: &Ruby,
+    args: &[Value],
+) -> Result<ParquetWriteArgs, MagnusError> {
+    let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
+    let (read_from,) = parsed_args.required;
+    let kwargs = get_kwargs::<
+        _,
+        (Value, Value),
+        (
+            Option<Option<usize>>,
+            Option<Option<usize>>,
+            Option<Option<String>>,
+            Option<Option<usize>>,
+            Option<Option<Value>>,
+            Option<Option<bool>>,
+        ),
+        (),
+    >(
+        parsed_args.keywords,
+        &["schema", "write_to"],
+        &[
+            "batch_size",
+            "flush_threshold",
+            "compression",
+            "sample_size",
+            "logger",
+            "string_cache",
+        ],
+    )?;
+    Ok(ParquetWriteArgs {
+        read_from,
+        write_to: kwargs.required.1,
+        schema_value: kwargs.required.0,
+        batch_size: kwargs.optional.0.flatten(),
+        flush_threshold: kwargs.optional.1.flatten(),
+        compression: kwargs.optional.2.flatten(),
+        sample_size: kwargs.optional.3.flatten(),
+        logger: kwargs.optional.4.flatten(),
+        string_cache: kwargs.optional.5.flatten(),
+    })
+}
+/// Handle block or enumerator creation
+pub fn handle_block_or_enum<F, T>(
+    block_given: bool,
+    create_enum: F,
+) -> Result<Option<T>, MagnusError>
+where
+    F: FnOnce() -> Result<T, MagnusError>,
+{
+    if !block_given {
+        let enum_value = create_enum()?;
+        return Ok(Some(enum_value));
+    }
+    Ok(None)
+}
+/// Create a row enumerator
+pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
+    let kwargs = RHash::new();
+    kwargs.aset(
+        Symbol::new("result_type"),
+        Symbol::new(args.result_type.to_string()),
+    )?;
+    if let Some(columns) = args.columns {
+        kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
+    }
+    if args.strict {
+        kwargs.aset(Symbol::new("strict"), true)?;
+    }
+    if let Some(logger) = args.logger {
+        kwargs.aset(Symbol::new("logger"), logger)?;
+    }
+    Ok(args
+        .rb_self
+        .enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
+}
+/// Create a column enumerator
+#[inline]
+pub fn create_column_enumerator(
+    args: ColumnEnumeratorArgs,
+) -> Result<magnus::Enumerator, MagnusError> {
+    let kwargs = RHash::new();
+    kwargs.aset(
+        Symbol::new("result_type"),
+        Symbol::new(args.result_type.to_string()),
+    )?;
+    if let Some(columns) = args.columns {
+        kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
+    }
+    if let Some(batch_size) = args.batch_size {
+        kwargs.aset(Symbol::new("batch_size"), batch_size)?;
+    }
+    if args.strict {
+        kwargs.aset(Symbol::new("strict"), true)?;
+    }
+    if let Some(logger) = args.logger {
+        kwargs.aset(Symbol::new("logger"), logger)?;
+    }
+    Ok(args
+        .rb_self
+        .enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
+}