RubyGems - parquet - Versions diffs - 0.2.9 → 0.2.12 - Mend

parquet 0.2.9 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

checksums.yaml +4 -4
data/Cargo.lock +82 -651
data/README.md +7 -0
data/ext/parquet/Cargo.toml +3 -1
data/ext/parquet/src/enumerator.rs +9 -17
data/ext/parquet/src/header_cache.rs +20 -80
data/ext/parquet/src/reader/mod.rs +2 -0
data/ext/parquet/src/reader/parquet_column_reader.rs +82 -106
data/ext/parquet/src/reader/parquet_row_reader.rs +51 -46
data/ext/parquet/src/types/writer_types.rs +1 -0
data/ext/parquet/src/writer/mod.rs +92 -10
data/lib/parquet/version.rb +1 -1
data/lib/parquet.rbi +4 -2
metadata +2 -2

data/ext/parquet/src/types/writer_types.rs CHANGED Viewed

@@ -26,6 +26,7 @@ pub struct ParquetWriteArgs<'a> {
     pub batch_size: Option<usize>,
     pub flush_threshold: Option<usize>,
     pub compression: Option<String>,
+    pub sample_size: Option<usize>,
 }
 pub trait SendableWrite: Send + Write {}

data/ext/parquet/src/writer/mod.rs CHANGED Viewed

@@ -1,6 +1,7 @@
 use std::{
     fs::File,
     io::{self, BufReader, BufWriter},
+    mem,
     sync::Arc,
 };
@@ -16,6 +17,7 @@ use parquet::{
     basic::{Compression, GzipLevel, ZstdLevel},
     file::properties::WriterProperties,
 };
+use rand::Rng;
 use tempfile::NamedTempFile;
 use crate::{
@@ -24,7 +26,9 @@ use crate::{
     IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
 };
-const DEFAULT_BATCH_SIZE: usize = 1000;
+const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
+const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
+const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
 // Maximum memory usage per batch (64MB by default)
 const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
@@ -42,12 +46,18 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
             Option<Option<usize>>,
             Option<Option<usize>>,
             Option<Option<String>>,
+            Option<Option<usize>>,
         ),
         (),
     >(
         parsed_args.keywords,
         &["schema", "write_to"],
-        &["batch_size", "flush_threshold", "compression"],
+        &[
+            "batch_size",
+            "flush_threshold",
+            "compression",
+            "sample_size",
+        ],
     )?;
     let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
@@ -123,9 +133,57 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
         batch_size: kwargs.optional.0.flatten(),
         flush_threshold: kwargs.optional.1.flatten(),
         compression: kwargs.optional.2.flatten(),
+        sample_size: kwargs.optional.3.flatten(),
     })
 }
+/// Estimate the size of a row
+fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
+    let mut row_size = 0;
+    for (field, value) in schema.iter().zip(row.into_iter()) {
+        // Estimate size based on type and value
+        row_size += match &field.type_ {
+            // Use reference to avoid moving
+            ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
+            ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
+            ParquetSchemaType::Int32
+            | ParquetSchemaType::UInt32
+            | ParquetSchemaType::Float
+            | ParquetSchemaType::Date32 => 4,
+            ParquetSchemaType::Int64
+            | ParquetSchemaType::UInt64
+            | ParquetSchemaType::Double
+            | ParquetSchemaType::TimestampMillis
+            | ParquetSchemaType::TimestampMicros => 8,
+            ParquetSchemaType::String => {
+                if let Ok(s) = String::try_convert(value) {
+                    s.len() + mem::size_of::<usize>() // account for length prefix
+                } else {
+                    16 // default estimate for string
+                }
+            }
+            ParquetSchemaType::Binary => {
+                if let Ok(bytes) = Vec::<u8>::try_convert(value) {
+                    bytes.len() + mem::size_of::<usize>() // account for length prefix
+                } else {
+                    16 // default estimate for binary
+                }
+            }
+            ParquetSchemaType::Boolean => 1,
+            ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
+                32 // rough estimate for complex types
+            }
+        };
+    }
+    Ok(row_size)
+}
+/// Calculate optimal batch size based on memory threshold and estimated row size
+fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
+    let batch_size = memory_threshold / row_size;
+    batch_size.max(MIN_BATCH_SIZE)
+}
 #[inline]
 pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
@@ -134,13 +192,12 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
         read_from,
         write_to,
         schema,
-        batch_size,
+        batch_size: user_batch_size,
         compression,
         flush_threshold,
+        sample_size: user_sample_size,
     } = parse_parquet_write_args(args)?;
-    let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
     let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
     // Convert schema to Arrow schema
@@ -185,11 +242,20 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
     if read_from.is_kind_of(ruby.class_enumerator()) {
         // Create collectors for each column
         let mut column_collectors: Vec<ColumnCollector> = schema
-            .into_iter()
-            .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
+            .iter()
+            .map(|field| {
+                // Clone the type to avoid moving from a reference
+                let type_clone = field.type_.clone();
+                ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
+            })
             .collect();
         let mut rows_in_batch = 0;
+        let mut total_rows = 0;
+        let mut rng = rand::rng();
+        let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
+        let mut size_samples = Vec::with_capacity(sample_size);
+        let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
         loop {
             match read_from.funcall::<_, _, Value>("next", ()) {
@@ -211,15 +277,30 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
                         ));
                     }
-                    // Process each value in the row immediately
+                    // Sample row sizes using reservoir sampling
+                    if size_samples.len() < sample_size {
+                        size_samples.push(estimate_single_row_size(&row_array, &schema)?);
+                    } else if rng.random_range(0..=total_rows) < sample_size {
+                        let idx = rng.random_range(0..sample_size);
+                        size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
+                    }
+                    // Process each value in the row
                     for (collector, value) in column_collectors.iter_mut().zip(row_array) {
                         collector.push_value(value)?;
                     }
                     rows_in_batch += 1;
+                    total_rows += 1;
+                    // Recalculate batch size if we have enough samples and no user-specified size
+                    if size_samples.len() >= sample_size && user_batch_size.is_none() {
+                        let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
+                        current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
+                    }
                     // When we reach batch size, write the batch
-                    if rows_in_batch >= batch_size {
+                    if rows_in_batch >= current_batch_size {
                         write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
                         rows_in_batch = 0;
                     }
@@ -263,6 +344,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
         batch_size: _,
         compression,
         flush_threshold,
+        sample_size: _,
     } = parse_parquet_write_args(args)?;
     let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
@@ -483,7 +565,7 @@ fn write_batch(
     match writer {
         WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
-            if w.in_progress_size() >= flush_threshold {
+            if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
                 w.flush().map_err(|e| ParquetErrorWrapper(e))?;
             }
         }

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.2.9"
+  VERSION = "0.2.12"
 end

data/lib/parquet.rbi CHANGED Viewed

@@ -68,6 +68,7 @@ module Parquet
   #   - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
   #   - `compression`: Optional compression type to use (defaults to "zstd")
   #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
+  #   - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
   sig do
     params(
       read_from: T::Enumerator[T::Array[T.untyped]],
@@ -75,10 +76,11 @@ module Parquet
       write_to: T.any(String, IO),
       batch_size: T.nilable(Integer),
       flush_threshold: T.nilable(Integer),
-      compression: T.nilable(String)
+      compression: T.nilable(String),
+      sample_size: T.nilable(Integer)
     ).void
   end
-  def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
+  def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil, sample_size: nil)
   end
   # Options:

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.2.9
+  version: 0.2.12
 platform: ruby
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-01-29 00:00:00.000000000 Z
+date: 2025-02-03 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys