RubyGems - parquet - Versions diffs - 0.2.8 → 0.2.9 - Mend

parquet 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +10 -2
data/ext/parquet/src/types/writer_types.rs +1 -0
data/ext/parquet/src/writer/mod.rs +44 -9
data/lib/parquet/version.rb +1 -1
data/lib/parquet.rbi +6 -2
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
-  data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
+  metadata.gz: c20809ee9bbbe96f268b2ef4c5d40f7e7fae1613e3aabe2c0f42778425237c32
+  data.tar.gz: 13cd137212d16de6eb4e0803ae35e9bdace8caf95ac1beabba4848a23290ad4e
 SHA512:
-  metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
-  data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
+  metadata.gz: 70f51bcdc98891e781ab51257c969c4a8ca5c9fe03a43ccab8b72d9a79db2b640e2779a15ed1a36969914dd1fcd0c8df639ecb50168ec5a15b53e05cebd6655b
+  data.tar.gz: abdacbca6cf8857a14ec039a34ba51403a6e4ba130b751faeb62904329b2688f581afa7207c0a95fa958822d6dda0b2477c58d02c930dac5e2b54c50ff4ccc7f

data/README.md CHANGED Viewed

@@ -117,6 +117,13 @@ Parquet.write_rows(rows,
   write_to: "data.parquet",
   batch_size: 500
 )
+# Optionally specify memory threshold for flushing (default is 64MB)
+Parquet.write_rows(rows,
+  schema: schema,
+  write_to: "data.parquet",
+  flush_threshold: 32 * 1024 * 1024  # 32MB
+)
 ```
 ### Writing Column-wise Data
@@ -155,11 +162,12 @@ columns = batches.each
 # Write to a parquet file with default ZSTD compression
 Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
-# Write to a parquet file with specific compression
+# Write to a parquet file with specific compression and memory threshold
 Parquet.write_columns(columns,
   schema: schema,
   write_to: "data.parquet",
-  compression: "snappy"  # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
+  compression: "snappy",  # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
+  flush_threshold: 32 * 1024 * 1024  # 32MB
 )
 # Write to an IO object

data/ext/parquet/src/types/writer_types.rs CHANGED Viewed

@@ -24,6 +24,7 @@ pub struct ParquetWriteArgs<'a> {
     pub write_to: Value,
     pub schema: Vec<SchemaField<'a>>,
     pub batch_size: Option<usize>,
+    pub flush_threshold: Option<usize>,
     pub compression: Option<String>,
 }

data/ext/parquet/src/writer/mod.rs CHANGED Viewed

@@ -26,18 +26,29 @@ use crate::{
 const DEFAULT_BATCH_SIZE: usize = 1000;
+// Maximum memory usage per batch (64MB by default)
+const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
 /// Parse arguments for Parquet writing
 pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
     let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
     let (read_from,) = parsed_args.required;
-    let kwargs =
-        get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
-            parsed_args.keywords,
-            &["schema", "write_to"],
-            &["batch_size", "compression"],
-        )?;
+    let kwargs = get_kwargs::<
+        _,
+        (Value, Value),
+        (
+            Option<Option<usize>>,
+            Option<Option<usize>>,
+            Option<Option<String>>,
+        ),
+        (),
+    >(
+        parsed_args.keywords,
+        &["schema", "write_to"],
+        &["batch_size", "flush_threshold", "compression"],
+    )?;
     let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
         MagnusError::new(
@@ -110,7 +121,8 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
         write_to: kwargs.required.1,
         schema,
         batch_size: kwargs.optional.0.flatten(),
-        compression: kwargs.optional.1.flatten(),
+        flush_threshold: kwargs.optional.1.flatten(),
+        compression: kwargs.optional.2.flatten(),
     })
 }
@@ -124,10 +136,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
         schema,
         batch_size,
         compression,
+        flush_threshold,
     } = parse_parquet_write_args(args)?;
     let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
+    let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
     // Convert schema to Arrow schema
     let arrow_fields: Vec<Field> = schema
         .iter()
@@ -205,7 +220,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
                     // When we reach batch size, write the batch
                     if rows_in_batch >= batch_size {
-                        write_batch(&mut writer, &mut column_collectors)?;
+                        write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
                         rows_in_batch = 0;
                     }
                 }
@@ -213,7 +228,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
                     if e.is_kind_of(ruby.exception_stop_iteration()) {
                         // Write any remaining rows
                         if rows_in_batch > 0 {
-                            write_batch(&mut writer, &mut column_collectors)?;
+                            write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
                         }
                         break;
                     }
@@ -247,8 +262,11 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
         schema,
         batch_size: _,
         compression,
+        flush_threshold,
     } = parse_parquet_write_args(args)?;
+    let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
     // Convert schema to Arrow schema
     let arrow_fields: Vec<Field> = schema
         .iter()
@@ -339,6 +357,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
                     writer
                         .write(&record_batch)
                         .map_err(|e| ParquetErrorWrapper(e))?;
+                    match &mut writer {
+                        WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
+                            if w.in_progress_size() >= flush_threshold {
+                                w.flush().map_err(|e| ParquetErrorWrapper(e))?;
+                            }
+                        }
+                    }
                 }
                 Err(e) => {
                     if e.is_kind_of(ruby.exception_stop_iteration()) {
@@ -435,6 +461,7 @@ fn copy_temp_file_to_io_like(
 fn write_batch(
     writer: &mut WriterOutput,
     collectors: &mut [ColumnCollector],
+    flush_threshold: usize,
 ) -> Result<(), MagnusError> {
     // Convert columns to Arrow arrays
     let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
@@ -454,5 +481,13 @@ fn write_batch(
         .write(&record_batch)
         .map_err(|e| ParquetErrorWrapper(e))?;
+    match writer {
+        WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
+            if w.in_progress_size() >= flush_threshold {
+                w.flush().map_err(|e| ParquetErrorWrapper(e))?;
+            }
+        }
+    }
     Ok(())
 }

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.2.8"
+  VERSION = "0.2.9"
 end

data/lib/parquet.rbi CHANGED Viewed

@@ -65,6 +65,7 @@ module Parquet
   #     - `timestamp_millis`, `timestamp_micros`
   #   - `write_to`: String path or IO object to write the parquet file to
   #   - `batch_size`: Optional batch size for writing (defaults to 1000)
+  #   - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
   #   - `compression`: Optional compression type to use (defaults to "zstd")
   #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
   sig do
@@ -73,10 +74,11 @@ module Parquet
       schema: T::Array[T::Hash[String, String]],
       write_to: T.any(String, IO),
       batch_size: T.nilable(Integer),
+      flush_threshold: T.nilable(Integer),
       compression: T.nilable(String)
     ).void
   end
-  def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
+  def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
   end
   # Options:
@@ -92,6 +94,7 @@ module Parquet
   #     - `timestamp_millis`, `timestamp_micros`
   #     - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
   #   - `write_to`: String path or IO object to write the parquet file to
+  #   - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
   #   - `compression`: Optional compression type to use (defaults to "zstd")
   #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
   sig do
@@ -99,9 +102,10 @@ module Parquet
       read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
       schema: T::Array[T::Hash[String, String]],
       write_to: T.any(String, IO),
+      flush_threshold: T.nilable(Integer),
       compression: T.nilable(String)
     ).void
   end
-  def self.write_columns(read_from, schema:, write_to:, compression: nil)
+  def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.2.8
+  version: 0.2.9
 platform: ruby
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-01-13 00:00:00.000000000 Z
+date: 2025-01-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys