RubyGems - parquet - Versions diffs - 0.2.7 → 0.2.9 - Mend

parquet 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

checksums.yaml +4 -4
data/README.md +16 -1
data/ext/parquet/src/types/writer_types.rs +2 -0
data/ext/parquet/src/writer/mod.rs +66 -10
data/lib/parquet/version.rb +1 -1
data/lib/parquet.rbi +14 -4
metadata +2 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
-  data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
+  metadata.gz: c20809ee9bbbe96f268b2ef4c5d40f7e7fae1613e3aabe2c0f42778425237c32
+  data.tar.gz: 13cd137212d16de6eb4e0803ae35e9bdace8caf95ac1beabba4848a23290ad4e
 SHA512:
-  metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
-  data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
+  metadata.gz: 70f51bcdc98891e781ab51257c969c4a8ca5c9fe03a43ccab8b72d9a79db2b640e2779a15ed1a36969914dd1fcd0c8df639ecb50168ec5a15b53e05cebd6655b
+  data.tar.gz: abdacbca6cf8857a14ec039a34ba51403a6e4ba130b751faeb62904329b2688f581afa7207c0a95fa958822d6dda0b2477c58d02c930dac5e2b54c50ff4ccc7f

data/README.md CHANGED Viewed

@@ -117,6 +117,13 @@ Parquet.write_rows(rows,
   write_to: "data.parquet",
   batch_size: 500
 )
+# Optionally specify memory threshold for flushing (default is 64MB)
+Parquet.write_rows(rows,
+  schema: schema,
+  write_to: "data.parquet",
+  flush_threshold: 32 * 1024 * 1024  # 32MB
+)
 ```
 ### Writing Column-wise Data
@@ -152,9 +159,17 @@ batches = [
 # Create an enumerator from the batches
 columns = batches.each
-# Write to a parquet file
+# Write to a parquet file with default ZSTD compression
 Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
+# Write to a parquet file with specific compression and memory threshold
+Parquet.write_columns(columns,
+  schema: schema,
+  write_to: "data.parquet",
+  compression: "snappy",  # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
+  flush_threshold: 32 * 1024 * 1024  # 32MB
+)
 # Write to an IO object
 File.open("data.parquet", "wb") do |file|
   Parquet.write_columns(columns, schema: schema, write_to: file)

data/ext/parquet/src/types/writer_types.rs CHANGED Viewed

@@ -24,6 +24,8 @@ pub struct ParquetWriteArgs<'a> {
     pub write_to: Value,
     pub schema: Vec<SchemaField<'a>>,
     pub batch_size: Option<usize>,
+    pub flush_threshold: Option<usize>,
+    pub compression: Option<String>,
 }
 pub trait SendableWrite: Send + Write {}

data/ext/parquet/src/writer/mod.rs CHANGED Viewed

@@ -11,7 +11,11 @@ use magnus::{
     value::ReprValue,
     Error as MagnusError, RArray, Ruby, TryConvert, Value,
 };
-use parquet::arrow::ArrowWriter;
+use parquet::{
+    arrow::ArrowWriter,
+    basic::{Compression, GzipLevel, ZstdLevel},
+    file::properties::WriterProperties,
+};
 use tempfile::NamedTempFile;
 use crate::{
@@ -22,16 +26,28 @@ use crate::{
 const DEFAULT_BATCH_SIZE: usize = 1000;
+// Maximum memory usage per batch (64MB by default)
+const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
 /// Parse arguments for Parquet writing
 pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
     let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
     let (read_from,) = parsed_args.required;
-    let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
+    let kwargs = get_kwargs::<
+        _,
+        (Value, Value),
+        (
+            Option<Option<usize>>,
+            Option<Option<usize>>,
+            Option<Option<String>>,
+        ),
+        (),
+    >(
         parsed_args.keywords,
         &["schema", "write_to"],
-        &["batch_size"],
+        &["batch_size", "flush_threshold", "compression"],
     )?;
     let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
@@ -105,6 +121,8 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
         write_to: kwargs.required.1,
         schema,
         batch_size: kwargs.optional.0.flatten(),
+        flush_threshold: kwargs.optional.1.flatten(),
+        compression: kwargs.optional.2.flatten(),
     })
 }
@@ -117,10 +135,14 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
         write_to,
         schema,
         batch_size,
+        compression,
+        flush_threshold,
     } = parse_parquet_write_args(args)?;
     let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
+    let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
     // Convert schema to Arrow schema
     let arrow_fields: Vec<Field> = schema
         .iter()
@@ -158,7 +180,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
     let arrow_schema = Arc::new(Schema::new(arrow_fields));
     // Create the writer
-    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
+    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
     if read_from.is_kind_of(ruby.class_enumerator()) {
         // Create collectors for each column
@@ -198,7 +220,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
                     // When we reach batch size, write the batch
                     if rows_in_batch >= batch_size {
-                        write_batch(&mut writer, &mut column_collectors)?;
+                        write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
                         rows_in_batch = 0;
                     }
                 }
@@ -206,7 +228,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
                     if e.is_kind_of(ruby.exception_stop_iteration()) {
                         // Write any remaining rows
                         if rows_in_batch > 0 {
-                            write_batch(&mut writer, &mut column_collectors)?;
+                            write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
                         }
                         break;
                     }
@@ -238,9 +260,13 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
         read_from,
         write_to,
         schema,
-        batch_size: _, // Batch size is determined by the input
+        batch_size: _,
+        compression,
+        flush_threshold,
     } = parse_parquet_write_args(args)?;
+    let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
     // Convert schema to Arrow schema
     let arrow_fields: Vec<Field> = schema
         .iter()
@@ -278,7 +304,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
     let arrow_schema = Arc::new(Schema::new(arrow_fields));
     // Create the writer
-    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
+    let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
     if read_from.is_kind_of(ruby.class_enumerator()) {
         loop {
@@ -331,6 +357,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
                     writer
                         .write(&record_batch)
                         .map_err(|e| ParquetErrorWrapper(e))?;
+                    match &mut writer {
+                        WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
+                            if w.in_progress_size() >= flush_threshold {
+                                w.flush().map_err(|e| ParquetErrorWrapper(e))?;
+                            }
+                        }
+                    }
                 }
                 Err(e) => {
                     if e.is_kind_of(ruby.exception_stop_iteration()) {
@@ -360,12 +394,25 @@ fn create_writer(
     ruby: &Ruby,
     write_to: &Value,
     schema: Arc<Schema>,
+    compression: Option<String>,
 ) -> Result<WriterOutput, MagnusError> {
+    // Create writer properties with compression based on the option
+    let props = WriterProperties::builder()
+        .set_compression(match compression.as_deref() {
+            Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
+            Some("snappy") => Compression::SNAPPY,
+            Some("gzip") => Compression::GZIP(GzipLevel::default()),
+            Some("lz4") => Compression::LZ4,
+            Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
+            _ => Compression::UNCOMPRESSED,
+        })
+        .build();
     if write_to.is_kind_of(ruby.class_string()) {
         let path = write_to.to_r_string()?.to_string()?;
         let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
         let writer =
-            ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
+            ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
         Ok(WriterOutput::File(writer))
     } else {
         // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -382,7 +429,7 @@ fn create_writer(
             )
         })?);
         let writer =
-            ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
+            ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
         Ok(WriterOutput::TempFile(writer, temp_file))
     }
 }
@@ -414,6 +461,7 @@ fn copy_temp_file_to_io_like(
 fn write_batch(
     writer: &mut WriterOutput,
     collectors: &mut [ColumnCollector],
+    flush_threshold: usize,
 ) -> Result<(), MagnusError> {
     // Convert columns to Arrow arrays
     let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
@@ -433,5 +481,13 @@ fn write_batch(
         .write(&record_batch)
         .map_err(|e| ParquetErrorWrapper(e))?;
+    match writer {
+        WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
+            if w.in_progress_size() >= flush_threshold {
+                w.flush().map_err(|e| ParquetErrorWrapper(e))?;
+            }
+        }
+    }
     Ok(())
 }

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.2.7"
+  VERSION = "0.2.9"
 end

data/lib/parquet.rbi CHANGED Viewed

@@ -65,15 +65,20 @@ module Parquet
   #     - `timestamp_millis`, `timestamp_micros`
   #   - `write_to`: String path or IO object to write the parquet file to
   #   - `batch_size`: Optional batch size for writing (defaults to 1000)
+  #   - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
+  #   - `compression`: Optional compression type to use (defaults to "zstd")
+  #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
   sig do
     params(
       read_from: T::Enumerator[T::Array[T.untyped]],
       schema: T::Array[T::Hash[String, String]],
       write_to: T.any(String, IO),
-      batch_size: T.nilable(Integer)
+      batch_size: T.nilable(Integer),
+      flush_threshold: T.nilable(Integer),
+      compression: T.nilable(String)
     ).void
   end
-  def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
+  def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
   end
   # Options:
@@ -89,13 +94,18 @@ module Parquet
   #     - `timestamp_millis`, `timestamp_micros`
   #     - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
   #   - `write_to`: String path or IO object to write the parquet file to
+  #   - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
+  #   - `compression`: Optional compression type to use (defaults to "zstd")
+  #                   Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
   sig do
     params(
       read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
       schema: T::Array[T::Hash[String, String]],
-      write_to: T.any(String, IO)
+      write_to: T.any(String, IO),
+      flush_threshold: T.nilable(Integer),
+      compression: T.nilable(String)
     ).void
   end
-  def self.write_columns(read_from, schema:, write_to:)
+  def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
   end
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.2.7
+  version: 0.2.9
 platform: ruby
 authors:
 - Nathan Jaremko
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-01-13 00:00:00.000000000 Z
+date: 2025-01-29 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rb_sys