RubyGems - parquet - Versions diffs - 0.3.0 → 0.3.2 - Mend

parquet 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +4 -4
data/ext/parquet/src/ruby_reader.rs +3 -23
data/ext/parquet/src/writer/mod.rs +11 -11
data/lib/parquet/version.rb +1 -1
metadata +1 -1

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5416216b30f385f5ff8a9ee44a02eb1f685ac200c1bec340ad09e936a37ef9b6
-  data.tar.gz: ed9dd4781af2c5cab68d94e582cfd29135d37d399d7904b82dcffe4bfefbe52f
+  metadata.gz: 259f41f1ea1b111a0b0fdec15d17b54ac5a4efee750157159a33a4272b0b1310
+  data.tar.gz: 71579d25b3ec411208103e4afe116285bd75a30003af3f21b7d7972ef8942ff6
 SHA512:
-  metadata.gz: 4f3650bac3f11755e691b7cf50f03cce66036a7e8989297c08aa6f0ab8cea60b3bcf0f6881e9a5e452ca51d78f64f183e9c20c46c38dad312bb568042ab7f0d4
-  data.tar.gz: 8a070d861e155d2d9b3cf31ec79d77307c379b03a7c797dd103a0e7837ff623a6507723621ef45715d9c8e764c836e786ab9a8a6ca3c5454551b3f1c3489018d
+  metadata.gz: 792a6653554393a94de0572ad2164e5da89c7b49b9476599b9d45efd38448f6e7065fa5b7b0082e036ea19da19441b462083ca9e61226f82dd38a812905a189e
+  data.tar.gz: 1520073d0668751a5c449dde08deea5dd06e50845e8e8fead398d7015d5cc979885897a258f1ab86eb405203fcaf6ebe0164ccdb31e4033e4911ad4b68268d11

data/ext/parquet/src/ruby_reader.rs CHANGED Viewed

@@ -1,23 +1,18 @@
 use bytes::Bytes;
 use magnus::{
     value::{Opaque, ReprValue},
-    RClass, RString, Ruby, Value,
+    RString, Ruby, Value,
 };
 use parquet::{
     errors::ParquetError,
     file::reader::{ChunkReader, Length},
 };
-use std::{
-    fs::File,
-    sync::{Mutex, OnceLock},
-};
+use std::{fs::File, sync::Mutex};
 use std::{
     io::{self, BufReader, Read, Seek, SeekFrom, Write},
     sync::Arc,
 };
-static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
 /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
 /// and provide a standard Read implementation for them.
 pub enum RubyReader {
@@ -34,15 +29,6 @@ pub enum RubyReader {
 }
 impl RubyReader {
-    fn is_string_io(ruby: &Ruby, value: &Value) -> bool {
-        let string_io_class = STRING_IO_CLASS.get_or_init(|| {
-            let class = RClass::from_value(ruby.eval("StringIO").expect("Failed to find StringIO"))
-                .expect("Failed to get StringIO class");
-            Opaque::from(class)
-        });
-        value.is_kind_of(ruby.get_inner(*string_io_class))
-    }
     fn is_io_like(value: &Value) -> bool {
         value.respond_to("read", false).unwrap_or(false)
     }
@@ -59,13 +45,7 @@ impl TryFrom<Value> for RubyReader {
     fn try_from(value: Value) -> Result<Self, Self::Error> {
         let ruby = unsafe { Ruby::get_unchecked() };
-        if RubyReader::is_string_io(&ruby, &value) {
-            let string_content = value.funcall::<_, _, RString>("string", ())?;
-            Ok(RubyReader::String {
-                inner: Opaque::from(string_content),
-                offset: 0,
-            })
-        } else if RubyReader::is_seekable_io_like(&value) {
+        if RubyReader::is_seekable_io_like(&value) {
             Ok(RubyReader::RubyIoLike {
                 inner: Opaque::from(value),
             })

data/ext/parquet/src/writer/mod.rs CHANGED Viewed

@@ -26,8 +26,9 @@ use crate::{
     IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
 };
+const MIN_SAMPLES_FOR_ESTIMATE: usize = 10; // Minimum samples needed for estimation
 const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
-const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
+const MIN_BATCH_SIZE: usize = 10; // Minimum batch size to maintain efficiency
 const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
 // Maximum memory usage per batch (64MB by default)
@@ -178,12 +179,6 @@ fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usiz
     Ok(row_size)
 }
-/// Calculate optimal batch size based on memory threshold and estimated row size
-fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
-    let batch_size = memory_threshold / row_size;
-    batch_size.max(MIN_BATCH_SIZE)
-}
 #[inline]
 pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
     let ruby = unsafe { Ruby::get_unchecked() };
@@ -293,10 +288,15 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
                     rows_in_batch += 1;
                     total_rows += 1;
-                    // Recalculate batch size if we have enough samples and no user-specified size
-                    if size_samples.len() >= sample_size && user_batch_size.is_none() {
-                        let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
-                        current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
+                    // Calculate batch size progressively once we have minimum samples
+                    if size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE && user_batch_size.is_none() {
+                        let total_size = size_samples.iter().sum::<usize>();
+                        // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
+                        let avg_row_size = total_size as f64 / size_samples.len() as f64;
+                        let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
+                        let suggested_batch_size =
+                            (flush_threshold as f64 / avg_row_size).floor() as usize;
+                        current_batch_size = suggested_batch_size.max(MIN_BATCH_SIZE);
                     }
                     // When we reach batch size, write the batch

data/lib/parquet/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Parquet
-  VERSION = "0.3.0"
+  VERSION = "0.3.2"
 end

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: parquet
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.3.2
 platform: ruby
 authors:
 - Nathan Jaremko