parquet 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 5416216b30f385f5ff8a9ee44a02eb1f685ac200c1bec340ad09e936a37ef9b6
4
- data.tar.gz: ed9dd4781af2c5cab68d94e582cfd29135d37d399d7904b82dcffe4bfefbe52f
3
+ metadata.gz: 259f41f1ea1b111a0b0fdec15d17b54ac5a4efee750157159a33a4272b0b1310
4
+ data.tar.gz: 71579d25b3ec411208103e4afe116285bd75a30003af3f21b7d7972ef8942ff6
5
5
  SHA512:
6
- metadata.gz: 4f3650bac3f11755e691b7cf50f03cce66036a7e8989297c08aa6f0ab8cea60b3bcf0f6881e9a5e452ca51d78f64f183e9c20c46c38dad312bb568042ab7f0d4
7
- data.tar.gz: 8a070d861e155d2d9b3cf31ec79d77307c379b03a7c797dd103a0e7837ff623a6507723621ef45715d9c8e764c836e786ab9a8a6ca3c5454551b3f1c3489018d
6
+ metadata.gz: 792a6653554393a94de0572ad2164e5da89c7b49b9476599b9d45efd38448f6e7065fa5b7b0082e036ea19da19441b462083ca9e61226f82dd38a812905a189e
7
+ data.tar.gz: 1520073d0668751a5c449dde08deea5dd06e50845e8e8fead398d7015d5cc979885897a258f1ab86eb405203fcaf6ebe0164ccdb31e4033e4911ad4b68268d11
@@ -1,23 +1,18 @@
1
1
  use bytes::Bytes;
2
2
  use magnus::{
3
3
  value::{Opaque, ReprValue},
4
- RClass, RString, Ruby, Value,
4
+ RString, Ruby, Value,
5
5
  };
6
6
  use parquet::{
7
7
  errors::ParquetError,
8
8
  file::reader::{ChunkReader, Length},
9
9
  };
10
- use std::{
11
- fs::File,
12
- sync::{Mutex, OnceLock},
13
- };
10
+ use std::{fs::File, sync::Mutex};
14
11
  use std::{
15
12
  io::{self, BufReader, Read, Seek, SeekFrom, Write},
16
13
  sync::Arc,
17
14
  };
18
15
 
19
- static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
20
-
21
16
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
22
17
  /// and provide a standard Read implementation for them.
23
18
  pub enum RubyReader {
@@ -34,15 +29,6 @@ pub enum RubyReader {
34
29
  }
35
30
 
36
31
  impl RubyReader {
37
- fn is_string_io(ruby: &Ruby, value: &Value) -> bool {
38
- let string_io_class = STRING_IO_CLASS.get_or_init(|| {
39
- let class = RClass::from_value(ruby.eval("StringIO").expect("Failed to find StringIO"))
40
- .expect("Failed to get StringIO class");
41
- Opaque::from(class)
42
- });
43
- value.is_kind_of(ruby.get_inner(*string_io_class))
44
- }
45
-
46
32
  fn is_io_like(value: &Value) -> bool {
47
33
  value.respond_to("read", false).unwrap_or(false)
48
34
  }
@@ -59,13 +45,7 @@ impl TryFrom<Value> for RubyReader {
59
45
 
60
46
  fn try_from(value: Value) -> Result<Self, Self::Error> {
61
47
  let ruby = unsafe { Ruby::get_unchecked() };
62
- if RubyReader::is_string_io(&ruby, &value) {
63
- let string_content = value.funcall::<_, _, RString>("string", ())?;
64
- Ok(RubyReader::String {
65
- inner: Opaque::from(string_content),
66
- offset: 0,
67
- })
68
- } else if RubyReader::is_seekable_io_like(&value) {
48
+ if RubyReader::is_seekable_io_like(&value) {
69
49
  Ok(RubyReader::RubyIoLike {
70
50
  inner: Opaque::from(value),
71
51
  })
@@ -26,8 +26,9 @@ use crate::{
26
26
  IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
27
27
  };
28
28
 
29
+ const MIN_SAMPLES_FOR_ESTIMATE: usize = 10; // Minimum samples needed for estimation
29
30
  const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
30
- const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
31
+ const MIN_BATCH_SIZE: usize = 10; // Minimum batch size to maintain efficiency
31
32
  const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
32
33
 
33
34
  // Maximum memory usage per batch (64MB by default)
@@ -178,12 +179,6 @@ fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usiz
178
179
  Ok(row_size)
179
180
  }
180
181
 
181
- /// Calculate optimal batch size based on memory threshold and estimated row size
182
- fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
183
- let batch_size = memory_threshold / row_size;
184
- batch_size.max(MIN_BATCH_SIZE)
185
- }
186
-
187
182
  #[inline]
188
183
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
189
184
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -293,10 +288,15 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
293
288
  rows_in_batch += 1;
294
289
  total_rows += 1;
295
290
 
296
- // Recalculate batch size if we have enough samples and no user-specified size
297
- if size_samples.len() >= sample_size && user_batch_size.is_none() {
298
- let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
299
- current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
291
+ // Calculate batch size progressively once we have minimum samples
292
+ if size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE && user_batch_size.is_none() {
293
+ let total_size = size_samples.iter().sum::<usize>();
294
+ // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
295
+ let avg_row_size = total_size as f64 / size_samples.len() as f64;
296
+ let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
297
+ let suggested_batch_size =
298
+ (flush_threshold as f64 / avg_row_size).floor() as usize;
299
+ current_batch_size = suggested_batch_size.max(MIN_BATCH_SIZE);
300
300
  }
301
301
 
302
302
  // When we reach batch size, write the batch
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.3.0"
2
+ VERSION = "0.3.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.3.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko