parquet 0.3.0 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/ruby_reader.rs +3 -23
- data/ext/parquet/src/writer/mod.rs +11 -11
- data/lib/parquet/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 259f41f1ea1b111a0b0fdec15d17b54ac5a4efee750157159a33a4272b0b1310
|
4
|
+
data.tar.gz: 71579d25b3ec411208103e4afe116285bd75a30003af3f21b7d7972ef8942ff6
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 792a6653554393a94de0572ad2164e5da89c7b49b9476599b9d45efd38448f6e7065fa5b7b0082e036ea19da19441b462083ca9e61226f82dd38a812905a189e
|
7
|
+
data.tar.gz: 1520073d0668751a5c449dde08deea5dd06e50845e8e8fead398d7015d5cc979885897a258f1ab86eb405203fcaf6ebe0164ccdb31e4033e4911ad4b68268d11
|
@@ -1,23 +1,18 @@
|
|
1
1
|
use bytes::Bytes;
|
2
2
|
use magnus::{
|
3
3
|
value::{Opaque, ReprValue},
|
4
|
-
|
4
|
+
RString, Ruby, Value,
|
5
5
|
};
|
6
6
|
use parquet::{
|
7
7
|
errors::ParquetError,
|
8
8
|
file::reader::{ChunkReader, Length},
|
9
9
|
};
|
10
|
-
use std::{
|
11
|
-
fs::File,
|
12
|
-
sync::{Mutex, OnceLock},
|
13
|
-
};
|
10
|
+
use std::{fs::File, sync::Mutex};
|
14
11
|
use std::{
|
15
12
|
io::{self, BufReader, Read, Seek, SeekFrom, Write},
|
16
13
|
sync::Arc,
|
17
14
|
};
|
18
15
|
|
19
|
-
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
20
|
-
|
21
16
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
22
17
|
/// and provide a standard Read implementation for them.
|
23
18
|
pub enum RubyReader {
|
@@ -34,15 +29,6 @@ pub enum RubyReader {
|
|
34
29
|
}
|
35
30
|
|
36
31
|
impl RubyReader {
|
37
|
-
fn is_string_io(ruby: &Ruby, value: &Value) -> bool {
|
38
|
-
let string_io_class = STRING_IO_CLASS.get_or_init(|| {
|
39
|
-
let class = RClass::from_value(ruby.eval("StringIO").expect("Failed to find StringIO"))
|
40
|
-
.expect("Failed to get StringIO class");
|
41
|
-
Opaque::from(class)
|
42
|
-
});
|
43
|
-
value.is_kind_of(ruby.get_inner(*string_io_class))
|
44
|
-
}
|
45
|
-
|
46
32
|
fn is_io_like(value: &Value) -> bool {
|
47
33
|
value.respond_to("read", false).unwrap_or(false)
|
48
34
|
}
|
@@ -59,13 +45,7 @@ impl TryFrom<Value> for RubyReader {
|
|
59
45
|
|
60
46
|
fn try_from(value: Value) -> Result<Self, Self::Error> {
|
61
47
|
let ruby = unsafe { Ruby::get_unchecked() };
|
62
|
-
if RubyReader::
|
63
|
-
let string_content = value.funcall::<_, _, RString>("string", ())?;
|
64
|
-
Ok(RubyReader::String {
|
65
|
-
inner: Opaque::from(string_content),
|
66
|
-
offset: 0,
|
67
|
-
})
|
68
|
-
} else if RubyReader::is_seekable_io_like(&value) {
|
48
|
+
if RubyReader::is_seekable_io_like(&value) {
|
69
49
|
Ok(RubyReader::RubyIoLike {
|
70
50
|
inner: Opaque::from(value),
|
71
51
|
})
|
@@ -26,8 +26,9 @@ use crate::{
|
|
26
26
|
IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
|
27
27
|
};
|
28
28
|
|
29
|
+
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10; // Minimum samples needed for estimation
|
29
30
|
const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
|
30
|
-
const MIN_BATCH_SIZE: usize =
|
31
|
+
const MIN_BATCH_SIZE: usize = 10; // Minimum batch size to maintain efficiency
|
31
32
|
const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
|
32
33
|
|
33
34
|
// Maximum memory usage per batch (64MB by default)
|
@@ -178,12 +179,6 @@ fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usiz
|
|
178
179
|
Ok(row_size)
|
179
180
|
}
|
180
181
|
|
181
|
-
/// Calculate optimal batch size based on memory threshold and estimated row size
|
182
|
-
fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
|
183
|
-
let batch_size = memory_threshold / row_size;
|
184
|
-
batch_size.max(MIN_BATCH_SIZE)
|
185
|
-
}
|
186
|
-
|
187
182
|
#[inline]
|
188
183
|
pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
189
184
|
let ruby = unsafe { Ruby::get_unchecked() };
|
@@ -293,10 +288,15 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
293
288
|
rows_in_batch += 1;
|
294
289
|
total_rows += 1;
|
295
290
|
|
296
|
-
//
|
297
|
-
if size_samples.len() >=
|
298
|
-
let
|
299
|
-
|
291
|
+
// Calculate batch size progressively once we have minimum samples
|
292
|
+
if size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE && user_batch_size.is_none() {
|
293
|
+
let total_size = size_samples.iter().sum::<usize>();
|
294
|
+
// Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
|
295
|
+
let avg_row_size = total_size as f64 / size_samples.len() as f64;
|
296
|
+
let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
|
297
|
+
let suggested_batch_size =
|
298
|
+
(flush_threshold as f64 / avg_row_size).floor() as usize;
|
299
|
+
current_batch_size = suggested_batch_size.max(MIN_BATCH_SIZE);
|
300
300
|
}
|
301
301
|
|
302
302
|
// When we reach batch size, write the batch
|
data/lib/parquet/version.rb
CHANGED