parquet 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/writer/mod.rs +11 -11
- data/lib/parquet/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f8fa4ebaf63f622e06e21085899249ea50edbceed3afa0c378df69bd0bdc8c58
|
4
|
+
data.tar.gz: 9f02946885241758cb26eff3564a6f386c4fafaa77811627e3af2f5768ea002a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 67edb7775baaec1f214edd359d1aff1768bceb1b24e3d616b23c2a28f60ae3a7c7d9c68b0a767d99558aa3841b725030452cb9726e0567c7e413f15c3da6562d
|
7
|
+
data.tar.gz: a704b96748fb53a272964fddcbe8db5ee4615fc4e96cec51e3897ab2e651b84c5429ebc2a50a48568da71e642b728c8f24b0aa46eab62f896f98a8919ec9141b
|
@@ -26,8 +26,9 @@ use crate::{
|
|
26
26
|
IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
|
27
27
|
};
|
28
28
|
|
29
|
+
const MIN_SAMPLES_FOR_ESTIMATE: usize = 10; // Minimum samples needed for estimation
|
29
30
|
const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
|
30
|
-
const MIN_BATCH_SIZE: usize =
|
31
|
+
const MIN_BATCH_SIZE: usize = 10; // Minimum batch size to maintain efficiency
|
31
32
|
const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
|
32
33
|
|
33
34
|
// Maximum memory usage per batch (64MB by default)
|
@@ -178,12 +179,6 @@ fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usiz
|
|
178
179
|
Ok(row_size)
|
179
180
|
}
|
180
181
|
|
181
|
-
/// Calculate optimal batch size based on memory threshold and estimated row size
|
182
|
-
fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
|
183
|
-
let batch_size = memory_threshold / row_size;
|
184
|
-
batch_size.max(MIN_BATCH_SIZE)
|
185
|
-
}
|
186
|
-
|
187
182
|
#[inline]
|
188
183
|
pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
189
184
|
let ruby = unsafe { Ruby::get_unchecked() };
|
@@ -293,10 +288,15 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
293
288
|
rows_in_batch += 1;
|
294
289
|
total_rows += 1;
|
295
290
|
|
296
|
-
//
|
297
|
-
if size_samples.len() >=
|
298
|
-
let
|
299
|
-
|
291
|
+
// Calculate batch size progressively once we have minimum samples
|
292
|
+
if size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE && user_batch_size.is_none() {
|
293
|
+
let total_size = size_samples.iter().sum::<usize>();
|
294
|
+
// Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
|
295
|
+
let avg_row_size = total_size as f64 / size_samples.len() as f64;
|
296
|
+
let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
|
297
|
+
let suggested_batch_size =
|
298
|
+
(flush_threshold as f64 / avg_row_size).floor() as usize;
|
299
|
+
current_batch_size = suggested_batch_size.max(MIN_BATCH_SIZE);
|
300
300
|
}
|
301
301
|
|
302
302
|
// When we reach batch size, write the batch
|
data/lib/parquet/version.rb
CHANGED