parquet 0.2.9 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,6 +26,7 @@ pub struct ParquetWriteArgs<'a> {
26
26
  pub batch_size: Option<usize>,
27
27
  pub flush_threshold: Option<usize>,
28
28
  pub compression: Option<String>,
29
+ pub sample_size: Option<usize>,
29
30
  }
30
31
 
31
32
  pub trait SendableWrite: Send + Write {}
@@ -1,6 +1,7 @@
1
1
  use std::{
2
2
  fs::File,
3
3
  io::{self, BufReader, BufWriter},
4
+ mem,
4
5
  sync::Arc,
5
6
  };
6
7
 
@@ -16,6 +17,7 @@ use parquet::{
16
17
  basic::{Compression, GzipLevel, ZstdLevel},
17
18
  file::properties::WriterProperties,
18
19
  };
20
+ use rand::Rng;
19
21
  use tempfile::NamedTempFile;
20
22
 
21
23
  use crate::{
@@ -24,7 +26,9 @@ use crate::{
24
26
  IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
25
27
  };
26
28
 
27
- const DEFAULT_BATCH_SIZE: usize = 1000;
29
+ const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
30
+ const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
31
+ const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
28
32
 
29
33
  // Maximum memory usage per batch (64MB by default)
30
34
  const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
@@ -42,12 +46,18 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
42
46
  Option<Option<usize>>,
43
47
  Option<Option<usize>>,
44
48
  Option<Option<String>>,
49
+ Option<Option<usize>>,
45
50
  ),
46
51
  (),
47
52
  >(
48
53
  parsed_args.keywords,
49
54
  &["schema", "write_to"],
50
- &["batch_size", "flush_threshold", "compression"],
55
+ &[
56
+ "batch_size",
57
+ "flush_threshold",
58
+ "compression",
59
+ "sample_size",
60
+ ],
51
61
  )?;
52
62
 
53
63
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
@@ -123,9 +133,57 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
123
133
  batch_size: kwargs.optional.0.flatten(),
124
134
  flush_threshold: kwargs.optional.1.flatten(),
125
135
  compression: kwargs.optional.2.flatten(),
136
+ sample_size: kwargs.optional.3.flatten(),
126
137
  })
127
138
  }
128
139
 
140
+ /// Estimate the size of a row
141
+ fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
142
+ let mut row_size = 0;
143
+ for (field, value) in schema.iter().zip(row.into_iter()) {
144
+ // Estimate size based on type and value
145
+ row_size += match &field.type_ {
146
+ // Use reference to avoid moving
147
+ ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
148
+ ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
149
+ ParquetSchemaType::Int32
150
+ | ParquetSchemaType::UInt32
151
+ | ParquetSchemaType::Float
152
+ | ParquetSchemaType::Date32 => 4,
153
+ ParquetSchemaType::Int64
154
+ | ParquetSchemaType::UInt64
155
+ | ParquetSchemaType::Double
156
+ | ParquetSchemaType::TimestampMillis
157
+ | ParquetSchemaType::TimestampMicros => 8,
158
+ ParquetSchemaType::String => {
159
+ if let Ok(s) = String::try_convert(value) {
160
+ s.len() + mem::size_of::<usize>() // account for length prefix
161
+ } else {
162
+ 16 // default estimate for string
163
+ }
164
+ }
165
+ ParquetSchemaType::Binary => {
166
+ if let Ok(bytes) = Vec::<u8>::try_convert(value) {
167
+ bytes.len() + mem::size_of::<usize>() // account for length prefix
168
+ } else {
169
+ 16 // default estimate for binary
170
+ }
171
+ }
172
+ ParquetSchemaType::Boolean => 1,
173
+ ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
174
+ 32 // rough estimate for complex types
175
+ }
176
+ };
177
+ }
178
+ Ok(row_size)
179
+ }
180
+
181
+ /// Calculate optimal batch size based on memory threshold and estimated row size
182
+ fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
183
+ let batch_size = memory_threshold / row_size;
184
+ batch_size.max(MIN_BATCH_SIZE)
185
+ }
186
+
129
187
  #[inline]
130
188
  pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
131
189
  let ruby = unsafe { Ruby::get_unchecked() };
@@ -134,13 +192,12 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
134
192
  read_from,
135
193
  write_to,
136
194
  schema,
137
- batch_size,
195
+ batch_size: user_batch_size,
138
196
  compression,
139
197
  flush_threshold,
198
+ sample_size: user_sample_size,
140
199
  } = parse_parquet_write_args(args)?;
141
200
 
142
- let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
143
-
144
201
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
145
202
 
146
203
  // Convert schema to Arrow schema
@@ -185,11 +242,20 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
185
242
  if read_from.is_kind_of(ruby.class_enumerator()) {
186
243
  // Create collectors for each column
187
244
  let mut column_collectors: Vec<ColumnCollector> = schema
188
- .into_iter()
189
- .map(|field| ColumnCollector::new(field.name, field.type_, field.format))
245
+ .iter()
246
+ .map(|field| {
247
+ // Clone the type to avoid moving from a reference
248
+ let type_clone = field.type_.clone();
249
+ ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
250
+ })
190
251
  .collect();
191
252
 
192
253
  let mut rows_in_batch = 0;
254
+ let mut total_rows = 0;
255
+ let mut rng = rand::rng();
256
+ let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
257
+ let mut size_samples = Vec::with_capacity(sample_size);
258
+ let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
193
259
 
194
260
  loop {
195
261
  match read_from.funcall::<_, _, Value>("next", ()) {
@@ -211,15 +277,30 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
211
277
  ));
212
278
  }
213
279
 
214
- // Process each value in the row immediately
280
+ // Sample row sizes using reservoir sampling
281
+ if size_samples.len() < sample_size {
282
+ size_samples.push(estimate_single_row_size(&row_array, &schema)?);
283
+ } else if rng.random_range(0..=total_rows) < sample_size {
284
+ let idx = rng.random_range(0..sample_size);
285
+ size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
286
+ }
287
+
288
+ // Process each value in the row
215
289
  for (collector, value) in column_collectors.iter_mut().zip(row_array) {
216
290
  collector.push_value(value)?;
217
291
  }
218
292
 
219
293
  rows_in_batch += 1;
294
+ total_rows += 1;
295
+
296
+ // Recalculate batch size if we have enough samples and no user-specified size
297
+ if size_samples.len() >= sample_size && user_batch_size.is_none() {
298
+ let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
299
+ current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
300
+ }
220
301
 
221
302
  // When we reach batch size, write the batch
222
- if rows_in_batch >= batch_size {
303
+ if rows_in_batch >= current_batch_size {
223
304
  write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
224
305
  rows_in_batch = 0;
225
306
  }
@@ -263,6 +344,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
263
344
  batch_size: _,
264
345
  compression,
265
346
  flush_threshold,
347
+ sample_size: _,
266
348
  } = parse_parquet_write_args(args)?;
267
349
 
268
350
  let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
@@ -483,7 +565,7 @@ fn write_batch(
483
565
 
484
566
  match writer {
485
567
  WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
486
- if w.in_progress_size() >= flush_threshold {
568
+ if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
487
569
  w.flush().map_err(|e| ParquetErrorWrapper(e))?;
488
570
  }
489
571
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.9"
2
+ VERSION = "0.2.12"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -68,6 +68,7 @@ module Parquet
68
68
  # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
69
69
  # - `compression`: Optional compression type to use (defaults to "zstd")
70
70
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
71
+ # - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
71
72
  sig do
72
73
  params(
73
74
  read_from: T::Enumerator[T::Array[T.untyped]],
@@ -75,10 +76,11 @@ module Parquet
75
76
  write_to: T.any(String, IO),
76
77
  batch_size: T.nilable(Integer),
77
78
  flush_threshold: T.nilable(Integer),
78
- compression: T.nilable(String)
79
+ compression: T.nilable(String),
80
+ sample_size: T.nilable(Integer)
79
81
  ).void
80
82
  end
81
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
83
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil, sample_size: nil)
82
84
  end
83
85
 
84
86
  # Options:
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.9
4
+ version: 0.2.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-29 00:00:00.000000000 Z
11
+ date: 2025-02-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys