parquet 0.2.9 → 0.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +82 -651
- data/README.md +7 -0
- data/ext/parquet/Cargo.toml +3 -1
- data/ext/parquet/src/enumerator.rs +9 -17
- data/ext/parquet/src/header_cache.rs +20 -80
- data/ext/parquet/src/reader/mod.rs +2 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +82 -106
- data/ext/parquet/src/reader/parquet_row_reader.rs +51 -46
- data/ext/parquet/src/types/writer_types.rs +1 -0
- data/ext/parquet/src/writer/mod.rs +92 -10
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +4 -2
- metadata +2 -2
@@ -1,6 +1,7 @@
|
|
1
1
|
use std::{
|
2
2
|
fs::File,
|
3
3
|
io::{self, BufReader, BufWriter},
|
4
|
+
mem,
|
4
5
|
sync::Arc,
|
5
6
|
};
|
6
7
|
|
@@ -16,6 +17,7 @@ use parquet::{
|
|
16
17
|
basic::{Compression, GzipLevel, ZstdLevel},
|
17
18
|
file::properties::WriterProperties,
|
18
19
|
};
|
20
|
+
use rand::Rng;
|
19
21
|
use tempfile::NamedTempFile;
|
20
22
|
|
21
23
|
use crate::{
|
@@ -24,7 +26,9 @@ use crate::{
|
|
24
26
|
IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
|
25
27
|
};
|
26
28
|
|
27
|
-
const
|
29
|
+
const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
|
30
|
+
const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
|
31
|
+
const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
|
28
32
|
|
29
33
|
// Maximum memory usage per batch (64MB by default)
|
30
34
|
const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
|
@@ -42,12 +46,18 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
42
46
|
Option<Option<usize>>,
|
43
47
|
Option<Option<usize>>,
|
44
48
|
Option<Option<String>>,
|
49
|
+
Option<Option<usize>>,
|
45
50
|
),
|
46
51
|
(),
|
47
52
|
>(
|
48
53
|
parsed_args.keywords,
|
49
54
|
&["schema", "write_to"],
|
50
|
-
&[
|
55
|
+
&[
|
56
|
+
"batch_size",
|
57
|
+
"flush_threshold",
|
58
|
+
"compression",
|
59
|
+
"sample_size",
|
60
|
+
],
|
51
61
|
)?;
|
52
62
|
|
53
63
|
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
@@ -123,9 +133,57 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
123
133
|
batch_size: kwargs.optional.0.flatten(),
|
124
134
|
flush_threshold: kwargs.optional.1.flatten(),
|
125
135
|
compression: kwargs.optional.2.flatten(),
|
136
|
+
sample_size: kwargs.optional.3.flatten(),
|
126
137
|
})
|
127
138
|
}
|
128
139
|
|
140
|
+
/// Estimate the size of a row
|
141
|
+
fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
|
142
|
+
let mut row_size = 0;
|
143
|
+
for (field, value) in schema.iter().zip(row.into_iter()) {
|
144
|
+
// Estimate size based on type and value
|
145
|
+
row_size += match &field.type_ {
|
146
|
+
// Use reference to avoid moving
|
147
|
+
ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
|
148
|
+
ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
|
149
|
+
ParquetSchemaType::Int32
|
150
|
+
| ParquetSchemaType::UInt32
|
151
|
+
| ParquetSchemaType::Float
|
152
|
+
| ParquetSchemaType::Date32 => 4,
|
153
|
+
ParquetSchemaType::Int64
|
154
|
+
| ParquetSchemaType::UInt64
|
155
|
+
| ParquetSchemaType::Double
|
156
|
+
| ParquetSchemaType::TimestampMillis
|
157
|
+
| ParquetSchemaType::TimestampMicros => 8,
|
158
|
+
ParquetSchemaType::String => {
|
159
|
+
if let Ok(s) = String::try_convert(value) {
|
160
|
+
s.len() + mem::size_of::<usize>() // account for length prefix
|
161
|
+
} else {
|
162
|
+
16 // default estimate for string
|
163
|
+
}
|
164
|
+
}
|
165
|
+
ParquetSchemaType::Binary => {
|
166
|
+
if let Ok(bytes) = Vec::<u8>::try_convert(value) {
|
167
|
+
bytes.len() + mem::size_of::<usize>() // account for length prefix
|
168
|
+
} else {
|
169
|
+
16 // default estimate for binary
|
170
|
+
}
|
171
|
+
}
|
172
|
+
ParquetSchemaType::Boolean => 1,
|
173
|
+
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
174
|
+
32 // rough estimate for complex types
|
175
|
+
}
|
176
|
+
};
|
177
|
+
}
|
178
|
+
Ok(row_size)
|
179
|
+
}
|
180
|
+
|
181
|
+
/// Calculate optimal batch size based on memory threshold and estimated row size
|
182
|
+
fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
|
183
|
+
let batch_size = memory_threshold / row_size;
|
184
|
+
batch_size.max(MIN_BATCH_SIZE)
|
185
|
+
}
|
186
|
+
|
129
187
|
#[inline]
|
130
188
|
pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
131
189
|
let ruby = unsafe { Ruby::get_unchecked() };
|
@@ -134,13 +192,12 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
134
192
|
read_from,
|
135
193
|
write_to,
|
136
194
|
schema,
|
137
|
-
batch_size,
|
195
|
+
batch_size: user_batch_size,
|
138
196
|
compression,
|
139
197
|
flush_threshold,
|
198
|
+
sample_size: user_sample_size,
|
140
199
|
} = parse_parquet_write_args(args)?;
|
141
200
|
|
142
|
-
let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
|
143
|
-
|
144
201
|
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
145
202
|
|
146
203
|
// Convert schema to Arrow schema
|
@@ -185,11 +242,20 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
185
242
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
186
243
|
// Create collectors for each column
|
187
244
|
let mut column_collectors: Vec<ColumnCollector> = schema
|
188
|
-
.
|
189
|
-
.map(|field|
|
245
|
+
.iter()
|
246
|
+
.map(|field| {
|
247
|
+
// Clone the type to avoid moving from a reference
|
248
|
+
let type_clone = field.type_.clone();
|
249
|
+
ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
|
250
|
+
})
|
190
251
|
.collect();
|
191
252
|
|
192
253
|
let mut rows_in_batch = 0;
|
254
|
+
let mut total_rows = 0;
|
255
|
+
let mut rng = rand::rng();
|
256
|
+
let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
|
257
|
+
let mut size_samples = Vec::with_capacity(sample_size);
|
258
|
+
let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
|
193
259
|
|
194
260
|
loop {
|
195
261
|
match read_from.funcall::<_, _, Value>("next", ()) {
|
@@ -211,15 +277,30 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
211
277
|
));
|
212
278
|
}
|
213
279
|
|
214
|
-
//
|
280
|
+
// Sample row sizes using reservoir sampling
|
281
|
+
if size_samples.len() < sample_size {
|
282
|
+
size_samples.push(estimate_single_row_size(&row_array, &schema)?);
|
283
|
+
} else if rng.random_range(0..=total_rows) < sample_size {
|
284
|
+
let idx = rng.random_range(0..sample_size);
|
285
|
+
size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
|
286
|
+
}
|
287
|
+
|
288
|
+
// Process each value in the row
|
215
289
|
for (collector, value) in column_collectors.iter_mut().zip(row_array) {
|
216
290
|
collector.push_value(value)?;
|
217
291
|
}
|
218
292
|
|
219
293
|
rows_in_batch += 1;
|
294
|
+
total_rows += 1;
|
295
|
+
|
296
|
+
// Recalculate batch size if we have enough samples and no user-specified size
|
297
|
+
if size_samples.len() >= sample_size && user_batch_size.is_none() {
|
298
|
+
let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
|
299
|
+
current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
|
300
|
+
}
|
220
301
|
|
221
302
|
// When we reach batch size, write the batch
|
222
|
-
if rows_in_batch >=
|
303
|
+
if rows_in_batch >= current_batch_size {
|
223
304
|
write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
|
224
305
|
rows_in_batch = 0;
|
225
306
|
}
|
@@ -263,6 +344,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
263
344
|
batch_size: _,
|
264
345
|
compression,
|
265
346
|
flush_threshold,
|
347
|
+
sample_size: _,
|
266
348
|
} = parse_parquet_write_args(args)?;
|
267
349
|
|
268
350
|
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
@@ -483,7 +565,7 @@ fn write_batch(
|
|
483
565
|
|
484
566
|
match writer {
|
485
567
|
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
486
|
-
if w.in_progress_size() >= flush_threshold {
|
568
|
+
if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
|
487
569
|
w.flush().map_err(|e| ParquetErrorWrapper(e))?;
|
488
570
|
}
|
489
571
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -68,6 +68,7 @@ module Parquet
|
|
68
68
|
# - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
|
69
69
|
# - `compression`: Optional compression type to use (defaults to "zstd")
|
70
70
|
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
71
|
+
# - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
|
71
72
|
sig do
|
72
73
|
params(
|
73
74
|
read_from: T::Enumerator[T::Array[T.untyped]],
|
@@ -75,10 +76,11 @@ module Parquet
|
|
75
76
|
write_to: T.any(String, IO),
|
76
77
|
batch_size: T.nilable(Integer),
|
77
78
|
flush_threshold: T.nilable(Integer),
|
78
|
-
compression: T.nilable(String)
|
79
|
+
compression: T.nilable(String),
|
80
|
+
sample_size: T.nilable(Integer)
|
79
81
|
).void
|
80
82
|
end
|
81
|
-
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
|
83
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil, sample_size: nil)
|
82
84
|
end
|
83
85
|
|
84
86
|
# Options:
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-02-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|