parquet 0.2.7 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +16 -1
- data/ext/parquet/src/types/writer_types.rs +2 -0
- data/ext/parquet/src/writer/mod.rs +66 -10
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +14 -4
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c20809ee9bbbe96f268b2ef4c5d40f7e7fae1613e3aabe2c0f42778425237c32
|
4
|
+
data.tar.gz: 13cd137212d16de6eb4e0803ae35e9bdace8caf95ac1beabba4848a23290ad4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70f51bcdc98891e781ab51257c969c4a8ca5c9fe03a43ccab8b72d9a79db2b640e2779a15ed1a36969914dd1fcd0c8df639ecb50168ec5a15b53e05cebd6655b
|
7
|
+
data.tar.gz: abdacbca6cf8857a14ec039a34ba51403a6e4ba130b751faeb62904329b2688f581afa7207c0a95fa958822d6dda0b2477c58d02c930dac5e2b54c50ff4ccc7f
|
data/README.md
CHANGED
@@ -117,6 +117,13 @@ Parquet.write_rows(rows,
|
|
117
117
|
write_to: "data.parquet",
|
118
118
|
batch_size: 500
|
119
119
|
)
|
120
|
+
|
121
|
+
# Optionally specify memory threshold for flushing (default is 64MB)
|
122
|
+
Parquet.write_rows(rows,
|
123
|
+
schema: schema,
|
124
|
+
write_to: "data.parquet",
|
125
|
+
flush_threshold: 32 * 1024 * 1024 # 32MB
|
126
|
+
)
|
120
127
|
```
|
121
128
|
|
122
129
|
### Writing Column-wise Data
|
@@ -152,9 +159,17 @@ batches = [
|
|
152
159
|
# Create an enumerator from the batches
|
153
160
|
columns = batches.each
|
154
161
|
|
155
|
-
# Write to a parquet file
|
162
|
+
# Write to a parquet file with default ZSTD compression
|
156
163
|
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
164
|
|
165
|
+
# Write to a parquet file with specific compression and memory threshold
|
166
|
+
Parquet.write_columns(columns,
|
167
|
+
schema: schema,
|
168
|
+
write_to: "data.parquet",
|
169
|
+
compression: "snappy", # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
170
|
+
flush_threshold: 32 * 1024 * 1024 # 32MB
|
171
|
+
)
|
172
|
+
|
158
173
|
# Write to an IO object
|
159
174
|
File.open("data.parquet", "wb") do |file|
|
160
175
|
Parquet.write_columns(columns, schema: schema, write_to: file)
|
@@ -11,7 +11,11 @@ use magnus::{
|
|
11
11
|
value::ReprValue,
|
12
12
|
Error as MagnusError, RArray, Ruby, TryConvert, Value,
|
13
13
|
};
|
14
|
-
use parquet::
|
14
|
+
use parquet::{
|
15
|
+
arrow::ArrowWriter,
|
16
|
+
basic::{Compression, GzipLevel, ZstdLevel},
|
17
|
+
file::properties::WriterProperties,
|
18
|
+
};
|
15
19
|
use tempfile::NamedTempFile;
|
16
20
|
|
17
21
|
use crate::{
|
@@ -22,16 +26,28 @@ use crate::{
|
|
22
26
|
|
23
27
|
const DEFAULT_BATCH_SIZE: usize = 1000;
|
24
28
|
|
29
|
+
// Maximum memory usage per batch (64MB by default)
|
30
|
+
const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
|
31
|
+
|
25
32
|
/// Parse arguments for Parquet writing
|
26
33
|
pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
|
27
34
|
let ruby = unsafe { Ruby::get_unchecked() };
|
28
35
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
29
36
|
let (read_from,) = parsed_args.required;
|
30
37
|
|
31
|
-
let kwargs = get_kwargs::<
|
38
|
+
let kwargs = get_kwargs::<
|
39
|
+
_,
|
40
|
+
(Value, Value),
|
41
|
+
(
|
42
|
+
Option<Option<usize>>,
|
43
|
+
Option<Option<usize>>,
|
44
|
+
Option<Option<String>>,
|
45
|
+
),
|
46
|
+
(),
|
47
|
+
>(
|
32
48
|
parsed_args.keywords,
|
33
49
|
&["schema", "write_to"],
|
34
|
-
&["batch_size"],
|
50
|
+
&["batch_size", "flush_threshold", "compression"],
|
35
51
|
)?;
|
36
52
|
|
37
53
|
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
@@ -105,6 +121,8 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
105
121
|
write_to: kwargs.required.1,
|
106
122
|
schema,
|
107
123
|
batch_size: kwargs.optional.0.flatten(),
|
124
|
+
flush_threshold: kwargs.optional.1.flatten(),
|
125
|
+
compression: kwargs.optional.2.flatten(),
|
108
126
|
})
|
109
127
|
}
|
110
128
|
|
@@ -117,10 +135,14 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
117
135
|
write_to,
|
118
136
|
schema,
|
119
137
|
batch_size,
|
138
|
+
compression,
|
139
|
+
flush_threshold,
|
120
140
|
} = parse_parquet_write_args(args)?;
|
121
141
|
|
122
142
|
let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
|
123
143
|
|
144
|
+
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
145
|
+
|
124
146
|
// Convert schema to Arrow schema
|
125
147
|
let arrow_fields: Vec<Field> = schema
|
126
148
|
.iter()
|
@@ -158,7 +180,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
158
180
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
159
181
|
|
160
182
|
// Create the writer
|
161
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
183
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
162
184
|
|
163
185
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
164
186
|
// Create collectors for each column
|
@@ -198,7 +220,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
198
220
|
|
199
221
|
// When we reach batch size, write the batch
|
200
222
|
if rows_in_batch >= batch_size {
|
201
|
-
write_batch(&mut writer, &mut column_collectors)?;
|
223
|
+
write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
|
202
224
|
rows_in_batch = 0;
|
203
225
|
}
|
204
226
|
}
|
@@ -206,7 +228,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
206
228
|
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
207
229
|
// Write any remaining rows
|
208
230
|
if rows_in_batch > 0 {
|
209
|
-
write_batch(&mut writer, &mut column_collectors)?;
|
231
|
+
write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
|
210
232
|
}
|
211
233
|
break;
|
212
234
|
}
|
@@ -238,9 +260,13 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
238
260
|
read_from,
|
239
261
|
write_to,
|
240
262
|
schema,
|
241
|
-
batch_size: _,
|
263
|
+
batch_size: _,
|
264
|
+
compression,
|
265
|
+
flush_threshold,
|
242
266
|
} = parse_parquet_write_args(args)?;
|
243
267
|
|
268
|
+
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
269
|
+
|
244
270
|
// Convert schema to Arrow schema
|
245
271
|
let arrow_fields: Vec<Field> = schema
|
246
272
|
.iter()
|
@@ -278,7 +304,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
278
304
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
279
305
|
|
280
306
|
// Create the writer
|
281
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
307
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
282
308
|
|
283
309
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
284
310
|
loop {
|
@@ -331,6 +357,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
331
357
|
writer
|
332
358
|
.write(&record_batch)
|
333
359
|
.map_err(|e| ParquetErrorWrapper(e))?;
|
360
|
+
|
361
|
+
match &mut writer {
|
362
|
+
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
363
|
+
if w.in_progress_size() >= flush_threshold {
|
364
|
+
w.flush().map_err(|e| ParquetErrorWrapper(e))?;
|
365
|
+
}
|
366
|
+
}
|
367
|
+
}
|
334
368
|
}
|
335
369
|
Err(e) => {
|
336
370
|
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
@@ -360,12 +394,25 @@ fn create_writer(
|
|
360
394
|
ruby: &Ruby,
|
361
395
|
write_to: &Value,
|
362
396
|
schema: Arc<Schema>,
|
397
|
+
compression: Option<String>,
|
363
398
|
) -> Result<WriterOutput, MagnusError> {
|
399
|
+
// Create writer properties with compression based on the option
|
400
|
+
let props = WriterProperties::builder()
|
401
|
+
.set_compression(match compression.as_deref() {
|
402
|
+
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
403
|
+
Some("snappy") => Compression::SNAPPY,
|
404
|
+
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
405
|
+
Some("lz4") => Compression::LZ4,
|
406
|
+
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
407
|
+
_ => Compression::UNCOMPRESSED,
|
408
|
+
})
|
409
|
+
.build();
|
410
|
+
|
364
411
|
if write_to.is_kind_of(ruby.class_string()) {
|
365
412
|
let path = write_to.to_r_string()?.to_string()?;
|
366
413
|
let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
|
367
414
|
let writer =
|
368
|
-
ArrowWriter::try_new(file, schema,
|
415
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
369
416
|
Ok(WriterOutput::File(writer))
|
370
417
|
} else {
|
371
418
|
// Create a temporary file to write to instead of directly to the IoLikeValue
|
@@ -382,7 +429,7 @@ fn create_writer(
|
|
382
429
|
)
|
383
430
|
})?);
|
384
431
|
let writer =
|
385
|
-
ArrowWriter::try_new(file, schema,
|
432
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
386
433
|
Ok(WriterOutput::TempFile(writer, temp_file))
|
387
434
|
}
|
388
435
|
}
|
@@ -414,6 +461,7 @@ fn copy_temp_file_to_io_like(
|
|
414
461
|
fn write_batch(
|
415
462
|
writer: &mut WriterOutput,
|
416
463
|
collectors: &mut [ColumnCollector],
|
464
|
+
flush_threshold: usize,
|
417
465
|
) -> Result<(), MagnusError> {
|
418
466
|
// Convert columns to Arrow arrays
|
419
467
|
let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
|
@@ -433,5 +481,13 @@ fn write_batch(
|
|
433
481
|
.write(&record_batch)
|
434
482
|
.map_err(|e| ParquetErrorWrapper(e))?;
|
435
483
|
|
484
|
+
match writer {
|
485
|
+
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
486
|
+
if w.in_progress_size() >= flush_threshold {
|
487
|
+
w.flush().map_err(|e| ParquetErrorWrapper(e))?;
|
488
|
+
}
|
489
|
+
}
|
490
|
+
}
|
491
|
+
|
436
492
|
Ok(())
|
437
493
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -65,15 +65,20 @@ module Parquet
|
|
65
65
|
# - `timestamp_millis`, `timestamp_micros`
|
66
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
67
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
68
|
+
# - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
|
69
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
70
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
68
71
|
sig do
|
69
72
|
params(
|
70
73
|
read_from: T::Enumerator[T::Array[T.untyped]],
|
71
74
|
schema: T::Array[T::Hash[String, String]],
|
72
75
|
write_to: T.any(String, IO),
|
73
|
-
batch_size: T.nilable(Integer)
|
76
|
+
batch_size: T.nilable(Integer),
|
77
|
+
flush_threshold: T.nilable(Integer),
|
78
|
+
compression: T.nilable(String)
|
74
79
|
).void
|
75
80
|
end
|
76
|
-
def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
|
81
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
|
77
82
|
end
|
78
83
|
|
79
84
|
# Options:
|
@@ -89,13 +94,18 @@ module Parquet
|
|
89
94
|
# - `timestamp_millis`, `timestamp_micros`
|
90
95
|
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
91
96
|
# - `write_to`: String path or IO object to write the parquet file to
|
97
|
+
# - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
|
98
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
99
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
92
100
|
sig do
|
93
101
|
params(
|
94
102
|
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
95
103
|
schema: T::Array[T::Hash[String, String]],
|
96
|
-
write_to: T.any(String, IO)
|
104
|
+
write_to: T.any(String, IO),
|
105
|
+
flush_threshold: T.nilable(Integer),
|
106
|
+
compression: T.nilable(String)
|
97
107
|
).void
|
98
108
|
end
|
99
|
-
def self.write_columns(read_from, schema:, write_to:)
|
109
|
+
def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
|
100
110
|
end
|
101
111
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|