parquet 0.2.8 → 0.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -2
- data/ext/parquet/src/types/writer_types.rs +1 -0
- data/ext/parquet/src/writer/mod.rs +44 -9
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +6 -2
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c20809ee9bbbe96f268b2ef4c5d40f7e7fae1613e3aabe2c0f42778425237c32
|
4
|
+
data.tar.gz: 13cd137212d16de6eb4e0803ae35e9bdace8caf95ac1beabba4848a23290ad4e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 70f51bcdc98891e781ab51257c969c4a8ca5c9fe03a43ccab8b72d9a79db2b640e2779a15ed1a36969914dd1fcd0c8df639ecb50168ec5a15b53e05cebd6655b
|
7
|
+
data.tar.gz: abdacbca6cf8857a14ec039a34ba51403a6e4ba130b751faeb62904329b2688f581afa7207c0a95fa958822d6dda0b2477c58d02c930dac5e2b54c50ff4ccc7f
|
data/README.md
CHANGED
@@ -117,6 +117,13 @@ Parquet.write_rows(rows,
|
|
117
117
|
write_to: "data.parquet",
|
118
118
|
batch_size: 500
|
119
119
|
)
|
120
|
+
|
121
|
+
# Optionally specify memory threshold for flushing (default is 64MB)
|
122
|
+
Parquet.write_rows(rows,
|
123
|
+
schema: schema,
|
124
|
+
write_to: "data.parquet",
|
125
|
+
flush_threshold: 32 * 1024 * 1024 # 32MB
|
126
|
+
)
|
120
127
|
```
|
121
128
|
|
122
129
|
### Writing Column-wise Data
|
@@ -155,11 +162,12 @@ columns = batches.each
|
|
155
162
|
# Write to a parquet file with default ZSTD compression
|
156
163
|
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
164
|
|
158
|
-
# Write to a parquet file with specific compression
|
165
|
+
# Write to a parquet file with specific compression and memory threshold
|
159
166
|
Parquet.write_columns(columns,
|
160
167
|
schema: schema,
|
161
168
|
write_to: "data.parquet",
|
162
|
-
compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
169
|
+
compression: "snappy", # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
170
|
+
flush_threshold: 32 * 1024 * 1024 # 32MB
|
163
171
|
)
|
164
172
|
|
165
173
|
# Write to an IO object
|
@@ -26,18 +26,29 @@ use crate::{
|
|
26
26
|
|
27
27
|
const DEFAULT_BATCH_SIZE: usize = 1000;
|
28
28
|
|
29
|
+
// Maximum memory usage per batch (64MB by default)
|
30
|
+
const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
|
31
|
+
|
29
32
|
/// Parse arguments for Parquet writing
|
30
33
|
pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
|
31
34
|
let ruby = unsafe { Ruby::get_unchecked() };
|
32
35
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
33
36
|
let (read_from,) = parsed_args.required;
|
34
37
|
|
35
|
-
let kwargs =
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
38
|
+
let kwargs = get_kwargs::<
|
39
|
+
_,
|
40
|
+
(Value, Value),
|
41
|
+
(
|
42
|
+
Option<Option<usize>>,
|
43
|
+
Option<Option<usize>>,
|
44
|
+
Option<Option<String>>,
|
45
|
+
),
|
46
|
+
(),
|
47
|
+
>(
|
48
|
+
parsed_args.keywords,
|
49
|
+
&["schema", "write_to"],
|
50
|
+
&["batch_size", "flush_threshold", "compression"],
|
51
|
+
)?;
|
41
52
|
|
42
53
|
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
43
54
|
MagnusError::new(
|
@@ -110,7 +121,8 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
110
121
|
write_to: kwargs.required.1,
|
111
122
|
schema,
|
112
123
|
batch_size: kwargs.optional.0.flatten(),
|
113
|
-
|
124
|
+
flush_threshold: kwargs.optional.1.flatten(),
|
125
|
+
compression: kwargs.optional.2.flatten(),
|
114
126
|
})
|
115
127
|
}
|
116
128
|
|
@@ -124,10 +136,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
124
136
|
schema,
|
125
137
|
batch_size,
|
126
138
|
compression,
|
139
|
+
flush_threshold,
|
127
140
|
} = parse_parquet_write_args(args)?;
|
128
141
|
|
129
142
|
let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
|
130
143
|
|
144
|
+
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
145
|
+
|
131
146
|
// Convert schema to Arrow schema
|
132
147
|
let arrow_fields: Vec<Field> = schema
|
133
148
|
.iter()
|
@@ -205,7 +220,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
205
220
|
|
206
221
|
// When we reach batch size, write the batch
|
207
222
|
if rows_in_batch >= batch_size {
|
208
|
-
write_batch(&mut writer, &mut column_collectors)?;
|
223
|
+
write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
|
209
224
|
rows_in_batch = 0;
|
210
225
|
}
|
211
226
|
}
|
@@ -213,7 +228,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
213
228
|
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
214
229
|
// Write any remaining rows
|
215
230
|
if rows_in_batch > 0 {
|
216
|
-
write_batch(&mut writer, &mut column_collectors)?;
|
231
|
+
write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
|
217
232
|
}
|
218
233
|
break;
|
219
234
|
}
|
@@ -247,8 +262,11 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
247
262
|
schema,
|
248
263
|
batch_size: _,
|
249
264
|
compression,
|
265
|
+
flush_threshold,
|
250
266
|
} = parse_parquet_write_args(args)?;
|
251
267
|
|
268
|
+
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
269
|
+
|
252
270
|
// Convert schema to Arrow schema
|
253
271
|
let arrow_fields: Vec<Field> = schema
|
254
272
|
.iter()
|
@@ -339,6 +357,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
339
357
|
writer
|
340
358
|
.write(&record_batch)
|
341
359
|
.map_err(|e| ParquetErrorWrapper(e))?;
|
360
|
+
|
361
|
+
match &mut writer {
|
362
|
+
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
363
|
+
if w.in_progress_size() >= flush_threshold {
|
364
|
+
w.flush().map_err(|e| ParquetErrorWrapper(e))?;
|
365
|
+
}
|
366
|
+
}
|
367
|
+
}
|
342
368
|
}
|
343
369
|
Err(e) => {
|
344
370
|
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
@@ -435,6 +461,7 @@ fn copy_temp_file_to_io_like(
|
|
435
461
|
fn write_batch(
|
436
462
|
writer: &mut WriterOutput,
|
437
463
|
collectors: &mut [ColumnCollector],
|
464
|
+
flush_threshold: usize,
|
438
465
|
) -> Result<(), MagnusError> {
|
439
466
|
// Convert columns to Arrow arrays
|
440
467
|
let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
|
@@ -454,5 +481,13 @@ fn write_batch(
|
|
454
481
|
.write(&record_batch)
|
455
482
|
.map_err(|e| ParquetErrorWrapper(e))?;
|
456
483
|
|
484
|
+
match writer {
|
485
|
+
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
486
|
+
if w.in_progress_size() >= flush_threshold {
|
487
|
+
w.flush().map_err(|e| ParquetErrorWrapper(e))?;
|
488
|
+
}
|
489
|
+
}
|
490
|
+
}
|
491
|
+
|
457
492
|
Ok(())
|
458
493
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -65,6 +65,7 @@ module Parquet
|
|
65
65
|
# - `timestamp_millis`, `timestamp_micros`
|
66
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
67
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
68
|
+
# - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
|
68
69
|
# - `compression`: Optional compression type to use (defaults to "zstd")
|
69
70
|
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
70
71
|
sig do
|
@@ -73,10 +74,11 @@ module Parquet
|
|
73
74
|
schema: T::Array[T::Hash[String, String]],
|
74
75
|
write_to: T.any(String, IO),
|
75
76
|
batch_size: T.nilable(Integer),
|
77
|
+
flush_threshold: T.nilable(Integer),
|
76
78
|
compression: T.nilable(String)
|
77
79
|
).void
|
78
80
|
end
|
79
|
-
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
|
81
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
|
80
82
|
end
|
81
83
|
|
82
84
|
# Options:
|
@@ -92,6 +94,7 @@ module Parquet
|
|
92
94
|
# - `timestamp_millis`, `timestamp_micros`
|
93
95
|
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
94
96
|
# - `write_to`: String path or IO object to write the parquet file to
|
97
|
+
# - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
|
95
98
|
# - `compression`: Optional compression type to use (defaults to "zstd")
|
96
99
|
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
97
100
|
sig do
|
@@ -99,9 +102,10 @@ module Parquet
|
|
99
102
|
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
100
103
|
schema: T::Array[T::Hash[String, String]],
|
101
104
|
write_to: T.any(String, IO),
|
105
|
+
flush_threshold: T.nilable(Integer),
|
102
106
|
compression: T.nilable(String)
|
103
107
|
).void
|
104
108
|
end
|
105
|
-
def self.write_columns(read_from, schema:, write_to:, compression: nil)
|
109
|
+
def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
|
106
110
|
end
|
107
111
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.9
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-29 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|