parquet 0.2.7 → 0.2.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/ext/parquet/src/types/writer_types.rs +1 -0
- data/ext/parquet/src/writer/mod.rs +32 -11
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +10 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
|
4
|
+
data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
|
7
|
+
data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
|
data/README.md
CHANGED
@@ -152,9 +152,16 @@ batches = [
|
|
152
152
|
# Create an enumerator from the batches
|
153
153
|
columns = batches.each
|
154
154
|
|
155
|
-
# Write to a parquet file
|
155
|
+
# Write to a parquet file with default ZSTD compression
|
156
156
|
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
157
|
|
158
|
+
# Write to a parquet file with specific compression
|
159
|
+
Parquet.write_columns(columns,
|
160
|
+
schema: schema,
|
161
|
+
write_to: "data.parquet",
|
162
|
+
compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
163
|
+
)
|
164
|
+
|
158
165
|
# Write to an IO object
|
159
166
|
File.open("data.parquet", "wb") do |file|
|
160
167
|
Parquet.write_columns(columns, schema: schema, write_to: file)
|
@@ -11,7 +11,11 @@ use magnus::{
|
|
11
11
|
value::ReprValue,
|
12
12
|
Error as MagnusError, RArray, Ruby, TryConvert, Value,
|
13
13
|
};
|
14
|
-
use parquet::
|
14
|
+
use parquet::{
|
15
|
+
arrow::ArrowWriter,
|
16
|
+
basic::{Compression, GzipLevel, ZstdLevel},
|
17
|
+
file::properties::WriterProperties,
|
18
|
+
};
|
15
19
|
use tempfile::NamedTempFile;
|
16
20
|
|
17
21
|
use crate::{
|
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
28
32
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
29
33
|
let (read_from,) = parsed_args.required;
|
30
34
|
|
31
|
-
let kwargs =
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
35
|
+
let kwargs =
|
36
|
+
get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
|
37
|
+
parsed_args.keywords,
|
38
|
+
&["schema", "write_to"],
|
39
|
+
&["batch_size", "compression"],
|
40
|
+
)?;
|
36
41
|
|
37
42
|
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
38
43
|
MagnusError::new(
|
@@ -105,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
105
110
|
write_to: kwargs.required.1,
|
106
111
|
schema,
|
107
112
|
batch_size: kwargs.optional.0.flatten(),
|
113
|
+
compression: kwargs.optional.1.flatten(),
|
108
114
|
})
|
109
115
|
}
|
110
116
|
|
@@ -117,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
117
123
|
write_to,
|
118
124
|
schema,
|
119
125
|
batch_size,
|
126
|
+
compression,
|
120
127
|
} = parse_parquet_write_args(args)?;
|
121
128
|
|
122
129
|
let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
|
@@ -158,7 +165,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
158
165
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
159
166
|
|
160
167
|
// Create the writer
|
161
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
168
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
162
169
|
|
163
170
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
164
171
|
// Create collectors for each column
|
@@ -238,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
238
245
|
read_from,
|
239
246
|
write_to,
|
240
247
|
schema,
|
241
|
-
batch_size: _,
|
248
|
+
batch_size: _,
|
249
|
+
compression,
|
242
250
|
} = parse_parquet_write_args(args)?;
|
243
251
|
|
244
252
|
// Convert schema to Arrow schema
|
@@ -278,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
278
286
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
279
287
|
|
280
288
|
// Create the writer
|
281
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
289
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
282
290
|
|
283
291
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
284
292
|
loop {
|
@@ -360,12 +368,25 @@ fn create_writer(
|
|
360
368
|
ruby: &Ruby,
|
361
369
|
write_to: &Value,
|
362
370
|
schema: Arc<Schema>,
|
371
|
+
compression: Option<String>,
|
363
372
|
) -> Result<WriterOutput, MagnusError> {
|
373
|
+
// Create writer properties with compression based on the option
|
374
|
+
let props = WriterProperties::builder()
|
375
|
+
.set_compression(match compression.as_deref() {
|
376
|
+
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
377
|
+
Some("snappy") => Compression::SNAPPY,
|
378
|
+
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
379
|
+
Some("lz4") => Compression::LZ4,
|
380
|
+
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
381
|
+
_ => Compression::UNCOMPRESSED,
|
382
|
+
})
|
383
|
+
.build();
|
384
|
+
|
364
385
|
if write_to.is_kind_of(ruby.class_string()) {
|
365
386
|
let path = write_to.to_r_string()?.to_string()?;
|
366
387
|
let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
|
367
388
|
let writer =
|
368
|
-
ArrowWriter::try_new(file, schema,
|
389
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
369
390
|
Ok(WriterOutput::File(writer))
|
370
391
|
} else {
|
371
392
|
// Create a temporary file to write to instead of directly to the IoLikeValue
|
@@ -382,7 +403,7 @@ fn create_writer(
|
|
382
403
|
)
|
383
404
|
})?);
|
384
405
|
let writer =
|
385
|
-
ArrowWriter::try_new(file, schema,
|
406
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
386
407
|
Ok(WriterOutput::TempFile(writer, temp_file))
|
387
408
|
}
|
388
409
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -65,15 +65,18 @@ module Parquet
|
|
65
65
|
# - `timestamp_millis`, `timestamp_micros`
|
66
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
67
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
68
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
69
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
68
70
|
sig do
|
69
71
|
params(
|
70
72
|
read_from: T::Enumerator[T::Array[T.untyped]],
|
71
73
|
schema: T::Array[T::Hash[String, String]],
|
72
74
|
write_to: T.any(String, IO),
|
73
|
-
batch_size: T.nilable(Integer)
|
75
|
+
batch_size: T.nilable(Integer),
|
76
|
+
compression: T.nilable(String)
|
74
77
|
).void
|
75
78
|
end
|
76
|
-
def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
|
79
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
|
77
80
|
end
|
78
81
|
|
79
82
|
# Options:
|
@@ -89,13 +92,16 @@ module Parquet
|
|
89
92
|
# - `timestamp_millis`, `timestamp_micros`
|
90
93
|
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
91
94
|
# - `write_to`: String path or IO object to write the parquet file to
|
95
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
96
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
92
97
|
sig do
|
93
98
|
params(
|
94
99
|
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
95
100
|
schema: T::Array[T::Hash[String, String]],
|
96
|
-
write_to: T.any(String, IO)
|
101
|
+
write_to: T.any(String, IO),
|
102
|
+
compression: T.nilable(String)
|
97
103
|
).void
|
98
104
|
end
|
99
|
-
def self.write_columns(read_from, schema:, write_to:)
|
105
|
+
def self.write_columns(read_from, schema:, write_to:, compression: nil)
|
100
106
|
end
|
101
107
|
end
|