parquet 0.2.7 → 0.2.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +8 -1
- data/ext/parquet/src/types/writer_types.rs +1 -0
- data/ext/parquet/src/writer/mod.rs +32 -11
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +10 -4
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
|
4
|
+
data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
|
7
|
+
data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
|
data/README.md
CHANGED
@@ -152,9 +152,16 @@ batches = [
|
|
152
152
|
# Create an enumerator from the batches
|
153
153
|
columns = batches.each
|
154
154
|
|
155
|
-
# Write to a parquet file
|
155
|
+
# Write to a parquet file with default ZSTD compression
|
156
156
|
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
157
|
|
158
|
+
# Write to a parquet file with specific compression
|
159
|
+
Parquet.write_columns(columns,
|
160
|
+
schema: schema,
|
161
|
+
write_to: "data.parquet",
|
162
|
+
compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
163
|
+
)
|
164
|
+
|
158
165
|
# Write to an IO object
|
159
166
|
File.open("data.parquet", "wb") do |file|
|
160
167
|
Parquet.write_columns(columns, schema: schema, write_to: file)
|
@@ -11,7 +11,11 @@ use magnus::{
|
|
11
11
|
value::ReprValue,
|
12
12
|
Error as MagnusError, RArray, Ruby, TryConvert, Value,
|
13
13
|
};
|
14
|
-
use parquet::
|
14
|
+
use parquet::{
|
15
|
+
arrow::ArrowWriter,
|
16
|
+
basic::{Compression, GzipLevel, ZstdLevel},
|
17
|
+
file::properties::WriterProperties,
|
18
|
+
};
|
15
19
|
use tempfile::NamedTempFile;
|
16
20
|
|
17
21
|
use crate::{
|
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
28
32
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
29
33
|
let (read_from,) = parsed_args.required;
|
30
34
|
|
31
|
-
let kwargs =
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
35
|
+
let kwargs =
|
36
|
+
get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
|
37
|
+
parsed_args.keywords,
|
38
|
+
&["schema", "write_to"],
|
39
|
+
&["batch_size", "compression"],
|
40
|
+
)?;
|
36
41
|
|
37
42
|
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
38
43
|
MagnusError::new(
|
@@ -105,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
|
|
105
110
|
write_to: kwargs.required.1,
|
106
111
|
schema,
|
107
112
|
batch_size: kwargs.optional.0.flatten(),
|
113
|
+
compression: kwargs.optional.1.flatten(),
|
108
114
|
})
|
109
115
|
}
|
110
116
|
|
@@ -117,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
117
123
|
write_to,
|
118
124
|
schema,
|
119
125
|
batch_size,
|
126
|
+
compression,
|
120
127
|
} = parse_parquet_write_args(args)?;
|
121
128
|
|
122
129
|
let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
|
@@ -158,7 +165,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
|
158
165
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
159
166
|
|
160
167
|
// Create the writer
|
161
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
168
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
162
169
|
|
163
170
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
164
171
|
// Create collectors for each column
|
@@ -238,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
238
245
|
read_from,
|
239
246
|
write_to,
|
240
247
|
schema,
|
241
|
-
batch_size: _,
|
248
|
+
batch_size: _,
|
249
|
+
compression,
|
242
250
|
} = parse_parquet_write_args(args)?;
|
243
251
|
|
244
252
|
// Convert schema to Arrow schema
|
@@ -278,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
|
278
286
|
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
279
287
|
|
280
288
|
// Create the writer
|
281
|
-
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
|
289
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
282
290
|
|
283
291
|
if read_from.is_kind_of(ruby.class_enumerator()) {
|
284
292
|
loop {
|
@@ -360,12 +368,25 @@ fn create_writer(
|
|
360
368
|
ruby: &Ruby,
|
361
369
|
write_to: &Value,
|
362
370
|
schema: Arc<Schema>,
|
371
|
+
compression: Option<String>,
|
363
372
|
) -> Result<WriterOutput, MagnusError> {
|
373
|
+
// Create writer properties with compression based on the option
|
374
|
+
let props = WriterProperties::builder()
|
375
|
+
.set_compression(match compression.as_deref() {
|
376
|
+
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
377
|
+
Some("snappy") => Compression::SNAPPY,
|
378
|
+
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
379
|
+
Some("lz4") => Compression::LZ4,
|
380
|
+
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
381
|
+
_ => Compression::UNCOMPRESSED,
|
382
|
+
})
|
383
|
+
.build();
|
384
|
+
|
364
385
|
if write_to.is_kind_of(ruby.class_string()) {
|
365
386
|
let path = write_to.to_r_string()?.to_string()?;
|
366
387
|
let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
|
367
388
|
let writer =
|
368
|
-
ArrowWriter::try_new(file, schema,
|
389
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
369
390
|
Ok(WriterOutput::File(writer))
|
370
391
|
} else {
|
371
392
|
// Create a temporary file to write to instead of directly to the IoLikeValue
|
@@ -382,7 +403,7 @@ fn create_writer(
|
|
382
403
|
)
|
383
404
|
})?);
|
384
405
|
let writer =
|
385
|
-
ArrowWriter::try_new(file, schema,
|
406
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
386
407
|
Ok(WriterOutput::TempFile(writer, temp_file))
|
387
408
|
}
|
388
409
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -65,15 +65,18 @@ module Parquet
|
|
65
65
|
# - `timestamp_millis`, `timestamp_micros`
|
66
66
|
# - `write_to`: String path or IO object to write the parquet file to
|
67
67
|
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
68
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
69
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
68
70
|
sig do
|
69
71
|
params(
|
70
72
|
read_from: T::Enumerator[T::Array[T.untyped]],
|
71
73
|
schema: T::Array[T::Hash[String, String]],
|
72
74
|
write_to: T.any(String, IO),
|
73
|
-
batch_size: T.nilable(Integer)
|
75
|
+
batch_size: T.nilable(Integer),
|
76
|
+
compression: T.nilable(String)
|
74
77
|
).void
|
75
78
|
end
|
76
|
-
def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
|
79
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
|
77
80
|
end
|
78
81
|
|
79
82
|
# Options:
|
@@ -89,13 +92,16 @@ module Parquet
|
|
89
92
|
# - `timestamp_millis`, `timestamp_micros`
|
90
93
|
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
91
94
|
# - `write_to`: String path or IO object to write the parquet file to
|
95
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
96
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
92
97
|
sig do
|
93
98
|
params(
|
94
99
|
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
95
100
|
schema: T::Array[T::Hash[String, String]],
|
96
|
-
write_to: T.any(String, IO)
|
101
|
+
write_to: T.any(String, IO),
|
102
|
+
compression: T.nilable(String)
|
97
103
|
).void
|
98
104
|
end
|
99
|
-
def self.write_columns(read_from, schema:, write_to:)
|
105
|
+
def self.write_columns(read_from, schema:, write_to:, compression: nil)
|
100
106
|
end
|
101
107
|
end
|