parquet 0.2.7 → 0.2.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
4
- data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
3
+ metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
4
+ data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
5
5
  SHA512:
6
- metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
7
- data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
6
+ metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
7
+ data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
data/README.md CHANGED
@@ -152,9 +152,16 @@ batches = [
152
152
  # Create an enumerator from the batches
153
153
  columns = batches.each
154
154
 
155
- # Write to a parquet file
155
+ # Write to a parquet file with default ZSTD compression
156
156
  Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
157
 
158
+ # Write to a parquet file with specific compression
159
+ Parquet.write_columns(columns,
160
+ schema: schema,
161
+ write_to: "data.parquet",
162
+ compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
163
+ )
164
+
158
165
  # Write to an IO object
159
166
  File.open("data.parquet", "wb") do |file|
160
167
  Parquet.write_columns(columns, schema: schema, write_to: file)
@@ -24,6 +24,7 @@ pub struct ParquetWriteArgs<'a> {
24
24
  pub write_to: Value,
25
25
  pub schema: Vec<SchemaField<'a>>,
26
26
  pub batch_size: Option<usize>,
27
+ pub compression: Option<String>,
27
28
  }
28
29
 
29
30
  pub trait SendableWrite: Send + Write {}
@@ -11,7 +11,11 @@ use magnus::{
11
11
  value::ReprValue,
12
12
  Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
13
  };
14
- use parquet::arrow::ArrowWriter;
14
+ use parquet::{
15
+ arrow::ArrowWriter,
16
+ basic::{Compression, GzipLevel, ZstdLevel},
17
+ file::properties::WriterProperties,
18
+ };
15
19
  use tempfile::NamedTempFile;
16
20
 
17
21
  use crate::{
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
28
32
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
33
  let (read_from,) = parsed_args.required;
30
34
 
31
- let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
32
- parsed_args.keywords,
33
- &["schema", "write_to"],
34
- &["batch_size"],
35
- )?;
35
+ let kwargs =
36
+ get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
37
+ parsed_args.keywords,
38
+ &["schema", "write_to"],
39
+ &["batch_size", "compression"],
40
+ )?;
36
41
 
37
42
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
38
43
  MagnusError::new(
@@ -105,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
105
110
  write_to: kwargs.required.1,
106
111
  schema,
107
112
  batch_size: kwargs.optional.0.flatten(),
113
+ compression: kwargs.optional.1.flatten(),
108
114
  })
109
115
  }
110
116
 
@@ -117,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
117
123
  write_to,
118
124
  schema,
119
125
  batch_size,
126
+ compression,
120
127
  } = parse_parquet_write_args(args)?;
121
128
 
122
129
  let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
@@ -158,7 +165,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
158
165
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
159
166
 
160
167
  // Create the writer
161
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
168
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
162
169
 
163
170
  if read_from.is_kind_of(ruby.class_enumerator()) {
164
171
  // Create collectors for each column
@@ -238,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
238
245
  read_from,
239
246
  write_to,
240
247
  schema,
241
- batch_size: _, // Batch size is determined by the input
248
+ batch_size: _,
249
+ compression,
242
250
  } = parse_parquet_write_args(args)?;
243
251
 
244
252
  // Convert schema to Arrow schema
@@ -278,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
278
286
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
279
287
 
280
288
  // Create the writer
281
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
289
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
282
290
 
283
291
  if read_from.is_kind_of(ruby.class_enumerator()) {
284
292
  loop {
@@ -360,12 +368,25 @@ fn create_writer(
360
368
  ruby: &Ruby,
361
369
  write_to: &Value,
362
370
  schema: Arc<Schema>,
371
+ compression: Option<String>,
363
372
  ) -> Result<WriterOutput, MagnusError> {
373
+ // Create writer properties with compression based on the option
374
+ let props = WriterProperties::builder()
375
+ .set_compression(match compression.as_deref() {
376
+ Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
377
+ Some("snappy") => Compression::SNAPPY,
378
+ Some("gzip") => Compression::GZIP(GzipLevel::default()),
379
+ Some("lz4") => Compression::LZ4,
380
+ Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
381
+ _ => Compression::UNCOMPRESSED,
382
+ })
383
+ .build();
384
+
364
385
  if write_to.is_kind_of(ruby.class_string()) {
365
386
  let path = write_to.to_r_string()?.to_string()?;
366
387
  let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
367
388
  let writer =
368
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
389
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
369
390
  Ok(WriterOutput::File(writer))
370
391
  } else {
371
392
  // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -382,7 +403,7 @@ fn create_writer(
382
403
  )
383
404
  })?);
384
405
  let writer =
385
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
406
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
386
407
  Ok(WriterOutput::TempFile(writer, temp_file))
387
408
  }
388
409
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.7"
2
+ VERSION = "0.2.8"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -65,15 +65,18 @@ module Parquet
65
65
  # - `timestamp_millis`, `timestamp_micros`
66
66
  # - `write_to`: String path or IO object to write the parquet file to
67
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
68
+ # - `compression`: Optional compression type to use (defaults to "zstd")
69
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
68
70
  sig do
69
71
  params(
70
72
  read_from: T::Enumerator[T::Array[T.untyped]],
71
73
  schema: T::Array[T::Hash[String, String]],
72
74
  write_to: T.any(String, IO),
73
- batch_size: T.nilable(Integer)
75
+ batch_size: T.nilable(Integer),
76
+ compression: T.nilable(String)
74
77
  ).void
75
78
  end
76
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
79
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
77
80
  end
78
81
 
79
82
  # Options:
@@ -89,13 +92,16 @@ module Parquet
89
92
  # - `timestamp_millis`, `timestamp_micros`
90
93
  # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
91
94
  # - `write_to`: String path or IO object to write the parquet file to
95
+ # - `compression`: Optional compression type to use (defaults to "zstd")
96
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
92
97
  sig do
93
98
  params(
94
99
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
95
100
  schema: T::Array[T::Hash[String, String]],
96
- write_to: T.any(String, IO)
101
+ write_to: T.any(String, IO),
102
+ compression: T.nilable(String)
97
103
  ).void
98
104
  end
99
- def self.write_columns(read_from, schema:, write_to:)
105
+ def self.write_columns(read_from, schema:, write_to:, compression: nil)
100
106
  end
101
107
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko