parquet 0.2.7 → 0.2.8

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
4
- data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
3
+ metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
4
+ data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
5
5
  SHA512:
6
- metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
7
- data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
6
+ metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
7
+ data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
data/README.md CHANGED
@@ -152,9 +152,16 @@ batches = [
152
152
  # Create an enumerator from the batches
153
153
  columns = batches.each
154
154
 
155
- # Write to a parquet file
155
+ # Write to a parquet file with default ZSTD compression
156
156
  Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
157
 
158
+ # Write to a parquet file with specific compression
159
+ Parquet.write_columns(columns,
160
+ schema: schema,
161
+ write_to: "data.parquet",
162
+ compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
163
+ )
164
+
158
165
  # Write to an IO object
159
166
  File.open("data.parquet", "wb") do |file|
160
167
  Parquet.write_columns(columns, schema: schema, write_to: file)
@@ -24,6 +24,7 @@ pub struct ParquetWriteArgs<'a> {
24
24
  pub write_to: Value,
25
25
  pub schema: Vec<SchemaField<'a>>,
26
26
  pub batch_size: Option<usize>,
27
+ pub compression: Option<String>,
27
28
  }
28
29
 
29
30
  pub trait SendableWrite: Send + Write {}
@@ -11,7 +11,11 @@ use magnus::{
11
11
  value::ReprValue,
12
12
  Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
13
  };
14
- use parquet::arrow::ArrowWriter;
14
+ use parquet::{
15
+ arrow::ArrowWriter,
16
+ basic::{Compression, GzipLevel, ZstdLevel},
17
+ file::properties::WriterProperties,
18
+ };
15
19
  use tempfile::NamedTempFile;
16
20
 
17
21
  use crate::{
@@ -28,11 +32,12 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
28
32
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
33
  let (read_from,) = parsed_args.required;
30
34
 
31
- let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
32
- parsed_args.keywords,
33
- &["schema", "write_to"],
34
- &["batch_size"],
35
- )?;
35
+ let kwargs =
36
+ get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
37
+ parsed_args.keywords,
38
+ &["schema", "write_to"],
39
+ &["batch_size", "compression"],
40
+ )?;
36
41
 
37
42
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
38
43
  MagnusError::new(
@@ -105,6 +110,7 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
105
110
  write_to: kwargs.required.1,
106
111
  schema,
107
112
  batch_size: kwargs.optional.0.flatten(),
113
+ compression: kwargs.optional.1.flatten(),
108
114
  })
109
115
  }
110
116
 
@@ -117,6 +123,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
117
123
  write_to,
118
124
  schema,
119
125
  batch_size,
126
+ compression,
120
127
  } = parse_parquet_write_args(args)?;
121
128
 
122
129
  let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
@@ -158,7 +165,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
158
165
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
159
166
 
160
167
  // Create the writer
161
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
168
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
162
169
 
163
170
  if read_from.is_kind_of(ruby.class_enumerator()) {
164
171
  // Create collectors for each column
@@ -238,7 +245,8 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
238
245
  read_from,
239
246
  write_to,
240
247
  schema,
241
- batch_size: _, // Batch size is determined by the input
248
+ batch_size: _,
249
+ compression,
242
250
  } = parse_parquet_write_args(args)?;
243
251
 
244
252
  // Convert schema to Arrow schema
@@ -278,7 +286,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
278
286
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
279
287
 
280
288
  // Create the writer
281
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
289
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
282
290
 
283
291
  if read_from.is_kind_of(ruby.class_enumerator()) {
284
292
  loop {
@@ -360,12 +368,25 @@ fn create_writer(
360
368
  ruby: &Ruby,
361
369
  write_to: &Value,
362
370
  schema: Arc<Schema>,
371
+ compression: Option<String>,
363
372
  ) -> Result<WriterOutput, MagnusError> {
373
+ // Create writer properties with compression based on the option
374
+ let props = WriterProperties::builder()
375
+ .set_compression(match compression.as_deref() {
376
+ Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
377
+ Some("snappy") => Compression::SNAPPY,
378
+ Some("gzip") => Compression::GZIP(GzipLevel::default()),
379
+ Some("lz4") => Compression::LZ4,
380
+ Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
381
+ _ => Compression::UNCOMPRESSED,
382
+ })
383
+ .build();
384
+
364
385
  if write_to.is_kind_of(ruby.class_string()) {
365
386
  let path = write_to.to_r_string()?.to_string()?;
366
387
  let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
367
388
  let writer =
368
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
389
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
369
390
  Ok(WriterOutput::File(writer))
370
391
  } else {
371
392
  // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -382,7 +403,7 @@ fn create_writer(
382
403
  )
383
404
  })?);
384
405
  let writer =
385
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
406
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
386
407
  Ok(WriterOutput::TempFile(writer, temp_file))
387
408
  }
388
409
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.7"
2
+ VERSION = "0.2.8"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -65,15 +65,18 @@ module Parquet
65
65
  # - `timestamp_millis`, `timestamp_micros`
66
66
  # - `write_to`: String path or IO object to write the parquet file to
67
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
68
+ # - `compression`: Optional compression type to use (defaults to "zstd")
69
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
68
70
  sig do
69
71
  params(
70
72
  read_from: T::Enumerator[T::Array[T.untyped]],
71
73
  schema: T::Array[T::Hash[String, String]],
72
74
  write_to: T.any(String, IO),
73
- batch_size: T.nilable(Integer)
75
+ batch_size: T.nilable(Integer),
76
+ compression: T.nilable(String)
74
77
  ).void
75
78
  end
76
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
79
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
77
80
  end
78
81
 
79
82
  # Options:
@@ -89,13 +92,16 @@ module Parquet
89
92
  # - `timestamp_millis`, `timestamp_micros`
90
93
  # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
91
94
  # - `write_to`: String path or IO object to write the parquet file to
95
+ # - `compression`: Optional compression type to use (defaults to "zstd")
96
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
92
97
  sig do
93
98
  params(
94
99
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
95
100
  schema: T::Array[T::Hash[String, String]],
96
- write_to: T.any(String, IO)
101
+ write_to: T.any(String, IO),
102
+ compression: T.nilable(String)
97
103
  ).void
98
104
  end
99
- def self.write_columns(read_from, schema:, write_to:)
105
+ def self.write_columns(read_from, schema:, write_to:, compression: nil)
100
106
  end
101
107
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.8
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko