parquet 0.2.7 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c1ed4f490a4f03443598dbe1b0e110746052f613a4c5575f9b8e47c6e160bb40
4
- data.tar.gz: 4db314d1707e633799e996c6fb777135ff0ea364a76c0a7d8fc5c429e2394d9f
3
+ metadata.gz: c20809ee9bbbe96f268b2ef4c5d40f7e7fae1613e3aabe2c0f42778425237c32
4
+ data.tar.gz: 13cd137212d16de6eb4e0803ae35e9bdace8caf95ac1beabba4848a23290ad4e
5
5
  SHA512:
6
- metadata.gz: b3f0a15cf467d030d3002c21bc6b64b6cd16e91e972b8de1e928abfd9bd373cfb5c4f77cdd1a6db7c620055e9657ec623866e0d8a0cb3a8e21a0c252bde3df87
7
- data.tar.gz: 77f41921f5818051b597d2941688f6eca2a24d86333c58dec45d6e47e7161bfdd70e78f50a0f7ddd6cc99356c2b477451ab43adf9caa201501815c6b1a731d5c
6
+ metadata.gz: 70f51bcdc98891e781ab51257c969c4a8ca5c9fe03a43ccab8b72d9a79db2b640e2779a15ed1a36969914dd1fcd0c8df639ecb50168ec5a15b53e05cebd6655b
7
+ data.tar.gz: abdacbca6cf8857a14ec039a34ba51403a6e4ba130b751faeb62904329b2688f581afa7207c0a95fa958822d6dda0b2477c58d02c930dac5e2b54c50ff4ccc7f
data/README.md CHANGED
@@ -117,6 +117,13 @@ Parquet.write_rows(rows,
117
117
  write_to: "data.parquet",
118
118
  batch_size: 500
119
119
  )
120
+
121
+ # Optionally specify memory threshold for flushing (default is 64MB)
122
+ Parquet.write_rows(rows,
123
+ schema: schema,
124
+ write_to: "data.parquet",
125
+ flush_threshold: 32 * 1024 * 1024 # 32MB
126
+ )
120
127
  ```
121
128
 
122
129
  ### Writing Column-wise Data
@@ -152,9 +159,17 @@ batches = [
152
159
  # Create an enumerator from the batches
153
160
  columns = batches.each
154
161
 
155
- # Write to a parquet file
162
+ # Write to a parquet file with default ZSTD compression
156
163
  Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
164
 
165
+ # Write to a parquet file with specific compression and memory threshold
166
+ Parquet.write_columns(columns,
167
+ schema: schema,
168
+ write_to: "data.parquet",
169
+ compression: "snappy", # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
170
+ flush_threshold: 32 * 1024 * 1024 # 32MB
171
+ )
172
+
158
173
  # Write to an IO object
159
174
  File.open("data.parquet", "wb") do |file|
160
175
  Parquet.write_columns(columns, schema: schema, write_to: file)
@@ -24,6 +24,8 @@ pub struct ParquetWriteArgs<'a> {
24
24
  pub write_to: Value,
25
25
  pub schema: Vec<SchemaField<'a>>,
26
26
  pub batch_size: Option<usize>,
27
+ pub flush_threshold: Option<usize>,
28
+ pub compression: Option<String>,
27
29
  }
28
30
 
29
31
  pub trait SendableWrite: Send + Write {}
@@ -11,7 +11,11 @@ use magnus::{
11
11
  value::ReprValue,
12
12
  Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
13
  };
14
- use parquet::arrow::ArrowWriter;
14
+ use parquet::{
15
+ arrow::ArrowWriter,
16
+ basic::{Compression, GzipLevel, ZstdLevel},
17
+ file::properties::WriterProperties,
18
+ };
15
19
  use tempfile::NamedTempFile;
16
20
 
17
21
  use crate::{
@@ -22,16 +26,28 @@ use crate::{
22
26
 
23
27
  const DEFAULT_BATCH_SIZE: usize = 1000;
24
28
 
29
+ // Maximum memory usage per batch (64MB by default)
30
+ const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
31
+
25
32
  /// Parse arguments for Parquet writing
26
33
  pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
27
34
  let ruby = unsafe { Ruby::get_unchecked() };
28
35
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
36
  let (read_from,) = parsed_args.required;
30
37
 
31
- let kwargs = get_kwargs::<_, (Value, Value), (Option<Option<usize>>,), ()>(
38
+ let kwargs = get_kwargs::<
39
+ _,
40
+ (Value, Value),
41
+ (
42
+ Option<Option<usize>>,
43
+ Option<Option<usize>>,
44
+ Option<Option<String>>,
45
+ ),
46
+ (),
47
+ >(
32
48
  parsed_args.keywords,
33
49
  &["schema", "write_to"],
34
- &["batch_size"],
50
+ &["batch_size", "flush_threshold", "compression"],
35
51
  )?;
36
52
 
37
53
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
@@ -105,6 +121,8 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
105
121
  write_to: kwargs.required.1,
106
122
  schema,
107
123
  batch_size: kwargs.optional.0.flatten(),
124
+ flush_threshold: kwargs.optional.1.flatten(),
125
+ compression: kwargs.optional.2.flatten(),
108
126
  })
109
127
  }
110
128
 
@@ -117,10 +135,14 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
117
135
  write_to,
118
136
  schema,
119
137
  batch_size,
138
+ compression,
139
+ flush_threshold,
120
140
  } = parse_parquet_write_args(args)?;
121
141
 
122
142
  let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
123
143
 
144
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
145
+
124
146
  // Convert schema to Arrow schema
125
147
  let arrow_fields: Vec<Field> = schema
126
148
  .iter()
@@ -158,7 +180,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
158
180
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
159
181
 
160
182
  // Create the writer
161
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
183
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
162
184
 
163
185
  if read_from.is_kind_of(ruby.class_enumerator()) {
164
186
  // Create collectors for each column
@@ -198,7 +220,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
198
220
 
199
221
  // When we reach batch size, write the batch
200
222
  if rows_in_batch >= batch_size {
201
- write_batch(&mut writer, &mut column_collectors)?;
223
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
202
224
  rows_in_batch = 0;
203
225
  }
204
226
  }
@@ -206,7 +228,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
206
228
  if e.is_kind_of(ruby.exception_stop_iteration()) {
207
229
  // Write any remaining rows
208
230
  if rows_in_batch > 0 {
209
- write_batch(&mut writer, &mut column_collectors)?;
231
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
210
232
  }
211
233
  break;
212
234
  }
@@ -238,9 +260,13 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
238
260
  read_from,
239
261
  write_to,
240
262
  schema,
241
- batch_size: _, // Batch size is determined by the input
263
+ batch_size: _,
264
+ compression,
265
+ flush_threshold,
242
266
  } = parse_parquet_write_args(args)?;
243
267
 
268
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
269
+
244
270
  // Convert schema to Arrow schema
245
271
  let arrow_fields: Vec<Field> = schema
246
272
  .iter()
@@ -278,7 +304,7 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
278
304
  let arrow_schema = Arc::new(Schema::new(arrow_fields));
279
305
 
280
306
  // Create the writer
281
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
307
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
282
308
 
283
309
  if read_from.is_kind_of(ruby.class_enumerator()) {
284
310
  loop {
@@ -331,6 +357,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
331
357
  writer
332
358
  .write(&record_batch)
333
359
  .map_err(|e| ParquetErrorWrapper(e))?;
360
+
361
+ match &mut writer {
362
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
363
+ if w.in_progress_size() >= flush_threshold {
364
+ w.flush().map_err(|e| ParquetErrorWrapper(e))?;
365
+ }
366
+ }
367
+ }
334
368
  }
335
369
  Err(e) => {
336
370
  if e.is_kind_of(ruby.exception_stop_iteration()) {
@@ -360,12 +394,25 @@ fn create_writer(
360
394
  ruby: &Ruby,
361
395
  write_to: &Value,
362
396
  schema: Arc<Schema>,
397
+ compression: Option<String>,
363
398
  ) -> Result<WriterOutput, MagnusError> {
399
+ // Create writer properties with compression based on the option
400
+ let props = WriterProperties::builder()
401
+ .set_compression(match compression.as_deref() {
402
+ Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
403
+ Some("snappy") => Compression::SNAPPY,
404
+ Some("gzip") => Compression::GZIP(GzipLevel::default()),
405
+ Some("lz4") => Compression::LZ4,
406
+ Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
407
+ _ => Compression::UNCOMPRESSED,
408
+ })
409
+ .build();
410
+
364
411
  if write_to.is_kind_of(ruby.class_string()) {
365
412
  let path = write_to.to_r_string()?.to_string()?;
366
413
  let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
367
414
  let writer =
368
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
415
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
369
416
  Ok(WriterOutput::File(writer))
370
417
  } else {
371
418
  // Create a temporary file to write to instead of directly to the IoLikeValue
@@ -382,7 +429,7 @@ fn create_writer(
382
429
  )
383
430
  })?);
384
431
  let writer =
385
- ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
432
+ ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
386
433
  Ok(WriterOutput::TempFile(writer, temp_file))
387
434
  }
388
435
  }
@@ -414,6 +461,7 @@ fn copy_temp_file_to_io_like(
414
461
  fn write_batch(
415
462
  writer: &mut WriterOutput,
416
463
  collectors: &mut [ColumnCollector],
464
+ flush_threshold: usize,
417
465
  ) -> Result<(), MagnusError> {
418
466
  // Convert columns to Arrow arrays
419
467
  let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
@@ -433,5 +481,13 @@ fn write_batch(
433
481
  .write(&record_batch)
434
482
  .map_err(|e| ParquetErrorWrapper(e))?;
435
483
 
484
+ match writer {
485
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
486
+ if w.in_progress_size() >= flush_threshold {
487
+ w.flush().map_err(|e| ParquetErrorWrapper(e))?;
488
+ }
489
+ }
490
+ }
491
+
436
492
  Ok(())
437
493
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.7"
2
+ VERSION = "0.2.9"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -65,15 +65,20 @@ module Parquet
65
65
  # - `timestamp_millis`, `timestamp_micros`
66
66
  # - `write_to`: String path or IO object to write the parquet file to
67
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
68
+ # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
69
+ # - `compression`: Optional compression type to use (defaults to "zstd")
70
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
68
71
  sig do
69
72
  params(
70
73
  read_from: T::Enumerator[T::Array[T.untyped]],
71
74
  schema: T::Array[T::Hash[String, String]],
72
75
  write_to: T.any(String, IO),
73
- batch_size: T.nilable(Integer)
76
+ batch_size: T.nilable(Integer),
77
+ flush_threshold: T.nilable(Integer),
78
+ compression: T.nilable(String)
74
79
  ).void
75
80
  end
76
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil)
81
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
77
82
  end
78
83
 
79
84
  # Options:
@@ -89,13 +94,18 @@ module Parquet
89
94
  # - `timestamp_millis`, `timestamp_micros`
90
95
  # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
91
96
  # - `write_to`: String path or IO object to write the parquet file to
97
+ # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
98
+ # - `compression`: Optional compression type to use (defaults to "zstd")
99
+ # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
92
100
  sig do
93
101
  params(
94
102
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
95
103
  schema: T::Array[T::Hash[String, String]],
96
- write_to: T.any(String, IO)
104
+ write_to: T.any(String, IO),
105
+ flush_threshold: T.nilable(Integer),
106
+ compression: T.nilable(String)
97
107
  ).void
98
108
  end
99
- def self.write_columns(read_from, schema:, write_to:)
109
+ def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
100
110
  end
101
111
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.7
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-13 00:00:00.000000000 Z
11
+ date: 2025-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys