parquet 0.2.8 → 0.2.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2dea9b9b171070949497da37aff1888de71c0782e76968ba218f38e5dc2f1606
4
- data.tar.gz: 74f4599b00a818cfca62d7fc162d02a87658da014ace361a76c998b718def9f2
3
+ metadata.gz: c20809ee9bbbe96f268b2ef4c5d40f7e7fae1613e3aabe2c0f42778425237c32
4
+ data.tar.gz: 13cd137212d16de6eb4e0803ae35e9bdace8caf95ac1beabba4848a23290ad4e
5
5
  SHA512:
6
- metadata.gz: 209ca0339ccb11224501efc1d1adfed241097763475aa44e3997fce811123e9744f1bbfb1447e91decd1b020181b722ded94a6655630288db1f22e88aa8c09ae
7
- data.tar.gz: a889e46dc8fca484043b3f1513ee6487b0f8caa8096c826cdbe4fa9ff2d6aa457c2d84e1bd95f7b05819e0ce2e33017a77a720aa331be7115cfa2ac470557a59
6
+ metadata.gz: 70f51bcdc98891e781ab51257c969c4a8ca5c9fe03a43ccab8b72d9a79db2b640e2779a15ed1a36969914dd1fcd0c8df639ecb50168ec5a15b53e05cebd6655b
7
+ data.tar.gz: abdacbca6cf8857a14ec039a34ba51403a6e4ba130b751faeb62904329b2688f581afa7207c0a95fa958822d6dda0b2477c58d02c930dac5e2b54c50ff4ccc7f
data/README.md CHANGED
@@ -117,6 +117,13 @@ Parquet.write_rows(rows,
117
117
  write_to: "data.parquet",
118
118
  batch_size: 500
119
119
  )
120
+
121
+ # Optionally specify memory threshold for flushing (default is 64MB)
122
+ Parquet.write_rows(rows,
123
+ schema: schema,
124
+ write_to: "data.parquet",
125
+ flush_threshold: 32 * 1024 * 1024 # 32MB
126
+ )
120
127
  ```
121
128
 
122
129
  ### Writing Column-wise Data
@@ -155,11 +162,12 @@ columns = batches.each
155
162
  # Write to a parquet file with default ZSTD compression
156
163
  Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
164
 
158
- # Write to a parquet file with specific compression
165
+ # Write to a parquet file with specific compression and memory threshold
159
166
  Parquet.write_columns(columns,
160
167
  schema: schema,
161
168
  write_to: "data.parquet",
162
- compression: "snappy" # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
169
+ compression: "snappy", # Supported: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
170
+ flush_threshold: 32 * 1024 * 1024 # 32MB
163
171
  )
164
172
 
165
173
  # Write to an IO object
@@ -24,6 +24,7 @@ pub struct ParquetWriteArgs<'a> {
24
24
  pub write_to: Value,
25
25
  pub schema: Vec<SchemaField<'a>>,
26
26
  pub batch_size: Option<usize>,
27
+ pub flush_threshold: Option<usize>,
27
28
  pub compression: Option<String>,
28
29
  }
29
30
 
@@ -26,18 +26,29 @@ use crate::{
26
26
 
27
27
  const DEFAULT_BATCH_SIZE: usize = 1000;
28
28
 
29
+ // Maximum memory usage per batch (64MB by default)
30
+ const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
31
+
29
32
  /// Parse arguments for Parquet writing
30
33
  pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
31
34
  let ruby = unsafe { Ruby::get_unchecked() };
32
35
  let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
33
36
  let (read_from,) = parsed_args.required;
34
37
 
35
- let kwargs =
36
- get_kwargs::<_, (Value, Value), (Option<Option<usize>>, Option<Option<String>>), ()>(
37
- parsed_args.keywords,
38
- &["schema", "write_to"],
39
- &["batch_size", "compression"],
40
- )?;
38
+ let kwargs = get_kwargs::<
39
+ _,
40
+ (Value, Value),
41
+ (
42
+ Option<Option<usize>>,
43
+ Option<Option<usize>>,
44
+ Option<Option<String>>,
45
+ ),
46
+ (),
47
+ >(
48
+ parsed_args.keywords,
49
+ &["schema", "write_to"],
50
+ &["batch_size", "flush_threshold", "compression"],
51
+ )?;
41
52
 
42
53
  let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
43
54
  MagnusError::new(
@@ -110,7 +121,8 @@ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, Magn
110
121
  write_to: kwargs.required.1,
111
122
  schema,
112
123
  batch_size: kwargs.optional.0.flatten(),
113
- compression: kwargs.optional.1.flatten(),
124
+ flush_threshold: kwargs.optional.1.flatten(),
125
+ compression: kwargs.optional.2.flatten(),
114
126
  })
115
127
  }
116
128
 
@@ -124,10 +136,13 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
124
136
  schema,
125
137
  batch_size,
126
138
  compression,
139
+ flush_threshold,
127
140
  } = parse_parquet_write_args(args)?;
128
141
 
129
142
  let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
130
143
 
144
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
145
+
131
146
  // Convert schema to Arrow schema
132
147
  let arrow_fields: Vec<Field> = schema
133
148
  .iter()
@@ -205,7 +220,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
205
220
 
206
221
  // When we reach batch size, write the batch
207
222
  if rows_in_batch >= batch_size {
208
- write_batch(&mut writer, &mut column_collectors)?;
223
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
209
224
  rows_in_batch = 0;
210
225
  }
211
226
  }
@@ -213,7 +228,7 @@ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
213
228
  if e.is_kind_of(ruby.exception_stop_iteration()) {
214
229
  // Write any remaining rows
215
230
  if rows_in_batch > 0 {
216
- write_batch(&mut writer, &mut column_collectors)?;
231
+ write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
217
232
  }
218
233
  break;
219
234
  }
@@ -247,8 +262,11 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
247
262
  schema,
248
263
  batch_size: _,
249
264
  compression,
265
+ flush_threshold,
250
266
  } = parse_parquet_write_args(args)?;
251
267
 
268
+ let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
269
+
252
270
  // Convert schema to Arrow schema
253
271
  let arrow_fields: Vec<Field> = schema
254
272
  .iter()
@@ -339,6 +357,14 @@ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
339
357
  writer
340
358
  .write(&record_batch)
341
359
  .map_err(|e| ParquetErrorWrapper(e))?;
360
+
361
+ match &mut writer {
362
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
363
+ if w.in_progress_size() >= flush_threshold {
364
+ w.flush().map_err(|e| ParquetErrorWrapper(e))?;
365
+ }
366
+ }
367
+ }
342
368
  }
343
369
  Err(e) => {
344
370
  if e.is_kind_of(ruby.exception_stop_iteration()) {
@@ -435,6 +461,7 @@ fn copy_temp_file_to_io_like(
435
461
  fn write_batch(
436
462
  writer: &mut WriterOutput,
437
463
  collectors: &mut [ColumnCollector],
464
+ flush_threshold: usize,
438
465
  ) -> Result<(), MagnusError> {
439
466
  // Convert columns to Arrow arrays
440
467
  let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
@@ -454,5 +481,13 @@ fn write_batch(
454
481
  .write(&record_batch)
455
482
  .map_err(|e| ParquetErrorWrapper(e))?;
456
483
 
484
+ match writer {
485
+ WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
486
+ if w.in_progress_size() >= flush_threshold {
487
+ w.flush().map_err(|e| ParquetErrorWrapper(e))?;
488
+ }
489
+ }
490
+ }
491
+
457
492
  Ok(())
458
493
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.8"
2
+ VERSION = "0.2.9"
3
3
  end
data/lib/parquet.rbi CHANGED
@@ -65,6 +65,7 @@ module Parquet
65
65
  # - `timestamp_millis`, `timestamp_micros`
66
66
  # - `write_to`: String path or IO object to write the parquet file to
67
67
  # - `batch_size`: Optional batch size for writing (defaults to 1000)
68
+ # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
68
69
  # - `compression`: Optional compression type to use (defaults to "zstd")
69
70
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
70
71
  sig do
@@ -73,10 +74,11 @@ module Parquet
73
74
  schema: T::Array[T::Hash[String, String]],
74
75
  write_to: T.any(String, IO),
75
76
  batch_size: T.nilable(Integer),
77
+ flush_threshold: T.nilable(Integer),
76
78
  compression: T.nilable(String)
77
79
  ).void
78
80
  end
79
- def self.write_rows(read_from, schema:, write_to:, batch_size: nil, compression: nil)
81
+ def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil)
80
82
  end
81
83
 
82
84
  # Options:
@@ -92,6 +94,7 @@ module Parquet
92
94
  # - `timestamp_millis`, `timestamp_micros`
93
95
  # - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
94
96
  # - `write_to`: String path or IO object to write the parquet file to
97
+ # - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
95
98
  # - `compression`: Optional compression type to use (defaults to "zstd")
96
99
  # Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
97
100
  sig do
@@ -99,9 +102,10 @@ module Parquet
99
102
  read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
100
103
  schema: T::Array[T::Hash[String, String]],
101
104
  write_to: T.any(String, IO),
105
+ flush_threshold: T.nilable(Integer),
102
106
  compression: T.nilable(String)
103
107
  ).void
104
108
  end
105
- def self.write_columns(read_from, schema:, write_to:, compression: nil)
109
+ def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
106
110
  end
107
111
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.8
4
+ version: 0.2.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-13 00:00:00.000000000 Z
11
+ date: 2025-01-29 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys