parquet 0.2.12-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1449 -0
- data/Cargo.toml +3 -0
- data/Gemfile +17 -0
- data/LICENSE +21 -0
- data/README.md +197 -0
- data/Rakefile +27 -0
- data/ext/parquet/Cargo.toml +28 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +52 -0
- data/ext/parquet/src/header_cache.rs +100 -0
- data/ext/parquet/src/lib.rs +29 -0
- data/ext/parquet/src/reader/mod.rs +44 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +214 -0
- data/ext/parquet/src/reader/parquet_row_reader.rs +157 -0
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +171 -0
- data/ext/parquet/src/types/core_types.rs +75 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/types/parquet_value.rs +462 -0
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +809 -0
- data/ext/parquet/src/types/writer_types.rs +283 -0
- data/ext/parquet/src/utils.rs +148 -0
- data/ext/parquet/src/writer/mod.rs +575 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +5 -0
- data/lib/parquet.rbi +113 -0
- metadata +109 -0
@@ -0,0 +1,575 @@
|
|
1
|
+
use std::{
|
2
|
+
fs::File,
|
3
|
+
io::{self, BufReader, BufWriter},
|
4
|
+
mem,
|
5
|
+
sync::Arc,
|
6
|
+
};
|
7
|
+
|
8
|
+
use arrow_array::{Array, RecordBatch};
|
9
|
+
use arrow_schema::{DataType, Field, Schema, TimeUnit};
|
10
|
+
use magnus::{
|
11
|
+
scan_args::{get_kwargs, scan_args},
|
12
|
+
value::ReprValue,
|
13
|
+
Error as MagnusError, RArray, Ruby, TryConvert, Value,
|
14
|
+
};
|
15
|
+
use parquet::{
|
16
|
+
arrow::ArrowWriter,
|
17
|
+
basic::{Compression, GzipLevel, ZstdLevel},
|
18
|
+
file::properties::WriterProperties,
|
19
|
+
};
|
20
|
+
use rand::Rng;
|
21
|
+
use tempfile::NamedTempFile;
|
22
|
+
|
23
|
+
use crate::{
|
24
|
+
convert_ruby_array_to_arrow,
|
25
|
+
types::{ColumnCollector, ParquetErrorWrapper, WriterOutput},
|
26
|
+
IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
|
27
|
+
};
|
28
|
+
|
29
|
+
const SAMPLE_SIZE: usize = 100; // Number of rows to sample for size estimation
|
30
|
+
const MIN_BATCH_SIZE: usize = 100; // Minimum batch size to maintain efficiency
|
31
|
+
const INITIAL_BATCH_SIZE: usize = 100; // Initial batch size while sampling
|
32
|
+
|
33
|
+
// Maximum memory usage per batch (64MB by default)
|
34
|
+
const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
|
35
|
+
|
36
|
+
/// Parse arguments for Parquet writing
|
37
|
+
pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
|
38
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
39
|
+
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
40
|
+
let (read_from,) = parsed_args.required;
|
41
|
+
|
42
|
+
let kwargs = get_kwargs::<
|
43
|
+
_,
|
44
|
+
(Value, Value),
|
45
|
+
(
|
46
|
+
Option<Option<usize>>,
|
47
|
+
Option<Option<usize>>,
|
48
|
+
Option<Option<String>>,
|
49
|
+
Option<Option<usize>>,
|
50
|
+
),
|
51
|
+
(),
|
52
|
+
>(
|
53
|
+
parsed_args.keywords,
|
54
|
+
&["schema", "write_to"],
|
55
|
+
&[
|
56
|
+
"batch_size",
|
57
|
+
"flush_threshold",
|
58
|
+
"compression",
|
59
|
+
"sample_size",
|
60
|
+
],
|
61
|
+
)?;
|
62
|
+
|
63
|
+
let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
|
64
|
+
MagnusError::new(
|
65
|
+
magnus::exception::type_error(),
|
66
|
+
"schema must be an array of hashes",
|
67
|
+
)
|
68
|
+
})?;
|
69
|
+
|
70
|
+
let mut schema = Vec::with_capacity(schema_array.len());
|
71
|
+
|
72
|
+
for (idx, field_hash) in schema_array.into_iter().enumerate() {
|
73
|
+
if !field_hash.is_kind_of(ruby.class_hash()) {
|
74
|
+
return Err(MagnusError::new(
|
75
|
+
magnus::exception::type_error(),
|
76
|
+
format!("schema[{}] must be a hash", idx),
|
77
|
+
));
|
78
|
+
}
|
79
|
+
|
80
|
+
let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
|
81
|
+
if entries.len() != 1 {
|
82
|
+
return Err(MagnusError::new(
|
83
|
+
magnus::exception::type_error(),
|
84
|
+
format!("schema[{}] must contain exactly one key-value pair", idx),
|
85
|
+
));
|
86
|
+
}
|
87
|
+
|
88
|
+
let (name, type_value) = &entries[0];
|
89
|
+
let name = String::try_convert(name.clone())?;
|
90
|
+
|
91
|
+
let (type_, format) = if type_value.is_kind_of(ruby.class_hash()) {
|
92
|
+
let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
|
93
|
+
let mut type_str = None;
|
94
|
+
let mut format_str = None;
|
95
|
+
|
96
|
+
for (key, value) in type_hash {
|
97
|
+
let key = String::try_convert(key)?;
|
98
|
+
match key.as_str() {
|
99
|
+
"type" => type_str = Some(value),
|
100
|
+
"format" => format_str = Some(String::try_convert(value)?),
|
101
|
+
_ => {
|
102
|
+
return Err(MagnusError::new(
|
103
|
+
magnus::exception::type_error(),
|
104
|
+
format!("Unknown key '{}' in type definition", key),
|
105
|
+
))
|
106
|
+
}
|
107
|
+
}
|
108
|
+
}
|
109
|
+
|
110
|
+
let type_str = type_str.ok_or_else(|| {
|
111
|
+
MagnusError::new(
|
112
|
+
magnus::exception::type_error(),
|
113
|
+
"Missing 'type' in type definition",
|
114
|
+
)
|
115
|
+
})?;
|
116
|
+
|
117
|
+
(ParquetSchemaType::try_convert(type_str)?, format_str)
|
118
|
+
} else {
|
119
|
+
(ParquetSchemaType::try_convert(type_value.clone())?, None)
|
120
|
+
};
|
121
|
+
|
122
|
+
schema.push(SchemaField {
|
123
|
+
name,
|
124
|
+
type_,
|
125
|
+
format,
|
126
|
+
});
|
127
|
+
}
|
128
|
+
|
129
|
+
Ok(ParquetWriteArgs {
|
130
|
+
read_from,
|
131
|
+
write_to: kwargs.required.1,
|
132
|
+
schema,
|
133
|
+
batch_size: kwargs.optional.0.flatten(),
|
134
|
+
flush_threshold: kwargs.optional.1.flatten(),
|
135
|
+
compression: kwargs.optional.2.flatten(),
|
136
|
+
sample_size: kwargs.optional.3.flatten(),
|
137
|
+
})
|
138
|
+
}
|
139
|
+
|
140
|
+
/// Estimate the size of a row
|
141
|
+
fn estimate_single_row_size(row: &RArray, schema: &[SchemaField]) -> Result<usize, MagnusError> {
|
142
|
+
let mut row_size = 0;
|
143
|
+
for (field, value) in schema.iter().zip(row.into_iter()) {
|
144
|
+
// Estimate size based on type and value
|
145
|
+
row_size += match &field.type_ {
|
146
|
+
// Use reference to avoid moving
|
147
|
+
ParquetSchemaType::Int8 | ParquetSchemaType::UInt8 => 1,
|
148
|
+
ParquetSchemaType::Int16 | ParquetSchemaType::UInt16 => 2,
|
149
|
+
ParquetSchemaType::Int32
|
150
|
+
| ParquetSchemaType::UInt32
|
151
|
+
| ParquetSchemaType::Float
|
152
|
+
| ParquetSchemaType::Date32 => 4,
|
153
|
+
ParquetSchemaType::Int64
|
154
|
+
| ParquetSchemaType::UInt64
|
155
|
+
| ParquetSchemaType::Double
|
156
|
+
| ParquetSchemaType::TimestampMillis
|
157
|
+
| ParquetSchemaType::TimestampMicros => 8,
|
158
|
+
ParquetSchemaType::String => {
|
159
|
+
if let Ok(s) = String::try_convert(value) {
|
160
|
+
s.len() + mem::size_of::<usize>() // account for length prefix
|
161
|
+
} else {
|
162
|
+
16 // default estimate for string
|
163
|
+
}
|
164
|
+
}
|
165
|
+
ParquetSchemaType::Binary => {
|
166
|
+
if let Ok(bytes) = Vec::<u8>::try_convert(value) {
|
167
|
+
bytes.len() + mem::size_of::<usize>() // account for length prefix
|
168
|
+
} else {
|
169
|
+
16 // default estimate for binary
|
170
|
+
}
|
171
|
+
}
|
172
|
+
ParquetSchemaType::Boolean => 1,
|
173
|
+
ParquetSchemaType::List(_) | ParquetSchemaType::Map(_) => {
|
174
|
+
32 // rough estimate for complex types
|
175
|
+
}
|
176
|
+
};
|
177
|
+
}
|
178
|
+
Ok(row_size)
|
179
|
+
}
|
180
|
+
|
181
|
+
/// Calculate optimal batch size based on memory threshold and estimated row size
|
182
|
+
fn calculate_batch_size(row_size: usize, memory_threshold: usize) -> usize {
|
183
|
+
let batch_size = memory_threshold / row_size;
|
184
|
+
batch_size.max(MIN_BATCH_SIZE)
|
185
|
+
}
|
186
|
+
|
187
|
+
#[inline]
|
188
|
+
pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
|
189
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
190
|
+
|
191
|
+
let ParquetWriteArgs {
|
192
|
+
read_from,
|
193
|
+
write_to,
|
194
|
+
schema,
|
195
|
+
batch_size: user_batch_size,
|
196
|
+
compression,
|
197
|
+
flush_threshold,
|
198
|
+
sample_size: user_sample_size,
|
199
|
+
} = parse_parquet_write_args(args)?;
|
200
|
+
|
201
|
+
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
202
|
+
|
203
|
+
// Convert schema to Arrow schema
|
204
|
+
let arrow_fields: Vec<Field> = schema
|
205
|
+
.iter()
|
206
|
+
.map(|field| {
|
207
|
+
Field::new(
|
208
|
+
&field.name,
|
209
|
+
match field.type_ {
|
210
|
+
ParquetSchemaType::Int8 => DataType::Int8,
|
211
|
+
ParquetSchemaType::Int16 => DataType::Int16,
|
212
|
+
ParquetSchemaType::Int32 => DataType::Int32,
|
213
|
+
ParquetSchemaType::Int64 => DataType::Int64,
|
214
|
+
ParquetSchemaType::UInt8 => DataType::UInt8,
|
215
|
+
ParquetSchemaType::UInt16 => DataType::UInt16,
|
216
|
+
ParquetSchemaType::UInt32 => DataType::UInt32,
|
217
|
+
ParquetSchemaType::UInt64 => DataType::UInt64,
|
218
|
+
ParquetSchemaType::Float => DataType::Float32,
|
219
|
+
ParquetSchemaType::Double => DataType::Float64,
|
220
|
+
ParquetSchemaType::String => DataType::Utf8,
|
221
|
+
ParquetSchemaType::Binary => DataType::Binary,
|
222
|
+
ParquetSchemaType::Boolean => DataType::Boolean,
|
223
|
+
ParquetSchemaType::Date32 => DataType::Date32,
|
224
|
+
ParquetSchemaType::TimestampMillis => {
|
225
|
+
DataType::Timestamp(TimeUnit::Millisecond, None)
|
226
|
+
}
|
227
|
+
ParquetSchemaType::TimestampMicros => {
|
228
|
+
DataType::Timestamp(TimeUnit::Microsecond, None)
|
229
|
+
}
|
230
|
+
ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
|
231
|
+
ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
|
232
|
+
},
|
233
|
+
true,
|
234
|
+
)
|
235
|
+
})
|
236
|
+
.collect();
|
237
|
+
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
238
|
+
|
239
|
+
// Create the writer
|
240
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
241
|
+
|
242
|
+
if read_from.is_kind_of(ruby.class_enumerator()) {
|
243
|
+
// Create collectors for each column
|
244
|
+
let mut column_collectors: Vec<ColumnCollector> = schema
|
245
|
+
.iter()
|
246
|
+
.map(|field| {
|
247
|
+
// Clone the type to avoid moving from a reference
|
248
|
+
let type_clone = field.type_.clone();
|
249
|
+
ColumnCollector::new(field.name.clone(), type_clone, field.format.clone())
|
250
|
+
})
|
251
|
+
.collect();
|
252
|
+
|
253
|
+
let mut rows_in_batch = 0;
|
254
|
+
let mut total_rows = 0;
|
255
|
+
let mut rng = rand::rng();
|
256
|
+
let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
|
257
|
+
let mut size_samples = Vec::with_capacity(sample_size);
|
258
|
+
let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
|
259
|
+
|
260
|
+
loop {
|
261
|
+
match read_from.funcall::<_, _, Value>("next", ()) {
|
262
|
+
Ok(row) => {
|
263
|
+
let row_array = RArray::from_value(row).ok_or_else(|| {
|
264
|
+
MagnusError::new(ruby.exception_type_error(), "Row must be an array")
|
265
|
+
})?;
|
266
|
+
|
267
|
+
// Validate row length matches schema
|
268
|
+
if row_array.len() != column_collectors.len() {
|
269
|
+
return Err(MagnusError::new(
|
270
|
+
magnus::exception::type_error(),
|
271
|
+
format!(
|
272
|
+
"Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
273
|
+
row_array.len(),
|
274
|
+
column_collectors.len(),
|
275
|
+
column_collectors.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
|
276
|
+
),
|
277
|
+
));
|
278
|
+
}
|
279
|
+
|
280
|
+
// Sample row sizes using reservoir sampling
|
281
|
+
if size_samples.len() < sample_size {
|
282
|
+
size_samples.push(estimate_single_row_size(&row_array, &schema)?);
|
283
|
+
} else if rng.random_range(0..=total_rows) < sample_size {
|
284
|
+
let idx = rng.random_range(0..sample_size);
|
285
|
+
size_samples[idx] = estimate_single_row_size(&row_array, &schema)?;
|
286
|
+
}
|
287
|
+
|
288
|
+
// Process each value in the row
|
289
|
+
for (collector, value) in column_collectors.iter_mut().zip(row_array) {
|
290
|
+
collector.push_value(value)?;
|
291
|
+
}
|
292
|
+
|
293
|
+
rows_in_batch += 1;
|
294
|
+
total_rows += 1;
|
295
|
+
|
296
|
+
// Recalculate batch size if we have enough samples and no user-specified size
|
297
|
+
if size_samples.len() >= sample_size && user_batch_size.is_none() {
|
298
|
+
let avg_row_size = size_samples.iter().sum::<usize>() / size_samples.len();
|
299
|
+
current_batch_size = calculate_batch_size(avg_row_size, flush_threshold);
|
300
|
+
}
|
301
|
+
|
302
|
+
// When we reach batch size, write the batch
|
303
|
+
if rows_in_batch >= current_batch_size {
|
304
|
+
write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
|
305
|
+
rows_in_batch = 0;
|
306
|
+
}
|
307
|
+
}
|
308
|
+
Err(e) => {
|
309
|
+
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
310
|
+
// Write any remaining rows
|
311
|
+
if rows_in_batch > 0 {
|
312
|
+
write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
|
313
|
+
}
|
314
|
+
break;
|
315
|
+
}
|
316
|
+
return Err(e);
|
317
|
+
}
|
318
|
+
}
|
319
|
+
}
|
320
|
+
} else {
|
321
|
+
return Err(MagnusError::new(
|
322
|
+
magnus::exception::type_error(),
|
323
|
+
"read_from must be an Enumerator",
|
324
|
+
));
|
325
|
+
}
|
326
|
+
|
327
|
+
// Ensure everything is written and get the temp file if it exists
|
328
|
+
if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
|
329
|
+
// If we got a temp file back, we need to copy its contents to the IO-like object
|
330
|
+
copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
|
331
|
+
}
|
332
|
+
|
333
|
+
Ok(())
|
334
|
+
}
|
335
|
+
|
336
|
+
#[inline]
|
337
|
+
pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
|
338
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
339
|
+
|
340
|
+
let ParquetWriteArgs {
|
341
|
+
read_from,
|
342
|
+
write_to,
|
343
|
+
schema,
|
344
|
+
batch_size: _,
|
345
|
+
compression,
|
346
|
+
flush_threshold,
|
347
|
+
sample_size: _,
|
348
|
+
} = parse_parquet_write_args(args)?;
|
349
|
+
|
350
|
+
let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
|
351
|
+
|
352
|
+
// Convert schema to Arrow schema
|
353
|
+
let arrow_fields: Vec<Field> = schema
|
354
|
+
.iter()
|
355
|
+
.map(|field| {
|
356
|
+
Field::new(
|
357
|
+
&field.name,
|
358
|
+
match field.type_ {
|
359
|
+
ParquetSchemaType::Int8 => DataType::Int8,
|
360
|
+
ParquetSchemaType::Int16 => DataType::Int16,
|
361
|
+
ParquetSchemaType::Int32 => DataType::Int32,
|
362
|
+
ParquetSchemaType::Int64 => DataType::Int64,
|
363
|
+
ParquetSchemaType::UInt8 => DataType::UInt8,
|
364
|
+
ParquetSchemaType::UInt16 => DataType::UInt16,
|
365
|
+
ParquetSchemaType::UInt32 => DataType::UInt32,
|
366
|
+
ParquetSchemaType::UInt64 => DataType::UInt64,
|
367
|
+
ParquetSchemaType::Float => DataType::Float32,
|
368
|
+
ParquetSchemaType::Double => DataType::Float64,
|
369
|
+
ParquetSchemaType::String => DataType::Utf8,
|
370
|
+
ParquetSchemaType::Binary => DataType::Binary,
|
371
|
+
ParquetSchemaType::Boolean => DataType::Boolean,
|
372
|
+
ParquetSchemaType::Date32 => DataType::Date32,
|
373
|
+
ParquetSchemaType::TimestampMillis => {
|
374
|
+
DataType::Timestamp(TimeUnit::Millisecond, None)
|
375
|
+
}
|
376
|
+
ParquetSchemaType::TimestampMicros => {
|
377
|
+
DataType::Timestamp(TimeUnit::Microsecond, None)
|
378
|
+
}
|
379
|
+
ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
|
380
|
+
ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
|
381
|
+
},
|
382
|
+
true,
|
383
|
+
)
|
384
|
+
})
|
385
|
+
.collect();
|
386
|
+
let arrow_schema = Arc::new(Schema::new(arrow_fields));
|
387
|
+
|
388
|
+
// Create the writer
|
389
|
+
let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
|
390
|
+
|
391
|
+
if read_from.is_kind_of(ruby.class_enumerator()) {
|
392
|
+
loop {
|
393
|
+
match read_from.funcall::<_, _, Value>("next", ()) {
|
394
|
+
Ok(batch) => {
|
395
|
+
let batch_array = RArray::from_value(batch).ok_or_else(|| {
|
396
|
+
MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
|
397
|
+
})?;
|
398
|
+
|
399
|
+
// Validate batch length matches schema
|
400
|
+
if batch_array.len() != schema.len() {
|
401
|
+
return Err(MagnusError::new(
|
402
|
+
magnus::exception::type_error(),
|
403
|
+
format!(
|
404
|
+
"Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
|
405
|
+
batch_array.len(),
|
406
|
+
schema.len(),
|
407
|
+
schema.iter().map(|f| f.name.as_str()).collect::<Vec<_>>()
|
408
|
+
),
|
409
|
+
));
|
410
|
+
}
|
411
|
+
|
412
|
+
// Convert each column in the batch to Arrow arrays
|
413
|
+
let arrow_arrays: Vec<(String, Arc<dyn Array>)> = schema
|
414
|
+
.iter()
|
415
|
+
.zip(batch_array)
|
416
|
+
.map(|(field, column)| {
|
417
|
+
let column_array = RArray::from_value(column).ok_or_else(|| {
|
418
|
+
MagnusError::new(
|
419
|
+
magnus::exception::type_error(),
|
420
|
+
format!("Column '{}' must be an array", field.name),
|
421
|
+
)
|
422
|
+
})?;
|
423
|
+
|
424
|
+
Ok((
|
425
|
+
field.name.clone(),
|
426
|
+
convert_ruby_array_to_arrow(column_array, &field.type_)?,
|
427
|
+
))
|
428
|
+
})
|
429
|
+
.collect::<Result<_, MagnusError>>()?;
|
430
|
+
|
431
|
+
// Create and write record batch
|
432
|
+
let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
|
433
|
+
MagnusError::new(
|
434
|
+
magnus::exception::runtime_error(),
|
435
|
+
format!("Failed to create record batch: {}", e),
|
436
|
+
)
|
437
|
+
})?;
|
438
|
+
|
439
|
+
writer
|
440
|
+
.write(&record_batch)
|
441
|
+
.map_err(|e| ParquetErrorWrapper(e))?;
|
442
|
+
|
443
|
+
match &mut writer {
|
444
|
+
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
445
|
+
if w.in_progress_size() >= flush_threshold {
|
446
|
+
w.flush().map_err(|e| ParquetErrorWrapper(e))?;
|
447
|
+
}
|
448
|
+
}
|
449
|
+
}
|
450
|
+
}
|
451
|
+
Err(e) => {
|
452
|
+
if e.is_kind_of(ruby.exception_stop_iteration()) {
|
453
|
+
break;
|
454
|
+
}
|
455
|
+
return Err(e);
|
456
|
+
}
|
457
|
+
}
|
458
|
+
}
|
459
|
+
} else {
|
460
|
+
return Err(MagnusError::new(
|
461
|
+
magnus::exception::type_error(),
|
462
|
+
"read_from must be an Enumerator",
|
463
|
+
));
|
464
|
+
}
|
465
|
+
|
466
|
+
// Ensure everything is written and get the temp file if it exists
|
467
|
+
if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
|
468
|
+
// If we got a temp file back, we need to copy its contents to the IO-like object
|
469
|
+
copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
|
470
|
+
}
|
471
|
+
|
472
|
+
Ok(())
|
473
|
+
}
|
474
|
+
|
475
|
+
fn create_writer(
|
476
|
+
ruby: &Ruby,
|
477
|
+
write_to: &Value,
|
478
|
+
schema: Arc<Schema>,
|
479
|
+
compression: Option<String>,
|
480
|
+
) -> Result<WriterOutput, MagnusError> {
|
481
|
+
// Create writer properties with compression based on the option
|
482
|
+
let props = WriterProperties::builder()
|
483
|
+
.set_compression(match compression.as_deref() {
|
484
|
+
Some("none") | Some("uncompressed") => Compression::UNCOMPRESSED,
|
485
|
+
Some("snappy") => Compression::SNAPPY,
|
486
|
+
Some("gzip") => Compression::GZIP(GzipLevel::default()),
|
487
|
+
Some("lz4") => Compression::LZ4,
|
488
|
+
Some("zstd") => Compression::ZSTD(ZstdLevel::default()),
|
489
|
+
_ => Compression::UNCOMPRESSED,
|
490
|
+
})
|
491
|
+
.build();
|
492
|
+
|
493
|
+
if write_to.is_kind_of(ruby.class_string()) {
|
494
|
+
let path = write_to.to_r_string()?.to_string()?;
|
495
|
+
let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
|
496
|
+
let writer =
|
497
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
498
|
+
Ok(WriterOutput::File(writer))
|
499
|
+
} else {
|
500
|
+
// Create a temporary file to write to instead of directly to the IoLikeValue
|
501
|
+
let temp_file = NamedTempFile::new().map_err(|e| {
|
502
|
+
MagnusError::new(
|
503
|
+
magnus::exception::runtime_error(),
|
504
|
+
format!("Failed to create temporary file: {}", e),
|
505
|
+
)
|
506
|
+
})?;
|
507
|
+
let file: Box<dyn SendableWrite> = Box::new(temp_file.reopen().map_err(|e| {
|
508
|
+
MagnusError::new(
|
509
|
+
magnus::exception::runtime_error(),
|
510
|
+
format!("Failed to reopen temporary file: {}", e),
|
511
|
+
)
|
512
|
+
})?);
|
513
|
+
let writer =
|
514
|
+
ArrowWriter::try_new(file, schema, Some(props)).map_err(|e| ParquetErrorWrapper(e))?;
|
515
|
+
Ok(WriterOutput::TempFile(writer, temp_file))
|
516
|
+
}
|
517
|
+
}
|
518
|
+
|
519
|
+
// Helper function to copy temp file contents to IoLikeValue
|
520
|
+
fn copy_temp_file_to_io_like(
|
521
|
+
temp_file: NamedTempFile,
|
522
|
+
io_like: IoLikeValue,
|
523
|
+
) -> Result<(), MagnusError> {
|
524
|
+
let file = temp_file.reopen().map_err(|e| {
|
525
|
+
MagnusError::new(
|
526
|
+
magnus::exception::runtime_error(),
|
527
|
+
format!("Failed to reopen temporary file: {}", e),
|
528
|
+
)
|
529
|
+
})?;
|
530
|
+
let mut buf_reader = BufReader::new(file);
|
531
|
+
let mut buf_writer = BufWriter::new(io_like);
|
532
|
+
|
533
|
+
io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
|
534
|
+
MagnusError::new(
|
535
|
+
magnus::exception::runtime_error(),
|
536
|
+
format!("Failed to copy temp file to io_like: {}", e),
|
537
|
+
)
|
538
|
+
})?;
|
539
|
+
|
540
|
+
Ok(())
|
541
|
+
}
|
542
|
+
|
543
|
+
fn write_batch(
|
544
|
+
writer: &mut WriterOutput,
|
545
|
+
collectors: &mut [ColumnCollector],
|
546
|
+
flush_threshold: usize,
|
547
|
+
) -> Result<(), MagnusError> {
|
548
|
+
// Convert columns to Arrow arrays
|
549
|
+
let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
|
550
|
+
.iter_mut()
|
551
|
+
.map(|collector| Ok((collector.name.clone(), collector.take_array()?)))
|
552
|
+
.collect::<Result<_, MagnusError>>()?;
|
553
|
+
|
554
|
+
// Create and write record batch
|
555
|
+
let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
|
556
|
+
MagnusError::new(
|
557
|
+
magnus::exception::runtime_error(),
|
558
|
+
format!("Failed to create record batch: {}", e),
|
559
|
+
)
|
560
|
+
})?;
|
561
|
+
|
562
|
+
writer
|
563
|
+
.write(&record_batch)
|
564
|
+
.map_err(|e| ParquetErrorWrapper(e))?;
|
565
|
+
|
566
|
+
match writer {
|
567
|
+
WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
|
568
|
+
if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
|
569
|
+
w.flush().map_err(|e| ParquetErrorWrapper(e))?;
|
570
|
+
}
|
571
|
+
}
|
572
|
+
}
|
573
|
+
|
574
|
+
Ok(())
|
575
|
+
}
|
data/lib/parquet.rb
ADDED
data/lib/parquet.rbi
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
# typed: true
|
2
|
+
|
3
|
+
module Parquet
|
4
|
+
# Options:
|
5
|
+
# - `input`: String, File, or IO object containing parquet data
|
6
|
+
# - `result_type`: String specifying the output format
|
7
|
+
# ("hash" or "array" or :hash or :array)
|
8
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
9
|
+
# This is useful for reducing how much data is read and improving performance.
|
10
|
+
sig do
|
11
|
+
params(
|
12
|
+
input: T.any(String, File, StringIO, IO),
|
13
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
14
|
+
columns: T.nilable(T::Array[String])
|
15
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
16
|
+
end
|
17
|
+
sig do
|
18
|
+
params(
|
19
|
+
input: T.any(String, File, StringIO, IO),
|
20
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
21
|
+
columns: T.nilable(T::Array[String]),
|
22
|
+
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
23
|
+
).returns(NilClass)
|
24
|
+
end
|
25
|
+
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Options:
|
29
|
+
# - `input`: String, File, or IO object containing parquet data
|
30
|
+
# - `result_type`: String specifying the output format
|
31
|
+
# ("hash" or "array" or :hash or :array)
|
32
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
33
|
+
# - `batch_size`: When present, specifies the number of rows per batch
|
34
|
+
sig do
|
35
|
+
params(
|
36
|
+
input: T.any(String, File, StringIO, IO),
|
37
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
38
|
+
columns: T.nilable(T::Array[String]),
|
39
|
+
batch_size: T.nilable(Integer)
|
40
|
+
).returns(T::Enumerator[T.any(T::Hash[String, T.untyped], T::Array[T.untyped])])
|
41
|
+
end
|
42
|
+
sig do
|
43
|
+
params(
|
44
|
+
input: T.any(String, File, StringIO, IO),
|
45
|
+
result_type: T.nilable(T.any(String, Symbol)),
|
46
|
+
columns: T.nilable(T::Array[String]),
|
47
|
+
batch_size: T.nilable(Integer),
|
48
|
+
blk:
|
49
|
+
T.nilable(T.proc.params(batch: T.any(T::Hash[String, T::Array[T.untyped]], T::Array[T::Array[T.untyped]])).void)
|
50
|
+
).returns(NilClass)
|
51
|
+
end
|
52
|
+
def self.each_column(input, result_type: nil, columns: nil, batch_size: nil, &blk)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Options:
|
56
|
+
# - `read_from`: An Enumerator yielding arrays of values representing each row
|
57
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
58
|
+
# - `int8`, `int16`, `int32`, `int64`
|
59
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
60
|
+
# - `float`, `double`
|
61
|
+
# - `string`
|
62
|
+
# - `binary`
|
63
|
+
# - `boolean`
|
64
|
+
# - `date32`
|
65
|
+
# - `timestamp_millis`, `timestamp_micros`
|
66
|
+
# - `write_to`: String path or IO object to write the parquet file to
|
67
|
+
# - `batch_size`: Optional batch size for writing (defaults to 1000)
|
68
|
+
# - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
|
69
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
70
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
71
|
+
# - `sample_size`: Optional number of rows to sample for size estimation (defaults to 100)
|
72
|
+
sig do
|
73
|
+
params(
|
74
|
+
read_from: T::Enumerator[T::Array[T.untyped]],
|
75
|
+
schema: T::Array[T::Hash[String, String]],
|
76
|
+
write_to: T.any(String, IO),
|
77
|
+
batch_size: T.nilable(Integer),
|
78
|
+
flush_threshold: T.nilable(Integer),
|
79
|
+
compression: T.nilable(String),
|
80
|
+
sample_size: T.nilable(Integer)
|
81
|
+
).void
|
82
|
+
end
|
83
|
+
def self.write_rows(read_from, schema:, write_to:, batch_size: nil, flush_threshold: nil, compression: nil, sample_size: nil)
|
84
|
+
end
|
85
|
+
|
86
|
+
# Options:
|
87
|
+
# - `read_from`: An Enumerator yielding arrays of column batches
|
88
|
+
# - `schema`: Array of hashes specifying column names and types. Supported types:
|
89
|
+
# - `int8`, `int16`, `int32`, `int64`
|
90
|
+
# - `uint8`, `uint16`, `uint32`, `uint64`
|
91
|
+
# - `float`, `double`
|
92
|
+
# - `string`
|
93
|
+
# - `binary`
|
94
|
+
# - `boolean`
|
95
|
+
# - `date32`
|
96
|
+
# - `timestamp_millis`, `timestamp_micros`
|
97
|
+
# - Looks like [{"column_name" => {"type" => "date32", "format" => "%Y-%m-%d"}}, {"column_name" => "int8"}]
|
98
|
+
# - `write_to`: String path or IO object to write the parquet file to
|
99
|
+
# - `flush_threshold`: Optional memory threshold in bytes before flushing (defaults to 64MB)
|
100
|
+
# - `compression`: Optional compression type to use (defaults to "zstd")
|
101
|
+
# Supported values: "none", "uncompressed", "snappy", "gzip", "lz4", "zstd"
|
102
|
+
sig do
|
103
|
+
params(
|
104
|
+
read_from: T::Enumerator[T::Array[T::Array[T.untyped]]],
|
105
|
+
schema: T::Array[T::Hash[String, String]],
|
106
|
+
write_to: T.any(String, IO),
|
107
|
+
flush_threshold: T.nilable(Integer),
|
108
|
+
compression: T.nilable(String)
|
109
|
+
).void
|
110
|
+
end
|
111
|
+
def self.write_columns(read_from, schema:, write_to:, flush_threshold: nil, compression: nil)
|
112
|
+
end
|
113
|
+
end
|