parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,505 +0,0 @@
1
- mod write_columns;
2
- mod write_rows;
3
-
4
- use arrow_schema::{DataType, Schema, TimeUnit};
5
- use itertools::Itertools;
6
- use magnus::{
7
- scan_args::{get_kwargs, scan_args},
8
- value::ReprValue,
9
- Error as MagnusError, RArray, RHash, Ruby, Symbol, Value,
10
- };
11
- use parquet::{
12
- arrow::ArrowWriter,
13
- basic::{Compression, GzipLevel, ZstdLevel},
14
- file::properties::WriterProperties,
15
- };
16
- use std::{
17
- fs::File,
18
- io::{self, BufReader, BufWriter},
19
- sync::Arc,
20
- };
21
- use tempfile::NamedTempFile;
22
- pub use write_columns::write_columns;
23
- pub use write_rows::write_rows;
24
-
25
- use crate::{types::PrimitiveType, SchemaNode};
26
- use crate::{
27
- types::{ColumnCollector, ParquetGemError, ParquetSchemaType, WriterOutput},
28
- utils::parse_string_or_symbol,
29
- IoLikeValue, ParquetSchemaType as PST, ParquetWriteArgs, SchemaField, SendableWrite,
30
- };
31
-
32
- const SAMPLE_SIZE: usize = 100;
33
- const MIN_BATCH_SIZE: usize = 10;
34
- const INITIAL_BATCH_SIZE: usize = 100;
35
- const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
36
-
37
- /// Parse arguments for Parquet writing
38
- pub fn parse_parquet_write_args(
39
- ruby: &Ruby,
40
- args: &[Value],
41
- ) -> Result<ParquetWriteArgs, MagnusError> {
42
- let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
43
- let (read_from,) = parsed_args.required;
44
-
45
- let kwargs = get_kwargs::<
46
- _,
47
- (Value, Value),
48
- (
49
- Option<Option<usize>>,
50
- Option<Option<usize>>,
51
- Option<Option<String>>,
52
- Option<Option<usize>>,
53
- Option<Option<Value>>,
54
- ),
55
- (),
56
- >(
57
- parsed_args.keywords,
58
- &["schema", "write_to"],
59
- &[
60
- "batch_size",
61
- "flush_threshold",
62
- "compression",
63
- "sample_size",
64
- "logger",
65
- ],
66
- )?;
67
-
68
- // The schema value could be one of:
69
- // 1. An array of hashes (legacy format)
70
- // 2. A hash with type: :struct (new DSL format)
71
- // 3. nil (infer from data)
72
- let schema_value = kwargs.required.0;
73
-
74
- // Check if it's the new DSL format (a hash with type: :struct)
75
- // We need to handle both direct hash objects and objects created via Parquet::Schema.define
76
-
77
- // First, try to convert it to a Hash if it's not already a Hash
78
- // This handles the case where schema_value is a Schema object from Parquet::Schema.define
79
- let schema_hash = if schema_value.is_kind_of(ruby.class_hash()) {
80
- RHash::from_value(schema_value).ok_or_else(|| {
81
- MagnusError::new(magnus::exception::type_error(), "Schema must be a hash")
82
- })?
83
- } else {
84
- // Try to convert the object to a hash with to_h
85
- match schema_value.respond_to("to_h", false) {
86
- Ok(true) => {
87
- match schema_value.funcall::<_, _, Value>("to_h", ()) {
88
- Ok(hash_val) => match RHash::from_value(hash_val) {
89
- Some(hash) => hash,
90
- None => {
91
- // Not a hash, continue to normal handling
92
- RHash::new()
93
- }
94
- },
95
- Err(_) => {
96
- // couldn't call to_h, continue to normal handling
97
- RHash::new()
98
- }
99
- }
100
- }
101
- _ => {
102
- // Doesn't respond to to_h, continue to normal handling
103
- RHash::new()
104
- }
105
- }
106
- };
107
-
108
- // Now check if it's a schema hash with a type: :struct field
109
- let type_val = schema_hash.get(Symbol::new("type"));
110
-
111
- if let Some(type_val) = type_val {
112
- // If it has a type: :struct, it's the new DSL format
113
- // Use parse_string_or_symbol to handle both String and Symbol values
114
- let ttype = parse_string_or_symbol(ruby, type_val)?;
115
- if let Some(ref type_str) = ttype {
116
- if type_str == "struct" {
117
- // Parse using the new schema approach
118
- let schema_node = crate::parse_schema_node(ruby, schema_value)?;
119
-
120
- validate_schema_node(ruby, &schema_node)?;
121
-
122
- return Ok(ParquetWriteArgs {
123
- read_from,
124
- write_to: kwargs.required.1,
125
- schema: schema_node,
126
- batch_size: kwargs.optional.0.flatten(),
127
- flush_threshold: kwargs.optional.1.flatten(),
128
- compression: kwargs.optional.2.flatten(),
129
- sample_size: kwargs.optional.3.flatten(),
130
- logger: kwargs.optional.4.flatten(),
131
- });
132
- }
133
- }
134
- }
135
-
136
- // If it's not a hash with type: :struct, handle as legacy format
137
- let schema_fields = if schema_value.is_nil()
138
- || (schema_value.is_kind_of(ruby.class_array())
139
- && RArray::from_value(schema_value)
140
- .ok_or_else(|| {
141
- MagnusError::new(
142
- magnus::exception::type_error(),
143
- "Schema fields must be an array",
144
- )
145
- })?
146
- .is_empty())
147
- {
148
- // If schema is nil or an empty array, we need to peek at the first value to determine column count
149
- let first_value = read_from.funcall::<_, _, Value>("peek", ())?;
150
- // Default to nullable:true for auto-inferred fields
151
- crate::infer_schema_from_first_row(ruby, first_value, true)?
152
- } else {
153
- // Legacy array format - use our centralized parser
154
- crate::parse_legacy_schema(ruby, schema_value)?
155
- };
156
-
157
- // Convert the legacy schema fields to SchemaNode (DSL format)
158
- let schema_node = crate::legacy_schema_to_dsl(ruby, schema_fields)?;
159
-
160
- validate_schema_node(ruby, &schema_node)?;
161
-
162
- Ok(ParquetWriteArgs {
163
- read_from,
164
- write_to: kwargs.required.1,
165
- schema: schema_node,
166
- batch_size: kwargs.optional.0.flatten(),
167
- flush_threshold: kwargs.optional.1.flatten(),
168
- compression: kwargs.optional.2.flatten(),
169
- sample_size: kwargs.optional.3.flatten(),
170
- logger: kwargs.optional.4.flatten(),
171
- })
172
- }
173
-
174
- // -----------------------------------------------------------------------------
175
- // HELPER to invert arrow DataType back to our ParquetSchemaType
176
- // Converts Arrow DataType to our internal ParquetSchemaType representation.
177
- // This is essential for mapping Arrow types back to our schema representation
178
- // when working with column collections and schema validation.
179
- // -----------------------------------------------------------------------------
180
- fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchemaType, MagnusError> {
181
- match dt {
182
- DataType::Boolean => Ok(PST::Primitive(PrimitiveType::Boolean)),
183
- DataType::Int8 => Ok(PST::Primitive(PrimitiveType::Int8)),
184
- DataType::Int16 => Ok(PST::Primitive(PrimitiveType::Int16)),
185
- DataType::Int32 => Ok(PST::Primitive(PrimitiveType::Int32)),
186
- DataType::Int64 => Ok(PST::Primitive(PrimitiveType::Int64)),
187
- DataType::UInt8 => Ok(PST::Primitive(PrimitiveType::UInt8)),
188
- DataType::UInt16 => Ok(PST::Primitive(PrimitiveType::UInt16)),
189
- DataType::UInt32 => Ok(PST::Primitive(PrimitiveType::UInt32)),
190
- DataType::UInt64 => Ok(PST::Primitive(PrimitiveType::UInt64)),
191
- DataType::Float16 => {
192
- // We do not have a direct ParquetSchemaType::Float16, we treat it as Float
193
- Ok(PST::Primitive(PrimitiveType::Float32))
194
- }
195
- DataType::Float32 => Ok(PST::Primitive(PrimitiveType::Float32)),
196
- DataType::Float64 => Ok(PST::Primitive(PrimitiveType::Float64)),
197
- DataType::Decimal128(precision, scale) => Ok(PST::Primitive(PrimitiveType::Decimal128(
198
- *precision, *scale,
199
- ))),
200
- DataType::Decimal256(precision, scale) => Ok(PST::Primitive(PrimitiveType::Decimal256(
201
- *precision, *scale,
202
- ))),
203
- DataType::Date32 => Ok(PST::Primitive(PrimitiveType::Date32)),
204
- DataType::Date64 => {
205
- // Our code typically uses Date32 or Timestamp for 64. But Arrow has Date64
206
- // We can store it as PST::Date64 if we want. If we don't have that, consider PST::Date32 or an error.
207
- // If your existing code only handles Date32, you can error. But let's do PST::Date32 as fallback:
208
- // Or define a new variant if you have one in your code. We'll show a fallback approach:
209
- Err(MagnusError::new(
210
- magnus::exception::runtime_error(),
211
- "Arrow Date64 not directly supported in current ParquetSchemaType (use date32?).",
212
- ))
213
- }
214
- DataType::Timestamp(TimeUnit::Second, _tz) => {
215
- // We'll treat this as PST::TimestampMillis, or define PST::TimestampSecond
216
- // For simplicity, let's map "second" to PST::TimestampMillis with a note:
217
- Ok(PST::Primitive(PrimitiveType::TimestampMillis))
218
- }
219
- DataType::Timestamp(TimeUnit::Millisecond, _tz) => {
220
- Ok(PST::Primitive(PrimitiveType::TimestampMillis))
221
- }
222
- DataType::Timestamp(TimeUnit::Microsecond, _tz) => {
223
- Ok(PST::Primitive(PrimitiveType::TimestampMicros))
224
- }
225
- DataType::Timestamp(TimeUnit::Nanosecond, _tz) => {
226
- // If you have a PST::TimestampNanos variant, use it. Otherwise, degrade to micros
227
- // for demonstration:
228
- Err(MagnusError::new(
229
- magnus::exception::runtime_error(),
230
- "TimestampNanos not supported, please adjust your schema or code.",
231
- ))
232
- }
233
- DataType::Time32(TimeUnit::Millisecond) => Ok(PST::Primitive(PrimitiveType::TimeMillis)),
234
- DataType::Time64(TimeUnit::Microsecond) => Ok(PST::Primitive(PrimitiveType::TimeMicros)),
235
- DataType::Time32(_) => Err(MagnusError::new(
236
- magnus::exception::runtime_error(),
237
- "Time32 only supports millisecond unit",
238
- )),
239
- DataType::Time64(_) => Err(MagnusError::new(
240
- magnus::exception::runtime_error(),
241
- "Time64 only supports microsecond unit",
242
- )),
243
- DataType::Utf8 => Ok(PST::Primitive(PrimitiveType::String)),
244
- DataType::Binary => Ok(PST::Primitive(PrimitiveType::Binary)),
245
- DataType::LargeUtf8 => {
246
- // If not supported, degrade or error. We'll degrade to PST::String
247
- Ok(PST::Primitive(PrimitiveType::String))
248
- }
249
- DataType::LargeBinary => Ok(PST::Primitive(PrimitiveType::Binary)),
250
- DataType::List(child_field) => {
251
- // Recursively handle the item type
252
- let child_type = arrow_data_type_to_parquet_schema_type(child_field.data_type())?;
253
- Ok(PST::List(Box::new(crate::types::ListField {
254
- item_type: child_type,
255
- format: None,
256
- nullable: true,
257
- })))
258
- }
259
- DataType::Map(entry_field, _keys_sorted) => {
260
- // Arrow's Map -> a struct<key, value> inside
261
- let entry_type = entry_field.data_type();
262
- if let DataType::Struct(fields) = entry_type {
263
- if fields.len() == 2 {
264
- let key_type = arrow_data_type_to_parquet_schema_type(fields[0].data_type())?;
265
- let value_type = arrow_data_type_to_parquet_schema_type(fields[1].data_type())?;
266
- Ok(PST::Map(Box::new(crate::types::MapField {
267
- key_type,
268
- value_type,
269
- key_format: None,
270
- value_format: None,
271
- value_nullable: true,
272
- })))
273
- } else {
274
- Err(MagnusError::new(
275
- magnus::exception::type_error(),
276
- "Map field must have exactly 2 child fields (key, value)",
277
- ))
278
- }
279
- } else {
280
- Err(MagnusError::new(
281
- magnus::exception::type_error(),
282
- "Map field is not a struct? Unexpected Arrow schema layout",
283
- ))
284
- }
285
- }
286
- DataType::Struct(arrow_fields) => {
287
- // We treat this as PST::Struct. We'll recursively handle subfields
288
- // but for top-level collecting we only store them as one column
289
- // so the user data must pass a Ruby Hash or something for that field.
290
- let mut schema_fields = vec![];
291
- for f in arrow_fields {
292
- let sub_type = arrow_data_type_to_parquet_schema_type(f.data_type())?;
293
- schema_fields.push(SchemaField {
294
- name: f.name().clone(),
295
- type_: sub_type,
296
- format: None, // We can't see the 'format' from Arrow
297
- nullable: f.is_nullable(),
298
- });
299
- }
300
- Ok(PST::Struct(Box::new(crate::types::StructField {
301
- fields: schema_fields,
302
- })))
303
- }
304
- _ => Err(MagnusError::new(
305
- magnus::exception::runtime_error(),
306
- format!("Unsupported or unhandled Arrow DataType: {:?}", dt),
307
- )),
308
- }
309
- }
310
-
311
- // -----------------------------------------------------------------------------
312
- // HELPER to build ColumnCollectors for the DSL variant
313
- // This function converts a SchemaNode (from our DSL) into a collection of ColumnCollectors
314
- // that can accumulate values for each column in the schema.
315
- // - arrow_schema: The Arrow schema corresponding to our DSL schema
316
- // - root_node: The root SchemaNode (expected to be a Struct node) from which to build collectors
317
- // -----------------------------------------------------------------------------
318
- fn build_column_collectors_from_dsl<'a>(
319
- ruby: &'a Ruby,
320
- arrow_schema: &'a Arc<Schema>,
321
- root_node: &'a SchemaNode,
322
- ) -> Result<Vec<ColumnCollector<'a>>, MagnusError> {
323
- // We expect the top-level schema node to be a Struct so that arrow_schema
324
- // lines up with root_node.fields. If the user gave a top-level primitive, it would be 1 field, but
325
- // our code calls build_arrow_schema under the assumption "top-level must be Struct."
326
- let fields = match root_node {
327
- SchemaNode::Struct { fields, .. } => fields,
328
- _ => {
329
- return Err(MagnusError::new(
330
- ruby.exception_runtime_error(),
331
- "Top-level schema for DSL must be a struct",
332
- ))
333
- }
334
- };
335
-
336
- if fields.len() != arrow_schema.fields().len() {
337
- return Err(MagnusError::new(
338
- ruby.exception_runtime_error(),
339
- format!(
340
- "Mismatch between DSL field count ({}) and Arrow fields ({})",
341
- fields.len(),
342
- arrow_schema.fields().len()
343
- ),
344
- ));
345
- }
346
-
347
- let mut collectors = Vec::with_capacity(fields.len());
348
- for (arrow_field, schema_field_node) in arrow_schema.fields().iter().zip(fields) {
349
- let name = arrow_field.name().clone();
350
- let parquet_type = arrow_data_type_to_parquet_schema_type(arrow_field.data_type())?;
351
-
352
- // Extract the optional format from the schema node
353
- let format = extract_format_from_schema_node(schema_field_node);
354
-
355
- // Build the ColumnCollector
356
- collectors.push(ColumnCollector::new(
357
- ruby,
358
- name,
359
- parquet_type,
360
- format,
361
- arrow_field.is_nullable(),
362
- ));
363
- }
364
- Ok(collectors)
365
- }
366
-
367
- // Helper to extract the format from a SchemaNode if available
368
- fn extract_format_from_schema_node(node: &SchemaNode) -> Option<String> {
369
- match node {
370
- SchemaNode::Primitive {
371
- format: f,
372
- parquet_type: _,
373
- ..
374
- } => f.clone(),
375
- // For struct, list, map, etc. there's no single "format." We ignore it.
376
- _ => None,
377
- }
378
- }
379
-
380
- // Validates a SchemaNode to ensure it meets Parquet schema requirements
381
- // Currently checks for duplicate field names at the root level, which would
382
- // cause problems when writing Parquet files. Additional validation rules
383
- // could be added here in the future.
384
- //
385
- // This validation is important because schema errors are difficult to debug
386
- // once they reach the Parquet/Arrow layer, so we check proactively before
387
- // any data processing begins.
388
- fn validate_schema_node(ruby: &Ruby, schema_node: &SchemaNode) -> Result<(), MagnusError> {
389
- if let SchemaNode::Struct { fields, .. } = &schema_node {
390
- // if any root level schema fields have the same name, we raise an error
391
- let field_names = fields
392
- .iter()
393
- .map(|f| match f {
394
- SchemaNode::Struct { name, .. } => name.as_str(),
395
- SchemaNode::List { name, .. } => name.as_str(),
396
- SchemaNode::Map { name, .. } => name.as_str(),
397
- SchemaNode::Primitive { name, .. } => name.as_str(),
398
- })
399
- .collect::<Vec<_>>();
400
- let unique_field_names = field_names.iter().unique().collect::<Vec<_>>();
401
- if field_names.len() != unique_field_names.len() {
402
- return Err(MagnusError::new(
403
- ruby.exception_arg_error(),
404
- format!(
405
- "Duplicate field names in root level schema: {:?}",
406
- field_names
407
- ),
408
- ));
409
- }
410
- }
411
- Ok(())
412
- }
413
-
414
- // Creates an appropriate Parquet writer based on the output target and compression settings
415
- // This function handles two main output scenarios:
416
- // 1. Writing directly to a file path (string)
417
- // 2. Writing to a Ruby IO-like object (using a temporary file as an intermediate buffer)
418
- //
419
- // For IO-like objects, the function creates a temporary file that is later copied to the
420
- // IO object when writing is complete. This approach is necessary because Parquet requires
421
- // random file access to write its footer after the data.
422
- //
423
- // The function also configures compression based on the user's preferences, with
424
- // several options available (none, snappy, gzip, lz4, zstd).
425
- fn create_writer(
426
- ruby: &Ruby,
427
- write_to: &Value,
428
- schema: Arc<Schema>,
429
- compression: Option<String>,
430
- ) -> Result<WriterOutput, ParquetGemError> {
431
- // Create writer properties with compression based on the option
432
- let compression_setting = match compression.map(|s| s.to_lowercase()).as_deref() {
433
- Some("none") | Some("uncompressed") => Ok(Compression::UNCOMPRESSED),
434
- Some("snappy") => Ok(Compression::SNAPPY),
435
- Some("gzip") => Ok(Compression::GZIP(GzipLevel::default())),
436
- Some("lz4") => Ok(Compression::LZ4),
437
- Some("zstd") => Ok(Compression::ZSTD(ZstdLevel::default())),
438
- None => Ok(Compression::UNCOMPRESSED),
439
- other => Err(MagnusError::new(
440
- magnus::exception::arg_error(),
441
- format!("Invalid compression option: {:?}", other),
442
- )),
443
- }?;
444
-
445
- let props = WriterProperties::builder()
446
- .set_compression(compression_setting)
447
- .build();
448
-
449
- if write_to.is_kind_of(ruby.class_string()) {
450
- let path = write_to.to_r_string()?.to_string()?;
451
- let file: Box<dyn SendableWrite> = Box::new(File::create(path)?);
452
- let writer = ArrowWriter::try_new(file, schema, Some(props))?;
453
- Ok(WriterOutput::File(writer))
454
- } else {
455
- // Create a temporary file to write to instead of directly to the IoLikeValue
456
- let temp_file = NamedTempFile::new().map_err(|e| {
457
- MagnusError::new(
458
- magnus::exception::runtime_error(),
459
- format!("Failed to create temporary file: {}", e),
460
- )
461
- })?;
462
- let file: Box<dyn SendableWrite> = Box::new(temp_file.reopen().map_err(|e| {
463
- MagnusError::new(
464
- magnus::exception::runtime_error(),
465
- format!("Failed to reopen temporary file: {}", e),
466
- )
467
- })?);
468
- let writer = ArrowWriter::try_new(file, schema, Some(props))?;
469
- Ok(WriterOutput::TempFile(writer, temp_file))
470
- }
471
- }
472
-
473
- // Copies the contents of a temporary file to a Ruby IO-like object
474
- // This function is necessary because Parquet writing requires random file access
475
- // (especially for writing the footer after all data), but Ruby IO objects may not
476
- // support seeking. The solution is to:
477
- //
478
- // 1. Write the entire Parquet file to a temporary file first
479
- // 2. Once writing is complete, copy the entire contents to the Ruby IO object
480
- //
481
- // This approach enables support for a wide range of Ruby IO objects like StringIO,
482
- // network streams, etc., but does require enough disk space for the temporary file
483
- // and involves a second full-file read/write operation at the end.
484
- fn copy_temp_file_to_io_like(
485
- temp_file: NamedTempFile,
486
- io_like: IoLikeValue,
487
- ) -> Result<(), MagnusError> {
488
- let file = temp_file.reopen().map_err(|e| {
489
- MagnusError::new(
490
- magnus::exception::runtime_error(),
491
- format!("Failed to reopen temporary file: {}", e),
492
- )
493
- })?;
494
- let mut buf_reader = BufReader::new(file);
495
- let mut buf_writer = BufWriter::new(io_like);
496
-
497
- io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
498
- MagnusError::new(
499
- magnus::exception::runtime_error(),
500
- format!("Failed to copy temp file to io_like: {}", e),
501
- )
502
- })?;
503
-
504
- Ok(())
505
- }