parquet 0.0.5 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,270 @@
1
+ use std::{
2
+ io::{self, Write},
3
+ str::FromStr,
4
+ sync::Arc,
5
+ };
6
+
7
+ use arrow_array::{Array, RecordBatch};
8
+ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
9
+ use parquet::{arrow::ArrowWriter, errors::ParquetError};
10
+ use tempfile::NamedTempFile;
11
+
12
+ use crate::types::{ListField, MapField, ParquetSchemaType};
13
+
14
+ #[derive(Debug)]
15
+ pub struct SchemaField {
16
+ pub name: String,
17
+ pub type_: ParquetSchemaType,
18
+ }
19
+
20
+ #[derive(Debug)]
21
+ pub struct ParquetWriteArgs {
22
+ pub read_from: Value,
23
+ pub write_to: Value,
24
+ pub schema: Vec<SchemaField>,
25
+ pub batch_size: Option<usize>,
26
+ }
27
+
28
+ pub trait SendableWrite: Send + Write {}
29
+ impl<T: Send + Write> SendableWrite for T {}
30
+
31
+ pub struct IoLikeValue(pub(crate) Value);
32
+
33
+ impl Write for IoLikeValue {
34
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
35
+ let ruby_bytes = RString::from_slice(buf);
36
+
37
+ let bytes_written = self
38
+ .0
39
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
40
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
41
+
42
+ Ok(bytes_written)
43
+ }
44
+
45
+ fn flush(&mut self) -> Result<(), io::Error> {
46
+ self.0
47
+ .funcall::<_, _, Value>("flush", ())
48
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
49
+
50
+ Ok(())
51
+ }
52
+ }
53
+
54
+ impl FromStr for ParquetSchemaType {
55
+ type Err = MagnusError;
56
+
57
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
58
+ match s {
59
+ "int8" => Ok(ParquetSchemaType::Int8),
60
+ "int16" => Ok(ParquetSchemaType::Int16),
61
+ "int32" => Ok(ParquetSchemaType::Int32),
62
+ "int64" => Ok(ParquetSchemaType::Int64),
63
+ "uint8" => Ok(ParquetSchemaType::UInt8),
64
+ "uint16" => Ok(ParquetSchemaType::UInt16),
65
+ "uint32" => Ok(ParquetSchemaType::UInt32),
66
+ "uint64" => Ok(ParquetSchemaType::UInt64),
67
+ "float" | "float32" => Ok(ParquetSchemaType::Float),
68
+ "double" | "float64" => Ok(ParquetSchemaType::Double),
69
+ "string" | "utf8" => Ok(ParquetSchemaType::String),
70
+ "binary" => Ok(ParquetSchemaType::Binary),
71
+ "boolean" | "bool" => Ok(ParquetSchemaType::Boolean),
72
+ "date32" => Ok(ParquetSchemaType::Date32),
73
+ "timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
74
+ "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
+ "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
+ item_type: ParquetSchemaType::Int8,
77
+ }))),
78
+ "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
+ key_type: ParquetSchemaType::String,
80
+ value_type: ParquetSchemaType::Int8,
81
+ }))),
82
+ _ => Err(MagnusError::new(
83
+ magnus::exception::runtime_error(),
84
+ format!("Invalid schema type: {}", s),
85
+ )),
86
+ }
87
+ }
88
+ }
89
+
90
+ impl TryConvert for ParquetSchemaType {
91
+ fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
+ let ruby = unsafe { Ruby::get_unchecked() };
93
+ let schema_type = parse_string_or_symbol(&ruby, value)?;
94
+
95
+ schema_type.unwrap().parse()
96
+ }
97
+ }
98
+
99
+ // We know this type is safe to move between threads because it's just an enum
100
+ // with simple primitive types and strings
101
+ unsafe impl Send for ParquetSchemaType {}
102
+
103
+ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
+ if value.is_nil() {
105
+ Ok(None)
106
+ } else if value.is_kind_of(ruby.class_string()) {
107
+ RString::from_value(value)
108
+ .ok_or_else(|| {
109
+ MagnusError::new(magnus::exception::type_error(), "Invalid string value")
110
+ })?
111
+ .to_string()
112
+ .map(|s| Some(s))
113
+ } else if value.is_kind_of(ruby.class_symbol()) {
114
+ Symbol::from_value(value)
115
+ .ok_or_else(|| {
116
+ MagnusError::new(magnus::exception::type_error(), "Invalid symbol value")
117
+ })?
118
+ .funcall("to_s", ())
119
+ .map(|s| Some(s))
120
+ } else {
121
+ Err(MagnusError::new(
122
+ magnus::exception::type_error(),
123
+ "Value must be a String or Symbol",
124
+ ))
125
+ }
126
+ }
127
+
128
+ pub enum WriterOutput {
129
+ File(ArrowWriter<Box<dyn SendableWrite>>),
130
+ TempFile(ArrowWriter<Box<dyn SendableWrite>>, NamedTempFile),
131
+ }
132
+
133
+ impl WriterOutput {
134
+ pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ParquetError> {
135
+ match self {
136
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _) => writer.write(batch),
137
+ }
138
+ }
139
+
140
+ pub fn close(self) -> Result<Option<NamedTempFile>, ParquetError> {
141
+ match self {
142
+ WriterOutput::File(writer) => {
143
+ writer.close()?;
144
+ Ok(None)
145
+ }
146
+ WriterOutput::TempFile(writer, temp_file) => {
147
+ writer.close()?;
148
+ Ok(Some(temp_file))
149
+ }
150
+ }
151
+ }
152
+ }
153
+
154
+ pub struct ParquetErrorWrapper(pub ParquetError);
155
+
156
+ impl From<ParquetErrorWrapper> for MagnusError {
157
+ fn from(err: ParquetErrorWrapper) -> Self {
158
+ MagnusError::new(
159
+ magnus::exception::runtime_error(),
160
+ format!("Parquet error: {}", err.0),
161
+ )
162
+ }
163
+ }
164
+
165
+ pub struct ColumnCollector {
166
+ pub name: String,
167
+ pub type_: ParquetSchemaType,
168
+ pub values: Vec<crate::types::ParquetValue>,
169
+ }
170
+
171
+ impl ColumnCollector {
172
+ pub fn new(name: String, type_: ParquetSchemaType) -> Self {
173
+ Self {
174
+ name,
175
+ type_,
176
+ values: Vec::new(),
177
+ }
178
+ }
179
+
180
+ pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
181
+ use crate::types::ParquetValue;
182
+ use crate::{
183
+ convert_to_binary, convert_to_boolean, convert_to_date32, convert_to_list,
184
+ convert_to_map, convert_to_timestamp_micros, convert_to_timestamp_millis,
185
+ NumericConverter,
186
+ };
187
+
188
+ let parquet_value = match &self.type_ {
189
+ ParquetSchemaType::Int8 => {
190
+ let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
191
+ ParquetValue::Int8(v)
192
+ }
193
+ ParquetSchemaType::Int16 => {
194
+ let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
195
+ ParquetValue::Int16(v)
196
+ }
197
+ ParquetSchemaType::Int32 => {
198
+ let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
199
+ ParquetValue::Int32(v)
200
+ }
201
+ ParquetSchemaType::Int64 => {
202
+ let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
203
+ ParquetValue::Int64(v)
204
+ }
205
+ ParquetSchemaType::UInt8 => {
206
+ let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
207
+ ParquetValue::UInt8(v)
208
+ }
209
+ ParquetSchemaType::UInt16 => {
210
+ let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
211
+ ParquetValue::UInt16(v)
212
+ }
213
+ ParquetSchemaType::UInt32 => {
214
+ let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
215
+ ParquetValue::UInt32(v)
216
+ }
217
+ ParquetSchemaType::UInt64 => {
218
+ let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
219
+ ParquetValue::UInt64(v)
220
+ }
221
+ ParquetSchemaType::Float => {
222
+ let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
223
+ ParquetValue::Float32(v)
224
+ }
225
+ ParquetSchemaType::Double => {
226
+ let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
227
+ ParquetValue::Float64(v)
228
+ }
229
+ ParquetSchemaType::String => {
230
+ let v = String::try_convert(value)?;
231
+ ParquetValue::String(v)
232
+ }
233
+ ParquetSchemaType::Binary => {
234
+ let v = convert_to_binary(value)?;
235
+ ParquetValue::Bytes(v)
236
+ }
237
+ ParquetSchemaType::Boolean => {
238
+ let v = convert_to_boolean(value)?;
239
+ ParquetValue::Boolean(v)
240
+ }
241
+ ParquetSchemaType::Date32 => {
242
+ let v = convert_to_date32(value)?;
243
+ ParquetValue::Date32(v)
244
+ }
245
+ ParquetSchemaType::TimestampMillis => {
246
+ let v = convert_to_timestamp_millis(value)?;
247
+ ParquetValue::TimestampMillis(v, None)
248
+ }
249
+ ParquetSchemaType::TimestampMicros => {
250
+ let v = convert_to_timestamp_micros(value)?;
251
+ ParquetValue::TimestampMicros(v, None)
252
+ }
253
+ ParquetSchemaType::List(list_field) => {
254
+ let values = convert_to_list(value, list_field)?;
255
+ ParquetValue::List(values)
256
+ }
257
+ ParquetSchemaType::Map(map_field) => {
258
+ let map = convert_to_map(value, map_field)?;
259
+ ParquetValue::Map(map)
260
+ }
261
+ };
262
+ self.values.push(parquet_value);
263
+ Ok(())
264
+ }
265
+
266
+ pub fn take_array(&mut self) -> Result<Arc<dyn Array>, MagnusError> {
267
+ let values = std::mem::take(&mut self.values);
268
+ crate::convert_parquet_values_to_arrow(values, &self.type_)
269
+ }
270
+ }
@@ -0,0 +1,403 @@
1
+ use std::{
2
+ fs::File,
3
+ io::{self, BufReader, BufWriter},
4
+ sync::Arc,
5
+ };
6
+
7
+ use arrow_array::{Array, RecordBatch};
8
+ use arrow_schema::{DataType, Field, Schema, TimeUnit};
9
+ use magnus::{
10
+ scan_args::{get_kwargs, scan_args},
11
+ value::ReprValue,
12
+ Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
+ };
14
+ use parquet::arrow::ArrowWriter;
15
+ use tempfile::NamedTempFile;
16
+
17
+ use crate::{
18
+ convert_ruby_array_to_arrow,
19
+ types::{ColumnCollector, ParquetErrorWrapper, WriterOutput},
20
+ IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
21
+ };
22
+
23
+ const DEFAULT_BATCH_SIZE: usize = 1000;
24
+
25
+ /// Parse arguments for Parquet writing
26
+ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
27
+ let ruby = unsafe { Ruby::get_unchecked() };
28
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
+ let (read_from,) = parsed_args.required;
30
+
31
+ let kwargs = get_kwargs::<_, (Value, Value), (Option<usize>,), ()>(
32
+ parsed_args.keywords,
33
+ &["schema", "write_to"],
34
+ &["batch_size"],
35
+ )?;
36
+
37
+ let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
38
+ MagnusError::new(
39
+ magnus::exception::type_error(),
40
+ "schema must be an array of hashes",
41
+ )
42
+ })?;
43
+
44
+ let mut schema = Vec::with_capacity(schema_array.len());
45
+
46
+ for (idx, field_hash) in schema_array.into_iter().enumerate() {
47
+ if !field_hash.is_kind_of(ruby.class_hash()) {
48
+ return Err(MagnusError::new(
49
+ magnus::exception::type_error(),
50
+ format!("schema[{}] must be a hash", idx),
51
+ ));
52
+ }
53
+
54
+ let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
55
+ if entries.len() != 1 {
56
+ return Err(MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("schema[{}] must contain exactly one key-value pair", idx),
59
+ ));
60
+ }
61
+
62
+ let (name, type_str) = &entries[0];
63
+ let name = String::try_convert(name.clone())?;
64
+ let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
+
66
+ schema.push(SchemaField { name, type_ });
67
+ }
68
+
69
+ Ok(ParquetWriteArgs {
70
+ read_from,
71
+ write_to: kwargs.required.1,
72
+ schema,
73
+ batch_size: kwargs.optional.0,
74
+ })
75
+ }
76
+
77
+ #[inline]
78
+ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
79
+ let ruby = unsafe { Ruby::get_unchecked() };
80
+
81
+ let ParquetWriteArgs {
82
+ read_from,
83
+ write_to,
84
+ schema,
85
+ batch_size,
86
+ } = parse_parquet_write_args(args)?;
87
+
88
+ let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
89
+
90
+ // Convert schema to Arrow schema
91
+ let arrow_fields: Vec<Field> = schema
92
+ .iter()
93
+ .map(|field| {
94
+ Field::new(
95
+ &field.name,
96
+ match field.type_ {
97
+ ParquetSchemaType::Int8 => DataType::Int8,
98
+ ParquetSchemaType::Int16 => DataType::Int16,
99
+ ParquetSchemaType::Int32 => DataType::Int32,
100
+ ParquetSchemaType::Int64 => DataType::Int64,
101
+ ParquetSchemaType::UInt8 => DataType::UInt8,
102
+ ParquetSchemaType::UInt16 => DataType::UInt16,
103
+ ParquetSchemaType::UInt32 => DataType::UInt32,
104
+ ParquetSchemaType::UInt64 => DataType::UInt64,
105
+ ParquetSchemaType::Float => DataType::Float32,
106
+ ParquetSchemaType::Double => DataType::Float64,
107
+ ParquetSchemaType::String => DataType::Utf8,
108
+ ParquetSchemaType::Binary => DataType::Binary,
109
+ ParquetSchemaType::Boolean => DataType::Boolean,
110
+ ParquetSchemaType::Date32 => DataType::Date32,
111
+ ParquetSchemaType::TimestampMillis => {
112
+ DataType::Timestamp(TimeUnit::Millisecond, None)
113
+ }
114
+ ParquetSchemaType::TimestampMicros => {
115
+ DataType::Timestamp(TimeUnit::Microsecond, None)
116
+ }
117
+ ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
118
+ ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
119
+ },
120
+ true,
121
+ )
122
+ })
123
+ .collect();
124
+ let arrow_schema = Arc::new(Schema::new(arrow_fields));
125
+
126
+ // Create the writer
127
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
128
+
129
+ if read_from.is_kind_of(ruby.class_enumerator()) {
130
+ // Create collectors for each column
131
+ let mut column_collectors: Vec<ColumnCollector> = schema
132
+ .into_iter()
133
+ .map(|field| ColumnCollector::new(field.name, field.type_))
134
+ .collect();
135
+
136
+ let mut rows_in_batch = 0;
137
+
138
+ loop {
139
+ match read_from.funcall::<_, _, Value>("next", ()) {
140
+ Ok(row) => {
141
+ let row_array = RArray::from_value(row).ok_or_else(|| {
142
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
143
+ })?;
144
+
145
+ // Validate row length matches schema
146
+ if row_array.len() != column_collectors.len() {
147
+ return Err(MagnusError::new(
148
+ magnus::exception::type_error(),
149
+ format!(
150
+ "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
151
+ row_array.len(),
152
+ column_collectors.len(),
153
+ column_collectors.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
154
+ ),
155
+ ));
156
+ }
157
+
158
+ // Process each value in the row immediately
159
+ for (collector, value) in column_collectors.iter_mut().zip(row_array) {
160
+ collector.push_value(value)?;
161
+ }
162
+
163
+ rows_in_batch += 1;
164
+
165
+ // When we reach batch size, write the batch
166
+ if rows_in_batch >= batch_size {
167
+ write_batch(&mut writer, &mut column_collectors)?;
168
+ rows_in_batch = 0;
169
+ }
170
+ }
171
+ Err(e) => {
172
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
173
+ // Write any remaining rows
174
+ if rows_in_batch > 0 {
175
+ write_batch(&mut writer, &mut column_collectors)?;
176
+ }
177
+ break;
178
+ }
179
+ return Err(e);
180
+ }
181
+ }
182
+ }
183
+ } else {
184
+ return Err(MagnusError::new(
185
+ magnus::exception::type_error(),
186
+ "read_from must be an Enumerator",
187
+ ));
188
+ }
189
+
190
+ // Ensure everything is written and get the temp file if it exists
191
+ if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
192
+ // If we got a temp file back, we need to copy its contents to the IO-like object
193
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
194
+ }
195
+
196
+ Ok(())
197
+ }
198
+
199
+ #[inline]
200
+ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
201
+ let ruby = unsafe { Ruby::get_unchecked() };
202
+
203
+ let ParquetWriteArgs {
204
+ read_from,
205
+ write_to,
206
+ schema,
207
+ batch_size: _, // Batch size is determined by the input
208
+ } = parse_parquet_write_args(args)?;
209
+
210
+ // Convert schema to Arrow schema
211
+ let arrow_fields: Vec<Field> = schema
212
+ .iter()
213
+ .map(|field| {
214
+ Field::new(
215
+ &field.name,
216
+ match field.type_ {
217
+ ParquetSchemaType::Int8 => DataType::Int8,
218
+ ParquetSchemaType::Int16 => DataType::Int16,
219
+ ParquetSchemaType::Int32 => DataType::Int32,
220
+ ParquetSchemaType::Int64 => DataType::Int64,
221
+ ParquetSchemaType::UInt8 => DataType::UInt8,
222
+ ParquetSchemaType::UInt16 => DataType::UInt16,
223
+ ParquetSchemaType::UInt32 => DataType::UInt32,
224
+ ParquetSchemaType::UInt64 => DataType::UInt64,
225
+ ParquetSchemaType::Float => DataType::Float32,
226
+ ParquetSchemaType::Double => DataType::Float64,
227
+ ParquetSchemaType::String => DataType::Utf8,
228
+ ParquetSchemaType::Binary => DataType::Binary,
229
+ ParquetSchemaType::Boolean => DataType::Boolean,
230
+ ParquetSchemaType::Date32 => DataType::Date32,
231
+ ParquetSchemaType::TimestampMillis => {
232
+ DataType::Timestamp(TimeUnit::Millisecond, None)
233
+ }
234
+ ParquetSchemaType::TimestampMicros => {
235
+ DataType::Timestamp(TimeUnit::Microsecond, None)
236
+ }
237
+ ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
238
+ ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
239
+ },
240
+ true,
241
+ )
242
+ })
243
+ .collect();
244
+ let arrow_schema = Arc::new(Schema::new(arrow_fields));
245
+
246
+ // Create the writer
247
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
248
+
249
+ if read_from.is_kind_of(ruby.class_enumerator()) {
250
+ loop {
251
+ match read_from.funcall::<_, _, Value>("next", ()) {
252
+ Ok(batch) => {
253
+ let batch_array = RArray::from_value(batch).ok_or_else(|| {
254
+ MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
255
+ })?;
256
+
257
+ // Validate batch length matches schema
258
+ if batch_array.len() != schema.len() {
259
+ return Err(MagnusError::new(
260
+ magnus::exception::type_error(),
261
+ format!(
262
+ "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
263
+ batch_array.len(),
264
+ schema.len(),
265
+ schema.iter().map(|f| f.name.as_str()).collect::<Vec<_>>()
266
+ ),
267
+ ));
268
+ }
269
+
270
+ // Convert each column in the batch to Arrow arrays
271
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = schema
272
+ .iter()
273
+ .zip(batch_array)
274
+ .map(|(field, column)| {
275
+ let column_array = RArray::from_value(column).ok_or_else(|| {
276
+ MagnusError::new(
277
+ magnus::exception::type_error(),
278
+ format!("Column '{}' must be an array", field.name),
279
+ )
280
+ })?;
281
+
282
+ Ok((
283
+ field.name.clone(),
284
+ convert_ruby_array_to_arrow(column_array, &field.type_)?,
285
+ ))
286
+ })
287
+ .collect::<Result<_, MagnusError>>()?;
288
+
289
+ // Create and write record batch
290
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
291
+ MagnusError::new(
292
+ magnus::exception::runtime_error(),
293
+ format!("Failed to create record batch: {}", e),
294
+ )
295
+ })?;
296
+
297
+ writer
298
+ .write(&record_batch)
299
+ .map_err(|e| ParquetErrorWrapper(e))?;
300
+ }
301
+ Err(e) => {
302
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
303
+ break;
304
+ }
305
+ return Err(e);
306
+ }
307
+ }
308
+ }
309
+ } else {
310
+ return Err(MagnusError::new(
311
+ magnus::exception::type_error(),
312
+ "read_from must be an Enumerator",
313
+ ));
314
+ }
315
+
316
+ // Ensure everything is written and get the temp file if it exists
317
+ if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
318
+ // If we got a temp file back, we need to copy its contents to the IO-like object
319
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
320
+ }
321
+
322
+ Ok(())
323
+ }
324
+
325
+ fn create_writer(
326
+ ruby: &Ruby,
327
+ write_to: &Value,
328
+ schema: Arc<Schema>,
329
+ ) -> Result<WriterOutput, MagnusError> {
330
+ if write_to.is_kind_of(ruby.class_string()) {
331
+ let path = write_to.to_r_string()?.to_string()?;
332
+ let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
333
+ let writer =
334
+ ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
335
+ Ok(WriterOutput::File(writer))
336
+ } else {
337
+ // Create a temporary file to write to instead of directly to the IoLikeValue
338
+ let temp_file = NamedTempFile::new().map_err(|e| {
339
+ MagnusError::new(
340
+ magnus::exception::runtime_error(),
341
+ format!("Failed to create temporary file: {}", e),
342
+ )
343
+ })?;
344
+ let file: Box<dyn SendableWrite> = Box::new(temp_file.reopen().map_err(|e| {
345
+ MagnusError::new(
346
+ magnus::exception::runtime_error(),
347
+ format!("Failed to reopen temporary file: {}", e),
348
+ )
349
+ })?);
350
+ let writer =
351
+ ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
352
+ Ok(WriterOutput::TempFile(writer, temp_file))
353
+ }
354
+ }
355
+
356
+ // Helper function to copy temp file contents to IoLikeValue
357
+ fn copy_temp_file_to_io_like(
358
+ temp_file: NamedTempFile,
359
+ io_like: IoLikeValue,
360
+ ) -> Result<(), MagnusError> {
361
+ let file = temp_file.reopen().map_err(|e| {
362
+ MagnusError::new(
363
+ magnus::exception::runtime_error(),
364
+ format!("Failed to reopen temporary file: {}", e),
365
+ )
366
+ })?;
367
+ let mut buf_reader = BufReader::new(file);
368
+ let mut buf_writer = BufWriter::new(io_like);
369
+
370
+ io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
371
+ MagnusError::new(
372
+ magnus::exception::runtime_error(),
373
+ format!("Failed to copy temp file to io_like: {}", e),
374
+ )
375
+ })?;
376
+
377
+ Ok(())
378
+ }
379
+
380
+ fn write_batch(
381
+ writer: &mut WriterOutput,
382
+ collectors: &mut [ColumnCollector],
383
+ ) -> Result<(), MagnusError> {
384
+ // Convert columns to Arrow arrays
385
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
386
+ .iter_mut()
387
+ .map(|collector| Ok((collector.name.clone(), collector.take_array()?)))
388
+ .collect::<Result<_, MagnusError>>()?;
389
+
390
+ // Create and write record batch
391
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
392
+ MagnusError::new(
393
+ magnus::exception::runtime_error(),
394
+ format!("Failed to create record batch: {}", e),
395
+ )
396
+ })?;
397
+
398
+ writer
399
+ .write(&record_batch)
400
+ .map_err(|e| ParquetErrorWrapper(e))?;
401
+
402
+ Ok(())
403
+ }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.5"
2
+ VERSION = "0.2.5"
3
3
  end