parquet 0.0.5 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,270 @@
1
+ use std::{
2
+ io::{self, Write},
3
+ str::FromStr,
4
+ sync::Arc,
5
+ };
6
+
7
+ use arrow_array::{Array, RecordBatch};
8
+ use magnus::{value::ReprValue, Error as MagnusError, RString, Ruby, Symbol, TryConvert, Value};
9
+ use parquet::{arrow::ArrowWriter, errors::ParquetError};
10
+ use tempfile::NamedTempFile;
11
+
12
+ use crate::types::{ListField, MapField, ParquetSchemaType};
13
+
14
+ #[derive(Debug)]
15
+ pub struct SchemaField {
16
+ pub name: String,
17
+ pub type_: ParquetSchemaType,
18
+ }
19
+
20
+ #[derive(Debug)]
21
+ pub struct ParquetWriteArgs {
22
+ pub read_from: Value,
23
+ pub write_to: Value,
24
+ pub schema: Vec<SchemaField>,
25
+ pub batch_size: Option<usize>,
26
+ }
27
+
28
+ pub trait SendableWrite: Send + Write {}
29
+ impl<T: Send + Write> SendableWrite for T {}
30
+
31
+ pub struct IoLikeValue(pub(crate) Value);
32
+
33
+ impl Write for IoLikeValue {
34
+ fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
35
+ let ruby_bytes = RString::from_slice(buf);
36
+
37
+ let bytes_written = self
38
+ .0
39
+ .funcall::<_, _, usize>("write", (ruby_bytes,))
40
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
41
+
42
+ Ok(bytes_written)
43
+ }
44
+
45
+ fn flush(&mut self) -> Result<(), io::Error> {
46
+ self.0
47
+ .funcall::<_, _, Value>("flush", ())
48
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
49
+
50
+ Ok(())
51
+ }
52
+ }
53
+
54
+ impl FromStr for ParquetSchemaType {
55
+ type Err = MagnusError;
56
+
57
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
58
+ match s {
59
+ "int8" => Ok(ParquetSchemaType::Int8),
60
+ "int16" => Ok(ParquetSchemaType::Int16),
61
+ "int32" => Ok(ParquetSchemaType::Int32),
62
+ "int64" => Ok(ParquetSchemaType::Int64),
63
+ "uint8" => Ok(ParquetSchemaType::UInt8),
64
+ "uint16" => Ok(ParquetSchemaType::UInt16),
65
+ "uint32" => Ok(ParquetSchemaType::UInt32),
66
+ "uint64" => Ok(ParquetSchemaType::UInt64),
67
+ "float" | "float32" => Ok(ParquetSchemaType::Float),
68
+ "double" | "float64" => Ok(ParquetSchemaType::Double),
69
+ "string" | "utf8" => Ok(ParquetSchemaType::String),
70
+ "binary" => Ok(ParquetSchemaType::Binary),
71
+ "boolean" | "bool" => Ok(ParquetSchemaType::Boolean),
72
+ "date32" => Ok(ParquetSchemaType::Date32),
73
+ "timestamp_millis" => Ok(ParquetSchemaType::TimestampMillis),
74
+ "timestamp_micros" => Ok(ParquetSchemaType::TimestampMicros),
75
+ "list" => Ok(ParquetSchemaType::List(Box::new(ListField {
76
+ item_type: ParquetSchemaType::Int8,
77
+ }))),
78
+ "map" => Ok(ParquetSchemaType::Map(Box::new(MapField {
79
+ key_type: ParquetSchemaType::String,
80
+ value_type: ParquetSchemaType::Int8,
81
+ }))),
82
+ _ => Err(MagnusError::new(
83
+ magnus::exception::runtime_error(),
84
+ format!("Invalid schema type: {}", s),
85
+ )),
86
+ }
87
+ }
88
+ }
89
+
90
+ impl TryConvert for ParquetSchemaType {
91
+ fn try_convert(value: Value) -> Result<Self, MagnusError> {
92
+ let ruby = unsafe { Ruby::get_unchecked() };
93
+ let schema_type = parse_string_or_symbol(&ruby, value)?;
94
+
95
+ schema_type.unwrap().parse()
96
+ }
97
+ }
98
+
99
+ // We know this type is safe to move between threads because it's just an enum
100
+ // with simple primitive types and strings
101
+ unsafe impl Send for ParquetSchemaType {}
102
+
103
+ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
104
+ if value.is_nil() {
105
+ Ok(None)
106
+ } else if value.is_kind_of(ruby.class_string()) {
107
+ RString::from_value(value)
108
+ .ok_or_else(|| {
109
+ MagnusError::new(magnus::exception::type_error(), "Invalid string value")
110
+ })?
111
+ .to_string()
112
+ .map(|s| Some(s))
113
+ } else if value.is_kind_of(ruby.class_symbol()) {
114
+ Symbol::from_value(value)
115
+ .ok_or_else(|| {
116
+ MagnusError::new(magnus::exception::type_error(), "Invalid symbol value")
117
+ })?
118
+ .funcall("to_s", ())
119
+ .map(|s| Some(s))
120
+ } else {
121
+ Err(MagnusError::new(
122
+ magnus::exception::type_error(),
123
+ "Value must be a String or Symbol",
124
+ ))
125
+ }
126
+ }
127
+
128
+ pub enum WriterOutput {
129
+ File(ArrowWriter<Box<dyn SendableWrite>>),
130
+ TempFile(ArrowWriter<Box<dyn SendableWrite>>, NamedTempFile),
131
+ }
132
+
133
+ impl WriterOutput {
134
+ pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ParquetError> {
135
+ match self {
136
+ WriterOutput::File(writer) | WriterOutput::TempFile(writer, _) => writer.write(batch),
137
+ }
138
+ }
139
+
140
+ pub fn close(self) -> Result<Option<NamedTempFile>, ParquetError> {
141
+ match self {
142
+ WriterOutput::File(writer) => {
143
+ writer.close()?;
144
+ Ok(None)
145
+ }
146
+ WriterOutput::TempFile(writer, temp_file) => {
147
+ writer.close()?;
148
+ Ok(Some(temp_file))
149
+ }
150
+ }
151
+ }
152
+ }
153
+
154
+ pub struct ParquetErrorWrapper(pub ParquetError);
155
+
156
+ impl From<ParquetErrorWrapper> for MagnusError {
157
+ fn from(err: ParquetErrorWrapper) -> Self {
158
+ MagnusError::new(
159
+ magnus::exception::runtime_error(),
160
+ format!("Parquet error: {}", err.0),
161
+ )
162
+ }
163
+ }
164
+
165
+ pub struct ColumnCollector {
166
+ pub name: String,
167
+ pub type_: ParquetSchemaType,
168
+ pub values: Vec<crate::types::ParquetValue>,
169
+ }
170
+
171
+ impl ColumnCollector {
172
+ pub fn new(name: String, type_: ParquetSchemaType) -> Self {
173
+ Self {
174
+ name,
175
+ type_,
176
+ values: Vec::new(),
177
+ }
178
+ }
179
+
180
+ pub fn push_value(&mut self, value: Value) -> Result<(), MagnusError> {
181
+ use crate::types::ParquetValue;
182
+ use crate::{
183
+ convert_to_binary, convert_to_boolean, convert_to_date32, convert_to_list,
184
+ convert_to_map, convert_to_timestamp_micros, convert_to_timestamp_millis,
185
+ NumericConverter,
186
+ };
187
+
188
+ let parquet_value = match &self.type_ {
189
+ ParquetSchemaType::Int8 => {
190
+ let v = NumericConverter::<i8>::convert_with_string_fallback(value)?;
191
+ ParquetValue::Int8(v)
192
+ }
193
+ ParquetSchemaType::Int16 => {
194
+ let v = NumericConverter::<i16>::convert_with_string_fallback(value)?;
195
+ ParquetValue::Int16(v)
196
+ }
197
+ ParquetSchemaType::Int32 => {
198
+ let v = NumericConverter::<i32>::convert_with_string_fallback(value)?;
199
+ ParquetValue::Int32(v)
200
+ }
201
+ ParquetSchemaType::Int64 => {
202
+ let v = NumericConverter::<i64>::convert_with_string_fallback(value)?;
203
+ ParquetValue::Int64(v)
204
+ }
205
+ ParquetSchemaType::UInt8 => {
206
+ let v = NumericConverter::<u8>::convert_with_string_fallback(value)?;
207
+ ParquetValue::UInt8(v)
208
+ }
209
+ ParquetSchemaType::UInt16 => {
210
+ let v = NumericConverter::<u16>::convert_with_string_fallback(value)?;
211
+ ParquetValue::UInt16(v)
212
+ }
213
+ ParquetSchemaType::UInt32 => {
214
+ let v = NumericConverter::<u32>::convert_with_string_fallback(value)?;
215
+ ParquetValue::UInt32(v)
216
+ }
217
+ ParquetSchemaType::UInt64 => {
218
+ let v = NumericConverter::<u64>::convert_with_string_fallback(value)?;
219
+ ParquetValue::UInt64(v)
220
+ }
221
+ ParquetSchemaType::Float => {
222
+ let v = NumericConverter::<f32>::convert_with_string_fallback(value)?;
223
+ ParquetValue::Float32(v)
224
+ }
225
+ ParquetSchemaType::Double => {
226
+ let v = NumericConverter::<f64>::convert_with_string_fallback(value)?;
227
+ ParquetValue::Float64(v)
228
+ }
229
+ ParquetSchemaType::String => {
230
+ let v = String::try_convert(value)?;
231
+ ParquetValue::String(v)
232
+ }
233
+ ParquetSchemaType::Binary => {
234
+ let v = convert_to_binary(value)?;
235
+ ParquetValue::Bytes(v)
236
+ }
237
+ ParquetSchemaType::Boolean => {
238
+ let v = convert_to_boolean(value)?;
239
+ ParquetValue::Boolean(v)
240
+ }
241
+ ParquetSchemaType::Date32 => {
242
+ let v = convert_to_date32(value)?;
243
+ ParquetValue::Date32(v)
244
+ }
245
+ ParquetSchemaType::TimestampMillis => {
246
+ let v = convert_to_timestamp_millis(value)?;
247
+ ParquetValue::TimestampMillis(v, None)
248
+ }
249
+ ParquetSchemaType::TimestampMicros => {
250
+ let v = convert_to_timestamp_micros(value)?;
251
+ ParquetValue::TimestampMicros(v, None)
252
+ }
253
+ ParquetSchemaType::List(list_field) => {
254
+ let values = convert_to_list(value, list_field)?;
255
+ ParquetValue::List(values)
256
+ }
257
+ ParquetSchemaType::Map(map_field) => {
258
+ let map = convert_to_map(value, map_field)?;
259
+ ParquetValue::Map(map)
260
+ }
261
+ };
262
+ self.values.push(parquet_value);
263
+ Ok(())
264
+ }
265
+
266
+ pub fn take_array(&mut self) -> Result<Arc<dyn Array>, MagnusError> {
267
+ let values = std::mem::take(&mut self.values);
268
+ crate::convert_parquet_values_to_arrow(values, &self.type_)
269
+ }
270
+ }
@@ -0,0 +1,403 @@
1
+ use std::{
2
+ fs::File,
3
+ io::{self, BufReader, BufWriter},
4
+ sync::Arc,
5
+ };
6
+
7
+ use arrow_array::{Array, RecordBatch};
8
+ use arrow_schema::{DataType, Field, Schema, TimeUnit};
9
+ use magnus::{
10
+ scan_args::{get_kwargs, scan_args},
11
+ value::ReprValue,
12
+ Error as MagnusError, RArray, Ruby, TryConvert, Value,
13
+ };
14
+ use parquet::arrow::ArrowWriter;
15
+ use tempfile::NamedTempFile;
16
+
17
+ use crate::{
18
+ convert_ruby_array_to_arrow,
19
+ types::{ColumnCollector, ParquetErrorWrapper, WriterOutput},
20
+ IoLikeValue, ParquetSchemaType, ParquetWriteArgs, SchemaField, SendableWrite,
21
+ };
22
+
23
+ const DEFAULT_BATCH_SIZE: usize = 1000;
24
+
25
+ /// Parse arguments for Parquet writing
26
+ pub fn parse_parquet_write_args(args: &[Value]) -> Result<ParquetWriteArgs, MagnusError> {
27
+ let ruby = unsafe { Ruby::get_unchecked() };
28
+ let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
29
+ let (read_from,) = parsed_args.required;
30
+
31
+ let kwargs = get_kwargs::<_, (Value, Value), (Option<usize>,), ()>(
32
+ parsed_args.keywords,
33
+ &["schema", "write_to"],
34
+ &["batch_size"],
35
+ )?;
36
+
37
+ let schema_array = RArray::from_value(kwargs.required.0).ok_or_else(|| {
38
+ MagnusError::new(
39
+ magnus::exception::type_error(),
40
+ "schema must be an array of hashes",
41
+ )
42
+ })?;
43
+
44
+ let mut schema = Vec::with_capacity(schema_array.len());
45
+
46
+ for (idx, field_hash) in schema_array.into_iter().enumerate() {
47
+ if !field_hash.is_kind_of(ruby.class_hash()) {
48
+ return Err(MagnusError::new(
49
+ magnus::exception::type_error(),
50
+ format!("schema[{}] must be a hash", idx),
51
+ ));
52
+ }
53
+
54
+ let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
55
+ if entries.len() != 1 {
56
+ return Err(MagnusError::new(
57
+ magnus::exception::type_error(),
58
+ format!("schema[{}] must contain exactly one key-value pair", idx),
59
+ ));
60
+ }
61
+
62
+ let (name, type_str) = &entries[0];
63
+ let name = String::try_convert(name.clone())?;
64
+ let type_ = ParquetSchemaType::try_convert(type_str.clone())?;
65
+
66
+ schema.push(SchemaField { name, type_ });
67
+ }
68
+
69
+ Ok(ParquetWriteArgs {
70
+ read_from,
71
+ write_to: kwargs.required.1,
72
+ schema,
73
+ batch_size: kwargs.optional.0,
74
+ })
75
+ }
76
+
77
+ #[inline]
78
+ pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
79
+ let ruby = unsafe { Ruby::get_unchecked() };
80
+
81
+ let ParquetWriteArgs {
82
+ read_from,
83
+ write_to,
84
+ schema,
85
+ batch_size,
86
+ } = parse_parquet_write_args(args)?;
87
+
88
+ let batch_size = batch_size.unwrap_or(DEFAULT_BATCH_SIZE);
89
+
90
+ // Convert schema to Arrow schema
91
+ let arrow_fields: Vec<Field> = schema
92
+ .iter()
93
+ .map(|field| {
94
+ Field::new(
95
+ &field.name,
96
+ match field.type_ {
97
+ ParquetSchemaType::Int8 => DataType::Int8,
98
+ ParquetSchemaType::Int16 => DataType::Int16,
99
+ ParquetSchemaType::Int32 => DataType::Int32,
100
+ ParquetSchemaType::Int64 => DataType::Int64,
101
+ ParquetSchemaType::UInt8 => DataType::UInt8,
102
+ ParquetSchemaType::UInt16 => DataType::UInt16,
103
+ ParquetSchemaType::UInt32 => DataType::UInt32,
104
+ ParquetSchemaType::UInt64 => DataType::UInt64,
105
+ ParquetSchemaType::Float => DataType::Float32,
106
+ ParquetSchemaType::Double => DataType::Float64,
107
+ ParquetSchemaType::String => DataType::Utf8,
108
+ ParquetSchemaType::Binary => DataType::Binary,
109
+ ParquetSchemaType::Boolean => DataType::Boolean,
110
+ ParquetSchemaType::Date32 => DataType::Date32,
111
+ ParquetSchemaType::TimestampMillis => {
112
+ DataType::Timestamp(TimeUnit::Millisecond, None)
113
+ }
114
+ ParquetSchemaType::TimestampMicros => {
115
+ DataType::Timestamp(TimeUnit::Microsecond, None)
116
+ }
117
+ ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
118
+ ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
119
+ },
120
+ true,
121
+ )
122
+ })
123
+ .collect();
124
+ let arrow_schema = Arc::new(Schema::new(arrow_fields));
125
+
126
+ // Create the writer
127
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
128
+
129
+ if read_from.is_kind_of(ruby.class_enumerator()) {
130
+ // Create collectors for each column
131
+ let mut column_collectors: Vec<ColumnCollector> = schema
132
+ .into_iter()
133
+ .map(|field| ColumnCollector::new(field.name, field.type_))
134
+ .collect();
135
+
136
+ let mut rows_in_batch = 0;
137
+
138
+ loop {
139
+ match read_from.funcall::<_, _, Value>("next", ()) {
140
+ Ok(row) => {
141
+ let row_array = RArray::from_value(row).ok_or_else(|| {
142
+ MagnusError::new(ruby.exception_type_error(), "Row must be an array")
143
+ })?;
144
+
145
+ // Validate row length matches schema
146
+ if row_array.len() != column_collectors.len() {
147
+ return Err(MagnusError::new(
148
+ magnus::exception::type_error(),
149
+ format!(
150
+ "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
151
+ row_array.len(),
152
+ column_collectors.len(),
153
+ column_collectors.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
154
+ ),
155
+ ));
156
+ }
157
+
158
+ // Process each value in the row immediately
159
+ for (collector, value) in column_collectors.iter_mut().zip(row_array) {
160
+ collector.push_value(value)?;
161
+ }
162
+
163
+ rows_in_batch += 1;
164
+
165
+ // When we reach batch size, write the batch
166
+ if rows_in_batch >= batch_size {
167
+ write_batch(&mut writer, &mut column_collectors)?;
168
+ rows_in_batch = 0;
169
+ }
170
+ }
171
+ Err(e) => {
172
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
173
+ // Write any remaining rows
174
+ if rows_in_batch > 0 {
175
+ write_batch(&mut writer, &mut column_collectors)?;
176
+ }
177
+ break;
178
+ }
179
+ return Err(e);
180
+ }
181
+ }
182
+ }
183
+ } else {
184
+ return Err(MagnusError::new(
185
+ magnus::exception::type_error(),
186
+ "read_from must be an Enumerator",
187
+ ));
188
+ }
189
+
190
+ // Ensure everything is written and get the temp file if it exists
191
+ if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
192
+ // If we got a temp file back, we need to copy its contents to the IO-like object
193
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
194
+ }
195
+
196
+ Ok(())
197
+ }
198
+
199
+ #[inline]
200
+ pub fn write_columns(args: &[Value]) -> Result<(), MagnusError> {
201
+ let ruby = unsafe { Ruby::get_unchecked() };
202
+
203
+ let ParquetWriteArgs {
204
+ read_from,
205
+ write_to,
206
+ schema,
207
+ batch_size: _, // Batch size is determined by the input
208
+ } = parse_parquet_write_args(args)?;
209
+
210
+ // Convert schema to Arrow schema
211
+ let arrow_fields: Vec<Field> = schema
212
+ .iter()
213
+ .map(|field| {
214
+ Field::new(
215
+ &field.name,
216
+ match field.type_ {
217
+ ParquetSchemaType::Int8 => DataType::Int8,
218
+ ParquetSchemaType::Int16 => DataType::Int16,
219
+ ParquetSchemaType::Int32 => DataType::Int32,
220
+ ParquetSchemaType::Int64 => DataType::Int64,
221
+ ParquetSchemaType::UInt8 => DataType::UInt8,
222
+ ParquetSchemaType::UInt16 => DataType::UInt16,
223
+ ParquetSchemaType::UInt32 => DataType::UInt32,
224
+ ParquetSchemaType::UInt64 => DataType::UInt64,
225
+ ParquetSchemaType::Float => DataType::Float32,
226
+ ParquetSchemaType::Double => DataType::Float64,
227
+ ParquetSchemaType::String => DataType::Utf8,
228
+ ParquetSchemaType::Binary => DataType::Binary,
229
+ ParquetSchemaType::Boolean => DataType::Boolean,
230
+ ParquetSchemaType::Date32 => DataType::Date32,
231
+ ParquetSchemaType::TimestampMillis => {
232
+ DataType::Timestamp(TimeUnit::Millisecond, None)
233
+ }
234
+ ParquetSchemaType::TimestampMicros => {
235
+ DataType::Timestamp(TimeUnit::Microsecond, None)
236
+ }
237
+ ParquetSchemaType::List(_) => unimplemented!("List type not yet supported"),
238
+ ParquetSchemaType::Map(_) => unimplemented!("Map type not yet supported"),
239
+ },
240
+ true,
241
+ )
242
+ })
243
+ .collect();
244
+ let arrow_schema = Arc::new(Schema::new(arrow_fields));
245
+
246
+ // Create the writer
247
+ let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone())?;
248
+
249
+ if read_from.is_kind_of(ruby.class_enumerator()) {
250
+ loop {
251
+ match read_from.funcall::<_, _, Value>("next", ()) {
252
+ Ok(batch) => {
253
+ let batch_array = RArray::from_value(batch).ok_or_else(|| {
254
+ MagnusError::new(ruby.exception_type_error(), "Batch must be an array")
255
+ })?;
256
+
257
+ // Validate batch length matches schema
258
+ if batch_array.len() != schema.len() {
259
+ return Err(MagnusError::new(
260
+ magnus::exception::type_error(),
261
+ format!(
262
+ "Batch column count ({}) does not match schema length ({}). Schema expects columns: {:?}",
263
+ batch_array.len(),
264
+ schema.len(),
265
+ schema.iter().map(|f| f.name.as_str()).collect::<Vec<_>>()
266
+ ),
267
+ ));
268
+ }
269
+
270
+ // Convert each column in the batch to Arrow arrays
271
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = schema
272
+ .iter()
273
+ .zip(batch_array)
274
+ .map(|(field, column)| {
275
+ let column_array = RArray::from_value(column).ok_or_else(|| {
276
+ MagnusError::new(
277
+ magnus::exception::type_error(),
278
+ format!("Column '{}' must be an array", field.name),
279
+ )
280
+ })?;
281
+
282
+ Ok((
283
+ field.name.clone(),
284
+ convert_ruby_array_to_arrow(column_array, &field.type_)?,
285
+ ))
286
+ })
287
+ .collect::<Result<_, MagnusError>>()?;
288
+
289
+ // Create and write record batch
290
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
291
+ MagnusError::new(
292
+ magnus::exception::runtime_error(),
293
+ format!("Failed to create record batch: {}", e),
294
+ )
295
+ })?;
296
+
297
+ writer
298
+ .write(&record_batch)
299
+ .map_err(|e| ParquetErrorWrapper(e))?;
300
+ }
301
+ Err(e) => {
302
+ if e.is_kind_of(ruby.exception_stop_iteration()) {
303
+ break;
304
+ }
305
+ return Err(e);
306
+ }
307
+ }
308
+ }
309
+ } else {
310
+ return Err(MagnusError::new(
311
+ magnus::exception::type_error(),
312
+ "read_from must be an Enumerator",
313
+ ));
314
+ }
315
+
316
+ // Ensure everything is written and get the temp file if it exists
317
+ if let Some(temp_file) = writer.close().map_err(|e| ParquetErrorWrapper(e))? {
318
+ // If we got a temp file back, we need to copy its contents to the IO-like object
319
+ copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
320
+ }
321
+
322
+ Ok(())
323
+ }
324
+
325
+ fn create_writer(
326
+ ruby: &Ruby,
327
+ write_to: &Value,
328
+ schema: Arc<Schema>,
329
+ ) -> Result<WriterOutput, MagnusError> {
330
+ if write_to.is_kind_of(ruby.class_string()) {
331
+ let path = write_to.to_r_string()?.to_string()?;
332
+ let file: Box<dyn SendableWrite> = Box::new(File::create(path).unwrap());
333
+ let writer =
334
+ ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
335
+ Ok(WriterOutput::File(writer))
336
+ } else {
337
+ // Create a temporary file to write to instead of directly to the IoLikeValue
338
+ let temp_file = NamedTempFile::new().map_err(|e| {
339
+ MagnusError::new(
340
+ magnus::exception::runtime_error(),
341
+ format!("Failed to create temporary file: {}", e),
342
+ )
343
+ })?;
344
+ let file: Box<dyn SendableWrite> = Box::new(temp_file.reopen().map_err(|e| {
345
+ MagnusError::new(
346
+ magnus::exception::runtime_error(),
347
+ format!("Failed to reopen temporary file: {}", e),
348
+ )
349
+ })?);
350
+ let writer =
351
+ ArrowWriter::try_new(file, schema, None).map_err(|e| ParquetErrorWrapper(e))?;
352
+ Ok(WriterOutput::TempFile(writer, temp_file))
353
+ }
354
+ }
355
+
356
+ // Helper function to copy temp file contents to IoLikeValue
357
+ fn copy_temp_file_to_io_like(
358
+ temp_file: NamedTempFile,
359
+ io_like: IoLikeValue,
360
+ ) -> Result<(), MagnusError> {
361
+ let file = temp_file.reopen().map_err(|e| {
362
+ MagnusError::new(
363
+ magnus::exception::runtime_error(),
364
+ format!("Failed to reopen temporary file: {}", e),
365
+ )
366
+ })?;
367
+ let mut buf_reader = BufReader::new(file);
368
+ let mut buf_writer = BufWriter::new(io_like);
369
+
370
+ io::copy(&mut buf_reader, &mut buf_writer).map_err(|e| {
371
+ MagnusError::new(
372
+ magnus::exception::runtime_error(),
373
+ format!("Failed to copy temp file to io_like: {}", e),
374
+ )
375
+ })?;
376
+
377
+ Ok(())
378
+ }
379
+
380
+ fn write_batch(
381
+ writer: &mut WriterOutput,
382
+ collectors: &mut [ColumnCollector],
383
+ ) -> Result<(), MagnusError> {
384
+ // Convert columns to Arrow arrays
385
+ let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
386
+ .iter_mut()
387
+ .map(|collector| Ok((collector.name.clone(), collector.take_array()?)))
388
+ .collect::<Result<_, MagnusError>>()?;
389
+
390
+ // Create and write record batch
391
+ let record_batch = RecordBatch::try_from_iter(arrow_arrays).map_err(|e| {
392
+ MagnusError::new(
393
+ magnus::exception::runtime_error(),
394
+ format!("Failed to create record batch: {}", e),
395
+ )
396
+ })?;
397
+
398
+ writer
399
+ .write(&record_batch)
400
+ .map_err(|e| ParquetErrorWrapper(e))?;
401
+
402
+ Ok(())
403
+ }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.0.5"
2
+ VERSION = "0.2.5"
3
3
  end