parquet 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,349 @@
1
+ use magnus::value::ReprValue; // Add ReprValue trait to scope
2
+ use magnus::{Error as MagnusError, RArray, Ruby, TryConvert, Value};
3
+
4
+ use crate::types::{ParquetSchemaType as PST, PrimitiveType, SchemaField, SchemaNode};
5
+ use crate::utils::parse_string_or_symbol;
6
+
7
+ /// Recursively converts a SchemaField to a SchemaNode for any level of nesting
8
+ fn convert_schema_field_to_node(field: &SchemaField) -> SchemaNode {
9
+ match &field.type_ {
10
+ PST::Int8 => SchemaNode::Primitive {
11
+ name: field.name.clone(),
12
+ nullable: field.nullable,
13
+ parquet_type: PrimitiveType::Int8,
14
+ format: field.format.clone(),
15
+ },
16
+ PST::Int16 => SchemaNode::Primitive {
17
+ name: field.name.clone(),
18
+ nullable: field.nullable,
19
+ parquet_type: PrimitiveType::Int16,
20
+ format: field.format.clone(),
21
+ },
22
+ PST::Int32 => SchemaNode::Primitive {
23
+ name: field.name.clone(),
24
+ nullable: field.nullable,
25
+ parquet_type: PrimitiveType::Int32,
26
+ format: field.format.clone(),
27
+ },
28
+ PST::Int64 => SchemaNode::Primitive {
29
+ name: field.name.clone(),
30
+ nullable: field.nullable,
31
+ parquet_type: PrimitiveType::Int64,
32
+ format: field.format.clone(),
33
+ },
34
+ PST::UInt8 => SchemaNode::Primitive {
35
+ name: field.name.clone(),
36
+ nullable: field.nullable,
37
+ parquet_type: PrimitiveType::UInt8,
38
+ format: field.format.clone(),
39
+ },
40
+ PST::UInt16 => SchemaNode::Primitive {
41
+ name: field.name.clone(),
42
+ nullable: field.nullable,
43
+ parquet_type: PrimitiveType::UInt16,
44
+ format: field.format.clone(),
45
+ },
46
+ PST::UInt32 => SchemaNode::Primitive {
47
+ name: field.name.clone(),
48
+ nullable: field.nullable,
49
+ parquet_type: PrimitiveType::UInt32,
50
+ format: field.format.clone(),
51
+ },
52
+ PST::UInt64 => SchemaNode::Primitive {
53
+ name: field.name.clone(),
54
+ nullable: field.nullable,
55
+ parquet_type: PrimitiveType::UInt64,
56
+ format: field.format.clone(),
57
+ },
58
+ PST::Float => SchemaNode::Primitive {
59
+ name: field.name.clone(),
60
+ nullable: field.nullable,
61
+ parquet_type: PrimitiveType::Float32,
62
+ format: field.format.clone(),
63
+ },
64
+ PST::Double => SchemaNode::Primitive {
65
+ name: field.name.clone(),
66
+ nullable: field.nullable,
67
+ parquet_type: PrimitiveType::Float64,
68
+ format: field.format.clone(),
69
+ },
70
+ PST::String => SchemaNode::Primitive {
71
+ name: field.name.clone(),
72
+ nullable: field.nullable,
73
+ parquet_type: PrimitiveType::String,
74
+ format: field.format.clone(),
75
+ },
76
+ PST::Binary => SchemaNode::Primitive {
77
+ name: field.name.clone(),
78
+ nullable: field.nullable,
79
+ parquet_type: PrimitiveType::Binary,
80
+ format: field.format.clone(),
81
+ },
82
+ PST::Boolean => SchemaNode::Primitive {
83
+ name: field.name.clone(),
84
+ nullable: field.nullable,
85
+ parquet_type: PrimitiveType::Boolean,
86
+ format: field.format.clone(),
87
+ },
88
+ PST::Date32 => SchemaNode::Primitive {
89
+ name: field.name.clone(),
90
+ nullable: field.nullable,
91
+ parquet_type: PrimitiveType::Date32,
92
+ format: field.format.clone(),
93
+ },
94
+ PST::TimestampMillis => SchemaNode::Primitive {
95
+ name: field.name.clone(),
96
+ nullable: field.nullable,
97
+ parquet_type: PrimitiveType::TimestampMillis,
98
+ format: field.format.clone(),
99
+ },
100
+ PST::TimestampMicros => SchemaNode::Primitive {
101
+ name: field.name.clone(),
102
+ nullable: field.nullable,
103
+ parquet_type: PrimitiveType::TimestampMicros,
104
+ format: field.format.clone(),
105
+ },
106
+ PST::List(list_field) => {
107
+ // Create item node by recursively converting the list item type to a node
108
+ let item_node = match &list_field.item_type {
109
+ // For primitive types, create a primitive node with name "item"
110
+ PST::Int8
111
+ | PST::Int16
112
+ | PST::Int32
113
+ | PST::Int64
114
+ | PST::UInt8
115
+ | PST::UInt16
116
+ | PST::UInt32
117
+ | PST::UInt64
118
+ | PST::Float
119
+ | PST::Double
120
+ | PST::String
121
+ | PST::Binary
122
+ | PST::Boolean
123
+ | PST::Date32
124
+ | PST::TimestampMillis
125
+ | PST::TimestampMicros => {
126
+ // Use a temporary SchemaField to convert item type
127
+ let item_field = SchemaField {
128
+ name: "item".to_string(),
129
+ type_: list_field.item_type.clone(),
130
+ format: list_field.format.clone().map(String::from),
131
+ nullable: list_field.nullable,
132
+ };
133
+ convert_schema_field_to_node(&item_field)
134
+ }
135
+ // For nested types (List, Map, Struct), recursively convert them
136
+ PST::List(_) | PST::Map(_) | PST::Struct(_) => {
137
+ // Use a temporary SchemaField to convert item type
138
+ let item_field = SchemaField {
139
+ name: "item".to_string(),
140
+ type_: list_field.item_type.clone(),
141
+ format: list_field.format.clone().map(String::from),
142
+ nullable: list_field.nullable,
143
+ };
144
+ convert_schema_field_to_node(&item_field)
145
+ }
146
+ };
147
+
148
+ SchemaNode::List {
149
+ name: field.name.clone(),
150
+ nullable: field.nullable,
151
+ item: Box::new(item_node),
152
+ }
153
+ }
154
+ PST::Map(map_field) => {
155
+ let key_field = SchemaField {
156
+ name: "key".to_string(),
157
+ type_: map_field.key_type.clone(),
158
+ format: map_field.key_format.clone().map(String::from),
159
+ nullable: false, // Map keys can never be null in Parquet
160
+ };
161
+ let value_field = SchemaField {
162
+ name: "value".to_string(),
163
+ type_: map_field.value_type.clone(),
164
+ format: map_field.value_format.clone().map(String::from),
165
+ nullable: map_field.value_nullable,
166
+ };
167
+
168
+ let key_node = convert_schema_field_to_node(&key_field);
169
+ let value_node = convert_schema_field_to_node(&value_field);
170
+
171
+ SchemaNode::Map {
172
+ name: field.name.clone(),
173
+ nullable: field.nullable,
174
+ key: Box::new(key_node),
175
+ value: Box::new(value_node),
176
+ }
177
+ }
178
+ PST::Struct(struct_field) => {
179
+ // Convert each subfield recursively
180
+ let mut field_nodes = Vec::with_capacity(struct_field.fields.len());
181
+
182
+ for subfield in struct_field.fields.iter() {
183
+ // Recursively convert each subfield, supporting any level of nesting
184
+ field_nodes.push(convert_schema_field_to_node(subfield));
185
+ }
186
+
187
+ SchemaNode::Struct {
188
+ name: field.name.clone(),
189
+ nullable: field.nullable,
190
+ fields: field_nodes,
191
+ }
192
+ }
193
+ }
194
+ }
195
+
196
+ /// Converts the legacy schema format (array of field hashes) to the new DSL format (SchemaNode)
197
+ pub fn legacy_schema_to_dsl(
198
+ _ruby: &Ruby,
199
+ schema_fields: Vec<SchemaField>,
200
+ ) -> Result<SchemaNode, MagnusError> {
201
+ // Create a top-level struct node with fields for each schema field
202
+ let mut field_nodes = Vec::with_capacity(schema_fields.len());
203
+
204
+ for field in schema_fields {
205
+ // Use our recursive converter to handle any level of nesting
206
+ field_nodes.push(convert_schema_field_to_node(&field));
207
+ }
208
+
209
+ Ok(SchemaNode::Struct {
210
+ name: "".to_string(), // Top level has no name
211
+ nullable: false, // Top level is not nullable
212
+ fields: field_nodes,
213
+ })
214
+ }
215
+
216
+ /// Parses the legacy format schema (array of field hashes)
217
+ pub fn parse_legacy_schema(
218
+ ruby: &Ruby,
219
+ schema_value: Value,
220
+ ) -> Result<Vec<SchemaField>, MagnusError> {
221
+ if schema_value.is_nil()
222
+ || (schema_value.is_kind_of(ruby.class_array())
223
+ && RArray::from_value(schema_value)
224
+ .ok_or_else(|| {
225
+ MagnusError::new(
226
+ ruby.exception_type_error(),
227
+ "Schema must be an array of field definitions or nil",
228
+ )
229
+ })?
230
+ .len()
231
+ == 0)
232
+ {
233
+ // If schema is nil or an empty array, we'll handle this in the caller
234
+ return Ok(Vec::new());
235
+ }
236
+
237
+ if schema_value.is_kind_of(ruby.class_array()) {
238
+ let schema_array = RArray::from_value(schema_value).ok_or_else(|| {
239
+ MagnusError::new(
240
+ ruby.exception_type_error(),
241
+ "Schema must be an array of field definitions or nil",
242
+ )
243
+ })?;
244
+ let mut schema = Vec::with_capacity(schema_array.len());
245
+
246
+ for (idx, field_hash) in schema_array.into_iter().enumerate() {
247
+ if !field_hash.is_kind_of(ruby.class_hash()) {
248
+ return Err(MagnusError::new(
249
+ ruby.exception_type_error(),
250
+ format!("schema[{}] must be a hash", idx),
251
+ ));
252
+ }
253
+
254
+ let entries: Vec<(Value, Value)> = field_hash.funcall("to_a", ())?;
255
+ if entries.len() != 1 {
256
+ return Err(MagnusError::new(
257
+ ruby.exception_type_error(),
258
+ format!("schema[{}] must contain exactly one key-value pair", idx),
259
+ ));
260
+ }
261
+
262
+ let (name, type_value) = &entries[0];
263
+ let name_option = parse_string_or_symbol(ruby, name.clone())?;
264
+ let name = name_option.ok_or_else(|| {
265
+ MagnusError::new(ruby.exception_runtime_error(), "Field name cannot be nil")
266
+ })?;
267
+
268
+ let (type_, format, nullable) = if type_value.is_kind_of(ruby.class_hash()) {
269
+ let type_hash: Vec<(Value, Value)> = type_value.funcall("to_a", ())?;
270
+ let mut type_str = None;
271
+ let mut format_str = None;
272
+ let mut nullable = true; // Default to true if not specified
273
+
274
+ for (key, value) in type_hash {
275
+ let key_option = parse_string_or_symbol(ruby, key)?;
276
+ let key = key_option.ok_or_else(|| {
277
+ MagnusError::new(ruby.exception_runtime_error(), "Type key cannot be nil")
278
+ })?;
279
+ match key.as_str() {
280
+ "type" => type_str = Some(value),
281
+ "format" => {
282
+ let format_option = parse_string_or_symbol(ruby, value)?;
283
+ format_str = format_option;
284
+ }
285
+ "nullable" => {
286
+ // Extract nullable if present - convert to boolean
287
+ nullable = bool::try_convert(value).unwrap_or(true);
288
+ }
289
+ _ => {
290
+ return Err(MagnusError::new(
291
+ ruby.exception_type_error(),
292
+ format!("Unknown key '{}' in type definition", key),
293
+ ))
294
+ }
295
+ }
296
+ }
297
+
298
+ let type_str = type_str.ok_or_else(|| {
299
+ MagnusError::new(
300
+ ruby.exception_type_error(),
301
+ "Missing 'type' in type definition",
302
+ )
303
+ })?;
304
+
305
+ (PST::try_convert(type_str)?, format_str, nullable)
306
+ } else {
307
+ (PST::try_convert(type_value.clone())?, None, true)
308
+ };
309
+
310
+ schema.push(SchemaField {
311
+ name,
312
+ type_,
313
+ format,
314
+ nullable,
315
+ });
316
+ }
317
+
318
+ Ok(schema)
319
+ } else {
320
+ Err(MagnusError::new(
321
+ ruby.exception_type_error(),
322
+ "Schema must be an array of field definitions or nil",
323
+ ))
324
+ }
325
+ }
326
+
327
+ /// Generates schema fields by inferring from the first row
328
+ pub fn infer_schema_from_first_row(
329
+ ruby: &Ruby,
330
+ first_value: Value,
331
+ nullable: bool,
332
+ ) -> Result<Vec<SchemaField>, MagnusError> {
333
+ let array = RArray::from_value(first_value).ok_or_else(|| {
334
+ MagnusError::new(
335
+ ruby.exception_type_error(),
336
+ "First value must be an array when schema is not provided",
337
+ )
338
+ })?;
339
+
340
+ // Generate field names f0, f1, f2, etc.
341
+ Ok((0..array.len())
342
+ .map(|i| SchemaField {
343
+ name: format!("f{}", i),
344
+ type_: PST::String, // Default to String type when inferring
345
+ format: None,
346
+ nullable,
347
+ })
348
+ .collect())
349
+ }
@@ -0,0 +1,329 @@
1
+ use std::sync::Arc;
2
+
3
+ use arrow_schema::{
4
+ DataType as ArrowDataType, Field as ArrowField, Fields as ArrowFields, Schema as ArrowSchema,
5
+ };
6
+ use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
7
+
8
+ use crate::logger::RubyLogger;
9
+ use crate::types::{PrimitiveType, SchemaNode};
10
+ use crate::utils::parse_string_or_symbol;
11
+
12
+ /// Builds an Arrow schema from a SchemaNode tree - placeholder declaration
13
+ /// The actual implementation appears later in the file
14
+ fn _build_arrow_schema_placeholder() {}
15
+
16
+ /// Helper to extract common fields from a schema node hash
17
+ fn extract_common_fields(
18
+ ruby: &Ruby,
19
+ node_hash: &RHash,
20
+ ) -> Result<(String, bool, Option<String>), MagnusError> {
21
+ // extract `name:` if present, else default
22
+ let name_val = node_hash.get(Symbol::new("name"));
23
+ let name: String = if let Some(v) = name_val {
24
+ let name_option = parse_string_or_symbol(ruby, v)?;
25
+ name_option.unwrap_or_else(|| "".to_string())
26
+ } else {
27
+ "".to_string() // top-level might omit name
28
+ };
29
+
30
+ // extract `nullable:`
31
+ let nullable_val = node_hash.get(Symbol::new("nullable"));
32
+ let nullable: bool = if let Some(v) = nullable_val {
33
+ bool::try_convert(v).unwrap_or(true)
34
+ } else {
35
+ true // default to nullable
36
+ };
37
+
38
+ // optional `format:`
39
+ let format_val = node_hash.get(Symbol::new("format"));
40
+ let format: Option<String> = if let Some(v) = format_val {
41
+ parse_string_or_symbol(ruby, v)?
42
+ } else {
43
+ None
44
+ };
45
+
46
+ Ok((name, nullable, format))
47
+ }
48
+
49
+ /// Parse a struct schema node
50
+ fn parse_struct_node(
51
+ ruby: &Ruby,
52
+ node_hash: &RHash,
53
+ name: String,
54
+ nullable: bool,
55
+ ) -> Result<SchemaNode, MagnusError> {
56
+ // parse subfields array from `fields`
57
+ let fields_val = node_hash.get(Symbol::new("fields")).ok_or_else(|| {
58
+ MagnusError::new(
59
+ ruby.exception_arg_error(),
60
+ "Struct must have :fields array defined",
61
+ )
62
+ })?;
63
+ let fields_arr: RArray = RArray::try_convert(fields_val).map_err(|_| {
64
+ MagnusError::new(
65
+ ruby.exception_type_error(),
66
+ "The :fields value must be an array",
67
+ )
68
+ })?;
69
+
70
+ // Check for empty struct immediately
71
+ if fields_arr.len() == 0 {
72
+ return Err(MagnusError::new(
73
+ ruby.exception_arg_error(),
74
+ format!("Cannot create a struct with zero fields. Struct name: '{}'. Parquet doesn't support empty structs", name)
75
+ ));
76
+ }
77
+
78
+ let mut fields = Vec::with_capacity(fields_arr.len());
79
+ for item in fields_arr.into_iter() {
80
+ fields.push(parse_schema_node(ruby, item)?);
81
+ }
82
+
83
+ Ok(SchemaNode::Struct {
84
+ name,
85
+ nullable,
86
+ fields,
87
+ })
88
+ }
89
+
90
+ /// Parse a list schema node
91
+ fn parse_list_node(
92
+ ruby: &Ruby,
93
+ node_hash: &RHash,
94
+ name: String,
95
+ nullable: bool,
96
+ ) -> Result<SchemaNode, MagnusError> {
97
+ // parse `item`
98
+ let item_val = node_hash.get(Symbol::new("item")).ok_or_else(|| {
99
+ MagnusError::new(
100
+ ruby.exception_arg_error(),
101
+ "List type must have :item field defined",
102
+ )
103
+ })?;
104
+ let item_node = parse_schema_node(ruby, item_val)?;
105
+
106
+ Ok(SchemaNode::List {
107
+ name,
108
+ nullable,
109
+ item: Box::new(item_node),
110
+ })
111
+ }
112
+
113
+ /// Parse a map schema node
114
+ fn parse_map_node(
115
+ ruby: &Ruby,
116
+ node_hash: &RHash,
117
+ name: String,
118
+ nullable: bool,
119
+ ) -> Result<SchemaNode, MagnusError> {
120
+ // parse `key` and `value`
121
+ let key_val = node_hash.get(Symbol::new("key")).ok_or_else(|| {
122
+ MagnusError::new(
123
+ ruby.exception_arg_error(),
124
+ "Map type must have :key field defined",
125
+ )
126
+ })?;
127
+ let value_val = node_hash.get(Symbol::new("value")).ok_or_else(|| {
128
+ MagnusError::new(
129
+ ruby.exception_arg_error(),
130
+ "Map type must have :value field defined",
131
+ )
132
+ })?;
133
+
134
+ let key_node = parse_schema_node(ruby, key_val)?;
135
+ let value_node = parse_schema_node(ruby, value_val)?;
136
+
137
+ Ok(SchemaNode::Map {
138
+ name,
139
+ nullable,
140
+ key: Box::new(key_node),
141
+ value: Box::new(value_node),
142
+ })
143
+ }
144
+
145
+ /// Parse a Ruby schema hash into a SchemaNode tree
146
+ pub fn parse_schema_node(ruby: &Ruby, node_value: Value) -> Result<SchemaNode, MagnusError> {
147
+ // The node_value should be a Ruby Hash with keys: :name, :type, :nullable, etc.
148
+ let node_hash = RHash::from_value(node_value).ok_or_else(|| {
149
+ MagnusError::new(
150
+ ruby.exception_type_error(),
151
+ "Schema node must be a Hash with :type and other fields",
152
+ )
153
+ })?;
154
+
155
+ // extract `type:` which is a symbol/string
156
+ let type_val = node_hash.get(Symbol::new("type")).ok_or_else(|| {
157
+ MagnusError::new(
158
+ ruby.exception_arg_error(),
159
+ "Missing required :type field in schema node",
160
+ )
161
+ })?;
162
+ let type_str_option = parse_string_or_symbol(ruby, type_val)?;
163
+ let type_str = type_str_option.ok_or_else(|| {
164
+ MagnusError::new(
165
+ ruby.exception_arg_error(),
166
+ "Type cannot be nil - please specify a valid type string or symbol",
167
+ )
168
+ })?;
169
+
170
+ // Extract common fields (name, nullable, format)
171
+ let (name, nullable, format) = extract_common_fields(ruby, &node_hash)?;
172
+
173
+ // Delegate to type-specific parsers with clear error handling
174
+ match type_str.as_str() {
175
+ "struct" => parse_struct_node(ruby, &node_hash, name, nullable),
176
+ "list" => parse_list_node(ruby, &node_hash, name, nullable),
177
+ "map" => parse_map_node(ruby, &node_hash, name, nullable),
178
+ // For primitives, provide better error messages when type isn't recognized
179
+ other => {
180
+ if let Some(parquet_type) = parse_primitive_type(other) {
181
+ Ok(SchemaNode::Primitive {
182
+ name,
183
+ parquet_type,
184
+ nullable,
185
+ format,
186
+ })
187
+ } else {
188
+ Err(MagnusError::new(
189
+ magnus::exception::arg_error(),
190
+ format!(
191
+ "Unknown type: '{}'. Supported types are: struct, list, map, int8, int16, int32, int64, uint8, uint16, uint32, uint64, float32, float64, boolean, string, binary, date32, timestamp_millis, timestamp_micros",
192
+ other
193
+ )
194
+ ))
195
+ }
196
+ }
197
+ }
198
+ }
199
+
200
+ /// Convert a type string like "int32" to a PrimitiveType
201
+ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
202
+ match s.to_lowercase().as_str() {
203
+ "int8" | "i8" => Some(PrimitiveType::Int8),
204
+ "int16" | "i16" => Some(PrimitiveType::Int16),
205
+ "int32" | "i32" | "int" => Some(PrimitiveType::Int32),
206
+ "int64" | "i64" | "long" | "bigint" => Some(PrimitiveType::Int64),
207
+ "uint8" | "u8" | "byte" => Some(PrimitiveType::UInt8),
208
+ "uint16" | "u16" => Some(PrimitiveType::UInt16),
209
+ "uint32" | "u32" | "uint" => Some(PrimitiveType::UInt32),
210
+ "uint64" | "u64" | "ulong" => Some(PrimitiveType::UInt64),
211
+ "float" | "float32" | "f32" => Some(PrimitiveType::Float32),
212
+ "double" | "float64" | "f64" => Some(PrimitiveType::Float64),
213
+ "bool" | "boolean" => Some(PrimitiveType::Boolean),
214
+ "string" | "utf8" | "str" | "text" => Some(PrimitiveType::String),
215
+ "binary" | "bytes" | "blob" => Some(PrimitiveType::Binary),
216
+ "date" | "date32" => Some(PrimitiveType::Date32),
217
+ "timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
218
+ "timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
219
+ _ => None,
220
+ }
221
+ }
222
+
223
+ /// Convert a SchemaNode to an Arrow field
224
+ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
225
+ match node {
226
+ SchemaNode::Primitive {
227
+ name,
228
+ parquet_type,
229
+ nullable,
230
+ format: _,
231
+ } => {
232
+ let dt = match parquet_type {
233
+ PrimitiveType::Int8 => ArrowDataType::Int8,
234
+ PrimitiveType::Int16 => ArrowDataType::Int16,
235
+ PrimitiveType::Int32 => ArrowDataType::Int32,
236
+ PrimitiveType::Int64 => ArrowDataType::Int64,
237
+ PrimitiveType::UInt8 => ArrowDataType::UInt8,
238
+ PrimitiveType::UInt16 => ArrowDataType::UInt16,
239
+ PrimitiveType::UInt32 => ArrowDataType::UInt32,
240
+ PrimitiveType::UInt64 => ArrowDataType::UInt64,
241
+ PrimitiveType::Float32 => ArrowDataType::Float32,
242
+ PrimitiveType::Float64 => ArrowDataType::Float64,
243
+ PrimitiveType::Boolean => ArrowDataType::Boolean,
244
+ PrimitiveType::String => ArrowDataType::Utf8,
245
+ PrimitiveType::Binary => ArrowDataType::Binary,
246
+ PrimitiveType::Date32 => ArrowDataType::Date32,
247
+ PrimitiveType::TimestampMillis => {
248
+ ArrowDataType::Timestamp(arrow_schema::TimeUnit::Millisecond, None)
249
+ }
250
+ PrimitiveType::TimestampMicros => {
251
+ ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
252
+ }
253
+ };
254
+ ArrowField::new(name, dt, *nullable)
255
+ }
256
+
257
+ SchemaNode::List {
258
+ name,
259
+ nullable,
260
+ item,
261
+ } => {
262
+ let child_field = schema_node_to_arrow_field(item);
263
+ let list_type = ArrowDataType::List(Arc::new(child_field));
264
+ ArrowField::new(name, list_type, *nullable)
265
+ }
266
+
267
+ SchemaNode::Map {
268
+ name,
269
+ nullable,
270
+ key,
271
+ value,
272
+ } => {
273
+ // A Map is basically: Map( Struct([key, value]), keysSorted=false )
274
+ let key_field = schema_node_to_arrow_field(key);
275
+ let value_field = schema_node_to_arrow_field(value);
276
+
277
+ let entries_struct = ArrowDataType::Struct(ArrowFields::from(vec![
278
+ ArrowField::new("key", key_field.data_type().clone(), false),
279
+ ArrowField::new(
280
+ "value",
281
+ value_field.data_type().clone(),
282
+ value_field.is_nullable(),
283
+ ),
284
+ ]));
285
+
286
+ let map_data_type = ArrowDataType::Map(
287
+ Arc::new(ArrowField::new("entries", entries_struct, false)),
288
+ false, // not sorted
289
+ );
290
+ ArrowField::new(name, map_data_type, *nullable)
291
+ }
292
+
293
+ SchemaNode::Struct {
294
+ name,
295
+ nullable,
296
+ fields,
297
+ } => {
298
+ // Field validation happens earlier - no empty structs allowed
299
+ let mut arrow_subfields = Vec::with_capacity(fields.len());
300
+ for f in fields {
301
+ arrow_subfields.push(schema_node_to_arrow_field(f));
302
+ }
303
+ let struct_type = ArrowDataType::Struct(ArrowFields::from(arrow_subfields));
304
+ ArrowField::new(name, struct_type, *nullable)
305
+ }
306
+ }
307
+ }
308
+
309
+ /// Build an Arrow schema from the top-level Node, which must be a Struct
310
+ pub fn build_arrow_schema(
311
+ root: &SchemaNode,
312
+ logger: &RubyLogger,
313
+ ) -> Result<Arc<ArrowSchema>, MagnusError> {
314
+ match root {
315
+ SchemaNode::Struct { fields, .. } => {
316
+ // Fields debug output removed - we've fixed the empty struct issue
317
+
318
+ let arrow_fields: Vec<ArrowField> =
319
+ fields.iter().map(schema_node_to_arrow_field).collect();
320
+ let arrow_schema = ArrowSchema::new(arrow_fields);
321
+ logger.debug(|| format!("Constructed Arrow schema: {:?}", arrow_schema))?;
322
+ Ok(Arc::new(arrow_schema))
323
+ }
324
+ _ => Err(MagnusError::new(
325
+ magnus::exception::arg_error(),
326
+ "Top-level schema must be a Struct".to_owned(),
327
+ )),
328
+ }
329
+ }