parquet 0.5.13 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +3 -0
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -605
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,810 @@
|
|
1
|
+
use magnus::value::ReprValue;
|
2
|
+
use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
|
3
|
+
use parquet_core::{ParquetError, PrimitiveType, Result, Schema, SchemaNode};
|
4
|
+
|
5
|
+
/// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
|
6
|
+
pub struct RubySchemaBuilder;
|
7
|
+
|
8
|
+
impl RubySchemaBuilder {
|
9
|
+
pub fn new() -> Self {
|
10
|
+
Self
|
11
|
+
}
|
12
|
+
|
13
|
+
/// Parse a Ruby schema definition (hash) into a SchemaNode
|
14
|
+
fn parse_schema_node(&self, name: String, schema_def: Value) -> Result<SchemaNode> {
|
15
|
+
// If it's a Hash, parse it as a complex type
|
16
|
+
if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
|
17
|
+
return self.parse_hash_schema_node(name, hash);
|
18
|
+
}
|
19
|
+
|
20
|
+
// Otherwise, try to parse as a simple type symbol
|
21
|
+
if let Ok(type_sym) = <Symbol as TryConvert>::try_convert(schema_def) {
|
22
|
+
let type_str = type_sym.name().map_err(|e: MagnusError| {
|
23
|
+
ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
|
24
|
+
})?;
|
25
|
+
|
26
|
+
// Check if it's a complex type with angle brackets
|
27
|
+
if type_str.contains('<') {
|
28
|
+
return self.parse_complex_type_string(name, type_str.to_string(), true);
|
29
|
+
}
|
30
|
+
|
31
|
+
let primitive_type =
|
32
|
+
self.parse_primitive_type(type_str.to_string(), None, None, None)?;
|
33
|
+
return Ok(SchemaNode::Primitive {
|
34
|
+
name,
|
35
|
+
primitive_type,
|
36
|
+
nullable: true, // Default to nullable for simple types
|
37
|
+
format: None,
|
38
|
+
});
|
39
|
+
}
|
40
|
+
|
41
|
+
Err(ParquetError::Schema(format!(
|
42
|
+
"Expected Hash or Symbol for schema definition, got {}",
|
43
|
+
schema_def.class()
|
44
|
+
)))
|
45
|
+
}
|
46
|
+
|
47
|
+
/// Parse a Ruby hash schema node
|
48
|
+
fn parse_hash_schema_node(&self, name: String, hash: RHash) -> Result<SchemaNode> {
|
49
|
+
// Get the type field
|
50
|
+
let type_sym: Symbol = hash
|
51
|
+
.fetch::<_, Symbol>(Symbol::new("type"))
|
52
|
+
.map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
|
53
|
+
|
54
|
+
let type_str = type_sym.name().map_err(|e: MagnusError| {
|
55
|
+
ParquetError::Conversion(format!("Failed to get type name: {}", e))
|
56
|
+
})?;
|
57
|
+
|
58
|
+
// Get nullable field (default to true)
|
59
|
+
let nullable = hash
|
60
|
+
.fetch::<_, Value>(Symbol::new("nullable"))
|
61
|
+
.ok()
|
62
|
+
.and_then(|v| <bool as TryConvert>::try_convert(v).ok())
|
63
|
+
.unwrap_or(true);
|
64
|
+
|
65
|
+
// Get format field if present
|
66
|
+
let format = hash
|
67
|
+
.fetch::<_, Value>(Symbol::new("format"))
|
68
|
+
.ok()
|
69
|
+
.and_then(|v| <String as TryConvert>::try_convert(v).ok());
|
70
|
+
|
71
|
+
match type_str.to_string().as_str() {
|
72
|
+
"struct" => {
|
73
|
+
let fields_array: RArray = hash
|
74
|
+
.fetch(Symbol::new("fields"))
|
75
|
+
.map_err(|e| ParquetError::Schema(format!("Struct missing 'fields': {}", e)))?;
|
76
|
+
|
77
|
+
let mut fields = Vec::new();
|
78
|
+
for field_value in fields_array.into_iter() {
|
79
|
+
let field_hash: RHash = <RHash as TryConvert>::try_convert(field_value)
|
80
|
+
.map_err(|e: MagnusError| {
|
81
|
+
ParquetError::Schema(format!("Invalid field definition: {}", e))
|
82
|
+
})?;
|
83
|
+
|
84
|
+
let _field_name: String =
|
85
|
+
field_hash.fetch(Symbol::new("name")).map_err(|e| {
|
86
|
+
ParquetError::Schema(format!("Field missing 'name': {}", e))
|
87
|
+
})?;
|
88
|
+
|
89
|
+
let field_node = self.parse_field_definition(field_hash)?;
|
90
|
+
fields.push(field_node);
|
91
|
+
}
|
92
|
+
|
93
|
+
Ok(SchemaNode::Struct {
|
94
|
+
name,
|
95
|
+
nullable,
|
96
|
+
fields,
|
97
|
+
})
|
98
|
+
}
|
99
|
+
|
100
|
+
"list" => {
|
101
|
+
let item_def = hash
|
102
|
+
.fetch::<_, Value>(Symbol::new("item"))
|
103
|
+
.map_err(|e| ParquetError::Schema(format!("List missing 'item': {}", e)))?;
|
104
|
+
|
105
|
+
let item_name = format!("{}_item", name);
|
106
|
+
let item_node = self.parse_schema_node(item_name, item_def)?;
|
107
|
+
|
108
|
+
Ok(SchemaNode::List {
|
109
|
+
name,
|
110
|
+
nullable,
|
111
|
+
item: Box::new(item_node),
|
112
|
+
})
|
113
|
+
}
|
114
|
+
|
115
|
+
"map" => {
|
116
|
+
// Parse key definition
|
117
|
+
let key_def = hash
|
118
|
+
.fetch::<_, Value>(Symbol::new("key"))
|
119
|
+
.map_err(|e| ParquetError::Schema(format!("Map missing 'key': {}", e)))?;
|
120
|
+
let key_node = self.parse_schema_node("key".to_string(), key_def)?;
|
121
|
+
|
122
|
+
// Parse value definition
|
123
|
+
let value_def = hash
|
124
|
+
.fetch::<_, Value>(Symbol::new("value"))
|
125
|
+
.map_err(|e| ParquetError::Schema(format!("Map missing 'value': {}", e)))?;
|
126
|
+
let value_node = self.parse_schema_node("value".to_string(), value_def)?;
|
127
|
+
|
128
|
+
Ok(SchemaNode::Map {
|
129
|
+
name,
|
130
|
+
nullable,
|
131
|
+
key: Box::new(key_node),
|
132
|
+
value: Box::new(value_node),
|
133
|
+
})
|
134
|
+
}
|
135
|
+
|
136
|
+
// Check if it's a complex type with angle brackets
|
137
|
+
type_str if type_str.contains('<') => {
|
138
|
+
self.parse_complex_type_string(name, type_str.to_string(), nullable)
|
139
|
+
}
|
140
|
+
|
141
|
+
// Primitive types
|
142
|
+
primitive_type => {
|
143
|
+
// Get precision and scale for decimal types
|
144
|
+
let precision = hash
|
145
|
+
.fetch::<_, Value>(Symbol::new("precision"))
|
146
|
+
.ok()
|
147
|
+
.and_then(|v| <u8 as TryConvert>::try_convert(v).ok());
|
148
|
+
|
149
|
+
let scale = hash
|
150
|
+
.fetch::<_, Value>(Symbol::new("scale"))
|
151
|
+
.ok()
|
152
|
+
.and_then(|v| <i8 as TryConvert>::try_convert(v).ok());
|
153
|
+
|
154
|
+
// Handle timezone for timestamp types
|
155
|
+
// Support both new has_timezone (preferred) and legacy timezone parameters
|
156
|
+
let timezone =
|
157
|
+
if let Ok(has_tz) = hash.fetch::<_, Value>(Symbol::new("has_timezone")) {
|
158
|
+
// New approach: has_timezone boolean
|
159
|
+
if let Ok(has_timezone) = <bool as TryConvert>::try_convert(has_tz) {
|
160
|
+
if has_timezone {
|
161
|
+
Some("UTC".to_string()) // Presence means UTC storage
|
162
|
+
} else {
|
163
|
+
None // Absence means local/unzoned storage
|
164
|
+
}
|
165
|
+
} else {
|
166
|
+
None
|
167
|
+
}
|
168
|
+
} else {
|
169
|
+
hash.fetch::<_, Value>(Symbol::new("timezone"))
|
170
|
+
.ok()
|
171
|
+
.map(|_| "UTC".to_string()) // Any value -> UTC
|
172
|
+
};
|
173
|
+
|
174
|
+
let primitive = self.parse_primitive_type(
|
175
|
+
primitive_type.to_string(),
|
176
|
+
precision,
|
177
|
+
scale,
|
178
|
+
timezone,
|
179
|
+
)?;
|
180
|
+
|
181
|
+
Ok(SchemaNode::Primitive {
|
182
|
+
name,
|
183
|
+
primitive_type: primitive,
|
184
|
+
nullable,
|
185
|
+
format,
|
186
|
+
})
|
187
|
+
}
|
188
|
+
}
|
189
|
+
}
|
190
|
+
|
191
|
+
/// Parse a complex type string like "list<string>" or "map<string,int32>"
|
192
|
+
fn parse_complex_type_string(
|
193
|
+
&self,
|
194
|
+
name: String,
|
195
|
+
type_str: String,
|
196
|
+
nullable: bool,
|
197
|
+
) -> Result<SchemaNode> {
|
198
|
+
if type_str.starts_with("list<") && type_str.ends_with('>') {
|
199
|
+
let inner_type = &type_str[5..type_str.len() - 1];
|
200
|
+
let item_name = format!("{}_item", name);
|
201
|
+
|
202
|
+
// Create a simple type node for the item
|
203
|
+
let item_node = if inner_type.contains('<') {
|
204
|
+
// Nested complex type
|
205
|
+
self.parse_complex_type_string(item_name, inner_type.to_string(), true)?
|
206
|
+
} else {
|
207
|
+
// Simple primitive type
|
208
|
+
SchemaNode::Primitive {
|
209
|
+
name: item_name,
|
210
|
+
primitive_type: self.parse_primitive_type(
|
211
|
+
inner_type.to_string(),
|
212
|
+
None,
|
213
|
+
None,
|
214
|
+
None,
|
215
|
+
)?,
|
216
|
+
nullable: true,
|
217
|
+
format: None,
|
218
|
+
}
|
219
|
+
};
|
220
|
+
|
221
|
+
Ok(SchemaNode::List {
|
222
|
+
name,
|
223
|
+
nullable,
|
224
|
+
item: Box::new(item_node),
|
225
|
+
})
|
226
|
+
} else if type_str.starts_with("map<") && type_str.ends_with('>') {
|
227
|
+
let inner = &type_str[4..type_str.len() - 1];
|
228
|
+
let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
|
229
|
+
if parts.len() != 2 {
|
230
|
+
return Err(ParquetError::Schema(format!(
|
231
|
+
"Invalid map type: {}",
|
232
|
+
type_str
|
233
|
+
)));
|
234
|
+
}
|
235
|
+
|
236
|
+
let key_type = self.parse_primitive_type(parts[0].to_string(), None, None, None)?;
|
237
|
+
let value_type = self.parse_primitive_type(parts[1].to_string(), None, None, None)?;
|
238
|
+
|
239
|
+
Ok(SchemaNode::Map {
|
240
|
+
name,
|
241
|
+
nullable,
|
242
|
+
key: Box::new(SchemaNode::Primitive {
|
243
|
+
name: "key".to_string(),
|
244
|
+
primitive_type: key_type,
|
245
|
+
nullable: false,
|
246
|
+
format: None,
|
247
|
+
}),
|
248
|
+
value: Box::new(SchemaNode::Primitive {
|
249
|
+
name: "value".to_string(),
|
250
|
+
primitive_type: value_type,
|
251
|
+
nullable: true,
|
252
|
+
format: None,
|
253
|
+
}),
|
254
|
+
})
|
255
|
+
} else {
|
256
|
+
Err(ParquetError::Schema(format!(
|
257
|
+
"Unknown complex type: {}",
|
258
|
+
type_str
|
259
|
+
)))
|
260
|
+
}
|
261
|
+
}
|
262
|
+
|
263
|
+
/// Parse a field definition from a Ruby hash
|
264
|
+
fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
|
265
|
+
let name: String = field_hash
|
266
|
+
.fetch(Symbol::new("name"))
|
267
|
+
.map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
|
268
|
+
|
269
|
+
// Check if there's a 'type' field - if so, parse as full definition
|
270
|
+
if let Ok(_type_value) = field_hash.fetch::<_, Value>(Symbol::new("type")) {
|
271
|
+
// This is a full field definition
|
272
|
+
self.parse_schema_node(name, field_hash.as_value())
|
273
|
+
} else {
|
274
|
+
// This might be a simplified definition - look for known field patterns
|
275
|
+
Err(ParquetError::Schema(format!(
|
276
|
+
"Field '{}' missing 'type' definition",
|
277
|
+
name
|
278
|
+
)))
|
279
|
+
}
|
280
|
+
}
|
281
|
+
|
282
|
+
/// Parse a primitive type string to PrimitiveType enum
|
283
|
+
fn parse_primitive_type(
|
284
|
+
&self,
|
285
|
+
type_str: String,
|
286
|
+
precision: Option<u8>,
|
287
|
+
scale: Option<i8>,
|
288
|
+
timezone: Option<String>,
|
289
|
+
) -> Result<PrimitiveType> {
|
290
|
+
// Check if it's a decimal type with parentheses notation like "decimal(5,2)"
|
291
|
+
if type_str.starts_with("decimal(") && type_str.ends_with(')') {
|
292
|
+
let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
|
293
|
+
let parts: Vec<&str> = params.split(',').map(|s| s.trim()).collect();
|
294
|
+
if parts.len() == 2 {
|
295
|
+
let p = parts[0].parse::<u8>().map_err(|_| {
|
296
|
+
ParquetError::Schema(format!("Invalid decimal precision: {}", parts[0]))
|
297
|
+
})?;
|
298
|
+
let s = parts[1].parse::<i8>().map_err(|_| {
|
299
|
+
ParquetError::Schema(format!("Invalid decimal scale: {}", parts[1]))
|
300
|
+
})?;
|
301
|
+
|
302
|
+
// Choose decimal type based on precision
|
303
|
+
if p <= 38 {
|
304
|
+
return Ok(PrimitiveType::Decimal128(p, s));
|
305
|
+
} else {
|
306
|
+
return Ok(PrimitiveType::Decimal256(p, s));
|
307
|
+
}
|
308
|
+
}
|
309
|
+
}
|
310
|
+
// Check for decimal256 with parentheses notation
|
311
|
+
if type_str.starts_with("decimal256(") && type_str.ends_with(')') {
|
312
|
+
let params = &type_str[11..type_str.len() - 1];
|
313
|
+
let parts: Vec<&str> = params.split(',').map(|s| s.trim()).collect();
|
314
|
+
if parts.len() == 2 {
|
315
|
+
let p = parts[0].parse::<u8>().map_err(|_| {
|
316
|
+
ParquetError::Schema(format!("Invalid decimal256 precision: {}", parts[0]))
|
317
|
+
})?;
|
318
|
+
let s = parts[1].parse::<i8>().map_err(|_| {
|
319
|
+
ParquetError::Schema(format!("Invalid decimal256 scale: {}", parts[1]))
|
320
|
+
})?;
|
321
|
+
return Ok(PrimitiveType::Decimal256(p, s));
|
322
|
+
}
|
323
|
+
}
|
324
|
+
|
325
|
+
match type_str.as_str() {
|
326
|
+
"boolean" | "bool" => Ok(PrimitiveType::Boolean),
|
327
|
+
"int8" => Ok(PrimitiveType::Int8),
|
328
|
+
"int16" => Ok(PrimitiveType::Int16),
|
329
|
+
"int32" => Ok(PrimitiveType::Int32),
|
330
|
+
"int64" => Ok(PrimitiveType::Int64),
|
331
|
+
"uint8" => Ok(PrimitiveType::UInt8),
|
332
|
+
"uint16" => Ok(PrimitiveType::UInt16),
|
333
|
+
"uint32" => Ok(PrimitiveType::UInt32),
|
334
|
+
"uint64" => Ok(PrimitiveType::UInt64),
|
335
|
+
"float" | "float32" => Ok(PrimitiveType::Float32),
|
336
|
+
"double" | "float64" => Ok(PrimitiveType::Float64),
|
337
|
+
"string" => Ok(PrimitiveType::String),
|
338
|
+
"binary" => Ok(PrimitiveType::Binary),
|
339
|
+
"date32" | "date" => Ok(PrimitiveType::Date32),
|
340
|
+
"date64" => Ok(PrimitiveType::Date64),
|
341
|
+
"timestamp" | "timestamp_millis" => {
|
342
|
+
// PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
|
343
|
+
Ok(PrimitiveType::TimestampMillis(timezone.map(Into::into)))
|
344
|
+
}
|
345
|
+
"timestamp_second" => {
|
346
|
+
// PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
|
347
|
+
Ok(PrimitiveType::TimestampSecond(timezone.map(Into::into)))
|
348
|
+
}
|
349
|
+
"timestamp_micros" => {
|
350
|
+
// PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
|
351
|
+
Ok(PrimitiveType::TimestampMicros(timezone.map(Into::into)))
|
352
|
+
}
|
353
|
+
"timestamp_nanos" => {
|
354
|
+
// PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
|
355
|
+
Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
|
356
|
+
}
|
357
|
+
"time32" | "time_millis" => Ok(PrimitiveType::TimeMillis),
|
358
|
+
"time64" | "time_micros" => Ok(PrimitiveType::TimeMicros),
|
359
|
+
"decimal" => {
|
360
|
+
// Use provided precision/scale or defaults
|
361
|
+
let p = precision.unwrap_or(38);
|
362
|
+
let s = scale.unwrap_or(0);
|
363
|
+
|
364
|
+
// Choose decimal type based on precision
|
365
|
+
if p <= 38 {
|
366
|
+
Ok(PrimitiveType::Decimal128(p, s))
|
367
|
+
} else {
|
368
|
+
Ok(PrimitiveType::Decimal256(p, s))
|
369
|
+
}
|
370
|
+
}
|
371
|
+
"decimal128" => {
|
372
|
+
let p = precision.unwrap_or(38);
|
373
|
+
let s = scale.unwrap_or(0);
|
374
|
+
Ok(PrimitiveType::Decimal128(p, s))
|
375
|
+
}
|
376
|
+
"decimal256" => {
|
377
|
+
let p = precision.unwrap_or(76);
|
378
|
+
let s = scale.unwrap_or(0);
|
379
|
+
Ok(PrimitiveType::Decimal256(p, s))
|
380
|
+
}
|
381
|
+
_ => Err(ParquetError::Schema(format!(
|
382
|
+
"Unknown primitive type: {}",
|
383
|
+
type_str
|
384
|
+
))),
|
385
|
+
}
|
386
|
+
}
|
387
|
+
}
|
388
|
+
|
389
|
+
impl Default for RubySchemaBuilder {
|
390
|
+
fn default() -> Self {
|
391
|
+
Self::new()
|
392
|
+
}
|
393
|
+
}
|
394
|
+
|
395
|
+
/// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
|
396
|
+
/// and Ruby Value is not Send/Sync
|
397
|
+
pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
398
|
+
let builder = RubySchemaBuilder::new();
|
399
|
+
|
400
|
+
// The Ruby schema should be a hash with a root struct
|
401
|
+
let hash: RHash = <RHash as TryConvert>::try_convert(schema_def)
|
402
|
+
.map_err(|e: MagnusError| ParquetError::Schema(format!("Schema must be a hash: {}", e)))?;
|
403
|
+
|
404
|
+
// Check if it's already in the expected format (with type: :struct)
|
405
|
+
let root_node = if hash.get(Symbol::new("type")).is_some() {
|
406
|
+
// It's a complete schema definition
|
407
|
+
builder.parse_hash_schema_node("root".to_string(), hash)?
|
408
|
+
} else if let Ok(fields) = hash.fetch::<_, RArray>(Symbol::new("fields")) {
|
409
|
+
// It's a simplified format with just fields array
|
410
|
+
let mut field_nodes = Vec::new();
|
411
|
+
for field_value in fields.into_iter() {
|
412
|
+
let field_hash: RHash = <RHash as TryConvert>::try_convert(field_value)
|
413
|
+
.map_err(|e: MagnusError| ParquetError::Schema(format!("Invalid field: {}", e)))?;
|
414
|
+
field_nodes.push(builder.parse_field_definition(field_hash)?);
|
415
|
+
}
|
416
|
+
|
417
|
+
// Check for duplicate field names
|
418
|
+
let field_names: Vec<String> = field_nodes
|
419
|
+
.iter()
|
420
|
+
.map(|node| match node {
|
421
|
+
SchemaNode::Primitive { name, .. } => name.clone(),
|
422
|
+
SchemaNode::List { name, .. } => name.clone(),
|
423
|
+
SchemaNode::Map { name, .. } => name.clone(),
|
424
|
+
SchemaNode::Struct { name, .. } => name.clone(),
|
425
|
+
})
|
426
|
+
.collect();
|
427
|
+
|
428
|
+
let mut unique_names = std::collections::HashSet::new();
|
429
|
+
for name in &field_names {
|
430
|
+
if !unique_names.insert(name) {
|
431
|
+
return Err(ParquetError::Schema(format!(
|
432
|
+
"Duplicate field names in root level schema: {:?}",
|
433
|
+
field_names
|
434
|
+
)));
|
435
|
+
}
|
436
|
+
}
|
437
|
+
|
438
|
+
SchemaNode::Struct {
|
439
|
+
name: "root".to_string(),
|
440
|
+
nullable: false,
|
441
|
+
fields: field_nodes,
|
442
|
+
}
|
443
|
+
} else {
|
444
|
+
return Err(ParquetError::Schema(
|
445
|
+
"Schema must have 'type' or 'fields' key".to_string(),
|
446
|
+
));
|
447
|
+
};
|
448
|
+
|
449
|
+
// Build the schema
|
450
|
+
parquet_core::SchemaBuilder::new()
|
451
|
+
.with_root(root_node)
|
452
|
+
.build()
|
453
|
+
.map_err(|e| ParquetError::Schema(e.to_string()))
|
454
|
+
}
|
455
|
+
|
456
|
+
/// Convert a Parquet schema back to Ruby representation
|
457
|
+
pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
|
458
|
+
let ruby = Ruby::get()
|
459
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
|
460
|
+
|
461
|
+
schema_node_to_ruby(&schema.root, &ruby)
|
462
|
+
}
|
463
|
+
|
464
|
+
fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
|
465
|
+
let hash = RHash::new();
|
466
|
+
|
467
|
+
match node {
|
468
|
+
SchemaNode::Struct {
|
469
|
+
name,
|
470
|
+
nullable,
|
471
|
+
fields,
|
472
|
+
} => {
|
473
|
+
hash.aset(Symbol::new("type"), Symbol::new("struct"))
|
474
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
|
475
|
+
hash.aset(Symbol::new("name"), name.as_str())
|
476
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
|
477
|
+
hash.aset(Symbol::new("nullable"), *nullable)
|
478
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
|
479
|
+
|
480
|
+
let fields_array = RArray::new();
|
481
|
+
for field in fields {
|
482
|
+
fields_array
|
483
|
+
.push(schema_node_to_ruby(field, _ruby)?)
|
484
|
+
.map_err(|e| {
|
485
|
+
ParquetError::Conversion(format!("Failed to push field: {}", e))
|
486
|
+
})?;
|
487
|
+
}
|
488
|
+
hash.aset(Symbol::new("fields"), fields_array)
|
489
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set fields: {}", e)))?;
|
490
|
+
}
|
491
|
+
|
492
|
+
SchemaNode::List {
|
493
|
+
name,
|
494
|
+
nullable,
|
495
|
+
item,
|
496
|
+
} => {
|
497
|
+
hash.aset(Symbol::new("type"), Symbol::new("list"))
|
498
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
|
499
|
+
hash.aset(Symbol::new("name"), name.as_str())
|
500
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
|
501
|
+
hash.aset(Symbol::new("nullable"), *nullable)
|
502
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
|
503
|
+
hash.aset(Symbol::new("item"), schema_node_to_ruby(item, _ruby)?)
|
504
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set item: {}", e)))?;
|
505
|
+
}
|
506
|
+
|
507
|
+
SchemaNode::Map {
|
508
|
+
name,
|
509
|
+
nullable,
|
510
|
+
key,
|
511
|
+
value,
|
512
|
+
} => {
|
513
|
+
hash.aset(Symbol::new("type"), Symbol::new("map"))
|
514
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
|
515
|
+
hash.aset(Symbol::new("name"), name.as_str())
|
516
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
|
517
|
+
hash.aset(Symbol::new("nullable"), *nullable)
|
518
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
|
519
|
+
hash.aset(Symbol::new("key"), schema_node_to_ruby(key, _ruby)?)
|
520
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set key: {}", e)))?;
|
521
|
+
hash.aset(Symbol::new("value"), schema_node_to_ruby(value, _ruby)?)
|
522
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set value: {}", e)))?;
|
523
|
+
}
|
524
|
+
|
525
|
+
SchemaNode::Primitive {
|
526
|
+
name,
|
527
|
+
primitive_type,
|
528
|
+
nullable,
|
529
|
+
format,
|
530
|
+
} => {
|
531
|
+
let type_sym = match primitive_type {
|
532
|
+
PrimitiveType::Boolean => Symbol::new("boolean"),
|
533
|
+
PrimitiveType::Int8 => Symbol::new("int8"),
|
534
|
+
PrimitiveType::Int16 => Symbol::new("int16"),
|
535
|
+
PrimitiveType::Int32 => Symbol::new("int32"),
|
536
|
+
PrimitiveType::Int64 => Symbol::new("int64"),
|
537
|
+
PrimitiveType::UInt8 => Symbol::new("uint8"),
|
538
|
+
PrimitiveType::UInt16 => Symbol::new("uint16"),
|
539
|
+
PrimitiveType::UInt32 => Symbol::new("uint32"),
|
540
|
+
PrimitiveType::UInt64 => Symbol::new("uint64"),
|
541
|
+
PrimitiveType::Float32 => Symbol::new("float32"),
|
542
|
+
PrimitiveType::Float64 => Symbol::new("float64"),
|
543
|
+
PrimitiveType::String => Symbol::new("string"),
|
544
|
+
PrimitiveType::Binary => Symbol::new("binary"),
|
545
|
+
PrimitiveType::Date32 => Symbol::new("date32"),
|
546
|
+
PrimitiveType::Date64 => Symbol::new("date64"),
|
547
|
+
PrimitiveType::TimestampSecond(_) => Symbol::new("timestamp_second"),
|
548
|
+
PrimitiveType::TimestampMillis(_) => Symbol::new("timestamp_millis"),
|
549
|
+
PrimitiveType::TimestampMicros(_) => Symbol::new("timestamp_micros"),
|
550
|
+
PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
|
551
|
+
PrimitiveType::TimeMillis => Symbol::new("time_millis"),
|
552
|
+
PrimitiveType::TimeMicros => Symbol::new("time_micros"),
|
553
|
+
PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
|
554
|
+
PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
|
555
|
+
PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
|
556
|
+
};
|
557
|
+
|
558
|
+
hash.aset(Symbol::new("type"), type_sym)
|
559
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set type: {}", e)))?;
|
560
|
+
hash.aset(Symbol::new("name"), name.as_str())
|
561
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set name: {}", e)))?;
|
562
|
+
hash.aset(Symbol::new("nullable"), *nullable)
|
563
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to set nullable: {}", e)))?;
|
564
|
+
|
565
|
+
if let Some(fmt) = format {
|
566
|
+
hash.aset(Symbol::new("format"), fmt.as_str())
|
567
|
+
.map_err(|e| {
|
568
|
+
ParquetError::Conversion(format!("Failed to set format: {}", e))
|
569
|
+
})?;
|
570
|
+
}
|
571
|
+
|
572
|
+
// Add precision/scale for decimal types
|
573
|
+
match primitive_type {
|
574
|
+
PrimitiveType::Decimal128(p, s) | PrimitiveType::Decimal256(p, s) => {
|
575
|
+
hash.aset(Symbol::new("precision"), *p).map_err(|e| {
|
576
|
+
ParquetError::Conversion(format!("Failed to set precision: {}", e))
|
577
|
+
})?;
|
578
|
+
hash.aset(Symbol::new("scale"), *s).map_err(|e| {
|
579
|
+
ParquetError::Conversion(format!("Failed to set scale: {}", e))
|
580
|
+
})?;
|
581
|
+
}
|
582
|
+
PrimitiveType::FixedLenByteArray(len) => {
|
583
|
+
hash.aset(Symbol::new("length"), *len).map_err(|e| {
|
584
|
+
ParquetError::Conversion(format!("Failed to set length: {}", e))
|
585
|
+
})?;
|
586
|
+
}
|
587
|
+
_ => {}
|
588
|
+
}
|
589
|
+
}
|
590
|
+
}
|
591
|
+
|
592
|
+
Ok(hash.as_value())
|
593
|
+
}
|
594
|
+
|
595
|
+
/// Convert old schema format to new format
|
596
|
+
/// Old: [{ "column_name" => "type" }, ...]
|
597
|
+
/// New: [{ name: "column_name", type: :type }, ...]
|
598
|
+
pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
|
599
|
+
let new_schema = RArray::new();
|
600
|
+
|
601
|
+
for item in schema.into_iter() {
|
602
|
+
let hash: RHash = TryConvert::try_convert(item).map_err(|e: MagnusError| {
|
603
|
+
ParquetError::Schema(format!("Invalid schema item: {}", e))
|
604
|
+
})?;
|
605
|
+
let new_field = RHash::new();
|
606
|
+
|
607
|
+
// The old format has a single key-value pair per hash
|
608
|
+
let process_result = hash.foreach(
|
609
|
+
|key: Value,
|
610
|
+
value: Value|
|
611
|
+
-> std::result::Result<magnus::r_hash::ForEach, MagnusError> {
|
612
|
+
let key_str: String = TryConvert::try_convert(key)?;
|
613
|
+
let type_str: String = TryConvert::try_convert(value)?;
|
614
|
+
|
615
|
+
new_field.aset(Symbol::new("name"), key_str)?;
|
616
|
+
new_field.aset(Symbol::new("type"), Symbol::new(&type_str))?;
|
617
|
+
if type_str.contains("timestamp") {
|
618
|
+
new_field.aset(Symbol::new("has_timezone"), true)?;
|
619
|
+
}
|
620
|
+
|
621
|
+
Ok(magnus::r_hash::ForEach::Continue)
|
622
|
+
},
|
623
|
+
);
|
624
|
+
|
625
|
+
if let Err(e) = process_result {
|
626
|
+
return Err(ParquetError::Schema(format!(
|
627
|
+
"Failed to process field: {}",
|
628
|
+
e
|
629
|
+
)));
|
630
|
+
}
|
631
|
+
|
632
|
+
new_schema
|
633
|
+
.push(new_field)
|
634
|
+
.map_err(|e| ParquetError::Schema(format!("Failed to push field: {}", e)))?;
|
635
|
+
}
|
636
|
+
|
637
|
+
Ok(new_schema)
|
638
|
+
}
|
639
|
+
|
640
|
+
/// Check if schema is in new DSL format (hash with type: :struct)
|
641
|
+
pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
|
642
|
+
if !schema_value.is_kind_of(ruby.class_hash()) {
|
643
|
+
return Ok(false);
|
644
|
+
}
|
645
|
+
|
646
|
+
let schema_hash: RHash = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
|
647
|
+
ParquetError::Schema(format!("Failed to convert to hash: {}", e))
|
648
|
+
})?;
|
649
|
+
if let Some(type_val) = schema_hash.get(Symbol::new("type")) {
|
650
|
+
if type_val.is_kind_of(ruby.class_symbol()) {
|
651
|
+
let type_sym: Symbol =
|
652
|
+
TryConvert::try_convert(type_val).map_err(|e: MagnusError| {
|
653
|
+
ParquetError::Schema(format!("Failed to convert to symbol: {}", e))
|
654
|
+
})?;
|
655
|
+
return Ok(type_sym.name().map_err(|e: MagnusError| {
|
656
|
+
ParquetError::Schema(format!("Failed to get symbol name: {}", e))
|
657
|
+
})? == "struct");
|
658
|
+
} else if type_val.is_kind_of(ruby.class_string()) {
|
659
|
+
let type_str: String =
|
660
|
+
TryConvert::try_convert(type_val).map_err(|e: MagnusError| {
|
661
|
+
ParquetError::Schema(format!("Failed to convert to string: {}", e))
|
662
|
+
})?;
|
663
|
+
return Ok(type_str == "struct");
|
664
|
+
}
|
665
|
+
}
|
666
|
+
Ok(false)
|
667
|
+
}
|
668
|
+
|
669
|
+
/// Process schema value and convert to format expected by ruby_schema_to_parquet
|
670
|
+
pub fn process_schema_value(
|
671
|
+
ruby: &Ruby,
|
672
|
+
schema_value: Value,
|
673
|
+
data_array: Option<&RArray>,
|
674
|
+
) -> Result<Value> {
|
675
|
+
// Check if it's the new DSL format
|
676
|
+
if is_dsl_schema(ruby, schema_value)? {
|
677
|
+
// For DSL format, pass it directly to ruby_schema_to_parquet
|
678
|
+
// which should handle the conversion
|
679
|
+
return Ok(schema_value);
|
680
|
+
}
|
681
|
+
|
682
|
+
// Handle array format or hash with fields
|
683
|
+
let mut schema_array = if schema_value.is_nil() {
|
684
|
+
RArray::new()
|
685
|
+
} else if schema_value.is_kind_of(ruby.class_array()) {
|
686
|
+
let array: RArray = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
|
687
|
+
ParquetError::Schema(format!("Failed to convert to array: {}", e))
|
688
|
+
})?;
|
689
|
+
|
690
|
+
// Check if it's in old format (array of single-key hashes)
|
691
|
+
if !array.is_empty() {
|
692
|
+
let first_item: Value = array
|
693
|
+
.entry(0)
|
694
|
+
.map_err(|e| ParquetError::Schema(format!("Failed to get first item: {}", e)))?;
|
695
|
+
|
696
|
+
if first_item.is_kind_of(ruby.class_hash()) {
|
697
|
+
let first_hash: RHash =
|
698
|
+
TryConvert::try_convert(first_item).map_err(|e: MagnusError| {
|
699
|
+
ParquetError::Schema(format!("Failed to convert first item to hash: {}", e))
|
700
|
+
})?;
|
701
|
+
// Check if it has the new format keys
|
702
|
+
if first_hash.get(Symbol::new("name")).is_some()
|
703
|
+
&& first_hash.get(Symbol::new("type")).is_some()
|
704
|
+
{
|
705
|
+
// Already in new format
|
706
|
+
array
|
707
|
+
} else {
|
708
|
+
// Old format, convert it
|
709
|
+
convert_legacy_schema(ruby, array)?
|
710
|
+
}
|
711
|
+
} else {
|
712
|
+
return Err(ParquetError::Schema(
|
713
|
+
"schema array must contain hashes".to_string(),
|
714
|
+
));
|
715
|
+
}
|
716
|
+
} else {
|
717
|
+
array
|
718
|
+
}
|
719
|
+
} else if schema_value.is_kind_of(ruby.class_hash()) {
|
720
|
+
// Hash format with fields key
|
721
|
+
let hash: RHash = TryConvert::try_convert(schema_value).map_err(|e: MagnusError| {
|
722
|
+
ParquetError::Schema(format!("Failed to convert to hash: {}", e))
|
723
|
+
})?;
|
724
|
+
if let Some(fields) = hash.get(Symbol::new("fields")) {
|
725
|
+
TryConvert::try_convert(fields).map_err(|e: MagnusError| {
|
726
|
+
ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
|
727
|
+
})?
|
728
|
+
} else {
|
729
|
+
return Err(ParquetError::Schema(
|
730
|
+
"schema hash must have 'fields' key or be in DSL format with 'type' key"
|
731
|
+
.to_string(),
|
732
|
+
));
|
733
|
+
}
|
734
|
+
} else {
|
735
|
+
return Err(ParquetError::Schema(
|
736
|
+
"schema must be nil, an array, or a hash".to_string(),
|
737
|
+
));
|
738
|
+
};
|
739
|
+
|
740
|
+
// Check if we need to infer schema from data
|
741
|
+
if schema_array.is_empty() {
|
742
|
+
if let Some(data) = data_array {
|
743
|
+
if data.is_empty() {
|
744
|
+
return Err(ParquetError::Schema(
|
745
|
+
"Cannot infer schema from empty data".to_string(),
|
746
|
+
));
|
747
|
+
}
|
748
|
+
|
749
|
+
// Get first row/batch to determine column count
|
750
|
+
let first_item: Value = data.entry(0).map_err(|e| {
|
751
|
+
ParquetError::Schema(format!("Failed to get first data item: {}", e))
|
752
|
+
})?;
|
753
|
+
let num_columns = if first_item.is_kind_of(ruby.class_array()) {
|
754
|
+
let first_array: RArray =
|
755
|
+
TryConvert::try_convert(first_item).map_err(|e: MagnusError| {
|
756
|
+
ParquetError::Schema(format!(
|
757
|
+
"Failed to convert first data item to array: {}",
|
758
|
+
e
|
759
|
+
))
|
760
|
+
})?;
|
761
|
+
first_array.len()
|
762
|
+
} else {
|
763
|
+
return Err(ParquetError::Schema(
|
764
|
+
"First data item must be an array".to_string(),
|
765
|
+
));
|
766
|
+
};
|
767
|
+
|
768
|
+
// Generate default schema with String types
|
769
|
+
let new_schema = RArray::new();
|
770
|
+
for i in 0..num_columns {
|
771
|
+
let field = RHash::new();
|
772
|
+
field
|
773
|
+
.aset(Symbol::new("name"), format!("f{}", i))
|
774
|
+
.map_err(|e| {
|
775
|
+
ParquetError::Schema(format!("Failed to set field name: {}", e))
|
776
|
+
})?;
|
777
|
+
field
|
778
|
+
.aset(Symbol::new("type"), Symbol::new("string"))
|
779
|
+
.map_err(|e| {
|
780
|
+
ParquetError::Schema(format!("Failed to set field type: {}", e))
|
781
|
+
})?;
|
782
|
+
new_schema
|
783
|
+
.push(field)
|
784
|
+
.map_err(|e| ParquetError::Schema(format!("Failed to push field: {}", e)))?;
|
785
|
+
}
|
786
|
+
|
787
|
+
schema_array = new_schema;
|
788
|
+
} else {
|
789
|
+
return Err(ParquetError::Schema(
|
790
|
+
"Schema is required when data is not provided for inference".to_string(),
|
791
|
+
));
|
792
|
+
}
|
793
|
+
}
|
794
|
+
|
795
|
+
// Convert schema to the format expected by ruby_schema_to_parquet
|
796
|
+
let schema_hash = ruby.hash_new();
|
797
|
+
schema_hash
|
798
|
+
.aset(Symbol::new("fields"), schema_array)
|
799
|
+
.map_err(|e| ParquetError::Schema(format!("Failed to set fields: {}", e)))?;
|
800
|
+
Ok(schema_hash.as_value())
|
801
|
+
}
|
802
|
+
|
803
|
+
/// Extract schema nodes from schema fields
|
804
|
+
pub fn extract_field_schemas(schema: &Schema) -> Vec<SchemaNode> {
|
805
|
+
if let SchemaNode::Struct { fields, .. } = &schema.root {
|
806
|
+
fields.to_vec()
|
807
|
+
} else {
|
808
|
+
Vec::new()
|
809
|
+
}
|
810
|
+
}
|