parquet 0.5.13 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +3 -0
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -605
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,1685 @@
|
|
1
|
+
use crate::string_cache::StringCache;
|
2
|
+
use bytes::Bytes;
|
3
|
+
use indexmap::IndexMap;
|
4
|
+
use magnus::r_hash::ForEach;
|
5
|
+
use magnus::value::ReprValue;
|
6
|
+
use magnus::{
|
7
|
+
Error as MagnusError, IntoValue, Module, RArray, RHash, RString, Ruby, Symbol, TryConvert,
|
8
|
+
Value,
|
9
|
+
};
|
10
|
+
use ordered_float::OrderedFloat;
|
11
|
+
use parquet_core::{ParquetError, ParquetValue, Result};
|
12
|
+
use std::cell::RefCell;
|
13
|
+
use std::sync::Arc;
|
14
|
+
|
15
|
+
/// Ruby value converter
|
16
|
+
///
|
17
|
+
/// Note: This converter is not thread-safe due to Ruby's GIL requirements.
|
18
|
+
/// It should only be used within Ruby's thread context.
|
19
|
+
#[derive(Default)]
|
20
|
+
pub struct RubyValueConverter {
|
21
|
+
string_cache: RefCell<Option<StringCache>>,
|
22
|
+
}
|
23
|
+
|
24
|
+
impl RubyValueConverter {
|
25
|
+
pub fn new() -> Self {
|
26
|
+
Self {
|
27
|
+
string_cache: RefCell::new(None),
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
pub fn with_string_cache(cache: StringCache) -> Self {
|
32
|
+
Self {
|
33
|
+
string_cache: RefCell::new(Some(cache)),
|
34
|
+
}
|
35
|
+
}
|
36
|
+
|
37
|
+
pub fn string_cache_stats(&self) -> Option<crate::string_cache::CacheStats> {
|
38
|
+
self.string_cache
|
39
|
+
.borrow()
|
40
|
+
.as_ref()
|
41
|
+
.map(|cache| cache.stats())
|
42
|
+
}
|
43
|
+
|
44
|
+
/// Convert a Ruby value to ParquetValue with type hint
|
45
|
+
/// This is the primary conversion method that handles all Ruby types
|
46
|
+
pub fn to_parquet_with_type_hint(
|
47
|
+
&mut self,
|
48
|
+
value: Value,
|
49
|
+
type_hint: Option<&parquet_core::PrimitiveType>,
|
50
|
+
) -> Result<ParquetValue> {
|
51
|
+
// Handle nil values
|
52
|
+
if value.is_nil() {
|
53
|
+
return Ok(ParquetValue::Null);
|
54
|
+
}
|
55
|
+
|
56
|
+
// If we have a type hint, use it to guide conversion
|
57
|
+
if let Some(hint) = type_hint {
|
58
|
+
return self.convert_with_type_hint(value, hint);
|
59
|
+
}
|
60
|
+
|
61
|
+
// Otherwise, infer type from Ruby value
|
62
|
+
self.infer_and_convert(value)
|
63
|
+
}
|
64
|
+
|
65
|
+
/// Convert a Ruby value to ParquetValue with schema hint
|
66
|
+
/// This handles both primitive and complex types
|
67
|
+
pub fn to_parquet_with_schema_hint(
|
68
|
+
&mut self,
|
69
|
+
value: Value,
|
70
|
+
schema_hint: Option<&parquet_core::SchemaNode>,
|
71
|
+
) -> Result<ParquetValue> {
|
72
|
+
// Handle nil values
|
73
|
+
if value.is_nil() {
|
74
|
+
return Ok(ParquetValue::Null);
|
75
|
+
}
|
76
|
+
|
77
|
+
// If we have a schema hint, use it to guide conversion
|
78
|
+
if let Some(schema) = schema_hint {
|
79
|
+
return self.convert_with_schema_hint(value, schema);
|
80
|
+
}
|
81
|
+
|
82
|
+
// Otherwise, infer type from Ruby value
|
83
|
+
self.infer_and_convert(value)
|
84
|
+
}
|
85
|
+
|
86
|
+
/// Convert with explicit schema hint
|
87
|
+
fn convert_with_schema_hint(
|
88
|
+
&mut self,
|
89
|
+
value: Value,
|
90
|
+
schema: &parquet_core::SchemaNode,
|
91
|
+
) -> Result<ParquetValue> {
|
92
|
+
use parquet_core::SchemaNode;
|
93
|
+
|
94
|
+
match schema {
|
95
|
+
SchemaNode::Primitive {
|
96
|
+
primitive_type,
|
97
|
+
format,
|
98
|
+
..
|
99
|
+
} => self.convert_with_type_hint_and_format(value, primitive_type, format.as_deref()),
|
100
|
+
SchemaNode::List { item, .. } => self.convert_to_list(value, item.as_ref()),
|
101
|
+
SchemaNode::Map {
|
102
|
+
key, value: val, ..
|
103
|
+
} => self.convert_to_map(value, key.as_ref(), val.as_ref()),
|
104
|
+
SchemaNode::Struct { fields, .. } => self.convert_to_struct(value, fields),
|
105
|
+
}
|
106
|
+
}
|
107
|
+
|
108
|
+
/// Convert with explicit type hint and optional format
|
109
|
+
fn convert_with_type_hint_and_format(
|
110
|
+
&mut self,
|
111
|
+
value: Value,
|
112
|
+
type_hint: &parquet_core::PrimitiveType,
|
113
|
+
format: Option<&str>,
|
114
|
+
) -> Result<ParquetValue> {
|
115
|
+
use parquet_core::PrimitiveType::*;
|
116
|
+
|
117
|
+
// Special handling for UUID format
|
118
|
+
if let (Binary, Some("uuid")) = (type_hint, format) {
|
119
|
+
return self.convert_to_uuid_binary(value);
|
120
|
+
}
|
121
|
+
|
122
|
+
// Handle date types with format
|
123
|
+
match type_hint {
|
124
|
+
Date32 => return self.convert_to_date32(value, format),
|
125
|
+
Date64 => return self.convert_to_date64(value, format),
|
126
|
+
_ => {}
|
127
|
+
}
|
128
|
+
|
129
|
+
// Default type hint conversion
|
130
|
+
self.convert_with_type_hint(value, type_hint)
|
131
|
+
}
|
132
|
+
|
133
|
+
/// Convert with explicit type hint
|
134
|
+
fn convert_with_type_hint(
|
135
|
+
&mut self,
|
136
|
+
value: Value,
|
137
|
+
type_hint: &parquet_core::PrimitiveType,
|
138
|
+
) -> Result<ParquetValue> {
|
139
|
+
use parquet_core::PrimitiveType::*;
|
140
|
+
|
141
|
+
match type_hint {
|
142
|
+
Boolean => self.convert_to_boolean(value),
|
143
|
+
Int8 => self.convert_to_int8(value),
|
144
|
+
Int16 => self.convert_to_int16(value),
|
145
|
+
Int32 => self.convert_to_int32(value),
|
146
|
+
Int64 => self.convert_to_int64(value),
|
147
|
+
UInt8 => self.convert_to_uint8(value),
|
148
|
+
UInt16 => self.convert_to_uint16(value),
|
149
|
+
UInt32 => self.convert_to_uint32(value),
|
150
|
+
UInt64 => self.convert_to_uint64(value),
|
151
|
+
Float32 => self.convert_to_float32(value),
|
152
|
+
Float64 => self.convert_to_float64(value),
|
153
|
+
String => self.convert_to_string(value),
|
154
|
+
Binary => self.convert_to_binary(value),
|
155
|
+
Date32 => self.convert_to_date32(value, None),
|
156
|
+
Date64 => self.convert_to_date64(value, None),
|
157
|
+
TimeMillis => self.convert_to_time_millis(value),
|
158
|
+
TimeMicros => self.convert_to_time_micros(value),
|
159
|
+
TimestampSecond(schema_tz) => {
|
160
|
+
self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
|
161
|
+
}
|
162
|
+
TimestampMillis(schema_tz) => {
|
163
|
+
self.convert_to_timestamp_millis_with_tz(value, schema_tz.as_deref())
|
164
|
+
}
|
165
|
+
TimestampMicros(schema_tz) => {
|
166
|
+
self.convert_to_timestamp_micros_with_tz(value, schema_tz.as_deref())
|
167
|
+
}
|
168
|
+
TimestampNanos(schema_tz) => {
|
169
|
+
self.convert_to_timestamp_nanos_with_tz(value, schema_tz.as_deref())
|
170
|
+
}
|
171
|
+
Decimal128(precision, scale) => self.convert_to_decimal128(value, *precision, *scale),
|
172
|
+
Decimal256(precision, scale) => self.convert_to_decimal256(value, *precision, *scale),
|
173
|
+
FixedLenByteArray(len) => self.convert_to_fixed_len_byte_array(value, *len),
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
/// Infer type from Ruby value and convert
|
178
|
+
fn infer_and_convert(&mut self, value: Value) -> Result<ParquetValue> {
|
179
|
+
let class_name = value.class().to_string();
|
180
|
+
|
181
|
+
match class_name.as_str() {
|
182
|
+
"Integer" => {
|
183
|
+
let i: i64 = TryConvert::try_convert(value)
|
184
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
185
|
+
Ok(ParquetValue::Int64(i))
|
186
|
+
}
|
187
|
+
"Float" => {
|
188
|
+
let f: f64 = TryConvert::try_convert(value)
|
189
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
190
|
+
Ok(ParquetValue::Float64(OrderedFloat(f)))
|
191
|
+
}
|
192
|
+
"String" => {
|
193
|
+
let s: String = TryConvert::try_convert(value)
|
194
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
195
|
+
Ok(ParquetValue::String(s.into()))
|
196
|
+
}
|
197
|
+
"TrueClass" | "FalseClass" => {
|
198
|
+
let b: bool = TryConvert::try_convert(value)
|
199
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
200
|
+
Ok(ParquetValue::Boolean(b))
|
201
|
+
}
|
202
|
+
"Array" => {
|
203
|
+
let array: RArray = TryConvert::try_convert(value)
|
204
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
205
|
+
let mut list = Vec::with_capacity(array.len());
|
206
|
+
|
207
|
+
for item in array.into_iter() {
|
208
|
+
list.push(self.infer_and_convert(item)?);
|
209
|
+
}
|
210
|
+
|
211
|
+
Ok(ParquetValue::List(list))
|
212
|
+
}
|
213
|
+
"Hash" => {
|
214
|
+
let hash: RHash = TryConvert::try_convert(value)
|
215
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
216
|
+
let mut map = Vec::new();
|
217
|
+
let mut conversion_error = None;
|
218
|
+
|
219
|
+
hash.foreach(|key: Value, val: Value| {
|
220
|
+
match (self.infer_and_convert(key), self.infer_and_convert(val)) {
|
221
|
+
(Ok(k), Ok(v)) => {
|
222
|
+
map.push((k, v));
|
223
|
+
Ok(ForEach::Continue)
|
224
|
+
}
|
225
|
+
(Err(e), _) | (_, Err(e)) => {
|
226
|
+
conversion_error = Some(e);
|
227
|
+
Ok(ForEach::Stop)
|
228
|
+
}
|
229
|
+
}
|
230
|
+
})
|
231
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
232
|
+
|
233
|
+
if let Some(err) = conversion_error {
|
234
|
+
return Err(err);
|
235
|
+
}
|
236
|
+
|
237
|
+
Ok(ParquetValue::Map(map))
|
238
|
+
}
|
239
|
+
"Time" => {
|
240
|
+
// Convert Ruby Time to timestamp millis
|
241
|
+
let millis = value
|
242
|
+
.funcall::<_, _, i64>("to_i", ())
|
243
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
244
|
+
* 1000
|
245
|
+
+ value
|
246
|
+
.funcall::<_, _, i32>("nsec", ())
|
247
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
248
|
+
as i64
|
249
|
+
/ 1_000_000;
|
250
|
+
let tz = self.extract_timezone(value)?;
|
251
|
+
|
252
|
+
Ok(ParquetValue::TimestampMillis(millis, tz))
|
253
|
+
}
|
254
|
+
"BigDecimal" => {
|
255
|
+
// Convert BigDecimal to Decimal128
|
256
|
+
let str_val: String = value
|
257
|
+
.funcall("to_s", ("F",))
|
258
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
259
|
+
self.parse_decimal128(&str_val, 38, 10) // Default precision and scale
|
260
|
+
}
|
261
|
+
_ => {
|
262
|
+
// Try to convert to string as fallback
|
263
|
+
let s: String = value.to_string();
|
264
|
+
Ok(ParquetValue::String(s.into()))
|
265
|
+
}
|
266
|
+
}
|
267
|
+
}
|
268
|
+
|
269
|
+
// Helper methods
|
270
|
+
|
271
|
+
/// Normalize timestamp for Parquet storage according to Parquet specification:
|
272
|
+
/// - WITH timezone in schema: Store as UTC (isAdjustedToUTC = true)
|
273
|
+
/// - WITHOUT timezone in schema: Store as local/unzoned time (isAdjustedToUTC = false)
|
274
|
+
///
|
275
|
+
/// IMPORTANT: Parquet can ONLY store:
|
276
|
+
/// 1. UTC timestamps (when schema has ANY timezone)
|
277
|
+
/// 2. Local/unzoned timestamps (when schema has NO timezone)
|
278
|
+
///
|
279
|
+
/// Non-UTC timezones like "+09:00" or "America/New_York" are NOT preserved.
|
280
|
+
fn normalize_timestamp_for_parquet(
|
281
|
+
&self,
|
282
|
+
time_value: Value,
|
283
|
+
schema_has_timezone: bool,
|
284
|
+
) -> Result<Value> {
|
285
|
+
if schema_has_timezone {
|
286
|
+
// Schema has timezone -> MUST convert to UTC (Parquet limitation)
|
287
|
+
// The original timezone offset is lost - only UTC is stored
|
288
|
+
time_value
|
289
|
+
.funcall("utc", ())
|
290
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to convert to UTC: {}", e)))
|
291
|
+
} else {
|
292
|
+
// Schema has no timezone -> keep as local/unzoned time
|
293
|
+
// This represents a "wall clock" time without timezone information
|
294
|
+
Ok(time_value)
|
295
|
+
}
|
296
|
+
}
|
297
|
+
|
298
|
+
/// Extract timezone information from a Ruby Time object
|
299
|
+
fn extract_timezone(&self, time_value: Value) -> Result<Option<std::sync::Arc<str>>> {
|
300
|
+
let _ruby = Ruby::get()
|
301
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
302
|
+
|
303
|
+
// Check if the time is in UTC
|
304
|
+
let is_utc: bool = time_value
|
305
|
+
.funcall("utc?", ())
|
306
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to check UTC: {}", e)))?;
|
307
|
+
|
308
|
+
if is_utc {
|
309
|
+
return Ok(Some("UTC".into()));
|
310
|
+
}
|
311
|
+
|
312
|
+
// Get the UTC offset in seconds
|
313
|
+
let utc_offset: i32 = time_value
|
314
|
+
.funcall("utc_offset", ())
|
315
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to get UTC offset: {}", e)))?;
|
316
|
+
|
317
|
+
// If offset is 0 and not explicitly UTC, it might be local time
|
318
|
+
if utc_offset == 0 {
|
319
|
+
// Check if this is actually UTC or just happens to have 0 offset
|
320
|
+
// We already checked utc? above, so this is local time with 0 offset
|
321
|
+
return Ok(None);
|
322
|
+
}
|
323
|
+
|
324
|
+
// Convert offset to hours and minutes
|
325
|
+
let hours = utc_offset / 3600;
|
326
|
+
let minutes = (utc_offset.abs() % 3600) / 60;
|
327
|
+
|
328
|
+
// Format as +HH:MM or -HH:MM
|
329
|
+
let tz_string = if minutes == 0 {
|
330
|
+
format!("{:+03}:00", hours)
|
331
|
+
} else {
|
332
|
+
format!("{:+03}:{:02}", hours, minutes)
|
333
|
+
};
|
334
|
+
|
335
|
+
Ok(Some(tz_string.into()))
|
336
|
+
}
|
337
|
+
|
338
|
+
// Conversion methods for specific types
|
339
|
+
|
340
|
+
fn convert_to_boolean(&self, value: Value) -> Result<ParquetValue> {
|
341
|
+
if value.is_nil() {
|
342
|
+
return Ok(ParquetValue::Null);
|
343
|
+
}
|
344
|
+
|
345
|
+
let b: bool = TryConvert::try_convert(value)
|
346
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
347
|
+
Ok(ParquetValue::Boolean(b))
|
348
|
+
}
|
349
|
+
|
350
|
+
fn convert_to_int8(&self, value: Value) -> Result<ParquetValue> {
|
351
|
+
if value.is_nil() {
|
352
|
+
return Ok(ParquetValue::Null);
|
353
|
+
}
|
354
|
+
|
355
|
+
let i = self.convert_numeric::<i8>(value)?;
|
356
|
+
Ok(ParquetValue::Int8(i))
|
357
|
+
}
|
358
|
+
|
359
|
+
fn convert_to_int16(&self, value: Value) -> Result<ParquetValue> {
|
360
|
+
if value.is_nil() {
|
361
|
+
return Ok(ParquetValue::Null);
|
362
|
+
}
|
363
|
+
|
364
|
+
let i = self.convert_numeric::<i16>(value)?;
|
365
|
+
Ok(ParquetValue::Int16(i))
|
366
|
+
}
|
367
|
+
|
368
|
+
fn convert_to_int32(&self, value: Value) -> Result<ParquetValue> {
|
369
|
+
if value.is_nil() {
|
370
|
+
return Ok(ParquetValue::Null);
|
371
|
+
}
|
372
|
+
|
373
|
+
let i = self.convert_numeric::<i32>(value)?;
|
374
|
+
Ok(ParquetValue::Int32(i))
|
375
|
+
}
|
376
|
+
|
377
|
+
fn convert_to_int64(&self, value: Value) -> Result<ParquetValue> {
|
378
|
+
if value.is_nil() {
|
379
|
+
return Ok(ParquetValue::Null);
|
380
|
+
}
|
381
|
+
|
382
|
+
let i = self.convert_numeric::<i64>(value)?;
|
383
|
+
Ok(ParquetValue::Int64(i))
|
384
|
+
}
|
385
|
+
|
386
|
+
fn convert_to_uint8(&self, value: Value) -> Result<ParquetValue> {
|
387
|
+
if value.is_nil() {
|
388
|
+
return Ok(ParquetValue::Null);
|
389
|
+
}
|
390
|
+
|
391
|
+
let i = self.convert_numeric::<u8>(value)?;
|
392
|
+
Ok(ParquetValue::UInt8(i))
|
393
|
+
}
|
394
|
+
|
395
|
+
fn convert_to_uint16(&self, value: Value) -> Result<ParquetValue> {
|
396
|
+
if value.is_nil() {
|
397
|
+
return Ok(ParquetValue::Null);
|
398
|
+
}
|
399
|
+
|
400
|
+
let i = self.convert_numeric::<u16>(value)?;
|
401
|
+
Ok(ParquetValue::UInt16(i))
|
402
|
+
}
|
403
|
+
|
404
|
+
fn convert_to_uint32(&self, value: Value) -> Result<ParquetValue> {
|
405
|
+
if value.is_nil() {
|
406
|
+
return Ok(ParquetValue::Null);
|
407
|
+
}
|
408
|
+
|
409
|
+
let i = self.convert_numeric::<u32>(value)?;
|
410
|
+
Ok(ParquetValue::UInt32(i))
|
411
|
+
}
|
412
|
+
|
413
|
+
fn convert_to_uint64(&self, value: Value) -> Result<ParquetValue> {
|
414
|
+
if value.is_nil() {
|
415
|
+
return Ok(ParquetValue::Null);
|
416
|
+
}
|
417
|
+
|
418
|
+
let i = self.convert_numeric::<u64>(value)?;
|
419
|
+
Ok(ParquetValue::UInt64(i))
|
420
|
+
}
|
421
|
+
|
422
|
+
fn convert_to_float32(&self, value: Value) -> Result<ParquetValue> {
|
423
|
+
if value.is_nil() {
|
424
|
+
return Ok(ParquetValue::Null);
|
425
|
+
}
|
426
|
+
|
427
|
+
let f = self.convert_numeric::<f32>(value)?;
|
428
|
+
Ok(ParquetValue::Float32(OrderedFloat(f)))
|
429
|
+
}
|
430
|
+
|
431
|
+
fn convert_to_float64(&self, value: Value) -> Result<ParquetValue> {
|
432
|
+
if value.is_nil() {
|
433
|
+
return Ok(ParquetValue::Null);
|
434
|
+
}
|
435
|
+
|
436
|
+
let f = self.convert_numeric::<f64>(value)?;
|
437
|
+
Ok(ParquetValue::Float64(OrderedFloat(f)))
|
438
|
+
}
|
439
|
+
|
440
|
+
fn convert_to_string(&mut self, value: Value) -> Result<ParquetValue> {
|
441
|
+
if value.is_nil() {
|
442
|
+
return Ok(ParquetValue::Null);
|
443
|
+
}
|
444
|
+
|
445
|
+
// Convert any value to string using to_s
|
446
|
+
let s: String = value
|
447
|
+
.funcall("to_s", ())
|
448
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
449
|
+
|
450
|
+
// Use string cache if available for statistics tracking
|
451
|
+
// Note: Currently doesn't provide memory savings due to ParquetValue storing String
|
452
|
+
if let Some(ref mut cache) = self.string_cache.borrow_mut().as_mut() {
|
453
|
+
let interned = cache.intern(s);
|
454
|
+
Ok(ParquetValue::String(interned))
|
455
|
+
} else {
|
456
|
+
Ok(ParquetValue::String(s.into()))
|
457
|
+
}
|
458
|
+
}
|
459
|
+
|
460
|
+
fn convert_to_binary(&self, value: Value) -> Result<ParquetValue> {
|
461
|
+
if value.is_nil() {
|
462
|
+
return Ok(ParquetValue::Null);
|
463
|
+
}
|
464
|
+
|
465
|
+
let ruby = Ruby::get()
|
466
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
467
|
+
if value.is_kind_of(ruby.class_string()) {
|
468
|
+
let s: RString = TryConvert::try_convert(value)
|
469
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
470
|
+
let bytes = unsafe { Bytes::copy_from_slice(s.as_slice()) };
|
471
|
+
Ok(ParquetValue::Bytes(bytes))
|
472
|
+
} else {
|
473
|
+
// Try to convert to string first
|
474
|
+
let s: String = TryConvert::try_convert(value)
|
475
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
476
|
+
Ok(ParquetValue::Bytes(s.into()))
|
477
|
+
}
|
478
|
+
}
|
479
|
+
|
480
|
+
fn convert_to_uuid_binary(&self, value: Value) -> Result<ParquetValue> {
|
481
|
+
if value.is_nil() {
|
482
|
+
return Ok(ParquetValue::Null);
|
483
|
+
}
|
484
|
+
|
485
|
+
// Convert value to string
|
486
|
+
let uuid_str: String = value
|
487
|
+
.funcall("to_s", ())
|
488
|
+
.and_then(TryConvert::try_convert)
|
489
|
+
.map_err(|e: MagnusError| {
|
490
|
+
ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
|
491
|
+
})?;
|
492
|
+
|
493
|
+
// Remove hyphens and validate length
|
494
|
+
let clean_uuid = uuid_str.replace('-', "");
|
495
|
+
if clean_uuid.len() != 32 {
|
496
|
+
return Err(ParquetError::Conversion(format!(
|
497
|
+
"Invalid UUID format: expected 32 hex characters (ignoring hyphens), got {}",
|
498
|
+
clean_uuid.len()
|
499
|
+
)));
|
500
|
+
}
|
501
|
+
|
502
|
+
// Parse hex string to bytes
|
503
|
+
let mut bytes = Vec::with_capacity(16);
|
504
|
+
for i in 0..16 {
|
505
|
+
let hex_byte = &clean_uuid[i * 2..i * 2 + 2];
|
506
|
+
let byte = u8::from_str_radix(hex_byte, 16).map_err(|_| {
|
507
|
+
ParquetError::Conversion(format!("Invalid hex character in UUID: {}", hex_byte))
|
508
|
+
})?;
|
509
|
+
bytes.push(byte);
|
510
|
+
}
|
511
|
+
|
512
|
+
Ok(ParquetValue::Bytes(bytes.into()))
|
513
|
+
}
|
514
|
+
|
515
|
+
fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
|
516
|
+
if value.is_nil() {
|
517
|
+
return Ok(ParquetValue::Null);
|
518
|
+
}
|
519
|
+
|
520
|
+
// Handle Time objects
|
521
|
+
let ruby = Ruby::get()
|
522
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
523
|
+
if value.is_kind_of(ruby.class_time()) {
|
524
|
+
let secs: i64 = value
|
525
|
+
.funcall("to_i", ())
|
526
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
527
|
+
let days = (secs / 86400) as i32;
|
528
|
+
return Ok(ParquetValue::Date32(days));
|
529
|
+
}
|
530
|
+
|
531
|
+
// Handle strings
|
532
|
+
if value.is_kind_of(ruby.class_string()) {
|
533
|
+
// Use Ruby's Date module
|
534
|
+
let _ = ruby.require("date");
|
535
|
+
let kernel = ruby.module_kernel();
|
536
|
+
let date_module = kernel
|
537
|
+
.const_get::<_, Value>("Date")
|
538
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
539
|
+
|
540
|
+
// Use strptime if format is provided, otherwise use parse
|
541
|
+
let date = if let Some(format) = date_format {
|
542
|
+
date_module
|
543
|
+
.funcall::<_, _, Value>("strptime", (value, format))
|
544
|
+
.map_err(|e| {
|
545
|
+
ParquetError::Conversion(format!(
|
546
|
+
"Failed to parse date with format '{}': {}",
|
547
|
+
format, e
|
548
|
+
))
|
549
|
+
})?
|
550
|
+
} else {
|
551
|
+
date_module
|
552
|
+
.funcall::<_, _, Value>("parse", (value,))
|
553
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to parse date: {}", e)))?
|
554
|
+
};
|
555
|
+
|
556
|
+
// Convert to Time object then to days since epoch
|
557
|
+
let time = date
|
558
|
+
.funcall::<_, _, Value>("to_time", ())
|
559
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
560
|
+
let secs: i64 = time
|
561
|
+
.funcall("to_i", ())
|
562
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
563
|
+
let days = (secs / 86400) as i32;
|
564
|
+
return Ok(ParquetValue::Date32(days));
|
565
|
+
}
|
566
|
+
|
567
|
+
Err(ParquetError::Conversion(format!(
|
568
|
+
"Cannot convert {} to date32",
|
569
|
+
value.class()
|
570
|
+
)))
|
571
|
+
}
|
572
|
+
|
573
|
+
fn convert_to_date64(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
|
574
|
+
if value.is_nil() {
|
575
|
+
return Ok(ParquetValue::Null);
|
576
|
+
}
|
577
|
+
|
578
|
+
// Similar to date32 but returns milliseconds since epoch
|
579
|
+
let ruby = Ruby::get()
|
580
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
581
|
+
if value.is_kind_of(ruby.class_time()) {
|
582
|
+
let millis: i64 = value
|
583
|
+
.funcall::<_, _, i64>("to_i", ())
|
584
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
585
|
+
* 1000;
|
586
|
+
return Ok(ParquetValue::Date64(millis));
|
587
|
+
}
|
588
|
+
|
589
|
+
// Handle strings
|
590
|
+
if value.is_kind_of(ruby.class_string()) {
|
591
|
+
// Use Ruby's Date module
|
592
|
+
let _ = ruby.require("date");
|
593
|
+
let kernel = ruby.module_kernel();
|
594
|
+
let date_module = kernel
|
595
|
+
.const_get::<_, Value>("Date")
|
596
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
597
|
+
|
598
|
+
// Use strptime if format is provided, otherwise use parse
|
599
|
+
let date = if let Some(format) = date_format {
|
600
|
+
date_module
|
601
|
+
.funcall::<_, _, Value>("strptime", (value, format))
|
602
|
+
.map_err(|e| {
|
603
|
+
ParquetError::Conversion(format!(
|
604
|
+
"Failed to parse date with format '{}': {}",
|
605
|
+
format, e
|
606
|
+
))
|
607
|
+
})?
|
608
|
+
} else {
|
609
|
+
date_module
|
610
|
+
.funcall::<_, _, Value>("parse", (value,))
|
611
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to parse date: {}", e)))?
|
612
|
+
};
|
613
|
+
|
614
|
+
// Convert to Time object then to milliseconds since epoch
|
615
|
+
let time = date
|
616
|
+
.funcall::<_, _, Value>("to_time", ())
|
617
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
618
|
+
let secs: i64 = time
|
619
|
+
.funcall("to_i", ())
|
620
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
621
|
+
let millis = secs * 1000;
|
622
|
+
return Ok(ParquetValue::Date64(millis));
|
623
|
+
}
|
624
|
+
|
625
|
+
Err(ParquetError::Conversion(format!(
|
626
|
+
"Cannot convert {} to date64",
|
627
|
+
value.class()
|
628
|
+
)))
|
629
|
+
}
|
630
|
+
|
631
|
+
fn convert_to_time_millis(&self, value: Value) -> Result<ParquetValue> {
|
632
|
+
if value.is_nil() {
|
633
|
+
return Ok(ParquetValue::Null);
|
634
|
+
}
|
635
|
+
|
636
|
+
// Convert to milliseconds since midnight
|
637
|
+
let ruby = Ruby::get()
|
638
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
639
|
+
if value.is_kind_of(ruby.class_time()) {
|
640
|
+
let hour: i32 = value
|
641
|
+
.funcall("hour", ())
|
642
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
643
|
+
let min: i32 = value
|
644
|
+
.funcall("min", ())
|
645
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
646
|
+
let sec: i32 = value
|
647
|
+
.funcall("sec", ())
|
648
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
649
|
+
let nsec: i32 = value
|
650
|
+
.funcall("nsec", ())
|
651
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
652
|
+
|
653
|
+
let millis = (hour * 3600 + min * 60 + sec) * 1000 + nsec / 1_000_000;
|
654
|
+
return Ok(ParquetValue::TimeMillis(millis));
|
655
|
+
}
|
656
|
+
|
657
|
+
Err(ParquetError::Conversion(format!(
|
658
|
+
"Cannot convert {} to time_millis",
|
659
|
+
value.class()
|
660
|
+
)))
|
661
|
+
}
|
662
|
+
|
663
|
+
fn convert_to_time_micros(&self, value: Value) -> Result<ParquetValue> {
|
664
|
+
if value.is_nil() {
|
665
|
+
return Ok(ParquetValue::Null);
|
666
|
+
}
|
667
|
+
|
668
|
+
// Convert to microseconds since midnight
|
669
|
+
let ruby = Ruby::get()
|
670
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
671
|
+
if value.is_kind_of(ruby.class_time()) {
|
672
|
+
let hour: i64 = value
|
673
|
+
.funcall("hour", ())
|
674
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
675
|
+
let min: i64 = value
|
676
|
+
.funcall("min", ())
|
677
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
678
|
+
let sec: i64 = value
|
679
|
+
.funcall("sec", ())
|
680
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
681
|
+
let nsec: i64 = value
|
682
|
+
.funcall("nsec", ())
|
683
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
684
|
+
|
685
|
+
let micros = (hour * 3600 + min * 60 + sec) * 1_000_000 + nsec / 1000;
|
686
|
+
return Ok(ParquetValue::TimeMicros(micros));
|
687
|
+
}
|
688
|
+
|
689
|
+
Err(ParquetError::Conversion(format!(
|
690
|
+
"Cannot convert {} to time_micros",
|
691
|
+
value.class()
|
692
|
+
)))
|
693
|
+
}
|
694
|
+
|
695
|
+
// Timestamp conversion methods that respect schema timezone
|
696
|
+
fn convert_to_timestamp_second_with_tz(
|
697
|
+
&self,
|
698
|
+
value: Value,
|
699
|
+
schema_tz: Option<&str>,
|
700
|
+
) -> Result<ParquetValue> {
|
701
|
+
if value.is_nil() {
|
702
|
+
return Ok(ParquetValue::Null);
|
703
|
+
}
|
704
|
+
|
705
|
+
let ruby = Ruby::get()
|
706
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
707
|
+
if value.is_kind_of(ruby.class_time()) {
|
708
|
+
// Normalize timestamp according to Parquet spec
|
709
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
|
710
|
+
|
711
|
+
let secs: i64 = adjusted_time
|
712
|
+
.funcall("to_i", ())
|
713
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
714
|
+
|
715
|
+
// PARQUET TIMESTAMP STORAGE:
|
716
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
717
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
718
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
719
|
+
let tz = if schema_tz.is_some() {
|
720
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
721
|
+
} else {
|
722
|
+
None // Unzoned/local timestamp
|
723
|
+
};
|
724
|
+
|
725
|
+
return Ok(ParquetValue::TimestampSecond(secs, tz));
|
726
|
+
}
|
727
|
+
|
728
|
+
// Handle strings
|
729
|
+
if value.is_kind_of(ruby.class_string()) {
|
730
|
+
// Use Ruby's Time.parse to handle timestamp strings
|
731
|
+
let time_class = ruby.class_time();
|
732
|
+
let time = time_class
|
733
|
+
.funcall::<_, _, Value>("parse", (value,))
|
734
|
+
.map_err(|e| {
|
735
|
+
ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
|
736
|
+
})?;
|
737
|
+
|
738
|
+
// Normalize timestamp according to Parquet spec
|
739
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
|
740
|
+
|
741
|
+
let secs: i64 = adjusted_time
|
742
|
+
.funcall("to_i", ())
|
743
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
744
|
+
|
745
|
+
// PARQUET TIMESTAMP STORAGE:
|
746
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
747
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
748
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
749
|
+
let tz = if schema_tz.is_some() {
|
750
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
751
|
+
} else {
|
752
|
+
None // Unzoned/local timestamp
|
753
|
+
};
|
754
|
+
|
755
|
+
return Ok(ParquetValue::TimestampSecond(secs, tz));
|
756
|
+
}
|
757
|
+
|
758
|
+
Err(ParquetError::Conversion(format!(
|
759
|
+
"Cannot convert {} to timestamp_second",
|
760
|
+
value.class()
|
761
|
+
)))
|
762
|
+
}
|
763
|
+
|
764
|
+
fn convert_to_timestamp_millis_with_tz(
|
765
|
+
&self,
|
766
|
+
value: Value,
|
767
|
+
schema_tz: Option<&str>,
|
768
|
+
) -> Result<ParquetValue> {
|
769
|
+
if value.is_nil() {
|
770
|
+
return Ok(ParquetValue::Null);
|
771
|
+
}
|
772
|
+
|
773
|
+
let ruby = Ruby::get()
|
774
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
775
|
+
if value.is_kind_of(ruby.class_time()) {
|
776
|
+
// Normalize timestamp according to Parquet spec
|
777
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
|
778
|
+
|
779
|
+
let millis = adjusted_time
|
780
|
+
.funcall::<_, _, i64>("to_i", ())
|
781
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
782
|
+
* 1000
|
783
|
+
+ adjusted_time
|
784
|
+
.funcall::<_, _, i32>("nsec", ())
|
785
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
|
786
|
+
/ 1_000_000;
|
787
|
+
|
788
|
+
// PARQUET TIMESTAMP STORAGE:
|
789
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
790
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
791
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
792
|
+
let tz = if schema_tz.is_some() {
|
793
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
794
|
+
} else {
|
795
|
+
None // Unzoned/local timestamp
|
796
|
+
};
|
797
|
+
|
798
|
+
return Ok(ParquetValue::TimestampMillis(millis, tz));
|
799
|
+
}
|
800
|
+
|
801
|
+
// Handle strings
|
802
|
+
if value.is_kind_of(ruby.class_string()) {
|
803
|
+
// Use Ruby's Time.parse to handle timestamp strings
|
804
|
+
let time_class = ruby.class_time();
|
805
|
+
let time = time_class
|
806
|
+
.funcall::<_, _, Value>("parse", (value,))
|
807
|
+
.map_err(|e| {
|
808
|
+
ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
|
809
|
+
})?;
|
810
|
+
|
811
|
+
// Normalize timestamp according to Parquet spec
|
812
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
|
813
|
+
|
814
|
+
let millis = adjusted_time
|
815
|
+
.funcall::<_, _, i64>("to_i", ())
|
816
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
817
|
+
* 1000
|
818
|
+
+ adjusted_time
|
819
|
+
.funcall::<_, _, i32>("nsec", ())
|
820
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
|
821
|
+
/ 1_000_000;
|
822
|
+
|
823
|
+
// PARQUET TIMESTAMP STORAGE:
|
824
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
825
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
826
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
827
|
+
let tz = if schema_tz.is_some() {
|
828
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
829
|
+
} else {
|
830
|
+
None // Unzoned/local timestamp
|
831
|
+
};
|
832
|
+
|
833
|
+
return Ok(ParquetValue::TimestampMillis(millis, tz));
|
834
|
+
}
|
835
|
+
|
836
|
+
Err(ParquetError::Conversion(format!(
|
837
|
+
"Cannot convert {} to timestamp_millis",
|
838
|
+
value.class()
|
839
|
+
)))
|
840
|
+
}
|
841
|
+
|
842
|
+
fn convert_to_timestamp_micros_with_tz(
|
843
|
+
&self,
|
844
|
+
value: Value,
|
845
|
+
schema_tz: Option<&str>,
|
846
|
+
) -> Result<ParquetValue> {
|
847
|
+
if value.is_nil() {
|
848
|
+
return Ok(ParquetValue::Null);
|
849
|
+
}
|
850
|
+
|
851
|
+
let ruby = Ruby::get()
|
852
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
853
|
+
if value.is_kind_of(ruby.class_time()) {
|
854
|
+
// Normalize timestamp according to Parquet spec
|
855
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
|
856
|
+
|
857
|
+
let micros = adjusted_time
|
858
|
+
.funcall::<_, _, i64>("to_i", ())
|
859
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
860
|
+
* 1_000_000
|
861
|
+
+ adjusted_time
|
862
|
+
.funcall::<_, _, i32>("nsec", ())
|
863
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
|
864
|
+
/ 1000;
|
865
|
+
|
866
|
+
// PARQUET TIMESTAMP STORAGE:
|
867
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
868
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
869
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
870
|
+
let tz = if schema_tz.is_some() {
|
871
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
872
|
+
} else {
|
873
|
+
None // Unzoned/local timestamp
|
874
|
+
};
|
875
|
+
|
876
|
+
return Ok(ParquetValue::TimestampMicros(micros, tz));
|
877
|
+
}
|
878
|
+
|
879
|
+
// Handle strings
|
880
|
+
if value.is_kind_of(ruby.class_string()) {
|
881
|
+
// Use Ruby's Time.parse to handle timestamp strings
|
882
|
+
let time_class = ruby.class_time();
|
883
|
+
let time = time_class
|
884
|
+
.funcall::<_, _, Value>("parse", (value,))
|
885
|
+
.map_err(|e| {
|
886
|
+
ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
|
887
|
+
})?;
|
888
|
+
|
889
|
+
// Normalize timestamp according to Parquet spec
|
890
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
|
891
|
+
|
892
|
+
let micros = adjusted_time
|
893
|
+
.funcall::<_, _, i64>("to_i", ())
|
894
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
895
|
+
* 1_000_000
|
896
|
+
+ adjusted_time
|
897
|
+
.funcall::<_, _, i32>("nsec", ())
|
898
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
|
899
|
+
/ 1000;
|
900
|
+
|
901
|
+
// PARQUET TIMESTAMP STORAGE:
|
902
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
903
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
904
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
905
|
+
let tz = if schema_tz.is_some() {
|
906
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
907
|
+
} else {
|
908
|
+
None // Unzoned/local timestamp
|
909
|
+
};
|
910
|
+
|
911
|
+
return Ok(ParquetValue::TimestampMicros(micros, tz));
|
912
|
+
}
|
913
|
+
|
914
|
+
Err(ParquetError::Conversion(format!(
|
915
|
+
"Cannot convert {} to timestamp_micros",
|
916
|
+
value.class()
|
917
|
+
)))
|
918
|
+
}
|
919
|
+
|
920
|
+
fn convert_to_timestamp_nanos_with_tz(
|
921
|
+
&self,
|
922
|
+
value: Value,
|
923
|
+
schema_tz: Option<&str>,
|
924
|
+
) -> Result<ParquetValue> {
|
925
|
+
if value.is_nil() {
|
926
|
+
return Ok(ParquetValue::Null);
|
927
|
+
}
|
928
|
+
|
929
|
+
let ruby = Ruby::get()
|
930
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
931
|
+
if value.is_kind_of(ruby.class_time()) {
|
932
|
+
// Normalize timestamp according to Parquet spec
|
933
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
|
934
|
+
|
935
|
+
let nanos = adjusted_time
|
936
|
+
.funcall::<_, _, i64>("to_i", ())
|
937
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
938
|
+
* 1_000_000_000
|
939
|
+
+ adjusted_time
|
940
|
+
.funcall::<_, _, i32>("nsec", ())
|
941
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))? as i64;
|
942
|
+
|
943
|
+
// PARQUET TIMESTAMP STORAGE:
|
944
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
945
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
946
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
947
|
+
let tz = if schema_tz.is_some() {
|
948
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
949
|
+
} else {
|
950
|
+
None // Unzoned/local timestamp
|
951
|
+
};
|
952
|
+
|
953
|
+
return Ok(ParquetValue::TimestampNanos(nanos, tz));
|
954
|
+
}
|
955
|
+
|
956
|
+
// Handle strings
|
957
|
+
if value.is_kind_of(ruby.class_string()) {
|
958
|
+
// Use Ruby's Time.parse to handle timestamp strings
|
959
|
+
let time_class = ruby.class_time();
|
960
|
+
let time = time_class
|
961
|
+
.funcall::<_, _, Value>("parse", (value,))
|
962
|
+
.map_err(|e| {
|
963
|
+
ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
|
964
|
+
})?;
|
965
|
+
|
966
|
+
// Normalize timestamp according to Parquet spec
|
967
|
+
let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
|
968
|
+
|
969
|
+
let nanos = adjusted_time
|
970
|
+
.funcall::<_, _, i64>("to_i", ())
|
971
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
972
|
+
* 1_000_000_000
|
973
|
+
+ adjusted_time
|
974
|
+
.funcall::<_, _, i32>("nsec", ())
|
975
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))? as i64;
|
976
|
+
|
977
|
+
// PARQUET TIMESTAMP STORAGE:
|
978
|
+
// - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
|
979
|
+
// - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
|
980
|
+
// NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
|
981
|
+
let tz = if schema_tz.is_some() {
|
982
|
+
Some(Arc::from("UTC")) // Always UTC, never the original timezone
|
983
|
+
} else {
|
984
|
+
None // Unzoned/local timestamp
|
985
|
+
};
|
986
|
+
|
987
|
+
return Ok(ParquetValue::TimestampNanos(nanos, tz));
|
988
|
+
}
|
989
|
+
|
990
|
+
Err(ParquetError::Conversion(format!(
|
991
|
+
"Cannot convert {} to timestamp_nanos",
|
992
|
+
value.class()
|
993
|
+
)))
|
994
|
+
}
|
995
|
+
|
996
|
+
fn convert_to_decimal128(
|
997
|
+
&self,
|
998
|
+
value: Value,
|
999
|
+
precision: u8,
|
1000
|
+
scale: i8,
|
1001
|
+
) -> Result<ParquetValue> {
|
1002
|
+
if value.is_nil() {
|
1003
|
+
return Ok(ParquetValue::Null);
|
1004
|
+
}
|
1005
|
+
|
1006
|
+
// For BigDecimal, use to_s("F") to get non-scientific notation
|
1007
|
+
let str_val: String = if value.class().to_string() == "BigDecimal" {
|
1008
|
+
value
|
1009
|
+
.funcall("to_s", ("F",))
|
1010
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
1011
|
+
} else {
|
1012
|
+
value
|
1013
|
+
.funcall("to_s", ())
|
1014
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
1015
|
+
};
|
1016
|
+
|
1017
|
+
self.parse_decimal128(&str_val, precision, scale)
|
1018
|
+
}
|
1019
|
+
|
1020
|
+
fn convert_to_decimal256(
|
1021
|
+
&self,
|
1022
|
+
value: Value,
|
1023
|
+
precision: u8,
|
1024
|
+
scale: i8,
|
1025
|
+
) -> Result<ParquetValue> {
|
1026
|
+
if value.is_nil() {
|
1027
|
+
return Ok(ParquetValue::Null);
|
1028
|
+
}
|
1029
|
+
|
1030
|
+
// For BigDecimal, use to_s("F") to get non-scientific notation
|
1031
|
+
let str_val: String = if value.class().to_string() == "BigDecimal" {
|
1032
|
+
value
|
1033
|
+
.funcall("to_s", ("F",))
|
1034
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
1035
|
+
} else {
|
1036
|
+
value
|
1037
|
+
.funcall("to_s", ())
|
1038
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
1039
|
+
};
|
1040
|
+
|
1041
|
+
self.parse_decimal256(&str_val, precision, scale)
|
1042
|
+
}
|
1043
|
+
|
1044
|
+
fn convert_to_fixed_len_byte_array(&self, value: Value, len: i32) -> Result<ParquetValue> {
|
1045
|
+
if value.is_nil() {
|
1046
|
+
return Ok(ParquetValue::Null);
|
1047
|
+
}
|
1048
|
+
|
1049
|
+
let ruby = Ruby::get()
|
1050
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
1051
|
+
let bytes = if value.is_kind_of(ruby.class_string()) {
|
1052
|
+
let s: RString = TryConvert::try_convert(value)
|
1053
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
1054
|
+
unsafe { s.as_slice() }.to_vec()
|
1055
|
+
} else {
|
1056
|
+
let s: String = TryConvert::try_convert(value)
|
1057
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
1058
|
+
s.into_bytes()
|
1059
|
+
};
|
1060
|
+
|
1061
|
+
if bytes.len() != len as usize {
|
1062
|
+
return Err(ParquetError::Conversion(format!(
|
1063
|
+
"Expected {} bytes, got {}",
|
1064
|
+
len,
|
1065
|
+
bytes.len()
|
1066
|
+
)));
|
1067
|
+
}
|
1068
|
+
|
1069
|
+
Ok(ParquetValue::Bytes(bytes.into()))
|
1070
|
+
}
|
1071
|
+
|
1072
|
+
// Helper methods
|
1073
|
+
|
1074
|
+
fn convert_numeric<T>(&self, value: Value) -> Result<T>
|
1075
|
+
where
|
1076
|
+
T: TryConvert + std::str::FromStr,
|
1077
|
+
<T as std::str::FromStr>::Err: std::fmt::Display,
|
1078
|
+
{
|
1079
|
+
// Try direct conversion first
|
1080
|
+
if let Ok(val) = TryConvert::try_convert(value) {
|
1081
|
+
return Ok(val);
|
1082
|
+
}
|
1083
|
+
|
1084
|
+
// If that fails, try converting to i64/f64 first, then to target type
|
1085
|
+
let ruby = Ruby::get()
|
1086
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
1087
|
+
if value.is_kind_of(ruby.class_integer()) {
|
1088
|
+
// Convert Integer to i64 first, then to target type
|
1089
|
+
let i: i64 = TryConvert::try_convert(value)
|
1090
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
1091
|
+
i.to_string().parse::<T>().map_err(|e| {
|
1092
|
+
ParquetError::Conversion(format!("Failed to convert {} to target type: {}", i, e))
|
1093
|
+
})
|
1094
|
+
} else if value.is_kind_of(ruby.class_float()) {
|
1095
|
+
// Convert Float to f64 first, then to target type
|
1096
|
+
let f: f64 = TryConvert::try_convert(value)
|
1097
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
1098
|
+
f.to_string().parse::<T>().map_err(|e| {
|
1099
|
+
ParquetError::Conversion(format!("Failed to convert {} to target type: {}", f, e))
|
1100
|
+
})
|
1101
|
+
} else if value.is_kind_of(ruby.class_string()) {
|
1102
|
+
let s: String = TryConvert::try_convert(value)
|
1103
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
1104
|
+
s.trim().parse::<T>().map_err(|e| {
|
1105
|
+
ParquetError::Conversion(format!("Failed to parse '{}' as numeric: {}", s, e))
|
1106
|
+
})
|
1107
|
+
} else {
|
1108
|
+
Err(ParquetError::Conversion(format!(
|
1109
|
+
"Cannot convert {} to numeric",
|
1110
|
+
value.class()
|
1111
|
+
)))
|
1112
|
+
}
|
1113
|
+
}
|
1114
|
+
|
1115
|
+
fn parse_decimal128(&self, s: &str, _precision: u8, scale: i8) -> Result<ParquetValue> {
|
1116
|
+
// Parse decimal string to i128
|
1117
|
+
let clean = s.trim();
|
1118
|
+
|
1119
|
+
// Handle scientific notation by converting to regular decimal format
|
1120
|
+
let normalized = if clean.to_lowercase().contains('e') {
|
1121
|
+
// Parse as f64 first to handle scientific notation
|
1122
|
+
let f: f64 = clean.parse().map_err(|e| {
|
1123
|
+
ParquetError::Conversion(format!("Failed to parse scientific notation: {}", e))
|
1124
|
+
})?;
|
1125
|
+
// Convert to string with enough precision
|
1126
|
+
format!("{:.15}", f)
|
1127
|
+
.trim_end_matches('0')
|
1128
|
+
.trim_end_matches('.')
|
1129
|
+
.to_string()
|
1130
|
+
} else {
|
1131
|
+
clean.to_string()
|
1132
|
+
};
|
1133
|
+
|
1134
|
+
let is_negative = normalized.starts_with('-');
|
1135
|
+
let clean_abs = normalized.trim_start_matches('-').trim_start_matches('+');
|
1136
|
+
|
1137
|
+
let parts: Vec<&str> = clean_abs.split('.').collect();
|
1138
|
+
|
1139
|
+
if parts.len() > 2 {
|
1140
|
+
return Err(ParquetError::Conversion(
|
1141
|
+
"Invalid decimal format".to_string(),
|
1142
|
+
));
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
let integer_part = if parts.is_empty() || parts[0].is_empty() {
|
1146
|
+
"0"
|
1147
|
+
} else {
|
1148
|
+
parts[0]
|
1149
|
+
};
|
1150
|
+
let fractional_part = if parts.len() == 2 { parts[1] } else { "" };
|
1151
|
+
|
1152
|
+
// Calculate the actual value considering the scale
|
1153
|
+
let current_scale = fractional_part.len() as i8;
|
1154
|
+
|
1155
|
+
if scale < 0 {
|
1156
|
+
return Err(ParquetError::Conversion(
|
1157
|
+
"Negative scale not supported".to_string(),
|
1158
|
+
));
|
1159
|
+
}
|
1160
|
+
|
1161
|
+
// Parse integer and fractional parts
|
1162
|
+
let integer_value: i128 = integer_part.parse().map_err(|e| {
|
1163
|
+
ParquetError::Conversion(format!("Failed to parse integer part: {}", e))
|
1164
|
+
})?;
|
1165
|
+
|
1166
|
+
let fractional_value: i128 = if fractional_part.is_empty() {
|
1167
|
+
0
|
1168
|
+
} else {
|
1169
|
+
fractional_part.parse().map_err(|e| {
|
1170
|
+
ParquetError::Conversion(format!("Failed to parse fractional part: {}", e))
|
1171
|
+
})?
|
1172
|
+
};
|
1173
|
+
|
1174
|
+
// Calculate the final value based on scale
|
1175
|
+
let scale_factor = 10_i128.pow(scale as u32);
|
1176
|
+
let current_scale_factor = 10_i128.pow(current_scale as u32);
|
1177
|
+
|
1178
|
+
let mut value = if current_scale <= scale {
|
1179
|
+
// Current scale is less than or equal to target scale - pad with zeros
|
1180
|
+
integer_value * scale_factor + fractional_value * (scale_factor / current_scale_factor)
|
1181
|
+
} else {
|
1182
|
+
// Current scale is greater than target scale - need to truncate/round
|
1183
|
+
let adjustment_factor = 10_i128.pow((current_scale - scale) as u32);
|
1184
|
+
let adjusted_fractional = fractional_value / adjustment_factor;
|
1185
|
+
integer_value * scale_factor + adjusted_fractional
|
1186
|
+
};
|
1187
|
+
|
1188
|
+
if is_negative {
|
1189
|
+
value = -value;
|
1190
|
+
}
|
1191
|
+
|
1192
|
+
Ok(ParquetValue::Decimal128(value, scale))
|
1193
|
+
}
|
1194
|
+
|
1195
|
+
fn parse_decimal256(&self, s: &str, _precision: u8, scale: i8) -> Result<ParquetValue> {
|
1196
|
+
// Parse decimal string to BigInt
|
1197
|
+
use num::{BigInt, Zero};
|
1198
|
+
|
1199
|
+
let clean = s.trim();
|
1200
|
+
|
1201
|
+
// Handle scientific notation by converting to regular decimal format
|
1202
|
+
let normalized = if clean.to_lowercase().contains('e') {
|
1203
|
+
// Parse as f64 first to handle scientific notation
|
1204
|
+
let f: f64 = clean.parse().map_err(|e| {
|
1205
|
+
ParquetError::Conversion(format!("Failed to parse scientific notation: {}", e))
|
1206
|
+
})?;
|
1207
|
+
// Convert to string with enough precision
|
1208
|
+
format!("{:.15}", f)
|
1209
|
+
.trim_end_matches('0')
|
1210
|
+
.trim_end_matches('.')
|
1211
|
+
.to_string()
|
1212
|
+
} else {
|
1213
|
+
clean.to_string()
|
1214
|
+
};
|
1215
|
+
|
1216
|
+
let is_negative = normalized.starts_with('-');
|
1217
|
+
let clean_abs = normalized.trim_start_matches('-').trim_start_matches('+');
|
1218
|
+
|
1219
|
+
let parts: Vec<&str> = clean_abs.split('.').collect();
|
1220
|
+
|
1221
|
+
if parts.len() > 2 {
|
1222
|
+
return Err(ParquetError::Conversion(
|
1223
|
+
"Invalid decimal format".to_string(),
|
1224
|
+
));
|
1225
|
+
}
|
1226
|
+
|
1227
|
+
let integer_part = if parts.is_empty() || parts[0].is_empty() {
|
1228
|
+
"0"
|
1229
|
+
} else {
|
1230
|
+
parts[0]
|
1231
|
+
};
|
1232
|
+
let fractional_part = if parts.len() == 2 { parts[1] } else { "" };
|
1233
|
+
|
1234
|
+
// Calculate the actual value considering the scale
|
1235
|
+
let current_scale = fractional_part.len() as i8;
|
1236
|
+
|
1237
|
+
if scale < 0 {
|
1238
|
+
return Err(ParquetError::Conversion(
|
1239
|
+
"Negative scale not supported".to_string(),
|
1240
|
+
));
|
1241
|
+
}
|
1242
|
+
|
1243
|
+
// Parse integer and fractional parts
|
1244
|
+
let integer_value: BigInt = integer_part.parse().map_err(|e| {
|
1245
|
+
ParquetError::Conversion(format!("Failed to parse integer part: {}", e))
|
1246
|
+
})?;
|
1247
|
+
|
1248
|
+
let fractional_value: BigInt = if fractional_part.is_empty() {
|
1249
|
+
BigInt::zero()
|
1250
|
+
} else {
|
1251
|
+
fractional_part.parse().map_err(|e| {
|
1252
|
+
ParquetError::Conversion(format!("Failed to parse fractional part: {}", e))
|
1253
|
+
})?
|
1254
|
+
};
|
1255
|
+
|
1256
|
+
// Calculate the final value based on scale
|
1257
|
+
let scale_factor = BigInt::from(10).pow(scale as u32);
|
1258
|
+
let current_scale_factor = BigInt::from(10).pow(current_scale as u32);
|
1259
|
+
|
1260
|
+
let mut value = if current_scale <= scale {
|
1261
|
+
// Current scale is less than or equal to target scale - pad with zeros
|
1262
|
+
integer_value * &scale_factor + fractional_value * (scale_factor / current_scale_factor)
|
1263
|
+
} else {
|
1264
|
+
// Current scale is greater than target scale - need to truncate/round
|
1265
|
+
let adjustment_factor = BigInt::from(10).pow((current_scale - scale) as u32);
|
1266
|
+
let adjusted_fractional = fractional_value / adjustment_factor;
|
1267
|
+
integer_value * &scale_factor + adjusted_fractional
|
1268
|
+
};
|
1269
|
+
|
1270
|
+
if is_negative {
|
1271
|
+
value = -value;
|
1272
|
+
}
|
1273
|
+
|
1274
|
+
Ok(ParquetValue::Decimal256(value, scale))
|
1275
|
+
}
|
1276
|
+
|
1277
|
+
/// Convert a Ruby array to a ParquetValue::List
|
1278
|
+
fn convert_to_list(
|
1279
|
+
&mut self,
|
1280
|
+
value: Value,
|
1281
|
+
item_schema: &parquet_core::SchemaNode,
|
1282
|
+
) -> Result<ParquetValue> {
|
1283
|
+
if value.is_nil() {
|
1284
|
+
return Ok(ParquetValue::Null);
|
1285
|
+
}
|
1286
|
+
|
1287
|
+
let array: RArray = TryConvert::try_convert(value).map_err(|e: MagnusError| {
|
1288
|
+
ParquetError::Conversion(format!("Expected Array for List type: {}", e))
|
1289
|
+
})?;
|
1290
|
+
|
1291
|
+
let mut list = Vec::with_capacity(array.len());
|
1292
|
+
for item in array.into_iter() {
|
1293
|
+
list.push(self.convert_with_schema_hint(item, item_schema)?);
|
1294
|
+
}
|
1295
|
+
|
1296
|
+
Ok(ParquetValue::List(list))
|
1297
|
+
}
|
1298
|
+
|
1299
|
+
/// Convert a Ruby hash to a ParquetValue::Map
|
1300
|
+
fn convert_to_map(
|
1301
|
+
&mut self,
|
1302
|
+
value: Value,
|
1303
|
+
key_schema: &parquet_core::SchemaNode,
|
1304
|
+
value_schema: &parquet_core::SchemaNode,
|
1305
|
+
) -> Result<ParquetValue> {
|
1306
|
+
if value.is_nil() {
|
1307
|
+
return Ok(ParquetValue::Null);
|
1308
|
+
}
|
1309
|
+
|
1310
|
+
let hash: RHash = TryConvert::try_convert(value).map_err(|e: MagnusError| {
|
1311
|
+
ParquetError::Conversion(format!("Expected Hash for Map type: {}", e))
|
1312
|
+
})?;
|
1313
|
+
|
1314
|
+
// Collect key-value pairs first
|
1315
|
+
let mut kv_pairs = Vec::new();
|
1316
|
+
hash.foreach(|k: Value, v: Value| {
|
1317
|
+
kv_pairs.push((k, v));
|
1318
|
+
Ok(ForEach::Continue)
|
1319
|
+
})
|
1320
|
+
.map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
|
1321
|
+
|
1322
|
+
// Now convert them with mutable self
|
1323
|
+
let mut map = Vec::new();
|
1324
|
+
for (k, v) in kv_pairs {
|
1325
|
+
let key = self.convert_with_schema_hint(k, key_schema)?;
|
1326
|
+
let val = self.convert_with_schema_hint(v, value_schema)?;
|
1327
|
+
map.push((key, val));
|
1328
|
+
}
|
1329
|
+
|
1330
|
+
Ok(ParquetValue::Map(map))
|
1331
|
+
}
|
1332
|
+
|
1333
|
+
/// Convert a Ruby hash to a ParquetValue::Record (struct)
|
1334
|
+
fn convert_to_struct(
|
1335
|
+
&mut self,
|
1336
|
+
value: Value,
|
1337
|
+
fields: &[parquet_core::SchemaNode],
|
1338
|
+
) -> Result<ParquetValue> {
|
1339
|
+
if value.is_nil() {
|
1340
|
+
return Ok(ParquetValue::Null);
|
1341
|
+
}
|
1342
|
+
|
1343
|
+
let hash: RHash = TryConvert::try_convert(value).map_err(|e: MagnusError| {
|
1344
|
+
ParquetError::Conversion(format!("Expected Hash for Struct type: {}", e))
|
1345
|
+
})?;
|
1346
|
+
|
1347
|
+
let mut record = IndexMap::new();
|
1348
|
+
|
1349
|
+
for field in fields {
|
1350
|
+
let field_name = field.name();
|
1351
|
+
let ruby_key = Symbol::new(field_name);
|
1352
|
+
|
1353
|
+
// Try symbol key first, then string key
|
1354
|
+
let field_value = if let Some(val) = hash.get(ruby_key) {
|
1355
|
+
val
|
1356
|
+
} else if let Some(val) = hash.get(field_name) {
|
1357
|
+
val
|
1358
|
+
} else {
|
1359
|
+
// Field not found, use null
|
1360
|
+
Ruby::get()
|
1361
|
+
.map_err(|_| {
|
1362
|
+
ParquetError::Conversion("Failed to get Ruby runtime".to_string())
|
1363
|
+
})?
|
1364
|
+
.qnil()
|
1365
|
+
.as_value()
|
1366
|
+
};
|
1367
|
+
|
1368
|
+
let converted = self.convert_with_schema_hint(field_value, field)?;
|
1369
|
+
record.insert(field_name.into(), converted);
|
1370
|
+
}
|
1371
|
+
|
1372
|
+
Ok(ParquetValue::Record(record))
|
1373
|
+
}
|
1374
|
+
}
|
1375
|
+
|
1376
|
+
// Helper functions for one-off conversions where we don't need string caching
|
1377
|
+
|
1378
|
+
pub fn ruby_to_parquet(value: Value) -> Result<ParquetValue> {
|
1379
|
+
let mut converter = RubyValueConverter::new();
|
1380
|
+
converter.infer_and_convert(value)
|
1381
|
+
}
|
1382
|
+
|
1383
|
+
pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
1384
|
+
let ruby = Ruby::get()
|
1385
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
1386
|
+
|
1387
|
+
match value {
|
1388
|
+
ParquetValue::Null => Ok(ruby.qnil().as_value()),
|
1389
|
+
ParquetValue::Boolean(b) => Ok(b.into_value_with(&ruby)),
|
1390
|
+
ParquetValue::Int8(i) => Ok((i as i64).into_value_with(&ruby)),
|
1391
|
+
ParquetValue::Int16(i) => Ok((i as i64).into_value_with(&ruby)),
|
1392
|
+
ParquetValue::Int32(i) => Ok((i as i64).into_value_with(&ruby)),
|
1393
|
+
ParquetValue::Int64(i) => Ok(i.into_value_with(&ruby)),
|
1394
|
+
ParquetValue::UInt8(i) => Ok((i as u64).into_value_with(&ruby)),
|
1395
|
+
ParquetValue::UInt16(i) => Ok((i as u64).into_value_with(&ruby)),
|
1396
|
+
ParquetValue::UInt32(i) => Ok((i as u64).into_value_with(&ruby)),
|
1397
|
+
ParquetValue::UInt64(i) => Ok(i.into_value_with(&ruby)),
|
1398
|
+
ParquetValue::Float16(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
|
1399
|
+
ParquetValue::Float32(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
|
1400
|
+
ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
|
1401
|
+
ParquetValue::String(s) => Ok(s.into_value_with(&ruby)),
|
1402
|
+
ParquetValue::Bytes(b) => {
|
1403
|
+
// Check if this is a UUID (16 bytes)
|
1404
|
+
if b.len() == 16 {
|
1405
|
+
// Format as UUID string
|
1406
|
+
let uuid_str = format!(
|
1407
|
+
"{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
|
1408
|
+
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
|
1409
|
+
b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
|
1410
|
+
);
|
1411
|
+
Ok(uuid_str.into_value_with(&ruby))
|
1412
|
+
} else {
|
1413
|
+
// Regular bytes - convert to string
|
1414
|
+
Ok(ruby.str_from_slice(&b).as_value())
|
1415
|
+
}
|
1416
|
+
}
|
1417
|
+
ParquetValue::Date32(days) => {
|
1418
|
+
// Convert days since epoch to Date object
|
1419
|
+
let _ = ruby.require("date");
|
1420
|
+
let kernel = ruby.module_kernel();
|
1421
|
+
let date_class = kernel
|
1422
|
+
.const_get::<_, Value>("Date")
|
1423
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1424
|
+
let secs = days as i64 * 86400;
|
1425
|
+
let time_class = ruby.class_time();
|
1426
|
+
let time = time_class
|
1427
|
+
.funcall::<_, _, Value>("at", (secs,))
|
1428
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?
|
1429
|
+
.funcall::<_, _, Value>("utc", ())
|
1430
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1431
|
+
let year: i32 = time
|
1432
|
+
.funcall("year", ())
|
1433
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1434
|
+
let month: i32 = time
|
1435
|
+
.funcall("month", ())
|
1436
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1437
|
+
let day: i32 = time
|
1438
|
+
.funcall("day", ())
|
1439
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1440
|
+
date_class
|
1441
|
+
.funcall("new", (year, month, day))
|
1442
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1443
|
+
}
|
1444
|
+
ParquetValue::Date64(millis) => {
|
1445
|
+
// Convert millis to Time object
|
1446
|
+
let time_class = ruby.class_time();
|
1447
|
+
let secs = millis / 1000;
|
1448
|
+
let nsec = (millis % 1000) * 1_000_000;
|
1449
|
+
time_class
|
1450
|
+
.funcall("at", (secs, nsec))
|
1451
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1452
|
+
}
|
1453
|
+
ParquetValue::TimeMillis(millis) => {
|
1454
|
+
// Convert to Time object for today with given time
|
1455
|
+
let time_class = ruby.class_time();
|
1456
|
+
let hours = millis / (3600 * 1000);
|
1457
|
+
let minutes = (millis % (3600 * 1000)) / (60 * 1000);
|
1458
|
+
let seconds = (millis % (60 * 1000)) / 1000;
|
1459
|
+
let ms = millis % 1000;
|
1460
|
+
|
1461
|
+
let now: Value = time_class
|
1462
|
+
.funcall("now", ())
|
1463
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1464
|
+
let year: i32 = now
|
1465
|
+
.funcall("year", ())
|
1466
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1467
|
+
let month: i32 = now
|
1468
|
+
.funcall("month", ())
|
1469
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1470
|
+
let day: i32 = now
|
1471
|
+
.funcall("day", ())
|
1472
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1473
|
+
|
1474
|
+
time_class
|
1475
|
+
.funcall(
|
1476
|
+
"utc",
|
1477
|
+
(year, month, day, hours, minutes, seconds, ms * 1000),
|
1478
|
+
)
|
1479
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1480
|
+
}
|
1481
|
+
ParquetValue::TimeMicros(micros) => {
|
1482
|
+
// Similar to TimeMillis but with microsecond precision
|
1483
|
+
let time_class = ruby.class_time();
|
1484
|
+
let hours = micros / (3600 * 1_000_000);
|
1485
|
+
let minutes = (micros % (3600 * 1_000_000)) / (60 * 1_000_000);
|
1486
|
+
let seconds = (micros % (60 * 1_000_000)) / 1_000_000;
|
1487
|
+
let us = micros % 1_000_000;
|
1488
|
+
|
1489
|
+
let now: Value = time_class
|
1490
|
+
.funcall("now", ())
|
1491
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1492
|
+
let year: i32 = now
|
1493
|
+
.funcall("year", ())
|
1494
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1495
|
+
let month: i32 = now
|
1496
|
+
.funcall("month", ())
|
1497
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1498
|
+
let day: i32 = now
|
1499
|
+
.funcall("day", ())
|
1500
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1501
|
+
|
1502
|
+
time_class
|
1503
|
+
.funcall("utc", (year, month, day, hours, minutes, seconds, us))
|
1504
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1505
|
+
}
|
1506
|
+
ParquetValue::TimestampSecond(secs, tz) => {
|
1507
|
+
let time_class = ruby.class_time();
|
1508
|
+
let time = time_class
|
1509
|
+
.funcall::<_, _, Value>("at", (secs,))
|
1510
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1511
|
+
apply_timezone(time, &tz)
|
1512
|
+
}
|
1513
|
+
ParquetValue::TimestampMillis(millis, tz) => {
|
1514
|
+
let time_class = ruby.class_time();
|
1515
|
+
let secs = millis / 1000;
|
1516
|
+
let usec = (millis % 1000) * 1000; // Convert millisecond remainder to microseconds
|
1517
|
+
let time = time_class
|
1518
|
+
.funcall::<_, _, Value>("at", (secs, usec))
|
1519
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1520
|
+
apply_timezone(time, &tz)
|
1521
|
+
}
|
1522
|
+
ParquetValue::TimestampMicros(micros, tz) => {
|
1523
|
+
let time_class = ruby.class_time();
|
1524
|
+
let secs = micros / 1_000_000;
|
1525
|
+
let usec = micros % 1_000_000; // Already in microseconds
|
1526
|
+
let time = time_class
|
1527
|
+
.funcall::<_, _, Value>("at", (secs, usec))
|
1528
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1529
|
+
apply_timezone(time, &tz)
|
1530
|
+
}
|
1531
|
+
ParquetValue::TimestampNanos(nanos, tz) => {
|
1532
|
+
let time_class = ruby.class_time();
|
1533
|
+
let secs = nanos / 1_000_000_000;
|
1534
|
+
let nsec = nanos % 1_000_000_000;
|
1535
|
+
// Use the nanosecond form of Time.at
|
1536
|
+
let time = time_class
|
1537
|
+
.funcall::<_, _, Value>("at", (secs, nsec, Symbol::new("nanosecond")))
|
1538
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1539
|
+
apply_timezone(time, &tz)
|
1540
|
+
}
|
1541
|
+
ParquetValue::Decimal128(val, scale) => {
|
1542
|
+
// Load BigDecimal if needed
|
1543
|
+
let _ = ruby.require("bigdecimal");
|
1544
|
+
|
1545
|
+
// Format decimal with scale
|
1546
|
+
let str_val = format_decimal128(val, scale);
|
1547
|
+
let kernel = ruby.module_kernel();
|
1548
|
+
kernel
|
1549
|
+
.funcall("BigDecimal", (str_val,))
|
1550
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1551
|
+
}
|
1552
|
+
ParquetValue::Decimal256(val, scale) => {
|
1553
|
+
// Load BigDecimal if needed
|
1554
|
+
let _ = ruby.require("bigdecimal");
|
1555
|
+
|
1556
|
+
// Format decimal with scale
|
1557
|
+
let str_val = format_decimal256(&val, scale);
|
1558
|
+
let kernel = ruby.module_kernel();
|
1559
|
+
kernel
|
1560
|
+
.funcall("BigDecimal", (str_val,))
|
1561
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1562
|
+
}
|
1563
|
+
ParquetValue::List(list) => {
|
1564
|
+
let array = ruby.ary_new_capa(list.len());
|
1565
|
+
for item in list {
|
1566
|
+
let ruby_val = parquet_to_ruby(item)?;
|
1567
|
+
array
|
1568
|
+
.push(ruby_val)
|
1569
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1570
|
+
}
|
1571
|
+
Ok(array.as_value())
|
1572
|
+
}
|
1573
|
+
ParquetValue::Map(map) => {
|
1574
|
+
let hash = ruby.hash_new();
|
1575
|
+
for (k, v) in map {
|
1576
|
+
let ruby_key = parquet_to_ruby(k)?;
|
1577
|
+
let ruby_val = parquet_to_ruby(v)?;
|
1578
|
+
hash.aset(ruby_key, ruby_val)
|
1579
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1580
|
+
}
|
1581
|
+
Ok(hash.as_value())
|
1582
|
+
}
|
1583
|
+
ParquetValue::Record(record) => {
|
1584
|
+
// Convert Record to Ruby Hash
|
1585
|
+
let hash = ruby.hash_new();
|
1586
|
+
for (field_name, field_value) in record {
|
1587
|
+
let ruby_key = ruby.str_new(&field_name);
|
1588
|
+
let ruby_val = parquet_to_ruby(field_value)?;
|
1589
|
+
hash.aset(ruby_key, ruby_val)
|
1590
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1591
|
+
}
|
1592
|
+
Ok(hash.as_value())
|
1593
|
+
}
|
1594
|
+
}
|
1595
|
+
}
|
1596
|
+
|
1597
|
+
// Helper functions for decimal formatting
|
1598
|
+
|
1599
|
+
fn format_decimal128(value: i128, scale: i8) -> String {
|
1600
|
+
if scale == 0 {
|
1601
|
+
return value.to_string();
|
1602
|
+
}
|
1603
|
+
|
1604
|
+
let abs_value = value.abs();
|
1605
|
+
let sign = if value < 0 { "-" } else { "" };
|
1606
|
+
|
1607
|
+
if scale > 0 {
|
1608
|
+
let divisor = 10_i128.pow(scale as u32);
|
1609
|
+
let integer_part = abs_value / divisor;
|
1610
|
+
let fractional_part = abs_value % divisor;
|
1611
|
+
format!(
|
1612
|
+
"{}{}.{:0>width$}",
|
1613
|
+
sign,
|
1614
|
+
integer_part,
|
1615
|
+
fractional_part,
|
1616
|
+
width = scale as usize
|
1617
|
+
)
|
1618
|
+
} else {
|
1619
|
+
// Negative scale means multiply by 10^(-scale)
|
1620
|
+
let multiplier = 10_i128.pow((-scale) as u32);
|
1621
|
+
format!("{}{}", sign, abs_value * multiplier)
|
1622
|
+
}
|
1623
|
+
}
|
1624
|
+
|
1625
|
+
fn format_decimal256(value: &num::BigInt, scale: i8) -> String {
|
1626
|
+
use num::{BigInt, Signed};
|
1627
|
+
|
1628
|
+
if scale == 0 {
|
1629
|
+
return value.to_string();
|
1630
|
+
}
|
1631
|
+
|
1632
|
+
let abs_value = value.abs();
|
1633
|
+
let sign = if value.is_negative() { "-" } else { "" };
|
1634
|
+
|
1635
|
+
if scale > 0 {
|
1636
|
+
let ten = BigInt::from(10);
|
1637
|
+
let divisor = ten.pow(scale as u32);
|
1638
|
+
let integer_part = &abs_value / &divisor;
|
1639
|
+
let fractional_part = &abs_value % &divisor;
|
1640
|
+
|
1641
|
+
// Format fractional part with leading zeros
|
1642
|
+
let frac_str = fractional_part.to_string();
|
1643
|
+
let padding = scale as usize - frac_str.len();
|
1644
|
+
let zeros = "0".repeat(padding);
|
1645
|
+
|
1646
|
+
format!("{}{}.{}{}", sign, integer_part, zeros, frac_str)
|
1647
|
+
} else {
|
1648
|
+
// Negative scale means multiply by 10^(-scale)
|
1649
|
+
let ten = BigInt::from(10);
|
1650
|
+
let multiplier = ten.pow((-scale) as u32);
|
1651
|
+
format!("{}{}", sign, abs_value * multiplier)
|
1652
|
+
}
|
1653
|
+
}
|
1654
|
+
|
1655
|
+
/// Apply timezone when reading timestamp from Parquet file
|
1656
|
+
///
|
1657
|
+
/// PARQUET SPEC COMPLIANCE:
|
1658
|
+
/// - If schema has ANY timezone -> values are UTC (isAdjustedToUTC = true)
|
1659
|
+
/// - If schema has NO timezone -> values are local/unzoned (isAdjustedToUTC = false)
|
1660
|
+
///
|
1661
|
+
/// NOTE: The actual timezone string in the schema is irrelevant for reading.
|
1662
|
+
/// Whether it's "UTC", "+09:00", or "America/New_York", the stored values
|
1663
|
+
/// are ALWAYS UTC-normalized. We return them as UTC Time objects.
|
1664
|
+
fn apply_timezone(time: Value, tz: &Option<std::sync::Arc<str>>) -> Result<Value> {
|
1665
|
+
let _ruby = Ruby::get()
|
1666
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
1667
|
+
|
1668
|
+
match tz {
|
1669
|
+
Some(_) => {
|
1670
|
+
// ANY timezone = UTC storage (Parquet spec requirement)
|
1671
|
+
// Original timezone like "+09:00" is NOT preserved
|
1672
|
+
time.funcall("utc", ())
|
1673
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1674
|
+
}
|
1675
|
+
None => {
|
1676
|
+
// No timezone = local/unzoned timestamp
|
1677
|
+
// This is a "wall clock" time without timezone context
|
1678
|
+
Ok(time)
|
1679
|
+
}
|
1680
|
+
}
|
1681
|
+
}
|
1682
|
+
|
1683
|
+
// Note: These wrapper functions are needed because ValueConverter is not thread-safe
|
1684
|
+
// due to Ruby's GIL requirements. They are called from Ruby FFI functions where we know
|
1685
|
+
// we're in the correct thread context.
|