parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,1734 @@
1
+ use crate::string_cache::StringCache;
2
+ use crate::string_storage::StringStorage;
3
+ use bytes::Bytes;
4
+ use indexmap::IndexMap;
5
+ use magnus::r_hash::ForEach;
6
+ use magnus::value::ReprValue;
7
+ use magnus::{
8
+ kwargs, Error as MagnusError, IntoValue, Module, RArray, RHash, RString, Ruby, TryConvert,
9
+ Value,
10
+ };
11
+ use ordered_float::OrderedFloat;
12
+ use parquet_core::{ParquetError, ParquetValue, Result};
13
+ use std::cell::RefCell;
14
+ use triomphe::Arc;
15
+ use uuid::Uuid;
16
+
17
+ /// Ruby value converter
18
+ ///
19
+ /// Note: This converter is not thread-safe due to Ruby's GIL requirements.
20
+ /// It should only be used within Ruby's thread context.
21
+ #[derive(Default)]
22
+ pub struct RubyValueConverter {
23
+ string_cache: RefCell<Option<StringCache>>,
24
+ }
25
+
26
+ impl RubyValueConverter {
27
+ pub fn new() -> Self {
28
+ Self {
29
+ string_cache: RefCell::new(None),
30
+ }
31
+ }
32
+
33
+ pub fn with_string_cache(cache: StringCache) -> Self {
34
+ Self {
35
+ string_cache: RefCell::new(Some(cache)),
36
+ }
37
+ }
38
+
39
+ pub fn string_cache_stats(&self) -> Option<crate::string_cache::CacheStats> {
40
+ self.string_cache
41
+ .borrow()
42
+ .as_ref()
43
+ .map(|cache| cache.stats())
44
+ }
45
+
46
+ /// Convert a Ruby value to ParquetValue with schema hint
47
+ /// This handles both primitive and complex types
48
+ pub fn to_parquet_with_schema_hint(
49
+ &mut self,
50
+ value: Value,
51
+ schema_hint: Option<&parquet_core::SchemaNode>,
52
+ ) -> Result<ParquetValue> {
53
+ // Handle nil values
54
+ if value.is_nil() {
55
+ return Ok(ParquetValue::Null);
56
+ }
57
+
58
+ // If we have a schema hint, use it to guide conversion
59
+ if let Some(schema) = schema_hint {
60
+ return self.convert_with_schema_hint(value, schema);
61
+ }
62
+
63
+ // Otherwise, infer type from Ruby value
64
+ self.infer_and_convert(value)
65
+ }
66
+
67
+ /// Convert with explicit schema hint
68
+ fn convert_with_schema_hint(
69
+ &mut self,
70
+ value: Value,
71
+ schema: &parquet_core::SchemaNode,
72
+ ) -> Result<ParquetValue> {
73
+ use parquet_core::SchemaNode;
74
+
75
+ match schema {
76
+ SchemaNode::Primitive {
77
+ primitive_type,
78
+ format,
79
+ ..
80
+ } => self.convert_with_type_hint_and_format(value, primitive_type, format.as_deref()),
81
+ SchemaNode::List { item, .. } => self.convert_to_list(value, item.as_ref()),
82
+ SchemaNode::Map {
83
+ key, value: val, ..
84
+ } => self.convert_to_map(value, key.as_ref(), val.as_ref()),
85
+ SchemaNode::Struct { fields, .. } => self.convert_to_struct(value, fields),
86
+ }
87
+ }
88
+
89
+ /// Convert with explicit type hint and optional format
90
+ fn convert_with_type_hint_and_format(
91
+ &mut self,
92
+ value: Value,
93
+ type_hint: &parquet_core::PrimitiveType,
94
+ format: Option<&str>,
95
+ ) -> Result<ParquetValue> {
96
+ use parquet_core::PrimitiveType::*;
97
+
98
+ // Special handling for UUID format
99
+ if let (FixedLenByteArray(16), Some("uuid")) = (type_hint, format) {
100
+ return self.convert_to_uuid_binary(value);
101
+ }
102
+
103
+ // Handle date types with format
104
+ match type_hint {
105
+ Date32 => return self.convert_to_date32(value, format),
106
+ Date64 => return self.convert_to_date64(value, format),
107
+ _ => {}
108
+ }
109
+
110
+ // Default type hint conversion
111
+ self.convert_with_type_hint(value, type_hint)
112
+ }
113
+
114
+ /// Convert with explicit type hint
115
+ fn convert_with_type_hint(
116
+ &mut self,
117
+ value: Value,
118
+ type_hint: &parquet_core::PrimitiveType,
119
+ ) -> Result<ParquetValue> {
120
+ use parquet_core::PrimitiveType::*;
121
+
122
+ match type_hint {
123
+ Boolean => self.convert_to_boolean(value),
124
+ Int8 => self.convert_to_int8(value),
125
+ Int16 => self.convert_to_int16(value),
126
+ Int32 => self.convert_to_int32(value),
127
+ Int64 => self.convert_to_int64(value),
128
+ UInt8 => self.convert_to_uint8(value),
129
+ UInt16 => self.convert_to_uint16(value),
130
+ UInt32 => self.convert_to_uint32(value),
131
+ UInt64 => self.convert_to_uint64(value),
132
+ Float32 => self.convert_to_float32(value),
133
+ Float64 => self.convert_to_float64(value),
134
+ String => self.convert_to_string(value),
135
+ Binary => self.convert_to_binary(value),
136
+ Date32 => self.convert_to_date32(value, None),
137
+ Date64 => self.convert_to_date64(value, None),
138
+ TimeMillis => self.convert_to_time_millis(value),
139
+ TimeMicros => self.convert_to_time_micros(value),
140
+ TimeNanos => self.convert_to_time_nanos(value),
141
+ TimestampSecond(schema_tz) => {
142
+ self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
143
+ }
144
+ TimestampMillis(schema_tz) => {
145
+ self.convert_to_timestamp_millis_with_tz(value, schema_tz.as_deref())
146
+ }
147
+ TimestampMicros(schema_tz) => {
148
+ self.convert_to_timestamp_micros_with_tz(value, schema_tz.as_deref())
149
+ }
150
+ TimestampNanos(schema_tz) => {
151
+ self.convert_to_timestamp_nanos_with_tz(value, schema_tz.as_deref())
152
+ }
153
+ Decimal128(precision, scale) => self.convert_to_decimal128(value, *precision, *scale),
154
+ Decimal256(precision, scale) => self.convert_to_decimal256(value, *precision, *scale),
155
+ FixedLenByteArray(len) => self.convert_to_fixed_len_byte_array(value, *len),
156
+ }
157
+ }
158
+
159
+ /// Infer type from Ruby value and convert
160
+ fn infer_and_convert(&mut self, value: Value) -> Result<ParquetValue> {
161
+ let class_name = value.class().to_string();
162
+
163
+ match class_name.as_str() {
164
+ "Integer" => {
165
+ let i: i64 = TryConvert::try_convert(value)
166
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
167
+ Ok(ParquetValue::Int64(i))
168
+ }
169
+ "Float" => {
170
+ let f: f64 = TryConvert::try_convert(value)
171
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
172
+ Ok(ParquetValue::Float64(OrderedFloat(f)))
173
+ }
174
+ "String" => {
175
+ let s: String = TryConvert::try_convert(value)
176
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
177
+ Ok(ParquetValue::String(s.into()))
178
+ }
179
+ "TrueClass" | "FalseClass" => {
180
+ let b: bool = TryConvert::try_convert(value)
181
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
182
+ Ok(ParquetValue::Boolean(b))
183
+ }
184
+ "Array" => {
185
+ let array: RArray = TryConvert::try_convert(value)
186
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
187
+ let mut list = Vec::with_capacity(array.len());
188
+
189
+ for item in array.into_iter() {
190
+ list.push(self.infer_and_convert(item)?);
191
+ }
192
+
193
+ Ok(ParquetValue::List(list))
194
+ }
195
+ "Hash" => {
196
+ let hash: RHash = TryConvert::try_convert(value)
197
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
198
+ let mut map = Vec::new();
199
+ let mut conversion_error = None;
200
+
201
+ hash.foreach(|key: Value, val: Value| {
202
+ match (self.infer_and_convert(key), self.infer_and_convert(val)) {
203
+ (Ok(k), Ok(v)) => {
204
+ map.push((k, v));
205
+ Ok(ForEach::Continue)
206
+ }
207
+ (Err(e), _) | (_, Err(e)) => {
208
+ conversion_error = Some(e);
209
+ Ok(ForEach::Stop)
210
+ }
211
+ }
212
+ })
213
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
214
+
215
+ if let Some(err) = conversion_error {
216
+ return Err(err);
217
+ }
218
+
219
+ Ok(ParquetValue::Map(map))
220
+ }
221
+ "Time" => {
222
+ // Convert Ruby Time to timestamp millis
223
+ let millis = value
224
+ .funcall::<_, _, i64>("to_i", ())
225
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
226
+ * 1000
227
+ + value
228
+ .funcall::<_, _, i32>("nsec", ())
229
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
230
+ as i64
231
+ / 1_000_000;
232
+ let tz = self.extract_timezone(value)?;
233
+
234
+ Ok(ParquetValue::TimestampMillis(millis, tz))
235
+ }
236
+ "BigDecimal" => {
237
+ // Convert BigDecimal to Decimal128
238
+ let str_val: String = value
239
+ .funcall("to_s", ("F",))
240
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
241
+ self.parse_decimal128(&str_val, 38, 10) // Default precision and scale
242
+ }
243
+ _ => {
244
+ // Try to convert to string as fallback
245
+ let s: String = value.to_string();
246
+ Ok(ParquetValue::String(s.into()))
247
+ }
248
+ }
249
+ }
250
+
251
+ // Helper methods
252
+
253
+ /// Normalize timestamp for Parquet storage according to Parquet specification:
254
+ /// - WITH timezone in schema: Store as UTC (isAdjustedToUTC = true)
255
+ /// - WITHOUT timezone in schema: Store as local/unzoned time (isAdjustedToUTC = false)
256
+ ///
257
+ /// IMPORTANT: Parquet can ONLY store:
258
+ /// 1. UTC timestamps (when schema has ANY timezone)
259
+ /// 2. Local/unzoned timestamps (when schema has NO timezone)
260
+ ///
261
+ /// Non-UTC timezones like "+09:00" or "America/New_York" are NOT preserved.
262
+ fn normalize_timestamp_for_parquet(
263
+ &self,
264
+ time_value: Value,
265
+ schema_has_timezone: bool,
266
+ ) -> Result<Value> {
267
+ if schema_has_timezone {
268
+ // Schema has timezone -> MUST convert to UTC (Parquet limitation)
269
+ // The original timezone offset is lost - only UTC is stored
270
+ time_value
271
+ .funcall("utc", ())
272
+ .map_err(|e| ParquetError::Conversion(format!("Failed to convert to UTC: {}", e)))
273
+ } else {
274
+ // Schema has no timezone -> keep as local/unzoned time
275
+ // This represents a "wall clock" time without timezone information
276
+ Ok(time_value)
277
+ }
278
+ }
279
+
280
+ /// Extract timezone information from a Ruby Time object
281
+ fn extract_timezone(&self, time_value: Value) -> Result<Option<Arc<str>>> {
282
+ let _ruby = Ruby::get()
283
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
284
+
285
+ // Check if the time is in UTC
286
+ let is_utc: bool = time_value
287
+ .funcall("utc?", ())
288
+ .map_err(|e| ParquetError::Conversion(format!("Failed to check UTC: {}", e)))?;
289
+
290
+ if is_utc {
291
+ return Ok(Some("UTC".into()));
292
+ }
293
+
294
+ // Get the UTC offset in seconds
295
+ let utc_offset: i32 = time_value
296
+ .funcall("utc_offset", ())
297
+ .map_err(|e| ParquetError::Conversion(format!("Failed to get UTC offset: {}", e)))?;
298
+
299
+ // If offset is 0 and not explicitly UTC, it might be local time
300
+ if utc_offset == 0 {
301
+ // Check if this is actually UTC or just happens to have 0 offset
302
+ // We already checked utc? above, so this is local time with 0 offset
303
+ return Ok(None);
304
+ }
305
+
306
+ // Convert offset to hours and minutes
307
+ let hours = utc_offset / 3600;
308
+ let minutes = (utc_offset.abs() % 3600) / 60;
309
+
310
+ // Format as +HH:MM or -HH:MM
311
+ let tz_string = if minutes == 0 {
312
+ format!("{:+03}:00", hours)
313
+ } else {
314
+ format!("{:+03}:{:02}", hours, minutes)
315
+ };
316
+
317
+ Ok(Some(tz_string.into()))
318
+ }
319
+
320
+ // Conversion methods for specific types
321
+
322
+ fn convert_to_boolean(&self, value: Value) -> Result<ParquetValue> {
323
+ if value.is_nil() {
324
+ return Ok(ParquetValue::Null);
325
+ }
326
+
327
+ let b: bool = TryConvert::try_convert(value)
328
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
329
+ Ok(ParquetValue::Boolean(b))
330
+ }
331
+
332
+ fn convert_to_int8(&self, value: Value) -> Result<ParquetValue> {
333
+ if value.is_nil() {
334
+ return Ok(ParquetValue::Null);
335
+ }
336
+
337
+ let i = self.convert_numeric::<i8>(value)?;
338
+ Ok(ParquetValue::Int8(i))
339
+ }
340
+
341
+ fn convert_to_int16(&self, value: Value) -> Result<ParquetValue> {
342
+ if value.is_nil() {
343
+ return Ok(ParquetValue::Null);
344
+ }
345
+
346
+ let i = self.convert_numeric::<i16>(value)?;
347
+ Ok(ParquetValue::Int16(i))
348
+ }
349
+
350
+ fn convert_to_int32(&self, value: Value) -> Result<ParquetValue> {
351
+ if value.is_nil() {
352
+ return Ok(ParquetValue::Null);
353
+ }
354
+
355
+ let i = self.convert_numeric::<i32>(value)?;
356
+ Ok(ParquetValue::Int32(i))
357
+ }
358
+
359
+ fn convert_to_int64(&self, value: Value) -> Result<ParquetValue> {
360
+ if value.is_nil() {
361
+ return Ok(ParquetValue::Null);
362
+ }
363
+
364
+ let i = self.convert_numeric::<i64>(value)?;
365
+ Ok(ParquetValue::Int64(i))
366
+ }
367
+
368
+ fn convert_to_uint8(&self, value: Value) -> Result<ParquetValue> {
369
+ if value.is_nil() {
370
+ return Ok(ParquetValue::Null);
371
+ }
372
+
373
+ let i = self.convert_numeric::<u8>(value)?;
374
+ Ok(ParquetValue::UInt8(i))
375
+ }
376
+
377
+ fn convert_to_uint16(&self, value: Value) -> Result<ParquetValue> {
378
+ if value.is_nil() {
379
+ return Ok(ParquetValue::Null);
380
+ }
381
+
382
+ let i = self.convert_numeric::<u16>(value)?;
383
+ Ok(ParquetValue::UInt16(i))
384
+ }
385
+
386
+ fn convert_to_uint32(&self, value: Value) -> Result<ParquetValue> {
387
+ if value.is_nil() {
388
+ return Ok(ParquetValue::Null);
389
+ }
390
+
391
+ let i = self.convert_numeric::<u32>(value)?;
392
+ Ok(ParquetValue::UInt32(i))
393
+ }
394
+
395
+ fn convert_to_uint64(&self, value: Value) -> Result<ParquetValue> {
396
+ if value.is_nil() {
397
+ return Ok(ParquetValue::Null);
398
+ }
399
+
400
+ let i = self.convert_numeric::<u64>(value)?;
401
+ Ok(ParquetValue::UInt64(i))
402
+ }
403
+
404
+ fn convert_to_float32(&self, value: Value) -> Result<ParquetValue> {
405
+ if value.is_nil() {
406
+ return Ok(ParquetValue::Null);
407
+ }
408
+
409
+ let f = self.convert_numeric::<f32>(value)?;
410
+ Ok(ParquetValue::Float32(OrderedFloat(f)))
411
+ }
412
+
413
+ fn convert_to_float64(&self, value: Value) -> Result<ParquetValue> {
414
+ if value.is_nil() {
415
+ return Ok(ParquetValue::Null);
416
+ }
417
+
418
+ let f = self.convert_numeric::<f64>(value)?;
419
+ Ok(ParquetValue::Float64(OrderedFloat(f)))
420
+ }
421
+
422
+ fn convert_to_string(&mut self, value: Value) -> Result<ParquetValue> {
423
+ if value.is_nil() {
424
+ return Ok(ParquetValue::Null);
425
+ }
426
+
427
+ // Convert any value to string using to_s
428
+ let s: String = value
429
+ .funcall("to_s", ())
430
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
431
+
432
+ // Use shared storage for repeated string values when the writer enabled caching.
433
+ if let Some(ref mut cache) = self.string_cache.borrow_mut().as_mut() {
434
+ let interned = cache.intern(s);
435
+ Ok(ParquetValue::String(interned))
436
+ } else {
437
+ Ok(ParquetValue::String(s.into()))
438
+ }
439
+ }
440
+
441
+ fn convert_to_binary(&self, value: Value) -> Result<ParquetValue> {
442
+ if value.is_nil() {
443
+ return Ok(ParquetValue::Null);
444
+ }
445
+
446
+ let ruby = Ruby::get()
447
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
448
+ if value.is_kind_of(ruby.class_string()) {
449
+ let s: RString = TryConvert::try_convert(value)
450
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
451
+ let bytes = unsafe { Bytes::copy_from_slice(s.as_slice()) };
452
+ Ok(ParquetValue::Bytes(bytes))
453
+ } else {
454
+ // Try to convert to string first
455
+ let s: String = TryConvert::try_convert(value)
456
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
457
+ Ok(ParquetValue::Bytes(s.into()))
458
+ }
459
+ }
460
+
461
+ fn convert_to_uuid_binary(&self, value: Value) -> Result<ParquetValue> {
462
+ if value.is_nil() {
463
+ return Ok(ParquetValue::Null);
464
+ }
465
+
466
+ // Convert value to string
467
+ let uuid_str: String = value
468
+ .to_r_string()
469
+ .map_err(|e: MagnusError| {
470
+ ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
471
+ })?
472
+ .to_string()
473
+ .map_err(|e: MagnusError| {
474
+ ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
475
+ })?;
476
+
477
+ let parsed = uuid::Uuid::parse_str(&uuid_str)
478
+ .map_err(|e| ParquetError::Conversion(format!("Failed to parse UUID: {}", e)))?;
479
+ let bytes = Bytes::copy_from_slice(parsed.as_bytes());
480
+ Ok(ParquetValue::Bytes(bytes))
481
+ }
482
+
483
+ fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
484
+ if value.is_nil() {
485
+ return Ok(ParquetValue::Null);
486
+ }
487
+
488
+ // Handle Time objects
489
+ let ruby = Ruby::get()
490
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
491
+ if value.is_kind_of(ruby.class_time()) {
492
+ let secs: i64 = value
493
+ .funcall("to_i", ())
494
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
495
+ let days = (secs / 86400) as i32;
496
+ return Ok(ParquetValue::Date32(days));
497
+ }
498
+
499
+ // Handle strings
500
+ if value.is_kind_of(ruby.class_string()) {
501
+ // Use Ruby's Date module
502
+ let _ = ruby.require("date");
503
+ let kernel = ruby.module_kernel();
504
+ let date_module = kernel
505
+ .const_get::<_, Value>("Date")
506
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
507
+
508
+ // Use strptime if format is provided, otherwise use parse
509
+ let date = if let Some(format) = date_format {
510
+ date_module
511
+ .funcall::<_, _, Value>("strptime", (value, format))
512
+ .map_err(|e| {
513
+ ParquetError::Conversion(format!(
514
+ "Failed to parse date with format '{}': {}",
515
+ format, e
516
+ ))
517
+ })?
518
+ } else {
519
+ date_module
520
+ .funcall::<_, _, Value>("parse", (value,))
521
+ .map_err(|e| ParquetError::Conversion(format!("Failed to parse date: {}", e)))?
522
+ };
523
+
524
+ // Convert to Time object then to days since epoch
525
+ let time = date
526
+ .funcall::<_, _, Value>("to_time", ())
527
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
528
+ let secs: i64 = time
529
+ .funcall("to_i", ())
530
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
531
+ let days = (secs / 86400) as i32;
532
+ return Ok(ParquetValue::Date32(days));
533
+ }
534
+
535
+ Err(ParquetError::Conversion(format!(
536
+ "Cannot convert {} to date32",
537
+ value.class()
538
+ )))
539
+ }
540
+
541
+ fn convert_to_date64(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
542
+ if value.is_nil() {
543
+ return Ok(ParquetValue::Null);
544
+ }
545
+
546
+ // Similar to date32 but returns milliseconds since epoch
547
+ let ruby = Ruby::get()
548
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
549
+ if value.is_kind_of(ruby.class_time()) {
550
+ let millis: i64 = value
551
+ .funcall::<_, _, i64>("to_i", ())
552
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
553
+ * 1000;
554
+ return Ok(ParquetValue::Date64(millis));
555
+ }
556
+
557
+ // Handle strings
558
+ if value.is_kind_of(ruby.class_string()) {
559
+ // Use Ruby's Date module
560
+ let _ = ruby.require("date");
561
+ let kernel = ruby.module_kernel();
562
+ let date_module = kernel
563
+ .const_get::<_, Value>("Date")
564
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
565
+
566
+ // Use strptime if format is provided, otherwise use parse
567
+ let date = if let Some(format) = date_format {
568
+ date_module
569
+ .funcall::<_, _, Value>("strptime", (value, format))
570
+ .map_err(|e| {
571
+ ParquetError::Conversion(format!(
572
+ "Failed to parse date with format '{}': {}",
573
+ format, e
574
+ ))
575
+ })?
576
+ } else {
577
+ date_module
578
+ .funcall::<_, _, Value>("parse", (value,))
579
+ .map_err(|e| ParquetError::Conversion(format!("Failed to parse date: {}", e)))?
580
+ };
581
+
582
+ // Convert to Time object then to milliseconds since epoch
583
+ let time = date
584
+ .funcall::<_, _, Value>("to_time", ())
585
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
586
+ let secs: i64 = time
587
+ .funcall("to_i", ())
588
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
589
+ let millis = secs * 1000;
590
+ return Ok(ParquetValue::Date64(millis));
591
+ }
592
+
593
+ Err(ParquetError::Conversion(format!(
594
+ "Cannot convert {} to date64",
595
+ value.class()
596
+ )))
597
+ }
598
+
599
+ fn convert_to_time_millis(&self, value: Value) -> Result<ParquetValue> {
600
+ if value.is_nil() {
601
+ return Ok(ParquetValue::Null);
602
+ }
603
+
604
+ // Convert to milliseconds since midnight
605
+ let ruby = Ruby::get()
606
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
607
+ if value.is_kind_of(ruby.class_time()) {
608
+ let hour: i32 = value
609
+ .funcall("hour", ())
610
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
611
+ let min: i32 = value
612
+ .funcall("min", ())
613
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
614
+ let sec: i32 = value
615
+ .funcall("sec", ())
616
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
617
+ let nsec: i32 = value
618
+ .funcall("nsec", ())
619
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
620
+
621
+ let millis = (hour * 3600 + min * 60 + sec) * 1000 + nsec / 1_000_000;
622
+ return Ok(ParquetValue::TimeMillis(millis));
623
+ }
624
+
625
+ Err(ParquetError::Conversion(format!(
626
+ "Cannot convert {} to time_millis",
627
+ value.class()
628
+ )))
629
+ }
630
+
631
+ fn convert_to_time_micros(&self, value: Value) -> Result<ParquetValue> {
632
+ if value.is_nil() {
633
+ return Ok(ParquetValue::Null);
634
+ }
635
+
636
+ // Convert to microseconds since midnight
637
+ let ruby = Ruby::get()
638
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
639
+ if value.is_kind_of(ruby.class_time()) {
640
+ let hour: i64 = value
641
+ .funcall("hour", ())
642
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
643
+ let min: i64 = value
644
+ .funcall("min", ())
645
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
646
+ let sec: i64 = value
647
+ .funcall("sec", ())
648
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
649
+ let nsec: i64 = value
650
+ .funcall("nsec", ())
651
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
652
+
653
+ let micros = (hour * 3600 + min * 60 + sec) * 1_000_000 + nsec / 1000;
654
+ return Ok(ParquetValue::TimeMicros(micros));
655
+ }
656
+
657
+ Err(ParquetError::Conversion(format!(
658
+ "Cannot convert {} to time_micros",
659
+ value.class()
660
+ )))
661
+ }
662
+
663
+ fn convert_to_time_nanos(&self, value: Value) -> Result<ParquetValue> {
664
+ if value.is_nil() {
665
+ return Ok(ParquetValue::Null);
666
+ }
667
+
668
+ // Convert to microseconds since midnight
669
+ let ruby = Ruby::get()
670
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
671
+ if value.is_kind_of(ruby.class_time()) {
672
+ let hour: i64 = value
673
+ .funcall("hour", ())
674
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
675
+ let min: i64 = value
676
+ .funcall("min", ())
677
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
678
+ let sec: i64 = value
679
+ .funcall("sec", ())
680
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
681
+ let nsec: i64 = value
682
+ .funcall("nsec", ())
683
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
684
+
685
+ let nanos = (hour * 3600 + min * 60 + sec) * 1_000_000_000 + nsec;
686
+ return Ok(ParquetValue::TimeNanos(nanos));
687
+ }
688
+
689
+ Err(ParquetError::Conversion(format!(
690
+ "Cannot convert {} to time_micros",
691
+ value.class()
692
+ )))
693
+ }
694
+
695
+ // Timestamp conversion methods that respect schema timezone
696
+ fn convert_to_timestamp_second_with_tz(
697
+ &self,
698
+ value: Value,
699
+ schema_tz: Option<&str>,
700
+ ) -> Result<ParquetValue> {
701
+ if value.is_nil() {
702
+ return Ok(ParquetValue::Null);
703
+ }
704
+
705
+ let ruby = Ruby::get()
706
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
707
+ if value.is_kind_of(ruby.class_time()) {
708
+ // Normalize timestamp according to Parquet spec
709
+ let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
710
+
711
+ let secs: i64 = adjusted_time
712
+ .funcall("to_i", ())
713
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
714
+
715
+ // PARQUET TIMESTAMP STORAGE:
716
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
717
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
718
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
719
+ let tz = if schema_tz.is_some() {
720
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
721
+ } else {
722
+ None // Unzoned/local timestamp
723
+ };
724
+
725
+ return Ok(ParquetValue::TimestampSecond(secs, tz));
726
+ }
727
+
728
+ // Handle strings
729
+ if value.is_kind_of(ruby.class_string()) {
730
+ // Use Ruby's Time.parse to handle timestamp strings
731
+ let time_class = ruby.class_time();
732
+ let time = time_class
733
+ .funcall::<_, _, Value>("parse", (value,))
734
+ .map_err(|e| {
735
+ ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
736
+ })?;
737
+
738
+ // Normalize timestamp according to Parquet spec
739
+ let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
740
+
741
+ let secs: i64 = adjusted_time
742
+ .funcall("to_i", ())
743
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
744
+
745
+ // PARQUET TIMESTAMP STORAGE:
746
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
747
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
748
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
749
+ let tz = if schema_tz.is_some() {
750
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
751
+ } else {
752
+ None // Unzoned/local timestamp
753
+ };
754
+
755
+ return Ok(ParquetValue::TimestampSecond(secs, tz));
756
+ }
757
+
758
+ Err(ParquetError::Conversion(format!(
759
+ "Cannot convert {} to timestamp_second",
760
+ value.class()
761
+ )))
762
+ }
763
+
764
+ fn convert_to_timestamp_millis_with_tz(
765
+ &self,
766
+ value: Value,
767
+ schema_tz: Option<&str>,
768
+ ) -> Result<ParquetValue> {
769
+ if value.is_nil() {
770
+ return Ok(ParquetValue::Null);
771
+ }
772
+
773
+ let ruby = Ruby::get()
774
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
775
+ if value.is_kind_of(ruby.class_time()) {
776
+ // Normalize timestamp according to Parquet spec
777
+ let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
778
+
779
+ let millis = adjusted_time
780
+ .funcall::<_, _, i64>("to_i", ())
781
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
782
+ * 1000
783
+ + adjusted_time
784
+ .funcall::<_, _, i32>("nsec", ())
785
+ .map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
786
+ / 1_000_000;
787
+
788
+ // PARQUET TIMESTAMP STORAGE:
789
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
790
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
791
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
792
+ let tz = if schema_tz.is_some() {
793
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
794
+ } else {
795
+ None // Unzoned/local timestamp
796
+ };
797
+
798
+ return Ok(ParquetValue::TimestampMillis(millis, tz));
799
+ }
800
+
801
+ // Handle strings
802
+ if value.is_kind_of(ruby.class_string()) {
803
+ // Use Ruby's Time.parse to handle timestamp strings
804
+ let time_class = ruby.class_time();
805
+ let time = time_class
806
+ .funcall::<_, _, Value>("parse", (value,))
807
+ .map_err(|e| {
808
+ ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
809
+ })?;
810
+
811
+ // Normalize timestamp according to Parquet spec
812
+ let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
813
+
814
+ let millis = adjusted_time
815
+ .funcall::<_, _, i64>("to_i", ())
816
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
817
+ * 1000
818
+ + adjusted_time
819
+ .funcall::<_, _, i32>("nsec", ())
820
+ .map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
821
+ / 1_000_000;
822
+
823
+ // PARQUET TIMESTAMP STORAGE:
824
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
825
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
826
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
827
+ let tz = if schema_tz.is_some() {
828
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
829
+ } else {
830
+ None // Unzoned/local timestamp
831
+ };
832
+
833
+ return Ok(ParquetValue::TimestampMillis(millis, tz));
834
+ }
835
+
836
+ Err(ParquetError::Conversion(format!(
837
+ "Cannot convert {} to timestamp_millis",
838
+ value.class()
839
+ )))
840
+ }
841
+
842
+ fn convert_to_timestamp_micros_with_tz(
843
+ &self,
844
+ value: Value,
845
+ schema_tz: Option<&str>,
846
+ ) -> Result<ParquetValue> {
847
+ if value.is_nil() {
848
+ return Ok(ParquetValue::Null);
849
+ }
850
+
851
+ let ruby = Ruby::get()
852
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
853
+ if value.is_kind_of(ruby.class_time()) {
854
+ // Normalize timestamp according to Parquet spec
855
+ let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
856
+
857
+ let micros = adjusted_time
858
+ .funcall::<_, _, i64>("to_i", ())
859
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
860
+ * 1_000_000
861
+ + adjusted_time
862
+ .funcall::<_, _, i32>("nsec", ())
863
+ .map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
864
+ / 1000;
865
+
866
+ // PARQUET TIMESTAMP STORAGE:
867
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
868
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
869
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
870
+ let tz = if schema_tz.is_some() {
871
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
872
+ } else {
873
+ None // Unzoned/local timestamp
874
+ };
875
+
876
+ return Ok(ParquetValue::TimestampMicros(micros, tz));
877
+ }
878
+
879
+ // Handle strings
880
+ if value.is_kind_of(ruby.class_string()) {
881
+ // Use Ruby's Time.parse to handle timestamp strings
882
+ let time_class = ruby.class_time();
883
+ let time = time_class
884
+ .funcall::<_, _, Value>("parse", (value,))
885
+ .map_err(|e| {
886
+ ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
887
+ })?;
888
+
889
+ // Normalize timestamp according to Parquet spec
890
+ let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
891
+
892
+ let micros = adjusted_time
893
+ .funcall::<_, _, i64>("to_i", ())
894
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
895
+ * 1_000_000
896
+ + adjusted_time
897
+ .funcall::<_, _, i32>("nsec", ())
898
+ .map_err(|e| ParquetError::Conversion(e.to_string()))? as i64
899
+ / 1000;
900
+
901
+ // PARQUET TIMESTAMP STORAGE:
902
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
903
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
904
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
905
+ let tz = if schema_tz.is_some() {
906
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
907
+ } else {
908
+ None // Unzoned/local timestamp
909
+ };
910
+
911
+ return Ok(ParquetValue::TimestampMicros(micros, tz));
912
+ }
913
+
914
+ Err(ParquetError::Conversion(format!(
915
+ "Cannot convert {} to timestamp_micros",
916
+ value.class()
917
+ )))
918
+ }
919
+
920
+ fn convert_to_timestamp_nanos_with_tz(
921
+ &self,
922
+ value: Value,
923
+ schema_tz: Option<&str>,
924
+ ) -> Result<ParquetValue> {
925
+ if value.is_nil() {
926
+ return Ok(ParquetValue::Null);
927
+ }
928
+
929
+ let ruby = Ruby::get()
930
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
931
+ if value.is_kind_of(ruby.class_time()) {
932
+ // Normalize timestamp according to Parquet spec
933
+ let adjusted_time = self.normalize_timestamp_for_parquet(value, schema_tz.is_some())?;
934
+
935
+ let nanos = adjusted_time
936
+ .funcall::<_, _, i64>("to_i", ())
937
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
938
+ * 1_000_000_000
939
+ + adjusted_time
940
+ .funcall::<_, _, i32>("nsec", ())
941
+ .map_err(|e| ParquetError::Conversion(e.to_string()))? as i64;
942
+
943
+ // PARQUET TIMESTAMP STORAGE:
944
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
945
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
946
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
947
+ let tz = if schema_tz.is_some() {
948
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
949
+ } else {
950
+ None // Unzoned/local timestamp
951
+ };
952
+
953
+ return Ok(ParquetValue::TimestampNanos(nanos, tz));
954
+ }
955
+
956
+ // Handle strings
957
+ if value.is_kind_of(ruby.class_string()) {
958
+ // Use Ruby's Time.parse to handle timestamp strings
959
+ let time_class = ruby.class_time();
960
+ let time = time_class
961
+ .funcall::<_, _, Value>("parse", (value,))
962
+ .map_err(|e| {
963
+ ParquetError::Conversion(format!("Failed to parse timestamp: {}", e))
964
+ })?;
965
+
966
+ // Normalize timestamp according to Parquet spec
967
+ let adjusted_time = self.normalize_timestamp_for_parquet(time, schema_tz.is_some())?;
968
+
969
+ let nanos = adjusted_time
970
+ .funcall::<_, _, i64>("to_i", ())
971
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
972
+ * 1_000_000_000
973
+ + adjusted_time
974
+ .funcall::<_, _, i32>("nsec", ())
975
+ .map_err(|e| ParquetError::Conversion(e.to_string()))? as i64;
976
+
977
+ // PARQUET TIMESTAMP STORAGE:
978
+ // - Schema WITH timezone -> Store as UTC (isAdjustedToUTC = true)
979
+ // - Schema WITHOUT timezone -> Store as unzoned (isAdjustedToUTC = false)
980
+ // NOTE: Original timezone like "+09:00" is converted to "UTC" for storage
981
+ let tz = if schema_tz.is_some() {
982
+ Some(Arc::from("UTC")) // Always UTC, never the original timezone
983
+ } else {
984
+ None // Unzoned/local timestamp
985
+ };
986
+
987
+ return Ok(ParquetValue::TimestampNanos(nanos, tz));
988
+ }
989
+
990
+ Err(ParquetError::Conversion(format!(
991
+ "Cannot convert {} to timestamp_nanos",
992
+ value.class()
993
+ )))
994
+ }
995
+
996
+ fn convert_to_decimal128(
997
+ &self,
998
+ value: Value,
999
+ precision: u8,
1000
+ scale: i8,
1001
+ ) -> Result<ParquetValue> {
1002
+ if value.is_nil() {
1003
+ return Ok(ParquetValue::Null);
1004
+ }
1005
+
1006
+ // For BigDecimal, use to_s("F") to get non-scientific notation
1007
+ let str_val: String = if value.class().to_string() == "BigDecimal" {
1008
+ value
1009
+ .funcall("to_s", ("F",))
1010
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
1011
+ } else {
1012
+ value
1013
+ .funcall("to_s", ())
1014
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
1015
+ };
1016
+
1017
+ self.parse_decimal128(&str_val, precision, scale)
1018
+ }
1019
+
1020
+ fn convert_to_decimal256(
1021
+ &self,
1022
+ value: Value,
1023
+ precision: u8,
1024
+ scale: i8,
1025
+ ) -> Result<ParquetValue> {
1026
+ if value.is_nil() {
1027
+ return Ok(ParquetValue::Null);
1028
+ }
1029
+
1030
+ // For BigDecimal, use to_s("F") to get non-scientific notation
1031
+ let str_val: String = if value.class().to_string() == "BigDecimal" {
1032
+ value
1033
+ .funcall("to_s", ("F",))
1034
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
1035
+ } else {
1036
+ value
1037
+ .funcall("to_s", ())
1038
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
1039
+ };
1040
+
1041
+ self.parse_decimal256(&str_val, precision, scale)
1042
+ }
1043
+
1044
+ fn convert_to_fixed_len_byte_array(&self, value: Value, len: i32) -> Result<ParquetValue> {
1045
+ if value.is_nil() {
1046
+ return Ok(ParquetValue::Null);
1047
+ }
1048
+
1049
+ let ruby = Ruby::get()
1050
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
1051
+ let bytes = if value.is_kind_of(ruby.class_string()) {
1052
+ let s: RString = TryConvert::try_convert(value)
1053
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
1054
+ unsafe { s.as_slice() }.to_vec()
1055
+ } else {
1056
+ let s: String = TryConvert::try_convert(value)
1057
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
1058
+ s.into_bytes()
1059
+ };
1060
+
1061
+ if bytes.len() != len as usize {
1062
+ return Err(ParquetError::Conversion(format!(
1063
+ "Expected {} bytes, got {}",
1064
+ len,
1065
+ bytes.len()
1066
+ )));
1067
+ }
1068
+
1069
+ Ok(ParquetValue::Bytes(bytes.into()))
1070
+ }
1071
+
1072
+ // Helper methods
1073
+
1074
+ fn convert_numeric<T>(&self, value: Value) -> Result<T>
1075
+ where
1076
+ T: TryConvert + std::str::FromStr,
1077
+ <T as std::str::FromStr>::Err: std::fmt::Display,
1078
+ {
1079
+ // Try direct conversion first
1080
+ if let Ok(val) = TryConvert::try_convert(value) {
1081
+ return Ok(val);
1082
+ }
1083
+
1084
+ // If that fails, try converting to i64/f64 first, then to target type
1085
+ let ruby = Ruby::get()
1086
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
1087
+ if value.is_kind_of(ruby.class_integer()) {
1088
+ // Convert Integer to i64 first, then to target type
1089
+ let i: i64 = TryConvert::try_convert(value)
1090
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
1091
+ i.to_string().parse::<T>().map_err(|e| {
1092
+ ParquetError::Conversion(format!("Failed to convert {} to target type: {}", i, e))
1093
+ })
1094
+ } else if value.is_kind_of(ruby.class_float()) {
1095
+ // Convert Float to f64 first, then to target type
1096
+ let f: f64 = TryConvert::try_convert(value)
1097
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
1098
+ f.to_string().parse::<T>().map_err(|e| {
1099
+ ParquetError::Conversion(format!("Failed to convert {} to target type: {}", f, e))
1100
+ })
1101
+ } else if value.is_kind_of(ruby.class_string()) {
1102
+ let s: String = TryConvert::try_convert(value)
1103
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
1104
+ s.trim().parse::<T>().map_err(|e| {
1105
+ ParquetError::Conversion(format!("Failed to parse '{}' as numeric: {}", s, e))
1106
+ })
1107
+ } else {
1108
+ Err(ParquetError::Conversion(format!(
1109
+ "Cannot convert {} to numeric",
1110
+ value.class()
1111
+ )))
1112
+ }
1113
+ }
1114
+
1115
+ fn parse_decimal128(&self, s: &str, _precision: u8, scale: i8) -> Result<ParquetValue> {
1116
+ // Parse decimal string to i128
1117
+ let clean = s.trim();
1118
+
1119
+ // Handle scientific notation by converting to regular decimal format
1120
+ let normalized = if clean.to_lowercase().contains('e') {
1121
+ // Parse as f64 first to handle scientific notation
1122
+ let f: f64 = clean.parse().map_err(|e| {
1123
+ ParquetError::Conversion(format!("Failed to parse scientific notation: {}", e))
1124
+ })?;
1125
+ // Convert to string with enough precision
1126
+ format!("{:.15}", f)
1127
+ .trim_end_matches('0')
1128
+ .trim_end_matches('.')
1129
+ .to_string()
1130
+ } else {
1131
+ clean.to_string()
1132
+ };
1133
+
1134
+ let is_negative = normalized.starts_with('-');
1135
+ let clean_abs = normalized.trim_start_matches('-').trim_start_matches('+');
1136
+
1137
+ let parts: Vec<&str> = clean_abs.split('.').collect();
1138
+
1139
+ if parts.len() > 2 {
1140
+ return Err(ParquetError::Conversion(
1141
+ "Invalid decimal format".to_string(),
1142
+ ));
1143
+ }
1144
+
1145
+ let integer_part = if parts.is_empty() || parts[0].is_empty() {
1146
+ "0"
1147
+ } else {
1148
+ parts[0]
1149
+ };
1150
+ let fractional_part = if parts.len() == 2 { parts[1] } else { "" };
1151
+
1152
+ // Calculate the actual value considering the scale
1153
+ let current_scale = fractional_part.len() as i8;
1154
+
1155
+ if scale < 0 {
1156
+ return Err(ParquetError::Conversion(
1157
+ "Negative scale not supported".to_string(),
1158
+ ));
1159
+ }
1160
+
1161
+ // Parse integer and fractional parts
1162
+ let integer_value: i128 = integer_part.parse().map_err(|e| {
1163
+ ParquetError::Conversion(format!("Failed to parse integer part: {}", e))
1164
+ })?;
1165
+
1166
+ let fractional_value: i128 = if fractional_part.is_empty() {
1167
+ 0
1168
+ } else {
1169
+ fractional_part.parse().map_err(|e| {
1170
+ ParquetError::Conversion(format!("Failed to parse fractional part: {}", e))
1171
+ })?
1172
+ };
1173
+
1174
+ // Calculate the final value based on scale
1175
+ let scale_factor = 10_i128.pow(scale as u32);
1176
+ let current_scale_factor = 10_i128.pow(current_scale as u32);
1177
+
1178
+ let mut value = if current_scale <= scale {
1179
+ // Current scale is less than or equal to target scale - pad with zeros
1180
+ integer_value * scale_factor + fractional_value * (scale_factor / current_scale_factor)
1181
+ } else {
1182
+ // Current scale is greater than target scale - need to truncate/round
1183
+ let adjustment_factor = 10_i128.pow((current_scale - scale) as u32);
1184
+ let adjusted_fractional = fractional_value / adjustment_factor;
1185
+ integer_value * scale_factor + adjusted_fractional
1186
+ };
1187
+
1188
+ if is_negative {
1189
+ value = -value;
1190
+ }
1191
+
1192
+ Ok(ParquetValue::Decimal128(value, scale))
1193
+ }
1194
+
1195
+ fn parse_decimal256(&self, s: &str, _precision: u8, scale: i8) -> Result<ParquetValue> {
1196
+ // Parse decimal string to BigInt
1197
+ use num::{BigInt, Zero};
1198
+
1199
+ let clean = s.trim();
1200
+
1201
+ // Handle scientific notation by converting to regular decimal format
1202
+ let normalized = if clean.to_lowercase().contains('e') {
1203
+ // Parse as f64 first to handle scientific notation
1204
+ let f: f64 = clean.parse().map_err(|e| {
1205
+ ParquetError::Conversion(format!("Failed to parse scientific notation: {}", e))
1206
+ })?;
1207
+ // Convert to string with enough precision
1208
+ format!("{:.15}", f)
1209
+ .trim_end_matches('0')
1210
+ .trim_end_matches('.')
1211
+ .to_string()
1212
+ } else {
1213
+ clean.to_string()
1214
+ };
1215
+
1216
+ let is_negative = normalized.starts_with('-');
1217
+ let clean_abs = normalized.trim_start_matches('-').trim_start_matches('+');
1218
+
1219
+ let parts: Vec<&str> = clean_abs.split('.').collect();
1220
+
1221
+ if parts.len() > 2 {
1222
+ return Err(ParquetError::Conversion(
1223
+ "Invalid decimal format".to_string(),
1224
+ ));
1225
+ }
1226
+
1227
+ let integer_part = if parts.is_empty() || parts[0].is_empty() {
1228
+ "0"
1229
+ } else {
1230
+ parts[0]
1231
+ };
1232
+ let fractional_part = if parts.len() == 2 { parts[1] } else { "" };
1233
+
1234
+ // Calculate the actual value considering the scale
1235
+ let current_scale = fractional_part.len() as i8;
1236
+
1237
+ if scale < 0 {
1238
+ return Err(ParquetError::Conversion(
1239
+ "Negative scale not supported".to_string(),
1240
+ ));
1241
+ }
1242
+
1243
+ // Parse integer and fractional parts
1244
+ let integer_value: BigInt = integer_part.parse().map_err(|e| {
1245
+ ParquetError::Conversion(format!("Failed to parse integer part: {}", e))
1246
+ })?;
1247
+
1248
+ let fractional_value: BigInt = if fractional_part.is_empty() {
1249
+ BigInt::zero()
1250
+ } else {
1251
+ fractional_part.parse().map_err(|e| {
1252
+ ParquetError::Conversion(format!("Failed to parse fractional part: {}", e))
1253
+ })?
1254
+ };
1255
+
1256
+ // Calculate the final value based on scale
1257
+ let scale_factor = BigInt::from(10).pow(scale as u32);
1258
+ let current_scale_factor = BigInt::from(10).pow(current_scale as u32);
1259
+
1260
+ let mut value = if current_scale <= scale {
1261
+ // Current scale is less than or equal to target scale - pad with zeros
1262
+ integer_value * &scale_factor + fractional_value * (scale_factor / current_scale_factor)
1263
+ } else {
1264
+ // Current scale is greater than target scale - need to truncate/round
1265
+ let adjustment_factor = BigInt::from(10).pow((current_scale - scale) as u32);
1266
+ let adjusted_fractional = fractional_value / adjustment_factor;
1267
+ integer_value * &scale_factor + adjusted_fractional
1268
+ };
1269
+
1270
+ if is_negative {
1271
+ value = -value;
1272
+ }
1273
+
1274
+ Ok(ParquetValue::Decimal256(value, scale))
1275
+ }
1276
+
1277
+ /// Convert a Ruby array to a ParquetValue::List
1278
+ fn convert_to_list(
1279
+ &mut self,
1280
+ value: Value,
1281
+ item_schema: &parquet_core::SchemaNode,
1282
+ ) -> Result<ParquetValue> {
1283
+ if value.is_nil() {
1284
+ return Ok(ParquetValue::Null);
1285
+ }
1286
+
1287
+ let array: RArray = TryConvert::try_convert(value).map_err(|e: MagnusError| {
1288
+ ParquetError::Conversion(format!("Expected Array for List type: {}", e))
1289
+ })?;
1290
+
1291
+ let mut list = Vec::with_capacity(array.len());
1292
+ for item in array.into_iter() {
1293
+ list.push(self.convert_with_schema_hint(item, item_schema)?);
1294
+ }
1295
+
1296
+ Ok(ParquetValue::List(list))
1297
+ }
1298
+
1299
+ /// Convert a Ruby hash to a ParquetValue::Map
1300
+ fn convert_to_map(
1301
+ &mut self,
1302
+ value: Value,
1303
+ key_schema: &parquet_core::SchemaNode,
1304
+ value_schema: &parquet_core::SchemaNode,
1305
+ ) -> Result<ParquetValue> {
1306
+ if value.is_nil() {
1307
+ return Ok(ParquetValue::Null);
1308
+ }
1309
+
1310
+ let hash: RHash = TryConvert::try_convert(value).map_err(|e: MagnusError| {
1311
+ ParquetError::Conversion(format!("Expected Hash for Map type: {}", e))
1312
+ })?;
1313
+
1314
+ // Collect key-value pairs first
1315
+ let mut kv_pairs = Vec::new();
1316
+ hash.foreach(|k: Value, v: Value| {
1317
+ kv_pairs.push((k, v));
1318
+ Ok(ForEach::Continue)
1319
+ })
1320
+ .map_err(|e: MagnusError| ParquetError::Conversion(e.to_string()))?;
1321
+
1322
+ // Now convert them with mutable self
1323
+ let mut map = Vec::new();
1324
+ for (k, v) in kv_pairs {
1325
+ let key = self.convert_with_schema_hint(k, key_schema)?;
1326
+ let val = self.convert_with_schema_hint(v, value_schema)?;
1327
+ map.push((key, val));
1328
+ }
1329
+
1330
+ Ok(ParquetValue::Map(map))
1331
+ }
1332
+
1333
+ /// Convert a Ruby hash to a ParquetValue::Record (struct)
1334
+ fn convert_to_struct(
1335
+ &mut self,
1336
+ value: Value,
1337
+ fields: &[parquet_core::SchemaNode],
1338
+ ) -> Result<ParquetValue> {
1339
+ if value.is_nil() {
1340
+ return Ok(ParquetValue::Null);
1341
+ }
1342
+
1343
+ let hash: RHash = TryConvert::try_convert(value).map_err(|e: MagnusError| {
1344
+ ParquetError::Conversion(format!("Expected Hash for Struct type: {}", e))
1345
+ })?;
1346
+
1347
+ let mut record = IndexMap::new();
1348
+
1349
+ let ruby = Ruby::get()
1350
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
1351
+ for field in fields {
1352
+ let field_name = field.name();
1353
+ let ruby_key = ruby.to_symbol(field_name);
1354
+
1355
+ // Try symbol key first, then string key
1356
+ let field_value = if let Some(val) = hash.get(ruby_key) {
1357
+ val
1358
+ } else if let Some(val) = hash.get(field_name) {
1359
+ val
1360
+ } else {
1361
+ // Field not found, use null
1362
+ ruby.qnil().as_value()
1363
+ };
1364
+
1365
+ let converted = self.convert_with_schema_hint(field_value, field)?;
1366
+ record.insert(field_name.into(), converted);
1367
+ }
1368
+
1369
+ Ok(ParquetValue::Record(record))
1370
+ }
1371
+ }
1372
+
1373
+ // Helper functions for one-off conversions where we don't need string caching
1374
+
1375
+ pub fn ruby_to_parquet(value: Value) -> Result<ParquetValue> {
1376
+ let mut converter = RubyValueConverter::new();
1377
+ converter.infer_and_convert(value)
1378
+ }
1379
+
1380
+ pub fn parquet_to_ruby(value: ParquetValue, string_storage: &mut StringStorage) -> Result<Value> {
1381
+ let ruby = Ruby::get()
1382
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
1383
+
1384
+ match value {
1385
+ ParquetValue::Null => Ok(ruby.qnil().as_value()),
1386
+ ParquetValue::Boolean(b) => Ok(b.into_value_with(&ruby)),
1387
+ ParquetValue::Int8(i) => Ok((i as i64).into_value_with(&ruby)),
1388
+ ParquetValue::Int16(i) => Ok((i as i64).into_value_with(&ruby)),
1389
+ ParquetValue::Int32(i) => Ok((i as i64).into_value_with(&ruby)),
1390
+ ParquetValue::Int64(i) => Ok(i.into_value_with(&ruby)),
1391
+ ParquetValue::UInt8(i) => Ok((i as u64).into_value_with(&ruby)),
1392
+ ParquetValue::UInt16(i) => Ok((i as u64).into_value_with(&ruby)),
1393
+ ParquetValue::UInt32(i) => Ok((i as u64).into_value_with(&ruby)),
1394
+ ParquetValue::UInt64(i) => Ok(i.into_value_with(&ruby)),
1395
+ ParquetValue::Float16(OrderedFloat(f)) => {
1396
+ let cleaned = {
1397
+ // Fast-path the specials.
1398
+ if f.is_nan() || f.is_infinite() {
1399
+ f as f64
1400
+ } else if f == 0.0 {
1401
+ // Keep the IEEE-754 sign bit for −0.0.
1402
+ if f.is_sign_negative() {
1403
+ -0.0
1404
+ } else {
1405
+ 0.0
1406
+ }
1407
+ } else {
1408
+ // `to_string` gives the shortest exact, round-trippable decimal.
1409
+ // Parsing it back to `f64` cannot fail
1410
+ f.to_string().parse::<f64>()?
1411
+ }
1412
+ };
1413
+ Ok(cleaned.into_value_with(&ruby))
1414
+ }
1415
+ ParquetValue::Float32(OrderedFloat(f)) => {
1416
+ let cleaned = {
1417
+ // Fast-path the specials.
1418
+ if f.is_nan() || f.is_infinite() {
1419
+ f as f64
1420
+ } else if f == 0.0 {
1421
+ // Keep the IEEE-754 sign bit for −0.0.
1422
+ if f.is_sign_negative() {
1423
+ -0.0
1424
+ } else {
1425
+ 0.0
1426
+ }
1427
+ } else {
1428
+ // `to_string` gives the shortest exact, round-trippable decimal.
1429
+ // Parsing it back to `f64` cannot fail
1430
+ f.to_string().parse::<f64>()?
1431
+ }
1432
+ };
1433
+ Ok(cleaned.into_value_with(&ruby))
1434
+ }
1435
+ ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
1436
+ ParquetValue::String(s) => Ok(string_storage.ruby_string(&ruby, &s)),
1437
+ ParquetValue::Uuid(u) => Ok(u
1438
+ .hyphenated()
1439
+ .encode_lower(&mut Uuid::encode_buffer())
1440
+ .into_value_with(&ruby)),
1441
+ ParquetValue::Bytes(b) => Ok(ruby.enc_str_new(&b, ruby.ascii8bit_encoding()).as_value()),
1442
+ ParquetValue::Date32(days) => {
1443
+ // Convert days since epoch to Date object
1444
+ let _ = ruby.require("date");
1445
+ let kernel = ruby.module_kernel();
1446
+ let date_class = kernel
1447
+ .const_get::<_, Value>("Date")
1448
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1449
+ let secs = days as i64 * 86400;
1450
+ let time_class = ruby.class_time();
1451
+ let time = time_class
1452
+ .funcall::<_, _, Value>("at", (secs,))
1453
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?
1454
+ .funcall::<_, _, Value>("utc", ())
1455
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1456
+ let year: i32 = time
1457
+ .funcall("year", ())
1458
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1459
+ let month: i32 = time
1460
+ .funcall("month", ())
1461
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1462
+ let day: i32 = time
1463
+ .funcall("day", ())
1464
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1465
+ date_class
1466
+ .funcall("new", (year, month, day))
1467
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1468
+ }
1469
+ ParquetValue::Date64(millis) => {
1470
+ // Convert millis to Time object
1471
+ let time_class = ruby.class_time();
1472
+ let secs = millis / 1000;
1473
+ let nsec = (millis % 1000) * 1_000_000;
1474
+ time_class
1475
+ .funcall("at", (secs, nsec))
1476
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1477
+ }
1478
+ ParquetValue::TimeMillis(millis) => {
1479
+ // Convert to Time object for today with given time
1480
+ let time_class = ruby.class_time();
1481
+ let hours = millis / (3600 * 1000);
1482
+ let minutes = (millis % (3600 * 1000)) / (60 * 1000);
1483
+ let seconds = (millis % (60 * 1000)) / 1000;
1484
+ let ms = millis % 1000;
1485
+
1486
+ let now: Value = time_class
1487
+ .funcall("now", ())
1488
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1489
+ let year: i32 = now
1490
+ .funcall("year", ())
1491
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1492
+ let month: i32 = now
1493
+ .funcall("month", ())
1494
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1495
+ let day: i32 = now
1496
+ .funcall("day", ())
1497
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1498
+
1499
+ time_class
1500
+ .funcall(
1501
+ "utc",
1502
+ (year, month, day, hours, minutes, seconds, ms * 1000),
1503
+ )
1504
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1505
+ }
1506
+ ParquetValue::TimeMicros(micros) => {
1507
+ // Similar to TimeMillis but with microsecond precision
1508
+ let time_class = ruby.class_time();
1509
+ let hours = micros / (3600 * 1_000_000);
1510
+ let minutes = (micros % (3600 * 1_000_000)) / (60 * 1_000_000);
1511
+ let seconds = (micros % (60 * 1_000_000)) / 1_000_000;
1512
+ let us = micros % 1_000_000;
1513
+
1514
+ let now: Value = time_class
1515
+ .funcall("now", ())
1516
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1517
+ let year: i32 = now
1518
+ .funcall("year", ())
1519
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1520
+ let month: i32 = now
1521
+ .funcall("month", ())
1522
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1523
+ let day: i32 = now
1524
+ .funcall("day", ())
1525
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1526
+
1527
+ time_class
1528
+ .funcall("utc", (year, month, day, hours, minutes, seconds, us))
1529
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1530
+ }
1531
+ ParquetValue::TimeNanos(nanos) => {
1532
+ let time_class = ruby.class_time();
1533
+ let secs = nanos / 1_000_000_000;
1534
+ let nsec = nanos % 1_000_000_000;
1535
+ time_class
1536
+ .funcall(
1537
+ "at",
1538
+ (
1539
+ secs,
1540
+ nsec,
1541
+ ruby.to_symbol("nanosecond"),
1542
+ kwargs!("in" => "UTC"),
1543
+ ),
1544
+ )
1545
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1546
+ }
1547
+ ParquetValue::TimestampSecond(secs, tz) => {
1548
+ let time_class = ruby.class_time();
1549
+ let time = time_class
1550
+ .funcall::<_, _, Value>("at", (secs, kwargs!("in" => "UTC")))
1551
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1552
+ apply_timezone(time, &tz)
1553
+ }
1554
+ ParquetValue::TimestampMillis(millis, tz) => {
1555
+ let time_class = ruby.class_time();
1556
+ let secs = millis / 1000;
1557
+ let usec = (millis % 1000) * 1000; // Convert millisecond remainder to microseconds
1558
+ let time = time_class
1559
+ .funcall::<_, _, Value>("at", (secs, usec, kwargs!("in" => "UTC")))
1560
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1561
+ apply_timezone(time, &tz)
1562
+ }
1563
+ ParquetValue::TimestampMicros(micros, tz) => {
1564
+ let time_class = ruby.class_time();
1565
+ let secs = micros / 1_000_000;
1566
+ let usec = micros % 1_000_000; // Already in microseconds
1567
+ let time = time_class
1568
+ .funcall::<_, _, Value>("at", (secs, usec, kwargs!("in" => "UTC")))
1569
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1570
+ apply_timezone(time, &tz)
1571
+ }
1572
+ ParquetValue::TimestampNanos(nanos, tz) => {
1573
+ let time_class = ruby.class_time();
1574
+ let secs = nanos / 1_000_000_000;
1575
+ let nsec = nanos % 1_000_000_000;
1576
+ // Use the nanosecond form of Time.at
1577
+ let time = time_class
1578
+ .funcall::<_, _, Value>(
1579
+ "at",
1580
+ (
1581
+ secs,
1582
+ nsec,
1583
+ ruby.to_symbol("nanosecond"),
1584
+ kwargs!("in" => "UTC"),
1585
+ ),
1586
+ )
1587
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1588
+ apply_timezone(time, &tz)
1589
+ }
1590
+ ParquetValue::Decimal128(val, scale) => {
1591
+ // Load BigDecimal if needed
1592
+ let _ = ruby.require("bigdecimal");
1593
+
1594
+ // Format decimal with scale
1595
+ let str_val = format_decimal128(val, scale);
1596
+ let kernel = ruby.module_kernel();
1597
+ kernel
1598
+ .funcall("BigDecimal", (str_val,))
1599
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1600
+ }
1601
+ ParquetValue::Decimal256(val, scale) => {
1602
+ // Load BigDecimal if needed
1603
+ let _ = ruby.require("bigdecimal");
1604
+
1605
+ // Format decimal with scale
1606
+ let str_val = format_decimal256(&val, scale);
1607
+ let kernel = ruby.module_kernel();
1608
+ kernel
1609
+ .funcall("BigDecimal", (str_val,))
1610
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1611
+ }
1612
+ ParquetValue::List(list) => {
1613
+ let array = ruby.ary_new_capa(list.len());
1614
+ for item in list {
1615
+ let ruby_val = parquet_to_ruby(item, string_storage)?;
1616
+ array
1617
+ .push(ruby_val)
1618
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1619
+ }
1620
+ Ok(array.as_value())
1621
+ }
1622
+ ParquetValue::Map(map) => {
1623
+ let hash = ruby.hash_new();
1624
+ for (k, v) in map {
1625
+ let ruby_key = parquet_to_ruby(k, string_storage)?;
1626
+ let ruby_val = parquet_to_ruby(v, string_storage)?;
1627
+ hash.aset(ruby_key, ruby_val)
1628
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1629
+ }
1630
+ Ok(hash.as_value())
1631
+ }
1632
+ ParquetValue::Record(record) => {
1633
+ // Convert Record to Ruby Hash
1634
+ let hash = ruby.hash_new();
1635
+ for (field_name, field_value) in record {
1636
+ let ruby_key = string_storage.ruby_key(&ruby, &field_name);
1637
+ let ruby_val = parquet_to_ruby(field_value, string_storage)?;
1638
+ hash.aset(ruby_key, ruby_val)
1639
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1640
+ }
1641
+ Ok(hash.as_value())
1642
+ }
1643
+ }
1644
+ }
1645
+
1646
+ // Helper functions for decimal formatting
1647
+
1648
+ fn format_decimal128(value: i128, scale: i8) -> String {
1649
+ if scale == 0 {
1650
+ return value.to_string();
1651
+ }
1652
+
1653
+ let abs_value = value.abs();
1654
+ let sign = if value < 0 { "-" } else { "" };
1655
+
1656
+ if scale > 0 {
1657
+ let divisor = 10_i128.pow(scale as u32);
1658
+ let integer_part = abs_value / divisor;
1659
+ let fractional_part = abs_value % divisor;
1660
+ format!(
1661
+ "{}{}.{:0>width$}",
1662
+ sign,
1663
+ integer_part,
1664
+ fractional_part,
1665
+ width = scale as usize
1666
+ )
1667
+ } else {
1668
+ // Negative scale means multiply by 10^(-scale)
1669
+ let multiplier = 10_i128.pow((-scale) as u32);
1670
+ format!("{}{}", sign, abs_value * multiplier)
1671
+ }
1672
+ }
1673
+
1674
+ fn format_decimal256(value: &num::BigInt, scale: i8) -> String {
1675
+ use num::{BigInt, Signed};
1676
+
1677
+ if scale == 0 {
1678
+ return value.to_string();
1679
+ }
1680
+
1681
+ let abs_value = value.abs();
1682
+ let sign = if value.is_negative() { "-" } else { "" };
1683
+
1684
+ if scale > 0 {
1685
+ let ten = BigInt::from(10);
1686
+ let divisor = ten.pow(scale as u32);
1687
+ let integer_part = &abs_value / &divisor;
1688
+ let fractional_part = &abs_value % &divisor;
1689
+
1690
+ // Format fractional part with leading zeros
1691
+ let frac_str = fractional_part.to_string();
1692
+ let padding = scale as usize - frac_str.len();
1693
+ let zeros = "0".repeat(padding);
1694
+
1695
+ format!("{}{}.{}{}", sign, integer_part, zeros, frac_str)
1696
+ } else {
1697
+ // Negative scale means multiply by 10^(-scale)
1698
+ let ten = BigInt::from(10);
1699
+ let multiplier = ten.pow((-scale) as u32);
1700
+ format!("{}{}", sign, abs_value * multiplier)
1701
+ }
1702
+ }
1703
+
1704
+ /// Apply timezone when reading timestamp from Parquet file
1705
+ ///
1706
+ /// PARQUET SPEC COMPLIANCE:
1707
+ /// - If schema has ANY timezone -> values are UTC (isAdjustedToUTC = true)
1708
+ /// - If schema has NO timezone -> values are local/unzoned (isAdjustedToUTC = false)
1709
+ ///
1710
+ /// NOTE: The actual timezone string in the schema is irrelevant for reading.
1711
+ /// Whether it's "UTC", "+09:00", or "America/New_York", the stored values
1712
+ /// are ALWAYS UTC-normalized. We return them as UTC Time objects.
1713
+ fn apply_timezone(time: Value, tz: &Option<Arc<str>>) -> Result<Value> {
1714
+ let _ruby = Ruby::get()
1715
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
1716
+
1717
+ match tz {
1718
+ Some(_) => {
1719
+ // ANY timezone = UTC storage (Parquet spec requirement)
1720
+ // Original timezone like "+09:00" is NOT preserved
1721
+ time.funcall("utc", ())
1722
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1723
+ }
1724
+ None => {
1725
+ // No timezone = local/unzoned timestamp
1726
+ // This is a "wall clock" time without timezone context
1727
+ Ok(time)
1728
+ }
1729
+ }
1730
+ }
1731
+
1732
+ // Note: These wrapper functions are needed because ValueConverter is not thread-safe
1733
+ // due to Ruby's GIL requirements. They are called from Ruby FFI functions where we know
1734
+ // we're in the correct thread context.