parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,1275 +0,0 @@
1
- use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
2
-
3
- use super::record_types::{format_decimal_with_i8_scale, format_i256_decimal_with_scale};
4
- use super::*;
5
- use arrow_array::MapArray;
6
- use magnus::{RArray, RString};
7
-
8
- #[derive(Debug, Clone)]
9
- pub enum ParquetValue {
10
- Int8(i8),
11
- Int16(i16),
12
- Int32(i32),
13
- Int64(i64),
14
- UInt8(u8),
15
- UInt16(u16),
16
- UInt32(u32),
17
- UInt64(u64),
18
- Float16(f32), // f16 converted to f32
19
- Float32(f32),
20
- Float64(f64),
21
- Boolean(bool),
22
- String(String),
23
- Bytes(Vec<u8>),
24
- Date32(i32),
25
- Date64(i64),
26
- Decimal128(i128, i8),
27
- Decimal256(arrow_buffer::i256, i8),
28
- TimestampSecond(i64, Option<Arc<str>>),
29
- TimestampMillis(i64, Option<Arc<str>>),
30
- TimestampMicros(i64, Option<Arc<str>>),
31
- TimestampNanos(i64, Option<Arc<str>>),
32
- TimeMillis(i32), // Time of day in milliseconds since midnight
33
- TimeMicros(i64), // Time of day in microseconds since midnight
34
- List(Vec<ParquetValue>), // A list of values (can be empty or have null items)
35
- // We're not using a separate NilList type anymore - we'll handle nil lists elsewhere
36
- Map(HashMap<ParquetValue, ParquetValue>),
37
- Null,
38
- }
39
-
40
- impl PartialEq for ParquetValue {
41
- fn eq(&self, other: &Self) -> bool {
42
- match (self, other) {
43
- (ParquetValue::Int8(a), ParquetValue::Int8(b)) => a == b,
44
- (ParquetValue::Int16(a), ParquetValue::Int16(b)) => a == b,
45
- (ParquetValue::Int32(a), ParquetValue::Int32(b)) => a == b,
46
- (ParquetValue::Int64(a), ParquetValue::Int64(b)) => a == b,
47
- (ParquetValue::UInt8(a), ParquetValue::UInt8(b)) => a == b,
48
- (ParquetValue::UInt16(a), ParquetValue::UInt16(b)) => a == b,
49
- (ParquetValue::UInt32(a), ParquetValue::UInt32(b)) => a == b,
50
- (ParquetValue::UInt64(a), ParquetValue::UInt64(b)) => a == b,
51
- (ParquetValue::Float16(a), ParquetValue::Float16(b)) => a == b,
52
- (ParquetValue::Float32(a), ParquetValue::Float32(b)) => a == b,
53
- (ParquetValue::Float64(a), ParquetValue::Float64(b)) => a == b,
54
- (ParquetValue::Boolean(a), ParquetValue::Boolean(b)) => a == b,
55
- (ParquetValue::String(a), ParquetValue::String(b)) => a == b,
56
- (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
57
- (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
58
- (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
59
- (ParquetValue::Decimal128(a, scale_a), ParquetValue::Decimal128(b, scale_b)) => {
60
- if scale_a == scale_b {
61
- // Same scale, compare directly
62
- a == b
63
- } else {
64
- // Different scales, need to adjust for proper comparison
65
- let mut a_val = *a;
66
- let mut b_val = *b;
67
-
68
- // Adjust to the same scale for proper comparison
69
- if scale_a < scale_b {
70
- // Scale up a to match b's scale
71
- let scale_diff = (*scale_b - *scale_a) as u32;
72
- if scale_diff <= 38 {
73
- // Limit to avoid overflow
74
- a_val *= 10_i128.pow(scale_diff);
75
- } else {
76
- // For large scale differences, use BigInt for the comparison
77
- let a_big = num::BigInt::from(*a)
78
- * num::BigInt::from(10_i128.pow(scale_diff.min(38)));
79
- let b_big = num::BigInt::from(*b);
80
- return a_big == b_big;
81
- }
82
- } else {
83
- // Scale up b to match a's scale
84
- let scale_diff = (*scale_a - *scale_b) as u32;
85
- if scale_diff <= 38 {
86
- // Limit to avoid overflow
87
- b_val *= 10_i128.pow(scale_diff);
88
- } else {
89
- // For large scale differences, use BigInt for the comparison
90
- let a_big = num::BigInt::from(*a);
91
- let b_big = num::BigInt::from(*b)
92
- * num::BigInt::from(10_i128.pow(scale_diff.min(38)));
93
- return a_big == b_big;
94
- }
95
- }
96
-
97
- a_val == b_val
98
- }
99
- }
100
- (ParquetValue::Decimal256(a, scale_a), ParquetValue::Decimal256(b, scale_b)) => {
101
- if scale_a == scale_b {
102
- // Same scale, compare directly
103
- a == b
104
- } else {
105
- // TODO: Implement decimal256 comparison
106
- todo!("decimal256 comparison");
107
- }
108
- }
109
- (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
110
- (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
111
- (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
112
- (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
113
- (ParquetValue::TimeMillis(a), ParquetValue::TimeMillis(b)) => a == b,
114
- (ParquetValue::TimeMicros(a), ParquetValue::TimeMicros(b)) => a == b,
115
- (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
116
- (ParquetValue::Null, ParquetValue::Null) => true,
117
- _ => false,
118
- }
119
- }
120
- }
121
-
122
- impl Eq for ParquetValue {}
123
-
124
- impl std::hash::Hash for ParquetValue {
125
- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
126
- match self {
127
- ParquetValue::Int8(i) => i.hash(state),
128
- ParquetValue::Int16(i) => i.hash(state),
129
- ParquetValue::Int32(i) => i.hash(state),
130
- ParquetValue::Int64(i) => i.hash(state),
131
- ParquetValue::UInt8(i) => i.hash(state),
132
- ParquetValue::UInt16(i) => i.hash(state),
133
- ParquetValue::UInt32(i) => i.hash(state),
134
- ParquetValue::UInt64(i) => i.hash(state),
135
- ParquetValue::Float16(f) => f.to_bits().hash(state),
136
- ParquetValue::Float32(f) => f.to_bits().hash(state),
137
- ParquetValue::Float64(f) => f.to_bits().hash(state),
138
- ParquetValue::Boolean(b) => b.hash(state),
139
- ParquetValue::String(s) => s.hash(state),
140
- ParquetValue::Bytes(b) => b.hash(state),
141
- ParquetValue::Date32(d) => d.hash(state),
142
- ParquetValue::Date64(d) => d.hash(state),
143
- ParquetValue::Decimal128(d, scale) => {
144
- d.hash(state);
145
- scale.hash(state);
146
- }
147
- ParquetValue::Decimal256(d, scale) => {
148
- d.hash(state);
149
- scale.hash(state);
150
- }
151
- ParquetValue::TimestampSecond(ts, tz) => {
152
- ts.hash(state);
153
- tz.hash(state);
154
- }
155
- ParquetValue::TimestampMillis(ts, tz) => {
156
- ts.hash(state);
157
- tz.hash(state);
158
- }
159
- ParquetValue::TimestampMicros(ts, tz) => {
160
- ts.hash(state);
161
- tz.hash(state);
162
- }
163
- ParquetValue::TimestampNanos(ts, tz) => {
164
- ts.hash(state);
165
- tz.hash(state);
166
- }
167
- ParquetValue::TimeMillis(t) => t.hash(state),
168
- ParquetValue::TimeMicros(t) => t.hash(state),
169
- ParquetValue::List(l) => l.hash(state),
170
- ParquetValue::Map(m) => {
171
- for (k, v) in m {
172
- k.hash(state);
173
- v.hash(state);
174
- }
175
- }
176
- ParquetValue::Null => 0_i32.hash(state),
177
- }
178
- }
179
- }
180
-
181
- impl TryIntoValue for ParquetValue {
182
- fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
183
- match self {
184
- ParquetValue::Int8(i) => Ok(i.into_value_with(handle)),
185
- ParquetValue::Int16(i) => Ok(i.into_value_with(handle)),
186
- ParquetValue::Int32(i) => Ok(i.into_value_with(handle)),
187
- ParquetValue::Int64(i) => Ok(i.into_value_with(handle)),
188
- ParquetValue::UInt8(i) => Ok(i.into_value_with(handle)),
189
- ParquetValue::UInt16(i) => Ok(i.into_value_with(handle)),
190
- ParquetValue::UInt32(i) => Ok(i.into_value_with(handle)),
191
- ParquetValue::UInt64(i) => Ok(i.into_value_with(handle)),
192
- ParquetValue::Float16(f) => Ok(f.into_value_with(handle)),
193
- ParquetValue::Float32(f) => Ok(f.into_value_with(handle)),
194
- ParquetValue::Float64(f) => Ok(f.into_value_with(handle)),
195
- ParquetValue::Boolean(b) => Ok(b.into_value_with(handle)),
196
- ParquetValue::String(s) => Ok(s.into_value_with(handle)),
197
- ParquetValue::Bytes(b) => Ok(handle.str_from_slice(&b).as_value()),
198
- ParquetValue::Decimal128(d, scale) => {
199
- // Load the bigdecimal gem if it's not already loaded
200
- LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
201
-
202
- // Format with proper scaling based on the sign of scale
203
- let value = format_decimal_with_i8_scale(d, scale);
204
-
205
- let kernel = handle.module_kernel();
206
- Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
207
- }
208
- ParquetValue::Decimal256(d, scale) => {
209
- // Load the bigdecimal gem if it's not already loaded
210
- LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
211
-
212
- // Format with proper scaling based on the sign of scale
213
- // Use specialized function to preserve full precision
214
- let value = format_i256_decimal_with_scale(d, scale)?;
215
-
216
- let kernel = handle.module_kernel();
217
- Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
218
- }
219
- ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
220
- ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
221
- timestamp @ ParquetValue::TimestampSecond(_, _) => {
222
- impl_timestamp_conversion!(timestamp, TimestampSecond, handle)
223
- }
224
- timestamp @ ParquetValue::TimestampMillis(_, _) => {
225
- impl_timestamp_conversion!(timestamp, TimestampMillis, handle)
226
- }
227
- timestamp @ ParquetValue::TimestampMicros(_, _) => {
228
- impl_timestamp_conversion!(timestamp, TimestampMicros, handle)
229
- }
230
- timestamp @ ParquetValue::TimestampNanos(_, _) => {
231
- impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
232
- }
233
- ParquetValue::TimeMillis(millis) => {
234
- // Convert time of day in milliseconds to a Ruby Time object
235
- // Use epoch date (1970-01-01) with the given time
236
- let total_seconds = millis / 1000;
237
- let ms = millis % 1000;
238
- let hours = total_seconds / 3600;
239
- let minutes = (total_seconds % 3600) / 60;
240
- let seconds = total_seconds % 60;
241
-
242
- // Create a Time object for 1970-01-01 with the given time
243
- let time_class = handle.class_time();
244
- let time = time_class.funcall::<_, _, Value>(
245
- "new",
246
- (1970, 1, 1, hours, minutes, seconds, ms * 1000), // Ruby expects microseconds
247
- )?;
248
- Ok(time.into_value_with(handle))
249
- }
250
- ParquetValue::TimeMicros(micros) => {
251
- // Convert time of day in microseconds to a Ruby Time object
252
- // Use epoch date (1970-01-01) with the given time
253
- let total_seconds = micros / 1_000_000;
254
- let us = micros % 1_000_000;
255
- let hours = total_seconds / 3600;
256
- let minutes = (total_seconds % 3600) / 60;
257
- let seconds = total_seconds % 60;
258
-
259
- // Create a Time object for 1970-01-01 with the given time
260
- let time_class = handle.class_time();
261
- let time = time_class
262
- .funcall::<_, _, Value>("new", (1970, 1, 1, hours, minutes, seconds, us))?;
263
- Ok(time.into_value_with(handle))
264
- }
265
- ParquetValue::List(l) => {
266
- // For lists, convert to Ruby array and check for specific cases
267
- // when we might need to return nil instead of an empty array
268
-
269
- // Normal case - convert list elements to a Ruby array
270
- let ary = handle.ary_new_capa(l.len());
271
- l.into_iter().try_for_each(|v| {
272
- ary.push(v.try_into_value_with(handle)?)?;
273
- Ok::<_, ParquetGemError>(())
274
- })?;
275
-
276
- // The complex_types test expects double_list to be nil when empty,
277
- // but it needs the context which we don't have directly.
278
- // We'll let List stay as an empty array, and in each_row.rs it can
279
- // be handled there with field name context.
280
- Ok(ary.into_value_with(handle))
281
- }
282
- ParquetValue::Map(m) => {
283
- #[cfg(ruby_lt_3_2)]
284
- let hash = handle.hash_new_capa(m.len());
285
-
286
- #[cfg(not(ruby_lt_3_2))]
287
- let hash = handle.hash_new();
288
-
289
- m.into_iter().try_for_each(|(k, v)| {
290
- hash.aset(
291
- k.try_into_value_with(handle)?,
292
- v.try_into_value_with(handle)?,
293
- )?;
294
- Ok::<_, ParquetGemError>(())
295
- })?;
296
- Ok(hash.into_value_with(handle))
297
- }
298
- ParquetValue::Null => Ok(handle.qnil().as_value()),
299
- }
300
- }
301
- }
302
-
303
- impl ParquetValue {
304
- pub fn from_value(
305
- ruby: &Ruby,
306
- value: Value,
307
- type_: &ParquetSchemaType,
308
- format: Option<&str>,
309
- ) -> Result<Self, MagnusError> {
310
- if value.is_nil() {
311
- return Ok(ParquetValue::Null);
312
- }
313
-
314
- match type_ {
315
- ParquetSchemaType::Primitive(primative) => match primative {
316
- PrimitiveType::Int8 => {
317
- let v = NumericConverter::<i8>::convert_with_string_fallback(ruby, value)?;
318
- Ok(ParquetValue::Int8(v))
319
- }
320
- PrimitiveType::Int16 => {
321
- let v = NumericConverter::<i16>::convert_with_string_fallback(ruby, value)?;
322
- Ok(ParquetValue::Int16(v))
323
- }
324
- PrimitiveType::Int32 => {
325
- let v = NumericConverter::<i32>::convert_with_string_fallback(ruby, value)?;
326
- Ok(ParquetValue::Int32(v))
327
- }
328
- PrimitiveType::Int64 => {
329
- let v = NumericConverter::<i64>::convert_with_string_fallback(ruby, value)?;
330
- Ok(ParquetValue::Int64(v))
331
- }
332
- PrimitiveType::UInt8 => {
333
- let v = NumericConverter::<u8>::convert_with_string_fallback(ruby, value)?;
334
- Ok(ParquetValue::UInt8(v))
335
- }
336
- PrimitiveType::UInt16 => {
337
- let v = NumericConverter::<u16>::convert_with_string_fallback(ruby, value)?;
338
- Ok(ParquetValue::UInt16(v))
339
- }
340
- PrimitiveType::UInt32 => {
341
- let v = NumericConverter::<u32>::convert_with_string_fallback(ruby, value)?;
342
- Ok(ParquetValue::UInt32(v))
343
- }
344
- PrimitiveType::UInt64 => {
345
- let v = NumericConverter::<u64>::convert_with_string_fallback(ruby, value)?;
346
- Ok(ParquetValue::UInt64(v))
347
- }
348
- PrimitiveType::Float32 => {
349
- let v = NumericConverter::<f32>::convert_with_string_fallback(ruby, value)?;
350
- Ok(ParquetValue::Float32(v))
351
- }
352
- PrimitiveType::Float64 => {
353
- let v = NumericConverter::<f64>::convert_with_string_fallback(ruby, value)?;
354
- Ok(ParquetValue::Float64(v))
355
- }
356
- PrimitiveType::Decimal128(_precision, scale) => {
357
- if value.is_kind_of(ruby.class_string()) {
358
- convert_to_decimal(value, *scale)
359
- } else if let Ok(s) = value.funcall::<_, _, RString>("to_s", ()) {
360
- convert_to_decimal(s.as_value(), *scale)
361
- } else {
362
- Err(MagnusError::new(
363
- magnus::exception::type_error(),
364
- "Expected a string for a decimal type",
365
- ))
366
- }
367
- }
368
- PrimitiveType::Decimal256(_precision, scale) => {
369
- if value.is_kind_of(ruby.class_string()) {
370
- convert_to_decimal(value, *scale)
371
- } else if let Ok(s) = value.funcall::<_, _, RString>("to_s", ()) {
372
- convert_to_decimal(s.as_value(), *scale)
373
- } else {
374
- Err(MagnusError::new(
375
- magnus::exception::type_error(),
376
- "Expected a string for a decimal type",
377
- ))
378
- }
379
- }
380
- PrimitiveType::String => {
381
- let v = convert_to_string(value)?;
382
- Ok(ParquetValue::String(v))
383
- }
384
- PrimitiveType::Binary => {
385
- let v = convert_to_binary(value)?;
386
- Ok(ParquetValue::Bytes(v))
387
- }
388
- PrimitiveType::Boolean => {
389
- let v = convert_to_boolean(ruby, value)?;
390
- Ok(ParquetValue::Boolean(v))
391
- }
392
- PrimitiveType::Date32 => {
393
- let v = convert_to_date32(ruby, value, format)?;
394
- Ok(ParquetValue::Date32(v))
395
- }
396
- PrimitiveType::TimestampMillis => {
397
- if value.is_kind_of(ruby.class_time()) {
398
- use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
399
- let (v, tz) = ruby_time_to_timestamp_with_tz(value, "millis")?;
400
- Ok(ParquetValue::TimestampMillis(v, tz))
401
- } else {
402
- let v = convert_to_timestamp_millis(ruby, value, format)?;
403
- Ok(ParquetValue::TimestampMillis(v, None))
404
- }
405
- }
406
- PrimitiveType::TimestampMicros => {
407
- if value.is_kind_of(ruby.class_time()) {
408
- use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
409
- let (v, tz) = ruby_time_to_timestamp_with_tz(value, "micros")?;
410
- Ok(ParquetValue::TimestampMicros(v, tz))
411
- } else {
412
- let v = convert_to_timestamp_micros(ruby, value, format)?;
413
- Ok(ParquetValue::TimestampMicros(v, None))
414
- }
415
- }
416
- PrimitiveType::TimeMillis => {
417
- let v = convert_to_time_millis(ruby, value, format)?;
418
- Ok(ParquetValue::TimeMillis(v))
419
- }
420
- PrimitiveType::TimeMicros => {
421
- let v = convert_to_time_micros(ruby, value, format)?;
422
- Ok(ParquetValue::TimeMicros(v))
423
- }
424
- },
425
- ParquetSchemaType::List(list_field) => {
426
- // We expect the Ruby object to be an Array, each item converting
427
- // to the item_type. We gather them into ParquetValue::List(...)
428
- let array = RArray::from_value(value).ok_or_else(|| {
429
- // Just get a simple string representation of the class
430
- let type_info = format!("{:?}", value.class());
431
-
432
- MagnusError::new(
433
- magnus::exception::type_error(),
434
- format!(
435
- "Value must be an Array for a list type, got {} instead",
436
- type_info
437
- ),
438
- )
439
- })?;
440
- let mut items = Vec::with_capacity(array.len());
441
- for (index, item_val) in array.into_iter().enumerate() {
442
- match ParquetValue::from_value(
443
- ruby,
444
- item_val,
445
- &list_field.item_type,
446
- list_field.format,
447
- ) {
448
- Ok(child_val) => items.push(child_val),
449
- Err(e) => {
450
- // Enhance the error with the item index
451
- return Err(MagnusError::new(
452
- magnus::exception::type_error(),
453
- format!("Failed to convert item at index {} of list: {}", index, e),
454
- ));
455
- }
456
- }
457
- }
458
- Ok(ParquetValue::List(items))
459
- }
460
- ParquetSchemaType::Map(map_field) => {
461
- // We expect the Ruby object to be a Hash
462
- let hash_pairs: Vec<(Value, Value)> = value.funcall("to_a", ())?;
463
- let mut result = HashMap::with_capacity(hash_pairs.len());
464
- for (k, v) in hash_pairs {
465
- let key_val = ParquetValue::from_value(
466
- ruby,
467
- k,
468
- &map_field.key_type,
469
- map_field.key_format,
470
- )?;
471
- let val_val = ParquetValue::from_value(
472
- ruby,
473
- v,
474
- &map_field.value_type,
475
- map_field.value_format,
476
- )?;
477
- result.insert(key_val, val_val);
478
- }
479
- Ok(ParquetValue::Map(result))
480
- }
481
- ParquetSchemaType::Struct(struct_field) => {
482
- // We expect a Ruby hash or object that responds to to_h
483
- let hash_obj = if value.respond_to("to_h", false)? {
484
- value.funcall::<_, _, Value>("to_h", ())?
485
- } else {
486
- return Err(MagnusError::new(
487
- magnus::exception::type_error(),
488
- "Value must be a Hash or respond to to_h for a struct type",
489
- ));
490
- };
491
-
492
- let mut result = HashMap::new();
493
-
494
- // For each field in the struct definition, try to find a matching key in the hash
495
- for field in &struct_field.fields {
496
- let field_name = ParquetValue::String(field.name.clone());
497
- let ruby_field_name = ruby.str_new(&field.name).as_value();
498
-
499
- // Try to get the field value using Ruby's [] method
500
- let field_value_obj =
501
- hash_obj.funcall::<_, _, Value>("[]", (ruby_field_name,))?;
502
-
503
- let field_value = if field_value_obj.is_nil() {
504
- ParquetValue::Null // Field not provided or nil, treat as null
505
- } else {
506
- ParquetValue::from_value(
507
- ruby,
508
- field_value_obj,
509
- &field.type_,
510
- field.format.as_deref(),
511
- )?
512
- };
513
-
514
- result.insert(field_name, field_value);
515
- }
516
-
517
- // Use Map to represent a struct since it's a collection of named values
518
- Ok(ParquetValue::Map(result))
519
- }
520
- }
521
- }
522
- }
523
-
524
- enum ParsedDecimal {
525
- Int128(i128),
526
- Int256(arrow_buffer::i256),
527
- }
528
-
529
- /// Unified helper to parse a decimal string and apply scaling
530
- fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<ParsedDecimal, MagnusError> {
531
- let s = input_str.trim();
532
-
533
- // 1. Handle scientific notation case (e.g., "0.12345e3")
534
- if let Some(e_pos) = s.to_lowercase().find('e') {
535
- let base = &s[0..e_pos];
536
- let exp = &s[e_pos + 1..];
537
-
538
- // Parse the exponent with detailed error message
539
- let exp_val = exp.parse::<i32>().map_err(|e| {
540
- MagnusError::new(
541
- magnus::exception::type_error(),
542
- format!(
543
- "Failed to parse exponent '{}' in decimal string '{}': {}",
544
- exp, s, e
545
- ),
546
- )
547
- })?;
548
-
549
- // For very large exponents, we'll need to use BigInt
550
- if exp_val.abs() > 38 {
551
- return parse_large_decimal_with_bigint(s, input_scale);
552
- }
553
-
554
- // Handle the base part which might contain a decimal point
555
- let (base_val, base_scale) = if let Some(decimal_pos) = base.find('.') {
556
- let mut base_without_point = base.to_string();
557
- base_without_point.remove(decimal_pos);
558
-
559
- let base_scale = base.len() - decimal_pos - 1;
560
-
561
- // Try to parse as i128 first
562
- match base_without_point.parse::<i128>() {
563
- Ok(v) => (v, base_scale as i32),
564
- Err(_) => {
565
- // Value too large for i128, use BigInt
566
- return parse_large_decimal_with_bigint(s, input_scale);
567
- }
568
- }
569
- } else {
570
- // No decimal point in base
571
- match base.parse::<i128>() {
572
- Ok(v) => (v, 0),
573
- Err(_) => {
574
- // Value too large for i128, use BigInt
575
- return parse_large_decimal_with_bigint(s, input_scale);
576
- }
577
- }
578
- };
579
-
580
- // Calculate the effective scale: base_scale - exp_val
581
- let effective_scale = base_scale - exp_val;
582
-
583
- // Adjust the value based on the difference between effective scale and requested scale
584
- match effective_scale.cmp(&(input_scale as i32)) {
585
- std::cmp::Ordering::Less => {
586
- // Need to multiply to increase scale
587
- let scale_diff = (input_scale as i32 - effective_scale) as u32;
588
- if scale_diff > 38 {
589
- return parse_large_decimal_with_bigint(s, input_scale);
590
- }
591
-
592
- // Check for overflow
593
- match base_val.checked_mul(10_i128.pow(scale_diff)) {
594
- Some(v) => Ok(ParsedDecimal::Int128(v)),
595
- None => parse_large_decimal_with_bigint(s, input_scale),
596
- }
597
- }
598
- std::cmp::Ordering::Greater => {
599
- // Need to divide to decrease scale
600
- let scale_diff = (effective_scale - input_scale as i32) as u32;
601
- if scale_diff > 38 {
602
- return Err(MagnusError::new(
603
- magnus::exception::range_error(),
604
- format!("Scale adjustment too large ({}) for decimal value '{}'. Consider using a larger scale.", scale_diff, s),
605
- ));
606
- }
607
- Ok(ParsedDecimal::Int128(base_val / 10_i128.pow(scale_diff)))
608
- }
609
- std::cmp::Ordering::Equal => Ok(ParsedDecimal::Int128(base_val)),
610
- }
611
- }
612
- // 2. Handle decimal point in the string (e.g., "123.456")
613
- else if let Some(decimal_pos) = s.find('.') {
614
- let mut s_without_point = s.to_string();
615
- s_without_point.remove(decimal_pos);
616
-
617
- // Calculate the actual scale from the decimal position
618
- let actual_scale = s.len() - decimal_pos - 1;
619
-
620
- // Try to parse as i128 first
621
- let v = match s_without_point.parse::<i128>() {
622
- Ok(v) => v,
623
- Err(_) => {
624
- // Value too large for i128, use BigInt
625
- return parse_large_decimal_with_bigint(s, input_scale);
626
- }
627
- };
628
-
629
- // Scale the value if needed based on the difference between
630
- // the actual scale and the requested scale
631
- match actual_scale.cmp(&(input_scale as usize)) {
632
- std::cmp::Ordering::Less => {
633
- // Need to multiply to increase scale
634
- let scale_diff = (input_scale - actual_scale as i8) as u32;
635
- if scale_diff > 38 {
636
- return parse_large_decimal_with_bigint(s, input_scale);
637
- }
638
-
639
- // Check for overflow
640
- match v.checked_mul(10_i128.pow(scale_diff)) {
641
- Some(v) => Ok(ParsedDecimal::Int128(v)),
642
- None => parse_large_decimal_with_bigint(s, input_scale),
643
- }
644
- }
645
- std::cmp::Ordering::Greater => {
646
- // Need to divide to decrease scale
647
- let scale_diff = (actual_scale as i8 - input_scale) as u32;
648
- if scale_diff > 38 {
649
- return Err(MagnusError::new(
650
- magnus::exception::range_error(),
651
- format!("Scale adjustment too large ({}) for decimal value '{}'. Consider using a larger scale.", scale_diff, s),
652
- ));
653
- }
654
- Ok(ParsedDecimal::Int128(v / 10_i128.pow(scale_diff)))
655
- }
656
- std::cmp::Ordering::Equal => Ok(ParsedDecimal::Int128(v)),
657
- }
658
- }
659
- // 3. Plain integer value (e.g., "12345")
660
- else {
661
- // No decimal point, try to parse as i128 first
662
- let v = match s.parse::<i128>() {
663
- Ok(v) => v,
664
- Err(_) => {
665
- // Value too large for i128, use BigInt
666
- return parse_large_decimal_with_bigint(s, input_scale);
667
- }
668
- };
669
-
670
- // Apply scale - make sure it's reasonable
671
- if input_scale > 38 {
672
- return parse_large_decimal_with_bigint(s, input_scale);
673
- } else if input_scale < -38 {
674
- return Err(MagnusError::new(
675
- magnus::exception::range_error(),
676
- format!(
677
- "Scale {} is too small for decimal value '{}'. Must be ≥ -38.",
678
- input_scale, s
679
- ),
680
- ));
681
- }
682
-
683
- // Apply positive scale (multiply)
684
- if input_scale >= 0 {
685
- match v.checked_mul(10_i128.pow(input_scale as u32)) {
686
- Some(v) => Ok(ParsedDecimal::Int128(v)),
687
- None => parse_large_decimal_with_bigint(s, input_scale),
688
- }
689
- } else {
690
- // Apply negative scale (divide)
691
- Ok(ParsedDecimal::Int128(
692
- v / 10_i128.pow((-input_scale) as u32),
693
- ))
694
- }
695
- }
696
- }
697
-
698
- /// Parse large decimal values using BigInt when they would overflow i128
699
- fn parse_large_decimal_with_bigint(s: &str, input_scale: i8) -> Result<ParsedDecimal, MagnusError> {
700
- use num::BigInt;
701
- use std::str::FromStr;
702
-
703
- // Parse the input string as a BigInt
704
- let bigint = if let Some(e_pos) = s.to_lowercase().find('e') {
705
- // Handle scientific notation
706
- let base = &s[0..e_pos];
707
- let exp = &s[e_pos + 1..];
708
-
709
- let exp_val = exp.parse::<i32>().map_err(|e| {
710
- MagnusError::new(
711
- magnus::exception::type_error(),
712
- format!("Failed to parse exponent '{}': {}", exp, e),
713
- )
714
- })?;
715
-
716
- // Parse base as BigInt
717
- let base_bigint = if let Some(decimal_pos) = base.find('.') {
718
- let mut base_without_point = base.to_string();
719
- base_without_point.remove(decimal_pos);
720
- let base_scale = base.len() - decimal_pos - 1;
721
-
722
- let bigint = BigInt::from_str(&base_without_point).map_err(|e| {
723
- MagnusError::new(
724
- magnus::exception::type_error(),
725
- format!("Failed to parse decimal base '{}': {}", base, e),
726
- )
727
- })?;
728
-
729
- // Adjust for the decimal point
730
- let effective_exp = exp_val - base_scale as i32;
731
-
732
- if effective_exp > 0 {
733
- bigint * BigInt::from(10).pow(effective_exp as u32)
734
- } else if effective_exp < 0 {
735
- bigint / BigInt::from(10).pow((-effective_exp) as u32)
736
- } else {
737
- bigint
738
- }
739
- } else {
740
- let bigint = BigInt::from_str(base).map_err(|e| {
741
- MagnusError::new(
742
- magnus::exception::type_error(),
743
- format!("Failed to parse decimal base '{}': {}", base, e),
744
- )
745
- })?;
746
-
747
- if exp_val > 0 {
748
- bigint * BigInt::from(10).pow(exp_val as u32)
749
- } else if exp_val < 0 {
750
- bigint / BigInt::from(10).pow((-exp_val) as u32)
751
- } else {
752
- bigint
753
- }
754
- };
755
-
756
- base_bigint
757
- } else if let Some(decimal_pos) = s.find('.') {
758
- // Handle decimal point
759
- let mut s_without_point = s.to_string();
760
- s_without_point.remove(decimal_pos);
761
-
762
- let actual_scale = s.len() - decimal_pos - 1;
763
- let bigint = BigInt::from_str(&s_without_point).map_err(|e| {
764
- MagnusError::new(
765
- magnus::exception::type_error(),
766
- format!("Failed to parse decimal string '{}': {}", s, e),
767
- )
768
- })?;
769
-
770
- // Adjust for scale difference
771
- let scale_diff = actual_scale as i8 - input_scale;
772
-
773
- if scale_diff > 0 {
774
- bigint / BigInt::from(10).pow(scale_diff as u32)
775
- } else if scale_diff < 0 {
776
- bigint * BigInt::from(10).pow((-scale_diff) as u32)
777
- } else {
778
- bigint
779
- }
780
- } else {
781
- // Plain integer
782
- let bigint = BigInt::from_str(s).map_err(|e| {
783
- MagnusError::new(
784
- magnus::exception::type_error(),
785
- format!("Failed to parse integer string '{}': {}", s, e),
786
- )
787
- })?;
788
-
789
- if input_scale > 0 {
790
- bigint * BigInt::from(10).pow(input_scale as u32)
791
- } else if input_scale < 0 {
792
- bigint / BigInt::from(10).pow((-input_scale) as u32)
793
- } else {
794
- bigint
795
- }
796
- };
797
-
798
- // Convert BigInt to bytes and then to i256
799
- let bytes = bigint.to_signed_bytes_le();
800
-
801
- if bytes.len() <= 16 {
802
- // Fits in i128
803
- let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
804
- [0xff; 16]
805
- } else {
806
- [0; 16]
807
- };
808
- buf[..bytes.len()].copy_from_slice(&bytes);
809
-
810
- Ok(ParsedDecimal::Int128(i128::from_le_bytes(buf)))
811
- } else if bytes.len() <= 32 {
812
- // Fits in i256
813
- let mut buf = if bigint.sign() == num::bigint::Sign::Minus {
814
- [0xff; 32]
815
- } else {
816
- [0; 32]
817
- };
818
- buf[..bytes.len()].copy_from_slice(&bytes);
819
-
820
- Ok(ParsedDecimal::Int256(arrow_buffer::i256::from_le_bytes(
821
- buf,
822
- )))
823
- } else {
824
- Err(MagnusError::new(
825
- magnus::exception::range_error(),
826
- format!("Decimal value '{}' is too large to fit in 256 bits", s),
827
- ))
828
- }
829
- }
830
-
831
- fn convert_to_decimal(value: Value, scale: i8) -> Result<ParquetValue, MagnusError> {
832
- // Get the decimal string based on the type of value
833
- let s = if unsafe { value.classname() } == "BigDecimal" {
834
- value
835
- .funcall::<_, _, RString>("to_s", ("F",))?
836
- .to_string()?
837
- } else {
838
- value.to_r_string()?.to_string()?
839
- };
840
-
841
- // Use our unified parser to convert the string to a decimal value with scaling
842
- match parse_decimal_string(&s, scale) {
843
- Ok(decimal_value) => match decimal_value {
844
- ParsedDecimal::Int128(v) => Ok(ParquetValue::Decimal128(v, scale)),
845
- ParsedDecimal::Int256(v) => Ok(ParquetValue::Decimal256(v, scale)),
846
- },
847
- Err(e) => Err(MagnusError::new(
848
- magnus::exception::type_error(),
849
- format!(
850
- "Failed to convert '{}' to decimal with scale {}: {}",
851
- s, scale, e
852
- ),
853
- )),
854
- }
855
- }
856
-
857
- #[derive(Debug)]
858
- pub struct ParquetValueVec(Vec<ParquetValue>);
859
-
860
- impl ParquetValueVec {
861
- pub fn into_inner(self) -> Vec<ParquetValue> {
862
- self.0
863
- }
864
- }
865
-
866
- impl IntoIterator for ParquetValueVec {
867
- type Item = ParquetValue;
868
- type IntoIter = std::vec::IntoIter<ParquetValue>;
869
-
870
- fn into_iter(self) -> Self::IntoIter {
871
- self.0.into_iter()
872
- }
873
- }
874
-
875
- impl std::cmp::PartialEq for ParquetValueVec {
876
- fn eq(&self, other: &Self) -> bool {
877
- self.0 == other.0
878
- }
879
- }
880
-
881
- impl std::cmp::Eq for ParquetValueVec {}
882
-
883
- macro_rules! impl_numeric_array_conversion {
884
- ($column:expr, $array_type:ty, $variant:ident) => {{
885
- let array = downcast_array::<$array_type>($column);
886
- Ok(ParquetValueVec(if array.is_nullable() {
887
- array
888
- .values()
889
- .iter()
890
- .enumerate()
891
- .map(|(i, x)| {
892
- if array.is_null(i) {
893
- ParquetValue::Null
894
- } else {
895
- ParquetValue::$variant(*x)
896
- }
897
- })
898
- .collect()
899
- } else {
900
- array
901
- .values()
902
- .iter()
903
- .map(|x| ParquetValue::$variant(*x))
904
- .collect()
905
- }))
906
- }};
907
- }
908
- macro_rules! impl_boolean_array_conversion {
909
- ($column:expr, $array_type:ty, $variant:ident) => {{
910
- let array = downcast_array::<$array_type>($column);
911
- Ok(ParquetValueVec(if array.is_nullable() {
912
- array
913
- .values()
914
- .iter()
915
- .enumerate()
916
- .map(|(i, x)| {
917
- if array.is_null(i) {
918
- ParquetValue::Null
919
- } else {
920
- ParquetValue::$variant(x)
921
- }
922
- })
923
- .collect()
924
- } else {
925
- array
926
- .values()
927
- .iter()
928
- .map(|x| ParquetValue::$variant(x))
929
- .collect()
930
- }))
931
- }};
932
- }
933
-
934
- pub struct ArrayWrapper<'a> {
935
- pub array: &'a dyn Array,
936
- pub strict: bool,
937
- }
938
-
939
- impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
940
- type Error = ParquetGemError;
941
-
942
- fn try_from(column: ArrayWrapper<'a>) -> Result<Self, Self::Error> {
943
- match column.array.data_type() {
944
- DataType::Boolean => {
945
- impl_boolean_array_conversion!(column.array, BooleanArray, Boolean)
946
- }
947
- DataType::Int8 => impl_numeric_array_conversion!(column.array, Int8Array, Int8),
948
- DataType::Int16 => impl_numeric_array_conversion!(column.array, Int16Array, Int16),
949
- DataType::Int32 => impl_numeric_array_conversion!(column.array, Int32Array, Int32),
950
- DataType::Int64 => impl_numeric_array_conversion!(column.array, Int64Array, Int64),
951
- DataType::UInt8 => impl_numeric_array_conversion!(column.array, UInt8Array, UInt8),
952
- DataType::UInt16 => impl_numeric_array_conversion!(column.array, UInt16Array, UInt16),
953
- DataType::UInt32 => impl_numeric_array_conversion!(column.array, UInt32Array, UInt32),
954
- DataType::UInt64 => impl_numeric_array_conversion!(column.array, UInt64Array, UInt64),
955
- DataType::Float32 => {
956
- impl_numeric_array_conversion!(column.array, Float32Array, Float32)
957
- }
958
- DataType::Float64 => {
959
- impl_numeric_array_conversion!(column.array, Float64Array, Float64)
960
- }
961
- DataType::Date32 => impl_numeric_array_conversion!(column.array, Date32Array, Date32),
962
- DataType::Date64 => impl_numeric_array_conversion!(column.array, Date64Array, Date64),
963
- DataType::Decimal128(_precision, scale) => {
964
- let array = downcast_array::<Decimal128Array>(column.array);
965
- Ok(ParquetValueVec(if array.is_nullable() {
966
- array
967
- .values()
968
- .iter()
969
- .enumerate()
970
- .map(|(i, x)| {
971
- if array.is_null(i) {
972
- ParquetValue::Null
973
- } else {
974
- ParquetValue::Decimal128(*x, *scale)
975
- }
976
- })
977
- .collect()
978
- } else {
979
- array
980
- .values()
981
- .iter()
982
- .map(|x| ParquetValue::Decimal128(*x, *scale))
983
- .collect()
984
- }))
985
- }
986
- DataType::Decimal256(_precision, scale) => {
987
- let array = downcast_array::<Decimal256Array>(column.array);
988
- Ok(ParquetValueVec(if array.is_nullable() {
989
- array
990
- .values()
991
- .iter()
992
- .enumerate()
993
- .map(|(i, x)| {
994
- if array.is_null(i) {
995
- ParquetValue::Null
996
- } else {
997
- ParquetValue::Decimal256(*x, *scale)
998
- }
999
- })
1000
- .collect()
1001
- } else {
1002
- array
1003
- .values()
1004
- .iter()
1005
- .map(|x| ParquetValue::Decimal256(*x, *scale))
1006
- .collect()
1007
- }))
1008
- }
1009
- DataType::Timestamp(TimeUnit::Second, tz) => {
1010
- impl_timestamp_array_conversion!(
1011
- column.array,
1012
- TimestampSecondArray,
1013
- TimestampSecond,
1014
- tz
1015
- )
1016
- }
1017
- DataType::Timestamp(TimeUnit::Millisecond, tz) => {
1018
- impl_timestamp_array_conversion!(
1019
- column.array,
1020
- TimestampMillisecondArray,
1021
- TimestampMillis,
1022
- tz
1023
- )
1024
- }
1025
- DataType::Timestamp(TimeUnit::Microsecond, tz) => {
1026
- impl_timestamp_array_conversion!(
1027
- column.array,
1028
- TimestampMicrosecondArray,
1029
- TimestampMicros,
1030
- tz
1031
- )
1032
- }
1033
- DataType::Timestamp(TimeUnit::Nanosecond, tz) => {
1034
- impl_timestamp_array_conversion!(
1035
- column.array,
1036
- TimestampNanosecondArray,
1037
- TimestampNanos,
1038
- tz
1039
- )
1040
- }
1041
- DataType::Time32(TimeUnit::Millisecond) => {
1042
- let array = downcast_array::<Time32MillisecondArray>(column.array);
1043
- Ok(ParquetValueVec(if array.is_nullable() {
1044
- array
1045
- .values()
1046
- .iter()
1047
- .enumerate()
1048
- .map(|(i, x)| {
1049
- if array.is_null(i) {
1050
- ParquetValue::Null
1051
- } else {
1052
- ParquetValue::TimeMillis(*x)
1053
- }
1054
- })
1055
- .collect()
1056
- } else {
1057
- array
1058
- .values()
1059
- .iter()
1060
- .map(|x| ParquetValue::TimeMillis(*x))
1061
- .collect()
1062
- }))
1063
- }
1064
- DataType::Time64(TimeUnit::Microsecond) => {
1065
- let array = downcast_array::<Time64MicrosecondArray>(column.array);
1066
- Ok(ParquetValueVec(if array.is_nullable() {
1067
- array
1068
- .values()
1069
- .iter()
1070
- .enumerate()
1071
- .map(|(i, x)| {
1072
- if array.is_null(i) {
1073
- ParquetValue::Null
1074
- } else {
1075
- ParquetValue::TimeMicros(*x)
1076
- }
1077
- })
1078
- .collect()
1079
- } else {
1080
- array
1081
- .values()
1082
- .iter()
1083
- .map(|x| ParquetValue::TimeMicros(*x))
1084
- .collect()
1085
- }))
1086
- }
1087
- DataType::Float16 => {
1088
- let array = downcast_array::<Float16Array>(column.array);
1089
- if array.is_nullable() {
1090
- Ok(ParquetValueVec(
1091
- array
1092
- .values()
1093
- .iter()
1094
- .enumerate()
1095
- .map(|(i, x)| {
1096
- if array.is_null(i) {
1097
- ParquetValue::Null
1098
- } else {
1099
- ParquetValue::Float16(f32::from(*x))
1100
- }
1101
- })
1102
- .collect(),
1103
- ))
1104
- } else {
1105
- Ok(ParquetValueVec(
1106
- array
1107
- .values()
1108
- .iter()
1109
- .map(|x| ParquetValue::Float16(f32::from(*x)))
1110
- .collect(),
1111
- ))
1112
- }
1113
- }
1114
- DataType::Utf8 => {
1115
- let array = downcast_array::<StringArray>(column.array);
1116
- let mut tmp_vec = Vec::with_capacity(array.len());
1117
- let iter = array.iter().map(|opt_x| match opt_x {
1118
- Some(x) => {
1119
- if column.strict {
1120
- Ok::<_, ParquetGemError>(ParquetValue::String(
1121
- simdutf8::basic::from_utf8(x.as_bytes())?.to_string(),
1122
- ))
1123
- } else {
1124
- Ok::<_, ParquetGemError>(ParquetValue::String(x.to_string()))
1125
- }
1126
- }
1127
- None => Ok(ParquetValue::Null),
1128
- });
1129
- for x in iter {
1130
- tmp_vec.push(x?);
1131
- }
1132
- Ok(ParquetValueVec(tmp_vec))
1133
- }
1134
- DataType::Binary => {
1135
- let array = downcast_array::<BinaryArray>(column.array);
1136
- Ok(ParquetValueVec(
1137
- array
1138
- .iter()
1139
- .map(|opt_x| match opt_x {
1140
- Some(x) => ParquetValue::Bytes(x.to_vec()),
1141
- None => ParquetValue::Null,
1142
- })
1143
- .collect(),
1144
- ))
1145
- }
1146
- DataType::List(_field) => {
1147
- let list_array = downcast_array::<ListArray>(column.array);
1148
- let sub_list = list_array
1149
- .iter()
1150
- .map(|x| match x {
1151
- Some(values) => match ParquetValueVec::try_from(ArrayWrapper {
1152
- array: &*values,
1153
- strict: column.strict,
1154
- }) {
1155
- Ok(vec) => Ok(ParquetValue::List(vec.into_inner())),
1156
- Err(e) => Err(MagnusError::new(
1157
- magnus::exception::type_error(),
1158
- format!("Error converting list array to ParquetValueVec: {}", e),
1159
- ))?,
1160
- },
1161
- None => Ok(ParquetValue::Null),
1162
- })
1163
- .collect::<Result<Vec<ParquetValue>, Self::Error>>()?;
1164
- Ok(ParquetValueVec(sub_list))
1165
- }
1166
- DataType::Struct(_) => {
1167
- let struct_array = downcast_array::<StructArray>(column.array);
1168
- let mut values = Vec::with_capacity(struct_array.len());
1169
- for i in 0..struct_array.len() {
1170
- if struct_array.is_null(i) {
1171
- values.push(ParquetValue::Null);
1172
- continue;
1173
- }
1174
-
1175
- let mut map = std::collections::HashMap::new();
1176
- for (field_idx, field) in struct_array.fields().iter().enumerate() {
1177
- let c = struct_array.column(field_idx);
1178
- let field_values = match ParquetValueVec::try_from(ArrayWrapper {
1179
- array: &*c.slice(i, 1),
1180
- strict: column.strict,
1181
- }) {
1182
- Ok(vec) => vec.into_inner(),
1183
- Err(e) => {
1184
- return Err(MagnusError::new(
1185
- magnus::exception::type_error(),
1186
- format!(
1187
- "Error converting struct field to ParquetValueVec: {}",
1188
- e
1189
- ),
1190
- ))?;
1191
- }
1192
- };
1193
- map.insert(
1194
- ParquetValue::String(field.name().to_string()),
1195
- field_values.into_iter().next().ok_or_else(|| {
1196
- MagnusError::new(
1197
- magnus::exception::type_error(),
1198
- "Expected a single value for struct field".to_string(),
1199
- )
1200
- })?,
1201
- );
1202
- }
1203
- values.push(ParquetValue::Map(map));
1204
- }
1205
- Ok(ParquetValueVec(values))
1206
- }
1207
- DataType::Map(_field, _keys_sorted) => {
1208
- let map_array = downcast_array::<MapArray>(column.array);
1209
-
1210
- let mut result = Vec::with_capacity(map_array.len());
1211
-
1212
- let offsets = map_array.offsets();
1213
- let struct_array = map_array.entries();
1214
-
1215
- for i in 0..map_array.len() {
1216
- if map_array.is_null(i) {
1217
- result.push(ParquetValue::Null);
1218
- continue;
1219
- }
1220
-
1221
- let start = offsets[i] as usize;
1222
- let end = offsets[i + 1] as usize;
1223
-
1224
- let mut map_data =
1225
- HashMap::with_capacity_and_hasher(end - start, Default::default());
1226
-
1227
- // In Arrow's MapArray, the entries are a struct with fields named "keys" and "values"
1228
- // Get the columns directly by index since we know the structure
1229
- let key_array = struct_array.column(0); // First field is always keys
1230
- let val_array = struct_array.column(1); // Second field is always values
1231
-
1232
- for entry_index in start..end {
1233
- let key_value = if key_array.is_null(entry_index) {
1234
- ParquetValue::Null
1235
- } else {
1236
- let subarray = key_array.slice(entry_index, 1);
1237
- let subwrapper = ArrayWrapper {
1238
- array: &*subarray,
1239
- strict: column.strict,
1240
- };
1241
- let mut converted = ParquetValueVec::try_from(subwrapper)?.0;
1242
- converted.pop().unwrap_or(ParquetValue::Null)
1243
- };
1244
-
1245
- let val_value = if val_array.is_null(entry_index) {
1246
- ParquetValue::Null
1247
- } else {
1248
- let subarray = val_array.slice(entry_index, 1);
1249
- let subwrapper = ArrayWrapper {
1250
- array: &*subarray,
1251
- strict: column.strict,
1252
- };
1253
- let mut converted = ParquetValueVec::try_from(subwrapper)?.0;
1254
- converted.pop().unwrap_or(ParquetValue::Null)
1255
- };
1256
-
1257
- map_data.insert(key_value, val_value);
1258
- }
1259
-
1260
- result.push(ParquetValue::Map(map_data));
1261
- }
1262
-
1263
- Ok(ParquetValueVec(result))
1264
- }
1265
- DataType::Null => {
1266
- let x = downcast_array::<NullArray>(column.array);
1267
- Ok(ParquetValueVec(vec![ParquetValue::Null; x.len()]))
1268
- }
1269
- _ => Err(MagnusError::new(
1270
- magnus::exception::type_error(),
1271
- format!("Unsupported data type: {:?}", column.array.data_type()),
1272
- ))?,
1273
- }
1274
- }
1275
- }