parquet 0.5.11 → 0.5.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +8 -8
- data/README.md +1 -0
- data/ext/parquet/Cargo.toml +5 -5
- data/ext/parquet/src/types/core_types.rs +2 -0
- data/ext/parquet/src/types/mod.rs +3 -3
- data/ext/parquet/src/types/parquet_value.rs +108 -4
- data/ext/parquet/src/types/record_types.rs +4 -2
- data/ext/parquet/src/types/schema_node.rs +8 -0
- data/ext/parquet/src/types/timestamp.rs +222 -25
- data/ext/parquet/src/types/type_conversion.rs +204 -0
- data/ext/parquet/src/types/writer_types.rs +6 -1
- data/ext/parquet/src/writer/mod.rs +10 -0
- data/ext/parquet/src/writer/write_columns.rs +6 -0
- data/ext/parquet/src/writer/write_rows.rs +5 -3
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5f1b43212634fda95b699a725acd82761c5ab5fe4a51473c3bc1dc5eb9bf2226
|
4
|
+
data.tar.gz: 6dfdc2027a957673ad5591dc293624f138e1ef5ef6772ad06ef31b24ca8f360d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dcdbfdeb1be352af9aa1356f6d1a13070df84410d6f7b7eaa744d5b3dd62ba18004ee189be23566fc0b3a39e5c6c3548505a6783d65cdd69ff677f4cd2206d29
|
7
|
+
data.tar.gz: a9d4cad07d8d91edc48a9a7268ad53a24ffd9a1b83f0265c2f8194a73fed097d6c6f74398707f3c1f026978892a9f84273ae68bd619f41653341dbb39ef880ee
|
data/Cargo.lock
CHANGED
@@ -64,7 +64,7 @@ dependencies = [
|
|
64
64
|
[[package]]
|
65
65
|
name = "arrow-array"
|
66
66
|
version = "55.1.0"
|
67
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
67
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
68
68
|
dependencies = [
|
69
69
|
"ahash",
|
70
70
|
"arrow-buffer",
|
@@ -79,7 +79,7 @@ dependencies = [
|
|
79
79
|
[[package]]
|
80
80
|
name = "arrow-buffer"
|
81
81
|
version = "55.1.0"
|
82
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
82
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
83
83
|
dependencies = [
|
84
84
|
"bytes",
|
85
85
|
"half",
|
@@ -89,7 +89,7 @@ dependencies = [
|
|
89
89
|
[[package]]
|
90
90
|
name = "arrow-cast"
|
91
91
|
version = "55.1.0"
|
92
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
92
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
93
93
|
dependencies = [
|
94
94
|
"arrow-array",
|
95
95
|
"arrow-buffer",
|
@@ -108,7 +108,7 @@ dependencies = [
|
|
108
108
|
[[package]]
|
109
109
|
name = "arrow-data"
|
110
110
|
version = "55.1.0"
|
111
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
111
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
112
112
|
dependencies = [
|
113
113
|
"arrow-buffer",
|
114
114
|
"arrow-schema",
|
@@ -119,7 +119,7 @@ dependencies = [
|
|
119
119
|
[[package]]
|
120
120
|
name = "arrow-ipc"
|
121
121
|
version = "55.1.0"
|
122
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
122
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
123
123
|
dependencies = [
|
124
124
|
"arrow-array",
|
125
125
|
"arrow-buffer",
|
@@ -132,12 +132,12 @@ dependencies = [
|
|
132
132
|
[[package]]
|
133
133
|
name = "arrow-schema"
|
134
134
|
version = "55.1.0"
|
135
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
135
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
136
136
|
|
137
137
|
[[package]]
|
138
138
|
name = "arrow-select"
|
139
139
|
version = "55.1.0"
|
140
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
140
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
141
141
|
dependencies = [
|
142
142
|
"ahash",
|
143
143
|
"arrow-array",
|
@@ -867,7 +867,7 @@ dependencies = [
|
|
867
867
|
[[package]]
|
868
868
|
name = "parquet"
|
869
869
|
version = "55.1.0"
|
870
|
-
source = "git+https://github.com/njaremko/arrow-rs?branch=
|
870
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
|
871
871
|
dependencies = [
|
872
872
|
"ahash",
|
873
873
|
"arrow-array",
|
data/README.md
CHANGED
data/ext/parquet/Cargo.toml
CHANGED
@@ -11,16 +11,16 @@ rb-sys-env = "^0.2"
|
|
11
11
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
|
-
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "
|
15
|
-
arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "
|
16
|
-
arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "
|
17
|
-
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "
|
14
|
+
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
15
|
+
arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
16
|
+
arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["lz4"] }
|
17
|
+
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
18
18
|
bytes = "^1.9"
|
19
19
|
either = "1.9"
|
20
20
|
itertools = "^0.14"
|
21
21
|
jiff = "0.2"
|
22
22
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
23
|
-
parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "
|
23
|
+
parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["json"] }
|
24
24
|
rand = "0.9"
|
25
25
|
rb-sys = "^0.9"
|
26
26
|
simdutf8 = "0.1.5"
|
@@ -25,9 +25,9 @@ use arrow_array::cast::downcast_array;
|
|
25
25
|
use arrow_array::{
|
26
26
|
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
|
27
27
|
Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
|
28
|
-
ListArray, NullArray, StringArray, StructArray,
|
29
|
-
TimestampMillisecondArray, TimestampNanosecondArray,
|
30
|
-
UInt32Array, UInt64Array, UInt8Array,
|
28
|
+
ListArray, NullArray, StringArray, StructArray, Time32MillisecondArray, Time64MicrosecondArray,
|
29
|
+
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
30
|
+
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
31
31
|
};
|
32
32
|
use arrow_schema::{DataType, TimeUnit};
|
33
33
|
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
|
@@ -29,6 +29,8 @@ pub enum ParquetValue {
|
|
29
29
|
TimestampMillis(i64, Option<Arc<str>>),
|
30
30
|
TimestampMicros(i64, Option<Arc<str>>),
|
31
31
|
TimestampNanos(i64, Option<Arc<str>>),
|
32
|
+
TimeMillis(i32), // Time of day in milliseconds since midnight
|
33
|
+
TimeMicros(i64), // Time of day in microseconds since midnight
|
32
34
|
List(Vec<ParquetValue>), // A list of values (can be empty or have null items)
|
33
35
|
// We're not using a separate NilList type anymore - we'll handle nil lists elsewhere
|
34
36
|
Map(HashMap<ParquetValue, ParquetValue>),
|
@@ -108,6 +110,8 @@ impl PartialEq for ParquetValue {
|
|
108
110
|
(ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
|
109
111
|
(ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
|
110
112
|
(ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
|
113
|
+
(ParquetValue::TimeMillis(a), ParquetValue::TimeMillis(b)) => a == b,
|
114
|
+
(ParquetValue::TimeMicros(a), ParquetValue::TimeMicros(b)) => a == b,
|
111
115
|
(ParquetValue::List(a), ParquetValue::List(b)) => a == b,
|
112
116
|
(ParquetValue::Null, ParquetValue::Null) => true,
|
113
117
|
_ => false,
|
@@ -160,6 +164,8 @@ impl std::hash::Hash for ParquetValue {
|
|
160
164
|
ts.hash(state);
|
161
165
|
tz.hash(state);
|
162
166
|
}
|
167
|
+
ParquetValue::TimeMillis(t) => t.hash(state),
|
168
|
+
ParquetValue::TimeMicros(t) => t.hash(state),
|
163
169
|
ParquetValue::List(l) => l.hash(state),
|
164
170
|
ParquetValue::Map(m) => {
|
165
171
|
for (k, v) in m {
|
@@ -224,6 +230,38 @@ impl TryIntoValue for ParquetValue {
|
|
224
230
|
timestamp @ ParquetValue::TimestampNanos(_, _) => {
|
225
231
|
impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
|
226
232
|
}
|
233
|
+
ParquetValue::TimeMillis(millis) => {
|
234
|
+
// Convert time of day in milliseconds to a Ruby Time object
|
235
|
+
// Use epoch date (1970-01-01) with the given time
|
236
|
+
let total_seconds = millis / 1000;
|
237
|
+
let ms = millis % 1000;
|
238
|
+
let hours = total_seconds / 3600;
|
239
|
+
let minutes = (total_seconds % 3600) / 60;
|
240
|
+
let seconds = total_seconds % 60;
|
241
|
+
|
242
|
+
// Create a Time object for 1970-01-01 with the given time
|
243
|
+
let time_class = handle.class_time();
|
244
|
+
let time = time_class.funcall::<_, _, Value>(
|
245
|
+
"new",
|
246
|
+
(1970, 1, 1, hours, minutes, seconds, ms * 1000), // Ruby expects microseconds
|
247
|
+
)?;
|
248
|
+
Ok(time.into_value_with(handle))
|
249
|
+
}
|
250
|
+
ParquetValue::TimeMicros(micros) => {
|
251
|
+
// Convert time of day in microseconds to a Ruby Time object
|
252
|
+
// Use epoch date (1970-01-01) with the given time
|
253
|
+
let total_seconds = micros / 1_000_000;
|
254
|
+
let us = micros % 1_000_000;
|
255
|
+
let hours = total_seconds / 3600;
|
256
|
+
let minutes = (total_seconds % 3600) / 60;
|
257
|
+
let seconds = total_seconds % 60;
|
258
|
+
|
259
|
+
// Create a Time object for 1970-01-01 with the given time
|
260
|
+
let time_class = handle.class_time();
|
261
|
+
let time = time_class
|
262
|
+
.funcall::<_, _, Value>("new", (1970, 1, 1, hours, minutes, seconds, us))?;
|
263
|
+
Ok(time.into_value_with(handle))
|
264
|
+
}
|
227
265
|
ParquetValue::List(l) => {
|
228
266
|
// For lists, convert to Ruby array and check for specific cases
|
229
267
|
// when we might need to return nil instead of an empty array
|
@@ -356,12 +394,32 @@ impl ParquetValue {
|
|
356
394
|
Ok(ParquetValue::Date32(v))
|
357
395
|
}
|
358
396
|
PrimitiveType::TimestampMillis => {
|
359
|
-
|
360
|
-
|
397
|
+
if value.is_kind_of(ruby.class_time()) {
|
398
|
+
use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
|
399
|
+
let (v, tz) = ruby_time_to_timestamp_with_tz(value, "millis")?;
|
400
|
+
Ok(ParquetValue::TimestampMillis(v, tz))
|
401
|
+
} else {
|
402
|
+
let v = convert_to_timestamp_millis(ruby, value, format)?;
|
403
|
+
Ok(ParquetValue::TimestampMillis(v, None))
|
404
|
+
}
|
361
405
|
}
|
362
406
|
PrimitiveType::TimestampMicros => {
|
363
|
-
|
364
|
-
|
407
|
+
if value.is_kind_of(ruby.class_time()) {
|
408
|
+
use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
|
409
|
+
let (v, tz) = ruby_time_to_timestamp_with_tz(value, "micros")?;
|
410
|
+
Ok(ParquetValue::TimestampMicros(v, tz))
|
411
|
+
} else {
|
412
|
+
let v = convert_to_timestamp_micros(ruby, value, format)?;
|
413
|
+
Ok(ParquetValue::TimestampMicros(v, None))
|
414
|
+
}
|
415
|
+
}
|
416
|
+
PrimitiveType::TimeMillis => {
|
417
|
+
let v = convert_to_time_millis(ruby, value, format)?;
|
418
|
+
Ok(ParquetValue::TimeMillis(v))
|
419
|
+
}
|
420
|
+
PrimitiveType::TimeMicros => {
|
421
|
+
let v = convert_to_time_micros(ruby, value, format)?;
|
422
|
+
Ok(ParquetValue::TimeMicros(v))
|
365
423
|
}
|
366
424
|
},
|
367
425
|
ParquetSchemaType::List(list_field) => {
|
@@ -980,6 +1038,52 @@ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
|
|
980
1038
|
tz
|
981
1039
|
)
|
982
1040
|
}
|
1041
|
+
DataType::Time32(TimeUnit::Millisecond) => {
|
1042
|
+
let array = downcast_array::<Time32MillisecondArray>(column.array);
|
1043
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
1044
|
+
array
|
1045
|
+
.values()
|
1046
|
+
.iter()
|
1047
|
+
.enumerate()
|
1048
|
+
.map(|(i, x)| {
|
1049
|
+
if array.is_null(i) {
|
1050
|
+
ParquetValue::Null
|
1051
|
+
} else {
|
1052
|
+
ParquetValue::TimeMillis(*x)
|
1053
|
+
}
|
1054
|
+
})
|
1055
|
+
.collect()
|
1056
|
+
} else {
|
1057
|
+
array
|
1058
|
+
.values()
|
1059
|
+
.iter()
|
1060
|
+
.map(|x| ParquetValue::TimeMillis(*x))
|
1061
|
+
.collect()
|
1062
|
+
}))
|
1063
|
+
}
|
1064
|
+
DataType::Time64(TimeUnit::Microsecond) => {
|
1065
|
+
let array = downcast_array::<Time64MicrosecondArray>(column.array);
|
1066
|
+
Ok(ParquetValueVec(if array.is_nullable() {
|
1067
|
+
array
|
1068
|
+
.values()
|
1069
|
+
.iter()
|
1070
|
+
.enumerate()
|
1071
|
+
.map(|(i, x)| {
|
1072
|
+
if array.is_null(i) {
|
1073
|
+
ParquetValue::Null
|
1074
|
+
} else {
|
1075
|
+
ParquetValue::TimeMicros(*x)
|
1076
|
+
}
|
1077
|
+
})
|
1078
|
+
.collect()
|
1079
|
+
} else {
|
1080
|
+
array
|
1081
|
+
.values()
|
1082
|
+
.iter()
|
1083
|
+
.map(|x| ParquetValue::TimeMicros(*x))
|
1084
|
+
.collect()
|
1085
|
+
}))
|
1086
|
+
}
|
983
1087
|
DataType::Float16 => {
|
984
1088
|
let array = downcast_array::<Float16Array>(column.array);
|
985
1089
|
if array.is_nullable() {
|
@@ -1,6 +1,7 @@
|
|
1
1
|
use std::sync::OnceLock;
|
2
2
|
|
3
3
|
use itertools::Itertools;
|
4
|
+
use jiff::ToSpan;
|
4
5
|
use parquet::{
|
5
6
|
basic::{ConvertedType, LogicalType},
|
6
7
|
data_type::AsBytes,
|
@@ -372,8 +373,9 @@ impl TryIntoValue for ParquetField {
|
|
372
373
|
}
|
373
374
|
}
|
374
375
|
Field::Date(d) => {
|
375
|
-
let
|
376
|
-
let
|
376
|
+
let epoch = jiff::civil::Date::new(1970, 1, 1)?;
|
377
|
+
let date = epoch.checked_add(d.days()).map_err(ParquetGemError::Jiff)?;
|
378
|
+
let formatted = date.to_string();
|
377
379
|
Ok(formatted.into_value_with(handle))
|
378
380
|
}
|
379
381
|
Field::TimeMillis(ts) => {
|
@@ -295,6 +295,8 @@ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
|
|
295
295
|
"date" | "date32" => Some(PrimitiveType::Date32),
|
296
296
|
"timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
|
297
297
|
"timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
|
298
|
+
"time_millis" | "time_ms" => Some(PrimitiveType::TimeMillis),
|
299
|
+
"time_micros" | "time_us" => Some(PrimitiveType::TimeMicros),
|
298
300
|
"decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
|
299
301
|
"decimal256" => Some(PrimitiveType::Decimal256(38, 0)), // Maximum precision, scale 0
|
300
302
|
_ => None,
|
@@ -337,6 +339,12 @@ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
|
|
337
339
|
PrimitiveType::TimestampMicros => {
|
338
340
|
ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
|
339
341
|
}
|
342
|
+
PrimitiveType::TimeMillis => {
|
343
|
+
ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond)
|
344
|
+
}
|
345
|
+
PrimitiveType::TimeMicros => {
|
346
|
+
ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond)
|
347
|
+
}
|
340
348
|
};
|
341
349
|
ArrowField::new(name, dt, *nullable)
|
342
350
|
}
|
@@ -1,4 +1,107 @@
|
|
1
1
|
use super::*;
|
2
|
+
use magnus::{TryConvert, Value};
|
3
|
+
|
4
|
+
/// Parses a fixed offset timezone string (e.g., "+09:00", "-05:30", "+0800")
|
5
|
+
/// Returns the offset in minutes from UTC
|
6
|
+
fn parse_fixed_offset(tz: &str) -> Result<i32, ParquetGemError> {
|
7
|
+
// Remove any whitespace
|
8
|
+
let tz = tz.trim();
|
9
|
+
|
10
|
+
// Check if it starts with + or -
|
11
|
+
if !tz.starts_with('+') && !tz.starts_with('-') {
|
12
|
+
return Err(MagnusError::new(
|
13
|
+
magnus::exception::arg_error(),
|
14
|
+
format!(
|
15
|
+
"Invalid timezone offset format: '{}'. Expected format like '+09:00' or '-0530'",
|
16
|
+
tz
|
17
|
+
),
|
18
|
+
))?;
|
19
|
+
}
|
20
|
+
|
21
|
+
let sign = if tz.starts_with('-') { -1 } else { 1 };
|
22
|
+
let offset_str = &tz[1..]; // Remove the sign
|
23
|
+
|
24
|
+
// Parse different formats: "+09:00", "+0900", "+09"
|
25
|
+
let (hours, minutes) = if offset_str.contains(':') {
|
26
|
+
// Format: "+09:00" or "+9:30"
|
27
|
+
let parts: Vec<&str> = offset_str.split(':').collect();
|
28
|
+
if parts.len() != 2 {
|
29
|
+
return Err(MagnusError::new(
|
30
|
+
magnus::exception::arg_error(),
|
31
|
+
format!("Invalid timezone offset format: '{}'. Expected HH:MM", tz),
|
32
|
+
))?;
|
33
|
+
}
|
34
|
+
|
35
|
+
let h = parts[0].parse::<i32>().map_err(|e| {
|
36
|
+
MagnusError::new(
|
37
|
+
magnus::exception::arg_error(),
|
38
|
+
format!("Invalid hour in timezone offset '{}': {}", tz, e),
|
39
|
+
)
|
40
|
+
})?;
|
41
|
+
|
42
|
+
let m = parts[1].parse::<i32>().map_err(|e| {
|
43
|
+
MagnusError::new(
|
44
|
+
magnus::exception::arg_error(),
|
45
|
+
format!("Invalid minute in timezone offset '{}': {}", tz, e),
|
46
|
+
)
|
47
|
+
})?;
|
48
|
+
|
49
|
+
(h, m)
|
50
|
+
} else if offset_str.len() == 4 {
|
51
|
+
// Format: "+0900"
|
52
|
+
let h = offset_str[0..2].parse::<i32>().map_err(|e| {
|
53
|
+
MagnusError::new(
|
54
|
+
magnus::exception::arg_error(),
|
55
|
+
format!("Invalid hour in timezone offset '{}': {}", tz, e),
|
56
|
+
)
|
57
|
+
})?;
|
58
|
+
|
59
|
+
let m = offset_str[2..4].parse::<i32>().map_err(|e| {
|
60
|
+
MagnusError::new(
|
61
|
+
magnus::exception::arg_error(),
|
62
|
+
format!("Invalid minute in timezone offset '{}': {}", tz, e),
|
63
|
+
)
|
64
|
+
})?;
|
65
|
+
|
66
|
+
(h, m)
|
67
|
+
} else if offset_str.len() == 2
|
68
|
+
|| (offset_str.len() == 1 && offset_str.chars().all(|c| c.is_numeric()))
|
69
|
+
{
|
70
|
+
// Format: "+09" or "+9"
|
71
|
+
let h = offset_str.parse::<i32>().map_err(|e| {
|
72
|
+
MagnusError::new(
|
73
|
+
magnus::exception::arg_error(),
|
74
|
+
format!("Invalid hour in timezone offset '{}': {}", tz, e),
|
75
|
+
)
|
76
|
+
})?;
|
77
|
+
(h, 0)
|
78
|
+
} else {
|
79
|
+
return Err(MagnusError::new(
|
80
|
+
magnus::exception::arg_error(),
|
81
|
+
format!("Invalid timezone offset format: '{}'. Expected formats: '+HH:MM', '+HHMM', or '+HH'", tz),
|
82
|
+
))?;
|
83
|
+
};
|
84
|
+
|
85
|
+
// Validate ranges
|
86
|
+
if hours < 0 || hours > 23 {
|
87
|
+
return Err(MagnusError::new(
|
88
|
+
magnus::exception::arg_error(),
|
89
|
+
format!("Invalid hour in timezone offset: {}. Must be 0-23", hours),
|
90
|
+
))?;
|
91
|
+
}
|
92
|
+
|
93
|
+
if minutes < 0 || minutes > 59 {
|
94
|
+
return Err(MagnusError::new(
|
95
|
+
magnus::exception::arg_error(),
|
96
|
+
format!(
|
97
|
+
"Invalid minute in timezone offset: {}. Must be 0-59",
|
98
|
+
minutes
|
99
|
+
),
|
100
|
+
))?;
|
101
|
+
}
|
102
|
+
|
103
|
+
Ok(sign * (hours * 60 + minutes))
|
104
|
+
}
|
2
105
|
|
3
106
|
pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, ParquetGemError> {
|
4
107
|
let (ts, tz) = match value {
|
@@ -18,37 +121,40 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, Pa
|
|
18
121
|
|
19
122
|
// If timezone is provided, convert to zoned timestamp
|
20
123
|
if let Some(tz) = tz {
|
21
|
-
// Handle fixed offset timezones
|
124
|
+
// Handle fixed offset timezones first
|
22
125
|
if tz.starts_with('+') || tz.starts_with('-') {
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
(h, m)
|
29
|
-
} else if tz.len() >= 3 {
|
30
|
-
// Format: "+09" or "-09"
|
31
|
-
let h = tz[1..3].parse::<i32>().unwrap_or(0);
|
32
|
-
(h, 0)
|
33
|
-
} else {
|
34
|
-
(0, 0)
|
35
|
-
};
|
126
|
+
let total_minutes = parse_fixed_offset(tz)?;
|
127
|
+
|
128
|
+
// Create fixed timezone using the parsed offset
|
129
|
+
let offset_hours = total_minutes / 60;
|
130
|
+
let offset_minutes = total_minutes % 60;
|
36
131
|
|
37
|
-
//
|
38
|
-
let
|
39
|
-
|
132
|
+
// jiff expects offset in hours, but we can be more precise
|
133
|
+
let tz = if offset_minutes == 0 {
|
134
|
+
jiff::tz::TimeZone::fixed(jiff::tz::offset(offset_hours as i8))
|
40
135
|
} else {
|
41
|
-
|
136
|
+
// For non-zero minutes, we need to create a custom offset
|
137
|
+
// jiff doesn't directly support minute-precision offsets in the simple API,
|
138
|
+
// so we'll use the timestamp directly with the offset applied
|
139
|
+
return Ok(ts);
|
42
140
|
};
|
43
141
|
|
44
|
-
// Create fixed timezone
|
45
|
-
let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
|
46
142
|
Ok(ts.to_zoned(tz).timestamp())
|
143
|
+
} else if tz.eq_ignore_ascii_case("UTC") || tz.eq_ignore_ascii_case("GMT") {
|
144
|
+
// Common UTC aliases
|
145
|
+
Ok(ts)
|
47
146
|
} else {
|
48
147
|
// Try IANA timezone
|
49
148
|
match ts.in_tz(tz) {
|
50
149
|
Ok(zoned) => Ok(zoned.timestamp()),
|
51
|
-
Err(
|
150
|
+
Err(e) => {
|
151
|
+
// Log the error but don't fail - fall back to UTC
|
152
|
+
eprintln!(
|
153
|
+
"Warning: Failed to parse timezone '{}': {}. Using UTC.",
|
154
|
+
tz, e
|
155
|
+
);
|
156
|
+
Ok(ts)
|
157
|
+
}
|
52
158
|
}
|
53
159
|
}
|
54
160
|
} else {
|
@@ -57,21 +163,112 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, Pa
|
|
57
163
|
}
|
58
164
|
}
|
59
165
|
|
166
|
+
/// Validates and normalizes a timezone string
|
167
|
+
/// Returns the normalized timezone string or None if invalid
|
168
|
+
pub fn validate_timezone(tz: &str) -> Option<String> {
|
169
|
+
let tz = tz.trim();
|
170
|
+
|
171
|
+
// Check for empty timezone
|
172
|
+
if tz.is_empty() {
|
173
|
+
return None;
|
174
|
+
}
|
175
|
+
|
176
|
+
// Fixed offset timezones
|
177
|
+
if tz.starts_with('+') || tz.starts_with('-') {
|
178
|
+
// Validate it can be parsed
|
179
|
+
if parse_fixed_offset(tz).is_ok() {
|
180
|
+
return Some(tz.to_string());
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
// Common UTC aliases
|
185
|
+
if tz.eq_ignore_ascii_case("UTC")
|
186
|
+
|| tz.eq_ignore_ascii_case("GMT")
|
187
|
+
|| tz.eq_ignore_ascii_case("Z")
|
188
|
+
{
|
189
|
+
return Some("UTC".to_string());
|
190
|
+
}
|
191
|
+
|
192
|
+
// Try to validate as IANA timezone by attempting to use it
|
193
|
+
// This is a bit expensive but ensures we only store valid timezones
|
194
|
+
if let Ok(tz_obj) = jiff::tz::TimeZone::get(tz) {
|
195
|
+
// Use the canonical name from jiff
|
196
|
+
return Some(
|
197
|
+
tz_obj
|
198
|
+
.iana_name()
|
199
|
+
.map(|s| s.to_string())
|
200
|
+
.unwrap_or_else(|| tz.to_string()),
|
201
|
+
);
|
202
|
+
}
|
203
|
+
|
204
|
+
None
|
205
|
+
}
|
206
|
+
|
207
|
+
/// Converts a Ruby Time object to a timestamp with timezone
|
208
|
+
pub fn ruby_time_to_timestamp_with_tz(
|
209
|
+
value: Value,
|
210
|
+
unit: &str,
|
211
|
+
) -> Result<(i64, Option<Arc<str>>), MagnusError> {
|
212
|
+
// Get seconds and microseconds
|
213
|
+
let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
|
214
|
+
let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
|
215
|
+
|
216
|
+
// Get timezone information from Ruby Time object
|
217
|
+
let tz_str = if let Ok(zone) = value.funcall::<_, _, Value>("zone", ()) {
|
218
|
+
if zone.is_nil() {
|
219
|
+
None
|
220
|
+
} else if let Ok(s) = String::try_convert(zone) {
|
221
|
+
validate_timezone(&s).map(|tz| Arc::from(tz.as_str()))
|
222
|
+
} else {
|
223
|
+
None
|
224
|
+
}
|
225
|
+
} else {
|
226
|
+
None
|
227
|
+
};
|
228
|
+
|
229
|
+
// Convert to appropriate unit
|
230
|
+
let timestamp = match unit {
|
231
|
+
"millis" => secs * 1000 + (usecs / 1000),
|
232
|
+
"micros" => secs * 1_000_000 + usecs,
|
233
|
+
"seconds" => secs,
|
234
|
+
"nanos" => secs * 1_000_000_000 + (usecs * 1000),
|
235
|
+
_ => {
|
236
|
+
return Err(MagnusError::new(
|
237
|
+
magnus::exception::arg_error(),
|
238
|
+
format!("Invalid timestamp unit: {}", unit),
|
239
|
+
))
|
240
|
+
}
|
241
|
+
};
|
242
|
+
|
243
|
+
Ok((timestamp, tz_str))
|
244
|
+
}
|
245
|
+
|
60
246
|
// Macro for handling timestamp conversions
|
61
247
|
#[macro_export]
|
62
248
|
macro_rules! impl_timestamp_conversion {
|
63
249
|
($value:expr, $unit:ident, $handle:expr) => {{
|
64
250
|
match $value {
|
65
251
|
ParquetValue::$unit(ts, tz) => {
|
66
|
-
let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz))?;
|
252
|
+
let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz.clone()))?;
|
67
253
|
let time_class = $handle.class_time();
|
68
|
-
|
254
|
+
|
255
|
+
// Convert timestamp to Time object
|
256
|
+
let time_obj = time_class
|
69
257
|
.funcall::<_, _, Value>("parse", (ts.to_string(),))?
|
70
|
-
.into_value_with($handle)
|
258
|
+
.into_value_with($handle);
|
259
|
+
|
260
|
+
// If we have timezone info, we've already handled it in parse_zoned_timestamp
|
261
|
+
// The resulting Time object will be in the correct timezone
|
262
|
+
|
263
|
+
Ok(time_obj)
|
71
264
|
}
|
72
265
|
_ => Err(MagnusError::new(
|
73
266
|
magnus::exception::type_error(),
|
74
|
-
|
267
|
+
format!(
|
268
|
+
"Invalid timestamp type. Expected {}, got {:?}",
|
269
|
+
stringify!($unit),
|
270
|
+
$value
|
271
|
+
),
|
75
272
|
))?,
|
76
273
|
}
|
77
274
|
}};
|
@@ -250,6 +250,8 @@ pub fn parquet_schema_type_to_arrow_data_type(
|
|
250
250
|
PrimitiveType::Date32 => DataType::Date32,
|
251
251
|
PrimitiveType::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
|
252
252
|
PrimitiveType::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
|
253
|
+
PrimitiveType::TimeMillis => DataType::Time32(TimeUnit::Millisecond),
|
254
|
+
PrimitiveType::TimeMicros => DataType::Time64(TimeUnit::Microsecond),
|
253
255
|
},
|
254
256
|
// For a List<T>, create a standard List in Arrow with nullable items
|
255
257
|
ParquetSchemaType::List(list_field) => {
|
@@ -416,6 +418,12 @@ fn create_arrow_builder_for_type(
|
|
416
418
|
ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => {
|
417
419
|
Ok(Box::new(TimestampMicrosecondBuilder::with_capacity(cap)))
|
418
420
|
}
|
421
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimeMillis) => {
|
422
|
+
Ok(Box::new(Time32MillisecondBuilder::with_capacity(cap)))
|
423
|
+
}
|
424
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimeMicros) => {
|
425
|
+
Ok(Box::new(Time64MicrosecondBuilder::with_capacity(cap)))
|
426
|
+
}
|
419
427
|
ParquetSchemaType::List(list_field) => {
|
420
428
|
// For a list, we create a ListBuilder whose child builder is determined by item_type.
|
421
429
|
// Pass through capacity to ensure consistent sizing
|
@@ -1165,6 +1173,44 @@ fn fill_builder(
|
|
1165
1173
|
}
|
1166
1174
|
Ok(())
|
1167
1175
|
}
|
1176
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimeMillis) => {
|
1177
|
+
let typed_builder = builder
|
1178
|
+
.as_any_mut()
|
1179
|
+
.downcast_mut::<Time32MillisecondBuilder>()
|
1180
|
+
.expect("Builder mismatch: expected Time32MillisecondBuilder");
|
1181
|
+
for val in values {
|
1182
|
+
match val {
|
1183
|
+
ParquetValue::TimeMillis(t) => typed_builder.append_value(*t),
|
1184
|
+
ParquetValue::Null => typed_builder.append_null(),
|
1185
|
+
other => {
|
1186
|
+
return Err(MagnusError::new(
|
1187
|
+
magnus::exception::type_error(),
|
1188
|
+
format!("Expected TimeMillis, got {:?}", other),
|
1189
|
+
))
|
1190
|
+
}
|
1191
|
+
}
|
1192
|
+
}
|
1193
|
+
Ok(())
|
1194
|
+
}
|
1195
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimeMicros) => {
|
1196
|
+
let typed_builder = builder
|
1197
|
+
.as_any_mut()
|
1198
|
+
.downcast_mut::<Time64MicrosecondBuilder>()
|
1199
|
+
.expect("Builder mismatch: expected Time64MicrosecondBuilder");
|
1200
|
+
for val in values {
|
1201
|
+
match val {
|
1202
|
+
ParquetValue::TimeMicros(t) => typed_builder.append_value(*t),
|
1203
|
+
ParquetValue::Null => typed_builder.append_null(),
|
1204
|
+
other => {
|
1205
|
+
return Err(MagnusError::new(
|
1206
|
+
magnus::exception::type_error(),
|
1207
|
+
format!("Expected TimeMicros, got {:?}", other),
|
1208
|
+
))
|
1209
|
+
}
|
1210
|
+
}
|
1211
|
+
}
|
1212
|
+
Ok(())
|
1213
|
+
}
|
1168
1214
|
|
1169
1215
|
// ------------------
|
1170
1216
|
// NESTED LIST - using helper function
|
@@ -1433,6 +1479,24 @@ fn fill_builder(
|
|
1433
1479
|
)
|
1434
1480
|
})?
|
1435
1481
|
.append_value(*x),
|
1482
|
+
ParquetValue::TimeMillis(x) => typed_builder
|
1483
|
+
.field_builder::<Time32MillisecondBuilder>(i)
|
1484
|
+
.ok_or_else(|| {
|
1485
|
+
MagnusError::new(
|
1486
|
+
magnus::exception::type_error(),
|
1487
|
+
"Failed to coerce into Time32MillisecondBuilder",
|
1488
|
+
)
|
1489
|
+
})?
|
1490
|
+
.append_value(*x),
|
1491
|
+
ParquetValue::TimeMicros(x) => typed_builder
|
1492
|
+
.field_builder::<Time64MicrosecondBuilder>(i)
|
1493
|
+
.ok_or_else(|| {
|
1494
|
+
MagnusError::new(
|
1495
|
+
magnus::exception::type_error(),
|
1496
|
+
"Failed to coerce into Time64MicrosecondBuilder",
|
1497
|
+
)
|
1498
|
+
})?
|
1499
|
+
.append_value(*x),
|
1436
1500
|
ParquetValue::List(items) => {
|
1437
1501
|
let list_builder = typed_builder
|
1438
1502
|
.field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
|
@@ -1647,6 +1711,24 @@ fn fill_builder(
|
|
1647
1711
|
)
|
1648
1712
|
})?
|
1649
1713
|
.append_null(),
|
1714
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimeMillis) => typed_builder
|
1715
|
+
.field_builder::<Time32MillisecondBuilder>(i)
|
1716
|
+
.ok_or_else(|| {
|
1717
|
+
MagnusError::new(
|
1718
|
+
magnus::exception::type_error(),
|
1719
|
+
"Failed to coerce into Time32MillisecondBuilder",
|
1720
|
+
)
|
1721
|
+
})?
|
1722
|
+
.append_null(),
|
1723
|
+
ParquetSchemaType::Primitive(PrimitiveType::TimeMicros) => typed_builder
|
1724
|
+
.field_builder::<Time64MicrosecondBuilder>(i)
|
1725
|
+
.ok_or_else(|| {
|
1726
|
+
MagnusError::new(
|
1727
|
+
magnus::exception::type_error(),
|
1728
|
+
"Failed to coerce into Time64MicrosecondBuilder",
|
1729
|
+
)
|
1730
|
+
})?
|
1731
|
+
.append_null(),
|
1650
1732
|
ParquetSchemaType::List(_) => typed_builder
|
1651
1733
|
.field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
|
1652
1734
|
.ok_or_else(|| {
|
@@ -1743,3 +1825,125 @@ pub fn convert_ruby_array_to_arrow(
|
|
1743
1825
|
}
|
1744
1826
|
convert_parquet_values_to_arrow(parquet_values, type_)
|
1745
1827
|
}
|
1828
|
+
|
1829
|
+
pub fn convert_to_time_millis(
|
1830
|
+
ruby: &Ruby,
|
1831
|
+
value: Value,
|
1832
|
+
format: Option<&str>,
|
1833
|
+
) -> Result<i32, MagnusError> {
|
1834
|
+
if value.is_kind_of(ruby.class_time()) {
|
1835
|
+
// Extract time components
|
1836
|
+
let hour = i32::try_convert(value.funcall::<_, _, Value>("hour", ())?)?;
|
1837
|
+
let min = i32::try_convert(value.funcall::<_, _, Value>("min", ())?)?;
|
1838
|
+
let sec = i32::try_convert(value.funcall::<_, _, Value>("sec", ())?)?;
|
1839
|
+
let usec = i32::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
|
1840
|
+
|
1841
|
+
// Convert to milliseconds since midnight
|
1842
|
+
Ok(hour * 3600000 + min * 60000 + sec * 1000 + usec / 1000)
|
1843
|
+
} else if value.is_kind_of(ruby.class_string()) {
|
1844
|
+
let s = String::try_convert(value)?;
|
1845
|
+
|
1846
|
+
if let Some(fmt) = format {
|
1847
|
+
// Parse using the provided format
|
1848
|
+
match jiff::civil::Time::strptime(fmt, &s) {
|
1849
|
+
Ok(time) => {
|
1850
|
+
let millis = time.hour() as i32 * 3600000
|
1851
|
+
+ time.minute() as i32 * 60000
|
1852
|
+
+ time.second() as i32 * 1000
|
1853
|
+
+ time.millisecond() as i32;
|
1854
|
+
Ok(millis)
|
1855
|
+
}
|
1856
|
+
Err(e) => Err(MagnusError::new(
|
1857
|
+
magnus::exception::type_error(),
|
1858
|
+
format!(
|
1859
|
+
"Failed to parse '{}' with format '{}' as time: {}",
|
1860
|
+
s, fmt, e
|
1861
|
+
),
|
1862
|
+
)),
|
1863
|
+
}
|
1864
|
+
} else {
|
1865
|
+
// Try to parse as standard time format
|
1866
|
+
match s.parse::<jiff::civil::Time>() {
|
1867
|
+
Ok(time) => {
|
1868
|
+
let millis = time.hour() as i32 * 3600000
|
1869
|
+
+ time.minute() as i32 * 60000
|
1870
|
+
+ time.second() as i32 * 1000
|
1871
|
+
+ time.millisecond() as i32;
|
1872
|
+
Ok(millis)
|
1873
|
+
}
|
1874
|
+
Err(e) => Err(MagnusError::new(
|
1875
|
+
magnus::exception::type_error(),
|
1876
|
+
format!("Failed to parse '{}' as time: {}", s, e),
|
1877
|
+
)),
|
1878
|
+
}
|
1879
|
+
}
|
1880
|
+
} else {
|
1881
|
+
Err(MagnusError::new(
|
1882
|
+
magnus::exception::type_error(),
|
1883
|
+
format!("Cannot convert {} to time_millis", unsafe {
|
1884
|
+
value.classname()
|
1885
|
+
}),
|
1886
|
+
))
|
1887
|
+
}
|
1888
|
+
}
|
1889
|
+
|
1890
|
+
pub fn convert_to_time_micros(
|
1891
|
+
ruby: &Ruby,
|
1892
|
+
value: Value,
|
1893
|
+
format: Option<&str>,
|
1894
|
+
) -> Result<i64, MagnusError> {
|
1895
|
+
if value.is_kind_of(ruby.class_time()) {
|
1896
|
+
// Extract time components
|
1897
|
+
let hour = i64::try_convert(value.funcall::<_, _, Value>("hour", ())?)?;
|
1898
|
+
let min = i64::try_convert(value.funcall::<_, _, Value>("min", ())?)?;
|
1899
|
+
let sec = i64::try_convert(value.funcall::<_, _, Value>("sec", ())?)?;
|
1900
|
+
let usec = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
|
1901
|
+
|
1902
|
+
// Convert to microseconds since midnight
|
1903
|
+
Ok(hour * 3600000000 + min * 60000000 + sec * 1000000 + usec)
|
1904
|
+
} else if value.is_kind_of(ruby.class_string()) {
|
1905
|
+
let s = String::try_convert(value)?;
|
1906
|
+
|
1907
|
+
if let Some(fmt) = format {
|
1908
|
+
// Parse using the provided format
|
1909
|
+
match jiff::civil::Time::strptime(fmt, &s) {
|
1910
|
+
Ok(time) => {
|
1911
|
+
let micros = time.hour() as i64 * 3600000000
|
1912
|
+
+ time.minute() as i64 * 60000000
|
1913
|
+
+ time.second() as i64 * 1000000
|
1914
|
+
+ time.microsecond() as i64;
|
1915
|
+
Ok(micros)
|
1916
|
+
}
|
1917
|
+
Err(e) => Err(MagnusError::new(
|
1918
|
+
magnus::exception::type_error(),
|
1919
|
+
format!(
|
1920
|
+
"Failed to parse '{}' with format '{}' as time: {}",
|
1921
|
+
s, fmt, e
|
1922
|
+
),
|
1923
|
+
)),
|
1924
|
+
}
|
1925
|
+
} else {
|
1926
|
+
// Try to parse as standard time format
|
1927
|
+
match s.parse::<jiff::civil::Time>() {
|
1928
|
+
Ok(time) => {
|
1929
|
+
let micros = time.hour() as i64 * 3600000000
|
1930
|
+
+ time.minute() as i64 * 60000000
|
1931
|
+
+ time.second() as i64 * 1000000
|
1932
|
+
+ time.microsecond() as i64;
|
1933
|
+
Ok(micros)
|
1934
|
+
}
|
1935
|
+
Err(e) => Err(MagnusError::new(
|
1936
|
+
magnus::exception::type_error(),
|
1937
|
+
format!("Failed to parse '{}' as time: {}", s, e),
|
1938
|
+
)),
|
1939
|
+
}
|
1940
|
+
}
|
1941
|
+
} else {
|
1942
|
+
Err(MagnusError::new(
|
1943
|
+
magnus::exception::type_error(),
|
1944
|
+
format!("Cannot convert {} to time_micros", unsafe {
|
1945
|
+
value.classname()
|
1946
|
+
}),
|
1947
|
+
))
|
1948
|
+
}
|
1949
|
+
}
|
@@ -146,7 +146,10 @@ impl FromStr for ParquetSchemaType<'_> {
|
|
146
146
|
}
|
147
147
|
|
148
148
|
// Check if it's a decimal256 type with precision and scale
|
149
|
-
if let Some(decimal_params) = s
|
149
|
+
if let Some(decimal_params) = s
|
150
|
+
.strip_prefix("decimal256(")
|
151
|
+
.and_then(|s| s.strip_suffix(")"))
|
152
|
+
{
|
150
153
|
let parts: Vec<&str> = decimal_params.split(',').collect();
|
151
154
|
|
152
155
|
// Handle both single parameter (precision only) and two parameters (precision and scale)
|
@@ -210,6 +213,8 @@ impl FromStr for ParquetSchemaType<'_> {
|
|
210
213
|
"date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
|
211
214
|
"timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
|
212
215
|
"timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
|
216
|
+
"time_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimeMillis)),
|
217
|
+
"time_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimeMicros)),
|
213
218
|
"decimal" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(
|
214
219
|
38, 0,
|
215
220
|
))),
|
@@ -230,6 +230,16 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
|
|
230
230
|
"TimestampNanos not supported, please adjust your schema or code.",
|
231
231
|
))
|
232
232
|
}
|
233
|
+
DataType::Time32(TimeUnit::Millisecond) => Ok(PST::Primitive(PrimitiveType::TimeMillis)),
|
234
|
+
DataType::Time64(TimeUnit::Microsecond) => Ok(PST::Primitive(PrimitiveType::TimeMicros)),
|
235
|
+
DataType::Time32(_) => Err(MagnusError::new(
|
236
|
+
magnus::exception::runtime_error(),
|
237
|
+
"Time32 only supports millisecond unit",
|
238
|
+
)),
|
239
|
+
DataType::Time64(_) => Err(MagnusError::new(
|
240
|
+
magnus::exception::runtime_error(),
|
241
|
+
"Time64 only supports microsecond unit",
|
242
|
+
)),
|
233
243
|
DataType::Utf8 => Ok(PST::Primitive(PrimitiveType::String)),
|
234
244
|
DataType::Binary => Ok(PST::Primitive(PrimitiveType::Binary)),
|
235
245
|
DataType::LargeUtf8 => {
|
@@ -170,6 +170,12 @@ fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemEr
|
|
170
170
|
PrimitiveType::TimestampMicros => {
|
171
171
|
PST::Primitive(PrimitiveType::TimestampMicros)
|
172
172
|
}
|
173
|
+
PrimitiveType::TimeMillis => {
|
174
|
+
PST::Primitive(PrimitiveType::TimeMillis)
|
175
|
+
}
|
176
|
+
PrimitiveType::TimeMicros => {
|
177
|
+
PST::Primitive(PrimitiveType::TimeMicros)
|
178
|
+
}
|
173
179
|
PrimitiveType::Decimal256(precision, scale) => {
|
174
180
|
PST::Primitive(PrimitiveType::Decimal256(precision, scale))
|
175
181
|
}
|
@@ -259,9 +259,11 @@ pub fn estimate_value_size(
|
|
259
259
|
PST::Primitive(PrimitiveType::Boolean) => Ok(1),
|
260
260
|
PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
|
261
261
|
PST::Primitive(PrimitiveType::Decimal256(_, _)) => Ok(32),
|
262
|
-
PST::Primitive(PrimitiveType::Date32)
|
263
|
-
|
264
|
-
| PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
|
262
|
+
PST::Primitive(PrimitiveType::Date32) => Ok(4), // Date32 is 4 bytes
|
263
|
+
PST::Primitive(PrimitiveType::TimestampMillis)
|
264
|
+
| PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8), // Timestamps are 8 bytes
|
265
|
+
PST::Primitive(PrimitiveType::TimeMillis) => Ok(4), // TimeMillis is 4 bytes
|
266
|
+
PST::Primitive(PrimitiveType::TimeMicros) => Ok(8), // TimeMicros is 8 bytes
|
265
267
|
PST::Primitive(PrimitiveType::String) | PST::Primitive(PrimitiveType::Binary) => {
|
266
268
|
if let Ok(s) = String::try_convert(value) {
|
267
269
|
// Account for string length plus Rust String's capacity+pointer overhead
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.13
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-06-
|
11
|
+
date: 2025-06-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|