parquet 0.5.11 → 0.5.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 82528b663c4a577262db90b6d17ba473a81d0ea725ceba486b63a3619040fa73
4
- data.tar.gz: 2e44daa9b4e36ef1503589daaa0815cbc3acee10c565d9942f6c0b6d35ced5f0
3
+ metadata.gz: 5f1b43212634fda95b699a725acd82761c5ab5fe4a51473c3bc1dc5eb9bf2226
4
+ data.tar.gz: 6dfdc2027a957673ad5591dc293624f138e1ef5ef6772ad06ef31b24ca8f360d
5
5
  SHA512:
6
- metadata.gz: 418951253384f5492385fcb30fa5b0113b85d9bc51346b6abad16105c124d8869266943c1a29bc0879cfee4270b94d32fb99004e233c6ebde4a70e1d329435af
7
- data.tar.gz: bc0db4ebb36add314253b5b9b946cc2c84f315d51ba7fefbead6c7de3b65a3f7752fa4e4cf0be19704405b390ae0106d8383e30791e7fac4a86a75141c214de1
6
+ metadata.gz: dcdbfdeb1be352af9aa1356f6d1a13070df84410d6f7b7eaa744d5b3dd62ba18004ee189be23566fc0b3a39e5c6c3548505a6783d65cdd69ff677f4cd2206d29
7
+ data.tar.gz: a9d4cad07d8d91edc48a9a7268ad53a24ffd9a1b83f0265c2f8194a73fed097d6c6f74398707f3c1f026978892a9f84273ae68bd619f41653341dbb39ef880ee
data/Cargo.lock CHANGED
@@ -64,7 +64,7 @@ dependencies = [
64
64
  [[package]]
65
65
  name = "arrow-array"
66
66
  version = "55.1.0"
67
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
67
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
68
68
  dependencies = [
69
69
  "ahash",
70
70
  "arrow-buffer",
@@ -79,7 +79,7 @@ dependencies = [
79
79
  [[package]]
80
80
  name = "arrow-buffer"
81
81
  version = "55.1.0"
82
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
82
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
83
83
  dependencies = [
84
84
  "bytes",
85
85
  "half",
@@ -89,7 +89,7 @@ dependencies = [
89
89
  [[package]]
90
90
  name = "arrow-cast"
91
91
  version = "55.1.0"
92
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
92
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
93
93
  dependencies = [
94
94
  "arrow-array",
95
95
  "arrow-buffer",
@@ -108,7 +108,7 @@ dependencies = [
108
108
  [[package]]
109
109
  name = "arrow-data"
110
110
  version = "55.1.0"
111
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
111
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
112
112
  dependencies = [
113
113
  "arrow-buffer",
114
114
  "arrow-schema",
@@ -119,7 +119,7 @@ dependencies = [
119
119
  [[package]]
120
120
  name = "arrow-ipc"
121
121
  version = "55.1.0"
122
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
122
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
123
123
  dependencies = [
124
124
  "arrow-array",
125
125
  "arrow-buffer",
@@ -132,12 +132,12 @@ dependencies = [
132
132
  [[package]]
133
133
  name = "arrow-schema"
134
134
  version = "55.1.0"
135
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
135
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
136
136
 
137
137
  [[package]]
138
138
  name = "arrow-select"
139
139
  version = "55.1.0"
140
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
140
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
141
141
  dependencies = [
142
142
  "ahash",
143
143
  "arrow-array",
@@ -867,7 +867,7 @@ dependencies = [
867
867
  [[package]]
868
868
  name = "parquet"
869
869
  version = "55.1.0"
870
- source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-time#a6a93f401a9a2aba72aa362822b50068f67ded42"
870
+ source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#2d8b0b99d70de81b96b76cfdc10faa20dc7aed4e"
871
871
  dependencies = [
872
872
  "ahash",
873
873
  "arrow-array",
data/README.md CHANGED
@@ -265,6 +265,7 @@ The following data types are supported in the schema:
265
265
  - `boolean`
266
266
  - `date32`
267
267
  - `timestamp_millis`, `timestamp_micros`
268
+ - `time_millis`, `time_micros`
268
269
 
269
270
  ### Schema DSL for Complex Data Types
270
271
 
@@ -11,16 +11,16 @@ rb-sys-env = "^0.2"
11
11
 
12
12
  [dependencies]
13
13
  ahash = "0.8"
14
- arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
15
- arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
16
- arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time", features = ["lz4"] }
17
- arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
14
+ arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
15
+ arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
16
+ arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["lz4"] }
17
+ arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
18
18
  bytes = "^1.9"
19
19
  either = "1.9"
20
20
  itertools = "^0.14"
21
21
  jiff = "0.2"
22
22
  magnus = { version = "0.7", features = ["rb-sys"] }
23
- parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time", features = ["json"] }
23
+ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["json"] }
24
24
  rand = "0.9"
25
25
  rb-sys = "^0.9"
26
26
  simdutf8 = "0.1.5"
@@ -115,4 +115,6 @@ pub enum PrimitiveType {
115
115
  Date32,
116
116
  TimestampMillis,
117
117
  TimestampMicros,
118
+ TimeMillis,
119
+ TimeMicros,
118
120
  }
@@ -25,9 +25,9 @@ use arrow_array::cast::downcast_array;
25
25
  use arrow_array::{
26
26
  Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
27
27
  Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
28
- ListArray, NullArray, StringArray, StructArray, TimestampMicrosecondArray,
29
- TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
30
- UInt32Array, UInt64Array, UInt8Array,
28
+ ListArray, NullArray, StringArray, StructArray, Time32MillisecondArray, Time64MicrosecondArray,
29
+ TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
30
+ TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
31
31
  };
32
32
  use arrow_schema::{DataType, TimeUnit};
33
33
  use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
@@ -29,6 +29,8 @@ pub enum ParquetValue {
29
29
  TimestampMillis(i64, Option<Arc<str>>),
30
30
  TimestampMicros(i64, Option<Arc<str>>),
31
31
  TimestampNanos(i64, Option<Arc<str>>),
32
+ TimeMillis(i32), // Time of day in milliseconds since midnight
33
+ TimeMicros(i64), // Time of day in microseconds since midnight
32
34
  List(Vec<ParquetValue>), // A list of values (can be empty or have null items)
33
35
  // We're not using a separate NilList type anymore - we'll handle nil lists elsewhere
34
36
  Map(HashMap<ParquetValue, ParquetValue>),
@@ -108,6 +110,8 @@ impl PartialEq for ParquetValue {
108
110
  (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
109
111
  (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
110
112
  (ParquetValue::TimestampNanos(a, _), ParquetValue::TimestampNanos(b, _)) => a == b,
113
+ (ParquetValue::TimeMillis(a), ParquetValue::TimeMillis(b)) => a == b,
114
+ (ParquetValue::TimeMicros(a), ParquetValue::TimeMicros(b)) => a == b,
111
115
  (ParquetValue::List(a), ParquetValue::List(b)) => a == b,
112
116
  (ParquetValue::Null, ParquetValue::Null) => true,
113
117
  _ => false,
@@ -160,6 +164,8 @@ impl std::hash::Hash for ParquetValue {
160
164
  ts.hash(state);
161
165
  tz.hash(state);
162
166
  }
167
+ ParquetValue::TimeMillis(t) => t.hash(state),
168
+ ParquetValue::TimeMicros(t) => t.hash(state),
163
169
  ParquetValue::List(l) => l.hash(state),
164
170
  ParquetValue::Map(m) => {
165
171
  for (k, v) in m {
@@ -224,6 +230,38 @@ impl TryIntoValue for ParquetValue {
224
230
  timestamp @ ParquetValue::TimestampNanos(_, _) => {
225
231
  impl_timestamp_conversion!(timestamp, TimestampNanos, handle)
226
232
  }
233
+ ParquetValue::TimeMillis(millis) => {
234
+ // Convert time of day in milliseconds to a Ruby Time object
235
+ // Use epoch date (1970-01-01) with the given time
236
+ let total_seconds = millis / 1000;
237
+ let ms = millis % 1000;
238
+ let hours = total_seconds / 3600;
239
+ let minutes = (total_seconds % 3600) / 60;
240
+ let seconds = total_seconds % 60;
241
+
242
+ // Create a Time object for 1970-01-01 with the given time
243
+ let time_class = handle.class_time();
244
+ let time = time_class.funcall::<_, _, Value>(
245
+ "new",
246
+ (1970, 1, 1, hours, minutes, seconds, ms * 1000), // Ruby expects microseconds
247
+ )?;
248
+ Ok(time.into_value_with(handle))
249
+ }
250
+ ParquetValue::TimeMicros(micros) => {
251
+ // Convert time of day in microseconds to a Ruby Time object
252
+ // Use epoch date (1970-01-01) with the given time
253
+ let total_seconds = micros / 1_000_000;
254
+ let us = micros % 1_000_000;
255
+ let hours = total_seconds / 3600;
256
+ let minutes = (total_seconds % 3600) / 60;
257
+ let seconds = total_seconds % 60;
258
+
259
+ // Create a Time object for 1970-01-01 with the given time
260
+ let time_class = handle.class_time();
261
+ let time = time_class
262
+ .funcall::<_, _, Value>("new", (1970, 1, 1, hours, minutes, seconds, us))?;
263
+ Ok(time.into_value_with(handle))
264
+ }
227
265
  ParquetValue::List(l) => {
228
266
  // For lists, convert to Ruby array and check for specific cases
229
267
  // when we might need to return nil instead of an empty array
@@ -356,12 +394,32 @@ impl ParquetValue {
356
394
  Ok(ParquetValue::Date32(v))
357
395
  }
358
396
  PrimitiveType::TimestampMillis => {
359
- let v = convert_to_timestamp_millis(ruby, value, format)?;
360
- Ok(ParquetValue::TimestampMillis(v, None))
397
+ if value.is_kind_of(ruby.class_time()) {
398
+ use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
399
+ let (v, tz) = ruby_time_to_timestamp_with_tz(value, "millis")?;
400
+ Ok(ParquetValue::TimestampMillis(v, tz))
401
+ } else {
402
+ let v = convert_to_timestamp_millis(ruby, value, format)?;
403
+ Ok(ParquetValue::TimestampMillis(v, None))
404
+ }
361
405
  }
362
406
  PrimitiveType::TimestampMicros => {
363
- let v = convert_to_timestamp_micros(ruby, value, format)?;
364
- Ok(ParquetValue::TimestampMicros(v, None))
407
+ if value.is_kind_of(ruby.class_time()) {
408
+ use crate::types::timestamp::ruby_time_to_timestamp_with_tz;
409
+ let (v, tz) = ruby_time_to_timestamp_with_tz(value, "micros")?;
410
+ Ok(ParquetValue::TimestampMicros(v, tz))
411
+ } else {
412
+ let v = convert_to_timestamp_micros(ruby, value, format)?;
413
+ Ok(ParquetValue::TimestampMicros(v, None))
414
+ }
415
+ }
416
+ PrimitiveType::TimeMillis => {
417
+ let v = convert_to_time_millis(ruby, value, format)?;
418
+ Ok(ParquetValue::TimeMillis(v))
419
+ }
420
+ PrimitiveType::TimeMicros => {
421
+ let v = convert_to_time_micros(ruby, value, format)?;
422
+ Ok(ParquetValue::TimeMicros(v))
365
423
  }
366
424
  },
367
425
  ParquetSchemaType::List(list_field) => {
@@ -980,6 +1038,52 @@ impl<'a> TryFrom<ArrayWrapper<'a>> for ParquetValueVec {
980
1038
  tz
981
1039
  )
982
1040
  }
1041
+ DataType::Time32(TimeUnit::Millisecond) => {
1042
+ let array = downcast_array::<Time32MillisecondArray>(column.array);
1043
+ Ok(ParquetValueVec(if array.is_nullable() {
1044
+ array
1045
+ .values()
1046
+ .iter()
1047
+ .enumerate()
1048
+ .map(|(i, x)| {
1049
+ if array.is_null(i) {
1050
+ ParquetValue::Null
1051
+ } else {
1052
+ ParquetValue::TimeMillis(*x)
1053
+ }
1054
+ })
1055
+ .collect()
1056
+ } else {
1057
+ array
1058
+ .values()
1059
+ .iter()
1060
+ .map(|x| ParquetValue::TimeMillis(*x))
1061
+ .collect()
1062
+ }))
1063
+ }
1064
+ DataType::Time64(TimeUnit::Microsecond) => {
1065
+ let array = downcast_array::<Time64MicrosecondArray>(column.array);
1066
+ Ok(ParquetValueVec(if array.is_nullable() {
1067
+ array
1068
+ .values()
1069
+ .iter()
1070
+ .enumerate()
1071
+ .map(|(i, x)| {
1072
+ if array.is_null(i) {
1073
+ ParquetValue::Null
1074
+ } else {
1075
+ ParquetValue::TimeMicros(*x)
1076
+ }
1077
+ })
1078
+ .collect()
1079
+ } else {
1080
+ array
1081
+ .values()
1082
+ .iter()
1083
+ .map(|x| ParquetValue::TimeMicros(*x))
1084
+ .collect()
1085
+ }))
1086
+ }
983
1087
  DataType::Float16 => {
984
1088
  let array = downcast_array::<Float16Array>(column.array);
985
1089
  if array.is_nullable() {
@@ -1,6 +1,7 @@
1
1
  use std::sync::OnceLock;
2
2
 
3
3
  use itertools::Itertools;
4
+ use jiff::ToSpan;
4
5
  use parquet::{
5
6
  basic::{ConvertedType, LogicalType},
6
7
  data_type::AsBytes,
@@ -372,8 +373,9 @@ impl TryIntoValue for ParquetField {
372
373
  }
373
374
  }
374
375
  Field::Date(d) => {
375
- let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
376
- let formatted = ts.strftime("%Y-%m-%d").to_string();
376
+ let epoch = jiff::civil::Date::new(1970, 1, 1)?;
377
+ let date = epoch.checked_add(d.days()).map_err(ParquetGemError::Jiff)?;
378
+ let formatted = date.to_string();
377
379
  Ok(formatted.into_value_with(handle))
378
380
  }
379
381
  Field::TimeMillis(ts) => {
@@ -295,6 +295,8 @@ fn parse_primitive_type(s: &str) -> Option<PrimitiveType> {
295
295
  "date" | "date32" => Some(PrimitiveType::Date32),
296
296
  "timestamp_millis" | "timestamp_ms" => Some(PrimitiveType::TimestampMillis),
297
297
  "timestamp_micros" | "timestamp_us" => Some(PrimitiveType::TimestampMicros),
298
+ "time_millis" | "time_ms" => Some(PrimitiveType::TimeMillis),
299
+ "time_micros" | "time_us" => Some(PrimitiveType::TimeMicros),
298
300
  "decimal" => Some(PrimitiveType::Decimal128(38, 0)), // Maximum precision, scale 0
299
301
  "decimal256" => Some(PrimitiveType::Decimal256(38, 0)), // Maximum precision, scale 0
300
302
  _ => None,
@@ -337,6 +339,12 @@ pub fn schema_node_to_arrow_field(node: &SchemaNode) -> ArrowField {
337
339
  PrimitiveType::TimestampMicros => {
338
340
  ArrowDataType::Timestamp(arrow_schema::TimeUnit::Microsecond, None)
339
341
  }
342
+ PrimitiveType::TimeMillis => {
343
+ ArrowDataType::Time32(arrow_schema::TimeUnit::Millisecond)
344
+ }
345
+ PrimitiveType::TimeMicros => {
346
+ ArrowDataType::Time64(arrow_schema::TimeUnit::Microsecond)
347
+ }
340
348
  };
341
349
  ArrowField::new(name, dt, *nullable)
342
350
  }
@@ -1,4 +1,107 @@
1
1
  use super::*;
2
+ use magnus::{TryConvert, Value};
3
+
4
+ /// Parses a fixed offset timezone string (e.g., "+09:00", "-05:30", "+0800")
5
+ /// Returns the offset in minutes from UTC
6
+ fn parse_fixed_offset(tz: &str) -> Result<i32, ParquetGemError> {
7
+ // Remove any whitespace
8
+ let tz = tz.trim();
9
+
10
+ // Check if it starts with + or -
11
+ if !tz.starts_with('+') && !tz.starts_with('-') {
12
+ return Err(MagnusError::new(
13
+ magnus::exception::arg_error(),
14
+ format!(
15
+ "Invalid timezone offset format: '{}'. Expected format like '+09:00' or '-0530'",
16
+ tz
17
+ ),
18
+ ))?;
19
+ }
20
+
21
+ let sign = if tz.starts_with('-') { -1 } else { 1 };
22
+ let offset_str = &tz[1..]; // Remove the sign
23
+
24
+ // Parse different formats: "+09:00", "+0900", "+09"
25
+ let (hours, minutes) = if offset_str.contains(':') {
26
+ // Format: "+09:00" or "+9:30"
27
+ let parts: Vec<&str> = offset_str.split(':').collect();
28
+ if parts.len() != 2 {
29
+ return Err(MagnusError::new(
30
+ magnus::exception::arg_error(),
31
+ format!("Invalid timezone offset format: '{}'. Expected HH:MM", tz),
32
+ ))?;
33
+ }
34
+
35
+ let h = parts[0].parse::<i32>().map_err(|e| {
36
+ MagnusError::new(
37
+ magnus::exception::arg_error(),
38
+ format!("Invalid hour in timezone offset '{}': {}", tz, e),
39
+ )
40
+ })?;
41
+
42
+ let m = parts[1].parse::<i32>().map_err(|e| {
43
+ MagnusError::new(
44
+ magnus::exception::arg_error(),
45
+ format!("Invalid minute in timezone offset '{}': {}", tz, e),
46
+ )
47
+ })?;
48
+
49
+ (h, m)
50
+ } else if offset_str.len() == 4 {
51
+ // Format: "+0900"
52
+ let h = offset_str[0..2].parse::<i32>().map_err(|e| {
53
+ MagnusError::new(
54
+ magnus::exception::arg_error(),
55
+ format!("Invalid hour in timezone offset '{}': {}", tz, e),
56
+ )
57
+ })?;
58
+
59
+ let m = offset_str[2..4].parse::<i32>().map_err(|e| {
60
+ MagnusError::new(
61
+ magnus::exception::arg_error(),
62
+ format!("Invalid minute in timezone offset '{}': {}", tz, e),
63
+ )
64
+ })?;
65
+
66
+ (h, m)
67
+ } else if offset_str.len() == 2
68
+ || (offset_str.len() == 1 && offset_str.chars().all(|c| c.is_numeric()))
69
+ {
70
+ // Format: "+09" or "+9"
71
+ let h = offset_str.parse::<i32>().map_err(|e| {
72
+ MagnusError::new(
73
+ magnus::exception::arg_error(),
74
+ format!("Invalid hour in timezone offset '{}': {}", tz, e),
75
+ )
76
+ })?;
77
+ (h, 0)
78
+ } else {
79
+ return Err(MagnusError::new(
80
+ magnus::exception::arg_error(),
81
+ format!("Invalid timezone offset format: '{}'. Expected formats: '+HH:MM', '+HHMM', or '+HH'", tz),
82
+ ))?;
83
+ };
84
+
85
+ // Validate ranges
86
+ if hours < 0 || hours > 23 {
87
+ return Err(MagnusError::new(
88
+ magnus::exception::arg_error(),
89
+ format!("Invalid hour in timezone offset: {}. Must be 0-23", hours),
90
+ ))?;
91
+ }
92
+
93
+ if minutes < 0 || minutes > 59 {
94
+ return Err(MagnusError::new(
95
+ magnus::exception::arg_error(),
96
+ format!(
97
+ "Invalid minute in timezone offset: {}. Must be 0-59",
98
+ minutes
99
+ ),
100
+ ))?;
101
+ }
102
+
103
+ Ok(sign * (hours * 60 + minutes))
104
+ }
2
105
 
3
106
  pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, ParquetGemError> {
4
107
  let (ts, tz) = match value {
@@ -18,37 +121,40 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, Pa
18
121
 
19
122
  // If timezone is provided, convert to zoned timestamp
20
123
  if let Some(tz) = tz {
21
- // Handle fixed offset timezones like "+09:00" first
124
+ // Handle fixed offset timezones first
22
125
  if tz.starts_with('+') || tz.starts_with('-') {
23
- // Parse the offset string into hours and minutes
24
- let (hours, minutes) = if tz.len() >= 5 && tz.contains(':') {
25
- // Format: "+09:00" or "-09:00"
26
- let h = tz[1..3].parse::<i32>().unwrap_or(0);
27
- let m = tz[4..6].parse::<i32>().unwrap_or(0);
28
- (h, m)
29
- } else if tz.len() >= 3 {
30
- // Format: "+09" or "-09"
31
- let h = tz[1..3].parse::<i32>().unwrap_or(0);
32
- (h, 0)
33
- } else {
34
- (0, 0)
35
- };
126
+ let total_minutes = parse_fixed_offset(tz)?;
127
+
128
+ // Create fixed timezone using the parsed offset
129
+ let offset_hours = total_minutes / 60;
130
+ let offset_minutes = total_minutes % 60;
36
131
 
37
- // Apply sign
38
- let total_minutes = if tz.starts_with('-') {
39
- -(hours * 60 + minutes)
132
+ // jiff expects offset in hours, but we can be more precise
133
+ let tz = if offset_minutes == 0 {
134
+ jiff::tz::TimeZone::fixed(jiff::tz::offset(offset_hours as i8))
40
135
  } else {
41
- hours * 60 + minutes
136
+ // For non-zero minutes, we need to create a custom offset
137
+ // jiff doesn't directly support minute-precision offsets in the simple API,
138
+ // so we'll use the timestamp directly with the offset applied
139
+ return Ok(ts);
42
140
  };
43
141
 
44
- // Create fixed timezone
45
- let tz = jiff::tz::TimeZone::fixed(jiff::tz::offset((total_minutes / 60) as i8));
46
142
  Ok(ts.to_zoned(tz).timestamp())
143
+ } else if tz.eq_ignore_ascii_case("UTC") || tz.eq_ignore_ascii_case("GMT") {
144
+ // Common UTC aliases
145
+ Ok(ts)
47
146
  } else {
48
147
  // Try IANA timezone
49
148
  match ts.in_tz(tz) {
50
149
  Ok(zoned) => Ok(zoned.timestamp()),
51
- Err(_) => Ok(ts), // Fall back to UTC if timezone is invalid
150
+ Err(e) => {
151
+ // Log the error but don't fail - fall back to UTC
152
+ eprintln!(
153
+ "Warning: Failed to parse timezone '{}': {}. Using UTC.",
154
+ tz, e
155
+ );
156
+ Ok(ts)
157
+ }
52
158
  }
53
159
  }
54
160
  } else {
@@ -57,21 +163,112 @@ pub fn parse_zoned_timestamp(value: &ParquetValue) -> Result<jiff::Timestamp, Pa
57
163
  }
58
164
  }
59
165
 
166
+ /// Validates and normalizes a timezone string
167
+ /// Returns the normalized timezone string or None if invalid
168
+ pub fn validate_timezone(tz: &str) -> Option<String> {
169
+ let tz = tz.trim();
170
+
171
+ // Check for empty timezone
172
+ if tz.is_empty() {
173
+ return None;
174
+ }
175
+
176
+ // Fixed offset timezones
177
+ if tz.starts_with('+') || tz.starts_with('-') {
178
+ // Validate it can be parsed
179
+ if parse_fixed_offset(tz).is_ok() {
180
+ return Some(tz.to_string());
181
+ }
182
+ }
183
+
184
+ // Common UTC aliases
185
+ if tz.eq_ignore_ascii_case("UTC")
186
+ || tz.eq_ignore_ascii_case("GMT")
187
+ || tz.eq_ignore_ascii_case("Z")
188
+ {
189
+ return Some("UTC".to_string());
190
+ }
191
+
192
+ // Try to validate as IANA timezone by attempting to use it
193
+ // This is a bit expensive but ensures we only store valid timezones
194
+ if let Ok(tz_obj) = jiff::tz::TimeZone::get(tz) {
195
+ // Use the canonical name from jiff
196
+ return Some(
197
+ tz_obj
198
+ .iana_name()
199
+ .map(|s| s.to_string())
200
+ .unwrap_or_else(|| tz.to_string()),
201
+ );
202
+ }
203
+
204
+ None
205
+ }
206
+
207
+ /// Converts a Ruby Time object to a timestamp with timezone
208
+ pub fn ruby_time_to_timestamp_with_tz(
209
+ value: Value,
210
+ unit: &str,
211
+ ) -> Result<(i64, Option<Arc<str>>), MagnusError> {
212
+ // Get seconds and microseconds
213
+ let secs = i64::try_convert(value.funcall::<_, _, Value>("to_i", ())?)?;
214
+ let usecs = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
215
+
216
+ // Get timezone information from Ruby Time object
217
+ let tz_str = if let Ok(zone) = value.funcall::<_, _, Value>("zone", ()) {
218
+ if zone.is_nil() {
219
+ None
220
+ } else if let Ok(s) = String::try_convert(zone) {
221
+ validate_timezone(&s).map(|tz| Arc::from(tz.as_str()))
222
+ } else {
223
+ None
224
+ }
225
+ } else {
226
+ None
227
+ };
228
+
229
+ // Convert to appropriate unit
230
+ let timestamp = match unit {
231
+ "millis" => secs * 1000 + (usecs / 1000),
232
+ "micros" => secs * 1_000_000 + usecs,
233
+ "seconds" => secs,
234
+ "nanos" => secs * 1_000_000_000 + (usecs * 1000),
235
+ _ => {
236
+ return Err(MagnusError::new(
237
+ magnus::exception::arg_error(),
238
+ format!("Invalid timestamp unit: {}", unit),
239
+ ))
240
+ }
241
+ };
242
+
243
+ Ok((timestamp, tz_str))
244
+ }
245
+
60
246
  // Macro for handling timestamp conversions
61
247
  #[macro_export]
62
248
  macro_rules! impl_timestamp_conversion {
63
249
  ($value:expr, $unit:ident, $handle:expr) => {{
64
250
  match $value {
65
251
  ParquetValue::$unit(ts, tz) => {
66
- let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz))?;
252
+ let ts = parse_zoned_timestamp(&ParquetValue::$unit(ts, tz.clone()))?;
67
253
  let time_class = $handle.class_time();
68
- Ok(time_class
254
+
255
+ // Convert timestamp to Time object
256
+ let time_obj = time_class
69
257
  .funcall::<_, _, Value>("parse", (ts.to_string(),))?
70
- .into_value_with($handle))
258
+ .into_value_with($handle);
259
+
260
+ // If we have timezone info, we've already handled it in parse_zoned_timestamp
261
+ // The resulting Time object will be in the correct timezone
262
+
263
+ Ok(time_obj)
71
264
  }
72
265
  _ => Err(MagnusError::new(
73
266
  magnus::exception::type_error(),
74
- "Invalid timestamp type".to_string(),
267
+ format!(
268
+ "Invalid timestamp type. Expected {}, got {:?}",
269
+ stringify!($unit),
270
+ $value
271
+ ),
75
272
  ))?,
76
273
  }
77
274
  }};
@@ -250,6 +250,8 @@ pub fn parquet_schema_type_to_arrow_data_type(
250
250
  PrimitiveType::Date32 => DataType::Date32,
251
251
  PrimitiveType::TimestampMillis => DataType::Timestamp(TimeUnit::Millisecond, None),
252
252
  PrimitiveType::TimestampMicros => DataType::Timestamp(TimeUnit::Microsecond, None),
253
+ PrimitiveType::TimeMillis => DataType::Time32(TimeUnit::Millisecond),
254
+ PrimitiveType::TimeMicros => DataType::Time64(TimeUnit::Microsecond),
253
255
  },
254
256
  // For a List<T>, create a standard List in Arrow with nullable items
255
257
  ParquetSchemaType::List(list_field) => {
@@ -416,6 +418,12 @@ fn create_arrow_builder_for_type(
416
418
  ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros) => {
417
419
  Ok(Box::new(TimestampMicrosecondBuilder::with_capacity(cap)))
418
420
  }
421
+ ParquetSchemaType::Primitive(PrimitiveType::TimeMillis) => {
422
+ Ok(Box::new(Time32MillisecondBuilder::with_capacity(cap)))
423
+ }
424
+ ParquetSchemaType::Primitive(PrimitiveType::TimeMicros) => {
425
+ Ok(Box::new(Time64MicrosecondBuilder::with_capacity(cap)))
426
+ }
419
427
  ParquetSchemaType::List(list_field) => {
420
428
  // For a list, we create a ListBuilder whose child builder is determined by item_type.
421
429
  // Pass through capacity to ensure consistent sizing
@@ -1165,6 +1173,44 @@ fn fill_builder(
1165
1173
  }
1166
1174
  Ok(())
1167
1175
  }
1176
+ ParquetSchemaType::Primitive(PrimitiveType::TimeMillis) => {
1177
+ let typed_builder = builder
1178
+ .as_any_mut()
1179
+ .downcast_mut::<Time32MillisecondBuilder>()
1180
+ .expect("Builder mismatch: expected Time32MillisecondBuilder");
1181
+ for val in values {
1182
+ match val {
1183
+ ParquetValue::TimeMillis(t) => typed_builder.append_value(*t),
1184
+ ParquetValue::Null => typed_builder.append_null(),
1185
+ other => {
1186
+ return Err(MagnusError::new(
1187
+ magnus::exception::type_error(),
1188
+ format!("Expected TimeMillis, got {:?}", other),
1189
+ ))
1190
+ }
1191
+ }
1192
+ }
1193
+ Ok(())
1194
+ }
1195
+ ParquetSchemaType::Primitive(PrimitiveType::TimeMicros) => {
1196
+ let typed_builder = builder
1197
+ .as_any_mut()
1198
+ .downcast_mut::<Time64MicrosecondBuilder>()
1199
+ .expect("Builder mismatch: expected Time64MicrosecondBuilder");
1200
+ for val in values {
1201
+ match val {
1202
+ ParquetValue::TimeMicros(t) => typed_builder.append_value(*t),
1203
+ ParquetValue::Null => typed_builder.append_null(),
1204
+ other => {
1205
+ return Err(MagnusError::new(
1206
+ magnus::exception::type_error(),
1207
+ format!("Expected TimeMicros, got {:?}", other),
1208
+ ))
1209
+ }
1210
+ }
1211
+ }
1212
+ Ok(())
1213
+ }
1168
1214
 
1169
1215
  // ------------------
1170
1216
  // NESTED LIST - using helper function
@@ -1433,6 +1479,24 @@ fn fill_builder(
1433
1479
  )
1434
1480
  })?
1435
1481
  .append_value(*x),
1482
+ ParquetValue::TimeMillis(x) => typed_builder
1483
+ .field_builder::<Time32MillisecondBuilder>(i)
1484
+ .ok_or_else(|| {
1485
+ MagnusError::new(
1486
+ magnus::exception::type_error(),
1487
+ "Failed to coerce into Time32MillisecondBuilder",
1488
+ )
1489
+ })?
1490
+ .append_value(*x),
1491
+ ParquetValue::TimeMicros(x) => typed_builder
1492
+ .field_builder::<Time64MicrosecondBuilder>(i)
1493
+ .ok_or_else(|| {
1494
+ MagnusError::new(
1495
+ magnus::exception::type_error(),
1496
+ "Failed to coerce into Time64MicrosecondBuilder",
1497
+ )
1498
+ })?
1499
+ .append_value(*x),
1436
1500
  ParquetValue::List(items) => {
1437
1501
  let list_builder = typed_builder
1438
1502
  .field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
@@ -1647,6 +1711,24 @@ fn fill_builder(
1647
1711
  )
1648
1712
  })?
1649
1713
  .append_null(),
1714
+ ParquetSchemaType::Primitive(PrimitiveType::TimeMillis) => typed_builder
1715
+ .field_builder::<Time32MillisecondBuilder>(i)
1716
+ .ok_or_else(|| {
1717
+ MagnusError::new(
1718
+ magnus::exception::type_error(),
1719
+ "Failed to coerce into Time32MillisecondBuilder",
1720
+ )
1721
+ })?
1722
+ .append_null(),
1723
+ ParquetSchemaType::Primitive(PrimitiveType::TimeMicros) => typed_builder
1724
+ .field_builder::<Time64MicrosecondBuilder>(i)
1725
+ .ok_or_else(|| {
1726
+ MagnusError::new(
1727
+ magnus::exception::type_error(),
1728
+ "Failed to coerce into Time64MicrosecondBuilder",
1729
+ )
1730
+ })?
1731
+ .append_null(),
1650
1732
  ParquetSchemaType::List(_) => typed_builder
1651
1733
  .field_builder::<ListBuilder<Box<dyn ArrayBuilder>>>(i)
1652
1734
  .ok_or_else(|| {
@@ -1743,3 +1825,125 @@ pub fn convert_ruby_array_to_arrow(
1743
1825
  }
1744
1826
  convert_parquet_values_to_arrow(parquet_values, type_)
1745
1827
  }
1828
+
1829
+ pub fn convert_to_time_millis(
1830
+ ruby: &Ruby,
1831
+ value: Value,
1832
+ format: Option<&str>,
1833
+ ) -> Result<i32, MagnusError> {
1834
+ if value.is_kind_of(ruby.class_time()) {
1835
+ // Extract time components
1836
+ let hour = i32::try_convert(value.funcall::<_, _, Value>("hour", ())?)?;
1837
+ let min = i32::try_convert(value.funcall::<_, _, Value>("min", ())?)?;
1838
+ let sec = i32::try_convert(value.funcall::<_, _, Value>("sec", ())?)?;
1839
+ let usec = i32::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
1840
+
1841
+ // Convert to milliseconds since midnight
1842
+ Ok(hour * 3600000 + min * 60000 + sec * 1000 + usec / 1000)
1843
+ } else if value.is_kind_of(ruby.class_string()) {
1844
+ let s = String::try_convert(value)?;
1845
+
1846
+ if let Some(fmt) = format {
1847
+ // Parse using the provided format
1848
+ match jiff::civil::Time::strptime(fmt, &s) {
1849
+ Ok(time) => {
1850
+ let millis = time.hour() as i32 * 3600000
1851
+ + time.minute() as i32 * 60000
1852
+ + time.second() as i32 * 1000
1853
+ + time.millisecond() as i32;
1854
+ Ok(millis)
1855
+ }
1856
+ Err(e) => Err(MagnusError::new(
1857
+ magnus::exception::type_error(),
1858
+ format!(
1859
+ "Failed to parse '{}' with format '{}' as time: {}",
1860
+ s, fmt, e
1861
+ ),
1862
+ )),
1863
+ }
1864
+ } else {
1865
+ // Try to parse as standard time format
1866
+ match s.parse::<jiff::civil::Time>() {
1867
+ Ok(time) => {
1868
+ let millis = time.hour() as i32 * 3600000
1869
+ + time.minute() as i32 * 60000
1870
+ + time.second() as i32 * 1000
1871
+ + time.millisecond() as i32;
1872
+ Ok(millis)
1873
+ }
1874
+ Err(e) => Err(MagnusError::new(
1875
+ magnus::exception::type_error(),
1876
+ format!("Failed to parse '{}' as time: {}", s, e),
1877
+ )),
1878
+ }
1879
+ }
1880
+ } else {
1881
+ Err(MagnusError::new(
1882
+ magnus::exception::type_error(),
1883
+ format!("Cannot convert {} to time_millis", unsafe {
1884
+ value.classname()
1885
+ }),
1886
+ ))
1887
+ }
1888
+ }
1889
+
1890
+ pub fn convert_to_time_micros(
1891
+ ruby: &Ruby,
1892
+ value: Value,
1893
+ format: Option<&str>,
1894
+ ) -> Result<i64, MagnusError> {
1895
+ if value.is_kind_of(ruby.class_time()) {
1896
+ // Extract time components
1897
+ let hour = i64::try_convert(value.funcall::<_, _, Value>("hour", ())?)?;
1898
+ let min = i64::try_convert(value.funcall::<_, _, Value>("min", ())?)?;
1899
+ let sec = i64::try_convert(value.funcall::<_, _, Value>("sec", ())?)?;
1900
+ let usec = i64::try_convert(value.funcall::<_, _, Value>("usec", ())?)?;
1901
+
1902
+ // Convert to microseconds since midnight
1903
+ Ok(hour * 3600000000 + min * 60000000 + sec * 1000000 + usec)
1904
+ } else if value.is_kind_of(ruby.class_string()) {
1905
+ let s = String::try_convert(value)?;
1906
+
1907
+ if let Some(fmt) = format {
1908
+ // Parse using the provided format
1909
+ match jiff::civil::Time::strptime(fmt, &s) {
1910
+ Ok(time) => {
1911
+ let micros = time.hour() as i64 * 3600000000
1912
+ + time.minute() as i64 * 60000000
1913
+ + time.second() as i64 * 1000000
1914
+ + time.microsecond() as i64;
1915
+ Ok(micros)
1916
+ }
1917
+ Err(e) => Err(MagnusError::new(
1918
+ magnus::exception::type_error(),
1919
+ format!(
1920
+ "Failed to parse '{}' with format '{}' as time: {}",
1921
+ s, fmt, e
1922
+ ),
1923
+ )),
1924
+ }
1925
+ } else {
1926
+ // Try to parse as standard time format
1927
+ match s.parse::<jiff::civil::Time>() {
1928
+ Ok(time) => {
1929
+ let micros = time.hour() as i64 * 3600000000
1930
+ + time.minute() as i64 * 60000000
1931
+ + time.second() as i64 * 1000000
1932
+ + time.microsecond() as i64;
1933
+ Ok(micros)
1934
+ }
1935
+ Err(e) => Err(MagnusError::new(
1936
+ magnus::exception::type_error(),
1937
+ format!("Failed to parse '{}' as time: {}", s, e),
1938
+ )),
1939
+ }
1940
+ }
1941
+ } else {
1942
+ Err(MagnusError::new(
1943
+ magnus::exception::type_error(),
1944
+ format!("Cannot convert {} to time_micros", unsafe {
1945
+ value.classname()
1946
+ }),
1947
+ ))
1948
+ }
1949
+ }
@@ -146,7 +146,10 @@ impl FromStr for ParquetSchemaType<'_> {
146
146
  }
147
147
 
148
148
  // Check if it's a decimal256 type with precision and scale
149
- if let Some(decimal_params) = s.strip_prefix("decimal256(").and_then(|s| s.strip_suffix(")")) {
149
+ if let Some(decimal_params) = s
150
+ .strip_prefix("decimal256(")
151
+ .and_then(|s| s.strip_suffix(")"))
152
+ {
150
153
  let parts: Vec<&str> = decimal_params.split(',').collect();
151
154
 
152
155
  // Handle both single parameter (precision only) and two parameters (precision and scale)
@@ -210,6 +213,8 @@ impl FromStr for ParquetSchemaType<'_> {
210
213
  "date32" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Date32)),
211
214
  "timestamp_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMillis)),
212
215
  "timestamp_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimestampMicros)),
216
+ "time_millis" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimeMillis)),
217
+ "time_micros" => Ok(ParquetSchemaType::Primitive(PrimitiveType::TimeMicros)),
213
218
  "decimal" => Ok(ParquetSchemaType::Primitive(PrimitiveType::Decimal128(
214
219
  38, 0,
215
220
  ))),
@@ -230,6 +230,16 @@ fn arrow_data_type_to_parquet_schema_type(dt: &DataType) -> Result<ParquetSchema
230
230
  "TimestampNanos not supported, please adjust your schema or code.",
231
231
  ))
232
232
  }
233
+ DataType::Time32(TimeUnit::Millisecond) => Ok(PST::Primitive(PrimitiveType::TimeMillis)),
234
+ DataType::Time64(TimeUnit::Microsecond) => Ok(PST::Primitive(PrimitiveType::TimeMicros)),
235
+ DataType::Time32(_) => Err(MagnusError::new(
236
+ magnus::exception::runtime_error(),
237
+ "Time32 only supports millisecond unit",
238
+ )),
239
+ DataType::Time64(_) => Err(MagnusError::new(
240
+ magnus::exception::runtime_error(),
241
+ "Time64 only supports microsecond unit",
242
+ )),
233
243
  DataType::Utf8 => Ok(PST::Primitive(PrimitiveType::String)),
234
244
  DataType::Binary => Ok(PST::Primitive(PrimitiveType::Binary)),
235
245
  DataType::LargeUtf8 => {
@@ -170,6 +170,12 @@ fn write_columns_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemEr
170
170
  PrimitiveType::TimestampMicros => {
171
171
  PST::Primitive(PrimitiveType::TimestampMicros)
172
172
  }
173
+ PrimitiveType::TimeMillis => {
174
+ PST::Primitive(PrimitiveType::TimeMillis)
175
+ }
176
+ PrimitiveType::TimeMicros => {
177
+ PST::Primitive(PrimitiveType::TimeMicros)
178
+ }
173
179
  PrimitiveType::Decimal256(precision, scale) => {
174
180
  PST::Primitive(PrimitiveType::Decimal256(precision, scale))
175
181
  }
@@ -259,9 +259,11 @@ pub fn estimate_value_size(
259
259
  PST::Primitive(PrimitiveType::Boolean) => Ok(1),
260
260
  PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
261
261
  PST::Primitive(PrimitiveType::Decimal256(_, _)) => Ok(32),
262
- PST::Primitive(PrimitiveType::Date32)
263
- | PST::Primitive(PrimitiveType::TimestampMillis)
264
- | PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8),
262
+ PST::Primitive(PrimitiveType::Date32) => Ok(4), // Date32 is 4 bytes
263
+ PST::Primitive(PrimitiveType::TimestampMillis)
264
+ | PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8), // Timestamps are 8 bytes
265
+ PST::Primitive(PrimitiveType::TimeMillis) => Ok(4), // TimeMillis is 4 bytes
266
+ PST::Primitive(PrimitiveType::TimeMicros) => Ok(8), // TimeMicros is 8 bytes
265
267
  PST::Primitive(PrimitiveType::String) | PST::Primitive(PrimitiveType::Binary) => {
266
268
  if let Ok(s) = String::try_convert(value) {
267
269
  // Account for string length plus Rust String's capacity+pointer overhead
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.5.11"
2
+ VERSION = "0.5.13"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.11
4
+ version: 0.5.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-06-18 00:00:00.000000000 Z
11
+ date: 2025-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys