parquet 0.6.1 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd9295880f123b0ed979fb970d789bf63461007e8c087641b47391baa02bef29
4
- data.tar.gz: e5ccd26a48e7b800412049e9d44e7f8c6fb5c12dbe25c74980467c822fb114c6
3
+ metadata.gz: de6b7f5c61eb1e796e7066790e2c1e0ada9ba4519140cee4e2378cd402db2586
4
+ data.tar.gz: 5b1dc2e442b1be17af82dd3a431b6f3a66254410229055cdbd8713aa1c009be2
5
5
  SHA512:
6
- metadata.gz: f275d82733cd6b1658ed7b6212618830e2a12e6831bcfcc26beeae79691ddc972b22b0b0660f10db9b0bb7a03e2af29da15de28d037d5896891fbd678b3ecca9
7
- data.tar.gz: 8fbfd33a92350c84409c0e879745fbc5a07d60f0a2862a2993a4780eb5c110be7b3918bfdfe6721e60e44635792108a1a3eeed460a189076bd03056a27764e50
6
+ metadata.gz: 7635247bc9627cdafe79ee9be1072c13b0f8ec11549506f9a8b6170d9b095883ede0f8a8165d0340572d89e1501c1d5f144c20f963ab960171dcb5813f15022c
7
+ data.tar.gz: abb59172a54c8d63ca39f24bdda4c64b98a60015622fc7f4a7a2a4c42ff03c3327f134de8ec66e006eb38a8cb38da90824a987b44f6e2fcc2af1c01bd4d85ee1
data/Cargo.lock CHANGED
@@ -225,6 +225,10 @@ dependencies = [
225
225
  name = "arrow-schema"
226
226
  version = "55.2.0"
227
227
  source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#54858bf019ff3faeb8f5b562da8c01012162aef0"
228
+ dependencies = [
229
+ "serde",
230
+ "serde_json",
231
+ ]
228
232
 
229
233
  [[package]]
230
234
  name = "arrow-select"
@@ -1092,6 +1096,7 @@ dependencies = [
1092
1096
  "rb-sys-env 0.2.2",
1093
1097
  "tempfile",
1094
1098
  "thiserror",
1099
+ "uuid",
1095
1100
  ]
1096
1101
 
1097
1102
  [[package]]
@@ -7,7 +7,7 @@ edition = "2021"
7
7
  arrow = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
8
8
  arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
9
9
  arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
10
- arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
10
+ arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["canonical_extension_types"]}
11
11
  bytes = "1.5"
12
12
  indexmap = "2.2"
13
13
  jiff = "0.2"
@@ -17,7 +17,7 @@ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24
17
17
  rand = "0.9.1"
18
18
  serde = { version = "1.0", features = ["derive"] }
19
19
  thiserror = "2.0"
20
+ uuid = { version = "1.0", features = ["v4"] }
20
21
 
21
22
  [dev-dependencies]
22
- uuid = { version = "1.0", features = ["v4"] }
23
23
  tempfile = "3.8"
@@ -7,6 +7,7 @@
7
7
 
8
8
  use crate::{ParquetError, ParquetValue, Result};
9
9
  use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
10
+ use arrow_schema::extension::Uuid as ArrowUuid;
10
11
  use arrow_schema::{DataType, Field};
11
12
  use bytes::Bytes;
12
13
  use indexmap::IndexMap;
@@ -14,7 +15,11 @@ use ordered_float::OrderedFloat;
14
15
  use std::sync::Arc;
15
16
 
16
17
  /// Convert a single value from an Arrow array at the given index to a ParquetValue
17
- pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<ParquetValue> {
18
+ pub fn arrow_to_parquet_value(
19
+ field: &Field,
20
+ array: &dyn Array,
21
+ index: usize,
22
+ ) -> Result<ParquetValue> {
18
23
  use arrow_array::*;
19
24
 
20
25
  if array.is_null(index) {
@@ -72,7 +77,6 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
72
77
  let array = downcast_array::<Float64Array>(array)?;
73
78
  Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
74
79
  }
75
-
76
80
  // String and binary types
77
81
  DataType::Utf8 => {
78
82
  let array = downcast_array::<StringArray>(array)?;
@@ -86,9 +90,15 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
86
90
  }
87
91
  DataType::FixedSizeBinary(_) => {
88
92
  let array = downcast_array::<FixedSizeBinaryArray>(array)?;
89
- Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
90
- array.value(index),
91
- )))
93
+ let value = array.value(index);
94
+ match field.try_extension_type::<ArrowUuid>() {
95
+ Ok(_) => {
96
+ let uuid = uuid::Uuid::from_slice(value)
97
+ .map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
98
+ Ok(ParquetValue::Uuid(uuid))
99
+ }
100
+ Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
101
+ }
92
102
  }
93
103
 
94
104
  // Date and time types
@@ -140,6 +150,10 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
140
150
  let array = downcast_array::<Time64MicrosecondArray>(array)?;
141
151
  Ok(ParquetValue::TimeMicros(array.value(index)))
142
152
  }
153
+ arrow_schema::TimeUnit::Nanosecond => {
154
+ let array = downcast_array::<Time64NanosecondArray>(array)?;
155
+ Ok(ParquetValue::TimeNanos(array.value(index)))
156
+ }
143
157
  _ => Err(ParquetError::Conversion(format!(
144
158
  "Unsupported time64 unit: {:?}",
145
159
  unit
@@ -173,13 +187,13 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
173
187
  }
174
188
 
175
189
  // Complex types
176
- DataType::List(_) => {
190
+ DataType::List(item_field) => {
177
191
  let array = downcast_array::<ListArray>(array)?;
178
192
  let list_values = array.value(index);
179
193
 
180
194
  let mut values = Vec::with_capacity(list_values.len());
181
195
  for i in 0..list_values.len() {
182
- values.push(arrow_to_parquet_value(&list_values, i)?);
196
+ values.push(arrow_to_parquet_value(item_field, &list_values, i)?);
183
197
  }
184
198
 
185
199
  Ok(ParquetValue::List(values))
@@ -192,10 +206,22 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
192
206
  let keys = map_value.column(0);
193
207
  let values = map_value.column(1);
194
208
 
209
+ let key_field = map_value
210
+ .fields()
211
+ .iter()
212
+ .find(|f| f.name() == "key")
213
+ .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
214
+
215
+ let value_field = map_value
216
+ .fields()
217
+ .iter()
218
+ .find(|f| f.name() == "value")
219
+ .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
220
+
195
221
  let mut map_vec = Vec::with_capacity(keys.len());
196
222
  for i in 0..keys.len() {
197
- let key = arrow_to_parquet_value(keys, i)?;
198
- let value = arrow_to_parquet_value(values, i)?;
223
+ let key = arrow_to_parquet_value(key_field, keys, i)?;
224
+ let value = arrow_to_parquet_value(value_field, values, i)?;
199
225
  map_vec.push((key, value));
200
226
  }
201
227
 
@@ -207,7 +233,7 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
207
233
  let mut map = IndexMap::new();
208
234
  for (col_idx, field) in array.fields().iter().enumerate() {
209
235
  let column = array.column(col_idx);
210
- let value = arrow_to_parquet_value(column, index)?;
236
+ let value = arrow_to_parquet_value(field, column, index)?;
211
237
  map.insert(Arc::from(field.name().as_str()), value);
212
238
  }
213
239
 
@@ -1108,7 +1134,7 @@ mod tests {
1108
1134
  let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
1109
1135
 
1110
1136
  for (i, expected) in values.iter().enumerate() {
1111
- let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
1137
+ let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
1112
1138
  assert_eq!(&actual, expected);
1113
1139
  }
1114
1140
  }
@@ -163,8 +163,10 @@ where
163
163
  // Extract values from current row
164
164
  let mut row_values = Vec::with_capacity(batch.num_columns());
165
165
 
166
- for column in batch.columns() {
167
- let value = match arrow_to_parquet_value(column, self.current_row) {
166
+ let schema = batch.schema();
167
+ for (i, column) in batch.columns().iter().enumerate() {
168
+ let field = schema.field(i);
169
+ let value = match arrow_to_parquet_value(field, column, self.current_row) {
168
170
  Ok(v) => v,
169
171
  Err(e) => return Some(Err(e)),
170
172
  };
@@ -228,12 +230,13 @@ where
228
230
  let mut columns = Vec::with_capacity(batch.num_columns());
229
231
 
230
232
  for (idx, column) in batch.columns().iter().enumerate() {
231
- let column_name = self.schema.field(idx).name().to_string();
233
+ let field = self.schema.field(idx);
234
+ let column_name = field.name().to_string();
232
235
 
233
236
  // Convert entire column to ParquetValues
234
237
  let mut values = Vec::with_capacity(column.len());
235
238
  for row_idx in 0..column.len() {
236
- match arrow_to_parquet_value(column, row_idx) {
239
+ match arrow_to_parquet_value(field, column, row_idx) {
237
240
  Ok(value) => values.push(value),
238
241
  Err(e) => return Some(Err(e)),
239
242
  }
@@ -72,6 +72,7 @@ pub enum PrimitiveType {
72
72
  TimestampNanos(Option<Arc<str>>),
73
73
  TimeMillis,
74
74
  TimeMicros,
75
+ TimeNanos,
75
76
 
76
77
  // Fixed-length byte array
77
78
  FixedLenByteArray(i32),
@@ -146,6 +147,7 @@ impl PrimitiveType {
146
147
  PrimitiveType::TimestampNanos(_) => "TimestampNanos",
147
148
  PrimitiveType::TimeMillis => "TimeMillis",
148
149
  PrimitiveType::TimeMicros => "TimeMicros",
150
+ PrimitiveType::TimeNanos => "TimeNanos",
149
151
  PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
150
152
  }
151
153
  }
@@ -2,6 +2,7 @@ use bytes::Bytes;
2
2
  use indexmap::IndexMap;
3
3
  use num::BigInt;
4
4
  use std::sync::Arc;
5
+ use uuid::Uuid;
5
6
 
6
7
  #[derive(Debug, Clone, PartialEq, Eq)]
7
8
  pub enum ParquetValue {
@@ -22,6 +23,7 @@ pub enum ParquetValue {
22
23
  Boolean(bool),
23
24
  String(Arc<str>),
24
25
  Bytes(Bytes),
26
+ Uuid(Uuid),
25
27
 
26
28
  // Date/Time types
27
29
  Date32(i32), // Days since epoch
@@ -40,6 +42,7 @@ pub enum ParquetValue {
40
42
  // Time types
41
43
  TimeMillis(i32), // Time of day in milliseconds since midnight
42
44
  TimeMicros(i64), // Time of day in microseconds since midnight
45
+ TimeNanos(i64), // Time of day in nanoseconds since midnight
43
46
 
44
47
  // Complex types
45
48
  List(Vec<ParquetValue>),
@@ -68,6 +71,7 @@ impl std::hash::Hash for ParquetValue {
68
71
  ParquetValue::Boolean(b) => b.hash(state),
69
72
  ParquetValue::String(s) => s.hash(state),
70
73
  ParquetValue::Bytes(b) => b.hash(state),
74
+ ParquetValue::Uuid(u) => u.hash(state),
71
75
  ParquetValue::Date32(d) => d.hash(state),
72
76
  ParquetValue::Date64(d) => d.hash(state),
73
77
  ParquetValue::Decimal128(d, scale) => {
@@ -96,6 +100,7 @@ impl std::hash::Hash for ParquetValue {
96
100
  }
97
101
  ParquetValue::TimeMillis(t) => t.hash(state),
98
102
  ParquetValue::TimeMicros(t) => t.hash(state),
103
+ ParquetValue::TimeNanos(t) => t.hash(state),
99
104
  ParquetValue::List(l) => l.hash(state),
100
105
  ParquetValue::Map(m) => m.hash(state),
101
106
  ParquetValue::Record(r) => {
@@ -133,6 +138,7 @@ impl ParquetValue {
133
138
  ParquetValue::Boolean(_) => "Boolean",
134
139
  ParquetValue::String(_) => "String",
135
140
  ParquetValue::Bytes(_) => "Bytes",
141
+ ParquetValue::Uuid(_) => "Uuid",
136
142
  ParquetValue::Date32(_) => "Date32",
137
143
  ParquetValue::Date64(_) => "Date64",
138
144
  ParquetValue::Decimal128(_, _) => "Decimal128",
@@ -143,6 +149,7 @@ impl ParquetValue {
143
149
  ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
144
150
  ParquetValue::TimeMillis(_) => "TimeMillis",
145
151
  ParquetValue::TimeMicros(_) => "TimeMicros",
152
+ ParquetValue::TimeNanos(_) => "TimeNanos",
146
153
  ParquetValue::List(_) => "List",
147
154
  ParquetValue::Map(_) => "Map",
148
155
  ParquetValue::Record(_) => "Record",
@@ -235,6 +235,7 @@ where
235
235
  (Date64(_), DataType::Date64) => 8,
236
236
  (TimeMillis(_), DataType::Time32(_)) => 4,
237
237
  (TimeMicros(_), DataType::Time64(_)) => 8,
238
+ (TimeNanos(_), DataType::Time64(_)) => 8,
238
239
  (TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
239
240
  (TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
240
241
  (TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
@@ -364,7 +365,9 @@ where
364
365
  writer.write(&batch)?;
365
366
 
366
367
  // Check if we need to flush based on memory usage
367
- if writer.in_progress_size() >= self.memory_threshold {
368
+ if writer.in_progress_size() >= self.memory_threshold
369
+ || writer.memory_size() >= self.memory_threshold
370
+ {
368
371
  writer.flush()?;
369
372
  }
370
373
  } else {
@@ -496,6 +499,7 @@ fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str)
496
499
  (Date64(_), DataType::Date64) => Ok(()),
497
500
  (TimeMillis(_), DataType::Time32(_)) => Ok(()),
498
501
  (TimeMicros(_), DataType::Time64(_)) => Ok(()),
502
+ (TimeNanos(_), DataType::Time64(_)) => Ok(()),
499
503
  (TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
500
504
  (TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
501
505
  (TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
@@ -591,10 +595,16 @@ fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
591
595
  name,
592
596
  primitive_type,
593
597
  nullable,
594
- ..
598
+ format,
595
599
  } => {
596
600
  let data_type = primitive_type_to_arrow(primitive_type)?;
597
- Ok(Field::new(name, data_type, *nullable))
601
+ let field = Field::new(name, data_type, *nullable);
602
+ let extended_field = if format.as_deref() == Some("uuid") {
603
+ field.with_extension_type(arrow_schema::extension::Uuid)
604
+ } else {
605
+ field
606
+ };
607
+ Ok(extended_field)
598
608
  }
599
609
  SchemaNode::List {
600
610
  name,
@@ -671,6 +681,7 @@ fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
671
681
  Date32 => DataType::Date32,
672
682
  TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
673
683
  TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
684
+ TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
674
685
  TimestampMillis(tz) => DataType::Timestamp(
675
686
  arrow_schema::TimeUnit::Millisecond,
676
687
  // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
@@ -99,7 +99,7 @@ fn test_decimal256_large_values() {
99
99
 
100
100
  // Verify roundtrip
101
101
  for i in 0..4 {
102
- let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
102
+ let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
103
103
  match (i, value) {
104
104
  (0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
105
105
  (1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
@@ -173,7 +173,7 @@ fn test_timestamp_with_timezone() {
173
173
 
174
174
  // Verify roundtrip preserves timezone
175
175
  for i in 0..3 {
176
- let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
176
+ let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
177
177
  match value {
178
178
  ParquetValue::TimestampMillis(_, Some(tz)) => {
179
179
  assert_eq!(tz.as_ref(), "America/New_York");
@@ -209,7 +209,7 @@ fn test_nested_list_of_lists() {
209
209
  assert_eq!(array.len(), 1);
210
210
 
211
211
  // Verify roundtrip
212
- let value = arrow_to_parquet_value(array.as_ref(), 0).unwrap();
212
+ let value = arrow_to_parquet_value(&outer_field, array.as_ref(), 0).unwrap();
213
213
  match value {
214
214
  ParquetValue::List(items) => assert_eq!(items.len(), 5),
215
215
  _ => panic!("Expected list"),
@@ -357,7 +357,7 @@ fn test_unsupported_arrow_types() {
357
357
  )
358
358
  .unwrap();
359
359
 
360
- let result = arrow_to_parquet_value(&array, 0);
360
+ let result = arrow_to_parquet_value(&Field::new("int", DataType::Int32, false), &array, 0);
361
361
  assert!(result.is_err());
362
362
  assert!(result
363
363
  .unwrap_err()
@@ -20,3 +20,4 @@ rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
20
20
  tempfile = "^3.15"
21
21
  thiserror = "2.0"
22
22
  indexmap = "2.2"
23
+ uuid = "*"
@@ -4,13 +4,14 @@ use indexmap::IndexMap;
4
4
  use magnus::r_hash::ForEach;
5
5
  use magnus::value::ReprValue;
6
6
  use magnus::{
7
- Error as MagnusError, IntoValue, Module, RArray, RHash, RString, Ruby, Symbol, TryConvert,
8
- Value,
7
+ kwargs, Error as MagnusError, IntoValue, Module, RArray, RHash, RString, Ruby, Symbol,
8
+ TryConvert, Value,
9
9
  };
10
10
  use ordered_float::OrderedFloat;
11
11
  use parquet_core::{ParquetError, ParquetValue, Result};
12
12
  use std::cell::RefCell;
13
13
  use std::sync::Arc;
14
+ use uuid::Uuid;
14
15
 
15
16
  /// Ruby value converter
16
17
  ///
@@ -41,27 +42,6 @@ impl RubyValueConverter {
41
42
  .map(|cache| cache.stats())
42
43
  }
43
44
 
44
- /// Convert a Ruby value to ParquetValue with type hint
45
- /// This is the primary conversion method that handles all Ruby types
46
- pub fn to_parquet_with_type_hint(
47
- &mut self,
48
- value: Value,
49
- type_hint: Option<&parquet_core::PrimitiveType>,
50
- ) -> Result<ParquetValue> {
51
- // Handle nil values
52
- if value.is_nil() {
53
- return Ok(ParquetValue::Null);
54
- }
55
-
56
- // If we have a type hint, use it to guide conversion
57
- if let Some(hint) = type_hint {
58
- return self.convert_with_type_hint(value, hint);
59
- }
60
-
61
- // Otherwise, infer type from Ruby value
62
- self.infer_and_convert(value)
63
- }
64
-
65
45
  /// Convert a Ruby value to ParquetValue with schema hint
66
46
  /// This handles both primitive and complex types
67
47
  pub fn to_parquet_with_schema_hint(
@@ -115,7 +95,7 @@ impl RubyValueConverter {
115
95
  use parquet_core::PrimitiveType::*;
116
96
 
117
97
  // Special handling for UUID format
118
- if let (Binary, Some("uuid")) = (type_hint, format) {
98
+ if let (FixedLenByteArray(16), Some("uuid")) = (type_hint, format) {
119
99
  return self.convert_to_uuid_binary(value);
120
100
  }
121
101
 
@@ -156,6 +136,7 @@ impl RubyValueConverter {
156
136
  Date64 => self.convert_to_date64(value, None),
157
137
  TimeMillis => self.convert_to_time_millis(value),
158
138
  TimeMicros => self.convert_to_time_micros(value),
139
+ TimeNanos => self.convert_to_time_nanos(value),
159
140
  TimestampSecond(schema_tz) => {
160
141
  self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
161
142
  }
@@ -484,32 +465,19 @@ impl RubyValueConverter {
484
465
 
485
466
  // Convert value to string
486
467
  let uuid_str: String = value
487
- .funcall("to_s", ())
488
- .and_then(TryConvert::try_convert)
468
+ .to_r_string()
469
+ .map_err(|e: MagnusError| {
470
+ ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
471
+ })?
472
+ .to_string()
489
473
  .map_err(|e: MagnusError| {
490
474
  ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
491
475
  })?;
492
476
 
493
- // Remove hyphens and validate length
494
- let clean_uuid = uuid_str.replace('-', "");
495
- if clean_uuid.len() != 32 {
496
- return Err(ParquetError::Conversion(format!(
497
- "Invalid UUID format: expected 32 hex characters (ignoring hyphens), got {}",
498
- clean_uuid.len()
499
- )));
500
- }
501
-
502
- // Parse hex string to bytes
503
- let mut bytes = Vec::with_capacity(16);
504
- for i in 0..16 {
505
- let hex_byte = &clean_uuid[i * 2..i * 2 + 2];
506
- let byte = u8::from_str_radix(hex_byte, 16).map_err(|_| {
507
- ParquetError::Conversion(format!("Invalid hex character in UUID: {}", hex_byte))
508
- })?;
509
- bytes.push(byte);
510
- }
511
-
512
- Ok(ParquetValue::Bytes(bytes.into()))
477
+ let parsed = uuid::Uuid::parse_str(&uuid_str)
478
+ .map_err(|e| ParquetError::Conversion(format!("Failed to parse UUID: {}", e)))?;
479
+ let bytes = Bytes::copy_from_slice(parsed.as_bytes());
480
+ Ok(ParquetValue::Bytes(bytes))
513
481
  }
514
482
 
515
483
  fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
@@ -692,6 +660,38 @@ impl RubyValueConverter {
692
660
  )))
693
661
  }
694
662
 
663
+ fn convert_to_time_nanos(&self, value: Value) -> Result<ParquetValue> {
664
+ if value.is_nil() {
665
+ return Ok(ParquetValue::Null);
666
+ }
667
+
668
+ // Convert to microseconds since midnight
669
+ let ruby = Ruby::get()
670
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
671
+ if value.is_kind_of(ruby.class_time()) {
672
+ let hour: i64 = value
673
+ .funcall("hour", ())
674
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
675
+ let min: i64 = value
676
+ .funcall("min", ())
677
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
678
+ let sec: i64 = value
679
+ .funcall("sec", ())
680
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
681
+ let nsec: i64 = value
682
+ .funcall("nsec", ())
683
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
684
+
685
+ let nanos = (hour * 3600 + min * 60 + sec) * 1_000_000_000 + nsec;
686
+ return Ok(ParquetValue::TimeNanos(nanos));
687
+ }
688
+
689
+ Err(ParquetError::Conversion(format!(
690
+ "Cannot convert {} to time_micros",
691
+ value.class()
692
+ )))
693
+ }
694
+
695
695
  // Timestamp conversion methods that respect schema timezone
696
696
  fn convert_to_timestamp_second_with_tz(
697
697
  &self,
@@ -1399,21 +1399,11 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1399
1399
  ParquetValue::Float32(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
1400
1400
  ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
1401
1401
  ParquetValue::String(s) => Ok(s.into_value_with(&ruby)),
1402
- ParquetValue::Bytes(b) => {
1403
- // Check if this is a UUID (16 bytes)
1404
- if b.len() == 16 {
1405
- // Format as UUID string
1406
- let uuid_str = format!(
1407
- "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
1408
- b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
1409
- b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
1410
- );
1411
- Ok(uuid_str.into_value_with(&ruby))
1412
- } else {
1413
- // Regular bytes - convert to string
1414
- Ok(ruby.str_from_slice(&b).as_value())
1415
- }
1416
- }
1402
+ ParquetValue::Uuid(u) => Ok(u
1403
+ .hyphenated()
1404
+ .encode_lower(&mut Uuid::encode_buffer())
1405
+ .into_value_with(&ruby)),
1406
+ ParquetValue::Bytes(b) => Ok(ruby.enc_str_new(&b, ruby.ascii8bit_encoding()).as_value()),
1417
1407
  ParquetValue::Date32(days) => {
1418
1408
  // Convert days since epoch to Date object
1419
1409
  let _ = ruby.require("date");
@@ -1503,10 +1493,26 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1503
1493
  .funcall("utc", (year, month, day, hours, minutes, seconds, us))
1504
1494
  .map_err(|e| ParquetError::Conversion(e.to_string()))
1505
1495
  }
1496
+ ParquetValue::TimeNanos(nanos) => {
1497
+ let time_class = ruby.class_time();
1498
+ let secs = nanos / 1_000_000_000;
1499
+ let nsec = nanos % 1_000_000_000;
1500
+ time_class
1501
+ .funcall(
1502
+ "at",
1503
+ (
1504
+ secs,
1505
+ nsec,
1506
+ Symbol::new("nanosecond"),
1507
+ kwargs!("in" => "UTC"),
1508
+ ),
1509
+ )
1510
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1511
+ }
1506
1512
  ParquetValue::TimestampSecond(secs, tz) => {
1507
1513
  let time_class = ruby.class_time();
1508
1514
  let time = time_class
1509
- .funcall::<_, _, Value>("at", (secs,))
1515
+ .funcall::<_, _, Value>("at", (secs, kwargs!("in" => "UTC")))
1510
1516
  .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1511
1517
  apply_timezone(time, &tz)
1512
1518
  }
@@ -1515,7 +1521,7 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1515
1521
  let secs = millis / 1000;
1516
1522
  let usec = (millis % 1000) * 1000; // Convert millisecond remainder to microseconds
1517
1523
  let time = time_class
1518
- .funcall::<_, _, Value>("at", (secs, usec))
1524
+ .funcall::<_, _, Value>("at", (secs, usec, kwargs!("in" => "UTC")))
1519
1525
  .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1520
1526
  apply_timezone(time, &tz)
1521
1527
  }
@@ -1524,7 +1530,7 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1524
1530
  let secs = micros / 1_000_000;
1525
1531
  let usec = micros % 1_000_000; // Already in microseconds
1526
1532
  let time = time_class
1527
- .funcall::<_, _, Value>("at", (secs, usec))
1533
+ .funcall::<_, _, Value>("at", (secs, usec, kwargs!("in" => "UTC")))
1528
1534
  .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1529
1535
  apply_timezone(time, &tz)
1530
1536
  }
@@ -1534,7 +1540,15 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1534
1540
  let nsec = nanos % 1_000_000_000;
1535
1541
  // Use the nanosecond form of Time.at
1536
1542
  let time = time_class
1537
- .funcall::<_, _, Value>("at", (secs, nsec, Symbol::new("nanosecond")))
1543
+ .funcall::<_, _, Value>(
1544
+ "at",
1545
+ (
1546
+ secs,
1547
+ nsec,
1548
+ Symbol::new("nanosecond"),
1549
+ kwargs!("in" => "UTC"),
1550
+ ),
1551
+ )
1538
1552
  .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1539
1553
  apply_timezone(time, &tz)
1540
1554
  }
@@ -115,9 +115,111 @@ impl TryIntoValue for RubyParquetMetaData {
115
115
  .map_err(|e| {
116
116
  RubyAdapterError::metadata(format!("Failed to set converted_type: {}", e))
117
117
  })?;
118
+
118
119
  if let Some(logical_type) = basic_info.logical_type() {
120
+ let logical_type_value = match logical_type {
121
+ parquet::basic::LogicalType::Decimal { scale, precision } => {
122
+ let logical_hash = handle.hash_new();
123
+ logical_hash.aset("type", "Decimal").map_err(|e| {
124
+ RubyAdapterError::metadata(format!("Failed to set type: {}", e))
125
+ })?;
126
+ logical_hash.aset("scale", scale).map_err(|e| {
127
+ RubyAdapterError::metadata(format!("Failed to set scale: {}", e))
128
+ })?;
129
+ logical_hash.aset("precision", precision).map_err(|e| {
130
+ RubyAdapterError::metadata(format!("Failed to set precision: {}", e))
131
+ })?;
132
+ logical_hash.as_value()
133
+ }
134
+ parquet::basic::LogicalType::Time {
135
+ is_adjusted_to_u_t_c,
136
+ unit,
137
+ } => {
138
+ let logical_hash = handle.hash_new();
139
+ logical_hash.aset("type", "Time").map_err(|e| {
140
+ RubyAdapterError::metadata(format!("Failed to set type: {}", e))
141
+ })?;
142
+ logical_hash
143
+ .aset(
144
+ "is_adjusted_to_utc",
145
+ is_adjusted_to_u_t_c.to_string().as_str(),
146
+ )
147
+ .map_err(|e| {
148
+ RubyAdapterError::metadata(format!(
149
+ "Failed to set is_adjusted_to_u_t_c: {}",
150
+ e
151
+ ))
152
+ })?;
153
+
154
+ let unit_str = match unit {
155
+ parquet::basic::TimeUnit::MILLIS(_) => "millis",
156
+ parquet::basic::TimeUnit::MICROS(_) => "micros",
157
+ parquet::basic::TimeUnit::NANOS(_) => "nanos",
158
+ };
159
+ logical_hash.aset("unit", unit_str).map_err(|e| {
160
+ RubyAdapterError::metadata(format!("Failed to set unit: {}", e))
161
+ })?;
162
+ logical_hash.as_value()
163
+ }
164
+ parquet::basic::LogicalType::Timestamp {
165
+ is_adjusted_to_u_t_c,
166
+ unit,
167
+ } => {
168
+ let logical_hash = handle.hash_new();
169
+ logical_hash.aset("type", "Timestamp").map_err(|e| {
170
+ RubyAdapterError::metadata(format!("Failed to set type: {}", e))
171
+ })?;
172
+ logical_hash
173
+ .aset("is_adjusted_to_utc", is_adjusted_to_u_t_c)
174
+ .map_err(|e| {
175
+ RubyAdapterError::metadata(format!(
176
+ "Failed to set is_adjusted_to_u_t_c: {}",
177
+ e
178
+ ))
179
+ })?;
180
+ let unit_str = match unit {
181
+ parquet::basic::TimeUnit::MILLIS(_) => "millis",
182
+ parquet::basic::TimeUnit::MICROS(_) => "micros",
183
+ parquet::basic::TimeUnit::NANOS(_) => "nanos",
184
+ };
185
+ logical_hash.aset("unit", unit_str).map_err(|e| {
186
+ RubyAdapterError::metadata(format!("Failed to set unit: {}", e))
187
+ })?;
188
+ logical_hash.as_value()
189
+ }
190
+ parquet::basic::LogicalType::Integer {
191
+ bit_width,
192
+ is_signed,
193
+ } => {
194
+ let logical_hash = handle.hash_new();
195
+ logical_hash.aset("type", "Integer").map_err(|e| {
196
+ RubyAdapterError::metadata(format!("Failed to set type: {}", e))
197
+ })?;
198
+ logical_hash.aset("bit_width", bit_width).map_err(|e| {
199
+ RubyAdapterError::metadata(format!("Failed to set bit_width: {}", e))
200
+ })?;
201
+ logical_hash
202
+ .aset("is_signed", is_signed.to_string().as_str())
203
+ .map_err(|e| {
204
+ RubyAdapterError::metadata(format!(
205
+ "Failed to set is_signed: {}",
206
+ e
207
+ ))
208
+ })?;
209
+ logical_hash.as_value()
210
+ }
211
+ _ => {
212
+ let logical_hash = handle.hash_new();
213
+ logical_hash
214
+ .aset("type", format!("{:?}", logical_type))
215
+ .map_err(|e| {
216
+ RubyAdapterError::metadata(format!("Failed to set type: {}", e))
217
+ })?;
218
+ logical_hash.as_value()
219
+ }
220
+ };
119
221
  field_hash
120
- .aset("logical_type", format!("{:?}", logical_type))
222
+ .aset("logical_type", logical_type_value)
121
223
  .map_err(|e| {
122
224
  RubyAdapterError::metadata(format!("Failed to set logical_type: {}", e))
123
225
  })?;
@@ -2,6 +2,7 @@ use magnus::value::ReprValue;
2
2
  use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
3
3
  use parquet_core::reader::Reader;
4
4
 
5
+ use crate::StringCache;
5
6
  use crate::{
6
7
  converter::parquet_to_ruby,
7
8
  io::{RubyIOReader, ThreadSafeRubyIOReader},
@@ -101,6 +102,12 @@ pub fn each_row(
101
102
  })?;
102
103
  let mut row_count = 0u64;
103
104
 
105
+ let mut cache = StringCache::new(true);
106
+ let interned_column_names = column_names
107
+ .iter()
108
+ .map(|name| cache.intern(name.clone()))
109
+ .collect::<Vec<_>>();
110
+
104
111
  for row_result in row_iter {
105
112
  let row = row_result
106
113
  .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
@@ -120,11 +127,11 @@ pub fn each_row(
120
127
  ParserResultType::Hash => {
121
128
  let hash: RHash = ruby.hash_new();
122
129
  for (idx, value) in row.into_iter().enumerate() {
123
- if idx < column_names.len() {
130
+ if idx < interned_column_names.len() {
124
131
  let ruby_value = parquet_to_ruby(value).map_err(|e| {
125
132
  MagnusError::new(ruby.exception_runtime_error(), e.to_string())
126
133
  })?;
127
- hash.aset(column_names[idx].as_str(), ruby_value)?;
134
+ hash.aset(interned_column_names[idx].as_ref(), ruby_value)?;
128
135
  }
129
136
  }
130
137
  hash.as_value()
@@ -1,8 +1,9 @@
1
1
  use magnus::value::ReprValue;
2
2
  use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
3
- use parquet_core::{ParquetError, PrimitiveType, Result, Schema, SchemaNode};
3
+ use parquet_core::{ParquetError, PrimitiveType, Schema, SchemaNode};
4
4
 
5
5
  use crate::utils::parse_string_or_symbol;
6
+ use crate::RubyAdapterError;
6
7
 
7
8
  /// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
8
9
  pub struct RubySchemaBuilder;
@@ -13,18 +14,18 @@ impl RubySchemaBuilder {
13
14
  }
14
15
 
15
16
  /// Parse a Ruby schema definition (hash) into a SchemaNode
16
- fn parse_schema_node(&self, name: String, schema_def: Value) -> Result<SchemaNode> {
17
+ fn parse_schema_node(
18
+ &self,
19
+ name: String,
20
+ schema_def: Value,
21
+ ) -> Result<SchemaNode, RubyAdapterError> {
17
22
  // If it's a Hash, parse it as a complex type
18
23
  if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
19
24
  return self.parse_hash_schema_node(name, hash);
20
25
  }
21
26
 
22
27
  // Otherwise, try to parse as a simple type symbol
23
- if let Ok(type_sym) = <Symbol as TryConvert>::try_convert(schema_def) {
24
- let type_str = type_sym.name().map_err(|e: MagnusError| {
25
- ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
26
- })?;
27
-
28
+ if let Ok(type_str) = schema_def.to_r_string()?.to_string() {
28
29
  // Check if it's a complex type with angle brackets
29
30
  if type_str.contains('<') {
30
31
  return self.parse_complex_type_string(name, type_str.to_string(), true);
@@ -40,22 +41,24 @@ impl RubySchemaBuilder {
40
41
  });
41
42
  }
42
43
 
43
- Err(ParquetError::Schema(format!(
44
+ Err(RubyAdapterError::InvalidInput(format!(
44
45
  "Expected Hash or Symbol for schema definition, got {}",
45
46
  schema_def.class()
46
47
  )))
47
48
  }
48
49
 
49
50
  /// Parse a Ruby hash schema node
50
- fn parse_hash_schema_node(&self, name: String, hash: RHash) -> Result<SchemaNode> {
51
+ fn parse_hash_schema_node(
52
+ &self,
53
+ name: String,
54
+ hash: RHash,
55
+ ) -> Result<SchemaNode, RubyAdapterError> {
51
56
  // Get the type field
52
- let type_sym: Symbol = hash
53
- .fetch::<_, Symbol>(Symbol::new("type"))
57
+ let type_sym: Value = hash
58
+ .fetch::<_, Value>(Symbol::new("type"))
54
59
  .map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
55
60
 
56
- let type_str = type_sym.name().map_err(|e: MagnusError| {
57
- ParquetError::Conversion(format!("Failed to get type name: {}", e))
58
- })?;
61
+ let type_str = type_sym.to_r_string()?.to_string()?;
59
62
 
60
63
  // Get nullable field (default to true)
61
64
  let nullable = hash
@@ -142,6 +145,15 @@ impl RubySchemaBuilder {
142
145
 
143
146
  // Primitive types
144
147
  primitive_type => {
148
+ if format.as_deref() == Some("uuid") {
149
+ return Ok(SchemaNode::Primitive {
150
+ name,
151
+ primitive_type: PrimitiveType::FixedLenByteArray(16),
152
+ nullable,
153
+ format,
154
+ });
155
+ }
156
+
145
157
  // Get precision and scale for decimal types
146
158
  let precision = hash
147
159
  .fetch::<_, Value>(Symbol::new("precision"))
@@ -196,7 +208,7 @@ impl RubySchemaBuilder {
196
208
  name: String,
197
209
  type_str: String,
198
210
  nullable: bool,
199
- ) -> Result<SchemaNode> {
211
+ ) -> Result<SchemaNode, RubyAdapterError> {
200
212
  if type_str.starts_with("list<") && type_str.ends_with('>') {
201
213
  let inner_type = &type_str[5..type_str.len() - 1];
202
214
  let item_name = format!("{}_item", name);
@@ -229,7 +241,7 @@ impl RubySchemaBuilder {
229
241
  let inner = &type_str[4..type_str.len() - 1];
230
242
  let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
231
243
  if parts.len() != 2 {
232
- return Err(ParquetError::Schema(format!(
244
+ return Err(RubyAdapterError::InvalidInput(format!(
233
245
  "Invalid map type: {}",
234
246
  type_str
235
247
  )));
@@ -255,7 +267,7 @@ impl RubySchemaBuilder {
255
267
  }),
256
268
  })
257
269
  } else {
258
- Err(ParquetError::Schema(format!(
270
+ Err(RubyAdapterError::InvalidInput(format!(
259
271
  "Unknown complex type: {}",
260
272
  type_str
261
273
  )))
@@ -263,7 +275,7 @@ impl RubySchemaBuilder {
263
275
  }
264
276
 
265
277
  /// Parse a field definition from a Ruby hash
266
- fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
278
+ fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode, RubyAdapterError> {
267
279
  let name: String = field_hash
268
280
  .fetch(Symbol::new("name"))
269
281
  .map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
@@ -274,7 +286,7 @@ impl RubySchemaBuilder {
274
286
  self.parse_schema_node(name, field_hash.as_value())
275
287
  } else {
276
288
  // This might be a simplified definition - look for known field patterns
277
- Err(ParquetError::Schema(format!(
289
+ Err(RubyAdapterError::InvalidInput(format!(
278
290
  "Field '{}' missing 'type' definition",
279
291
  name
280
292
  )))
@@ -288,7 +300,7 @@ impl RubySchemaBuilder {
288
300
  precision: Option<u8>,
289
301
  scale: Option<i8>,
290
302
  timezone: Option<String>,
291
- ) -> Result<PrimitiveType> {
303
+ ) -> Result<PrimitiveType, RubyAdapterError> {
292
304
  // Check if it's a decimal type with parentheses notation like "decimal(5,2)"
293
305
  if type_str.starts_with("decimal(") && type_str.ends_with(')') {
294
306
  let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
@@ -324,6 +336,14 @@ impl RubySchemaBuilder {
324
336
  }
325
337
  }
326
338
 
339
+ if type_str.starts_with("fixed_len_byte_array(") && type_str.ends_with(')') {
340
+ let params = &type_str[20..type_str.len() - 1];
341
+ let len = params.parse::<i32>().map_err(|_| {
342
+ ParquetError::Schema(format!("Invalid fixed_len_byte_array length: {}", params))
343
+ })?;
344
+ return Ok(PrimitiveType::FixedLenByteArray(len));
345
+ }
346
+
327
347
  match type_str.as_str() {
328
348
  "boolean" | "bool" => Ok(PrimitiveType::Boolean),
329
349
  "int8" => Ok(PrimitiveType::Int8),
@@ -356,8 +376,9 @@ impl RubySchemaBuilder {
356
376
  // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
357
377
  Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
358
378
  }
359
- "time32" | "time_millis" => Ok(PrimitiveType::TimeMillis),
360
- "time64" | "time_micros" => Ok(PrimitiveType::TimeMicros),
379
+ "time_millis" => Ok(PrimitiveType::TimeMillis),
380
+ "time_micros" => Ok(PrimitiveType::TimeMicros),
381
+ "time_nanos" => Ok(PrimitiveType::TimeNanos),
361
382
  "decimal" => {
362
383
  // Use provided precision/scale or defaults
363
384
  let p = precision.unwrap_or(38);
@@ -380,7 +401,7 @@ impl RubySchemaBuilder {
380
401
  let s = scale.unwrap_or(0);
381
402
  Ok(PrimitiveType::Decimal256(p, s))
382
403
  }
383
- _ => Err(ParquetError::Schema(format!(
404
+ _ => Err(RubyAdapterError::InvalidInput(format!(
384
405
  "Unknown primitive type: {}",
385
406
  type_str
386
407
  ))),
@@ -396,7 +417,7 @@ impl Default for RubySchemaBuilder {
396
417
 
397
418
  /// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
398
419
  /// and Ruby Value is not Send/Sync
399
- pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
420
+ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema, RubyAdapterError> {
400
421
  let builder = RubySchemaBuilder::new();
401
422
 
402
423
  // The Ruby schema should be a hash with a root struct
@@ -430,7 +451,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
430
451
  let mut unique_names = std::collections::HashSet::new();
431
452
  for name in &field_names {
432
453
  if !unique_names.insert(name) {
433
- return Err(ParquetError::Schema(format!(
454
+ return Err(RubyAdapterError::InvalidInput(format!(
434
455
  "Duplicate field names in root level schema: {:?}",
435
456
  field_names
436
457
  )));
@@ -443,7 +464,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
443
464
  fields: field_nodes,
444
465
  }
445
466
  } else {
446
- return Err(ParquetError::Schema(
467
+ return Err(RubyAdapterError::InvalidInput(
447
468
  "Schema must have 'type' or 'fields' key".to_string(),
448
469
  ));
449
470
  };
@@ -452,18 +473,18 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
452
473
  parquet_core::SchemaBuilder::new()
453
474
  .with_root(root_node)
454
475
  .build()
455
- .map_err(|e| ParquetError::Schema(e.to_string()))
476
+ .map_err(|e| RubyAdapterError::InvalidInput(e.to_string()))
456
477
  }
457
478
 
458
479
  /// Convert a Parquet schema back to Ruby representation
459
- pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
480
+ pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value, RubyAdapterError> {
460
481
  let ruby = Ruby::get()
461
482
  .map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
462
483
 
463
484
  schema_node_to_ruby(&schema.root, &ruby)
464
485
  }
465
486
 
466
- fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
487
+ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value, RubyAdapterError> {
467
488
  let hash = RHash::new();
468
489
 
469
490
  match node {
@@ -552,6 +573,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
552
573
  PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
553
574
  PrimitiveType::TimeMillis => Symbol::new("time_millis"),
554
575
  PrimitiveType::TimeMicros => Symbol::new("time_micros"),
576
+ PrimitiveType::TimeNanos => Symbol::new("time_nanos"),
555
577
  PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
556
578
  PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
557
579
  PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
@@ -597,7 +619,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
597
619
  /// Convert old schema format to new format
598
620
  /// Old: [{ "column_name" => "type" }, ...]
599
621
  /// New: [{ name: "column_name", type: :type }, ...]
600
- pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
622
+ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray, RubyAdapterError> {
601
623
  let new_schema = RArray::new();
602
624
 
603
625
  for item in schema.into_iter() {
@@ -630,7 +652,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
630
652
  );
631
653
 
632
654
  if let Err(e) = process_result {
633
- return Err(ParquetError::Schema(format!(
655
+ return Err(RubyAdapterError::InvalidInput(format!(
634
656
  "Failed to process field: {}",
635
657
  e
636
658
  )));
@@ -645,7 +667,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
645
667
  }
646
668
 
647
669
  /// Check if schema is in new DSL format (hash with type: :struct)
648
- pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
670
+ pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool, RubyAdapterError> {
649
671
  if !schema_value.is_kind_of(ruby.class_hash()) {
650
672
  return Ok(false);
651
673
  }
@@ -678,7 +700,7 @@ pub fn process_schema_value(
678
700
  ruby: &Ruby,
679
701
  schema_value: Value,
680
702
  data_array: Option<&RArray>,
681
- ) -> Result<Value> {
703
+ ) -> Result<Value, RubyAdapterError> {
682
704
  // Check if it's the new DSL format
683
705
  if is_dsl_schema(ruby, schema_value)? {
684
706
  // For DSL format, pass it directly to ruby_schema_to_parquet
@@ -716,7 +738,7 @@ pub fn process_schema_value(
716
738
  convert_legacy_schema(ruby, array)?
717
739
  }
718
740
  } else {
719
- return Err(ParquetError::Schema(
741
+ return Err(RubyAdapterError::InvalidInput(
720
742
  "schema array must contain hashes".to_string(),
721
743
  ));
722
744
  }
@@ -733,13 +755,13 @@ pub fn process_schema_value(
733
755
  ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
734
756
  })?
735
757
  } else {
736
- return Err(ParquetError::Schema(
758
+ return Err(RubyAdapterError::InvalidInput(
737
759
  "schema hash must have 'fields' key or be in DSL format with 'type' key"
738
760
  .to_string(),
739
761
  ));
740
762
  }
741
763
  } else {
742
- return Err(ParquetError::Schema(
764
+ return Err(RubyAdapterError::InvalidInput(
743
765
  "schema must be nil, an array, or a hash".to_string(),
744
766
  ));
745
767
  };
@@ -748,7 +770,7 @@ pub fn process_schema_value(
748
770
  if schema_array.is_empty() {
749
771
  if let Some(data) = data_array {
750
772
  if data.is_empty() {
751
- return Err(ParquetError::Schema(
773
+ return Err(RubyAdapterError::InvalidInput(
752
774
  "Cannot infer schema from empty data".to_string(),
753
775
  ));
754
776
  }
@@ -767,7 +789,7 @@ pub fn process_schema_value(
767
789
  })?;
768
790
  first_array.len()
769
791
  } else {
770
- return Err(ParquetError::Schema(
792
+ return Err(RubyAdapterError::InvalidInput(
771
793
  "First data item must be an array".to_string(),
772
794
  ));
773
795
  };
@@ -793,7 +815,7 @@ pub fn process_schema_value(
793
815
 
794
816
  schema_array = new_schema;
795
817
  } else {
796
- return Err(ParquetError::Schema(
818
+ return Err(RubyAdapterError::InvalidInput(
797
819
  "Schema is required when data is not provided for inference".to_string(),
798
820
  ));
799
821
  }
@@ -1,15 +1,15 @@
1
1
  use std::collections::HashMap;
2
- use std::sync::{Arc, Mutex};
2
+ use std::sync::{Arc, LazyLock, Mutex};
3
3
 
4
4
  use magnus::RString;
5
5
 
6
+ static STRING_CACHE: LazyLock<Mutex<HashMap<String, &'static str>>> =
7
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
8
+
6
9
  /// A cache for interning strings in the Ruby VM to reduce memory usage
7
10
  /// when there are many repeated strings
8
11
  #[derive(Debug)]
9
12
  pub struct StringCache {
10
- /// The actual cache is shared behind an Arc<Mutex> to allow cloning
11
- /// while maintaining a single global cache
12
- cache: Arc<Mutex<HashMap<String, &'static str>>>,
13
13
  enabled: bool,
14
14
  hits: Arc<Mutex<usize>>,
15
15
  misses: Arc<Mutex<usize>>,
@@ -19,7 +19,6 @@ impl StringCache {
19
19
  /// Create a new string cache
20
20
  pub fn new(enabled: bool) -> Self {
21
21
  Self {
22
- cache: Arc::new(Mutex::new(HashMap::new())),
23
22
  enabled,
24
23
  hits: Arc::new(Mutex::new(0)),
25
24
  misses: Arc::new(Mutex::new(0)),
@@ -36,9 +35,9 @@ impl StringCache {
36
35
 
37
36
  // Try to get or create the interned string
38
37
  let result = (|| -> Result<(), String> {
39
- let mut cache = self.cache.lock().map_err(|e| e.to_string())?;
38
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
40
39
 
41
- if cache.contains_key(&s) {
40
+ if cache.contains_key(s.as_str()) {
42
41
  let mut hits = self.hits.lock().map_err(|e| e.to_string())?;
43
42
  *hits += 1;
44
43
  } else {
@@ -65,7 +64,7 @@ impl StringCache {
65
64
 
66
65
  /// Get cache statistics
67
66
  pub fn stats(&self) -> CacheStats {
68
- let cache_size = self.cache.lock().map(|c| c.len()).unwrap_or(0);
67
+ let cache_size = STRING_CACHE.lock().map(|c| c.len()).unwrap_or(0);
69
68
  let hits = self.hits.lock().map(|h| *h).unwrap_or(0);
70
69
  let misses = self.misses.lock().map(|m| *m).unwrap_or(0);
71
70
 
@@ -84,7 +83,7 @@ impl StringCache {
84
83
 
85
84
  /// Clear the cache
86
85
  pub fn clear(&mut self) {
87
- if let Ok(mut cache) = self.cache.lock() {
86
+ if let Ok(mut cache) = STRING_CACHE.lock() {
88
87
  cache.clear();
89
88
  }
90
89
  if let Ok(mut hits) = self.hits.lock() {
@@ -26,6 +26,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
26
26
  ParquetValue::Float64(_) => 8,
27
27
  ParquetValue::String(s) => s.len() + 24, // String overhead
28
28
  ParquetValue::Bytes(b) => b.len() + 24, // Vec overhead
29
+ ParquetValue::Uuid(_) => 16,
29
30
  ParquetValue::Date32(_) => 4,
30
31
  ParquetValue::Date64(_) => 8,
31
32
  ParquetValue::Decimal128(_, _) => 16 + 1, // value + scale
@@ -36,6 +37,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
36
37
  ParquetValue::TimestampNanos(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
37
38
  ParquetValue::TimeMillis(_) => 4,
38
39
  ParquetValue::TimeMicros(_) => 8,
40
+ ParquetValue::TimeNanos(_) => 8,
39
41
  ParquetValue::List(items) => {
40
42
  24 + items.iter().map(estimate_parquet_value_size).sum::<usize>()
41
43
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.6.1"
2
+ VERSION = "0.7.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.7.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-07-03 00:00:00.000000000 Z
11
+ date: 2025-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys