parquet 0.6.1 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dd9295880f123b0ed979fb970d789bf63461007e8c087641b47391baa02bef29
4
- data.tar.gz: e5ccd26a48e7b800412049e9d44e7f8c6fb5c12dbe25c74980467c822fb114c6
3
+ metadata.gz: dfd19103b2414e7feeaa6d1ec3c9a9c25ce42cf5c8362baa37e3b9d8d5245f82
4
+ data.tar.gz: c5c1170dbdc3635577738a568688c36adc9670710f4b0d570fae29294e337754
5
5
  SHA512:
6
- metadata.gz: f275d82733cd6b1658ed7b6212618830e2a12e6831bcfcc26beeae79691ddc972b22b0b0660f10db9b0bb7a03e2af29da15de28d037d5896891fbd678b3ecca9
7
- data.tar.gz: 8fbfd33a92350c84409c0e879745fbc5a07d60f0a2862a2993a4780eb5c110be7b3918bfdfe6721e60e44635792108a1a3eeed460a189076bd03056a27764e50
6
+ metadata.gz: c9bf72b4e708c750ab7ae30afd97aef7f456a4249904fe3eb74f916557e28ca1a53bc262a6492db38c38162ab3e3f684e30f0c70dabbaf8f8f4145ef4d9af259
7
+ data.tar.gz: 164c5b0569d3d13242bcff7c09d66edf67b279d8289f97def043000a508b4333dd1387a4f47517be09023cd270cf3b6dfd57fdf658ac1b52e25f3f5b2b5ca30c
data/Cargo.lock CHANGED
@@ -225,6 +225,10 @@ dependencies = [
225
225
  name = "arrow-schema"
226
226
  version = "55.2.0"
227
227
  source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#54858bf019ff3faeb8f5b562da8c01012162aef0"
228
+ dependencies = [
229
+ "serde",
230
+ "serde_json",
231
+ ]
228
232
 
229
233
  [[package]]
230
234
  name = "arrow-select"
@@ -1092,6 +1096,7 @@ dependencies = [
1092
1096
  "rb-sys-env 0.2.2",
1093
1097
  "tempfile",
1094
1098
  "thiserror",
1099
+ "uuid",
1095
1100
  ]
1096
1101
 
1097
1102
  [[package]]
@@ -7,7 +7,7 @@ edition = "2021"
7
7
  arrow = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
8
8
  arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
9
9
  arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
10
- arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
10
+ arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["canonical_extension_types"]}
11
11
  bytes = "1.5"
12
12
  indexmap = "2.2"
13
13
  jiff = "0.2"
@@ -17,7 +17,7 @@ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24
17
17
  rand = "0.9.1"
18
18
  serde = { version = "1.0", features = ["derive"] }
19
19
  thiserror = "2.0"
20
+ uuid = { version = "1.0", features = ["v4"] }
20
21
 
21
22
  [dev-dependencies]
22
- uuid = { version = "1.0", features = ["v4"] }
23
23
  tempfile = "3.8"
@@ -7,6 +7,7 @@
7
7
 
8
8
  use crate::{ParquetError, ParquetValue, Result};
9
9
  use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
10
+ use arrow_schema::extension::Uuid as ArrowUuid;
10
11
  use arrow_schema::{DataType, Field};
11
12
  use bytes::Bytes;
12
13
  use indexmap::IndexMap;
@@ -14,7 +15,11 @@ use ordered_float::OrderedFloat;
14
15
  use std::sync::Arc;
15
16
 
16
17
  /// Convert a single value from an Arrow array at the given index to a ParquetValue
17
- pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<ParquetValue> {
18
+ pub fn arrow_to_parquet_value(
19
+ field: &Field,
20
+ array: &dyn Array,
21
+ index: usize,
22
+ ) -> Result<ParquetValue> {
18
23
  use arrow_array::*;
19
24
 
20
25
  if array.is_null(index) {
@@ -72,7 +77,6 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
72
77
  let array = downcast_array::<Float64Array>(array)?;
73
78
  Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
74
79
  }
75
-
76
80
  // String and binary types
77
81
  DataType::Utf8 => {
78
82
  let array = downcast_array::<StringArray>(array)?;
@@ -86,9 +90,15 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
86
90
  }
87
91
  DataType::FixedSizeBinary(_) => {
88
92
  let array = downcast_array::<FixedSizeBinaryArray>(array)?;
89
- Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
90
- array.value(index),
91
- )))
93
+ let value = array.value(index);
94
+ match field.try_extension_type::<ArrowUuid>() {
95
+ Ok(_) => {
96
+ let uuid = uuid::Uuid::from_slice(value)
97
+ .map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
98
+ Ok(ParquetValue::Uuid(uuid))
99
+ }
100
+ Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
101
+ }
92
102
  }
93
103
 
94
104
  // Date and time types
@@ -140,6 +150,10 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
140
150
  let array = downcast_array::<Time64MicrosecondArray>(array)?;
141
151
  Ok(ParquetValue::TimeMicros(array.value(index)))
142
152
  }
153
+ arrow_schema::TimeUnit::Nanosecond => {
154
+ let array = downcast_array::<Time64NanosecondArray>(array)?;
155
+ Ok(ParquetValue::TimeNanos(array.value(index)))
156
+ }
143
157
  _ => Err(ParquetError::Conversion(format!(
144
158
  "Unsupported time64 unit: {:?}",
145
159
  unit
@@ -173,13 +187,13 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
173
187
  }
174
188
 
175
189
  // Complex types
176
- DataType::List(_) => {
190
+ DataType::List(item_field) => {
177
191
  let array = downcast_array::<ListArray>(array)?;
178
192
  let list_values = array.value(index);
179
193
 
180
194
  let mut values = Vec::with_capacity(list_values.len());
181
195
  for i in 0..list_values.len() {
182
- values.push(arrow_to_parquet_value(&list_values, i)?);
196
+ values.push(arrow_to_parquet_value(item_field, &list_values, i)?);
183
197
  }
184
198
 
185
199
  Ok(ParquetValue::List(values))
@@ -192,10 +206,20 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
192
206
  let keys = map_value.column(0);
193
207
  let values = map_value.column(1);
194
208
 
209
+ let key_field = map_value
210
+ .fields()
211
+ .iter().find(|f| f.name() == "key")
212
+ .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
213
+
214
+ let value_field = map_value
215
+ .fields()
216
+ .iter().find(|f| f.name() == "value")
217
+ .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
218
+
195
219
  let mut map_vec = Vec::with_capacity(keys.len());
196
220
  for i in 0..keys.len() {
197
- let key = arrow_to_parquet_value(keys, i)?;
198
- let value = arrow_to_parquet_value(values, i)?;
221
+ let key = arrow_to_parquet_value(key_field, keys, i)?;
222
+ let value = arrow_to_parquet_value(value_field, values, i)?;
199
223
  map_vec.push((key, value));
200
224
  }
201
225
 
@@ -207,7 +231,7 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
207
231
  let mut map = IndexMap::new();
208
232
  for (col_idx, field) in array.fields().iter().enumerate() {
209
233
  let column = array.column(col_idx);
210
- let value = arrow_to_parquet_value(column, index)?;
234
+ let value = arrow_to_parquet_value(field, column, index)?;
211
235
  map.insert(Arc::from(field.name().as_str()), value);
212
236
  }
213
237
 
@@ -1108,7 +1132,7 @@ mod tests {
1108
1132
  let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
1109
1133
 
1110
1134
  for (i, expected) in values.iter().enumerate() {
1111
- let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
1135
+ let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
1112
1136
  assert_eq!(&actual, expected);
1113
1137
  }
1114
1138
  }
@@ -163,8 +163,10 @@ where
163
163
  // Extract values from current row
164
164
  let mut row_values = Vec::with_capacity(batch.num_columns());
165
165
 
166
- for column in batch.columns() {
167
- let value = match arrow_to_parquet_value(column, self.current_row) {
166
+ let schema = batch.schema();
167
+ for (i, column) in batch.columns().iter().enumerate() {
168
+ let field = schema.field(i);
169
+ let value = match arrow_to_parquet_value(field, column, self.current_row) {
168
170
  Ok(v) => v,
169
171
  Err(e) => return Some(Err(e)),
170
172
  };
@@ -228,12 +230,13 @@ where
228
230
  let mut columns = Vec::with_capacity(batch.num_columns());
229
231
 
230
232
  for (idx, column) in batch.columns().iter().enumerate() {
231
- let column_name = self.schema.field(idx).name().to_string();
233
+ let field = self.schema.field(idx);
234
+ let column_name = field.name().to_string();
232
235
 
233
236
  // Convert entire column to ParquetValues
234
237
  let mut values = Vec::with_capacity(column.len());
235
238
  for row_idx in 0..column.len() {
236
- match arrow_to_parquet_value(column, row_idx) {
239
+ match arrow_to_parquet_value(field, column, row_idx) {
237
240
  Ok(value) => values.push(value),
238
241
  Err(e) => return Some(Err(e)),
239
242
  }
@@ -72,6 +72,7 @@ pub enum PrimitiveType {
72
72
  TimestampNanos(Option<Arc<str>>),
73
73
  TimeMillis,
74
74
  TimeMicros,
75
+ TimeNanos,
75
76
 
76
77
  // Fixed-length byte array
77
78
  FixedLenByteArray(i32),
@@ -146,6 +147,7 @@ impl PrimitiveType {
146
147
  PrimitiveType::TimestampNanos(_) => "TimestampNanos",
147
148
  PrimitiveType::TimeMillis => "TimeMillis",
148
149
  PrimitiveType::TimeMicros => "TimeMicros",
150
+ PrimitiveType::TimeNanos => "TimeNanos",
149
151
  PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
150
152
  }
151
153
  }
@@ -2,6 +2,7 @@ use bytes::Bytes;
2
2
  use indexmap::IndexMap;
3
3
  use num::BigInt;
4
4
  use std::sync::Arc;
5
+ use uuid::Uuid;
5
6
 
6
7
  #[derive(Debug, Clone, PartialEq, Eq)]
7
8
  pub enum ParquetValue {
@@ -22,6 +23,7 @@ pub enum ParquetValue {
22
23
  Boolean(bool),
23
24
  String(Arc<str>),
24
25
  Bytes(Bytes),
26
+ Uuid(Uuid),
25
27
 
26
28
  // Date/Time types
27
29
  Date32(i32), // Days since epoch
@@ -40,6 +42,7 @@ pub enum ParquetValue {
40
42
  // Time types
41
43
  TimeMillis(i32), // Time of day in milliseconds since midnight
42
44
  TimeMicros(i64), // Time of day in microseconds since midnight
45
+ TimeNanos(i64), // Time of day in nanoseconds since midnight
43
46
 
44
47
  // Complex types
45
48
  List(Vec<ParquetValue>),
@@ -68,6 +71,7 @@ impl std::hash::Hash for ParquetValue {
68
71
  ParquetValue::Boolean(b) => b.hash(state),
69
72
  ParquetValue::String(s) => s.hash(state),
70
73
  ParquetValue::Bytes(b) => b.hash(state),
74
+ ParquetValue::Uuid(u) => u.hash(state),
71
75
  ParquetValue::Date32(d) => d.hash(state),
72
76
  ParquetValue::Date64(d) => d.hash(state),
73
77
  ParquetValue::Decimal128(d, scale) => {
@@ -96,6 +100,7 @@ impl std::hash::Hash for ParquetValue {
96
100
  }
97
101
  ParquetValue::TimeMillis(t) => t.hash(state),
98
102
  ParquetValue::TimeMicros(t) => t.hash(state),
103
+ ParquetValue::TimeNanos(t) => t.hash(state),
99
104
  ParquetValue::List(l) => l.hash(state),
100
105
  ParquetValue::Map(m) => m.hash(state),
101
106
  ParquetValue::Record(r) => {
@@ -133,6 +138,7 @@ impl ParquetValue {
133
138
  ParquetValue::Boolean(_) => "Boolean",
134
139
  ParquetValue::String(_) => "String",
135
140
  ParquetValue::Bytes(_) => "Bytes",
141
+ ParquetValue::Uuid(_) => "Uuid",
136
142
  ParquetValue::Date32(_) => "Date32",
137
143
  ParquetValue::Date64(_) => "Date64",
138
144
  ParquetValue::Decimal128(_, _) => "Decimal128",
@@ -143,6 +149,7 @@ impl ParquetValue {
143
149
  ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
144
150
  ParquetValue::TimeMillis(_) => "TimeMillis",
145
151
  ParquetValue::TimeMicros(_) => "TimeMicros",
152
+ ParquetValue::TimeNanos(_) => "TimeNanos",
146
153
  ParquetValue::List(_) => "List",
147
154
  ParquetValue::Map(_) => "Map",
148
155
  ParquetValue::Record(_) => "Record",
@@ -235,6 +235,7 @@ where
235
235
  (Date64(_), DataType::Date64) => 8,
236
236
  (TimeMillis(_), DataType::Time32(_)) => 4,
237
237
  (TimeMicros(_), DataType::Time64(_)) => 8,
238
+ (TimeNanos(_), DataType::Time64(_)) => 8,
238
239
  (TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
239
240
  (TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
240
241
  (TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
@@ -364,7 +365,9 @@ where
364
365
  writer.write(&batch)?;
365
366
 
366
367
  // Check if we need to flush based on memory usage
367
- if writer.in_progress_size() >= self.memory_threshold {
368
+ if writer.in_progress_size() >= self.memory_threshold
369
+ || writer.memory_size() >= self.memory_threshold
370
+ {
368
371
  writer.flush()?;
369
372
  }
370
373
  } else {
@@ -496,6 +499,7 @@ fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str)
496
499
  (Date64(_), DataType::Date64) => Ok(()),
497
500
  (TimeMillis(_), DataType::Time32(_)) => Ok(()),
498
501
  (TimeMicros(_), DataType::Time64(_)) => Ok(()),
502
+ (TimeNanos(_), DataType::Time64(_)) => Ok(()),
499
503
  (TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
500
504
  (TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
501
505
  (TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
@@ -591,10 +595,16 @@ fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
591
595
  name,
592
596
  primitive_type,
593
597
  nullable,
594
- ..
598
+ format,
595
599
  } => {
596
600
  let data_type = primitive_type_to_arrow(primitive_type)?;
597
- Ok(Field::new(name, data_type, *nullable))
601
+ let field = Field::new(name, data_type, *nullable);
602
+ let extended_field = if format.as_deref() == Some("uuid") {
603
+ field.with_extension_type(arrow_schema::extension::Uuid)
604
+ } else {
605
+ field
606
+ };
607
+ Ok(extended_field)
598
608
  }
599
609
  SchemaNode::List {
600
610
  name,
@@ -671,6 +681,7 @@ fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
671
681
  Date32 => DataType::Date32,
672
682
  TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
673
683
  TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
684
+ TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
674
685
  TimestampMillis(tz) => DataType::Timestamp(
675
686
  arrow_schema::TimeUnit::Millisecond,
676
687
  // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
@@ -99,7 +99,7 @@ fn test_decimal256_large_values() {
99
99
 
100
100
  // Verify roundtrip
101
101
  for i in 0..4 {
102
- let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
102
+ let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
103
103
  match (i, value) {
104
104
  (0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
105
105
  (1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
@@ -173,7 +173,7 @@ fn test_timestamp_with_timezone() {
173
173
 
174
174
  // Verify roundtrip preserves timezone
175
175
  for i in 0..3 {
176
- let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
176
+ let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
177
177
  match value {
178
178
  ParquetValue::TimestampMillis(_, Some(tz)) => {
179
179
  assert_eq!(tz.as_ref(), "America/New_York");
@@ -209,7 +209,7 @@ fn test_nested_list_of_lists() {
209
209
  assert_eq!(array.len(), 1);
210
210
 
211
211
  // Verify roundtrip
212
- let value = arrow_to_parquet_value(array.as_ref(), 0).unwrap();
212
+ let value = arrow_to_parquet_value(&outer_field, array.as_ref(), 0).unwrap();
213
213
  match value {
214
214
  ParquetValue::List(items) => assert_eq!(items.len(), 5),
215
215
  _ => panic!("Expected list"),
@@ -357,7 +357,7 @@ fn test_unsupported_arrow_types() {
357
357
  )
358
358
  .unwrap();
359
359
 
360
- let result = arrow_to_parquet_value(&array, 0);
360
+ let result = arrow_to_parquet_value(&Field::new("int", DataType::Int32, false), &array, 0);
361
361
  assert!(result.is_err());
362
362
  assert!(result
363
363
  .unwrap_err()
@@ -20,3 +20,4 @@ rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
20
20
  tempfile = "^3.15"
21
21
  thiserror = "2.0"
22
22
  indexmap = "2.2"
23
+ uuid = "*"
@@ -41,27 +41,6 @@ impl RubyValueConverter {
41
41
  .map(|cache| cache.stats())
42
42
  }
43
43
 
44
- /// Convert a Ruby value to ParquetValue with type hint
45
- /// This is the primary conversion method that handles all Ruby types
46
- pub fn to_parquet_with_type_hint(
47
- &mut self,
48
- value: Value,
49
- type_hint: Option<&parquet_core::PrimitiveType>,
50
- ) -> Result<ParquetValue> {
51
- // Handle nil values
52
- if value.is_nil() {
53
- return Ok(ParquetValue::Null);
54
- }
55
-
56
- // If we have a type hint, use it to guide conversion
57
- if let Some(hint) = type_hint {
58
- return self.convert_with_type_hint(value, hint);
59
- }
60
-
61
- // Otherwise, infer type from Ruby value
62
- self.infer_and_convert(value)
63
- }
64
-
65
44
  /// Convert a Ruby value to ParquetValue with schema hint
66
45
  /// This handles both primitive and complex types
67
46
  pub fn to_parquet_with_schema_hint(
@@ -115,7 +94,7 @@ impl RubyValueConverter {
115
94
  use parquet_core::PrimitiveType::*;
116
95
 
117
96
  // Special handling for UUID format
118
- if let (Binary, Some("uuid")) = (type_hint, format) {
97
+ if let (FixedLenByteArray(16), Some("uuid")) = (type_hint, format) {
119
98
  return self.convert_to_uuid_binary(value);
120
99
  }
121
100
 
@@ -156,6 +135,7 @@ impl RubyValueConverter {
156
135
  Date64 => self.convert_to_date64(value, None),
157
136
  TimeMillis => self.convert_to_time_millis(value),
158
137
  TimeMicros => self.convert_to_time_micros(value),
138
+ TimeNanos => self.convert_to_time_nanos(value),
159
139
  TimestampSecond(schema_tz) => {
160
140
  self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
161
141
  }
@@ -484,32 +464,19 @@ impl RubyValueConverter {
484
464
 
485
465
  // Convert value to string
486
466
  let uuid_str: String = value
487
- .funcall("to_s", ())
488
- .and_then(TryConvert::try_convert)
467
+ .to_r_string()
468
+ .map_err(|e: MagnusError| {
469
+ ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
470
+ })?
471
+ .to_string()
489
472
  .map_err(|e: MagnusError| {
490
473
  ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
491
474
  })?;
492
475
 
493
- // Remove hyphens and validate length
494
- let clean_uuid = uuid_str.replace('-', "");
495
- if clean_uuid.len() != 32 {
496
- return Err(ParquetError::Conversion(format!(
497
- "Invalid UUID format: expected 32 hex characters (ignoring hyphens), got {}",
498
- clean_uuid.len()
499
- )));
500
- }
501
-
502
- // Parse hex string to bytes
503
- let mut bytes = Vec::with_capacity(16);
504
- for i in 0..16 {
505
- let hex_byte = &clean_uuid[i * 2..i * 2 + 2];
506
- let byte = u8::from_str_radix(hex_byte, 16).map_err(|_| {
507
- ParquetError::Conversion(format!("Invalid hex character in UUID: {}", hex_byte))
508
- })?;
509
- bytes.push(byte);
510
- }
511
-
512
- Ok(ParquetValue::Bytes(bytes.into()))
476
+ let parsed = uuid::Uuid::parse_str(&uuid_str)
477
+ .map_err(|e| ParquetError::Conversion(format!("Failed to parse UUID: {}", e)))?;
478
+ let bytes = Bytes::copy_from_slice(parsed.as_bytes());
479
+ Ok(ParquetValue::Bytes(bytes))
513
480
  }
514
481
 
515
482
  fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
@@ -692,6 +659,38 @@ impl RubyValueConverter {
692
659
  )))
693
660
  }
694
661
 
662
+ fn convert_to_time_nanos(&self, value: Value) -> Result<ParquetValue> {
663
+ if value.is_nil() {
664
+ return Ok(ParquetValue::Null);
665
+ }
666
+
667
+ // Convert to microseconds since midnight
668
+ let ruby = Ruby::get()
669
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
670
+ if value.is_kind_of(ruby.class_time()) {
671
+ let hour: i64 = value
672
+ .funcall("hour", ())
673
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
674
+ let min: i64 = value
675
+ .funcall("min", ())
676
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
677
+ let sec: i64 = value
678
+ .funcall("sec", ())
679
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
680
+ let nsec: i64 = value
681
+ .funcall("nsec", ())
682
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
683
+
684
+ let nanos = (hour * 3600 + min * 60 + sec) * 1_000_000_000 + nsec;
685
+ return Ok(ParquetValue::TimeNanos(nanos));
686
+ }
687
+
688
+ Err(ParquetError::Conversion(format!(
689
+ "Cannot convert {} to time_micros",
690
+ value.class()
691
+ )))
692
+ }
693
+
695
694
  // Timestamp conversion methods that respect schema timezone
696
695
  fn convert_to_timestamp_second_with_tz(
697
696
  &self,
@@ -1399,21 +1398,8 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1399
1398
  ParquetValue::Float32(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
1400
1399
  ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
1401
1400
  ParquetValue::String(s) => Ok(s.into_value_with(&ruby)),
1402
- ParquetValue::Bytes(b) => {
1403
- // Check if this is a UUID (16 bytes)
1404
- if b.len() == 16 {
1405
- // Format as UUID string
1406
- let uuid_str = format!(
1407
- "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
1408
- b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
1409
- b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
1410
- );
1411
- Ok(uuid_str.into_value_with(&ruby))
1412
- } else {
1413
- // Regular bytes - convert to string
1414
- Ok(ruby.str_from_slice(&b).as_value())
1415
- }
1416
- }
1401
+ ParquetValue::Uuid(u) => Ok(u.to_string().into_value_with(&ruby)),
1402
+ ParquetValue::Bytes(b) => Ok(ruby.enc_str_new(&b, ruby.ascii8bit_encoding()).as_value()),
1417
1403
  ParquetValue::Date32(days) => {
1418
1404
  // Convert days since epoch to Date object
1419
1405
  let _ = ruby.require("date");
@@ -1528,6 +1514,14 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1528
1514
  .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1529
1515
  apply_timezone(time, &tz)
1530
1516
  }
1517
+ ParquetValue::TimeNanos(nanos) => {
1518
+ let time_class = ruby.class_time();
1519
+ let secs = nanos / 1_000_000_000;
1520
+ let nsec = nanos % 1_000_000_000;
1521
+ time_class
1522
+ .funcall("at", (secs, nsec, Symbol::new("nanosecond")))
1523
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1524
+ }
1531
1525
  ParquetValue::TimestampNanos(nanos, tz) => {
1532
1526
  let time_class = ruby.class_time();
1533
1527
  let secs = nanos / 1_000_000_000;
@@ -2,6 +2,7 @@ use magnus::value::ReprValue;
2
2
  use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
3
3
  use parquet_core::reader::Reader;
4
4
 
5
+ use crate::StringCache;
5
6
  use crate::{
6
7
  converter::parquet_to_ruby,
7
8
  io::{RubyIOReader, ThreadSafeRubyIOReader},
@@ -101,6 +102,12 @@ pub fn each_row(
101
102
  })?;
102
103
  let mut row_count = 0u64;
103
104
 
105
+ let mut cache = StringCache::new(true);
106
+ let interned_column_names = column_names
107
+ .iter()
108
+ .map(|name| cache.intern(name.clone()))
109
+ .collect::<Vec<_>>();
110
+
104
111
  for row_result in row_iter {
105
112
  let row = row_result
106
113
  .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
@@ -120,11 +127,11 @@ pub fn each_row(
120
127
  ParserResultType::Hash => {
121
128
  let hash: RHash = ruby.hash_new();
122
129
  for (idx, value) in row.into_iter().enumerate() {
123
- if idx < column_names.len() {
130
+ if idx < interned_column_names.len() {
124
131
  let ruby_value = parquet_to_ruby(value).map_err(|e| {
125
132
  MagnusError::new(ruby.exception_runtime_error(), e.to_string())
126
133
  })?;
127
- hash.aset(column_names[idx].as_str(), ruby_value)?;
134
+ hash.aset(interned_column_names[idx].as_ref(), ruby_value)?;
128
135
  }
129
136
  }
130
137
  hash.as_value()
@@ -1,8 +1,9 @@
1
1
  use magnus::value::ReprValue;
2
2
  use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
3
- use parquet_core::{ParquetError, PrimitiveType, Result, Schema, SchemaNode};
3
+ use parquet_core::{ParquetError, PrimitiveType, Schema, SchemaNode};
4
4
 
5
5
  use crate::utils::parse_string_or_symbol;
6
+ use crate::RubyAdapterError;
6
7
 
7
8
  /// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
8
9
  pub struct RubySchemaBuilder;
@@ -13,18 +14,18 @@ impl RubySchemaBuilder {
13
14
  }
14
15
 
15
16
  /// Parse a Ruby schema definition (hash) into a SchemaNode
16
- fn parse_schema_node(&self, name: String, schema_def: Value) -> Result<SchemaNode> {
17
+ fn parse_schema_node(
18
+ &self,
19
+ name: String,
20
+ schema_def: Value,
21
+ ) -> Result<SchemaNode, RubyAdapterError> {
17
22
  // If it's a Hash, parse it as a complex type
18
23
  if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
19
24
  return self.parse_hash_schema_node(name, hash);
20
25
  }
21
26
 
22
27
  // Otherwise, try to parse as a simple type symbol
23
- if let Ok(type_sym) = <Symbol as TryConvert>::try_convert(schema_def) {
24
- let type_str = type_sym.name().map_err(|e: MagnusError| {
25
- ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
26
- })?;
27
-
28
+ if let Ok(type_str) = schema_def.to_r_string()?.to_string() {
28
29
  // Check if it's a complex type with angle brackets
29
30
  if type_str.contains('<') {
30
31
  return self.parse_complex_type_string(name, type_str.to_string(), true);
@@ -40,22 +41,24 @@ impl RubySchemaBuilder {
40
41
  });
41
42
  }
42
43
 
43
- Err(ParquetError::Schema(format!(
44
+ Err(RubyAdapterError::InvalidInput(format!(
44
45
  "Expected Hash or Symbol for schema definition, got {}",
45
46
  schema_def.class()
46
47
  )))
47
48
  }
48
49
 
49
50
  /// Parse a Ruby hash schema node
50
- fn parse_hash_schema_node(&self, name: String, hash: RHash) -> Result<SchemaNode> {
51
+ fn parse_hash_schema_node(
52
+ &self,
53
+ name: String,
54
+ hash: RHash,
55
+ ) -> Result<SchemaNode, RubyAdapterError> {
51
56
  // Get the type field
52
- let type_sym: Symbol = hash
53
- .fetch::<_, Symbol>(Symbol::new("type"))
57
+ let type_sym: Value = hash
58
+ .fetch::<_, Value>(Symbol::new("type"))
54
59
  .map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
55
60
 
56
- let type_str = type_sym.name().map_err(|e: MagnusError| {
57
- ParquetError::Conversion(format!("Failed to get type name: {}", e))
58
- })?;
61
+ let type_str = type_sym.to_r_string()?.to_string()?;
59
62
 
60
63
  // Get nullable field (default to true)
61
64
  let nullable = hash
@@ -142,6 +145,15 @@ impl RubySchemaBuilder {
142
145
 
143
146
  // Primitive types
144
147
  primitive_type => {
148
+ if format.as_deref() == Some("uuid") {
149
+ return Ok(SchemaNode::Primitive {
150
+ name,
151
+ primitive_type: PrimitiveType::FixedLenByteArray(16),
152
+ nullable,
153
+ format,
154
+ });
155
+ }
156
+
145
157
  // Get precision and scale for decimal types
146
158
  let precision = hash
147
159
  .fetch::<_, Value>(Symbol::new("precision"))
@@ -196,7 +208,7 @@ impl RubySchemaBuilder {
196
208
  name: String,
197
209
  type_str: String,
198
210
  nullable: bool,
199
- ) -> Result<SchemaNode> {
211
+ ) -> Result<SchemaNode, RubyAdapterError> {
200
212
  if type_str.starts_with("list<") && type_str.ends_with('>') {
201
213
  let inner_type = &type_str[5..type_str.len() - 1];
202
214
  let item_name = format!("{}_item", name);
@@ -229,7 +241,7 @@ impl RubySchemaBuilder {
229
241
  let inner = &type_str[4..type_str.len() - 1];
230
242
  let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
231
243
  if parts.len() != 2 {
232
- return Err(ParquetError::Schema(format!(
244
+ return Err(RubyAdapterError::InvalidInput(format!(
233
245
  "Invalid map type: {}",
234
246
  type_str
235
247
  )));
@@ -255,7 +267,7 @@ impl RubySchemaBuilder {
255
267
  }),
256
268
  })
257
269
  } else {
258
- Err(ParquetError::Schema(format!(
270
+ Err(RubyAdapterError::InvalidInput(format!(
259
271
  "Unknown complex type: {}",
260
272
  type_str
261
273
  )))
@@ -263,7 +275,7 @@ impl RubySchemaBuilder {
263
275
  }
264
276
 
265
277
  /// Parse a field definition from a Ruby hash
266
- fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
278
+ fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode, RubyAdapterError> {
267
279
  let name: String = field_hash
268
280
  .fetch(Symbol::new("name"))
269
281
  .map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
@@ -274,7 +286,7 @@ impl RubySchemaBuilder {
274
286
  self.parse_schema_node(name, field_hash.as_value())
275
287
  } else {
276
288
  // This might be a simplified definition - look for known field patterns
277
- Err(ParquetError::Schema(format!(
289
+ Err(RubyAdapterError::InvalidInput(format!(
278
290
  "Field '{}' missing 'type' definition",
279
291
  name
280
292
  )))
@@ -288,7 +300,7 @@ impl RubySchemaBuilder {
288
300
  precision: Option<u8>,
289
301
  scale: Option<i8>,
290
302
  timezone: Option<String>,
291
- ) -> Result<PrimitiveType> {
303
+ ) -> Result<PrimitiveType, RubyAdapterError> {
292
304
  // Check if it's a decimal type with parentheses notation like "decimal(5,2)"
293
305
  if type_str.starts_with("decimal(") && type_str.ends_with(')') {
294
306
  let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
@@ -324,6 +336,14 @@ impl RubySchemaBuilder {
324
336
  }
325
337
  }
326
338
 
339
+ if type_str.starts_with("fixed_len_byte_array(") && type_str.ends_with(')') {
340
+ let params = &type_str[20..type_str.len() - 1];
341
+ let len = params.parse::<i32>().map_err(|_| {
342
+ ParquetError::Schema(format!("Invalid fixed_len_byte_array length: {}", params))
343
+ })?;
344
+ return Ok(PrimitiveType::FixedLenByteArray(len));
345
+ }
346
+
327
347
  match type_str.as_str() {
328
348
  "boolean" | "bool" => Ok(PrimitiveType::Boolean),
329
349
  "int8" => Ok(PrimitiveType::Int8),
@@ -356,8 +376,9 @@ impl RubySchemaBuilder {
356
376
  // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
357
377
  Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
358
378
  }
359
- "time32" | "time_millis" => Ok(PrimitiveType::TimeMillis),
360
- "time64" | "time_micros" => Ok(PrimitiveType::TimeMicros),
379
+ "time_millis" => Ok(PrimitiveType::TimeMillis),
380
+ "time_micros" => Ok(PrimitiveType::TimeMicros),
381
+ "time_nanos" => Ok(PrimitiveType::TimeNanos),
361
382
  "decimal" => {
362
383
  // Use provided precision/scale or defaults
363
384
  let p = precision.unwrap_or(38);
@@ -380,7 +401,7 @@ impl RubySchemaBuilder {
380
401
  let s = scale.unwrap_or(0);
381
402
  Ok(PrimitiveType::Decimal256(p, s))
382
403
  }
383
- _ => Err(ParquetError::Schema(format!(
404
+ _ => Err(RubyAdapterError::InvalidInput(format!(
384
405
  "Unknown primitive type: {}",
385
406
  type_str
386
407
  ))),
@@ -396,7 +417,7 @@ impl Default for RubySchemaBuilder {
396
417
 
397
418
  /// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
398
419
  /// and Ruby Value is not Send/Sync
399
- pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
420
+ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema, RubyAdapterError> {
400
421
  let builder = RubySchemaBuilder::new();
401
422
 
402
423
  // The Ruby schema should be a hash with a root struct
@@ -430,7 +451,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
430
451
  let mut unique_names = std::collections::HashSet::new();
431
452
  for name in &field_names {
432
453
  if !unique_names.insert(name) {
433
- return Err(ParquetError::Schema(format!(
454
+ return Err(RubyAdapterError::InvalidInput(format!(
434
455
  "Duplicate field names in root level schema: {:?}",
435
456
  field_names
436
457
  )));
@@ -443,7 +464,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
443
464
  fields: field_nodes,
444
465
  }
445
466
  } else {
446
- return Err(ParquetError::Schema(
467
+ return Err(RubyAdapterError::InvalidInput(
447
468
  "Schema must have 'type' or 'fields' key".to_string(),
448
469
  ));
449
470
  };
@@ -452,18 +473,18 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
452
473
  parquet_core::SchemaBuilder::new()
453
474
  .with_root(root_node)
454
475
  .build()
455
- .map_err(|e| ParquetError::Schema(e.to_string()))
476
+ .map_err(|e| RubyAdapterError::InvalidInput(e.to_string()))
456
477
  }
457
478
 
458
479
  /// Convert a Parquet schema back to Ruby representation
459
- pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
480
+ pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value, RubyAdapterError> {
460
481
  let ruby = Ruby::get()
461
482
  .map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
462
483
 
463
484
  schema_node_to_ruby(&schema.root, &ruby)
464
485
  }
465
486
 
466
- fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
487
+ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value, RubyAdapterError> {
467
488
  let hash = RHash::new();
468
489
 
469
490
  match node {
@@ -552,6 +573,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
552
573
  PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
553
574
  PrimitiveType::TimeMillis => Symbol::new("time_millis"),
554
575
  PrimitiveType::TimeMicros => Symbol::new("time_micros"),
576
+ PrimitiveType::TimeNanos => Symbol::new("time_nanos"),
555
577
  PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
556
578
  PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
557
579
  PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
@@ -597,7 +619,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
597
619
  /// Convert old schema format to new format
598
620
  /// Old: [{ "column_name" => "type" }, ...]
599
621
  /// New: [{ name: "column_name", type: :type }, ...]
600
- pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
622
+ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray, RubyAdapterError> {
601
623
  let new_schema = RArray::new();
602
624
 
603
625
  for item in schema.into_iter() {
@@ -630,7 +652,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
630
652
  );
631
653
 
632
654
  if let Err(e) = process_result {
633
- return Err(ParquetError::Schema(format!(
655
+ return Err(RubyAdapterError::InvalidInput(format!(
634
656
  "Failed to process field: {}",
635
657
  e
636
658
  )));
@@ -645,7 +667,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
645
667
  }
646
668
 
647
669
  /// Check if schema is in new DSL format (hash with type: :struct)
648
- pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
670
+ pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool, RubyAdapterError> {
649
671
  if !schema_value.is_kind_of(ruby.class_hash()) {
650
672
  return Ok(false);
651
673
  }
@@ -678,7 +700,7 @@ pub fn process_schema_value(
678
700
  ruby: &Ruby,
679
701
  schema_value: Value,
680
702
  data_array: Option<&RArray>,
681
- ) -> Result<Value> {
703
+ ) -> Result<Value, RubyAdapterError> {
682
704
  // Check if it's the new DSL format
683
705
  if is_dsl_schema(ruby, schema_value)? {
684
706
  // For DSL format, pass it directly to ruby_schema_to_parquet
@@ -716,7 +738,7 @@ pub fn process_schema_value(
716
738
  convert_legacy_schema(ruby, array)?
717
739
  }
718
740
  } else {
719
- return Err(ParquetError::Schema(
741
+ return Err(RubyAdapterError::InvalidInput(
720
742
  "schema array must contain hashes".to_string(),
721
743
  ));
722
744
  }
@@ -733,13 +755,13 @@ pub fn process_schema_value(
733
755
  ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
734
756
  })?
735
757
  } else {
736
- return Err(ParquetError::Schema(
758
+ return Err(RubyAdapterError::InvalidInput(
737
759
  "schema hash must have 'fields' key or be in DSL format with 'type' key"
738
760
  .to_string(),
739
761
  ));
740
762
  }
741
763
  } else {
742
- return Err(ParquetError::Schema(
764
+ return Err(RubyAdapterError::InvalidInput(
743
765
  "schema must be nil, an array, or a hash".to_string(),
744
766
  ));
745
767
  };
@@ -748,7 +770,7 @@ pub fn process_schema_value(
748
770
  if schema_array.is_empty() {
749
771
  if let Some(data) = data_array {
750
772
  if data.is_empty() {
751
- return Err(ParquetError::Schema(
773
+ return Err(RubyAdapterError::InvalidInput(
752
774
  "Cannot infer schema from empty data".to_string(),
753
775
  ));
754
776
  }
@@ -767,7 +789,7 @@ pub fn process_schema_value(
767
789
  })?;
768
790
  first_array.len()
769
791
  } else {
770
- return Err(ParquetError::Schema(
792
+ return Err(RubyAdapterError::InvalidInput(
771
793
  "First data item must be an array".to_string(),
772
794
  ));
773
795
  };
@@ -793,7 +815,7 @@ pub fn process_schema_value(
793
815
 
794
816
  schema_array = new_schema;
795
817
  } else {
796
- return Err(ParquetError::Schema(
818
+ return Err(RubyAdapterError::InvalidInput(
797
819
  "Schema is required when data is not provided for inference".to_string(),
798
820
  ));
799
821
  }
@@ -1,15 +1,15 @@
1
1
  use std::collections::HashMap;
2
- use std::sync::{Arc, Mutex};
2
+ use std::sync::{Arc, LazyLock, Mutex};
3
3
 
4
4
  use magnus::RString;
5
5
 
6
+ static STRING_CACHE: LazyLock<Mutex<HashMap<String, &'static str>>> =
7
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
8
+
6
9
  /// A cache for interning strings in the Ruby VM to reduce memory usage
7
10
  /// when there are many repeated strings
8
11
  #[derive(Debug)]
9
12
  pub struct StringCache {
10
- /// The actual cache is shared behind an Arc<Mutex> to allow cloning
11
- /// while maintaining a single global cache
12
- cache: Arc<Mutex<HashMap<String, &'static str>>>,
13
13
  enabled: bool,
14
14
  hits: Arc<Mutex<usize>>,
15
15
  misses: Arc<Mutex<usize>>,
@@ -19,7 +19,6 @@ impl StringCache {
19
19
  /// Create a new string cache
20
20
  pub fn new(enabled: bool) -> Self {
21
21
  Self {
22
- cache: Arc::new(Mutex::new(HashMap::new())),
23
22
  enabled,
24
23
  hits: Arc::new(Mutex::new(0)),
25
24
  misses: Arc::new(Mutex::new(0)),
@@ -36,9 +35,9 @@ impl StringCache {
36
35
 
37
36
  // Try to get or create the interned string
38
37
  let result = (|| -> Result<(), String> {
39
- let mut cache = self.cache.lock().map_err(|e| e.to_string())?;
38
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
40
39
 
41
- if cache.contains_key(&s) {
40
+ if cache.contains_key(s.as_str()) {
42
41
  let mut hits = self.hits.lock().map_err(|e| e.to_string())?;
43
42
  *hits += 1;
44
43
  } else {
@@ -65,7 +64,7 @@ impl StringCache {
65
64
 
66
65
  /// Get cache statistics
67
66
  pub fn stats(&self) -> CacheStats {
68
- let cache_size = self.cache.lock().map(|c| c.len()).unwrap_or(0);
67
+ let cache_size = STRING_CACHE.lock().map(|c| c.len()).unwrap_or(0);
69
68
  let hits = self.hits.lock().map(|h| *h).unwrap_or(0);
70
69
  let misses = self.misses.lock().map(|m| *m).unwrap_or(0);
71
70
 
@@ -84,7 +83,7 @@ impl StringCache {
84
83
 
85
84
  /// Clear the cache
86
85
  pub fn clear(&mut self) {
87
- if let Ok(mut cache) = self.cache.lock() {
86
+ if let Ok(mut cache) = STRING_CACHE.lock() {
88
87
  cache.clear();
89
88
  }
90
89
  if let Ok(mut hits) = self.hits.lock() {
@@ -26,6 +26,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
26
26
  ParquetValue::Float64(_) => 8,
27
27
  ParquetValue::String(s) => s.len() + 24, // String overhead
28
28
  ParquetValue::Bytes(b) => b.len() + 24, // Vec overhead
29
+ ParquetValue::Uuid(_) => 16,
29
30
  ParquetValue::Date32(_) => 4,
30
31
  ParquetValue::Date64(_) => 8,
31
32
  ParquetValue::Decimal128(_, _) => 16 + 1, // value + scale
@@ -36,6 +37,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
36
37
  ParquetValue::TimestampNanos(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
37
38
  ParquetValue::TimeMillis(_) => 4,
38
39
  ParquetValue::TimeMicros(_) => 8,
40
+ ParquetValue::TimeNanos(_) => 8,
39
41
  ParquetValue::List(items) => {
40
42
  24 + items.iter().map(estimate_parquet_value_size).sum::<usize>()
41
43
  }
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.6.1"
2
+ VERSION = "0.6.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.1
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-07-03 00:00:00.000000000 Z
11
+ date: 2025-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys