parquet 0.6.0 → 0.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ea1f639c9ac61eb47b4449999aacb95770a5b863dcf00358b616bdd57dcdf348
4
- data.tar.gz: dc9480d6c4959e8f6ad4cc6775583f3ebf59ef5393eaeef801f84e25ca32fa3d
3
+ metadata.gz: dfd19103b2414e7feeaa6d1ec3c9a9c25ce42cf5c8362baa37e3b9d8d5245f82
4
+ data.tar.gz: c5c1170dbdc3635577738a568688c36adc9670710f4b0d570fae29294e337754
5
5
  SHA512:
6
- metadata.gz: b1c17ae80a816a643ef47afc44172b6738baa9c28d671938f9c5781a466285199332d29ee96b4362c446f41c1ad7737255bab78c63a62626d6d982202da0b867
7
- data.tar.gz: a91604cc16cc3f8eeb127da29d70e4ed9fca72a1c52613f4d7587db15809342710e9dc9d0c513c29ba6f112ace47d466b99a159629d5bb84969c587392f2468e
6
+ metadata.gz: c9bf72b4e708c750ab7ae30afd97aef7f456a4249904fe3eb74f916557e28ca1a53bc262a6492db38c38162ab3e3f684e30f0c70dabbaf8f8f4145ef4d9af259
7
+ data.tar.gz: 164c5b0569d3d13242bcff7c09d66edf67b279d8289f97def043000a508b4333dd1387a4f47517be09023cd270cf3b6dfd57fdf658ac1b52e25f3f5b2b5ca30c
data/Cargo.lock CHANGED
@@ -225,6 +225,10 @@ dependencies = [
225
225
  name = "arrow-schema"
226
226
  version = "55.2.0"
227
227
  source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#54858bf019ff3faeb8f5b562da8c01012162aef0"
228
+ dependencies = [
229
+ "serde",
230
+ "serde_json",
231
+ ]
228
232
 
229
233
  [[package]]
230
234
  name = "arrow-select"
@@ -1092,6 +1096,7 @@ dependencies = [
1092
1096
  "rb-sys-env 0.2.2",
1093
1097
  "tempfile",
1094
1098
  "thiserror",
1099
+ "uuid",
1095
1100
  ]
1096
1101
 
1097
1102
  [[package]]
@@ -1,6 +1,6 @@
1
1
  use magnus::scan_args::{get_kwargs, scan_args};
2
- use magnus::value::ReprValue;
3
2
  use magnus::{Error as MagnusError, Ruby, Value};
3
+ use parquet_ruby_adapter::utils::parse_string_or_symbol;
4
4
  use parquet_ruby_adapter::{
5
5
  logger::RubyLogger, types::ParserResultType, utils::parse_parquet_write_args,
6
6
  };
@@ -34,11 +34,14 @@ pub fn each_row(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
34
34
  )?;
35
35
 
36
36
  let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
37
- rt_value
38
- .to_r_string()?
39
- .to_string()?
37
+ parse_string_or_symbol(&ruby, rt_value)?
38
+ .ok_or_else(|| {
39
+ MagnusError::new(magnus::exception::arg_error(), "result_type cannot be nil")
40
+ })?
40
41
  .parse()
41
- .map_err(|e| MagnusError::new(ruby.exception_arg_error(), e))?
42
+ .map_err(|_| {
43
+ MagnusError::new(magnus::exception::arg_error(), "Invalid result_type value")
44
+ })?
42
45
  } else {
43
46
  ParserResultType::Hash
44
47
  };
@@ -89,11 +92,14 @@ pub fn each_column(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError>
89
92
  )?;
90
93
 
91
94
  let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
92
- rt_value
93
- .to_r_string()?
94
- .to_string()?
95
+ parse_string_or_symbol(&ruby, rt_value)?
96
+ .ok_or_else(|| {
97
+ MagnusError::new(magnus::exception::arg_error(), "result_type cannot be nil")
98
+ })?
95
99
  .parse()
96
- .map_err(|e| MagnusError::new(ruby.exception_arg_error(), e))?
100
+ .map_err(|_| {
101
+ MagnusError::new(magnus::exception::arg_error(), "Invalid result_type value")
102
+ })?
97
103
  } else {
98
104
  ParserResultType::Hash
99
105
  };
@@ -101,7 +107,7 @@ pub fn each_column(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError>
101
107
  let batch_size = if let Some(bs) = kwargs.optional.2.flatten() {
102
108
  if bs == 0 {
103
109
  return Err(MagnusError::new(
104
- ruby.exception_arg_error(),
110
+ magnus::exception::arg_error(),
105
111
  "batch_size must be greater than 0",
106
112
  ));
107
113
  }
@@ -7,7 +7,7 @@ edition = "2021"
7
7
  arrow = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
8
8
  arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
9
9
  arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
10
- arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
10
+ arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["canonical_extension_types"]}
11
11
  bytes = "1.5"
12
12
  indexmap = "2.2"
13
13
  jiff = "0.2"
@@ -17,7 +17,7 @@ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24
17
17
  rand = "0.9.1"
18
18
  serde = { version = "1.0", features = ["derive"] }
19
19
  thiserror = "2.0"
20
+ uuid = { version = "1.0", features = ["v4"] }
20
21
 
21
22
  [dev-dependencies]
22
- uuid = { version = "1.0", features = ["v4"] }
23
23
  tempfile = "3.8"
@@ -7,6 +7,7 @@
7
7
 
8
8
  use crate::{ParquetError, ParquetValue, Result};
9
9
  use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
10
+ use arrow_schema::extension::Uuid as ArrowUuid;
10
11
  use arrow_schema::{DataType, Field};
11
12
  use bytes::Bytes;
12
13
  use indexmap::IndexMap;
@@ -14,7 +15,11 @@ use ordered_float::OrderedFloat;
14
15
  use std::sync::Arc;
15
16
 
16
17
  /// Convert a single value from an Arrow array at the given index to a ParquetValue
17
- pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<ParquetValue> {
18
+ pub fn arrow_to_parquet_value(
19
+ field: &Field,
20
+ array: &dyn Array,
21
+ index: usize,
22
+ ) -> Result<ParquetValue> {
18
23
  use arrow_array::*;
19
24
 
20
25
  if array.is_null(index) {
@@ -72,7 +77,6 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
72
77
  let array = downcast_array::<Float64Array>(array)?;
73
78
  Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
74
79
  }
75
-
76
80
  // String and binary types
77
81
  DataType::Utf8 => {
78
82
  let array = downcast_array::<StringArray>(array)?;
@@ -86,9 +90,15 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
86
90
  }
87
91
  DataType::FixedSizeBinary(_) => {
88
92
  let array = downcast_array::<FixedSizeBinaryArray>(array)?;
89
- Ok(ParquetValue::Bytes(Bytes::copy_from_slice(
90
- array.value(index),
91
- )))
93
+ let value = array.value(index);
94
+ match field.try_extension_type::<ArrowUuid>() {
95
+ Ok(_) => {
96
+ let uuid = uuid::Uuid::from_slice(value)
97
+ .map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
98
+ Ok(ParquetValue::Uuid(uuid))
99
+ }
100
+ Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
101
+ }
92
102
  }
93
103
 
94
104
  // Date and time types
@@ -140,6 +150,10 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
140
150
  let array = downcast_array::<Time64MicrosecondArray>(array)?;
141
151
  Ok(ParquetValue::TimeMicros(array.value(index)))
142
152
  }
153
+ arrow_schema::TimeUnit::Nanosecond => {
154
+ let array = downcast_array::<Time64NanosecondArray>(array)?;
155
+ Ok(ParquetValue::TimeNanos(array.value(index)))
156
+ }
143
157
  _ => Err(ParquetError::Conversion(format!(
144
158
  "Unsupported time64 unit: {:?}",
145
159
  unit
@@ -173,13 +187,13 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
173
187
  }
174
188
 
175
189
  // Complex types
176
- DataType::List(_) => {
190
+ DataType::List(item_field) => {
177
191
  let array = downcast_array::<ListArray>(array)?;
178
192
  let list_values = array.value(index);
179
193
 
180
194
  let mut values = Vec::with_capacity(list_values.len());
181
195
  for i in 0..list_values.len() {
182
- values.push(arrow_to_parquet_value(&list_values, i)?);
196
+ values.push(arrow_to_parquet_value(item_field, &list_values, i)?);
183
197
  }
184
198
 
185
199
  Ok(ParquetValue::List(values))
@@ -192,10 +206,20 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
192
206
  let keys = map_value.column(0);
193
207
  let values = map_value.column(1);
194
208
 
209
+ let key_field = map_value
210
+ .fields()
211
+ .iter().find(|f| f.name() == "key")
212
+ .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
213
+
214
+ let value_field = map_value
215
+ .fields()
216
+ .iter().find(|f| f.name() == "value")
217
+ .ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
218
+
195
219
  let mut map_vec = Vec::with_capacity(keys.len());
196
220
  for i in 0..keys.len() {
197
- let key = arrow_to_parquet_value(keys, i)?;
198
- let value = arrow_to_parquet_value(values, i)?;
221
+ let key = arrow_to_parquet_value(key_field, keys, i)?;
222
+ let value = arrow_to_parquet_value(value_field, values, i)?;
199
223
  map_vec.push((key, value));
200
224
  }
201
225
 
@@ -207,7 +231,7 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
207
231
  let mut map = IndexMap::new();
208
232
  for (col_idx, field) in array.fields().iter().enumerate() {
209
233
  let column = array.column(col_idx);
210
- let value = arrow_to_parquet_value(column, index)?;
234
+ let value = arrow_to_parquet_value(field, column, index)?;
211
235
  map.insert(Arc::from(field.name().as_str()), value);
212
236
  }
213
237
 
@@ -1108,7 +1132,7 @@ mod tests {
1108
1132
  let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
1109
1133
 
1110
1134
  for (i, expected) in values.iter().enumerate() {
1111
- let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
1135
+ let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
1112
1136
  assert_eq!(&actual, expected);
1113
1137
  }
1114
1138
  }
@@ -163,8 +163,10 @@ where
163
163
  // Extract values from current row
164
164
  let mut row_values = Vec::with_capacity(batch.num_columns());
165
165
 
166
- for column in batch.columns() {
167
- let value = match arrow_to_parquet_value(column, self.current_row) {
166
+ let schema = batch.schema();
167
+ for (i, column) in batch.columns().iter().enumerate() {
168
+ let field = schema.field(i);
169
+ let value = match arrow_to_parquet_value(field, column, self.current_row) {
168
170
  Ok(v) => v,
169
171
  Err(e) => return Some(Err(e)),
170
172
  };
@@ -228,12 +230,13 @@ where
228
230
  let mut columns = Vec::with_capacity(batch.num_columns());
229
231
 
230
232
  for (idx, column) in batch.columns().iter().enumerate() {
231
- let column_name = self.schema.field(idx).name().to_string();
233
+ let field = self.schema.field(idx);
234
+ let column_name = field.name().to_string();
232
235
 
233
236
  // Convert entire column to ParquetValues
234
237
  let mut values = Vec::with_capacity(column.len());
235
238
  for row_idx in 0..column.len() {
236
- match arrow_to_parquet_value(column, row_idx) {
239
+ match arrow_to_parquet_value(field, column, row_idx) {
237
240
  Ok(value) => values.push(value),
238
241
  Err(e) => return Some(Err(e)),
239
242
  }
@@ -72,6 +72,7 @@ pub enum PrimitiveType {
72
72
  TimestampNanos(Option<Arc<str>>),
73
73
  TimeMillis,
74
74
  TimeMicros,
75
+ TimeNanos,
75
76
 
76
77
  // Fixed-length byte array
77
78
  FixedLenByteArray(i32),
@@ -146,6 +147,7 @@ impl PrimitiveType {
146
147
  PrimitiveType::TimestampNanos(_) => "TimestampNanos",
147
148
  PrimitiveType::TimeMillis => "TimeMillis",
148
149
  PrimitiveType::TimeMicros => "TimeMicros",
150
+ PrimitiveType::TimeNanos => "TimeNanos",
149
151
  PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
150
152
  }
151
153
  }
@@ -2,6 +2,7 @@ use bytes::Bytes;
2
2
  use indexmap::IndexMap;
3
3
  use num::BigInt;
4
4
  use std::sync::Arc;
5
+ use uuid::Uuid;
5
6
 
6
7
  #[derive(Debug, Clone, PartialEq, Eq)]
7
8
  pub enum ParquetValue {
@@ -22,6 +23,7 @@ pub enum ParquetValue {
22
23
  Boolean(bool),
23
24
  String(Arc<str>),
24
25
  Bytes(Bytes),
26
+ Uuid(Uuid),
25
27
 
26
28
  // Date/Time types
27
29
  Date32(i32), // Days since epoch
@@ -40,6 +42,7 @@ pub enum ParquetValue {
40
42
  // Time types
41
43
  TimeMillis(i32), // Time of day in milliseconds since midnight
42
44
  TimeMicros(i64), // Time of day in microseconds since midnight
45
+ TimeNanos(i64), // Time of day in nanoseconds since midnight
43
46
 
44
47
  // Complex types
45
48
  List(Vec<ParquetValue>),
@@ -68,6 +71,7 @@ impl std::hash::Hash for ParquetValue {
68
71
  ParquetValue::Boolean(b) => b.hash(state),
69
72
  ParquetValue::String(s) => s.hash(state),
70
73
  ParquetValue::Bytes(b) => b.hash(state),
74
+ ParquetValue::Uuid(u) => u.hash(state),
71
75
  ParquetValue::Date32(d) => d.hash(state),
72
76
  ParquetValue::Date64(d) => d.hash(state),
73
77
  ParquetValue::Decimal128(d, scale) => {
@@ -96,6 +100,7 @@ impl std::hash::Hash for ParquetValue {
96
100
  }
97
101
  ParquetValue::TimeMillis(t) => t.hash(state),
98
102
  ParquetValue::TimeMicros(t) => t.hash(state),
103
+ ParquetValue::TimeNanos(t) => t.hash(state),
99
104
  ParquetValue::List(l) => l.hash(state),
100
105
  ParquetValue::Map(m) => m.hash(state),
101
106
  ParquetValue::Record(r) => {
@@ -133,6 +138,7 @@ impl ParquetValue {
133
138
  ParquetValue::Boolean(_) => "Boolean",
134
139
  ParquetValue::String(_) => "String",
135
140
  ParquetValue::Bytes(_) => "Bytes",
141
+ ParquetValue::Uuid(_) => "Uuid",
136
142
  ParquetValue::Date32(_) => "Date32",
137
143
  ParquetValue::Date64(_) => "Date64",
138
144
  ParquetValue::Decimal128(_, _) => "Decimal128",
@@ -143,6 +149,7 @@ impl ParquetValue {
143
149
  ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
144
150
  ParquetValue::TimeMillis(_) => "TimeMillis",
145
151
  ParquetValue::TimeMicros(_) => "TimeMicros",
152
+ ParquetValue::TimeNanos(_) => "TimeNanos",
146
153
  ParquetValue::List(_) => "List",
147
154
  ParquetValue::Map(_) => "Map",
148
155
  ParquetValue::Record(_) => "Record",
@@ -235,6 +235,7 @@ where
235
235
  (Date64(_), DataType::Date64) => 8,
236
236
  (TimeMillis(_), DataType::Time32(_)) => 4,
237
237
  (TimeMicros(_), DataType::Time64(_)) => 8,
238
+ (TimeNanos(_), DataType::Time64(_)) => 8,
238
239
  (TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
239
240
  (TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
240
241
  (TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
@@ -364,7 +365,9 @@ where
364
365
  writer.write(&batch)?;
365
366
 
366
367
  // Check if we need to flush based on memory usage
367
- if writer.in_progress_size() >= self.memory_threshold {
368
+ if writer.in_progress_size() >= self.memory_threshold
369
+ || writer.memory_size() >= self.memory_threshold
370
+ {
368
371
  writer.flush()?;
369
372
  }
370
373
  } else {
@@ -496,6 +499,7 @@ fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str)
496
499
  (Date64(_), DataType::Date64) => Ok(()),
497
500
  (TimeMillis(_), DataType::Time32(_)) => Ok(()),
498
501
  (TimeMicros(_), DataType::Time64(_)) => Ok(()),
502
+ (TimeNanos(_), DataType::Time64(_)) => Ok(()),
499
503
  (TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
500
504
  (TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
501
505
  (TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
@@ -591,10 +595,16 @@ fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
591
595
  name,
592
596
  primitive_type,
593
597
  nullable,
594
- ..
598
+ format,
595
599
  } => {
596
600
  let data_type = primitive_type_to_arrow(primitive_type)?;
597
- Ok(Field::new(name, data_type, *nullable))
601
+ let field = Field::new(name, data_type, *nullable);
602
+ let extended_field = if format.as_deref() == Some("uuid") {
603
+ field.with_extension_type(arrow_schema::extension::Uuid)
604
+ } else {
605
+ field
606
+ };
607
+ Ok(extended_field)
598
608
  }
599
609
  SchemaNode::List {
600
610
  name,
@@ -671,6 +681,7 @@ fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
671
681
  Date32 => DataType::Date32,
672
682
  TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
673
683
  TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
684
+ TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
674
685
  TimestampMillis(tz) => DataType::Timestamp(
675
686
  arrow_schema::TimeUnit::Millisecond,
676
687
  // PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
@@ -99,7 +99,7 @@ fn test_decimal256_large_values() {
99
99
 
100
100
  // Verify roundtrip
101
101
  for i in 0..4 {
102
- let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
102
+ let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
103
103
  match (i, value) {
104
104
  (0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
105
105
  (1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
@@ -173,7 +173,7 @@ fn test_timestamp_with_timezone() {
173
173
 
174
174
  // Verify roundtrip preserves timezone
175
175
  for i in 0..3 {
176
- let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
176
+ let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
177
177
  match value {
178
178
  ParquetValue::TimestampMillis(_, Some(tz)) => {
179
179
  assert_eq!(tz.as_ref(), "America/New_York");
@@ -209,7 +209,7 @@ fn test_nested_list_of_lists() {
209
209
  assert_eq!(array.len(), 1);
210
210
 
211
211
  // Verify roundtrip
212
- let value = arrow_to_parquet_value(array.as_ref(), 0).unwrap();
212
+ let value = arrow_to_parquet_value(&outer_field, array.as_ref(), 0).unwrap();
213
213
  match value {
214
214
  ParquetValue::List(items) => assert_eq!(items.len(), 5),
215
215
  _ => panic!("Expected list"),
@@ -357,7 +357,7 @@ fn test_unsupported_arrow_types() {
357
357
  )
358
358
  .unwrap();
359
359
 
360
- let result = arrow_to_parquet_value(&array, 0);
360
+ let result = arrow_to_parquet_value(&Field::new("int", DataType::Int32, false), &array, 0);
361
361
  assert!(result.is_err());
362
362
  assert!(result
363
363
  .unwrap_err()
@@ -20,3 +20,4 @@ rb-sys = { version = "0.9", features = ["stable-api-compiled-fallback"] }
20
20
  tempfile = "^3.15"
21
21
  thiserror = "2.0"
22
22
  indexmap = "2.2"
23
+ uuid = "*"
@@ -41,27 +41,6 @@ impl RubyValueConverter {
41
41
  .map(|cache| cache.stats())
42
42
  }
43
43
 
44
- /// Convert a Ruby value to ParquetValue with type hint
45
- /// This is the primary conversion method that handles all Ruby types
46
- pub fn to_parquet_with_type_hint(
47
- &mut self,
48
- value: Value,
49
- type_hint: Option<&parquet_core::PrimitiveType>,
50
- ) -> Result<ParquetValue> {
51
- // Handle nil values
52
- if value.is_nil() {
53
- return Ok(ParquetValue::Null);
54
- }
55
-
56
- // If we have a type hint, use it to guide conversion
57
- if let Some(hint) = type_hint {
58
- return self.convert_with_type_hint(value, hint);
59
- }
60
-
61
- // Otherwise, infer type from Ruby value
62
- self.infer_and_convert(value)
63
- }
64
-
65
44
  /// Convert a Ruby value to ParquetValue with schema hint
66
45
  /// This handles both primitive and complex types
67
46
  pub fn to_parquet_with_schema_hint(
@@ -115,7 +94,7 @@ impl RubyValueConverter {
115
94
  use parquet_core::PrimitiveType::*;
116
95
 
117
96
  // Special handling for UUID format
118
- if let (Binary, Some("uuid")) = (type_hint, format) {
97
+ if let (FixedLenByteArray(16), Some("uuid")) = (type_hint, format) {
119
98
  return self.convert_to_uuid_binary(value);
120
99
  }
121
100
 
@@ -156,6 +135,7 @@ impl RubyValueConverter {
156
135
  Date64 => self.convert_to_date64(value, None),
157
136
  TimeMillis => self.convert_to_time_millis(value),
158
137
  TimeMicros => self.convert_to_time_micros(value),
138
+ TimeNanos => self.convert_to_time_nanos(value),
159
139
  TimestampSecond(schema_tz) => {
160
140
  self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
161
141
  }
@@ -484,32 +464,19 @@ impl RubyValueConverter {
484
464
 
485
465
  // Convert value to string
486
466
  let uuid_str: String = value
487
- .funcall("to_s", ())
488
- .and_then(TryConvert::try_convert)
467
+ .to_r_string()
468
+ .map_err(|e: MagnusError| {
469
+ ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
470
+ })?
471
+ .to_string()
489
472
  .map_err(|e: MagnusError| {
490
473
  ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
491
474
  })?;
492
475
 
493
- // Remove hyphens and validate length
494
- let clean_uuid = uuid_str.replace('-', "");
495
- if clean_uuid.len() != 32 {
496
- return Err(ParquetError::Conversion(format!(
497
- "Invalid UUID format: expected 32 hex characters (ignoring hyphens), got {}",
498
- clean_uuid.len()
499
- )));
500
- }
501
-
502
- // Parse hex string to bytes
503
- let mut bytes = Vec::with_capacity(16);
504
- for i in 0..16 {
505
- let hex_byte = &clean_uuid[i * 2..i * 2 + 2];
506
- let byte = u8::from_str_radix(hex_byte, 16).map_err(|_| {
507
- ParquetError::Conversion(format!("Invalid hex character in UUID: {}", hex_byte))
508
- })?;
509
- bytes.push(byte);
510
- }
511
-
512
- Ok(ParquetValue::Bytes(bytes.into()))
476
+ let parsed = uuid::Uuid::parse_str(&uuid_str)
477
+ .map_err(|e| ParquetError::Conversion(format!("Failed to parse UUID: {}", e)))?;
478
+ let bytes = Bytes::copy_from_slice(parsed.as_bytes());
479
+ Ok(ParquetValue::Bytes(bytes))
513
480
  }
514
481
 
515
482
  fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
@@ -692,6 +659,38 @@ impl RubyValueConverter {
692
659
  )))
693
660
  }
694
661
 
662
+ fn convert_to_time_nanos(&self, value: Value) -> Result<ParquetValue> {
663
+ if value.is_nil() {
664
+ return Ok(ParquetValue::Null);
665
+ }
666
+
667
+ // Convert to microseconds since midnight
668
+ let ruby = Ruby::get()
669
+ .map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
670
+ if value.is_kind_of(ruby.class_time()) {
671
+ let hour: i64 = value
672
+ .funcall("hour", ())
673
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
674
+ let min: i64 = value
675
+ .funcall("min", ())
676
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
677
+ let sec: i64 = value
678
+ .funcall("sec", ())
679
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
680
+ let nsec: i64 = value
681
+ .funcall("nsec", ())
682
+ .map_err(|e| ParquetError::Conversion(e.to_string()))?;
683
+
684
+ let nanos = (hour * 3600 + min * 60 + sec) * 1_000_000_000 + nsec;
685
+ return Ok(ParquetValue::TimeNanos(nanos));
686
+ }
687
+
688
+ Err(ParquetError::Conversion(format!(
689
+ "Cannot convert {} to time_micros",
690
+ value.class()
691
+ )))
692
+ }
693
+
695
694
  // Timestamp conversion methods that respect schema timezone
696
695
  fn convert_to_timestamp_second_with_tz(
697
696
  &self,
@@ -1399,21 +1398,8 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1399
1398
  ParquetValue::Float32(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
1400
1399
  ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
1401
1400
  ParquetValue::String(s) => Ok(s.into_value_with(&ruby)),
1402
- ParquetValue::Bytes(b) => {
1403
- // Check if this is a UUID (16 bytes)
1404
- if b.len() == 16 {
1405
- // Format as UUID string
1406
- let uuid_str = format!(
1407
- "{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
1408
- b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
1409
- b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
1410
- );
1411
- Ok(uuid_str.into_value_with(&ruby))
1412
- } else {
1413
- // Regular bytes - convert to string
1414
- Ok(ruby.str_from_slice(&b).as_value())
1415
- }
1416
- }
1401
+ ParquetValue::Uuid(u) => Ok(u.to_string().into_value_with(&ruby)),
1402
+ ParquetValue::Bytes(b) => Ok(ruby.enc_str_new(&b, ruby.ascii8bit_encoding()).as_value()),
1417
1403
  ParquetValue::Date32(days) => {
1418
1404
  // Convert days since epoch to Date object
1419
1405
  let _ = ruby.require("date");
@@ -1528,6 +1514,14 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
1528
1514
  .map_err(|e| ParquetError::Conversion(e.to_string()))?;
1529
1515
  apply_timezone(time, &tz)
1530
1516
  }
1517
+ ParquetValue::TimeNanos(nanos) => {
1518
+ let time_class = ruby.class_time();
1519
+ let secs = nanos / 1_000_000_000;
1520
+ let nsec = nanos % 1_000_000_000;
1521
+ time_class
1522
+ .funcall("at", (secs, nsec, Symbol::new("nanosecond")))
1523
+ .map_err(|e| ParquetError::Conversion(e.to_string()))
1524
+ }
1531
1525
  ParquetValue::TimestampNanos(nanos, tz) => {
1532
1526
  let time_class = ruby.class_time();
1533
1527
  let secs = nanos / 1_000_000_000;
@@ -2,6 +2,7 @@ use magnus::value::ReprValue;
2
2
  use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
3
3
  use parquet_core::reader::Reader;
4
4
 
5
+ use crate::StringCache;
5
6
  use crate::{
6
7
  converter::parquet_to_ruby,
7
8
  io::{RubyIOReader, ThreadSafeRubyIOReader},
@@ -101,6 +102,12 @@ pub fn each_row(
101
102
  })?;
102
103
  let mut row_count = 0u64;
103
104
 
105
+ let mut cache = StringCache::new(true);
106
+ let interned_column_names = column_names
107
+ .iter()
108
+ .map(|name| cache.intern(name.clone()))
109
+ .collect::<Vec<_>>();
110
+
104
111
  for row_result in row_iter {
105
112
  let row = row_result
106
113
  .map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
@@ -120,11 +127,11 @@ pub fn each_row(
120
127
  ParserResultType::Hash => {
121
128
  let hash: RHash = ruby.hash_new();
122
129
  for (idx, value) in row.into_iter().enumerate() {
123
- if idx < column_names.len() {
130
+ if idx < interned_column_names.len() {
124
131
  let ruby_value = parquet_to_ruby(value).map_err(|e| {
125
132
  MagnusError::new(ruby.exception_runtime_error(), e.to_string())
126
133
  })?;
127
- hash.aset(column_names[idx].as_str(), ruby_value)?;
134
+ hash.aset(interned_column_names[idx].as_ref(), ruby_value)?;
128
135
  }
129
136
  }
130
137
  hash.as_value()
@@ -1,6 +1,9 @@
1
1
  use magnus::value::ReprValue;
2
2
  use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
3
- use parquet_core::{ParquetError, PrimitiveType, Result, Schema, SchemaNode};
3
+ use parquet_core::{ParquetError, PrimitiveType, Schema, SchemaNode};
4
+
5
+ use crate::utils::parse_string_or_symbol;
6
+ use crate::RubyAdapterError;
4
7
 
5
8
  /// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
6
9
  pub struct RubySchemaBuilder;
@@ -11,18 +14,18 @@ impl RubySchemaBuilder {
11
14
  }
12
15
 
13
16
  /// Parse a Ruby schema definition (hash) into a SchemaNode
14
- fn parse_schema_node(&self, name: String, schema_def: Value) -> Result<SchemaNode> {
17
+ fn parse_schema_node(
18
+ &self,
19
+ name: String,
20
+ schema_def: Value,
21
+ ) -> Result<SchemaNode, RubyAdapterError> {
15
22
  // If it's a Hash, parse it as a complex type
16
23
  if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
17
24
  return self.parse_hash_schema_node(name, hash);
18
25
  }
19
26
 
20
27
  // Otherwise, try to parse as a simple type symbol
21
- if let Ok(type_sym) = <Symbol as TryConvert>::try_convert(schema_def) {
22
- let type_str = type_sym.name().map_err(|e: MagnusError| {
23
- ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
24
- })?;
25
-
28
+ if let Ok(type_str) = schema_def.to_r_string()?.to_string() {
26
29
  // Check if it's a complex type with angle brackets
27
30
  if type_str.contains('<') {
28
31
  return self.parse_complex_type_string(name, type_str.to_string(), true);
@@ -38,22 +41,24 @@ impl RubySchemaBuilder {
38
41
  });
39
42
  }
40
43
 
41
- Err(ParquetError::Schema(format!(
44
+ Err(RubyAdapterError::InvalidInput(format!(
42
45
  "Expected Hash or Symbol for schema definition, got {}",
43
46
  schema_def.class()
44
47
  )))
45
48
  }
46
49
 
47
50
  /// Parse a Ruby hash schema node
48
- fn parse_hash_schema_node(&self, name: String, hash: RHash) -> Result<SchemaNode> {
51
+ fn parse_hash_schema_node(
52
+ &self,
53
+ name: String,
54
+ hash: RHash,
55
+ ) -> Result<SchemaNode, RubyAdapterError> {
49
56
  // Get the type field
50
- let type_sym: Symbol = hash
51
- .fetch::<_, Symbol>(Symbol::new("type"))
57
+ let type_sym: Value = hash
58
+ .fetch::<_, Value>(Symbol::new("type"))
52
59
  .map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
53
60
 
54
- let type_str = type_sym.name().map_err(|e: MagnusError| {
55
- ParquetError::Conversion(format!("Failed to get type name: {}", e))
56
- })?;
61
+ let type_str = type_sym.to_r_string()?.to_string()?;
57
62
 
58
63
  // Get nullable field (default to true)
59
64
  let nullable = hash
@@ -140,6 +145,15 @@ impl RubySchemaBuilder {
140
145
 
141
146
  // Primitive types
142
147
  primitive_type => {
148
+ if format.as_deref() == Some("uuid") {
149
+ return Ok(SchemaNode::Primitive {
150
+ name,
151
+ primitive_type: PrimitiveType::FixedLenByteArray(16),
152
+ nullable,
153
+ format,
154
+ });
155
+ }
156
+
143
157
  // Get precision and scale for decimal types
144
158
  let precision = hash
145
159
  .fetch::<_, Value>(Symbol::new("precision"))
@@ -194,7 +208,7 @@ impl RubySchemaBuilder {
194
208
  name: String,
195
209
  type_str: String,
196
210
  nullable: bool,
197
- ) -> Result<SchemaNode> {
211
+ ) -> Result<SchemaNode, RubyAdapterError> {
198
212
  if type_str.starts_with("list<") && type_str.ends_with('>') {
199
213
  let inner_type = &type_str[5..type_str.len() - 1];
200
214
  let item_name = format!("{}_item", name);
@@ -227,7 +241,7 @@ impl RubySchemaBuilder {
227
241
  let inner = &type_str[4..type_str.len() - 1];
228
242
  let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
229
243
  if parts.len() != 2 {
230
- return Err(ParquetError::Schema(format!(
244
+ return Err(RubyAdapterError::InvalidInput(format!(
231
245
  "Invalid map type: {}",
232
246
  type_str
233
247
  )));
@@ -253,7 +267,7 @@ impl RubySchemaBuilder {
253
267
  }),
254
268
  })
255
269
  } else {
256
- Err(ParquetError::Schema(format!(
270
+ Err(RubyAdapterError::InvalidInput(format!(
257
271
  "Unknown complex type: {}",
258
272
  type_str
259
273
  )))
@@ -261,7 +275,7 @@ impl RubySchemaBuilder {
261
275
  }
262
276
 
263
277
  /// Parse a field definition from a Ruby hash
264
- fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
278
+ fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode, RubyAdapterError> {
265
279
  let name: String = field_hash
266
280
  .fetch(Symbol::new("name"))
267
281
  .map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
@@ -272,7 +286,7 @@ impl RubySchemaBuilder {
272
286
  self.parse_schema_node(name, field_hash.as_value())
273
287
  } else {
274
288
  // This might be a simplified definition - look for known field patterns
275
- Err(ParquetError::Schema(format!(
289
+ Err(RubyAdapterError::InvalidInput(format!(
276
290
  "Field '{}' missing 'type' definition",
277
291
  name
278
292
  )))
@@ -286,7 +300,7 @@ impl RubySchemaBuilder {
286
300
  precision: Option<u8>,
287
301
  scale: Option<i8>,
288
302
  timezone: Option<String>,
289
- ) -> Result<PrimitiveType> {
303
+ ) -> Result<PrimitiveType, RubyAdapterError> {
290
304
  // Check if it's a decimal type with parentheses notation like "decimal(5,2)"
291
305
  if type_str.starts_with("decimal(") && type_str.ends_with(')') {
292
306
  let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
@@ -322,6 +336,14 @@ impl RubySchemaBuilder {
322
336
  }
323
337
  }
324
338
 
339
+ if type_str.starts_with("fixed_len_byte_array(") && type_str.ends_with(')') {
340
+ let params = &type_str[20..type_str.len() - 1];
341
+ let len = params.parse::<i32>().map_err(|_| {
342
+ ParquetError::Schema(format!("Invalid fixed_len_byte_array length: {}", params))
343
+ })?;
344
+ return Ok(PrimitiveType::FixedLenByteArray(len));
345
+ }
346
+
325
347
  match type_str.as_str() {
326
348
  "boolean" | "bool" => Ok(PrimitiveType::Boolean),
327
349
  "int8" => Ok(PrimitiveType::Int8),
@@ -354,8 +376,9 @@ impl RubySchemaBuilder {
354
376
  // PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
355
377
  Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
356
378
  }
357
- "time32" | "time_millis" => Ok(PrimitiveType::TimeMillis),
358
- "time64" | "time_micros" => Ok(PrimitiveType::TimeMicros),
379
+ "time_millis" => Ok(PrimitiveType::TimeMillis),
380
+ "time_micros" => Ok(PrimitiveType::TimeMicros),
381
+ "time_nanos" => Ok(PrimitiveType::TimeNanos),
359
382
  "decimal" => {
360
383
  // Use provided precision/scale or defaults
361
384
  let p = precision.unwrap_or(38);
@@ -378,7 +401,7 @@ impl RubySchemaBuilder {
378
401
  let s = scale.unwrap_or(0);
379
402
  Ok(PrimitiveType::Decimal256(p, s))
380
403
  }
381
- _ => Err(ParquetError::Schema(format!(
404
+ _ => Err(RubyAdapterError::InvalidInput(format!(
382
405
  "Unknown primitive type: {}",
383
406
  type_str
384
407
  ))),
@@ -394,7 +417,7 @@ impl Default for RubySchemaBuilder {
394
417
 
395
418
  /// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
396
419
  /// and Ruby Value is not Send/Sync
397
- pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
420
+ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema, RubyAdapterError> {
398
421
  let builder = RubySchemaBuilder::new();
399
422
 
400
423
  // The Ruby schema should be a hash with a root struct
@@ -428,7 +451,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
428
451
  let mut unique_names = std::collections::HashSet::new();
429
452
  for name in &field_names {
430
453
  if !unique_names.insert(name) {
431
- return Err(ParquetError::Schema(format!(
454
+ return Err(RubyAdapterError::InvalidInput(format!(
432
455
  "Duplicate field names in root level schema: {:?}",
433
456
  field_names
434
457
  )));
@@ -441,7 +464,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
441
464
  fields: field_nodes,
442
465
  }
443
466
  } else {
444
- return Err(ParquetError::Schema(
467
+ return Err(RubyAdapterError::InvalidInput(
445
468
  "Schema must have 'type' or 'fields' key".to_string(),
446
469
  ));
447
470
  };
@@ -450,18 +473,18 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
450
473
  parquet_core::SchemaBuilder::new()
451
474
  .with_root(root_node)
452
475
  .build()
453
- .map_err(|e| ParquetError::Schema(e.to_string()))
476
+ .map_err(|e| RubyAdapterError::InvalidInput(e.to_string()))
454
477
  }
455
478
 
456
479
  /// Convert a Parquet schema back to Ruby representation
457
- pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
480
+ pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value, RubyAdapterError> {
458
481
  let ruby = Ruby::get()
459
482
  .map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
460
483
 
461
484
  schema_node_to_ruby(&schema.root, &ruby)
462
485
  }
463
486
 
464
- fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
487
+ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value, RubyAdapterError> {
465
488
  let hash = RHash::new();
466
489
 
467
490
  match node {
@@ -550,6 +573,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
550
573
  PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
551
574
  PrimitiveType::TimeMillis => Symbol::new("time_millis"),
552
575
  PrimitiveType::TimeMicros => Symbol::new("time_micros"),
576
+ PrimitiveType::TimeNanos => Symbol::new("time_nanos"),
553
577
  PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
554
578
  PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
555
579
  PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
@@ -595,7 +619,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
595
619
  /// Convert old schema format to new format
596
620
  /// Old: [{ "column_name" => "type" }, ...]
597
621
  /// New: [{ name: "column_name", type: :type }, ...]
598
- pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
622
+ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray, RubyAdapterError> {
599
623
  let new_schema = RArray::new();
600
624
 
601
625
  for item in schema.into_iter() {
@@ -609,7 +633,12 @@ pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
609
633
  |key: Value,
610
634
  value: Value|
611
635
  -> std::result::Result<magnus::r_hash::ForEach, MagnusError> {
612
- let key_str: String = TryConvert::try_convert(key)?;
636
+ let key_str: String = parse_string_or_symbol(ruby, key)?.ok_or_else(|| {
637
+ MagnusError::new(
638
+ magnus::exception::arg_error(),
639
+ "Nil keys not allowed in schema",
640
+ )
641
+ })?;
613
642
  let type_str: String = TryConvert::try_convert(value)?;
614
643
 
615
644
  new_field.aset(Symbol::new("name"), key_str)?;
@@ -623,7 +652,7 @@ pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
623
652
  );
624
653
 
625
654
  if let Err(e) = process_result {
626
- return Err(ParquetError::Schema(format!(
655
+ return Err(RubyAdapterError::InvalidInput(format!(
627
656
  "Failed to process field: {}",
628
657
  e
629
658
  )));
@@ -638,7 +667,7 @@ pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
638
667
  }
639
668
 
640
669
  /// Check if schema is in new DSL format (hash with type: :struct)
641
- pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
670
+ pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool, RubyAdapterError> {
642
671
  if !schema_value.is_kind_of(ruby.class_hash()) {
643
672
  return Ok(false);
644
673
  }
@@ -671,7 +700,7 @@ pub fn process_schema_value(
671
700
  ruby: &Ruby,
672
701
  schema_value: Value,
673
702
  data_array: Option<&RArray>,
674
- ) -> Result<Value> {
703
+ ) -> Result<Value, RubyAdapterError> {
675
704
  // Check if it's the new DSL format
676
705
  if is_dsl_schema(ruby, schema_value)? {
677
706
  // For DSL format, pass it directly to ruby_schema_to_parquet
@@ -709,7 +738,7 @@ pub fn process_schema_value(
709
738
  convert_legacy_schema(ruby, array)?
710
739
  }
711
740
  } else {
712
- return Err(ParquetError::Schema(
741
+ return Err(RubyAdapterError::InvalidInput(
713
742
  "schema array must contain hashes".to_string(),
714
743
  ));
715
744
  }
@@ -726,13 +755,13 @@ pub fn process_schema_value(
726
755
  ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
727
756
  })?
728
757
  } else {
729
- return Err(ParquetError::Schema(
758
+ return Err(RubyAdapterError::InvalidInput(
730
759
  "schema hash must have 'fields' key or be in DSL format with 'type' key"
731
760
  .to_string(),
732
761
  ));
733
762
  }
734
763
  } else {
735
- return Err(ParquetError::Schema(
764
+ return Err(RubyAdapterError::InvalidInput(
736
765
  "schema must be nil, an array, or a hash".to_string(),
737
766
  ));
738
767
  };
@@ -741,7 +770,7 @@ pub fn process_schema_value(
741
770
  if schema_array.is_empty() {
742
771
  if let Some(data) = data_array {
743
772
  if data.is_empty() {
744
- return Err(ParquetError::Schema(
773
+ return Err(RubyAdapterError::InvalidInput(
745
774
  "Cannot infer schema from empty data".to_string(),
746
775
  ));
747
776
  }
@@ -760,7 +789,7 @@ pub fn process_schema_value(
760
789
  })?;
761
790
  first_array.len()
762
791
  } else {
763
- return Err(ParquetError::Schema(
792
+ return Err(RubyAdapterError::InvalidInput(
764
793
  "First data item must be an array".to_string(),
765
794
  ));
766
795
  };
@@ -786,7 +815,7 @@ pub fn process_schema_value(
786
815
 
787
816
  schema_array = new_schema;
788
817
  } else {
789
- return Err(ParquetError::Schema(
818
+ return Err(RubyAdapterError::InvalidInput(
790
819
  "Schema is required when data is not provided for inference".to_string(),
791
820
  ));
792
821
  }
@@ -1,15 +1,15 @@
1
1
  use std::collections::HashMap;
2
- use std::sync::{Arc, Mutex};
2
+ use std::sync::{Arc, LazyLock, Mutex};
3
3
 
4
4
  use magnus::RString;
5
5
 
6
+ static STRING_CACHE: LazyLock<Mutex<HashMap<String, &'static str>>> =
7
+ LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
8
+
6
9
  /// A cache for interning strings in the Ruby VM to reduce memory usage
7
10
  /// when there are many repeated strings
8
11
  #[derive(Debug)]
9
12
  pub struct StringCache {
10
- /// The actual cache is shared behind an Arc<Mutex> to allow cloning
11
- /// while maintaining a single global cache
12
- cache: Arc<Mutex<HashMap<String, &'static str>>>,
13
13
  enabled: bool,
14
14
  hits: Arc<Mutex<usize>>,
15
15
  misses: Arc<Mutex<usize>>,
@@ -19,7 +19,6 @@ impl StringCache {
19
19
  /// Create a new string cache
20
20
  pub fn new(enabled: bool) -> Self {
21
21
  Self {
22
- cache: Arc::new(Mutex::new(HashMap::new())),
23
22
  enabled,
24
23
  hits: Arc::new(Mutex::new(0)),
25
24
  misses: Arc::new(Mutex::new(0)),
@@ -36,9 +35,9 @@ impl StringCache {
36
35
 
37
36
  // Try to get or create the interned string
38
37
  let result = (|| -> Result<(), String> {
39
- let mut cache = self.cache.lock().map_err(|e| e.to_string())?;
38
+ let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
40
39
 
41
- if cache.contains_key(&s) {
40
+ if cache.contains_key(s.as_str()) {
42
41
  let mut hits = self.hits.lock().map_err(|e| e.to_string())?;
43
42
  *hits += 1;
44
43
  } else {
@@ -65,7 +64,7 @@ impl StringCache {
65
64
 
66
65
  /// Get cache statistics
67
66
  pub fn stats(&self) -> CacheStats {
68
- let cache_size = self.cache.lock().map(|c| c.len()).unwrap_or(0);
67
+ let cache_size = STRING_CACHE.lock().map(|c| c.len()).unwrap_or(0);
69
68
  let hits = self.hits.lock().map(|h| *h).unwrap_or(0);
70
69
  let misses = self.misses.lock().map(|m| *m).unwrap_or(0);
71
70
 
@@ -84,7 +83,7 @@ impl StringCache {
84
83
 
85
84
  /// Clear the cache
86
85
  pub fn clear(&mut self) {
87
- if let Ok(mut cache) = self.cache.lock() {
86
+ if let Ok(mut cache) = STRING_CACHE.lock() {
88
87
  cache.clear();
89
88
  }
90
89
  if let Ok(mut hits) = self.hits.lock() {
@@ -26,6 +26,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
26
26
  ParquetValue::Float64(_) => 8,
27
27
  ParquetValue::String(s) => s.len() + 24, // String overhead
28
28
  ParquetValue::Bytes(b) => b.len() + 24, // Vec overhead
29
+ ParquetValue::Uuid(_) => 16,
29
30
  ParquetValue::Date32(_) => 4,
30
31
  ParquetValue::Date64(_) => 8,
31
32
  ParquetValue::Decimal128(_, _) => 16 + 1, // value + scale
@@ -36,6 +37,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
36
37
  ParquetValue::TimestampNanos(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
37
38
  ParquetValue::TimeMillis(_) => 4,
38
39
  ParquetValue::TimeMicros(_) => 8,
40
+ ParquetValue::TimeNanos(_) => 8,
39
41
  ParquetValue::List(items) => {
40
42
  24 + items.iter().map(estimate_parquet_value_size).sum::<usize>()
41
43
  }
@@ -122,6 +124,21 @@ pub fn parse_parquet_write_args(
122
124
  })
123
125
  }
124
126
 
127
+ /// Convert a Ruby Value to a String, handling both String and Symbol types
128
+ pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
129
+ if value.is_nil() {
130
+ Ok(None)
131
+ } else if value.is_kind_of(ruby.class_string()) || value.is_kind_of(ruby.class_symbol()) {
132
+ let stringed = value.to_r_string()?.to_string()?;
133
+ Ok(Some(stringed))
134
+ } else {
135
+ Err(MagnusError::new(
136
+ magnus::exception::type_error(),
137
+ "Value must be a String or Symbol",
138
+ ))
139
+ }
140
+ }
141
+
125
142
  /// Handle block or enumerator creation
126
143
  pub fn handle_block_or_enum<F, T>(
127
144
  block_given: bool,
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.6.0"
2
+ VERSION = "0.6.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-07-03 00:00:00.000000000 Z
11
+ date: 2025-07-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys