parquet 0.6.1 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +5 -0
- data/ext/parquet-core/Cargo.toml +2 -2
- data/ext/parquet-core/src/arrow_conversion.rs +37 -11
- data/ext/parquet-core/src/reader.rs +7 -4
- data/ext/parquet-core/src/schema.rs +2 -0
- data/ext/parquet-core/src/value.rs +7 -0
- data/ext/parquet-core/src/writer.rs +14 -3
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +4 -4
- data/ext/parquet-ruby-adapter/Cargo.toml +1 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +79 -65
- data/ext/parquet-ruby-adapter/src/metadata.rs +103 -1
- data/ext/parquet-ruby-adapter/src/reader.rs +9 -2
- data/ext/parquet-ruby-adapter/src/schema.rs +61 -39
- data/ext/parquet-ruby-adapter/src/string_cache.rs +8 -9
- data/ext/parquet-ruby-adapter/src/utils.rs +2 -0
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: de6b7f5c61eb1e796e7066790e2c1e0ada9ba4519140cee4e2378cd402db2586
|
4
|
+
data.tar.gz: 5b1dc2e442b1be17af82dd3a431b6f3a66254410229055cdbd8713aa1c009be2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7635247bc9627cdafe79ee9be1072c13b0f8ec11549506f9a8b6170d9b095883ede0f8a8165d0340572d89e1501c1d5f144c20f963ab960171dcb5813f15022c
|
7
|
+
data.tar.gz: abb59172a54c8d63ca39f24bdda4c64b98a60015622fc7f4a7a2a4c42ff03c3327f134de8ec66e006eb38a8cb38da90824a987b44f6e2fcc2af1c01bd4d85ee1
|
data/Cargo.lock
CHANGED
@@ -225,6 +225,10 @@ dependencies = [
|
|
225
225
|
name = "arrow-schema"
|
226
226
|
version = "55.2.0"
|
227
227
|
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#54858bf019ff3faeb8f5b562da8c01012162aef0"
|
228
|
+
dependencies = [
|
229
|
+
"serde",
|
230
|
+
"serde_json",
|
231
|
+
]
|
228
232
|
|
229
233
|
[[package]]
|
230
234
|
name = "arrow-select"
|
@@ -1092,6 +1096,7 @@ dependencies = [
|
|
1092
1096
|
"rb-sys-env 0.2.2",
|
1093
1097
|
"tempfile",
|
1094
1098
|
"thiserror",
|
1099
|
+
"uuid",
|
1095
1100
|
]
|
1096
1101
|
|
1097
1102
|
[[package]]
|
data/ext/parquet-core/Cargo.toml
CHANGED
@@ -7,7 +7,7 @@ edition = "2021"
|
|
7
7
|
arrow = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
8
8
|
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
9
9
|
arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
10
|
-
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
10
|
+
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["canonical_extension_types"]}
|
11
11
|
bytes = "1.5"
|
12
12
|
indexmap = "2.2"
|
13
13
|
jiff = "0.2"
|
@@ -17,7 +17,7 @@ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24
|
|
17
17
|
rand = "0.9.1"
|
18
18
|
serde = { version = "1.0", features = ["derive"] }
|
19
19
|
thiserror = "2.0"
|
20
|
+
uuid = { version = "1.0", features = ["v4"] }
|
20
21
|
|
21
22
|
[dev-dependencies]
|
22
|
-
uuid = { version = "1.0", features = ["v4"] }
|
23
23
|
tempfile = "3.8"
|
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
use crate::{ParquetError, ParquetValue, Result};
|
9
9
|
use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
|
10
|
+
use arrow_schema::extension::Uuid as ArrowUuid;
|
10
11
|
use arrow_schema::{DataType, Field};
|
11
12
|
use bytes::Bytes;
|
12
13
|
use indexmap::IndexMap;
|
@@ -14,7 +15,11 @@ use ordered_float::OrderedFloat;
|
|
14
15
|
use std::sync::Arc;
|
15
16
|
|
16
17
|
/// Convert a single value from an Arrow array at the given index to a ParquetValue
|
17
|
-
pub fn arrow_to_parquet_value(
|
18
|
+
pub fn arrow_to_parquet_value(
|
19
|
+
field: &Field,
|
20
|
+
array: &dyn Array,
|
21
|
+
index: usize,
|
22
|
+
) -> Result<ParquetValue> {
|
18
23
|
use arrow_array::*;
|
19
24
|
|
20
25
|
if array.is_null(index) {
|
@@ -72,7 +77,6 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
72
77
|
let array = downcast_array::<Float64Array>(array)?;
|
73
78
|
Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
|
74
79
|
}
|
75
|
-
|
76
80
|
// String and binary types
|
77
81
|
DataType::Utf8 => {
|
78
82
|
let array = downcast_array::<StringArray>(array)?;
|
@@ -86,9 +90,15 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
86
90
|
}
|
87
91
|
DataType::FixedSizeBinary(_) => {
|
88
92
|
let array = downcast_array::<FixedSizeBinaryArray>(array)?;
|
89
|
-
|
90
|
-
|
91
|
-
|
93
|
+
let value = array.value(index);
|
94
|
+
match field.try_extension_type::<ArrowUuid>() {
|
95
|
+
Ok(_) => {
|
96
|
+
let uuid = uuid::Uuid::from_slice(value)
|
97
|
+
.map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
|
98
|
+
Ok(ParquetValue::Uuid(uuid))
|
99
|
+
}
|
100
|
+
Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
|
101
|
+
}
|
92
102
|
}
|
93
103
|
|
94
104
|
// Date and time types
|
@@ -140,6 +150,10 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
140
150
|
let array = downcast_array::<Time64MicrosecondArray>(array)?;
|
141
151
|
Ok(ParquetValue::TimeMicros(array.value(index)))
|
142
152
|
}
|
153
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
154
|
+
let array = downcast_array::<Time64NanosecondArray>(array)?;
|
155
|
+
Ok(ParquetValue::TimeNanos(array.value(index)))
|
156
|
+
}
|
143
157
|
_ => Err(ParquetError::Conversion(format!(
|
144
158
|
"Unsupported time64 unit: {:?}",
|
145
159
|
unit
|
@@ -173,13 +187,13 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
173
187
|
}
|
174
188
|
|
175
189
|
// Complex types
|
176
|
-
DataType::List(
|
190
|
+
DataType::List(item_field) => {
|
177
191
|
let array = downcast_array::<ListArray>(array)?;
|
178
192
|
let list_values = array.value(index);
|
179
193
|
|
180
194
|
let mut values = Vec::with_capacity(list_values.len());
|
181
195
|
for i in 0..list_values.len() {
|
182
|
-
values.push(arrow_to_parquet_value(&list_values, i)?);
|
196
|
+
values.push(arrow_to_parquet_value(item_field, &list_values, i)?);
|
183
197
|
}
|
184
198
|
|
185
199
|
Ok(ParquetValue::List(values))
|
@@ -192,10 +206,22 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
192
206
|
let keys = map_value.column(0);
|
193
207
|
let values = map_value.column(1);
|
194
208
|
|
209
|
+
let key_field = map_value
|
210
|
+
.fields()
|
211
|
+
.iter()
|
212
|
+
.find(|f| f.name() == "key")
|
213
|
+
.ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
|
214
|
+
|
215
|
+
let value_field = map_value
|
216
|
+
.fields()
|
217
|
+
.iter()
|
218
|
+
.find(|f| f.name() == "value")
|
219
|
+
.ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
|
220
|
+
|
195
221
|
let mut map_vec = Vec::with_capacity(keys.len());
|
196
222
|
for i in 0..keys.len() {
|
197
|
-
let key = arrow_to_parquet_value(keys, i)?;
|
198
|
-
let value = arrow_to_parquet_value(values, i)?;
|
223
|
+
let key = arrow_to_parquet_value(key_field, keys, i)?;
|
224
|
+
let value = arrow_to_parquet_value(value_field, values, i)?;
|
199
225
|
map_vec.push((key, value));
|
200
226
|
}
|
201
227
|
|
@@ -207,7 +233,7 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
207
233
|
let mut map = IndexMap::new();
|
208
234
|
for (col_idx, field) in array.fields().iter().enumerate() {
|
209
235
|
let column = array.column(col_idx);
|
210
|
-
let value = arrow_to_parquet_value(column, index)?;
|
236
|
+
let value = arrow_to_parquet_value(field, column, index)?;
|
211
237
|
map.insert(Arc::from(field.name().as_str()), value);
|
212
238
|
}
|
213
239
|
|
@@ -1108,7 +1134,7 @@ mod tests {
|
|
1108
1134
|
let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
|
1109
1135
|
|
1110
1136
|
for (i, expected) in values.iter().enumerate() {
|
1111
|
-
let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
|
1137
|
+
let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
|
1112
1138
|
assert_eq!(&actual, expected);
|
1113
1139
|
}
|
1114
1140
|
}
|
@@ -163,8 +163,10 @@ where
|
|
163
163
|
// Extract values from current row
|
164
164
|
let mut row_values = Vec::with_capacity(batch.num_columns());
|
165
165
|
|
166
|
-
|
167
|
-
|
166
|
+
let schema = batch.schema();
|
167
|
+
for (i, column) in batch.columns().iter().enumerate() {
|
168
|
+
let field = schema.field(i);
|
169
|
+
let value = match arrow_to_parquet_value(field, column, self.current_row) {
|
168
170
|
Ok(v) => v,
|
169
171
|
Err(e) => return Some(Err(e)),
|
170
172
|
};
|
@@ -228,12 +230,13 @@ where
|
|
228
230
|
let mut columns = Vec::with_capacity(batch.num_columns());
|
229
231
|
|
230
232
|
for (idx, column) in batch.columns().iter().enumerate() {
|
231
|
-
let
|
233
|
+
let field = self.schema.field(idx);
|
234
|
+
let column_name = field.name().to_string();
|
232
235
|
|
233
236
|
// Convert entire column to ParquetValues
|
234
237
|
let mut values = Vec::with_capacity(column.len());
|
235
238
|
for row_idx in 0..column.len() {
|
236
|
-
match arrow_to_parquet_value(column, row_idx) {
|
239
|
+
match arrow_to_parquet_value(field, column, row_idx) {
|
237
240
|
Ok(value) => values.push(value),
|
238
241
|
Err(e) => return Some(Err(e)),
|
239
242
|
}
|
@@ -72,6 +72,7 @@ pub enum PrimitiveType {
|
|
72
72
|
TimestampNanos(Option<Arc<str>>),
|
73
73
|
TimeMillis,
|
74
74
|
TimeMicros,
|
75
|
+
TimeNanos,
|
75
76
|
|
76
77
|
// Fixed-length byte array
|
77
78
|
FixedLenByteArray(i32),
|
@@ -146,6 +147,7 @@ impl PrimitiveType {
|
|
146
147
|
PrimitiveType::TimestampNanos(_) => "TimestampNanos",
|
147
148
|
PrimitiveType::TimeMillis => "TimeMillis",
|
148
149
|
PrimitiveType::TimeMicros => "TimeMicros",
|
150
|
+
PrimitiveType::TimeNanos => "TimeNanos",
|
149
151
|
PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
|
150
152
|
}
|
151
153
|
}
|
@@ -2,6 +2,7 @@ use bytes::Bytes;
|
|
2
2
|
use indexmap::IndexMap;
|
3
3
|
use num::BigInt;
|
4
4
|
use std::sync::Arc;
|
5
|
+
use uuid::Uuid;
|
5
6
|
|
6
7
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
7
8
|
pub enum ParquetValue {
|
@@ -22,6 +23,7 @@ pub enum ParquetValue {
|
|
22
23
|
Boolean(bool),
|
23
24
|
String(Arc<str>),
|
24
25
|
Bytes(Bytes),
|
26
|
+
Uuid(Uuid),
|
25
27
|
|
26
28
|
// Date/Time types
|
27
29
|
Date32(i32), // Days since epoch
|
@@ -40,6 +42,7 @@ pub enum ParquetValue {
|
|
40
42
|
// Time types
|
41
43
|
TimeMillis(i32), // Time of day in milliseconds since midnight
|
42
44
|
TimeMicros(i64), // Time of day in microseconds since midnight
|
45
|
+
TimeNanos(i64), // Time of day in nanoseconds since midnight
|
43
46
|
|
44
47
|
// Complex types
|
45
48
|
List(Vec<ParquetValue>),
|
@@ -68,6 +71,7 @@ impl std::hash::Hash for ParquetValue {
|
|
68
71
|
ParquetValue::Boolean(b) => b.hash(state),
|
69
72
|
ParquetValue::String(s) => s.hash(state),
|
70
73
|
ParquetValue::Bytes(b) => b.hash(state),
|
74
|
+
ParquetValue::Uuid(u) => u.hash(state),
|
71
75
|
ParquetValue::Date32(d) => d.hash(state),
|
72
76
|
ParquetValue::Date64(d) => d.hash(state),
|
73
77
|
ParquetValue::Decimal128(d, scale) => {
|
@@ -96,6 +100,7 @@ impl std::hash::Hash for ParquetValue {
|
|
96
100
|
}
|
97
101
|
ParquetValue::TimeMillis(t) => t.hash(state),
|
98
102
|
ParquetValue::TimeMicros(t) => t.hash(state),
|
103
|
+
ParquetValue::TimeNanos(t) => t.hash(state),
|
99
104
|
ParquetValue::List(l) => l.hash(state),
|
100
105
|
ParquetValue::Map(m) => m.hash(state),
|
101
106
|
ParquetValue::Record(r) => {
|
@@ -133,6 +138,7 @@ impl ParquetValue {
|
|
133
138
|
ParquetValue::Boolean(_) => "Boolean",
|
134
139
|
ParquetValue::String(_) => "String",
|
135
140
|
ParquetValue::Bytes(_) => "Bytes",
|
141
|
+
ParquetValue::Uuid(_) => "Uuid",
|
136
142
|
ParquetValue::Date32(_) => "Date32",
|
137
143
|
ParquetValue::Date64(_) => "Date64",
|
138
144
|
ParquetValue::Decimal128(_, _) => "Decimal128",
|
@@ -143,6 +149,7 @@ impl ParquetValue {
|
|
143
149
|
ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
|
144
150
|
ParquetValue::TimeMillis(_) => "TimeMillis",
|
145
151
|
ParquetValue::TimeMicros(_) => "TimeMicros",
|
152
|
+
ParquetValue::TimeNanos(_) => "TimeNanos",
|
146
153
|
ParquetValue::List(_) => "List",
|
147
154
|
ParquetValue::Map(_) => "Map",
|
148
155
|
ParquetValue::Record(_) => "Record",
|
@@ -235,6 +235,7 @@ where
|
|
235
235
|
(Date64(_), DataType::Date64) => 8,
|
236
236
|
(TimeMillis(_), DataType::Time32(_)) => 4,
|
237
237
|
(TimeMicros(_), DataType::Time64(_)) => 8,
|
238
|
+
(TimeNanos(_), DataType::Time64(_)) => 8,
|
238
239
|
(TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
|
239
240
|
(TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
|
240
241
|
(TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
|
@@ -364,7 +365,9 @@ where
|
|
364
365
|
writer.write(&batch)?;
|
365
366
|
|
366
367
|
// Check if we need to flush based on memory usage
|
367
|
-
if writer.in_progress_size() >= self.memory_threshold
|
368
|
+
if writer.in_progress_size() >= self.memory_threshold
|
369
|
+
|| writer.memory_size() >= self.memory_threshold
|
370
|
+
{
|
368
371
|
writer.flush()?;
|
369
372
|
}
|
370
373
|
} else {
|
@@ -496,6 +499,7 @@ fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str)
|
|
496
499
|
(Date64(_), DataType::Date64) => Ok(()),
|
497
500
|
(TimeMillis(_), DataType::Time32(_)) => Ok(()),
|
498
501
|
(TimeMicros(_), DataType::Time64(_)) => Ok(()),
|
502
|
+
(TimeNanos(_), DataType::Time64(_)) => Ok(()),
|
499
503
|
(TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
500
504
|
(TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
501
505
|
(TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
@@ -591,10 +595,16 @@ fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
|
|
591
595
|
name,
|
592
596
|
primitive_type,
|
593
597
|
nullable,
|
594
|
-
|
598
|
+
format,
|
595
599
|
} => {
|
596
600
|
let data_type = primitive_type_to_arrow(primitive_type)?;
|
597
|
-
|
601
|
+
let field = Field::new(name, data_type, *nullable);
|
602
|
+
let extended_field = if format.as_deref() == Some("uuid") {
|
603
|
+
field.with_extension_type(arrow_schema::extension::Uuid)
|
604
|
+
} else {
|
605
|
+
field
|
606
|
+
};
|
607
|
+
Ok(extended_field)
|
598
608
|
}
|
599
609
|
SchemaNode::List {
|
600
610
|
name,
|
@@ -671,6 +681,7 @@ fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
|
|
671
681
|
Date32 => DataType::Date32,
|
672
682
|
TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
|
673
683
|
TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
|
684
|
+
TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
|
674
685
|
TimestampMillis(tz) => DataType::Timestamp(
|
675
686
|
arrow_schema::TimeUnit::Millisecond,
|
676
687
|
// PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
|
@@ -99,7 +99,7 @@ fn test_decimal256_large_values() {
|
|
99
99
|
|
100
100
|
// Verify roundtrip
|
101
101
|
for i in 0..4 {
|
102
|
-
let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
|
102
|
+
let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
|
103
103
|
match (i, value) {
|
104
104
|
(0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
|
105
105
|
(1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
|
@@ -173,7 +173,7 @@ fn test_timestamp_with_timezone() {
|
|
173
173
|
|
174
174
|
// Verify roundtrip preserves timezone
|
175
175
|
for i in 0..3 {
|
176
|
-
let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
|
176
|
+
let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
|
177
177
|
match value {
|
178
178
|
ParquetValue::TimestampMillis(_, Some(tz)) => {
|
179
179
|
assert_eq!(tz.as_ref(), "America/New_York");
|
@@ -209,7 +209,7 @@ fn test_nested_list_of_lists() {
|
|
209
209
|
assert_eq!(array.len(), 1);
|
210
210
|
|
211
211
|
// Verify roundtrip
|
212
|
-
let value = arrow_to_parquet_value(array.as_ref(), 0).unwrap();
|
212
|
+
let value = arrow_to_parquet_value(&outer_field, array.as_ref(), 0).unwrap();
|
213
213
|
match value {
|
214
214
|
ParquetValue::List(items) => assert_eq!(items.len(), 5),
|
215
215
|
_ => panic!("Expected list"),
|
@@ -357,7 +357,7 @@ fn test_unsupported_arrow_types() {
|
|
357
357
|
)
|
358
358
|
.unwrap();
|
359
359
|
|
360
|
-
let result = arrow_to_parquet_value(&array, 0);
|
360
|
+
let result = arrow_to_parquet_value(&Field::new("int", DataType::Int32, false), &array, 0);
|
361
361
|
assert!(result.is_err());
|
362
362
|
assert!(result
|
363
363
|
.unwrap_err()
|
@@ -4,13 +4,14 @@ use indexmap::IndexMap;
|
|
4
4
|
use magnus::r_hash::ForEach;
|
5
5
|
use magnus::value::ReprValue;
|
6
6
|
use magnus::{
|
7
|
-
Error as MagnusError, IntoValue, Module, RArray, RHash, RString, Ruby, Symbol,
|
8
|
-
Value,
|
7
|
+
kwargs, Error as MagnusError, IntoValue, Module, RArray, RHash, RString, Ruby, Symbol,
|
8
|
+
TryConvert, Value,
|
9
9
|
};
|
10
10
|
use ordered_float::OrderedFloat;
|
11
11
|
use parquet_core::{ParquetError, ParquetValue, Result};
|
12
12
|
use std::cell::RefCell;
|
13
13
|
use std::sync::Arc;
|
14
|
+
use uuid::Uuid;
|
14
15
|
|
15
16
|
/// Ruby value converter
|
16
17
|
///
|
@@ -41,27 +42,6 @@ impl RubyValueConverter {
|
|
41
42
|
.map(|cache| cache.stats())
|
42
43
|
}
|
43
44
|
|
44
|
-
/// Convert a Ruby value to ParquetValue with type hint
|
45
|
-
/// This is the primary conversion method that handles all Ruby types
|
46
|
-
pub fn to_parquet_with_type_hint(
|
47
|
-
&mut self,
|
48
|
-
value: Value,
|
49
|
-
type_hint: Option<&parquet_core::PrimitiveType>,
|
50
|
-
) -> Result<ParquetValue> {
|
51
|
-
// Handle nil values
|
52
|
-
if value.is_nil() {
|
53
|
-
return Ok(ParquetValue::Null);
|
54
|
-
}
|
55
|
-
|
56
|
-
// If we have a type hint, use it to guide conversion
|
57
|
-
if let Some(hint) = type_hint {
|
58
|
-
return self.convert_with_type_hint(value, hint);
|
59
|
-
}
|
60
|
-
|
61
|
-
// Otherwise, infer type from Ruby value
|
62
|
-
self.infer_and_convert(value)
|
63
|
-
}
|
64
|
-
|
65
45
|
/// Convert a Ruby value to ParquetValue with schema hint
|
66
46
|
/// This handles both primitive and complex types
|
67
47
|
pub fn to_parquet_with_schema_hint(
|
@@ -115,7 +95,7 @@ impl RubyValueConverter {
|
|
115
95
|
use parquet_core::PrimitiveType::*;
|
116
96
|
|
117
97
|
// Special handling for UUID format
|
118
|
-
if let (
|
98
|
+
if let (FixedLenByteArray(16), Some("uuid")) = (type_hint, format) {
|
119
99
|
return self.convert_to_uuid_binary(value);
|
120
100
|
}
|
121
101
|
|
@@ -156,6 +136,7 @@ impl RubyValueConverter {
|
|
156
136
|
Date64 => self.convert_to_date64(value, None),
|
157
137
|
TimeMillis => self.convert_to_time_millis(value),
|
158
138
|
TimeMicros => self.convert_to_time_micros(value),
|
139
|
+
TimeNanos => self.convert_to_time_nanos(value),
|
159
140
|
TimestampSecond(schema_tz) => {
|
160
141
|
self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
|
161
142
|
}
|
@@ -484,32 +465,19 @@ impl RubyValueConverter {
|
|
484
465
|
|
485
466
|
// Convert value to string
|
486
467
|
let uuid_str: String = value
|
487
|
-
.
|
488
|
-
.
|
468
|
+
.to_r_string()
|
469
|
+
.map_err(|e: MagnusError| {
|
470
|
+
ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
|
471
|
+
})?
|
472
|
+
.to_string()
|
489
473
|
.map_err(|e: MagnusError| {
|
490
474
|
ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
|
491
475
|
})?;
|
492
476
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
"Invalid UUID format: expected 32 hex characters (ignoring hyphens), got {}",
|
498
|
-
clean_uuid.len()
|
499
|
-
)));
|
500
|
-
}
|
501
|
-
|
502
|
-
// Parse hex string to bytes
|
503
|
-
let mut bytes = Vec::with_capacity(16);
|
504
|
-
for i in 0..16 {
|
505
|
-
let hex_byte = &clean_uuid[i * 2..i * 2 + 2];
|
506
|
-
let byte = u8::from_str_radix(hex_byte, 16).map_err(|_| {
|
507
|
-
ParquetError::Conversion(format!("Invalid hex character in UUID: {}", hex_byte))
|
508
|
-
})?;
|
509
|
-
bytes.push(byte);
|
510
|
-
}
|
511
|
-
|
512
|
-
Ok(ParquetValue::Bytes(bytes.into()))
|
477
|
+
let parsed = uuid::Uuid::parse_str(&uuid_str)
|
478
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to parse UUID: {}", e)))?;
|
479
|
+
let bytes = Bytes::copy_from_slice(parsed.as_bytes());
|
480
|
+
Ok(ParquetValue::Bytes(bytes))
|
513
481
|
}
|
514
482
|
|
515
483
|
fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
|
@@ -692,6 +660,38 @@ impl RubyValueConverter {
|
|
692
660
|
)))
|
693
661
|
}
|
694
662
|
|
663
|
+
fn convert_to_time_nanos(&self, value: Value) -> Result<ParquetValue> {
|
664
|
+
if value.is_nil() {
|
665
|
+
return Ok(ParquetValue::Null);
|
666
|
+
}
|
667
|
+
|
668
|
+
// Convert to microseconds since midnight
|
669
|
+
let ruby = Ruby::get()
|
670
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
671
|
+
if value.is_kind_of(ruby.class_time()) {
|
672
|
+
let hour: i64 = value
|
673
|
+
.funcall("hour", ())
|
674
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
675
|
+
let min: i64 = value
|
676
|
+
.funcall("min", ())
|
677
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
678
|
+
let sec: i64 = value
|
679
|
+
.funcall("sec", ())
|
680
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
681
|
+
let nsec: i64 = value
|
682
|
+
.funcall("nsec", ())
|
683
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
684
|
+
|
685
|
+
let nanos = (hour * 3600 + min * 60 + sec) * 1_000_000_000 + nsec;
|
686
|
+
return Ok(ParquetValue::TimeNanos(nanos));
|
687
|
+
}
|
688
|
+
|
689
|
+
Err(ParquetError::Conversion(format!(
|
690
|
+
"Cannot convert {} to time_micros",
|
691
|
+
value.class()
|
692
|
+
)))
|
693
|
+
}
|
694
|
+
|
695
695
|
// Timestamp conversion methods that respect schema timezone
|
696
696
|
fn convert_to_timestamp_second_with_tz(
|
697
697
|
&self,
|
@@ -1399,21 +1399,11 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
|
1399
1399
|
ParquetValue::Float32(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
|
1400
1400
|
ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
|
1401
1401
|
ParquetValue::String(s) => Ok(s.into_value_with(&ruby)),
|
1402
|
-
ParquetValue::
|
1403
|
-
|
1404
|
-
|
1405
|
-
|
1406
|
-
|
1407
|
-
"{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
|
1408
|
-
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
|
1409
|
-
b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
|
1410
|
-
);
|
1411
|
-
Ok(uuid_str.into_value_with(&ruby))
|
1412
|
-
} else {
|
1413
|
-
// Regular bytes - convert to string
|
1414
|
-
Ok(ruby.str_from_slice(&b).as_value())
|
1415
|
-
}
|
1416
|
-
}
|
1402
|
+
ParquetValue::Uuid(u) => Ok(u
|
1403
|
+
.hyphenated()
|
1404
|
+
.encode_lower(&mut Uuid::encode_buffer())
|
1405
|
+
.into_value_with(&ruby)),
|
1406
|
+
ParquetValue::Bytes(b) => Ok(ruby.enc_str_new(&b, ruby.ascii8bit_encoding()).as_value()),
|
1417
1407
|
ParquetValue::Date32(days) => {
|
1418
1408
|
// Convert days since epoch to Date object
|
1419
1409
|
let _ = ruby.require("date");
|
@@ -1503,10 +1493,26 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
|
1503
1493
|
.funcall("utc", (year, month, day, hours, minutes, seconds, us))
|
1504
1494
|
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1505
1495
|
}
|
1496
|
+
ParquetValue::TimeNanos(nanos) => {
|
1497
|
+
let time_class = ruby.class_time();
|
1498
|
+
let secs = nanos / 1_000_000_000;
|
1499
|
+
let nsec = nanos % 1_000_000_000;
|
1500
|
+
time_class
|
1501
|
+
.funcall(
|
1502
|
+
"at",
|
1503
|
+
(
|
1504
|
+
secs,
|
1505
|
+
nsec,
|
1506
|
+
Symbol::new("nanosecond"),
|
1507
|
+
kwargs!("in" => "UTC"),
|
1508
|
+
),
|
1509
|
+
)
|
1510
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1511
|
+
}
|
1506
1512
|
ParquetValue::TimestampSecond(secs, tz) => {
|
1507
1513
|
let time_class = ruby.class_time();
|
1508
1514
|
let time = time_class
|
1509
|
-
.funcall::<_, _, Value>("at", (secs,))
|
1515
|
+
.funcall::<_, _, Value>("at", (secs, kwargs!("in" => "UTC")))
|
1510
1516
|
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1511
1517
|
apply_timezone(time, &tz)
|
1512
1518
|
}
|
@@ -1515,7 +1521,7 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
|
1515
1521
|
let secs = millis / 1000;
|
1516
1522
|
let usec = (millis % 1000) * 1000; // Convert millisecond remainder to microseconds
|
1517
1523
|
let time = time_class
|
1518
|
-
.funcall::<_, _, Value>("at", (secs, usec))
|
1524
|
+
.funcall::<_, _, Value>("at", (secs, usec, kwargs!("in" => "UTC")))
|
1519
1525
|
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1520
1526
|
apply_timezone(time, &tz)
|
1521
1527
|
}
|
@@ -1524,7 +1530,7 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
|
1524
1530
|
let secs = micros / 1_000_000;
|
1525
1531
|
let usec = micros % 1_000_000; // Already in microseconds
|
1526
1532
|
let time = time_class
|
1527
|
-
.funcall::<_, _, Value>("at", (secs, usec))
|
1533
|
+
.funcall::<_, _, Value>("at", (secs, usec, kwargs!("in" => "UTC")))
|
1528
1534
|
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1529
1535
|
apply_timezone(time, &tz)
|
1530
1536
|
}
|
@@ -1534,7 +1540,15 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
|
1534
1540
|
let nsec = nanos % 1_000_000_000;
|
1535
1541
|
// Use the nanosecond form of Time.at
|
1536
1542
|
let time = time_class
|
1537
|
-
.funcall::<_, _, Value>(
|
1543
|
+
.funcall::<_, _, Value>(
|
1544
|
+
"at",
|
1545
|
+
(
|
1546
|
+
secs,
|
1547
|
+
nsec,
|
1548
|
+
Symbol::new("nanosecond"),
|
1549
|
+
kwargs!("in" => "UTC"),
|
1550
|
+
),
|
1551
|
+
)
|
1538
1552
|
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1539
1553
|
apply_timezone(time, &tz)
|
1540
1554
|
}
|
@@ -115,9 +115,111 @@ impl TryIntoValue for RubyParquetMetaData {
|
|
115
115
|
.map_err(|e| {
|
116
116
|
RubyAdapterError::metadata(format!("Failed to set converted_type: {}", e))
|
117
117
|
})?;
|
118
|
+
|
118
119
|
if let Some(logical_type) = basic_info.logical_type() {
|
120
|
+
let logical_type_value = match logical_type {
|
121
|
+
parquet::basic::LogicalType::Decimal { scale, precision } => {
|
122
|
+
let logical_hash = handle.hash_new();
|
123
|
+
logical_hash.aset("type", "Decimal").map_err(|e| {
|
124
|
+
RubyAdapterError::metadata(format!("Failed to set type: {}", e))
|
125
|
+
})?;
|
126
|
+
logical_hash.aset("scale", scale).map_err(|e| {
|
127
|
+
RubyAdapterError::metadata(format!("Failed to set scale: {}", e))
|
128
|
+
})?;
|
129
|
+
logical_hash.aset("precision", precision).map_err(|e| {
|
130
|
+
RubyAdapterError::metadata(format!("Failed to set precision: {}", e))
|
131
|
+
})?;
|
132
|
+
logical_hash.as_value()
|
133
|
+
}
|
134
|
+
parquet::basic::LogicalType::Time {
|
135
|
+
is_adjusted_to_u_t_c,
|
136
|
+
unit,
|
137
|
+
} => {
|
138
|
+
let logical_hash = handle.hash_new();
|
139
|
+
logical_hash.aset("type", "Time").map_err(|e| {
|
140
|
+
RubyAdapterError::metadata(format!("Failed to set type: {}", e))
|
141
|
+
})?;
|
142
|
+
logical_hash
|
143
|
+
.aset(
|
144
|
+
"is_adjusted_to_utc",
|
145
|
+
is_adjusted_to_u_t_c.to_string().as_str(),
|
146
|
+
)
|
147
|
+
.map_err(|e| {
|
148
|
+
RubyAdapterError::metadata(format!(
|
149
|
+
"Failed to set is_adjusted_to_u_t_c: {}",
|
150
|
+
e
|
151
|
+
))
|
152
|
+
})?;
|
153
|
+
|
154
|
+
let unit_str = match unit {
|
155
|
+
parquet::basic::TimeUnit::MILLIS(_) => "millis",
|
156
|
+
parquet::basic::TimeUnit::MICROS(_) => "micros",
|
157
|
+
parquet::basic::TimeUnit::NANOS(_) => "nanos",
|
158
|
+
};
|
159
|
+
logical_hash.aset("unit", unit_str).map_err(|e| {
|
160
|
+
RubyAdapterError::metadata(format!("Failed to set unit: {}", e))
|
161
|
+
})?;
|
162
|
+
logical_hash.as_value()
|
163
|
+
}
|
164
|
+
parquet::basic::LogicalType::Timestamp {
|
165
|
+
is_adjusted_to_u_t_c,
|
166
|
+
unit,
|
167
|
+
} => {
|
168
|
+
let logical_hash = handle.hash_new();
|
169
|
+
logical_hash.aset("type", "Timestamp").map_err(|e| {
|
170
|
+
RubyAdapterError::metadata(format!("Failed to set type: {}", e))
|
171
|
+
})?;
|
172
|
+
logical_hash
|
173
|
+
.aset("is_adjusted_to_utc", is_adjusted_to_u_t_c)
|
174
|
+
.map_err(|e| {
|
175
|
+
RubyAdapterError::metadata(format!(
|
176
|
+
"Failed to set is_adjusted_to_u_t_c: {}",
|
177
|
+
e
|
178
|
+
))
|
179
|
+
})?;
|
180
|
+
let unit_str = match unit {
|
181
|
+
parquet::basic::TimeUnit::MILLIS(_) => "millis",
|
182
|
+
parquet::basic::TimeUnit::MICROS(_) => "micros",
|
183
|
+
parquet::basic::TimeUnit::NANOS(_) => "nanos",
|
184
|
+
};
|
185
|
+
logical_hash.aset("unit", unit_str).map_err(|e| {
|
186
|
+
RubyAdapterError::metadata(format!("Failed to set unit: {}", e))
|
187
|
+
})?;
|
188
|
+
logical_hash.as_value()
|
189
|
+
}
|
190
|
+
parquet::basic::LogicalType::Integer {
|
191
|
+
bit_width,
|
192
|
+
is_signed,
|
193
|
+
} => {
|
194
|
+
let logical_hash = handle.hash_new();
|
195
|
+
logical_hash.aset("type", "Integer").map_err(|e| {
|
196
|
+
RubyAdapterError::metadata(format!("Failed to set type: {}", e))
|
197
|
+
})?;
|
198
|
+
logical_hash.aset("bit_width", bit_width).map_err(|e| {
|
199
|
+
RubyAdapterError::metadata(format!("Failed to set bit_width: {}", e))
|
200
|
+
})?;
|
201
|
+
logical_hash
|
202
|
+
.aset("is_signed", is_signed.to_string().as_str())
|
203
|
+
.map_err(|e| {
|
204
|
+
RubyAdapterError::metadata(format!(
|
205
|
+
"Failed to set is_signed: {}",
|
206
|
+
e
|
207
|
+
))
|
208
|
+
})?;
|
209
|
+
logical_hash.as_value()
|
210
|
+
}
|
211
|
+
_ => {
|
212
|
+
let logical_hash = handle.hash_new();
|
213
|
+
logical_hash
|
214
|
+
.aset("type", format!("{:?}", logical_type))
|
215
|
+
.map_err(|e| {
|
216
|
+
RubyAdapterError::metadata(format!("Failed to set type: {}", e))
|
217
|
+
})?;
|
218
|
+
logical_hash.as_value()
|
219
|
+
}
|
220
|
+
};
|
119
221
|
field_hash
|
120
|
-
.aset("logical_type",
|
222
|
+
.aset("logical_type", logical_type_value)
|
121
223
|
.map_err(|e| {
|
122
224
|
RubyAdapterError::metadata(format!("Failed to set logical_type: {}", e))
|
123
225
|
})?;
|
@@ -2,6 +2,7 @@ use magnus::value::ReprValue;
|
|
2
2
|
use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
|
3
3
|
use parquet_core::reader::Reader;
|
4
4
|
|
5
|
+
use crate::StringCache;
|
5
6
|
use crate::{
|
6
7
|
converter::parquet_to_ruby,
|
7
8
|
io::{RubyIOReader, ThreadSafeRubyIOReader},
|
@@ -101,6 +102,12 @@ pub fn each_row(
|
|
101
102
|
})?;
|
102
103
|
let mut row_count = 0u64;
|
103
104
|
|
105
|
+
let mut cache = StringCache::new(true);
|
106
|
+
let interned_column_names = column_names
|
107
|
+
.iter()
|
108
|
+
.map(|name| cache.intern(name.clone()))
|
109
|
+
.collect::<Vec<_>>();
|
110
|
+
|
104
111
|
for row_result in row_iter {
|
105
112
|
let row = row_result
|
106
113
|
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
@@ -120,11 +127,11 @@ pub fn each_row(
|
|
120
127
|
ParserResultType::Hash => {
|
121
128
|
let hash: RHash = ruby.hash_new();
|
122
129
|
for (idx, value) in row.into_iter().enumerate() {
|
123
|
-
if idx <
|
130
|
+
if idx < interned_column_names.len() {
|
124
131
|
let ruby_value = parquet_to_ruby(value).map_err(|e| {
|
125
132
|
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
126
133
|
})?;
|
127
|
-
hash.aset(
|
134
|
+
hash.aset(interned_column_names[idx].as_ref(), ruby_value)?;
|
128
135
|
}
|
129
136
|
}
|
130
137
|
hash.as_value()
|
@@ -1,8 +1,9 @@
|
|
1
1
|
use magnus::value::ReprValue;
|
2
2
|
use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
|
3
|
-
use parquet_core::{ParquetError, PrimitiveType,
|
3
|
+
use parquet_core::{ParquetError, PrimitiveType, Schema, SchemaNode};
|
4
4
|
|
5
5
|
use crate::utils::parse_string_or_symbol;
|
6
|
+
use crate::RubyAdapterError;
|
6
7
|
|
7
8
|
/// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
|
8
9
|
pub struct RubySchemaBuilder;
|
@@ -13,18 +14,18 @@ impl RubySchemaBuilder {
|
|
13
14
|
}
|
14
15
|
|
15
16
|
/// Parse a Ruby schema definition (hash) into a SchemaNode
|
16
|
-
fn parse_schema_node(
|
17
|
+
fn parse_schema_node(
|
18
|
+
&self,
|
19
|
+
name: String,
|
20
|
+
schema_def: Value,
|
21
|
+
) -> Result<SchemaNode, RubyAdapterError> {
|
17
22
|
// If it's a Hash, parse it as a complex type
|
18
23
|
if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
|
19
24
|
return self.parse_hash_schema_node(name, hash);
|
20
25
|
}
|
21
26
|
|
22
27
|
// Otherwise, try to parse as a simple type symbol
|
23
|
-
if let Ok(
|
24
|
-
let type_str = type_sym.name().map_err(|e: MagnusError| {
|
25
|
-
ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
|
26
|
-
})?;
|
27
|
-
|
28
|
+
if let Ok(type_str) = schema_def.to_r_string()?.to_string() {
|
28
29
|
// Check if it's a complex type with angle brackets
|
29
30
|
if type_str.contains('<') {
|
30
31
|
return self.parse_complex_type_string(name, type_str.to_string(), true);
|
@@ -40,22 +41,24 @@ impl RubySchemaBuilder {
|
|
40
41
|
});
|
41
42
|
}
|
42
43
|
|
43
|
-
Err(
|
44
|
+
Err(RubyAdapterError::InvalidInput(format!(
|
44
45
|
"Expected Hash or Symbol for schema definition, got {}",
|
45
46
|
schema_def.class()
|
46
47
|
)))
|
47
48
|
}
|
48
49
|
|
49
50
|
/// Parse a Ruby hash schema node
|
50
|
-
fn parse_hash_schema_node(
|
51
|
+
fn parse_hash_schema_node(
|
52
|
+
&self,
|
53
|
+
name: String,
|
54
|
+
hash: RHash,
|
55
|
+
) -> Result<SchemaNode, RubyAdapterError> {
|
51
56
|
// Get the type field
|
52
|
-
let type_sym:
|
53
|
-
.fetch::<_,
|
57
|
+
let type_sym: Value = hash
|
58
|
+
.fetch::<_, Value>(Symbol::new("type"))
|
54
59
|
.map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
|
55
60
|
|
56
|
-
let type_str = type_sym.
|
57
|
-
ParquetError::Conversion(format!("Failed to get type name: {}", e))
|
58
|
-
})?;
|
61
|
+
let type_str = type_sym.to_r_string()?.to_string()?;
|
59
62
|
|
60
63
|
// Get nullable field (default to true)
|
61
64
|
let nullable = hash
|
@@ -142,6 +145,15 @@ impl RubySchemaBuilder {
|
|
142
145
|
|
143
146
|
// Primitive types
|
144
147
|
primitive_type => {
|
148
|
+
if format.as_deref() == Some("uuid") {
|
149
|
+
return Ok(SchemaNode::Primitive {
|
150
|
+
name,
|
151
|
+
primitive_type: PrimitiveType::FixedLenByteArray(16),
|
152
|
+
nullable,
|
153
|
+
format,
|
154
|
+
});
|
155
|
+
}
|
156
|
+
|
145
157
|
// Get precision and scale for decimal types
|
146
158
|
let precision = hash
|
147
159
|
.fetch::<_, Value>(Symbol::new("precision"))
|
@@ -196,7 +208,7 @@ impl RubySchemaBuilder {
|
|
196
208
|
name: String,
|
197
209
|
type_str: String,
|
198
210
|
nullable: bool,
|
199
|
-
) -> Result<SchemaNode> {
|
211
|
+
) -> Result<SchemaNode, RubyAdapterError> {
|
200
212
|
if type_str.starts_with("list<") && type_str.ends_with('>') {
|
201
213
|
let inner_type = &type_str[5..type_str.len() - 1];
|
202
214
|
let item_name = format!("{}_item", name);
|
@@ -229,7 +241,7 @@ impl RubySchemaBuilder {
|
|
229
241
|
let inner = &type_str[4..type_str.len() - 1];
|
230
242
|
let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
|
231
243
|
if parts.len() != 2 {
|
232
|
-
return Err(
|
244
|
+
return Err(RubyAdapterError::InvalidInput(format!(
|
233
245
|
"Invalid map type: {}",
|
234
246
|
type_str
|
235
247
|
)));
|
@@ -255,7 +267,7 @@ impl RubySchemaBuilder {
|
|
255
267
|
}),
|
256
268
|
})
|
257
269
|
} else {
|
258
|
-
Err(
|
270
|
+
Err(RubyAdapterError::InvalidInput(format!(
|
259
271
|
"Unknown complex type: {}",
|
260
272
|
type_str
|
261
273
|
)))
|
@@ -263,7 +275,7 @@ impl RubySchemaBuilder {
|
|
263
275
|
}
|
264
276
|
|
265
277
|
/// Parse a field definition from a Ruby hash
|
266
|
-
fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
|
278
|
+
fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode, RubyAdapterError> {
|
267
279
|
let name: String = field_hash
|
268
280
|
.fetch(Symbol::new("name"))
|
269
281
|
.map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
|
@@ -274,7 +286,7 @@ impl RubySchemaBuilder {
|
|
274
286
|
self.parse_schema_node(name, field_hash.as_value())
|
275
287
|
} else {
|
276
288
|
// This might be a simplified definition - look for known field patterns
|
277
|
-
Err(
|
289
|
+
Err(RubyAdapterError::InvalidInput(format!(
|
278
290
|
"Field '{}' missing 'type' definition",
|
279
291
|
name
|
280
292
|
)))
|
@@ -288,7 +300,7 @@ impl RubySchemaBuilder {
|
|
288
300
|
precision: Option<u8>,
|
289
301
|
scale: Option<i8>,
|
290
302
|
timezone: Option<String>,
|
291
|
-
) -> Result<PrimitiveType> {
|
303
|
+
) -> Result<PrimitiveType, RubyAdapterError> {
|
292
304
|
// Check if it's a decimal type with parentheses notation like "decimal(5,2)"
|
293
305
|
if type_str.starts_with("decimal(") && type_str.ends_with(')') {
|
294
306
|
let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
|
@@ -324,6 +336,14 @@ impl RubySchemaBuilder {
|
|
324
336
|
}
|
325
337
|
}
|
326
338
|
|
339
|
+
if type_str.starts_with("fixed_len_byte_array(") && type_str.ends_with(')') {
|
340
|
+
let params = &type_str[20..type_str.len() - 1];
|
341
|
+
let len = params.parse::<i32>().map_err(|_| {
|
342
|
+
ParquetError::Schema(format!("Invalid fixed_len_byte_array length: {}", params))
|
343
|
+
})?;
|
344
|
+
return Ok(PrimitiveType::FixedLenByteArray(len));
|
345
|
+
}
|
346
|
+
|
327
347
|
match type_str.as_str() {
|
328
348
|
"boolean" | "bool" => Ok(PrimitiveType::Boolean),
|
329
349
|
"int8" => Ok(PrimitiveType::Int8),
|
@@ -356,8 +376,9 @@ impl RubySchemaBuilder {
|
|
356
376
|
// PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
|
357
377
|
Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
|
358
378
|
}
|
359
|
-
"
|
360
|
-
"
|
379
|
+
"time_millis" => Ok(PrimitiveType::TimeMillis),
|
380
|
+
"time_micros" => Ok(PrimitiveType::TimeMicros),
|
381
|
+
"time_nanos" => Ok(PrimitiveType::TimeNanos),
|
361
382
|
"decimal" => {
|
362
383
|
// Use provided precision/scale or defaults
|
363
384
|
let p = precision.unwrap_or(38);
|
@@ -380,7 +401,7 @@ impl RubySchemaBuilder {
|
|
380
401
|
let s = scale.unwrap_or(0);
|
381
402
|
Ok(PrimitiveType::Decimal256(p, s))
|
382
403
|
}
|
383
|
-
_ => Err(
|
404
|
+
_ => Err(RubyAdapterError::InvalidInput(format!(
|
384
405
|
"Unknown primitive type: {}",
|
385
406
|
type_str
|
386
407
|
))),
|
@@ -396,7 +417,7 @@ impl Default for RubySchemaBuilder {
|
|
396
417
|
|
397
418
|
/// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
|
398
419
|
/// and Ruby Value is not Send/Sync
|
399
|
-
pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
420
|
+
pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema, RubyAdapterError> {
|
400
421
|
let builder = RubySchemaBuilder::new();
|
401
422
|
|
402
423
|
// The Ruby schema should be a hash with a root struct
|
@@ -430,7 +451,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
|
430
451
|
let mut unique_names = std::collections::HashSet::new();
|
431
452
|
for name in &field_names {
|
432
453
|
if !unique_names.insert(name) {
|
433
|
-
return Err(
|
454
|
+
return Err(RubyAdapterError::InvalidInput(format!(
|
434
455
|
"Duplicate field names in root level schema: {:?}",
|
435
456
|
field_names
|
436
457
|
)));
|
@@ -443,7 +464,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
|
443
464
|
fields: field_nodes,
|
444
465
|
}
|
445
466
|
} else {
|
446
|
-
return Err(
|
467
|
+
return Err(RubyAdapterError::InvalidInput(
|
447
468
|
"Schema must have 'type' or 'fields' key".to_string(),
|
448
469
|
));
|
449
470
|
};
|
@@ -452,18 +473,18 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
|
452
473
|
parquet_core::SchemaBuilder::new()
|
453
474
|
.with_root(root_node)
|
454
475
|
.build()
|
455
|
-
.map_err(|e|
|
476
|
+
.map_err(|e| RubyAdapterError::InvalidInput(e.to_string()))
|
456
477
|
}
|
457
478
|
|
458
479
|
/// Convert a Parquet schema back to Ruby representation
|
459
|
-
pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
|
480
|
+
pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value, RubyAdapterError> {
|
460
481
|
let ruby = Ruby::get()
|
461
482
|
.map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
|
462
483
|
|
463
484
|
schema_node_to_ruby(&schema.root, &ruby)
|
464
485
|
}
|
465
486
|
|
466
|
-
fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
|
487
|
+
fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value, RubyAdapterError> {
|
467
488
|
let hash = RHash::new();
|
468
489
|
|
469
490
|
match node {
|
@@ -552,6 +573,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
|
|
552
573
|
PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
|
553
574
|
PrimitiveType::TimeMillis => Symbol::new("time_millis"),
|
554
575
|
PrimitiveType::TimeMicros => Symbol::new("time_micros"),
|
576
|
+
PrimitiveType::TimeNanos => Symbol::new("time_nanos"),
|
555
577
|
PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
|
556
578
|
PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
|
557
579
|
PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
|
@@ -597,7 +619,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
|
|
597
619
|
/// Convert old schema format to new format
|
598
620
|
/// Old: [{ "column_name" => "type" }, ...]
|
599
621
|
/// New: [{ name: "column_name", type: :type }, ...]
|
600
|
-
pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
|
622
|
+
pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray, RubyAdapterError> {
|
601
623
|
let new_schema = RArray::new();
|
602
624
|
|
603
625
|
for item in schema.into_iter() {
|
@@ -630,7 +652,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
|
|
630
652
|
);
|
631
653
|
|
632
654
|
if let Err(e) = process_result {
|
633
|
-
return Err(
|
655
|
+
return Err(RubyAdapterError::InvalidInput(format!(
|
634
656
|
"Failed to process field: {}",
|
635
657
|
e
|
636
658
|
)));
|
@@ -645,7 +667,7 @@ pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray> {
|
|
645
667
|
}
|
646
668
|
|
647
669
|
/// Check if schema is in new DSL format (hash with type: :struct)
|
648
|
-
pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
|
670
|
+
pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool, RubyAdapterError> {
|
649
671
|
if !schema_value.is_kind_of(ruby.class_hash()) {
|
650
672
|
return Ok(false);
|
651
673
|
}
|
@@ -678,7 +700,7 @@ pub fn process_schema_value(
|
|
678
700
|
ruby: &Ruby,
|
679
701
|
schema_value: Value,
|
680
702
|
data_array: Option<&RArray>,
|
681
|
-
) -> Result<Value> {
|
703
|
+
) -> Result<Value, RubyAdapterError> {
|
682
704
|
// Check if it's the new DSL format
|
683
705
|
if is_dsl_schema(ruby, schema_value)? {
|
684
706
|
// For DSL format, pass it directly to ruby_schema_to_parquet
|
@@ -716,7 +738,7 @@ pub fn process_schema_value(
|
|
716
738
|
convert_legacy_schema(ruby, array)?
|
717
739
|
}
|
718
740
|
} else {
|
719
|
-
return Err(
|
741
|
+
return Err(RubyAdapterError::InvalidInput(
|
720
742
|
"schema array must contain hashes".to_string(),
|
721
743
|
));
|
722
744
|
}
|
@@ -733,13 +755,13 @@ pub fn process_schema_value(
|
|
733
755
|
ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
|
734
756
|
})?
|
735
757
|
} else {
|
736
|
-
return Err(
|
758
|
+
return Err(RubyAdapterError::InvalidInput(
|
737
759
|
"schema hash must have 'fields' key or be in DSL format with 'type' key"
|
738
760
|
.to_string(),
|
739
761
|
));
|
740
762
|
}
|
741
763
|
} else {
|
742
|
-
return Err(
|
764
|
+
return Err(RubyAdapterError::InvalidInput(
|
743
765
|
"schema must be nil, an array, or a hash".to_string(),
|
744
766
|
));
|
745
767
|
};
|
@@ -748,7 +770,7 @@ pub fn process_schema_value(
|
|
748
770
|
if schema_array.is_empty() {
|
749
771
|
if let Some(data) = data_array {
|
750
772
|
if data.is_empty() {
|
751
|
-
return Err(
|
773
|
+
return Err(RubyAdapterError::InvalidInput(
|
752
774
|
"Cannot infer schema from empty data".to_string(),
|
753
775
|
));
|
754
776
|
}
|
@@ -767,7 +789,7 @@ pub fn process_schema_value(
|
|
767
789
|
})?;
|
768
790
|
first_array.len()
|
769
791
|
} else {
|
770
|
-
return Err(
|
792
|
+
return Err(RubyAdapterError::InvalidInput(
|
771
793
|
"First data item must be an array".to_string(),
|
772
794
|
));
|
773
795
|
};
|
@@ -793,7 +815,7 @@ pub fn process_schema_value(
|
|
793
815
|
|
794
816
|
schema_array = new_schema;
|
795
817
|
} else {
|
796
|
-
return Err(
|
818
|
+
return Err(RubyAdapterError::InvalidInput(
|
797
819
|
"Schema is required when data is not provided for inference".to_string(),
|
798
820
|
));
|
799
821
|
}
|
@@ -1,15 +1,15 @@
|
|
1
1
|
use std::collections::HashMap;
|
2
|
-
use std::sync::{Arc, Mutex};
|
2
|
+
use std::sync::{Arc, LazyLock, Mutex};
|
3
3
|
|
4
4
|
use magnus::RString;
|
5
5
|
|
6
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<String, &'static str>>> =
|
7
|
+
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
8
|
+
|
6
9
|
/// A cache for interning strings in the Ruby VM to reduce memory usage
|
7
10
|
/// when there are many repeated strings
|
8
11
|
#[derive(Debug)]
|
9
12
|
pub struct StringCache {
|
10
|
-
/// The actual cache is shared behind an Arc<Mutex> to allow cloning
|
11
|
-
/// while maintaining a single global cache
|
12
|
-
cache: Arc<Mutex<HashMap<String, &'static str>>>,
|
13
13
|
enabled: bool,
|
14
14
|
hits: Arc<Mutex<usize>>,
|
15
15
|
misses: Arc<Mutex<usize>>,
|
@@ -19,7 +19,6 @@ impl StringCache {
|
|
19
19
|
/// Create a new string cache
|
20
20
|
pub fn new(enabled: bool) -> Self {
|
21
21
|
Self {
|
22
|
-
cache: Arc::new(Mutex::new(HashMap::new())),
|
23
22
|
enabled,
|
24
23
|
hits: Arc::new(Mutex::new(0)),
|
25
24
|
misses: Arc::new(Mutex::new(0)),
|
@@ -36,9 +35,9 @@ impl StringCache {
|
|
36
35
|
|
37
36
|
// Try to get or create the interned string
|
38
37
|
let result = (|| -> Result<(), String> {
|
39
|
-
let mut cache =
|
38
|
+
let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
40
39
|
|
41
|
-
if cache.contains_key(
|
40
|
+
if cache.contains_key(s.as_str()) {
|
42
41
|
let mut hits = self.hits.lock().map_err(|e| e.to_string())?;
|
43
42
|
*hits += 1;
|
44
43
|
} else {
|
@@ -65,7 +64,7 @@ impl StringCache {
|
|
65
64
|
|
66
65
|
/// Get cache statistics
|
67
66
|
pub fn stats(&self) -> CacheStats {
|
68
|
-
let cache_size =
|
67
|
+
let cache_size = STRING_CACHE.lock().map(|c| c.len()).unwrap_or(0);
|
69
68
|
let hits = self.hits.lock().map(|h| *h).unwrap_or(0);
|
70
69
|
let misses = self.misses.lock().map(|m| *m).unwrap_or(0);
|
71
70
|
|
@@ -84,7 +83,7 @@ impl StringCache {
|
|
84
83
|
|
85
84
|
/// Clear the cache
|
86
85
|
pub fn clear(&mut self) {
|
87
|
-
if let Ok(mut cache) =
|
86
|
+
if let Ok(mut cache) = STRING_CACHE.lock() {
|
88
87
|
cache.clear();
|
89
88
|
}
|
90
89
|
if let Ok(mut hits) = self.hits.lock() {
|
@@ -26,6 +26,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
|
|
26
26
|
ParquetValue::Float64(_) => 8,
|
27
27
|
ParquetValue::String(s) => s.len() + 24, // String overhead
|
28
28
|
ParquetValue::Bytes(b) => b.len() + 24, // Vec overhead
|
29
|
+
ParquetValue::Uuid(_) => 16,
|
29
30
|
ParquetValue::Date32(_) => 4,
|
30
31
|
ParquetValue::Date64(_) => 8,
|
31
32
|
ParquetValue::Decimal128(_, _) => 16 + 1, // value + scale
|
@@ -36,6 +37,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
|
|
36
37
|
ParquetValue::TimestampNanos(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
|
37
38
|
ParquetValue::TimeMillis(_) => 4,
|
38
39
|
ParquetValue::TimeMicros(_) => 8,
|
40
|
+
ParquetValue::TimeNanos(_) => 8,
|
39
41
|
ParquetValue::List(items) => {
|
40
42
|
24 + items.iter().map(estimate_parquet_value_size).sum::<usize>()
|
41
43
|
}
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.7.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-07-
|
11
|
+
date: 2025-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|