parquet 0.6.0 → 0.6.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +5 -0
- data/ext/parquet/src/adapter_ffi.rs +16 -10
- data/ext/parquet-core/Cargo.toml +2 -2
- data/ext/parquet-core/src/arrow_conversion.rs +35 -11
- data/ext/parquet-core/src/reader.rs +7 -4
- data/ext/parquet-core/src/schema.rs +2 -0
- data/ext/parquet-core/src/value.rs +7 -0
- data/ext/parquet-core/src/writer.rs +14 -3
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +4 -4
- data/ext/parquet-ruby-adapter/Cargo.toml +1 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +53 -59
- data/ext/parquet-ruby-adapter/src/reader.rs +9 -2
- data/ext/parquet-ruby-adapter/src/schema.rs +69 -40
- data/ext/parquet-ruby-adapter/src/string_cache.rs +8 -9
- data/ext/parquet-ruby-adapter/src/utils.rs +17 -0
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dfd19103b2414e7feeaa6d1ec3c9a9c25ce42cf5c8362baa37e3b9d8d5245f82
|
4
|
+
data.tar.gz: c5c1170dbdc3635577738a568688c36adc9670710f4b0d570fae29294e337754
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c9bf72b4e708c750ab7ae30afd97aef7f456a4249904fe3eb74f916557e28ca1a53bc262a6492db38c38162ab3e3f684e30f0c70dabbaf8f8f4145ef4d9af259
|
7
|
+
data.tar.gz: 164c5b0569d3d13242bcff7c09d66edf67b279d8289f97def043000a508b4333dd1387a4f47517be09023cd270cf3b6dfd57fdf658ac1b52e25f3f5b2b5ca30c
|
data/Cargo.lock
CHANGED
@@ -225,6 +225,10 @@ dependencies = [
|
|
225
225
|
name = "arrow-schema"
|
226
226
|
version = "55.2.0"
|
227
227
|
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan_06-24-remove_primitive_map_key_assertion_on_record_reader#54858bf019ff3faeb8f5b562da8c01012162aef0"
|
228
|
+
dependencies = [
|
229
|
+
"serde",
|
230
|
+
"serde_json",
|
231
|
+
]
|
228
232
|
|
229
233
|
[[package]]
|
230
234
|
name = "arrow-select"
|
@@ -1092,6 +1096,7 @@ dependencies = [
|
|
1092
1096
|
"rb-sys-env 0.2.2",
|
1093
1097
|
"tempfile",
|
1094
1098
|
"thiserror",
|
1099
|
+
"uuid",
|
1095
1100
|
]
|
1096
1101
|
|
1097
1102
|
[[package]]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
use magnus::scan_args::{get_kwargs, scan_args};
|
2
|
-
use magnus::value::ReprValue;
|
3
2
|
use magnus::{Error as MagnusError, Ruby, Value};
|
3
|
+
use parquet_ruby_adapter::utils::parse_string_or_symbol;
|
4
4
|
use parquet_ruby_adapter::{
|
5
5
|
logger::RubyLogger, types::ParserResultType, utils::parse_parquet_write_args,
|
6
6
|
};
|
@@ -34,11 +34,14 @@ pub fn each_row(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
|
34
34
|
)?;
|
35
35
|
|
36
36
|
let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
|
37
|
-
rt_value
|
38
|
-
.
|
39
|
-
|
37
|
+
parse_string_or_symbol(&ruby, rt_value)?
|
38
|
+
.ok_or_else(|| {
|
39
|
+
MagnusError::new(magnus::exception::arg_error(), "result_type cannot be nil")
|
40
|
+
})?
|
40
41
|
.parse()
|
41
|
-
.map_err(|
|
42
|
+
.map_err(|_| {
|
43
|
+
MagnusError::new(magnus::exception::arg_error(), "Invalid result_type value")
|
44
|
+
})?
|
42
45
|
} else {
|
43
46
|
ParserResultType::Hash
|
44
47
|
};
|
@@ -89,11 +92,14 @@ pub fn each_column(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError>
|
|
89
92
|
)?;
|
90
93
|
|
91
94
|
let result_type: ParserResultType = if let Some(rt_value) = kwargs.optional.0.flatten() {
|
92
|
-
rt_value
|
93
|
-
.
|
94
|
-
|
95
|
+
parse_string_or_symbol(&ruby, rt_value)?
|
96
|
+
.ok_or_else(|| {
|
97
|
+
MagnusError::new(magnus::exception::arg_error(), "result_type cannot be nil")
|
98
|
+
})?
|
95
99
|
.parse()
|
96
|
-
.map_err(|
|
100
|
+
.map_err(|_| {
|
101
|
+
MagnusError::new(magnus::exception::arg_error(), "Invalid result_type value")
|
102
|
+
})?
|
97
103
|
} else {
|
98
104
|
ParserResultType::Hash
|
99
105
|
};
|
@@ -101,7 +107,7 @@ pub fn each_column(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError>
|
|
101
107
|
let batch_size = if let Some(bs) = kwargs.optional.2.flatten() {
|
102
108
|
if bs == 0 {
|
103
109
|
return Err(MagnusError::new(
|
104
|
-
|
110
|
+
magnus::exception::arg_error(),
|
105
111
|
"batch_size must be greater than 0",
|
106
112
|
));
|
107
113
|
}
|
data/ext/parquet-core/Cargo.toml
CHANGED
@@ -7,7 +7,7 @@ edition = "2021"
|
|
7
7
|
arrow = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
8
8
|
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
9
9
|
arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
10
|
-
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader" }
|
10
|
+
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24-remove_primitive_map_key_assertion_on_record_reader", features = ["canonical_extension_types"]}
|
11
11
|
bytes = "1.5"
|
12
12
|
indexmap = "2.2"
|
13
13
|
jiff = "0.2"
|
@@ -17,7 +17,7 @@ parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan_06-24
|
|
17
17
|
rand = "0.9.1"
|
18
18
|
serde = { version = "1.0", features = ["derive"] }
|
19
19
|
thiserror = "2.0"
|
20
|
+
uuid = { version = "1.0", features = ["v4"] }
|
20
21
|
|
21
22
|
[dev-dependencies]
|
22
|
-
uuid = { version = "1.0", features = ["v4"] }
|
23
23
|
tempfile = "3.8"
|
@@ -7,6 +7,7 @@
|
|
7
7
|
|
8
8
|
use crate::{ParquetError, ParquetValue, Result};
|
9
9
|
use arrow_array::{builder::*, Array, ArrayRef, ListArray, MapArray, StructArray};
|
10
|
+
use arrow_schema::extension::Uuid as ArrowUuid;
|
10
11
|
use arrow_schema::{DataType, Field};
|
11
12
|
use bytes::Bytes;
|
12
13
|
use indexmap::IndexMap;
|
@@ -14,7 +15,11 @@ use ordered_float::OrderedFloat;
|
|
14
15
|
use std::sync::Arc;
|
15
16
|
|
16
17
|
/// Convert a single value from an Arrow array at the given index to a ParquetValue
|
17
|
-
pub fn arrow_to_parquet_value(
|
18
|
+
pub fn arrow_to_parquet_value(
|
19
|
+
field: &Field,
|
20
|
+
array: &dyn Array,
|
21
|
+
index: usize,
|
22
|
+
) -> Result<ParquetValue> {
|
18
23
|
use arrow_array::*;
|
19
24
|
|
20
25
|
if array.is_null(index) {
|
@@ -72,7 +77,6 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
72
77
|
let array = downcast_array::<Float64Array>(array)?;
|
73
78
|
Ok(ParquetValue::Float64(OrderedFloat(array.value(index))))
|
74
79
|
}
|
75
|
-
|
76
80
|
// String and binary types
|
77
81
|
DataType::Utf8 => {
|
78
82
|
let array = downcast_array::<StringArray>(array)?;
|
@@ -86,9 +90,15 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
86
90
|
}
|
87
91
|
DataType::FixedSizeBinary(_) => {
|
88
92
|
let array = downcast_array::<FixedSizeBinaryArray>(array)?;
|
89
|
-
|
90
|
-
|
91
|
-
|
93
|
+
let value = array.value(index);
|
94
|
+
match field.try_extension_type::<ArrowUuid>() {
|
95
|
+
Ok(_) => {
|
96
|
+
let uuid = uuid::Uuid::from_slice(value)
|
97
|
+
.map_err(|e| ParquetError::Conversion(format!("Invalid UUID: {}", e)))?;
|
98
|
+
Ok(ParquetValue::Uuid(uuid))
|
99
|
+
}
|
100
|
+
Err(_) => Ok(ParquetValue::Bytes(Bytes::copy_from_slice(value))),
|
101
|
+
}
|
92
102
|
}
|
93
103
|
|
94
104
|
// Date and time types
|
@@ -140,6 +150,10 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
140
150
|
let array = downcast_array::<Time64MicrosecondArray>(array)?;
|
141
151
|
Ok(ParquetValue::TimeMicros(array.value(index)))
|
142
152
|
}
|
153
|
+
arrow_schema::TimeUnit::Nanosecond => {
|
154
|
+
let array = downcast_array::<Time64NanosecondArray>(array)?;
|
155
|
+
Ok(ParquetValue::TimeNanos(array.value(index)))
|
156
|
+
}
|
143
157
|
_ => Err(ParquetError::Conversion(format!(
|
144
158
|
"Unsupported time64 unit: {:?}",
|
145
159
|
unit
|
@@ -173,13 +187,13 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
173
187
|
}
|
174
188
|
|
175
189
|
// Complex types
|
176
|
-
DataType::List(
|
190
|
+
DataType::List(item_field) => {
|
177
191
|
let array = downcast_array::<ListArray>(array)?;
|
178
192
|
let list_values = array.value(index);
|
179
193
|
|
180
194
|
let mut values = Vec::with_capacity(list_values.len());
|
181
195
|
for i in 0..list_values.len() {
|
182
|
-
values.push(arrow_to_parquet_value(&list_values, i)?);
|
196
|
+
values.push(arrow_to_parquet_value(item_field, &list_values, i)?);
|
183
197
|
}
|
184
198
|
|
185
199
|
Ok(ParquetValue::List(values))
|
@@ -192,10 +206,20 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
192
206
|
let keys = map_value.column(0);
|
193
207
|
let values = map_value.column(1);
|
194
208
|
|
209
|
+
let key_field = map_value
|
210
|
+
.fields()
|
211
|
+
.iter().find(|f| f.name() == "key")
|
212
|
+
.ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
|
213
|
+
|
214
|
+
let value_field = map_value
|
215
|
+
.fields()
|
216
|
+
.iter().find(|f| f.name() == "value")
|
217
|
+
.ok_or_else(|| ParquetError::Conversion("No value field found".to_string()))?;
|
218
|
+
|
195
219
|
let mut map_vec = Vec::with_capacity(keys.len());
|
196
220
|
for i in 0..keys.len() {
|
197
|
-
let key = arrow_to_parquet_value(keys, i)?;
|
198
|
-
let value = arrow_to_parquet_value(values, i)?;
|
221
|
+
let key = arrow_to_parquet_value(key_field, keys, i)?;
|
222
|
+
let value = arrow_to_parquet_value(value_field, values, i)?;
|
199
223
|
map_vec.push((key, value));
|
200
224
|
}
|
201
225
|
|
@@ -207,7 +231,7 @@ pub fn arrow_to_parquet_value(array: &dyn Array, index: usize) -> Result<Parquet
|
|
207
231
|
let mut map = IndexMap::new();
|
208
232
|
for (col_idx, field) in array.fields().iter().enumerate() {
|
209
233
|
let column = array.column(col_idx);
|
210
|
-
let value = arrow_to_parquet_value(column, index)?;
|
234
|
+
let value = arrow_to_parquet_value(field, column, index)?;
|
211
235
|
map.insert(Arc::from(field.name().as_str()), value);
|
212
236
|
}
|
213
237
|
|
@@ -1108,7 +1132,7 @@ mod tests {
|
|
1108
1132
|
let array = parquet_values_to_arrow_array(values.clone(), &field).unwrap();
|
1109
1133
|
|
1110
1134
|
for (i, expected) in values.iter().enumerate() {
|
1111
|
-
let actual = arrow_to_parquet_value(array.as_ref(), i).unwrap();
|
1135
|
+
let actual = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
|
1112
1136
|
assert_eq!(&actual, expected);
|
1113
1137
|
}
|
1114
1138
|
}
|
@@ -163,8 +163,10 @@ where
|
|
163
163
|
// Extract values from current row
|
164
164
|
let mut row_values = Vec::with_capacity(batch.num_columns());
|
165
165
|
|
166
|
-
|
167
|
-
|
166
|
+
let schema = batch.schema();
|
167
|
+
for (i, column) in batch.columns().iter().enumerate() {
|
168
|
+
let field = schema.field(i);
|
169
|
+
let value = match arrow_to_parquet_value(field, column, self.current_row) {
|
168
170
|
Ok(v) => v,
|
169
171
|
Err(e) => return Some(Err(e)),
|
170
172
|
};
|
@@ -228,12 +230,13 @@ where
|
|
228
230
|
let mut columns = Vec::with_capacity(batch.num_columns());
|
229
231
|
|
230
232
|
for (idx, column) in batch.columns().iter().enumerate() {
|
231
|
-
let
|
233
|
+
let field = self.schema.field(idx);
|
234
|
+
let column_name = field.name().to_string();
|
232
235
|
|
233
236
|
// Convert entire column to ParquetValues
|
234
237
|
let mut values = Vec::with_capacity(column.len());
|
235
238
|
for row_idx in 0..column.len() {
|
236
|
-
match arrow_to_parquet_value(column, row_idx) {
|
239
|
+
match arrow_to_parquet_value(field, column, row_idx) {
|
237
240
|
Ok(value) => values.push(value),
|
238
241
|
Err(e) => return Some(Err(e)),
|
239
242
|
}
|
@@ -72,6 +72,7 @@ pub enum PrimitiveType {
|
|
72
72
|
TimestampNanos(Option<Arc<str>>),
|
73
73
|
TimeMillis,
|
74
74
|
TimeMicros,
|
75
|
+
TimeNanos,
|
75
76
|
|
76
77
|
// Fixed-length byte array
|
77
78
|
FixedLenByteArray(i32),
|
@@ -146,6 +147,7 @@ impl PrimitiveType {
|
|
146
147
|
PrimitiveType::TimestampNanos(_) => "TimestampNanos",
|
147
148
|
PrimitiveType::TimeMillis => "TimeMillis",
|
148
149
|
PrimitiveType::TimeMicros => "TimeMicros",
|
150
|
+
PrimitiveType::TimeNanos => "TimeNanos",
|
149
151
|
PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
|
150
152
|
}
|
151
153
|
}
|
@@ -2,6 +2,7 @@ use bytes::Bytes;
|
|
2
2
|
use indexmap::IndexMap;
|
3
3
|
use num::BigInt;
|
4
4
|
use std::sync::Arc;
|
5
|
+
use uuid::Uuid;
|
5
6
|
|
6
7
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
7
8
|
pub enum ParquetValue {
|
@@ -22,6 +23,7 @@ pub enum ParquetValue {
|
|
22
23
|
Boolean(bool),
|
23
24
|
String(Arc<str>),
|
24
25
|
Bytes(Bytes),
|
26
|
+
Uuid(Uuid),
|
25
27
|
|
26
28
|
// Date/Time types
|
27
29
|
Date32(i32), // Days since epoch
|
@@ -40,6 +42,7 @@ pub enum ParquetValue {
|
|
40
42
|
// Time types
|
41
43
|
TimeMillis(i32), // Time of day in milliseconds since midnight
|
42
44
|
TimeMicros(i64), // Time of day in microseconds since midnight
|
45
|
+
TimeNanos(i64), // Time of day in nanoseconds since midnight
|
43
46
|
|
44
47
|
// Complex types
|
45
48
|
List(Vec<ParquetValue>),
|
@@ -68,6 +71,7 @@ impl std::hash::Hash for ParquetValue {
|
|
68
71
|
ParquetValue::Boolean(b) => b.hash(state),
|
69
72
|
ParquetValue::String(s) => s.hash(state),
|
70
73
|
ParquetValue::Bytes(b) => b.hash(state),
|
74
|
+
ParquetValue::Uuid(u) => u.hash(state),
|
71
75
|
ParquetValue::Date32(d) => d.hash(state),
|
72
76
|
ParquetValue::Date64(d) => d.hash(state),
|
73
77
|
ParquetValue::Decimal128(d, scale) => {
|
@@ -96,6 +100,7 @@ impl std::hash::Hash for ParquetValue {
|
|
96
100
|
}
|
97
101
|
ParquetValue::TimeMillis(t) => t.hash(state),
|
98
102
|
ParquetValue::TimeMicros(t) => t.hash(state),
|
103
|
+
ParquetValue::TimeNanos(t) => t.hash(state),
|
99
104
|
ParquetValue::List(l) => l.hash(state),
|
100
105
|
ParquetValue::Map(m) => m.hash(state),
|
101
106
|
ParquetValue::Record(r) => {
|
@@ -133,6 +138,7 @@ impl ParquetValue {
|
|
133
138
|
ParquetValue::Boolean(_) => "Boolean",
|
134
139
|
ParquetValue::String(_) => "String",
|
135
140
|
ParquetValue::Bytes(_) => "Bytes",
|
141
|
+
ParquetValue::Uuid(_) => "Uuid",
|
136
142
|
ParquetValue::Date32(_) => "Date32",
|
137
143
|
ParquetValue::Date64(_) => "Date64",
|
138
144
|
ParquetValue::Decimal128(_, _) => "Decimal128",
|
@@ -143,6 +149,7 @@ impl ParquetValue {
|
|
143
149
|
ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
|
144
150
|
ParquetValue::TimeMillis(_) => "TimeMillis",
|
145
151
|
ParquetValue::TimeMicros(_) => "TimeMicros",
|
152
|
+
ParquetValue::TimeNanos(_) => "TimeNanos",
|
146
153
|
ParquetValue::List(_) => "List",
|
147
154
|
ParquetValue::Map(_) => "Map",
|
148
155
|
ParquetValue::Record(_) => "Record",
|
@@ -235,6 +235,7 @@ where
|
|
235
235
|
(Date64(_), DataType::Date64) => 8,
|
236
236
|
(TimeMillis(_), DataType::Time32(_)) => 4,
|
237
237
|
(TimeMicros(_), DataType::Time64(_)) => 8,
|
238
|
+
(TimeNanos(_), DataType::Time64(_)) => 8,
|
238
239
|
(TimestampSecond(_, _), DataType::Timestamp(_, _)) => 8,
|
239
240
|
(TimestampMillis(_, _), DataType::Timestamp(_, _)) => 8,
|
240
241
|
(TimestampMicros(_, _), DataType::Timestamp(_, _)) => 8,
|
@@ -364,7 +365,9 @@ where
|
|
364
365
|
writer.write(&batch)?;
|
365
366
|
|
366
367
|
// Check if we need to flush based on memory usage
|
367
|
-
if writer.in_progress_size() >= self.memory_threshold
|
368
|
+
if writer.in_progress_size() >= self.memory_threshold
|
369
|
+
|| writer.memory_size() >= self.memory_threshold
|
370
|
+
{
|
368
371
|
writer.flush()?;
|
369
372
|
}
|
370
373
|
} else {
|
@@ -496,6 +499,7 @@ fn validate_value_against_field(value: &ParquetValue, field: &Field, path: &str)
|
|
496
499
|
(Date64(_), DataType::Date64) => Ok(()),
|
497
500
|
(TimeMillis(_), DataType::Time32(_)) => Ok(()),
|
498
501
|
(TimeMicros(_), DataType::Time64(_)) => Ok(()),
|
502
|
+
(TimeNanos(_), DataType::Time64(_)) => Ok(()),
|
499
503
|
(TimestampSecond(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
500
504
|
(TimestampMillis(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
501
505
|
(TimestampMicros(_, _), DataType::Timestamp(_, _)) => Ok(()),
|
@@ -591,10 +595,16 @@ fn schema_node_to_arrow_field(node: &SchemaNode) -> Result<Field> {
|
|
591
595
|
name,
|
592
596
|
primitive_type,
|
593
597
|
nullable,
|
594
|
-
|
598
|
+
format,
|
595
599
|
} => {
|
596
600
|
let data_type = primitive_type_to_arrow(primitive_type)?;
|
597
|
-
|
601
|
+
let field = Field::new(name, data_type, *nullable);
|
602
|
+
let extended_field = if format.as_deref() == Some("uuid") {
|
603
|
+
field.with_extension_type(arrow_schema::extension::Uuid)
|
604
|
+
} else {
|
605
|
+
field
|
606
|
+
};
|
607
|
+
Ok(extended_field)
|
598
608
|
}
|
599
609
|
SchemaNode::List {
|
600
610
|
name,
|
@@ -671,6 +681,7 @@ fn primitive_type_to_arrow(ptype: &crate::PrimitiveType) -> Result<DataType> {
|
|
671
681
|
Date32 => DataType::Date32,
|
672
682
|
TimeMillis => DataType::Time32(arrow_schema::TimeUnit::Millisecond),
|
673
683
|
TimeMicros => DataType::Time64(arrow_schema::TimeUnit::Microsecond),
|
684
|
+
TimeNanos => DataType::Time64(arrow_schema::TimeUnit::Nanosecond),
|
674
685
|
TimestampMillis(tz) => DataType::Timestamp(
|
675
686
|
arrow_schema::TimeUnit::Millisecond,
|
676
687
|
// PARQUET SPEC: ANY timezone (e.g., "+09:00", "America/New_York") means
|
@@ -99,7 +99,7 @@ fn test_decimal256_large_values() {
|
|
99
99
|
|
100
100
|
// Verify roundtrip
|
101
101
|
for i in 0..4 {
|
102
|
-
let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
|
102
|
+
let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
|
103
103
|
match (i, value) {
|
104
104
|
(0, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_positive.clone()),
|
105
105
|
(1, ParquetValue::Decimal256(v, _)) => assert_eq!(v, large_negative.clone()),
|
@@ -173,7 +173,7 @@ fn test_timestamp_with_timezone() {
|
|
173
173
|
|
174
174
|
// Verify roundtrip preserves timezone
|
175
175
|
for i in 0..3 {
|
176
|
-
let value = arrow_to_parquet_value(array.as_ref(), i).unwrap();
|
176
|
+
let value = arrow_to_parquet_value(&field, array.as_ref(), i).unwrap();
|
177
177
|
match value {
|
178
178
|
ParquetValue::TimestampMillis(_, Some(tz)) => {
|
179
179
|
assert_eq!(tz.as_ref(), "America/New_York");
|
@@ -209,7 +209,7 @@ fn test_nested_list_of_lists() {
|
|
209
209
|
assert_eq!(array.len(), 1);
|
210
210
|
|
211
211
|
// Verify roundtrip
|
212
|
-
let value = arrow_to_parquet_value(array.as_ref(), 0).unwrap();
|
212
|
+
let value = arrow_to_parquet_value(&outer_field, array.as_ref(), 0).unwrap();
|
213
213
|
match value {
|
214
214
|
ParquetValue::List(items) => assert_eq!(items.len(), 5),
|
215
215
|
_ => panic!("Expected list"),
|
@@ -357,7 +357,7 @@ fn test_unsupported_arrow_types() {
|
|
357
357
|
)
|
358
358
|
.unwrap();
|
359
359
|
|
360
|
-
let result = arrow_to_parquet_value(&array, 0);
|
360
|
+
let result = arrow_to_parquet_value(&Field::new("int", DataType::Int32, false), &array, 0);
|
361
361
|
assert!(result.is_err());
|
362
362
|
assert!(result
|
363
363
|
.unwrap_err()
|
@@ -41,27 +41,6 @@ impl RubyValueConverter {
|
|
41
41
|
.map(|cache| cache.stats())
|
42
42
|
}
|
43
43
|
|
44
|
-
/// Convert a Ruby value to ParquetValue with type hint
|
45
|
-
/// This is the primary conversion method that handles all Ruby types
|
46
|
-
pub fn to_parquet_with_type_hint(
|
47
|
-
&mut self,
|
48
|
-
value: Value,
|
49
|
-
type_hint: Option<&parquet_core::PrimitiveType>,
|
50
|
-
) -> Result<ParquetValue> {
|
51
|
-
// Handle nil values
|
52
|
-
if value.is_nil() {
|
53
|
-
return Ok(ParquetValue::Null);
|
54
|
-
}
|
55
|
-
|
56
|
-
// If we have a type hint, use it to guide conversion
|
57
|
-
if let Some(hint) = type_hint {
|
58
|
-
return self.convert_with_type_hint(value, hint);
|
59
|
-
}
|
60
|
-
|
61
|
-
// Otherwise, infer type from Ruby value
|
62
|
-
self.infer_and_convert(value)
|
63
|
-
}
|
64
|
-
|
65
44
|
/// Convert a Ruby value to ParquetValue with schema hint
|
66
45
|
/// This handles both primitive and complex types
|
67
46
|
pub fn to_parquet_with_schema_hint(
|
@@ -115,7 +94,7 @@ impl RubyValueConverter {
|
|
115
94
|
use parquet_core::PrimitiveType::*;
|
116
95
|
|
117
96
|
// Special handling for UUID format
|
118
|
-
if let (
|
97
|
+
if let (FixedLenByteArray(16), Some("uuid")) = (type_hint, format) {
|
119
98
|
return self.convert_to_uuid_binary(value);
|
120
99
|
}
|
121
100
|
|
@@ -156,6 +135,7 @@ impl RubyValueConverter {
|
|
156
135
|
Date64 => self.convert_to_date64(value, None),
|
157
136
|
TimeMillis => self.convert_to_time_millis(value),
|
158
137
|
TimeMicros => self.convert_to_time_micros(value),
|
138
|
+
TimeNanos => self.convert_to_time_nanos(value),
|
159
139
|
TimestampSecond(schema_tz) => {
|
160
140
|
self.convert_to_timestamp_second_with_tz(value, schema_tz.as_deref())
|
161
141
|
}
|
@@ -484,32 +464,19 @@ impl RubyValueConverter {
|
|
484
464
|
|
485
465
|
// Convert value to string
|
486
466
|
let uuid_str: String = value
|
487
|
-
.
|
488
|
-
.
|
467
|
+
.to_r_string()
|
468
|
+
.map_err(|e: MagnusError| {
|
469
|
+
ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
|
470
|
+
})?
|
471
|
+
.to_string()
|
489
472
|
.map_err(|e: MagnusError| {
|
490
473
|
ParquetError::Conversion(format!("Failed to convert to UUID string: {}", e))
|
491
474
|
})?;
|
492
475
|
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
"Invalid UUID format: expected 32 hex characters (ignoring hyphens), got {}",
|
498
|
-
clean_uuid.len()
|
499
|
-
)));
|
500
|
-
}
|
501
|
-
|
502
|
-
// Parse hex string to bytes
|
503
|
-
let mut bytes = Vec::with_capacity(16);
|
504
|
-
for i in 0..16 {
|
505
|
-
let hex_byte = &clean_uuid[i * 2..i * 2 + 2];
|
506
|
-
let byte = u8::from_str_radix(hex_byte, 16).map_err(|_| {
|
507
|
-
ParquetError::Conversion(format!("Invalid hex character in UUID: {}", hex_byte))
|
508
|
-
})?;
|
509
|
-
bytes.push(byte);
|
510
|
-
}
|
511
|
-
|
512
|
-
Ok(ParquetValue::Bytes(bytes.into()))
|
476
|
+
let parsed = uuid::Uuid::parse_str(&uuid_str)
|
477
|
+
.map_err(|e| ParquetError::Conversion(format!("Failed to parse UUID: {}", e)))?;
|
478
|
+
let bytes = Bytes::copy_from_slice(parsed.as_bytes());
|
479
|
+
Ok(ParquetValue::Bytes(bytes))
|
513
480
|
}
|
514
481
|
|
515
482
|
fn convert_to_date32(&self, value: Value, date_format: Option<&str>) -> Result<ParquetValue> {
|
@@ -692,6 +659,38 @@ impl RubyValueConverter {
|
|
692
659
|
)))
|
693
660
|
}
|
694
661
|
|
662
|
+
fn convert_to_time_nanos(&self, value: Value) -> Result<ParquetValue> {
|
663
|
+
if value.is_nil() {
|
664
|
+
return Ok(ParquetValue::Null);
|
665
|
+
}
|
666
|
+
|
667
|
+
// Convert to microseconds since midnight
|
668
|
+
let ruby = Ruby::get()
|
669
|
+
.map_err(|_| ParquetError::Conversion("Failed to get Ruby runtime".to_string()))?;
|
670
|
+
if value.is_kind_of(ruby.class_time()) {
|
671
|
+
let hour: i64 = value
|
672
|
+
.funcall("hour", ())
|
673
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
674
|
+
let min: i64 = value
|
675
|
+
.funcall("min", ())
|
676
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
677
|
+
let sec: i64 = value
|
678
|
+
.funcall("sec", ())
|
679
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
680
|
+
let nsec: i64 = value
|
681
|
+
.funcall("nsec", ())
|
682
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
683
|
+
|
684
|
+
let nanos = (hour * 3600 + min * 60 + sec) * 1_000_000_000 + nsec;
|
685
|
+
return Ok(ParquetValue::TimeNanos(nanos));
|
686
|
+
}
|
687
|
+
|
688
|
+
Err(ParquetError::Conversion(format!(
|
689
|
+
"Cannot convert {} to time_micros",
|
690
|
+
value.class()
|
691
|
+
)))
|
692
|
+
}
|
693
|
+
|
695
694
|
// Timestamp conversion methods that respect schema timezone
|
696
695
|
fn convert_to_timestamp_second_with_tz(
|
697
696
|
&self,
|
@@ -1399,21 +1398,8 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
|
1399
1398
|
ParquetValue::Float32(OrderedFloat(f)) => Ok((f as f64).into_value_with(&ruby)),
|
1400
1399
|
ParquetValue::Float64(OrderedFloat(f)) => Ok(f.into_value_with(&ruby)),
|
1401
1400
|
ParquetValue::String(s) => Ok(s.into_value_with(&ruby)),
|
1402
|
-
ParquetValue::
|
1403
|
-
|
1404
|
-
if b.len() == 16 {
|
1405
|
-
// Format as UUID string
|
1406
|
-
let uuid_str = format!(
|
1407
|
-
"{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}",
|
1408
|
-
b[0], b[1], b[2], b[3], b[4], b[5], b[6], b[7],
|
1409
|
-
b[8], b[9], b[10], b[11], b[12], b[13], b[14], b[15]
|
1410
|
-
);
|
1411
|
-
Ok(uuid_str.into_value_with(&ruby))
|
1412
|
-
} else {
|
1413
|
-
// Regular bytes - convert to string
|
1414
|
-
Ok(ruby.str_from_slice(&b).as_value())
|
1415
|
-
}
|
1416
|
-
}
|
1401
|
+
ParquetValue::Uuid(u) => Ok(u.to_string().into_value_with(&ruby)),
|
1402
|
+
ParquetValue::Bytes(b) => Ok(ruby.enc_str_new(&b, ruby.ascii8bit_encoding()).as_value()),
|
1417
1403
|
ParquetValue::Date32(days) => {
|
1418
1404
|
// Convert days since epoch to Date object
|
1419
1405
|
let _ = ruby.require("date");
|
@@ -1528,6 +1514,14 @@ pub fn parquet_to_ruby(value: ParquetValue) -> Result<Value> {
|
|
1528
1514
|
.map_err(|e| ParquetError::Conversion(e.to_string()))?;
|
1529
1515
|
apply_timezone(time, &tz)
|
1530
1516
|
}
|
1517
|
+
ParquetValue::TimeNanos(nanos) => {
|
1518
|
+
let time_class = ruby.class_time();
|
1519
|
+
let secs = nanos / 1_000_000_000;
|
1520
|
+
let nsec = nanos % 1_000_000_000;
|
1521
|
+
time_class
|
1522
|
+
.funcall("at", (secs, nsec, Symbol::new("nanosecond")))
|
1523
|
+
.map_err(|e| ParquetError::Conversion(e.to_string()))
|
1524
|
+
}
|
1531
1525
|
ParquetValue::TimestampNanos(nanos, tz) => {
|
1532
1526
|
let time_class = ruby.class_time();
|
1533
1527
|
let secs = nanos / 1_000_000_000;
|
@@ -2,6 +2,7 @@ use magnus::value::ReprValue;
|
|
2
2
|
use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
|
3
3
|
use parquet_core::reader::Reader;
|
4
4
|
|
5
|
+
use crate::StringCache;
|
5
6
|
use crate::{
|
6
7
|
converter::parquet_to_ruby,
|
7
8
|
io::{RubyIOReader, ThreadSafeRubyIOReader},
|
@@ -101,6 +102,12 @@ pub fn each_row(
|
|
101
102
|
})?;
|
102
103
|
let mut row_count = 0u64;
|
103
104
|
|
105
|
+
let mut cache = StringCache::new(true);
|
106
|
+
let interned_column_names = column_names
|
107
|
+
.iter()
|
108
|
+
.map(|name| cache.intern(name.clone()))
|
109
|
+
.collect::<Vec<_>>();
|
110
|
+
|
104
111
|
for row_result in row_iter {
|
105
112
|
let row = row_result
|
106
113
|
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
@@ -120,11 +127,11 @@ pub fn each_row(
|
|
120
127
|
ParserResultType::Hash => {
|
121
128
|
let hash: RHash = ruby.hash_new();
|
122
129
|
for (idx, value) in row.into_iter().enumerate() {
|
123
|
-
if idx <
|
130
|
+
if idx < interned_column_names.len() {
|
124
131
|
let ruby_value = parquet_to_ruby(value).map_err(|e| {
|
125
132
|
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
126
133
|
})?;
|
127
|
-
hash.aset(
|
134
|
+
hash.aset(interned_column_names[idx].as_ref(), ruby_value)?;
|
128
135
|
}
|
129
136
|
}
|
130
137
|
hash.as_value()
|
@@ -1,6 +1,9 @@
|
|
1
1
|
use magnus::value::ReprValue;
|
2
2
|
use magnus::{Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value};
|
3
|
-
use parquet_core::{ParquetError, PrimitiveType,
|
3
|
+
use parquet_core::{ParquetError, PrimitiveType, Schema, SchemaNode};
|
4
|
+
|
5
|
+
use crate::utils::parse_string_or_symbol;
|
6
|
+
use crate::RubyAdapterError;
|
4
7
|
|
5
8
|
/// Ruby schema builder that converts Ruby hash/array representations to Parquet schemas
|
6
9
|
pub struct RubySchemaBuilder;
|
@@ -11,18 +14,18 @@ impl RubySchemaBuilder {
|
|
11
14
|
}
|
12
15
|
|
13
16
|
/// Parse a Ruby schema definition (hash) into a SchemaNode
|
14
|
-
fn parse_schema_node(
|
17
|
+
fn parse_schema_node(
|
18
|
+
&self,
|
19
|
+
name: String,
|
20
|
+
schema_def: Value,
|
21
|
+
) -> Result<SchemaNode, RubyAdapterError> {
|
15
22
|
// If it's a Hash, parse it as a complex type
|
16
23
|
if let Ok(hash) = <RHash as TryConvert>::try_convert(schema_def) {
|
17
24
|
return self.parse_hash_schema_node(name, hash);
|
18
25
|
}
|
19
26
|
|
20
27
|
// Otherwise, try to parse as a simple type symbol
|
21
|
-
if let Ok(
|
22
|
-
let type_str = type_sym.name().map_err(|e: MagnusError| {
|
23
|
-
ParquetError::Conversion(format!("Failed to get symbol name: {}", e))
|
24
|
-
})?;
|
25
|
-
|
28
|
+
if let Ok(type_str) = schema_def.to_r_string()?.to_string() {
|
26
29
|
// Check if it's a complex type with angle brackets
|
27
30
|
if type_str.contains('<') {
|
28
31
|
return self.parse_complex_type_string(name, type_str.to_string(), true);
|
@@ -38,22 +41,24 @@ impl RubySchemaBuilder {
|
|
38
41
|
});
|
39
42
|
}
|
40
43
|
|
41
|
-
Err(
|
44
|
+
Err(RubyAdapterError::InvalidInput(format!(
|
42
45
|
"Expected Hash or Symbol for schema definition, got {}",
|
43
46
|
schema_def.class()
|
44
47
|
)))
|
45
48
|
}
|
46
49
|
|
47
50
|
/// Parse a Ruby hash schema node
|
48
|
-
fn parse_hash_schema_node(
|
51
|
+
fn parse_hash_schema_node(
|
52
|
+
&self,
|
53
|
+
name: String,
|
54
|
+
hash: RHash,
|
55
|
+
) -> Result<SchemaNode, RubyAdapterError> {
|
49
56
|
// Get the type field
|
50
|
-
let type_sym:
|
51
|
-
.fetch::<_,
|
57
|
+
let type_sym: Value = hash
|
58
|
+
.fetch::<_, Value>(Symbol::new("type"))
|
52
59
|
.map_err(|e| ParquetError::Schema(format!("Schema missing 'type' field: {}", e)))?;
|
53
60
|
|
54
|
-
let type_str = type_sym.
|
55
|
-
ParquetError::Conversion(format!("Failed to get type name: {}", e))
|
56
|
-
})?;
|
61
|
+
let type_str = type_sym.to_r_string()?.to_string()?;
|
57
62
|
|
58
63
|
// Get nullable field (default to true)
|
59
64
|
let nullable = hash
|
@@ -140,6 +145,15 @@ impl RubySchemaBuilder {
|
|
140
145
|
|
141
146
|
// Primitive types
|
142
147
|
primitive_type => {
|
148
|
+
if format.as_deref() == Some("uuid") {
|
149
|
+
return Ok(SchemaNode::Primitive {
|
150
|
+
name,
|
151
|
+
primitive_type: PrimitiveType::FixedLenByteArray(16),
|
152
|
+
nullable,
|
153
|
+
format,
|
154
|
+
});
|
155
|
+
}
|
156
|
+
|
143
157
|
// Get precision and scale for decimal types
|
144
158
|
let precision = hash
|
145
159
|
.fetch::<_, Value>(Symbol::new("precision"))
|
@@ -194,7 +208,7 @@ impl RubySchemaBuilder {
|
|
194
208
|
name: String,
|
195
209
|
type_str: String,
|
196
210
|
nullable: bool,
|
197
|
-
) -> Result<SchemaNode> {
|
211
|
+
) -> Result<SchemaNode, RubyAdapterError> {
|
198
212
|
if type_str.starts_with("list<") && type_str.ends_with('>') {
|
199
213
|
let inner_type = &type_str[5..type_str.len() - 1];
|
200
214
|
let item_name = format!("{}_item", name);
|
@@ -227,7 +241,7 @@ impl RubySchemaBuilder {
|
|
227
241
|
let inner = &type_str[4..type_str.len() - 1];
|
228
242
|
let parts: Vec<&str> = inner.split(',').map(|s| s.trim()).collect();
|
229
243
|
if parts.len() != 2 {
|
230
|
-
return Err(
|
244
|
+
return Err(RubyAdapterError::InvalidInput(format!(
|
231
245
|
"Invalid map type: {}",
|
232
246
|
type_str
|
233
247
|
)));
|
@@ -253,7 +267,7 @@ impl RubySchemaBuilder {
|
|
253
267
|
}),
|
254
268
|
})
|
255
269
|
} else {
|
256
|
-
Err(
|
270
|
+
Err(RubyAdapterError::InvalidInput(format!(
|
257
271
|
"Unknown complex type: {}",
|
258
272
|
type_str
|
259
273
|
)))
|
@@ -261,7 +275,7 @@ impl RubySchemaBuilder {
|
|
261
275
|
}
|
262
276
|
|
263
277
|
/// Parse a field definition from a Ruby hash
|
264
|
-
fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode> {
|
278
|
+
fn parse_field_definition(&self, field_hash: RHash) -> Result<SchemaNode, RubyAdapterError> {
|
265
279
|
let name: String = field_hash
|
266
280
|
.fetch(Symbol::new("name"))
|
267
281
|
.map_err(|e| ParquetError::Schema(format!("Field missing 'name': {}", e)))?;
|
@@ -272,7 +286,7 @@ impl RubySchemaBuilder {
|
|
272
286
|
self.parse_schema_node(name, field_hash.as_value())
|
273
287
|
} else {
|
274
288
|
// This might be a simplified definition - look for known field patterns
|
275
|
-
Err(
|
289
|
+
Err(RubyAdapterError::InvalidInput(format!(
|
276
290
|
"Field '{}' missing 'type' definition",
|
277
291
|
name
|
278
292
|
)))
|
@@ -286,7 +300,7 @@ impl RubySchemaBuilder {
|
|
286
300
|
precision: Option<u8>,
|
287
301
|
scale: Option<i8>,
|
288
302
|
timezone: Option<String>,
|
289
|
-
) -> Result<PrimitiveType> {
|
303
|
+
) -> Result<PrimitiveType, RubyAdapterError> {
|
290
304
|
// Check if it's a decimal type with parentheses notation like "decimal(5,2)"
|
291
305
|
if type_str.starts_with("decimal(") && type_str.ends_with(')') {
|
292
306
|
let params = &type_str[8..type_str.len() - 1]; // Extract "5,2" from "decimal(5,2)"
|
@@ -322,6 +336,14 @@ impl RubySchemaBuilder {
|
|
322
336
|
}
|
323
337
|
}
|
324
338
|
|
339
|
+
if type_str.starts_with("fixed_len_byte_array(") && type_str.ends_with(')') {
|
340
|
+
let params = &type_str[20..type_str.len() - 1];
|
341
|
+
let len = params.parse::<i32>().map_err(|_| {
|
342
|
+
ParquetError::Schema(format!("Invalid fixed_len_byte_array length: {}", params))
|
343
|
+
})?;
|
344
|
+
return Ok(PrimitiveType::FixedLenByteArray(len));
|
345
|
+
}
|
346
|
+
|
325
347
|
match type_str.as_str() {
|
326
348
|
"boolean" | "bool" => Ok(PrimitiveType::Boolean),
|
327
349
|
"int8" => Ok(PrimitiveType::Int8),
|
@@ -354,8 +376,9 @@ impl RubySchemaBuilder {
|
|
354
376
|
// PARQUET SPEC: timezone presence means UTC storage (isAdjustedToUTC = true)
|
355
377
|
Ok(PrimitiveType::TimestampNanos(timezone.map(Into::into)))
|
356
378
|
}
|
357
|
-
"
|
358
|
-
"
|
379
|
+
"time_millis" => Ok(PrimitiveType::TimeMillis),
|
380
|
+
"time_micros" => Ok(PrimitiveType::TimeMicros),
|
381
|
+
"time_nanos" => Ok(PrimitiveType::TimeNanos),
|
359
382
|
"decimal" => {
|
360
383
|
// Use provided precision/scale or defaults
|
361
384
|
let p = precision.unwrap_or(38);
|
@@ -378,7 +401,7 @@ impl RubySchemaBuilder {
|
|
378
401
|
let s = scale.unwrap_or(0);
|
379
402
|
Ok(PrimitiveType::Decimal256(p, s))
|
380
403
|
}
|
381
|
-
_ => Err(
|
404
|
+
_ => Err(RubyAdapterError::InvalidInput(format!(
|
382
405
|
"Unknown primitive type: {}",
|
383
406
|
type_str
|
384
407
|
))),
|
@@ -394,7 +417,7 @@ impl Default for RubySchemaBuilder {
|
|
394
417
|
|
395
418
|
/// Wrapper functions for Ruby FFI since SchemaBuilderTrait requires Send + Sync
|
396
419
|
/// and Ruby Value is not Send/Sync
|
397
|
-
pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
420
|
+
pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema, RubyAdapterError> {
|
398
421
|
let builder = RubySchemaBuilder::new();
|
399
422
|
|
400
423
|
// The Ruby schema should be a hash with a root struct
|
@@ -428,7 +451,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
|
428
451
|
let mut unique_names = std::collections::HashSet::new();
|
429
452
|
for name in &field_names {
|
430
453
|
if !unique_names.insert(name) {
|
431
|
-
return Err(
|
454
|
+
return Err(RubyAdapterError::InvalidInput(format!(
|
432
455
|
"Duplicate field names in root level schema: {:?}",
|
433
456
|
field_names
|
434
457
|
)));
|
@@ -441,7 +464,7 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
|
441
464
|
fields: field_nodes,
|
442
465
|
}
|
443
466
|
} else {
|
444
|
-
return Err(
|
467
|
+
return Err(RubyAdapterError::InvalidInput(
|
445
468
|
"Schema must have 'type' or 'fields' key".to_string(),
|
446
469
|
));
|
447
470
|
};
|
@@ -450,18 +473,18 @@ pub fn ruby_schema_to_parquet(schema_def: Value) -> Result<Schema> {
|
|
450
473
|
parquet_core::SchemaBuilder::new()
|
451
474
|
.with_root(root_node)
|
452
475
|
.build()
|
453
|
-
.map_err(|e|
|
476
|
+
.map_err(|e| RubyAdapterError::InvalidInput(e.to_string()))
|
454
477
|
}
|
455
478
|
|
456
479
|
/// Convert a Parquet schema back to Ruby representation
|
457
|
-
pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value> {
|
480
|
+
pub fn parquet_schema_to_ruby(schema: &Schema) -> Result<Value, RubyAdapterError> {
|
458
481
|
let ruby = Ruby::get()
|
459
482
|
.map_err(|e| ParquetError::Conversion(format!("Failed to get Ruby runtime: {}", e)))?;
|
460
483
|
|
461
484
|
schema_node_to_ruby(&schema.root, &ruby)
|
462
485
|
}
|
463
486
|
|
464
|
-
fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
|
487
|
+
fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value, RubyAdapterError> {
|
465
488
|
let hash = RHash::new();
|
466
489
|
|
467
490
|
match node {
|
@@ -550,6 +573,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
|
|
550
573
|
PrimitiveType::TimestampNanos(_) => Symbol::new("timestamp_nanos"),
|
551
574
|
PrimitiveType::TimeMillis => Symbol::new("time_millis"),
|
552
575
|
PrimitiveType::TimeMicros => Symbol::new("time_micros"),
|
576
|
+
PrimitiveType::TimeNanos => Symbol::new("time_nanos"),
|
553
577
|
PrimitiveType::Decimal128(_, _) => Symbol::new("decimal128"),
|
554
578
|
PrimitiveType::Decimal256(_, _) => Symbol::new("decimal256"),
|
555
579
|
PrimitiveType::FixedLenByteArray(_) => Symbol::new("fixed_len_byte_array"),
|
@@ -595,7 +619,7 @@ fn schema_node_to_ruby(node: &SchemaNode, _ruby: &Ruby) -> Result<Value> {
|
|
595
619
|
/// Convert old schema format to new format
|
596
620
|
/// Old: [{ "column_name" => "type" }, ...]
|
597
621
|
/// New: [{ name: "column_name", type: :type }, ...]
|
598
|
-
pub fn convert_legacy_schema(
|
622
|
+
pub fn convert_legacy_schema(ruby: &Ruby, schema: RArray) -> Result<RArray, RubyAdapterError> {
|
599
623
|
let new_schema = RArray::new();
|
600
624
|
|
601
625
|
for item in schema.into_iter() {
|
@@ -609,7 +633,12 @@ pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
|
|
609
633
|
|key: Value,
|
610
634
|
value: Value|
|
611
635
|
-> std::result::Result<magnus::r_hash::ForEach, MagnusError> {
|
612
|
-
let key_str: String =
|
636
|
+
let key_str: String = parse_string_or_symbol(ruby, key)?.ok_or_else(|| {
|
637
|
+
MagnusError::new(
|
638
|
+
magnus::exception::arg_error(),
|
639
|
+
"Nil keys not allowed in schema",
|
640
|
+
)
|
641
|
+
})?;
|
613
642
|
let type_str: String = TryConvert::try_convert(value)?;
|
614
643
|
|
615
644
|
new_field.aset(Symbol::new("name"), key_str)?;
|
@@ -623,7 +652,7 @@ pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
|
|
623
652
|
);
|
624
653
|
|
625
654
|
if let Err(e) = process_result {
|
626
|
-
return Err(
|
655
|
+
return Err(RubyAdapterError::InvalidInput(format!(
|
627
656
|
"Failed to process field: {}",
|
628
657
|
e
|
629
658
|
)));
|
@@ -638,7 +667,7 @@ pub fn convert_legacy_schema(_ruby: &Ruby, schema: RArray) -> Result<RArray> {
|
|
638
667
|
}
|
639
668
|
|
640
669
|
/// Check if schema is in new DSL format (hash with type: :struct)
|
641
|
-
pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool> {
|
670
|
+
pub fn is_dsl_schema(ruby: &Ruby, schema_value: Value) -> Result<bool, RubyAdapterError> {
|
642
671
|
if !schema_value.is_kind_of(ruby.class_hash()) {
|
643
672
|
return Ok(false);
|
644
673
|
}
|
@@ -671,7 +700,7 @@ pub fn process_schema_value(
|
|
671
700
|
ruby: &Ruby,
|
672
701
|
schema_value: Value,
|
673
702
|
data_array: Option<&RArray>,
|
674
|
-
) -> Result<Value> {
|
703
|
+
) -> Result<Value, RubyAdapterError> {
|
675
704
|
// Check if it's the new DSL format
|
676
705
|
if is_dsl_schema(ruby, schema_value)? {
|
677
706
|
// For DSL format, pass it directly to ruby_schema_to_parquet
|
@@ -709,7 +738,7 @@ pub fn process_schema_value(
|
|
709
738
|
convert_legacy_schema(ruby, array)?
|
710
739
|
}
|
711
740
|
} else {
|
712
|
-
return Err(
|
741
|
+
return Err(RubyAdapterError::InvalidInput(
|
713
742
|
"schema array must contain hashes".to_string(),
|
714
743
|
));
|
715
744
|
}
|
@@ -726,13 +755,13 @@ pub fn process_schema_value(
|
|
726
755
|
ParquetError::Schema(format!("Failed to convert fields to array: {}", e))
|
727
756
|
})?
|
728
757
|
} else {
|
729
|
-
return Err(
|
758
|
+
return Err(RubyAdapterError::InvalidInput(
|
730
759
|
"schema hash must have 'fields' key or be in DSL format with 'type' key"
|
731
760
|
.to_string(),
|
732
761
|
));
|
733
762
|
}
|
734
763
|
} else {
|
735
|
-
return Err(
|
764
|
+
return Err(RubyAdapterError::InvalidInput(
|
736
765
|
"schema must be nil, an array, or a hash".to_string(),
|
737
766
|
));
|
738
767
|
};
|
@@ -741,7 +770,7 @@ pub fn process_schema_value(
|
|
741
770
|
if schema_array.is_empty() {
|
742
771
|
if let Some(data) = data_array {
|
743
772
|
if data.is_empty() {
|
744
|
-
return Err(
|
773
|
+
return Err(RubyAdapterError::InvalidInput(
|
745
774
|
"Cannot infer schema from empty data".to_string(),
|
746
775
|
));
|
747
776
|
}
|
@@ -760,7 +789,7 @@ pub fn process_schema_value(
|
|
760
789
|
})?;
|
761
790
|
first_array.len()
|
762
791
|
} else {
|
763
|
-
return Err(
|
792
|
+
return Err(RubyAdapterError::InvalidInput(
|
764
793
|
"First data item must be an array".to_string(),
|
765
794
|
));
|
766
795
|
};
|
@@ -786,7 +815,7 @@ pub fn process_schema_value(
|
|
786
815
|
|
787
816
|
schema_array = new_schema;
|
788
817
|
} else {
|
789
|
-
return Err(
|
818
|
+
return Err(RubyAdapterError::InvalidInput(
|
790
819
|
"Schema is required when data is not provided for inference".to_string(),
|
791
820
|
));
|
792
821
|
}
|
@@ -1,15 +1,15 @@
|
|
1
1
|
use std::collections::HashMap;
|
2
|
-
use std::sync::{Arc, Mutex};
|
2
|
+
use std::sync::{Arc, LazyLock, Mutex};
|
3
3
|
|
4
4
|
use magnus::RString;
|
5
5
|
|
6
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<String, &'static str>>> =
|
7
|
+
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
8
|
+
|
6
9
|
/// A cache for interning strings in the Ruby VM to reduce memory usage
|
7
10
|
/// when there are many repeated strings
|
8
11
|
#[derive(Debug)]
|
9
12
|
pub struct StringCache {
|
10
|
-
/// The actual cache is shared behind an Arc<Mutex> to allow cloning
|
11
|
-
/// while maintaining a single global cache
|
12
|
-
cache: Arc<Mutex<HashMap<String, &'static str>>>,
|
13
13
|
enabled: bool,
|
14
14
|
hits: Arc<Mutex<usize>>,
|
15
15
|
misses: Arc<Mutex<usize>>,
|
@@ -19,7 +19,6 @@ impl StringCache {
|
|
19
19
|
/// Create a new string cache
|
20
20
|
pub fn new(enabled: bool) -> Self {
|
21
21
|
Self {
|
22
|
-
cache: Arc::new(Mutex::new(HashMap::new())),
|
23
22
|
enabled,
|
24
23
|
hits: Arc::new(Mutex::new(0)),
|
25
24
|
misses: Arc::new(Mutex::new(0)),
|
@@ -36,9 +35,9 @@ impl StringCache {
|
|
36
35
|
|
37
36
|
// Try to get or create the interned string
|
38
37
|
let result = (|| -> Result<(), String> {
|
39
|
-
let mut cache =
|
38
|
+
let mut cache = STRING_CACHE.lock().map_err(|e| e.to_string())?;
|
40
39
|
|
41
|
-
if cache.contains_key(
|
40
|
+
if cache.contains_key(s.as_str()) {
|
42
41
|
let mut hits = self.hits.lock().map_err(|e| e.to_string())?;
|
43
42
|
*hits += 1;
|
44
43
|
} else {
|
@@ -65,7 +64,7 @@ impl StringCache {
|
|
65
64
|
|
66
65
|
/// Get cache statistics
|
67
66
|
pub fn stats(&self) -> CacheStats {
|
68
|
-
let cache_size =
|
67
|
+
let cache_size = STRING_CACHE.lock().map(|c| c.len()).unwrap_or(0);
|
69
68
|
let hits = self.hits.lock().map(|h| *h).unwrap_or(0);
|
70
69
|
let misses = self.misses.lock().map(|m| *m).unwrap_or(0);
|
71
70
|
|
@@ -84,7 +83,7 @@ impl StringCache {
|
|
84
83
|
|
85
84
|
/// Clear the cache
|
86
85
|
pub fn clear(&mut self) {
|
87
|
-
if let Ok(mut cache) =
|
86
|
+
if let Ok(mut cache) = STRING_CACHE.lock() {
|
88
87
|
cache.clear();
|
89
88
|
}
|
90
89
|
if let Ok(mut hits) = self.hits.lock() {
|
@@ -26,6 +26,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
|
|
26
26
|
ParquetValue::Float64(_) => 8,
|
27
27
|
ParquetValue::String(s) => s.len() + 24, // String overhead
|
28
28
|
ParquetValue::Bytes(b) => b.len() + 24, // Vec overhead
|
29
|
+
ParquetValue::Uuid(_) => 16,
|
29
30
|
ParquetValue::Date32(_) => 4,
|
30
31
|
ParquetValue::Date64(_) => 8,
|
31
32
|
ParquetValue::Decimal128(_, _) => 16 + 1, // value + scale
|
@@ -36,6 +37,7 @@ pub fn estimate_parquet_value_size(value: &ParquetValue) -> usize {
|
|
36
37
|
ParquetValue::TimestampNanos(_, tz) => 8 + tz.as_ref().map_or(0, |s| s.len() + 24),
|
37
38
|
ParquetValue::TimeMillis(_) => 4,
|
38
39
|
ParquetValue::TimeMicros(_) => 8,
|
40
|
+
ParquetValue::TimeNanos(_) => 8,
|
39
41
|
ParquetValue::List(items) => {
|
40
42
|
24 + items.iter().map(estimate_parquet_value_size).sum::<usize>()
|
41
43
|
}
|
@@ -122,6 +124,21 @@ pub fn parse_parquet_write_args(
|
|
122
124
|
})
|
123
125
|
}
|
124
126
|
|
127
|
+
/// Convert a Ruby Value to a String, handling both String and Symbol types
|
128
|
+
pub fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, MagnusError> {
|
129
|
+
if value.is_nil() {
|
130
|
+
Ok(None)
|
131
|
+
} else if value.is_kind_of(ruby.class_string()) || value.is_kind_of(ruby.class_symbol()) {
|
132
|
+
let stringed = value.to_r_string()?.to_string()?;
|
133
|
+
Ok(Some(stringed))
|
134
|
+
} else {
|
135
|
+
Err(MagnusError::new(
|
136
|
+
magnus::exception::type_error(),
|
137
|
+
"Value must be a String or Symbol",
|
138
|
+
))
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
125
142
|
/// Handle block or enumerator creation
|
126
143
|
pub fn handle_block_or_enum<F, T>(
|
127
144
|
block_given: bool,
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.6.
|
4
|
+
version: 0.6.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-07-
|
11
|
+
date: 2025-07-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|