parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,151 @@
|
|
1
|
+
use crate::SchemaNode;
|
2
|
+
|
3
|
+
/// Trait for schema introspection
|
4
|
+
///
|
5
|
+
/// This trait provides methods for examining and querying schemas
|
6
|
+
/// without modifying them.
|
7
|
+
pub trait SchemaInspector {
|
8
|
+
/// Get the total number of fields (including nested)
|
9
|
+
fn field_count(&self) -> usize;
|
10
|
+
|
11
|
+
/// Get field by path (e.g., "address.city")
|
12
|
+
fn get_field_by_path(&self, path: &str) -> Option<&SchemaNode>;
|
13
|
+
|
14
|
+
/// Check if schema contains a specific field
|
15
|
+
fn has_field(&self, name: &str) -> bool;
|
16
|
+
|
17
|
+
/// Get all field paths in the schema
|
18
|
+
fn all_field_paths(&self) -> Vec<String>;
|
19
|
+
}
|
20
|
+
|
21
|
+
impl SchemaInspector for crate::Schema {
|
22
|
+
fn field_count(&self) -> usize {
|
23
|
+
count_fields(&self.root)
|
24
|
+
}
|
25
|
+
|
26
|
+
fn get_field_by_path(&self, path: &str) -> Option<&SchemaNode> {
|
27
|
+
get_field_by_path(&self.root, path)
|
28
|
+
}
|
29
|
+
|
30
|
+
fn has_field(&self, name: &str) -> bool {
|
31
|
+
self.get_field_by_path(name).is_some()
|
32
|
+
}
|
33
|
+
|
34
|
+
fn all_field_paths(&self) -> Vec<String> {
|
35
|
+
let mut paths = Vec::new();
|
36
|
+
collect_field_paths(&self.root, String::new(), &mut paths);
|
37
|
+
paths
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
// Helper functions for schema inspection
|
42
|
+
fn count_fields(node: &SchemaNode) -> usize {
|
43
|
+
match node {
|
44
|
+
SchemaNode::Struct { fields, .. } => 1 + fields.iter().map(count_fields).sum::<usize>(),
|
45
|
+
SchemaNode::List { item, .. } => 1 + count_fields(item),
|
46
|
+
SchemaNode::Map { key, value, .. } => 1 + count_fields(key) + count_fields(value),
|
47
|
+
SchemaNode::Primitive { .. } => 1,
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
fn get_field_by_path<'a>(node: &'a SchemaNode, path: &str) -> Option<&'a SchemaNode> {
|
52
|
+
let parts: Vec<&str> = path.split('.').collect();
|
53
|
+
get_field_by_path_parts(node, &parts)
|
54
|
+
}
|
55
|
+
|
56
|
+
fn get_field_by_path_parts<'a>(node: &'a SchemaNode, parts: &[&str]) -> Option<&'a SchemaNode> {
|
57
|
+
if parts.is_empty() {
|
58
|
+
return Some(node);
|
59
|
+
}
|
60
|
+
|
61
|
+
let first = parts[0];
|
62
|
+
let rest = &parts[1..];
|
63
|
+
|
64
|
+
match node {
|
65
|
+
SchemaNode::Struct { fields, .. } => fields
|
66
|
+
.iter()
|
67
|
+
.find(|f| f.name() == first)
|
68
|
+
.and_then(|f| get_field_by_path_parts(f, rest)),
|
69
|
+
SchemaNode::List { item, .. } if first == "item" => get_field_by_path_parts(item, rest),
|
70
|
+
SchemaNode::Map { key, value, .. } => match first {
|
71
|
+
"key" => get_field_by_path_parts(key, rest),
|
72
|
+
"value" => get_field_by_path_parts(value, rest),
|
73
|
+
_ => None,
|
74
|
+
},
|
75
|
+
_ => None,
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
fn collect_field_paths(node: &SchemaNode, prefix: String, paths: &mut Vec<String>) {
|
80
|
+
let current_path = if prefix.is_empty() {
|
81
|
+
node.name().to_string()
|
82
|
+
} else {
|
83
|
+
format!("{}.{}", prefix, node.name())
|
84
|
+
};
|
85
|
+
|
86
|
+
paths.push(current_path.clone());
|
87
|
+
|
88
|
+
match node {
|
89
|
+
SchemaNode::Struct { fields, .. } => {
|
90
|
+
for field in fields {
|
91
|
+
collect_field_paths(field, current_path.clone(), paths);
|
92
|
+
}
|
93
|
+
}
|
94
|
+
SchemaNode::List { item, .. } => {
|
95
|
+
collect_field_paths(item, format!("{}.item", current_path), paths);
|
96
|
+
}
|
97
|
+
SchemaNode::Map { key, value, .. } => {
|
98
|
+
collect_field_paths(key, format!("{}.key", current_path), paths);
|
99
|
+
collect_field_paths(value, format!("{}.value", current_path), paths);
|
100
|
+
}
|
101
|
+
SchemaNode::Primitive { .. } => {}
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
#[cfg(test)]
|
106
|
+
mod tests {
|
107
|
+
use super::*;
|
108
|
+
use crate::{PrimitiveType, SchemaBuilder as CoreSchemaBuilder};
|
109
|
+
|
110
|
+
#[test]
|
111
|
+
fn test_schema_inspector() {
|
112
|
+
let schema = CoreSchemaBuilder::new()
|
113
|
+
.with_root(SchemaNode::Struct {
|
114
|
+
name: "root".to_string(),
|
115
|
+
nullable: false,
|
116
|
+
fields: vec![
|
117
|
+
SchemaNode::Primitive {
|
118
|
+
name: "id".to_string(),
|
119
|
+
primitive_type: PrimitiveType::Int64,
|
120
|
+
nullable: false,
|
121
|
+
format: None,
|
122
|
+
},
|
123
|
+
SchemaNode::Struct {
|
124
|
+
name: "address".to_string(),
|
125
|
+
nullable: true,
|
126
|
+
fields: vec![SchemaNode::Primitive {
|
127
|
+
name: "city".to_string(),
|
128
|
+
primitive_type: PrimitiveType::String,
|
129
|
+
nullable: true,
|
130
|
+
format: None,
|
131
|
+
}],
|
132
|
+
},
|
133
|
+
],
|
134
|
+
})
|
135
|
+
.build()
|
136
|
+
.unwrap();
|
137
|
+
|
138
|
+
// Test field count
|
139
|
+
assert_eq!(schema.field_count(), 4); // root, id, address, city
|
140
|
+
|
141
|
+
// Test field lookup
|
142
|
+
assert!(schema.has_field("id"));
|
143
|
+
assert!(schema.has_field("address"));
|
144
|
+
assert!(schema.has_field("address.city"));
|
145
|
+
assert!(!schema.has_field("missing"));
|
146
|
+
|
147
|
+
// Test get field by path
|
148
|
+
let city = schema.get_field_by_path("address.city").unwrap();
|
149
|
+
assert_eq!(city.name(), "city");
|
150
|
+
}
|
151
|
+
}
|
@@ -0,0 +1,209 @@
|
|
1
|
+
use bytes::Bytes;
|
2
|
+
use indexmap::IndexMap;
|
3
|
+
use num::BigInt;
|
4
|
+
use std::sync::Arc;
|
5
|
+
|
6
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
7
|
+
pub enum ParquetValue {
|
8
|
+
// Numeric types
|
9
|
+
Int8(i8),
|
10
|
+
Int16(i16),
|
11
|
+
Int32(i32),
|
12
|
+
Int64(i64),
|
13
|
+
UInt8(u8),
|
14
|
+
UInt16(u16),
|
15
|
+
UInt32(u32),
|
16
|
+
UInt64(u64),
|
17
|
+
Float16(ordered_float::OrderedFloat<f32>), // f16 converted to f32
|
18
|
+
Float32(ordered_float::OrderedFloat<f32>),
|
19
|
+
Float64(ordered_float::OrderedFloat<f64>),
|
20
|
+
|
21
|
+
// Basic types
|
22
|
+
Boolean(bool),
|
23
|
+
String(Arc<str>),
|
24
|
+
Bytes(Bytes),
|
25
|
+
|
26
|
+
// Date/Time types
|
27
|
+
Date32(i32), // Days since epoch
|
28
|
+
Date64(i64), // Milliseconds since epoch
|
29
|
+
|
30
|
+
// Decimal types
|
31
|
+
Decimal128(i128, i8), // value, scale
|
32
|
+
Decimal256(BigInt, i8), // Using BigInt instead of arrow_buffer::i256 for pure Rust
|
33
|
+
|
34
|
+
// Timestamp types - all store microseconds since epoch with optional timezone
|
35
|
+
TimestampSecond(i64, Option<Arc<str>>),
|
36
|
+
TimestampMillis(i64, Option<Arc<str>>),
|
37
|
+
TimestampMicros(i64, Option<Arc<str>>),
|
38
|
+
TimestampNanos(i64, Option<Arc<str>>),
|
39
|
+
|
40
|
+
// Time types
|
41
|
+
TimeMillis(i32), // Time of day in milliseconds since midnight
|
42
|
+
TimeMicros(i64), // Time of day in microseconds since midnight
|
43
|
+
|
44
|
+
// Complex types
|
45
|
+
List(Vec<ParquetValue>),
|
46
|
+
Map(Vec<(ParquetValue, ParquetValue)>), // Using Vec of tuples for deterministic ordering
|
47
|
+
Record(IndexMap<Arc<str>, ParquetValue>), // For struct/record types, preserves field order
|
48
|
+
|
49
|
+
// Null value
|
50
|
+
Null,
|
51
|
+
}
|
52
|
+
|
53
|
+
impl std::hash::Hash for ParquetValue {
|
54
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
55
|
+
std::mem::discriminant(self).hash(state);
|
56
|
+
match self {
|
57
|
+
ParquetValue::Int8(i) => i.hash(state),
|
58
|
+
ParquetValue::Int16(i) => i.hash(state),
|
59
|
+
ParquetValue::Int32(i) => i.hash(state),
|
60
|
+
ParquetValue::Int64(i) => i.hash(state),
|
61
|
+
ParquetValue::UInt8(i) => i.hash(state),
|
62
|
+
ParquetValue::UInt16(i) => i.hash(state),
|
63
|
+
ParquetValue::UInt32(i) => i.hash(state),
|
64
|
+
ParquetValue::UInt64(i) => i.hash(state),
|
65
|
+
ParquetValue::Float16(f) => f.hash(state),
|
66
|
+
ParquetValue::Float32(f) => f.hash(state),
|
67
|
+
ParquetValue::Float64(f) => f.hash(state),
|
68
|
+
ParquetValue::Boolean(b) => b.hash(state),
|
69
|
+
ParquetValue::String(s) => s.hash(state),
|
70
|
+
ParquetValue::Bytes(b) => b.hash(state),
|
71
|
+
ParquetValue::Date32(d) => d.hash(state),
|
72
|
+
ParquetValue::Date64(d) => d.hash(state),
|
73
|
+
ParquetValue::Decimal128(d, scale) => {
|
74
|
+
d.hash(state);
|
75
|
+
scale.hash(state);
|
76
|
+
}
|
77
|
+
ParquetValue::Decimal256(d, scale) => {
|
78
|
+
d.hash(state);
|
79
|
+
scale.hash(state);
|
80
|
+
}
|
81
|
+
ParquetValue::TimestampSecond(ts, tz) => {
|
82
|
+
ts.hash(state);
|
83
|
+
tz.hash(state);
|
84
|
+
}
|
85
|
+
ParquetValue::TimestampMillis(ts, tz) => {
|
86
|
+
ts.hash(state);
|
87
|
+
tz.hash(state);
|
88
|
+
}
|
89
|
+
ParquetValue::TimestampMicros(ts, tz) => {
|
90
|
+
ts.hash(state);
|
91
|
+
tz.hash(state);
|
92
|
+
}
|
93
|
+
ParquetValue::TimestampNanos(ts, tz) => {
|
94
|
+
ts.hash(state);
|
95
|
+
tz.hash(state);
|
96
|
+
}
|
97
|
+
ParquetValue::TimeMillis(t) => t.hash(state),
|
98
|
+
ParquetValue::TimeMicros(t) => t.hash(state),
|
99
|
+
ParquetValue::List(l) => l.hash(state),
|
100
|
+
ParquetValue::Map(m) => m.hash(state),
|
101
|
+
ParquetValue::Record(r) => {
|
102
|
+
// IndexMap preserves insertion order, so hash is deterministic
|
103
|
+
for (k, v) in r {
|
104
|
+
k.hash(state);
|
105
|
+
v.hash(state);
|
106
|
+
}
|
107
|
+
}
|
108
|
+
ParquetValue::Null => 0_i32.hash(state),
|
109
|
+
}
|
110
|
+
}
|
111
|
+
}
|
112
|
+
|
113
|
+
impl ParquetValue {
|
114
|
+
/// Check if the value is null
|
115
|
+
pub fn is_null(&self) -> bool {
|
116
|
+
matches!(self, ParquetValue::Null)
|
117
|
+
}
|
118
|
+
|
119
|
+
/// Get the type name of the value
|
120
|
+
pub fn type_name(&self) -> &'static str {
|
121
|
+
match self {
|
122
|
+
ParquetValue::Int8(_) => "Int8",
|
123
|
+
ParquetValue::Int16(_) => "Int16",
|
124
|
+
ParquetValue::Int32(_) => "Int32",
|
125
|
+
ParquetValue::Int64(_) => "Int64",
|
126
|
+
ParquetValue::UInt8(_) => "UInt8",
|
127
|
+
ParquetValue::UInt16(_) => "UInt16",
|
128
|
+
ParquetValue::UInt32(_) => "UInt32",
|
129
|
+
ParquetValue::UInt64(_) => "UInt64",
|
130
|
+
ParquetValue::Float16(_) => "Float16",
|
131
|
+
ParquetValue::Float32(_) => "Float32",
|
132
|
+
ParquetValue::Float64(_) => "Float64",
|
133
|
+
ParquetValue::Boolean(_) => "Boolean",
|
134
|
+
ParquetValue::String(_) => "String",
|
135
|
+
ParquetValue::Bytes(_) => "Bytes",
|
136
|
+
ParquetValue::Date32(_) => "Date32",
|
137
|
+
ParquetValue::Date64(_) => "Date64",
|
138
|
+
ParquetValue::Decimal128(_, _) => "Decimal128",
|
139
|
+
ParquetValue::Decimal256(_, _) => "Decimal256",
|
140
|
+
ParquetValue::TimestampSecond(_, _) => "TimestampSecond",
|
141
|
+
ParquetValue::TimestampMillis(_, _) => "TimestampMillis",
|
142
|
+
ParquetValue::TimestampMicros(_, _) => "TimestampMicros",
|
143
|
+
ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
|
144
|
+
ParquetValue::TimeMillis(_) => "TimeMillis",
|
145
|
+
ParquetValue::TimeMicros(_) => "TimeMicros",
|
146
|
+
ParquetValue::List(_) => "List",
|
147
|
+
ParquetValue::Map(_) => "Map",
|
148
|
+
ParquetValue::Record(_) => "Record",
|
149
|
+
ParquetValue::Null => "Null",
|
150
|
+
}
|
151
|
+
}
|
152
|
+
}
|
153
|
+
|
154
|
+
#[cfg(test)]
|
155
|
+
mod tests {
|
156
|
+
use super::*;
|
157
|
+
use ordered_float::OrderedFloat;
|
158
|
+
|
159
|
+
#[test]
|
160
|
+
fn test_value_creation() {
|
161
|
+
let v = ParquetValue::Int32(42);
|
162
|
+
assert_eq!(v, ParquetValue::Int32(42));
|
163
|
+
assert!(!v.is_null());
|
164
|
+
assert_eq!(v.type_name(), "Int32");
|
165
|
+
}
|
166
|
+
|
167
|
+
#[test]
|
168
|
+
fn test_null_value() {
|
169
|
+
let v = ParquetValue::Null;
|
170
|
+
assert!(v.is_null());
|
171
|
+
assert_eq!(v.type_name(), "Null");
|
172
|
+
}
|
173
|
+
|
174
|
+
#[test]
|
175
|
+
fn test_float_equality() {
|
176
|
+
let v1 = ParquetValue::Float32(OrderedFloat(3.5));
|
177
|
+
let v2 = ParquetValue::Float32(OrderedFloat(3.5));
|
178
|
+
assert_eq!(v1, v2);
|
179
|
+
}
|
180
|
+
|
181
|
+
#[test]
|
182
|
+
fn test_complex_types() {
|
183
|
+
let list = ParquetValue::List(vec![
|
184
|
+
ParquetValue::Int32(1),
|
185
|
+
ParquetValue::Int32(2),
|
186
|
+
ParquetValue::Int32(3),
|
187
|
+
]);
|
188
|
+
assert_eq!(list.type_name(), "List");
|
189
|
+
|
190
|
+
let map = ParquetValue::Map(vec![(
|
191
|
+
ParquetValue::String(Arc::from("key")),
|
192
|
+
ParquetValue::Int32(42),
|
193
|
+
)]);
|
194
|
+
assert_eq!(map.type_name(), "Map");
|
195
|
+
}
|
196
|
+
|
197
|
+
#[test]
|
198
|
+
fn test_hash_consistency() {
|
199
|
+
use std::collections::HashSet;
|
200
|
+
|
201
|
+
let mut set = HashSet::new();
|
202
|
+
set.insert(ParquetValue::Int32(42));
|
203
|
+
set.insert(ParquetValue::String(Arc::from("hello")));
|
204
|
+
|
205
|
+
assert!(set.contains(&ParquetValue::Int32(42)));
|
206
|
+
assert!(set.contains(&ParquetValue::String(Arc::from("hello"))));
|
207
|
+
assert!(!set.contains(&ParquetValue::Int32(43)));
|
208
|
+
}
|
209
|
+
}
|