parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,283 @@
|
|
1
|
+
use std::sync::Arc;
|
2
|
+
|
3
|
+
/// Core schema representation for Parquet files
|
4
|
+
#[derive(Debug, Clone, PartialEq)]
|
5
|
+
pub struct Schema {
|
6
|
+
pub root: SchemaNode,
|
7
|
+
}
|
8
|
+
|
9
|
+
/// Represents a node in the Parquet schema tree
|
10
|
+
#[derive(Debug, Clone, PartialEq)]
|
11
|
+
pub enum SchemaNode {
|
12
|
+
/// A struct with named fields
|
13
|
+
Struct {
|
14
|
+
name: String,
|
15
|
+
nullable: bool,
|
16
|
+
fields: Vec<SchemaNode>,
|
17
|
+
},
|
18
|
+
/// A list containing items of a single type
|
19
|
+
List {
|
20
|
+
name: String,
|
21
|
+
nullable: bool,
|
22
|
+
item: Box<SchemaNode>,
|
23
|
+
},
|
24
|
+
/// A map with key-value pairs
|
25
|
+
Map {
|
26
|
+
name: String,
|
27
|
+
nullable: bool,
|
28
|
+
key: Box<SchemaNode>,
|
29
|
+
value: Box<SchemaNode>,
|
30
|
+
},
|
31
|
+
/// A primitive/leaf type
|
32
|
+
Primitive {
|
33
|
+
name: String,
|
34
|
+
primitive_type: PrimitiveType,
|
35
|
+
nullable: bool,
|
36
|
+
format: Option<String>,
|
37
|
+
},
|
38
|
+
}
|
39
|
+
|
40
|
+
/// Primitive data types supported by Parquet
|
41
|
+
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
42
|
+
pub enum PrimitiveType {
|
43
|
+
// Integer types
|
44
|
+
Int8,
|
45
|
+
Int16,
|
46
|
+
Int32,
|
47
|
+
Int64,
|
48
|
+
UInt8,
|
49
|
+
UInt16,
|
50
|
+
UInt32,
|
51
|
+
UInt64,
|
52
|
+
|
53
|
+
// Floating point types
|
54
|
+
Float32,
|
55
|
+
Float64,
|
56
|
+
|
57
|
+
// Decimal types (precision, scale)
|
58
|
+
Decimal128(u8, i8),
|
59
|
+
Decimal256(u8, i8),
|
60
|
+
|
61
|
+
// Other basic types
|
62
|
+
Boolean,
|
63
|
+
String,
|
64
|
+
Binary,
|
65
|
+
|
66
|
+
// Date/Time types
|
67
|
+
Date32,
|
68
|
+
Date64,
|
69
|
+
TimestampSecond(Option<Arc<str>>),
|
70
|
+
TimestampMillis(Option<Arc<str>>),
|
71
|
+
TimestampMicros(Option<Arc<str>>),
|
72
|
+
TimestampNanos(Option<Arc<str>>),
|
73
|
+
TimeMillis,
|
74
|
+
TimeMicros,
|
75
|
+
|
76
|
+
// Fixed-length byte array
|
77
|
+
FixedLenByteArray(i32),
|
78
|
+
}
|
79
|
+
|
80
|
+
/// Represents how values are repeated in Parquet
|
81
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
82
|
+
pub enum Repetition {
|
83
|
+
/// Field must have exactly one value
|
84
|
+
Required,
|
85
|
+
/// Field can have 0 or 1 value
|
86
|
+
Optional,
|
87
|
+
/// Field can have 0 or more values
|
88
|
+
Repeated,
|
89
|
+
}
|
90
|
+
|
91
|
+
impl SchemaNode {
|
92
|
+
/// Get the name of this schema node
|
93
|
+
pub fn name(&self) -> &str {
|
94
|
+
match self {
|
95
|
+
SchemaNode::Struct { name, .. } => name,
|
96
|
+
SchemaNode::List { name, .. } => name,
|
97
|
+
SchemaNode::Map { name, .. } => name,
|
98
|
+
SchemaNode::Primitive { name, .. } => name,
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
/// Check if this node is nullable
|
103
|
+
pub fn is_nullable(&self) -> bool {
|
104
|
+
match self {
|
105
|
+
SchemaNode::Struct { nullable, .. } => *nullable,
|
106
|
+
SchemaNode::List { nullable, .. } => *nullable,
|
107
|
+
SchemaNode::Map { nullable, .. } => *nullable,
|
108
|
+
SchemaNode::Primitive { nullable, .. } => *nullable,
|
109
|
+
}
|
110
|
+
}
|
111
|
+
|
112
|
+
/// Get the repetition level based on nullability
|
113
|
+
pub fn repetition(&self) -> Repetition {
|
114
|
+
if self.is_nullable() {
|
115
|
+
Repetition::Optional
|
116
|
+
} else {
|
117
|
+
Repetition::Required
|
118
|
+
}
|
119
|
+
}
|
120
|
+
}
|
121
|
+
|
122
|
+
impl PrimitiveType {
|
123
|
+
/// Get the logical type name for display
|
124
|
+
pub fn type_name(&self) -> &'static str {
|
125
|
+
match self {
|
126
|
+
PrimitiveType::Int8 => "Int8",
|
127
|
+
PrimitiveType::Int16 => "Int16",
|
128
|
+
PrimitiveType::Int32 => "Int32",
|
129
|
+
PrimitiveType::Int64 => "Int64",
|
130
|
+
PrimitiveType::UInt8 => "UInt8",
|
131
|
+
PrimitiveType::UInt16 => "UInt16",
|
132
|
+
PrimitiveType::UInt32 => "UInt32",
|
133
|
+
PrimitiveType::UInt64 => "UInt64",
|
134
|
+
PrimitiveType::Float32 => "Float32",
|
135
|
+
PrimitiveType::Float64 => "Float64",
|
136
|
+
PrimitiveType::Decimal128(_, _) => "Decimal128",
|
137
|
+
PrimitiveType::Decimal256(_, _) => "Decimal256",
|
138
|
+
PrimitiveType::Boolean => "Boolean",
|
139
|
+
PrimitiveType::String => "String",
|
140
|
+
PrimitiveType::Binary => "Binary",
|
141
|
+
PrimitiveType::Date32 => "Date32",
|
142
|
+
PrimitiveType::Date64 => "Date64",
|
143
|
+
PrimitiveType::TimestampSecond(_) => "TimestampSecond",
|
144
|
+
PrimitiveType::TimestampMillis(_) => "TimestampMillis",
|
145
|
+
PrimitiveType::TimestampMicros(_) => "TimestampMicros",
|
146
|
+
PrimitiveType::TimestampNanos(_) => "TimestampNanos",
|
147
|
+
PrimitiveType::TimeMillis => "TimeMillis",
|
148
|
+
PrimitiveType::TimeMicros => "TimeMicros",
|
149
|
+
PrimitiveType::FixedLenByteArray(_) => "FixedLenByteArray",
|
150
|
+
}
|
151
|
+
}
|
152
|
+
|
153
|
+
/// Check if this type requires a format specifier
|
154
|
+
pub fn requires_format(&self) -> bool {
|
155
|
+
matches!(
|
156
|
+
self,
|
157
|
+
PrimitiveType::Date32
|
158
|
+
| PrimitiveType::Date64
|
159
|
+
| PrimitiveType::TimestampSecond(_)
|
160
|
+
| PrimitiveType::TimestampMillis(_)
|
161
|
+
| PrimitiveType::TimestampMicros(_)
|
162
|
+
| PrimitiveType::TimestampNanos(_)
|
163
|
+
| PrimitiveType::TimeMillis
|
164
|
+
| PrimitiveType::TimeMicros
|
165
|
+
)
|
166
|
+
}
|
167
|
+
}
|
168
|
+
|
169
|
+
/// Builder for creating schemas
|
170
|
+
pub struct SchemaBuilder {
|
171
|
+
root: Option<SchemaNode>,
|
172
|
+
}
|
173
|
+
|
174
|
+
impl SchemaBuilder {
|
175
|
+
pub fn new() -> Self {
|
176
|
+
Self { root: None }
|
177
|
+
}
|
178
|
+
|
179
|
+
pub fn with_root(mut self, root: SchemaNode) -> Self {
|
180
|
+
self.root = Some(root);
|
181
|
+
self
|
182
|
+
}
|
183
|
+
|
184
|
+
pub fn build(self) -> Result<Schema, &'static str> {
|
185
|
+
match self.root {
|
186
|
+
Some(root) => Ok(Schema { root }),
|
187
|
+
None => Err("Schema must have a root node"),
|
188
|
+
}
|
189
|
+
}
|
190
|
+
}
|
191
|
+
|
192
|
+
impl Default for SchemaBuilder {
|
193
|
+
fn default() -> Self {
|
194
|
+
Self::new()
|
195
|
+
}
|
196
|
+
}
|
197
|
+
|
198
|
+
#[cfg(test)]
|
199
|
+
mod tests {
|
200
|
+
use super::*;
|
201
|
+
|
202
|
+
#[test]
|
203
|
+
fn test_schema_creation() {
|
204
|
+
let schema = SchemaBuilder::new()
|
205
|
+
.with_root(SchemaNode::Struct {
|
206
|
+
name: "root".to_string(),
|
207
|
+
nullable: false,
|
208
|
+
fields: vec![
|
209
|
+
SchemaNode::Primitive {
|
210
|
+
name: "id".to_string(),
|
211
|
+
primitive_type: PrimitiveType::Int64,
|
212
|
+
nullable: false,
|
213
|
+
format: None,
|
214
|
+
},
|
215
|
+
SchemaNode::Primitive {
|
216
|
+
name: "name".to_string(),
|
217
|
+
primitive_type: PrimitiveType::String,
|
218
|
+
nullable: true,
|
219
|
+
format: None,
|
220
|
+
},
|
221
|
+
],
|
222
|
+
})
|
223
|
+
.build()
|
224
|
+
.unwrap();
|
225
|
+
|
226
|
+
assert_eq!(schema.root.name(), "root");
|
227
|
+
assert!(!schema.root.is_nullable());
|
228
|
+
}
|
229
|
+
|
230
|
+
#[test]
|
231
|
+
fn test_primitive_types() {
|
232
|
+
let decimal = PrimitiveType::Decimal128(10, 2);
|
233
|
+
assert_eq!(decimal.type_name(), "Decimal128");
|
234
|
+
|
235
|
+
let timestamp = PrimitiveType::TimestampMicros(None);
|
236
|
+
assert!(timestamp.requires_format());
|
237
|
+
|
238
|
+
let integer = PrimitiveType::Int32;
|
239
|
+
assert!(!integer.requires_format());
|
240
|
+
}
|
241
|
+
|
242
|
+
#[test]
|
243
|
+
fn test_nested_schema() {
|
244
|
+
let list_node = SchemaNode::List {
|
245
|
+
name: "items".to_string(),
|
246
|
+
nullable: true,
|
247
|
+
item: Box::new(SchemaNode::Primitive {
|
248
|
+
name: "item".to_string(),
|
249
|
+
primitive_type: PrimitiveType::String,
|
250
|
+
nullable: false,
|
251
|
+
format: None,
|
252
|
+
}),
|
253
|
+
};
|
254
|
+
|
255
|
+
assert_eq!(list_node.name(), "items");
|
256
|
+
assert!(list_node.is_nullable());
|
257
|
+
assert_eq!(list_node.repetition(), Repetition::Optional);
|
258
|
+
}
|
259
|
+
|
260
|
+
#[test]
|
261
|
+
fn test_map_schema() {
|
262
|
+
let map_node = SchemaNode::Map {
|
263
|
+
name: "metadata".to_string(),
|
264
|
+
nullable: false,
|
265
|
+
key: Box::new(SchemaNode::Primitive {
|
266
|
+
name: "key".to_string(),
|
267
|
+
primitive_type: PrimitiveType::String,
|
268
|
+
nullable: false,
|
269
|
+
format: None,
|
270
|
+
}),
|
271
|
+
value: Box::new(SchemaNode::Primitive {
|
272
|
+
name: "value".to_string(),
|
273
|
+
primitive_type: PrimitiveType::String,
|
274
|
+
nullable: true,
|
275
|
+
format: None,
|
276
|
+
}),
|
277
|
+
};
|
278
|
+
|
279
|
+
assert_eq!(map_node.name(), "metadata");
|
280
|
+
assert!(!map_node.is_nullable());
|
281
|
+
assert_eq!(map_node.repetition(), Repetition::Required);
|
282
|
+
}
|
283
|
+
}
|
@@ -0,0 +1,308 @@
|
|
1
|
+
//! Test utilities for parquet-core
|
2
|
+
|
3
|
+
#[cfg(test)]
|
4
|
+
pub mod test {
|
5
|
+
use crate::{ParquetValue, PrimitiveType, Schema, SchemaBuilder, SchemaNode};
|
6
|
+
use indexmap::IndexMap;
|
7
|
+
use ordered_float::OrderedFloat;
|
8
|
+
use std::sync::Arc;
|
9
|
+
|
10
|
+
/// Create a simple schema for testing
|
11
|
+
pub fn sample_schema() -> Schema {
|
12
|
+
SchemaBuilder::new()
|
13
|
+
.with_root(SchemaNode::Struct {
|
14
|
+
name: "root".to_string(),
|
15
|
+
nullable: false,
|
16
|
+
fields: vec![
|
17
|
+
SchemaNode::Primitive {
|
18
|
+
name: "id".to_string(),
|
19
|
+
primitive_type: PrimitiveType::Int64,
|
20
|
+
nullable: false,
|
21
|
+
format: None,
|
22
|
+
},
|
23
|
+
SchemaNode::Primitive {
|
24
|
+
name: "name".to_string(),
|
25
|
+
primitive_type: PrimitiveType::String,
|
26
|
+
nullable: true,
|
27
|
+
format: None,
|
28
|
+
},
|
29
|
+
SchemaNode::Primitive {
|
30
|
+
name: "age".to_string(),
|
31
|
+
primitive_type: PrimitiveType::Int32,
|
32
|
+
nullable: true,
|
33
|
+
format: None,
|
34
|
+
},
|
35
|
+
SchemaNode::Primitive {
|
36
|
+
name: "salary".to_string(),
|
37
|
+
primitive_type: PrimitiveType::Float64,
|
38
|
+
nullable: true,
|
39
|
+
format: None,
|
40
|
+
},
|
41
|
+
],
|
42
|
+
})
|
43
|
+
.build()
|
44
|
+
.unwrap()
|
45
|
+
}
|
46
|
+
|
47
|
+
/// Create a complex schema with nested types
|
48
|
+
pub fn complex_schema() -> Schema {
|
49
|
+
SchemaBuilder::new()
|
50
|
+
.with_root(SchemaNode::Struct {
|
51
|
+
name: "root".to_string(),
|
52
|
+
nullable: false,
|
53
|
+
fields: vec![
|
54
|
+
SchemaNode::Primitive {
|
55
|
+
name: "id".to_string(),
|
56
|
+
primitive_type: PrimitiveType::Int64,
|
57
|
+
nullable: false,
|
58
|
+
format: None,
|
59
|
+
},
|
60
|
+
SchemaNode::Struct {
|
61
|
+
name: "person".to_string(),
|
62
|
+
nullable: true,
|
63
|
+
fields: vec![
|
64
|
+
SchemaNode::Primitive {
|
65
|
+
name: "name".to_string(),
|
66
|
+
primitive_type: PrimitiveType::String,
|
67
|
+
nullable: false,
|
68
|
+
format: None,
|
69
|
+
},
|
70
|
+
SchemaNode::Primitive {
|
71
|
+
name: "age".to_string(),
|
72
|
+
primitive_type: PrimitiveType::Int32,
|
73
|
+
nullable: true,
|
74
|
+
format: None,
|
75
|
+
},
|
76
|
+
],
|
77
|
+
},
|
78
|
+
SchemaNode::List {
|
79
|
+
name: "scores".to_string(),
|
80
|
+
nullable: true,
|
81
|
+
item: Box::new(SchemaNode::Primitive {
|
82
|
+
name: "item".to_string(),
|
83
|
+
primitive_type: PrimitiveType::Float32,
|
84
|
+
nullable: false,
|
85
|
+
format: None,
|
86
|
+
}),
|
87
|
+
},
|
88
|
+
],
|
89
|
+
})
|
90
|
+
.build()
|
91
|
+
.unwrap()
|
92
|
+
}
|
93
|
+
|
94
|
+
/// Create sample row values matching the simple schema
|
95
|
+
pub fn sample_values() -> Vec<ParquetValue> {
|
96
|
+
vec![
|
97
|
+
ParquetValue::Int64(1),
|
98
|
+
ParquetValue::String(Arc::from("Alice")),
|
99
|
+
ParquetValue::Int32(30),
|
100
|
+
ParquetValue::Float64(OrderedFloat(75000.0)),
|
101
|
+
]
|
102
|
+
}
|
103
|
+
|
104
|
+
/// Create multiple sample rows
|
105
|
+
pub fn sample_rows(count: usize) -> Vec<Vec<ParquetValue>> {
|
106
|
+
(0..count)
|
107
|
+
.map(|i| {
|
108
|
+
vec![
|
109
|
+
ParquetValue::Int64(i as i64),
|
110
|
+
ParquetValue::String(Arc::from(format!("Person{}", i))),
|
111
|
+
ParquetValue::Int32((20 + i % 50) as i32),
|
112
|
+
ParquetValue::Float64(OrderedFloat(50000.0 + (i as f64 * 1000.0))),
|
113
|
+
]
|
114
|
+
})
|
115
|
+
.collect()
|
116
|
+
}
|
117
|
+
|
118
|
+
/// Create sample values with nulls
|
119
|
+
pub fn sample_values_with_nulls() -> Vec<ParquetValue> {
|
120
|
+
vec![
|
121
|
+
ParquetValue::Int64(2),
|
122
|
+
ParquetValue::Null,
|
123
|
+
ParquetValue::Int32(25),
|
124
|
+
ParquetValue::Null,
|
125
|
+
]
|
126
|
+
}
|
127
|
+
|
128
|
+
/// Create complex values matching the complex schema
|
129
|
+
pub fn complex_values() -> Vec<ParquetValue> {
|
130
|
+
let mut person = IndexMap::new();
|
131
|
+
person.insert(Arc::from("name"), ParquetValue::String(Arc::from("Bob")));
|
132
|
+
person.insert(Arc::from("age"), ParquetValue::Int32(35));
|
133
|
+
|
134
|
+
vec![
|
135
|
+
ParquetValue::Int64(1),
|
136
|
+
ParquetValue::Record(person),
|
137
|
+
ParquetValue::List(vec![
|
138
|
+
ParquetValue::Float32(OrderedFloat(90.5)),
|
139
|
+
ParquetValue::Float32(OrderedFloat(87.3)),
|
140
|
+
ParquetValue::Float32(OrderedFloat(92.1)),
|
141
|
+
]),
|
142
|
+
]
|
143
|
+
}
|
144
|
+
|
145
|
+
/// Test data for all primitive types
|
146
|
+
pub fn all_primitive_values() -> Vec<(PrimitiveType, ParquetValue)> {
|
147
|
+
vec![
|
148
|
+
(PrimitiveType::Boolean, ParquetValue::Boolean(true)),
|
149
|
+
(PrimitiveType::Int8, ParquetValue::Int8(42)),
|
150
|
+
(PrimitiveType::Int16, ParquetValue::Int16(1000)),
|
151
|
+
(PrimitiveType::Int32, ParquetValue::Int32(100000)),
|
152
|
+
(PrimitiveType::Int64, ParquetValue::Int64(1000000000)),
|
153
|
+
(PrimitiveType::UInt8, ParquetValue::UInt8(200)),
|
154
|
+
(PrimitiveType::UInt16, ParquetValue::UInt16(50000)),
|
155
|
+
(PrimitiveType::UInt32, ParquetValue::UInt32(3000000000)),
|
156
|
+
(PrimitiveType::UInt64, ParquetValue::UInt64(10000000000)),
|
157
|
+
(
|
158
|
+
PrimitiveType::Float32,
|
159
|
+
ParquetValue::Float32(OrderedFloat(3.75)),
|
160
|
+
),
|
161
|
+
(
|
162
|
+
PrimitiveType::Float64,
|
163
|
+
ParquetValue::Float64(OrderedFloat(2.625)),
|
164
|
+
),
|
165
|
+
(
|
166
|
+
PrimitiveType::String,
|
167
|
+
ParquetValue::String(Arc::from("test string")),
|
168
|
+
),
|
169
|
+
(
|
170
|
+
PrimitiveType::Binary,
|
171
|
+
ParquetValue::Bytes(bytes::Bytes::from(vec![0x01, 0x02, 0x03])),
|
172
|
+
),
|
173
|
+
(PrimitiveType::Date32, ParquetValue::Date32(18628)), // 2021-01-01
|
174
|
+
(
|
175
|
+
PrimitiveType::TimeMillis,
|
176
|
+
ParquetValue::TimeMillis(43200000),
|
177
|
+
), // 12:00:00
|
178
|
+
(
|
179
|
+
PrimitiveType::TimeMicros,
|
180
|
+
ParquetValue::TimeMicros(43200000000),
|
181
|
+
), // 12:00:00
|
182
|
+
(
|
183
|
+
PrimitiveType::TimestampMillis(None),
|
184
|
+
ParquetValue::TimestampMillis(1609459200000, None),
|
185
|
+
), // 2021-01-01 00:00:00
|
186
|
+
(
|
187
|
+
PrimitiveType::TimestampMicros(None),
|
188
|
+
ParquetValue::TimestampMicros(1609459200000000, None),
|
189
|
+
), // 2021-01-01 00:00:00
|
190
|
+
(
|
191
|
+
PrimitiveType::Decimal128(10, 2),
|
192
|
+
ParquetValue::Decimal128(12345, 2),
|
193
|
+
), // 123.45
|
194
|
+
]
|
195
|
+
}
|
196
|
+
|
197
|
+
/// Create a temporary file path for testing
|
198
|
+
pub fn temp_file_path() -> String {
|
199
|
+
format!("/tmp/parquet_test_{}.parquet", uuid::Uuid::new_v4())
|
200
|
+
}
|
201
|
+
|
202
|
+
/// Compare two ParquetValues for equality, handling floating point comparison
|
203
|
+
pub fn values_equal(a: &ParquetValue, b: &ParquetValue) -> bool {
|
204
|
+
match (a, b) {
|
205
|
+
(ParquetValue::Float32(OrderedFloat(a)), ParquetValue::Float32(OrderedFloat(b))) => {
|
206
|
+
(a - b).abs() < f32::EPSILON
|
207
|
+
}
|
208
|
+
(ParquetValue::Float64(OrderedFloat(a)), ParquetValue::Float64(OrderedFloat(b))) => {
|
209
|
+
(a - b).abs() < f64::EPSILON
|
210
|
+
}
|
211
|
+
(ParquetValue::List(a), ParquetValue::List(b)) => {
|
212
|
+
a.len() == b.len() && a.iter().zip(b.iter()).all(|(a, b)| values_equal(a, b))
|
213
|
+
}
|
214
|
+
(ParquetValue::Map(a), ParquetValue::Map(b)) => {
|
215
|
+
a.len() == b.len()
|
216
|
+
&& a.iter()
|
217
|
+
.zip(b.iter())
|
218
|
+
.all(|((k1, v1), (k2, v2))| values_equal(k1, k2) && values_equal(v1, v2))
|
219
|
+
}
|
220
|
+
(ParquetValue::Record(a), ParquetValue::Record(b)) => {
|
221
|
+
a.len() == b.len()
|
222
|
+
&& a.iter()
|
223
|
+
.all(|(k, v)| b.get(k).map_or(false, |v2| values_equal(v, v2)))
|
224
|
+
}
|
225
|
+
_ => a == b,
|
226
|
+
}
|
227
|
+
}
|
228
|
+
|
229
|
+
/// Assert that two vectors of ParquetValues are equal
|
230
|
+
pub fn assert_values_equal(expected: &[ParquetValue], actual: &[ParquetValue]) {
|
231
|
+
assert_eq!(
|
232
|
+
expected.len(),
|
233
|
+
actual.len(),
|
234
|
+
"Value vectors have different lengths: expected {}, got {}",
|
235
|
+
expected.len(),
|
236
|
+
actual.len()
|
237
|
+
);
|
238
|
+
|
239
|
+
for (i, (e, a)) in expected.iter().zip(actual.iter()).enumerate() {
|
240
|
+
assert!(
|
241
|
+
values_equal(e, a),
|
242
|
+
"Values at index {} are not equal:\nExpected: {:?}\nActual: {:?}",
|
243
|
+
i,
|
244
|
+
e,
|
245
|
+
a
|
246
|
+
);
|
247
|
+
}
|
248
|
+
}
|
249
|
+
}
|
250
|
+
|
251
|
+
#[cfg(test)]
|
252
|
+
mod test_utils_tests {
|
253
|
+
use super::test::*;
|
254
|
+
|
255
|
+
#[test]
|
256
|
+
fn test_sample_schema() {
|
257
|
+
let schema = sample_schema();
|
258
|
+
assert_eq!(schema.root.name(), "root");
|
259
|
+
|
260
|
+
if let crate::SchemaNode::Struct { fields, .. } = &schema.root {
|
261
|
+
assert_eq!(fields.len(), 4);
|
262
|
+
assert_eq!(fields[0].name(), "id");
|
263
|
+
assert_eq!(fields[1].name(), "name");
|
264
|
+
assert_eq!(fields[2].name(), "age");
|
265
|
+
assert_eq!(fields[3].name(), "salary");
|
266
|
+
} else {
|
267
|
+
panic!("Expected struct schema");
|
268
|
+
}
|
269
|
+
}
|
270
|
+
|
271
|
+
#[test]
|
272
|
+
fn test_sample_values() {
|
273
|
+
let values = sample_values();
|
274
|
+
assert_eq!(values.len(), 4);
|
275
|
+
assert!(matches!(values[0], crate::ParquetValue::Int64(1)));
|
276
|
+
assert!(matches!(&values[1], crate::ParquetValue::String(s) if s.as_ref() == "Alice"));
|
277
|
+
}
|
278
|
+
|
279
|
+
#[test]
|
280
|
+
fn test_values_equal() {
|
281
|
+
use crate::ParquetValue;
|
282
|
+
use ordered_float::OrderedFloat;
|
283
|
+
|
284
|
+
// Test exact equality
|
285
|
+
assert!(values_equal(
|
286
|
+
&ParquetValue::Int32(42),
|
287
|
+
&ParquetValue::Int32(42)
|
288
|
+
));
|
289
|
+
|
290
|
+
// Test floating point equality
|
291
|
+
assert!(values_equal(
|
292
|
+
&ParquetValue::Float32(OrderedFloat(1.0)),
|
293
|
+
&ParquetValue::Float32(OrderedFloat(1.0 + f32::EPSILON / 2.0))
|
294
|
+
));
|
295
|
+
|
296
|
+
// Test list equality
|
297
|
+
assert!(values_equal(
|
298
|
+
&ParquetValue::List(vec![ParquetValue::Int32(1), ParquetValue::Int32(2)]),
|
299
|
+
&ParquetValue::List(vec![ParquetValue::Int32(1), ParquetValue::Int32(2)])
|
300
|
+
));
|
301
|
+
|
302
|
+
// Test inequality
|
303
|
+
assert!(!values_equal(
|
304
|
+
&ParquetValue::Int32(42),
|
305
|
+
&ParquetValue::Int32(43)
|
306
|
+
));
|
307
|
+
}
|
308
|
+
}
|