parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,151 @@
1
+ use crate::SchemaNode;
2
+
3
+ /// Trait for schema introspection
4
+ ///
5
+ /// This trait provides methods for examining and querying schemas
6
+ /// without modifying them.
7
+ pub trait SchemaInspector {
8
+ /// Get the total number of fields (including nested)
9
+ fn field_count(&self) -> usize;
10
+
11
+ /// Get field by path (e.g., "address.city")
12
+ fn get_field_by_path(&self, path: &str) -> Option<&SchemaNode>;
13
+
14
+ /// Check if schema contains a specific field
15
+ fn has_field(&self, name: &str) -> bool;
16
+
17
+ /// Get all field paths in the schema
18
+ fn all_field_paths(&self) -> Vec<String>;
19
+ }
20
+
21
+ impl SchemaInspector for crate::Schema {
22
+ fn field_count(&self) -> usize {
23
+ count_fields(&self.root)
24
+ }
25
+
26
+ fn get_field_by_path(&self, path: &str) -> Option<&SchemaNode> {
27
+ get_field_by_path(&self.root, path)
28
+ }
29
+
30
+ fn has_field(&self, name: &str) -> bool {
31
+ self.get_field_by_path(name).is_some()
32
+ }
33
+
34
+ fn all_field_paths(&self) -> Vec<String> {
35
+ let mut paths = Vec::new();
36
+ collect_field_paths(&self.root, String::new(), &mut paths);
37
+ paths
38
+ }
39
+ }
40
+
41
+ // Helper functions for schema inspection
42
+ fn count_fields(node: &SchemaNode) -> usize {
43
+ match node {
44
+ SchemaNode::Struct { fields, .. } => 1 + fields.iter().map(count_fields).sum::<usize>(),
45
+ SchemaNode::List { item, .. } => 1 + count_fields(item),
46
+ SchemaNode::Map { key, value, .. } => 1 + count_fields(key) + count_fields(value),
47
+ SchemaNode::Primitive { .. } => 1,
48
+ }
49
+ }
50
+
51
+ fn get_field_by_path<'a>(node: &'a SchemaNode, path: &str) -> Option<&'a SchemaNode> {
52
+ let parts: Vec<&str> = path.split('.').collect();
53
+ get_field_by_path_parts(node, &parts)
54
+ }
55
+
56
+ fn get_field_by_path_parts<'a>(node: &'a SchemaNode, parts: &[&str]) -> Option<&'a SchemaNode> {
57
+ if parts.is_empty() {
58
+ return Some(node);
59
+ }
60
+
61
+ let first = parts[0];
62
+ let rest = &parts[1..];
63
+
64
+ match node {
65
+ SchemaNode::Struct { fields, .. } => fields
66
+ .iter()
67
+ .find(|f| f.name() == first)
68
+ .and_then(|f| get_field_by_path_parts(f, rest)),
69
+ SchemaNode::List { item, .. } if first == "item" => get_field_by_path_parts(item, rest),
70
+ SchemaNode::Map { key, value, .. } => match first {
71
+ "key" => get_field_by_path_parts(key, rest),
72
+ "value" => get_field_by_path_parts(value, rest),
73
+ _ => None,
74
+ },
75
+ _ => None,
76
+ }
77
+ }
78
+
79
+ fn collect_field_paths(node: &SchemaNode, prefix: String, paths: &mut Vec<String>) {
80
+ let current_path = if prefix.is_empty() {
81
+ node.name().to_string()
82
+ } else {
83
+ format!("{}.{}", prefix, node.name())
84
+ };
85
+
86
+ paths.push(current_path.clone());
87
+
88
+ match node {
89
+ SchemaNode::Struct { fields, .. } => {
90
+ for field in fields {
91
+ collect_field_paths(field, current_path.clone(), paths);
92
+ }
93
+ }
94
+ SchemaNode::List { item, .. } => {
95
+ collect_field_paths(item, format!("{}.item", current_path), paths);
96
+ }
97
+ SchemaNode::Map { key, value, .. } => {
98
+ collect_field_paths(key, format!("{}.key", current_path), paths);
99
+ collect_field_paths(value, format!("{}.value", current_path), paths);
100
+ }
101
+ SchemaNode::Primitive { .. } => {}
102
+ }
103
+ }
104
+
105
+ #[cfg(test)]
106
+ mod tests {
107
+ use super::*;
108
+ use crate::{PrimitiveType, SchemaBuilder as CoreSchemaBuilder};
109
+
110
+ #[test]
111
+ fn test_schema_inspector() {
112
+ let schema = CoreSchemaBuilder::new()
113
+ .with_root(SchemaNode::Struct {
114
+ name: "root".to_string(),
115
+ nullable: false,
116
+ fields: vec![
117
+ SchemaNode::Primitive {
118
+ name: "id".to_string(),
119
+ primitive_type: PrimitiveType::Int64,
120
+ nullable: false,
121
+ format: None,
122
+ },
123
+ SchemaNode::Struct {
124
+ name: "address".to_string(),
125
+ nullable: true,
126
+ fields: vec![SchemaNode::Primitive {
127
+ name: "city".to_string(),
128
+ primitive_type: PrimitiveType::String,
129
+ nullable: true,
130
+ format: None,
131
+ }],
132
+ },
133
+ ],
134
+ })
135
+ .build()
136
+ .unwrap();
137
+
138
+ // Test field count
139
+ assert_eq!(schema.field_count(), 4); // root, id, address, city
140
+
141
+ // Test field lookup
142
+ assert!(schema.has_field("id"));
143
+ assert!(schema.has_field("address"));
144
+ assert!(schema.has_field("address.city"));
145
+ assert!(!schema.has_field("missing"));
146
+
147
+ // Test get field by path
148
+ let city = schema.get_field_by_path("address.city").unwrap();
149
+ assert_eq!(city.name(), "city");
150
+ }
151
+ }
@@ -0,0 +1,209 @@
1
+ use bytes::Bytes;
2
+ use indexmap::IndexMap;
3
+ use num::BigInt;
4
+ use std::sync::Arc;
5
+
6
+ #[derive(Debug, Clone, PartialEq, Eq)]
7
+ pub enum ParquetValue {
8
+ // Numeric types
9
+ Int8(i8),
10
+ Int16(i16),
11
+ Int32(i32),
12
+ Int64(i64),
13
+ UInt8(u8),
14
+ UInt16(u16),
15
+ UInt32(u32),
16
+ UInt64(u64),
17
+ Float16(ordered_float::OrderedFloat<f32>), // f16 converted to f32
18
+ Float32(ordered_float::OrderedFloat<f32>),
19
+ Float64(ordered_float::OrderedFloat<f64>),
20
+
21
+ // Basic types
22
+ Boolean(bool),
23
+ String(Arc<str>),
24
+ Bytes(Bytes),
25
+
26
+ // Date/Time types
27
+ Date32(i32), // Days since epoch
28
+ Date64(i64), // Milliseconds since epoch
29
+
30
+ // Decimal types
31
+ Decimal128(i128, i8), // value, scale
32
+ Decimal256(BigInt, i8), // Using BigInt instead of arrow_buffer::i256 for pure Rust
33
+
34
+ // Timestamp types - all store microseconds since epoch with optional timezone
35
+ TimestampSecond(i64, Option<Arc<str>>),
36
+ TimestampMillis(i64, Option<Arc<str>>),
37
+ TimestampMicros(i64, Option<Arc<str>>),
38
+ TimestampNanos(i64, Option<Arc<str>>),
39
+
40
+ // Time types
41
+ TimeMillis(i32), // Time of day in milliseconds since midnight
42
+ TimeMicros(i64), // Time of day in microseconds since midnight
43
+
44
+ // Complex types
45
+ List(Vec<ParquetValue>),
46
+ Map(Vec<(ParquetValue, ParquetValue)>), // Using Vec of tuples for deterministic ordering
47
+ Record(IndexMap<Arc<str>, ParquetValue>), // For struct/record types, preserves field order
48
+
49
+ // Null value
50
+ Null,
51
+ }
52
+
53
+ impl std::hash::Hash for ParquetValue {
54
+ fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
55
+ std::mem::discriminant(self).hash(state);
56
+ match self {
57
+ ParquetValue::Int8(i) => i.hash(state),
58
+ ParquetValue::Int16(i) => i.hash(state),
59
+ ParquetValue::Int32(i) => i.hash(state),
60
+ ParquetValue::Int64(i) => i.hash(state),
61
+ ParquetValue::UInt8(i) => i.hash(state),
62
+ ParquetValue::UInt16(i) => i.hash(state),
63
+ ParquetValue::UInt32(i) => i.hash(state),
64
+ ParquetValue::UInt64(i) => i.hash(state),
65
+ ParquetValue::Float16(f) => f.hash(state),
66
+ ParquetValue::Float32(f) => f.hash(state),
67
+ ParquetValue::Float64(f) => f.hash(state),
68
+ ParquetValue::Boolean(b) => b.hash(state),
69
+ ParquetValue::String(s) => s.hash(state),
70
+ ParquetValue::Bytes(b) => b.hash(state),
71
+ ParquetValue::Date32(d) => d.hash(state),
72
+ ParquetValue::Date64(d) => d.hash(state),
73
+ ParquetValue::Decimal128(d, scale) => {
74
+ d.hash(state);
75
+ scale.hash(state);
76
+ }
77
+ ParquetValue::Decimal256(d, scale) => {
78
+ d.hash(state);
79
+ scale.hash(state);
80
+ }
81
+ ParquetValue::TimestampSecond(ts, tz) => {
82
+ ts.hash(state);
83
+ tz.hash(state);
84
+ }
85
+ ParquetValue::TimestampMillis(ts, tz) => {
86
+ ts.hash(state);
87
+ tz.hash(state);
88
+ }
89
+ ParquetValue::TimestampMicros(ts, tz) => {
90
+ ts.hash(state);
91
+ tz.hash(state);
92
+ }
93
+ ParquetValue::TimestampNanos(ts, tz) => {
94
+ ts.hash(state);
95
+ tz.hash(state);
96
+ }
97
+ ParquetValue::TimeMillis(t) => t.hash(state),
98
+ ParquetValue::TimeMicros(t) => t.hash(state),
99
+ ParquetValue::List(l) => l.hash(state),
100
+ ParquetValue::Map(m) => m.hash(state),
101
+ ParquetValue::Record(r) => {
102
+ // IndexMap preserves insertion order, so hash is deterministic
103
+ for (k, v) in r {
104
+ k.hash(state);
105
+ v.hash(state);
106
+ }
107
+ }
108
+ ParquetValue::Null => 0_i32.hash(state),
109
+ }
110
+ }
111
+ }
112
+
113
+ impl ParquetValue {
114
+ /// Check if the value is null
115
+ pub fn is_null(&self) -> bool {
116
+ matches!(self, ParquetValue::Null)
117
+ }
118
+
119
+ /// Get the type name of the value
120
+ pub fn type_name(&self) -> &'static str {
121
+ match self {
122
+ ParquetValue::Int8(_) => "Int8",
123
+ ParquetValue::Int16(_) => "Int16",
124
+ ParquetValue::Int32(_) => "Int32",
125
+ ParquetValue::Int64(_) => "Int64",
126
+ ParquetValue::UInt8(_) => "UInt8",
127
+ ParquetValue::UInt16(_) => "UInt16",
128
+ ParquetValue::UInt32(_) => "UInt32",
129
+ ParquetValue::UInt64(_) => "UInt64",
130
+ ParquetValue::Float16(_) => "Float16",
131
+ ParquetValue::Float32(_) => "Float32",
132
+ ParquetValue::Float64(_) => "Float64",
133
+ ParquetValue::Boolean(_) => "Boolean",
134
+ ParquetValue::String(_) => "String",
135
+ ParquetValue::Bytes(_) => "Bytes",
136
+ ParquetValue::Date32(_) => "Date32",
137
+ ParquetValue::Date64(_) => "Date64",
138
+ ParquetValue::Decimal128(_, _) => "Decimal128",
139
+ ParquetValue::Decimal256(_, _) => "Decimal256",
140
+ ParquetValue::TimestampSecond(_, _) => "TimestampSecond",
141
+ ParquetValue::TimestampMillis(_, _) => "TimestampMillis",
142
+ ParquetValue::TimestampMicros(_, _) => "TimestampMicros",
143
+ ParquetValue::TimestampNanos(_, _) => "TimestampNanos",
144
+ ParquetValue::TimeMillis(_) => "TimeMillis",
145
+ ParquetValue::TimeMicros(_) => "TimeMicros",
146
+ ParquetValue::List(_) => "List",
147
+ ParquetValue::Map(_) => "Map",
148
+ ParquetValue::Record(_) => "Record",
149
+ ParquetValue::Null => "Null",
150
+ }
151
+ }
152
+ }
153
+
154
+ #[cfg(test)]
155
+ mod tests {
156
+ use super::*;
157
+ use ordered_float::OrderedFloat;
158
+
159
+ #[test]
160
+ fn test_value_creation() {
161
+ let v = ParquetValue::Int32(42);
162
+ assert_eq!(v, ParquetValue::Int32(42));
163
+ assert!(!v.is_null());
164
+ assert_eq!(v.type_name(), "Int32");
165
+ }
166
+
167
+ #[test]
168
+ fn test_null_value() {
169
+ let v = ParquetValue::Null;
170
+ assert!(v.is_null());
171
+ assert_eq!(v.type_name(), "Null");
172
+ }
173
+
174
+ #[test]
175
+ fn test_float_equality() {
176
+ let v1 = ParquetValue::Float32(OrderedFloat(3.5));
177
+ let v2 = ParquetValue::Float32(OrderedFloat(3.5));
178
+ assert_eq!(v1, v2);
179
+ }
180
+
181
+ #[test]
182
+ fn test_complex_types() {
183
+ let list = ParquetValue::List(vec![
184
+ ParquetValue::Int32(1),
185
+ ParquetValue::Int32(2),
186
+ ParquetValue::Int32(3),
187
+ ]);
188
+ assert_eq!(list.type_name(), "List");
189
+
190
+ let map = ParquetValue::Map(vec![(
191
+ ParquetValue::String(Arc::from("key")),
192
+ ParquetValue::Int32(42),
193
+ )]);
194
+ assert_eq!(map.type_name(), "Map");
195
+ }
196
+
197
+ #[test]
198
+ fn test_hash_consistency() {
199
+ use std::collections::HashSet;
200
+
201
+ let mut set = HashSet::new();
202
+ set.insert(ParquetValue::Int32(42));
203
+ set.insert(ParquetValue::String(Arc::from("hello")));
204
+
205
+ assert!(set.contains(&ParquetValue::Int32(42)));
206
+ assert!(set.contains(&ParquetValue::String(Arc::from("hello"))));
207
+ assert!(!set.contains(&ParquetValue::Int32(43)));
208
+ }
209
+ }