parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,279 @@
1
+ use bytes::Bytes;
2
+ use num::BigInt;
3
+ use ordered_float::OrderedFloat;
4
+ use parquet_core::*;
5
+ use std::sync::Arc;
6
+
7
+ #[test]
8
+ fn test_all_primitive_types_roundtrip() {
9
+ // Comprehensive test that all primitive types roundtrip correctly
10
+ let schema = SchemaBuilder::new()
11
+ .with_root(SchemaNode::Struct {
12
+ name: "root".to_string(),
13
+ nullable: false,
14
+ fields: vec![
15
+ SchemaNode::Primitive {
16
+ name: "bool_val".to_string(),
17
+ primitive_type: PrimitiveType::Boolean,
18
+ nullable: false,
19
+ format: None,
20
+ },
21
+ SchemaNode::Primitive {
22
+ name: "int8_val".to_string(),
23
+ primitive_type: PrimitiveType::Int8,
24
+ nullable: false,
25
+ format: None,
26
+ },
27
+ SchemaNode::Primitive {
28
+ name: "int16_val".to_string(),
29
+ primitive_type: PrimitiveType::Int16,
30
+ nullable: false,
31
+ format: None,
32
+ },
33
+ SchemaNode::Primitive {
34
+ name: "int32_val".to_string(),
35
+ primitive_type: PrimitiveType::Int32,
36
+ nullable: false,
37
+ format: None,
38
+ },
39
+ SchemaNode::Primitive {
40
+ name: "int64_val".to_string(),
41
+ primitive_type: PrimitiveType::Int64,
42
+ nullable: false,
43
+ format: None,
44
+ },
45
+ SchemaNode::Primitive {
46
+ name: "uint8_val".to_string(),
47
+ primitive_type: PrimitiveType::UInt8,
48
+ nullable: false,
49
+ format: None,
50
+ },
51
+ SchemaNode::Primitive {
52
+ name: "uint16_val".to_string(),
53
+ primitive_type: PrimitiveType::UInt16,
54
+ nullable: false,
55
+ format: None,
56
+ },
57
+ SchemaNode::Primitive {
58
+ name: "uint32_val".to_string(),
59
+ primitive_type: PrimitiveType::UInt32,
60
+ nullable: false,
61
+ format: None,
62
+ },
63
+ SchemaNode::Primitive {
64
+ name: "uint64_val".to_string(),
65
+ primitive_type: PrimitiveType::UInt64,
66
+ nullable: false,
67
+ format: None,
68
+ },
69
+ SchemaNode::Primitive {
70
+ name: "float32_val".to_string(),
71
+ primitive_type: PrimitiveType::Float32,
72
+ nullable: false,
73
+ format: None,
74
+ },
75
+ SchemaNode::Primitive {
76
+ name: "float64_val".to_string(),
77
+ primitive_type: PrimitiveType::Float64,
78
+ nullable: false,
79
+ format: None,
80
+ },
81
+ SchemaNode::Primitive {
82
+ name: "string_val".to_string(),
83
+ primitive_type: PrimitiveType::String,
84
+ nullable: false,
85
+ format: None,
86
+ },
87
+ SchemaNode::Primitive {
88
+ name: "binary_val".to_string(),
89
+ primitive_type: PrimitiveType::Binary,
90
+ nullable: false,
91
+ format: None,
92
+ },
93
+ SchemaNode::Primitive {
94
+ name: "date32_val".to_string(),
95
+ primitive_type: PrimitiveType::Date32,
96
+ nullable: false,
97
+ format: None,
98
+ },
99
+ SchemaNode::Primitive {
100
+ name: "date64_val".to_string(),
101
+ primitive_type: PrimitiveType::Date64,
102
+ nullable: false,
103
+ format: None,
104
+ },
105
+ SchemaNode::Primitive {
106
+ name: "time_millis_val".to_string(),
107
+ primitive_type: PrimitiveType::TimeMillis,
108
+ nullable: false,
109
+ format: None,
110
+ },
111
+ SchemaNode::Primitive {
112
+ name: "time_micros_val".to_string(),
113
+ primitive_type: PrimitiveType::TimeMicros,
114
+ nullable: false,
115
+ format: None,
116
+ },
117
+ SchemaNode::Primitive {
118
+ name: "timestamp_millis_val".to_string(),
119
+ primitive_type: PrimitiveType::TimestampMillis(None),
120
+ nullable: false,
121
+ format: None,
122
+ },
123
+ SchemaNode::Primitive {
124
+ name: "decimal128_val".to_string(),
125
+ primitive_type: PrimitiveType::Decimal128(10, 2),
126
+ nullable: false,
127
+ format: None,
128
+ },
129
+ SchemaNode::Primitive {
130
+ name: "decimal256_val".to_string(),
131
+ primitive_type: PrimitiveType::Decimal256(50, 10),
132
+ nullable: false,
133
+ format: None,
134
+ },
135
+ ],
136
+ })
137
+ .build()
138
+ .unwrap();
139
+
140
+ let rows = vec![vec![
141
+ ParquetValue::Boolean(true),
142
+ ParquetValue::Int8(42),
143
+ ParquetValue::Int16(1000),
144
+ ParquetValue::Int32(100000),
145
+ ParquetValue::Int64(1000000000),
146
+ ParquetValue::UInt8(200),
147
+ ParquetValue::UInt16(50000),
148
+ ParquetValue::UInt32(3000000000),
149
+ ParquetValue::UInt64(10000000000),
150
+ ParquetValue::Float32(OrderedFloat(std::f32::consts::PI)),
151
+ ParquetValue::Float64(OrderedFloat(std::f64::consts::E)),
152
+ ParquetValue::String(Arc::from("Test string 🦀")),
153
+ ParquetValue::Bytes(Bytes::from(vec![0xDE, 0xAD, 0xBE, 0xEF])),
154
+ ParquetValue::Date32(19000),
155
+ ParquetValue::Date64(1640995200000),
156
+ ParquetValue::TimeMillis(43200000),
157
+ ParquetValue::TimeMicros(43200000000),
158
+ ParquetValue::TimestampMillis(1640995200000, None),
159
+ ParquetValue::Decimal128(12345, 2),
160
+ ParquetValue::Decimal256(
161
+ BigInt::parse_bytes(b"1234567890123456789012345678901234567890", 10).unwrap(),
162
+ 10,
163
+ ),
164
+ ]];
165
+
166
+ let mut buffer = Vec::new();
167
+ {
168
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
169
+ writer.write_rows(rows.clone()).unwrap();
170
+ writer.close().unwrap();
171
+ }
172
+
173
+ // Read back and verify
174
+ let bytes = Bytes::from(buffer);
175
+ let reader = Reader::new(bytes);
176
+
177
+ let read_rows: Vec<_> = reader
178
+ .read_rows()
179
+ .unwrap()
180
+ .collect::<Result<Vec<_>>>()
181
+ .unwrap();
182
+
183
+ assert_eq!(rows.len(), read_rows.len());
184
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
185
+ assert_eq!(expected, actual);
186
+ }
187
+ }
188
+
189
+ #[test]
190
+ fn test_empty_collections_roundtrip() {
191
+ // Test that empty lists, maps, and strings roundtrip correctly
192
+ let schema = SchemaBuilder::new()
193
+ .with_root(SchemaNode::Struct {
194
+ name: "root".to_string(),
195
+ nullable: false,
196
+ fields: vec![
197
+ SchemaNode::Primitive {
198
+ name: "empty_string".to_string(),
199
+ primitive_type: PrimitiveType::String,
200
+ nullable: false,
201
+ format: None,
202
+ },
203
+ SchemaNode::Primitive {
204
+ name: "empty_binary".to_string(),
205
+ primitive_type: PrimitiveType::Binary,
206
+ nullable: false,
207
+ format: None,
208
+ },
209
+ SchemaNode::List {
210
+ name: "empty_list".to_string(),
211
+ nullable: false,
212
+ item: Box::new(SchemaNode::Primitive {
213
+ name: "item".to_string(),
214
+ primitive_type: PrimitiveType::Int32,
215
+ nullable: false,
216
+ format: None,
217
+ }),
218
+ },
219
+ SchemaNode::Map {
220
+ name: "empty_map".to_string(),
221
+ nullable: false,
222
+ key: Box::new(SchemaNode::Primitive {
223
+ name: "key".to_string(),
224
+ primitive_type: PrimitiveType::String,
225
+ nullable: false,
226
+ format: None,
227
+ }),
228
+ value: Box::new(SchemaNode::Primitive {
229
+ name: "value".to_string(),
230
+ primitive_type: PrimitiveType::Int32,
231
+ nullable: false,
232
+ format: None,
233
+ }),
234
+ },
235
+ ],
236
+ })
237
+ .build()
238
+ .unwrap();
239
+
240
+ let rows = vec![
241
+ vec![
242
+ ParquetValue::String(Arc::from("")),
243
+ ParquetValue::Bytes(Bytes::from(vec![])),
244
+ ParquetValue::List(vec![]),
245
+ ParquetValue::Map(vec![]),
246
+ ],
247
+ vec![
248
+ ParquetValue::String(Arc::from("not empty")),
249
+ ParquetValue::Bytes(Bytes::from(vec![1, 2, 3])),
250
+ ParquetValue::List(vec![ParquetValue::Int32(42)]),
251
+ ParquetValue::Map(vec![(
252
+ ParquetValue::String(Arc::from("key")),
253
+ ParquetValue::Int32(100),
254
+ )]),
255
+ ],
256
+ ];
257
+
258
+ let mut buffer = Vec::new();
259
+ {
260
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
261
+ writer.write_rows(rows.clone()).unwrap();
262
+ writer.close().unwrap();
263
+ }
264
+
265
+ // Read back and verify
266
+ let bytes = Bytes::from(buffer);
267
+ let reader = Reader::new(bytes);
268
+
269
+ let read_rows: Vec<_> = reader
270
+ .read_rows()
271
+ .unwrap()
272
+ .collect::<Result<Vec<_>>>()
273
+ .unwrap();
274
+
275
+ assert_eq!(rows.len(), read_rows.len());
276
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
277
+ assert_eq!(expected, actual);
278
+ }
279
+ }