parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,322 @@
1
+ use bytes::Bytes;
2
+ use parquet_core::*;
3
+ use std::sync::Arc;
4
+
5
+ #[test]
6
+ fn test_single_row_file() {
7
+ let schema = SchemaBuilder::new()
8
+ .with_root(SchemaNode::Struct {
9
+ name: "root".to_string(),
10
+ nullable: false,
11
+ fields: vec![SchemaNode::Primitive {
12
+ name: "value".to_string(),
13
+ primitive_type: PrimitiveType::String,
14
+ nullable: false,
15
+ format: None,
16
+ }],
17
+ })
18
+ .build()
19
+ .unwrap();
20
+
21
+ let rows = vec![vec![ParquetValue::String(Arc::from("single"))]];
22
+
23
+ let mut buffer = Vec::new();
24
+ {
25
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
26
+ writer.write_rows(rows.clone()).unwrap();
27
+ writer.close().unwrap();
28
+ }
29
+
30
+ let bytes = Bytes::from(buffer);
31
+ let reader = Reader::new(bytes);
32
+
33
+ let read_rows: Vec<_> = reader
34
+ .read_rows()
35
+ .unwrap()
36
+ .collect::<Result<Vec<_>>>()
37
+ .unwrap();
38
+
39
+ assert_eq!(read_rows.len(), 1);
40
+ assert_eq!(read_rows[0], rows[0]);
41
+ }
42
+
43
+ #[test]
44
+ fn test_unicode_edge_cases() {
45
+ let schema = SchemaBuilder::new()
46
+ .with_root(SchemaNode::Struct {
47
+ name: "root".to_string(),
48
+ nullable: false,
49
+ fields: vec![SchemaNode::Primitive {
50
+ name: "text".to_string(),
51
+ primitive_type: PrimitiveType::String,
52
+ nullable: false,
53
+ format: None,
54
+ }],
55
+ })
56
+ .build()
57
+ .unwrap();
58
+
59
+ let test_strings = vec![
60
+ // Various Unicode edge cases
61
+ "".to_string(), // Empty
62
+ "A".to_string(), // ASCII
63
+ "Ω".to_string(), // Greek
64
+ "中文".to_string(), // Chinese
65
+ "🦀".to_string(), // Emoji (4-byte UTF-8)
66
+ "👨‍👩‍👧‍👦".to_string(), // Family emoji (ZWJ sequence)
67
+ "\u{0000}".to_string(), // Null character
68
+ "\u{FFFD}".to_string(), // Replacement character
69
+ "A\u{0301}".to_string(), // Combining character (A with accent)
70
+ "\u{200B}invisible\u{200B}".to_string(), // Zero-width space
71
+ "🏴󠁧󠁢󠁳󠁣󠁴󠁿".to_string(), // Flag (tag sequence)
72
+ "\u{1F1FA}\u{1F1F8}".to_string(), // US flag (regional indicators)
73
+ ];
74
+
75
+ let rows: Vec<Vec<ParquetValue>> = test_strings
76
+ .into_iter()
77
+ .map(|s| vec![ParquetValue::String(s.into())])
78
+ .collect();
79
+
80
+ let mut buffer = Vec::new();
81
+ {
82
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
83
+ writer.write_rows(rows.clone()).unwrap();
84
+ writer.close().unwrap();
85
+ }
86
+
87
+ let bytes = Bytes::from(buffer);
88
+ let reader = Reader::new(bytes);
89
+
90
+ let read_rows: Vec<_> = reader
91
+ .read_rows()
92
+ .unwrap()
93
+ .collect::<Result<Vec<_>>>()
94
+ .unwrap();
95
+
96
+ assert_eq!(read_rows.len(), rows.len());
97
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
98
+ assert_eq!(expected, actual);
99
+ }
100
+ }
101
+
102
+ #[test]
103
+ fn test_decimal_precision_boundaries() {
104
+ // Test decimal values at exact precision boundaries
105
+ let schema = SchemaBuilder::new()
106
+ .with_root(SchemaNode::Struct {
107
+ name: "root".to_string(),
108
+ nullable: false,
109
+ fields: vec![
110
+ SchemaNode::Primitive {
111
+ name: "dec_5_2".to_string(),
112
+ primitive_type: PrimitiveType::Decimal128(5, 2),
113
+ nullable: false,
114
+ format: None,
115
+ },
116
+ SchemaNode::Primitive {
117
+ name: "dec_38_0".to_string(),
118
+ primitive_type: PrimitiveType::Decimal128(38, 0),
119
+ nullable: false,
120
+ format: None,
121
+ },
122
+ SchemaNode::Primitive {
123
+ name: "dec_38_38".to_string(),
124
+ primitive_type: PrimitiveType::Decimal128(38, 38),
125
+ nullable: false,
126
+ format: None,
127
+ },
128
+ ],
129
+ })
130
+ .build()
131
+ .unwrap();
132
+
133
+ let rows = vec![
134
+ vec![
135
+ ParquetValue::Decimal128(99999, 2), // 999.99 (max for 5,2)
136
+ ParquetValue::Decimal128(99999999999999999999999999999999999999_i128, 0), // Max 38 digits
137
+ ParquetValue::Decimal128(99999999999999999999999999999999999999_i128, 38), // 0.99999... (38 9s after decimal)
138
+ ],
139
+ vec![
140
+ ParquetValue::Decimal128(-99999, 2), // -999.99 (min for 5,2)
141
+ ParquetValue::Decimal128(-99999999999999999999999999999999999999_i128, 0), // Min 38 digits
142
+ ParquetValue::Decimal128(-99999999999999999999999999999999999999_i128, 38), // -0.99999...
143
+ ],
144
+ vec![
145
+ ParquetValue::Decimal128(0, 2), // 0.00
146
+ ParquetValue::Decimal128(0, 0), // 0
147
+ ParquetValue::Decimal128(0, 38), // 0.00000... (38 zeros)
148
+ ],
149
+ vec![
150
+ ParquetValue::Decimal128(1, 2), // 0.01 (smallest positive for 5,2)
151
+ ParquetValue::Decimal128(1, 0), // 1
152
+ ParquetValue::Decimal128(1, 38), // 0.00000...01 (37 zeros then 1)
153
+ ],
154
+ ];
155
+
156
+ let mut buffer = Vec::new();
157
+ {
158
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
159
+ writer.write_rows(rows.clone()).unwrap();
160
+ writer.close().unwrap();
161
+ }
162
+
163
+ let bytes = Bytes::from(buffer);
164
+ let reader = Reader::new(bytes);
165
+
166
+ let read_rows: Vec<_> = reader
167
+ .read_rows()
168
+ .unwrap()
169
+ .collect::<Result<Vec<_>>>()
170
+ .unwrap();
171
+
172
+ assert_eq!(read_rows.len(), rows.len());
173
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
174
+ assert_eq!(expected, actual);
175
+ }
176
+ }
177
+
178
+ #[test]
179
+ fn test_map_with_duplicate_keys() {
180
+ // Maps can have duplicate keys in Parquet
181
+ let schema = SchemaBuilder::new()
182
+ .with_root(SchemaNode::Struct {
183
+ name: "root".to_string(),
184
+ nullable: false,
185
+ fields: vec![SchemaNode::Map {
186
+ name: "map_field".to_string(),
187
+ nullable: false,
188
+ key: Box::new(SchemaNode::Primitive {
189
+ name: "key".to_string(),
190
+ primitive_type: PrimitiveType::String,
191
+ nullable: false,
192
+ format: None,
193
+ }),
194
+ value: Box::new(SchemaNode::Primitive {
195
+ name: "value".to_string(),
196
+ primitive_type: PrimitiveType::Int32,
197
+ nullable: false,
198
+ format: None,
199
+ }),
200
+ }],
201
+ })
202
+ .build()
203
+ .unwrap();
204
+
205
+ let rows = vec![
206
+ // Map with duplicate keys
207
+ vec![ParquetValue::Map(vec![
208
+ (
209
+ ParquetValue::String(Arc::from("key1")),
210
+ ParquetValue::Int32(1),
211
+ ),
212
+ (
213
+ ParquetValue::String(Arc::from("key2")),
214
+ ParquetValue::Int32(2),
215
+ ),
216
+ (
217
+ ParquetValue::String(Arc::from("key1")),
218
+ ParquetValue::Int32(3),
219
+ ), // Duplicate key
220
+ (
221
+ ParquetValue::String(Arc::from("key1")),
222
+ ParquetValue::Int32(4),
223
+ ), // Another duplicate
224
+ ])],
225
+ // Empty map
226
+ vec![ParquetValue::Map(vec![])],
227
+ // Single entry
228
+ vec![ParquetValue::Map(vec![(
229
+ ParquetValue::String(Arc::from("only")),
230
+ ParquetValue::Int32(42),
231
+ )])],
232
+ ];
233
+
234
+ let mut buffer = Vec::new();
235
+ {
236
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
237
+ writer.write_rows(rows.clone()).unwrap();
238
+ writer.close().unwrap();
239
+ }
240
+
241
+ let bytes = Bytes::from(buffer);
242
+ let reader = Reader::new(bytes);
243
+
244
+ let read_rows: Vec<_> = reader
245
+ .read_rows()
246
+ .unwrap()
247
+ .collect::<Result<Vec<_>>>()
248
+ .unwrap();
249
+
250
+ assert_eq!(read_rows.len(), rows.len());
251
+
252
+ // Check that all entries including duplicates are preserved
253
+ match &read_rows[0][0] {
254
+ ParquetValue::Map(entries) => {
255
+ assert_eq!(entries.len(), 4); // All 4 entries including duplicates
256
+ }
257
+ _ => panic!("Expected map"),
258
+ }
259
+ }
260
+
261
+ #[test]
262
+ fn test_list_of_empty_lists() {
263
+ let schema = SchemaBuilder::new()
264
+ .with_root(SchemaNode::Struct {
265
+ name: "root".to_string(),
266
+ nullable: false,
267
+ fields: vec![SchemaNode::List {
268
+ name: "nested_lists".to_string(),
269
+ nullable: false,
270
+ item: Box::new(SchemaNode::List {
271
+ name: "inner_list".to_string(),
272
+ nullable: false,
273
+ item: Box::new(SchemaNode::Primitive {
274
+ name: "value".to_string(),
275
+ primitive_type: PrimitiveType::Int32,
276
+ nullable: false,
277
+ format: None,
278
+ }),
279
+ }),
280
+ }],
281
+ })
282
+ .build()
283
+ .unwrap();
284
+
285
+ let rows = vec![
286
+ // List containing empty lists
287
+ vec![ParquetValue::List(vec![
288
+ ParquetValue::List(vec![]),
289
+ ParquetValue::List(vec![]),
290
+ ParquetValue::List(vec![]),
291
+ ])],
292
+ // List with mix of empty and non-empty
293
+ vec![ParquetValue::List(vec![
294
+ ParquetValue::List(vec![ParquetValue::Int32(1), ParquetValue::Int32(2)]),
295
+ ParquetValue::List(vec![]),
296
+ ParquetValue::List(vec![ParquetValue::Int32(3)]),
297
+ ])],
298
+ // Empty outer list
299
+ vec![ParquetValue::List(vec![])],
300
+ ];
301
+
302
+ let mut buffer = Vec::new();
303
+ {
304
+ let mut writer = Writer::new(&mut buffer, schema).unwrap();
305
+ writer.write_rows(rows.clone()).unwrap();
306
+ writer.close().unwrap();
307
+ }
308
+
309
+ let bytes = Bytes::from(buffer);
310
+ let reader = Reader::new(bytes);
311
+
312
+ let read_rows: Vec<_> = reader
313
+ .read_rows()
314
+ .unwrap()
315
+ .collect::<Result<Vec<_>>>()
316
+ .unwrap();
317
+
318
+ assert_eq!(read_rows.len(), rows.len());
319
+ for (expected, actual) in rows.iter().zip(read_rows.iter()) {
320
+ assert_eq!(expected, actual);
321
+ }
322
+ }