parquet 0.5.5 → 0.5.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +58 -44
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/reader/unified/mod.rs +55 -20
- data/ext/parquet/src/types/mod.rs +2 -0
- data/ext/parquet/src/types/record_types.rs +186 -13
- data/ext/parquet/src/types/schema_converter.rs +14 -75
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8a79e74af0419282904a0041c09509520f64ce1e504e133237f4b87697dce14
|
4
|
+
data.tar.gz: 63391ffff73907caccc142f37550e85c12826f302f00ac726f826af391f8d8cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: cddb7c6711e7e49ea785f6c0ab5ae3c40181756ad0e3fc23f298c291b725b178fdfbe5a8430fd9be10591b09e1b963255cb50637743054fe2173c9798e1e8bcc
|
7
|
+
data.tar.gz: 927a112ff1994800b3ed989f5000ed2a43438cebff886a545d0dd22018731b042b9052ead5b14983faf50fffc593cdd7512dd764bfed4de8ffa7781e6f2fda1a
|
data/Cargo.lock
CHANGED
@@ -63,9 +63,8 @@ dependencies = [
|
|
63
63
|
|
64
64
|
[[package]]
|
65
65
|
name = "arrow-array"
|
66
|
-
version = "
|
67
|
-
source = "
|
68
|
-
checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a"
|
66
|
+
version = "55.1.0"
|
67
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
69
68
|
dependencies = [
|
70
69
|
"ahash",
|
71
70
|
"arrow-buffer",
|
@@ -79,9 +78,8 @@ dependencies = [
|
|
79
78
|
|
80
79
|
[[package]]
|
81
80
|
name = "arrow-buffer"
|
82
|
-
version = "
|
83
|
-
source = "
|
84
|
-
checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a"
|
81
|
+
version = "55.1.0"
|
82
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
85
83
|
dependencies = [
|
86
84
|
"bytes",
|
87
85
|
"half",
|
@@ -90,9 +88,8 @@ dependencies = [
|
|
90
88
|
|
91
89
|
[[package]]
|
92
90
|
name = "arrow-cast"
|
93
|
-
version = "
|
94
|
-
source = "
|
95
|
-
checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee"
|
91
|
+
version = "55.1.0"
|
92
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
96
93
|
dependencies = [
|
97
94
|
"arrow-array",
|
98
95
|
"arrow-buffer",
|
@@ -110,9 +107,8 @@ dependencies = [
|
|
110
107
|
|
111
108
|
[[package]]
|
112
109
|
name = "arrow-data"
|
113
|
-
version = "
|
114
|
-
source = "
|
115
|
-
checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83"
|
110
|
+
version = "55.1.0"
|
111
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
116
112
|
dependencies = [
|
117
113
|
"arrow-buffer",
|
118
114
|
"arrow-schema",
|
@@ -122,9 +118,8 @@ dependencies = [
|
|
122
118
|
|
123
119
|
[[package]]
|
124
120
|
name = "arrow-ipc"
|
125
|
-
version = "
|
126
|
-
source = "
|
127
|
-
checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6"
|
121
|
+
version = "55.1.0"
|
122
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
128
123
|
dependencies = [
|
129
124
|
"arrow-array",
|
130
125
|
"arrow-buffer",
|
@@ -135,15 +130,13 @@ dependencies = [
|
|
135
130
|
|
136
131
|
[[package]]
|
137
132
|
name = "arrow-schema"
|
138
|
-
version = "
|
139
|
-
source = "
|
140
|
-
checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735"
|
133
|
+
version = "55.1.0"
|
134
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
141
135
|
|
142
136
|
[[package]]
|
143
137
|
name = "arrow-select"
|
144
|
-
version = "
|
145
|
-
source = "
|
146
|
-
checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539"
|
138
|
+
version = "55.1.0"
|
139
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
147
140
|
dependencies = [
|
148
141
|
"ahash",
|
149
142
|
"arrow-array",
|
@@ -180,7 +173,7 @@ version = "0.69.5"
|
|
180
173
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
181
174
|
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
|
182
175
|
dependencies = [
|
183
|
-
"bitflags
|
176
|
+
"bitflags",
|
184
177
|
"cexpr",
|
185
178
|
"clang-sys",
|
186
179
|
"itertools 0.12.1",
|
@@ -194,12 +187,6 @@ dependencies = [
|
|
194
187
|
"syn",
|
195
188
|
]
|
196
189
|
|
197
|
-
[[package]]
|
198
|
-
name = "bitflags"
|
199
|
-
version = "1.3.2"
|
200
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
201
|
-
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
202
|
-
|
203
190
|
[[package]]
|
204
191
|
name = "bitflags"
|
205
192
|
version = "2.8.0"
|
@@ -208,9 +195,9 @@ checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
|
|
208
195
|
|
209
196
|
[[package]]
|
210
197
|
name = "brotli"
|
211
|
-
version = "
|
198
|
+
version = "8.0.1"
|
212
199
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
213
|
-
checksum = "
|
200
|
+
checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d"
|
214
201
|
dependencies = [
|
215
202
|
"alloc-no-stdlib",
|
216
203
|
"alloc-stdlib",
|
@@ -219,9 +206,9 @@ dependencies = [
|
|
219
206
|
|
220
207
|
[[package]]
|
221
208
|
name = "brotli-decompressor"
|
222
|
-
version = "
|
209
|
+
version = "5.0.0"
|
223
210
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
224
|
-
checksum = "
|
211
|
+
checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
|
225
212
|
dependencies = [
|
226
213
|
"alloc-no-stdlib",
|
227
214
|
"alloc-stdlib",
|
@@ -359,11 +346,11 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
|
359
346
|
|
360
347
|
[[package]]
|
361
348
|
name = "flatbuffers"
|
362
|
-
version = "
|
349
|
+
version = "25.2.10"
|
363
350
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
364
|
-
checksum = "
|
351
|
+
checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1"
|
365
352
|
dependencies = [
|
366
|
-
"bitflags
|
353
|
+
"bitflags",
|
367
354
|
"rustc_version",
|
368
355
|
]
|
369
356
|
|
@@ -374,6 +361,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
374
361
|
checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
|
375
362
|
dependencies = [
|
376
363
|
"crc32fast",
|
364
|
+
"libz-rs-sys",
|
377
365
|
"miniz_oxide",
|
378
366
|
]
|
379
367
|
|
@@ -652,6 +640,15 @@ dependencies = [
|
|
652
640
|
"libc",
|
653
641
|
]
|
654
642
|
|
643
|
+
[[package]]
|
644
|
+
name = "libz-rs-sys"
|
645
|
+
version = "0.4.2"
|
646
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
647
|
+
checksum = "902bc563b5d65ad9bba616b490842ef0651066a1a1dc3ce1087113ffcb873c8d"
|
648
|
+
dependencies = [
|
649
|
+
"zlib-rs",
|
650
|
+
]
|
651
|
+
|
655
652
|
[[package]]
|
656
653
|
name = "linux-raw-sys"
|
657
654
|
version = "0.4.15"
|
@@ -670,7 +667,7 @@ version = "0.11.3"
|
|
670
667
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
671
668
|
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
|
672
669
|
dependencies = [
|
673
|
-
"twox-hash",
|
670
|
+
"twox-hash 1.6.3",
|
674
671
|
]
|
675
672
|
|
676
673
|
[[package]]
|
@@ -840,20 +837,20 @@ dependencies = [
|
|
840
837
|
"magnus",
|
841
838
|
"mimalloc",
|
842
839
|
"num",
|
843
|
-
"parquet
|
840
|
+
"parquet 55.1.0",
|
844
841
|
"rand",
|
845
842
|
"rb-sys",
|
846
843
|
"rb-sys-env 0.2.2",
|
847
844
|
"simdutf8",
|
848
845
|
"tempfile",
|
849
846
|
"thiserror",
|
847
|
+
"uuid",
|
850
848
|
]
|
851
849
|
|
852
850
|
[[package]]
|
853
851
|
name = "parquet"
|
854
|
-
version = "
|
855
|
-
source = "
|
856
|
-
checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb"
|
852
|
+
version = "55.1.0"
|
853
|
+
source = "git+https://github.com/apache/arrow-rs?branch=main#e9df239980baa6d0f7eb4384eb01078bdd9b1701"
|
857
854
|
dependencies = [
|
858
855
|
"ahash",
|
859
856
|
"arrow-array",
|
@@ -879,9 +876,8 @@ dependencies = [
|
|
879
876
|
"simdutf8",
|
880
877
|
"snap",
|
881
878
|
"thrift",
|
882
|
-
"twox-hash",
|
879
|
+
"twox-hash 2.1.0",
|
883
880
|
"zstd",
|
884
|
-
"zstd-sys",
|
885
881
|
]
|
886
882
|
|
887
883
|
[[package]]
|
@@ -1055,7 +1051,7 @@ version = "0.38.44"
|
|
1055
1051
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1056
1052
|
checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
|
1057
1053
|
dependencies = [
|
1058
|
-
"bitflags
|
1054
|
+
"bitflags",
|
1059
1055
|
"errno",
|
1060
1056
|
"libc",
|
1061
1057
|
"linux-raw-sys",
|
@@ -1223,12 +1219,24 @@ dependencies = [
|
|
1223
1219
|
"static_assertions",
|
1224
1220
|
]
|
1225
1221
|
|
1222
|
+
[[package]]
|
1223
|
+
name = "twox-hash"
|
1224
|
+
version = "2.1.0"
|
1225
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1226
|
+
checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908"
|
1227
|
+
|
1226
1228
|
[[package]]
|
1227
1229
|
name = "unicode-ident"
|
1228
1230
|
version = "1.0.17"
|
1229
1231
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1230
1232
|
checksum = "00e2473a93778eb0bad35909dff6a10d28e63f792f16ed15e404fca9d5eeedbe"
|
1231
1233
|
|
1234
|
+
[[package]]
|
1235
|
+
name = "uuid"
|
1236
|
+
version = "1.16.0"
|
1237
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1238
|
+
checksum = "458f7a779bf54acc9f347480ac654f68407d3aab21269a6e3c9f922acd9e2da9"
|
1239
|
+
|
1232
1240
|
[[package]]
|
1233
1241
|
name = "version_check"
|
1234
1242
|
version = "0.9.5"
|
@@ -1402,7 +1410,7 @@ version = "0.33.0"
|
|
1402
1410
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1403
1411
|
checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
|
1404
1412
|
dependencies = [
|
1405
|
-
"bitflags
|
1413
|
+
"bitflags",
|
1406
1414
|
]
|
1407
1415
|
|
1408
1416
|
[[package]]
|
@@ -1446,6 +1454,12 @@ dependencies = [
|
|
1446
1454
|
"syn",
|
1447
1455
|
]
|
1448
1456
|
|
1457
|
+
[[package]]
|
1458
|
+
name = "zlib-rs"
|
1459
|
+
version = "0.4.2"
|
1460
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1461
|
+
checksum = "8b20717f0917c908dc63de2e44e97f1e6b126ca58d0e391cee86d504eb8fbd05"
|
1462
|
+
|
1449
1463
|
[[package]]
|
1450
1464
|
name = "zstd"
|
1451
1465
|
version = "0.13.3"
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -11,20 +11,21 @@ rb-sys-env = "^0.2"
|
|
11
11
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
|
-
arrow-array = "
|
15
|
-
arrow-schema = "
|
14
|
+
arrow-array = { git = "https://github.com/apache/arrow-rs", branch = "main" }
|
15
|
+
arrow-schema = { git = "https://github.com/apache/arrow-rs", branch = "main" }
|
16
16
|
bytes = "^1.9"
|
17
17
|
either = "1.9"
|
18
18
|
itertools = "^0.14"
|
19
19
|
jiff = "0.2"
|
20
20
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
21
|
-
parquet = {
|
21
|
+
parquet = { git = "https://github.com/apache/arrow-rs", branch = "main", features = ["json"] }
|
22
22
|
rand = "0.9"
|
23
23
|
rb-sys = "^0.9"
|
24
24
|
simdutf8 = "0.1.5"
|
25
25
|
tempfile = "^3.15"
|
26
26
|
thiserror = "2.0"
|
27
27
|
num = "0.4.3"
|
28
|
+
uuid = "1.16.0"
|
28
29
|
|
29
30
|
[target.'cfg(target_os = "linux")'.dependencies]
|
30
31
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
@@ -2,8 +2,8 @@ use crate::header_cache::StringCache;
|
|
2
2
|
use crate::logger::RubyLogger;
|
3
3
|
use crate::types::TryIntoValue;
|
4
4
|
use crate::{
|
5
|
-
create_column_enumerator, create_row_enumerator,
|
6
|
-
|
5
|
+
create_column_enumerator, create_row_enumerator, ColumnEnumeratorArgs, ColumnRecord,
|
6
|
+
ParquetField, ParquetGemError, ParquetValueVec, ParserResultType, RowEnumeratorArgs, RowRecord,
|
7
7
|
};
|
8
8
|
use ahash::RandomState;
|
9
9
|
use either::Either;
|
@@ -13,10 +13,10 @@ use std::collections::HashMap;
|
|
13
13
|
use std::rc::Rc;
|
14
14
|
use std::sync::OnceLock;
|
15
15
|
|
16
|
-
use crate::types::ArrayWrapper;
|
17
16
|
use super::common::{
|
18
17
|
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
19
18
|
};
|
19
|
+
use crate::types::ArrayWrapper;
|
20
20
|
|
21
21
|
/// A unified parser configuration that can be used for both row and column parsing
|
22
22
|
pub enum ParserType {
|
@@ -53,11 +53,11 @@ pub fn parse_parquet_unified(
|
|
53
53
|
} = args;
|
54
54
|
|
55
55
|
// Initialize the logger if provided
|
56
|
-
let ruby_logger = RubyLogger::new(&ruby, logger
|
57
|
-
|
56
|
+
let ruby_logger = RubyLogger::new(&ruby, logger)?;
|
57
|
+
|
58
58
|
// Clone values for the closure to avoid move issues
|
59
59
|
let columns_clone = columns.clone();
|
60
|
-
|
60
|
+
|
61
61
|
// Determine if we're handling rows or columns for enumerator creation
|
62
62
|
match &parser_type {
|
63
63
|
ParserType::Row { strict } => {
|
@@ -75,13 +75,13 @@ pub fn parse_parquet_unified(
|
|
75
75
|
})? {
|
76
76
|
return Ok(enum_value);
|
77
77
|
}
|
78
|
-
}
|
78
|
+
}
|
79
79
|
ParserType::Column { batch_size, strict } => {
|
80
80
|
// For column-based parsing, log the batch size if present
|
81
81
|
if let Some(ref bs) = batch_size {
|
82
82
|
ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
|
83
83
|
}
|
84
|
-
|
84
|
+
|
85
85
|
// Handle block or create column enumerator
|
86
86
|
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
87
87
|
create_column_enumerator(ColumnEnumeratorArgs {
|
@@ -102,19 +102,34 @@ pub fn parse_parquet_unified(
|
|
102
102
|
|
103
103
|
// Open the Parquet source
|
104
104
|
let source = open_parquet_source(ruby.clone(), to_read)?;
|
105
|
-
|
105
|
+
|
106
106
|
// Based on the parser type, handle the data differently
|
107
107
|
match parser_type {
|
108
108
|
ParserType::Row { strict } => {
|
109
109
|
// Handle row-based parsing
|
110
|
-
process_row_data(
|
111
|
-
|
110
|
+
process_row_data(
|
111
|
+
ruby.clone(),
|
112
|
+
source,
|
113
|
+
&columns,
|
114
|
+
result_type,
|
115
|
+
strict,
|
116
|
+
&ruby_logger,
|
117
|
+
)?;
|
118
|
+
}
|
112
119
|
ParserType::Column { batch_size, strict } => {
|
113
120
|
// Handle column-based parsing
|
114
|
-
process_column_data(
|
121
|
+
process_column_data(
|
122
|
+
ruby.clone(),
|
123
|
+
source,
|
124
|
+
&columns,
|
125
|
+
result_type,
|
126
|
+
batch_size,
|
127
|
+
strict,
|
128
|
+
&ruby_logger,
|
129
|
+
)?;
|
115
130
|
}
|
116
131
|
}
|
117
|
-
|
132
|
+
|
118
133
|
Ok(ruby.qnil().into_value_with(&ruby))
|
119
134
|
}
|
120
135
|
|
@@ -129,7 +144,7 @@ fn process_row_data(
|
|
129
144
|
) -> Result<(), ParquetGemError> {
|
130
145
|
use parquet::file::reader::{FileReader, SerializedFileReader};
|
131
146
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
132
|
-
|
147
|
+
|
133
148
|
// Create the row-based reader
|
134
149
|
let reader: Box<dyn FileReader> = match source {
|
135
150
|
Either::Left(file) => {
|
@@ -174,8 +189,19 @@ fn process_row_data(
|
|
174
189
|
|
175
190
|
let mut map =
|
176
191
|
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
177
|
-
for (i, (_, v)
|
178
|
-
|
192
|
+
for (i, ((_, v), t)) in
|
193
|
+
row.get_column_iter().zip(schema.get_fields()).enumerate()
|
194
|
+
{
|
195
|
+
let type_info = t.get_basic_info();
|
196
|
+
map.insert(
|
197
|
+
headers[i],
|
198
|
+
ParquetField {
|
199
|
+
field: v.clone(),
|
200
|
+
converted_type: type_info.converted_type(),
|
201
|
+
logical_type: type_info.logical_type().clone(),
|
202
|
+
strict,
|
203
|
+
},
|
204
|
+
);
|
179
205
|
}
|
180
206
|
map
|
181
207
|
})
|
@@ -193,8 +219,14 @@ fn process_row_data(
|
|
193
219
|
row.map(|row| {
|
194
220
|
let column_count = row.get_column_iter().count();
|
195
221
|
let mut vec = Vec::with_capacity(column_count);
|
196
|
-
for (_, v) in row.get_column_iter() {
|
197
|
-
|
222
|
+
for ((_, v), t) in row.get_column_iter().zip(schema.get_fields()) {
|
223
|
+
let type_info = t.get_basic_info();
|
224
|
+
vec.push(ParquetField {
|
225
|
+
field: v.clone(),
|
226
|
+
converted_type: type_info.converted_type(),
|
227
|
+
logical_type: type_info.logical_type().clone(),
|
228
|
+
strict,
|
229
|
+
});
|
198
230
|
}
|
199
231
|
vec
|
200
232
|
})
|
@@ -309,7 +341,10 @@ fn process_column_data(
|
|
309
341
|
}
|
310
342
|
|
311
343
|
/// Helper function to create a projection schema
|
312
|
-
fn create_projection_schema(
|
344
|
+
fn create_projection_schema(
|
345
|
+
schema: &parquet::schema::types::Type,
|
346
|
+
columns: &[String],
|
347
|
+
) -> parquet::schema::types::Type {
|
313
348
|
if let parquet::schema::types::Type::GroupType { fields, .. } = schema {
|
314
349
|
let projected_fields: Vec<std::sync::Arc<parquet::schema::types::Type>> = fields
|
315
350
|
.iter()
|
@@ -325,4 +360,4 @@ fn create_projection_schema(schema: &parquet::schema::types::Type, columns: &[St
|
|
325
360
|
// Return original schema if not a group type
|
326
361
|
schema.clone()
|
327
362
|
}
|
328
|
-
}
|
363
|
+
}
|
@@ -1,7 +1,10 @@
|
|
1
1
|
use std::sync::OnceLock;
|
2
2
|
|
3
3
|
use itertools::Itertools;
|
4
|
-
use parquet::
|
4
|
+
use parquet::{
|
5
|
+
basic::{ConvertedType, LogicalType},
|
6
|
+
data_type::AsBytes,
|
7
|
+
};
|
5
8
|
|
6
9
|
use super::*;
|
7
10
|
|
@@ -44,7 +47,13 @@ pub enum ColumnRecord<S: BuildHasher + Default> {
|
|
44
47
|
}
|
45
48
|
|
46
49
|
#[derive(Debug)]
|
47
|
-
pub struct ParquetField
|
50
|
+
pub struct ParquetField {
|
51
|
+
pub field: Field,
|
52
|
+
#[allow(dead_code)]
|
53
|
+
pub converted_type: ConvertedType,
|
54
|
+
pub logical_type: Option<LogicalType>,
|
55
|
+
pub strict: bool,
|
56
|
+
}
|
48
57
|
|
49
58
|
impl<S: BuildHasher + Default> TryIntoValue for RowRecord<S> {
|
50
59
|
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
@@ -158,7 +167,7 @@ pub trait TryIntoValue {
|
|
158
167
|
|
159
168
|
impl TryIntoValue for ParquetField {
|
160
169
|
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
161
|
-
match self.
|
170
|
+
match self.field {
|
162
171
|
Field::Null => Ok(handle.qnil().as_value()),
|
163
172
|
Field::Bool(b) => Ok(b.into_value_with(handle)),
|
164
173
|
Field::Short(s) => Ok(s.into_value_with(handle)),
|
@@ -172,7 +181,7 @@ impl TryIntoValue for ParquetField {
|
|
172
181
|
Field::Float(f) => Ok(f.into_value_with(handle)),
|
173
182
|
Field::Double(d) => Ok(d.into_value_with(handle)),
|
174
183
|
Field::Str(s) => {
|
175
|
-
if self.
|
184
|
+
if self.strict {
|
176
185
|
Ok(simdutf8::basic::from_utf8(s.as_bytes())
|
177
186
|
.map_err(ParquetGemError::Utf8Error)
|
178
187
|
.map(|s| s.into_value_with(handle))?)
|
@@ -182,7 +191,15 @@ impl TryIntoValue for ParquetField {
|
|
182
191
|
}
|
183
192
|
}
|
184
193
|
Field::Byte(b) => Ok(b.into_value_with(handle)),
|
185
|
-
Field::Bytes(b) =>
|
194
|
+
Field::Bytes(b) => {
|
195
|
+
if matches!(self.logical_type, Some(parquet::basic::LogicalType::Uuid)) {
|
196
|
+
let bytes = b.as_bytes();
|
197
|
+
let uuid = uuid::Uuid::from_slice(bytes)?;
|
198
|
+
Ok(uuid.to_string().into_value_with(handle))
|
199
|
+
} else {
|
200
|
+
Ok(handle.str_from_slice(b.data()).as_value())
|
201
|
+
}
|
202
|
+
}
|
186
203
|
Field::Date(d) => {
|
187
204
|
let ts = jiff::Timestamp::from_second((d as i64) * 86400)?;
|
188
205
|
let formatted = ts.strftime("%Y-%m-%d").to_string();
|
@@ -206,7 +223,15 @@ impl TryIntoValue for ParquetField {
|
|
206
223
|
let elements = list.elements();
|
207
224
|
let ary = handle.ary_new_capa(elements.len());
|
208
225
|
elements.iter().try_for_each(|e| {
|
209
|
-
ary.push(
|
226
|
+
ary.push(
|
227
|
+
ParquetField {
|
228
|
+
field: e.clone(),
|
229
|
+
logical_type: e.to_logical_type(),
|
230
|
+
converted_type: e.to_converted_type(),
|
231
|
+
strict: self.strict,
|
232
|
+
}
|
233
|
+
.try_into_value_with(handle)?,
|
234
|
+
)?;
|
210
235
|
Ok::<_, ParquetGemError>(())
|
211
236
|
})?;
|
212
237
|
Ok(ary.into_value_with(handle))
|
@@ -220,8 +245,20 @@ impl TryIntoValue for ParquetField {
|
|
220
245
|
|
221
246
|
map.entries().iter().try_for_each(|(k, v)| {
|
222
247
|
hash.aset(
|
223
|
-
ParquetField
|
224
|
-
|
248
|
+
ParquetField {
|
249
|
+
field: k.clone(),
|
250
|
+
converted_type: k.to_converted_type(),
|
251
|
+
logical_type: k.to_logical_type(),
|
252
|
+
strict: self.strict,
|
253
|
+
}
|
254
|
+
.try_into_value_with(handle)?,
|
255
|
+
ParquetField {
|
256
|
+
field: v.clone(),
|
257
|
+
converted_type: v.to_converted_type(),
|
258
|
+
logical_type: v.to_logical_type(),
|
259
|
+
strict: self.strict,
|
260
|
+
}
|
261
|
+
.try_into_value_with(handle)?,
|
225
262
|
)?;
|
226
263
|
Ok::<_, ParquetGemError>(())
|
227
264
|
})?;
|
@@ -238,10 +275,32 @@ impl TryIntoValue for ParquetField {
|
|
238
275
|
format_decimal_with_i32_scale(unscaled, scale)
|
239
276
|
}
|
240
277
|
Decimal::Bytes { value, scale, .. } => {
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
278
|
+
match value.len() {
|
279
|
+
4 => {
|
280
|
+
// value is a byte array containing the bytes for an i32 value in big endian order
|
281
|
+
let casted = value.as_bytes()[..4].try_into()?;
|
282
|
+
let unscaled = i32::from_be_bytes(casted);
|
283
|
+
format_decimal_with_i32_scale(unscaled, scale)
|
284
|
+
}
|
285
|
+
8 => {
|
286
|
+
// value is a byte array containing the bytes for an i64 value in big endian order
|
287
|
+
let casted = value.as_bytes()[..8].try_into()?;
|
288
|
+
let unscaled = i64::from_be_bytes(casted);
|
289
|
+
format_decimal_with_i32_scale(unscaled, scale)
|
290
|
+
}
|
291
|
+
16 => {
|
292
|
+
// value is a byte array containing the bytes for an i128 value in big endian order
|
293
|
+
let casted = value.as_bytes()[..16].try_into()?;
|
294
|
+
let unscaled = i128::from_be_bytes(casted);
|
295
|
+
format_decimal_with_i32_scale(unscaled, scale)
|
296
|
+
}
|
297
|
+
_ => {
|
298
|
+
unimplemented!(
|
299
|
+
"Unsupported decimal byte array size: {}",
|
300
|
+
value.len()
|
301
|
+
);
|
302
|
+
}
|
303
|
+
}
|
245
304
|
}
|
246
305
|
};
|
247
306
|
|
@@ -256,7 +315,13 @@ impl TryIntoValue for ParquetField {
|
|
256
315
|
row.get_column_iter().try_for_each(|(k, v)| {
|
257
316
|
hash.aset(
|
258
317
|
k.clone().into_value_with(handle),
|
259
|
-
ParquetField
|
318
|
+
ParquetField {
|
319
|
+
field: v.clone(),
|
320
|
+
converted_type: v.to_converted_type(),
|
321
|
+
logical_type: v.to_logical_type(),
|
322
|
+
strict: self.strict,
|
323
|
+
}
|
324
|
+
.try_into_value_with(handle)?,
|
260
325
|
)?;
|
261
326
|
Ok::<_, ParquetGemError>(())
|
262
327
|
})?;
|
@@ -265,3 +330,111 @@ impl TryIntoValue for ParquetField {
|
|
265
330
|
}
|
266
331
|
}
|
267
332
|
}
|
333
|
+
|
334
|
+
trait ToTypeInfo {
|
335
|
+
fn to_converted_type(&self) -> ConvertedType;
|
336
|
+
fn to_logical_type(&self) -> Option<LogicalType>;
|
337
|
+
}
|
338
|
+
|
339
|
+
impl ToTypeInfo for &parquet::record::Field {
|
340
|
+
fn to_converted_type(&self) -> ConvertedType {
|
341
|
+
match self {
|
342
|
+
Field::Null => ConvertedType::NONE,
|
343
|
+
Field::Bool(_) => ConvertedType::INT_8,
|
344
|
+
Field::Byte(_) => ConvertedType::INT_8,
|
345
|
+
Field::Short(_) => ConvertedType::INT_16,
|
346
|
+
Field::Int(_) => ConvertedType::INT_32,
|
347
|
+
Field::Long(_) => ConvertedType::INT_64,
|
348
|
+
Field::UByte(_) => ConvertedType::UINT_8,
|
349
|
+
Field::UShort(_) => ConvertedType::UINT_16,
|
350
|
+
Field::UInt(_) => ConvertedType::UINT_32,
|
351
|
+
Field::ULong(_) => ConvertedType::UINT_64,
|
352
|
+
Field::Float16(_) => ConvertedType::NONE,
|
353
|
+
Field::Float(_) => ConvertedType::NONE,
|
354
|
+
Field::Double(_) => ConvertedType::NONE,
|
355
|
+
Field::Decimal(_) => ConvertedType::DECIMAL,
|
356
|
+
Field::Str(_) => ConvertedType::UTF8,
|
357
|
+
Field::Bytes(_) => ConvertedType::LIST,
|
358
|
+
Field::Date(_) => ConvertedType::DATE,
|
359
|
+
Field::TimestampMillis(_) => ConvertedType::TIMESTAMP_MILLIS,
|
360
|
+
Field::TimestampMicros(_) => ConvertedType::TIMESTAMP_MICROS,
|
361
|
+
Field::Group(_) => ConvertedType::NONE,
|
362
|
+
Field::ListInternal(_) => ConvertedType::LIST,
|
363
|
+
Field::MapInternal(_) => ConvertedType::MAP,
|
364
|
+
}
|
365
|
+
}
|
366
|
+
fn to_logical_type(&self) -> Option<LogicalType> {
|
367
|
+
Some(match self {
|
368
|
+
Field::Null => LogicalType::Unknown,
|
369
|
+
Field::Bool(_) => LogicalType::Integer {
|
370
|
+
bit_width: 1,
|
371
|
+
is_signed: false,
|
372
|
+
},
|
373
|
+
Field::Byte(_) => LogicalType::Integer {
|
374
|
+
bit_width: 8,
|
375
|
+
is_signed: false,
|
376
|
+
},
|
377
|
+
Field::Short(_) => LogicalType::Integer {
|
378
|
+
bit_width: 16,
|
379
|
+
is_signed: true,
|
380
|
+
},
|
381
|
+
Field::Int(_) => LogicalType::Integer {
|
382
|
+
bit_width: 32,
|
383
|
+
is_signed: true,
|
384
|
+
},
|
385
|
+
Field::Long(_) => LogicalType::Integer {
|
386
|
+
bit_width: 64,
|
387
|
+
is_signed: true,
|
388
|
+
},
|
389
|
+
Field::UByte(_) => LogicalType::Integer {
|
390
|
+
bit_width: 8,
|
391
|
+
is_signed: false,
|
392
|
+
},
|
393
|
+
Field::UShort(_) => LogicalType::Integer {
|
394
|
+
bit_width: 16,
|
395
|
+
is_signed: false,
|
396
|
+
},
|
397
|
+
Field::UInt(_) => LogicalType::Integer {
|
398
|
+
bit_width: 32,
|
399
|
+
is_signed: false,
|
400
|
+
},
|
401
|
+
Field::ULong(_) => LogicalType::Integer {
|
402
|
+
bit_width: 64,
|
403
|
+
is_signed: false,
|
404
|
+
},
|
405
|
+
Field::Float16(_) => LogicalType::Float16,
|
406
|
+
Field::Float(_) => LogicalType::Decimal {
|
407
|
+
scale: 7,
|
408
|
+
precision: 7,
|
409
|
+
},
|
410
|
+
Field::Double(_) => LogicalType::Decimal {
|
411
|
+
scale: 15,
|
412
|
+
precision: 15,
|
413
|
+
},
|
414
|
+
Field::Decimal(decimal) => LogicalType::Decimal {
|
415
|
+
scale: decimal.scale(),
|
416
|
+
precision: decimal.precision(),
|
417
|
+
},
|
418
|
+
Field::Str(_) => LogicalType::String,
|
419
|
+
Field::Bytes(b) => {
|
420
|
+
if b.data().len() == 16 && uuid::Uuid::from_slice(b.as_bytes()).is_ok() {
|
421
|
+
LogicalType::Uuid
|
422
|
+
} else {
|
423
|
+
LogicalType::Unknown
|
424
|
+
}
|
425
|
+
}
|
426
|
+
Field::Date(_) => LogicalType::Date,
|
427
|
+
Field::TimestampMillis(_) => LogicalType::Timestamp {
|
428
|
+
is_adjusted_to_u_t_c: true,
|
429
|
+
unit: parquet::basic::TimeUnit::MILLIS(parquet::format::MilliSeconds {}),
|
430
|
+
},
|
431
|
+
Field::TimestampMicros(_) => LogicalType::Timestamp {
|
432
|
+
is_adjusted_to_u_t_c: true,
|
433
|
+
unit: parquet::basic::TimeUnit::MICROS(parquet::format::MicroSeconds {}),
|
434
|
+
},
|
435
|
+
Field::Group(_) => LogicalType::Unknown,
|
436
|
+
Field::ListInternal(_) => LogicalType::List,
|
437
|
+
Field::MapInternal(_) => LogicalType::Map,
|
438
|
+
})
|
439
|
+
}
|
440
|
+
}
|
@@ -121,7 +121,8 @@ pub fn parse_legacy_schema(
|
|
121
121
|
ruby.exception_type_error(),
|
122
122
|
"Schema must be an array of field definitions or nil",
|
123
123
|
)
|
124
|
-
})
|
124
|
+
})?
|
125
|
+
.is_empty())
|
125
126
|
{
|
126
127
|
// If schema is nil or an empty array, we'll handle this in the caller
|
127
128
|
return Ok(Vec::new());
|
@@ -206,101 +207,39 @@ pub fn parse_legacy_schema(
|
|
206
207
|
|
207
208
|
// Handle decimal type with precision and scale
|
208
209
|
let mut type_result = PST::try_convert(type_str)?;
|
209
|
-
|
210
|
+
|
210
211
|
// If it's a decimal type and we have precision and scale, override the type
|
211
212
|
if let PST::Primitive(PrimitiveType::Decimal128(_, _)) = type_result {
|
212
|
-
|
213
|
-
let val: u8 = 18;
|
214
|
-
val.into_value_with(ruby)
|
215
|
-
});
|
216
|
-
let scale_value = scale.unwrap_or_else(|| {
|
217
|
-
let val: i8 = 2;
|
218
|
-
val.into_value_with(ruby)
|
219
|
-
});
|
220
|
-
|
221
|
-
let precision_u8 = u8::try_convert(precision_value).map_err(|_| {
|
222
|
-
MagnusError::new(
|
223
|
-
ruby.exception_type_error(),
|
224
|
-
"Invalid precision value for decimal type, expected a positive integer".to_string(),
|
225
|
-
)
|
226
|
-
})?;
|
227
|
-
|
228
|
-
// Validate precision is in a valid range
|
229
|
-
if precision_u8 < 1 {
|
230
|
-
return Err(MagnusError::new(
|
231
|
-
ruby.exception_arg_error(),
|
232
|
-
format!(
|
233
|
-
"Precision for decimal type must be at least 1, got {}",
|
234
|
-
precision_u8
|
235
|
-
),
|
236
|
-
));
|
237
|
-
}
|
238
|
-
|
239
|
-
if precision_u8 > 38 {
|
240
|
-
return Err(MagnusError::new(
|
241
|
-
ruby.exception_arg_error(),
|
242
|
-
format!(
|
243
|
-
"Precision for decimal type cannot exceed 38, got {}",
|
244
|
-
precision_u8
|
245
|
-
),
|
246
|
-
));
|
247
|
-
}
|
248
|
-
|
249
|
-
let scale_i8 = i8::try_convert(scale_value).map_err(|_| {
|
250
|
-
MagnusError::new(
|
251
|
-
ruby.exception_type_error(),
|
252
|
-
"Invalid scale value for decimal type, expected an integer".to_string(),
|
253
|
-
)
|
254
|
-
})?;
|
255
|
-
|
256
|
-
// Validate scale is in a valid range relative to precision
|
257
|
-
if scale_i8 < 0 {
|
258
|
-
return Err(MagnusError::new(
|
259
|
-
ruby.exception_arg_error(),
|
260
|
-
format!(
|
261
|
-
"Scale for decimal type cannot be negative, got {}",
|
262
|
-
scale_i8
|
263
|
-
),
|
264
|
-
));
|
265
|
-
}
|
266
|
-
|
267
|
-
if scale_i8 as u8 > precision_u8 {
|
268
|
-
return Err(MagnusError::new(
|
269
|
-
ruby.exception_arg_error(),
|
270
|
-
format!(
|
271
|
-
"Scale ({}) cannot be larger than precision ({}) for decimal type",
|
272
|
-
scale_i8, precision_u8
|
273
|
-
),
|
274
|
-
));
|
275
|
-
}
|
276
|
-
|
277
|
-
type_result = PST::Primitive(PrimitiveType::Decimal128(precision_u8, scale_i8));
|
213
|
+
// Do nothing
|
278
214
|
} else if let Some(type_name) = parse_string_or_symbol(ruby, type_str)? {
|
279
215
|
if type_name == "decimal" {
|
280
216
|
let precision_value = precision.unwrap_or_else(|| {
|
281
|
-
let val: u8 =
|
217
|
+
let val: u8 = 38;
|
282
218
|
val.into_value_with(ruby)
|
283
219
|
});
|
220
|
+
|
284
221
|
let scale_value = scale.unwrap_or_else(|| {
|
285
|
-
let val: i8 =
|
222
|
+
let val: i8 = 0;
|
286
223
|
val.into_value_with(ruby)
|
287
224
|
});
|
288
|
-
|
225
|
+
|
289
226
|
let precision_u8 = u8::try_convert(precision_value).map_err(|_| {
|
290
227
|
MagnusError::new(
|
291
228
|
ruby.exception_type_error(),
|
292
229
|
"Invalid precision value for decimal type, expected a positive integer".to_string(),
|
293
230
|
)
|
294
231
|
})?;
|
295
|
-
|
232
|
+
|
296
233
|
let scale_i8 = i8::try_convert(scale_value).map_err(|_| {
|
297
234
|
MagnusError::new(
|
298
235
|
ruby.exception_type_error(),
|
299
|
-
"Invalid scale value for decimal type, expected an integer"
|
236
|
+
"Invalid scale value for decimal type, expected an integer"
|
237
|
+
.to_string(),
|
300
238
|
)
|
301
239
|
})?;
|
302
|
-
|
303
|
-
type_result =
|
240
|
+
|
241
|
+
type_result =
|
242
|
+
PST::Primitive(PrimitiveType::Decimal128(precision_u8, scale_i8));
|
304
243
|
}
|
305
244
|
}
|
306
245
|
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-05-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|