parquet 0.5.4 → 0.5.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +51 -44
- data/ext/parquet/Cargo.toml +3 -3
- data/ext/parquet/src/reader/mod.rs +2 -1
- data/ext/parquet/src/reader/parquet_column_reader.rs +15 -127
- data/ext/parquet/src/reader/parquet_row_reader.rs +14 -134
- data/ext/parquet/src/reader/unified/mod.rs +328 -0
- data/ext/parquet/src/types/parquet_value.rs +90 -16
- data/ext/parquet/src/types/record_types.rs +53 -7
- data/ext/parquet/src/types/schema_converter.rs +14 -75
- data/ext/parquet/src/types/type_conversion.rs +13 -11
- data/lib/parquet/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dc1d1eda7d71aa6336fbf6cc94789517439df3fab1852ec7d2e9d265e0c016c4
|
4
|
+
data.tar.gz: 6fff5321a31d3fe19a59a4f47add56222dbeb274bef7a068163b48757d65252d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ddd50f82df2b42cf844e379a7f07c0214e9aef925e7c43ec566b6b9f27be311676b6f887c163aa5d41d4523cd1d506266b15623205453bc8e08467c88e7c2b63
|
7
|
+
data.tar.gz: afb235ad09338d8c4cd59588dded3d312890c5d5d879b77040fcbf960be69653981fe5176cc591969a80ba54214d4c6a63cff96c36ceda7b9e00c75ba8e9e913
|
data/Cargo.lock
CHANGED
@@ -63,9 +63,8 @@ dependencies = [
|
|
63
63
|
|
64
64
|
[[package]]
|
65
65
|
name = "arrow-array"
|
66
|
-
version = "
|
67
|
-
source = "
|
68
|
-
checksum = "57a4a6d2896083cfbdf84a71a863b22460d0708f8206a8373c52e326cc72ea1a"
|
66
|
+
version = "55.1.0"
|
67
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
69
68
|
dependencies = [
|
70
69
|
"ahash",
|
71
70
|
"arrow-buffer",
|
@@ -79,9 +78,8 @@ dependencies = [
|
|
79
78
|
|
80
79
|
[[package]]
|
81
80
|
name = "arrow-buffer"
|
82
|
-
version = "
|
83
|
-
source = "
|
84
|
-
checksum = "cef870583ce5e4f3b123c181706f2002fb134960f9a911900f64ba4830c7a43a"
|
81
|
+
version = "55.1.0"
|
82
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
85
83
|
dependencies = [
|
86
84
|
"bytes",
|
87
85
|
"half",
|
@@ -90,9 +88,8 @@ dependencies = [
|
|
90
88
|
|
91
89
|
[[package]]
|
92
90
|
name = "arrow-cast"
|
93
|
-
version = "
|
94
|
-
source = "
|
95
|
-
checksum = "1ac7eba5a987f8b4a7d9629206ba48e19a1991762795bbe5d08497b7736017ee"
|
91
|
+
version = "55.1.0"
|
92
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
96
93
|
dependencies = [
|
97
94
|
"arrow-array",
|
98
95
|
"arrow-buffer",
|
@@ -110,9 +107,8 @@ dependencies = [
|
|
110
107
|
|
111
108
|
[[package]]
|
112
109
|
name = "arrow-data"
|
113
|
-
version = "
|
114
|
-
source = "
|
115
|
-
checksum = "b095e8a4f3c309544935d53e04c3bfe4eea4e71c3de6fe0416d1f08bb4441a83"
|
110
|
+
version = "55.1.0"
|
111
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
116
112
|
dependencies = [
|
117
113
|
"arrow-buffer",
|
118
114
|
"arrow-schema",
|
@@ -122,9 +118,8 @@ dependencies = [
|
|
122
118
|
|
123
119
|
[[package]]
|
124
120
|
name = "arrow-ipc"
|
125
|
-
version = "
|
126
|
-
source = "
|
127
|
-
checksum = "65c63da4afedde2b25ef69825cd4663ca76f78f79ffe2d057695742099130ff6"
|
121
|
+
version = "55.1.0"
|
122
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
128
123
|
dependencies = [
|
129
124
|
"arrow-array",
|
130
125
|
"arrow-buffer",
|
@@ -135,15 +130,13 @@ dependencies = [
|
|
135
130
|
|
136
131
|
[[package]]
|
137
132
|
name = "arrow-schema"
|
138
|
-
version = "
|
139
|
-
source = "
|
140
|
-
checksum = "0f40f6be8f78af1ab610db7d9b236e21d587b7168e368a36275d2e5670096735"
|
133
|
+
version = "55.1.0"
|
134
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
141
135
|
|
142
136
|
[[package]]
|
143
137
|
name = "arrow-select"
|
144
|
-
version = "
|
145
|
-
source = "
|
146
|
-
checksum = "ac265273864a820c4a179fc67182ccc41ea9151b97024e1be956f0f2369c2539"
|
138
|
+
version = "55.1.0"
|
139
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
147
140
|
dependencies = [
|
148
141
|
"ahash",
|
149
142
|
"arrow-array",
|
@@ -180,7 +173,7 @@ version = "0.69.5"
|
|
180
173
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
181
174
|
checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
|
182
175
|
dependencies = [
|
183
|
-
"bitflags
|
176
|
+
"bitflags",
|
184
177
|
"cexpr",
|
185
178
|
"clang-sys",
|
186
179
|
"itertools 0.12.1",
|
@@ -194,12 +187,6 @@ dependencies = [
|
|
194
187
|
"syn",
|
195
188
|
]
|
196
189
|
|
197
|
-
[[package]]
|
198
|
-
name = "bitflags"
|
199
|
-
version = "1.3.2"
|
200
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
201
|
-
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
202
|
-
|
203
190
|
[[package]]
|
204
191
|
name = "bitflags"
|
205
192
|
version = "2.8.0"
|
@@ -208,9 +195,9 @@ checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
|
|
208
195
|
|
209
196
|
[[package]]
|
210
197
|
name = "brotli"
|
211
|
-
version = "
|
198
|
+
version = "8.0.1"
|
212
199
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
213
|
-
checksum = "
|
200
|
+
checksum = "9991eea70ea4f293524138648e41ee89b0b2b12ddef3b255effa43c8056e0e0d"
|
214
201
|
dependencies = [
|
215
202
|
"alloc-no-stdlib",
|
216
203
|
"alloc-stdlib",
|
@@ -219,9 +206,9 @@ dependencies = [
|
|
219
206
|
|
220
207
|
[[package]]
|
221
208
|
name = "brotli-decompressor"
|
222
|
-
version = "
|
209
|
+
version = "5.0.0"
|
223
210
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
224
|
-
checksum = "
|
211
|
+
checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03"
|
225
212
|
dependencies = [
|
226
213
|
"alloc-no-stdlib",
|
227
214
|
"alloc-stdlib",
|
@@ -359,11 +346,11 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
|
359
346
|
|
360
347
|
[[package]]
|
361
348
|
name = "flatbuffers"
|
362
|
-
version = "
|
349
|
+
version = "25.2.10"
|
363
350
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
364
|
-
checksum = "
|
351
|
+
checksum = "1045398c1bfd89168b5fd3f1fc11f6e70b34f6f66300c87d44d3de849463abf1"
|
365
352
|
dependencies = [
|
366
|
-
"bitflags
|
353
|
+
"bitflags",
|
367
354
|
"rustc_version",
|
368
355
|
]
|
369
356
|
|
@@ -374,6 +361,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
374
361
|
checksum = "11faaf5a5236997af9848be0bef4db95824b1d534ebc64d0f0c6cf3e67bd38dc"
|
375
362
|
dependencies = [
|
376
363
|
"crc32fast",
|
364
|
+
"libz-rs-sys",
|
377
365
|
"miniz_oxide",
|
378
366
|
]
|
379
367
|
|
@@ -652,6 +640,15 @@ dependencies = [
|
|
652
640
|
"libc",
|
653
641
|
]
|
654
642
|
|
643
|
+
[[package]]
|
644
|
+
name = "libz-rs-sys"
|
645
|
+
version = "0.4.2"
|
646
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
647
|
+
checksum = "902bc563b5d65ad9bba616b490842ef0651066a1a1dc3ce1087113ffcb873c8d"
|
648
|
+
dependencies = [
|
649
|
+
"zlib-rs",
|
650
|
+
]
|
651
|
+
|
655
652
|
[[package]]
|
656
653
|
name = "linux-raw-sys"
|
657
654
|
version = "0.4.15"
|
@@ -670,7 +667,7 @@ version = "0.11.3"
|
|
670
667
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
671
668
|
checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5"
|
672
669
|
dependencies = [
|
673
|
-
"twox-hash",
|
670
|
+
"twox-hash 1.6.3",
|
674
671
|
]
|
675
672
|
|
676
673
|
[[package]]
|
@@ -840,7 +837,7 @@ dependencies = [
|
|
840
837
|
"magnus",
|
841
838
|
"mimalloc",
|
842
839
|
"num",
|
843
|
-
"parquet
|
840
|
+
"parquet 55.1.0",
|
844
841
|
"rand",
|
845
842
|
"rb-sys",
|
846
843
|
"rb-sys-env 0.2.2",
|
@@ -851,9 +848,8 @@ dependencies = [
|
|
851
848
|
|
852
849
|
[[package]]
|
853
850
|
name = "parquet"
|
854
|
-
version = "
|
855
|
-
source = "
|
856
|
-
checksum = "761c44d824fe83106e0600d2510c07bf4159a4985bf0569b513ea4288dc1b4fb"
|
851
|
+
version = "55.1.0"
|
852
|
+
source = "git+https://github.com/njaremko/arrow-rs?branch=nathan%2Ffix-reading-int32-timestamp-records#f791b78a67cb5d9a0b4ec0fcab80780dcb61c346"
|
857
853
|
dependencies = [
|
858
854
|
"ahash",
|
859
855
|
"arrow-array",
|
@@ -879,9 +875,8 @@ dependencies = [
|
|
879
875
|
"simdutf8",
|
880
876
|
"snap",
|
881
877
|
"thrift",
|
882
|
-
"twox-hash",
|
878
|
+
"twox-hash 2.1.0",
|
883
879
|
"zstd",
|
884
|
-
"zstd-sys",
|
885
880
|
]
|
886
881
|
|
887
882
|
[[package]]
|
@@ -1055,7 +1050,7 @@ version = "0.38.44"
|
|
1055
1050
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1056
1051
|
checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154"
|
1057
1052
|
dependencies = [
|
1058
|
-
"bitflags
|
1053
|
+
"bitflags",
|
1059
1054
|
"errno",
|
1060
1055
|
"libc",
|
1061
1056
|
"linux-raw-sys",
|
@@ -1223,6 +1218,12 @@ dependencies = [
|
|
1223
1218
|
"static_assertions",
|
1224
1219
|
]
|
1225
1220
|
|
1221
|
+
[[package]]
|
1222
|
+
name = "twox-hash"
|
1223
|
+
version = "2.1.0"
|
1224
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1225
|
+
checksum = "e7b17f197b3050ba473acf9181f7b1d3b66d1cf7356c6cc57886662276e65908"
|
1226
|
+
|
1226
1227
|
[[package]]
|
1227
1228
|
name = "unicode-ident"
|
1228
1229
|
version = "1.0.17"
|
@@ -1402,7 +1403,7 @@ version = "0.33.0"
|
|
1402
1403
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1403
1404
|
checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c"
|
1404
1405
|
dependencies = [
|
1405
|
-
"bitflags
|
1406
|
+
"bitflags",
|
1406
1407
|
]
|
1407
1408
|
|
1408
1409
|
[[package]]
|
@@ -1446,6 +1447,12 @@ dependencies = [
|
|
1446
1447
|
"syn",
|
1447
1448
|
]
|
1448
1449
|
|
1450
|
+
[[package]]
|
1451
|
+
name = "zlib-rs"
|
1452
|
+
version = "0.4.2"
|
1453
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1454
|
+
checksum = "8b20717f0917c908dc63de2e44e97f1e6b126ca58d0e391cee86d504eb8fbd05"
|
1455
|
+
|
1449
1456
|
[[package]]
|
1450
1457
|
name = "zstd"
|
1451
1458
|
version = "0.13.3"
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -11,14 +11,14 @@ rb-sys-env = "^0.2"
|
|
11
11
|
|
12
12
|
[dependencies]
|
13
13
|
ahash = "0.8"
|
14
|
-
arrow-array = "
|
15
|
-
arrow-schema = "
|
14
|
+
arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records" }
|
15
|
+
arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records" }
|
16
16
|
bytes = "^1.9"
|
17
17
|
either = "1.9"
|
18
18
|
itertools = "^0.14"
|
19
19
|
jiff = "0.2"
|
20
20
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
21
|
-
parquet = {
|
21
|
+
parquet = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-reading-int32-timestamp-records", features = ["json"] }
|
22
22
|
rand = "0.9"
|
23
23
|
rb-sys = "^0.9"
|
24
24
|
simdutf8 = "0.1.5"
|
@@ -1,6 +1,7 @@
|
|
1
1
|
mod common;
|
2
2
|
mod parquet_column_reader;
|
3
3
|
mod parquet_row_reader;
|
4
|
+
mod unified;
|
4
5
|
use std::{fs::File, rc::Rc};
|
5
6
|
|
6
7
|
use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
|
@@ -207,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
|
|
207
208
|
let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
|
208
209
|
|
209
210
|
Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
|
210
|
-
}
|
211
|
+
}
|
@@ -1,21 +1,9 @@
|
|
1
|
-
use crate::
|
2
|
-
use crate::
|
3
|
-
use crate::
|
4
|
-
|
5
|
-
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
|
6
|
-
ParserResultType,
|
7
|
-
};
|
8
|
-
use ahash::RandomState;
|
9
|
-
use either::Either;
|
10
|
-
use magnus::IntoValue;
|
1
|
+
use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
|
2
|
+
use crate::utils::*;
|
3
|
+
use crate::ParquetGemError;
|
4
|
+
|
11
5
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
|
-
use std::collections::HashMap;
|
13
6
|
use std::rc::Rc;
|
14
|
-
use std::sync::OnceLock;
|
15
|
-
|
16
|
-
use super::common::{
|
17
|
-
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
18
|
-
};
|
19
7
|
|
20
8
|
#[inline]
|
21
9
|
pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
@@ -41,116 +29,16 @@ fn parse_parquet_columns_impl(
|
|
41
29
|
logger,
|
42
30
|
} = parse_parquet_columns_args(&ruby, args)?;
|
43
31
|
|
44
|
-
//
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
// Clone values for the closure to avoid move issues
|
51
|
-
let columns_clone = columns.clone();
|
52
|
-
|
53
|
-
// Handle block or create enumerator
|
54
|
-
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
55
|
-
create_column_enumerator(ColumnEnumeratorArgs {
|
56
|
-
rb_self,
|
32
|
+
// Use the unified parsing implementation
|
33
|
+
parse_parquet_unified(
|
34
|
+
ruby,
|
35
|
+
rb_self,
|
36
|
+
UnifiedParserArgs {
|
57
37
|
to_read,
|
58
38
|
result_type,
|
59
|
-
columns
|
60
|
-
batch_size,
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
})? {
|
66
|
-
return Ok(enum_value);
|
67
|
-
}
|
68
|
-
|
69
|
-
let source = open_parquet_source(ruby.clone(), to_read)?;
|
70
|
-
|
71
|
-
// Use the common function to create the batch reader
|
72
|
-
|
73
|
-
let (batch_reader, schema, num_rows) = match source {
|
74
|
-
Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
|
75
|
-
Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
|
76
|
-
};
|
77
|
-
|
78
|
-
match result_type {
|
79
|
-
ParserResultType::Hash => {
|
80
|
-
// For hash return type, we need to return a hash with column names pointing at empty arrays
|
81
|
-
if handle_empty_file(&ruby, &schema, num_rows)? {
|
82
|
-
return Ok(ruby.qnil().into_value_with(&ruby));
|
83
|
-
}
|
84
|
-
|
85
|
-
let headers = OnceLock::new();
|
86
|
-
let headers_clone = headers.clone();
|
87
|
-
let iter = batch_reader.map(move |batch| {
|
88
|
-
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
89
|
-
let local_headers = headers_clone
|
90
|
-
.get_or_init(|| {
|
91
|
-
let schema = batch.schema();
|
92
|
-
let fields = schema.fields();
|
93
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
94
|
-
for field in fields {
|
95
|
-
header_string.push(field.name().to_owned());
|
96
|
-
}
|
97
|
-
StringCache::intern_many(&header_string)
|
98
|
-
})
|
99
|
-
.as_ref()
|
100
|
-
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
101
|
-
|
102
|
-
let mut map = HashMap::with_capacity_and_hasher(
|
103
|
-
local_headers.len(),
|
104
|
-
RandomState::default(),
|
105
|
-
);
|
106
|
-
|
107
|
-
batch
|
108
|
-
.columns()
|
109
|
-
.iter()
|
110
|
-
.enumerate()
|
111
|
-
.try_for_each(|(i, column)| {
|
112
|
-
let header = local_headers[i];
|
113
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
114
|
-
array: column,
|
115
|
-
strict,
|
116
|
-
})?;
|
117
|
-
map.insert(header, values.into_inner());
|
118
|
-
Ok::<_, ParquetGemError>(())
|
119
|
-
})?;
|
120
|
-
|
121
|
-
Ok(ColumnRecord::Map::<RandomState>(map))
|
122
|
-
})
|
123
|
-
});
|
124
|
-
|
125
|
-
for result in iter {
|
126
|
-
let record = result?;
|
127
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
128
|
-
}
|
129
|
-
}
|
130
|
-
ParserResultType::Array => {
|
131
|
-
let iter = batch_reader.map(|batch| {
|
132
|
-
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
133
|
-
let vec = batch
|
134
|
-
.columns()
|
135
|
-
.iter()
|
136
|
-
.map(|column| {
|
137
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
138
|
-
array: column,
|
139
|
-
strict,
|
140
|
-
})?;
|
141
|
-
Ok::<_, ParquetGemError>(values.into_inner())
|
142
|
-
})
|
143
|
-
.collect::<Result<Vec<_>, _>>()?;
|
144
|
-
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
145
|
-
})
|
146
|
-
});
|
147
|
-
|
148
|
-
for result in iter {
|
149
|
-
let record = result?;
|
150
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
151
|
-
}
|
152
|
-
}
|
153
|
-
}
|
154
|
-
|
155
|
-
Ok(ruby.qnil().into_value_with(&ruby))
|
156
|
-
}
|
39
|
+
columns,
|
40
|
+
parser_type: ParserType::Column { batch_size, strict },
|
41
|
+
logger,
|
42
|
+
},
|
43
|
+
)
|
44
|
+
}
|
@@ -1,22 +1,9 @@
|
|
1
|
-
use crate::
|
2
|
-
use crate::
|
3
|
-
use crate::
|
4
|
-
|
5
|
-
create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
|
6
|
-
RowEnumeratorArgs, RowRecord,
|
7
|
-
};
|
8
|
-
use ahash::RandomState;
|
9
|
-
use either::Either;
|
10
|
-
use magnus::IntoValue;
|
1
|
+
use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
|
2
|
+
use crate::utils::*;
|
3
|
+
use crate::ParquetGemError;
|
4
|
+
|
11
5
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
|
-
use parquet::file::reader::{FileReader, SerializedFileReader};
|
13
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
14
|
-
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
15
|
-
use std::collections::HashMap;
|
16
6
|
use std::rc::Rc;
|
17
|
-
use std::sync::OnceLock;
|
18
|
-
|
19
|
-
use super::common::{handle_block_or_enum, open_parquet_source};
|
20
7
|
|
21
8
|
#[inline]
|
22
9
|
pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
@@ -41,123 +28,16 @@ fn parse_parquet_rows_impl(
|
|
41
28
|
logger,
|
42
29
|
} = parse_parquet_rows_args(&ruby, args)?;
|
43
30
|
|
44
|
-
//
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
// Handle block or create enumerator
|
51
|
-
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
52
|
-
create_row_enumerator(RowEnumeratorArgs {
|
53
|
-
rb_self,
|
31
|
+
// Use the unified parsing implementation
|
32
|
+
parse_parquet_unified(
|
33
|
+
ruby,
|
34
|
+
rb_self,
|
35
|
+
UnifiedParserArgs {
|
54
36
|
to_read,
|
55
37
|
result_type,
|
56
|
-
columns
|
57
|
-
strict,
|
38
|
+
columns,
|
39
|
+
parser_type: ParserType::Row { strict },
|
58
40
|
logger,
|
59
|
-
}
|
60
|
-
|
61
|
-
|
62
|
-
return Ok(enum_value);
|
63
|
-
}
|
64
|
-
|
65
|
-
let source = open_parquet_source(ruby.clone(), to_read)?;
|
66
|
-
let reader: Box<dyn FileReader> = match source {
|
67
|
-
Either::Left(file) => {
|
68
|
-
Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
|
69
|
-
}
|
70
|
-
Either::Right(readable) => {
|
71
|
-
Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
|
72
|
-
}
|
73
|
-
};
|
74
|
-
|
75
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
76
|
-
ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
|
77
|
-
|
78
|
-
let mut iter = ParquetRowIter::from_file_into(reader);
|
79
|
-
if let Some(cols) = columns {
|
80
|
-
ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
|
81
|
-
let projection = create_projection_schema(&schema, &cols);
|
82
|
-
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
83
|
-
MagnusError::new(
|
84
|
-
ruby.exception_runtime_error(),
|
85
|
-
format!("Failed to create projection: {}", e),
|
86
|
-
)
|
87
|
-
})?;
|
88
|
-
}
|
89
|
-
|
90
|
-
match result_type {
|
91
|
-
ParserResultType::Hash => {
|
92
|
-
let headers = OnceLock::new();
|
93
|
-
let headers_clone = headers.clone();
|
94
|
-
let iter = iter.map(move |row| {
|
95
|
-
row.map(|row| {
|
96
|
-
let headers = headers_clone.get_or_init(|| {
|
97
|
-
let column_count = row.get_column_iter().count();
|
98
|
-
|
99
|
-
let mut header_string = Vec::with_capacity(column_count);
|
100
|
-
for (k, _) in row.get_column_iter() {
|
101
|
-
header_string.push(k.to_owned());
|
102
|
-
}
|
103
|
-
|
104
|
-
StringCache::intern_many(&header_string).expect("Failed to intern headers")
|
105
|
-
});
|
106
|
-
|
107
|
-
let mut map =
|
108
|
-
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
109
|
-
for (i, (_, v)) in row.get_column_iter().enumerate() {
|
110
|
-
map.insert(headers[i], ParquetField(v.clone(), strict));
|
111
|
-
}
|
112
|
-
map
|
113
|
-
})
|
114
|
-
.map(RowRecord::Map::<RandomState>)
|
115
|
-
.map_err(ParquetGemError::from)
|
116
|
-
});
|
117
|
-
|
118
|
-
for result in iter {
|
119
|
-
let record = result?;
|
120
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
121
|
-
}
|
122
|
-
}
|
123
|
-
ParserResultType::Array => {
|
124
|
-
let iter = iter.map(|row| {
|
125
|
-
row.map(|row| {
|
126
|
-
let column_count = row.get_column_iter().count();
|
127
|
-
let mut vec = Vec::with_capacity(column_count);
|
128
|
-
for (_, v) in row.get_column_iter() {
|
129
|
-
vec.push(ParquetField(v.clone(), strict));
|
130
|
-
}
|
131
|
-
vec
|
132
|
-
})
|
133
|
-
.map(RowRecord::Vec::<RandomState>)
|
134
|
-
.map_err(ParquetGemError::from)
|
135
|
-
});
|
136
|
-
|
137
|
-
for result in iter {
|
138
|
-
let record = result?;
|
139
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
140
|
-
}
|
141
|
-
}
|
142
|
-
}
|
143
|
-
|
144
|
-
Ok(ruby.qnil().into_value_with(&ruby))
|
145
|
-
}
|
146
|
-
|
147
|
-
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
148
|
-
if let SchemaType::GroupType { fields, .. } = schema {
|
149
|
-
let projected_fields: Vec<TypePtr> = fields
|
150
|
-
.iter()
|
151
|
-
.filter(|field| columns.contains(&field.name().to_string()))
|
152
|
-
.cloned()
|
153
|
-
.collect();
|
154
|
-
|
155
|
-
SchemaType::GroupType {
|
156
|
-
basic_info: schema.get_basic_info().clone(),
|
157
|
-
fields: projected_fields,
|
158
|
-
}
|
159
|
-
} else {
|
160
|
-
// Return original schema if not a group type
|
161
|
-
schema.clone()
|
162
|
-
}
|
163
|
-
}
|
41
|
+
},
|
42
|
+
)
|
43
|
+
}
|