parquet 0.0.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +48 -40
- data/Gemfile +1 -1
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +5 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +5 -0
- data/ext/parquet/src/reader/mod.rs +42 -0
- data/ext/parquet/src/{reader.rs → reader/parquet_column_reader.rs} +7 -164
- data/ext/parquet/src/reader/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/{types.rs → types/parquet_value.rs} +171 -435
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
|
4
|
+
data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
|
7
|
+
data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
|
data/Cargo.lock
CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
|
|
387
387
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
388
388
|
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
389
389
|
|
390
|
+
[[package]]
|
391
|
+
name = "errno"
|
392
|
+
version = "0.3.10"
|
393
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
394
|
+
checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
|
395
|
+
dependencies = [
|
396
|
+
"libc",
|
397
|
+
"windows-sys",
|
398
|
+
]
|
399
|
+
|
400
|
+
[[package]]
|
401
|
+
name = "fastrand"
|
402
|
+
version = "2.3.0"
|
403
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
404
|
+
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
405
|
+
|
390
406
|
[[package]]
|
391
407
|
name = "flatbuffers"
|
392
408
|
version = "24.12.23"
|
@@ -826,16 +842,6 @@ dependencies = [
|
|
826
842
|
"wasm-bindgen",
|
827
843
|
]
|
828
844
|
|
829
|
-
[[package]]
|
830
|
-
name = "kanal"
|
831
|
-
version = "0.1.0-pre8"
|
832
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
833
|
-
checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
|
834
|
-
dependencies = [
|
835
|
-
"futures-core",
|
836
|
-
"lock_api",
|
837
|
-
]
|
838
|
-
|
839
845
|
[[package]]
|
840
846
|
name = "lazy_static"
|
841
847
|
version = "1.5.0"
|
@@ -944,6 +950,12 @@ dependencies = [
|
|
944
950
|
"libc",
|
945
951
|
]
|
946
952
|
|
953
|
+
[[package]]
|
954
|
+
name = "linux-raw-sys"
|
955
|
+
version = "0.4.15"
|
956
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
957
|
+
checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
|
958
|
+
|
947
959
|
[[package]]
|
948
960
|
name = "litemap"
|
949
961
|
version = "0.7.4"
|
@@ -975,18 +987,6 @@ dependencies = [
|
|
975
987
|
"twox-hash",
|
976
988
|
]
|
977
989
|
|
978
|
-
[[package]]
|
979
|
-
name = "magnus"
|
980
|
-
version = "0.6.4"
|
981
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
982
|
-
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
983
|
-
dependencies = [
|
984
|
-
"magnus-macros",
|
985
|
-
"rb-sys",
|
986
|
-
"rb-sys-env",
|
987
|
-
"seq-macro",
|
988
|
-
]
|
989
|
-
|
990
990
|
[[package]]
|
991
991
|
name = "magnus"
|
992
992
|
version = "0.7.1"
|
@@ -1203,13 +1203,11 @@ dependencies = [
|
|
1203
1203
|
"itertools 0.14.0",
|
1204
1204
|
"jemallocator",
|
1205
1205
|
"jiff",
|
1206
|
-
"
|
1207
|
-
"magnus 0.7.1",
|
1206
|
+
"magnus",
|
1208
1207
|
"mimalloc",
|
1209
1208
|
"parquet 54.0.0",
|
1210
1209
|
"rb-sys",
|
1211
|
-
"
|
1212
|
-
"serde_magnus",
|
1210
|
+
"tempfile",
|
1213
1211
|
"thiserror",
|
1214
1212
|
]
|
1215
1213
|
|
@@ -1402,6 +1400,19 @@ dependencies = [
|
|
1402
1400
|
"semver",
|
1403
1401
|
]
|
1404
1402
|
|
1403
|
+
[[package]]
|
1404
|
+
name = "rustix"
|
1405
|
+
version = "0.38.43"
|
1406
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1407
|
+
checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
|
1408
|
+
dependencies = [
|
1409
|
+
"bitflags 2.6.0",
|
1410
|
+
"errno",
|
1411
|
+
"libc",
|
1412
|
+
"linux-raw-sys",
|
1413
|
+
"windows-sys",
|
1414
|
+
]
|
1415
|
+
|
1405
1416
|
[[package]]
|
1406
1417
|
name = "ryu"
|
1407
1418
|
version = "1.0.18"
|
@@ -1467,17 +1478,6 @@ dependencies = [
|
|
1467
1478
|
"serde",
|
1468
1479
|
]
|
1469
1480
|
|
1470
|
-
[[package]]
|
1471
|
-
name = "serde_magnus"
|
1472
|
-
version = "0.8.1"
|
1473
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1474
|
-
checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
|
1475
|
-
dependencies = [
|
1476
|
-
"magnus 0.6.4",
|
1477
|
-
"serde",
|
1478
|
-
"tap",
|
1479
|
-
]
|
1480
|
-
|
1481
1481
|
[[package]]
|
1482
1482
|
name = "shell-words"
|
1483
1483
|
version = "1.1.0"
|
@@ -1567,10 +1567,18 @@ dependencies = [
|
|
1567
1567
|
]
|
1568
1568
|
|
1569
1569
|
[[package]]
|
1570
|
-
name = "
|
1571
|
-
version = "
|
1570
|
+
name = "tempfile"
|
1571
|
+
version = "3.15.0"
|
1572
1572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1573
|
-
checksum = "
|
1573
|
+
checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
|
1574
|
+
dependencies = [
|
1575
|
+
"cfg-if",
|
1576
|
+
"fastrand",
|
1577
|
+
"getrandom",
|
1578
|
+
"once_cell",
|
1579
|
+
"rustix",
|
1580
|
+
"windows-sys",
|
1581
|
+
]
|
1574
1582
|
|
1575
1583
|
[[package]]
|
1576
1584
|
name = "thiserror"
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -4,8 +4,6 @@
|
|
4
4
|
|
5
5
|
This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
|
6
6
|
|
7
|
-
At the moment, it only supports iterating rows as either a hash or an array.
|
8
|
-
|
9
7
|
## Usage
|
10
8
|
|
11
9
|
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
|
|
83
81
|
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
82
|
|
85
83
|
When no block is given, both methods return an Enumerator.
|
84
|
+
|
85
|
+
### Writing Row-wise Data
|
86
|
+
|
87
|
+
The `write_rows` method allows you to write data row by row:
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
require "parquet"
|
91
|
+
|
92
|
+
# Define the schema for your data
|
93
|
+
schema = [
|
94
|
+
{ "id" => "int64" },
|
95
|
+
{ "name" => "string" },
|
96
|
+
{ "score" => "double" }
|
97
|
+
]
|
98
|
+
|
99
|
+
# Create an enumerator that yields arrays of row values
|
100
|
+
rows = [
|
101
|
+
[1, "Alice", 95.5],
|
102
|
+
[2, "Bob", 82.3],
|
103
|
+
[3, "Charlie", 88.7]
|
104
|
+
].each
|
105
|
+
|
106
|
+
# Write to a file
|
107
|
+
Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
|
108
|
+
|
109
|
+
# Write to an IO object
|
110
|
+
File.open("data.parquet", "wb") do |file|
|
111
|
+
Parquet.write_rows(rows, schema: schema, write_to: file)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Optionally specify batch size (default is 1000)
|
115
|
+
Parquet.write_rows(rows,
|
116
|
+
schema: schema,
|
117
|
+
write_to: "data.parquet",
|
118
|
+
batch_size: 500
|
119
|
+
)
|
120
|
+
```
|
121
|
+
|
122
|
+
### Writing Column-wise Data
|
123
|
+
|
124
|
+
The `write_columns` method provides a more efficient way to write data in column-oriented batches:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
require "parquet"
|
128
|
+
|
129
|
+
# Define the schema
|
130
|
+
schema = [
|
131
|
+
{ "id" => "int64" },
|
132
|
+
{ "name" => "string" },
|
133
|
+
{ "score" => "double" }
|
134
|
+
]
|
135
|
+
|
136
|
+
# Create batches of column data
|
137
|
+
batches = [
|
138
|
+
# First batch
|
139
|
+
[
|
140
|
+
[1, 2], # id column
|
141
|
+
["Alice", "Bob"], # name column
|
142
|
+
[95.5, 82.3] # score column
|
143
|
+
],
|
144
|
+
# Second batch
|
145
|
+
[
|
146
|
+
[3], # id column
|
147
|
+
["Charlie"], # name column
|
148
|
+
[88.7] # score column
|
149
|
+
]
|
150
|
+
]
|
151
|
+
|
152
|
+
# Create an enumerator from the batches
|
153
|
+
columns = batches.each
|
154
|
+
|
155
|
+
# Write to a parquet file
|
156
|
+
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
|
+
|
158
|
+
# Write to an IO object
|
159
|
+
File.open("data.parquet", "wb") do |file|
|
160
|
+
Parquet.write_columns(columns, schema: schema, write_to: file)
|
161
|
+
end
|
162
|
+
```
|
163
|
+
|
164
|
+
The following data types are supported in the schema:
|
165
|
+
|
166
|
+
- `int8`, `int16`, `int32`, `int64`
|
167
|
+
- `uint8`, `uint16`, `uint32`, `uint64`
|
168
|
+
- `float`, `double`
|
169
|
+
- `string`
|
170
|
+
- `binary`
|
171
|
+
- `boolean`
|
172
|
+
- `date32`
|
173
|
+
- `timestamp_millis`, `timestamp_micros`
|
174
|
+
|
175
|
+
Note: List and Map types are currently not supported.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -8,19 +8,16 @@ crate-type = ["cdylib"]
|
|
8
8
|
|
9
9
|
[dependencies]
|
10
10
|
ahash = "0.8"
|
11
|
-
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
12
|
-
arrow-schema = "54.0.0"
|
13
11
|
arrow-array = "54.0.0"
|
12
|
+
arrow-schema = "54.0.0"
|
14
13
|
bytes = "^1.9"
|
15
|
-
|
14
|
+
itertools = "^0.14"
|
15
|
+
jiff = "0.1.19"
|
16
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
17
|
+
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
17
18
|
rb-sys = "^0.9"
|
18
|
-
serde = { version = "1.0", features = ["derive"] }
|
19
|
-
serde_magnus = "0.8.1"
|
20
19
|
thiserror = "2.0"
|
21
|
-
|
22
|
-
jiff = "0.1.19"
|
23
|
-
|
20
|
+
tempfile = "^3.15"
|
24
21
|
|
25
22
|
[target.'cfg(target_os = "linux")'.dependencies]
|
26
23
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
@@ -3,12 +3,12 @@ use magnus::{
|
|
3
3
|
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
4
|
};
|
5
5
|
|
6
|
-
use crate::{ColumnRecord, RowRecord};
|
6
|
+
use crate::{ColumnRecord, ParserResultType, RowRecord};
|
7
7
|
|
8
8
|
pub struct RowEnumeratorArgs {
|
9
9
|
pub rb_self: Value,
|
10
10
|
pub to_read: Value,
|
11
|
-
pub result_type:
|
11
|
+
pub result_type: ParserResultType,
|
12
12
|
pub columns: Option<Vec<String>>,
|
13
13
|
}
|
14
14
|
|
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
|
|
17
17
|
args: RowEnumeratorArgs,
|
18
18
|
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
19
19
|
let kwargs = RHash::new();
|
20
|
-
kwargs.aset(
|
20
|
+
kwargs.aset(
|
21
|
+
Symbol::new("result_type"),
|
22
|
+
Symbol::new(args.result_type.to_string()),
|
23
|
+
)?;
|
21
24
|
if let Some(columns) = args.columns {
|
22
25
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
23
26
|
}
|
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
|
|
30
33
|
pub struct ColumnEnumeratorArgs {
|
31
34
|
pub rb_self: Value,
|
32
35
|
pub to_read: Value,
|
33
|
-
pub result_type:
|
36
|
+
pub result_type: ParserResultType,
|
34
37
|
pub columns: Option<Vec<String>>,
|
35
38
|
pub batch_size: Option<usize>,
|
36
39
|
}
|
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
|
|
40
43
|
args: ColumnEnumeratorArgs,
|
41
44
|
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
42
45
|
let kwargs = RHash::new();
|
43
|
-
kwargs.aset(
|
46
|
+
kwargs.aset(
|
47
|
+
Symbol::new("result_type"),
|
48
|
+
Symbol::new(args.result_type.to_string()),
|
49
|
+
)?;
|
44
50
|
if let Some(columns) = args.columns {
|
45
51
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
46
52
|
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -6,6 +6,7 @@ mod ruby_integration;
|
|
6
6
|
mod ruby_reader;
|
7
7
|
mod types;
|
8
8
|
mod utils;
|
9
|
+
mod writer;
|
9
10
|
|
10
11
|
use crate::enumerator::*;
|
11
12
|
use crate::reader::*;
|
@@ -13,6 +14,8 @@ use crate::ruby_integration::*;
|
|
13
14
|
use crate::types::*;
|
14
15
|
|
15
16
|
use magnus::{Error, Ruby};
|
17
|
+
use writer::write_columns;
|
18
|
+
use writer::write_rows;
|
16
19
|
|
17
20
|
/// Initializes the Ruby extension and defines methods.
|
18
21
|
#[magnus::init]
|
@@ -20,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
20
23
|
let module = ruby.define_module("Parquet")?;
|
21
24
|
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
22
25
|
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
26
|
+
module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
|
27
|
+
module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
|
23
28
|
Ok(())
|
24
29
|
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
mod parquet_column_reader;
|
2
|
+
mod parquet_row_reader;
|
3
|
+
|
4
|
+
use std::io;
|
5
|
+
|
6
|
+
use magnus::{Error as MagnusError, Ruby};
|
7
|
+
use thiserror::Error;
|
8
|
+
|
9
|
+
use crate::header_cache::CacheError;
|
10
|
+
pub use parquet_column_reader::parse_parquet_columns;
|
11
|
+
pub use parquet_row_reader::parse_parquet_rows;
|
12
|
+
|
13
|
+
#[derive(Error, Debug)]
|
14
|
+
pub enum ReaderError {
|
15
|
+
#[error("Failed to get file descriptor: {0}")]
|
16
|
+
FileDescriptor(String),
|
17
|
+
#[error("Invalid file descriptor")]
|
18
|
+
InvalidFileDescriptor,
|
19
|
+
#[error("Failed to open file: {0}")]
|
20
|
+
FileOpen(#[from] io::Error),
|
21
|
+
#[error("Failed to intern headers: {0}")]
|
22
|
+
HeaderIntern(#[from] CacheError),
|
23
|
+
#[error("Ruby error: {0}")]
|
24
|
+
Ruby(String),
|
25
|
+
#[error("Parquet error: {0}")]
|
26
|
+
Parquet(#[from] parquet::errors::ParquetError),
|
27
|
+
}
|
28
|
+
|
29
|
+
impl From<MagnusError> for ReaderError {
|
30
|
+
fn from(err: MagnusError) -> Self {
|
31
|
+
Self::Ruby(err.to_string())
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
impl From<ReaderError> for MagnusError {
|
36
|
+
fn from(err: ReaderError) -> Self {
|
37
|
+
MagnusError::new(
|
38
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
39
|
+
err.to_string(),
|
40
|
+
)
|
41
|
+
}
|
42
|
+
}
|
@@ -1,11 +1,7 @@
|
|
1
|
-
// =============================================================================
|
2
|
-
// Imports and Dependencies
|
3
|
-
// =============================================================================
|
4
1
|
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
2
|
use crate::{
|
6
|
-
create_column_enumerator,
|
7
|
-
|
8
|
-
SeekableRubyValue,
|
3
|
+
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
|
+
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
9
5
|
};
|
10
6
|
use ahash::RandomState;
|
11
7
|
use magnus::rb_sys::AsRawValue;
|
@@ -14,149 +10,20 @@ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
|
14
10
|
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
15
11
|
use parquet::arrow::ProjectionMask;
|
16
12
|
use parquet::errors::ParquetError;
|
17
|
-
use parquet::file::reader::FileReader;
|
18
|
-
use parquet::file::reader::SerializedFileReader;
|
19
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
20
|
-
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
21
13
|
use std::collections::HashMap;
|
22
14
|
use std::fs::File;
|
23
|
-
use std::io
|
15
|
+
use std::io;
|
24
16
|
use std::mem::ManuallyDrop;
|
25
17
|
use std::os::fd::FromRawFd;
|
26
18
|
use std::sync::OnceLock;
|
27
19
|
use thiserror::Error;
|
28
20
|
|
29
|
-
#[inline]
|
30
|
-
pub fn parse_parquet_rows<'a>(
|
31
|
-
rb_self: Value,
|
32
|
-
args: &[Value],
|
33
|
-
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
34
|
-
let original = unsafe { Ruby::get_unchecked() };
|
35
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
36
|
-
|
37
|
-
let ParquetRowsArgs {
|
38
|
-
to_read,
|
39
|
-
result_type,
|
40
|
-
columns,
|
41
|
-
} = parse_parquet_rows_args(&ruby, args)?;
|
42
|
-
|
43
|
-
if !ruby.block_given() {
|
44
|
-
return create_row_enumerator(RowEnumeratorArgs {
|
45
|
-
rb_self,
|
46
|
-
to_read,
|
47
|
-
result_type,
|
48
|
-
columns,
|
49
|
-
});
|
50
|
-
}
|
51
|
-
|
52
|
-
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
53
|
-
let path_string = to_read.to_r_string()?;
|
54
|
-
let file_path = unsafe { path_string.as_str()? };
|
55
|
-
let file = File::open(file_path).unwrap();
|
56
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
57
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
58
|
-
|
59
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
60
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
61
|
-
let raw_value = to_read.as_raw();
|
62
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
63
|
-
.map_err(|_| {
|
64
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
65
|
-
})?;
|
66
|
-
|
67
|
-
if fd < 0 {
|
68
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
69
|
-
}
|
70
|
-
|
71
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
72
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
73
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
74
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
75
|
-
|
76
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
77
|
-
} else {
|
78
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
79
|
-
let reader = SerializedFileReader::new(readable).unwrap();
|
80
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
81
|
-
|
82
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
83
|
-
};
|
84
|
-
|
85
|
-
if let Some(cols) = columns {
|
86
|
-
let projection = create_projection_schema(&schema, &cols);
|
87
|
-
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
88
|
-
MagnusError::new(
|
89
|
-
ruby.exception_runtime_error(),
|
90
|
-
format!("Failed to create projection: {}", e),
|
91
|
-
)
|
92
|
-
})?;
|
93
|
-
}
|
94
|
-
|
95
|
-
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
96
|
-
"hash" => {
|
97
|
-
let headers = OnceLock::new();
|
98
|
-
let headers_clone = headers.clone();
|
99
|
-
let iter = iter
|
100
|
-
.filter_map(move |row| {
|
101
|
-
row.ok().map(|row| {
|
102
|
-
let headers = headers_clone.get_or_init(|| {
|
103
|
-
let column_count = row.get_column_iter().count();
|
104
|
-
|
105
|
-
let mut header_string = Vec::with_capacity(column_count);
|
106
|
-
for (k, _) in row.get_column_iter() {
|
107
|
-
header_string.push(k.to_owned());
|
108
|
-
}
|
109
|
-
|
110
|
-
let headers = StringCache::intern_many(&header_string).unwrap();
|
111
|
-
|
112
|
-
headers
|
113
|
-
});
|
114
|
-
|
115
|
-
let mut map =
|
116
|
-
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
117
|
-
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
118
|
-
map.insert(headers[i], ParquetField(v.clone()));
|
119
|
-
});
|
120
|
-
map
|
121
|
-
})
|
122
|
-
})
|
123
|
-
.map(RowRecord::Map);
|
124
|
-
|
125
|
-
Box::new(HeaderCacheCleanupIter {
|
126
|
-
inner: iter,
|
127
|
-
headers,
|
128
|
-
})
|
129
|
-
}
|
130
|
-
"array" => Box::new(
|
131
|
-
iter.filter_map(|row| {
|
132
|
-
row.ok().map(|row| {
|
133
|
-
let column_count = row.get_column_iter().count();
|
134
|
-
let mut vec = Vec::with_capacity(column_count);
|
135
|
-
row.get_column_iter()
|
136
|
-
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
137
|
-
vec
|
138
|
-
})
|
139
|
-
})
|
140
|
-
.map(RowRecord::Vec),
|
141
|
-
),
|
142
|
-
_ => {
|
143
|
-
return Err(MagnusError::new(
|
144
|
-
ruby.exception_runtime_error(),
|
145
|
-
"Invalid result type",
|
146
|
-
))
|
147
|
-
}
|
148
|
-
};
|
149
|
-
|
150
|
-
Ok(Yield::Iter(iter))
|
151
|
-
}
|
152
|
-
|
153
21
|
#[inline]
|
154
22
|
pub fn parse_parquet_columns<'a>(
|
155
23
|
rb_self: Value,
|
156
24
|
args: &[Value],
|
157
25
|
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
158
|
-
let
|
159
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
26
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
160
27
|
|
161
28
|
let ParquetColumnsArgs {
|
162
29
|
to_read,
|
@@ -282,8 +149,8 @@ pub fn parse_parquet_columns<'a>(
|
|
282
149
|
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
283
150
|
}
|
284
151
|
|
285
|
-
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type
|
286
|
-
|
152
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
|
153
|
+
ParserResultType::Hash => {
|
287
154
|
let headers = OnceLock::new();
|
288
155
|
let headers_clone = headers.clone();
|
289
156
|
let iter = batch_reader
|
@@ -318,7 +185,7 @@ pub fn parse_parquet_columns<'a>(
|
|
318
185
|
headers,
|
319
186
|
})
|
320
187
|
}
|
321
|
-
|
188
|
+
ParserResultType::Array => Box::new(
|
322
189
|
batch_reader
|
323
190
|
.filter_map(|batch| {
|
324
191
|
batch.ok().map(|batch| {
|
@@ -334,35 +201,11 @@ pub fn parse_parquet_columns<'a>(
|
|
334
201
|
})
|
335
202
|
.map(ColumnRecord::Vec),
|
336
203
|
),
|
337
|
-
_ => {
|
338
|
-
return Err(MagnusError::new(
|
339
|
-
ruby.exception_runtime_error(),
|
340
|
-
"Invalid result type",
|
341
|
-
))
|
342
|
-
}
|
343
204
|
};
|
344
205
|
|
345
206
|
Ok(Yield::Iter(iter))
|
346
207
|
}
|
347
208
|
|
348
|
-
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
349
|
-
if let SchemaType::GroupType { fields, .. } = schema {
|
350
|
-
let projected_fields: Vec<TypePtr> = fields
|
351
|
-
.iter()
|
352
|
-
.filter(|field| columns.contains(&field.name().to_string()))
|
353
|
-
.cloned()
|
354
|
-
.collect();
|
355
|
-
|
356
|
-
SchemaType::GroupType {
|
357
|
-
basic_info: schema.get_basic_info().clone(),
|
358
|
-
fields: projected_fields,
|
359
|
-
}
|
360
|
-
} else {
|
361
|
-
// Return original schema if not a group type
|
362
|
-
schema.clone()
|
363
|
-
}
|
364
|
-
}
|
365
|
-
|
366
209
|
#[derive(Error, Debug)]
|
367
210
|
pub enum ReaderError {
|
368
211
|
#[error("Failed to get file descriptor: {0}")]
|