parquet 0.0.4 → 0.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +48 -40
- data/Gemfile +1 -1
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +5 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +5 -0
- data/ext/parquet/src/reader/mod.rs +42 -0
- data/ext/parquet/src/{reader.rs → reader/parquet_column_reader.rs} +7 -164
- data/ext/parquet/src/reader/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/{types.rs → types/parquet_value.rs} +171 -435
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
|
4
|
+
data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
|
7
|
+
data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
|
data/Cargo.lock
CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
|
|
387
387
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
388
388
|
checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
|
389
389
|
|
390
|
+
[[package]]
|
391
|
+
name = "errno"
|
392
|
+
version = "0.3.10"
|
393
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
394
|
+
checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
|
395
|
+
dependencies = [
|
396
|
+
"libc",
|
397
|
+
"windows-sys",
|
398
|
+
]
|
399
|
+
|
400
|
+
[[package]]
|
401
|
+
name = "fastrand"
|
402
|
+
version = "2.3.0"
|
403
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
404
|
+
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
405
|
+
|
390
406
|
[[package]]
|
391
407
|
name = "flatbuffers"
|
392
408
|
version = "24.12.23"
|
@@ -826,16 +842,6 @@ dependencies = [
|
|
826
842
|
"wasm-bindgen",
|
827
843
|
]
|
828
844
|
|
829
|
-
[[package]]
|
830
|
-
name = "kanal"
|
831
|
-
version = "0.1.0-pre8"
|
832
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
833
|
-
checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
|
834
|
-
dependencies = [
|
835
|
-
"futures-core",
|
836
|
-
"lock_api",
|
837
|
-
]
|
838
|
-
|
839
845
|
[[package]]
|
840
846
|
name = "lazy_static"
|
841
847
|
version = "1.5.0"
|
@@ -944,6 +950,12 @@ dependencies = [
|
|
944
950
|
"libc",
|
945
951
|
]
|
946
952
|
|
953
|
+
[[package]]
|
954
|
+
name = "linux-raw-sys"
|
955
|
+
version = "0.4.15"
|
956
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
957
|
+
checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
|
958
|
+
|
947
959
|
[[package]]
|
948
960
|
name = "litemap"
|
949
961
|
version = "0.7.4"
|
@@ -975,18 +987,6 @@ dependencies = [
|
|
975
987
|
"twox-hash",
|
976
988
|
]
|
977
989
|
|
978
|
-
[[package]]
|
979
|
-
name = "magnus"
|
980
|
-
version = "0.6.4"
|
981
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
982
|
-
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
983
|
-
dependencies = [
|
984
|
-
"magnus-macros",
|
985
|
-
"rb-sys",
|
986
|
-
"rb-sys-env",
|
987
|
-
"seq-macro",
|
988
|
-
]
|
989
|
-
|
990
990
|
[[package]]
|
991
991
|
name = "magnus"
|
992
992
|
version = "0.7.1"
|
@@ -1203,13 +1203,11 @@ dependencies = [
|
|
1203
1203
|
"itertools 0.14.0",
|
1204
1204
|
"jemallocator",
|
1205
1205
|
"jiff",
|
1206
|
-
"
|
1207
|
-
"magnus 0.7.1",
|
1206
|
+
"magnus",
|
1208
1207
|
"mimalloc",
|
1209
1208
|
"parquet 54.0.0",
|
1210
1209
|
"rb-sys",
|
1211
|
-
"
|
1212
|
-
"serde_magnus",
|
1210
|
+
"tempfile",
|
1213
1211
|
"thiserror",
|
1214
1212
|
]
|
1215
1213
|
|
@@ -1402,6 +1400,19 @@ dependencies = [
|
|
1402
1400
|
"semver",
|
1403
1401
|
]
|
1404
1402
|
|
1403
|
+
[[package]]
|
1404
|
+
name = "rustix"
|
1405
|
+
version = "0.38.43"
|
1406
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1407
|
+
checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
|
1408
|
+
dependencies = [
|
1409
|
+
"bitflags 2.6.0",
|
1410
|
+
"errno",
|
1411
|
+
"libc",
|
1412
|
+
"linux-raw-sys",
|
1413
|
+
"windows-sys",
|
1414
|
+
]
|
1415
|
+
|
1405
1416
|
[[package]]
|
1406
1417
|
name = "ryu"
|
1407
1418
|
version = "1.0.18"
|
@@ -1467,17 +1478,6 @@ dependencies = [
|
|
1467
1478
|
"serde",
|
1468
1479
|
]
|
1469
1480
|
|
1470
|
-
[[package]]
|
1471
|
-
name = "serde_magnus"
|
1472
|
-
version = "0.8.1"
|
1473
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1474
|
-
checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
|
1475
|
-
dependencies = [
|
1476
|
-
"magnus 0.6.4",
|
1477
|
-
"serde",
|
1478
|
-
"tap",
|
1479
|
-
]
|
1480
|
-
|
1481
1481
|
[[package]]
|
1482
1482
|
name = "shell-words"
|
1483
1483
|
version = "1.1.0"
|
@@ -1567,10 +1567,18 @@ dependencies = [
|
|
1567
1567
|
]
|
1568
1568
|
|
1569
1569
|
[[package]]
|
1570
|
-
name = "
|
1571
|
-
version = "
|
1570
|
+
name = "tempfile"
|
1571
|
+
version = "3.15.0"
|
1572
1572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1573
|
-
checksum = "
|
1573
|
+
checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
|
1574
|
+
dependencies = [
|
1575
|
+
"cfg-if",
|
1576
|
+
"fastrand",
|
1577
|
+
"getrandom",
|
1578
|
+
"once_cell",
|
1579
|
+
"rustix",
|
1580
|
+
"windows-sys",
|
1581
|
+
]
|
1574
1582
|
|
1575
1583
|
[[package]]
|
1576
1584
|
name = "thiserror"
|
data/Gemfile
CHANGED
data/README.md
CHANGED
@@ -4,8 +4,6 @@
|
|
4
4
|
|
5
5
|
This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
|
6
6
|
|
7
|
-
At the moment, it only supports iterating rows as either a hash or an array.
|
8
|
-
|
9
7
|
## Usage
|
10
8
|
|
11
9
|
This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
|
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
|
|
83
81
|
- `batch_size`: Number of rows per batch (defaults to implementation-defined value)
|
84
82
|
|
85
83
|
When no block is given, both methods return an Enumerator.
|
84
|
+
|
85
|
+
### Writing Row-wise Data
|
86
|
+
|
87
|
+
The `write_rows` method allows you to write data row by row:
|
88
|
+
|
89
|
+
```ruby
|
90
|
+
require "parquet"
|
91
|
+
|
92
|
+
# Define the schema for your data
|
93
|
+
schema = [
|
94
|
+
{ "id" => "int64" },
|
95
|
+
{ "name" => "string" },
|
96
|
+
{ "score" => "double" }
|
97
|
+
]
|
98
|
+
|
99
|
+
# Create an enumerator that yields arrays of row values
|
100
|
+
rows = [
|
101
|
+
[1, "Alice", 95.5],
|
102
|
+
[2, "Bob", 82.3],
|
103
|
+
[3, "Charlie", 88.7]
|
104
|
+
].each
|
105
|
+
|
106
|
+
# Write to a file
|
107
|
+
Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
|
108
|
+
|
109
|
+
# Write to an IO object
|
110
|
+
File.open("data.parquet", "wb") do |file|
|
111
|
+
Parquet.write_rows(rows, schema: schema, write_to: file)
|
112
|
+
end
|
113
|
+
|
114
|
+
# Optionally specify batch size (default is 1000)
|
115
|
+
Parquet.write_rows(rows,
|
116
|
+
schema: schema,
|
117
|
+
write_to: "data.parquet",
|
118
|
+
batch_size: 500
|
119
|
+
)
|
120
|
+
```
|
121
|
+
|
122
|
+
### Writing Column-wise Data
|
123
|
+
|
124
|
+
The `write_columns` method provides a more efficient way to write data in column-oriented batches:
|
125
|
+
|
126
|
+
```ruby
|
127
|
+
require "parquet"
|
128
|
+
|
129
|
+
# Define the schema
|
130
|
+
schema = [
|
131
|
+
{ "id" => "int64" },
|
132
|
+
{ "name" => "string" },
|
133
|
+
{ "score" => "double" }
|
134
|
+
]
|
135
|
+
|
136
|
+
# Create batches of column data
|
137
|
+
batches = [
|
138
|
+
# First batch
|
139
|
+
[
|
140
|
+
[1, 2], # id column
|
141
|
+
["Alice", "Bob"], # name column
|
142
|
+
[95.5, 82.3] # score column
|
143
|
+
],
|
144
|
+
# Second batch
|
145
|
+
[
|
146
|
+
[3], # id column
|
147
|
+
["Charlie"], # name column
|
148
|
+
[88.7] # score column
|
149
|
+
]
|
150
|
+
]
|
151
|
+
|
152
|
+
# Create an enumerator from the batches
|
153
|
+
columns = batches.each
|
154
|
+
|
155
|
+
# Write to a parquet file
|
156
|
+
Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
|
157
|
+
|
158
|
+
# Write to an IO object
|
159
|
+
File.open("data.parquet", "wb") do |file|
|
160
|
+
Parquet.write_columns(columns, schema: schema, write_to: file)
|
161
|
+
end
|
162
|
+
```
|
163
|
+
|
164
|
+
The following data types are supported in the schema:
|
165
|
+
|
166
|
+
- `int8`, `int16`, `int32`, `int64`
|
167
|
+
- `uint8`, `uint16`, `uint32`, `uint64`
|
168
|
+
- `float`, `double`
|
169
|
+
- `string`
|
170
|
+
- `binary`
|
171
|
+
- `boolean`
|
172
|
+
- `date32`
|
173
|
+
- `timestamp_millis`, `timestamp_micros`
|
174
|
+
|
175
|
+
Note: List and Map types are currently not supported.
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -8,19 +8,16 @@ crate-type = ["cdylib"]
|
|
8
8
|
|
9
9
|
[dependencies]
|
10
10
|
ahash = "0.8"
|
11
|
-
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
12
|
-
arrow-schema = "54.0.0"
|
13
11
|
arrow-array = "54.0.0"
|
12
|
+
arrow-schema = "54.0.0"
|
14
13
|
bytes = "^1.9"
|
15
|
-
|
14
|
+
itertools = "^0.14"
|
15
|
+
jiff = "0.1.19"
|
16
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
17
|
+
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
17
18
|
rb-sys = "^0.9"
|
18
|
-
serde = { version = "1.0", features = ["derive"] }
|
19
|
-
serde_magnus = "0.8.1"
|
20
19
|
thiserror = "2.0"
|
21
|
-
|
22
|
-
jiff = "0.1.19"
|
23
|
-
|
20
|
+
tempfile = "^3.15"
|
24
21
|
|
25
22
|
[target.'cfg(target_os = "linux")'.dependencies]
|
26
23
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
@@ -3,12 +3,12 @@ use magnus::{
|
|
3
3
|
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
4
|
};
|
5
5
|
|
6
|
-
use crate::{ColumnRecord, RowRecord};
|
6
|
+
use crate::{ColumnRecord, ParserResultType, RowRecord};
|
7
7
|
|
8
8
|
pub struct RowEnumeratorArgs {
|
9
9
|
pub rb_self: Value,
|
10
10
|
pub to_read: Value,
|
11
|
-
pub result_type:
|
11
|
+
pub result_type: ParserResultType,
|
12
12
|
pub columns: Option<Vec<String>>,
|
13
13
|
}
|
14
14
|
|
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
|
|
17
17
|
args: RowEnumeratorArgs,
|
18
18
|
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
19
19
|
let kwargs = RHash::new();
|
20
|
-
kwargs.aset(
|
20
|
+
kwargs.aset(
|
21
|
+
Symbol::new("result_type"),
|
22
|
+
Symbol::new(args.result_type.to_string()),
|
23
|
+
)?;
|
21
24
|
if let Some(columns) = args.columns {
|
22
25
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
23
26
|
}
|
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
|
|
30
33
|
pub struct ColumnEnumeratorArgs {
|
31
34
|
pub rb_self: Value,
|
32
35
|
pub to_read: Value,
|
33
|
-
pub result_type:
|
36
|
+
pub result_type: ParserResultType,
|
34
37
|
pub columns: Option<Vec<String>>,
|
35
38
|
pub batch_size: Option<usize>,
|
36
39
|
}
|
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
|
|
40
43
|
args: ColumnEnumeratorArgs,
|
41
44
|
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
42
45
|
let kwargs = RHash::new();
|
43
|
-
kwargs.aset(
|
46
|
+
kwargs.aset(
|
47
|
+
Symbol::new("result_type"),
|
48
|
+
Symbol::new(args.result_type.to_string()),
|
49
|
+
)?;
|
44
50
|
if let Some(columns) = args.columns {
|
45
51
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
46
52
|
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -6,6 +6,7 @@ mod ruby_integration;
|
|
6
6
|
mod ruby_reader;
|
7
7
|
mod types;
|
8
8
|
mod utils;
|
9
|
+
mod writer;
|
9
10
|
|
10
11
|
use crate::enumerator::*;
|
11
12
|
use crate::reader::*;
|
@@ -13,6 +14,8 @@ use crate::ruby_integration::*;
|
|
13
14
|
use crate::types::*;
|
14
15
|
|
15
16
|
use magnus::{Error, Ruby};
|
17
|
+
use writer::write_columns;
|
18
|
+
use writer::write_rows;
|
16
19
|
|
17
20
|
/// Initializes the Ruby extension and defines methods.
|
18
21
|
#[magnus::init]
|
@@ -20,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
20
23
|
let module = ruby.define_module("Parquet")?;
|
21
24
|
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
22
25
|
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
26
|
+
module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
|
27
|
+
module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
|
23
28
|
Ok(())
|
24
29
|
}
|
@@ -0,0 +1,42 @@
|
|
1
|
+
mod parquet_column_reader;
|
2
|
+
mod parquet_row_reader;
|
3
|
+
|
4
|
+
use std::io;
|
5
|
+
|
6
|
+
use magnus::{Error as MagnusError, Ruby};
|
7
|
+
use thiserror::Error;
|
8
|
+
|
9
|
+
use crate::header_cache::CacheError;
|
10
|
+
pub use parquet_column_reader::parse_parquet_columns;
|
11
|
+
pub use parquet_row_reader::parse_parquet_rows;
|
12
|
+
|
13
|
+
#[derive(Error, Debug)]
|
14
|
+
pub enum ReaderError {
|
15
|
+
#[error("Failed to get file descriptor: {0}")]
|
16
|
+
FileDescriptor(String),
|
17
|
+
#[error("Invalid file descriptor")]
|
18
|
+
InvalidFileDescriptor,
|
19
|
+
#[error("Failed to open file: {0}")]
|
20
|
+
FileOpen(#[from] io::Error),
|
21
|
+
#[error("Failed to intern headers: {0}")]
|
22
|
+
HeaderIntern(#[from] CacheError),
|
23
|
+
#[error("Ruby error: {0}")]
|
24
|
+
Ruby(String),
|
25
|
+
#[error("Parquet error: {0}")]
|
26
|
+
Parquet(#[from] parquet::errors::ParquetError),
|
27
|
+
}
|
28
|
+
|
29
|
+
impl From<MagnusError> for ReaderError {
|
30
|
+
fn from(err: MagnusError) -> Self {
|
31
|
+
Self::Ruby(err.to_string())
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
impl From<ReaderError> for MagnusError {
|
36
|
+
fn from(err: ReaderError) -> Self {
|
37
|
+
MagnusError::new(
|
38
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
39
|
+
err.to_string(),
|
40
|
+
)
|
41
|
+
}
|
42
|
+
}
|
@@ -1,11 +1,7 @@
|
|
1
|
-
// =============================================================================
|
2
|
-
// Imports and Dependencies
|
3
|
-
// =============================================================================
|
4
1
|
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
2
|
use crate::{
|
6
|
-
create_column_enumerator,
|
7
|
-
|
8
|
-
SeekableRubyValue,
|
3
|
+
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
|
+
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
9
5
|
};
|
10
6
|
use ahash::RandomState;
|
11
7
|
use magnus::rb_sys::AsRawValue;
|
@@ -14,149 +10,20 @@ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
|
14
10
|
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
15
11
|
use parquet::arrow::ProjectionMask;
|
16
12
|
use parquet::errors::ParquetError;
|
17
|
-
use parquet::file::reader::FileReader;
|
18
|
-
use parquet::file::reader::SerializedFileReader;
|
19
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
20
|
-
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
21
13
|
use std::collections::HashMap;
|
22
14
|
use std::fs::File;
|
23
|
-
use std::io
|
15
|
+
use std::io;
|
24
16
|
use std::mem::ManuallyDrop;
|
25
17
|
use std::os::fd::FromRawFd;
|
26
18
|
use std::sync::OnceLock;
|
27
19
|
use thiserror::Error;
|
28
20
|
|
29
|
-
#[inline]
|
30
|
-
pub fn parse_parquet_rows<'a>(
|
31
|
-
rb_self: Value,
|
32
|
-
args: &[Value],
|
33
|
-
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
34
|
-
let original = unsafe { Ruby::get_unchecked() };
|
35
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
36
|
-
|
37
|
-
let ParquetRowsArgs {
|
38
|
-
to_read,
|
39
|
-
result_type,
|
40
|
-
columns,
|
41
|
-
} = parse_parquet_rows_args(&ruby, args)?;
|
42
|
-
|
43
|
-
if !ruby.block_given() {
|
44
|
-
return create_row_enumerator(RowEnumeratorArgs {
|
45
|
-
rb_self,
|
46
|
-
to_read,
|
47
|
-
result_type,
|
48
|
-
columns,
|
49
|
-
});
|
50
|
-
}
|
51
|
-
|
52
|
-
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
53
|
-
let path_string = to_read.to_r_string()?;
|
54
|
-
let file_path = unsafe { path_string.as_str()? };
|
55
|
-
let file = File::open(file_path).unwrap();
|
56
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
57
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
58
|
-
|
59
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
60
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
61
|
-
let raw_value = to_read.as_raw();
|
62
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
63
|
-
.map_err(|_| {
|
64
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
65
|
-
})?;
|
66
|
-
|
67
|
-
if fd < 0 {
|
68
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
69
|
-
}
|
70
|
-
|
71
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
72
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
73
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
74
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
75
|
-
|
76
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
77
|
-
} else {
|
78
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
79
|
-
let reader = SerializedFileReader::new(readable).unwrap();
|
80
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
81
|
-
|
82
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
83
|
-
};
|
84
|
-
|
85
|
-
if let Some(cols) = columns {
|
86
|
-
let projection = create_projection_schema(&schema, &cols);
|
87
|
-
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
88
|
-
MagnusError::new(
|
89
|
-
ruby.exception_runtime_error(),
|
90
|
-
format!("Failed to create projection: {}", e),
|
91
|
-
)
|
92
|
-
})?;
|
93
|
-
}
|
94
|
-
|
95
|
-
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
96
|
-
"hash" => {
|
97
|
-
let headers = OnceLock::new();
|
98
|
-
let headers_clone = headers.clone();
|
99
|
-
let iter = iter
|
100
|
-
.filter_map(move |row| {
|
101
|
-
row.ok().map(|row| {
|
102
|
-
let headers = headers_clone.get_or_init(|| {
|
103
|
-
let column_count = row.get_column_iter().count();
|
104
|
-
|
105
|
-
let mut header_string = Vec::with_capacity(column_count);
|
106
|
-
for (k, _) in row.get_column_iter() {
|
107
|
-
header_string.push(k.to_owned());
|
108
|
-
}
|
109
|
-
|
110
|
-
let headers = StringCache::intern_many(&header_string).unwrap();
|
111
|
-
|
112
|
-
headers
|
113
|
-
});
|
114
|
-
|
115
|
-
let mut map =
|
116
|
-
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
117
|
-
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
118
|
-
map.insert(headers[i], ParquetField(v.clone()));
|
119
|
-
});
|
120
|
-
map
|
121
|
-
})
|
122
|
-
})
|
123
|
-
.map(RowRecord::Map);
|
124
|
-
|
125
|
-
Box::new(HeaderCacheCleanupIter {
|
126
|
-
inner: iter,
|
127
|
-
headers,
|
128
|
-
})
|
129
|
-
}
|
130
|
-
"array" => Box::new(
|
131
|
-
iter.filter_map(|row| {
|
132
|
-
row.ok().map(|row| {
|
133
|
-
let column_count = row.get_column_iter().count();
|
134
|
-
let mut vec = Vec::with_capacity(column_count);
|
135
|
-
row.get_column_iter()
|
136
|
-
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
137
|
-
vec
|
138
|
-
})
|
139
|
-
})
|
140
|
-
.map(RowRecord::Vec),
|
141
|
-
),
|
142
|
-
_ => {
|
143
|
-
return Err(MagnusError::new(
|
144
|
-
ruby.exception_runtime_error(),
|
145
|
-
"Invalid result type",
|
146
|
-
))
|
147
|
-
}
|
148
|
-
};
|
149
|
-
|
150
|
-
Ok(Yield::Iter(iter))
|
151
|
-
}
|
152
|
-
|
153
21
|
#[inline]
|
154
22
|
pub fn parse_parquet_columns<'a>(
|
155
23
|
rb_self: Value,
|
156
24
|
args: &[Value],
|
157
25
|
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
158
|
-
let
|
159
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
26
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
160
27
|
|
161
28
|
let ParquetColumnsArgs {
|
162
29
|
to_read,
|
@@ -282,8 +149,8 @@ pub fn parse_parquet_columns<'a>(
|
|
282
149
|
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
283
150
|
}
|
284
151
|
|
285
|
-
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type
|
286
|
-
|
152
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
|
153
|
+
ParserResultType::Hash => {
|
287
154
|
let headers = OnceLock::new();
|
288
155
|
let headers_clone = headers.clone();
|
289
156
|
let iter = batch_reader
|
@@ -318,7 +185,7 @@ pub fn parse_parquet_columns<'a>(
|
|
318
185
|
headers,
|
319
186
|
})
|
320
187
|
}
|
321
|
-
|
188
|
+
ParserResultType::Array => Box::new(
|
322
189
|
batch_reader
|
323
190
|
.filter_map(|batch| {
|
324
191
|
batch.ok().map(|batch| {
|
@@ -334,35 +201,11 @@ pub fn parse_parquet_columns<'a>(
|
|
334
201
|
})
|
335
202
|
.map(ColumnRecord::Vec),
|
336
203
|
),
|
337
|
-
_ => {
|
338
|
-
return Err(MagnusError::new(
|
339
|
-
ruby.exception_runtime_error(),
|
340
|
-
"Invalid result type",
|
341
|
-
))
|
342
|
-
}
|
343
204
|
};
|
344
205
|
|
345
206
|
Ok(Yield::Iter(iter))
|
346
207
|
}
|
347
208
|
|
348
|
-
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
349
|
-
if let SchemaType::GroupType { fields, .. } = schema {
|
350
|
-
let projected_fields: Vec<TypePtr> = fields
|
351
|
-
.iter()
|
352
|
-
.filter(|field| columns.contains(&field.name().to_string()))
|
353
|
-
.cloned()
|
354
|
-
.collect();
|
355
|
-
|
356
|
-
SchemaType::GroupType {
|
357
|
-
basic_info: schema.get_basic_info().clone(),
|
358
|
-
fields: projected_fields,
|
359
|
-
}
|
360
|
-
} else {
|
361
|
-
// Return original schema if not a group type
|
362
|
-
schema.clone()
|
363
|
-
}
|
364
|
-
}
|
365
|
-
|
366
209
|
#[derive(Error, Debug)]
|
367
210
|
pub enum ReaderError {
|
368
211
|
#[error("Failed to get file descriptor: {0}")]
|