parquet 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +42 -7
- data/Gemfile +7 -2
- data/ext/parquet/Cargo.toml +7 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +28 -0
- data/ext/parquet/src/header_cache.rs +23 -1
- data/ext/parquet/src/lib.rs +7 -0
- data/ext/parquet/src/reader.rs +92 -239
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +90 -0
- data/ext/parquet/src/utils.rs +7 -2
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +4 -2
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 70d9932bf622cd2647423e2519013d3a9f9256217effe9610e9aeaaebbcf1778
|
4
|
+
data.tar.gz: fae3767ce0d950c91b17f77b740159d863293e1288063ed15d9b9c1f82e87fe1
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a03e75bcd377ce5a61cd5f17685995c420601ac5917bd3d4a99dc082686423729ee5f0913bb032fe826dd1a8bac9b52c152cfb2037a376751258c17f3b0e63b1
|
7
|
+
data.tar.gz: ddfbb0ee14a6b7dcce47caf41962afe9610ab175d2b829c2744d62bed67cc746e64d214f64318220f2301a9ce8dcdecf9f9f9e90786df3d18f244716724abef8
|
data/Cargo.lock
CHANGED
@@ -749,6 +749,26 @@ version = "1.0.14"
|
|
749
749
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
750
750
|
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
751
751
|
|
752
|
+
[[package]]
|
753
|
+
name = "jemalloc-sys"
|
754
|
+
version = "0.5.4+5.3.0-patched"
|
755
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
756
|
+
checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
|
757
|
+
dependencies = [
|
758
|
+
"cc",
|
759
|
+
"libc",
|
760
|
+
]
|
761
|
+
|
762
|
+
[[package]]
|
763
|
+
name = "jemallocator"
|
764
|
+
version = "0.5.4"
|
765
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
766
|
+
checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
|
767
|
+
dependencies = [
|
768
|
+
"jemalloc-sys",
|
769
|
+
"libc",
|
770
|
+
]
|
771
|
+
|
752
772
|
[[package]]
|
753
773
|
name = "jobserver"
|
754
774
|
version = "0.1.32"
|
@@ -876,6 +896,16 @@ version = "0.2.11"
|
|
876
896
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
877
897
|
checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
|
878
898
|
|
899
|
+
[[package]]
|
900
|
+
name = "libmimalloc-sys"
|
901
|
+
version = "0.1.39"
|
902
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
903
|
+
checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
|
904
|
+
dependencies = [
|
905
|
+
"cc",
|
906
|
+
"libc",
|
907
|
+
]
|
908
|
+
|
879
909
|
[[package]]
|
880
910
|
name = "litemap"
|
881
911
|
version = "0.7.4"
|
@@ -948,6 +978,15 @@ version = "2.7.4"
|
|
948
978
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
949
979
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
950
980
|
|
981
|
+
[[package]]
|
982
|
+
name = "mimalloc"
|
983
|
+
version = "0.1.43"
|
984
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
985
|
+
checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
|
986
|
+
dependencies = [
|
987
|
+
"libmimalloc-sys",
|
988
|
+
]
|
989
|
+
|
951
990
|
[[package]]
|
952
991
|
name = "minimal-lexical"
|
953
992
|
version = "0.2.1"
|
@@ -1119,15 +1158,17 @@ dependencies = [
|
|
1119
1158
|
name = "parquet"
|
1120
1159
|
version = "0.1.0"
|
1121
1160
|
dependencies = [
|
1161
|
+
"ahash",
|
1122
1162
|
"bytes",
|
1163
|
+
"jemallocator",
|
1123
1164
|
"kanal",
|
1124
1165
|
"magnus 0.7.1",
|
1166
|
+
"mimalloc",
|
1125
1167
|
"parquet 54.0.0",
|
1126
1168
|
"rb-sys",
|
1127
1169
|
"serde",
|
1128
1170
|
"serde_magnus",
|
1129
1171
|
"thiserror",
|
1130
|
-
"xxhash-rust",
|
1131
1172
|
]
|
1132
1173
|
|
1133
1174
|
[[package]]
|
@@ -1796,12 +1837,6 @@ version = "0.5.5"
|
|
1796
1837
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1797
1838
|
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
1798
1839
|
|
1799
|
-
[[package]]
|
1800
|
-
name = "xxhash-rust"
|
1801
|
-
version = "0.8.14"
|
1802
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1803
|
-
checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
|
1804
|
-
|
1805
1840
|
[[package]]
|
1806
1841
|
name = "yoke"
|
1807
1842
|
version = "0.7.5"
|
data/Gemfile
CHANGED
@@ -6,7 +6,12 @@ gem "rake"
|
|
6
6
|
# Use local version of parquet
|
7
7
|
gemspec
|
8
8
|
|
9
|
-
group :development
|
10
|
-
gem "minitest", "~> 5.0"
|
9
|
+
group :development do
|
11
10
|
gem "benchmark-ips", "~> 2.12"
|
11
|
+
# gem "polars-df"
|
12
|
+
# gem "duckdb"
|
13
|
+
end
|
14
|
+
|
15
|
+
group :test do
|
16
|
+
gem "minitest", "~> 5.0"
|
12
17
|
end
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -7,6 +7,7 @@ edition = "2021"
|
|
7
7
|
crate-type = ["cdylib"]
|
8
8
|
|
9
9
|
[dependencies]
|
10
|
+
ahash = "0.8"
|
10
11
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
11
12
|
bytes = "^1.9"
|
12
13
|
kanal = "0.1.0-pre8"
|
@@ -15,4 +16,9 @@ rb-sys = "^0.9"
|
|
15
16
|
serde = { version = "1.0", features = ["derive"] }
|
16
17
|
serde_magnus = "0.8.1"
|
17
18
|
thiserror = "2.0"
|
18
|
-
|
19
|
+
|
20
|
+
[target.'cfg(target_os = "linux")'.dependencies]
|
21
|
+
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
22
|
+
|
23
|
+
[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
|
24
|
+
mimalloc = { version = "0.1", default-features = false }
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#[cfg(target_os = "linux")]
|
2
|
+
use jemallocator::Jemalloc;
|
3
|
+
|
4
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
5
|
+
use mimalloc::MiMalloc;
|
6
|
+
|
7
|
+
#[global_allocator]
|
8
|
+
#[cfg(target_os = "linux")]
|
9
|
+
static ALLOC: Jemalloc = Jemalloc;
|
10
|
+
|
11
|
+
#[global_allocator]
|
12
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
13
|
+
static ALLOC: MiMalloc = MiMalloc;
|
@@ -0,0 +1,28 @@
|
|
1
|
+
use ahash::RandomState;
|
2
|
+
use magnus::{
|
3
|
+
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
|
+
};
|
5
|
+
|
6
|
+
use crate::Record;
|
7
|
+
|
8
|
+
pub struct EnumeratorArgs {
|
9
|
+
pub rb_self: Value,
|
10
|
+
pub to_read: Value,
|
11
|
+
pub result_type: String,
|
12
|
+
pub columns: Option<Vec<String>>,
|
13
|
+
}
|
14
|
+
|
15
|
+
#[inline]
|
16
|
+
pub fn create_enumerator(
|
17
|
+
args: EnumeratorArgs,
|
18
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
|
19
|
+
let kwargs = RHash::new();
|
20
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
21
|
+
if let Some(columns) = args.columns {
|
22
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
23
|
+
}
|
24
|
+
let enumerator = args
|
25
|
+
.rb_self
|
26
|
+
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
27
|
+
Ok(Yield::Enumerator(enumerator))
|
28
|
+
}
|
@@ -6,7 +6,7 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
9
|
+
sync::{atomic::AtomicU32, LazyLock, Mutex, OnceLock},
|
10
10
|
};
|
11
11
|
use thiserror::Error;
|
12
12
|
|
@@ -79,3 +79,25 @@ impl StringCache {
|
|
79
79
|
Ok(())
|
80
80
|
}
|
81
81
|
}
|
82
|
+
|
83
|
+
pub struct HeaderCacheCleanupIter<I> {
|
84
|
+
pub inner: I,
|
85
|
+
pub headers: OnceLock<Vec<&'static str>>,
|
86
|
+
}
|
87
|
+
|
88
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
89
|
+
type Item = I::Item;
|
90
|
+
|
91
|
+
#[inline(always)]
|
92
|
+
fn next(&mut self) -> Option<Self::Item> {
|
93
|
+
self.inner.next()
|
94
|
+
}
|
95
|
+
}
|
96
|
+
|
97
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
98
|
+
fn drop(&mut self) {
|
99
|
+
if let Some(headers) = self.headers.get() {
|
100
|
+
StringCache::clear(&headers).unwrap();
|
101
|
+
}
|
102
|
+
}
|
103
|
+
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -1,9 +1,16 @@
|
|
1
|
+
mod allocator;
|
2
|
+
mod enumerator;
|
1
3
|
pub mod header_cache;
|
2
4
|
mod reader;
|
5
|
+
mod ruby_integration;
|
3
6
|
mod ruby_reader;
|
7
|
+
mod types;
|
4
8
|
mod utils;
|
5
9
|
|
10
|
+
use crate::enumerator::*;
|
6
11
|
use crate::reader::*;
|
12
|
+
use crate::ruby_integration::*;
|
13
|
+
use crate::types::*;
|
7
14
|
|
8
15
|
use magnus::{Error, Ruby};
|
9
16
|
|
data/ext/parquet/src/reader.rs
CHANGED
@@ -1,106 +1,39 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
use
|
1
|
+
// =============================================================================
|
2
|
+
// Imports and Dependencies
|
3
|
+
// =============================================================================
|
4
|
+
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
|
+
use crate::{
|
6
|
+
create_enumerator, utils::*, EnumeratorArgs, ForgottenFileHandle, ParquetField, Record,
|
7
|
+
SeekableRubyValue,
|
8
|
+
};
|
9
|
+
use ahash::RandomState;
|
5
10
|
use magnus::rb_sys::AsRawValue;
|
6
11
|
use magnus::value::{Opaque, ReprValue};
|
7
|
-
use magnus::
|
8
|
-
use
|
9
|
-
use parquet::
|
10
|
-
use parquet::
|
11
|
-
use parquet::
|
12
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
13
|
+
use parquet::file::reader::FileReader;
|
14
|
+
use parquet::file::reader::SerializedFileReader;
|
15
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
16
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
12
17
|
use std::collections::HashMap;
|
13
18
|
use std::fs::File;
|
14
|
-
use std::io::{self
|
19
|
+
use std::io::{self};
|
15
20
|
use std::mem::ManuallyDrop;
|
16
21
|
use std::os::fd::FromRawFd;
|
17
22
|
use std::sync::OnceLock;
|
18
|
-
use std::{borrow::Cow, hash::BuildHasher};
|
19
23
|
use thiserror::Error;
|
20
|
-
use xxhash_rust::xxh3::Xxh3Builder;
|
21
|
-
|
22
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
23
|
-
|
24
|
-
#[derive(Error, Debug)]
|
25
|
-
pub enum ReaderError {
|
26
|
-
#[error("Failed to get file descriptor: {0}")]
|
27
|
-
FileDescriptor(String),
|
28
|
-
#[error("Invalid file descriptor")]
|
29
|
-
InvalidFileDescriptor,
|
30
|
-
#[error("Failed to open file: {0}")]
|
31
|
-
FileOpen(#[from] io::Error),
|
32
|
-
#[error("Failed to intern headers: {0}")]
|
33
|
-
HeaderIntern(#[from] CacheError),
|
34
|
-
#[error("Ruby error: {0}")]
|
35
|
-
Ruby(String),
|
36
|
-
}
|
37
|
-
|
38
|
-
impl From<MagnusError> for ReaderError {
|
39
|
-
fn from(err: MagnusError) -> Self {
|
40
|
-
Self::Ruby(err.to_string())
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
impl From<ReaderError> for MagnusError {
|
45
|
-
fn from(err: ReaderError) -> Self {
|
46
|
-
MagnusError::new(
|
47
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
48
|
-
err.to_string(),
|
49
|
-
)
|
50
|
-
}
|
51
|
-
}
|
52
|
-
|
53
|
-
struct ForgottenFileHandle(ManuallyDrop<File>);
|
54
|
-
|
55
|
-
impl Length for ForgottenFileHandle {
|
56
|
-
fn len(&self) -> u64 {
|
57
|
-
self.0.len()
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
impl ChunkReader for ForgottenFileHandle {
|
62
|
-
type T = BufReader<File>;
|
63
|
-
|
64
|
-
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
65
|
-
self.0.get_read(start)
|
66
|
-
}
|
67
|
-
|
68
|
-
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
69
|
-
self.0.get_bytes(start, length)
|
70
|
-
}
|
71
|
-
}
|
72
|
-
|
73
|
-
struct HeaderCacheCleanupIter<I> {
|
74
|
-
inner: I,
|
75
|
-
headers: OnceLock<Vec<&'static str>>,
|
76
|
-
}
|
77
|
-
|
78
|
-
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
79
|
-
type Item = I::Item;
|
80
|
-
|
81
|
-
fn next(&mut self) -> Option<Self::Item> {
|
82
|
-
self.inner.next()
|
83
|
-
}
|
84
|
-
}
|
85
|
-
|
86
|
-
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
87
|
-
fn drop(&mut self) {
|
88
|
-
if let Some(headers) = self.headers.get() {
|
89
|
-
StringCache::clear(&headers).unwrap();
|
90
|
-
}
|
91
|
-
}
|
92
|
-
}
|
93
24
|
|
25
|
+
#[inline]
|
94
26
|
pub fn parse_parquet<'a>(
|
95
27
|
rb_self: Value,
|
96
28
|
args: &[Value],
|
97
|
-
) -> Result<Yield<Box<dyn Iterator<Item = Record<
|
29
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
|
98
30
|
let original = unsafe { Ruby::get_unchecked() };
|
99
31
|
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
100
32
|
|
101
33
|
let ParquetArgs {
|
102
34
|
to_read,
|
103
35
|
result_type,
|
36
|
+
columns,
|
104
37
|
} = parse_parquet_args(&ruby, args)?;
|
105
38
|
|
106
39
|
if !ruby.block_given() {
|
@@ -108,15 +41,18 @@ pub fn parse_parquet<'a>(
|
|
108
41
|
rb_self,
|
109
42
|
to_read,
|
110
43
|
result_type,
|
44
|
+
columns,
|
111
45
|
});
|
112
46
|
}
|
113
47
|
|
114
|
-
let iter = if to_read.is_kind_of(ruby.class_string()) {
|
48
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
115
49
|
let path_string = to_read.to_r_string()?;
|
116
50
|
let file_path = unsafe { path_string.as_str()? };
|
117
51
|
let file = File::open(file_path).unwrap();
|
118
52
|
let reader = SerializedFileReader::new(file).unwrap();
|
119
|
-
|
53
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
54
|
+
|
55
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
120
56
|
} else if to_read.is_kind_of(ruby.class_io()) {
|
121
57
|
let raw_value = to_read.as_raw();
|
122
58
|
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
@@ -131,14 +67,28 @@ pub fn parse_parquet<'a>(
|
|
131
67
|
let file = unsafe { File::from_raw_fd(fd) };
|
132
68
|
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
133
69
|
let reader = SerializedFileReader::new(file).unwrap();
|
134
|
-
|
70
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
71
|
+
|
72
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
135
73
|
} else {
|
136
74
|
let readable = SeekableRubyValue(Opaque::from(to_read));
|
137
75
|
let reader = SerializedFileReader::new(readable).unwrap();
|
138
|
-
|
76
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
77
|
+
|
78
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
139
79
|
};
|
140
80
|
|
141
|
-
let
|
81
|
+
if let Some(cols) = columns {
|
82
|
+
let projection = create_projection_schema(&schema, &cols);
|
83
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
84
|
+
MagnusError::new(
|
85
|
+
ruby.exception_runtime_error(),
|
86
|
+
format!("Failed to create projection: {}", e),
|
87
|
+
)
|
88
|
+
})?;
|
89
|
+
}
|
90
|
+
|
91
|
+
let iter: Box<dyn Iterator<Item = Record<RandomState>>> = match result_type.as_str() {
|
142
92
|
"hash" => {
|
143
93
|
let headers = OnceLock::new();
|
144
94
|
let headers_clone = headers.clone();
|
@@ -146,21 +96,23 @@ pub fn parse_parquet<'a>(
|
|
146
96
|
.filter_map(move |row| {
|
147
97
|
row.ok().map(|row| {
|
148
98
|
let headers = headers_clone.get_or_init(|| {
|
149
|
-
row.get_column_iter()
|
150
|
-
|
151
|
-
|
99
|
+
let column_count = row.get_column_iter().count();
|
100
|
+
let mut headers = Vec::with_capacity(column_count);
|
101
|
+
row.get_column_iter().for_each(|(k, _)| {
|
102
|
+
headers.push(StringCache::intern(k.to_owned()).unwrap())
|
103
|
+
});
|
104
|
+
headers
|
152
105
|
});
|
153
106
|
|
154
|
-
|
155
|
-
.
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
.collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
|
107
|
+
let mut map =
|
108
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
109
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
110
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
111
|
+
});
|
112
|
+
map
|
161
113
|
})
|
162
114
|
})
|
163
|
-
.map(
|
115
|
+
.map(Record::Map);
|
164
116
|
|
165
117
|
Box::new(HeaderCacheCleanupIter {
|
166
118
|
inner: iter,
|
@@ -170,12 +122,14 @@ pub fn parse_parquet<'a>(
|
|
170
122
|
"array" => Box::new(
|
171
123
|
iter.filter_map(|row| {
|
172
124
|
row.ok().map(|row| {
|
125
|
+
let column_count = row.get_column_iter().count();
|
126
|
+
let mut vec = Vec::with_capacity(column_count);
|
173
127
|
row.get_column_iter()
|
174
|
-
.
|
175
|
-
|
128
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
129
|
+
vec
|
176
130
|
})
|
177
131
|
})
|
178
|
-
.map(
|
132
|
+
.map(Record::Vec),
|
179
133
|
),
|
180
134
|
_ => {
|
181
135
|
return Err(MagnusError::new(
|
@@ -188,150 +142,49 @@ pub fn parse_parquet<'a>(
|
|
188
142
|
Ok(Yield::Iter(iter))
|
189
143
|
}
|
190
144
|
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
args: EnumeratorArgs,
|
199
|
-
) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
|
200
|
-
let kwargs = RHash::new();
|
201
|
-
|
202
|
-
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
203
|
-
|
204
|
-
let enumerator = args
|
205
|
-
.rb_self
|
206
|
-
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
207
|
-
Ok(Yield::Enumerator(enumerator))
|
208
|
-
}
|
209
|
-
|
210
|
-
#[derive(Debug)]
|
211
|
-
pub enum Record<S: BuildHasher + Default> {
|
212
|
-
Vec(Vec<ParquetField>),
|
213
|
-
Map(HashMap<&'static str, ParquetField, S>),
|
214
|
-
}
|
145
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
146
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
147
|
+
let projected_fields: Vec<TypePtr> = fields
|
148
|
+
.iter()
|
149
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
150
|
+
.cloned()
|
151
|
+
.collect();
|
215
152
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
match self {
|
220
|
-
Record::Vec(vec) => {
|
221
|
-
let ary = handle.ary_new_capa(vec.len());
|
222
|
-
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
223
|
-
ary.into_value_with(handle)
|
224
|
-
}
|
225
|
-
Record::Map(map) => {
|
226
|
-
// Pre-allocate the hash with the known size
|
227
|
-
let hash = handle.hash_new_capa(map.len());
|
228
|
-
map.into_iter()
|
229
|
-
.try_for_each(|(k, v)| hash.aset(k, v))
|
230
|
-
.unwrap();
|
231
|
-
hash.into_value_with(handle)
|
232
|
-
}
|
153
|
+
SchemaType::GroupType {
|
154
|
+
basic_info: schema.get_basic_info().clone(),
|
155
|
+
fields: projected_fields,
|
233
156
|
}
|
157
|
+
} else {
|
158
|
+
// Return original schema if not a group type
|
159
|
+
schema.clone()
|
234
160
|
}
|
235
161
|
}
|
236
162
|
|
237
|
-
#[derive(
|
238
|
-
pub
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
}
|
244
|
-
|
245
|
-
|
246
|
-
#[
|
247
|
-
|
248
|
-
|
249
|
-
impl<'a> IntoValue for ParquetField {
|
250
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
251
|
-
match self.0 {
|
252
|
-
Field::Byte(b) => b.into_value_with(handle),
|
253
|
-
Field::Bool(b) => b.into_value_with(handle),
|
254
|
-
Field::Short(s) => s.into_value_with(handle),
|
255
|
-
Field::Int(i) => i.into_value_with(handle),
|
256
|
-
Field::Long(l) => l.into_value_with(handle),
|
257
|
-
Field::UByte(ub) => ub.into_value_with(handle),
|
258
|
-
Field::UShort(us) => us.into_value_with(handle),
|
259
|
-
Field::UInt(ui) => ui.into_value_with(handle),
|
260
|
-
Field::ULong(ul) => ul.into_value_with(handle),
|
261
|
-
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
262
|
-
Field::Float(f) => f.into_value_with(handle),
|
263
|
-
Field::Double(d) => d.into_value_with(handle),
|
264
|
-
|
265
|
-
Field::Str(s) => s.into_value_with(handle),
|
266
|
-
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
267
|
-
Field::Date(d) => d.into_value_with(handle),
|
268
|
-
Field::TimestampMillis(ts) => ts.into_value_with(handle),
|
269
|
-
Field::TimestampMicros(ts) => ts.into_value_with(handle),
|
270
|
-
Field::ListInternal(list) => {
|
271
|
-
let ary = handle.ary_new_capa(list.elements().len());
|
272
|
-
list.elements()
|
273
|
-
.iter()
|
274
|
-
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
275
|
-
.unwrap();
|
276
|
-
ary.into_value_with(handle)
|
277
|
-
}
|
278
|
-
Field::MapInternal(map) => {
|
279
|
-
let hash = handle.hash_new_capa(map.entries().len());
|
280
|
-
map.entries()
|
281
|
-
.iter()
|
282
|
-
.try_for_each(|(k, v)| {
|
283
|
-
hash.aset(
|
284
|
-
ParquetField(k.clone()).into_value_with(handle),
|
285
|
-
ParquetField(v.clone()).into_value_with(handle),
|
286
|
-
)
|
287
|
-
})
|
288
|
-
.unwrap();
|
289
|
-
hash.into_value_with(handle)
|
290
|
-
}
|
291
|
-
// Field::Decimal(d) => d.to_string().into_value_with(handle),
|
292
|
-
// Field::Group(row) => row.into_value_with(handle),
|
293
|
-
Field::Null => handle.qnil().as_value(),
|
294
|
-
_ => panic!("Unsupported field type"),
|
295
|
-
}
|
296
|
-
}
|
163
|
+
#[derive(Error, Debug)]
|
164
|
+
pub enum ReaderError {
|
165
|
+
#[error("Failed to get file descriptor: {0}")]
|
166
|
+
FileDescriptor(String),
|
167
|
+
#[error("Invalid file descriptor")]
|
168
|
+
InvalidFileDescriptor,
|
169
|
+
#[error("Failed to open file: {0}")]
|
170
|
+
FileOpen(#[from] io::Error),
|
171
|
+
#[error("Failed to intern headers: {0}")]
|
172
|
+
HeaderIntern(#[from] CacheError),
|
173
|
+
#[error("Ruby error: {0}")]
|
174
|
+
Ruby(String),
|
297
175
|
}
|
298
176
|
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
fn len(&self) -> u64 {
|
303
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
304
|
-
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
305
|
-
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
306
|
-
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
307
|
-
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
308
|
-
file_len
|
177
|
+
impl From<MagnusError> for ReaderError {
|
178
|
+
fn from(err: MagnusError) -> Self {
|
179
|
+
Self::Ruby(err.to_string())
|
309
180
|
}
|
310
181
|
}
|
311
182
|
|
312
|
-
impl
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
reader.seek(SeekFrom::Start(start))?;
|
319
|
-
Ok(BufReader::new(reader))
|
320
|
-
}
|
321
|
-
|
322
|
-
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
323
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
324
|
-
let mut buffer = Vec::with_capacity(length);
|
325
|
-
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
326
|
-
reader.seek(SeekFrom::Start(start))?;
|
327
|
-
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
328
|
-
|
329
|
-
if read != length {
|
330
|
-
return Err(ParquetError::EOF(format!(
|
331
|
-
"Expected to read {} bytes, read only {}",
|
332
|
-
length, read
|
333
|
-
)));
|
334
|
-
}
|
335
|
-
Ok(buffer.into())
|
183
|
+
impl From<ReaderError> for MagnusError {
|
184
|
+
fn from(err: ReaderError) -> Self {
|
185
|
+
MagnusError::new(
|
186
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
187
|
+
err.to_string(),
|
188
|
+
)
|
336
189
|
}
|
337
190
|
}
|
@@ -0,0 +1,77 @@
|
|
1
|
+
use std::{
|
2
|
+
fs::File,
|
3
|
+
io::{BufReader, SeekFrom},
|
4
|
+
mem::ManuallyDrop,
|
5
|
+
};
|
6
|
+
|
7
|
+
use bytes::Bytes;
|
8
|
+
use magnus::{value::Opaque, Ruby, Value};
|
9
|
+
use parquet::{
|
10
|
+
errors::ParquetError,
|
11
|
+
file::reader::{ChunkReader, Length},
|
12
|
+
};
|
13
|
+
use std::io::Read;
|
14
|
+
|
15
|
+
use crate::ruby_reader::{build_ruby_reader, SeekableRead};
|
16
|
+
|
17
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
18
|
+
|
19
|
+
pub struct SeekableRubyValue(pub Opaque<Value>);
|
20
|
+
|
21
|
+
impl Length for SeekableRubyValue {
|
22
|
+
fn len(&self) -> u64 {
|
23
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
24
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
25
|
+
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
26
|
+
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
27
|
+
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
28
|
+
file_len
|
29
|
+
}
|
30
|
+
}
|
31
|
+
|
32
|
+
impl ChunkReader for SeekableRubyValue {
|
33
|
+
type T = BufReader<Box<dyn SeekableRead>>;
|
34
|
+
|
35
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
36
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
37
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
38
|
+
reader.seek(SeekFrom::Start(start))?;
|
39
|
+
Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
|
40
|
+
}
|
41
|
+
|
42
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
43
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
44
|
+
let mut buffer = Vec::with_capacity(length);
|
45
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
46
|
+
reader.seek(SeekFrom::Start(start))?;
|
47
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
48
|
+
|
49
|
+
if read != length {
|
50
|
+
return Err(ParquetError::EOF(format!(
|
51
|
+
"Expected to read {} bytes, read only {}",
|
52
|
+
length, read
|
53
|
+
)));
|
54
|
+
}
|
55
|
+
Ok(buffer.into())
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
60
|
+
|
61
|
+
impl Length for ForgottenFileHandle {
|
62
|
+
fn len(&self) -> u64 {
|
63
|
+
self.0.len()
|
64
|
+
}
|
65
|
+
}
|
66
|
+
|
67
|
+
impl ChunkReader for ForgottenFileHandle {
|
68
|
+
type T = BufReader<File>;
|
69
|
+
|
70
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
71
|
+
self.0.get_read(start)
|
72
|
+
}
|
73
|
+
|
74
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
75
|
+
self.0.get_bytes(start, length)
|
76
|
+
}
|
77
|
+
}
|
@@ -2,30 +2,24 @@ use magnus::{
|
|
2
2
|
value::{Opaque, ReprValue},
|
3
3
|
RClass, RString, Ruby, Value,
|
4
4
|
};
|
5
|
-
use std::io::{self, Read, Seek};
|
5
|
+
use std::io::{self, Read, Seek, SeekFrom, Write};
|
6
6
|
use std::sync::OnceLock;
|
7
7
|
|
8
8
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
9
9
|
|
10
|
-
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
11
|
-
|
12
10
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
13
11
|
/// and provide a standard Read implementation for them.
|
14
12
|
pub struct RubyReader<T> {
|
15
13
|
inner: T,
|
16
|
-
buffer: Option<Vec<u8>>,
|
17
14
|
offset: usize,
|
18
|
-
// Number of bytes that have been read into the buffer
|
19
|
-
// Used as an upper bound for offset
|
20
|
-
buffered_bytes: usize,
|
21
15
|
}
|
22
16
|
|
23
17
|
pub trait SeekableRead: std::io::Read + Seek {}
|
24
18
|
impl SeekableRead for RubyReader<Value> {}
|
25
19
|
impl SeekableRead for RubyReader<RString> {}
|
26
20
|
|
27
|
-
pub fn build_ruby_reader
|
28
|
-
ruby: &
|
21
|
+
pub fn build_ruby_reader(
|
22
|
+
ruby: &Ruby,
|
29
23
|
input: Value,
|
30
24
|
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
31
25
|
if RubyReader::is_string_io(ruby, &input) {
|
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
|
|
39
33
|
|
40
34
|
impl Seek for RubyReader<Value> {
|
41
35
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
42
|
-
let
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
}
|
47
|
-
io::SeekFrom::End(offset) => {
|
48
|
-
// SEEK_END - from end of stream
|
49
|
-
offset
|
50
|
-
}
|
51
|
-
io::SeekFrom::Current(offset) => {
|
52
|
-
// SEEK_CUR - relative to current
|
53
|
-
offset
|
54
|
-
}
|
36
|
+
let (whence, offset) = match pos {
|
37
|
+
SeekFrom::Start(i) => (0, i as i64),
|
38
|
+
SeekFrom::Current(i) => (1, i),
|
39
|
+
SeekFrom::End(i) => (2, i),
|
55
40
|
};
|
56
41
|
|
57
|
-
let
|
58
|
-
|
59
|
-
|
60
|
-
io::
|
61
|
-
|
42
|
+
let new_position = self
|
43
|
+
.inner
|
44
|
+
.funcall("seek", (offset, whence))
|
45
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
46
|
+
|
47
|
+
Ok(new_position)
|
48
|
+
}
|
49
|
+
}
|
62
50
|
|
63
|
-
|
64
|
-
|
51
|
+
impl Write for RubyReader<Value> {
|
52
|
+
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
53
|
+
let ruby_bytes = RString::from_slice(buf);
|
65
54
|
|
66
|
-
|
67
|
-
|
55
|
+
let bytes_written = self
|
56
|
+
.inner
|
57
|
+
.funcall::<_, _, usize>("write", (ruby_bytes,))
|
58
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
68
59
|
|
69
|
-
Ok(
|
60
|
+
Ok(bytes_written)
|
61
|
+
}
|
62
|
+
|
63
|
+
fn flush(&mut self) -> Result<(), io::Error> {
|
64
|
+
self.inner
|
65
|
+
.funcall::<_, _, Value>("flush", ())
|
66
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
67
|
+
|
68
|
+
Ok(())
|
70
69
|
}
|
71
70
|
}
|
72
71
|
|
73
72
|
impl Seek for RubyReader<RString> {
|
74
73
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
75
74
|
match pos {
|
76
|
-
io::SeekFrom::Start(offset) =>
|
77
|
-
|
78
|
-
|
79
|
-
io::SeekFrom::End(offset) => {
|
80
|
-
self.offset = (self.inner.len() - offset as usize) as usize;
|
81
|
-
}
|
82
|
-
io::SeekFrom::Current(offset) => {
|
83
|
-
self.offset = (self.offset as i64 + offset) as usize;
|
84
|
-
}
|
75
|
+
io::SeekFrom::Start(offset) => self.offset = offset as usize,
|
76
|
+
io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
|
77
|
+
io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
|
85
78
|
}
|
86
79
|
Ok(self.offset as u64)
|
87
80
|
}
|
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
|
|
106
99
|
fn from_io_like(input: Value) -> Self {
|
107
100
|
Self {
|
108
101
|
inner: input,
|
109
|
-
buffer: Some(vec![0; READ_BUFFER_SIZE]),
|
110
102
|
offset: 0,
|
111
|
-
buffered_bytes: 0,
|
112
103
|
}
|
113
104
|
}
|
114
|
-
|
115
|
-
fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
|
116
|
-
if let Some(from_buf) = &self.buffer {
|
117
|
-
// If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
|
118
|
-
if self.offset < self.buffered_bytes {
|
119
|
-
let remaining = self.buffered_bytes - self.offset;
|
120
|
-
let copy_size = remaining.min(to_buf.len());
|
121
|
-
to_buf[..copy_size]
|
122
|
-
.copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
|
123
|
-
self.offset += copy_size;
|
124
|
-
Some(Ok(copy_size))
|
125
|
-
} else {
|
126
|
-
None
|
127
|
-
}
|
128
|
-
} else {
|
129
|
-
None
|
130
|
-
}
|
131
|
-
}
|
132
|
-
|
133
|
-
fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
134
|
-
let buffer = self.buffer.as_mut().unwrap();
|
135
|
-
let result = self
|
136
|
-
.inner
|
137
|
-
.funcall::<_, _, RString>("read", (buffer.capacity(),))
|
138
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
139
|
-
|
140
|
-
if result.is_nil() {
|
141
|
-
return Ok(0); // EOF
|
142
|
-
}
|
143
|
-
|
144
|
-
let bytes = unsafe { result.as_slice() };
|
145
|
-
|
146
|
-
// Update internal buffer
|
147
|
-
let bytes_len = bytes.len();
|
148
|
-
if bytes_len == 0 {
|
149
|
-
return Ok(0);
|
150
|
-
}
|
151
|
-
|
152
|
-
// Only copy what we actually read
|
153
|
-
buffer[..bytes_len].copy_from_slice(bytes);
|
154
|
-
self.buffered_bytes = bytes_len;
|
155
|
-
|
156
|
-
// Copy to output buffer
|
157
|
-
let copy_size = bytes_len.min(buf.len());
|
158
|
-
buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
|
159
|
-
self.offset = copy_size;
|
160
|
-
Ok(copy_size)
|
161
|
-
}
|
162
105
|
}
|
163
106
|
|
164
107
|
impl RubyReader<RString> {
|
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
|
|
176
119
|
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
177
120
|
Ok(Box::new(Self {
|
178
121
|
inner: string_content,
|
179
|
-
buffer: None,
|
180
122
|
offset: 0,
|
181
|
-
buffered_bytes: 0,
|
182
123
|
}))
|
183
124
|
}
|
184
125
|
|
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
|
|
197
138
|
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
198
139
|
Ok(Box::new(Self {
|
199
140
|
inner: string_content,
|
200
|
-
buffer: None,
|
201
141
|
offset: 0,
|
202
|
-
buffered_bytes: 0,
|
203
142
|
}))
|
204
143
|
}
|
205
144
|
}
|
206
145
|
|
207
146
|
impl Read for RubyReader<Value> {
|
208
|
-
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
}
|
147
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
148
|
+
let bytes = self
|
149
|
+
.inner
|
150
|
+
.funcall::<_, _, RString>("read", (buf.len(),))
|
151
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
152
|
+
|
153
|
+
buf.write_all(unsafe { bytes.as_slice() })?;
|
154
|
+
|
155
|
+
Ok(bytes.len())
|
215
156
|
}
|
216
157
|
}
|
217
158
|
|
@@ -0,0 +1,90 @@
|
|
1
|
+
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
|
2
|
+
|
3
|
+
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
4
|
+
use parquet::record::Field;
|
5
|
+
|
6
|
+
#[derive(Debug)]
|
7
|
+
pub enum Record<S: BuildHasher + Default> {
|
8
|
+
Vec(Vec<ParquetField>),
|
9
|
+
Map(HashMap<&'static str, ParquetField, S>),
|
10
|
+
}
|
11
|
+
|
12
|
+
impl<S: BuildHasher + Default> IntoValue for Record<S> {
|
13
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
14
|
+
match self {
|
15
|
+
Record::Vec(vec) => {
|
16
|
+
let ary = handle.ary_new_capa(vec.len());
|
17
|
+
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
18
|
+
ary.into_value_with(handle)
|
19
|
+
}
|
20
|
+
Record::Map(map) => {
|
21
|
+
let hash = handle.hash_new_capa(map.len());
|
22
|
+
map.into_iter()
|
23
|
+
.try_for_each(|(k, v)| hash.aset(k, v))
|
24
|
+
.unwrap();
|
25
|
+
hash.into_value_with(handle)
|
26
|
+
}
|
27
|
+
}
|
28
|
+
}
|
29
|
+
}
|
30
|
+
|
31
|
+
#[derive(Debug, Clone)]
|
32
|
+
pub struct CowValue<'a>(pub Cow<'a, str>);
|
33
|
+
|
34
|
+
impl<'a> IntoValue for CowValue<'a> {
|
35
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
36
|
+
self.0.into_value_with(handle)
|
37
|
+
}
|
38
|
+
}
|
39
|
+
|
40
|
+
#[derive(Debug)]
|
41
|
+
pub struct ParquetField(pub Field);
|
42
|
+
|
43
|
+
impl IntoValue for ParquetField {
|
44
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
45
|
+
match self.0 {
|
46
|
+
Field::Byte(b) => b.into_value_with(handle),
|
47
|
+
Field::Bool(b) => b.into_value_with(handle),
|
48
|
+
Field::Short(s) => s.into_value_with(handle),
|
49
|
+
Field::Int(i) => i.into_value_with(handle),
|
50
|
+
Field::Long(l) => l.into_value_with(handle),
|
51
|
+
Field::UByte(ub) => ub.into_value_with(handle),
|
52
|
+
Field::UShort(us) => us.into_value_with(handle),
|
53
|
+
Field::UInt(ui) => ui.into_value_with(handle),
|
54
|
+
Field::ULong(ul) => ul.into_value_with(handle),
|
55
|
+
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
56
|
+
Field::Float(f) => f.into_value_with(handle),
|
57
|
+
Field::Double(d) => d.into_value_with(handle),
|
58
|
+
Field::Str(s) => s.into_value_with(handle),
|
59
|
+
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
60
|
+
Field::Date(d) => d.into_value_with(handle),
|
61
|
+
Field::TimestampMillis(ts) => ts.into_value_with(handle),
|
62
|
+
Field::TimestampMicros(ts) => ts.into_value_with(handle),
|
63
|
+
Field::ListInternal(list) => {
|
64
|
+
let elements = list.elements();
|
65
|
+
let ary = handle.ary_new_capa(elements.len());
|
66
|
+
elements
|
67
|
+
.iter()
|
68
|
+
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
69
|
+
.unwrap();
|
70
|
+
ary.into_value_with(handle)
|
71
|
+
}
|
72
|
+
Field::MapInternal(map) => {
|
73
|
+
let entries = map.entries();
|
74
|
+
let hash = handle.hash_new_capa(entries.len());
|
75
|
+
entries
|
76
|
+
.iter()
|
77
|
+
.try_for_each(|(k, v)| {
|
78
|
+
hash.aset(
|
79
|
+
ParquetField(k.clone()).into_value_with(handle),
|
80
|
+
ParquetField(v.clone()).into_value_with(handle),
|
81
|
+
)
|
82
|
+
})
|
83
|
+
.unwrap();
|
84
|
+
hash.into_value_with(handle)
|
85
|
+
}
|
86
|
+
Field::Null => handle.qnil().as_value(),
|
87
|
+
_ => panic!("Unsupported field type"),
|
88
|
+
}
|
89
|
+
}
|
90
|
+
}
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -29,6 +29,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
29
29
|
pub struct ParquetArgs {
|
30
30
|
pub to_read: Value,
|
31
31
|
pub result_type: String,
|
32
|
+
pub columns: Option<Vec<String>>,
|
32
33
|
}
|
33
34
|
|
34
35
|
/// Parse common arguments for CSV parsing
|
@@ -36,8 +37,11 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
|
|
36
37
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
37
38
|
let (to_read,) = parsed_args.required;
|
38
39
|
|
39
|
-
let kwargs =
|
40
|
-
|
40
|
+
let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
|
41
|
+
parsed_args.keywords,
|
42
|
+
&[],
|
43
|
+
&["result_type", "columns"],
|
44
|
+
)?;
|
41
45
|
|
42
46
|
let result_type = match kwargs
|
43
47
|
.optional
|
@@ -66,5 +70,6 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
|
|
66
70
|
Ok(ParquetArgs {
|
67
71
|
to_read,
|
68
72
|
result_type,
|
73
|
+
columns: kwargs.optional.1,
|
69
74
|
})
|
70
75
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
@@ -1,17 +1,19 @@
|
|
1
1
|
# typed: strict
|
2
|
-
|
3
2
|
module Parquet
|
4
3
|
# Options:
|
5
4
|
# - `input`: String specifying the input file
|
6
5
|
# - `result_type`: String specifying the output format
|
7
6
|
# ("hash" or "array" or :hash or :array)
|
7
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
8
|
+
# This is useful for reducing how much data is read and improving performance.
|
8
9
|
sig do
|
9
10
|
params(
|
10
11
|
input: T.any(String, IO),
|
11
12
|
result_type: T.nilable(T.any(String, Symbol)),
|
13
|
+
columns: T.nilable(T::Array[String]),
|
12
14
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
13
15
|
).returns(T.any(Enumerator, T.untyped))
|
14
16
|
end
|
15
|
-
def self.each_row(input, result_type: nil, &blk)
|
17
|
+
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
16
18
|
end
|
17
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -56,10 +56,14 @@ files:
|
|
56
56
|
- Rakefile
|
57
57
|
- ext/parquet/Cargo.toml
|
58
58
|
- ext/parquet/extconf.rb
|
59
|
+
- ext/parquet/src/allocator.rs
|
60
|
+
- ext/parquet/src/enumerator.rs
|
59
61
|
- ext/parquet/src/header_cache.rs
|
60
62
|
- ext/parquet/src/lib.rs
|
61
63
|
- ext/parquet/src/reader.rs
|
64
|
+
- ext/parquet/src/ruby_integration.rs
|
62
65
|
- ext/parquet/src/ruby_reader.rs
|
66
|
+
- ext/parquet/src/types.rs
|
63
67
|
- ext/parquet/src/utils.rs
|
64
68
|
- lib/parquet.rb
|
65
69
|
- lib/parquet.rbi
|