parquet 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +42 -7
- data/Gemfile +7 -2
- data/ext/parquet/Cargo.toml +7 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +28 -0
- data/ext/parquet/src/header_cache.rs +23 -1
- data/ext/parquet/src/lib.rs +7 -0
- data/ext/parquet/src/reader.rs +92 -239
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +90 -0
- data/ext/parquet/src/utils.rs +7 -2
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +4 -2
- metadata +6 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 70d9932bf622cd2647423e2519013d3a9f9256217effe9610e9aeaaebbcf1778
|
|
4
|
+
data.tar.gz: fae3767ce0d950c91b17f77b740159d863293e1288063ed15d9b9c1f82e87fe1
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: a03e75bcd377ce5a61cd5f17685995c420601ac5917bd3d4a99dc082686423729ee5f0913bb032fe826dd1a8bac9b52c152cfb2037a376751258c17f3b0e63b1
|
|
7
|
+
data.tar.gz: ddfbb0ee14a6b7dcce47caf41962afe9610ab175d2b829c2744d62bed67cc746e64d214f64318220f2301a9ce8dcdecf9f9f9e90786df3d18f244716724abef8
|
data/Cargo.lock
CHANGED
|
@@ -749,6 +749,26 @@ version = "1.0.14"
|
|
|
749
749
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
750
750
|
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
|
751
751
|
|
|
752
|
+
[[package]]
|
|
753
|
+
name = "jemalloc-sys"
|
|
754
|
+
version = "0.5.4+5.3.0-patched"
|
|
755
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
756
|
+
checksum = "ac6c1946e1cea1788cbfde01c993b52a10e2da07f4bac608228d1bed20bfebf2"
|
|
757
|
+
dependencies = [
|
|
758
|
+
"cc",
|
|
759
|
+
"libc",
|
|
760
|
+
]
|
|
761
|
+
|
|
762
|
+
[[package]]
|
|
763
|
+
name = "jemallocator"
|
|
764
|
+
version = "0.5.4"
|
|
765
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
766
|
+
checksum = "a0de374a9f8e63150e6f5e8a60cc14c668226d7a347d8aee1a45766e3c4dd3bc"
|
|
767
|
+
dependencies = [
|
|
768
|
+
"jemalloc-sys",
|
|
769
|
+
"libc",
|
|
770
|
+
]
|
|
771
|
+
|
|
752
772
|
[[package]]
|
|
753
773
|
name = "jobserver"
|
|
754
774
|
version = "0.1.32"
|
|
@@ -876,6 +896,16 @@ version = "0.2.11"
|
|
|
876
896
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
877
897
|
checksum = "8355be11b20d696c8f18f6cc018c4e372165b1fa8126cef092399c9951984ffa"
|
|
878
898
|
|
|
899
|
+
[[package]]
|
|
900
|
+
name = "libmimalloc-sys"
|
|
901
|
+
version = "0.1.39"
|
|
902
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
903
|
+
checksum = "23aa6811d3bd4deb8a84dde645f943476d13b248d818edcf8ce0b2f37f036b44"
|
|
904
|
+
dependencies = [
|
|
905
|
+
"cc",
|
|
906
|
+
"libc",
|
|
907
|
+
]
|
|
908
|
+
|
|
879
909
|
[[package]]
|
|
880
910
|
name = "litemap"
|
|
881
911
|
version = "0.7.4"
|
|
@@ -948,6 +978,15 @@ version = "2.7.4"
|
|
|
948
978
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
949
979
|
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
|
950
980
|
|
|
981
|
+
[[package]]
|
|
982
|
+
name = "mimalloc"
|
|
983
|
+
version = "0.1.43"
|
|
984
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
985
|
+
checksum = "68914350ae34959d83f732418d51e2427a794055d0b9529f48259ac07af65633"
|
|
986
|
+
dependencies = [
|
|
987
|
+
"libmimalloc-sys",
|
|
988
|
+
]
|
|
989
|
+
|
|
951
990
|
[[package]]
|
|
952
991
|
name = "minimal-lexical"
|
|
953
992
|
version = "0.2.1"
|
|
@@ -1119,15 +1158,17 @@ dependencies = [
|
|
|
1119
1158
|
name = "parquet"
|
|
1120
1159
|
version = "0.1.0"
|
|
1121
1160
|
dependencies = [
|
|
1161
|
+
"ahash",
|
|
1122
1162
|
"bytes",
|
|
1163
|
+
"jemallocator",
|
|
1123
1164
|
"kanal",
|
|
1124
1165
|
"magnus 0.7.1",
|
|
1166
|
+
"mimalloc",
|
|
1125
1167
|
"parquet 54.0.0",
|
|
1126
1168
|
"rb-sys",
|
|
1127
1169
|
"serde",
|
|
1128
1170
|
"serde_magnus",
|
|
1129
1171
|
"thiserror",
|
|
1130
|
-
"xxhash-rust",
|
|
1131
1172
|
]
|
|
1132
1173
|
|
|
1133
1174
|
[[package]]
|
|
@@ -1796,12 +1837,6 @@ version = "0.5.5"
|
|
|
1796
1837
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1797
1838
|
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
|
1798
1839
|
|
|
1799
|
-
[[package]]
|
|
1800
|
-
name = "xxhash-rust"
|
|
1801
|
-
version = "0.8.14"
|
|
1802
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1803
|
-
checksum = "d7d48f1b18be023c95e7b75f481cac649d74be7c507ff4a407c55cfb957f7934"
|
|
1804
|
-
|
|
1805
1840
|
[[package]]
|
|
1806
1841
|
name = "yoke"
|
|
1807
1842
|
version = "0.7.5"
|
data/Gemfile
CHANGED
|
@@ -6,7 +6,12 @@ gem "rake"
|
|
|
6
6
|
# Use local version of parquet
|
|
7
7
|
gemspec
|
|
8
8
|
|
|
9
|
-
group :development
|
|
10
|
-
gem "minitest", "~> 5.0"
|
|
9
|
+
group :development do
|
|
11
10
|
gem "benchmark-ips", "~> 2.12"
|
|
11
|
+
# gem "polars-df"
|
|
12
|
+
# gem "duckdb"
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
group :test do
|
|
16
|
+
gem "minitest", "~> 5.0"
|
|
12
17
|
end
|
data/ext/parquet/Cargo.toml
CHANGED
|
@@ -7,6 +7,7 @@ edition = "2021"
|
|
|
7
7
|
crate-type = ["cdylib"]
|
|
8
8
|
|
|
9
9
|
[dependencies]
|
|
10
|
+
ahash = "0.8"
|
|
10
11
|
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
|
11
12
|
bytes = "^1.9"
|
|
12
13
|
kanal = "0.1.0-pre8"
|
|
@@ -15,4 +16,9 @@ rb-sys = "^0.9"
|
|
|
15
16
|
serde = { version = "1.0", features = ["derive"] }
|
|
16
17
|
serde_magnus = "0.8.1"
|
|
17
18
|
thiserror = "2.0"
|
|
18
|
-
|
|
19
|
+
|
|
20
|
+
[target.'cfg(target_os = "linux")'.dependencies]
|
|
21
|
+
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
|
22
|
+
|
|
23
|
+
[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
|
|
24
|
+
mimalloc = { version = "0.1", default-features = false }
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#[cfg(target_os = "linux")]
|
|
2
|
+
use jemallocator::Jemalloc;
|
|
3
|
+
|
|
4
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
|
5
|
+
use mimalloc::MiMalloc;
|
|
6
|
+
|
|
7
|
+
#[global_allocator]
|
|
8
|
+
#[cfg(target_os = "linux")]
|
|
9
|
+
static ALLOC: Jemalloc = Jemalloc;
|
|
10
|
+
|
|
11
|
+
#[global_allocator]
|
|
12
|
+
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
|
13
|
+
static ALLOC: MiMalloc = MiMalloc;
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
use ahash::RandomState;
|
|
2
|
+
use magnus::{
|
|
3
|
+
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
|
4
|
+
};
|
|
5
|
+
|
|
6
|
+
use crate::Record;
|
|
7
|
+
|
|
8
|
+
pub struct EnumeratorArgs {
|
|
9
|
+
pub rb_self: Value,
|
|
10
|
+
pub to_read: Value,
|
|
11
|
+
pub result_type: String,
|
|
12
|
+
pub columns: Option<Vec<String>>,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
#[inline]
|
|
16
|
+
pub fn create_enumerator(
|
|
17
|
+
args: EnumeratorArgs,
|
|
18
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
|
|
19
|
+
let kwargs = RHash::new();
|
|
20
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
|
21
|
+
if let Some(columns) = args.columns {
|
|
22
|
+
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
|
23
|
+
}
|
|
24
|
+
let enumerator = args
|
|
25
|
+
.rb_self
|
|
26
|
+
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
|
27
|
+
Ok(Yield::Enumerator(enumerator))
|
|
28
|
+
}
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
|
7
7
|
use std::{
|
|
8
8
|
collections::HashMap,
|
|
9
|
-
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
|
9
|
+
sync::{atomic::AtomicU32, LazyLock, Mutex, OnceLock},
|
|
10
10
|
};
|
|
11
11
|
use thiserror::Error;
|
|
12
12
|
|
|
@@ -79,3 +79,25 @@ impl StringCache {
|
|
|
79
79
|
Ok(())
|
|
80
80
|
}
|
|
81
81
|
}
|
|
82
|
+
|
|
83
|
+
pub struct HeaderCacheCleanupIter<I> {
|
|
84
|
+
pub inner: I,
|
|
85
|
+
pub headers: OnceLock<Vec<&'static str>>,
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
|
89
|
+
type Item = I::Item;
|
|
90
|
+
|
|
91
|
+
#[inline(always)]
|
|
92
|
+
fn next(&mut self) -> Option<Self::Item> {
|
|
93
|
+
self.inner.next()
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
|
98
|
+
fn drop(&mut self) {
|
|
99
|
+
if let Some(headers) = self.headers.get() {
|
|
100
|
+
StringCache::clear(&headers).unwrap();
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
}
|
data/ext/parquet/src/lib.rs
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
1
|
+
mod allocator;
|
|
2
|
+
mod enumerator;
|
|
1
3
|
pub mod header_cache;
|
|
2
4
|
mod reader;
|
|
5
|
+
mod ruby_integration;
|
|
3
6
|
mod ruby_reader;
|
|
7
|
+
mod types;
|
|
4
8
|
mod utils;
|
|
5
9
|
|
|
10
|
+
use crate::enumerator::*;
|
|
6
11
|
use crate::reader::*;
|
|
12
|
+
use crate::ruby_integration::*;
|
|
13
|
+
use crate::types::*;
|
|
7
14
|
|
|
8
15
|
use magnus::{Error, Ruby};
|
|
9
16
|
|
data/ext/parquet/src/reader.rs
CHANGED
|
@@ -1,106 +1,39 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
use
|
|
1
|
+
// =============================================================================
|
|
2
|
+
// Imports and Dependencies
|
|
3
|
+
// =============================================================================
|
|
4
|
+
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
|
5
|
+
use crate::{
|
|
6
|
+
create_enumerator, utils::*, EnumeratorArgs, ForgottenFileHandle, ParquetField, Record,
|
|
7
|
+
SeekableRubyValue,
|
|
8
|
+
};
|
|
9
|
+
use ahash::RandomState;
|
|
5
10
|
use magnus::rb_sys::AsRawValue;
|
|
6
11
|
use magnus::value::{Opaque, ReprValue};
|
|
7
|
-
use magnus::
|
|
8
|
-
use
|
|
9
|
-
use parquet::
|
|
10
|
-
use parquet::
|
|
11
|
-
use parquet::
|
|
12
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
|
13
|
+
use parquet::file::reader::FileReader;
|
|
14
|
+
use parquet::file::reader::SerializedFileReader;
|
|
15
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
|
16
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
|
12
17
|
use std::collections::HashMap;
|
|
13
18
|
use std::fs::File;
|
|
14
|
-
use std::io::{self
|
|
19
|
+
use std::io::{self};
|
|
15
20
|
use std::mem::ManuallyDrop;
|
|
16
21
|
use std::os::fd::FromRawFd;
|
|
17
22
|
use std::sync::OnceLock;
|
|
18
|
-
use std::{borrow::Cow, hash::BuildHasher};
|
|
19
23
|
use thiserror::Error;
|
|
20
|
-
use xxhash_rust::xxh3::Xxh3Builder;
|
|
21
|
-
|
|
22
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
|
23
|
-
|
|
24
|
-
#[derive(Error, Debug)]
|
|
25
|
-
pub enum ReaderError {
|
|
26
|
-
#[error("Failed to get file descriptor: {0}")]
|
|
27
|
-
FileDescriptor(String),
|
|
28
|
-
#[error("Invalid file descriptor")]
|
|
29
|
-
InvalidFileDescriptor,
|
|
30
|
-
#[error("Failed to open file: {0}")]
|
|
31
|
-
FileOpen(#[from] io::Error),
|
|
32
|
-
#[error("Failed to intern headers: {0}")]
|
|
33
|
-
HeaderIntern(#[from] CacheError),
|
|
34
|
-
#[error("Ruby error: {0}")]
|
|
35
|
-
Ruby(String),
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
impl From<MagnusError> for ReaderError {
|
|
39
|
-
fn from(err: MagnusError) -> Self {
|
|
40
|
-
Self::Ruby(err.to_string())
|
|
41
|
-
}
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
impl From<ReaderError> for MagnusError {
|
|
45
|
-
fn from(err: ReaderError) -> Self {
|
|
46
|
-
MagnusError::new(
|
|
47
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
|
48
|
-
err.to_string(),
|
|
49
|
-
)
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
struct ForgottenFileHandle(ManuallyDrop<File>);
|
|
54
|
-
|
|
55
|
-
impl Length for ForgottenFileHandle {
|
|
56
|
-
fn len(&self) -> u64 {
|
|
57
|
-
self.0.len()
|
|
58
|
-
}
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
impl ChunkReader for ForgottenFileHandle {
|
|
62
|
-
type T = BufReader<File>;
|
|
63
|
-
|
|
64
|
-
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
|
65
|
-
self.0.get_read(start)
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
|
69
|
-
self.0.get_bytes(start, length)
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
struct HeaderCacheCleanupIter<I> {
|
|
74
|
-
inner: I,
|
|
75
|
-
headers: OnceLock<Vec<&'static str>>,
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
|
79
|
-
type Item = I::Item;
|
|
80
|
-
|
|
81
|
-
fn next(&mut self) -> Option<Self::Item> {
|
|
82
|
-
self.inner.next()
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
|
87
|
-
fn drop(&mut self) {
|
|
88
|
-
if let Some(headers) = self.headers.get() {
|
|
89
|
-
StringCache::clear(&headers).unwrap();
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
}
|
|
93
24
|
|
|
25
|
+
#[inline]
|
|
94
26
|
pub fn parse_parquet<'a>(
|
|
95
27
|
rb_self: Value,
|
|
96
28
|
args: &[Value],
|
|
97
|
-
) -> Result<Yield<Box<dyn Iterator<Item = Record<
|
|
29
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
|
|
98
30
|
let original = unsafe { Ruby::get_unchecked() };
|
|
99
31
|
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
|
100
32
|
|
|
101
33
|
let ParquetArgs {
|
|
102
34
|
to_read,
|
|
103
35
|
result_type,
|
|
36
|
+
columns,
|
|
104
37
|
} = parse_parquet_args(&ruby, args)?;
|
|
105
38
|
|
|
106
39
|
if !ruby.block_given() {
|
|
@@ -108,15 +41,18 @@ pub fn parse_parquet<'a>(
|
|
|
108
41
|
rb_self,
|
|
109
42
|
to_read,
|
|
110
43
|
result_type,
|
|
44
|
+
columns,
|
|
111
45
|
});
|
|
112
46
|
}
|
|
113
47
|
|
|
114
|
-
let iter = if to_read.is_kind_of(ruby.class_string()) {
|
|
48
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
|
115
49
|
let path_string = to_read.to_r_string()?;
|
|
116
50
|
let file_path = unsafe { path_string.as_str()? };
|
|
117
51
|
let file = File::open(file_path).unwrap();
|
|
118
52
|
let reader = SerializedFileReader::new(file).unwrap();
|
|
119
|
-
|
|
53
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
|
54
|
+
|
|
55
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
120
56
|
} else if to_read.is_kind_of(ruby.class_io()) {
|
|
121
57
|
let raw_value = to_read.as_raw();
|
|
122
58
|
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
|
@@ -131,14 +67,28 @@ pub fn parse_parquet<'a>(
|
|
|
131
67
|
let file = unsafe { File::from_raw_fd(fd) };
|
|
132
68
|
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
|
133
69
|
let reader = SerializedFileReader::new(file).unwrap();
|
|
134
|
-
|
|
70
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
|
71
|
+
|
|
72
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
135
73
|
} else {
|
|
136
74
|
let readable = SeekableRubyValue(Opaque::from(to_read));
|
|
137
75
|
let reader = SerializedFileReader::new(readable).unwrap();
|
|
138
|
-
|
|
76
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
|
77
|
+
|
|
78
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
139
79
|
};
|
|
140
80
|
|
|
141
|
-
let
|
|
81
|
+
if let Some(cols) = columns {
|
|
82
|
+
let projection = create_projection_schema(&schema, &cols);
|
|
83
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
|
84
|
+
MagnusError::new(
|
|
85
|
+
ruby.exception_runtime_error(),
|
|
86
|
+
format!("Failed to create projection: {}", e),
|
|
87
|
+
)
|
|
88
|
+
})?;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
let iter: Box<dyn Iterator<Item = Record<RandomState>>> = match result_type.as_str() {
|
|
142
92
|
"hash" => {
|
|
143
93
|
let headers = OnceLock::new();
|
|
144
94
|
let headers_clone = headers.clone();
|
|
@@ -146,21 +96,23 @@ pub fn parse_parquet<'a>(
|
|
|
146
96
|
.filter_map(move |row| {
|
|
147
97
|
row.ok().map(|row| {
|
|
148
98
|
let headers = headers_clone.get_or_init(|| {
|
|
149
|
-
row.get_column_iter()
|
|
150
|
-
|
|
151
|
-
|
|
99
|
+
let column_count = row.get_column_iter().count();
|
|
100
|
+
let mut headers = Vec::with_capacity(column_count);
|
|
101
|
+
row.get_column_iter().for_each(|(k, _)| {
|
|
102
|
+
headers.push(StringCache::intern(k.to_owned()).unwrap())
|
|
103
|
+
});
|
|
104
|
+
headers
|
|
152
105
|
});
|
|
153
106
|
|
|
154
|
-
|
|
155
|
-
.
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
.collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
|
|
107
|
+
let mut map =
|
|
108
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
|
109
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
|
110
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
|
111
|
+
});
|
|
112
|
+
map
|
|
161
113
|
})
|
|
162
114
|
})
|
|
163
|
-
.map(
|
|
115
|
+
.map(Record::Map);
|
|
164
116
|
|
|
165
117
|
Box::new(HeaderCacheCleanupIter {
|
|
166
118
|
inner: iter,
|
|
@@ -170,12 +122,14 @@ pub fn parse_parquet<'a>(
|
|
|
170
122
|
"array" => Box::new(
|
|
171
123
|
iter.filter_map(|row| {
|
|
172
124
|
row.ok().map(|row| {
|
|
125
|
+
let column_count = row.get_column_iter().count();
|
|
126
|
+
let mut vec = Vec::with_capacity(column_count);
|
|
173
127
|
row.get_column_iter()
|
|
174
|
-
.
|
|
175
|
-
|
|
128
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
|
129
|
+
vec
|
|
176
130
|
})
|
|
177
131
|
})
|
|
178
|
-
.map(
|
|
132
|
+
.map(Record::Vec),
|
|
179
133
|
),
|
|
180
134
|
_ => {
|
|
181
135
|
return Err(MagnusError::new(
|
|
@@ -188,150 +142,49 @@ pub fn parse_parquet<'a>(
|
|
|
188
142
|
Ok(Yield::Iter(iter))
|
|
189
143
|
}
|
|
190
144
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
args: EnumeratorArgs,
|
|
199
|
-
) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
|
|
200
|
-
let kwargs = RHash::new();
|
|
201
|
-
|
|
202
|
-
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
|
203
|
-
|
|
204
|
-
let enumerator = args
|
|
205
|
-
.rb_self
|
|
206
|
-
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
|
207
|
-
Ok(Yield::Enumerator(enumerator))
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
#[derive(Debug)]
|
|
211
|
-
pub enum Record<S: BuildHasher + Default> {
|
|
212
|
-
Vec(Vec<ParquetField>),
|
|
213
|
-
Map(HashMap<&'static str, ParquetField, S>),
|
|
214
|
-
}
|
|
145
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
|
146
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
|
147
|
+
let projected_fields: Vec<TypePtr> = fields
|
|
148
|
+
.iter()
|
|
149
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
|
150
|
+
.cloned()
|
|
151
|
+
.collect();
|
|
215
152
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
match self {
|
|
220
|
-
Record::Vec(vec) => {
|
|
221
|
-
let ary = handle.ary_new_capa(vec.len());
|
|
222
|
-
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
|
223
|
-
ary.into_value_with(handle)
|
|
224
|
-
}
|
|
225
|
-
Record::Map(map) => {
|
|
226
|
-
// Pre-allocate the hash with the known size
|
|
227
|
-
let hash = handle.hash_new_capa(map.len());
|
|
228
|
-
map.into_iter()
|
|
229
|
-
.try_for_each(|(k, v)| hash.aset(k, v))
|
|
230
|
-
.unwrap();
|
|
231
|
-
hash.into_value_with(handle)
|
|
232
|
-
}
|
|
153
|
+
SchemaType::GroupType {
|
|
154
|
+
basic_info: schema.get_basic_info().clone(),
|
|
155
|
+
fields: projected_fields,
|
|
233
156
|
}
|
|
157
|
+
} else {
|
|
158
|
+
// Return original schema if not a group type
|
|
159
|
+
schema.clone()
|
|
234
160
|
}
|
|
235
161
|
}
|
|
236
162
|
|
|
237
|
-
#[derive(
|
|
238
|
-
pub
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
#[
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
impl<'a> IntoValue for ParquetField {
|
|
250
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
|
251
|
-
match self.0 {
|
|
252
|
-
Field::Byte(b) => b.into_value_with(handle),
|
|
253
|
-
Field::Bool(b) => b.into_value_with(handle),
|
|
254
|
-
Field::Short(s) => s.into_value_with(handle),
|
|
255
|
-
Field::Int(i) => i.into_value_with(handle),
|
|
256
|
-
Field::Long(l) => l.into_value_with(handle),
|
|
257
|
-
Field::UByte(ub) => ub.into_value_with(handle),
|
|
258
|
-
Field::UShort(us) => us.into_value_with(handle),
|
|
259
|
-
Field::UInt(ui) => ui.into_value_with(handle),
|
|
260
|
-
Field::ULong(ul) => ul.into_value_with(handle),
|
|
261
|
-
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
|
262
|
-
Field::Float(f) => f.into_value_with(handle),
|
|
263
|
-
Field::Double(d) => d.into_value_with(handle),
|
|
264
|
-
|
|
265
|
-
Field::Str(s) => s.into_value_with(handle),
|
|
266
|
-
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
|
267
|
-
Field::Date(d) => d.into_value_with(handle),
|
|
268
|
-
Field::TimestampMillis(ts) => ts.into_value_with(handle),
|
|
269
|
-
Field::TimestampMicros(ts) => ts.into_value_with(handle),
|
|
270
|
-
Field::ListInternal(list) => {
|
|
271
|
-
let ary = handle.ary_new_capa(list.elements().len());
|
|
272
|
-
list.elements()
|
|
273
|
-
.iter()
|
|
274
|
-
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
|
275
|
-
.unwrap();
|
|
276
|
-
ary.into_value_with(handle)
|
|
277
|
-
}
|
|
278
|
-
Field::MapInternal(map) => {
|
|
279
|
-
let hash = handle.hash_new_capa(map.entries().len());
|
|
280
|
-
map.entries()
|
|
281
|
-
.iter()
|
|
282
|
-
.try_for_each(|(k, v)| {
|
|
283
|
-
hash.aset(
|
|
284
|
-
ParquetField(k.clone()).into_value_with(handle),
|
|
285
|
-
ParquetField(v.clone()).into_value_with(handle),
|
|
286
|
-
)
|
|
287
|
-
})
|
|
288
|
-
.unwrap();
|
|
289
|
-
hash.into_value_with(handle)
|
|
290
|
-
}
|
|
291
|
-
// Field::Decimal(d) => d.to_string().into_value_with(handle),
|
|
292
|
-
// Field::Group(row) => row.into_value_with(handle),
|
|
293
|
-
Field::Null => handle.qnil().as_value(),
|
|
294
|
-
_ => panic!("Unsupported field type"),
|
|
295
|
-
}
|
|
296
|
-
}
|
|
163
|
+
#[derive(Error, Debug)]
|
|
164
|
+
pub enum ReaderError {
|
|
165
|
+
#[error("Failed to get file descriptor: {0}")]
|
|
166
|
+
FileDescriptor(String),
|
|
167
|
+
#[error("Invalid file descriptor")]
|
|
168
|
+
InvalidFileDescriptor,
|
|
169
|
+
#[error("Failed to open file: {0}")]
|
|
170
|
+
FileOpen(#[from] io::Error),
|
|
171
|
+
#[error("Failed to intern headers: {0}")]
|
|
172
|
+
HeaderIntern(#[from] CacheError),
|
|
173
|
+
#[error("Ruby error: {0}")]
|
|
174
|
+
Ruby(String),
|
|
297
175
|
}
|
|
298
176
|
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
fn len(&self) -> u64 {
|
|
303
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
|
304
|
-
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
|
305
|
-
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
|
306
|
-
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
|
307
|
-
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
|
308
|
-
file_len
|
|
177
|
+
impl From<MagnusError> for ReaderError {
|
|
178
|
+
fn from(err: MagnusError) -> Self {
|
|
179
|
+
Self::Ruby(err.to_string())
|
|
309
180
|
}
|
|
310
181
|
}
|
|
311
182
|
|
|
312
|
-
impl
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
reader.seek(SeekFrom::Start(start))?;
|
|
319
|
-
Ok(BufReader::new(reader))
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
|
323
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
|
324
|
-
let mut buffer = Vec::with_capacity(length);
|
|
325
|
-
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
|
326
|
-
reader.seek(SeekFrom::Start(start))?;
|
|
327
|
-
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
|
328
|
-
|
|
329
|
-
if read != length {
|
|
330
|
-
return Err(ParquetError::EOF(format!(
|
|
331
|
-
"Expected to read {} bytes, read only {}",
|
|
332
|
-
length, read
|
|
333
|
-
)));
|
|
334
|
-
}
|
|
335
|
-
Ok(buffer.into())
|
|
183
|
+
impl From<ReaderError> for MagnusError {
|
|
184
|
+
fn from(err: ReaderError) -> Self {
|
|
185
|
+
MagnusError::new(
|
|
186
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
|
187
|
+
err.to_string(),
|
|
188
|
+
)
|
|
336
189
|
}
|
|
337
190
|
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
use std::{
|
|
2
|
+
fs::File,
|
|
3
|
+
io::{BufReader, SeekFrom},
|
|
4
|
+
mem::ManuallyDrop,
|
|
5
|
+
};
|
|
6
|
+
|
|
7
|
+
use bytes::Bytes;
|
|
8
|
+
use magnus::{value::Opaque, Ruby, Value};
|
|
9
|
+
use parquet::{
|
|
10
|
+
errors::ParquetError,
|
|
11
|
+
file::reader::{ChunkReader, Length},
|
|
12
|
+
};
|
|
13
|
+
use std::io::Read;
|
|
14
|
+
|
|
15
|
+
use crate::ruby_reader::{build_ruby_reader, SeekableRead};
|
|
16
|
+
|
|
17
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
|
18
|
+
|
|
19
|
+
pub struct SeekableRubyValue(pub Opaque<Value>);
|
|
20
|
+
|
|
21
|
+
impl Length for SeekableRubyValue {
|
|
22
|
+
fn len(&self) -> u64 {
|
|
23
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
24
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
|
25
|
+
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
|
26
|
+
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
|
27
|
+
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
|
28
|
+
file_len
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
impl ChunkReader for SeekableRubyValue {
|
|
33
|
+
type T = BufReader<Box<dyn SeekableRead>>;
|
|
34
|
+
|
|
35
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
|
36
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
37
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
|
38
|
+
reader.seek(SeekFrom::Start(start))?;
|
|
39
|
+
Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
|
43
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
44
|
+
let mut buffer = Vec::with_capacity(length);
|
|
45
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
|
46
|
+
reader.seek(SeekFrom::Start(start))?;
|
|
47
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
|
48
|
+
|
|
49
|
+
if read != length {
|
|
50
|
+
return Err(ParquetError::EOF(format!(
|
|
51
|
+
"Expected to read {} bytes, read only {}",
|
|
52
|
+
length, read
|
|
53
|
+
)));
|
|
54
|
+
}
|
|
55
|
+
Ok(buffer.into())
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
pub struct ForgottenFileHandle(pub ManuallyDrop<File>);
|
|
60
|
+
|
|
61
|
+
impl Length for ForgottenFileHandle {
|
|
62
|
+
fn len(&self) -> u64 {
|
|
63
|
+
self.0.len()
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
impl ChunkReader for ForgottenFileHandle {
|
|
68
|
+
type T = BufReader<File>;
|
|
69
|
+
|
|
70
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
|
71
|
+
self.0.get_read(start)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
|
75
|
+
self.0.get_bytes(start, length)
|
|
76
|
+
}
|
|
77
|
+
}
|
|
@@ -2,30 +2,24 @@ use magnus::{
|
|
|
2
2
|
value::{Opaque, ReprValue},
|
|
3
3
|
RClass, RString, Ruby, Value,
|
|
4
4
|
};
|
|
5
|
-
use std::io::{self, Read, Seek};
|
|
5
|
+
use std::io::{self, Read, Seek, SeekFrom, Write};
|
|
6
6
|
use std::sync::OnceLock;
|
|
7
7
|
|
|
8
8
|
static STRING_IO_CLASS: OnceLock<Opaque<RClass>> = OnceLock::new();
|
|
9
9
|
|
|
10
|
-
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
|
11
|
-
|
|
12
10
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
|
13
11
|
/// and provide a standard Read implementation for them.
|
|
14
12
|
pub struct RubyReader<T> {
|
|
15
13
|
inner: T,
|
|
16
|
-
buffer: Option<Vec<u8>>,
|
|
17
14
|
offset: usize,
|
|
18
|
-
// Number of bytes that have been read into the buffer
|
|
19
|
-
// Used as an upper bound for offset
|
|
20
|
-
buffered_bytes: usize,
|
|
21
15
|
}
|
|
22
16
|
|
|
23
17
|
pub trait SeekableRead: std::io::Read + Seek {}
|
|
24
18
|
impl SeekableRead for RubyReader<Value> {}
|
|
25
19
|
impl SeekableRead for RubyReader<RString> {}
|
|
26
20
|
|
|
27
|
-
pub fn build_ruby_reader
|
|
28
|
-
ruby: &
|
|
21
|
+
pub fn build_ruby_reader(
|
|
22
|
+
ruby: &Ruby,
|
|
29
23
|
input: Value,
|
|
30
24
|
) -> Result<Box<dyn SeekableRead>, magnus::Error> {
|
|
31
25
|
if RubyReader::is_string_io(ruby, &input) {
|
|
@@ -39,49 +33,48 @@ pub fn build_ruby_reader<'a>(
|
|
|
39
33
|
|
|
40
34
|
impl Seek for RubyReader<Value> {
|
|
41
35
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
|
42
|
-
let
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
}
|
|
47
|
-
io::SeekFrom::End(offset) => {
|
|
48
|
-
// SEEK_END - from end of stream
|
|
49
|
-
offset
|
|
50
|
-
}
|
|
51
|
-
io::SeekFrom::Current(offset) => {
|
|
52
|
-
// SEEK_CUR - relative to current
|
|
53
|
-
offset
|
|
54
|
-
}
|
|
36
|
+
let (whence, offset) = match pos {
|
|
37
|
+
SeekFrom::Start(i) => (0, i as i64),
|
|
38
|
+
SeekFrom::Current(i) => (1, i),
|
|
39
|
+
SeekFrom::End(i) => (2, i),
|
|
55
40
|
};
|
|
56
41
|
|
|
57
|
-
let
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
io::
|
|
61
|
-
|
|
42
|
+
let new_position = self
|
|
43
|
+
.inner
|
|
44
|
+
.funcall("seek", (offset, whence))
|
|
45
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
46
|
+
|
|
47
|
+
Ok(new_position)
|
|
48
|
+
}
|
|
49
|
+
}
|
|
62
50
|
|
|
63
|
-
|
|
64
|
-
|
|
51
|
+
impl Write for RubyReader<Value> {
|
|
52
|
+
fn write(&mut self, buf: &[u8]) -> Result<usize, io::Error> {
|
|
53
|
+
let ruby_bytes = RString::from_slice(buf);
|
|
65
54
|
|
|
66
|
-
|
|
67
|
-
|
|
55
|
+
let bytes_written = self
|
|
56
|
+
.inner
|
|
57
|
+
.funcall::<_, _, usize>("write", (ruby_bytes,))
|
|
58
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
68
59
|
|
|
69
|
-
Ok(
|
|
60
|
+
Ok(bytes_written)
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
fn flush(&mut self) -> Result<(), io::Error> {
|
|
64
|
+
self.inner
|
|
65
|
+
.funcall::<_, _, Value>("flush", ())
|
|
66
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
67
|
+
|
|
68
|
+
Ok(())
|
|
70
69
|
}
|
|
71
70
|
}
|
|
72
71
|
|
|
73
72
|
impl Seek for RubyReader<RString> {
|
|
74
73
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
|
75
74
|
match pos {
|
|
76
|
-
io::SeekFrom::Start(offset) =>
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
io::SeekFrom::End(offset) => {
|
|
80
|
-
self.offset = (self.inner.len() - offset as usize) as usize;
|
|
81
|
-
}
|
|
82
|
-
io::SeekFrom::Current(offset) => {
|
|
83
|
-
self.offset = (self.offset as i64 + offset) as usize;
|
|
84
|
-
}
|
|
75
|
+
io::SeekFrom::Start(offset) => self.offset = offset as usize,
|
|
76
|
+
io::SeekFrom::Current(offset) => self.offset = (self.offset as i64 + offset) as usize,
|
|
77
|
+
io::SeekFrom::End(offset) => self.offset = self.inner.len() - offset as usize,
|
|
85
78
|
}
|
|
86
79
|
Ok(self.offset as u64)
|
|
87
80
|
}
|
|
@@ -106,59 +99,9 @@ impl RubyReader<Value> {
|
|
|
106
99
|
fn from_io_like(input: Value) -> Self {
|
|
107
100
|
Self {
|
|
108
101
|
inner: input,
|
|
109
|
-
buffer: Some(vec![0; READ_BUFFER_SIZE]),
|
|
110
102
|
offset: 0,
|
|
111
|
-
buffered_bytes: 0,
|
|
112
103
|
}
|
|
113
104
|
}
|
|
114
|
-
|
|
115
|
-
fn read_from_buffer(&mut self, to_buf: &mut [u8]) -> Option<io::Result<usize>> {
|
|
116
|
-
if let Some(from_buf) = &self.buffer {
|
|
117
|
-
// If the offset is within the buffered bytes, copy the remaining bytes to the output buffer
|
|
118
|
-
if self.offset < self.buffered_bytes {
|
|
119
|
-
let remaining = self.buffered_bytes - self.offset;
|
|
120
|
-
let copy_size = remaining.min(to_buf.len());
|
|
121
|
-
to_buf[..copy_size]
|
|
122
|
-
.copy_from_slice(&from_buf[self.offset..self.offset + copy_size]);
|
|
123
|
-
self.offset += copy_size;
|
|
124
|
-
Some(Ok(copy_size))
|
|
125
|
-
} else {
|
|
126
|
-
None
|
|
127
|
-
}
|
|
128
|
-
} else {
|
|
129
|
-
None
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
fn read_from_ruby(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
|
134
|
-
let buffer = self.buffer.as_mut().unwrap();
|
|
135
|
-
let result = self
|
|
136
|
-
.inner
|
|
137
|
-
.funcall::<_, _, RString>("read", (buffer.capacity(),))
|
|
138
|
-
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
139
|
-
|
|
140
|
-
if result.is_nil() {
|
|
141
|
-
return Ok(0); // EOF
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
let bytes = unsafe { result.as_slice() };
|
|
145
|
-
|
|
146
|
-
// Update internal buffer
|
|
147
|
-
let bytes_len = bytes.len();
|
|
148
|
-
if bytes_len == 0 {
|
|
149
|
-
return Ok(0);
|
|
150
|
-
}
|
|
151
|
-
|
|
152
|
-
// Only copy what we actually read
|
|
153
|
-
buffer[..bytes_len].copy_from_slice(bytes);
|
|
154
|
-
self.buffered_bytes = bytes_len;
|
|
155
|
-
|
|
156
|
-
// Copy to output buffer
|
|
157
|
-
let copy_size = bytes_len.min(buf.len());
|
|
158
|
-
buf[..copy_size].copy_from_slice(&buffer[..copy_size]);
|
|
159
|
-
self.offset = copy_size;
|
|
160
|
-
Ok(copy_size)
|
|
161
|
-
}
|
|
162
105
|
}
|
|
163
106
|
|
|
164
107
|
impl RubyReader<RString> {
|
|
@@ -176,9 +119,7 @@ impl RubyReader<RString> {
|
|
|
176
119
|
let string_content = input.funcall::<_, _, RString>("string", ()).unwrap();
|
|
177
120
|
Ok(Box::new(Self {
|
|
178
121
|
inner: string_content,
|
|
179
|
-
buffer: None,
|
|
180
122
|
offset: 0,
|
|
181
|
-
buffered_bytes: 0,
|
|
182
123
|
}))
|
|
183
124
|
}
|
|
184
125
|
|
|
@@ -197,21 +138,21 @@ impl RubyReader<RString> {
|
|
|
197
138
|
.or_else(|_| input.funcall::<_, _, RString>("to_s", ()))?;
|
|
198
139
|
Ok(Box::new(Self {
|
|
199
140
|
inner: string_content,
|
|
200
|
-
buffer: None,
|
|
201
141
|
offset: 0,
|
|
202
|
-
buffered_bytes: 0,
|
|
203
142
|
}))
|
|
204
143
|
}
|
|
205
144
|
}
|
|
206
145
|
|
|
207
146
|
impl Read for RubyReader<Value> {
|
|
208
|
-
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
}
|
|
147
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
|
148
|
+
let bytes = self
|
|
149
|
+
.inner
|
|
150
|
+
.funcall::<_, _, RString>("read", (buf.len(),))
|
|
151
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
152
|
+
|
|
153
|
+
buf.write_all(unsafe { bytes.as_slice() })?;
|
|
154
|
+
|
|
155
|
+
Ok(bytes.len())
|
|
215
156
|
}
|
|
216
157
|
}
|
|
217
158
|
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
use std::{borrow::Cow, collections::HashMap, hash::BuildHasher};
|
|
2
|
+
|
|
3
|
+
use magnus::{value::ReprValue, IntoValue, Ruby, Value};
|
|
4
|
+
use parquet::record::Field;
|
|
5
|
+
|
|
6
|
+
#[derive(Debug)]
|
|
7
|
+
pub enum Record<S: BuildHasher + Default> {
|
|
8
|
+
Vec(Vec<ParquetField>),
|
|
9
|
+
Map(HashMap<&'static str, ParquetField, S>),
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
impl<S: BuildHasher + Default> IntoValue for Record<S> {
|
|
13
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
|
14
|
+
match self {
|
|
15
|
+
Record::Vec(vec) => {
|
|
16
|
+
let ary = handle.ary_new_capa(vec.len());
|
|
17
|
+
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
|
18
|
+
ary.into_value_with(handle)
|
|
19
|
+
}
|
|
20
|
+
Record::Map(map) => {
|
|
21
|
+
let hash = handle.hash_new_capa(map.len());
|
|
22
|
+
map.into_iter()
|
|
23
|
+
.try_for_each(|(k, v)| hash.aset(k, v))
|
|
24
|
+
.unwrap();
|
|
25
|
+
hash.into_value_with(handle)
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
#[derive(Debug, Clone)]
|
|
32
|
+
pub struct CowValue<'a>(pub Cow<'a, str>);
|
|
33
|
+
|
|
34
|
+
impl<'a> IntoValue for CowValue<'a> {
|
|
35
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
|
36
|
+
self.0.into_value_with(handle)
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
#[derive(Debug)]
|
|
41
|
+
pub struct ParquetField(pub Field);
|
|
42
|
+
|
|
43
|
+
impl IntoValue for ParquetField {
|
|
44
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
|
45
|
+
match self.0 {
|
|
46
|
+
Field::Byte(b) => b.into_value_with(handle),
|
|
47
|
+
Field::Bool(b) => b.into_value_with(handle),
|
|
48
|
+
Field::Short(s) => s.into_value_with(handle),
|
|
49
|
+
Field::Int(i) => i.into_value_with(handle),
|
|
50
|
+
Field::Long(l) => l.into_value_with(handle),
|
|
51
|
+
Field::UByte(ub) => ub.into_value_with(handle),
|
|
52
|
+
Field::UShort(us) => us.into_value_with(handle),
|
|
53
|
+
Field::UInt(ui) => ui.into_value_with(handle),
|
|
54
|
+
Field::ULong(ul) => ul.into_value_with(handle),
|
|
55
|
+
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
|
56
|
+
Field::Float(f) => f.into_value_with(handle),
|
|
57
|
+
Field::Double(d) => d.into_value_with(handle),
|
|
58
|
+
Field::Str(s) => s.into_value_with(handle),
|
|
59
|
+
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
|
60
|
+
Field::Date(d) => d.into_value_with(handle),
|
|
61
|
+
Field::TimestampMillis(ts) => ts.into_value_with(handle),
|
|
62
|
+
Field::TimestampMicros(ts) => ts.into_value_with(handle),
|
|
63
|
+
Field::ListInternal(list) => {
|
|
64
|
+
let elements = list.elements();
|
|
65
|
+
let ary = handle.ary_new_capa(elements.len());
|
|
66
|
+
elements
|
|
67
|
+
.iter()
|
|
68
|
+
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
|
69
|
+
.unwrap();
|
|
70
|
+
ary.into_value_with(handle)
|
|
71
|
+
}
|
|
72
|
+
Field::MapInternal(map) => {
|
|
73
|
+
let entries = map.entries();
|
|
74
|
+
let hash = handle.hash_new_capa(entries.len());
|
|
75
|
+
entries
|
|
76
|
+
.iter()
|
|
77
|
+
.try_for_each(|(k, v)| {
|
|
78
|
+
hash.aset(
|
|
79
|
+
ParquetField(k.clone()).into_value_with(handle),
|
|
80
|
+
ParquetField(v.clone()).into_value_with(handle),
|
|
81
|
+
)
|
|
82
|
+
})
|
|
83
|
+
.unwrap();
|
|
84
|
+
hash.into_value_with(handle)
|
|
85
|
+
}
|
|
86
|
+
Field::Null => handle.qnil().as_value(),
|
|
87
|
+
_ => panic!("Unsupported field type"),
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
data/ext/parquet/src/utils.rs
CHANGED
|
@@ -29,6 +29,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
|
29
29
|
pub struct ParquetArgs {
|
|
30
30
|
pub to_read: Value,
|
|
31
31
|
pub result_type: String,
|
|
32
|
+
pub columns: Option<Vec<String>>,
|
|
32
33
|
}
|
|
33
34
|
|
|
34
35
|
/// Parse common arguments for CSV parsing
|
|
@@ -36,8 +37,11 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
|
|
|
36
37
|
let parsed_args = scan_args::<(Value,), (), (), (), _, ()>(args)?;
|
|
37
38
|
let (to_read,) = parsed_args.required;
|
|
38
39
|
|
|
39
|
-
let kwargs =
|
|
40
|
-
|
|
40
|
+
let kwargs = get_kwargs::<_, (), (Option<Value>, Option<Vec<String>>), ()>(
|
|
41
|
+
parsed_args.keywords,
|
|
42
|
+
&[],
|
|
43
|
+
&["result_type", "columns"],
|
|
44
|
+
)?;
|
|
41
45
|
|
|
42
46
|
let result_type = match kwargs
|
|
43
47
|
.optional
|
|
@@ -66,5 +70,6 @@ pub fn parse_parquet_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetArgs, Er
|
|
|
66
70
|
Ok(ParquetArgs {
|
|
67
71
|
to_read,
|
|
68
72
|
result_type,
|
|
73
|
+
columns: kwargs.optional.1,
|
|
69
74
|
})
|
|
70
75
|
}
|
data/lib/parquet/version.rb
CHANGED
data/lib/parquet.rbi
CHANGED
|
@@ -1,17 +1,19 @@
|
|
|
1
1
|
# typed: strict
|
|
2
|
-
|
|
3
2
|
module Parquet
|
|
4
3
|
# Options:
|
|
5
4
|
# - `input`: String specifying the input file
|
|
6
5
|
# - `result_type`: String specifying the output format
|
|
7
6
|
# ("hash" or "array" or :hash or :array)
|
|
7
|
+
# - `columns`: When present, only the specified columns will be included in the output.
|
|
8
|
+
# This is useful for reducing how much data is read and improving performance.
|
|
8
9
|
sig do
|
|
9
10
|
params(
|
|
10
11
|
input: T.any(String, IO),
|
|
11
12
|
result_type: T.nilable(T.any(String, Symbol)),
|
|
13
|
+
columns: T.nilable(T::Array[String]),
|
|
12
14
|
blk: T.nilable(T.proc.params(row: T.any(T::Hash[String, T.untyped], T::Array[T.untyped])).void)
|
|
13
15
|
).returns(T.any(Enumerator, T.untyped))
|
|
14
16
|
end
|
|
15
|
-
def self.each_row(input, result_type: nil, &blk)
|
|
17
|
+
def self.each_row(input, result_type: nil, columns: nil, &blk)
|
|
16
18
|
end
|
|
17
19
|
end
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: parquet
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Nathan Jaremko
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-01-
|
|
11
|
+
date: 2025-01-03 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -56,10 +56,14 @@ files:
|
|
|
56
56
|
- Rakefile
|
|
57
57
|
- ext/parquet/Cargo.toml
|
|
58
58
|
- ext/parquet/extconf.rb
|
|
59
|
+
- ext/parquet/src/allocator.rs
|
|
60
|
+
- ext/parquet/src/enumerator.rs
|
|
59
61
|
- ext/parquet/src/header_cache.rs
|
|
60
62
|
- ext/parquet/src/lib.rs
|
|
61
63
|
- ext/parquet/src/reader.rs
|
|
64
|
+
- ext/parquet/src/ruby_integration.rs
|
|
62
65
|
- ext/parquet/src/ruby_reader.rs
|
|
66
|
+
- ext/parquet/src/types.rs
|
|
63
67
|
- ext/parquet/src/utils.rs
|
|
64
68
|
- lib/parquet.rb
|
|
65
69
|
- lib/parquet.rbi
|