parquet 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1918 -0
- data/Cargo.toml +3 -0
- data/Gemfile +12 -0
- data/LICENSE +21 -0
- data/README.md +29 -0
- data/Rakefile +27 -0
- data/ext/parquet/Cargo.toml +18 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/header_cache.rs +81 -0
- data/ext/parquet/src/lib.rs +16 -0
- data/ext/parquet/src/reader.rs +337 -0
- data/ext/parquet/src/ruby_reader.rs +231 -0
- data/ext/parquet/src/utils.rs +70 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +5 -0
- data/lib/parquet.rbi +17 -0
- metadata +96 -0
data/Cargo.toml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Nathan Jaremko
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# parquet-ruby
|
2
|
+
|
3
|
+
[](https://badge.fury.io/rb/parquet)
|
4
|
+
|
5
|
+
This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
|
6
|
+
|
7
|
+
At the moment, it only supports iterating rows as either a hash or an array.
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
require "parquet"
|
13
|
+
|
14
|
+
# Read each row as a hash
|
15
|
+
Parquet.each_row("test/data.parquet") { |row| puts row.inspect }
|
16
|
+
|
17
|
+
# Read each row as an array
|
18
|
+
Parquet.each_row("test/data.parquet", result_type: :array) { |row| puts row.inspect }
|
19
|
+
|
20
|
+
# Read from an IO object (like File or StringIO)
|
21
|
+
File.open("test/data.parquet", "rb") do |file|
|
22
|
+
Parquet.each_row(file) { |row| puts row.inspect }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Or with StringIO
|
26
|
+
io = StringIO.new(File.binread("test/data.parquet"))
|
27
|
+
Parquet.each_row(io) { |row| puts row.inspect }
|
28
|
+
|
29
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "rake/testtask"
|
4
|
+
require "rb_sys/extensiontask"
|
5
|
+
|
6
|
+
task default: :test
|
7
|
+
|
8
|
+
GEMSPEC = Gem::Specification.load("parquet.gemspec")
|
9
|
+
|
10
|
+
RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
|
11
|
+
ext.lib_dir = "lib/parquet"
|
12
|
+
ext.ext_dir = "ext/parquet"
|
13
|
+
end
|
14
|
+
|
15
|
+
Rake::TestTask.new do |t|
|
16
|
+
t.deps << :compile
|
17
|
+
t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
|
18
|
+
t.libs << "lib"
|
19
|
+
t.libs << "test"
|
20
|
+
end
|
21
|
+
|
22
|
+
task :release do
|
23
|
+
sh "bundle exec rake test"
|
24
|
+
sh "mkdir -p pkg"
|
25
|
+
sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
|
26
|
+
sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
|
27
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
[package]
|
2
|
+
name = "parquet"
|
3
|
+
version = "0.1.0"
|
4
|
+
edition = "2021"
|
5
|
+
|
6
|
+
[lib]
|
7
|
+
crate-type = ["cdylib"]
|
8
|
+
|
9
|
+
[dependencies]
|
10
|
+
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
11
|
+
bytes = "^1.9"
|
12
|
+
kanal = "0.1.0-pre8"
|
13
|
+
magnus = { version = "0.7", features = ["rb-sys"] }
|
14
|
+
rb-sys = "^0.9"
|
15
|
+
serde = { version = "1.0", features = ["derive"] }
|
16
|
+
serde_magnus = "0.8.1"
|
17
|
+
thiserror = "2.0"
|
18
|
+
xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
|
@@ -0,0 +1,81 @@
|
|
1
|
+
/// This module exists to avoid cloning header keys in returned HashMaps.
|
2
|
+
/// Since the underlying RString creation already involves cloning,
|
3
|
+
/// this caching layer aims to reduce redundant allocations.
|
4
|
+
///
|
5
|
+
/// Note: Performance testing on macOS showed minimal speed improvements,
|
6
|
+
/// so this optimization could be removed if any issues arise.
|
7
|
+
use std::{
|
8
|
+
collections::HashMap,
|
9
|
+
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
10
|
+
};
|
11
|
+
use thiserror::Error;
|
12
|
+
|
13
|
+
#[derive(Debug, Error)]
|
14
|
+
pub enum CacheError {
|
15
|
+
#[error("Failed to acquire lock: {0}")]
|
16
|
+
LockError(String),
|
17
|
+
}
|
18
|
+
|
19
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
20
|
+
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
|
+
|
22
|
+
pub struct StringCache;
|
23
|
+
|
24
|
+
impl StringCache {
|
25
|
+
#[allow(dead_code)]
|
26
|
+
pub fn intern(string: String) -> Result<&'static str, CacheError> {
|
27
|
+
let mut cache = STRING_CACHE
|
28
|
+
.lock()
|
29
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
|
+
|
31
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
32
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
33
|
+
Ok(existing)
|
34
|
+
} else {
|
35
|
+
let leaked = Box::leak(string.into_boxed_str());
|
36
|
+
cache.insert(leaked, AtomicU32::new(1));
|
37
|
+
Ok(leaked)
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
|
42
|
+
let mut cache = STRING_CACHE
|
43
|
+
.lock()
|
44
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
|
+
|
46
|
+
let mut result = Vec::with_capacity(strings.len());
|
47
|
+
for string in strings {
|
48
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
49
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
50
|
+
result.push(existing);
|
51
|
+
} else {
|
52
|
+
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
+
cache.insert(leaked, AtomicU32::new(1));
|
54
|
+
result.push(leaked);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
Ok(result)
|
58
|
+
}
|
59
|
+
|
60
|
+
pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
|
61
|
+
let mut cache = STRING_CACHE
|
62
|
+
.lock()
|
63
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
|
+
|
65
|
+
for header in headers {
|
66
|
+
if let Some(count) = cache.get(header) {
|
67
|
+
// Returns the previous value of the counter
|
68
|
+
let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
|
69
|
+
if was == 1 {
|
70
|
+
cache.remove(header);
|
71
|
+
let ptr = *header as *const str as *mut str;
|
72
|
+
unsafe {
|
73
|
+
let _ = Box::from_raw(ptr);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
Ok(())
|
80
|
+
}
|
81
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
pub mod header_cache;
|
2
|
+
mod reader;
|
3
|
+
mod ruby_reader;
|
4
|
+
mod utils;
|
5
|
+
|
6
|
+
use crate::reader::*;
|
7
|
+
|
8
|
+
use magnus::{Error, Ruby};
|
9
|
+
|
10
|
+
/// Initializes the Ruby extension and defines methods.
|
11
|
+
#[magnus::init]
|
12
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
13
|
+
let module = ruby.define_module("Parquet")?;
|
14
|
+
module.define_module_function("each_row", magnus::method!(parse_parquet, -1))?;
|
15
|
+
Ok(())
|
16
|
+
}
|
@@ -0,0 +1,337 @@
|
|
1
|
+
use crate::header_cache::{CacheError, StringCache};
|
2
|
+
use crate::ruby_reader::{build_ruby_reader, SeekableRead};
|
3
|
+
use crate::utils::*;
|
4
|
+
use bytes::Bytes;
|
5
|
+
use magnus::rb_sys::AsRawValue;
|
6
|
+
use magnus::value::{Opaque, ReprValue};
|
7
|
+
use magnus::IntoValue;
|
8
|
+
use magnus::{block::Yield, Error as MagnusError, KwArgs, RHash, Ruby, Symbol, Value};
|
9
|
+
use parquet::errors::ParquetError;
|
10
|
+
use parquet::file::reader::{ChunkReader, Length, SerializedFileReader};
|
11
|
+
use parquet::record::Field;
|
12
|
+
use std::collections::HashMap;
|
13
|
+
use std::fs::File;
|
14
|
+
use std::io::{self, BufReader, Read, Seek, SeekFrom};
|
15
|
+
use std::mem::ManuallyDrop;
|
16
|
+
use std::os::fd::FromRawFd;
|
17
|
+
use std::sync::OnceLock;
|
18
|
+
use std::{borrow::Cow, hash::BuildHasher};
|
19
|
+
use thiserror::Error;
|
20
|
+
use xxhash_rust::xxh3::Xxh3Builder;
|
21
|
+
|
22
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
23
|
+
|
24
|
+
#[derive(Error, Debug)]
|
25
|
+
pub enum ReaderError {
|
26
|
+
#[error("Failed to get file descriptor: {0}")]
|
27
|
+
FileDescriptor(String),
|
28
|
+
#[error("Invalid file descriptor")]
|
29
|
+
InvalidFileDescriptor,
|
30
|
+
#[error("Failed to open file: {0}")]
|
31
|
+
FileOpen(#[from] io::Error),
|
32
|
+
#[error("Failed to intern headers: {0}")]
|
33
|
+
HeaderIntern(#[from] CacheError),
|
34
|
+
#[error("Ruby error: {0}")]
|
35
|
+
Ruby(String),
|
36
|
+
}
|
37
|
+
|
38
|
+
impl From<MagnusError> for ReaderError {
|
39
|
+
fn from(err: MagnusError) -> Self {
|
40
|
+
Self::Ruby(err.to_string())
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
impl From<ReaderError> for MagnusError {
|
45
|
+
fn from(err: ReaderError) -> Self {
|
46
|
+
MagnusError::new(
|
47
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
48
|
+
err.to_string(),
|
49
|
+
)
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
struct ForgottenFileHandle(ManuallyDrop<File>);
|
54
|
+
|
55
|
+
impl Length for ForgottenFileHandle {
|
56
|
+
fn len(&self) -> u64 {
|
57
|
+
self.0.len()
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
impl ChunkReader for ForgottenFileHandle {
|
62
|
+
type T = BufReader<File>;
|
63
|
+
|
64
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
65
|
+
self.0.get_read(start)
|
66
|
+
}
|
67
|
+
|
68
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
69
|
+
self.0.get_bytes(start, length)
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
struct HeaderCacheCleanupIter<I> {
|
74
|
+
inner: I,
|
75
|
+
headers: OnceLock<Vec<&'static str>>,
|
76
|
+
}
|
77
|
+
|
78
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
79
|
+
type Item = I::Item;
|
80
|
+
|
81
|
+
fn next(&mut self) -> Option<Self::Item> {
|
82
|
+
self.inner.next()
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
87
|
+
fn drop(&mut self) {
|
88
|
+
if let Some(headers) = self.headers.get() {
|
89
|
+
StringCache::clear(&headers).unwrap();
|
90
|
+
}
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
pub fn parse_parquet<'a>(
|
95
|
+
rb_self: Value,
|
96
|
+
args: &[Value],
|
97
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
|
98
|
+
let original = unsafe { Ruby::get_unchecked() };
|
99
|
+
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
100
|
+
|
101
|
+
let ParquetArgs {
|
102
|
+
to_read,
|
103
|
+
result_type,
|
104
|
+
} = parse_parquet_args(&ruby, args)?;
|
105
|
+
|
106
|
+
if !ruby.block_given() {
|
107
|
+
return create_enumerator(EnumeratorArgs {
|
108
|
+
rb_self,
|
109
|
+
to_read,
|
110
|
+
result_type,
|
111
|
+
});
|
112
|
+
}
|
113
|
+
|
114
|
+
let iter = if to_read.is_kind_of(ruby.class_string()) {
|
115
|
+
let path_string = to_read.to_r_string()?;
|
116
|
+
let file_path = unsafe { path_string.as_str()? };
|
117
|
+
let file = File::open(file_path).unwrap();
|
118
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
119
|
+
ParquetRowIter::from_file_into(Box::new(reader))
|
120
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
121
|
+
let raw_value = to_read.as_raw();
|
122
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
123
|
+
.map_err(|_| {
|
124
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
125
|
+
})?;
|
126
|
+
|
127
|
+
if fd < 0 {
|
128
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
129
|
+
}
|
130
|
+
|
131
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
132
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
133
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
134
|
+
ParquetRowIter::from_file_into(Box::new(reader))
|
135
|
+
} else {
|
136
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
137
|
+
let reader = SerializedFileReader::new(readable).unwrap();
|
138
|
+
ParquetRowIter::from_file_into(Box::new(reader))
|
139
|
+
};
|
140
|
+
|
141
|
+
let iter: Box<dyn Iterator<Item = Record<Xxh3Builder>>> = match result_type.as_str() {
|
142
|
+
"hash" => {
|
143
|
+
let headers = OnceLock::new();
|
144
|
+
let headers_clone = headers.clone();
|
145
|
+
let iter = iter
|
146
|
+
.filter_map(move |row| {
|
147
|
+
row.ok().map(|row| {
|
148
|
+
let headers = headers_clone.get_or_init(|| {
|
149
|
+
row.get_column_iter()
|
150
|
+
.map(|(k, _)| StringCache::intern(k.to_owned()).unwrap())
|
151
|
+
.collect::<Vec<_>>()
|
152
|
+
});
|
153
|
+
|
154
|
+
row.get_column_iter()
|
155
|
+
.enumerate()
|
156
|
+
.map(|(i, (_, v))| {
|
157
|
+
let key = headers[i];
|
158
|
+
(key, ParquetField(v.clone()))
|
159
|
+
})
|
160
|
+
.collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
|
161
|
+
})
|
162
|
+
})
|
163
|
+
.map(|row| Record::Map(row));
|
164
|
+
|
165
|
+
Box::new(HeaderCacheCleanupIter {
|
166
|
+
inner: iter,
|
167
|
+
headers,
|
168
|
+
})
|
169
|
+
}
|
170
|
+
"array" => Box::new(
|
171
|
+
iter.filter_map(|row| {
|
172
|
+
row.ok().map(|row| {
|
173
|
+
row.get_column_iter()
|
174
|
+
.map(|(_, v)| ParquetField(v.clone()))
|
175
|
+
.collect::<Vec<ParquetField>>()
|
176
|
+
})
|
177
|
+
})
|
178
|
+
.map(|row| Record::Vec(row)),
|
179
|
+
),
|
180
|
+
_ => {
|
181
|
+
return Err(MagnusError::new(
|
182
|
+
ruby.exception_runtime_error(),
|
183
|
+
"Invalid result type",
|
184
|
+
))
|
185
|
+
}
|
186
|
+
};
|
187
|
+
|
188
|
+
Ok(Yield::Iter(iter))
|
189
|
+
}
|
190
|
+
|
191
|
+
struct EnumeratorArgs {
|
192
|
+
rb_self: Value,
|
193
|
+
to_read: Value,
|
194
|
+
result_type: String,
|
195
|
+
}
|
196
|
+
|
197
|
+
fn create_enumerator(
|
198
|
+
args: EnumeratorArgs,
|
199
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
|
200
|
+
let kwargs = RHash::new();
|
201
|
+
|
202
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
203
|
+
|
204
|
+
let enumerator = args
|
205
|
+
.rb_self
|
206
|
+
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
207
|
+
Ok(Yield::Enumerator(enumerator))
|
208
|
+
}
|
209
|
+
|
210
|
+
#[derive(Debug)]
|
211
|
+
pub enum Record<S: BuildHasher + Default> {
|
212
|
+
Vec(Vec<ParquetField>),
|
213
|
+
Map(HashMap<&'static str, ParquetField, S>),
|
214
|
+
}
|
215
|
+
|
216
|
+
impl<S: BuildHasher + Default> IntoValue for Record<S> {
|
217
|
+
#[inline]
|
218
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
219
|
+
match self {
|
220
|
+
Record::Vec(vec) => {
|
221
|
+
let ary = handle.ary_new_capa(vec.len());
|
222
|
+
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
223
|
+
ary.into_value_with(handle)
|
224
|
+
}
|
225
|
+
Record::Map(map) => {
|
226
|
+
// Pre-allocate the hash with the known size
|
227
|
+
let hash = handle.hash_new_capa(map.len());
|
228
|
+
map.into_iter()
|
229
|
+
.try_for_each(|(k, v)| hash.aset(k, v))
|
230
|
+
.unwrap();
|
231
|
+
hash.into_value_with(handle)
|
232
|
+
}
|
233
|
+
}
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
#[derive(Debug, Clone)]
|
238
|
+
pub struct CowValue<'a>(pub Cow<'a, str>);
|
239
|
+
|
240
|
+
impl<'a> IntoValue for CowValue<'a> {
|
241
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
242
|
+
self.0.into_value_with(handle)
|
243
|
+
}
|
244
|
+
}
|
245
|
+
|
246
|
+
#[derive(Debug)]
|
247
|
+
pub struct ParquetField(Field);
|
248
|
+
|
249
|
+
impl<'a> IntoValue for ParquetField {
|
250
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
251
|
+
match self.0 {
|
252
|
+
Field::Byte(b) => b.into_value_with(handle),
|
253
|
+
Field::Bool(b) => b.into_value_with(handle),
|
254
|
+
Field::Short(s) => s.into_value_with(handle),
|
255
|
+
Field::Int(i) => i.into_value_with(handle),
|
256
|
+
Field::Long(l) => l.into_value_with(handle),
|
257
|
+
Field::UByte(ub) => ub.into_value_with(handle),
|
258
|
+
Field::UShort(us) => us.into_value_with(handle),
|
259
|
+
Field::UInt(ui) => ui.into_value_with(handle),
|
260
|
+
Field::ULong(ul) => ul.into_value_with(handle),
|
261
|
+
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
262
|
+
Field::Float(f) => f.into_value_with(handle),
|
263
|
+
Field::Double(d) => d.into_value_with(handle),
|
264
|
+
|
265
|
+
Field::Str(s) => s.into_value_with(handle),
|
266
|
+
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
267
|
+
Field::Date(d) => d.into_value_with(handle),
|
268
|
+
Field::TimestampMillis(ts) => ts.into_value_with(handle),
|
269
|
+
Field::TimestampMicros(ts) => ts.into_value_with(handle),
|
270
|
+
Field::ListInternal(list) => {
|
271
|
+
let ary = handle.ary_new_capa(list.elements().len());
|
272
|
+
list.elements()
|
273
|
+
.iter()
|
274
|
+
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
275
|
+
.unwrap();
|
276
|
+
ary.into_value_with(handle)
|
277
|
+
}
|
278
|
+
Field::MapInternal(map) => {
|
279
|
+
let hash = handle.hash_new_capa(map.entries().len());
|
280
|
+
map.entries()
|
281
|
+
.iter()
|
282
|
+
.try_for_each(|(k, v)| {
|
283
|
+
hash.aset(
|
284
|
+
ParquetField(k.clone()).into_value_with(handle),
|
285
|
+
ParquetField(v.clone()).into_value_with(handle),
|
286
|
+
)
|
287
|
+
})
|
288
|
+
.unwrap();
|
289
|
+
hash.into_value_with(handle)
|
290
|
+
}
|
291
|
+
// Field::Decimal(d) => d.to_string().into_value_with(handle),
|
292
|
+
// Field::Group(row) => row.into_value_with(handle),
|
293
|
+
Field::Null => handle.qnil().as_value(),
|
294
|
+
_ => panic!("Unsupported field type"),
|
295
|
+
}
|
296
|
+
}
|
297
|
+
}
|
298
|
+
|
299
|
+
struct SeekableRubyValue(Opaque<Value>);
|
300
|
+
|
301
|
+
impl Length for SeekableRubyValue {
|
302
|
+
fn len(&self) -> u64 {
|
303
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
304
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
305
|
+
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
306
|
+
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
307
|
+
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
308
|
+
file_len
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
impl ChunkReader for SeekableRubyValue {
|
313
|
+
type T = BufReader<Box<dyn SeekableRead>>;
|
314
|
+
|
315
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
316
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
317
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
318
|
+
reader.seek(SeekFrom::Start(start))?;
|
319
|
+
Ok(BufReader::new(reader))
|
320
|
+
}
|
321
|
+
|
322
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
323
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
324
|
+
let mut buffer = Vec::with_capacity(length);
|
325
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
326
|
+
reader.seek(SeekFrom::Start(start))?;
|
327
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
328
|
+
|
329
|
+
if read != length {
|
330
|
+
return Err(ParquetError::EOF(format!(
|
331
|
+
"Expected to read {} bytes, read only {}",
|
332
|
+
length, read
|
333
|
+
)));
|
334
|
+
}
|
335
|
+
Ok(buffer.into())
|
336
|
+
}
|
337
|
+
}
|