parquet 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Cargo.lock +1918 -0
- data/Cargo.toml +3 -0
- data/Gemfile +12 -0
- data/LICENSE +21 -0
- data/README.md +29 -0
- data/Rakefile +27 -0
- data/ext/parquet/Cargo.toml +18 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/header_cache.rs +81 -0
- data/ext/parquet/src/lib.rs +16 -0
- data/ext/parquet/src/reader.rs +337 -0
- data/ext/parquet/src/ruby_reader.rs +231 -0
- data/ext/parquet/src/utils.rs +70 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +5 -0
- data/lib/parquet.rbi +17 -0
- metadata +96 -0
data/Cargo.toml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2024 Nathan Jaremko
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# parquet-ruby
|
2
|
+
|
3
|
+
[![Gem Version](https://badge.fury.io/rb/parquet.svg)](https://badge.fury.io/rb/parquet)
|
4
|
+
|
5
|
+
This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
|
6
|
+
|
7
|
+
At the moment, it only supports iterating rows as either a hash or an array.
|
8
|
+
|
9
|
+
## Usage
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
require "parquet"
|
13
|
+
|
14
|
+
# Read each row as a hash
|
15
|
+
Parquet.each_row("test/data.parquet") { |row| puts row.inspect }
|
16
|
+
|
17
|
+
# Read each row as an array
|
18
|
+
Parquet.each_row("test/data.parquet", result_type: :array) { |row| puts row.inspect }
|
19
|
+
|
20
|
+
# Read from an IO object (like File or StringIO)
|
21
|
+
File.open("test/data.parquet", "rb") do |file|
|
22
|
+
Parquet.each_row(file) { |row| puts row.inspect }
|
23
|
+
end
|
24
|
+
|
25
|
+
# Or with StringIO
|
26
|
+
io = StringIO.new(File.binread("test/data.parquet"))
|
27
|
+
Parquet.each_row(io) { |row| puts row.inspect }
|
28
|
+
|
29
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require "rake/testtask"
|
4
|
+
require "rb_sys/extensiontask"
|
5
|
+
|
6
|
+
task default: :test
|
7
|
+
|
8
|
+
GEMSPEC = Gem::Specification.load("parquet.gemspec")
|
9
|
+
|
10
|
+
RbSys::ExtensionTask.new("parquet", GEMSPEC) do |ext|
|
11
|
+
ext.lib_dir = "lib/parquet"
|
12
|
+
ext.ext_dir = "ext/parquet"
|
13
|
+
end
|
14
|
+
|
15
|
+
Rake::TestTask.new do |t|
|
16
|
+
t.deps << :compile
|
17
|
+
t.test_files = FileList[File.expand_path("test/*_test.rb", __dir__)]
|
18
|
+
t.libs << "lib"
|
19
|
+
t.libs << "test"
|
20
|
+
end
|
21
|
+
|
22
|
+
task :release do
|
23
|
+
sh "bundle exec rake test"
|
24
|
+
sh "mkdir -p pkg"
|
25
|
+
sh "gem build parquet.gemspec -o pkg/parquet-#{Parquet::VERSION}.gem"
|
26
|
+
sh "gem push pkg/parquet-#{Parquet::VERSION}.gem"
|
27
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
[package]
|
2
|
+
name = "parquet"
|
3
|
+
version = "0.1.0"
|
4
|
+
edition = "2021"
|
5
|
+
|
6
|
+
[lib]
|
7
|
+
crate-type = ["cdylib"]
|
8
|
+
|
9
|
+
[dependencies]
|
10
|
+
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
11
|
+
bytes = "^1.9"
|
12
|
+
kanal = "0.1.0-pre8"
|
13
|
+
magnus = { version = "0.7", features = ["rb-sys"] }
|
14
|
+
rb-sys = "^0.9"
|
15
|
+
serde = { version = "1.0", features = ["derive"] }
|
16
|
+
serde_magnus = "0.8.1"
|
17
|
+
thiserror = "2.0"
|
18
|
+
xxhash-rust = { version = "0.8.12", features = ["xxh3"] }
|
@@ -0,0 +1,81 @@
|
|
1
|
+
/// This module exists to avoid cloning header keys in returned HashMaps.
|
2
|
+
/// Since the underlying RString creation already involves cloning,
|
3
|
+
/// this caching layer aims to reduce redundant allocations.
|
4
|
+
///
|
5
|
+
/// Note: Performance testing on macOS showed minimal speed improvements,
|
6
|
+
/// so this optimization could be removed if any issues arise.
|
7
|
+
use std::{
|
8
|
+
collections::HashMap,
|
9
|
+
sync::{atomic::AtomicU32, LazyLock, Mutex},
|
10
|
+
};
|
11
|
+
use thiserror::Error;
|
12
|
+
|
13
|
+
#[derive(Debug, Error)]
|
14
|
+
pub enum CacheError {
|
15
|
+
#[error("Failed to acquire lock: {0}")]
|
16
|
+
LockError(String),
|
17
|
+
}
|
18
|
+
|
19
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, AtomicU32>>> =
|
20
|
+
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
21
|
+
|
22
|
+
pub struct StringCache;
|
23
|
+
|
24
|
+
impl StringCache {
|
25
|
+
#[allow(dead_code)]
|
26
|
+
pub fn intern(string: String) -> Result<&'static str, CacheError> {
|
27
|
+
let mut cache = STRING_CACHE
|
28
|
+
.lock()
|
29
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
30
|
+
|
31
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
32
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
33
|
+
Ok(existing)
|
34
|
+
} else {
|
35
|
+
let leaked = Box::leak(string.into_boxed_str());
|
36
|
+
cache.insert(leaked, AtomicU32::new(1));
|
37
|
+
Ok(leaked)
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
pub fn intern_many(strings: &[String]) -> Result<Vec<&'static str>, CacheError> {
|
42
|
+
let mut cache = STRING_CACHE
|
43
|
+
.lock()
|
44
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
45
|
+
|
46
|
+
let mut result = Vec::with_capacity(strings.len());
|
47
|
+
for string in strings {
|
48
|
+
if let Some((&existing, count)) = cache.get_key_value(string.as_str()) {
|
49
|
+
count.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
|
50
|
+
result.push(existing);
|
51
|
+
} else {
|
52
|
+
let leaked = Box::leak(string.clone().into_boxed_str());
|
53
|
+
cache.insert(leaked, AtomicU32::new(1));
|
54
|
+
result.push(leaked);
|
55
|
+
}
|
56
|
+
}
|
57
|
+
Ok(result)
|
58
|
+
}
|
59
|
+
|
60
|
+
pub fn clear(headers: &[&'static str]) -> Result<(), CacheError> {
|
61
|
+
let mut cache = STRING_CACHE
|
62
|
+
.lock()
|
63
|
+
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
64
|
+
|
65
|
+
for header in headers {
|
66
|
+
if let Some(count) = cache.get(header) {
|
67
|
+
// Returns the previous value of the counter
|
68
|
+
let was = count.fetch_sub(1, std::sync::atomic::Ordering::Relaxed);
|
69
|
+
if was == 1 {
|
70
|
+
cache.remove(header);
|
71
|
+
let ptr = *header as *const str as *mut str;
|
72
|
+
unsafe {
|
73
|
+
let _ = Box::from_raw(ptr);
|
74
|
+
}
|
75
|
+
}
|
76
|
+
}
|
77
|
+
}
|
78
|
+
|
79
|
+
Ok(())
|
80
|
+
}
|
81
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
pub mod header_cache;
|
2
|
+
mod reader;
|
3
|
+
mod ruby_reader;
|
4
|
+
mod utils;
|
5
|
+
|
6
|
+
use crate::reader::*;
|
7
|
+
|
8
|
+
use magnus::{Error, Ruby};
|
9
|
+
|
10
|
+
/// Initializes the Ruby extension and defines methods.
|
11
|
+
#[magnus::init]
|
12
|
+
fn init(ruby: &Ruby) -> Result<(), Error> {
|
13
|
+
let module = ruby.define_module("Parquet")?;
|
14
|
+
module.define_module_function("each_row", magnus::method!(parse_parquet, -1))?;
|
15
|
+
Ok(())
|
16
|
+
}
|
@@ -0,0 +1,337 @@
|
|
1
|
+
use crate::header_cache::{CacheError, StringCache};
|
2
|
+
use crate::ruby_reader::{build_ruby_reader, SeekableRead};
|
3
|
+
use crate::utils::*;
|
4
|
+
use bytes::Bytes;
|
5
|
+
use magnus::rb_sys::AsRawValue;
|
6
|
+
use magnus::value::{Opaque, ReprValue};
|
7
|
+
use magnus::IntoValue;
|
8
|
+
use magnus::{block::Yield, Error as MagnusError, KwArgs, RHash, Ruby, Symbol, Value};
|
9
|
+
use parquet::errors::ParquetError;
|
10
|
+
use parquet::file::reader::{ChunkReader, Length, SerializedFileReader};
|
11
|
+
use parquet::record::Field;
|
12
|
+
use std::collections::HashMap;
|
13
|
+
use std::fs::File;
|
14
|
+
use std::io::{self, BufReader, Read, Seek, SeekFrom};
|
15
|
+
use std::mem::ManuallyDrop;
|
16
|
+
use std::os::fd::FromRawFd;
|
17
|
+
use std::sync::OnceLock;
|
18
|
+
use std::{borrow::Cow, hash::BuildHasher};
|
19
|
+
use thiserror::Error;
|
20
|
+
use xxhash_rust::xxh3::Xxh3Builder;
|
21
|
+
|
22
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
23
|
+
|
24
|
+
#[derive(Error, Debug)]
|
25
|
+
pub enum ReaderError {
|
26
|
+
#[error("Failed to get file descriptor: {0}")]
|
27
|
+
FileDescriptor(String),
|
28
|
+
#[error("Invalid file descriptor")]
|
29
|
+
InvalidFileDescriptor,
|
30
|
+
#[error("Failed to open file: {0}")]
|
31
|
+
FileOpen(#[from] io::Error),
|
32
|
+
#[error("Failed to intern headers: {0}")]
|
33
|
+
HeaderIntern(#[from] CacheError),
|
34
|
+
#[error("Ruby error: {0}")]
|
35
|
+
Ruby(String),
|
36
|
+
}
|
37
|
+
|
38
|
+
impl From<MagnusError> for ReaderError {
|
39
|
+
fn from(err: MagnusError) -> Self {
|
40
|
+
Self::Ruby(err.to_string())
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
impl From<ReaderError> for MagnusError {
|
45
|
+
fn from(err: ReaderError) -> Self {
|
46
|
+
MagnusError::new(
|
47
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
48
|
+
err.to_string(),
|
49
|
+
)
|
50
|
+
}
|
51
|
+
}
|
52
|
+
|
53
|
+
struct ForgottenFileHandle(ManuallyDrop<File>);
|
54
|
+
|
55
|
+
impl Length for ForgottenFileHandle {
|
56
|
+
fn len(&self) -> u64 {
|
57
|
+
self.0.len()
|
58
|
+
}
|
59
|
+
}
|
60
|
+
|
61
|
+
impl ChunkReader for ForgottenFileHandle {
|
62
|
+
type T = BufReader<File>;
|
63
|
+
|
64
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
65
|
+
self.0.get_read(start)
|
66
|
+
}
|
67
|
+
|
68
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
69
|
+
self.0.get_bytes(start, length)
|
70
|
+
}
|
71
|
+
}
|
72
|
+
|
73
|
+
struct HeaderCacheCleanupIter<I> {
|
74
|
+
inner: I,
|
75
|
+
headers: OnceLock<Vec<&'static str>>,
|
76
|
+
}
|
77
|
+
|
78
|
+
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
79
|
+
type Item = I::Item;
|
80
|
+
|
81
|
+
fn next(&mut self) -> Option<Self::Item> {
|
82
|
+
self.inner.next()
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
87
|
+
fn drop(&mut self) {
|
88
|
+
if let Some(headers) = self.headers.get() {
|
89
|
+
StringCache::clear(&headers).unwrap();
|
90
|
+
}
|
91
|
+
}
|
92
|
+
}
|
93
|
+
|
94
|
+
pub fn parse_parquet<'a>(
|
95
|
+
rb_self: Value,
|
96
|
+
args: &[Value],
|
97
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
|
98
|
+
let original = unsafe { Ruby::get_unchecked() };
|
99
|
+
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
100
|
+
|
101
|
+
let ParquetArgs {
|
102
|
+
to_read,
|
103
|
+
result_type,
|
104
|
+
} = parse_parquet_args(&ruby, args)?;
|
105
|
+
|
106
|
+
if !ruby.block_given() {
|
107
|
+
return create_enumerator(EnumeratorArgs {
|
108
|
+
rb_self,
|
109
|
+
to_read,
|
110
|
+
result_type,
|
111
|
+
});
|
112
|
+
}
|
113
|
+
|
114
|
+
let iter = if to_read.is_kind_of(ruby.class_string()) {
|
115
|
+
let path_string = to_read.to_r_string()?;
|
116
|
+
let file_path = unsafe { path_string.as_str()? };
|
117
|
+
let file = File::open(file_path).unwrap();
|
118
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
119
|
+
ParquetRowIter::from_file_into(Box::new(reader))
|
120
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
121
|
+
let raw_value = to_read.as_raw();
|
122
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
123
|
+
.map_err(|_| {
|
124
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
125
|
+
})?;
|
126
|
+
|
127
|
+
if fd < 0 {
|
128
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
129
|
+
}
|
130
|
+
|
131
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
132
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
133
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
134
|
+
ParquetRowIter::from_file_into(Box::new(reader))
|
135
|
+
} else {
|
136
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
137
|
+
let reader = SerializedFileReader::new(readable).unwrap();
|
138
|
+
ParquetRowIter::from_file_into(Box::new(reader))
|
139
|
+
};
|
140
|
+
|
141
|
+
let iter: Box<dyn Iterator<Item = Record<Xxh3Builder>>> = match result_type.as_str() {
|
142
|
+
"hash" => {
|
143
|
+
let headers = OnceLock::new();
|
144
|
+
let headers_clone = headers.clone();
|
145
|
+
let iter = iter
|
146
|
+
.filter_map(move |row| {
|
147
|
+
row.ok().map(|row| {
|
148
|
+
let headers = headers_clone.get_or_init(|| {
|
149
|
+
row.get_column_iter()
|
150
|
+
.map(|(k, _)| StringCache::intern(k.to_owned()).unwrap())
|
151
|
+
.collect::<Vec<_>>()
|
152
|
+
});
|
153
|
+
|
154
|
+
row.get_column_iter()
|
155
|
+
.enumerate()
|
156
|
+
.map(|(i, (_, v))| {
|
157
|
+
let key = headers[i];
|
158
|
+
(key, ParquetField(v.clone()))
|
159
|
+
})
|
160
|
+
.collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
|
161
|
+
})
|
162
|
+
})
|
163
|
+
.map(|row| Record::Map(row));
|
164
|
+
|
165
|
+
Box::new(HeaderCacheCleanupIter {
|
166
|
+
inner: iter,
|
167
|
+
headers,
|
168
|
+
})
|
169
|
+
}
|
170
|
+
"array" => Box::new(
|
171
|
+
iter.filter_map(|row| {
|
172
|
+
row.ok().map(|row| {
|
173
|
+
row.get_column_iter()
|
174
|
+
.map(|(_, v)| ParquetField(v.clone()))
|
175
|
+
.collect::<Vec<ParquetField>>()
|
176
|
+
})
|
177
|
+
})
|
178
|
+
.map(|row| Record::Vec(row)),
|
179
|
+
),
|
180
|
+
_ => {
|
181
|
+
return Err(MagnusError::new(
|
182
|
+
ruby.exception_runtime_error(),
|
183
|
+
"Invalid result type",
|
184
|
+
))
|
185
|
+
}
|
186
|
+
};
|
187
|
+
|
188
|
+
Ok(Yield::Iter(iter))
|
189
|
+
}
|
190
|
+
|
191
|
+
struct EnumeratorArgs {
|
192
|
+
rb_self: Value,
|
193
|
+
to_read: Value,
|
194
|
+
result_type: String,
|
195
|
+
}
|
196
|
+
|
197
|
+
fn create_enumerator(
|
198
|
+
args: EnumeratorArgs,
|
199
|
+
) -> Result<Yield<Box<dyn Iterator<Item = Record<Xxh3Builder>>>>, MagnusError> {
|
200
|
+
let kwargs = RHash::new();
|
201
|
+
|
202
|
+
kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
|
203
|
+
|
204
|
+
let enumerator = args
|
205
|
+
.rb_self
|
206
|
+
.enumeratorize("for_each", (args.to_read, KwArgs(kwargs)));
|
207
|
+
Ok(Yield::Enumerator(enumerator))
|
208
|
+
}
|
209
|
+
|
210
|
+
#[derive(Debug)]
|
211
|
+
pub enum Record<S: BuildHasher + Default> {
|
212
|
+
Vec(Vec<ParquetField>),
|
213
|
+
Map(HashMap<&'static str, ParquetField, S>),
|
214
|
+
}
|
215
|
+
|
216
|
+
impl<S: BuildHasher + Default> IntoValue for Record<S> {
|
217
|
+
#[inline]
|
218
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
219
|
+
match self {
|
220
|
+
Record::Vec(vec) => {
|
221
|
+
let ary = handle.ary_new_capa(vec.len());
|
222
|
+
vec.into_iter().try_for_each(|v| ary.push(v)).unwrap();
|
223
|
+
ary.into_value_with(handle)
|
224
|
+
}
|
225
|
+
Record::Map(map) => {
|
226
|
+
// Pre-allocate the hash with the known size
|
227
|
+
let hash = handle.hash_new_capa(map.len());
|
228
|
+
map.into_iter()
|
229
|
+
.try_for_each(|(k, v)| hash.aset(k, v))
|
230
|
+
.unwrap();
|
231
|
+
hash.into_value_with(handle)
|
232
|
+
}
|
233
|
+
}
|
234
|
+
}
|
235
|
+
}
|
236
|
+
|
237
|
+
#[derive(Debug, Clone)]
|
238
|
+
pub struct CowValue<'a>(pub Cow<'a, str>);
|
239
|
+
|
240
|
+
impl<'a> IntoValue for CowValue<'a> {
|
241
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
242
|
+
self.0.into_value_with(handle)
|
243
|
+
}
|
244
|
+
}
|
245
|
+
|
246
|
+
#[derive(Debug)]
|
247
|
+
pub struct ParquetField(Field);
|
248
|
+
|
249
|
+
impl<'a> IntoValue for ParquetField {
|
250
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
251
|
+
match self.0 {
|
252
|
+
Field::Byte(b) => b.into_value_with(handle),
|
253
|
+
Field::Bool(b) => b.into_value_with(handle),
|
254
|
+
Field::Short(s) => s.into_value_with(handle),
|
255
|
+
Field::Int(i) => i.into_value_with(handle),
|
256
|
+
Field::Long(l) => l.into_value_with(handle),
|
257
|
+
Field::UByte(ub) => ub.into_value_with(handle),
|
258
|
+
Field::UShort(us) => us.into_value_with(handle),
|
259
|
+
Field::UInt(ui) => ui.into_value_with(handle),
|
260
|
+
Field::ULong(ul) => ul.into_value_with(handle),
|
261
|
+
Field::Float16(f) => f32::from(f).into_value_with(handle),
|
262
|
+
Field::Float(f) => f.into_value_with(handle),
|
263
|
+
Field::Double(d) => d.into_value_with(handle),
|
264
|
+
|
265
|
+
Field::Str(s) => s.into_value_with(handle),
|
266
|
+
Field::Bytes(b) => handle.str_from_slice(b.data()).as_value(),
|
267
|
+
Field::Date(d) => d.into_value_with(handle),
|
268
|
+
Field::TimestampMillis(ts) => ts.into_value_with(handle),
|
269
|
+
Field::TimestampMicros(ts) => ts.into_value_with(handle),
|
270
|
+
Field::ListInternal(list) => {
|
271
|
+
let ary = handle.ary_new_capa(list.elements().len());
|
272
|
+
list.elements()
|
273
|
+
.iter()
|
274
|
+
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
275
|
+
.unwrap();
|
276
|
+
ary.into_value_with(handle)
|
277
|
+
}
|
278
|
+
Field::MapInternal(map) => {
|
279
|
+
let hash = handle.hash_new_capa(map.entries().len());
|
280
|
+
map.entries()
|
281
|
+
.iter()
|
282
|
+
.try_for_each(|(k, v)| {
|
283
|
+
hash.aset(
|
284
|
+
ParquetField(k.clone()).into_value_with(handle),
|
285
|
+
ParquetField(v.clone()).into_value_with(handle),
|
286
|
+
)
|
287
|
+
})
|
288
|
+
.unwrap();
|
289
|
+
hash.into_value_with(handle)
|
290
|
+
}
|
291
|
+
// Field::Decimal(d) => d.to_string().into_value_with(handle),
|
292
|
+
// Field::Group(row) => row.into_value_with(handle),
|
293
|
+
Field::Null => handle.qnil().as_value(),
|
294
|
+
_ => panic!("Unsupported field type"),
|
295
|
+
}
|
296
|
+
}
|
297
|
+
}
|
298
|
+
|
299
|
+
struct SeekableRubyValue(Opaque<Value>);
|
300
|
+
|
301
|
+
impl Length for SeekableRubyValue {
|
302
|
+
fn len(&self) -> u64 {
|
303
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
304
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
305
|
+
let current_pos = reader.seek(SeekFrom::Current(0)).unwrap();
|
306
|
+
let file_len = reader.seek(SeekFrom::End(0)).unwrap();
|
307
|
+
reader.seek(SeekFrom::Start(current_pos)).unwrap();
|
308
|
+
file_len
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
impl ChunkReader for SeekableRubyValue {
|
313
|
+
type T = BufReader<Box<dyn SeekableRead>>;
|
314
|
+
|
315
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
316
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
317
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
318
|
+
reader.seek(SeekFrom::Start(start))?;
|
319
|
+
Ok(BufReader::new(reader))
|
320
|
+
}
|
321
|
+
|
322
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
323
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
324
|
+
let mut buffer = Vec::with_capacity(length);
|
325
|
+
let mut reader = build_ruby_reader(&ruby, ruby.get_inner(self.0)).unwrap();
|
326
|
+
reader.seek(SeekFrom::Start(start))?;
|
327
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
328
|
+
|
329
|
+
if read != length {
|
330
|
+
return Err(ParquetError::EOF(format!(
|
331
|
+
"Expected to read {} bytes, read only {}",
|
332
|
+
length, read
|
333
|
+
)));
|
334
|
+
}
|
335
|
+
Ok(buffer.into())
|
336
|
+
}
|
337
|
+
}
|