parquet 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +99 -7
- data/Gemfile +7 -2
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +12 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +54 -0
- data/ext/parquet/src/header_cache.rs +105 -26
- data/ext/parquet/src/lib.rs +9 -1
- data/ext/parquet/src/reader.rs +289 -231
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +722 -0
- data/ext/parquet/src/utils.rs +64 -5
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +26 -5
- metadata +6 -2
data/ext/parquet/src/reader.rs
CHANGED
@@ -1,122 +1,62 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
use
|
1
|
+
// =============================================================================
|
2
|
+
// Imports and Dependencies
|
3
|
+
// =============================================================================
|
4
|
+
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
|
+
use crate::{
|
6
|
+
create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
|
7
|
+
ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
|
8
|
+
SeekableRubyValue,
|
9
|
+
};
|
10
|
+
use ahash::RandomState;
|
5
11
|
use magnus::rb_sys::AsRawValue;
|
6
12
|
use magnus::value::{Opaque, ReprValue};
|
7
|
-
use magnus::
|
8
|
-
use
|
13
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
14
|
+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
15
|
+
use parquet::arrow::ProjectionMask;
|
9
16
|
use parquet::errors::ParquetError;
|
10
|
-
use parquet::file::reader::
|
11
|
-
use parquet::
|
17
|
+
use parquet::file::reader::FileReader;
|
18
|
+
use parquet::file::reader::SerializedFileReader;
|
19
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
20
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
12
21
|
use std::collections::HashMap;
|
13
22
|
use std::fs::File;
|
14
|
-
use std::io::{self
|
23
|
+
use std::io::{self};
|
15
24
|
use std::mem::ManuallyDrop;
|
16
25
|
use std::os::fd::FromRawFd;
|
17
26
|
use std::sync::OnceLock;
|
18
|
-
use std::{borrow::Cow, hash::BuildHasher};
|
19
27
|
use thiserror::Error;
|
20
|
-
use xxhash_rust::xxh3::Xxh3Builder;
|
21
|
-
|
22
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
23
|
-
|
24
|
-
#[derive(Error, Debug)]
|
25
|
-
pub enum ReaderError {
|
26
|
-
#[error("Failed to get file descriptor: {0}")]
|
27
|
-
FileDescriptor(String),
|
28
|
-
#[error("Invalid file descriptor")]
|
29
|
-
InvalidFileDescriptor,
|
30
|
-
#[error("Failed to open file: {0}")]
|
31
|
-
FileOpen(#[from] io::Error),
|
32
|
-
#[error("Failed to intern headers: {0}")]
|
33
|
-
HeaderIntern(#[from] CacheError),
|
34
|
-
#[error("Ruby error: {0}")]
|
35
|
-
Ruby(String),
|
36
|
-
}
|
37
|
-
|
38
|
-
impl From<MagnusError> for ReaderError {
|
39
|
-
fn from(err: MagnusError) -> Self {
|
40
|
-
Self::Ruby(err.to_string())
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
impl From<ReaderError> for MagnusError {
|
45
|
-
fn from(err: ReaderError) -> Self {
|
46
|
-
MagnusError::new(
|
47
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
48
|
-
err.to_string(),
|
49
|
-
)
|
50
|
-
}
|
51
|
-
}
|
52
|
-
|
53
|
-
struct ForgottenFileHandle(ManuallyDrop<File>);
|
54
|
-
|
55
|
-
impl Length for ForgottenFileHandle {
|
56
|
-
fn len(&self) -> u64 {
|
57
|
-
self.0.len()
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
impl ChunkReader for ForgottenFileHandle {
|
62
|
-
type T = BufReader<File>;
|
63
|
-
|
64
|
-
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
65
|
-
self.0.get_read(start)
|
66
|
-
}
|
67
|
-
|
68
|
-
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
69
|
-
self.0.get_bytes(start, length)
|
70
|
-
}
|
71
|
-
}
|
72
|
-
|
73
|
-
struct HeaderCacheCleanupIter<I> {
|
74
|
-
inner: I,
|
75
|
-
headers: OnceLock<Vec<&'static str>>,
|
76
|
-
}
|
77
|
-
|
78
|
-
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
79
|
-
type Item = I::Item;
|
80
28
|
|
81
|
-
|
82
|
-
|
83
|
-
}
|
84
|
-
}
|
85
|
-
|
86
|
-
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
87
|
-
fn drop(&mut self) {
|
88
|
-
if let Some(headers) = self.headers.get() {
|
89
|
-
StringCache::clear(&headers).unwrap();
|
90
|
-
}
|
91
|
-
}
|
92
|
-
}
|
93
|
-
|
94
|
-
pub fn parse_parquet<'a>(
|
29
|
+
#[inline]
|
30
|
+
pub fn parse_parquet_rows<'a>(
|
95
31
|
rb_self: Value,
|
96
32
|
args: &[Value],
|
97
|
-
) -> Result<Yield<Box<dyn Iterator<Item =
|
33
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
98
34
|
let original = unsafe { Ruby::get_unchecked() };
|
99
35
|
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
100
36
|
|
101
|
-
let
|
37
|
+
let ParquetRowsArgs {
|
102
38
|
to_read,
|
103
39
|
result_type,
|
104
|
-
|
40
|
+
columns,
|
41
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
105
42
|
|
106
43
|
if !ruby.block_given() {
|
107
|
-
return
|
44
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
108
45
|
rb_self,
|
109
46
|
to_read,
|
110
47
|
result_type,
|
48
|
+
columns,
|
111
49
|
});
|
112
50
|
}
|
113
51
|
|
114
|
-
let iter = if to_read.is_kind_of(ruby.class_string()) {
|
52
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
115
53
|
let path_string = to_read.to_r_string()?;
|
116
54
|
let file_path = unsafe { path_string.as_str()? };
|
117
55
|
let file = File::open(file_path).unwrap();
|
118
56
|
let reader = SerializedFileReader::new(file).unwrap();
|
119
|
-
|
57
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
58
|
+
|
59
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
120
60
|
} else if to_read.is_kind_of(ruby.class_io()) {
|
121
61
|
let raw_value = to_read.as_raw();
|
122
62
|
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
@@ -131,14 +71,28 @@ pub fn parse_parquet<'a>(
|
|
131
71
|
let file = unsafe { File::from_raw_fd(fd) };
|
132
72
|
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
133
73
|
let reader = SerializedFileReader::new(file).unwrap();
|
134
|
-
|
74
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
75
|
+
|
76
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
135
77
|
} else {
|
136
78
|
let readable = SeekableRubyValue(Opaque::from(to_read));
|
137
79
|
let reader = SerializedFileReader::new(readable).unwrap();
|
138
|
-
|
80
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
81
|
+
|
82
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
139
83
|
};
|
140
84
|
|
141
|
-
let
|
85
|
+
if let Some(cols) = columns {
|
86
|
+
let projection = create_projection_schema(&schema, &cols);
|
87
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
88
|
+
MagnusError::new(
|
89
|
+
ruby.exception_runtime_error(),
|
90
|
+
format!("Failed to create projection: {}", e),
|
91
|
+
)
|
92
|
+
})?;
|
93
|
+
}
|
94
|
+
|
95
|
+
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
142
96
|
"hash" => {
|
143
97
|
let headers = OnceLock::new();
|
144
98
|
let headers_clone = headers.clone();
|
@@ -146,21 +100,27 @@ pub fn parse_parquet<'a>(
|
|
146
100
|
.filter_map(move |row| {
|
147
101
|
row.ok().map(|row| {
|
148
102
|
let headers = headers_clone.get_or_init(|| {
|
149
|
-
row.get_column_iter()
|
150
|
-
|
151
|
-
|
103
|
+
let column_count = row.get_column_iter().count();
|
104
|
+
|
105
|
+
let mut header_string = Vec::with_capacity(column_count);
|
106
|
+
for (k, _) in row.get_column_iter() {
|
107
|
+
header_string.push(k.to_owned());
|
108
|
+
}
|
109
|
+
|
110
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
111
|
+
|
112
|
+
headers
|
152
113
|
});
|
153
114
|
|
154
|
-
|
155
|
-
.
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
.collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
|
115
|
+
let mut map =
|
116
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
117
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
118
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
119
|
+
});
|
120
|
+
map
|
161
121
|
})
|
162
122
|
})
|
163
|
-
.map(
|
123
|
+
.map(RowRecord::Map);
|
164
124
|
|
165
125
|
Box::new(HeaderCacheCleanupIter {
|
166
126
|
inner: iter,
|
@@ -170,12 +130,14 @@ pub fn parse_parquet<'a>(
|
|
170
130
|
"array" => Box::new(
|
171
131
|
iter.filter_map(|row| {
|
172
132
|
row.ok().map(|row| {
|
133
|
+
let column_count = row.get_column_iter().count();
|
134
|
+
let mut vec = Vec::with_capacity(column_count);
|
173
135
|
row.get_column_iter()
|
174
|
-
.
|
175
|
-
|
136
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
137
|
+
vec
|
176
138
|
})
|
177
139
|
})
|
178
|
-
.map(
|
140
|
+
.map(RowRecord::Vec),
|
179
141
|
),
|
180
142
|
_ => {
|
181
143
|
return Err(MagnusError::new(
|
@@ -188,150 +150,246 @@ pub fn parse_parquet<'a>(
|
|
188
150
|
Ok(Yield::Iter(iter))
|
189
151
|
}
|
190
152
|
|
191
|
-
|
153
|
+
#[inline]
|
154
|
+
pub fn parse_parquet_columns<'a>(
|
192
155
|
rb_self: Value,
|
193
|
-
|
194
|
-
|
195
|
-
}
|
156
|
+
args: &[Value],
|
157
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
158
|
+
let original = unsafe { Ruby::get_unchecked() };
|
159
|
+
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
196
160
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
161
|
+
let ParquetColumnsArgs {
|
162
|
+
to_read,
|
163
|
+
result_type,
|
164
|
+
columns,
|
165
|
+
batch_size,
|
166
|
+
} = parse_parquet_columns_args(&ruby, args)?;
|
201
167
|
|
202
|
-
|
168
|
+
if !ruby.block_given() {
|
169
|
+
return create_column_enumerator(ColumnEnumeratorArgs {
|
170
|
+
rb_self,
|
171
|
+
to_read,
|
172
|
+
result_type,
|
173
|
+
columns,
|
174
|
+
batch_size,
|
175
|
+
});
|
176
|
+
}
|
203
177
|
|
204
|
-
let
|
205
|
-
.
|
206
|
-
|
207
|
-
|
208
|
-
}
|
178
|
+
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
179
|
+
let path_string = to_read.to_r_string()?;
|
180
|
+
let file_path = unsafe { path_string.as_str()? };
|
181
|
+
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
209
182
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
}
|
183
|
+
let mut builder =
|
184
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
185
|
+
let schema = builder.schema().clone();
|
186
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
215
187
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
.unwrap();
|
231
|
-
hash.into_value_with(handle)
|
232
|
-
}
|
188
|
+
// If columns are specified, project only those columns
|
189
|
+
if let Some(cols) = &columns {
|
190
|
+
// Get the parquet schema
|
191
|
+
let parquet_schema = builder.parquet_schema();
|
192
|
+
|
193
|
+
// Create a projection mask from column names
|
194
|
+
let projection =
|
195
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
196
|
+
|
197
|
+
builder = builder.with_projection(projection);
|
198
|
+
}
|
199
|
+
|
200
|
+
if let Some(batch_size) = batch_size {
|
201
|
+
builder = builder.with_batch_size(batch_size);
|
233
202
|
}
|
234
|
-
}
|
235
|
-
}
|
236
203
|
|
237
|
-
|
238
|
-
pub struct CowValue<'a>(pub Cow<'a, str>);
|
204
|
+
let reader = builder.build().unwrap();
|
239
205
|
|
240
|
-
|
241
|
-
|
242
|
-
|
206
|
+
(reader, schema, num_rows)
|
207
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
208
|
+
let raw_value = to_read.as_raw();
|
209
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
210
|
+
.map_err(|_| {
|
211
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
212
|
+
})?;
|
213
|
+
|
214
|
+
if fd < 0 {
|
215
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
216
|
+
}
|
217
|
+
|
218
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
219
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
220
|
+
|
221
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
222
|
+
let schema = builder.schema().clone();
|
223
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
224
|
+
|
225
|
+
if let Some(batch_size) = batch_size {
|
226
|
+
builder = builder.with_batch_size(batch_size);
|
227
|
+
}
|
228
|
+
|
229
|
+
// If columns are specified, project only those columns
|
230
|
+
if let Some(cols) = &columns {
|
231
|
+
// Get the parquet schema
|
232
|
+
let parquet_schema = builder.parquet_schema();
|
233
|
+
|
234
|
+
// Create a projection mask from column names
|
235
|
+
let projection =
|
236
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
237
|
+
|
238
|
+
builder = builder.with_projection(projection);
|
239
|
+
}
|
240
|
+
|
241
|
+
let reader = builder.build().unwrap();
|
242
|
+
|
243
|
+
(reader, schema, num_rows)
|
244
|
+
} else {
|
245
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
246
|
+
|
247
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
248
|
+
let schema = builder.schema().clone();
|
249
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
250
|
+
|
251
|
+
if let Some(batch_size) = batch_size {
|
252
|
+
builder = builder.with_batch_size(batch_size);
|
253
|
+
}
|
254
|
+
|
255
|
+
// If columns are specified, project only those columns
|
256
|
+
if let Some(cols) = &columns {
|
257
|
+
// Get the parquet schema
|
258
|
+
let parquet_schema = builder.parquet_schema();
|
259
|
+
|
260
|
+
// Create a projection mask from column names
|
261
|
+
let projection =
|
262
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
263
|
+
|
264
|
+
builder = builder.with_projection(projection);
|
265
|
+
}
|
266
|
+
|
267
|
+
let reader = builder.build().unwrap();
|
268
|
+
|
269
|
+
(reader, schema, num_rows)
|
270
|
+
};
|
271
|
+
|
272
|
+
if num_rows == 0 {
|
273
|
+
let mut map =
|
274
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
275
|
+
for field in schema.fields() {
|
276
|
+
map.insert(
|
277
|
+
StringCache::intern(field.name().to_string()).unwrap(),
|
278
|
+
vec![],
|
279
|
+
);
|
280
|
+
}
|
281
|
+
let column_record = vec![ColumnRecord::Map(map)];
|
282
|
+
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
243
283
|
}
|
244
|
-
}
|
245
284
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
.iter()
|
274
|
-
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
275
|
-
.unwrap();
|
276
|
-
ary.into_value_with(handle)
|
277
|
-
}
|
278
|
-
Field::MapInternal(map) => {
|
279
|
-
let hash = handle.hash_new_capa(map.entries().len());
|
280
|
-
map.entries()
|
281
|
-
.iter()
|
282
|
-
.try_for_each(|(k, v)| {
|
283
|
-
hash.aset(
|
284
|
-
ParquetField(k.clone()).into_value_with(handle),
|
285
|
-
ParquetField(v.clone()).into_value_with(handle),
|
286
|
-
)
|
285
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
|
286
|
+
"hash" => {
|
287
|
+
let headers = OnceLock::new();
|
288
|
+
let headers_clone = headers.clone();
|
289
|
+
let iter = batch_reader
|
290
|
+
.filter_map(move |batch| {
|
291
|
+
batch.ok().map(|batch| {
|
292
|
+
let headers = headers_clone.get_or_init(|| {
|
293
|
+
let schema = batch.schema();
|
294
|
+
let fields = schema.fields();
|
295
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
296
|
+
for field in fields {
|
297
|
+
header_string.push(field.name().to_owned());
|
298
|
+
}
|
299
|
+
StringCache::intern_many(&header_string).unwrap()
|
300
|
+
});
|
301
|
+
|
302
|
+
let mut map =
|
303
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
304
|
+
|
305
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
306
|
+
let header = headers[i];
|
307
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
308
|
+
map.insert(header, values.into_inner());
|
309
|
+
});
|
310
|
+
|
311
|
+
map
|
287
312
|
})
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
313
|
+
})
|
314
|
+
.map(ColumnRecord::Map);
|
315
|
+
|
316
|
+
Box::new(HeaderCacheCleanupIter {
|
317
|
+
inner: iter,
|
318
|
+
headers,
|
319
|
+
})
|
295
320
|
}
|
296
|
-
|
297
|
-
|
321
|
+
"array" => Box::new(
|
322
|
+
batch_reader
|
323
|
+
.filter_map(|batch| {
|
324
|
+
batch.ok().map(|batch| {
|
325
|
+
batch
|
326
|
+
.columns()
|
327
|
+
.into_iter()
|
328
|
+
.map(|column| {
|
329
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
330
|
+
values.into_inner()
|
331
|
+
})
|
332
|
+
.collect()
|
333
|
+
})
|
334
|
+
})
|
335
|
+
.map(ColumnRecord::Vec),
|
336
|
+
),
|
337
|
+
_ => {
|
338
|
+
return Err(MagnusError::new(
|
339
|
+
ruby.exception_runtime_error(),
|
340
|
+
"Invalid result type",
|
341
|
+
))
|
342
|
+
}
|
343
|
+
};
|
298
344
|
|
299
|
-
|
345
|
+
Ok(Yield::Iter(iter))
|
346
|
+
}
|
300
347
|
|
301
|
-
|
302
|
-
|
303
|
-
let
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
348
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
349
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
350
|
+
let projected_fields: Vec<TypePtr> = fields
|
351
|
+
.iter()
|
352
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
353
|
+
.cloned()
|
354
|
+
.collect();
|
355
|
+
|
356
|
+
SchemaType::GroupType {
|
357
|
+
basic_info: schema.get_basic_info().clone(),
|
358
|
+
fields: projected_fields,
|
359
|
+
}
|
360
|
+
} else {
|
361
|
+
// Return original schema if not a group type
|
362
|
+
schema.clone()
|
309
363
|
}
|
310
364
|
}
|
311
365
|
|
312
|
-
|
313
|
-
|
366
|
+
#[derive(Error, Debug)]
|
367
|
+
pub enum ReaderError {
|
368
|
+
#[error("Failed to get file descriptor: {0}")]
|
369
|
+
FileDescriptor(String),
|
370
|
+
#[error("Invalid file descriptor")]
|
371
|
+
InvalidFileDescriptor,
|
372
|
+
#[error("Failed to open file: {0}")]
|
373
|
+
FileOpen(#[from] io::Error),
|
374
|
+
#[error("Failed to intern headers: {0}")]
|
375
|
+
HeaderIntern(#[from] CacheError),
|
376
|
+
#[error("Ruby error: {0}")]
|
377
|
+
Ruby(String),
|
378
|
+
#[error("Parquet error: {0}")]
|
379
|
+
Parquet(#[from] ParquetError),
|
380
|
+
}
|
314
381
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
reader.seek(SeekFrom::Start(start))?;
|
319
|
-
Ok(BufReader::new(reader))
|
382
|
+
impl From<MagnusError> for ReaderError {
|
383
|
+
fn from(err: MagnusError) -> Self {
|
384
|
+
Self::Ruby(err.to_string())
|
320
385
|
}
|
386
|
+
}
|
321
387
|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
if read != length {
|
330
|
-
return Err(ParquetError::EOF(format!(
|
331
|
-
"Expected to read {} bytes, read only {}",
|
332
|
-
length, read
|
333
|
-
)));
|
334
|
-
}
|
335
|
-
Ok(buffer.into())
|
388
|
+
impl From<ReaderError> for MagnusError {
|
389
|
+
fn from(err: ReaderError) -> Self {
|
390
|
+
MagnusError::new(
|
391
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
392
|
+
err.to_string(),
|
393
|
+
)
|
336
394
|
}
|
337
395
|
}
|