parquet 0.0.1 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +99 -7
- data/Gemfile +7 -2
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +12 -1
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/enumerator.rs +54 -0
- data/ext/parquet/src/header_cache.rs +105 -26
- data/ext/parquet/src/lib.rs +9 -1
- data/ext/parquet/src/reader.rs +289 -231
- data/ext/parquet/src/ruby_integration.rs +77 -0
- data/ext/parquet/src/ruby_reader.rs +43 -102
- data/ext/parquet/src/types.rs +722 -0
- data/ext/parquet/src/utils.rs +64 -5
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +26 -5
- metadata +6 -2
data/ext/parquet/src/reader.rs
CHANGED
@@ -1,122 +1,62 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
use
|
1
|
+
// =============================================================================
|
2
|
+
// Imports and Dependencies
|
3
|
+
// =============================================================================
|
4
|
+
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
|
+
use crate::{
|
6
|
+
create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
|
7
|
+
ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
|
8
|
+
SeekableRubyValue,
|
9
|
+
};
|
10
|
+
use ahash::RandomState;
|
5
11
|
use magnus::rb_sys::AsRawValue;
|
6
12
|
use magnus::value::{Opaque, ReprValue};
|
7
|
-
use magnus::
|
8
|
-
use
|
13
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
14
|
+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
15
|
+
use parquet::arrow::ProjectionMask;
|
9
16
|
use parquet::errors::ParquetError;
|
10
|
-
use parquet::file::reader::
|
11
|
-
use parquet::
|
17
|
+
use parquet::file::reader::FileReader;
|
18
|
+
use parquet::file::reader::SerializedFileReader;
|
19
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
20
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
12
21
|
use std::collections::HashMap;
|
13
22
|
use std::fs::File;
|
14
|
-
use std::io::{self
|
23
|
+
use std::io::{self};
|
15
24
|
use std::mem::ManuallyDrop;
|
16
25
|
use std::os::fd::FromRawFd;
|
17
26
|
use std::sync::OnceLock;
|
18
|
-
use std::{borrow::Cow, hash::BuildHasher};
|
19
27
|
use thiserror::Error;
|
20
|
-
use xxhash_rust::xxh3::Xxh3Builder;
|
21
|
-
|
22
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
23
|
-
|
24
|
-
#[derive(Error, Debug)]
|
25
|
-
pub enum ReaderError {
|
26
|
-
#[error("Failed to get file descriptor: {0}")]
|
27
|
-
FileDescriptor(String),
|
28
|
-
#[error("Invalid file descriptor")]
|
29
|
-
InvalidFileDescriptor,
|
30
|
-
#[error("Failed to open file: {0}")]
|
31
|
-
FileOpen(#[from] io::Error),
|
32
|
-
#[error("Failed to intern headers: {0}")]
|
33
|
-
HeaderIntern(#[from] CacheError),
|
34
|
-
#[error("Ruby error: {0}")]
|
35
|
-
Ruby(String),
|
36
|
-
}
|
37
|
-
|
38
|
-
impl From<MagnusError> for ReaderError {
|
39
|
-
fn from(err: MagnusError) -> Self {
|
40
|
-
Self::Ruby(err.to_string())
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
impl From<ReaderError> for MagnusError {
|
45
|
-
fn from(err: ReaderError) -> Self {
|
46
|
-
MagnusError::new(
|
47
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
48
|
-
err.to_string(),
|
49
|
-
)
|
50
|
-
}
|
51
|
-
}
|
52
|
-
|
53
|
-
struct ForgottenFileHandle(ManuallyDrop<File>);
|
54
|
-
|
55
|
-
impl Length for ForgottenFileHandle {
|
56
|
-
fn len(&self) -> u64 {
|
57
|
-
self.0.len()
|
58
|
-
}
|
59
|
-
}
|
60
|
-
|
61
|
-
impl ChunkReader for ForgottenFileHandle {
|
62
|
-
type T = BufReader<File>;
|
63
|
-
|
64
|
-
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
65
|
-
self.0.get_read(start)
|
66
|
-
}
|
67
|
-
|
68
|
-
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
69
|
-
self.0.get_bytes(start, length)
|
70
|
-
}
|
71
|
-
}
|
72
|
-
|
73
|
-
struct HeaderCacheCleanupIter<I> {
|
74
|
-
inner: I,
|
75
|
-
headers: OnceLock<Vec<&'static str>>,
|
76
|
-
}
|
77
|
-
|
78
|
-
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
79
|
-
type Item = I::Item;
|
80
28
|
|
81
|
-
|
82
|
-
|
83
|
-
}
|
84
|
-
}
|
85
|
-
|
86
|
-
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
87
|
-
fn drop(&mut self) {
|
88
|
-
if let Some(headers) = self.headers.get() {
|
89
|
-
StringCache::clear(&headers).unwrap();
|
90
|
-
}
|
91
|
-
}
|
92
|
-
}
|
93
|
-
|
94
|
-
pub fn parse_parquet<'a>(
|
29
|
+
#[inline]
|
30
|
+
pub fn parse_parquet_rows<'a>(
|
95
31
|
rb_self: Value,
|
96
32
|
args: &[Value],
|
97
|
-
) -> Result<Yield<Box<dyn Iterator<Item =
|
33
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
98
34
|
let original = unsafe { Ruby::get_unchecked() };
|
99
35
|
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
100
36
|
|
101
|
-
let
|
37
|
+
let ParquetRowsArgs {
|
102
38
|
to_read,
|
103
39
|
result_type,
|
104
|
-
|
40
|
+
columns,
|
41
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
105
42
|
|
106
43
|
if !ruby.block_given() {
|
107
|
-
return
|
44
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
108
45
|
rb_self,
|
109
46
|
to_read,
|
110
47
|
result_type,
|
48
|
+
columns,
|
111
49
|
});
|
112
50
|
}
|
113
51
|
|
114
|
-
let iter = if to_read.is_kind_of(ruby.class_string()) {
|
52
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
115
53
|
let path_string = to_read.to_r_string()?;
|
116
54
|
let file_path = unsafe { path_string.as_str()? };
|
117
55
|
let file = File::open(file_path).unwrap();
|
118
56
|
let reader = SerializedFileReader::new(file).unwrap();
|
119
|
-
|
57
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
58
|
+
|
59
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
120
60
|
} else if to_read.is_kind_of(ruby.class_io()) {
|
121
61
|
let raw_value = to_read.as_raw();
|
122
62
|
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
@@ -131,14 +71,28 @@ pub fn parse_parquet<'a>(
|
|
131
71
|
let file = unsafe { File::from_raw_fd(fd) };
|
132
72
|
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
133
73
|
let reader = SerializedFileReader::new(file).unwrap();
|
134
|
-
|
74
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
75
|
+
|
76
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
135
77
|
} else {
|
136
78
|
let readable = SeekableRubyValue(Opaque::from(to_read));
|
137
79
|
let reader = SerializedFileReader::new(readable).unwrap();
|
138
|
-
|
80
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
81
|
+
|
82
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
139
83
|
};
|
140
84
|
|
141
|
-
let
|
85
|
+
if let Some(cols) = columns {
|
86
|
+
let projection = create_projection_schema(&schema, &cols);
|
87
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
88
|
+
MagnusError::new(
|
89
|
+
ruby.exception_runtime_error(),
|
90
|
+
format!("Failed to create projection: {}", e),
|
91
|
+
)
|
92
|
+
})?;
|
93
|
+
}
|
94
|
+
|
95
|
+
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
142
96
|
"hash" => {
|
143
97
|
let headers = OnceLock::new();
|
144
98
|
let headers_clone = headers.clone();
|
@@ -146,21 +100,27 @@ pub fn parse_parquet<'a>(
|
|
146
100
|
.filter_map(move |row| {
|
147
101
|
row.ok().map(|row| {
|
148
102
|
let headers = headers_clone.get_or_init(|| {
|
149
|
-
row.get_column_iter()
|
150
|
-
|
151
|
-
|
103
|
+
let column_count = row.get_column_iter().count();
|
104
|
+
|
105
|
+
let mut header_string = Vec::with_capacity(column_count);
|
106
|
+
for (k, _) in row.get_column_iter() {
|
107
|
+
header_string.push(k.to_owned());
|
108
|
+
}
|
109
|
+
|
110
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
111
|
+
|
112
|
+
headers
|
152
113
|
});
|
153
114
|
|
154
|
-
|
155
|
-
.
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
.collect::<HashMap<&'static str, ParquetField, Xxh3Builder>>()
|
115
|
+
let mut map =
|
116
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
117
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
118
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
119
|
+
});
|
120
|
+
map
|
161
121
|
})
|
162
122
|
})
|
163
|
-
.map(
|
123
|
+
.map(RowRecord::Map);
|
164
124
|
|
165
125
|
Box::new(HeaderCacheCleanupIter {
|
166
126
|
inner: iter,
|
@@ -170,12 +130,14 @@ pub fn parse_parquet<'a>(
|
|
170
130
|
"array" => Box::new(
|
171
131
|
iter.filter_map(|row| {
|
172
132
|
row.ok().map(|row| {
|
133
|
+
let column_count = row.get_column_iter().count();
|
134
|
+
let mut vec = Vec::with_capacity(column_count);
|
173
135
|
row.get_column_iter()
|
174
|
-
.
|
175
|
-
|
136
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
137
|
+
vec
|
176
138
|
})
|
177
139
|
})
|
178
|
-
.map(
|
140
|
+
.map(RowRecord::Vec),
|
179
141
|
),
|
180
142
|
_ => {
|
181
143
|
return Err(MagnusError::new(
|
@@ -188,150 +150,246 @@ pub fn parse_parquet<'a>(
|
|
188
150
|
Ok(Yield::Iter(iter))
|
189
151
|
}
|
190
152
|
|
191
|
-
|
153
|
+
#[inline]
|
154
|
+
pub fn parse_parquet_columns<'a>(
|
192
155
|
rb_self: Value,
|
193
|
-
|
194
|
-
|
195
|
-
}
|
156
|
+
args: &[Value],
|
157
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
158
|
+
let original = unsafe { Ruby::get_unchecked() };
|
159
|
+
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
196
160
|
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
161
|
+
let ParquetColumnsArgs {
|
162
|
+
to_read,
|
163
|
+
result_type,
|
164
|
+
columns,
|
165
|
+
batch_size,
|
166
|
+
} = parse_parquet_columns_args(&ruby, args)?;
|
201
167
|
|
202
|
-
|
168
|
+
if !ruby.block_given() {
|
169
|
+
return create_column_enumerator(ColumnEnumeratorArgs {
|
170
|
+
rb_self,
|
171
|
+
to_read,
|
172
|
+
result_type,
|
173
|
+
columns,
|
174
|
+
batch_size,
|
175
|
+
});
|
176
|
+
}
|
203
177
|
|
204
|
-
let
|
205
|
-
.
|
206
|
-
|
207
|
-
|
208
|
-
}
|
178
|
+
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
179
|
+
let path_string = to_read.to_r_string()?;
|
180
|
+
let file_path = unsafe { path_string.as_str()? };
|
181
|
+
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
209
182
|
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
}
|
183
|
+
let mut builder =
|
184
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
185
|
+
let schema = builder.schema().clone();
|
186
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
215
187
|
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
.unwrap();
|
231
|
-
hash.into_value_with(handle)
|
232
|
-
}
|
188
|
+
// If columns are specified, project only those columns
|
189
|
+
if let Some(cols) = &columns {
|
190
|
+
// Get the parquet schema
|
191
|
+
let parquet_schema = builder.parquet_schema();
|
192
|
+
|
193
|
+
// Create a projection mask from column names
|
194
|
+
let projection =
|
195
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
196
|
+
|
197
|
+
builder = builder.with_projection(projection);
|
198
|
+
}
|
199
|
+
|
200
|
+
if let Some(batch_size) = batch_size {
|
201
|
+
builder = builder.with_batch_size(batch_size);
|
233
202
|
}
|
234
|
-
}
|
235
|
-
}
|
236
203
|
|
237
|
-
|
238
|
-
pub struct CowValue<'a>(pub Cow<'a, str>);
|
204
|
+
let reader = builder.build().unwrap();
|
239
205
|
|
240
|
-
|
241
|
-
|
242
|
-
|
206
|
+
(reader, schema, num_rows)
|
207
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
208
|
+
let raw_value = to_read.as_raw();
|
209
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
210
|
+
.map_err(|_| {
|
211
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
212
|
+
})?;
|
213
|
+
|
214
|
+
if fd < 0 {
|
215
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
216
|
+
}
|
217
|
+
|
218
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
219
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
220
|
+
|
221
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
222
|
+
let schema = builder.schema().clone();
|
223
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
224
|
+
|
225
|
+
if let Some(batch_size) = batch_size {
|
226
|
+
builder = builder.with_batch_size(batch_size);
|
227
|
+
}
|
228
|
+
|
229
|
+
// If columns are specified, project only those columns
|
230
|
+
if let Some(cols) = &columns {
|
231
|
+
// Get the parquet schema
|
232
|
+
let parquet_schema = builder.parquet_schema();
|
233
|
+
|
234
|
+
// Create a projection mask from column names
|
235
|
+
let projection =
|
236
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
237
|
+
|
238
|
+
builder = builder.with_projection(projection);
|
239
|
+
}
|
240
|
+
|
241
|
+
let reader = builder.build().unwrap();
|
242
|
+
|
243
|
+
(reader, schema, num_rows)
|
244
|
+
} else {
|
245
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
246
|
+
|
247
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
248
|
+
let schema = builder.schema().clone();
|
249
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
250
|
+
|
251
|
+
if let Some(batch_size) = batch_size {
|
252
|
+
builder = builder.with_batch_size(batch_size);
|
253
|
+
}
|
254
|
+
|
255
|
+
// If columns are specified, project only those columns
|
256
|
+
if let Some(cols) = &columns {
|
257
|
+
// Get the parquet schema
|
258
|
+
let parquet_schema = builder.parquet_schema();
|
259
|
+
|
260
|
+
// Create a projection mask from column names
|
261
|
+
let projection =
|
262
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
263
|
+
|
264
|
+
builder = builder.with_projection(projection);
|
265
|
+
}
|
266
|
+
|
267
|
+
let reader = builder.build().unwrap();
|
268
|
+
|
269
|
+
(reader, schema, num_rows)
|
270
|
+
};
|
271
|
+
|
272
|
+
if num_rows == 0 {
|
273
|
+
let mut map =
|
274
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
275
|
+
for field in schema.fields() {
|
276
|
+
map.insert(
|
277
|
+
StringCache::intern(field.name().to_string()).unwrap(),
|
278
|
+
vec![],
|
279
|
+
);
|
280
|
+
}
|
281
|
+
let column_record = vec![ColumnRecord::Map(map)];
|
282
|
+
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
243
283
|
}
|
244
|
-
}
|
245
284
|
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
.iter()
|
274
|
-
.try_for_each(|e| ary.push(ParquetField(e.clone()).into_value_with(handle)))
|
275
|
-
.unwrap();
|
276
|
-
ary.into_value_with(handle)
|
277
|
-
}
|
278
|
-
Field::MapInternal(map) => {
|
279
|
-
let hash = handle.hash_new_capa(map.entries().len());
|
280
|
-
map.entries()
|
281
|
-
.iter()
|
282
|
-
.try_for_each(|(k, v)| {
|
283
|
-
hash.aset(
|
284
|
-
ParquetField(k.clone()).into_value_with(handle),
|
285
|
-
ParquetField(v.clone()).into_value_with(handle),
|
286
|
-
)
|
285
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
|
286
|
+
"hash" => {
|
287
|
+
let headers = OnceLock::new();
|
288
|
+
let headers_clone = headers.clone();
|
289
|
+
let iter = batch_reader
|
290
|
+
.filter_map(move |batch| {
|
291
|
+
batch.ok().map(|batch| {
|
292
|
+
let headers = headers_clone.get_or_init(|| {
|
293
|
+
let schema = batch.schema();
|
294
|
+
let fields = schema.fields();
|
295
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
296
|
+
for field in fields {
|
297
|
+
header_string.push(field.name().to_owned());
|
298
|
+
}
|
299
|
+
StringCache::intern_many(&header_string).unwrap()
|
300
|
+
});
|
301
|
+
|
302
|
+
let mut map =
|
303
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
304
|
+
|
305
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
306
|
+
let header = headers[i];
|
307
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
308
|
+
map.insert(header, values.into_inner());
|
309
|
+
});
|
310
|
+
|
311
|
+
map
|
287
312
|
})
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
313
|
+
})
|
314
|
+
.map(ColumnRecord::Map);
|
315
|
+
|
316
|
+
Box::new(HeaderCacheCleanupIter {
|
317
|
+
inner: iter,
|
318
|
+
headers,
|
319
|
+
})
|
295
320
|
}
|
296
|
-
|
297
|
-
|
321
|
+
"array" => Box::new(
|
322
|
+
batch_reader
|
323
|
+
.filter_map(|batch| {
|
324
|
+
batch.ok().map(|batch| {
|
325
|
+
batch
|
326
|
+
.columns()
|
327
|
+
.into_iter()
|
328
|
+
.map(|column| {
|
329
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
330
|
+
values.into_inner()
|
331
|
+
})
|
332
|
+
.collect()
|
333
|
+
})
|
334
|
+
})
|
335
|
+
.map(ColumnRecord::Vec),
|
336
|
+
),
|
337
|
+
_ => {
|
338
|
+
return Err(MagnusError::new(
|
339
|
+
ruby.exception_runtime_error(),
|
340
|
+
"Invalid result type",
|
341
|
+
))
|
342
|
+
}
|
343
|
+
};
|
298
344
|
|
299
|
-
|
345
|
+
Ok(Yield::Iter(iter))
|
346
|
+
}
|
300
347
|
|
301
|
-
|
302
|
-
|
303
|
-
let
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
348
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
349
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
350
|
+
let projected_fields: Vec<TypePtr> = fields
|
351
|
+
.iter()
|
352
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
353
|
+
.cloned()
|
354
|
+
.collect();
|
355
|
+
|
356
|
+
SchemaType::GroupType {
|
357
|
+
basic_info: schema.get_basic_info().clone(),
|
358
|
+
fields: projected_fields,
|
359
|
+
}
|
360
|
+
} else {
|
361
|
+
// Return original schema if not a group type
|
362
|
+
schema.clone()
|
309
363
|
}
|
310
364
|
}
|
311
365
|
|
312
|
-
|
313
|
-
|
366
|
+
#[derive(Error, Debug)]
|
367
|
+
pub enum ReaderError {
|
368
|
+
#[error("Failed to get file descriptor: {0}")]
|
369
|
+
FileDescriptor(String),
|
370
|
+
#[error("Invalid file descriptor")]
|
371
|
+
InvalidFileDescriptor,
|
372
|
+
#[error("Failed to open file: {0}")]
|
373
|
+
FileOpen(#[from] io::Error),
|
374
|
+
#[error("Failed to intern headers: {0}")]
|
375
|
+
HeaderIntern(#[from] CacheError),
|
376
|
+
#[error("Ruby error: {0}")]
|
377
|
+
Ruby(String),
|
378
|
+
#[error("Parquet error: {0}")]
|
379
|
+
Parquet(#[from] ParquetError),
|
380
|
+
}
|
314
381
|
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
reader.seek(SeekFrom::Start(start))?;
|
319
|
-
Ok(BufReader::new(reader))
|
382
|
+
impl From<MagnusError> for ReaderError {
|
383
|
+
fn from(err: MagnusError) -> Self {
|
384
|
+
Self::Ruby(err.to_string())
|
320
385
|
}
|
386
|
+
}
|
321
387
|
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
if read != length {
|
330
|
-
return Err(ParquetError::EOF(format!(
|
331
|
-
"Expected to read {} bytes, read only {}",
|
332
|
-
length, read
|
333
|
-
)));
|
334
|
-
}
|
335
|
-
Ok(buffer.into())
|
388
|
+
impl From<ReaderError> for MagnusError {
|
389
|
+
fn from(err: ReaderError) -> Self {
|
390
|
+
MagnusError::new(
|
391
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
392
|
+
err.to_string(),
|
393
|
+
)
|
336
394
|
}
|
337
395
|
}
|