parquet 0.0.2 → 0.0.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +57 -0
- data/Gemfile +1 -1
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +5 -0
- data/ext/parquet/src/enumerator.rs +32 -6
- data/ext/parquet/src/header_cache.rs +85 -28
- data/ext/parquet/src/lib.rs +2 -1
- data/ext/parquet/src/reader.rs +218 -13
- data/ext/parquet/src/types.rs +647 -15
- data/ext/parquet/src/utils.rs +57 -3
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +22 -3
- metadata +4 -4
data/ext/parquet/src/reader.rs
CHANGED
@@ -3,13 +3,17 @@
|
|
3
3
|
// =============================================================================
|
4
4
|
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
5
|
use crate::{
|
6
|
-
|
6
|
+
create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
|
7
|
+
ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
|
7
8
|
SeekableRubyValue,
|
8
9
|
};
|
9
10
|
use ahash::RandomState;
|
10
11
|
use magnus::rb_sys::AsRawValue;
|
11
12
|
use magnus::value::{Opaque, ReprValue};
|
12
13
|
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
14
|
+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
15
|
+
use parquet::arrow::ProjectionMask;
|
16
|
+
use parquet::errors::ParquetError;
|
13
17
|
use parquet::file::reader::FileReader;
|
14
18
|
use parquet::file::reader::SerializedFileReader;
|
15
19
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
@@ -23,21 +27,21 @@ use std::sync::OnceLock;
|
|
23
27
|
use thiserror::Error;
|
24
28
|
|
25
29
|
#[inline]
|
26
|
-
pub fn
|
30
|
+
pub fn parse_parquet_rows<'a>(
|
27
31
|
rb_self: Value,
|
28
32
|
args: &[Value],
|
29
|
-
) -> Result<Yield<Box<dyn Iterator<Item =
|
33
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
30
34
|
let original = unsafe { Ruby::get_unchecked() };
|
31
35
|
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
32
36
|
|
33
|
-
let
|
37
|
+
let ParquetRowsArgs {
|
34
38
|
to_read,
|
35
39
|
result_type,
|
36
40
|
columns,
|
37
|
-
} =
|
41
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
38
42
|
|
39
43
|
if !ruby.block_given() {
|
40
|
-
return
|
44
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
41
45
|
rb_self,
|
42
46
|
to_read,
|
43
47
|
result_type,
|
@@ -88,7 +92,7 @@ pub fn parse_parquet<'a>(
|
|
88
92
|
})?;
|
89
93
|
}
|
90
94
|
|
91
|
-
let iter: Box<dyn Iterator<Item =
|
95
|
+
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
92
96
|
"hash" => {
|
93
97
|
let headers = OnceLock::new();
|
94
98
|
let headers_clone = headers.clone();
|
@@ -97,10 +101,14 @@ pub fn parse_parquet<'a>(
|
|
97
101
|
row.ok().map(|row| {
|
98
102
|
let headers = headers_clone.get_or_init(|| {
|
99
103
|
let column_count = row.get_column_iter().count();
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
+
|
105
|
+
let mut header_string = Vec::with_capacity(column_count);
|
106
|
+
for (k, _) in row.get_column_iter() {
|
107
|
+
header_string.push(k.to_owned());
|
108
|
+
}
|
109
|
+
|
110
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
111
|
+
|
104
112
|
headers
|
105
113
|
});
|
106
114
|
|
@@ -112,7 +120,7 @@ pub fn parse_parquet<'a>(
|
|
112
120
|
map
|
113
121
|
})
|
114
122
|
})
|
115
|
-
.map(
|
123
|
+
.map(RowRecord::Map);
|
116
124
|
|
117
125
|
Box::new(HeaderCacheCleanupIter {
|
118
126
|
inner: iter,
|
@@ -129,7 +137,202 @@ pub fn parse_parquet<'a>(
|
|
129
137
|
vec
|
130
138
|
})
|
131
139
|
})
|
132
|
-
.map(
|
140
|
+
.map(RowRecord::Vec),
|
141
|
+
),
|
142
|
+
_ => {
|
143
|
+
return Err(MagnusError::new(
|
144
|
+
ruby.exception_runtime_error(),
|
145
|
+
"Invalid result type",
|
146
|
+
))
|
147
|
+
}
|
148
|
+
};
|
149
|
+
|
150
|
+
Ok(Yield::Iter(iter))
|
151
|
+
}
|
152
|
+
|
153
|
+
#[inline]
|
154
|
+
pub fn parse_parquet_columns<'a>(
|
155
|
+
rb_self: Value,
|
156
|
+
args: &[Value],
|
157
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
158
|
+
let original = unsafe { Ruby::get_unchecked() };
|
159
|
+
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
160
|
+
|
161
|
+
let ParquetColumnsArgs {
|
162
|
+
to_read,
|
163
|
+
result_type,
|
164
|
+
columns,
|
165
|
+
batch_size,
|
166
|
+
} = parse_parquet_columns_args(&ruby, args)?;
|
167
|
+
|
168
|
+
if !ruby.block_given() {
|
169
|
+
return create_column_enumerator(ColumnEnumeratorArgs {
|
170
|
+
rb_self,
|
171
|
+
to_read,
|
172
|
+
result_type,
|
173
|
+
columns,
|
174
|
+
batch_size,
|
175
|
+
});
|
176
|
+
}
|
177
|
+
|
178
|
+
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
179
|
+
let path_string = to_read.to_r_string()?;
|
180
|
+
let file_path = unsafe { path_string.as_str()? };
|
181
|
+
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
182
|
+
|
183
|
+
let mut builder =
|
184
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
185
|
+
let schema = builder.schema().clone();
|
186
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
187
|
+
|
188
|
+
// If columns are specified, project only those columns
|
189
|
+
if let Some(cols) = &columns {
|
190
|
+
// Get the parquet schema
|
191
|
+
let parquet_schema = builder.parquet_schema();
|
192
|
+
|
193
|
+
// Create a projection mask from column names
|
194
|
+
let projection =
|
195
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
196
|
+
|
197
|
+
builder = builder.with_projection(projection);
|
198
|
+
}
|
199
|
+
|
200
|
+
if let Some(batch_size) = batch_size {
|
201
|
+
builder = builder.with_batch_size(batch_size);
|
202
|
+
}
|
203
|
+
|
204
|
+
let reader = builder.build().unwrap();
|
205
|
+
|
206
|
+
(reader, schema, num_rows)
|
207
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
208
|
+
let raw_value = to_read.as_raw();
|
209
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
210
|
+
.map_err(|_| {
|
211
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
212
|
+
})?;
|
213
|
+
|
214
|
+
if fd < 0 {
|
215
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
216
|
+
}
|
217
|
+
|
218
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
219
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
220
|
+
|
221
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
222
|
+
let schema = builder.schema().clone();
|
223
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
224
|
+
|
225
|
+
if let Some(batch_size) = batch_size {
|
226
|
+
builder = builder.with_batch_size(batch_size);
|
227
|
+
}
|
228
|
+
|
229
|
+
// If columns are specified, project only those columns
|
230
|
+
if let Some(cols) = &columns {
|
231
|
+
// Get the parquet schema
|
232
|
+
let parquet_schema = builder.parquet_schema();
|
233
|
+
|
234
|
+
// Create a projection mask from column names
|
235
|
+
let projection =
|
236
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
237
|
+
|
238
|
+
builder = builder.with_projection(projection);
|
239
|
+
}
|
240
|
+
|
241
|
+
let reader = builder.build().unwrap();
|
242
|
+
|
243
|
+
(reader, schema, num_rows)
|
244
|
+
} else {
|
245
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
246
|
+
|
247
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
248
|
+
let schema = builder.schema().clone();
|
249
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
250
|
+
|
251
|
+
if let Some(batch_size) = batch_size {
|
252
|
+
builder = builder.with_batch_size(batch_size);
|
253
|
+
}
|
254
|
+
|
255
|
+
// If columns are specified, project only those columns
|
256
|
+
if let Some(cols) = &columns {
|
257
|
+
// Get the parquet schema
|
258
|
+
let parquet_schema = builder.parquet_schema();
|
259
|
+
|
260
|
+
// Create a projection mask from column names
|
261
|
+
let projection =
|
262
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
263
|
+
|
264
|
+
builder = builder.with_projection(projection);
|
265
|
+
}
|
266
|
+
|
267
|
+
let reader = builder.build().unwrap();
|
268
|
+
|
269
|
+
(reader, schema, num_rows)
|
270
|
+
};
|
271
|
+
|
272
|
+
if num_rows == 0 {
|
273
|
+
let mut map =
|
274
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
275
|
+
for field in schema.fields() {
|
276
|
+
map.insert(
|
277
|
+
StringCache::intern(field.name().to_string()).unwrap(),
|
278
|
+
vec![],
|
279
|
+
);
|
280
|
+
}
|
281
|
+
let column_record = vec![ColumnRecord::Map(map)];
|
282
|
+
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
283
|
+
}
|
284
|
+
|
285
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
|
286
|
+
"hash" => {
|
287
|
+
let headers = OnceLock::new();
|
288
|
+
let headers_clone = headers.clone();
|
289
|
+
let iter = batch_reader
|
290
|
+
.filter_map(move |batch| {
|
291
|
+
batch.ok().map(|batch| {
|
292
|
+
let headers = headers_clone.get_or_init(|| {
|
293
|
+
let schema = batch.schema();
|
294
|
+
let fields = schema.fields();
|
295
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
296
|
+
for field in fields {
|
297
|
+
header_string.push(field.name().to_owned());
|
298
|
+
}
|
299
|
+
StringCache::intern_many(&header_string).unwrap()
|
300
|
+
});
|
301
|
+
|
302
|
+
let mut map =
|
303
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
304
|
+
|
305
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
306
|
+
let header = headers[i];
|
307
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
308
|
+
map.insert(header, values.into_inner());
|
309
|
+
});
|
310
|
+
|
311
|
+
map
|
312
|
+
})
|
313
|
+
})
|
314
|
+
.map(ColumnRecord::Map);
|
315
|
+
|
316
|
+
Box::new(HeaderCacheCleanupIter {
|
317
|
+
inner: iter,
|
318
|
+
headers,
|
319
|
+
})
|
320
|
+
}
|
321
|
+
"array" => Box::new(
|
322
|
+
batch_reader
|
323
|
+
.filter_map(|batch| {
|
324
|
+
batch.ok().map(|batch| {
|
325
|
+
batch
|
326
|
+
.columns()
|
327
|
+
.into_iter()
|
328
|
+
.map(|column| {
|
329
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
330
|
+
values.into_inner()
|
331
|
+
})
|
332
|
+
.collect()
|
333
|
+
})
|
334
|
+
})
|
335
|
+
.map(ColumnRecord::Vec),
|
133
336
|
),
|
134
337
|
_ => {
|
135
338
|
return Err(MagnusError::new(
|
@@ -172,6 +375,8 @@ pub enum ReaderError {
|
|
172
375
|
HeaderIntern(#[from] CacheError),
|
173
376
|
#[error("Ruby error: {0}")]
|
174
377
|
Ruby(String),
|
378
|
+
#[error("Parquet error: {0}")]
|
379
|
+
Parquet(#[from] ParquetError),
|
175
380
|
}
|
176
381
|
|
177
382
|
impl From<MagnusError> for ReaderError {
|