parquet 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +57 -0
- data/Gemfile +1 -1
- data/README.md +66 -10
- data/ext/parquet/Cargo.toml +5 -0
- data/ext/parquet/src/enumerator.rs +32 -6
- data/ext/parquet/src/header_cache.rs +85 -28
- data/ext/parquet/src/lib.rs +2 -1
- data/ext/parquet/src/reader.rs +218 -13
- data/ext/parquet/src/types.rs +647 -15
- data/ext/parquet/src/utils.rs +57 -3
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +22 -3
- metadata +2 -2
data/ext/parquet/src/reader.rs
CHANGED
@@ -3,13 +3,17 @@
|
|
3
3
|
// =============================================================================
|
4
4
|
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
5
|
use crate::{
|
6
|
-
|
6
|
+
create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
|
7
|
+
ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
|
7
8
|
SeekableRubyValue,
|
8
9
|
};
|
9
10
|
use ahash::RandomState;
|
10
11
|
use magnus::rb_sys::AsRawValue;
|
11
12
|
use magnus::value::{Opaque, ReprValue};
|
12
13
|
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
14
|
+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
15
|
+
use parquet::arrow::ProjectionMask;
|
16
|
+
use parquet::errors::ParquetError;
|
13
17
|
use parquet::file::reader::FileReader;
|
14
18
|
use parquet::file::reader::SerializedFileReader;
|
15
19
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
@@ -23,21 +27,21 @@ use std::sync::OnceLock;
|
|
23
27
|
use thiserror::Error;
|
24
28
|
|
25
29
|
#[inline]
|
26
|
-
pub fn
|
30
|
+
pub fn parse_parquet_rows<'a>(
|
27
31
|
rb_self: Value,
|
28
32
|
args: &[Value],
|
29
|
-
) -> Result<Yield<Box<dyn Iterator<Item =
|
33
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
30
34
|
let original = unsafe { Ruby::get_unchecked() };
|
31
35
|
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
32
36
|
|
33
|
-
let
|
37
|
+
let ParquetRowsArgs {
|
34
38
|
to_read,
|
35
39
|
result_type,
|
36
40
|
columns,
|
37
|
-
} =
|
41
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
38
42
|
|
39
43
|
if !ruby.block_given() {
|
40
|
-
return
|
44
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
41
45
|
rb_self,
|
42
46
|
to_read,
|
43
47
|
result_type,
|
@@ -88,7 +92,7 @@ pub fn parse_parquet<'a>(
|
|
88
92
|
})?;
|
89
93
|
}
|
90
94
|
|
91
|
-
let iter: Box<dyn Iterator<Item =
|
95
|
+
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
92
96
|
"hash" => {
|
93
97
|
let headers = OnceLock::new();
|
94
98
|
let headers_clone = headers.clone();
|
@@ -97,10 +101,14 @@ pub fn parse_parquet<'a>(
|
|
97
101
|
row.ok().map(|row| {
|
98
102
|
let headers = headers_clone.get_or_init(|| {
|
99
103
|
let column_count = row.get_column_iter().count();
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
+
|
105
|
+
let mut header_string = Vec::with_capacity(column_count);
|
106
|
+
for (k, _) in row.get_column_iter() {
|
107
|
+
header_string.push(k.to_owned());
|
108
|
+
}
|
109
|
+
|
110
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
111
|
+
|
104
112
|
headers
|
105
113
|
});
|
106
114
|
|
@@ -112,7 +120,7 @@ pub fn parse_parquet<'a>(
|
|
112
120
|
map
|
113
121
|
})
|
114
122
|
})
|
115
|
-
.map(
|
123
|
+
.map(RowRecord::Map);
|
116
124
|
|
117
125
|
Box::new(HeaderCacheCleanupIter {
|
118
126
|
inner: iter,
|
@@ -129,7 +137,202 @@ pub fn parse_parquet<'a>(
|
|
129
137
|
vec
|
130
138
|
})
|
131
139
|
})
|
132
|
-
.map(
|
140
|
+
.map(RowRecord::Vec),
|
141
|
+
),
|
142
|
+
_ => {
|
143
|
+
return Err(MagnusError::new(
|
144
|
+
ruby.exception_runtime_error(),
|
145
|
+
"Invalid result type",
|
146
|
+
))
|
147
|
+
}
|
148
|
+
};
|
149
|
+
|
150
|
+
Ok(Yield::Iter(iter))
|
151
|
+
}
|
152
|
+
|
153
|
+
#[inline]
|
154
|
+
pub fn parse_parquet_columns<'a>(
|
155
|
+
rb_self: Value,
|
156
|
+
args: &[Value],
|
157
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
158
|
+
let original = unsafe { Ruby::get_unchecked() };
|
159
|
+
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
160
|
+
|
161
|
+
let ParquetColumnsArgs {
|
162
|
+
to_read,
|
163
|
+
result_type,
|
164
|
+
columns,
|
165
|
+
batch_size,
|
166
|
+
} = parse_parquet_columns_args(&ruby, args)?;
|
167
|
+
|
168
|
+
if !ruby.block_given() {
|
169
|
+
return create_column_enumerator(ColumnEnumeratorArgs {
|
170
|
+
rb_self,
|
171
|
+
to_read,
|
172
|
+
result_type,
|
173
|
+
columns,
|
174
|
+
batch_size,
|
175
|
+
});
|
176
|
+
}
|
177
|
+
|
178
|
+
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
179
|
+
let path_string = to_read.to_r_string()?;
|
180
|
+
let file_path = unsafe { path_string.as_str()? };
|
181
|
+
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
182
|
+
|
183
|
+
let mut builder =
|
184
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
185
|
+
let schema = builder.schema().clone();
|
186
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
187
|
+
|
188
|
+
// If columns are specified, project only those columns
|
189
|
+
if let Some(cols) = &columns {
|
190
|
+
// Get the parquet schema
|
191
|
+
let parquet_schema = builder.parquet_schema();
|
192
|
+
|
193
|
+
// Create a projection mask from column names
|
194
|
+
let projection =
|
195
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
196
|
+
|
197
|
+
builder = builder.with_projection(projection);
|
198
|
+
}
|
199
|
+
|
200
|
+
if let Some(batch_size) = batch_size {
|
201
|
+
builder = builder.with_batch_size(batch_size);
|
202
|
+
}
|
203
|
+
|
204
|
+
let reader = builder.build().unwrap();
|
205
|
+
|
206
|
+
(reader, schema, num_rows)
|
207
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
208
|
+
let raw_value = to_read.as_raw();
|
209
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
210
|
+
.map_err(|_| {
|
211
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
212
|
+
})?;
|
213
|
+
|
214
|
+
if fd < 0 {
|
215
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
216
|
+
}
|
217
|
+
|
218
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
219
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
220
|
+
|
221
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
222
|
+
let schema = builder.schema().clone();
|
223
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
224
|
+
|
225
|
+
if let Some(batch_size) = batch_size {
|
226
|
+
builder = builder.with_batch_size(batch_size);
|
227
|
+
}
|
228
|
+
|
229
|
+
// If columns are specified, project only those columns
|
230
|
+
if let Some(cols) = &columns {
|
231
|
+
// Get the parquet schema
|
232
|
+
let parquet_schema = builder.parquet_schema();
|
233
|
+
|
234
|
+
// Create a projection mask from column names
|
235
|
+
let projection =
|
236
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
237
|
+
|
238
|
+
builder = builder.with_projection(projection);
|
239
|
+
}
|
240
|
+
|
241
|
+
let reader = builder.build().unwrap();
|
242
|
+
|
243
|
+
(reader, schema, num_rows)
|
244
|
+
} else {
|
245
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
246
|
+
|
247
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
248
|
+
let schema = builder.schema().clone();
|
249
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
250
|
+
|
251
|
+
if let Some(batch_size) = batch_size {
|
252
|
+
builder = builder.with_batch_size(batch_size);
|
253
|
+
}
|
254
|
+
|
255
|
+
// If columns are specified, project only those columns
|
256
|
+
if let Some(cols) = &columns {
|
257
|
+
// Get the parquet schema
|
258
|
+
let parquet_schema = builder.parquet_schema();
|
259
|
+
|
260
|
+
// Create a projection mask from column names
|
261
|
+
let projection =
|
262
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
263
|
+
|
264
|
+
builder = builder.with_projection(projection);
|
265
|
+
}
|
266
|
+
|
267
|
+
let reader = builder.build().unwrap();
|
268
|
+
|
269
|
+
(reader, schema, num_rows)
|
270
|
+
};
|
271
|
+
|
272
|
+
if num_rows == 0 {
|
273
|
+
let mut map =
|
274
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
275
|
+
for field in schema.fields() {
|
276
|
+
map.insert(
|
277
|
+
StringCache::intern(field.name().to_string()).unwrap(),
|
278
|
+
vec![],
|
279
|
+
);
|
280
|
+
}
|
281
|
+
let column_record = vec![ColumnRecord::Map(map)];
|
282
|
+
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
283
|
+
}
|
284
|
+
|
285
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
|
286
|
+
"hash" => {
|
287
|
+
let headers = OnceLock::new();
|
288
|
+
let headers_clone = headers.clone();
|
289
|
+
let iter = batch_reader
|
290
|
+
.filter_map(move |batch| {
|
291
|
+
batch.ok().map(|batch| {
|
292
|
+
let headers = headers_clone.get_or_init(|| {
|
293
|
+
let schema = batch.schema();
|
294
|
+
let fields = schema.fields();
|
295
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
296
|
+
for field in fields {
|
297
|
+
header_string.push(field.name().to_owned());
|
298
|
+
}
|
299
|
+
StringCache::intern_many(&header_string).unwrap()
|
300
|
+
});
|
301
|
+
|
302
|
+
let mut map =
|
303
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
304
|
+
|
305
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
306
|
+
let header = headers[i];
|
307
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
308
|
+
map.insert(header, values.into_inner());
|
309
|
+
});
|
310
|
+
|
311
|
+
map
|
312
|
+
})
|
313
|
+
})
|
314
|
+
.map(ColumnRecord::Map);
|
315
|
+
|
316
|
+
Box::new(HeaderCacheCleanupIter {
|
317
|
+
inner: iter,
|
318
|
+
headers,
|
319
|
+
})
|
320
|
+
}
|
321
|
+
"array" => Box::new(
|
322
|
+
batch_reader
|
323
|
+
.filter_map(|batch| {
|
324
|
+
batch.ok().map(|batch| {
|
325
|
+
batch
|
326
|
+
.columns()
|
327
|
+
.into_iter()
|
328
|
+
.map(|column| {
|
329
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
330
|
+
values.into_inner()
|
331
|
+
})
|
332
|
+
.collect()
|
333
|
+
})
|
334
|
+
})
|
335
|
+
.map(ColumnRecord::Vec),
|
133
336
|
),
|
134
337
|
_ => {
|
135
338
|
return Err(MagnusError::new(
|
@@ -172,6 +375,8 @@ pub enum ReaderError {
|
|
172
375
|
HeaderIntern(#[from] CacheError),
|
173
376
|
#[error("Ruby error: {0}")]
|
174
377
|
Ruby(String),
|
378
|
+
#[error("Parquet error: {0}")]
|
379
|
+
Parquet(#[from] ParquetError),
|
175
380
|
}
|
176
381
|
|
177
382
|
impl From<MagnusError> for ReaderError {
|