parquet 0.0.2 → 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -3,13 +3,17 @@
3
3
  // =============================================================================
4
4
  use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
5
  use crate::{
6
- create_enumerator, utils::*, EnumeratorArgs, ForgottenFileHandle, ParquetField, Record,
6
+ create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
7
+ ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
7
8
  SeekableRubyValue,
8
9
  };
9
10
  use ahash::RandomState;
10
11
  use magnus::rb_sys::AsRawValue;
11
12
  use magnus::value::{Opaque, ReprValue};
12
13
  use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
14
+ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
15
+ use parquet::arrow::ProjectionMask;
16
+ use parquet::errors::ParquetError;
13
17
  use parquet::file::reader::FileReader;
14
18
  use parquet::file::reader::SerializedFileReader;
15
19
  use parquet::record::reader::RowIter as ParquetRowIter;
@@ -23,21 +27,21 @@ use std::sync::OnceLock;
23
27
  use thiserror::Error;
24
28
 
25
29
  #[inline]
26
- pub fn parse_parquet<'a>(
30
+ pub fn parse_parquet_rows<'a>(
27
31
  rb_self: Value,
28
32
  args: &[Value],
29
- ) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
33
+ ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
30
34
  let original = unsafe { Ruby::get_unchecked() };
31
35
  let ruby: &'static Ruby = Box::leak(Box::new(original));
32
36
 
33
- let ParquetArgs {
37
+ let ParquetRowsArgs {
34
38
  to_read,
35
39
  result_type,
36
40
  columns,
37
- } = parse_parquet_args(&ruby, args)?;
41
+ } = parse_parquet_rows_args(&ruby, args)?;
38
42
 
39
43
  if !ruby.block_given() {
40
- return create_enumerator(EnumeratorArgs {
44
+ return create_row_enumerator(RowEnumeratorArgs {
41
45
  rb_self,
42
46
  to_read,
43
47
  result_type,
@@ -88,7 +92,7 @@ pub fn parse_parquet<'a>(
88
92
  })?;
89
93
  }
90
94
 
91
- let iter: Box<dyn Iterator<Item = Record<RandomState>>> = match result_type.as_str() {
95
+ let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
92
96
  "hash" => {
93
97
  let headers = OnceLock::new();
94
98
  let headers_clone = headers.clone();
@@ -97,10 +101,14 @@ pub fn parse_parquet<'a>(
97
101
  row.ok().map(|row| {
98
102
  let headers = headers_clone.get_or_init(|| {
99
103
  let column_count = row.get_column_iter().count();
100
- let mut headers = Vec::with_capacity(column_count);
101
- row.get_column_iter().for_each(|(k, _)| {
102
- headers.push(StringCache::intern(k.to_owned()).unwrap())
103
- });
104
+
105
+ let mut header_string = Vec::with_capacity(column_count);
106
+ for (k, _) in row.get_column_iter() {
107
+ header_string.push(k.to_owned());
108
+ }
109
+
110
+ let headers = StringCache::intern_many(&header_string).unwrap();
111
+
104
112
  headers
105
113
  });
106
114
 
@@ -112,7 +120,7 @@ pub fn parse_parquet<'a>(
112
120
  map
113
121
  })
114
122
  })
115
- .map(Record::Map);
123
+ .map(RowRecord::Map);
116
124
 
117
125
  Box::new(HeaderCacheCleanupIter {
118
126
  inner: iter,
@@ -129,7 +137,202 @@ pub fn parse_parquet<'a>(
129
137
  vec
130
138
  })
131
139
  })
132
- .map(Record::Vec),
140
+ .map(RowRecord::Vec),
141
+ ),
142
+ _ => {
143
+ return Err(MagnusError::new(
144
+ ruby.exception_runtime_error(),
145
+ "Invalid result type",
146
+ ))
147
+ }
148
+ };
149
+
150
+ Ok(Yield::Iter(iter))
151
+ }
152
+
153
+ #[inline]
154
+ pub fn parse_parquet_columns<'a>(
155
+ rb_self: Value,
156
+ args: &[Value],
157
+ ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
158
+ let original = unsafe { Ruby::get_unchecked() };
159
+ let ruby: &'static Ruby = Box::leak(Box::new(original));
160
+
161
+ let ParquetColumnsArgs {
162
+ to_read,
163
+ result_type,
164
+ columns,
165
+ batch_size,
166
+ } = parse_parquet_columns_args(&ruby, args)?;
167
+
168
+ if !ruby.block_given() {
169
+ return create_column_enumerator(ColumnEnumeratorArgs {
170
+ rb_self,
171
+ to_read,
172
+ result_type,
173
+ columns,
174
+ batch_size,
175
+ });
176
+ }
177
+
178
+ let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
179
+ let path_string = to_read.to_r_string()?;
180
+ let file_path = unsafe { path_string.as_str()? };
181
+ let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
182
+
183
+ let mut builder =
184
+ ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
185
+ let schema = builder.schema().clone();
186
+ let num_rows = builder.metadata().file_metadata().num_rows();
187
+
188
+ // If columns are specified, project only those columns
189
+ if let Some(cols) = &columns {
190
+ // Get the parquet schema
191
+ let parquet_schema = builder.parquet_schema();
192
+
193
+ // Create a projection mask from column names
194
+ let projection =
195
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
196
+
197
+ builder = builder.with_projection(projection);
198
+ }
199
+
200
+ if let Some(batch_size) = batch_size {
201
+ builder = builder.with_batch_size(batch_size);
202
+ }
203
+
204
+ let reader = builder.build().unwrap();
205
+
206
+ (reader, schema, num_rows)
207
+ } else if to_read.is_kind_of(ruby.class_io()) {
208
+ let raw_value = to_read.as_raw();
209
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
210
+ .map_err(|_| {
211
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
212
+ })?;
213
+
214
+ if fd < 0 {
215
+ return Err(ReaderError::InvalidFileDescriptor.into());
216
+ }
217
+
218
+ let file = unsafe { File::from_raw_fd(fd) };
219
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
220
+
221
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
222
+ let schema = builder.schema().clone();
223
+ let num_rows = builder.metadata().file_metadata().num_rows();
224
+
225
+ if let Some(batch_size) = batch_size {
226
+ builder = builder.with_batch_size(batch_size);
227
+ }
228
+
229
+ // If columns are specified, project only those columns
230
+ if let Some(cols) = &columns {
231
+ // Get the parquet schema
232
+ let parquet_schema = builder.parquet_schema();
233
+
234
+ // Create a projection mask from column names
235
+ let projection =
236
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
237
+
238
+ builder = builder.with_projection(projection);
239
+ }
240
+
241
+ let reader = builder.build().unwrap();
242
+
243
+ (reader, schema, num_rows)
244
+ } else {
245
+ let readable = SeekableRubyValue(Opaque::from(to_read));
246
+
247
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
248
+ let schema = builder.schema().clone();
249
+ let num_rows = builder.metadata().file_metadata().num_rows();
250
+
251
+ if let Some(batch_size) = batch_size {
252
+ builder = builder.with_batch_size(batch_size);
253
+ }
254
+
255
+ // If columns are specified, project only those columns
256
+ if let Some(cols) = &columns {
257
+ // Get the parquet schema
258
+ let parquet_schema = builder.parquet_schema();
259
+
260
+ // Create a projection mask from column names
261
+ let projection =
262
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
263
+
264
+ builder = builder.with_projection(projection);
265
+ }
266
+
267
+ let reader = builder.build().unwrap();
268
+
269
+ (reader, schema, num_rows)
270
+ };
271
+
272
+ if num_rows == 0 {
273
+ let mut map =
274
+ HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
275
+ for field in schema.fields() {
276
+ map.insert(
277
+ StringCache::intern(field.name().to_string()).unwrap(),
278
+ vec![],
279
+ );
280
+ }
281
+ let column_record = vec![ColumnRecord::Map(map)];
282
+ return Ok(Yield::Iter(Box::new(column_record.into_iter())));
283
+ }
284
+
285
+ let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
286
+ "hash" => {
287
+ let headers = OnceLock::new();
288
+ let headers_clone = headers.clone();
289
+ let iter = batch_reader
290
+ .filter_map(move |batch| {
291
+ batch.ok().map(|batch| {
292
+ let headers = headers_clone.get_or_init(|| {
293
+ let schema = batch.schema();
294
+ let fields = schema.fields();
295
+ let mut header_string = Vec::with_capacity(fields.len());
296
+ for field in fields {
297
+ header_string.push(field.name().to_owned());
298
+ }
299
+ StringCache::intern_many(&header_string).unwrap()
300
+ });
301
+
302
+ let mut map =
303
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
304
+
305
+ batch.columns().iter().enumerate().for_each(|(i, column)| {
306
+ let header = headers[i];
307
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
308
+ map.insert(header, values.into_inner());
309
+ });
310
+
311
+ map
312
+ })
313
+ })
314
+ .map(ColumnRecord::Map);
315
+
316
+ Box::new(HeaderCacheCleanupIter {
317
+ inner: iter,
318
+ headers,
319
+ })
320
+ }
321
+ "array" => Box::new(
322
+ batch_reader
323
+ .filter_map(|batch| {
324
+ batch.ok().map(|batch| {
325
+ batch
326
+ .columns()
327
+ .into_iter()
328
+ .map(|column| {
329
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
330
+ values.into_inner()
331
+ })
332
+ .collect()
333
+ })
334
+ })
335
+ .map(ColumnRecord::Vec),
133
336
  ),
134
337
  _ => {
135
338
  return Err(MagnusError::new(
@@ -172,6 +375,8 @@ pub enum ReaderError {
172
375
  HeaderIntern(#[from] CacheError),
173
376
  #[error("Ruby error: {0}")]
174
377
  Ruby(String),
378
+ #[error("Parquet error: {0}")]
379
+ Parquet(#[from] ParquetError),
175
380
  }
176
381
 
177
382
  impl From<MagnusError> for ReaderError {