parquet 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,13 +3,17 @@
3
3
  // =============================================================================
4
4
  use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
5
  use crate::{
6
- create_enumerator, utils::*, EnumeratorArgs, ForgottenFileHandle, ParquetField, Record,
6
+ create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
7
+ ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
7
8
  SeekableRubyValue,
8
9
  };
9
10
  use ahash::RandomState;
10
11
  use magnus::rb_sys::AsRawValue;
11
12
  use magnus::value::{Opaque, ReprValue};
12
13
  use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
14
+ use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
15
+ use parquet::arrow::ProjectionMask;
16
+ use parquet::errors::ParquetError;
13
17
  use parquet::file::reader::FileReader;
14
18
  use parquet::file::reader::SerializedFileReader;
15
19
  use parquet::record::reader::RowIter as ParquetRowIter;
@@ -23,21 +27,21 @@ use std::sync::OnceLock;
23
27
  use thiserror::Error;
24
28
 
25
29
  #[inline]
26
- pub fn parse_parquet<'a>(
30
+ pub fn parse_parquet_rows<'a>(
27
31
  rb_self: Value,
28
32
  args: &[Value],
29
- ) -> Result<Yield<Box<dyn Iterator<Item = Record<RandomState>>>>, MagnusError> {
33
+ ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
30
34
  let original = unsafe { Ruby::get_unchecked() };
31
35
  let ruby: &'static Ruby = Box::leak(Box::new(original));
32
36
 
33
- let ParquetArgs {
37
+ let ParquetRowsArgs {
34
38
  to_read,
35
39
  result_type,
36
40
  columns,
37
- } = parse_parquet_args(&ruby, args)?;
41
+ } = parse_parquet_rows_args(&ruby, args)?;
38
42
 
39
43
  if !ruby.block_given() {
40
- return create_enumerator(EnumeratorArgs {
44
+ return create_row_enumerator(RowEnumeratorArgs {
41
45
  rb_self,
42
46
  to_read,
43
47
  result_type,
@@ -88,7 +92,7 @@ pub fn parse_parquet<'a>(
88
92
  })?;
89
93
  }
90
94
 
91
- let iter: Box<dyn Iterator<Item = Record<RandomState>>> = match result_type.as_str() {
95
+ let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
92
96
  "hash" => {
93
97
  let headers = OnceLock::new();
94
98
  let headers_clone = headers.clone();
@@ -97,10 +101,14 @@ pub fn parse_parquet<'a>(
97
101
  row.ok().map(|row| {
98
102
  let headers = headers_clone.get_or_init(|| {
99
103
  let column_count = row.get_column_iter().count();
100
- let mut headers = Vec::with_capacity(column_count);
101
- row.get_column_iter().for_each(|(k, _)| {
102
- headers.push(StringCache::intern(k.to_owned()).unwrap())
103
- });
104
+
105
+ let mut header_string = Vec::with_capacity(column_count);
106
+ for (k, _) in row.get_column_iter() {
107
+ header_string.push(k.to_owned());
108
+ }
109
+
110
+ let headers = StringCache::intern_many(&header_string).unwrap();
111
+
104
112
  headers
105
113
  });
106
114
 
@@ -112,7 +120,7 @@ pub fn parse_parquet<'a>(
112
120
  map
113
121
  })
114
122
  })
115
- .map(Record::Map);
123
+ .map(RowRecord::Map);
116
124
 
117
125
  Box::new(HeaderCacheCleanupIter {
118
126
  inner: iter,
@@ -129,7 +137,202 @@ pub fn parse_parquet<'a>(
129
137
  vec
130
138
  })
131
139
  })
132
- .map(Record::Vec),
140
+ .map(RowRecord::Vec),
141
+ ),
142
+ _ => {
143
+ return Err(MagnusError::new(
144
+ ruby.exception_runtime_error(),
145
+ "Invalid result type",
146
+ ))
147
+ }
148
+ };
149
+
150
+ Ok(Yield::Iter(iter))
151
+ }
152
+
153
+ #[inline]
154
+ pub fn parse_parquet_columns<'a>(
155
+ rb_self: Value,
156
+ args: &[Value],
157
+ ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
158
+ let original = unsafe { Ruby::get_unchecked() };
159
+ let ruby: &'static Ruby = Box::leak(Box::new(original));
160
+
161
+ let ParquetColumnsArgs {
162
+ to_read,
163
+ result_type,
164
+ columns,
165
+ batch_size,
166
+ } = parse_parquet_columns_args(&ruby, args)?;
167
+
168
+ if !ruby.block_given() {
169
+ return create_column_enumerator(ColumnEnumeratorArgs {
170
+ rb_self,
171
+ to_read,
172
+ result_type,
173
+ columns,
174
+ batch_size,
175
+ });
176
+ }
177
+
178
+ let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
179
+ let path_string = to_read.to_r_string()?;
180
+ let file_path = unsafe { path_string.as_str()? };
181
+ let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
182
+
183
+ let mut builder =
184
+ ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
185
+ let schema = builder.schema().clone();
186
+ let num_rows = builder.metadata().file_metadata().num_rows();
187
+
188
+ // If columns are specified, project only those columns
189
+ if let Some(cols) = &columns {
190
+ // Get the parquet schema
191
+ let parquet_schema = builder.parquet_schema();
192
+
193
+ // Create a projection mask from column names
194
+ let projection =
195
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
196
+
197
+ builder = builder.with_projection(projection);
198
+ }
199
+
200
+ if let Some(batch_size) = batch_size {
201
+ builder = builder.with_batch_size(batch_size);
202
+ }
203
+
204
+ let reader = builder.build().unwrap();
205
+
206
+ (reader, schema, num_rows)
207
+ } else if to_read.is_kind_of(ruby.class_io()) {
208
+ let raw_value = to_read.as_raw();
209
+ let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
210
+ .map_err(|_| {
211
+ ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
212
+ })?;
213
+
214
+ if fd < 0 {
215
+ return Err(ReaderError::InvalidFileDescriptor.into());
216
+ }
217
+
218
+ let file = unsafe { File::from_raw_fd(fd) };
219
+ let file = ForgottenFileHandle(ManuallyDrop::new(file));
220
+
221
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
222
+ let schema = builder.schema().clone();
223
+ let num_rows = builder.metadata().file_metadata().num_rows();
224
+
225
+ if let Some(batch_size) = batch_size {
226
+ builder = builder.with_batch_size(batch_size);
227
+ }
228
+
229
+ // If columns are specified, project only those columns
230
+ if let Some(cols) = &columns {
231
+ // Get the parquet schema
232
+ let parquet_schema = builder.parquet_schema();
233
+
234
+ // Create a projection mask from column names
235
+ let projection =
236
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
237
+
238
+ builder = builder.with_projection(projection);
239
+ }
240
+
241
+ let reader = builder.build().unwrap();
242
+
243
+ (reader, schema, num_rows)
244
+ } else {
245
+ let readable = SeekableRubyValue(Opaque::from(to_read));
246
+
247
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
248
+ let schema = builder.schema().clone();
249
+ let num_rows = builder.metadata().file_metadata().num_rows();
250
+
251
+ if let Some(batch_size) = batch_size {
252
+ builder = builder.with_batch_size(batch_size);
253
+ }
254
+
255
+ // If columns are specified, project only those columns
256
+ if let Some(cols) = &columns {
257
+ // Get the parquet schema
258
+ let parquet_schema = builder.parquet_schema();
259
+
260
+ // Create a projection mask from column names
261
+ let projection =
262
+ ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
263
+
264
+ builder = builder.with_projection(projection);
265
+ }
266
+
267
+ let reader = builder.build().unwrap();
268
+
269
+ (reader, schema, num_rows)
270
+ };
271
+
272
+ if num_rows == 0 {
273
+ let mut map =
274
+ HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
275
+ for field in schema.fields() {
276
+ map.insert(
277
+ StringCache::intern(field.name().to_string()).unwrap(),
278
+ vec![],
279
+ );
280
+ }
281
+ let column_record = vec![ColumnRecord::Map(map)];
282
+ return Ok(Yield::Iter(Box::new(column_record.into_iter())));
283
+ }
284
+
285
+ let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
286
+ "hash" => {
287
+ let headers = OnceLock::new();
288
+ let headers_clone = headers.clone();
289
+ let iter = batch_reader
290
+ .filter_map(move |batch| {
291
+ batch.ok().map(|batch| {
292
+ let headers = headers_clone.get_or_init(|| {
293
+ let schema = batch.schema();
294
+ let fields = schema.fields();
295
+ let mut header_string = Vec::with_capacity(fields.len());
296
+ for field in fields {
297
+ header_string.push(field.name().to_owned());
298
+ }
299
+ StringCache::intern_many(&header_string).unwrap()
300
+ });
301
+
302
+ let mut map =
303
+ HashMap::with_capacity_and_hasher(headers.len(), Default::default());
304
+
305
+ batch.columns().iter().enumerate().for_each(|(i, column)| {
306
+ let header = headers[i];
307
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
308
+ map.insert(header, values.into_inner());
309
+ });
310
+
311
+ map
312
+ })
313
+ })
314
+ .map(ColumnRecord::Map);
315
+
316
+ Box::new(HeaderCacheCleanupIter {
317
+ inner: iter,
318
+ headers,
319
+ })
320
+ }
321
+ "array" => Box::new(
322
+ batch_reader
323
+ .filter_map(|batch| {
324
+ batch.ok().map(|batch| {
325
+ batch
326
+ .columns()
327
+ .into_iter()
328
+ .map(|column| {
329
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
330
+ values.into_inner()
331
+ })
332
+ .collect()
333
+ })
334
+ })
335
+ .map(ColumnRecord::Vec),
133
336
  ),
134
337
  _ => {
135
338
  return Err(MagnusError::new(
@@ -172,6 +375,8 @@ pub enum ReaderError {
172
375
  HeaderIntern(#[from] CacheError),
173
376
  #[error("Ruby error: {0}")]
174
377
  Ruby(String),
378
+ #[error("Parquet error: {0}")]
379
+ Parquet(#[from] ParquetError),
175
380
  }
176
381
 
177
382
  impl From<MagnusError> for ReaderError {