parquet 0.5.9 → 0.5.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f1a1f7c250b960dbe334145a537e865889fbc759e7b8bfbafcbadc77689972cd
4
- data.tar.gz: 22116ec0b9fe89f0ad08a3674267bd00b141170b889091b476f3aab0d6be88a8
3
+ metadata.gz: 114891cfa5fa190e1f00d44803327f1c90cc11f64ba23f7f2a9cc9f9379da787
4
+ data.tar.gz: 9168b2be960faa93ce9c84d170c6e8f73819535bcedbf3d3b26869ff9829ecc6
5
5
  SHA512:
6
- metadata.gz: ef8485d03247dd0d31993a774117669c1aaef5b875e7cb5c6f4c53e237a72fb81113ea35251426a21ea1ba24f8ae568bd2c3a158e6a45ce2416a308251d0f467
7
- data.tar.gz: 672f38dfbf703dae996283fba8d137529e3089f569797df87feaac32fb0f956ea7c4d7ae57032715d1a21bd5bfa4dd728c2a3fe80174fb5fc0abdef51c73110a
6
+ metadata.gz: f07f99a188ac5fa0663616fba00b1990a2cbd6bb14462383915f0e1617c26c5ca481840c16179958f2b3760b334f176e2e4542d95e3cc922379948ac2b0bfa61
7
+ data.tar.gz: 42c7b0779d6e3fa46addc5fa92420f326418a54962d391e9b063db8378f8a5f8c2916b43f356649fc127e8fc582aa1e98d7afd71f0bc5f9700a0664ed46313f6
data/Cargo.lock CHANGED
@@ -126,6 +126,7 @@ dependencies = [
126
126
  "arrow-data",
127
127
  "arrow-schema",
128
128
  "flatbuffers",
129
+ "lz4_flex",
129
130
  ]
130
131
 
131
132
  [[package]]
@@ -842,6 +843,8 @@ version = "0.1.0"
842
843
  dependencies = [
843
844
  "ahash",
844
845
  "arrow-array",
846
+ "arrow-buffer",
847
+ "arrow-ipc",
845
848
  "arrow-schema",
846
849
  "bytes",
847
850
  "either",
@@ -12,6 +12,8 @@ rb-sys-env = "^0.2"
12
12
  [dependencies]
13
13
  ahash = "0.8"
14
14
  arrow-array = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
15
+ arrow-buffer = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
16
+ arrow-ipc = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time", features = ["lz4"] }
15
17
  arrow-schema = { git = "https://github.com/njaremko/arrow-rs", branch = "nathan/fix-time" }
16
18
  bytes = "^1.9"
17
19
  either = "1.9"
data/ext/parquet/build.rs CHANGED
@@ -1,5 +1,5 @@
1
1
  pub fn main() -> Result<(), Box<dyn std::error::Error>> {
2
- let _rb_env = rb_sys_env::activate()?;
2
+ rb_sys_env::activate()?;
3
3
 
4
4
  Ok(())
5
5
  }
@@ -19,6 +19,9 @@ use writer::write_rows;
19
19
  /// Initializes the Ruby extension and defines methods.
20
20
  #[magnus::init]
21
21
  fn init(ruby: &Ruby) -> Result<(), Error> {
22
+ // Require 'time' for Time.parse method
23
+ ruby.require("time")?;
24
+
22
25
  let module = ruby.define_module("Parquet")?;
23
26
  module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
24
27
  module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
@@ -0,0 +1,579 @@
1
+ use crate::header_cache::StringCache;
2
+ use crate::logger::RubyLogger;
3
+ use crate::types::ArrayWrapper;
4
+ use crate::types::{
5
+ ColumnRecord, ParquetGemError, ParquetValueVec, ParserResultType, RowRecord, TryIntoValue,
6
+ };
7
+ use ahash::RandomState;
8
+ use arrow_array::RecordBatch;
9
+ use arrow_ipc::reader::{FileReader, StreamReader};
10
+ use arrow_schema::Schema;
11
+ use magnus::{Ruby, Value};
12
+ use std::collections::HashMap;
13
+ use std::fs::File;
14
+ use std::io::Read;
15
+ use std::rc::Rc;
16
+ use std::sync::{Arc, OnceLock};
17
+
18
+ /// Process Arrow IPC file data for column-based parsing
19
+ pub fn process_arrow_column_data<R: Read>(
20
+ ruby: Rc<Ruby>,
21
+ reader: StreamReader<R>,
22
+ columns: &Option<Vec<String>>,
23
+ result_type: ParserResultType,
24
+ _batch_size: Option<usize>,
25
+ strict: bool,
26
+ ruby_logger: &RubyLogger,
27
+ ) -> Result<(), ParquetGemError> {
28
+ let schema = reader.schema();
29
+ ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
30
+
31
+ // Filter schema if columns are specified
32
+ let _filtered_schema = if let Some(cols) = columns {
33
+ let mut fields = Vec::new();
34
+ for field in schema.fields() {
35
+ if cols.contains(&field.name().to_string()) {
36
+ fields.push(field.clone());
37
+ }
38
+ }
39
+ Arc::new(Schema::new(fields))
40
+ } else {
41
+ schema.clone()
42
+ };
43
+
44
+ match result_type {
45
+ ParserResultType::Hash => {
46
+ let headers = OnceLock::new();
47
+
48
+ for batch_result in reader {
49
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
50
+
51
+ // Filter columns if needed
52
+ let batch = if let Some(cols) = columns {
53
+ filter_record_batch(&batch, cols)?
54
+ } else {
55
+ batch
56
+ };
57
+
58
+ let local_headers = headers
59
+ .get_or_init(|| {
60
+ let schema = batch.schema();
61
+ let fields = schema.fields();
62
+ let mut header_string = Vec::with_capacity(fields.len());
63
+ for field in fields {
64
+ header_string.push(field.name().to_owned());
65
+ }
66
+ StringCache::intern_many(&header_string)
67
+ })
68
+ .as_ref()
69
+ .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
70
+
71
+ let mut map =
72
+ HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
73
+
74
+ batch
75
+ .columns()
76
+ .iter()
77
+ .enumerate()
78
+ .try_for_each(|(i, column)| {
79
+ let header = local_headers[i];
80
+ let values = ParquetValueVec::try_from(ArrayWrapper {
81
+ array: column,
82
+ strict,
83
+ })?;
84
+ map.insert(header, values.into_inner());
85
+ Ok::<_, ParquetGemError>(())
86
+ })?;
87
+
88
+ let record = ColumnRecord::Map::<RandomState>(map);
89
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
90
+ }
91
+ }
92
+ ParserResultType::Array => {
93
+ for batch_result in reader {
94
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
95
+
96
+ // Filter columns if needed
97
+ let batch = if let Some(cols) = columns {
98
+ filter_record_batch(&batch, cols)?
99
+ } else {
100
+ batch
101
+ };
102
+
103
+ let vec = batch
104
+ .columns()
105
+ .iter()
106
+ .map(|column| {
107
+ let values = ParquetValueVec::try_from(ArrayWrapper {
108
+ array: column,
109
+ strict,
110
+ })?;
111
+ Ok::<_, ParquetGemError>(values.into_inner())
112
+ })
113
+ .collect::<Result<Vec<_>, _>>()?;
114
+
115
+ let record = ColumnRecord::Vec::<RandomState>(vec);
116
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
117
+ }
118
+ }
119
+ }
120
+
121
+ Ok(())
122
+ }
123
+
124
+ /// Process Arrow IPC file data for row-based parsing
125
+ pub fn process_arrow_row_data<R: Read>(
126
+ ruby: Rc<Ruby>,
127
+ reader: StreamReader<R>,
128
+ columns: &Option<Vec<String>>,
129
+ result_type: ParserResultType,
130
+ strict: bool,
131
+ ruby_logger: &RubyLogger,
132
+ ) -> Result<(), ParquetGemError> {
133
+ let schema = reader.schema();
134
+ ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
135
+
136
+ match result_type {
137
+ ParserResultType::Hash => {
138
+ let headers = OnceLock::new();
139
+
140
+ for batch_result in reader {
141
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
142
+
143
+ // Filter columns if needed
144
+ let batch = if let Some(cols) = columns {
145
+ filter_record_batch(&batch, cols)?
146
+ } else {
147
+ batch
148
+ };
149
+
150
+ let local_headers = headers
151
+ .get_or_init(|| {
152
+ let schema = batch.schema();
153
+ let fields = schema.fields();
154
+ let mut header_string = Vec::with_capacity(fields.len());
155
+ for field in fields {
156
+ header_string.push(field.name().to_owned());
157
+ }
158
+ StringCache::intern_many(&header_string)
159
+ })
160
+ .as_ref()
161
+ .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
162
+
163
+ // Convert columnar data to rows
164
+ for row_idx in 0..batch.num_rows() {
165
+ let mut map = HashMap::with_capacity_and_hasher(
166
+ local_headers.len(),
167
+ RandomState::default(),
168
+ );
169
+
170
+ for (col_idx, column) in batch.columns().iter().enumerate() {
171
+ let header = local_headers[col_idx];
172
+ let value = extract_value_at_index(column, row_idx, strict)?;
173
+ map.insert(header, value);
174
+ }
175
+
176
+ let record = RowRecord::Map::<RandomState>(map);
177
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
178
+ }
179
+ }
180
+ }
181
+ ParserResultType::Array => {
182
+ for batch_result in reader {
183
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
184
+
185
+ // Filter columns if needed
186
+ let batch = if let Some(cols) = columns {
187
+ filter_record_batch(&batch, cols)?
188
+ } else {
189
+ batch
190
+ };
191
+
192
+ // Convert columnar data to rows
193
+ for row_idx in 0..batch.num_rows() {
194
+ let mut row_vec = Vec::with_capacity(batch.num_columns());
195
+
196
+ for column in batch.columns() {
197
+ let value = extract_value_at_index(column, row_idx, strict)?;
198
+ row_vec.push(value);
199
+ }
200
+
201
+ let record = RowRecord::Vec::<RandomState>(row_vec);
202
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
203
+ }
204
+ }
205
+ }
206
+ }
207
+
208
+ Ok(())
209
+ }
210
+
211
+ /// Process Arrow IPC file with FileReader for row-based parsing
212
+ pub fn process_arrow_file_row_data(
213
+ ruby: Rc<Ruby>,
214
+ reader: FileReader<File>,
215
+ columns: &Option<Vec<String>>,
216
+ result_type: ParserResultType,
217
+ strict: bool,
218
+ ruby_logger: &RubyLogger,
219
+ ) -> Result<(), ParquetGemError> {
220
+ let schema = reader.schema();
221
+ ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
222
+
223
+ match result_type {
224
+ ParserResultType::Hash => {
225
+ let headers = OnceLock::new();
226
+
227
+ for batch_result in reader {
228
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
229
+
230
+ // Filter columns if needed
231
+ let batch = if let Some(cols) = columns {
232
+ filter_record_batch(&batch, cols)?
233
+ } else {
234
+ batch
235
+ };
236
+
237
+ let local_headers = headers
238
+ .get_or_init(|| {
239
+ let schema = batch.schema();
240
+ let fields = schema.fields();
241
+ let mut header_string = Vec::with_capacity(fields.len());
242
+ for field in fields {
243
+ header_string.push(field.name().to_owned());
244
+ }
245
+ StringCache::intern_many(&header_string)
246
+ })
247
+ .as_ref()
248
+ .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
249
+
250
+ // Convert columnar data to rows
251
+ for row_idx in 0..batch.num_rows() {
252
+ let mut map = HashMap::with_capacity_and_hasher(
253
+ local_headers.len(),
254
+ RandomState::default(),
255
+ );
256
+
257
+ for (col_idx, column) in batch.columns().iter().enumerate() {
258
+ let header = local_headers[col_idx];
259
+ let value = extract_value_at_index(column, row_idx, strict)?;
260
+ map.insert(header, value);
261
+ }
262
+
263
+ let record = RowRecord::Map::<RandomState>(map);
264
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
265
+ }
266
+ }
267
+ }
268
+ ParserResultType::Array => {
269
+ for batch_result in reader {
270
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
271
+
272
+ // Filter columns if needed
273
+ let batch = if let Some(cols) = columns {
274
+ filter_record_batch(&batch, cols)?
275
+ } else {
276
+ batch
277
+ };
278
+
279
+ // Convert columnar data to rows
280
+ for row_idx in 0..batch.num_rows() {
281
+ let mut row_vec = Vec::with_capacity(batch.num_columns());
282
+
283
+ for column in batch.columns() {
284
+ let value = extract_value_at_index(column, row_idx, strict)?;
285
+ row_vec.push(value);
286
+ }
287
+
288
+ let record = RowRecord::Vec::<RandomState>(row_vec);
289
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
290
+ }
291
+ }
292
+ }
293
+ }
294
+
295
+ Ok(())
296
+ }
297
+
298
+ /// Process Arrow IPC file with FileReader (for seekable sources)
299
+ pub fn process_arrow_file_column_data(
300
+ ruby: Rc<Ruby>,
301
+ file: File,
302
+ columns: &Option<Vec<String>>,
303
+ result_type: ParserResultType,
304
+ _batch_size: Option<usize>,
305
+ strict: bool,
306
+ ruby_logger: &RubyLogger,
307
+ ) -> Result<(), ParquetGemError> {
308
+ let reader =
309
+ FileReader::try_new(file, None).map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
310
+
311
+ let schema = reader.schema();
312
+ ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
313
+
314
+ // FileReader implements Iterator<Item = Result<RecordBatch, ArrowError>>
315
+ match result_type {
316
+ ParserResultType::Hash => {
317
+ let headers = OnceLock::new();
318
+
319
+ for batch_result in reader {
320
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
321
+
322
+ // Filter columns if needed
323
+ let batch = if let Some(cols) = columns {
324
+ filter_record_batch(&batch, cols)?
325
+ } else {
326
+ batch
327
+ };
328
+
329
+ let local_headers = headers
330
+ .get_or_init(|| {
331
+ let schema = batch.schema();
332
+ let fields = schema.fields();
333
+ let mut header_string = Vec::with_capacity(fields.len());
334
+ for field in fields {
335
+ header_string.push(field.name().to_owned());
336
+ }
337
+ StringCache::intern_many(&header_string)
338
+ })
339
+ .as_ref()
340
+ .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
341
+
342
+ let mut map =
343
+ HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
344
+
345
+ batch
346
+ .columns()
347
+ .iter()
348
+ .enumerate()
349
+ .try_for_each(|(i, column)| {
350
+ let header = local_headers[i];
351
+ let values = ParquetValueVec::try_from(ArrayWrapper {
352
+ array: column,
353
+ strict,
354
+ })?;
355
+ map.insert(header, values.into_inner());
356
+ Ok::<_, ParquetGemError>(())
357
+ })?;
358
+
359
+ let record = ColumnRecord::Map::<RandomState>(map);
360
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
361
+ }
362
+ }
363
+ ParserResultType::Array => {
364
+ for batch_result in reader {
365
+ let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
366
+
367
+ // Filter columns if needed
368
+ let batch = if let Some(cols) = columns {
369
+ filter_record_batch(&batch, cols)?
370
+ } else {
371
+ batch
372
+ };
373
+
374
+ let vec = batch
375
+ .columns()
376
+ .iter()
377
+ .map(|column| {
378
+ let values = ParquetValueVec::try_from(ArrayWrapper {
379
+ array: column,
380
+ strict,
381
+ })?;
382
+ Ok::<_, ParquetGemError>(values.into_inner())
383
+ })
384
+ .collect::<Result<Vec<_>, _>>()?;
385
+
386
+ let record = ColumnRecord::Vec::<RandomState>(vec);
387
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
388
+ }
389
+ }
390
+ }
391
+
392
+ Ok(())
393
+ }
394
+
395
+ /// Extract a single value from an Arrow array at a specific index
396
+ fn extract_value_at_index(
397
+ array: &Arc<dyn arrow_array::Array>,
398
+ index: usize,
399
+ strict: bool,
400
+ ) -> Result<crate::types::ParquetField, ParquetGemError> {
401
+ use crate::types::ParquetField;
402
+ use arrow_array::*;
403
+ use arrow_schema::DataType;
404
+ use parquet::record::Field;
405
+
406
+ // Convert Arrow array value at index to Parquet Field
407
+ let field = match array.data_type() {
408
+ DataType::Boolean => {
409
+ let arr = array.as_any().downcast_ref::<BooleanArray>().unwrap();
410
+ if arr.is_null(index) {
411
+ Field::Null
412
+ } else {
413
+ Field::Bool(arr.value(index))
414
+ }
415
+ }
416
+ DataType::Int8 => {
417
+ let arr = array.as_any().downcast_ref::<Int8Array>().unwrap();
418
+ if arr.is_null(index) {
419
+ Field::Null
420
+ } else {
421
+ Field::Byte(arr.value(index) as i8)
422
+ }
423
+ }
424
+ DataType::Int16 => {
425
+ let arr = array.as_any().downcast_ref::<Int16Array>().unwrap();
426
+ if arr.is_null(index) {
427
+ Field::Null
428
+ } else {
429
+ Field::Short(arr.value(index))
430
+ }
431
+ }
432
+ DataType::Int32 => {
433
+ let arr = array.as_any().downcast_ref::<Int32Array>().unwrap();
434
+ if arr.is_null(index) {
435
+ Field::Null
436
+ } else {
437
+ Field::Int(arr.value(index))
438
+ }
439
+ }
440
+ DataType::Int64 => {
441
+ let arr = array.as_any().downcast_ref::<Int64Array>().unwrap();
442
+ if arr.is_null(index) {
443
+ Field::Null
444
+ } else {
445
+ Field::Long(arr.value(index))
446
+ }
447
+ }
448
+ DataType::UInt8 => {
449
+ let arr = array.as_any().downcast_ref::<UInt8Array>().unwrap();
450
+ if arr.is_null(index) {
451
+ Field::Null
452
+ } else {
453
+ Field::UByte(arr.value(index))
454
+ }
455
+ }
456
+ DataType::UInt16 => {
457
+ let arr = array.as_any().downcast_ref::<UInt16Array>().unwrap();
458
+ if arr.is_null(index) {
459
+ Field::Null
460
+ } else {
461
+ Field::UShort(arr.value(index))
462
+ }
463
+ }
464
+ DataType::UInt32 => {
465
+ let arr = array.as_any().downcast_ref::<UInt32Array>().unwrap();
466
+ if arr.is_null(index) {
467
+ Field::Null
468
+ } else {
469
+ Field::UInt(arr.value(index))
470
+ }
471
+ }
472
+ DataType::UInt64 => {
473
+ let arr = array.as_any().downcast_ref::<UInt64Array>().unwrap();
474
+ if arr.is_null(index) {
475
+ Field::Null
476
+ } else {
477
+ Field::ULong(arr.value(index))
478
+ }
479
+ }
480
+ DataType::Float32 => {
481
+ let arr = array.as_any().downcast_ref::<Float32Array>().unwrap();
482
+ if arr.is_null(index) {
483
+ Field::Null
484
+ } else {
485
+ Field::Float(arr.value(index))
486
+ }
487
+ }
488
+ DataType::Float64 => {
489
+ let arr = array.as_any().downcast_ref::<Float64Array>().unwrap();
490
+ if arr.is_null(index) {
491
+ Field::Null
492
+ } else {
493
+ Field::Double(arr.value(index))
494
+ }
495
+ }
496
+ DataType::Utf8 => {
497
+ let arr = array.as_any().downcast_ref::<StringArray>().unwrap();
498
+ if arr.is_null(index) {
499
+ Field::Null
500
+ } else {
501
+ Field::Str(arr.value(index).to_string())
502
+ }
503
+ }
504
+ DataType::Binary => {
505
+ let arr = array.as_any().downcast_ref::<BinaryArray>().unwrap();
506
+ if arr.is_null(index) {
507
+ Field::Null
508
+ } else {
509
+ Field::Bytes(arr.value(index).into())
510
+ }
511
+ }
512
+ DataType::Date32 => {
513
+ let arr = array.as_any().downcast_ref::<Date32Array>().unwrap();
514
+ if arr.is_null(index) {
515
+ Field::Null
516
+ } else {
517
+ Field::Date(arr.value(index))
518
+ }
519
+ }
520
+ DataType::Timestamp(unit, _tz) => match unit {
521
+ arrow_schema::TimeUnit::Millisecond => {
522
+ let arr = array
523
+ .as_any()
524
+ .downcast_ref::<TimestampMillisecondArray>()
525
+ .unwrap();
526
+ if arr.is_null(index) {
527
+ Field::Null
528
+ } else {
529
+ Field::TimestampMillis(arr.value(index))
530
+ }
531
+ }
532
+ arrow_schema::TimeUnit::Microsecond => {
533
+ let arr = array
534
+ .as_any()
535
+ .downcast_ref::<TimestampMicrosecondArray>()
536
+ .unwrap();
537
+ if arr.is_null(index) {
538
+ Field::Null
539
+ } else {
540
+ Field::TimestampMicros(arr.value(index))
541
+ }
542
+ }
543
+ _ => Field::Null,
544
+ },
545
+ // Add more type handling as needed
546
+ _ => Field::Null,
547
+ };
548
+
549
+ // For Arrow files, we don't have Parquet logical types, so we use defaults
550
+ Ok(ParquetField {
551
+ field,
552
+ converted_type: parquet::basic::ConvertedType::NONE,
553
+ logical_type: None,
554
+ strict,
555
+ })
556
+ }
557
+
558
+ /// Filter a RecordBatch to only include specified columns
559
+ fn filter_record_batch(
560
+ batch: &RecordBatch,
561
+ columns: &[String],
562
+ ) -> Result<RecordBatch, ParquetGemError> {
563
+ let schema = batch.schema();
564
+ let mut indices = Vec::new();
565
+ let mut fields = Vec::new();
566
+
567
+ for (i, field) in schema.fields().iter().enumerate() {
568
+ if columns.contains(&field.name().to_string()) {
569
+ indices.push(i);
570
+ fields.push(field.clone());
571
+ }
572
+ }
573
+
574
+ let new_schema = Arc::new(Schema::new(fields));
575
+ let new_columns: Vec<_> = indices.iter().map(|&i| batch.column(i).clone()).collect();
576
+
577
+ RecordBatch::try_new(new_schema, new_columns)
578
+ .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))
579
+ }