parquet 0.0.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b88d6751418f21c4ec032d05b6d0a6e9dbd37304983ed80e1a290508c787d118
4
- data.tar.gz: 948702f38cad3c4d4e76efccbd9d7d8ad4c81366c4dcba2c71058cc4d013c237
3
+ metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
4
+ data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
5
5
  SHA512:
6
- metadata.gz: 30f90ee2f597aa6e2d5a84b8ab9780af3d71fa41d3a1152f47d7a12b34bc203b8ff06b04c3f929f689c93be9e962186a3e6c305f61724b36ad4e6ad551c11f49
7
- data.tar.gz: 5a83b007e0c4789c6cfde1f8037228b0b00f2f0ef7ea0f932d7eaafefb91669db422450bbfd923f4388e2bfc644cae57f514828a2e4a2868ee6a20b492af428e
6
+ metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
7
+ data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
data/Cargo.lock CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
387
387
  source = "registry+https://github.com/rust-lang/crates.io-index"
388
388
  checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
389
389
 
390
+ [[package]]
391
+ name = "errno"
392
+ version = "0.3.10"
393
+ source = "registry+https://github.com/rust-lang/crates.io-index"
394
+ checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
395
+ dependencies = [
396
+ "libc",
397
+ "windows-sys",
398
+ ]
399
+
400
+ [[package]]
401
+ name = "fastrand"
402
+ version = "2.3.0"
403
+ source = "registry+https://github.com/rust-lang/crates.io-index"
404
+ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
405
+
390
406
  [[package]]
391
407
  name = "flatbuffers"
392
408
  version = "24.12.23"
@@ -826,16 +842,6 @@ dependencies = [
826
842
  "wasm-bindgen",
827
843
  ]
828
844
 
829
- [[package]]
830
- name = "kanal"
831
- version = "0.1.0-pre8"
832
- source = "registry+https://github.com/rust-lang/crates.io-index"
833
- checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
834
- dependencies = [
835
- "futures-core",
836
- "lock_api",
837
- ]
838
-
839
845
  [[package]]
840
846
  name = "lazy_static"
841
847
  version = "1.5.0"
@@ -944,6 +950,12 @@ dependencies = [
944
950
  "libc",
945
951
  ]
946
952
 
953
+ [[package]]
954
+ name = "linux-raw-sys"
955
+ version = "0.4.15"
956
+ source = "registry+https://github.com/rust-lang/crates.io-index"
957
+ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
958
+
947
959
  [[package]]
948
960
  name = "litemap"
949
961
  version = "0.7.4"
@@ -975,18 +987,6 @@ dependencies = [
975
987
  "twox-hash",
976
988
  ]
977
989
 
978
- [[package]]
979
- name = "magnus"
980
- version = "0.6.4"
981
- source = "registry+https://github.com/rust-lang/crates.io-index"
982
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
983
- dependencies = [
984
- "magnus-macros",
985
- "rb-sys",
986
- "rb-sys-env",
987
- "seq-macro",
988
- ]
989
-
990
990
  [[package]]
991
991
  name = "magnus"
992
992
  version = "0.7.1"
@@ -1203,13 +1203,11 @@ dependencies = [
1203
1203
  "itertools 0.14.0",
1204
1204
  "jemallocator",
1205
1205
  "jiff",
1206
- "kanal",
1207
- "magnus 0.7.1",
1206
+ "magnus",
1208
1207
  "mimalloc",
1209
1208
  "parquet 54.0.0",
1210
1209
  "rb-sys",
1211
- "serde",
1212
- "serde_magnus",
1210
+ "tempfile",
1213
1211
  "thiserror",
1214
1212
  ]
1215
1213
 
@@ -1402,6 +1400,19 @@ dependencies = [
1402
1400
  "semver",
1403
1401
  ]
1404
1402
 
1403
+ [[package]]
1404
+ name = "rustix"
1405
+ version = "0.38.43"
1406
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1407
+ checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
1408
+ dependencies = [
1409
+ "bitflags 2.6.0",
1410
+ "errno",
1411
+ "libc",
1412
+ "linux-raw-sys",
1413
+ "windows-sys",
1414
+ ]
1415
+
1405
1416
  [[package]]
1406
1417
  name = "ryu"
1407
1418
  version = "1.0.18"
@@ -1467,17 +1478,6 @@ dependencies = [
1467
1478
  "serde",
1468
1479
  ]
1469
1480
 
1470
- [[package]]
1471
- name = "serde_magnus"
1472
- version = "0.8.1"
1473
- source = "registry+https://github.com/rust-lang/crates.io-index"
1474
- checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
1475
- dependencies = [
1476
- "magnus 0.6.4",
1477
- "serde",
1478
- "tap",
1479
- ]
1480
-
1481
1481
  [[package]]
1482
1482
  name = "shell-words"
1483
1483
  version = "1.1.0"
@@ -1567,10 +1567,18 @@ dependencies = [
1567
1567
  ]
1568
1568
 
1569
1569
  [[package]]
1570
- name = "tap"
1571
- version = "1.0.1"
1570
+ name = "tempfile"
1571
+ version = "3.15.0"
1572
1572
  source = "registry+https://github.com/rust-lang/crates.io-index"
1573
- checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
1573
+ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
1574
+ dependencies = [
1575
+ "cfg-if",
1576
+ "fastrand",
1577
+ "getrandom",
1578
+ "once_cell",
1579
+ "rustix",
1580
+ "windows-sys",
1581
+ ]
1574
1582
 
1575
1583
  [[package]]
1576
1584
  name = "thiserror"
data/Gemfile CHANGED
@@ -8,7 +8,7 @@ gemspec
8
8
 
9
9
  group :development do
10
10
  gem "benchmark-ips", "~> 2.12"
11
- # gem "polars-df"
11
+ gem "polars-df"
12
12
  gem "duckdb"
13
13
  end
14
14
 
data/README.md CHANGED
@@ -4,8 +4,6 @@
4
4
 
5
5
  This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
6
6
 
7
- At the moment, it only supports iterating rows as either a hash or an array.
8
-
9
7
  ## Usage
10
8
 
11
9
  This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
83
81
  - `batch_size`: Number of rows per batch (defaults to implementation-defined value)
84
82
 
85
83
  When no block is given, both methods return an Enumerator.
84
+
85
+ ### Writing Row-wise Data
86
+
87
+ The `write_rows` method allows you to write data row by row:
88
+
89
+ ```ruby
90
+ require "parquet"
91
+
92
+ # Define the schema for your data
93
+ schema = [
94
+ { "id" => "int64" },
95
+ { "name" => "string" },
96
+ { "score" => "double" }
97
+ ]
98
+
99
+ # Create an enumerator that yields arrays of row values
100
+ rows = [
101
+ [1, "Alice", 95.5],
102
+ [2, "Bob", 82.3],
103
+ [3, "Charlie", 88.7]
104
+ ].each
105
+
106
+ # Write to a file
107
+ Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
108
+
109
+ # Write to an IO object
110
+ File.open("data.parquet", "wb") do |file|
111
+ Parquet.write_rows(rows, schema: schema, write_to: file)
112
+ end
113
+
114
+ # Optionally specify batch size (default is 1000)
115
+ Parquet.write_rows(rows,
116
+ schema: schema,
117
+ write_to: "data.parquet",
118
+ batch_size: 500
119
+ )
120
+ ```
121
+
122
+ ### Writing Column-wise Data
123
+
124
+ The `write_columns` method provides a more efficient way to write data in column-oriented batches:
125
+
126
+ ```ruby
127
+ require "parquet"
128
+
129
+ # Define the schema
130
+ schema = [
131
+ { "id" => "int64" },
132
+ { "name" => "string" },
133
+ { "score" => "double" }
134
+ ]
135
+
136
+ # Create batches of column data
137
+ batches = [
138
+ # First batch
139
+ [
140
+ [1, 2], # id column
141
+ ["Alice", "Bob"], # name column
142
+ [95.5, 82.3] # score column
143
+ ],
144
+ # Second batch
145
+ [
146
+ [3], # id column
147
+ ["Charlie"], # name column
148
+ [88.7] # score column
149
+ ]
150
+ ]
151
+
152
+ # Create an enumerator from the batches
153
+ columns = batches.each
154
+
155
+ # Write to a parquet file
156
+ Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
+
158
+ # Write to an IO object
159
+ File.open("data.parquet", "wb") do |file|
160
+ Parquet.write_columns(columns, schema: schema, write_to: file)
161
+ end
162
+ ```
163
+
164
+ The following data types are supported in the schema:
165
+
166
+ - `int8`, `int16`, `int32`, `int64`
167
+ - `uint8`, `uint16`, `uint32`, `uint64`
168
+ - `float`, `double`
169
+ - `string`
170
+ - `binary`
171
+ - `boolean`
172
+ - `date32`
173
+ - `timestamp_millis`, `timestamp_micros`
174
+
175
+ Note: List and Map types are currently not supported.
@@ -8,19 +8,16 @@ crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
10
  ahash = "0.8"
11
- parquet = { version = "^54.0", features = ["json", "object_store"] }
12
- arrow-schema = "54.0.0"
13
11
  arrow-array = "54.0.0"
12
+ arrow-schema = "54.0.0"
14
13
  bytes = "^1.9"
15
- kanal = "0.1.0-pre8"
14
+ itertools = "^0.14"
15
+ jiff = "0.1.19"
16
16
  magnus = { version = "0.7", features = ["rb-sys"] }
17
+ parquet = { version = "^54.0", features = ["json", "object_store"] }
17
18
  rb-sys = "^0.9"
18
- serde = { version = "1.0", features = ["derive"] }
19
- serde_magnus = "0.8.1"
20
19
  thiserror = "2.0"
21
- itertools = "^0.14"
22
- jiff = "0.1.19"
23
-
20
+ tempfile = "^3.15"
24
21
 
25
22
  [target.'cfg(target_os = "linux")'.dependencies]
26
23
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -3,12 +3,12 @@ use magnus::{
3
3
  block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
4
  };
5
5
 
6
- use crate::{ColumnRecord, RowRecord};
6
+ use crate::{ColumnRecord, ParserResultType, RowRecord};
7
7
 
8
8
  pub struct RowEnumeratorArgs {
9
9
  pub rb_self: Value,
10
10
  pub to_read: Value,
11
- pub result_type: String,
11
+ pub result_type: ParserResultType,
12
12
  pub columns: Option<Vec<String>>,
13
13
  }
14
14
 
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
17
17
  args: RowEnumeratorArgs,
18
18
  ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
19
19
  let kwargs = RHash::new();
20
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
20
+ kwargs.aset(
21
+ Symbol::new("result_type"),
22
+ Symbol::new(args.result_type.to_string()),
23
+ )?;
21
24
  if let Some(columns) = args.columns {
22
25
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
23
26
  }
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
30
33
  pub struct ColumnEnumeratorArgs {
31
34
  pub rb_self: Value,
32
35
  pub to_read: Value,
33
- pub result_type: String,
36
+ pub result_type: ParserResultType,
34
37
  pub columns: Option<Vec<String>>,
35
38
  pub batch_size: Option<usize>,
36
39
  }
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
40
43
  args: ColumnEnumeratorArgs,
41
44
  ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
42
45
  let kwargs = RHash::new();
43
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
46
+ kwargs.aset(
47
+ Symbol::new("result_type"),
48
+ Symbol::new(args.result_type.to_string()),
49
+ )?;
44
50
  if let Some(columns) = args.columns {
45
51
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
46
52
  }
@@ -6,6 +6,7 @@ mod ruby_integration;
6
6
  mod ruby_reader;
7
7
  mod types;
8
8
  mod utils;
9
+ mod writer;
9
10
 
10
11
  use crate::enumerator::*;
11
12
  use crate::reader::*;
@@ -13,6 +14,8 @@ use crate::ruby_integration::*;
13
14
  use crate::types::*;
14
15
 
15
16
  use magnus::{Error, Ruby};
17
+ use writer::write_columns;
18
+ use writer::write_rows;
16
19
 
17
20
  /// Initializes the Ruby extension and defines methods.
18
21
  #[magnus::init]
@@ -20,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
20
23
  let module = ruby.define_module("Parquet")?;
21
24
  module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
22
25
  module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
26
+ module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
27
+ module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
23
28
  Ok(())
24
29
  }
@@ -0,0 +1,42 @@
1
+ mod parquet_column_reader;
2
+ mod parquet_row_reader;
3
+
4
+ use std::io;
5
+
6
+ use magnus::{Error as MagnusError, Ruby};
7
+ use thiserror::Error;
8
+
9
+ use crate::header_cache::CacheError;
10
+ pub use parquet_column_reader::parse_parquet_columns;
11
+ pub use parquet_row_reader::parse_parquet_rows;
12
+
13
+ #[derive(Error, Debug)]
14
+ pub enum ReaderError {
15
+ #[error("Failed to get file descriptor: {0}")]
16
+ FileDescriptor(String),
17
+ #[error("Invalid file descriptor")]
18
+ InvalidFileDescriptor,
19
+ #[error("Failed to open file: {0}")]
20
+ FileOpen(#[from] io::Error),
21
+ #[error("Failed to intern headers: {0}")]
22
+ HeaderIntern(#[from] CacheError),
23
+ #[error("Ruby error: {0}")]
24
+ Ruby(String),
25
+ #[error("Parquet error: {0}")]
26
+ Parquet(#[from] parquet::errors::ParquetError),
27
+ }
28
+
29
+ impl From<MagnusError> for ReaderError {
30
+ fn from(err: MagnusError) -> Self {
31
+ Self::Ruby(err.to_string())
32
+ }
33
+ }
34
+
35
+ impl From<ReaderError> for MagnusError {
36
+ fn from(err: ReaderError) -> Self {
37
+ MagnusError::new(
38
+ Ruby::get().unwrap().exception_runtime_error(),
39
+ err.to_string(),
40
+ )
41
+ }
42
+ }
@@ -1,11 +1,7 @@
1
- // =============================================================================
2
- // Imports and Dependencies
3
- // =============================================================================
4
1
  use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
2
  use crate::{
6
- create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
7
- ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
8
- SeekableRubyValue,
3
+ create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
4
+ ParquetValueVec, ParserResultType, SeekableRubyValue,
9
5
  };
10
6
  use ahash::RandomState;
11
7
  use magnus::rb_sys::AsRawValue;
@@ -14,149 +10,20 @@ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
14
10
  use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
15
11
  use parquet::arrow::ProjectionMask;
16
12
  use parquet::errors::ParquetError;
17
- use parquet::file::reader::FileReader;
18
- use parquet::file::reader::SerializedFileReader;
19
- use parquet::record::reader::RowIter as ParquetRowIter;
20
- use parquet::schema::types::{Type as SchemaType, TypePtr};
21
13
  use std::collections::HashMap;
22
14
  use std::fs::File;
23
- use std::io::{self};
15
+ use std::io;
24
16
  use std::mem::ManuallyDrop;
25
17
  use std::os::fd::FromRawFd;
26
18
  use std::sync::OnceLock;
27
19
  use thiserror::Error;
28
20
 
29
- #[inline]
30
- pub fn parse_parquet_rows<'a>(
31
- rb_self: Value,
32
- args: &[Value],
33
- ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
34
- let original = unsafe { Ruby::get_unchecked() };
35
- let ruby: &'static Ruby = Box::leak(Box::new(original));
36
-
37
- let ParquetRowsArgs {
38
- to_read,
39
- result_type,
40
- columns,
41
- } = parse_parquet_rows_args(&ruby, args)?;
42
-
43
- if !ruby.block_given() {
44
- return create_row_enumerator(RowEnumeratorArgs {
45
- rb_self,
46
- to_read,
47
- result_type,
48
- columns,
49
- });
50
- }
51
-
52
- let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
53
- let path_string = to_read.to_r_string()?;
54
- let file_path = unsafe { path_string.as_str()? };
55
- let file = File::open(file_path).unwrap();
56
- let reader = SerializedFileReader::new(file).unwrap();
57
- let schema = reader.metadata().file_metadata().schema().clone();
58
-
59
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
60
- } else if to_read.is_kind_of(ruby.class_io()) {
61
- let raw_value = to_read.as_raw();
62
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
63
- .map_err(|_| {
64
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
65
- })?;
66
-
67
- if fd < 0 {
68
- return Err(ReaderError::InvalidFileDescriptor.into());
69
- }
70
-
71
- let file = unsafe { File::from_raw_fd(fd) };
72
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
73
- let reader = SerializedFileReader::new(file).unwrap();
74
- let schema = reader.metadata().file_metadata().schema().clone();
75
-
76
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
77
- } else {
78
- let readable = SeekableRubyValue(Opaque::from(to_read));
79
- let reader = SerializedFileReader::new(readable).unwrap();
80
- let schema = reader.metadata().file_metadata().schema().clone();
81
-
82
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
83
- };
84
-
85
- if let Some(cols) = columns {
86
- let projection = create_projection_schema(&schema, &cols);
87
- iter = iter.project(Some(projection.to_owned())).map_err(|e| {
88
- MagnusError::new(
89
- ruby.exception_runtime_error(),
90
- format!("Failed to create projection: {}", e),
91
- )
92
- })?;
93
- }
94
-
95
- let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
96
- "hash" => {
97
- let headers = OnceLock::new();
98
- let headers_clone = headers.clone();
99
- let iter = iter
100
- .filter_map(move |row| {
101
- row.ok().map(|row| {
102
- let headers = headers_clone.get_or_init(|| {
103
- let column_count = row.get_column_iter().count();
104
-
105
- let mut header_string = Vec::with_capacity(column_count);
106
- for (k, _) in row.get_column_iter() {
107
- header_string.push(k.to_owned());
108
- }
109
-
110
- let headers = StringCache::intern_many(&header_string).unwrap();
111
-
112
- headers
113
- });
114
-
115
- let mut map =
116
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
117
- row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
118
- map.insert(headers[i], ParquetField(v.clone()));
119
- });
120
- map
121
- })
122
- })
123
- .map(RowRecord::Map);
124
-
125
- Box::new(HeaderCacheCleanupIter {
126
- inner: iter,
127
- headers,
128
- })
129
- }
130
- "array" => Box::new(
131
- iter.filter_map(|row| {
132
- row.ok().map(|row| {
133
- let column_count = row.get_column_iter().count();
134
- let mut vec = Vec::with_capacity(column_count);
135
- row.get_column_iter()
136
- .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
137
- vec
138
- })
139
- })
140
- .map(RowRecord::Vec),
141
- ),
142
- _ => {
143
- return Err(MagnusError::new(
144
- ruby.exception_runtime_error(),
145
- "Invalid result type",
146
- ))
147
- }
148
- };
149
-
150
- Ok(Yield::Iter(iter))
151
- }
152
-
153
21
  #[inline]
154
22
  pub fn parse_parquet_columns<'a>(
155
23
  rb_self: Value,
156
24
  args: &[Value],
157
25
  ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
158
- let original = unsafe { Ruby::get_unchecked() };
159
- let ruby: &'static Ruby = Box::leak(Box::new(original));
26
+ let ruby = unsafe { Ruby::get_unchecked() };
160
27
 
161
28
  let ParquetColumnsArgs {
162
29
  to_read,
@@ -282,8 +149,8 @@ pub fn parse_parquet_columns<'a>(
282
149
  return Ok(Yield::Iter(Box::new(column_record.into_iter())));
283
150
  }
284
151
 
285
- let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
286
- "hash" => {
152
+ let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
153
+ ParserResultType::Hash => {
287
154
  let headers = OnceLock::new();
288
155
  let headers_clone = headers.clone();
289
156
  let iter = batch_reader
@@ -318,7 +185,7 @@ pub fn parse_parquet_columns<'a>(
318
185
  headers,
319
186
  })
320
187
  }
321
- "array" => Box::new(
188
+ ParserResultType::Array => Box::new(
322
189
  batch_reader
323
190
  .filter_map(|batch| {
324
191
  batch.ok().map(|batch| {
@@ -334,35 +201,11 @@ pub fn parse_parquet_columns<'a>(
334
201
  })
335
202
  .map(ColumnRecord::Vec),
336
203
  ),
337
- _ => {
338
- return Err(MagnusError::new(
339
- ruby.exception_runtime_error(),
340
- "Invalid result type",
341
- ))
342
- }
343
204
  };
344
205
 
345
206
  Ok(Yield::Iter(iter))
346
207
  }
347
208
 
348
- fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
349
- if let SchemaType::GroupType { fields, .. } = schema {
350
- let projected_fields: Vec<TypePtr> = fields
351
- .iter()
352
- .filter(|field| columns.contains(&field.name().to_string()))
353
- .cloned()
354
- .collect();
355
-
356
- SchemaType::GroupType {
357
- basic_info: schema.get_basic_info().clone(),
358
- fields: projected_fields,
359
- }
360
- } else {
361
- // Return original schema if not a group type
362
- schema.clone()
363
- }
364
- }
365
-
366
209
  #[derive(Error, Debug)]
367
210
  pub enum ReaderError {
368
211
  #[error("Failed to get file descriptor: {0}")]