parquet 0.0.4 → 0.2.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b88d6751418f21c4ec032d05b6d0a6e9dbd37304983ed80e1a290508c787d118
4
- data.tar.gz: 948702f38cad3c4d4e76efccbd9d7d8ad4c81366c4dcba2c71058cc4d013c237
3
+ metadata.gz: 90e876ca198a0e1871f692a382f09ceaeec670d162da26f2c102ea4eca4244bf
4
+ data.tar.gz: 96743e260cbd2fb55f6cdeaf256fbb1e915c57651fdc3f20fdd58b6a34596544
5
5
  SHA512:
6
- metadata.gz: 30f90ee2f597aa6e2d5a84b8ab9780af3d71fa41d3a1152f47d7a12b34bc203b8ff06b04c3f929f689c93be9e962186a3e6c305f61724b36ad4e6ad551c11f49
7
- data.tar.gz: 5a83b007e0c4789c6cfde1f8037228b0b00f2f0ef7ea0f932d7eaafefb91669db422450bbfd923f4388e2bfc644cae57f514828a2e4a2868ee6a20b492af428e
6
+ metadata.gz: 1609a37c5a9bd9f1d57bb31dd02b2fdb5b608a7c044686e6ef2513c95e53e830bd7bf7048a36904465a32a5915425c7b6bf581c5b35a4fb19f950cbca20913b2
7
+ data.tar.gz: 96ec18377fc5944556760329c126f440de61d3b378bfa976a66437db03f0a51220c880afd14098a5b1968daa968d2e836c50f83bef21507789ba4df314c48148
data/Cargo.lock CHANGED
@@ -387,6 +387,22 @@ version = "1.13.0"
387
387
  source = "registry+https://github.com/rust-lang/crates.io-index"
388
388
  checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0"
389
389
 
390
+ [[package]]
391
+ name = "errno"
392
+ version = "0.3.10"
393
+ source = "registry+https://github.com/rust-lang/crates.io-index"
394
+ checksum = "33d852cb9b869c2a9b3df2f71a3074817f01e1844f839a144f5fcef059a4eb5d"
395
+ dependencies = [
396
+ "libc",
397
+ "windows-sys",
398
+ ]
399
+
400
+ [[package]]
401
+ name = "fastrand"
402
+ version = "2.3.0"
403
+ source = "registry+https://github.com/rust-lang/crates.io-index"
404
+ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
405
+
390
406
  [[package]]
391
407
  name = "flatbuffers"
392
408
  version = "24.12.23"
@@ -826,16 +842,6 @@ dependencies = [
826
842
  "wasm-bindgen",
827
843
  ]
828
844
 
829
- [[package]]
830
- name = "kanal"
831
- version = "0.1.0-pre8"
832
- source = "registry+https://github.com/rust-lang/crates.io-index"
833
- checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
834
- dependencies = [
835
- "futures-core",
836
- "lock_api",
837
- ]
838
-
839
845
  [[package]]
840
846
  name = "lazy_static"
841
847
  version = "1.5.0"
@@ -944,6 +950,12 @@ dependencies = [
944
950
  "libc",
945
951
  ]
946
952
 
953
+ [[package]]
954
+ name = "linux-raw-sys"
955
+ version = "0.4.15"
956
+ source = "registry+https://github.com/rust-lang/crates.io-index"
957
+ checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab"
958
+
947
959
  [[package]]
948
960
  name = "litemap"
949
961
  version = "0.7.4"
@@ -975,18 +987,6 @@ dependencies = [
975
987
  "twox-hash",
976
988
  ]
977
989
 
978
- [[package]]
979
- name = "magnus"
980
- version = "0.6.4"
981
- source = "registry+https://github.com/rust-lang/crates.io-index"
982
- checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
983
- dependencies = [
984
- "magnus-macros",
985
- "rb-sys",
986
- "rb-sys-env",
987
- "seq-macro",
988
- ]
989
-
990
990
  [[package]]
991
991
  name = "magnus"
992
992
  version = "0.7.1"
@@ -1203,13 +1203,11 @@ dependencies = [
1203
1203
  "itertools 0.14.0",
1204
1204
  "jemallocator",
1205
1205
  "jiff",
1206
- "kanal",
1207
- "magnus 0.7.1",
1206
+ "magnus",
1208
1207
  "mimalloc",
1209
1208
  "parquet 54.0.0",
1210
1209
  "rb-sys",
1211
- "serde",
1212
- "serde_magnus",
1210
+ "tempfile",
1213
1211
  "thiserror",
1214
1212
  ]
1215
1213
 
@@ -1402,6 +1400,19 @@ dependencies = [
1402
1400
  "semver",
1403
1401
  ]
1404
1402
 
1403
+ [[package]]
1404
+ name = "rustix"
1405
+ version = "0.38.43"
1406
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1407
+ checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6"
1408
+ dependencies = [
1409
+ "bitflags 2.6.0",
1410
+ "errno",
1411
+ "libc",
1412
+ "linux-raw-sys",
1413
+ "windows-sys",
1414
+ ]
1415
+
1405
1416
  [[package]]
1406
1417
  name = "ryu"
1407
1418
  version = "1.0.18"
@@ -1467,17 +1478,6 @@ dependencies = [
1467
1478
  "serde",
1468
1479
  ]
1469
1480
 
1470
- [[package]]
1471
- name = "serde_magnus"
1472
- version = "0.8.1"
1473
- source = "registry+https://github.com/rust-lang/crates.io-index"
1474
- checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
1475
- dependencies = [
1476
- "magnus 0.6.4",
1477
- "serde",
1478
- "tap",
1479
- ]
1480
-
1481
1481
  [[package]]
1482
1482
  name = "shell-words"
1483
1483
  version = "1.1.0"
@@ -1567,10 +1567,18 @@ dependencies = [
1567
1567
  ]
1568
1568
 
1569
1569
  [[package]]
1570
- name = "tap"
1571
- version = "1.0.1"
1570
+ name = "tempfile"
1571
+ version = "3.15.0"
1572
1572
  source = "registry+https://github.com/rust-lang/crates.io-index"
1573
- checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
1573
+ checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704"
1574
+ dependencies = [
1575
+ "cfg-if",
1576
+ "fastrand",
1577
+ "getrandom",
1578
+ "once_cell",
1579
+ "rustix",
1580
+ "windows-sys",
1581
+ ]
1574
1582
 
1575
1583
  [[package]]
1576
1584
  name = "thiserror"
data/Gemfile CHANGED
@@ -8,7 +8,7 @@ gemspec
8
8
 
9
9
  group :development do
10
10
  gem "benchmark-ips", "~> 2.12"
11
- # gem "polars-df"
11
+ gem "polars-df"
12
12
  gem "duckdb"
13
13
  end
14
14
 
data/README.md CHANGED
@@ -4,8 +4,6 @@
4
4
 
5
5
  This project is a Ruby library wrapping the [parquet-rs](https://github.com/apache/parquet-rs) rust crate.
6
6
 
7
- At the moment, it only supports iterating rows as either a hash or an array.
8
-
9
7
  ## Usage
10
8
 
11
9
  This library provides high-level bindings to parquet-rs with two primary APIs for reading Parquet files: row-wise and column-wise iteration. The column-wise API generally offers better performance, especially when working with subset of columns.
@@ -83,3 +81,95 @@ Additional arguments for `each_column`:
83
81
  - `batch_size`: Number of rows per batch (defaults to implementation-defined value)
84
82
 
85
83
  When no block is given, both methods return an Enumerator.
84
+
85
+ ### Writing Row-wise Data
86
+
87
+ The `write_rows` method allows you to write data row by row:
88
+
89
+ ```ruby
90
+ require "parquet"
91
+
92
+ # Define the schema for your data
93
+ schema = [
94
+ { "id" => "int64" },
95
+ { "name" => "string" },
96
+ { "score" => "double" }
97
+ ]
98
+
99
+ # Create an enumerator that yields arrays of row values
100
+ rows = [
101
+ [1, "Alice", 95.5],
102
+ [2, "Bob", 82.3],
103
+ [3, "Charlie", 88.7]
104
+ ].each
105
+
106
+ # Write to a file
107
+ Parquet.write_rows(rows, schema: schema, write_to: "data.parquet")
108
+
109
+ # Write to an IO object
110
+ File.open("data.parquet", "wb") do |file|
111
+ Parquet.write_rows(rows, schema: schema, write_to: file)
112
+ end
113
+
114
+ # Optionally specify batch size (default is 1000)
115
+ Parquet.write_rows(rows,
116
+ schema: schema,
117
+ write_to: "data.parquet",
118
+ batch_size: 500
119
+ )
120
+ ```
121
+
122
+ ### Writing Column-wise Data
123
+
124
+ The `write_columns` method provides a more efficient way to write data in column-oriented batches:
125
+
126
+ ```ruby
127
+ require "parquet"
128
+
129
+ # Define the schema
130
+ schema = [
131
+ { "id" => "int64" },
132
+ { "name" => "string" },
133
+ { "score" => "double" }
134
+ ]
135
+
136
+ # Create batches of column data
137
+ batches = [
138
+ # First batch
139
+ [
140
+ [1, 2], # id column
141
+ ["Alice", "Bob"], # name column
142
+ [95.5, 82.3] # score column
143
+ ],
144
+ # Second batch
145
+ [
146
+ [3], # id column
147
+ ["Charlie"], # name column
148
+ [88.7] # score column
149
+ ]
150
+ ]
151
+
152
+ # Create an enumerator from the batches
153
+ columns = batches.each
154
+
155
+ # Write to a parquet file
156
+ Parquet.write_columns(columns, schema: schema, write_to: "data.parquet")
157
+
158
+ # Write to an IO object
159
+ File.open("data.parquet", "wb") do |file|
160
+ Parquet.write_columns(columns, schema: schema, write_to: file)
161
+ end
162
+ ```
163
+
164
+ The following data types are supported in the schema:
165
+
166
+ - `int8`, `int16`, `int32`, `int64`
167
+ - `uint8`, `uint16`, `uint32`, `uint64`
168
+ - `float`, `double`
169
+ - `string`
170
+ - `binary`
171
+ - `boolean`
172
+ - `date32`
173
+ - `timestamp_millis`, `timestamp_micros`
174
+
175
+ Note: List and Map types are currently not supported.
@@ -8,19 +8,16 @@ crate-type = ["cdylib"]
8
8
 
9
9
  [dependencies]
10
10
  ahash = "0.8"
11
- parquet = { version = "^54.0", features = ["json", "object_store"] }
12
- arrow-schema = "54.0.0"
13
11
  arrow-array = "54.0.0"
12
+ arrow-schema = "54.0.0"
14
13
  bytes = "^1.9"
15
- kanal = "0.1.0-pre8"
14
+ itertools = "^0.14"
15
+ jiff = "0.1.19"
16
16
  magnus = { version = "0.7", features = ["rb-sys"] }
17
+ parquet = { version = "^54.0", features = ["json", "object_store"] }
17
18
  rb-sys = "^0.9"
18
- serde = { version = "1.0", features = ["derive"] }
19
- serde_magnus = "0.8.1"
20
19
  thiserror = "2.0"
21
- itertools = "^0.14"
22
- jiff = "0.1.19"
23
-
20
+ tempfile = "^3.15"
24
21
 
25
22
  [target.'cfg(target_os = "linux")'.dependencies]
26
23
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -3,12 +3,12 @@ use magnus::{
3
3
  block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
4
  };
5
5
 
6
- use crate::{ColumnRecord, RowRecord};
6
+ use crate::{ColumnRecord, ParserResultType, RowRecord};
7
7
 
8
8
  pub struct RowEnumeratorArgs {
9
9
  pub rb_self: Value,
10
10
  pub to_read: Value,
11
- pub result_type: String,
11
+ pub result_type: ParserResultType,
12
12
  pub columns: Option<Vec<String>>,
13
13
  }
14
14
 
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
17
17
  args: RowEnumeratorArgs,
18
18
  ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
19
19
  let kwargs = RHash::new();
20
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
20
+ kwargs.aset(
21
+ Symbol::new("result_type"),
22
+ Symbol::new(args.result_type.to_string()),
23
+ )?;
21
24
  if let Some(columns) = args.columns {
22
25
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
23
26
  }
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
30
33
  pub struct ColumnEnumeratorArgs {
31
34
  pub rb_self: Value,
32
35
  pub to_read: Value,
33
- pub result_type: String,
36
+ pub result_type: ParserResultType,
34
37
  pub columns: Option<Vec<String>>,
35
38
  pub batch_size: Option<usize>,
36
39
  }
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
40
43
  args: ColumnEnumeratorArgs,
41
44
  ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
42
45
  let kwargs = RHash::new();
43
- kwargs.aset(Symbol::new("result_type"), Symbol::new(args.result_type))?;
46
+ kwargs.aset(
47
+ Symbol::new("result_type"),
48
+ Symbol::new(args.result_type.to_string()),
49
+ )?;
44
50
  if let Some(columns) = args.columns {
45
51
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
46
52
  }
@@ -6,6 +6,7 @@ mod ruby_integration;
6
6
  mod ruby_reader;
7
7
  mod types;
8
8
  mod utils;
9
+ mod writer;
9
10
 
10
11
  use crate::enumerator::*;
11
12
  use crate::reader::*;
@@ -13,6 +14,8 @@ use crate::ruby_integration::*;
13
14
  use crate::types::*;
14
15
 
15
16
  use magnus::{Error, Ruby};
17
+ use writer::write_columns;
18
+ use writer::write_rows;
16
19
 
17
20
  /// Initializes the Ruby extension and defines methods.
18
21
  #[magnus::init]
@@ -20,5 +23,7 @@ fn init(ruby: &Ruby) -> Result<(), Error> {
20
23
  let module = ruby.define_module("Parquet")?;
21
24
  module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
22
25
  module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
26
+ module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
27
+ module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
23
28
  Ok(())
24
29
  }
@@ -0,0 +1,42 @@
1
+ mod parquet_column_reader;
2
+ mod parquet_row_reader;
3
+
4
+ use std::io;
5
+
6
+ use magnus::{Error as MagnusError, Ruby};
7
+ use thiserror::Error;
8
+
9
+ use crate::header_cache::CacheError;
10
+ pub use parquet_column_reader::parse_parquet_columns;
11
+ pub use parquet_row_reader::parse_parquet_rows;
12
+
13
+ #[derive(Error, Debug)]
14
+ pub enum ReaderError {
15
+ #[error("Failed to get file descriptor: {0}")]
16
+ FileDescriptor(String),
17
+ #[error("Invalid file descriptor")]
18
+ InvalidFileDescriptor,
19
+ #[error("Failed to open file: {0}")]
20
+ FileOpen(#[from] io::Error),
21
+ #[error("Failed to intern headers: {0}")]
22
+ HeaderIntern(#[from] CacheError),
23
+ #[error("Ruby error: {0}")]
24
+ Ruby(String),
25
+ #[error("Parquet error: {0}")]
26
+ Parquet(#[from] parquet::errors::ParquetError),
27
+ }
28
+
29
+ impl From<MagnusError> for ReaderError {
30
+ fn from(err: MagnusError) -> Self {
31
+ Self::Ruby(err.to_string())
32
+ }
33
+ }
34
+
35
+ impl From<ReaderError> for MagnusError {
36
+ fn from(err: ReaderError) -> Self {
37
+ MagnusError::new(
38
+ Ruby::get().unwrap().exception_runtime_error(),
39
+ err.to_string(),
40
+ )
41
+ }
42
+ }
@@ -1,11 +1,7 @@
1
- // =============================================================================
2
- // Imports and Dependencies
3
- // =============================================================================
4
1
  use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
5
2
  use crate::{
6
- create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
7
- ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
8
- SeekableRubyValue,
3
+ create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
4
+ ParquetValueVec, ParserResultType, SeekableRubyValue,
9
5
  };
10
6
  use ahash::RandomState;
11
7
  use magnus::rb_sys::AsRawValue;
@@ -14,149 +10,20 @@ use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
14
10
  use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
15
11
  use parquet::arrow::ProjectionMask;
16
12
  use parquet::errors::ParquetError;
17
- use parquet::file::reader::FileReader;
18
- use parquet::file::reader::SerializedFileReader;
19
- use parquet::record::reader::RowIter as ParquetRowIter;
20
- use parquet::schema::types::{Type as SchemaType, TypePtr};
21
13
  use std::collections::HashMap;
22
14
  use std::fs::File;
23
- use std::io::{self};
15
+ use std::io;
24
16
  use std::mem::ManuallyDrop;
25
17
  use std::os::fd::FromRawFd;
26
18
  use std::sync::OnceLock;
27
19
  use thiserror::Error;
28
20
 
29
- #[inline]
30
- pub fn parse_parquet_rows<'a>(
31
- rb_self: Value,
32
- args: &[Value],
33
- ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
34
- let original = unsafe { Ruby::get_unchecked() };
35
- let ruby: &'static Ruby = Box::leak(Box::new(original));
36
-
37
- let ParquetRowsArgs {
38
- to_read,
39
- result_type,
40
- columns,
41
- } = parse_parquet_rows_args(&ruby, args)?;
42
-
43
- if !ruby.block_given() {
44
- return create_row_enumerator(RowEnumeratorArgs {
45
- rb_self,
46
- to_read,
47
- result_type,
48
- columns,
49
- });
50
- }
51
-
52
- let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
53
- let path_string = to_read.to_r_string()?;
54
- let file_path = unsafe { path_string.as_str()? };
55
- let file = File::open(file_path).unwrap();
56
- let reader = SerializedFileReader::new(file).unwrap();
57
- let schema = reader.metadata().file_metadata().schema().clone();
58
-
59
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
60
- } else if to_read.is_kind_of(ruby.class_io()) {
61
- let raw_value = to_read.as_raw();
62
- let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
63
- .map_err(|_| {
64
- ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
65
- })?;
66
-
67
- if fd < 0 {
68
- return Err(ReaderError::InvalidFileDescriptor.into());
69
- }
70
-
71
- let file = unsafe { File::from_raw_fd(fd) };
72
- let file = ForgottenFileHandle(ManuallyDrop::new(file));
73
- let reader = SerializedFileReader::new(file).unwrap();
74
- let schema = reader.metadata().file_metadata().schema().clone();
75
-
76
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
77
- } else {
78
- let readable = SeekableRubyValue(Opaque::from(to_read));
79
- let reader = SerializedFileReader::new(readable).unwrap();
80
- let schema = reader.metadata().file_metadata().schema().clone();
81
-
82
- (schema, ParquetRowIter::from_file_into(Box::new(reader)))
83
- };
84
-
85
- if let Some(cols) = columns {
86
- let projection = create_projection_schema(&schema, &cols);
87
- iter = iter.project(Some(projection.to_owned())).map_err(|e| {
88
- MagnusError::new(
89
- ruby.exception_runtime_error(),
90
- format!("Failed to create projection: {}", e),
91
- )
92
- })?;
93
- }
94
-
95
- let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
96
- "hash" => {
97
- let headers = OnceLock::new();
98
- let headers_clone = headers.clone();
99
- let iter = iter
100
- .filter_map(move |row| {
101
- row.ok().map(|row| {
102
- let headers = headers_clone.get_or_init(|| {
103
- let column_count = row.get_column_iter().count();
104
-
105
- let mut header_string = Vec::with_capacity(column_count);
106
- for (k, _) in row.get_column_iter() {
107
- header_string.push(k.to_owned());
108
- }
109
-
110
- let headers = StringCache::intern_many(&header_string).unwrap();
111
-
112
- headers
113
- });
114
-
115
- let mut map =
116
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
117
- row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
118
- map.insert(headers[i], ParquetField(v.clone()));
119
- });
120
- map
121
- })
122
- })
123
- .map(RowRecord::Map);
124
-
125
- Box::new(HeaderCacheCleanupIter {
126
- inner: iter,
127
- headers,
128
- })
129
- }
130
- "array" => Box::new(
131
- iter.filter_map(|row| {
132
- row.ok().map(|row| {
133
- let column_count = row.get_column_iter().count();
134
- let mut vec = Vec::with_capacity(column_count);
135
- row.get_column_iter()
136
- .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
137
- vec
138
- })
139
- })
140
- .map(RowRecord::Vec),
141
- ),
142
- _ => {
143
- return Err(MagnusError::new(
144
- ruby.exception_runtime_error(),
145
- "Invalid result type",
146
- ))
147
- }
148
- };
149
-
150
- Ok(Yield::Iter(iter))
151
- }
152
-
153
21
  #[inline]
154
22
  pub fn parse_parquet_columns<'a>(
155
23
  rb_self: Value,
156
24
  args: &[Value],
157
25
  ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
158
- let original = unsafe { Ruby::get_unchecked() };
159
- let ruby: &'static Ruby = Box::leak(Box::new(original));
26
+ let ruby = unsafe { Ruby::get_unchecked() };
160
27
 
161
28
  let ParquetColumnsArgs {
162
29
  to_read,
@@ -282,8 +149,8 @@ pub fn parse_parquet_columns<'a>(
282
149
  return Ok(Yield::Iter(Box::new(column_record.into_iter())));
283
150
  }
284
151
 
285
- let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
286
- "hash" => {
152
+ let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
153
+ ParserResultType::Hash => {
287
154
  let headers = OnceLock::new();
288
155
  let headers_clone = headers.clone();
289
156
  let iter = batch_reader
@@ -318,7 +185,7 @@ pub fn parse_parquet_columns<'a>(
318
185
  headers,
319
186
  })
320
187
  }
321
- "array" => Box::new(
188
+ ParserResultType::Array => Box::new(
322
189
  batch_reader
323
190
  .filter_map(|batch| {
324
191
  batch.ok().map(|batch| {
@@ -334,35 +201,11 @@ pub fn parse_parquet_columns<'a>(
334
201
  })
335
202
  .map(ColumnRecord::Vec),
336
203
  ),
337
- _ => {
338
- return Err(MagnusError::new(
339
- ruby.exception_runtime_error(),
340
- "Invalid result type",
341
- ))
342
- }
343
204
  };
344
205
 
345
206
  Ok(Yield::Iter(iter))
346
207
  }
347
208
 
348
- fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
349
- if let SchemaType::GroupType { fields, .. } = schema {
350
- let projected_fields: Vec<TypePtr> = fields
351
- .iter()
352
- .filter(|field| columns.contains(&field.name().to_string()))
353
- .cloned()
354
- .collect();
355
-
356
- SchemaType::GroupType {
357
- basic_info: schema.get_basic_info().clone(),
358
- fields: projected_fields,
359
- }
360
- } else {
361
- // Return original schema if not a group type
362
- schema.clone()
363
- }
364
- }
365
-
366
209
  #[derive(Error, Debug)]
367
210
  pub enum ReaderError {
368
211
  #[error("Failed to get file descriptor: {0}")]