parquet 0.2.10 → 0.2.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: d7ad6471a6c32833344fa6196794733c398a8de814652cc85f1121ab85f14f86
4
- data.tar.gz: edeb31c6b6683b45913e58782753678302c980165aab4764bbc5b498c1203798
3
+ metadata.gz: 7baa8799961bd4698da7c59c93cf8c36418553c29e4a56106a9338e1e00796d9
4
+ data.tar.gz: 84e6e87d4ea74a0be77e7cefa9ba21fd8c410b6a873108965294f41ac7443b04
5
5
  SHA512:
6
- metadata.gz: 13929dda3279394f9a8b2c25a3c605fd813393c44030b2c5fc52c815e0582b75c008bcb4146b08d6079ab3d19e8d545ffbe00b5a10d6757556d4f27122039927
7
- data.tar.gz: a542706c99bf184b0833239950d7a269dad091dece68e09dc495528df14f90ae614fc76017a8af64231e081c247a36def095b31deaa454eac9874c60805c47ca
6
+ metadata.gz: ff0aa33661944a72a69a31c287143b45d0c376fcba27ea4b5e416409702bb1acf896edeb5c2fb2bf485dd0083b5de21ca2ba9ee0cf619479b0f01f99b33a7c11
7
+ data.tar.gz: ed88efcc1e55a3c8b685f16e52dcdb9a378d64d2cf161ba27b6a613684bbbf13a60b532de556e21096558a0ca86a65ba201d11e793644214b2e015203531968f
data/Cargo.lock CHANGED
@@ -841,6 +841,7 @@ dependencies = [
841
841
  "parquet 54.0.0",
842
842
  "rand",
843
843
  "rb-sys",
844
+ "simdutf8",
844
845
  "tempfile",
845
846
  "thiserror",
846
847
  ]
@@ -1113,6 +1114,12 @@ version = "1.3.0"
1113
1114
  source = "registry+https://github.com/rust-lang/crates.io-index"
1114
1115
  checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
1115
1116
 
1117
+ [[package]]
1118
+ name = "simdutf8"
1119
+ version = "0.1.5"
1120
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1121
+ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
1122
+
1116
1123
  [[package]]
1117
1124
  name = "snap"
1118
1125
  version = "1.1.1"
@@ -19,6 +19,7 @@ rand = "0.9"
19
19
  rb-sys = "^0.9"
20
20
  thiserror = "2.0"
21
21
  tempfile = "^3.15"
22
+ simdutf8 = "0.1.5"
22
23
 
23
24
  [target.'cfg(target_os = "linux")'.dependencies]
24
25
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -1,9 +1,5 @@
1
- use ahash::RandomState;
2
- use magnus::{
3
- block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
4
- };
5
-
6
- use crate::{ColumnRecord, ParserResultType, RowRecord};
1
+ use crate::ParserResultType;
2
+ use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
7
3
 
8
4
  pub struct RowEnumeratorArgs {
9
5
  pub rb_self: Value,
@@ -12,10 +8,8 @@ pub struct RowEnumeratorArgs {
12
8
  pub columns: Option<Vec<String>>,
13
9
  }
14
10
 
15
- #[inline]
16
- pub fn create_row_enumerator(
17
- args: RowEnumeratorArgs,
18
- ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
11
+ /// Creates an enumerator for lazy Parquet row parsing
12
+ pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
19
13
  let kwargs = RHash::new();
20
14
  kwargs.aset(
21
15
  Symbol::new("result_type"),
@@ -24,10 +18,9 @@ pub fn create_row_enumerator(
24
18
  if let Some(columns) = args.columns {
25
19
  kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
26
20
  }
27
- let enumerator = args
21
+ Ok(args
28
22
  .rb_self
29
- .enumeratorize("each_row", (args.to_read, KwArgs(kwargs)));
30
- Ok(Yield::Enumerator(enumerator))
23
+ .enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
31
24
  }
32
25
 
33
26
  pub struct ColumnEnumeratorArgs {
@@ -41,7 +34,7 @@ pub struct ColumnEnumeratorArgs {
41
34
  #[inline]
42
35
  pub fn create_column_enumerator(
43
36
  args: ColumnEnumeratorArgs,
44
- ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
37
+ ) -> Result<magnus::Enumerator, MagnusError> {
45
38
  let kwargs = RHash::new();
46
39
  kwargs.aset(
47
40
  Symbol::new("result_type"),
@@ -53,8 +46,7 @@ pub fn create_column_enumerator(
53
46
  if let Some(batch_size) = args.batch_size {
54
47
  kwargs.aset(Symbol::new("batch_size"), batch_size)?;
55
48
  }
56
- let enumerator = args
49
+ Ok(args
57
50
  .rb_self
58
- .enumeratorize("each_column", (args.to_read, KwArgs(kwargs)));
59
- Ok(Yield::Enumerator(enumerator))
51
+ .enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
60
52
  }
@@ -8,11 +8,11 @@ use std::{
8
8
  collections::HashMap,
9
9
  sync::{
10
10
  atomic::{AtomicU32, Ordering},
11
- LazyLock, Mutex, OnceLock,
11
+ LazyLock, Mutex,
12
12
  },
13
13
  };
14
14
 
15
- use magnus::{r_string::FString, value::Opaque, IntoValue, RString, Ruby, Value};
15
+ use magnus::{IntoValue, RString, Ruby, Value};
16
16
 
17
17
  use thiserror::Error;
18
18
 
@@ -28,19 +28,19 @@ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, Atomi
28
28
  pub struct StringCache;
29
29
 
30
30
  #[derive(Copy, Clone)]
31
- pub struct StringCacheKey(Opaque<FString>, &'static str);
31
+ pub struct StringCacheKey(&'static str);
32
32
 
33
33
  impl StringCacheKey {
34
34
  pub fn new(string: &str) -> Self {
35
35
  let rstr = RString::new(string);
36
36
  let fstr = rstr.to_interned_str();
37
- Self(Opaque::from(fstr), fstr.as_str().unwrap())
37
+ Self(fstr.as_str().unwrap())
38
38
  }
39
39
  }
40
40
 
41
41
  impl AsRef<str> for StringCacheKey {
42
42
  fn as_ref(&self) -> &'static str {
43
- self.1
43
+ self.0
44
44
  }
45
45
  }
46
46
 
@@ -50,15 +50,21 @@ impl IntoValue for StringCacheKey {
50
50
  }
51
51
  }
52
52
 
53
+ impl IntoValue for &StringCacheKey {
54
+ fn into_value_with(self, handle: &Ruby) -> Value {
55
+ handle.into_value(self.0)
56
+ }
57
+ }
58
+
53
59
  impl std::fmt::Debug for StringCacheKey {
54
60
  fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55
- self.1.fmt(f)
61
+ self.0.fmt(f)
56
62
  }
57
63
  }
58
64
 
59
65
  impl PartialEq for StringCacheKey {
60
66
  fn eq(&self, other: &Self) -> bool {
61
- self.1 == other.1
67
+ self.0 == other.0
62
68
  }
63
69
  }
64
70
 
@@ -66,95 +72,29 @@ impl std::cmp::Eq for StringCacheKey {}
66
72
 
67
73
  impl std::hash::Hash for StringCacheKey {
68
74
  fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
69
- self.1.hash(state);
75
+ self.0.hash(state);
70
76
  }
71
77
  }
72
78
 
73
79
  impl StringCache {
74
- #[allow(dead_code)]
75
- pub fn intern(string: String) -> Result<StringCacheKey, CacheError> {
76
- let mut cache = STRING_CACHE
77
- .lock()
78
- .map_err(|e| CacheError::LockError(e.to_string()))?;
79
-
80
- if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
81
- counter.fetch_add(1, Ordering::Relaxed);
82
- Ok(*interned_string)
83
- } else {
84
- let interned = StringCacheKey::new(string.as_str());
85
- let leaked = Box::leak(string.into_boxed_str());
86
- cache.insert(leaked, (interned, AtomicU32::new(1)));
87
- Ok(interned)
88
- }
89
- }
90
-
91
- pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
80
+ pub fn intern_many<AsStr: AsRef<str>>(
81
+ strings: &[AsStr],
82
+ ) -> Result<Vec<StringCacheKey>, CacheError> {
92
83
  let mut cache = STRING_CACHE
93
84
  .lock()
94
85
  .map_err(|e| CacheError::LockError(e.to_string()))?;
95
86
 
96
87
  let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
97
88
  for string in strings {
98
- if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
89
+ if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
99
90
  counter.fetch_add(1, Ordering::Relaxed);
100
91
  result.push(*interned_string);
101
92
  } else {
102
- let interned = StringCacheKey::new(&string);
103
- let leaked = Box::leak(string.clone().into_boxed_str());
104
- cache.insert(leaked, (interned, AtomicU32::new(1)));
93
+ let interned = StringCacheKey::new(string.as_ref());
94
+ cache.insert(interned.0, (interned, AtomicU32::new(1)));
105
95
  result.push(interned);
106
96
  }
107
97
  }
108
98
  Ok(result)
109
99
  }
110
-
111
- pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
112
- let mut cache = STRING_CACHE
113
- .lock()
114
- .map_err(|e| CacheError::LockError(e.to_string()))?;
115
-
116
- let to_remove: Vec<_> = headers
117
- .iter()
118
- .filter_map(|header| {
119
- let key = header.as_ref();
120
- if let Some((_, (_, counter))) = cache.get_key_value(key) {
121
- let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
122
- if prev_count == 1 {
123
- Some(key)
124
- } else {
125
- None
126
- }
127
- } else {
128
- None
129
- }
130
- })
131
- .collect();
132
-
133
- for key in to_remove {
134
- cache.remove(key);
135
- }
136
-
137
- Ok(())
138
- }
139
- }
140
-
141
- pub struct HeaderCacheCleanupIter<I> {
142
- pub inner: I,
143
- pub headers: OnceLock<Vec<StringCacheKey>>,
144
- }
145
-
146
- impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
147
- type Item = I::Item;
148
-
149
- fn next(&mut self) -> Option<Self::Item> {
150
- self.inner.next()
151
- }
152
- }
153
-
154
- impl<I> Drop for HeaderCacheCleanupIter<I> {
155
- fn drop(&mut self) {
156
- if let Some(headers) = self.headers.get() {
157
- StringCache::clear(&headers).unwrap();
158
- }
159
- }
160
100
  }
@@ -24,6 +24,8 @@ pub enum ReaderError {
24
24
  Ruby(String),
25
25
  #[error("Parquet error: {0}")]
26
26
  Parquet(#[from] parquet::errors::ParquetError),
27
+ #[error("Arrow error: {0}")]
28
+ Arrow(#[from] arrow_schema::ArrowError),
27
29
  }
28
30
 
29
31
  impl From<MagnusError> for ReaderError {
@@ -1,4 +1,4 @@
1
- use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
1
+ use crate::header_cache::StringCache;
2
2
  use crate::{
3
3
  create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
4
4
  ParquetValueVec, ParserResultType, SeekableRubyValue,
@@ -6,23 +6,20 @@ use crate::{
6
6
  use ahash::RandomState;
7
7
  use magnus::rb_sys::AsRawValue;
8
8
  use magnus::value::{Opaque, ReprValue};
9
- use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
9
+ use magnus::IntoValue;
10
+ use magnus::{Error as MagnusError, Ruby, Value};
10
11
  use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
11
12
  use parquet::arrow::ProjectionMask;
12
- use parquet::errors::ParquetError;
13
13
  use std::collections::HashMap;
14
14
  use std::fs::File;
15
- use std::io;
16
15
  use std::mem::ManuallyDrop;
17
16
  use std::os::fd::FromRawFd;
18
17
  use std::sync::OnceLock;
19
- use thiserror::Error;
18
+
19
+ use super::ReaderError;
20
20
 
21
21
  #[inline]
22
- pub fn parse_parquet_columns<'a>(
23
- rb_self: Value,
24
- args: &[Value],
25
- ) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
22
+ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
26
23
  let ruby = unsafe { Ruby::get_unchecked() };
27
24
 
28
25
  let ParquetColumnsArgs {
@@ -39,7 +36,8 @@ pub fn parse_parquet_columns<'a>(
39
36
  result_type,
40
37
  columns,
41
38
  batch_size,
42
- });
39
+ })
40
+ .map(|yield_enum| yield_enum.into_value_with(&ruby));
43
41
  }
44
42
 
45
43
  let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
@@ -68,7 +66,7 @@ pub fn parse_parquet_columns<'a>(
68
66
  builder = builder.with_batch_size(batch_size);
69
67
  }
70
68
 
71
- let reader = builder.build().unwrap();
69
+ let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
72
70
 
73
71
  (reader, schema, num_rows)
74
72
  } else if to_read.is_kind_of(ruby.class_io()) {
@@ -85,14 +83,11 @@ pub fn parse_parquet_columns<'a>(
85
83
  let file = unsafe { File::from_raw_fd(fd) };
86
84
  let file = ForgottenFileHandle(ManuallyDrop::new(file));
87
85
 
88
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
86
+ let mut builder =
87
+ ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
89
88
  let schema = builder.schema().clone();
90
89
  let num_rows = builder.metadata().file_metadata().num_rows();
91
90
 
92
- if let Some(batch_size) = batch_size {
93
- builder = builder.with_batch_size(batch_size);
94
- }
95
-
96
91
  // If columns are specified, project only those columns
97
92
  if let Some(cols) = &columns {
98
93
  // Get the parquet schema
@@ -105,20 +100,21 @@ pub fn parse_parquet_columns<'a>(
105
100
  builder = builder.with_projection(projection);
106
101
  }
107
102
 
108
- let reader = builder.build().unwrap();
103
+ if let Some(batch_size) = batch_size {
104
+ builder = builder.with_batch_size(batch_size);
105
+ }
106
+
107
+ let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
109
108
 
110
109
  (reader, schema, num_rows)
111
110
  } else {
112
111
  let readable = SeekableRubyValue(Opaque::from(to_read));
113
112
 
114
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
113
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
114
+ .map_err(|e| ReaderError::Parquet(e))?;
115
115
  let schema = builder.schema().clone();
116
116
  let num_rows = builder.metadata().file_metadata().num_rows();
117
117
 
118
- if let Some(batch_size) = batch_size {
119
- builder = builder.with_batch_size(batch_size);
120
- }
121
-
122
118
  // If columns are specified, project only those columns
123
119
  if let Some(cols) = &columns {
124
120
  // Get the parquet schema
@@ -131,7 +127,11 @@ pub fn parse_parquet_columns<'a>(
131
127
  builder = builder.with_projection(projection);
132
128
  }
133
129
 
134
- let reader = builder.build().unwrap();
130
+ if let Some(batch_size) = batch_size {
131
+ builder = builder.with_batch_size(batch_size);
132
+ }
133
+
134
+ let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
135
135
 
136
136
  (reader, schema, num_rows)
137
137
  };
@@ -139,100 +139,76 @@ pub fn parse_parquet_columns<'a>(
139
139
  if num_rows == 0 {
140
140
  let mut map =
141
141
  HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
142
- for field in schema.fields() {
143
- map.insert(
144
- StringCache::intern(field.name().to_string()).unwrap(),
145
- vec![],
146
- );
142
+ let headers: Vec<String> = schema
143
+ .fields()
144
+ .iter()
145
+ .map(|field| field.name().to_string())
146
+ .collect();
147
+ let interned_headers =
148
+ StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
149
+ for field in interned_headers.iter() {
150
+ map.insert(*field, vec![]);
147
151
  }
148
- let column_record = vec![ColumnRecord::Map(map)];
149
- return Ok(Yield::Iter(Box::new(column_record.into_iter())));
152
+ let record = ColumnRecord::Map(map);
153
+ let _: Value = ruby.yield_value(record)?;
154
+ return Ok(ruby.qnil().into_value_with(&ruby));
150
155
  }
151
156
 
152
- let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
157
+ match result_type {
153
158
  ParserResultType::Hash => {
154
159
  let headers = OnceLock::new();
155
160
  let headers_clone = headers.clone();
156
- let iter = batch_reader
157
- .filter_map(move |batch| {
158
- batch.ok().map(|batch| {
159
- let headers = headers_clone.get_or_init(|| {
160
- let schema = batch.schema();
161
- let fields = schema.fields();
162
- let mut header_string = Vec::with_capacity(fields.len());
163
- for field in fields {
164
- header_string.push(field.name().to_owned());
165
- }
166
- StringCache::intern_many(&header_string).unwrap()
167
- });
168
-
169
- let mut map =
170
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
171
-
172
- batch.columns().iter().enumerate().for_each(|(i, column)| {
173
- let header = headers[i];
174
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
175
- map.insert(header, values.into_inner());
176
- });
177
-
178
- map
179
- })
161
+ let iter = batch_reader.map(move |batch| {
162
+ batch.map_err(ReaderError::Arrow).and_then(|batch| {
163
+ let headers = headers_clone.get_or_init(|| {
164
+ let schema = batch.schema();
165
+ let fields = schema.fields();
166
+ let mut header_string = Vec::with_capacity(fields.len());
167
+ for field in fields {
168
+ header_string.push(field.name().to_owned());
169
+ }
170
+ StringCache::intern_many(&header_string).unwrap()
171
+ });
172
+
173
+ let mut map =
174
+ HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
175
+
176
+ batch.columns().iter().enumerate().for_each(|(i, column)| {
177
+ let header = headers[i];
178
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
179
+ map.insert(header, values.into_inner());
180
+ });
181
+
182
+ Ok(ColumnRecord::Map::<RandomState>(map))
180
183
  })
181
- .map(ColumnRecord::Map);
184
+ });
182
185
 
183
- Box::new(HeaderCacheCleanupIter {
184
- inner: iter,
185
- headers,
186
- })
186
+ for result in iter {
187
+ let record = result?;
188
+ let _: Value = ruby.yield_value(record)?;
189
+ }
187
190
  }
188
- ParserResultType::Array => Box::new(
189
- batch_reader
190
- .filter_map(|batch| {
191
- batch.ok().map(|batch| {
192
- batch
193
- .columns()
194
- .into_iter()
195
- .map(|column| {
196
- let values = ParquetValueVec::try_from(column.clone()).unwrap();
197
- values.into_inner()
198
- })
199
- .collect()
200
- })
191
+ ParserResultType::Array => {
192
+ let iter = batch_reader.map(|batch| {
193
+ batch.map_err(ReaderError::Arrow).and_then(|batch| {
194
+ let vec = batch
195
+ .columns()
196
+ .into_iter()
197
+ .map(|column| {
198
+ let values = ParquetValueVec::try_from(column.clone()).unwrap();
199
+ values.into_inner()
200
+ })
201
+ .collect();
202
+ Ok(ColumnRecord::Vec::<RandomState>(vec))
201
203
  })
202
- .map(ColumnRecord::Vec),
203
- ),
204
- };
204
+ });
205
205
 
206
- Ok(Yield::Iter(iter))
207
- }
208
-
209
- #[derive(Error, Debug)]
210
- pub enum ReaderError {
211
- #[error("Failed to get file descriptor: {0}")]
212
- FileDescriptor(String),
213
- #[error("Invalid file descriptor")]
214
- InvalidFileDescriptor,
215
- #[error("Failed to open file: {0}")]
216
- FileOpen(#[from] io::Error),
217
- #[error("Failed to intern headers: {0}")]
218
- HeaderIntern(#[from] CacheError),
219
- #[error("Ruby error: {0}")]
220
- Ruby(String),
221
- #[error("Parquet error: {0}")]
222
- Parquet(#[from] ParquetError),
223
- }
224
-
225
- impl From<MagnusError> for ReaderError {
226
- fn from(err: MagnusError) -> Self {
227
- Self::Ruby(err.to_string())
206
+ for result in iter {
207
+ let record = result?;
208
+ let _: Value = ruby.yield_value(record)?;
209
+ }
210
+ }
228
211
  }
229
- }
230
212
 
231
- impl From<ReaderError> for MagnusError {
232
- fn from(err: ReaderError) -> Self {
233
- MagnusError::new(
234
- Ruby::get().unwrap().exception_runtime_error(),
235
- err.to_string(),
236
- )
237
- }
213
+ Ok(ruby.qnil().into_value_with(&ruby))
238
214
  }
@@ -1,4 +1,4 @@
1
- use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
1
+ use crate::header_cache::StringCache;
2
2
  use crate::{
3
3
  create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
4
4
  ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
@@ -6,7 +6,8 @@ use crate::{
6
6
  use ahash::RandomState;
7
7
  use magnus::rb_sys::AsRawValue;
8
8
  use magnus::value::{Opaque, ReprValue};
9
- use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
9
+ use magnus::IntoValue;
10
+ use magnus::{Error as MagnusError, Ruby, Value};
10
11
  use parquet::file::reader::{FileReader, SerializedFileReader};
11
12
  use parquet::record::reader::RowIter as ParquetRowIter;
12
13
  use parquet::schema::types::{Type as SchemaType, TypePtr};
@@ -17,10 +18,7 @@ use std::os::fd::FromRawFd;
17
18
  use std::sync::OnceLock;
18
19
 
19
20
  #[inline]
20
- pub fn parse_parquet_rows<'a>(
21
- rb_self: Value,
22
- args: &[Value],
23
- ) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
21
+ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
24
22
  let ruby = unsafe { Ruby::get_unchecked() };
25
23
 
26
24
  let ParquetRowsArgs {
@@ -35,7 +33,8 @@ pub fn parse_parquet_rows<'a>(
35
33
  to_read,
36
34
  result_type,
37
35
  columns,
38
- });
36
+ })
37
+ .map(|yield_enum| yield_enum.into_value_with(&ruby));
39
38
  }
40
39
 
41
40
  let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
@@ -81,56 +80,62 @@ pub fn parse_parquet_rows<'a>(
81
80
  })?;
82
81
  }
83
82
 
84
- let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
83
+ match result_type {
85
84
  ParserResultType::Hash => {
86
85
  let headers = OnceLock::new();
87
86
  let headers_clone = headers.clone();
88
- let iter = iter
89
- .filter_map(move |row| {
90
- row.ok().map(|row| {
91
- let headers = headers_clone.get_or_init(|| {
92
- let column_count = row.get_column_iter().count();
93
-
94
- let mut header_string = Vec::with_capacity(column_count);
95
- for (k, _) in row.get_column_iter() {
96
- header_string.push(k.to_owned());
97
- }
98
-
99
- let headers = StringCache::intern_many(&header_string).unwrap();
100
-
101
- headers
102
- });
103
-
104
- let mut map =
105
- HashMap::with_capacity_and_hasher(headers.len(), Default::default());
106
- row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
107
- map.insert(headers[i], ParquetField(v.clone()));
108
- });
109
- map
110
- })
87
+ let iter = iter.map(move |row| {
88
+ row.and_then(|row| {
89
+ let headers = headers_clone.get_or_init(|| {
90
+ let column_count = row.get_column_iter().count();
91
+
92
+ let mut header_string = Vec::with_capacity(column_count);
93
+ for (k, _) in row.get_column_iter() {
94
+ header_string.push(k.to_owned());
95
+ }
96
+
97
+ let headers = StringCache::intern_many(&header_string).unwrap();
98
+
99
+ headers
100
+ });
101
+
102
+ let mut map =
103
+ HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
104
+ row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
105
+ map.insert(headers[i], ParquetField(v.clone()));
106
+ });
107
+ Ok(map)
111
108
  })
112
- .map(RowRecord::Map);
113
-
114
- Box::new(HeaderCacheCleanupIter {
115
- inner: iter,
116
- headers,
117
- })
109
+ .and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
110
+ .map_err(|e| ReaderError::Parquet(e))
111
+ });
112
+
113
+ for result in iter {
114
+ let record = result?;
115
+ let _: Value = ruby.yield_value(record)?;
116
+ }
118
117
  }
119
- ParserResultType::Array => Box::new(
120
- iter.filter_map(|row| {
121
- row.ok().map(|row| {
118
+ ParserResultType::Array => {
119
+ let iter = iter.map(|row| {
120
+ row.and_then(|row| {
122
121
  let column_count = row.get_column_iter().count();
123
122
  let mut vec = Vec::with_capacity(column_count);
124
123
  row.get_column_iter()
125
124
  .for_each(|(_, v)| vec.push(ParquetField(v.clone())));
126
- vec
125
+ Ok(vec)
127
126
  })
128
- })
129
- .map(RowRecord::Vec),
130
- ),
131
- };
127
+ .and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
128
+ .map_err(|e| ReaderError::Parquet(e))
129
+ });
130
+
131
+ for result in iter {
132
+ let record = result?;
133
+ let _: Value = ruby.yield_value(record)?;
134
+ }
135
+ }
136
+ }
132
137
 
133
- Ok(Yield::Iter(iter))
138
+ Ok(ruby.qnil().into_value_with(&ruby))
134
139
  }
135
140
 
136
141
  fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.2.10"
2
+ VERSION = "0.2.12"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.10
4
+ version: 0.2.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-01-30 00:00:00.000000000 Z
11
+ date: 2025-02-03 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys