parquet 0.2.10 → 0.2.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +7 -0
- data/ext/parquet/Cargo.toml +1 -0
- data/ext/parquet/src/enumerator.rs +9 -17
- data/ext/parquet/src/header_cache.rs +20 -80
- data/ext/parquet/src/reader/mod.rs +2 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +82 -106
- data/ext/parquet/src/reader/parquet_row_reader.rs +51 -46
- data/lib/parquet/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 7baa8799961bd4698da7c59c93cf8c36418553c29e4a56106a9338e1e00796d9
|
4
|
+
data.tar.gz: 84e6e87d4ea74a0be77e7cefa9ba21fd8c410b6a873108965294f41ac7443b04
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ff0aa33661944a72a69a31c287143b45d0c376fcba27ea4b5e416409702bb1acf896edeb5c2fb2bf485dd0083b5de21ca2ba9ee0cf619479b0f01f99b33a7c11
|
7
|
+
data.tar.gz: ed88efcc1e55a3c8b685f16e52dcdb9a378d64d2cf161ba27b6a613684bbbf13a60b532de556e21096558a0ca86a65ba201d11e793644214b2e015203531968f
|
data/Cargo.lock
CHANGED
@@ -841,6 +841,7 @@ dependencies = [
|
|
841
841
|
"parquet 54.0.0",
|
842
842
|
"rand",
|
843
843
|
"rb-sys",
|
844
|
+
"simdutf8",
|
844
845
|
"tempfile",
|
845
846
|
"thiserror",
|
846
847
|
]
|
@@ -1113,6 +1114,12 @@ version = "1.3.0"
|
|
1113
1114
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1114
1115
|
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
1115
1116
|
|
1117
|
+
[[package]]
|
1118
|
+
name = "simdutf8"
|
1119
|
+
version = "0.1.5"
|
1120
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1121
|
+
checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
|
1122
|
+
|
1116
1123
|
[[package]]
|
1117
1124
|
name = "snap"
|
1118
1125
|
version = "1.1.1"
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -1,9 +1,5 @@
|
|
1
|
-
use
|
2
|
-
use magnus::{
|
3
|
-
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
|
-
};
|
5
|
-
|
6
|
-
use crate::{ColumnRecord, ParserResultType, RowRecord};
|
1
|
+
use crate::ParserResultType;
|
2
|
+
use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
|
7
3
|
|
8
4
|
pub struct RowEnumeratorArgs {
|
9
5
|
pub rb_self: Value,
|
@@ -12,10 +8,8 @@ pub struct RowEnumeratorArgs {
|
|
12
8
|
pub columns: Option<Vec<String>>,
|
13
9
|
}
|
14
10
|
|
15
|
-
|
16
|
-
pub fn create_row_enumerator(
|
17
|
-
args: RowEnumeratorArgs,
|
18
|
-
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
11
|
+
/// Creates an enumerator for lazy Parquet row parsing
|
12
|
+
pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
|
19
13
|
let kwargs = RHash::new();
|
20
14
|
kwargs.aset(
|
21
15
|
Symbol::new("result_type"),
|
@@ -24,10 +18,9 @@ pub fn create_row_enumerator(
|
|
24
18
|
if let Some(columns) = args.columns {
|
25
19
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
26
20
|
}
|
27
|
-
|
21
|
+
Ok(args
|
28
22
|
.rb_self
|
29
|
-
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs)))
|
30
|
-
Ok(Yield::Enumerator(enumerator))
|
23
|
+
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
|
31
24
|
}
|
32
25
|
|
33
26
|
pub struct ColumnEnumeratorArgs {
|
@@ -41,7 +34,7 @@ pub struct ColumnEnumeratorArgs {
|
|
41
34
|
#[inline]
|
42
35
|
pub fn create_column_enumerator(
|
43
36
|
args: ColumnEnumeratorArgs,
|
44
|
-
) -> Result<
|
37
|
+
) -> Result<magnus::Enumerator, MagnusError> {
|
45
38
|
let kwargs = RHash::new();
|
46
39
|
kwargs.aset(
|
47
40
|
Symbol::new("result_type"),
|
@@ -53,8 +46,7 @@ pub fn create_column_enumerator(
|
|
53
46
|
if let Some(batch_size) = args.batch_size {
|
54
47
|
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
55
48
|
}
|
56
|
-
|
49
|
+
Ok(args
|
57
50
|
.rb_self
|
58
|
-
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs)))
|
59
|
-
Ok(Yield::Enumerator(enumerator))
|
51
|
+
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
|
60
52
|
}
|
@@ -8,11 +8,11 @@ use std::{
|
|
8
8
|
collections::HashMap,
|
9
9
|
sync::{
|
10
10
|
atomic::{AtomicU32, Ordering},
|
11
|
-
LazyLock, Mutex,
|
11
|
+
LazyLock, Mutex,
|
12
12
|
},
|
13
13
|
};
|
14
14
|
|
15
|
-
use magnus::{
|
15
|
+
use magnus::{IntoValue, RString, Ruby, Value};
|
16
16
|
|
17
17
|
use thiserror::Error;
|
18
18
|
|
@@ -28,19 +28,19 @@ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, Atomi
|
|
28
28
|
pub struct StringCache;
|
29
29
|
|
30
30
|
#[derive(Copy, Clone)]
|
31
|
-
pub struct StringCacheKey(
|
31
|
+
pub struct StringCacheKey(&'static str);
|
32
32
|
|
33
33
|
impl StringCacheKey {
|
34
34
|
pub fn new(string: &str) -> Self {
|
35
35
|
let rstr = RString::new(string);
|
36
36
|
let fstr = rstr.to_interned_str();
|
37
|
-
Self(
|
37
|
+
Self(fstr.as_str().unwrap())
|
38
38
|
}
|
39
39
|
}
|
40
40
|
|
41
41
|
impl AsRef<str> for StringCacheKey {
|
42
42
|
fn as_ref(&self) -> &'static str {
|
43
|
-
self.
|
43
|
+
self.0
|
44
44
|
}
|
45
45
|
}
|
46
46
|
|
@@ -50,15 +50,21 @@ impl IntoValue for StringCacheKey {
|
|
50
50
|
}
|
51
51
|
}
|
52
52
|
|
53
|
+
impl IntoValue for &StringCacheKey {
|
54
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
55
|
+
handle.into_value(self.0)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
53
59
|
impl std::fmt::Debug for StringCacheKey {
|
54
60
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
|
-
self.
|
61
|
+
self.0.fmt(f)
|
56
62
|
}
|
57
63
|
}
|
58
64
|
|
59
65
|
impl PartialEq for StringCacheKey {
|
60
66
|
fn eq(&self, other: &Self) -> bool {
|
61
|
-
self.
|
67
|
+
self.0 == other.0
|
62
68
|
}
|
63
69
|
}
|
64
70
|
|
@@ -66,95 +72,29 @@ impl std::cmp::Eq for StringCacheKey {}
|
|
66
72
|
|
67
73
|
impl std::hash::Hash for StringCacheKey {
|
68
74
|
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
69
|
-
self.
|
75
|
+
self.0.hash(state);
|
70
76
|
}
|
71
77
|
}
|
72
78
|
|
73
79
|
impl StringCache {
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
.lock()
|
78
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
79
|
-
|
80
|
-
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
|
-
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
-
Ok(*interned_string)
|
83
|
-
} else {
|
84
|
-
let interned = StringCacheKey::new(string.as_str());
|
85
|
-
let leaked = Box::leak(string.into_boxed_str());
|
86
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
87
|
-
Ok(interned)
|
88
|
-
}
|
89
|
-
}
|
90
|
-
|
91
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
80
|
+
pub fn intern_many<AsStr: AsRef<str>>(
|
81
|
+
strings: &[AsStr],
|
82
|
+
) -> Result<Vec<StringCacheKey>, CacheError> {
|
92
83
|
let mut cache = STRING_CACHE
|
93
84
|
.lock()
|
94
85
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
95
86
|
|
96
87
|
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
97
88
|
for string in strings {
|
98
|
-
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.
|
89
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
|
99
90
|
counter.fetch_add(1, Ordering::Relaxed);
|
100
91
|
result.push(*interned_string);
|
101
92
|
} else {
|
102
|
-
let interned = StringCacheKey::new(
|
103
|
-
|
104
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
93
|
+
let interned = StringCacheKey::new(string.as_ref());
|
94
|
+
cache.insert(interned.0, (interned, AtomicU32::new(1)));
|
105
95
|
result.push(interned);
|
106
96
|
}
|
107
97
|
}
|
108
98
|
Ok(result)
|
109
99
|
}
|
110
|
-
|
111
|
-
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
112
|
-
let mut cache = STRING_CACHE
|
113
|
-
.lock()
|
114
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
115
|
-
|
116
|
-
let to_remove: Vec<_> = headers
|
117
|
-
.iter()
|
118
|
-
.filter_map(|header| {
|
119
|
-
let key = header.as_ref();
|
120
|
-
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
|
-
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
|
-
if prev_count == 1 {
|
123
|
-
Some(key)
|
124
|
-
} else {
|
125
|
-
None
|
126
|
-
}
|
127
|
-
} else {
|
128
|
-
None
|
129
|
-
}
|
130
|
-
})
|
131
|
-
.collect();
|
132
|
-
|
133
|
-
for key in to_remove {
|
134
|
-
cache.remove(key);
|
135
|
-
}
|
136
|
-
|
137
|
-
Ok(())
|
138
|
-
}
|
139
|
-
}
|
140
|
-
|
141
|
-
pub struct HeaderCacheCleanupIter<I> {
|
142
|
-
pub inner: I,
|
143
|
-
pub headers: OnceLock<Vec<StringCacheKey>>,
|
144
|
-
}
|
145
|
-
|
146
|
-
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
147
|
-
type Item = I::Item;
|
148
|
-
|
149
|
-
fn next(&mut self) -> Option<Self::Item> {
|
150
|
-
self.inner.next()
|
151
|
-
}
|
152
|
-
}
|
153
|
-
|
154
|
-
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
155
|
-
fn drop(&mut self) {
|
156
|
-
if let Some(headers) = self.headers.get() {
|
157
|
-
StringCache::clear(&headers).unwrap();
|
158
|
-
}
|
159
|
-
}
|
160
100
|
}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use crate::header_cache::
|
1
|
+
use crate::header_cache::StringCache;
|
2
2
|
use crate::{
|
3
3
|
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
4
|
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
@@ -6,23 +6,20 @@ use crate::{
|
|
6
6
|
use ahash::RandomState;
|
7
7
|
use magnus::rb_sys::AsRawValue;
|
8
8
|
use magnus::value::{Opaque, ReprValue};
|
9
|
-
use magnus::
|
9
|
+
use magnus::IntoValue;
|
10
|
+
use magnus::{Error as MagnusError, Ruby, Value};
|
10
11
|
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
11
12
|
use parquet::arrow::ProjectionMask;
|
12
|
-
use parquet::errors::ParquetError;
|
13
13
|
use std::collections::HashMap;
|
14
14
|
use std::fs::File;
|
15
|
-
use std::io;
|
16
15
|
use std::mem::ManuallyDrop;
|
17
16
|
use std::os::fd::FromRawFd;
|
18
17
|
use std::sync::OnceLock;
|
19
|
-
|
18
|
+
|
19
|
+
use super::ReaderError;
|
20
20
|
|
21
21
|
#[inline]
|
22
|
-
pub fn parse_parquet_columns<'a>(
|
23
|
-
rb_self: Value,
|
24
|
-
args: &[Value],
|
25
|
-
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
22
|
+
pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
26
23
|
let ruby = unsafe { Ruby::get_unchecked() };
|
27
24
|
|
28
25
|
let ParquetColumnsArgs {
|
@@ -39,7 +36,8 @@ pub fn parse_parquet_columns<'a>(
|
|
39
36
|
result_type,
|
40
37
|
columns,
|
41
38
|
batch_size,
|
42
|
-
})
|
39
|
+
})
|
40
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
43
41
|
}
|
44
42
|
|
45
43
|
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
@@ -68,7 +66,7 @@ pub fn parse_parquet_columns<'a>(
|
|
68
66
|
builder = builder.with_batch_size(batch_size);
|
69
67
|
}
|
70
68
|
|
71
|
-
let reader = builder.build().
|
69
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
72
70
|
|
73
71
|
(reader, schema, num_rows)
|
74
72
|
} else if to_read.is_kind_of(ruby.class_io()) {
|
@@ -85,14 +83,11 @@ pub fn parse_parquet_columns<'a>(
|
|
85
83
|
let file = unsafe { File::from_raw_fd(fd) };
|
86
84
|
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
87
85
|
|
88
|
-
let mut builder =
|
86
|
+
let mut builder =
|
87
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
89
88
|
let schema = builder.schema().clone();
|
90
89
|
let num_rows = builder.metadata().file_metadata().num_rows();
|
91
90
|
|
92
|
-
if let Some(batch_size) = batch_size {
|
93
|
-
builder = builder.with_batch_size(batch_size);
|
94
|
-
}
|
95
|
-
|
96
91
|
// If columns are specified, project only those columns
|
97
92
|
if let Some(cols) = &columns {
|
98
93
|
// Get the parquet schema
|
@@ -105,20 +100,21 @@ pub fn parse_parquet_columns<'a>(
|
|
105
100
|
builder = builder.with_projection(projection);
|
106
101
|
}
|
107
102
|
|
108
|
-
let
|
103
|
+
if let Some(batch_size) = batch_size {
|
104
|
+
builder = builder.with_batch_size(batch_size);
|
105
|
+
}
|
106
|
+
|
107
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
109
108
|
|
110
109
|
(reader, schema, num_rows)
|
111
110
|
} else {
|
112
111
|
let readable = SeekableRubyValue(Opaque::from(to_read));
|
113
112
|
|
114
|
-
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
|
113
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
|
114
|
+
.map_err(|e| ReaderError::Parquet(e))?;
|
115
115
|
let schema = builder.schema().clone();
|
116
116
|
let num_rows = builder.metadata().file_metadata().num_rows();
|
117
117
|
|
118
|
-
if let Some(batch_size) = batch_size {
|
119
|
-
builder = builder.with_batch_size(batch_size);
|
120
|
-
}
|
121
|
-
|
122
118
|
// If columns are specified, project only those columns
|
123
119
|
if let Some(cols) = &columns {
|
124
120
|
// Get the parquet schema
|
@@ -131,7 +127,11 @@ pub fn parse_parquet_columns<'a>(
|
|
131
127
|
builder = builder.with_projection(projection);
|
132
128
|
}
|
133
129
|
|
134
|
-
let
|
130
|
+
if let Some(batch_size) = batch_size {
|
131
|
+
builder = builder.with_batch_size(batch_size);
|
132
|
+
}
|
133
|
+
|
134
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
135
135
|
|
136
136
|
(reader, schema, num_rows)
|
137
137
|
};
|
@@ -139,100 +139,76 @@ pub fn parse_parquet_columns<'a>(
|
|
139
139
|
if num_rows == 0 {
|
140
140
|
let mut map =
|
141
141
|
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
);
|
142
|
+
let headers: Vec<String> = schema
|
143
|
+
.fields()
|
144
|
+
.iter()
|
145
|
+
.map(|field| field.name().to_string())
|
146
|
+
.collect();
|
147
|
+
let interned_headers =
|
148
|
+
StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
|
149
|
+
for field in interned_headers.iter() {
|
150
|
+
map.insert(*field, vec![]);
|
147
151
|
}
|
148
|
-
let
|
149
|
-
|
152
|
+
let record = ColumnRecord::Map(map);
|
153
|
+
let _: Value = ruby.yield_value(record)?;
|
154
|
+
return Ok(ruby.qnil().into_value_with(&ruby));
|
150
155
|
}
|
151
156
|
|
152
|
-
|
157
|
+
match result_type {
|
153
158
|
ParserResultType::Hash => {
|
154
159
|
let headers = OnceLock::new();
|
155
160
|
let headers_clone = headers.clone();
|
156
|
-
let iter = batch_reader
|
157
|
-
.
|
158
|
-
|
159
|
-
let
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
map
|
179
|
-
})
|
161
|
+
let iter = batch_reader.map(move |batch| {
|
162
|
+
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
163
|
+
let headers = headers_clone.get_or_init(|| {
|
164
|
+
let schema = batch.schema();
|
165
|
+
let fields = schema.fields();
|
166
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
167
|
+
for field in fields {
|
168
|
+
header_string.push(field.name().to_owned());
|
169
|
+
}
|
170
|
+
StringCache::intern_many(&header_string).unwrap()
|
171
|
+
});
|
172
|
+
|
173
|
+
let mut map =
|
174
|
+
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
175
|
+
|
176
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
177
|
+
let header = headers[i];
|
178
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
179
|
+
map.insert(header, values.into_inner());
|
180
|
+
});
|
181
|
+
|
182
|
+
Ok(ColumnRecord::Map::<RandomState>(map))
|
180
183
|
})
|
181
|
-
|
184
|
+
});
|
182
185
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
}
|
186
|
+
for result in iter {
|
187
|
+
let record = result?;
|
188
|
+
let _: Value = ruby.yield_value(record)?;
|
189
|
+
}
|
187
190
|
}
|
188
|
-
ParserResultType::Array =>
|
189
|
-
batch_reader
|
190
|
-
.
|
191
|
-
batch
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
.
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
})
|
191
|
+
ParserResultType::Array => {
|
192
|
+
let iter = batch_reader.map(|batch| {
|
193
|
+
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
194
|
+
let vec = batch
|
195
|
+
.columns()
|
196
|
+
.into_iter()
|
197
|
+
.map(|column| {
|
198
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
199
|
+
values.into_inner()
|
200
|
+
})
|
201
|
+
.collect();
|
202
|
+
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
201
203
|
})
|
202
|
-
|
203
|
-
),
|
204
|
-
};
|
204
|
+
});
|
205
205
|
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
#[error("Failed to get file descriptor: {0}")]
|
212
|
-
FileDescriptor(String),
|
213
|
-
#[error("Invalid file descriptor")]
|
214
|
-
InvalidFileDescriptor,
|
215
|
-
#[error("Failed to open file: {0}")]
|
216
|
-
FileOpen(#[from] io::Error),
|
217
|
-
#[error("Failed to intern headers: {0}")]
|
218
|
-
HeaderIntern(#[from] CacheError),
|
219
|
-
#[error("Ruby error: {0}")]
|
220
|
-
Ruby(String),
|
221
|
-
#[error("Parquet error: {0}")]
|
222
|
-
Parquet(#[from] ParquetError),
|
223
|
-
}
|
224
|
-
|
225
|
-
impl From<MagnusError> for ReaderError {
|
226
|
-
fn from(err: MagnusError) -> Self {
|
227
|
-
Self::Ruby(err.to_string())
|
206
|
+
for result in iter {
|
207
|
+
let record = result?;
|
208
|
+
let _: Value = ruby.yield_value(record)?;
|
209
|
+
}
|
210
|
+
}
|
228
211
|
}
|
229
|
-
}
|
230
212
|
|
231
|
-
|
232
|
-
fn from(err: ReaderError) -> Self {
|
233
|
-
MagnusError::new(
|
234
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
235
|
-
err.to_string(),
|
236
|
-
)
|
237
|
-
}
|
213
|
+
Ok(ruby.qnil().into_value_with(&ruby))
|
238
214
|
}
|
@@ -1,4 +1,4 @@
|
|
1
|
-
use crate::header_cache::
|
1
|
+
use crate::header_cache::StringCache;
|
2
2
|
use crate::{
|
3
3
|
create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
|
4
4
|
ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
|
@@ -6,7 +6,8 @@ use crate::{
|
|
6
6
|
use ahash::RandomState;
|
7
7
|
use magnus::rb_sys::AsRawValue;
|
8
8
|
use magnus::value::{Opaque, ReprValue};
|
9
|
-
use magnus::
|
9
|
+
use magnus::IntoValue;
|
10
|
+
use magnus::{Error as MagnusError, Ruby, Value};
|
10
11
|
use parquet::file::reader::{FileReader, SerializedFileReader};
|
11
12
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
12
13
|
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
@@ -17,10 +18,7 @@ use std::os::fd::FromRawFd;
|
|
17
18
|
use std::sync::OnceLock;
|
18
19
|
|
19
20
|
#[inline]
|
20
|
-
pub fn parse_parquet_rows<'a>(
|
21
|
-
rb_self: Value,
|
22
|
-
args: &[Value],
|
23
|
-
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
21
|
+
pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
24
22
|
let ruby = unsafe { Ruby::get_unchecked() };
|
25
23
|
|
26
24
|
let ParquetRowsArgs {
|
@@ -35,7 +33,8 @@ pub fn parse_parquet_rows<'a>(
|
|
35
33
|
to_read,
|
36
34
|
result_type,
|
37
35
|
columns,
|
38
|
-
})
|
36
|
+
})
|
37
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
39
38
|
}
|
40
39
|
|
41
40
|
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
@@ -81,56 +80,62 @@ pub fn parse_parquet_rows<'a>(
|
|
81
80
|
})?;
|
82
81
|
}
|
83
82
|
|
84
|
-
|
83
|
+
match result_type {
|
85
84
|
ParserResultType::Hash => {
|
86
85
|
let headers = OnceLock::new();
|
87
86
|
let headers_clone = headers.clone();
|
88
|
-
let iter = iter
|
89
|
-
.
|
90
|
-
|
91
|
-
let
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
map
|
110
|
-
})
|
87
|
+
let iter = iter.map(move |row| {
|
88
|
+
row.and_then(|row| {
|
89
|
+
let headers = headers_clone.get_or_init(|| {
|
90
|
+
let column_count = row.get_column_iter().count();
|
91
|
+
|
92
|
+
let mut header_string = Vec::with_capacity(column_count);
|
93
|
+
for (k, _) in row.get_column_iter() {
|
94
|
+
header_string.push(k.to_owned());
|
95
|
+
}
|
96
|
+
|
97
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
98
|
+
|
99
|
+
headers
|
100
|
+
});
|
101
|
+
|
102
|
+
let mut map =
|
103
|
+
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
104
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
105
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
106
|
+
});
|
107
|
+
Ok(map)
|
111
108
|
})
|
112
|
-
.
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
109
|
+
.and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
|
110
|
+
.map_err(|e| ReaderError::Parquet(e))
|
111
|
+
});
|
112
|
+
|
113
|
+
for result in iter {
|
114
|
+
let record = result?;
|
115
|
+
let _: Value = ruby.yield_value(record)?;
|
116
|
+
}
|
118
117
|
}
|
119
|
-
ParserResultType::Array =>
|
120
|
-
iter.
|
121
|
-
row.
|
118
|
+
ParserResultType::Array => {
|
119
|
+
let iter = iter.map(|row| {
|
120
|
+
row.and_then(|row| {
|
122
121
|
let column_count = row.get_column_iter().count();
|
123
122
|
let mut vec = Vec::with_capacity(column_count);
|
124
123
|
row.get_column_iter()
|
125
124
|
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
126
|
-
vec
|
125
|
+
Ok(vec)
|
127
126
|
})
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
127
|
+
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|
128
|
+
.map_err(|e| ReaderError::Parquet(e))
|
129
|
+
});
|
130
|
+
|
131
|
+
for result in iter {
|
132
|
+
let record = result?;
|
133
|
+
let _: Value = ruby.yield_value(record)?;
|
134
|
+
}
|
135
|
+
}
|
136
|
+
}
|
132
137
|
|
133
|
-
Ok(
|
138
|
+
Ok(ruby.qnil().into_value_with(&ruby))
|
134
139
|
}
|
135
140
|
|
136
141
|
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-
|
11
|
+
date: 2025-02-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|