parquet 0.2.10 → 0.2.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +7 -0
- data/ext/parquet/Cargo.toml +1 -0
- data/ext/parquet/src/enumerator.rs +17 -17
- data/ext/parquet/src/header_cache.rs +21 -81
- data/ext/parquet/src/reader/mod.rs +6 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +93 -98
- data/ext/parquet/src/reader/parquet_row_reader.rs +55 -47
- data/ext/parquet/src/types/parquet_value.rs +157 -118
- data/ext/parquet/src/types/record_types.rs +91 -77
- data/ext/parquet/src/types/timestamp.rs +4 -5
- data/ext/parquet/src/types/type_conversion.rs +2 -2
- data/ext/parquet/src/utils.rs +19 -3
- data/lib/parquet/parquet.so +0 -0
- data/lib/parquet/version.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0cf24938c23cee5bc8ed4049e2b3fee7794cb619755e26cf83d4bb8826ebccd7
|
4
|
+
data.tar.gz: 85f55738e3503729535de7854d7438bca69f0b82e648471c285a3eefdb51a69b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 17eaa053e7c05605d63c84786958f2980817509a6ba165654bfe50459cc30a37553671cc8b57a70831c255e463b26fc2768afcee3621664b443f9e0e67dc4460
|
7
|
+
data.tar.gz: e1a90f2683fce4a10b489eba3b0d98754ebeed9c418bd274e1af38c8fd9b5ad50f1e4ce72568f9c98001db408367c9610723f5c24796c801a1eaed4c23377d42
|
data/Cargo.lock
CHANGED
@@ -841,6 +841,7 @@ dependencies = [
|
|
841
841
|
"parquet 54.0.0",
|
842
842
|
"rand",
|
843
843
|
"rb-sys",
|
844
|
+
"simdutf8",
|
844
845
|
"tempfile",
|
845
846
|
"thiserror",
|
846
847
|
]
|
@@ -1113,6 +1114,12 @@ version = "1.3.0"
|
|
1113
1114
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1114
1115
|
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
1115
1116
|
|
1117
|
+
[[package]]
|
1118
|
+
name = "simdutf8"
|
1119
|
+
version = "0.1.5"
|
1120
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1121
|
+
checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
|
1122
|
+
|
1116
1123
|
[[package]]
|
1117
1124
|
name = "snap"
|
1118
1125
|
version = "1.1.1"
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -1,21 +1,16 @@
|
|
1
|
-
use
|
2
|
-
use magnus::{
|
3
|
-
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
|
-
};
|
5
|
-
|
6
|
-
use crate::{ColumnRecord, ParserResultType, RowRecord};
|
1
|
+
use crate::ParserResultType;
|
2
|
+
use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
|
7
3
|
|
8
4
|
pub struct RowEnumeratorArgs {
|
9
5
|
pub rb_self: Value,
|
10
6
|
pub to_read: Value,
|
11
7
|
pub result_type: ParserResultType,
|
12
8
|
pub columns: Option<Vec<String>>,
|
9
|
+
pub strict: bool,
|
13
10
|
}
|
14
11
|
|
15
|
-
|
16
|
-
pub fn create_row_enumerator(
|
17
|
-
args: RowEnumeratorArgs,
|
18
|
-
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
12
|
+
/// Creates an enumerator for lazy Parquet row parsing
|
13
|
+
pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
|
19
14
|
let kwargs = RHash::new();
|
20
15
|
kwargs.aset(
|
21
16
|
Symbol::new("result_type"),
|
@@ -24,10 +19,12 @@ pub fn create_row_enumerator(
|
|
24
19
|
if let Some(columns) = args.columns {
|
25
20
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
26
21
|
}
|
27
|
-
|
22
|
+
if args.strict {
|
23
|
+
kwargs.aset(Symbol::new("strict"), true)?;
|
24
|
+
}
|
25
|
+
Ok(args
|
28
26
|
.rb_self
|
29
|
-
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs)))
|
30
|
-
Ok(Yield::Enumerator(enumerator))
|
27
|
+
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
|
31
28
|
}
|
32
29
|
|
33
30
|
pub struct ColumnEnumeratorArgs {
|
@@ -36,12 +33,13 @@ pub struct ColumnEnumeratorArgs {
|
|
36
33
|
pub result_type: ParserResultType,
|
37
34
|
pub columns: Option<Vec<String>>,
|
38
35
|
pub batch_size: Option<usize>,
|
36
|
+
pub strict: bool,
|
39
37
|
}
|
40
38
|
|
41
39
|
#[inline]
|
42
40
|
pub fn create_column_enumerator(
|
43
41
|
args: ColumnEnumeratorArgs,
|
44
|
-
) -> Result<
|
42
|
+
) -> Result<magnus::Enumerator, MagnusError> {
|
45
43
|
let kwargs = RHash::new();
|
46
44
|
kwargs.aset(
|
47
45
|
Symbol::new("result_type"),
|
@@ -53,8 +51,10 @@ pub fn create_column_enumerator(
|
|
53
51
|
if let Some(batch_size) = args.batch_size {
|
54
52
|
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
55
53
|
}
|
56
|
-
|
54
|
+
if args.strict {
|
55
|
+
kwargs.aset(Symbol::new("strict"), true)?;
|
56
|
+
}
|
57
|
+
Ok(args
|
57
58
|
.rb_self
|
58
|
-
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs)))
|
59
|
-
Ok(Yield::Enumerator(enumerator))
|
59
|
+
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
|
60
60
|
}
|
@@ -8,15 +8,15 @@ use std::{
|
|
8
8
|
collections::HashMap,
|
9
9
|
sync::{
|
10
10
|
atomic::{AtomicU32, Ordering},
|
11
|
-
LazyLock, Mutex,
|
11
|
+
LazyLock, Mutex,
|
12
12
|
},
|
13
13
|
};
|
14
14
|
|
15
|
-
use magnus::{
|
15
|
+
use magnus::{IntoValue, RString, Ruby, Value};
|
16
16
|
|
17
17
|
use thiserror::Error;
|
18
18
|
|
19
|
-
#[derive(Debug, Error)]
|
19
|
+
#[derive(Debug, Clone, Error)]
|
20
20
|
pub enum CacheError {
|
21
21
|
#[error("Failed to acquire lock: {0}")]
|
22
22
|
LockError(String),
|
@@ -28,19 +28,19 @@ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, Atomi
|
|
28
28
|
pub struct StringCache;
|
29
29
|
|
30
30
|
#[derive(Copy, Clone)]
|
31
|
-
pub struct StringCacheKey(
|
31
|
+
pub struct StringCacheKey(&'static str);
|
32
32
|
|
33
33
|
impl StringCacheKey {
|
34
34
|
pub fn new(string: &str) -> Self {
|
35
35
|
let rstr = RString::new(string);
|
36
36
|
let fstr = rstr.to_interned_str();
|
37
|
-
Self(
|
37
|
+
Self(fstr.as_str().unwrap())
|
38
38
|
}
|
39
39
|
}
|
40
40
|
|
41
41
|
impl AsRef<str> for StringCacheKey {
|
42
42
|
fn as_ref(&self) -> &'static str {
|
43
|
-
self.
|
43
|
+
self.0
|
44
44
|
}
|
45
45
|
}
|
46
46
|
|
@@ -50,15 +50,21 @@ impl IntoValue for StringCacheKey {
|
|
50
50
|
}
|
51
51
|
}
|
52
52
|
|
53
|
+
impl IntoValue for &StringCacheKey {
|
54
|
+
fn into_value_with(self, handle: &Ruby) -> Value {
|
55
|
+
handle.into_value(self.0)
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
53
59
|
impl std::fmt::Debug for StringCacheKey {
|
54
60
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
55
|
-
self.
|
61
|
+
self.0.fmt(f)
|
56
62
|
}
|
57
63
|
}
|
58
64
|
|
59
65
|
impl PartialEq for StringCacheKey {
|
60
66
|
fn eq(&self, other: &Self) -> bool {
|
61
|
-
self.
|
67
|
+
self.0 == other.0
|
62
68
|
}
|
63
69
|
}
|
64
70
|
|
@@ -66,95 +72,29 @@ impl std::cmp::Eq for StringCacheKey {}
|
|
66
72
|
|
67
73
|
impl std::hash::Hash for StringCacheKey {
|
68
74
|
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
69
|
-
self.
|
75
|
+
self.0.hash(state);
|
70
76
|
}
|
71
77
|
}
|
72
78
|
|
73
79
|
impl StringCache {
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
.lock()
|
78
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
79
|
-
|
80
|
-
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_str()) {
|
81
|
-
counter.fetch_add(1, Ordering::Relaxed);
|
82
|
-
Ok(*interned_string)
|
83
|
-
} else {
|
84
|
-
let interned = StringCacheKey::new(string.as_str());
|
85
|
-
let leaked = Box::leak(string.into_boxed_str());
|
86
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
87
|
-
Ok(interned)
|
88
|
-
}
|
89
|
-
}
|
90
|
-
|
91
|
-
pub fn intern_many(strings: &[String]) -> Result<Vec<StringCacheKey>, CacheError> {
|
80
|
+
pub fn intern_many<AsStr: AsRef<str>>(
|
81
|
+
strings: &[AsStr],
|
82
|
+
) -> Result<Vec<StringCacheKey>, CacheError> {
|
92
83
|
let mut cache = STRING_CACHE
|
93
84
|
.lock()
|
94
85
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
95
86
|
|
96
87
|
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
97
88
|
for string in strings {
|
98
|
-
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.
|
89
|
+
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
|
99
90
|
counter.fetch_add(1, Ordering::Relaxed);
|
100
91
|
result.push(*interned_string);
|
101
92
|
} else {
|
102
|
-
let interned = StringCacheKey::new(
|
103
|
-
|
104
|
-
cache.insert(leaked, (interned, AtomicU32::new(1)));
|
93
|
+
let interned = StringCacheKey::new(string.as_ref());
|
94
|
+
cache.insert(interned.0, (interned, AtomicU32::new(1)));
|
105
95
|
result.push(interned);
|
106
96
|
}
|
107
97
|
}
|
108
98
|
Ok(result)
|
109
99
|
}
|
110
|
-
|
111
|
-
pub fn clear(headers: &[StringCacheKey]) -> Result<(), CacheError> {
|
112
|
-
let mut cache = STRING_CACHE
|
113
|
-
.lock()
|
114
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
115
|
-
|
116
|
-
let to_remove: Vec<_> = headers
|
117
|
-
.iter()
|
118
|
-
.filter_map(|header| {
|
119
|
-
let key = header.as_ref();
|
120
|
-
if let Some((_, (_, counter))) = cache.get_key_value(key) {
|
121
|
-
let prev_count = counter.fetch_sub(1, Ordering::Relaxed);
|
122
|
-
if prev_count == 1 {
|
123
|
-
Some(key)
|
124
|
-
} else {
|
125
|
-
None
|
126
|
-
}
|
127
|
-
} else {
|
128
|
-
None
|
129
|
-
}
|
130
|
-
})
|
131
|
-
.collect();
|
132
|
-
|
133
|
-
for key in to_remove {
|
134
|
-
cache.remove(key);
|
135
|
-
}
|
136
|
-
|
137
|
-
Ok(())
|
138
|
-
}
|
139
|
-
}
|
140
|
-
|
141
|
-
pub struct HeaderCacheCleanupIter<I> {
|
142
|
-
pub inner: I,
|
143
|
-
pub headers: OnceLock<Vec<StringCacheKey>>,
|
144
|
-
}
|
145
|
-
|
146
|
-
impl<I: Iterator> Iterator for HeaderCacheCleanupIter<I> {
|
147
|
-
type Item = I::Item;
|
148
|
-
|
149
|
-
fn next(&mut self) -> Option<Self::Item> {
|
150
|
-
self.inner.next()
|
151
|
-
}
|
152
|
-
}
|
153
|
-
|
154
|
-
impl<I> Drop for HeaderCacheCleanupIter<I> {
|
155
|
-
fn drop(&mut self) {
|
156
|
-
if let Some(headers) = self.headers.get() {
|
157
|
-
StringCache::clear(&headers).unwrap();
|
158
|
-
}
|
159
|
-
}
|
160
100
|
}
|
@@ -24,6 +24,12 @@ pub enum ReaderError {
|
|
24
24
|
Ruby(String),
|
25
25
|
#[error("Parquet error: {0}")]
|
26
26
|
Parquet(#[from] parquet::errors::ParquetError),
|
27
|
+
#[error("Arrow error: {0}")]
|
28
|
+
Arrow(#[from] arrow_schema::ArrowError),
|
29
|
+
#[error("UTF-8 error: {0}")]
|
30
|
+
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
31
|
+
#[error("Jiff error: {0}")]
|
32
|
+
Jiff(#[from] jiff::Error),
|
27
33
|
}
|
28
34
|
|
29
35
|
impl From<MagnusError> for ReaderError {
|
@@ -1,4 +1,5 @@
|
|
1
|
-
use crate::header_cache::
|
1
|
+
use crate::header_cache::StringCache;
|
2
|
+
use crate::types::{ArrayWrapper, TryIntoValue};
|
2
3
|
use crate::{
|
3
4
|
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
5
|
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
@@ -6,23 +7,20 @@ use crate::{
|
|
6
7
|
use ahash::RandomState;
|
7
8
|
use magnus::rb_sys::AsRawValue;
|
8
9
|
use magnus::value::{Opaque, ReprValue};
|
9
|
-
use magnus::
|
10
|
+
use magnus::IntoValue;
|
11
|
+
use magnus::{Error as MagnusError, Ruby, Value};
|
10
12
|
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
11
13
|
use parquet::arrow::ProjectionMask;
|
12
|
-
use parquet::errors::ParquetError;
|
13
14
|
use std::collections::HashMap;
|
14
15
|
use std::fs::File;
|
15
|
-
use std::io;
|
16
16
|
use std::mem::ManuallyDrop;
|
17
17
|
use std::os::fd::FromRawFd;
|
18
18
|
use std::sync::OnceLock;
|
19
|
-
|
19
|
+
|
20
|
+
use super::ReaderError;
|
20
21
|
|
21
22
|
#[inline]
|
22
|
-
pub fn parse_parquet_columns<'a>(
|
23
|
-
rb_self: Value,
|
24
|
-
args: &[Value],
|
25
|
-
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
23
|
+
pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
26
24
|
let ruby = unsafe { Ruby::get_unchecked() };
|
27
25
|
|
28
26
|
let ParquetColumnsArgs {
|
@@ -30,6 +28,7 @@ pub fn parse_parquet_columns<'a>(
|
|
30
28
|
result_type,
|
31
29
|
columns,
|
32
30
|
batch_size,
|
31
|
+
strict,
|
33
32
|
} = parse_parquet_columns_args(&ruby, args)?;
|
34
33
|
|
35
34
|
if !ruby.block_given() {
|
@@ -39,7 +38,9 @@ pub fn parse_parquet_columns<'a>(
|
|
39
38
|
result_type,
|
40
39
|
columns,
|
41
40
|
batch_size,
|
42
|
-
|
41
|
+
strict,
|
42
|
+
})
|
43
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
43
44
|
}
|
44
45
|
|
45
46
|
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
@@ -68,7 +69,7 @@ pub fn parse_parquet_columns<'a>(
|
|
68
69
|
builder = builder.with_batch_size(batch_size);
|
69
70
|
}
|
70
71
|
|
71
|
-
let reader = builder.build().
|
72
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
72
73
|
|
73
74
|
(reader, schema, num_rows)
|
74
75
|
} else if to_read.is_kind_of(ruby.class_io()) {
|
@@ -85,14 +86,11 @@ pub fn parse_parquet_columns<'a>(
|
|
85
86
|
let file = unsafe { File::from_raw_fd(fd) };
|
86
87
|
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
87
88
|
|
88
|
-
let mut builder =
|
89
|
+
let mut builder =
|
90
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
89
91
|
let schema = builder.schema().clone();
|
90
92
|
let num_rows = builder.metadata().file_metadata().num_rows();
|
91
93
|
|
92
|
-
if let Some(batch_size) = batch_size {
|
93
|
-
builder = builder.with_batch_size(batch_size);
|
94
|
-
}
|
95
|
-
|
96
94
|
// If columns are specified, project only those columns
|
97
95
|
if let Some(cols) = &columns {
|
98
96
|
// Get the parquet schema
|
@@ -105,20 +103,21 @@ pub fn parse_parquet_columns<'a>(
|
|
105
103
|
builder = builder.with_projection(projection);
|
106
104
|
}
|
107
105
|
|
108
|
-
let
|
106
|
+
if let Some(batch_size) = batch_size {
|
107
|
+
builder = builder.with_batch_size(batch_size);
|
108
|
+
}
|
109
|
+
|
110
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
109
111
|
|
110
112
|
(reader, schema, num_rows)
|
111
113
|
} else {
|
112
114
|
let readable = SeekableRubyValue(Opaque::from(to_read));
|
113
115
|
|
114
|
-
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
|
116
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
|
117
|
+
.map_err(|e| ReaderError::Parquet(e))?;
|
115
118
|
let schema = builder.schema().clone();
|
116
119
|
let num_rows = builder.metadata().file_metadata().num_rows();
|
117
120
|
|
118
|
-
if let Some(batch_size) = batch_size {
|
119
|
-
builder = builder.with_batch_size(batch_size);
|
120
|
-
}
|
121
|
-
|
122
121
|
// If columns are specified, project only those columns
|
123
122
|
if let Some(cols) = &columns {
|
124
123
|
// Get the parquet schema
|
@@ -131,7 +130,11 @@ pub fn parse_parquet_columns<'a>(
|
|
131
130
|
builder = builder.with_projection(projection);
|
132
131
|
}
|
133
132
|
|
134
|
-
let
|
133
|
+
if let Some(batch_size) = batch_size {
|
134
|
+
builder = builder.with_batch_size(batch_size);
|
135
|
+
}
|
136
|
+
|
137
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
135
138
|
|
136
139
|
(reader, schema, num_rows)
|
137
140
|
};
|
@@ -139,100 +142,92 @@ pub fn parse_parquet_columns<'a>(
|
|
139
142
|
if num_rows == 0 {
|
140
143
|
let mut map =
|
141
144
|
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
);
|
145
|
+
let headers: Vec<String> = schema
|
146
|
+
.fields()
|
147
|
+
.iter()
|
148
|
+
.map(|field| field.name().to_string())
|
149
|
+
.collect();
|
150
|
+
let interned_headers =
|
151
|
+
StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
|
152
|
+
for field in interned_headers.iter() {
|
153
|
+
map.insert(*field, vec![]);
|
147
154
|
}
|
148
|
-
let
|
149
|
-
|
155
|
+
let record = ColumnRecord::Map(map);
|
156
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
157
|
+
return Ok(ruby.qnil().into_value_with(&ruby));
|
150
158
|
}
|
151
159
|
|
152
|
-
|
160
|
+
match result_type {
|
153
161
|
ParserResultType::Hash => {
|
154
162
|
let headers = OnceLock::new();
|
155
163
|
let headers_clone = headers.clone();
|
156
|
-
let iter = batch_reader
|
157
|
-
.
|
158
|
-
|
159
|
-
|
164
|
+
let iter = batch_reader.map(move |batch| {
|
165
|
+
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
166
|
+
let local_headers = headers_clone
|
167
|
+
.get_or_init(|| {
|
160
168
|
let schema = batch.schema();
|
161
169
|
let fields = schema.fields();
|
162
170
|
let mut header_string = Vec::with_capacity(fields.len());
|
163
171
|
for field in fields {
|
164
172
|
header_string.push(field.name().to_owned());
|
165
173
|
}
|
166
|
-
StringCache::intern_many(&header_string)
|
167
|
-
})
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
174
|
+
StringCache::intern_many(&header_string)
|
175
|
+
})
|
176
|
+
.as_ref()
|
177
|
+
.map_err(|e| ReaderError::HeaderIntern(e.clone()))?;
|
178
|
+
|
179
|
+
let mut map = HashMap::with_capacity_and_hasher(
|
180
|
+
local_headers.len(),
|
181
|
+
RandomState::default(),
|
182
|
+
);
|
183
|
+
|
184
|
+
batch
|
185
|
+
.columns()
|
186
|
+
.iter()
|
187
|
+
.enumerate()
|
188
|
+
.try_for_each(|(i, column)| {
|
189
|
+
let header = local_headers[i];
|
190
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
191
|
+
array: &*column,
|
192
|
+
strict: strict,
|
193
|
+
})?;
|
175
194
|
map.insert(header, values.into_inner());
|
176
|
-
|
195
|
+
Ok::<_, ReaderError>(())
|
196
|
+
})?;
|
177
197
|
|
178
|
-
|
179
|
-
})
|
198
|
+
Ok(ColumnRecord::Map::<RandomState>(map))
|
180
199
|
})
|
181
|
-
|
200
|
+
});
|
182
201
|
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
}
|
202
|
+
for result in iter {
|
203
|
+
let record = result?;
|
204
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
205
|
+
}
|
187
206
|
}
|
188
|
-
ParserResultType::Array =>
|
189
|
-
batch_reader
|
190
|
-
.
|
191
|
-
batch
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
})
|
199
|
-
.
|
200
|
-
|
207
|
+
ParserResultType::Array => {
|
208
|
+
let iter = batch_reader.map(|batch| {
|
209
|
+
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
210
|
+
let vec = batch
|
211
|
+
.columns()
|
212
|
+
.into_iter()
|
213
|
+
.map(|column| {
|
214
|
+
let values = ParquetValueVec::try_from(ArrayWrapper {
|
215
|
+
array: &*column,
|
216
|
+
strict: strict,
|
217
|
+
})?;
|
218
|
+
Ok::<_, ReaderError>(values.into_inner())
|
219
|
+
})
|
220
|
+
.collect::<Result<Vec<_>, _>>()?;
|
221
|
+
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
201
222
|
})
|
202
|
-
|
203
|
-
),
|
204
|
-
};
|
223
|
+
});
|
205
224
|
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
#[error("Failed to get file descriptor: {0}")]
|
212
|
-
FileDescriptor(String),
|
213
|
-
#[error("Invalid file descriptor")]
|
214
|
-
InvalidFileDescriptor,
|
215
|
-
#[error("Failed to open file: {0}")]
|
216
|
-
FileOpen(#[from] io::Error),
|
217
|
-
#[error("Failed to intern headers: {0}")]
|
218
|
-
HeaderIntern(#[from] CacheError),
|
219
|
-
#[error("Ruby error: {0}")]
|
220
|
-
Ruby(String),
|
221
|
-
#[error("Parquet error: {0}")]
|
222
|
-
Parquet(#[from] ParquetError),
|
223
|
-
}
|
224
|
-
|
225
|
-
impl From<MagnusError> for ReaderError {
|
226
|
-
fn from(err: MagnusError) -> Self {
|
227
|
-
Self::Ruby(err.to_string())
|
225
|
+
for result in iter {
|
226
|
+
let record = result?;
|
227
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
228
|
+
}
|
229
|
+
}
|
228
230
|
}
|
229
|
-
}
|
230
231
|
|
231
|
-
|
232
|
-
fn from(err: ReaderError) -> Self {
|
233
|
-
MagnusError::new(
|
234
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
235
|
-
err.to_string(),
|
236
|
-
)
|
237
|
-
}
|
232
|
+
Ok(ruby.qnil().into_value_with(&ruby))
|
238
233
|
}
|