parquet 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +1 -43
- data/Gemfile +1 -1
- data/ext/parquet/Cargo.toml +4 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +3 -0
- data/ext/parquet/src/parquet_column_reader.rs +238 -0
- data/ext/parquet/src/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/reader.rs +7 -363
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types.rs +41 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/lib/parquet/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 398a8ab4fe6b9c8e82d63ec832aa73163e75874c39080d87291a60397756df42
|
4
|
+
data.tar.gz: cace20e14d0eddc6e3185b2f9294253cb57c1689ec463ff66bc903d3c780af13
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 72ae6542b367fe433016f06fa109aaa77fe360bbc1df64e5c997db8fcc0a00aa166aa19a37240a706b3f443612770b80bc387dd41b34ee4a94ab26c3b0e74832
|
7
|
+
data.tar.gz: f69b10c6d4c8d879cdd3fce7c3b44933a99569358d1adfa3106760bd7c66036a2fef86737cf4dc6369be46234c124b9f2ef66e82fab118e36b5b079e9d23e10b
|
data/Cargo.lock
CHANGED
@@ -826,16 +826,6 @@ dependencies = [
|
|
826
826
|
"wasm-bindgen",
|
827
827
|
]
|
828
828
|
|
829
|
-
[[package]]
|
830
|
-
name = "kanal"
|
831
|
-
version = "0.1.0-pre8"
|
832
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
833
|
-
checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
|
834
|
-
dependencies = [
|
835
|
-
"futures-core",
|
836
|
-
"lock_api",
|
837
|
-
]
|
838
|
-
|
839
829
|
[[package]]
|
840
830
|
name = "lazy_static"
|
841
831
|
version = "1.5.0"
|
@@ -975,18 +965,6 @@ dependencies = [
|
|
975
965
|
"twox-hash",
|
976
966
|
]
|
977
967
|
|
978
|
-
[[package]]
|
979
|
-
name = "magnus"
|
980
|
-
version = "0.6.4"
|
981
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
982
|
-
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
983
|
-
dependencies = [
|
984
|
-
"magnus-macros",
|
985
|
-
"rb-sys",
|
986
|
-
"rb-sys-env",
|
987
|
-
"seq-macro",
|
988
|
-
]
|
989
|
-
|
990
968
|
[[package]]
|
991
969
|
name = "magnus"
|
992
970
|
version = "0.7.1"
|
@@ -1203,13 +1181,10 @@ dependencies = [
|
|
1203
1181
|
"itertools 0.14.0",
|
1204
1182
|
"jemallocator",
|
1205
1183
|
"jiff",
|
1206
|
-
"
|
1207
|
-
"magnus 0.7.1",
|
1184
|
+
"magnus",
|
1208
1185
|
"mimalloc",
|
1209
1186
|
"parquet 54.0.0",
|
1210
1187
|
"rb-sys",
|
1211
|
-
"serde",
|
1212
|
-
"serde_magnus",
|
1213
1188
|
"thiserror",
|
1214
1189
|
]
|
1215
1190
|
|
@@ -1467,17 +1442,6 @@ dependencies = [
|
|
1467
1442
|
"serde",
|
1468
1443
|
]
|
1469
1444
|
|
1470
|
-
[[package]]
|
1471
|
-
name = "serde_magnus"
|
1472
|
-
version = "0.8.1"
|
1473
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1474
|
-
checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
|
1475
|
-
dependencies = [
|
1476
|
-
"magnus 0.6.4",
|
1477
|
-
"serde",
|
1478
|
-
"tap",
|
1479
|
-
]
|
1480
|
-
|
1481
1445
|
[[package]]
|
1482
1446
|
name = "shell-words"
|
1483
1447
|
version = "1.1.0"
|
@@ -1566,12 +1530,6 @@ dependencies = [
|
|
1566
1530
|
"syn",
|
1567
1531
|
]
|
1568
1532
|
|
1569
|
-
[[package]]
|
1570
|
-
name = "tap"
|
1571
|
-
version = "1.0.1"
|
1572
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1573
|
-
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
1574
|
-
|
1575
1533
|
[[package]]
|
1576
1534
|
name = "thiserror"
|
1577
1535
|
version = "2.0.9"
|
data/Gemfile
CHANGED
data/ext/parquet/Cargo.toml
CHANGED
@@ -8,19 +8,15 @@ crate-type = ["cdylib"]
|
|
8
8
|
|
9
9
|
[dependencies]
|
10
10
|
ahash = "0.8"
|
11
|
-
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
12
|
-
arrow-schema = "54.0.0"
|
13
11
|
arrow-array = "54.0.0"
|
12
|
+
arrow-schema = "54.0.0"
|
14
13
|
bytes = "^1.9"
|
15
|
-
|
14
|
+
itertools = "^0.14"
|
15
|
+
jiff = "0.1.19"
|
16
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
17
|
+
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
17
18
|
rb-sys = "^0.9"
|
18
|
-
serde = { version = "1.0", features = ["derive"] }
|
19
|
-
serde_magnus = "0.8.1"
|
20
19
|
thiserror = "2.0"
|
21
|
-
itertools = "^0.14"
|
22
|
-
jiff = "0.1.19"
|
23
|
-
|
24
20
|
|
25
21
|
[target.'cfg(target_os = "linux")'.dependencies]
|
26
22
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
@@ -3,12 +3,12 @@ use magnus::{
|
|
3
3
|
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
4
4
|
};
|
5
5
|
|
6
|
-
use crate::{ColumnRecord, RowRecord};
|
6
|
+
use crate::{ColumnRecord, ParserResultType, RowRecord};
|
7
7
|
|
8
8
|
pub struct RowEnumeratorArgs {
|
9
9
|
pub rb_self: Value,
|
10
10
|
pub to_read: Value,
|
11
|
-
pub result_type:
|
11
|
+
pub result_type: ParserResultType,
|
12
12
|
pub columns: Option<Vec<String>>,
|
13
13
|
}
|
14
14
|
|
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
|
|
17
17
|
args: RowEnumeratorArgs,
|
18
18
|
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
19
19
|
let kwargs = RHash::new();
|
20
|
-
kwargs.aset(
|
20
|
+
kwargs.aset(
|
21
|
+
Symbol::new("result_type"),
|
22
|
+
Symbol::new(args.result_type.to_string()),
|
23
|
+
)?;
|
21
24
|
if let Some(columns) = args.columns {
|
22
25
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
23
26
|
}
|
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
|
|
30
33
|
pub struct ColumnEnumeratorArgs {
|
31
34
|
pub rb_self: Value,
|
32
35
|
pub to_read: Value,
|
33
|
-
pub result_type:
|
36
|
+
pub result_type: ParserResultType,
|
34
37
|
pub columns: Option<Vec<String>>,
|
35
38
|
pub batch_size: Option<usize>,
|
36
39
|
}
|
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
|
|
40
43
|
args: ColumnEnumeratorArgs,
|
41
44
|
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
42
45
|
let kwargs = RHash::new();
|
43
|
-
kwargs.aset(
|
46
|
+
kwargs.aset(
|
47
|
+
Symbol::new("result_type"),
|
48
|
+
Symbol::new(args.result_type.to_string()),
|
49
|
+
)?;
|
44
50
|
if let Some(columns) = args.columns {
|
45
51
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
46
52
|
}
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -0,0 +1,238 @@
|
|
1
|
+
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
2
|
+
use crate::{
|
3
|
+
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
|
+
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
5
|
+
};
|
6
|
+
use ahash::RandomState;
|
7
|
+
use magnus::rb_sys::AsRawValue;
|
8
|
+
use magnus::value::{Opaque, ReprValue};
|
9
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
10
|
+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
11
|
+
use parquet::arrow::ProjectionMask;
|
12
|
+
use parquet::errors::ParquetError;
|
13
|
+
use std::collections::HashMap;
|
14
|
+
use std::fs::File;
|
15
|
+
use std::io;
|
16
|
+
use std::mem::ManuallyDrop;
|
17
|
+
use std::os::fd::FromRawFd;
|
18
|
+
use std::sync::OnceLock;
|
19
|
+
use thiserror::Error;
|
20
|
+
|
21
|
+
#[inline]
|
22
|
+
pub fn parse_parquet_columns<'a>(
|
23
|
+
rb_self: Value,
|
24
|
+
args: &[Value],
|
25
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
26
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
27
|
+
|
28
|
+
let ParquetColumnsArgs {
|
29
|
+
to_read,
|
30
|
+
result_type,
|
31
|
+
columns,
|
32
|
+
batch_size,
|
33
|
+
} = parse_parquet_columns_args(&ruby, args)?;
|
34
|
+
|
35
|
+
if !ruby.block_given() {
|
36
|
+
return create_column_enumerator(ColumnEnumeratorArgs {
|
37
|
+
rb_self,
|
38
|
+
to_read,
|
39
|
+
result_type,
|
40
|
+
columns,
|
41
|
+
batch_size,
|
42
|
+
});
|
43
|
+
}
|
44
|
+
|
45
|
+
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
46
|
+
let path_string = to_read.to_r_string()?;
|
47
|
+
let file_path = unsafe { path_string.as_str()? };
|
48
|
+
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
49
|
+
|
50
|
+
let mut builder =
|
51
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
52
|
+
let schema = builder.schema().clone();
|
53
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
54
|
+
|
55
|
+
// If columns are specified, project only those columns
|
56
|
+
if let Some(cols) = &columns {
|
57
|
+
// Get the parquet schema
|
58
|
+
let parquet_schema = builder.parquet_schema();
|
59
|
+
|
60
|
+
// Create a projection mask from column names
|
61
|
+
let projection =
|
62
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
63
|
+
|
64
|
+
builder = builder.with_projection(projection);
|
65
|
+
}
|
66
|
+
|
67
|
+
if let Some(batch_size) = batch_size {
|
68
|
+
builder = builder.with_batch_size(batch_size);
|
69
|
+
}
|
70
|
+
|
71
|
+
let reader = builder.build().unwrap();
|
72
|
+
|
73
|
+
(reader, schema, num_rows)
|
74
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
75
|
+
let raw_value = to_read.as_raw();
|
76
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
77
|
+
.map_err(|_| {
|
78
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
79
|
+
})?;
|
80
|
+
|
81
|
+
if fd < 0 {
|
82
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
83
|
+
}
|
84
|
+
|
85
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
86
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
87
|
+
|
88
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
89
|
+
let schema = builder.schema().clone();
|
90
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
91
|
+
|
92
|
+
if let Some(batch_size) = batch_size {
|
93
|
+
builder = builder.with_batch_size(batch_size);
|
94
|
+
}
|
95
|
+
|
96
|
+
// If columns are specified, project only those columns
|
97
|
+
if let Some(cols) = &columns {
|
98
|
+
// Get the parquet schema
|
99
|
+
let parquet_schema = builder.parquet_schema();
|
100
|
+
|
101
|
+
// Create a projection mask from column names
|
102
|
+
let projection =
|
103
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
104
|
+
|
105
|
+
builder = builder.with_projection(projection);
|
106
|
+
}
|
107
|
+
|
108
|
+
let reader = builder.build().unwrap();
|
109
|
+
|
110
|
+
(reader, schema, num_rows)
|
111
|
+
} else {
|
112
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
113
|
+
|
114
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
115
|
+
let schema = builder.schema().clone();
|
116
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
117
|
+
|
118
|
+
if let Some(batch_size) = batch_size {
|
119
|
+
builder = builder.with_batch_size(batch_size);
|
120
|
+
}
|
121
|
+
|
122
|
+
// If columns are specified, project only those columns
|
123
|
+
if let Some(cols) = &columns {
|
124
|
+
// Get the parquet schema
|
125
|
+
let parquet_schema = builder.parquet_schema();
|
126
|
+
|
127
|
+
// Create a projection mask from column names
|
128
|
+
let projection =
|
129
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
130
|
+
|
131
|
+
builder = builder.with_projection(projection);
|
132
|
+
}
|
133
|
+
|
134
|
+
let reader = builder.build().unwrap();
|
135
|
+
|
136
|
+
(reader, schema, num_rows)
|
137
|
+
};
|
138
|
+
|
139
|
+
if num_rows == 0 {
|
140
|
+
let mut map =
|
141
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
142
|
+
for field in schema.fields() {
|
143
|
+
map.insert(
|
144
|
+
StringCache::intern(field.name().to_string()).unwrap(),
|
145
|
+
vec![],
|
146
|
+
);
|
147
|
+
}
|
148
|
+
let column_record = vec![ColumnRecord::Map(map)];
|
149
|
+
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
150
|
+
}
|
151
|
+
|
152
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
|
153
|
+
ParserResultType::Hash => {
|
154
|
+
let headers = OnceLock::new();
|
155
|
+
let headers_clone = headers.clone();
|
156
|
+
let iter = batch_reader
|
157
|
+
.filter_map(move |batch| {
|
158
|
+
batch.ok().map(|batch| {
|
159
|
+
let headers = headers_clone.get_or_init(|| {
|
160
|
+
let schema = batch.schema();
|
161
|
+
let fields = schema.fields();
|
162
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
163
|
+
for field in fields {
|
164
|
+
header_string.push(field.name().to_owned());
|
165
|
+
}
|
166
|
+
StringCache::intern_many(&header_string).unwrap()
|
167
|
+
});
|
168
|
+
|
169
|
+
let mut map =
|
170
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
171
|
+
|
172
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
173
|
+
let header = headers[i];
|
174
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
175
|
+
map.insert(header, values.into_inner());
|
176
|
+
});
|
177
|
+
|
178
|
+
map
|
179
|
+
})
|
180
|
+
})
|
181
|
+
.map(ColumnRecord::Map);
|
182
|
+
|
183
|
+
Box::new(HeaderCacheCleanupIter {
|
184
|
+
inner: iter,
|
185
|
+
headers,
|
186
|
+
})
|
187
|
+
}
|
188
|
+
ParserResultType::Array => Box::new(
|
189
|
+
batch_reader
|
190
|
+
.filter_map(|batch| {
|
191
|
+
batch.ok().map(|batch| {
|
192
|
+
batch
|
193
|
+
.columns()
|
194
|
+
.into_iter()
|
195
|
+
.map(|column| {
|
196
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
197
|
+
values.into_inner()
|
198
|
+
})
|
199
|
+
.collect()
|
200
|
+
})
|
201
|
+
})
|
202
|
+
.map(ColumnRecord::Vec),
|
203
|
+
),
|
204
|
+
};
|
205
|
+
|
206
|
+
Ok(Yield::Iter(iter))
|
207
|
+
}
|
208
|
+
|
209
|
+
#[derive(Error, Debug)]
|
210
|
+
pub enum ReaderError {
|
211
|
+
#[error("Failed to get file descriptor: {0}")]
|
212
|
+
FileDescriptor(String),
|
213
|
+
#[error("Invalid file descriptor")]
|
214
|
+
InvalidFileDescriptor,
|
215
|
+
#[error("Failed to open file: {0}")]
|
216
|
+
FileOpen(#[from] io::Error),
|
217
|
+
#[error("Failed to intern headers: {0}")]
|
218
|
+
HeaderIntern(#[from] CacheError),
|
219
|
+
#[error("Ruby error: {0}")]
|
220
|
+
Ruby(String),
|
221
|
+
#[error("Parquet error: {0}")]
|
222
|
+
Parquet(#[from] ParquetError),
|
223
|
+
}
|
224
|
+
|
225
|
+
impl From<MagnusError> for ReaderError {
|
226
|
+
fn from(err: MagnusError) -> Self {
|
227
|
+
Self::Ruby(err.to_string())
|
228
|
+
}
|
229
|
+
}
|
230
|
+
|
231
|
+
impl From<ReaderError> for MagnusError {
|
232
|
+
fn from(err: ReaderError) -> Self {
|
233
|
+
MagnusError::new(
|
234
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
235
|
+
err.to_string(),
|
236
|
+
)
|
237
|
+
}
|
238
|
+
}
|
@@ -0,0 +1,152 @@
|
|
1
|
+
use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
|
2
|
+
use crate::{
|
3
|
+
create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
|
4
|
+
ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
|
5
|
+
};
|
6
|
+
use ahash::RandomState;
|
7
|
+
use magnus::rb_sys::AsRawValue;
|
8
|
+
use magnus::value::{Opaque, ReprValue};
|
9
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
10
|
+
use parquet::file::reader::{FileReader, SerializedFileReader};
|
11
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
12
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
13
|
+
use std::collections::HashMap;
|
14
|
+
use std::fs::File;
|
15
|
+
use std::mem::ManuallyDrop;
|
16
|
+
use std::os::fd::FromRawFd;
|
17
|
+
use std::sync::OnceLock;
|
18
|
+
|
19
|
+
#[inline]
|
20
|
+
pub fn parse_parquet_rows<'a>(
|
21
|
+
rb_self: Value,
|
22
|
+
args: &[Value],
|
23
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
24
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
25
|
+
|
26
|
+
let ParquetRowsArgs {
|
27
|
+
to_read,
|
28
|
+
result_type,
|
29
|
+
columns,
|
30
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
31
|
+
|
32
|
+
if !ruby.block_given() {
|
33
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
34
|
+
rb_self,
|
35
|
+
to_read,
|
36
|
+
result_type,
|
37
|
+
columns,
|
38
|
+
});
|
39
|
+
}
|
40
|
+
|
41
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
42
|
+
let path_string = to_read.to_r_string()?;
|
43
|
+
let file_path = unsafe { path_string.as_str()? };
|
44
|
+
let file = File::open(file_path).unwrap();
|
45
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
46
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
47
|
+
|
48
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
49
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
50
|
+
let raw_value = to_read.as_raw();
|
51
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
52
|
+
.map_err(|_| {
|
53
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
54
|
+
})?;
|
55
|
+
|
56
|
+
if fd < 0 {
|
57
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
58
|
+
}
|
59
|
+
|
60
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
61
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
62
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
63
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
64
|
+
|
65
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
66
|
+
} else {
|
67
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
68
|
+
let reader = SerializedFileReader::new(readable).unwrap();
|
69
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
70
|
+
|
71
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
72
|
+
};
|
73
|
+
|
74
|
+
if let Some(cols) = columns {
|
75
|
+
let projection = create_projection_schema(&schema, &cols);
|
76
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
77
|
+
MagnusError::new(
|
78
|
+
ruby.exception_runtime_error(),
|
79
|
+
format!("Failed to create projection: {}", e),
|
80
|
+
)
|
81
|
+
})?;
|
82
|
+
}
|
83
|
+
|
84
|
+
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
|
85
|
+
ParserResultType::Hash => {
|
86
|
+
let headers = OnceLock::new();
|
87
|
+
let headers_clone = headers.clone();
|
88
|
+
let iter = iter
|
89
|
+
.filter_map(move |row| {
|
90
|
+
row.ok().map(|row| {
|
91
|
+
let headers = headers_clone.get_or_init(|| {
|
92
|
+
let column_count = row.get_column_iter().count();
|
93
|
+
|
94
|
+
let mut header_string = Vec::with_capacity(column_count);
|
95
|
+
for (k, _) in row.get_column_iter() {
|
96
|
+
header_string.push(k.to_owned());
|
97
|
+
}
|
98
|
+
|
99
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
100
|
+
|
101
|
+
headers
|
102
|
+
});
|
103
|
+
|
104
|
+
let mut map =
|
105
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
106
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
107
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
108
|
+
});
|
109
|
+
map
|
110
|
+
})
|
111
|
+
})
|
112
|
+
.map(RowRecord::Map);
|
113
|
+
|
114
|
+
Box::new(HeaderCacheCleanupIter {
|
115
|
+
inner: iter,
|
116
|
+
headers,
|
117
|
+
})
|
118
|
+
}
|
119
|
+
ParserResultType::Array => Box::new(
|
120
|
+
iter.filter_map(|row| {
|
121
|
+
row.ok().map(|row| {
|
122
|
+
let column_count = row.get_column_iter().count();
|
123
|
+
let mut vec = Vec::with_capacity(column_count);
|
124
|
+
row.get_column_iter()
|
125
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
126
|
+
vec
|
127
|
+
})
|
128
|
+
})
|
129
|
+
.map(RowRecord::Vec),
|
130
|
+
),
|
131
|
+
};
|
132
|
+
|
133
|
+
Ok(Yield::Iter(iter))
|
134
|
+
}
|
135
|
+
|
136
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
137
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
138
|
+
let projected_fields: Vec<TypePtr> = fields
|
139
|
+
.iter()
|
140
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
141
|
+
.cloned()
|
142
|
+
.collect();
|
143
|
+
|
144
|
+
SchemaType::GroupType {
|
145
|
+
basic_info: schema.get_basic_info().clone(),
|
146
|
+
fields: projected_fields,
|
147
|
+
}
|
148
|
+
} else {
|
149
|
+
// Return original schema if not a group type
|
150
|
+
schema.clone()
|
151
|
+
}
|
152
|
+
}
|
data/ext/parquet/src/reader.rs
CHANGED
@@ -1,367 +1,11 @@
|
|
1
|
-
|
2
|
-
// Imports and Dependencies
|
3
|
-
// =============================================================================
|
4
|
-
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
5
|
-
use crate::{
|
6
|
-
create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
|
7
|
-
ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
|
8
|
-
SeekableRubyValue,
|
9
|
-
};
|
10
|
-
use ahash::RandomState;
|
11
|
-
use magnus::rb_sys::AsRawValue;
|
12
|
-
use magnus::value::{Opaque, ReprValue};
|
13
|
-
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
14
|
-
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
15
|
-
use parquet::arrow::ProjectionMask;
|
16
|
-
use parquet::errors::ParquetError;
|
17
|
-
use parquet::file::reader::FileReader;
|
18
|
-
use parquet::file::reader::SerializedFileReader;
|
19
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
20
|
-
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
21
|
-
use std::collections::HashMap;
|
22
|
-
use std::fs::File;
|
23
|
-
use std::io::{self};
|
24
|
-
use std::mem::ManuallyDrop;
|
25
|
-
use std::os::fd::FromRawFd;
|
26
|
-
use std::sync::OnceLock;
|
27
|
-
use thiserror::Error;
|
28
|
-
|
29
|
-
#[inline]
|
30
|
-
pub fn parse_parquet_rows<'a>(
|
31
|
-
rb_self: Value,
|
32
|
-
args: &[Value],
|
33
|
-
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
34
|
-
let original = unsafe { Ruby::get_unchecked() };
|
35
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
36
|
-
|
37
|
-
let ParquetRowsArgs {
|
38
|
-
to_read,
|
39
|
-
result_type,
|
40
|
-
columns,
|
41
|
-
} = parse_parquet_rows_args(&ruby, args)?;
|
42
|
-
|
43
|
-
if !ruby.block_given() {
|
44
|
-
return create_row_enumerator(RowEnumeratorArgs {
|
45
|
-
rb_self,
|
46
|
-
to_read,
|
47
|
-
result_type,
|
48
|
-
columns,
|
49
|
-
});
|
50
|
-
}
|
51
|
-
|
52
|
-
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
53
|
-
let path_string = to_read.to_r_string()?;
|
54
|
-
let file_path = unsafe { path_string.as_str()? };
|
55
|
-
let file = File::open(file_path).unwrap();
|
56
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
57
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
58
|
-
|
59
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
60
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
61
|
-
let raw_value = to_read.as_raw();
|
62
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
63
|
-
.map_err(|_| {
|
64
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
65
|
-
})?;
|
66
|
-
|
67
|
-
if fd < 0 {
|
68
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
69
|
-
}
|
70
|
-
|
71
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
72
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
73
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
74
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
75
|
-
|
76
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
77
|
-
} else {
|
78
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
79
|
-
let reader = SerializedFileReader::new(readable).unwrap();
|
80
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
81
|
-
|
82
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
83
|
-
};
|
84
|
-
|
85
|
-
if let Some(cols) = columns {
|
86
|
-
let projection = create_projection_schema(&schema, &cols);
|
87
|
-
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
88
|
-
MagnusError::new(
|
89
|
-
ruby.exception_runtime_error(),
|
90
|
-
format!("Failed to create projection: {}", e),
|
91
|
-
)
|
92
|
-
})?;
|
93
|
-
}
|
94
|
-
|
95
|
-
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
96
|
-
"hash" => {
|
97
|
-
let headers = OnceLock::new();
|
98
|
-
let headers_clone = headers.clone();
|
99
|
-
let iter = iter
|
100
|
-
.filter_map(move |row| {
|
101
|
-
row.ok().map(|row| {
|
102
|
-
let headers = headers_clone.get_or_init(|| {
|
103
|
-
let column_count = row.get_column_iter().count();
|
104
|
-
|
105
|
-
let mut header_string = Vec::with_capacity(column_count);
|
106
|
-
for (k, _) in row.get_column_iter() {
|
107
|
-
header_string.push(k.to_owned());
|
108
|
-
}
|
109
|
-
|
110
|
-
let headers = StringCache::intern_many(&header_string).unwrap();
|
111
|
-
|
112
|
-
headers
|
113
|
-
});
|
114
|
-
|
115
|
-
let mut map =
|
116
|
-
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
117
|
-
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
118
|
-
map.insert(headers[i], ParquetField(v.clone()));
|
119
|
-
});
|
120
|
-
map
|
121
|
-
})
|
122
|
-
})
|
123
|
-
.map(RowRecord::Map);
|
124
|
-
|
125
|
-
Box::new(HeaderCacheCleanupIter {
|
126
|
-
inner: iter,
|
127
|
-
headers,
|
128
|
-
})
|
129
|
-
}
|
130
|
-
"array" => Box::new(
|
131
|
-
iter.filter_map(|row| {
|
132
|
-
row.ok().map(|row| {
|
133
|
-
let column_count = row.get_column_iter().count();
|
134
|
-
let mut vec = Vec::with_capacity(column_count);
|
135
|
-
row.get_column_iter()
|
136
|
-
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
137
|
-
vec
|
138
|
-
})
|
139
|
-
})
|
140
|
-
.map(RowRecord::Vec),
|
141
|
-
),
|
142
|
-
_ => {
|
143
|
-
return Err(MagnusError::new(
|
144
|
-
ruby.exception_runtime_error(),
|
145
|
-
"Invalid result type",
|
146
|
-
))
|
147
|
-
}
|
148
|
-
};
|
149
|
-
|
150
|
-
Ok(Yield::Iter(iter))
|
151
|
-
}
|
152
|
-
|
153
|
-
#[inline]
|
154
|
-
pub fn parse_parquet_columns<'a>(
|
155
|
-
rb_self: Value,
|
156
|
-
args: &[Value],
|
157
|
-
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
158
|
-
let original = unsafe { Ruby::get_unchecked() };
|
159
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
160
|
-
|
161
|
-
let ParquetColumnsArgs {
|
162
|
-
to_read,
|
163
|
-
result_type,
|
164
|
-
columns,
|
165
|
-
batch_size,
|
166
|
-
} = parse_parquet_columns_args(&ruby, args)?;
|
167
|
-
|
168
|
-
if !ruby.block_given() {
|
169
|
-
return create_column_enumerator(ColumnEnumeratorArgs {
|
170
|
-
rb_self,
|
171
|
-
to_read,
|
172
|
-
result_type,
|
173
|
-
columns,
|
174
|
-
batch_size,
|
175
|
-
});
|
176
|
-
}
|
177
|
-
|
178
|
-
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
179
|
-
let path_string = to_read.to_r_string()?;
|
180
|
-
let file_path = unsafe { path_string.as_str()? };
|
181
|
-
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
182
|
-
|
183
|
-
let mut builder =
|
184
|
-
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
185
|
-
let schema = builder.schema().clone();
|
186
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
187
|
-
|
188
|
-
// If columns are specified, project only those columns
|
189
|
-
if let Some(cols) = &columns {
|
190
|
-
// Get the parquet schema
|
191
|
-
let parquet_schema = builder.parquet_schema();
|
192
|
-
|
193
|
-
// Create a projection mask from column names
|
194
|
-
let projection =
|
195
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
196
|
-
|
197
|
-
builder = builder.with_projection(projection);
|
198
|
-
}
|
199
|
-
|
200
|
-
if let Some(batch_size) = batch_size {
|
201
|
-
builder = builder.with_batch_size(batch_size);
|
202
|
-
}
|
1
|
+
use std::io;
|
203
2
|
|
204
|
-
|
205
|
-
|
206
|
-
(reader, schema, num_rows)
|
207
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
208
|
-
let raw_value = to_read.as_raw();
|
209
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
210
|
-
.map_err(|_| {
|
211
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
212
|
-
})?;
|
213
|
-
|
214
|
-
if fd < 0 {
|
215
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
216
|
-
}
|
217
|
-
|
218
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
219
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
220
|
-
|
221
|
-
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
222
|
-
let schema = builder.schema().clone();
|
223
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
224
|
-
|
225
|
-
if let Some(batch_size) = batch_size {
|
226
|
-
builder = builder.with_batch_size(batch_size);
|
227
|
-
}
|
228
|
-
|
229
|
-
// If columns are specified, project only those columns
|
230
|
-
if let Some(cols) = &columns {
|
231
|
-
// Get the parquet schema
|
232
|
-
let parquet_schema = builder.parquet_schema();
|
233
|
-
|
234
|
-
// Create a projection mask from column names
|
235
|
-
let projection =
|
236
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
237
|
-
|
238
|
-
builder = builder.with_projection(projection);
|
239
|
-
}
|
240
|
-
|
241
|
-
let reader = builder.build().unwrap();
|
242
|
-
|
243
|
-
(reader, schema, num_rows)
|
244
|
-
} else {
|
245
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
246
|
-
|
247
|
-
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
248
|
-
let schema = builder.schema().clone();
|
249
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
250
|
-
|
251
|
-
if let Some(batch_size) = batch_size {
|
252
|
-
builder = builder.with_batch_size(batch_size);
|
253
|
-
}
|
254
|
-
|
255
|
-
// If columns are specified, project only those columns
|
256
|
-
if let Some(cols) = &columns {
|
257
|
-
// Get the parquet schema
|
258
|
-
let parquet_schema = builder.parquet_schema();
|
259
|
-
|
260
|
-
// Create a projection mask from column names
|
261
|
-
let projection =
|
262
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
263
|
-
|
264
|
-
builder = builder.with_projection(projection);
|
265
|
-
}
|
266
|
-
|
267
|
-
let reader = builder.build().unwrap();
|
268
|
-
|
269
|
-
(reader, schema, num_rows)
|
270
|
-
};
|
271
|
-
|
272
|
-
if num_rows == 0 {
|
273
|
-
let mut map =
|
274
|
-
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
275
|
-
for field in schema.fields() {
|
276
|
-
map.insert(
|
277
|
-
StringCache::intern(field.name().to_string()).unwrap(),
|
278
|
-
vec![],
|
279
|
-
);
|
280
|
-
}
|
281
|
-
let column_record = vec![ColumnRecord::Map(map)];
|
282
|
-
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
283
|
-
}
|
284
|
-
|
285
|
-
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
|
286
|
-
"hash" => {
|
287
|
-
let headers = OnceLock::new();
|
288
|
-
let headers_clone = headers.clone();
|
289
|
-
let iter = batch_reader
|
290
|
-
.filter_map(move |batch| {
|
291
|
-
batch.ok().map(|batch| {
|
292
|
-
let headers = headers_clone.get_or_init(|| {
|
293
|
-
let schema = batch.schema();
|
294
|
-
let fields = schema.fields();
|
295
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
296
|
-
for field in fields {
|
297
|
-
header_string.push(field.name().to_owned());
|
298
|
-
}
|
299
|
-
StringCache::intern_many(&header_string).unwrap()
|
300
|
-
});
|
301
|
-
|
302
|
-
let mut map =
|
303
|
-
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
304
|
-
|
305
|
-
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
306
|
-
let header = headers[i];
|
307
|
-
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
308
|
-
map.insert(header, values.into_inner());
|
309
|
-
});
|
310
|
-
|
311
|
-
map
|
312
|
-
})
|
313
|
-
})
|
314
|
-
.map(ColumnRecord::Map);
|
315
|
-
|
316
|
-
Box::new(HeaderCacheCleanupIter {
|
317
|
-
inner: iter,
|
318
|
-
headers,
|
319
|
-
})
|
320
|
-
}
|
321
|
-
"array" => Box::new(
|
322
|
-
batch_reader
|
323
|
-
.filter_map(|batch| {
|
324
|
-
batch.ok().map(|batch| {
|
325
|
-
batch
|
326
|
-
.columns()
|
327
|
-
.into_iter()
|
328
|
-
.map(|column| {
|
329
|
-
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
330
|
-
values.into_inner()
|
331
|
-
})
|
332
|
-
.collect()
|
333
|
-
})
|
334
|
-
})
|
335
|
-
.map(ColumnRecord::Vec),
|
336
|
-
),
|
337
|
-
_ => {
|
338
|
-
return Err(MagnusError::new(
|
339
|
-
ruby.exception_runtime_error(),
|
340
|
-
"Invalid result type",
|
341
|
-
))
|
342
|
-
}
|
343
|
-
};
|
344
|
-
|
345
|
-
Ok(Yield::Iter(iter))
|
346
|
-
}
|
347
|
-
|
348
|
-
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
349
|
-
if let SchemaType::GroupType { fields, .. } = schema {
|
350
|
-
let projected_fields: Vec<TypePtr> = fields
|
351
|
-
.iter()
|
352
|
-
.filter(|field| columns.contains(&field.name().to_string()))
|
353
|
-
.cloned()
|
354
|
-
.collect();
|
3
|
+
use magnus::{Error as MagnusError, Ruby};
|
4
|
+
use thiserror::Error;
|
355
5
|
|
356
|
-
|
357
|
-
|
358
|
-
|
359
|
-
}
|
360
|
-
} else {
|
361
|
-
// Return original schema if not a group type
|
362
|
-
schema.clone()
|
363
|
-
}
|
364
|
-
}
|
6
|
+
use crate::header_cache::CacheError;
|
7
|
+
pub use crate::parquet_column_reader::parse_parquet_columns;
|
8
|
+
pub use crate::parquet_row_reader::parse_parquet_rows;
|
365
9
|
|
366
10
|
#[derive(Error, Debug)]
|
367
11
|
pub enum ReaderError {
|
@@ -376,7 +20,7 @@ pub enum ReaderError {
|
|
376
20
|
#[error("Ruby error: {0}")]
|
377
21
|
Ruby(String),
|
378
22
|
#[error("Parquet error: {0}")]
|
379
|
-
Parquet(#[from] ParquetError),
|
23
|
+
Parquet(#[from] parquet::errors::ParquetError),
|
380
24
|
}
|
381
25
|
|
382
26
|
impl From<MagnusError> for ReaderError {
|
@@ -14,9 +14,8 @@ pub struct RubyReader<T> {
|
|
14
14
|
offset: usize,
|
15
15
|
}
|
16
16
|
|
17
|
-
pub trait SeekableRead:
|
18
|
-
impl SeekableRead for
|
19
|
-
impl SeekableRead for RubyReader<RString> {}
|
17
|
+
pub trait SeekableRead: Read + Seek {}
|
18
|
+
impl<T: Read + Seek> SeekableRead for T {}
|
20
19
|
|
21
20
|
pub fn build_ruby_reader(
|
22
21
|
ruby: &Ruby,
|
data/ext/parquet/src/types.rs
CHANGED
@@ -15,6 +15,47 @@ use parquet::record::Field;
|
|
15
15
|
|
16
16
|
use crate::header_cache::StringCacheKey;
|
17
17
|
|
18
|
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
19
|
+
pub enum ParserResultType {
|
20
|
+
Hash,
|
21
|
+
Array,
|
22
|
+
}
|
23
|
+
|
24
|
+
impl ParserResultType {
|
25
|
+
pub fn iter() -> impl Iterator<Item = Self> {
|
26
|
+
[Self::Hash, Self::Array].into_iter()
|
27
|
+
}
|
28
|
+
}
|
29
|
+
|
30
|
+
impl TryFrom<&str> for ParserResultType {
|
31
|
+
type Error = String;
|
32
|
+
|
33
|
+
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
34
|
+
match value {
|
35
|
+
"hash" => Ok(ParserResultType::Hash),
|
36
|
+
"array" => Ok(ParserResultType::Array),
|
37
|
+
_ => Err(format!("Invalid parser result type: {}", value)),
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
impl TryFrom<String> for ParserResultType {
|
43
|
+
type Error = String;
|
44
|
+
|
45
|
+
fn try_from(value: String) -> Result<Self, Self::Error> {
|
46
|
+
Self::try_from(value.as_str())
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
impl std::fmt::Display for ParserResultType {
|
51
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
52
|
+
match self {
|
53
|
+
ParserResultType::Hash => write!(f, "hash"),
|
54
|
+
ParserResultType::Array => write!(f, "array"),
|
55
|
+
}
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
18
59
|
#[derive(Debug)]
|
19
60
|
pub enum RowRecord<S: BuildHasher + Default> {
|
20
61
|
Vec(Vec<ParquetField>),
|
data/ext/parquet/src/utils.rs
CHANGED
@@ -4,6 +4,8 @@ use magnus::{
|
|
4
4
|
Error, RString, Ruby, Symbol, Value,
|
5
5
|
};
|
6
6
|
|
7
|
+
use crate::ParserResultType;
|
8
|
+
|
7
9
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
8
10
|
if value.is_nil() {
|
9
11
|
Ok(None)
|
@@ -28,7 +30,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
28
30
|
#[derive(Debug)]
|
29
31
|
pub struct ParquetRowsArgs {
|
30
32
|
pub to_read: Value,
|
31
|
-
pub result_type:
|
33
|
+
pub result_type: ParserResultType,
|
32
34
|
pub columns: Option<Vec<String>>,
|
33
35
|
}
|
34
36
|
|
@@ -43,28 +45,31 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
43
45
|
&["result_type", "columns"],
|
44
46
|
)?;
|
45
47
|
|
46
|
-
let result_type = match kwargs
|
48
|
+
let result_type: ParserResultType = match kwargs
|
47
49
|
.optional
|
48
50
|
.0
|
49
51
|
.map(|value| parse_string_or_symbol(ruby, value))
|
50
52
|
{
|
51
|
-
Some(Ok(Some(parsed))) =>
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
53
|
+
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
54
|
+
Error::new(
|
55
|
+
magnus::exception::runtime_error(),
|
56
|
+
format!(
|
57
|
+
"Invalid result type: {e}. Must be one of {}",
|
58
|
+
ParserResultType::iter()
|
59
|
+
.map(|v| v.to_string())
|
60
|
+
.collect::<Vec<_>>()
|
61
|
+
.join(", ")
|
62
|
+
),
|
63
|
+
)
|
64
|
+
})?,
|
65
|
+
Some(Ok(None)) => ParserResultType::Hash,
|
61
66
|
Some(Err(_)) => {
|
62
67
|
return Err(Error::new(
|
63
68
|
magnus::exception::type_error(),
|
64
69
|
"result_type must be a String or Symbol",
|
65
70
|
))
|
66
71
|
}
|
67
|
-
None =>
|
72
|
+
None => ParserResultType::Hash,
|
68
73
|
};
|
69
74
|
|
70
75
|
Ok(ParquetRowsArgs {
|
@@ -77,7 +82,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
77
82
|
#[derive(Debug)]
|
78
83
|
pub struct ParquetColumnsArgs {
|
79
84
|
pub to_read: Value,
|
80
|
-
pub result_type:
|
85
|
+
pub result_type: ParserResultType,
|
81
86
|
pub columns: Option<Vec<String>>,
|
82
87
|
pub batch_size: Option<usize>,
|
83
88
|
}
|
@@ -96,28 +101,31 @@ pub fn parse_parquet_columns_args(
|
|
96
101
|
&["result_type", "columns", "batch_size"],
|
97
102
|
)?;
|
98
103
|
|
99
|
-
let result_type = match kwargs
|
104
|
+
let result_type: ParserResultType = match kwargs
|
100
105
|
.optional
|
101
106
|
.0
|
102
107
|
.map(|value| parse_string_or_symbol(ruby, value))
|
103
108
|
{
|
104
|
-
Some(Ok(Some(parsed))) =>
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
109
|
+
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
110
|
+
Error::new(
|
111
|
+
magnus::exception::runtime_error(),
|
112
|
+
format!(
|
113
|
+
"Invalid result type: {e}. Must be one of {}",
|
114
|
+
ParserResultType::iter()
|
115
|
+
.map(|v| v.to_string())
|
116
|
+
.collect::<Vec<_>>()
|
117
|
+
.join(", ")
|
118
|
+
),
|
119
|
+
)
|
120
|
+
})?,
|
121
|
+
Some(Ok(None)) => ParserResultType::Hash,
|
114
122
|
Some(Err(_)) => {
|
115
123
|
return Err(Error::new(
|
116
124
|
magnus::exception::type_error(),
|
117
125
|
"result_type must be a String or Symbol",
|
118
126
|
))
|
119
127
|
}
|
120
|
-
None =>
|
128
|
+
None => ParserResultType::Hash,
|
121
129
|
};
|
122
130
|
|
123
131
|
Ok(ParquetColumnsArgs {
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-01-
|
11
|
+
date: 2025-01-06 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -60,6 +60,8 @@ files:
|
|
60
60
|
- ext/parquet/src/enumerator.rs
|
61
61
|
- ext/parquet/src/header_cache.rs
|
62
62
|
- ext/parquet/src/lib.rs
|
63
|
+
- ext/parquet/src/parquet_column_reader.rs
|
64
|
+
- ext/parquet/src/parquet_row_reader.rs
|
63
65
|
- ext/parquet/src/reader.rs
|
64
66
|
- ext/parquet/src/ruby_integration.rs
|
65
67
|
- ext/parquet/src/ruby_reader.rs
|