parquet 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +1 -43
- data/Gemfile +1 -1
- data/ext/parquet/Cargo.toml +4 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +3 -0
- data/ext/parquet/src/parquet_column_reader.rs +238 -0
- data/ext/parquet/src/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/reader.rs +7 -363
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types.rs +41 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/lib/parquet/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 398a8ab4fe6b9c8e82d63ec832aa73163e75874c39080d87291a60397756df42
|
|
4
|
+
data.tar.gz: cace20e14d0eddc6e3185b2f9294253cb57c1689ec463ff66bc903d3c780af13
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 72ae6542b367fe433016f06fa109aaa77fe360bbc1df64e5c997db8fcc0a00aa166aa19a37240a706b3f443612770b80bc387dd41b34ee4a94ab26c3b0e74832
|
|
7
|
+
data.tar.gz: f69b10c6d4c8d879cdd3fce7c3b44933a99569358d1adfa3106760bd7c66036a2fef86737cf4dc6369be46234c124b9f2ef66e82fab118e36b5b079e9d23e10b
|
data/Cargo.lock
CHANGED
|
@@ -826,16 +826,6 @@ dependencies = [
|
|
|
826
826
|
"wasm-bindgen",
|
|
827
827
|
]
|
|
828
828
|
|
|
829
|
-
[[package]]
|
|
830
|
-
name = "kanal"
|
|
831
|
-
version = "0.1.0-pre8"
|
|
832
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
833
|
-
checksum = "b05d55519627edaf7fd0f29981f6dc03fb52df3f5b257130eb8d0bf2801ea1d7"
|
|
834
|
-
dependencies = [
|
|
835
|
-
"futures-core",
|
|
836
|
-
"lock_api",
|
|
837
|
-
]
|
|
838
|
-
|
|
839
829
|
[[package]]
|
|
840
830
|
name = "lazy_static"
|
|
841
831
|
version = "1.5.0"
|
|
@@ -975,18 +965,6 @@ dependencies = [
|
|
|
975
965
|
"twox-hash",
|
|
976
966
|
]
|
|
977
967
|
|
|
978
|
-
[[package]]
|
|
979
|
-
name = "magnus"
|
|
980
|
-
version = "0.6.4"
|
|
981
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
982
|
-
checksum = "b1597ef40aa8c36be098249e82c9a20cf7199278ac1c1a1a995eeead6a184479"
|
|
983
|
-
dependencies = [
|
|
984
|
-
"magnus-macros",
|
|
985
|
-
"rb-sys",
|
|
986
|
-
"rb-sys-env",
|
|
987
|
-
"seq-macro",
|
|
988
|
-
]
|
|
989
|
-
|
|
990
968
|
[[package]]
|
|
991
969
|
name = "magnus"
|
|
992
970
|
version = "0.7.1"
|
|
@@ -1203,13 +1181,10 @@ dependencies = [
|
|
|
1203
1181
|
"itertools 0.14.0",
|
|
1204
1182
|
"jemallocator",
|
|
1205
1183
|
"jiff",
|
|
1206
|
-
"
|
|
1207
|
-
"magnus 0.7.1",
|
|
1184
|
+
"magnus",
|
|
1208
1185
|
"mimalloc",
|
|
1209
1186
|
"parquet 54.0.0",
|
|
1210
1187
|
"rb-sys",
|
|
1211
|
-
"serde",
|
|
1212
|
-
"serde_magnus",
|
|
1213
1188
|
"thiserror",
|
|
1214
1189
|
]
|
|
1215
1190
|
|
|
@@ -1467,17 +1442,6 @@ dependencies = [
|
|
|
1467
1442
|
"serde",
|
|
1468
1443
|
]
|
|
1469
1444
|
|
|
1470
|
-
[[package]]
|
|
1471
|
-
name = "serde_magnus"
|
|
1472
|
-
version = "0.8.1"
|
|
1473
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1474
|
-
checksum = "76c20da583b5e1016e9199ef5f3260f7a8d1b253307d232600f6b12737262dbd"
|
|
1475
|
-
dependencies = [
|
|
1476
|
-
"magnus 0.6.4",
|
|
1477
|
-
"serde",
|
|
1478
|
-
"tap",
|
|
1479
|
-
]
|
|
1480
|
-
|
|
1481
1445
|
[[package]]
|
|
1482
1446
|
name = "shell-words"
|
|
1483
1447
|
version = "1.1.0"
|
|
@@ -1566,12 +1530,6 @@ dependencies = [
|
|
|
1566
1530
|
"syn",
|
|
1567
1531
|
]
|
|
1568
1532
|
|
|
1569
|
-
[[package]]
|
|
1570
|
-
name = "tap"
|
|
1571
|
-
version = "1.0.1"
|
|
1572
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1573
|
-
checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
|
|
1574
|
-
|
|
1575
1533
|
[[package]]
|
|
1576
1534
|
name = "thiserror"
|
|
1577
1535
|
version = "2.0.9"
|
data/Gemfile
CHANGED
data/ext/parquet/Cargo.toml
CHANGED
|
@@ -8,19 +8,15 @@ crate-type = ["cdylib"]
|
|
|
8
8
|
|
|
9
9
|
[dependencies]
|
|
10
10
|
ahash = "0.8"
|
|
11
|
-
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
|
12
|
-
arrow-schema = "54.0.0"
|
|
13
11
|
arrow-array = "54.0.0"
|
|
12
|
+
arrow-schema = "54.0.0"
|
|
14
13
|
bytes = "^1.9"
|
|
15
|
-
|
|
14
|
+
itertools = "^0.14"
|
|
15
|
+
jiff = "0.1.19"
|
|
16
16
|
magnus = { version = "0.7", features = ["rb-sys"] }
|
|
17
|
+
parquet = { version = "^54.0", features = ["json", "object_store"] }
|
|
17
18
|
rb-sys = "^0.9"
|
|
18
|
-
serde = { version = "1.0", features = ["derive"] }
|
|
19
|
-
serde_magnus = "0.8.1"
|
|
20
19
|
thiserror = "2.0"
|
|
21
|
-
itertools = "^0.14"
|
|
22
|
-
jiff = "0.1.19"
|
|
23
|
-
|
|
24
20
|
|
|
25
21
|
[target.'cfg(target_os = "linux")'.dependencies]
|
|
26
22
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
|
@@ -3,12 +3,12 @@ use magnus::{
|
|
|
3
3
|
block::Yield, value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value,
|
|
4
4
|
};
|
|
5
5
|
|
|
6
|
-
use crate::{ColumnRecord, RowRecord};
|
|
6
|
+
use crate::{ColumnRecord, ParserResultType, RowRecord};
|
|
7
7
|
|
|
8
8
|
pub struct RowEnumeratorArgs {
|
|
9
9
|
pub rb_self: Value,
|
|
10
10
|
pub to_read: Value,
|
|
11
|
-
pub result_type:
|
|
11
|
+
pub result_type: ParserResultType,
|
|
12
12
|
pub columns: Option<Vec<String>>,
|
|
13
13
|
}
|
|
14
14
|
|
|
@@ -17,7 +17,10 @@ pub fn create_row_enumerator(
|
|
|
17
17
|
args: RowEnumeratorArgs,
|
|
18
18
|
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
|
19
19
|
let kwargs = RHash::new();
|
|
20
|
-
kwargs.aset(
|
|
20
|
+
kwargs.aset(
|
|
21
|
+
Symbol::new("result_type"),
|
|
22
|
+
Symbol::new(args.result_type.to_string()),
|
|
23
|
+
)?;
|
|
21
24
|
if let Some(columns) = args.columns {
|
|
22
25
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
|
23
26
|
}
|
|
@@ -30,7 +33,7 @@ pub fn create_row_enumerator(
|
|
|
30
33
|
pub struct ColumnEnumeratorArgs {
|
|
31
34
|
pub rb_self: Value,
|
|
32
35
|
pub to_read: Value,
|
|
33
|
-
pub result_type:
|
|
36
|
+
pub result_type: ParserResultType,
|
|
34
37
|
pub columns: Option<Vec<String>>,
|
|
35
38
|
pub batch_size: Option<usize>,
|
|
36
39
|
}
|
|
@@ -40,7 +43,10 @@ pub fn create_column_enumerator(
|
|
|
40
43
|
args: ColumnEnumeratorArgs,
|
|
41
44
|
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
|
42
45
|
let kwargs = RHash::new();
|
|
43
|
-
kwargs.aset(
|
|
46
|
+
kwargs.aset(
|
|
47
|
+
Symbol::new("result_type"),
|
|
48
|
+
Symbol::new(args.result_type.to_string()),
|
|
49
|
+
)?;
|
|
44
50
|
if let Some(columns) = args.columns {
|
|
45
51
|
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
|
46
52
|
}
|
data/ext/parquet/src/lib.rs
CHANGED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
|
2
|
+
use crate::{
|
|
3
|
+
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
|
4
|
+
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
|
5
|
+
};
|
|
6
|
+
use ahash::RandomState;
|
|
7
|
+
use magnus::rb_sys::AsRawValue;
|
|
8
|
+
use magnus::value::{Opaque, ReprValue};
|
|
9
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
|
10
|
+
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
|
11
|
+
use parquet::arrow::ProjectionMask;
|
|
12
|
+
use parquet::errors::ParquetError;
|
|
13
|
+
use std::collections::HashMap;
|
|
14
|
+
use std::fs::File;
|
|
15
|
+
use std::io;
|
|
16
|
+
use std::mem::ManuallyDrop;
|
|
17
|
+
use std::os::fd::FromRawFd;
|
|
18
|
+
use std::sync::OnceLock;
|
|
19
|
+
use thiserror::Error;
|
|
20
|
+
|
|
21
|
+
#[inline]
|
|
22
|
+
pub fn parse_parquet_columns<'a>(
|
|
23
|
+
rb_self: Value,
|
|
24
|
+
args: &[Value],
|
|
25
|
+
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
|
26
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
27
|
+
|
|
28
|
+
let ParquetColumnsArgs {
|
|
29
|
+
to_read,
|
|
30
|
+
result_type,
|
|
31
|
+
columns,
|
|
32
|
+
batch_size,
|
|
33
|
+
} = parse_parquet_columns_args(&ruby, args)?;
|
|
34
|
+
|
|
35
|
+
if !ruby.block_given() {
|
|
36
|
+
return create_column_enumerator(ColumnEnumeratorArgs {
|
|
37
|
+
rb_self,
|
|
38
|
+
to_read,
|
|
39
|
+
result_type,
|
|
40
|
+
columns,
|
|
41
|
+
batch_size,
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
|
46
|
+
let path_string = to_read.to_r_string()?;
|
|
47
|
+
let file_path = unsafe { path_string.as_str()? };
|
|
48
|
+
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
|
49
|
+
|
|
50
|
+
let mut builder =
|
|
51
|
+
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
|
52
|
+
let schema = builder.schema().clone();
|
|
53
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
|
54
|
+
|
|
55
|
+
// If columns are specified, project only those columns
|
|
56
|
+
if let Some(cols) = &columns {
|
|
57
|
+
// Get the parquet schema
|
|
58
|
+
let parquet_schema = builder.parquet_schema();
|
|
59
|
+
|
|
60
|
+
// Create a projection mask from column names
|
|
61
|
+
let projection =
|
|
62
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
|
63
|
+
|
|
64
|
+
builder = builder.with_projection(projection);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if let Some(batch_size) = batch_size {
|
|
68
|
+
builder = builder.with_batch_size(batch_size);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
let reader = builder.build().unwrap();
|
|
72
|
+
|
|
73
|
+
(reader, schema, num_rows)
|
|
74
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
|
75
|
+
let raw_value = to_read.as_raw();
|
|
76
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
|
77
|
+
.map_err(|_| {
|
|
78
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
|
79
|
+
})?;
|
|
80
|
+
|
|
81
|
+
if fd < 0 {
|
|
82
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
|
86
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
|
87
|
+
|
|
88
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
|
89
|
+
let schema = builder.schema().clone();
|
|
90
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
|
91
|
+
|
|
92
|
+
if let Some(batch_size) = batch_size {
|
|
93
|
+
builder = builder.with_batch_size(batch_size);
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// If columns are specified, project only those columns
|
|
97
|
+
if let Some(cols) = &columns {
|
|
98
|
+
// Get the parquet schema
|
|
99
|
+
let parquet_schema = builder.parquet_schema();
|
|
100
|
+
|
|
101
|
+
// Create a projection mask from column names
|
|
102
|
+
let projection =
|
|
103
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
|
104
|
+
|
|
105
|
+
builder = builder.with_projection(projection);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
let reader = builder.build().unwrap();
|
|
109
|
+
|
|
110
|
+
(reader, schema, num_rows)
|
|
111
|
+
} else {
|
|
112
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
|
113
|
+
|
|
114
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
|
115
|
+
let schema = builder.schema().clone();
|
|
116
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
|
117
|
+
|
|
118
|
+
if let Some(batch_size) = batch_size {
|
|
119
|
+
builder = builder.with_batch_size(batch_size);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// If columns are specified, project only those columns
|
|
123
|
+
if let Some(cols) = &columns {
|
|
124
|
+
// Get the parquet schema
|
|
125
|
+
let parquet_schema = builder.parquet_schema();
|
|
126
|
+
|
|
127
|
+
// Create a projection mask from column names
|
|
128
|
+
let projection =
|
|
129
|
+
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
|
130
|
+
|
|
131
|
+
builder = builder.with_projection(projection);
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
let reader = builder.build().unwrap();
|
|
135
|
+
|
|
136
|
+
(reader, schema, num_rows)
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
if num_rows == 0 {
|
|
140
|
+
let mut map =
|
|
141
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
|
142
|
+
for field in schema.fields() {
|
|
143
|
+
map.insert(
|
|
144
|
+
StringCache::intern(field.name().to_string()).unwrap(),
|
|
145
|
+
vec![],
|
|
146
|
+
);
|
|
147
|
+
}
|
|
148
|
+
let column_record = vec![ColumnRecord::Map(map)];
|
|
149
|
+
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type {
|
|
153
|
+
ParserResultType::Hash => {
|
|
154
|
+
let headers = OnceLock::new();
|
|
155
|
+
let headers_clone = headers.clone();
|
|
156
|
+
let iter = batch_reader
|
|
157
|
+
.filter_map(move |batch| {
|
|
158
|
+
batch.ok().map(|batch| {
|
|
159
|
+
let headers = headers_clone.get_or_init(|| {
|
|
160
|
+
let schema = batch.schema();
|
|
161
|
+
let fields = schema.fields();
|
|
162
|
+
let mut header_string = Vec::with_capacity(fields.len());
|
|
163
|
+
for field in fields {
|
|
164
|
+
header_string.push(field.name().to_owned());
|
|
165
|
+
}
|
|
166
|
+
StringCache::intern_many(&header_string).unwrap()
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
let mut map =
|
|
170
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
|
171
|
+
|
|
172
|
+
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
|
173
|
+
let header = headers[i];
|
|
174
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
|
175
|
+
map.insert(header, values.into_inner());
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
map
|
|
179
|
+
})
|
|
180
|
+
})
|
|
181
|
+
.map(ColumnRecord::Map);
|
|
182
|
+
|
|
183
|
+
Box::new(HeaderCacheCleanupIter {
|
|
184
|
+
inner: iter,
|
|
185
|
+
headers,
|
|
186
|
+
})
|
|
187
|
+
}
|
|
188
|
+
ParserResultType::Array => Box::new(
|
|
189
|
+
batch_reader
|
|
190
|
+
.filter_map(|batch| {
|
|
191
|
+
batch.ok().map(|batch| {
|
|
192
|
+
batch
|
|
193
|
+
.columns()
|
|
194
|
+
.into_iter()
|
|
195
|
+
.map(|column| {
|
|
196
|
+
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
|
197
|
+
values.into_inner()
|
|
198
|
+
})
|
|
199
|
+
.collect()
|
|
200
|
+
})
|
|
201
|
+
})
|
|
202
|
+
.map(ColumnRecord::Vec),
|
|
203
|
+
),
|
|
204
|
+
};
|
|
205
|
+
|
|
206
|
+
Ok(Yield::Iter(iter))
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
#[derive(Error, Debug)]
|
|
210
|
+
pub enum ReaderError {
|
|
211
|
+
#[error("Failed to get file descriptor: {0}")]
|
|
212
|
+
FileDescriptor(String),
|
|
213
|
+
#[error("Invalid file descriptor")]
|
|
214
|
+
InvalidFileDescriptor,
|
|
215
|
+
#[error("Failed to open file: {0}")]
|
|
216
|
+
FileOpen(#[from] io::Error),
|
|
217
|
+
#[error("Failed to intern headers: {0}")]
|
|
218
|
+
HeaderIntern(#[from] CacheError),
|
|
219
|
+
#[error("Ruby error: {0}")]
|
|
220
|
+
Ruby(String),
|
|
221
|
+
#[error("Parquet error: {0}")]
|
|
222
|
+
Parquet(#[from] ParquetError),
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
impl From<MagnusError> for ReaderError {
|
|
226
|
+
fn from(err: MagnusError) -> Self {
|
|
227
|
+
Self::Ruby(err.to_string())
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
impl From<ReaderError> for MagnusError {
|
|
232
|
+
fn from(err: ReaderError) -> Self {
|
|
233
|
+
MagnusError::new(
|
|
234
|
+
Ruby::get().unwrap().exception_runtime_error(),
|
|
235
|
+
err.to_string(),
|
|
236
|
+
)
|
|
237
|
+
}
|
|
238
|
+
}
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
|
|
2
|
+
use crate::{
|
|
3
|
+
create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
|
|
4
|
+
ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
|
|
5
|
+
};
|
|
6
|
+
use ahash::RandomState;
|
|
7
|
+
use magnus::rb_sys::AsRawValue;
|
|
8
|
+
use magnus::value::{Opaque, ReprValue};
|
|
9
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
|
10
|
+
use parquet::file::reader::{FileReader, SerializedFileReader};
|
|
11
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
|
12
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
|
13
|
+
use std::collections::HashMap;
|
|
14
|
+
use std::fs::File;
|
|
15
|
+
use std::mem::ManuallyDrop;
|
|
16
|
+
use std::os::fd::FromRawFd;
|
|
17
|
+
use std::sync::OnceLock;
|
|
18
|
+
|
|
19
|
+
#[inline]
|
|
20
|
+
pub fn parse_parquet_rows<'a>(
|
|
21
|
+
rb_self: Value,
|
|
22
|
+
args: &[Value],
|
|
23
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
|
24
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
|
25
|
+
|
|
26
|
+
let ParquetRowsArgs {
|
|
27
|
+
to_read,
|
|
28
|
+
result_type,
|
|
29
|
+
columns,
|
|
30
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
|
31
|
+
|
|
32
|
+
if !ruby.block_given() {
|
|
33
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
|
34
|
+
rb_self,
|
|
35
|
+
to_read,
|
|
36
|
+
result_type,
|
|
37
|
+
columns,
|
|
38
|
+
});
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
|
42
|
+
let path_string = to_read.to_r_string()?;
|
|
43
|
+
let file_path = unsafe { path_string.as_str()? };
|
|
44
|
+
let file = File::open(file_path).unwrap();
|
|
45
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
|
46
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
|
47
|
+
|
|
48
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
49
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
|
50
|
+
let raw_value = to_read.as_raw();
|
|
51
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
|
52
|
+
.map_err(|_| {
|
|
53
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
|
54
|
+
})?;
|
|
55
|
+
|
|
56
|
+
if fd < 0 {
|
|
57
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
|
61
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
|
62
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
|
63
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
|
64
|
+
|
|
65
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
66
|
+
} else {
|
|
67
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
|
68
|
+
let reader = SerializedFileReader::new(readable).unwrap();
|
|
69
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
|
70
|
+
|
|
71
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
72
|
+
};
|
|
73
|
+
|
|
74
|
+
if let Some(cols) = columns {
|
|
75
|
+
let projection = create_projection_schema(&schema, &cols);
|
|
76
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
|
77
|
+
MagnusError::new(
|
|
78
|
+
ruby.exception_runtime_error(),
|
|
79
|
+
format!("Failed to create projection: {}", e),
|
|
80
|
+
)
|
|
81
|
+
})?;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
|
|
85
|
+
ParserResultType::Hash => {
|
|
86
|
+
let headers = OnceLock::new();
|
|
87
|
+
let headers_clone = headers.clone();
|
|
88
|
+
let iter = iter
|
|
89
|
+
.filter_map(move |row| {
|
|
90
|
+
row.ok().map(|row| {
|
|
91
|
+
let headers = headers_clone.get_or_init(|| {
|
|
92
|
+
let column_count = row.get_column_iter().count();
|
|
93
|
+
|
|
94
|
+
let mut header_string = Vec::with_capacity(column_count);
|
|
95
|
+
for (k, _) in row.get_column_iter() {
|
|
96
|
+
header_string.push(k.to_owned());
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
|
100
|
+
|
|
101
|
+
headers
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
let mut map =
|
|
105
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
|
106
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
|
107
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
|
108
|
+
});
|
|
109
|
+
map
|
|
110
|
+
})
|
|
111
|
+
})
|
|
112
|
+
.map(RowRecord::Map);
|
|
113
|
+
|
|
114
|
+
Box::new(HeaderCacheCleanupIter {
|
|
115
|
+
inner: iter,
|
|
116
|
+
headers,
|
|
117
|
+
})
|
|
118
|
+
}
|
|
119
|
+
ParserResultType::Array => Box::new(
|
|
120
|
+
iter.filter_map(|row| {
|
|
121
|
+
row.ok().map(|row| {
|
|
122
|
+
let column_count = row.get_column_iter().count();
|
|
123
|
+
let mut vec = Vec::with_capacity(column_count);
|
|
124
|
+
row.get_column_iter()
|
|
125
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
|
126
|
+
vec
|
|
127
|
+
})
|
|
128
|
+
})
|
|
129
|
+
.map(RowRecord::Vec),
|
|
130
|
+
),
|
|
131
|
+
};
|
|
132
|
+
|
|
133
|
+
Ok(Yield::Iter(iter))
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
|
137
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
|
138
|
+
let projected_fields: Vec<TypePtr> = fields
|
|
139
|
+
.iter()
|
|
140
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
|
141
|
+
.cloned()
|
|
142
|
+
.collect();
|
|
143
|
+
|
|
144
|
+
SchemaType::GroupType {
|
|
145
|
+
basic_info: schema.get_basic_info().clone(),
|
|
146
|
+
fields: projected_fields,
|
|
147
|
+
}
|
|
148
|
+
} else {
|
|
149
|
+
// Return original schema if not a group type
|
|
150
|
+
schema.clone()
|
|
151
|
+
}
|
|
152
|
+
}
|
data/ext/parquet/src/reader.rs
CHANGED
|
@@ -1,367 +1,11 @@
|
|
|
1
|
-
|
|
2
|
-
// Imports and Dependencies
|
|
3
|
-
// =============================================================================
|
|
4
|
-
use crate::header_cache::{CacheError, HeaderCacheCleanupIter, StringCache};
|
|
5
|
-
use crate::{
|
|
6
|
-
create_column_enumerator, create_row_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord,
|
|
7
|
-
ForgottenFileHandle, ParquetField, ParquetValueVec, RowEnumeratorArgs, RowRecord,
|
|
8
|
-
SeekableRubyValue,
|
|
9
|
-
};
|
|
10
|
-
use ahash::RandomState;
|
|
11
|
-
use magnus::rb_sys::AsRawValue;
|
|
12
|
-
use magnus::value::{Opaque, ReprValue};
|
|
13
|
-
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
|
14
|
-
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
|
15
|
-
use parquet::arrow::ProjectionMask;
|
|
16
|
-
use parquet::errors::ParquetError;
|
|
17
|
-
use parquet::file::reader::FileReader;
|
|
18
|
-
use parquet::file::reader::SerializedFileReader;
|
|
19
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
|
20
|
-
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
|
21
|
-
use std::collections::HashMap;
|
|
22
|
-
use std::fs::File;
|
|
23
|
-
use std::io::{self};
|
|
24
|
-
use std::mem::ManuallyDrop;
|
|
25
|
-
use std::os::fd::FromRawFd;
|
|
26
|
-
use std::sync::OnceLock;
|
|
27
|
-
use thiserror::Error;
|
|
28
|
-
|
|
29
|
-
#[inline]
|
|
30
|
-
pub fn parse_parquet_rows<'a>(
|
|
31
|
-
rb_self: Value,
|
|
32
|
-
args: &[Value],
|
|
33
|
-
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
|
34
|
-
let original = unsafe { Ruby::get_unchecked() };
|
|
35
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
|
36
|
-
|
|
37
|
-
let ParquetRowsArgs {
|
|
38
|
-
to_read,
|
|
39
|
-
result_type,
|
|
40
|
-
columns,
|
|
41
|
-
} = parse_parquet_rows_args(&ruby, args)?;
|
|
42
|
-
|
|
43
|
-
if !ruby.block_given() {
|
|
44
|
-
return create_row_enumerator(RowEnumeratorArgs {
|
|
45
|
-
rb_self,
|
|
46
|
-
to_read,
|
|
47
|
-
result_type,
|
|
48
|
-
columns,
|
|
49
|
-
});
|
|
50
|
-
}
|
|
51
|
-
|
|
52
|
-
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
|
53
|
-
let path_string = to_read.to_r_string()?;
|
|
54
|
-
let file_path = unsafe { path_string.as_str()? };
|
|
55
|
-
let file = File::open(file_path).unwrap();
|
|
56
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
|
57
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
|
58
|
-
|
|
59
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
60
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
|
61
|
-
let raw_value = to_read.as_raw();
|
|
62
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
|
63
|
-
.map_err(|_| {
|
|
64
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
|
65
|
-
})?;
|
|
66
|
-
|
|
67
|
-
if fd < 0 {
|
|
68
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
|
69
|
-
}
|
|
70
|
-
|
|
71
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
|
72
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
|
73
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
|
74
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
|
75
|
-
|
|
76
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
77
|
-
} else {
|
|
78
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
|
79
|
-
let reader = SerializedFileReader::new(readable).unwrap();
|
|
80
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
|
81
|
-
|
|
82
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
|
83
|
-
};
|
|
84
|
-
|
|
85
|
-
if let Some(cols) = columns {
|
|
86
|
-
let projection = create_projection_schema(&schema, &cols);
|
|
87
|
-
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
|
88
|
-
MagnusError::new(
|
|
89
|
-
ruby.exception_runtime_error(),
|
|
90
|
-
format!("Failed to create projection: {}", e),
|
|
91
|
-
)
|
|
92
|
-
})?;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type.as_str() {
|
|
96
|
-
"hash" => {
|
|
97
|
-
let headers = OnceLock::new();
|
|
98
|
-
let headers_clone = headers.clone();
|
|
99
|
-
let iter = iter
|
|
100
|
-
.filter_map(move |row| {
|
|
101
|
-
row.ok().map(|row| {
|
|
102
|
-
let headers = headers_clone.get_or_init(|| {
|
|
103
|
-
let column_count = row.get_column_iter().count();
|
|
104
|
-
|
|
105
|
-
let mut header_string = Vec::with_capacity(column_count);
|
|
106
|
-
for (k, _) in row.get_column_iter() {
|
|
107
|
-
header_string.push(k.to_owned());
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
let headers = StringCache::intern_many(&header_string).unwrap();
|
|
111
|
-
|
|
112
|
-
headers
|
|
113
|
-
});
|
|
114
|
-
|
|
115
|
-
let mut map =
|
|
116
|
-
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
|
117
|
-
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
|
118
|
-
map.insert(headers[i], ParquetField(v.clone()));
|
|
119
|
-
});
|
|
120
|
-
map
|
|
121
|
-
})
|
|
122
|
-
})
|
|
123
|
-
.map(RowRecord::Map);
|
|
124
|
-
|
|
125
|
-
Box::new(HeaderCacheCleanupIter {
|
|
126
|
-
inner: iter,
|
|
127
|
-
headers,
|
|
128
|
-
})
|
|
129
|
-
}
|
|
130
|
-
"array" => Box::new(
|
|
131
|
-
iter.filter_map(|row| {
|
|
132
|
-
row.ok().map(|row| {
|
|
133
|
-
let column_count = row.get_column_iter().count();
|
|
134
|
-
let mut vec = Vec::with_capacity(column_count);
|
|
135
|
-
row.get_column_iter()
|
|
136
|
-
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
|
137
|
-
vec
|
|
138
|
-
})
|
|
139
|
-
})
|
|
140
|
-
.map(RowRecord::Vec),
|
|
141
|
-
),
|
|
142
|
-
_ => {
|
|
143
|
-
return Err(MagnusError::new(
|
|
144
|
-
ruby.exception_runtime_error(),
|
|
145
|
-
"Invalid result type",
|
|
146
|
-
))
|
|
147
|
-
}
|
|
148
|
-
};
|
|
149
|
-
|
|
150
|
-
Ok(Yield::Iter(iter))
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
#[inline]
|
|
154
|
-
pub fn parse_parquet_columns<'a>(
|
|
155
|
-
rb_self: Value,
|
|
156
|
-
args: &[Value],
|
|
157
|
-
) -> Result<Yield<Box<dyn Iterator<Item = ColumnRecord<RandomState>>>>, MagnusError> {
|
|
158
|
-
let original = unsafe { Ruby::get_unchecked() };
|
|
159
|
-
let ruby: &'static Ruby = Box::leak(Box::new(original));
|
|
160
|
-
|
|
161
|
-
let ParquetColumnsArgs {
|
|
162
|
-
to_read,
|
|
163
|
-
result_type,
|
|
164
|
-
columns,
|
|
165
|
-
batch_size,
|
|
166
|
-
} = parse_parquet_columns_args(&ruby, args)?;
|
|
167
|
-
|
|
168
|
-
if !ruby.block_given() {
|
|
169
|
-
return create_column_enumerator(ColumnEnumeratorArgs {
|
|
170
|
-
rb_self,
|
|
171
|
-
to_read,
|
|
172
|
-
result_type,
|
|
173
|
-
columns,
|
|
174
|
-
batch_size,
|
|
175
|
-
});
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
|
179
|
-
let path_string = to_read.to_r_string()?;
|
|
180
|
-
let file_path = unsafe { path_string.as_str()? };
|
|
181
|
-
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
|
182
|
-
|
|
183
|
-
let mut builder =
|
|
184
|
-
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
|
185
|
-
let schema = builder.schema().clone();
|
|
186
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
|
187
|
-
|
|
188
|
-
// If columns are specified, project only those columns
|
|
189
|
-
if let Some(cols) = &columns {
|
|
190
|
-
// Get the parquet schema
|
|
191
|
-
let parquet_schema = builder.parquet_schema();
|
|
192
|
-
|
|
193
|
-
// Create a projection mask from column names
|
|
194
|
-
let projection =
|
|
195
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
|
196
|
-
|
|
197
|
-
builder = builder.with_projection(projection);
|
|
198
|
-
}
|
|
199
|
-
|
|
200
|
-
if let Some(batch_size) = batch_size {
|
|
201
|
-
builder = builder.with_batch_size(batch_size);
|
|
202
|
-
}
|
|
1
|
+
use std::io;
|
|
203
2
|
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
(reader, schema, num_rows)
|
|
207
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
|
208
|
-
let raw_value = to_read.as_raw();
|
|
209
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
|
210
|
-
.map_err(|_| {
|
|
211
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
|
212
|
-
})?;
|
|
213
|
-
|
|
214
|
-
if fd < 0 {
|
|
215
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
|
216
|
-
}
|
|
217
|
-
|
|
218
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
|
219
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
|
220
|
-
|
|
221
|
-
let mut builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
|
|
222
|
-
let schema = builder.schema().clone();
|
|
223
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
|
224
|
-
|
|
225
|
-
if let Some(batch_size) = batch_size {
|
|
226
|
-
builder = builder.with_batch_size(batch_size);
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
// If columns are specified, project only those columns
|
|
230
|
-
if let Some(cols) = &columns {
|
|
231
|
-
// Get the parquet schema
|
|
232
|
-
let parquet_schema = builder.parquet_schema();
|
|
233
|
-
|
|
234
|
-
// Create a projection mask from column names
|
|
235
|
-
let projection =
|
|
236
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
|
237
|
-
|
|
238
|
-
builder = builder.with_projection(projection);
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
let reader = builder.build().unwrap();
|
|
242
|
-
|
|
243
|
-
(reader, schema, num_rows)
|
|
244
|
-
} else {
|
|
245
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
|
246
|
-
|
|
247
|
-
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable).unwrap();
|
|
248
|
-
let schema = builder.schema().clone();
|
|
249
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
|
250
|
-
|
|
251
|
-
if let Some(batch_size) = batch_size {
|
|
252
|
-
builder = builder.with_batch_size(batch_size);
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
// If columns are specified, project only those columns
|
|
256
|
-
if let Some(cols) = &columns {
|
|
257
|
-
// Get the parquet schema
|
|
258
|
-
let parquet_schema = builder.parquet_schema();
|
|
259
|
-
|
|
260
|
-
// Create a projection mask from column names
|
|
261
|
-
let projection =
|
|
262
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
|
263
|
-
|
|
264
|
-
builder = builder.with_projection(projection);
|
|
265
|
-
}
|
|
266
|
-
|
|
267
|
-
let reader = builder.build().unwrap();
|
|
268
|
-
|
|
269
|
-
(reader, schema, num_rows)
|
|
270
|
-
};
|
|
271
|
-
|
|
272
|
-
if num_rows == 0 {
|
|
273
|
-
let mut map =
|
|
274
|
-
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
|
275
|
-
for field in schema.fields() {
|
|
276
|
-
map.insert(
|
|
277
|
-
StringCache::intern(field.name().to_string()).unwrap(),
|
|
278
|
-
vec![],
|
|
279
|
-
);
|
|
280
|
-
}
|
|
281
|
-
let column_record = vec![ColumnRecord::Map(map)];
|
|
282
|
-
return Ok(Yield::Iter(Box::new(column_record.into_iter())));
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
let iter: Box<dyn Iterator<Item = ColumnRecord<RandomState>>> = match result_type.as_str() {
|
|
286
|
-
"hash" => {
|
|
287
|
-
let headers = OnceLock::new();
|
|
288
|
-
let headers_clone = headers.clone();
|
|
289
|
-
let iter = batch_reader
|
|
290
|
-
.filter_map(move |batch| {
|
|
291
|
-
batch.ok().map(|batch| {
|
|
292
|
-
let headers = headers_clone.get_or_init(|| {
|
|
293
|
-
let schema = batch.schema();
|
|
294
|
-
let fields = schema.fields();
|
|
295
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
|
296
|
-
for field in fields {
|
|
297
|
-
header_string.push(field.name().to_owned());
|
|
298
|
-
}
|
|
299
|
-
StringCache::intern_many(&header_string).unwrap()
|
|
300
|
-
});
|
|
301
|
-
|
|
302
|
-
let mut map =
|
|
303
|
-
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
|
304
|
-
|
|
305
|
-
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
|
306
|
-
let header = headers[i];
|
|
307
|
-
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
|
308
|
-
map.insert(header, values.into_inner());
|
|
309
|
-
});
|
|
310
|
-
|
|
311
|
-
map
|
|
312
|
-
})
|
|
313
|
-
})
|
|
314
|
-
.map(ColumnRecord::Map);
|
|
315
|
-
|
|
316
|
-
Box::new(HeaderCacheCleanupIter {
|
|
317
|
-
inner: iter,
|
|
318
|
-
headers,
|
|
319
|
-
})
|
|
320
|
-
}
|
|
321
|
-
"array" => Box::new(
|
|
322
|
-
batch_reader
|
|
323
|
-
.filter_map(|batch| {
|
|
324
|
-
batch.ok().map(|batch| {
|
|
325
|
-
batch
|
|
326
|
-
.columns()
|
|
327
|
-
.into_iter()
|
|
328
|
-
.map(|column| {
|
|
329
|
-
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
|
330
|
-
values.into_inner()
|
|
331
|
-
})
|
|
332
|
-
.collect()
|
|
333
|
-
})
|
|
334
|
-
})
|
|
335
|
-
.map(ColumnRecord::Vec),
|
|
336
|
-
),
|
|
337
|
-
_ => {
|
|
338
|
-
return Err(MagnusError::new(
|
|
339
|
-
ruby.exception_runtime_error(),
|
|
340
|
-
"Invalid result type",
|
|
341
|
-
))
|
|
342
|
-
}
|
|
343
|
-
};
|
|
344
|
-
|
|
345
|
-
Ok(Yield::Iter(iter))
|
|
346
|
-
}
|
|
347
|
-
|
|
348
|
-
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
|
349
|
-
if let SchemaType::GroupType { fields, .. } = schema {
|
|
350
|
-
let projected_fields: Vec<TypePtr> = fields
|
|
351
|
-
.iter()
|
|
352
|
-
.filter(|field| columns.contains(&field.name().to_string()))
|
|
353
|
-
.cloned()
|
|
354
|
-
.collect();
|
|
3
|
+
use magnus::{Error as MagnusError, Ruby};
|
|
4
|
+
use thiserror::Error;
|
|
355
5
|
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
}
|
|
360
|
-
} else {
|
|
361
|
-
// Return original schema if not a group type
|
|
362
|
-
schema.clone()
|
|
363
|
-
}
|
|
364
|
-
}
|
|
6
|
+
use crate::header_cache::CacheError;
|
|
7
|
+
pub use crate::parquet_column_reader::parse_parquet_columns;
|
|
8
|
+
pub use crate::parquet_row_reader::parse_parquet_rows;
|
|
365
9
|
|
|
366
10
|
#[derive(Error, Debug)]
|
|
367
11
|
pub enum ReaderError {
|
|
@@ -376,7 +20,7 @@ pub enum ReaderError {
|
|
|
376
20
|
#[error("Ruby error: {0}")]
|
|
377
21
|
Ruby(String),
|
|
378
22
|
#[error("Parquet error: {0}")]
|
|
379
|
-
Parquet(#[from] ParquetError),
|
|
23
|
+
Parquet(#[from] parquet::errors::ParquetError),
|
|
380
24
|
}
|
|
381
25
|
|
|
382
26
|
impl From<MagnusError> for ReaderError {
|
|
@@ -14,9 +14,8 @@ pub struct RubyReader<T> {
|
|
|
14
14
|
offset: usize,
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
-
pub trait SeekableRead:
|
|
18
|
-
impl SeekableRead for
|
|
19
|
-
impl SeekableRead for RubyReader<RString> {}
|
|
17
|
+
pub trait SeekableRead: Read + Seek {}
|
|
18
|
+
impl<T: Read + Seek> SeekableRead for T {}
|
|
20
19
|
|
|
21
20
|
pub fn build_ruby_reader(
|
|
22
21
|
ruby: &Ruby,
|
data/ext/parquet/src/types.rs
CHANGED
|
@@ -15,6 +15,47 @@ use parquet::record::Field;
|
|
|
15
15
|
|
|
16
16
|
use crate::header_cache::StringCacheKey;
|
|
17
17
|
|
|
18
|
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
|
19
|
+
pub enum ParserResultType {
|
|
20
|
+
Hash,
|
|
21
|
+
Array,
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
impl ParserResultType {
|
|
25
|
+
pub fn iter() -> impl Iterator<Item = Self> {
|
|
26
|
+
[Self::Hash, Self::Array].into_iter()
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
impl TryFrom<&str> for ParserResultType {
|
|
31
|
+
type Error = String;
|
|
32
|
+
|
|
33
|
+
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
|
34
|
+
match value {
|
|
35
|
+
"hash" => Ok(ParserResultType::Hash),
|
|
36
|
+
"array" => Ok(ParserResultType::Array),
|
|
37
|
+
_ => Err(format!("Invalid parser result type: {}", value)),
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
impl TryFrom<String> for ParserResultType {
|
|
43
|
+
type Error = String;
|
|
44
|
+
|
|
45
|
+
fn try_from(value: String) -> Result<Self, Self::Error> {
|
|
46
|
+
Self::try_from(value.as_str())
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
impl std::fmt::Display for ParserResultType {
|
|
51
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
52
|
+
match self {
|
|
53
|
+
ParserResultType::Hash => write!(f, "hash"),
|
|
54
|
+
ParserResultType::Array => write!(f, "array"),
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
18
59
|
#[derive(Debug)]
|
|
19
60
|
pub enum RowRecord<S: BuildHasher + Default> {
|
|
20
61
|
Vec(Vec<ParquetField>),
|
data/ext/parquet/src/utils.rs
CHANGED
|
@@ -4,6 +4,8 @@ use magnus::{
|
|
|
4
4
|
Error, RString, Ruby, Symbol, Value,
|
|
5
5
|
};
|
|
6
6
|
|
|
7
|
+
use crate::ParserResultType;
|
|
8
|
+
|
|
7
9
|
fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, Error> {
|
|
8
10
|
if value.is_nil() {
|
|
9
11
|
Ok(None)
|
|
@@ -28,7 +30,7 @@ fn parse_string_or_symbol(ruby: &Ruby, value: Value) -> Result<Option<String>, E
|
|
|
28
30
|
#[derive(Debug)]
|
|
29
31
|
pub struct ParquetRowsArgs {
|
|
30
32
|
pub to_read: Value,
|
|
31
|
-
pub result_type:
|
|
33
|
+
pub result_type: ParserResultType,
|
|
32
34
|
pub columns: Option<Vec<String>>,
|
|
33
35
|
}
|
|
34
36
|
|
|
@@ -43,28 +45,31 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
|
43
45
|
&["result_type", "columns"],
|
|
44
46
|
)?;
|
|
45
47
|
|
|
46
|
-
let result_type = match kwargs
|
|
48
|
+
let result_type: ParserResultType = match kwargs
|
|
47
49
|
.optional
|
|
48
50
|
.0
|
|
49
51
|
.map(|value| parse_string_or_symbol(ruby, value))
|
|
50
52
|
{
|
|
51
|
-
Some(Ok(Some(parsed))) =>
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
53
|
+
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
|
54
|
+
Error::new(
|
|
55
|
+
magnus::exception::runtime_error(),
|
|
56
|
+
format!(
|
|
57
|
+
"Invalid result type: {e}. Must be one of {}",
|
|
58
|
+
ParserResultType::iter()
|
|
59
|
+
.map(|v| v.to_string())
|
|
60
|
+
.collect::<Vec<_>>()
|
|
61
|
+
.join(", ")
|
|
62
|
+
),
|
|
63
|
+
)
|
|
64
|
+
})?,
|
|
65
|
+
Some(Ok(None)) => ParserResultType::Hash,
|
|
61
66
|
Some(Err(_)) => {
|
|
62
67
|
return Err(Error::new(
|
|
63
68
|
magnus::exception::type_error(),
|
|
64
69
|
"result_type must be a String or Symbol",
|
|
65
70
|
))
|
|
66
71
|
}
|
|
67
|
-
None =>
|
|
72
|
+
None => ParserResultType::Hash,
|
|
68
73
|
};
|
|
69
74
|
|
|
70
75
|
Ok(ParquetRowsArgs {
|
|
@@ -77,7 +82,7 @@ pub fn parse_parquet_rows_args(ruby: &Ruby, args: &[Value]) -> Result<ParquetRow
|
|
|
77
82
|
#[derive(Debug)]
|
|
78
83
|
pub struct ParquetColumnsArgs {
|
|
79
84
|
pub to_read: Value,
|
|
80
|
-
pub result_type:
|
|
85
|
+
pub result_type: ParserResultType,
|
|
81
86
|
pub columns: Option<Vec<String>>,
|
|
82
87
|
pub batch_size: Option<usize>,
|
|
83
88
|
}
|
|
@@ -96,28 +101,31 @@ pub fn parse_parquet_columns_args(
|
|
|
96
101
|
&["result_type", "columns", "batch_size"],
|
|
97
102
|
)?;
|
|
98
103
|
|
|
99
|
-
let result_type = match kwargs
|
|
104
|
+
let result_type: ParserResultType = match kwargs
|
|
100
105
|
.optional
|
|
101
106
|
.0
|
|
102
107
|
.map(|value| parse_string_or_symbol(ruby, value))
|
|
103
108
|
{
|
|
104
|
-
Some(Ok(Some(parsed))) =>
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
109
|
+
Some(Ok(Some(parsed))) => parsed.try_into().map_err(|e| {
|
|
110
|
+
Error::new(
|
|
111
|
+
magnus::exception::runtime_error(),
|
|
112
|
+
format!(
|
|
113
|
+
"Invalid result type: {e}. Must be one of {}",
|
|
114
|
+
ParserResultType::iter()
|
|
115
|
+
.map(|v| v.to_string())
|
|
116
|
+
.collect::<Vec<_>>()
|
|
117
|
+
.join(", ")
|
|
118
|
+
),
|
|
119
|
+
)
|
|
120
|
+
})?,
|
|
121
|
+
Some(Ok(None)) => ParserResultType::Hash,
|
|
114
122
|
Some(Err(_)) => {
|
|
115
123
|
return Err(Error::new(
|
|
116
124
|
magnus::exception::type_error(),
|
|
117
125
|
"result_type must be a String or Symbol",
|
|
118
126
|
))
|
|
119
127
|
}
|
|
120
|
-
None =>
|
|
128
|
+
None => ParserResultType::Hash,
|
|
121
129
|
};
|
|
122
130
|
|
|
123
131
|
Ok(ParquetColumnsArgs {
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: parquet
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
4
|
+
version: 0.0.5
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Nathan Jaremko
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2025-01-
|
|
11
|
+
date: 2025-01-06 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -60,6 +60,8 @@ files:
|
|
|
60
60
|
- ext/parquet/src/enumerator.rs
|
|
61
61
|
- ext/parquet/src/header_cache.rs
|
|
62
62
|
- ext/parquet/src/lib.rs
|
|
63
|
+
- ext/parquet/src/parquet_column_reader.rs
|
|
64
|
+
- ext/parquet/src/parquet_row_reader.rs
|
|
63
65
|
- ext/parquet/src/reader.rs
|
|
64
66
|
- ext/parquet/src/ruby_integration.rs
|
|
65
67
|
- ext/parquet/src/ruby_reader.rs
|