parquet 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +9 -1
- data/ext/parquet/Cargo.toml +4 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/reader/common.rs +7 -6
- data/ext/parquet/src/reader/mod.rs +204 -0
- data/ext/parquet/src/reader/parquet_column_reader.rs +19 -20
- data/ext/parquet/src/reader/parquet_row_reader.rs +18 -22
- data/ext/parquet/src/ruby_reader.rs +11 -24
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +8 -5
- data/ext/parquet/src/types/parquet_value.rs +204 -7
- data/ext/parquet/src/types/record_types.rs +31 -8
- data/ext/parquet/src/types/schema_converter.rs +118 -11
- data/ext/parquet/src/types/schema_node.rs +83 -2
- data/ext/parquet/src/types/timestamp.rs +6 -10
- data/ext/parquet/src/types/type_conversion.rs +84 -11
- data/ext/parquet/src/types/writer_types.rs +40 -11
- data/ext/parquet/src/utils.rs +6 -6
- data/ext/parquet/src/writer/mod.rs +25 -18
- data/ext/parquet/src/writer/write_columns.rs +27 -24
- data/ext/parquet/src/writer/write_rows.rs +17 -16
- data/lib/parquet/schema.rb +77 -4
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +11 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e1ae8e2c64920df8527a16d7348fc37c5ae2cf5c783b648bed93e31cab25bd72
|
4
|
+
data.tar.gz: 2d7b45349d33679f96559683e31d7c9dd5718fb78611aad057bba92d7324c2d3
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1f56d8e538bdb095e43472940a8c3a57b6b54d74ab87d9c1519878d759962e6d844f9c992927dc22d22ebefee4bd64a858b2ed89ccc3c694d183bcb9fd154497
|
7
|
+
data.tar.gz: 5f5c8914d81ef297bebb021ba40e70725208e61c2bd1565f7d134341ac3c31489b501766266f7390ffde82a44e5821321b55f827467ac95c760cd08588788e9d
|
data/Cargo.lock
CHANGED
@@ -681,7 +681,7 @@ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
|
|
681
681
|
dependencies = [
|
682
682
|
"magnus-macros",
|
683
683
|
"rb-sys",
|
684
|
-
"rb-sys-env",
|
684
|
+
"rb-sys-env 0.1.2",
|
685
685
|
"seq-macro",
|
686
686
|
]
|
687
687
|
|
@@ -839,9 +839,11 @@ dependencies = [
|
|
839
839
|
"jiff",
|
840
840
|
"magnus",
|
841
841
|
"mimalloc",
|
842
|
+
"num",
|
842
843
|
"parquet 54.2.0",
|
843
844
|
"rand",
|
844
845
|
"rb-sys",
|
846
|
+
"rb-sys-env 0.2.2",
|
845
847
|
"simdutf8",
|
846
848
|
"tempfile",
|
847
849
|
"thiserror",
|
@@ -997,6 +999,12 @@ version = "0.1.2"
|
|
997
999
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
998
1000
|
checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
999
1001
|
|
1002
|
+
[[package]]
|
1003
|
+
name = "rb-sys-env"
|
1004
|
+
version = "0.2.2"
|
1005
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1006
|
+
checksum = "08f8d2924cf136a1315e2b4c7460a39f62ef11ee5d522df9b2750fab55b868b6"
|
1007
|
+
|
1000
1008
|
[[package]]
|
1001
1009
|
name = "regex"
|
1002
1010
|
version = "1.11.1"
|
data/ext/parquet/Cargo.toml
CHANGED
@@ -6,6 +6,9 @@ edition = "2021"
|
|
6
6
|
[lib]
|
7
7
|
crate-type = ["cdylib"]
|
8
8
|
|
9
|
+
[build-dependencies]
|
10
|
+
rb-sys-env = "^0.2"
|
11
|
+
|
9
12
|
[dependencies]
|
10
13
|
ahash = "0.8"
|
11
14
|
arrow-array = "54.0.0"
|
@@ -21,6 +24,7 @@ rb-sys = "^0.9"
|
|
21
24
|
simdutf8 = "0.1.5"
|
22
25
|
tempfile = "^3.15"
|
23
26
|
thiserror = "2.0"
|
27
|
+
num = "0.4.3"
|
24
28
|
|
25
29
|
[target.'cfg(target_os = "linux")'.dependencies]
|
26
30
|
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
data/ext/parquet/src/lib.rs
CHANGED
@@ -20,6 +20,7 @@ use writer::write_rows;
|
|
20
20
|
#[magnus::init]
|
21
21
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
22
22
|
let module = ruby.define_module("Parquet")?;
|
23
|
+
module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
|
23
24
|
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
24
25
|
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
25
26
|
module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
|
@@ -5,6 +5,7 @@ use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchR
|
|
5
5
|
use parquet::arrow::ProjectionMask;
|
6
6
|
use std::collections::HashMap;
|
7
7
|
use std::fs::File;
|
8
|
+
use std::rc::Rc;
|
8
9
|
use std::sync::Arc;
|
9
10
|
|
10
11
|
use magnus::value::ReprValue;
|
@@ -21,7 +22,7 @@ use crate::ColumnRecord;
|
|
21
22
|
/// returning either a File or a ThreadSafeRubyReader that can be used with
|
22
23
|
/// parquet readers.
|
23
24
|
pub fn open_parquet_source(
|
24
|
-
ruby:
|
25
|
+
ruby: Rc<Ruby>,
|
25
26
|
to_read: Value,
|
26
27
|
) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
|
27
28
|
if to_read.is_kind_of(ruby.class_string()) {
|
@@ -58,8 +59,8 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
|
|
58
59
|
columns: &Option<Vec<String>>,
|
59
60
|
batch_size: Option<usize>,
|
60
61
|
) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
|
61
|
-
let mut builder =
|
62
|
-
.map_err(
|
62
|
+
let mut builder =
|
63
|
+
ParquetRecordBatchReaderBuilder::try_new(reader).map_err(ParquetGemError::Parquet)?;
|
63
64
|
|
64
65
|
let schema = builder.schema().clone();
|
65
66
|
let num_rows = builder.metadata().file_metadata().num_rows();
|
@@ -78,7 +79,7 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
|
|
78
79
|
builder = builder.with_batch_size(batch_size);
|
79
80
|
}
|
80
81
|
|
81
|
-
let reader = builder.build().map_err(
|
82
|
+
let reader = builder.build().map_err(ParquetGemError::Parquet)?;
|
82
83
|
Ok((reader, schema, num_rows))
|
83
84
|
}
|
84
85
|
|
@@ -98,12 +99,12 @@ pub fn handle_empty_file(
|
|
98
99
|
.map(|field| field.name().to_string())
|
99
100
|
.collect();
|
100
101
|
let interned_headers =
|
101
|
-
StringCache::intern_many(&headers).map_err(
|
102
|
+
StringCache::intern_many(&headers).map_err(ParquetGemError::HeaderIntern)?;
|
102
103
|
for field in interned_headers.iter() {
|
103
104
|
map.insert(*field, vec![]);
|
104
105
|
}
|
105
106
|
let record = ColumnRecord::Map(map);
|
106
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(
|
107
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(ruby)?)?;
|
107
108
|
return Ok(true);
|
108
109
|
}
|
109
110
|
Ok(false)
|
@@ -1,6 +1,210 @@
|
|
1
1
|
mod common;
|
2
2
|
mod parquet_column_reader;
|
3
3
|
mod parquet_row_reader;
|
4
|
+
use std::{fs::File, rc::Rc};
|
4
5
|
|
6
|
+
use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
|
7
|
+
use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
|
5
8
|
pub use parquet_column_reader::parse_parquet_columns;
|
6
9
|
pub use parquet_row_reader::parse_parquet_rows;
|
10
|
+
|
11
|
+
use crate::{
|
12
|
+
ruby_reader::{RubyReader, ThreadSafeRubyReader},
|
13
|
+
types::{ParquetGemError, TryIntoValue},
|
14
|
+
};
|
15
|
+
|
16
|
+
struct RubyParquetMetaData(ParquetMetaData);
|
17
|
+
|
18
|
+
impl TryIntoValue for RubyParquetMetaData {
|
19
|
+
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
20
|
+
let metadata = self.0;
|
21
|
+
let file_metadata = metadata.file_metadata();
|
22
|
+
let row_groups = metadata.row_groups();
|
23
|
+
|
24
|
+
// Construct a hash with the metadata
|
25
|
+
let hash = handle.hash_new();
|
26
|
+
hash.aset("num_rows", file_metadata.num_rows())?;
|
27
|
+
hash.aset("created_by", file_metadata.created_by())?;
|
28
|
+
// Convert key_value_metadata to a Ruby array if it exists
|
29
|
+
if let Some(key_value_metadata) = file_metadata.key_value_metadata() {
|
30
|
+
let kv_array = handle.ary_new();
|
31
|
+
for kv in key_value_metadata {
|
32
|
+
let kv_hash = handle.hash_new();
|
33
|
+
kv_hash.aset("key", kv.key.clone())?;
|
34
|
+
kv_hash.aset("value", kv.value.clone())?;
|
35
|
+
kv_array.push(kv_hash)?;
|
36
|
+
}
|
37
|
+
hash.aset("key_value_metadata", kv_array)?;
|
38
|
+
} else {
|
39
|
+
hash.aset("key_value_metadata", None::<Value>)?;
|
40
|
+
}
|
41
|
+
|
42
|
+
// Convert schema to a Ruby hash since &Type doesn't implement IntoValue
|
43
|
+
let schema_hash = handle.hash_new();
|
44
|
+
let schema = file_metadata.schema();
|
45
|
+
schema_hash.aset("name", schema.name())?;
|
46
|
+
// Add schema fields information
|
47
|
+
let fields_array = handle.ary_new();
|
48
|
+
for field in schema.get_fields() {
|
49
|
+
let field_hash = handle.hash_new();
|
50
|
+
field_hash.aset("name", field.name())?;
|
51
|
+
|
52
|
+
// Handle different field types
|
53
|
+
match field.as_ref() {
|
54
|
+
parquet::schema::types::Type::PrimitiveType {
|
55
|
+
physical_type,
|
56
|
+
type_length,
|
57
|
+
scale,
|
58
|
+
precision,
|
59
|
+
..
|
60
|
+
} => {
|
61
|
+
field_hash.aset("type", "primitive")?;
|
62
|
+
field_hash.aset("physical_type", format!("{:?}", physical_type))?;
|
63
|
+
field_hash.aset("type_length", *type_length)?;
|
64
|
+
field_hash.aset("scale", *scale)?;
|
65
|
+
field_hash.aset("precision", *precision)?;
|
66
|
+
}
|
67
|
+
parquet::schema::types::Type::GroupType { .. } => {
|
68
|
+
field_hash.aset("type", "group")?;
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
// Add basic info
|
73
|
+
let basic_info = field.get_basic_info();
|
74
|
+
field_hash.aset("repetition", format!("{:?}", basic_info.repetition()))?;
|
75
|
+
field_hash.aset(
|
76
|
+
"converted_type",
|
77
|
+
format!("{:?}", basic_info.converted_type()),
|
78
|
+
)?;
|
79
|
+
if let Some(logical_type) = basic_info.logical_type() {
|
80
|
+
field_hash.aset("logical_type", format!("{:?}", logical_type))?;
|
81
|
+
}
|
82
|
+
|
83
|
+
fields_array.push(field_hash)?;
|
84
|
+
}
|
85
|
+
schema_hash.aset("fields", fields_array)?;
|
86
|
+
|
87
|
+
hash.aset("schema", schema_hash)?;
|
88
|
+
|
89
|
+
// Convert row_groups to a Ruby array since &[RowGroupMetaData] doesn't implement IntoValue
|
90
|
+
let row_groups_array = handle.ary_new();
|
91
|
+
for row_group in row_groups.iter() {
|
92
|
+
let rg_hash = handle.hash_new();
|
93
|
+
rg_hash.aset("num_columns", row_group.num_columns())?;
|
94
|
+
rg_hash.aset("num_rows", row_group.num_rows())?;
|
95
|
+
rg_hash.aset("total_byte_size", row_group.total_byte_size())?;
|
96
|
+
rg_hash.aset("file_offset", row_group.file_offset())?;
|
97
|
+
rg_hash.aset("ordinal", row_group.ordinal())?;
|
98
|
+
rg_hash.aset("compressed_size", row_group.compressed_size())?;
|
99
|
+
|
100
|
+
// Add column chunks metadata
|
101
|
+
let columns_array = handle.ary_new();
|
102
|
+
for col_idx in 0..row_group.num_columns() {
|
103
|
+
let column = row_group.column(col_idx);
|
104
|
+
let col_hash = handle.hash_new();
|
105
|
+
|
106
|
+
col_hash.aset("column_path", column.column_path().string())?;
|
107
|
+
col_hash.aset("file_path", column.file_path())?;
|
108
|
+
col_hash.aset("file_offset", column.file_offset())?;
|
109
|
+
col_hash.aset("num_values", column.num_values())?;
|
110
|
+
col_hash.aset("compression", format!("{:?}", column.compression()))?;
|
111
|
+
col_hash.aset("total_compressed_size", column.compressed_size())?;
|
112
|
+
col_hash.aset("total_uncompressed_size", column.uncompressed_size())?;
|
113
|
+
col_hash.aset("data_page_offset", column.data_page_offset())?;
|
114
|
+
|
115
|
+
if let Some(offset) = column.dictionary_page_offset() {
|
116
|
+
col_hash.aset("dictionary_page_offset", offset)?;
|
117
|
+
}
|
118
|
+
|
119
|
+
if let Some(offset) = column.bloom_filter_offset() {
|
120
|
+
col_hash.aset("bloom_filter_offset", offset)?;
|
121
|
+
}
|
122
|
+
|
123
|
+
if let Some(length) = column.bloom_filter_length() {
|
124
|
+
col_hash.aset("bloom_filter_length", length)?;
|
125
|
+
}
|
126
|
+
|
127
|
+
if let Some(offset) = column.offset_index_offset() {
|
128
|
+
col_hash.aset("offset_index_offset", offset)?;
|
129
|
+
}
|
130
|
+
|
131
|
+
if let Some(length) = column.offset_index_length() {
|
132
|
+
col_hash.aset("offset_index_length", length)?;
|
133
|
+
}
|
134
|
+
|
135
|
+
if let Some(offset) = column.column_index_offset() {
|
136
|
+
col_hash.aset("column_index_offset", offset)?;
|
137
|
+
}
|
138
|
+
|
139
|
+
if let Some(length) = column.column_index_length() {
|
140
|
+
col_hash.aset("column_index_length", length)?;
|
141
|
+
}
|
142
|
+
|
143
|
+
// Add encodings
|
144
|
+
let encodings_array = handle.ary_new();
|
145
|
+
for encoding in column.encodings() {
|
146
|
+
encodings_array.push(format!("{:?}", encoding))?;
|
147
|
+
}
|
148
|
+
col_hash.aset("encodings", encodings_array)?;
|
149
|
+
|
150
|
+
// Add statistics if available
|
151
|
+
if let Some(stats) = column.statistics() {
|
152
|
+
let stats_hash = handle.hash_new();
|
153
|
+
stats_hash.aset("min_is_exact", stats.min_is_exact())?;
|
154
|
+
stats_hash.aset("max_is_exact", stats.max_is_exact())?;
|
155
|
+
|
156
|
+
col_hash.aset("statistics", stats_hash)?;
|
157
|
+
}
|
158
|
+
|
159
|
+
// Add page encoding stats if available
|
160
|
+
if let Some(page_encoding_stats) = column.page_encoding_stats() {
|
161
|
+
let page_stats_array = handle.ary_new();
|
162
|
+
for stat in page_encoding_stats {
|
163
|
+
let stat_hash = handle.hash_new();
|
164
|
+
stat_hash.aset("page_type", format!("{:?}", stat.page_type))?;
|
165
|
+
stat_hash.aset("encoding", format!("{:?}", stat.encoding))?;
|
166
|
+
stat_hash.aset("count", stat.count)?;
|
167
|
+
page_stats_array.push(stat_hash)?;
|
168
|
+
}
|
169
|
+
col_hash.aset("page_encoding_stats", page_stats_array)?;
|
170
|
+
}
|
171
|
+
|
172
|
+
columns_array.push(col_hash)?;
|
173
|
+
}
|
174
|
+
rg_hash.aset("columns", columns_array)?;
|
175
|
+
|
176
|
+
row_groups_array.push(rg_hash)?;
|
177
|
+
}
|
178
|
+
hash.aset("row_groups", row_groups_array)?;
|
179
|
+
|
180
|
+
Ok(handle.into_value(hash))
|
181
|
+
}
|
182
|
+
}
|
183
|
+
|
184
|
+
pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
185
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
186
|
+
|
187
|
+
if args.len() != 1 {
|
188
|
+
return Err(MagnusError::new(
|
189
|
+
magnus::exception::arg_error(),
|
190
|
+
format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
|
191
|
+
));
|
192
|
+
}
|
193
|
+
|
194
|
+
let ruby = Rc::new(ruby);
|
195
|
+
let arg = args[0];
|
196
|
+
|
197
|
+
let mut reader = ParquetMetaDataReader::new();
|
198
|
+
if arg.is_kind_of(ruby.class_string()) {
|
199
|
+
let path = arg.to_r_string()?.to_string()?;
|
200
|
+
let file = File::open(path).map_err(ParquetGemError::FileOpen)?;
|
201
|
+
reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
|
202
|
+
} else {
|
203
|
+
let file = ThreadSafeRubyReader::new(RubyReader::new(ruby.clone(), arg)?);
|
204
|
+
reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
|
205
|
+
}
|
206
|
+
|
207
|
+
let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
|
208
|
+
|
209
|
+
Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
|
210
|
+
}
|
@@ -10,26 +10,25 @@ use either::Either;
|
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
12
|
use std::collections::HashMap;
|
13
|
-
use std::
|
13
|
+
use std::rc::Rc;
|
14
|
+
use std::sync::OnceLock;
|
14
15
|
|
15
16
|
use super::common::{
|
16
17
|
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
17
18
|
};
|
18
19
|
|
19
20
|
#[inline]
|
20
|
-
pub fn parse_parquet_columns
|
21
|
+
pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
21
22
|
let ruby = unsafe { Ruby::get_unchecked() };
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
})?,
|
27
|
-
)
|
23
|
+
parse_parquet_columns_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
|
24
|
+
let z: MagnusError = e.into();
|
25
|
+
z
|
26
|
+
})
|
28
27
|
}
|
29
28
|
|
30
29
|
#[inline]
|
31
|
-
fn parse_parquet_columns_impl
|
32
|
-
ruby:
|
30
|
+
fn parse_parquet_columns_impl(
|
31
|
+
ruby: Rc<Ruby>,
|
33
32
|
rb_self: Value,
|
34
33
|
args: &[Value],
|
35
34
|
) -> Result<Value, ParquetGemError> {
|
@@ -76,13 +75,13 @@ fn parse_parquet_columns_impl<'a>(
|
|
76
75
|
Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
|
77
76
|
};
|
78
77
|
|
79
|
-
// Handle empty file case
|
80
|
-
if handle_empty_file(&ruby, &schema, num_rows)? {
|
81
|
-
return Ok(ruby.qnil().into_value_with(&ruby));
|
82
|
-
}
|
83
|
-
|
84
78
|
match result_type {
|
85
79
|
ParserResultType::Hash => {
|
80
|
+
// For hash return type, we need to return a hash with column names pointing at empty arrays
|
81
|
+
if handle_empty_file(&ruby, &schema, num_rows)? {
|
82
|
+
return Ok(ruby.qnil().into_value_with(&ruby));
|
83
|
+
}
|
84
|
+
|
86
85
|
let headers = OnceLock::new();
|
87
86
|
let headers_clone = headers.clone();
|
88
87
|
let iter = batch_reader.map(move |batch| {
|
@@ -112,8 +111,8 @@ fn parse_parquet_columns_impl<'a>(
|
|
112
111
|
.try_for_each(|(i, column)| {
|
113
112
|
let header = local_headers[i];
|
114
113
|
let values = ParquetValueVec::try_from(ArrayWrapper {
|
115
|
-
array:
|
116
|
-
strict
|
114
|
+
array: column,
|
115
|
+
strict,
|
117
116
|
})?;
|
118
117
|
map.insert(header, values.into_inner());
|
119
118
|
Ok::<_, ParquetGemError>(())
|
@@ -133,11 +132,11 @@ fn parse_parquet_columns_impl<'a>(
|
|
133
132
|
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
134
133
|
let vec = batch
|
135
134
|
.columns()
|
136
|
-
.
|
135
|
+
.iter()
|
137
136
|
.map(|column| {
|
138
137
|
let values = ParquetValueVec::try_from(ArrayWrapper {
|
139
|
-
array:
|
140
|
-
strict
|
138
|
+
array: column,
|
139
|
+
strict,
|
141
140
|
})?;
|
142
141
|
Ok::<_, ParquetGemError>(values.into_inner())
|
143
142
|
})
|
@@ -13,24 +13,23 @@ use parquet::file::reader::{FileReader, SerializedFileReader};
|
|
13
13
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
14
14
|
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
15
15
|
use std::collections::HashMap;
|
16
|
-
use std::
|
16
|
+
use std::rc::Rc;
|
17
|
+
use std::sync::OnceLock;
|
17
18
|
|
18
19
|
use super::common::{handle_block_or_enum, open_parquet_source};
|
19
20
|
|
20
21
|
#[inline]
|
21
|
-
pub fn parse_parquet_rows
|
22
|
+
pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
23
|
let ruby = unsafe { Ruby::get_unchecked() };
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
})?,
|
28
|
-
)
|
24
|
+
parse_parquet_rows_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
|
25
|
+
let z: MagnusError = e.into();
|
26
|
+
z
|
27
|
+
})
|
29
28
|
}
|
30
29
|
|
31
30
|
#[inline]
|
32
|
-
fn parse_parquet_rows_impl
|
33
|
-
ruby:
|
31
|
+
fn parse_parquet_rows_impl(
|
32
|
+
ruby: Rc<Ruby>,
|
34
33
|
rb_self: Value,
|
35
34
|
args: &[Value],
|
36
35
|
) -> Result<Value, ParquetGemError> {
|
@@ -93,7 +92,7 @@ fn parse_parquet_rows_impl<'a>(
|
|
93
92
|
let headers = OnceLock::new();
|
94
93
|
let headers_clone = headers.clone();
|
95
94
|
let iter = iter.map(move |row| {
|
96
|
-
row.
|
95
|
+
row.map(|row| {
|
97
96
|
let headers = headers_clone.get_or_init(|| {
|
98
97
|
let column_count = row.get_column_iter().count();
|
99
98
|
|
@@ -102,10 +101,7 @@ fn parse_parquet_rows_impl<'a>(
|
|
102
101
|
header_string.push(k.to_owned());
|
103
102
|
}
|
104
103
|
|
105
|
-
|
106
|
-
.expect("Failed to intern headers");
|
107
|
-
|
108
|
-
headers
|
104
|
+
StringCache::intern_many(&header_string).expect("Failed to intern headers")
|
109
105
|
});
|
110
106
|
|
111
107
|
let mut map =
|
@@ -113,10 +109,10 @@ fn parse_parquet_rows_impl<'a>(
|
|
113
109
|
for (i, (_, v)) in row.get_column_iter().enumerate() {
|
114
110
|
map.insert(headers[i], ParquetField(v.clone(), strict));
|
115
111
|
}
|
116
|
-
|
112
|
+
map
|
117
113
|
})
|
118
|
-
.
|
119
|
-
.map_err(
|
114
|
+
.map(RowRecord::Map::<RandomState>)
|
115
|
+
.map_err(ParquetGemError::from)
|
120
116
|
});
|
121
117
|
|
122
118
|
for result in iter {
|
@@ -126,16 +122,16 @@ fn parse_parquet_rows_impl<'a>(
|
|
126
122
|
}
|
127
123
|
ParserResultType::Array => {
|
128
124
|
let iter = iter.map(|row| {
|
129
|
-
row.
|
125
|
+
row.map(|row| {
|
130
126
|
let column_count = row.get_column_iter().count();
|
131
127
|
let mut vec = Vec::with_capacity(column_count);
|
132
128
|
for (_, v) in row.get_column_iter() {
|
133
129
|
vec.push(ParquetField(v.clone(), strict));
|
134
130
|
}
|
135
|
-
|
131
|
+
vec
|
136
132
|
})
|
137
|
-
.
|
138
|
-
.map_err(
|
133
|
+
.map(RowRecord::Vec::<RandomState>)
|
134
|
+
.map_err(ParquetGemError::from)
|
139
135
|
});
|
140
136
|
|
141
137
|
for result in iter {
|
@@ -7,7 +7,7 @@ use parquet::{
|
|
7
7
|
errors::ParquetError,
|
8
8
|
file::reader::{ChunkReader, Length},
|
9
9
|
};
|
10
|
-
use std::{fs::File, sync::Mutex};
|
10
|
+
use std::{fs::File, rc::Rc, sync::Mutex};
|
11
11
|
use std::{
|
12
12
|
io::{self, BufReader, Read, Seek, SeekFrom, Write},
|
13
13
|
sync::Arc,
|
@@ -19,12 +19,10 @@ use crate::types::ParquetGemError;
|
|
19
19
|
/// and provide a standard Read implementation for them.
|
20
20
|
pub enum RubyReader {
|
21
21
|
String {
|
22
|
-
ruby: Arc<Ruby>,
|
23
22
|
inner: Opaque<RString>,
|
24
23
|
offset: usize,
|
25
24
|
},
|
26
25
|
RubyIoLike {
|
27
|
-
ruby: Arc<Ruby>,
|
28
26
|
inner: Opaque<Value>,
|
29
27
|
},
|
30
28
|
NativeProxyIoLike {
|
@@ -37,10 +35,9 @@ pub enum RubyReader {
|
|
37
35
|
unsafe impl Send for RubyReader {}
|
38
36
|
|
39
37
|
impl RubyReader {
|
40
|
-
pub fn new(ruby:
|
38
|
+
pub fn new(ruby: Rc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
|
41
39
|
if RubyReader::is_seekable_io_like(&value) {
|
42
40
|
Ok(RubyReader::RubyIoLike {
|
43
|
-
ruby,
|
44
41
|
inner: Opaque::from(value),
|
45
42
|
})
|
46
43
|
} else if RubyReader::is_io_like(&value) {
|
@@ -49,7 +46,6 @@ impl RubyReader {
|
|
49
46
|
|
50
47
|
// This is safe, because we won't call seek
|
51
48
|
let inner_readable = RubyReader::RubyIoLike {
|
52
|
-
ruby: ruby.clone(),
|
53
49
|
inner: Opaque::from(value),
|
54
50
|
};
|
55
51
|
let mut reader = BufReader::new(inner_readable);
|
@@ -68,7 +64,6 @@ impl RubyReader {
|
|
68
64
|
.funcall::<_, _, RString>("to_str", ())
|
69
65
|
.or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
|
70
66
|
Ok(RubyReader::String {
|
71
|
-
ruby,
|
72
67
|
inner: Opaque::from(string_content),
|
73
68
|
offset: 0,
|
74
69
|
})
|
@@ -89,10 +84,10 @@ impl RubyReader {
|
|
89
84
|
|
90
85
|
impl Seek for RubyReader {
|
91
86
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
87
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
92
88
|
match self {
|
93
89
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
|
94
90
|
RubyReader::String {
|
95
|
-
ruby,
|
96
91
|
inner,
|
97
92
|
offset: original_offset,
|
98
93
|
} => {
|
@@ -113,7 +108,7 @@ impl Seek for RubyReader {
|
|
113
108
|
*original_offset = new_offset.min(unwrapped_inner.len());
|
114
109
|
Ok(*original_offset as u64)
|
115
110
|
}
|
116
|
-
RubyReader::RubyIoLike {
|
111
|
+
RubyReader::RubyIoLike { inner } => {
|
117
112
|
let unwrapped_inner = ruby.get_inner(*inner);
|
118
113
|
|
119
114
|
let (whence, ruby_offset) = match pos {
|
@@ -138,13 +133,10 @@ impl Seek for RubyReader {
|
|
138
133
|
|
139
134
|
impl Read for RubyReader {
|
140
135
|
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
136
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
141
137
|
match self {
|
142
138
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
|
143
|
-
RubyReader::String {
|
144
|
-
ruby,
|
145
|
-
inner,
|
146
|
-
offset,
|
147
|
-
} => {
|
139
|
+
RubyReader::String { inner, offset } => {
|
148
140
|
let unwrapped_inner = ruby.get_inner(*inner);
|
149
141
|
|
150
142
|
let string_buffer = unsafe { unwrapped_inner.as_slice() };
|
@@ -160,7 +152,7 @@ impl Read for RubyReader {
|
|
160
152
|
|
161
153
|
Ok(copy_size)
|
162
154
|
}
|
163
|
-
RubyReader::RubyIoLike {
|
155
|
+
RubyReader::RubyIoLike { inner } => {
|
164
156
|
let unwrapped_inner = ruby.get_inner(*inner);
|
165
157
|
|
166
158
|
let bytes = unwrapped_inner
|
@@ -173,9 +165,7 @@ impl Read for RubyReader {
|
|
173
165
|
buf.write_all(string_buffer)?;
|
174
166
|
Ok(string_buffer.len())
|
175
167
|
}
|
176
|
-
None =>
|
177
|
-
return Ok(0);
|
178
|
-
}
|
168
|
+
None => Ok(0),
|
179
169
|
}
|
180
170
|
}
|
181
171
|
}
|
@@ -184,17 +174,14 @@ impl Read for RubyReader {
|
|
184
174
|
|
185
175
|
impl Length for RubyReader {
|
186
176
|
fn len(&self) -> u64 {
|
177
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
187
178
|
match self {
|
188
179
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
|
189
|
-
RubyReader::String {
|
190
|
-
ruby,
|
191
|
-
inner,
|
192
|
-
offset: _,
|
193
|
-
} => {
|
180
|
+
RubyReader::String { inner, offset: _ } => {
|
194
181
|
let unwrapped_inner = ruby.get_inner(*inner);
|
195
182
|
unwrapped_inner.len() as u64
|
196
183
|
}
|
197
|
-
RubyReader::RubyIoLike {
|
184
|
+
RubyReader::RubyIoLike { inner } => {
|
198
185
|
let unwrapped_inner = ruby.get_inner(*inner);
|
199
186
|
|
200
187
|
// Get current position
|