parquet 0.0.4 → 0.2.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Cargo.lock +48 -40
- data/Gemfile +1 -1
- data/README.md +92 -2
- data/ext/parquet/Cargo.toml +5 -8
- data/ext/parquet/src/enumerator.rs +11 -5
- data/ext/parquet/src/lib.rs +5 -0
- data/ext/parquet/src/reader/mod.rs +42 -0
- data/ext/parquet/src/{reader.rs → reader/parquet_column_reader.rs} +7 -164
- data/ext/parquet/src/reader/parquet_row_reader.rs +152 -0
- data/ext/parquet/src/ruby_reader.rs +2 -3
- data/ext/parquet/src/types/core_types.rs +73 -0
- data/ext/parquet/src/types/mod.rs +30 -0
- data/ext/parquet/src/{types.rs → types/parquet_value.rs} +171 -435
- data/ext/parquet/src/types/record_types.rs +204 -0
- data/ext/parquet/src/types/timestamp.rs +85 -0
- data/ext/parquet/src/types/type_conversion.rs +753 -0
- data/ext/parquet/src/types/writer_types.rs +270 -0
- data/ext/parquet/src/utils.rs +34 -26
- data/ext/parquet/src/writer/mod.rs +403 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rbi +33 -2
- metadata +13 -4
@@ -0,0 +1,152 @@
|
|
1
|
+
use crate::header_cache::{HeaderCacheCleanupIter, StringCache};
|
2
|
+
use crate::{
|
3
|
+
create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
|
4
|
+
ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
|
5
|
+
};
|
6
|
+
use ahash::RandomState;
|
7
|
+
use magnus::rb_sys::AsRawValue;
|
8
|
+
use magnus::value::{Opaque, ReprValue};
|
9
|
+
use magnus::{block::Yield, Error as MagnusError, Ruby, Value};
|
10
|
+
use parquet::file::reader::{FileReader, SerializedFileReader};
|
11
|
+
use parquet::record::reader::RowIter as ParquetRowIter;
|
12
|
+
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
13
|
+
use std::collections::HashMap;
|
14
|
+
use std::fs::File;
|
15
|
+
use std::mem::ManuallyDrop;
|
16
|
+
use std::os::fd::FromRawFd;
|
17
|
+
use std::sync::OnceLock;
|
18
|
+
|
19
|
+
#[inline]
|
20
|
+
pub fn parse_parquet_rows<'a>(
|
21
|
+
rb_self: Value,
|
22
|
+
args: &[Value],
|
23
|
+
) -> Result<Yield<Box<dyn Iterator<Item = RowRecord<RandomState>>>>, MagnusError> {
|
24
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
25
|
+
|
26
|
+
let ParquetRowsArgs {
|
27
|
+
to_read,
|
28
|
+
result_type,
|
29
|
+
columns,
|
30
|
+
} = parse_parquet_rows_args(&ruby, args)?;
|
31
|
+
|
32
|
+
if !ruby.block_given() {
|
33
|
+
return create_row_enumerator(RowEnumeratorArgs {
|
34
|
+
rb_self,
|
35
|
+
to_read,
|
36
|
+
result_type,
|
37
|
+
columns,
|
38
|
+
});
|
39
|
+
}
|
40
|
+
|
41
|
+
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
42
|
+
let path_string = to_read.to_r_string()?;
|
43
|
+
let file_path = unsafe { path_string.as_str()? };
|
44
|
+
let file = File::open(file_path).unwrap();
|
45
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
46
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
47
|
+
|
48
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
49
|
+
} else if to_read.is_kind_of(ruby.class_io()) {
|
50
|
+
let raw_value = to_read.as_raw();
|
51
|
+
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
52
|
+
.map_err(|_| {
|
53
|
+
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
54
|
+
})?;
|
55
|
+
|
56
|
+
if fd < 0 {
|
57
|
+
return Err(ReaderError::InvalidFileDescriptor.into());
|
58
|
+
}
|
59
|
+
|
60
|
+
let file = unsafe { File::from_raw_fd(fd) };
|
61
|
+
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
62
|
+
let reader = SerializedFileReader::new(file).unwrap();
|
63
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
64
|
+
|
65
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
66
|
+
} else {
|
67
|
+
let readable = SeekableRubyValue(Opaque::from(to_read));
|
68
|
+
let reader = SerializedFileReader::new(readable).unwrap();
|
69
|
+
let schema = reader.metadata().file_metadata().schema().clone();
|
70
|
+
|
71
|
+
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
72
|
+
};
|
73
|
+
|
74
|
+
if let Some(cols) = columns {
|
75
|
+
let projection = create_projection_schema(&schema, &cols);
|
76
|
+
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
77
|
+
MagnusError::new(
|
78
|
+
ruby.exception_runtime_error(),
|
79
|
+
format!("Failed to create projection: {}", e),
|
80
|
+
)
|
81
|
+
})?;
|
82
|
+
}
|
83
|
+
|
84
|
+
let iter: Box<dyn Iterator<Item = RowRecord<RandomState>>> = match result_type {
|
85
|
+
ParserResultType::Hash => {
|
86
|
+
let headers = OnceLock::new();
|
87
|
+
let headers_clone = headers.clone();
|
88
|
+
let iter = iter
|
89
|
+
.filter_map(move |row| {
|
90
|
+
row.ok().map(|row| {
|
91
|
+
let headers = headers_clone.get_or_init(|| {
|
92
|
+
let column_count = row.get_column_iter().count();
|
93
|
+
|
94
|
+
let mut header_string = Vec::with_capacity(column_count);
|
95
|
+
for (k, _) in row.get_column_iter() {
|
96
|
+
header_string.push(k.to_owned());
|
97
|
+
}
|
98
|
+
|
99
|
+
let headers = StringCache::intern_many(&header_string).unwrap();
|
100
|
+
|
101
|
+
headers
|
102
|
+
});
|
103
|
+
|
104
|
+
let mut map =
|
105
|
+
HashMap::with_capacity_and_hasher(headers.len(), Default::default());
|
106
|
+
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
107
|
+
map.insert(headers[i], ParquetField(v.clone()));
|
108
|
+
});
|
109
|
+
map
|
110
|
+
})
|
111
|
+
})
|
112
|
+
.map(RowRecord::Map);
|
113
|
+
|
114
|
+
Box::new(HeaderCacheCleanupIter {
|
115
|
+
inner: iter,
|
116
|
+
headers,
|
117
|
+
})
|
118
|
+
}
|
119
|
+
ParserResultType::Array => Box::new(
|
120
|
+
iter.filter_map(|row| {
|
121
|
+
row.ok().map(|row| {
|
122
|
+
let column_count = row.get_column_iter().count();
|
123
|
+
let mut vec = Vec::with_capacity(column_count);
|
124
|
+
row.get_column_iter()
|
125
|
+
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
126
|
+
vec
|
127
|
+
})
|
128
|
+
})
|
129
|
+
.map(RowRecord::Vec),
|
130
|
+
),
|
131
|
+
};
|
132
|
+
|
133
|
+
Ok(Yield::Iter(iter))
|
134
|
+
}
|
135
|
+
|
136
|
+
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
137
|
+
if let SchemaType::GroupType { fields, .. } = schema {
|
138
|
+
let projected_fields: Vec<TypePtr> = fields
|
139
|
+
.iter()
|
140
|
+
.filter(|field| columns.contains(&field.name().to_string()))
|
141
|
+
.cloned()
|
142
|
+
.collect();
|
143
|
+
|
144
|
+
SchemaType::GroupType {
|
145
|
+
basic_info: schema.get_basic_info().clone(),
|
146
|
+
fields: projected_fields,
|
147
|
+
}
|
148
|
+
} else {
|
149
|
+
// Return original schema if not a group type
|
150
|
+
schema.clone()
|
151
|
+
}
|
152
|
+
}
|
@@ -14,9 +14,8 @@ pub struct RubyReader<T> {
|
|
14
14
|
offset: usize,
|
15
15
|
}
|
16
16
|
|
17
|
-
pub trait SeekableRead:
|
18
|
-
impl SeekableRead for
|
19
|
-
impl SeekableRead for RubyReader<RString> {}
|
17
|
+
pub trait SeekableRead: Read + Seek {}
|
18
|
+
impl<T: Read + Seek> SeekableRead for T {}
|
20
19
|
|
21
20
|
pub fn build_ruby_reader(
|
22
21
|
ruby: &Ruby,
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
2
|
+
pub enum ParserResultType {
|
3
|
+
Hash,
|
4
|
+
Array,
|
5
|
+
}
|
6
|
+
|
7
|
+
impl ParserResultType {
|
8
|
+
pub fn iter() -> impl Iterator<Item = Self> {
|
9
|
+
[Self::Hash, Self::Array].into_iter()
|
10
|
+
}
|
11
|
+
}
|
12
|
+
|
13
|
+
impl TryFrom<&str> for ParserResultType {
|
14
|
+
type Error = String;
|
15
|
+
|
16
|
+
fn try_from(value: &str) -> Result<Self, Self::Error> {
|
17
|
+
match value {
|
18
|
+
"hash" => Ok(ParserResultType::Hash),
|
19
|
+
"array" => Ok(ParserResultType::Array),
|
20
|
+
_ => Err(format!("Invalid parser result type: {}", value)),
|
21
|
+
}
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
impl TryFrom<String> for ParserResultType {
|
26
|
+
type Error = String;
|
27
|
+
|
28
|
+
fn try_from(value: String) -> Result<Self, Self::Error> {
|
29
|
+
Self::try_from(value.as_str())
|
30
|
+
}
|
31
|
+
}
|
32
|
+
|
33
|
+
impl std::fmt::Display for ParserResultType {
|
34
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
35
|
+
match self {
|
36
|
+
ParserResultType::Hash => write!(f, "hash"),
|
37
|
+
ParserResultType::Array => write!(f, "array"),
|
38
|
+
}
|
39
|
+
}
|
40
|
+
}
|
41
|
+
|
42
|
+
#[derive(Debug, Clone)]
|
43
|
+
pub struct ListField {
|
44
|
+
pub item_type: ParquetSchemaType,
|
45
|
+
}
|
46
|
+
|
47
|
+
#[derive(Debug, Clone)]
|
48
|
+
pub struct MapField {
|
49
|
+
pub key_type: ParquetSchemaType,
|
50
|
+
pub value_type: ParquetSchemaType,
|
51
|
+
}
|
52
|
+
|
53
|
+
#[derive(Debug, Clone)]
|
54
|
+
pub enum ParquetSchemaType {
|
55
|
+
Int8,
|
56
|
+
Int16,
|
57
|
+
Int32,
|
58
|
+
Int64,
|
59
|
+
UInt8,
|
60
|
+
UInt16,
|
61
|
+
UInt32,
|
62
|
+
UInt64,
|
63
|
+
Float,
|
64
|
+
Double,
|
65
|
+
String,
|
66
|
+
Binary,
|
67
|
+
Boolean,
|
68
|
+
Date32,
|
69
|
+
TimestampMillis,
|
70
|
+
TimestampMicros,
|
71
|
+
List(Box<ListField>),
|
72
|
+
Map(Box<MapField>),
|
73
|
+
}
|
@@ -0,0 +1,30 @@
|
|
1
|
+
// Re-export all public items from submodules
|
2
|
+
mod core_types;
|
3
|
+
mod parquet_value;
|
4
|
+
mod record_types;
|
5
|
+
mod timestamp;
|
6
|
+
mod type_conversion;
|
7
|
+
mod writer_types;
|
8
|
+
|
9
|
+
pub use core_types::*;
|
10
|
+
pub use parquet_value::*;
|
11
|
+
pub use record_types::*;
|
12
|
+
pub use timestamp::*;
|
13
|
+
pub use type_conversion::*;
|
14
|
+
pub use writer_types::*;
|
15
|
+
|
16
|
+
// Common imports used across the module
|
17
|
+
use arrow_array::cast::downcast_array;
|
18
|
+
use arrow_array::{
|
19
|
+
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
|
20
|
+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
|
21
|
+
StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
|
22
|
+
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
|
23
|
+
};
|
24
|
+
use arrow_schema::{DataType, TimeUnit};
|
25
|
+
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, TryConvert, Value};
|
26
|
+
use parquet::data_type::Decimal;
|
27
|
+
use parquet::record::Field;
|
28
|
+
use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
|
29
|
+
|
30
|
+
use crate::header_cache::StringCacheKey;
|