parquet 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +66 -59
- data/README.md +105 -1
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +11 -12
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/logger.rs +171 -0
- data/ext/parquet/src/reader/common.rs +110 -0
- data/ext/parquet/src/reader/mod.rs +1 -43
- data/ext/parquet/src/reader/parquet_column_reader.rs +50 -86
- data/ext/parquet/src/reader/parquet_row_reader.rs +53 -23
- data/ext/parquet/src/ruby_reader.rs +37 -25
- data/ext/parquet/src/types/core_types.rs +47 -6
- data/ext/parquet/src/types/mod.rs +64 -1
- data/ext/parquet/src/types/parquet_value.rs +284 -102
- data/ext/parquet/src/types/record_types.rs +24 -23
- data/ext/parquet/src/types/schema_converter.rs +244 -0
- data/ext/parquet/src/types/schema_node.rs +329 -0
- data/ext/parquet/src/types/timestamp.rs +16 -8
- data/ext/parquet/src/types/type_conversion.rs +1151 -521
- data/ext/parquet/src/types/writer_types.rs +94 -151
- data/ext/parquet/src/utils.rs +29 -9
- data/ext/parquet/src/writer/mod.rs +342 -457
- data/ext/parquet/src/writer/write_columns.rs +226 -0
- data/ext/parquet/src/writer/write_rows.rs +484 -0
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +9 -2
@@ -13,14 +13,18 @@ use std::{
|
|
13
13
|
sync::Arc,
|
14
14
|
};
|
15
15
|
|
16
|
+
use crate::types::ParquetGemError;
|
17
|
+
|
16
18
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
17
19
|
/// and provide a standard Read implementation for them.
|
18
20
|
pub enum RubyReader {
|
19
21
|
String {
|
22
|
+
ruby: Arc<Ruby>,
|
20
23
|
inner: Opaque<RString>,
|
21
24
|
offset: usize,
|
22
25
|
},
|
23
26
|
RubyIoLike {
|
27
|
+
ruby: Arc<Ruby>,
|
24
28
|
inner: Opaque<Value>,
|
25
29
|
},
|
26
30
|
NativeProxyIoLike {
|
@@ -28,26 +32,15 @@ pub enum RubyReader {
|
|
28
32
|
},
|
29
33
|
}
|
30
34
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
}
|
35
|
-
|
36
|
-
// For now, don't use this. Having to use seek in length is scary.
|
37
|
-
fn is_seekable_io_like(value: &Value) -> bool {
|
38
|
-
Self::is_io_like(value)
|
39
|
-
&& value.respond_to("seek", false).unwrap_or(false)
|
40
|
-
&& value.respond_to("pos", false).unwrap_or(false)
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
impl TryFrom<Value> for RubyReader {
|
45
|
-
type Error = magnus::Error;
|
35
|
+
// Sending is technically not safe, but the only things that threatens to
|
36
|
+
// do this is the parquet gem, and they don't seem to actually do it.
|
37
|
+
unsafe impl Send for RubyReader {}
|
46
38
|
|
47
|
-
|
48
|
-
|
39
|
+
impl RubyReader {
|
40
|
+
pub fn new(ruby: Arc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
|
49
41
|
if RubyReader::is_seekable_io_like(&value) {
|
50
42
|
Ok(RubyReader::RubyIoLike {
|
43
|
+
ruby,
|
51
44
|
inner: Opaque::from(value),
|
52
45
|
})
|
53
46
|
} else if RubyReader::is_io_like(&value) {
|
@@ -56,6 +49,7 @@ impl TryFrom<Value> for RubyReader {
|
|
56
49
|
|
57
50
|
// This is safe, because we won't call seek
|
58
51
|
let inner_readable = RubyReader::RubyIoLike {
|
52
|
+
ruby: ruby.clone(),
|
59
53
|
inner: Opaque::from(value),
|
60
54
|
};
|
61
55
|
let mut reader = BufReader::new(inner_readable);
|
@@ -74,19 +68,31 @@ impl TryFrom<Value> for RubyReader {
|
|
74
68
|
.funcall::<_, _, RString>("to_str", ())
|
75
69
|
.or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
|
76
70
|
Ok(RubyReader::String {
|
71
|
+
ruby,
|
77
72
|
inner: Opaque::from(string_content),
|
78
73
|
offset: 0,
|
79
74
|
})
|
80
75
|
}
|
81
76
|
}
|
77
|
+
|
78
|
+
fn is_io_like(value: &Value) -> bool {
|
79
|
+
value.respond_to("read", false).unwrap_or(false)
|
80
|
+
}
|
81
|
+
|
82
|
+
// For now, don't use this. Having to use seek in length is scary.
|
83
|
+
fn is_seekable_io_like(value: &Value) -> bool {
|
84
|
+
Self::is_io_like(value)
|
85
|
+
&& value.respond_to("seek", false).unwrap_or(false)
|
86
|
+
&& value.respond_to("pos", false).unwrap_or(false)
|
87
|
+
}
|
82
88
|
}
|
83
89
|
|
84
90
|
impl Seek for RubyReader {
|
85
91
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
86
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
87
92
|
match self {
|
88
93
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
|
89
94
|
RubyReader::String {
|
95
|
+
ruby,
|
90
96
|
inner,
|
91
97
|
offset: original_offset,
|
92
98
|
} => {
|
@@ -107,7 +113,7 @@ impl Seek for RubyReader {
|
|
107
113
|
*original_offset = new_offset.min(unwrapped_inner.len());
|
108
114
|
Ok(*original_offset as u64)
|
109
115
|
}
|
110
|
-
RubyReader::RubyIoLike { inner } => {
|
116
|
+
RubyReader::RubyIoLike { ruby, inner } => {
|
111
117
|
let unwrapped_inner = ruby.get_inner(*inner);
|
112
118
|
|
113
119
|
let (whence, ruby_offset) = match pos {
|
@@ -132,10 +138,13 @@ impl Seek for RubyReader {
|
|
132
138
|
|
133
139
|
impl Read for RubyReader {
|
134
140
|
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
135
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
136
141
|
match self {
|
137
142
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
|
138
|
-
RubyReader::String {
|
143
|
+
RubyReader::String {
|
144
|
+
ruby,
|
145
|
+
inner,
|
146
|
+
offset,
|
147
|
+
} => {
|
139
148
|
let unwrapped_inner = ruby.get_inner(*inner);
|
140
149
|
|
141
150
|
let string_buffer = unsafe { unwrapped_inner.as_slice() };
|
@@ -151,7 +160,7 @@ impl Read for RubyReader {
|
|
151
160
|
|
152
161
|
Ok(copy_size)
|
153
162
|
}
|
154
|
-
RubyReader::RubyIoLike { inner } => {
|
163
|
+
RubyReader::RubyIoLike { ruby, inner } => {
|
155
164
|
let unwrapped_inner = ruby.get_inner(*inner);
|
156
165
|
|
157
166
|
let bytes = unwrapped_inner
|
@@ -175,14 +184,17 @@ impl Read for RubyReader {
|
|
175
184
|
|
176
185
|
impl Length for RubyReader {
|
177
186
|
fn len(&self) -> u64 {
|
178
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
179
187
|
match self {
|
180
188
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
|
181
|
-
RubyReader::String {
|
189
|
+
RubyReader::String {
|
190
|
+
ruby,
|
191
|
+
inner,
|
192
|
+
offset: _,
|
193
|
+
} => {
|
182
194
|
let unwrapped_inner = ruby.get_inner(*inner);
|
183
195
|
unwrapped_inner.len() as u64
|
184
196
|
}
|
185
|
-
RubyReader::RubyIoLike { inner } => {
|
197
|
+
RubyReader::RubyIoLike { ruby, inner } => {
|
186
198
|
let unwrapped_inner = ruby.get_inner(*inner);
|
187
199
|
|
188
200
|
// Get current position
|
@@ -43,17 +43,60 @@ impl std::fmt::Display for ParserResultType {
|
|
43
43
|
pub struct ListField<'a> {
|
44
44
|
pub item_type: ParquetSchemaType<'a>,
|
45
45
|
pub format: Option<&'a str>,
|
46
|
+
pub nullable: bool,
|
46
47
|
}
|
47
48
|
|
48
49
|
#[derive(Debug, Clone)]
|
49
50
|
pub struct MapField<'a> {
|
50
51
|
pub key_type: ParquetSchemaType<'a>,
|
51
52
|
pub value_type: ParquetSchemaType<'a>,
|
52
|
-
pub
|
53
|
+
pub key_format: Option<&'a str>,
|
54
|
+
pub value_format: Option<&'a str>,
|
55
|
+
pub value_nullable: bool,
|
53
56
|
}
|
54
57
|
|
55
58
|
#[derive(Debug, Clone)]
|
59
|
+
pub struct StructField<'a> {
|
60
|
+
pub fields: Vec<super::writer_types::SchemaField<'a>>,
|
61
|
+
}
|
62
|
+
|
63
|
+
#[derive(Clone, Debug)]
|
56
64
|
pub enum ParquetSchemaType<'a> {
|
65
|
+
Primitive(PrimitiveType),
|
66
|
+
List(Box<ListField<'a>>),
|
67
|
+
Map(Box<MapField<'a>>),
|
68
|
+
Struct(Box<StructField<'a>>),
|
69
|
+
}
|
70
|
+
|
71
|
+
// New schema representation for the DSL-based approach
|
72
|
+
#[derive(Debug, Clone)]
|
73
|
+
pub enum SchemaNode {
|
74
|
+
Struct {
|
75
|
+
name: String,
|
76
|
+
nullable: bool,
|
77
|
+
fields: Vec<SchemaNode>,
|
78
|
+
},
|
79
|
+
List {
|
80
|
+
name: String,
|
81
|
+
nullable: bool,
|
82
|
+
item: Box<SchemaNode>,
|
83
|
+
},
|
84
|
+
Map {
|
85
|
+
name: String,
|
86
|
+
nullable: bool,
|
87
|
+
key: Box<SchemaNode>,
|
88
|
+
value: Box<SchemaNode>,
|
89
|
+
},
|
90
|
+
Primitive {
|
91
|
+
name: String,
|
92
|
+
parquet_type: PrimitiveType,
|
93
|
+
nullable: bool,
|
94
|
+
format: Option<String>,
|
95
|
+
},
|
96
|
+
}
|
97
|
+
|
98
|
+
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
99
|
+
pub enum PrimitiveType {
|
57
100
|
Int8,
|
58
101
|
Int16,
|
59
102
|
Int32,
|
@@ -62,14 +105,12 @@ pub enum ParquetSchemaType<'a> {
|
|
62
105
|
UInt16,
|
63
106
|
UInt32,
|
64
107
|
UInt64,
|
65
|
-
|
66
|
-
|
108
|
+
Float32,
|
109
|
+
Float64,
|
110
|
+
Boolean,
|
67
111
|
String,
|
68
112
|
Binary,
|
69
|
-
Boolean,
|
70
113
|
Date32,
|
71
114
|
TimestampMillis,
|
72
115
|
TimestampMicros,
|
73
|
-
List(Box<ListField<'a>>),
|
74
|
-
Map(Box<MapField<'a>>),
|
75
116
|
}
|
@@ -2,13 +2,20 @@
|
|
2
2
|
mod core_types;
|
3
3
|
mod parquet_value;
|
4
4
|
mod record_types;
|
5
|
+
pub mod schema_converter;
|
6
|
+
pub mod schema_node;
|
5
7
|
mod timestamp;
|
6
|
-
mod type_conversion;
|
8
|
+
pub mod type_conversion;
|
7
9
|
mod writer_types;
|
8
10
|
|
9
11
|
pub use core_types::*;
|
10
12
|
pub use parquet_value::*;
|
11
13
|
pub use record_types::*;
|
14
|
+
// Explicitly export schema-related items
|
15
|
+
pub use schema_converter::{
|
16
|
+
infer_schema_from_first_row, legacy_schema_to_dsl, parse_legacy_schema,
|
17
|
+
};
|
18
|
+
pub use schema_node::parse_schema_node;
|
12
19
|
pub use timestamp::*;
|
13
20
|
pub use type_conversion::*;
|
14
21
|
pub use writer_types::*;
|
@@ -28,3 +35,59 @@ use parquet::record::Field;
|
|
28
35
|
use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
|
29
36
|
|
30
37
|
use crate::header_cache::StringCacheKey;
|
38
|
+
|
39
|
+
use crate::header_cache::CacheError;
|
40
|
+
|
41
|
+
use std::io;
|
42
|
+
|
43
|
+
use thiserror::Error;
|
44
|
+
|
45
|
+
#[derive(Error, Debug)]
|
46
|
+
pub enum ParquetGemError {
|
47
|
+
#[error("Failed to open file: {0}")]
|
48
|
+
FileOpen(#[from] io::Error),
|
49
|
+
#[error("Failed to intern headers: {0}")]
|
50
|
+
HeaderIntern(#[from] CacheError),
|
51
|
+
#[error("Ruby error: {0}")]
|
52
|
+
Ruby(#[from] MagnusErrorWrapper),
|
53
|
+
#[error("Parquet error: {0}")]
|
54
|
+
Parquet(#[from] parquet::errors::ParquetError),
|
55
|
+
#[error("Arrow error: {0}")]
|
56
|
+
Arrow(#[from] arrow_schema::ArrowError),
|
57
|
+
#[error("UTF-8 error: {0}")]
|
58
|
+
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
59
|
+
#[error("Jiff error: {0}")]
|
60
|
+
Jiff(#[from] jiff::Error),
|
61
|
+
}
|
62
|
+
|
63
|
+
#[derive(Debug)]
|
64
|
+
pub struct MagnusErrorWrapper(pub MagnusError);
|
65
|
+
|
66
|
+
impl From<MagnusError> for MagnusErrorWrapper {
|
67
|
+
fn from(err: MagnusError) -> Self {
|
68
|
+
Self(err)
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
impl std::fmt::Display for MagnusErrorWrapper {
|
73
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
74
|
+
write!(f, "{}", self.0)
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
impl std::error::Error for MagnusErrorWrapper {}
|
79
|
+
|
80
|
+
impl From<MagnusError> for ParquetGemError {
|
81
|
+
fn from(err: MagnusError) -> Self {
|
82
|
+
Self::Ruby(MagnusErrorWrapper(err))
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
impl Into<MagnusError> for ParquetGemError {
|
87
|
+
fn into(self) -> MagnusError {
|
88
|
+
match self {
|
89
|
+
Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
|
90
|
+
_ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
|
91
|
+
}
|
92
|
+
}
|
93
|
+
}
|