parquet 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +1 -1
- data/ext/parquet/src/header_cache.rs +4 -9
- data/ext/parquet/src/logger.rs +2 -2
- data/ext/parquet/src/reader/common.rs +12 -15
- data/ext/parquet/src/reader/mod.rs +0 -56
- data/ext/parquet/src/reader/parquet_column_reader.rs +20 -16
- data/ext/parquet/src/reader/parquet_row_reader.rs +21 -14
- data/ext/parquet/src/ruby_reader.rs +37 -25
- data/ext/parquet/src/types/core_types.rs +2 -17
- data/ext/parquet/src/types/mod.rs +56 -0
- data/ext/parquet/src/types/parquet_value.rs +101 -95
- data/ext/parquet/src/types/record_types.rs +12 -14
- data/ext/parquet/src/types/schema_converter.rs +4 -109
- data/ext/parquet/src/types/timestamp.rs +3 -5
- data/ext/parquet/src/types/type_conversion.rs +116 -81
- data/ext/parquet/src/types/writer_types.rs +26 -54
- data/ext/parquet/src/writer/mod.rs +176 -839
- data/ext/parquet/src/writer/write_columns.rs +226 -0
- data/ext/parquet/src/writer/write_rows.rs +484 -0
- data/lib/parquet/version.rb +1 -1
- metadata +3 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0d72c16371c10a011af5118f2915de9bbeb33cde133369bdac2050e3c035572e
|
4
|
+
data.tar.gz: b39c6ec9a8232eca5b5b156bf28992ed59c05e9a36e4c13db2b8933a74485ba0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: c7f338b1d010fa59c2344065b233ff20a08d4a17c6ca987ef72677150dd1cbf55d134855585d68e187b748dc5121f13d5e86cb82aabc1eeb3562a3326aca459c
|
7
|
+
data.tar.gz: 69eaa6b133123944138a826612a7b48d9f87acb202ecbe172e253be02a1a1c7009e3d7182e8bb31ae423098bc34bb5dddc4ce042453f0d1cb41505d56d02c21e
|
data/README.md
CHANGED
@@ -294,7 +294,7 @@ The Schema DSL supports:
|
|
294
294
|
- **Complex types**: Structs, lists, and maps with arbitrary nesting
|
295
295
|
- **Nullability control**: Specify which fields can contain null values with `nullable: false/true`
|
296
296
|
- **List item nullability**: Control whether list items can be null with `item_nullable: false/true`
|
297
|
-
- **Map key/value nullability**: Control whether map keys or values can be null with `
|
297
|
+
- **Map key/value nullability**: Control whether map keys or values can be null with `value_nullable: false/true`
|
298
298
|
|
299
299
|
Note: When using List and Map types, you need to provide at least:
|
300
300
|
- For lists: The `item:` parameter specifying the item type
|
@@ -6,10 +6,7 @@
|
|
6
6
|
/// so this optimization could be removed if any issues arise.
|
7
7
|
use std::{
|
8
8
|
collections::HashMap,
|
9
|
-
sync::{
|
10
|
-
atomic::{AtomicU32, Ordering},
|
11
|
-
LazyLock, Mutex,
|
12
|
-
},
|
9
|
+
sync::{LazyLock, Mutex},
|
13
10
|
};
|
14
11
|
|
15
12
|
use magnus::{IntoValue, RString, Ruby, Value};
|
@@ -24,7 +21,7 @@ pub enum CacheError {
|
|
24
21
|
RStringConversion(String),
|
25
22
|
}
|
26
23
|
|
27
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str,
|
24
|
+
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, StringCacheKey>>> =
|
28
25
|
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
29
26
|
|
30
27
|
pub struct StringCache;
|
@@ -84,18 +81,16 @@ impl StringCache {
|
|
84
81
|
pub fn intern_many<AsStr: AsRef<str>>(
|
85
82
|
strings: &[AsStr],
|
86
83
|
) -> Result<Vec<StringCacheKey>, CacheError> {
|
87
|
-
let
|
84
|
+
let cache = STRING_CACHE
|
88
85
|
.lock()
|
89
86
|
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
90
87
|
|
91
88
|
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
92
89
|
for string in strings {
|
93
|
-
if let Some((_,
|
94
|
-
counter.fetch_add(1, Ordering::Relaxed);
|
90
|
+
if let Some((_, interned_string)) = cache.get_key_value(string.as_ref()) {
|
95
91
|
result.push(*interned_string);
|
96
92
|
} else {
|
97
93
|
let interned = StringCacheKey::new(string.as_ref())?;
|
98
|
-
cache.insert(interned.0, (interned, AtomicU32::new(1)));
|
99
94
|
result.push(interned);
|
100
95
|
}
|
101
96
|
}
|
data/ext/parquet/src/logger.rs
CHANGED
@@ -5,7 +5,7 @@ use std::str::FromStr;
|
|
5
5
|
|
6
6
|
use magnus::{exception::runtime_error, value::ReprValue, Error as MagnusError, Ruby, Value};
|
7
7
|
|
8
|
-
use crate::{
|
8
|
+
use crate::{types::ParquetGemError, utils::parse_string_or_symbol};
|
9
9
|
|
10
10
|
/// Severity levels that match Ruby's Logger levels
|
11
11
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
@@ -45,7 +45,7 @@ pub struct RubyLogger {
|
|
45
45
|
|
46
46
|
#[allow(dead_code)]
|
47
47
|
impl RubyLogger {
|
48
|
-
pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self,
|
48
|
+
pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ParquetGemError> {
|
49
49
|
let environment_level = std::env::var("PARQUET_GEM_LOG_LEVEL")
|
50
50
|
.unwrap_or_else(|_| "warn".to_string())
|
51
51
|
.parse::<LogLevel>()
|
@@ -8,32 +8,29 @@ use std::fs::File;
|
|
8
8
|
use std::sync::Arc;
|
9
9
|
|
10
10
|
use magnus::value::ReprValue;
|
11
|
-
use magnus::{Error as MagnusError, Value};
|
11
|
+
use magnus::{Error as MagnusError, Ruby, Value};
|
12
12
|
|
13
13
|
use crate::header_cache::StringCache;
|
14
14
|
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
15
|
-
use crate::types::TryIntoValue;
|
15
|
+
use crate::types::{ParquetGemError, TryIntoValue};
|
16
16
|
use crate::ColumnRecord;
|
17
17
|
|
18
|
-
use super::ReaderError;
|
19
|
-
|
20
18
|
/// Opens a parquet file or IO-like object for reading
|
21
19
|
///
|
22
20
|
/// This function handles both file paths (as strings) and IO-like objects,
|
23
21
|
/// returning either a File or a ThreadSafeRubyReader that can be used with
|
24
22
|
/// parquet readers.
|
25
23
|
pub fn open_parquet_source(
|
24
|
+
ruby: Arc<Ruby>,
|
26
25
|
to_read: Value,
|
27
|
-
) -> Result<Either<File, ThreadSafeRubyReader>,
|
28
|
-
let ruby = unsafe { magnus::Ruby::get_unchecked() };
|
29
|
-
|
26
|
+
) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
|
30
27
|
if to_read.is_kind_of(ruby.class_string()) {
|
31
28
|
let path_string = to_read.to_r_string()?;
|
32
29
|
let file_path = unsafe { path_string.as_str()? };
|
33
|
-
let file = File::open(file_path).map_err(
|
30
|
+
let file = File::open(file_path).map_err(ParquetGemError::from)?;
|
34
31
|
Ok(Either::Left(file))
|
35
32
|
} else {
|
36
|
-
let readable = ThreadSafeRubyReader::new(RubyReader::
|
33
|
+
let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
|
37
34
|
Ok(Either::Right(readable))
|
38
35
|
}
|
39
36
|
}
|
@@ -60,9 +57,9 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
|
|
60
57
|
reader: T,
|
61
58
|
columns: &Option<Vec<String>>,
|
62
59
|
batch_size: Option<usize>,
|
63
|
-
) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64),
|
64
|
-
let mut builder =
|
65
|
-
|
60
|
+
) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
|
61
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(reader)
|
62
|
+
.map_err(|e| ParquetGemError::Parquet(e))?;
|
66
63
|
|
67
64
|
let schema = builder.schema().clone();
|
68
65
|
let num_rows = builder.metadata().file_metadata().num_rows();
|
@@ -81,7 +78,7 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
|
|
81
78
|
builder = builder.with_batch_size(batch_size);
|
82
79
|
}
|
83
80
|
|
84
|
-
let reader = builder.build().map_err(|e|
|
81
|
+
let reader = builder.build().map_err(|e| ParquetGemError::Parquet(e))?;
|
85
82
|
Ok((reader, schema, num_rows))
|
86
83
|
}
|
87
84
|
|
@@ -91,7 +88,7 @@ pub fn handle_empty_file(
|
|
91
88
|
ruby: &magnus::Ruby,
|
92
89
|
schema: &Arc<Schema>,
|
93
90
|
num_rows: i64,
|
94
|
-
) -> Result<bool,
|
91
|
+
) -> Result<bool, ParquetGemError> {
|
95
92
|
if num_rows == 0 {
|
96
93
|
let mut map =
|
97
94
|
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
@@ -101,7 +98,7 @@ pub fn handle_empty_file(
|
|
101
98
|
.map(|field| field.name().to_string())
|
102
99
|
.collect();
|
103
100
|
let interned_headers =
|
104
|
-
StringCache::intern_many(&headers).map_err(|e|
|
101
|
+
StringCache::intern_many(&headers).map_err(|e| ParquetGemError::HeaderIntern(e))?;
|
105
102
|
for field in interned_headers.iter() {
|
106
103
|
map.insert(*field, vec![]);
|
107
104
|
}
|
@@ -2,61 +2,5 @@ mod common;
|
|
2
2
|
mod parquet_column_reader;
|
3
3
|
mod parquet_row_reader;
|
4
4
|
|
5
|
-
use std::io;
|
6
|
-
|
7
|
-
use magnus::Error as MagnusError;
|
8
|
-
use thiserror::Error;
|
9
|
-
|
10
|
-
use crate::header_cache::CacheError;
|
11
5
|
pub use parquet_column_reader::parse_parquet_columns;
|
12
6
|
pub use parquet_row_reader::parse_parquet_rows;
|
13
|
-
|
14
|
-
#[derive(Error, Debug)]
|
15
|
-
pub enum ReaderError {
|
16
|
-
#[error("Failed to open file: {0}")]
|
17
|
-
FileOpen(#[from] io::Error),
|
18
|
-
#[error("Failed to intern headers: {0}")]
|
19
|
-
HeaderIntern(#[from] CacheError),
|
20
|
-
#[error("Ruby error: {0}")]
|
21
|
-
Ruby(#[from] MagnusErrorWrapper),
|
22
|
-
#[error("Parquet error: {0}")]
|
23
|
-
Parquet(#[from] parquet::errors::ParquetError),
|
24
|
-
#[error("Arrow error: {0}")]
|
25
|
-
Arrow(#[from] arrow_schema::ArrowError),
|
26
|
-
#[error("UTF-8 error: {0}")]
|
27
|
-
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
28
|
-
#[error("Jiff error: {0}")]
|
29
|
-
Jiff(#[from] jiff::Error),
|
30
|
-
}
|
31
|
-
|
32
|
-
#[derive(Debug)]
|
33
|
-
pub struct MagnusErrorWrapper(pub MagnusError);
|
34
|
-
|
35
|
-
impl From<MagnusError> for MagnusErrorWrapper {
|
36
|
-
fn from(err: MagnusError) -> Self {
|
37
|
-
Self(err)
|
38
|
-
}
|
39
|
-
}
|
40
|
-
|
41
|
-
impl std::fmt::Display for MagnusErrorWrapper {
|
42
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
43
|
-
write!(f, "{}", self.0)
|
44
|
-
}
|
45
|
-
}
|
46
|
-
|
47
|
-
impl std::error::Error for MagnusErrorWrapper {}
|
48
|
-
|
49
|
-
impl From<MagnusError> for ReaderError {
|
50
|
-
fn from(err: MagnusError) -> Self {
|
51
|
-
Self::Ruby(MagnusErrorWrapper(err))
|
52
|
-
}
|
53
|
-
}
|
54
|
-
|
55
|
-
impl Into<MagnusError> for ReaderError {
|
56
|
-
fn into(self) -> MagnusError {
|
57
|
-
match self {
|
58
|
-
Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
|
59
|
-
_ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
|
60
|
-
}
|
61
|
-
}
|
62
|
-
}
|
@@ -1,6 +1,6 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
2
|
use crate::logger::RubyLogger;
|
3
|
-
use crate::types::{ArrayWrapper, TryIntoValue};
|
3
|
+
use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
|
4
4
|
use crate::{
|
5
5
|
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
|
6
6
|
ParserResultType,
|
@@ -10,25 +10,29 @@ use either::Either;
|
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
12
|
use std::collections::HashMap;
|
13
|
-
use std::sync::OnceLock;
|
13
|
+
use std::sync::{Arc, OnceLock};
|
14
14
|
|
15
15
|
use super::common::{
|
16
16
|
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
17
17
|
};
|
18
|
-
use super::ReaderError;
|
19
18
|
|
20
19
|
#[inline]
|
21
20
|
pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
21
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
22
|
+
Ok(
|
23
|
+
parse_parquet_columns_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
|
24
|
+
let z: MagnusError = e.into();
|
25
|
+
z
|
26
|
+
})?,
|
27
|
+
)
|
26
28
|
}
|
27
29
|
|
28
30
|
#[inline]
|
29
|
-
fn parse_parquet_columns_impl<'a>(
|
30
|
-
|
31
|
-
|
31
|
+
fn parse_parquet_columns_impl<'a>(
|
32
|
+
ruby: Arc<Ruby>,
|
33
|
+
rb_self: Value,
|
34
|
+
args: &[Value],
|
35
|
+
) -> Result<Value, ParquetGemError> {
|
32
36
|
let ParquetColumnsArgs {
|
33
37
|
to_read,
|
34
38
|
result_type,
|
@@ -63,7 +67,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
|
|
63
67
|
return Ok(enum_value);
|
64
68
|
}
|
65
69
|
|
66
|
-
let source = open_parquet_source(to_read)?;
|
70
|
+
let source = open_parquet_source(ruby.clone(), to_read)?;
|
67
71
|
|
68
72
|
// Use the common function to create the batch reader
|
69
73
|
|
@@ -82,7 +86,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
|
|
82
86
|
let headers = OnceLock::new();
|
83
87
|
let headers_clone = headers.clone();
|
84
88
|
let iter = batch_reader.map(move |batch| {
|
85
|
-
batch.map_err(
|
89
|
+
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
86
90
|
let local_headers = headers_clone
|
87
91
|
.get_or_init(|| {
|
88
92
|
let schema = batch.schema();
|
@@ -94,7 +98,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
|
|
94
98
|
StringCache::intern_many(&header_string)
|
95
99
|
})
|
96
100
|
.as_ref()
|
97
|
-
.map_err(|e|
|
101
|
+
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
98
102
|
|
99
103
|
let mut map = HashMap::with_capacity_and_hasher(
|
100
104
|
local_headers.len(),
|
@@ -112,7 +116,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
|
|
112
116
|
strict: strict,
|
113
117
|
})?;
|
114
118
|
map.insert(header, values.into_inner());
|
115
|
-
Ok::<_,
|
119
|
+
Ok::<_, ParquetGemError>(())
|
116
120
|
})?;
|
117
121
|
|
118
122
|
Ok(ColumnRecord::Map::<RandomState>(map))
|
@@ -126,7 +130,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
|
|
126
130
|
}
|
127
131
|
ParserResultType::Array => {
|
128
132
|
let iter = batch_reader.map(|batch| {
|
129
|
-
batch.map_err(
|
133
|
+
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
130
134
|
let vec = batch
|
131
135
|
.columns()
|
132
136
|
.into_iter()
|
@@ -135,7 +139,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
|
|
135
139
|
array: &*column,
|
136
140
|
strict: strict,
|
137
141
|
})?;
|
138
|
-
Ok::<_,
|
142
|
+
Ok::<_, ParquetGemError>(values.into_inner())
|
139
143
|
})
|
140
144
|
.collect::<Result<Vec<_>, _>>()?;
|
141
145
|
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
@@ -2,7 +2,7 @@ use crate::header_cache::StringCache;
|
|
2
2
|
use crate::logger::RubyLogger;
|
3
3
|
use crate::types::TryIntoValue;
|
4
4
|
use crate::{
|
5
|
-
create_row_enumerator, utils::*, ParquetField,
|
5
|
+
create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
|
6
6
|
RowEnumeratorArgs, RowRecord,
|
7
7
|
};
|
8
8
|
use ahash::RandomState;
|
@@ -13,22 +13,27 @@ use parquet::file::reader::{FileReader, SerializedFileReader};
|
|
13
13
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
14
14
|
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
15
15
|
use std::collections::HashMap;
|
16
|
-
use std::sync::OnceLock;
|
16
|
+
use std::sync::{Arc, OnceLock};
|
17
17
|
|
18
18
|
use super::common::{handle_block_or_enum, open_parquet_source};
|
19
19
|
|
20
20
|
#[inline]
|
21
21
|
pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
22
|
+
let ruby = unsafe { Ruby::get_unchecked() };
|
23
|
+
Ok(
|
24
|
+
parse_parquet_rows_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
|
25
|
+
let z: MagnusError = e.into();
|
26
|
+
z
|
27
|
+
})?,
|
28
|
+
)
|
26
29
|
}
|
27
30
|
|
28
31
|
#[inline]
|
29
|
-
fn parse_parquet_rows_impl<'a>(
|
30
|
-
|
31
|
-
|
32
|
+
fn parse_parquet_rows_impl<'a>(
|
33
|
+
ruby: Arc<Ruby>,
|
34
|
+
rb_self: Value,
|
35
|
+
args: &[Value],
|
36
|
+
) -> Result<Value, ParquetGemError> {
|
32
37
|
let ParquetRowsArgs {
|
33
38
|
to_read,
|
34
39
|
result_type,
|
@@ -58,11 +63,13 @@ fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value,
|
|
58
63
|
return Ok(enum_value);
|
59
64
|
}
|
60
65
|
|
61
|
-
let source = open_parquet_source(to_read)?;
|
66
|
+
let source = open_parquet_source(ruby.clone(), to_read)?;
|
62
67
|
let reader: Box<dyn FileReader> = match source {
|
63
|
-
Either::Left(file) =>
|
68
|
+
Either::Left(file) => {
|
69
|
+
Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
|
70
|
+
}
|
64
71
|
Either::Right(readable) => {
|
65
|
-
Box::new(SerializedFileReader::new(readable).map_err(
|
72
|
+
Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
|
66
73
|
}
|
67
74
|
};
|
68
75
|
|
@@ -109,7 +116,7 @@ fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value,
|
|
109
116
|
Ok(map)
|
110
117
|
})
|
111
118
|
.and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
|
112
|
-
.map_err(|e|
|
119
|
+
.map_err(|e| ParquetGemError::from(e))
|
113
120
|
});
|
114
121
|
|
115
122
|
for result in iter {
|
@@ -128,7 +135,7 @@ fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value,
|
|
128
135
|
Ok(vec)
|
129
136
|
})
|
130
137
|
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|
131
|
-
.map_err(|e|
|
138
|
+
.map_err(|e| ParquetGemError::from(e))
|
132
139
|
});
|
133
140
|
|
134
141
|
for result in iter {
|
@@ -13,14 +13,18 @@ use std::{
|
|
13
13
|
sync::Arc,
|
14
14
|
};
|
15
15
|
|
16
|
+
use crate::types::ParquetGemError;
|
17
|
+
|
16
18
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
17
19
|
/// and provide a standard Read implementation for them.
|
18
20
|
pub enum RubyReader {
|
19
21
|
String {
|
22
|
+
ruby: Arc<Ruby>,
|
20
23
|
inner: Opaque<RString>,
|
21
24
|
offset: usize,
|
22
25
|
},
|
23
26
|
RubyIoLike {
|
27
|
+
ruby: Arc<Ruby>,
|
24
28
|
inner: Opaque<Value>,
|
25
29
|
},
|
26
30
|
NativeProxyIoLike {
|
@@ -28,26 +32,15 @@ pub enum RubyReader {
|
|
28
32
|
},
|
29
33
|
}
|
30
34
|
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
}
|
35
|
-
|
36
|
-
// For now, don't use this. Having to use seek in length is scary.
|
37
|
-
fn is_seekable_io_like(value: &Value) -> bool {
|
38
|
-
Self::is_io_like(value)
|
39
|
-
&& value.respond_to("seek", false).unwrap_or(false)
|
40
|
-
&& value.respond_to("pos", false).unwrap_or(false)
|
41
|
-
}
|
42
|
-
}
|
43
|
-
|
44
|
-
impl TryFrom<Value> for RubyReader {
|
45
|
-
type Error = magnus::Error;
|
35
|
+
// Sending is technically not safe, but the only things that threatens to
|
36
|
+
// do this is the parquet gem, and they don't seem to actually do it.
|
37
|
+
unsafe impl Send for RubyReader {}
|
46
38
|
|
47
|
-
|
48
|
-
|
39
|
+
impl RubyReader {
|
40
|
+
pub fn new(ruby: Arc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
|
49
41
|
if RubyReader::is_seekable_io_like(&value) {
|
50
42
|
Ok(RubyReader::RubyIoLike {
|
43
|
+
ruby,
|
51
44
|
inner: Opaque::from(value),
|
52
45
|
})
|
53
46
|
} else if RubyReader::is_io_like(&value) {
|
@@ -56,6 +49,7 @@ impl TryFrom<Value> for RubyReader {
|
|
56
49
|
|
57
50
|
// This is safe, because we won't call seek
|
58
51
|
let inner_readable = RubyReader::RubyIoLike {
|
52
|
+
ruby: ruby.clone(),
|
59
53
|
inner: Opaque::from(value),
|
60
54
|
};
|
61
55
|
let mut reader = BufReader::new(inner_readable);
|
@@ -74,19 +68,31 @@ impl TryFrom<Value> for RubyReader {
|
|
74
68
|
.funcall::<_, _, RString>("to_str", ())
|
75
69
|
.or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
|
76
70
|
Ok(RubyReader::String {
|
71
|
+
ruby,
|
77
72
|
inner: Opaque::from(string_content),
|
78
73
|
offset: 0,
|
79
74
|
})
|
80
75
|
}
|
81
76
|
}
|
77
|
+
|
78
|
+
fn is_io_like(value: &Value) -> bool {
|
79
|
+
value.respond_to("read", false).unwrap_or(false)
|
80
|
+
}
|
81
|
+
|
82
|
+
// For now, don't use this. Having to use seek in length is scary.
|
83
|
+
fn is_seekable_io_like(value: &Value) -> bool {
|
84
|
+
Self::is_io_like(value)
|
85
|
+
&& value.respond_to("seek", false).unwrap_or(false)
|
86
|
+
&& value.respond_to("pos", false).unwrap_or(false)
|
87
|
+
}
|
82
88
|
}
|
83
89
|
|
84
90
|
impl Seek for RubyReader {
|
85
91
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
86
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
87
92
|
match self {
|
88
93
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
|
89
94
|
RubyReader::String {
|
95
|
+
ruby,
|
90
96
|
inner,
|
91
97
|
offset: original_offset,
|
92
98
|
} => {
|
@@ -107,7 +113,7 @@ impl Seek for RubyReader {
|
|
107
113
|
*original_offset = new_offset.min(unwrapped_inner.len());
|
108
114
|
Ok(*original_offset as u64)
|
109
115
|
}
|
110
|
-
RubyReader::RubyIoLike { inner } => {
|
116
|
+
RubyReader::RubyIoLike { ruby, inner } => {
|
111
117
|
let unwrapped_inner = ruby.get_inner(*inner);
|
112
118
|
|
113
119
|
let (whence, ruby_offset) = match pos {
|
@@ -132,10 +138,13 @@ impl Seek for RubyReader {
|
|
132
138
|
|
133
139
|
impl Read for RubyReader {
|
134
140
|
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
135
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
136
141
|
match self {
|
137
142
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
|
138
|
-
RubyReader::String {
|
143
|
+
RubyReader::String {
|
144
|
+
ruby,
|
145
|
+
inner,
|
146
|
+
offset,
|
147
|
+
} => {
|
139
148
|
let unwrapped_inner = ruby.get_inner(*inner);
|
140
149
|
|
141
150
|
let string_buffer = unsafe { unwrapped_inner.as_slice() };
|
@@ -151,7 +160,7 @@ impl Read for RubyReader {
|
|
151
160
|
|
152
161
|
Ok(copy_size)
|
153
162
|
}
|
154
|
-
RubyReader::RubyIoLike { inner } => {
|
163
|
+
RubyReader::RubyIoLike { ruby, inner } => {
|
155
164
|
let unwrapped_inner = ruby.get_inner(*inner);
|
156
165
|
|
157
166
|
let bytes = unwrapped_inner
|
@@ -175,14 +184,17 @@ impl Read for RubyReader {
|
|
175
184
|
|
176
185
|
impl Length for RubyReader {
|
177
186
|
fn len(&self) -> u64 {
|
178
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
179
187
|
match self {
|
180
188
|
RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
|
181
|
-
RubyReader::String {
|
189
|
+
RubyReader::String {
|
190
|
+
ruby,
|
191
|
+
inner,
|
192
|
+
offset: _,
|
193
|
+
} => {
|
182
194
|
let unwrapped_inner = ruby.get_inner(*inner);
|
183
195
|
unwrapped_inner.len() as u64
|
184
196
|
}
|
185
|
-
RubyReader::RubyIoLike { inner } => {
|
197
|
+
RubyReader::RubyIoLike { ruby, inner } => {
|
186
198
|
let unwrapped_inner = ruby.get_inner(*inner);
|
187
199
|
|
188
200
|
// Get current position
|
@@ -62,22 +62,7 @@ pub struct StructField<'a> {
|
|
62
62
|
|
63
63
|
#[derive(Clone, Debug)]
|
64
64
|
pub enum ParquetSchemaType<'a> {
|
65
|
-
|
66
|
-
Int16,
|
67
|
-
Int32,
|
68
|
-
Int64,
|
69
|
-
UInt8,
|
70
|
-
UInt16,
|
71
|
-
UInt32,
|
72
|
-
UInt64,
|
73
|
-
Float,
|
74
|
-
Double,
|
75
|
-
String,
|
76
|
-
Binary,
|
77
|
-
Boolean,
|
78
|
-
Date32,
|
79
|
-
TimestampMillis,
|
80
|
-
TimestampMicros,
|
65
|
+
Primitive(PrimitiveType),
|
81
66
|
List(Box<ListField<'a>>),
|
82
67
|
Map(Box<MapField<'a>>),
|
83
68
|
Struct(Box<StructField<'a>>),
|
@@ -110,7 +95,7 @@ pub enum SchemaNode {
|
|
110
95
|
},
|
111
96
|
}
|
112
97
|
|
113
|
-
#[derive(Debug, Clone)]
|
98
|
+
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
114
99
|
pub enum PrimitiveType {
|
115
100
|
Int8,
|
116
101
|
Int16,
|
@@ -35,3 +35,59 @@ use parquet::record::Field;
|
|
35
35
|
use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
|
36
36
|
|
37
37
|
use crate::header_cache::StringCacheKey;
|
38
|
+
|
39
|
+
use crate::header_cache::CacheError;
|
40
|
+
|
41
|
+
use std::io;
|
42
|
+
|
43
|
+
use thiserror::Error;
|
44
|
+
|
45
|
+
#[derive(Error, Debug)]
|
46
|
+
pub enum ParquetGemError {
|
47
|
+
#[error("Failed to open file: {0}")]
|
48
|
+
FileOpen(#[from] io::Error),
|
49
|
+
#[error("Failed to intern headers: {0}")]
|
50
|
+
HeaderIntern(#[from] CacheError),
|
51
|
+
#[error("Ruby error: {0}")]
|
52
|
+
Ruby(#[from] MagnusErrorWrapper),
|
53
|
+
#[error("Parquet error: {0}")]
|
54
|
+
Parquet(#[from] parquet::errors::ParquetError),
|
55
|
+
#[error("Arrow error: {0}")]
|
56
|
+
Arrow(#[from] arrow_schema::ArrowError),
|
57
|
+
#[error("UTF-8 error: {0}")]
|
58
|
+
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
59
|
+
#[error("Jiff error: {0}")]
|
60
|
+
Jiff(#[from] jiff::Error),
|
61
|
+
}
|
62
|
+
|
63
|
+
#[derive(Debug)]
|
64
|
+
pub struct MagnusErrorWrapper(pub MagnusError);
|
65
|
+
|
66
|
+
impl From<MagnusError> for MagnusErrorWrapper {
|
67
|
+
fn from(err: MagnusError) -> Self {
|
68
|
+
Self(err)
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
impl std::fmt::Display for MagnusErrorWrapper {
|
73
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
74
|
+
write!(f, "{}", self.0)
|
75
|
+
}
|
76
|
+
}
|
77
|
+
|
78
|
+
impl std::error::Error for MagnusErrorWrapper {}
|
79
|
+
|
80
|
+
impl From<MagnusError> for ParquetGemError {
|
81
|
+
fn from(err: MagnusError) -> Self {
|
82
|
+
Self::Ruby(MagnusErrorWrapper(err))
|
83
|
+
}
|
84
|
+
}
|
85
|
+
|
86
|
+
impl Into<MagnusError> for ParquetGemError {
|
87
|
+
fn into(self) -> MagnusError {
|
88
|
+
match self {
|
89
|
+
Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
|
90
|
+
_ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
|
91
|
+
}
|
92
|
+
}
|
93
|
+
}
|