parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
use magnus::{Error as MagnusError, Ruby};
|
|
2
|
+
use parquet_core::ParquetError as CoreParquetError;
|
|
3
|
+
use std::fmt::Display;
|
|
4
|
+
use thiserror::Error;
|
|
5
|
+
|
|
6
|
+
/// Error type for parquet-ruby-adapter
|
|
7
|
+
#[derive(Error, Debug)]
|
|
8
|
+
pub enum RubyAdapterError {
|
|
9
|
+
/// Core parquet errors
|
|
10
|
+
#[error("Parquet error: {0}")]
|
|
11
|
+
Parquet(#[from] CoreParquetError),
|
|
12
|
+
|
|
13
|
+
/// Magnus/Ruby errors
|
|
14
|
+
#[error("Ruby error: {0}")]
|
|
15
|
+
Ruby(String),
|
|
16
|
+
|
|
17
|
+
/// IO errors
|
|
18
|
+
#[error("IO error: {0}")]
|
|
19
|
+
Io(#[from] std::io::Error),
|
|
20
|
+
|
|
21
|
+
/// Type conversion errors
|
|
22
|
+
#[error("Type conversion error: {0}")]
|
|
23
|
+
TypeConversion(String),
|
|
24
|
+
|
|
25
|
+
/// Schema conversion errors
|
|
26
|
+
#[error("Schema conversion error: {0}")]
|
|
27
|
+
SchemaConversion(String),
|
|
28
|
+
|
|
29
|
+
/// Metadata extraction errors
|
|
30
|
+
#[error("Metadata error: {0}")]
|
|
31
|
+
Metadata(String),
|
|
32
|
+
|
|
33
|
+
/// Invalid input errors
|
|
34
|
+
#[error("Invalid input: {0}")]
|
|
35
|
+
InvalidInput(String),
|
|
36
|
+
|
|
37
|
+
/// Runtime errors
|
|
38
|
+
#[error("Runtime error: {0}")]
|
|
39
|
+
Runtime(String),
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
pub type Result<T> = std::result::Result<T, RubyAdapterError>;
|
|
43
|
+
|
|
44
|
+
impl RubyAdapterError {
|
|
45
|
+
/// Create a new Ruby error
|
|
46
|
+
pub fn ruby<S: Into<String>>(msg: S) -> Self {
|
|
47
|
+
RubyAdapterError::Ruby(msg.into())
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/// Create a new type conversion error
|
|
51
|
+
pub fn type_conversion<S: Into<String>>(msg: S) -> Self {
|
|
52
|
+
RubyAdapterError::TypeConversion(msg.into())
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/// Create a new schema conversion error
|
|
56
|
+
pub fn schema_conversion<S: Into<String>>(msg: S) -> Self {
|
|
57
|
+
RubyAdapterError::SchemaConversion(msg.into())
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/// Create a new metadata error
|
|
61
|
+
pub fn metadata<S: Into<String>>(msg: S) -> Self {
|
|
62
|
+
RubyAdapterError::Metadata(msg.into())
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/// Create a new invalid input error
|
|
66
|
+
pub fn invalid_input<S: Into<String>>(msg: S) -> Self {
|
|
67
|
+
RubyAdapterError::InvalidInput(msg.into())
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/// Create a new runtime error
|
|
71
|
+
pub fn runtime<S: Into<String>>(msg: S) -> Self {
|
|
72
|
+
RubyAdapterError::Runtime(msg.into())
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// Convert RubyAdapterError to MagnusError
|
|
77
|
+
impl From<RubyAdapterError> for MagnusError {
|
|
78
|
+
fn from(err: RubyAdapterError) -> Self {
|
|
79
|
+
// This conversion only runs at the FFI boundary, where the GVL is held
|
|
80
|
+
// and a Ruby handle is always available. A Ruby exception cannot be
|
|
81
|
+
// constructed without that handle, so an unavailable runtime is an
|
|
82
|
+
// impossible state we fail fast on rather than paper over.
|
|
83
|
+
let ruby = Ruby::get().unwrap_or_else(|unavailable| {
|
|
84
|
+
panic!("cannot build Ruby exception off the Ruby thread ({unavailable}); source error: {err}")
|
|
85
|
+
});
|
|
86
|
+
let class = match &err {
|
|
87
|
+
RubyAdapterError::Io(_) => ruby.exception_io_error(),
|
|
88
|
+
RubyAdapterError::TypeConversion(_) => ruby.exception_type_error(),
|
|
89
|
+
RubyAdapterError::InvalidInput(_) => ruby.exception_arg_error(),
|
|
90
|
+
_ => ruby.exception_runtime_error(),
|
|
91
|
+
};
|
|
92
|
+
MagnusError::new(class, err.to_string())
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/// Extension trait to convert errors to MagnusError at the boundary
|
|
97
|
+
pub trait IntoMagnusError<T> {
|
|
98
|
+
/// Convert to MagnusError
|
|
99
|
+
fn into_magnus_error(self) -> std::result::Result<T, MagnusError>;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
impl<T> IntoMagnusError<T> for Result<T> {
|
|
103
|
+
fn into_magnus_error(self) -> std::result::Result<T, MagnusError> {
|
|
104
|
+
self.map_err(Into::into)
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Extension trait to add context to errors
|
|
109
|
+
pub trait ErrorContext<T> {
|
|
110
|
+
/// Add context to an error
|
|
111
|
+
fn context<S: Display>(self, ctx: S) -> Result<T>;
|
|
112
|
+
|
|
113
|
+
/// Add context with a closure that's only called on error
|
|
114
|
+
fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T>;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
impl<T, E> ErrorContext<T> for std::result::Result<T, E>
|
|
118
|
+
where
|
|
119
|
+
E: Into<RubyAdapterError>,
|
|
120
|
+
{
|
|
121
|
+
fn context<S: Display>(self, ctx: S) -> Result<T> {
|
|
122
|
+
self.map_err(|e| {
|
|
123
|
+
let base_error = e.into();
|
|
124
|
+
RubyAdapterError::Runtime(format!("{}: {}", ctx, base_error))
|
|
125
|
+
})
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T> {
|
|
129
|
+
self.map_err(|e| {
|
|
130
|
+
let base_error = e.into();
|
|
131
|
+
RubyAdapterError::Runtime(format!("{}: {}", f(), base_error))
|
|
132
|
+
})
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
/// Convert from MagnusError to RubyAdapterError
|
|
137
|
+
impl From<MagnusError> for RubyAdapterError {
|
|
138
|
+
fn from(err: MagnusError) -> Self {
|
|
139
|
+
RubyAdapterError::Ruby(err.to_string())
|
|
140
|
+
}
|
|
141
|
+
}
|
|
@@ -0,0 +1,432 @@
|
|
|
1
|
+
use bytes::Bytes;
|
|
2
|
+
use magnus::value::{Opaque, ReprValue};
|
|
3
|
+
use magnus::{Error as MagnusError, RString, Ruby, Value};
|
|
4
|
+
use std::io::{Error as IoError, ErrorKind, Read, Seek, SeekFrom, Write};
|
|
5
|
+
|
|
6
|
+
use parquet::{
|
|
7
|
+
errors::ParquetError,
|
|
8
|
+
file::reader::{ChunkReader, Length},
|
|
9
|
+
};
|
|
10
|
+
use std::{fs::File, sync::Mutex};
|
|
11
|
+
use std::{
|
|
12
|
+
io::{self, BufReader},
|
|
13
|
+
sync::Arc,
|
|
14
|
+
};
|
|
15
|
+
|
|
16
|
+
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
|
17
|
+
/// and provide a standard Read implementation for them.
|
|
18
|
+
pub enum RubyIOReader {
|
|
19
|
+
String {
|
|
20
|
+
inner: Opaque<RString>,
|
|
21
|
+
offset: usize,
|
|
22
|
+
},
|
|
23
|
+
RubyIoLike {
|
|
24
|
+
inner: Opaque<Value>,
|
|
25
|
+
},
|
|
26
|
+
NativeProxyIoLike {
|
|
27
|
+
proxy_file: File,
|
|
28
|
+
},
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Sending is technically not safe, but the only things that threatens to
|
|
32
|
+
// do this is the parquet gem, and they don't seem to actually do it.
|
|
33
|
+
unsafe impl Send for RubyIOReader {}
|
|
34
|
+
|
|
35
|
+
impl RubyIOReader {
|
|
36
|
+
pub fn new(value: Value) -> std::io::Result<Self> {
|
|
37
|
+
if RubyIOReader::is_seekable_io_like(&value) {
|
|
38
|
+
Ok(RubyIOReader::RubyIoLike {
|
|
39
|
+
inner: Opaque::from(value),
|
|
40
|
+
})
|
|
41
|
+
} else if RubyIOReader::is_io_like(&value) {
|
|
42
|
+
let mut temp_file = tempfile::tempfile()?;
|
|
43
|
+
|
|
44
|
+
// This is safe, because we won't call seek
|
|
45
|
+
let inner_readable = RubyIOReader::RubyIoLike {
|
|
46
|
+
inner: Opaque::from(value),
|
|
47
|
+
};
|
|
48
|
+
let mut reader = BufReader::new(inner_readable);
|
|
49
|
+
io::copy(&mut reader, &mut temp_file)?;
|
|
50
|
+
temp_file.seek(SeekFrom::Start(0))?;
|
|
51
|
+
|
|
52
|
+
Ok(RubyIOReader::NativeProxyIoLike {
|
|
53
|
+
proxy_file: temp_file,
|
|
54
|
+
})
|
|
55
|
+
} else {
|
|
56
|
+
// Try calling `to_str`, and if that fails, try `to_s`
|
|
57
|
+
let string_content = value
|
|
58
|
+
.funcall::<_, _, RString>("to_str", ())
|
|
59
|
+
.or_else(|_| value.funcall::<_, _, RString>("to_s", ()))
|
|
60
|
+
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))?;
|
|
61
|
+
Ok(RubyIOReader::String {
|
|
62
|
+
inner: Opaque::from(string_content),
|
|
63
|
+
offset: 0,
|
|
64
|
+
})
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
fn is_io_like(value: &Value) -> bool {
|
|
69
|
+
value.respond_to("read", false).unwrap_or(false)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// For now, don't use this. Having to use seek in length is scary.
|
|
73
|
+
fn is_seekable_io_like(value: &Value) -> bool {
|
|
74
|
+
Self::is_io_like(value)
|
|
75
|
+
&& value.respond_to("seek", false).unwrap_or(false)
|
|
76
|
+
&& value.respond_to("pos", false).unwrap_or(false)
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
impl Seek for RubyIOReader {
|
|
81
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
|
82
|
+
let ruby = Ruby::get()
|
|
83
|
+
.map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
|
|
84
|
+
match self {
|
|
85
|
+
RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
|
|
86
|
+
RubyIOReader::String {
|
|
87
|
+
inner,
|
|
88
|
+
offset: original_offset,
|
|
89
|
+
} => {
|
|
90
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
|
91
|
+
|
|
92
|
+
let new_offset = match pos {
|
|
93
|
+
SeekFrom::Start(off) => off as usize,
|
|
94
|
+
SeekFrom::Current(off) => {
|
|
95
|
+
let signed = *original_offset as i64 + off;
|
|
96
|
+
signed.max(0) as usize
|
|
97
|
+
}
|
|
98
|
+
SeekFrom::End(off) => {
|
|
99
|
+
let signed = unwrapped_inner.len() as i64 + off;
|
|
100
|
+
signed.max(0) as usize
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
*original_offset = new_offset.min(unwrapped_inner.len());
|
|
105
|
+
Ok(*original_offset as u64)
|
|
106
|
+
}
|
|
107
|
+
RubyIOReader::RubyIoLike { inner } => {
|
|
108
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
|
109
|
+
|
|
110
|
+
let (whence, ruby_offset) = match pos {
|
|
111
|
+
SeekFrom::Start(i) => (0, i as i64),
|
|
112
|
+
SeekFrom::Current(i) => (1, i),
|
|
113
|
+
SeekFrom::End(i) => (2, i),
|
|
114
|
+
};
|
|
115
|
+
|
|
116
|
+
unwrapped_inner
|
|
117
|
+
.funcall::<_, _, u64>("seek", (ruby_offset, whence))
|
|
118
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
119
|
+
|
|
120
|
+
let new_position = unwrapped_inner
|
|
121
|
+
.funcall::<_, _, u64>("pos", ())
|
|
122
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
123
|
+
|
|
124
|
+
Ok(new_position)
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
impl Read for RubyIOReader {
|
|
131
|
+
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
|
132
|
+
let ruby = Ruby::get()
|
|
133
|
+
.map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
|
|
134
|
+
match self {
|
|
135
|
+
RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
|
|
136
|
+
RubyIOReader::String { inner, offset } => {
|
|
137
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
|
138
|
+
|
|
139
|
+
let string_buffer = unsafe { unwrapped_inner.as_slice() };
|
|
140
|
+
if *offset >= string_buffer.len() {
|
|
141
|
+
return Ok(0); // EOF
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
let remaining = string_buffer.len() - *offset;
|
|
145
|
+
let copy_size = remaining.min(buf.len());
|
|
146
|
+
buf[..copy_size].copy_from_slice(&string_buffer[*offset..*offset + copy_size]);
|
|
147
|
+
|
|
148
|
+
*offset += copy_size;
|
|
149
|
+
|
|
150
|
+
Ok(copy_size)
|
|
151
|
+
}
|
|
152
|
+
RubyIOReader::RubyIoLike { inner } => {
|
|
153
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
|
154
|
+
|
|
155
|
+
let bytes = unwrapped_inner
|
|
156
|
+
.funcall::<_, _, Option<RString>>("read", (buf.len(),))
|
|
157
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
158
|
+
|
|
159
|
+
match bytes {
|
|
160
|
+
Some(bytes) => {
|
|
161
|
+
let string_buffer = unsafe { bytes.as_slice() };
|
|
162
|
+
buf.write_all(string_buffer)?;
|
|
163
|
+
Ok(string_buffer.len())
|
|
164
|
+
}
|
|
165
|
+
None => Ok(0),
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
impl Length for RubyIOReader {
|
|
173
|
+
fn len(&self) -> u64 {
|
|
174
|
+
let ruby = match Ruby::get() {
|
|
175
|
+
Ok(r) => r,
|
|
176
|
+
Err(_) => {
|
|
177
|
+
eprintln!("Failed to get Ruby runtime in RubyIOReader::len");
|
|
178
|
+
return 0;
|
|
179
|
+
}
|
|
180
|
+
};
|
|
181
|
+
match self {
|
|
182
|
+
RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
|
|
183
|
+
RubyIOReader::String { inner, offset: _ } => {
|
|
184
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
|
185
|
+
unwrapped_inner.len() as u64
|
|
186
|
+
}
|
|
187
|
+
RubyIOReader::RubyIoLike { inner } => {
|
|
188
|
+
let unwrapped_inner = ruby.get_inner(*inner);
|
|
189
|
+
|
|
190
|
+
// Get current position
|
|
191
|
+
let current_pos = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
|
|
192
|
+
Ok(pos) => pos,
|
|
193
|
+
Err(e) => {
|
|
194
|
+
eprintln!("Error seeking: {}", e);
|
|
195
|
+
return 0;
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
// Seek to end
|
|
200
|
+
if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 2)) {
|
|
201
|
+
eprintln!("Error seeking: {}", e);
|
|
202
|
+
return 0;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Offset at the end of the file is the length of the file
|
|
206
|
+
let size = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
|
|
207
|
+
Ok(pos) => pos,
|
|
208
|
+
Err(e) => {
|
|
209
|
+
eprintln!("Error seeking: {}", e);
|
|
210
|
+
return 0;
|
|
211
|
+
}
|
|
212
|
+
};
|
|
213
|
+
|
|
214
|
+
// Restore original position
|
|
215
|
+
if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (current_pos, 0)) {
|
|
216
|
+
eprintln!("Error seeking: {}", e);
|
|
217
|
+
return 0;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
let final_pos = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
|
|
221
|
+
Ok(pos) => pos,
|
|
222
|
+
Err(e) => {
|
|
223
|
+
eprintln!("Error seeking: {}", e);
|
|
224
|
+
return 0;
|
|
225
|
+
}
|
|
226
|
+
};
|
|
227
|
+
|
|
228
|
+
if current_pos != final_pos {
|
|
229
|
+
eprintln!(
|
|
230
|
+
"Failed to restore original position in seekable IO object. Started at position {}, but ended at {}",
|
|
231
|
+
current_pos,
|
|
232
|
+
final_pos
|
|
233
|
+
);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
size
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
|
243
|
+
|
|
244
|
+
#[derive(Clone)]
|
|
245
|
+
pub struct ThreadSafeRubyIOReader(Arc<Mutex<RubyIOReader>>);
|
|
246
|
+
|
|
247
|
+
impl ThreadSafeRubyIOReader {
|
|
248
|
+
pub fn new(reader: RubyIOReader) -> Self {
|
|
249
|
+
Self(Arc::new(Mutex::new(reader)))
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
impl Length for ThreadSafeRubyIOReader {
|
|
254
|
+
fn len(&self) -> u64 {
|
|
255
|
+
match self.0.lock() {
|
|
256
|
+
Ok(reader) => reader.len(),
|
|
257
|
+
Err(_) => {
|
|
258
|
+
// If the mutex is poisoned, we can't recover, return 0
|
|
259
|
+
eprintln!("Failed to lock mutex in ThreadSafeRubyIOReader::len");
|
|
260
|
+
0
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
impl Seek for ThreadSafeRubyIOReader {
|
|
267
|
+
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
|
268
|
+
let mut reader = self
|
|
269
|
+
.0
|
|
270
|
+
.lock()
|
|
271
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
272
|
+
reader.seek(pos)
|
|
273
|
+
}
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
impl Read for ThreadSafeRubyIOReader {
|
|
277
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
|
278
|
+
let mut reader = self
|
|
279
|
+
.0
|
|
280
|
+
.lock()
|
|
281
|
+
.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
|
|
282
|
+
reader.read(buf)
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
impl ChunkReader for ThreadSafeRubyIOReader {
|
|
287
|
+
type T = BufReader<ThreadSafeRubyIOReader>;
|
|
288
|
+
|
|
289
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
|
290
|
+
let mut reader = self.clone();
|
|
291
|
+
reader.seek(SeekFrom::Start(start))?;
|
|
292
|
+
Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
|
296
|
+
let mut buffer = Vec::with_capacity(length);
|
|
297
|
+
let mut reader = self.clone();
|
|
298
|
+
reader.seek(SeekFrom::Start(start))?;
|
|
299
|
+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
|
|
300
|
+
|
|
301
|
+
if read != length {
|
|
302
|
+
return Err(ParquetError::EOF(format!(
|
|
303
|
+
"Expected to read {} bytes, read only {}",
|
|
304
|
+
length, read
|
|
305
|
+
)));
|
|
306
|
+
}
|
|
307
|
+
Ok(buffer.into())
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
/// Adapter for Ruby IO objects that implements std::io::Write
|
|
312
|
+
pub struct RubyIOWriter {
|
|
313
|
+
io: Value,
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
impl RubyIOWriter {
|
|
317
|
+
pub fn new(io: Value) -> Self {
|
|
318
|
+
Self { io }
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
impl Write for RubyIOWriter {
|
|
323
|
+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
|
324
|
+
// Call Ruby IO#write method
|
|
325
|
+
let ruby = Ruby::get().map_err(|e| {
|
|
326
|
+
IoError::new(
|
|
327
|
+
ErrorKind::Other,
|
|
328
|
+
format!("Failed to get Ruby runtime: {}", e),
|
|
329
|
+
)
|
|
330
|
+
})?;
|
|
331
|
+
|
|
332
|
+
if buf.is_empty() {
|
|
333
|
+
return Ok(0);
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Convert bytes to Ruby string
|
|
337
|
+
let ruby_string = ruby.str_from_slice(buf);
|
|
338
|
+
|
|
339
|
+
// Call io.write(string)
|
|
340
|
+
let result: Result<usize, MagnusError> = self.io.funcall("write", (ruby_string,));
|
|
341
|
+
|
|
342
|
+
match result {
|
|
343
|
+
Ok(bytes_written) => Ok(bytes_written),
|
|
344
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
fn flush(&mut self) -> std::io::Result<()> {
|
|
349
|
+
// Call Ruby IO#flush method
|
|
350
|
+
let result: Result<Value, MagnusError> = self.io.funcall("flush", ());
|
|
351
|
+
|
|
352
|
+
match result {
|
|
353
|
+
Ok(_) => Ok(()),
|
|
354
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/// Wrapper that implements both Read and Write for Ruby IO objects
|
|
360
|
+
pub struct RubyIO {
|
|
361
|
+
io: Value,
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
impl RubyIO {
|
|
365
|
+
pub fn new(io: Value) -> Self {
|
|
366
|
+
Self { io }
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
impl Read for RubyIO {
|
|
371
|
+
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
|
372
|
+
RubyIOReader::new(self.io)?.read(buf)
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
impl Write for RubyIO {
|
|
377
|
+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
|
378
|
+
RubyIOWriter::new(self.io).write(buf)
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
fn flush(&mut self) -> std::io::Result<()> {
|
|
382
|
+
RubyIOWriter::new(self.io).flush()
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
impl Seek for RubyIO {
|
|
387
|
+
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
|
|
388
|
+
RubyIOReader::new(self.io)?.seek(pos)
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
/// Check if a Ruby value responds to IO methods
|
|
393
|
+
pub fn is_io_like(value: Value) -> bool {
|
|
394
|
+
// Check if the object responds to read/write methods
|
|
395
|
+
let responds_to_read: Result<bool, MagnusError> = value.funcall("respond_to?", ("read",));
|
|
396
|
+
let responds_to_write: Result<bool, MagnusError> = value.funcall("respond_to?", ("write",));
|
|
397
|
+
|
|
398
|
+
matches!(
|
|
399
|
+
(responds_to_read, responds_to_write),
|
|
400
|
+
(Ok(true), _) | (_, Ok(true))
|
|
401
|
+
)
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/// Create a reader from a Ruby IO-like object
|
|
405
|
+
pub fn create_reader(io: Value) -> std::io::Result<RubyIOReader> {
|
|
406
|
+
// Verify it has a read method
|
|
407
|
+
let responds_to_read: Result<bool, MagnusError> = io.funcall("respond_to?", ("read",));
|
|
408
|
+
|
|
409
|
+
match responds_to_read {
|
|
410
|
+
Ok(true) => RubyIOReader::new(io),
|
|
411
|
+
Ok(false) => Err(IoError::new(
|
|
412
|
+
ErrorKind::InvalidInput,
|
|
413
|
+
"Object does not respond to 'read' method",
|
|
414
|
+
)),
|
|
415
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
/// Create a writer from a Ruby IO-like object
|
|
420
|
+
pub fn create_writer(io: Value) -> std::io::Result<RubyIOWriter> {
|
|
421
|
+
// Verify it has a write method
|
|
422
|
+
let responds_to_write: Result<bool, MagnusError> = io.funcall("respond_to?", ("write",));
|
|
423
|
+
|
|
424
|
+
match responds_to_write {
|
|
425
|
+
Ok(true) => Ok(RubyIOWriter::new(io)),
|
|
426
|
+
Ok(false) => Err(IoError::new(
|
|
427
|
+
ErrorKind::InvalidInput,
|
|
428
|
+
"Object does not respond to 'write' method",
|
|
429
|
+
)),
|
|
430
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
|
431
|
+
}
|
|
432
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
//! Ruby-specific adapter for parquet-core
|
|
2
|
+
//!
|
|
3
|
+
//! This crate provides Ruby-specific implementations of the parquet-core traits,
|
|
4
|
+
//! enabling seamless integration between Ruby and the core Parquet functionality.
|
|
5
|
+
//!
|
|
6
|
+
//! # Overview
|
|
7
|
+
//!
|
|
8
|
+
//! The adapter implements three main components:
|
|
9
|
+
//!
|
|
10
|
+
//! ## Value Conversion
|
|
11
|
+
//!
|
|
12
|
+
//! The [`RubyValueConverter`] implements the `ValueConverter` trait to handle
|
|
13
|
+
//! conversions between Ruby values (via Magnus) and Parquet values:
|
|
14
|
+
//!
|
|
15
|
+
//! - Ruby integers ↔ Parquet int types
|
|
16
|
+
//! - Ruby floats ↔ Parquet float/double
|
|
17
|
+
//! - Ruby strings ↔ Parquet strings/binary
|
|
18
|
+
//! - Ruby BigDecimal ↔ Parquet decimal types
|
|
19
|
+
//! - Ruby Time/DateTime ↔ Parquet temporal types
|
|
20
|
+
//! - Ruby arrays/hashes ↔ Parquet lists/maps/structs
|
|
21
|
+
//!
|
|
22
|
+
//! ## I/O Operations
|
|
23
|
+
//!
|
|
24
|
+
//! The I/O module provides [`RubyIOReader`] and [`RubyIOWriter`] which implement
|
|
25
|
+
//! parquet-core's `ChunkReader` trait for Ruby IO objects:
|
|
26
|
+
//!
|
|
27
|
+
//! - File objects
|
|
28
|
+
//! - StringIO for in-memory operations
|
|
29
|
+
//! - Any Ruby object implementing read/write/seek methods
|
|
30
|
+
//!
|
|
31
|
+
//! ## Schema Conversion
|
|
32
|
+
//!
|
|
33
|
+
//! Schema utilities for converting between Ruby schema representations and
|
|
34
|
+
//! parquet-core's schema types:
|
|
35
|
+
//!
|
|
36
|
+
//! - Legacy hash-based schemas
|
|
37
|
+
//! - New DSL-based schemas
|
|
38
|
+
//! - Automatic type inference from data
|
|
39
|
+
|
|
40
|
+
pub mod error;
|
|
41
|
+
pub use error::{ErrorContext, IntoMagnusError, Result, RubyAdapterError};
|
|
42
|
+
|
|
43
|
+
pub mod chunk_reader;
|
|
44
|
+
pub use chunk_reader::CloneableChunkReader;
|
|
45
|
+
|
|
46
|
+
pub mod converter;
|
|
47
|
+
pub use converter::RubyValueConverter;
|
|
48
|
+
|
|
49
|
+
pub mod io;
|
|
50
|
+
pub use io::{create_reader, is_io_like, RubyIO, RubyIOReader, RubyIOWriter};
|
|
51
|
+
|
|
52
|
+
pub mod logger;
|
|
53
|
+
pub use logger::RubyLogger;
|
|
54
|
+
|
|
55
|
+
pub mod schema;
|
|
56
|
+
pub use schema::{
|
|
57
|
+
convert_legacy_schema, extract_field_schemas, is_dsl_schema, parquet_schema_to_ruby,
|
|
58
|
+
process_schema_value, ruby_schema_to_parquet, RubySchemaBuilder,
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
pub mod string_cache;
|
|
62
|
+
pub use string_cache::StringCache;
|
|
63
|
+
|
|
64
|
+
pub mod string_storage;
|
|
65
|
+
pub use string_storage::{
|
|
66
|
+
StringStorage, StringStorageConfig, StringStorageMode, DEFAULT_SHARED_MAX_ENTRIES,
|
|
67
|
+
DEFAULT_SHARED_MAX_VALUE_BYTES,
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
pub mod metadata;
|
|
71
|
+
pub use metadata::{parse_metadata, RubyParquetMetaData};
|
|
72
|
+
|
|
73
|
+
pub mod types;
|
|
74
|
+
pub use types::{
|
|
75
|
+
ColumnEnumeratorArgs, ParquetWriteArgs, ParserResultType, RowEnumeratorArgs, WriterOutput,
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
pub mod utils;
|
|
79
|
+
pub use utils::{
|
|
80
|
+
create_column_enumerator, create_row_enumerator, handle_block_or_enum, parse_compression,
|
|
81
|
+
parse_parquet_write_args,
|
|
82
|
+
};
|
|
83
|
+
|
|
84
|
+
pub mod reader;
|
|
85
|
+
pub use reader::{each_column, each_row};
|
|
86
|
+
|
|
87
|
+
pub mod writer;
|
|
88
|
+
pub use writer::{create_writer, finalize_writer, write_columns, write_rows};
|
|
89
|
+
|
|
90
|
+
pub mod try_into_value;
|
|
91
|
+
pub use try_into_value::TryIntoValue;
|