parquet 0.5.13 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +3 -0
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -605
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,148 @@
|
|
1
|
+
use magnus::{Error as MagnusError, Ruby};
|
2
|
+
use parquet_core::ParquetError as CoreParquetError;
|
3
|
+
use std::fmt::Display;
|
4
|
+
use thiserror::Error;
|
5
|
+
|
6
|
+
/// Error type for parquet-ruby-adapter
|
7
|
+
#[derive(Error, Debug)]
|
8
|
+
pub enum RubyAdapterError {
|
9
|
+
/// Core parquet errors
|
10
|
+
#[error("Parquet error: {0}")]
|
11
|
+
Parquet(#[from] CoreParquetError),
|
12
|
+
|
13
|
+
/// Magnus/Ruby errors
|
14
|
+
#[error("Ruby error: {0}")]
|
15
|
+
Ruby(String),
|
16
|
+
|
17
|
+
/// IO errors
|
18
|
+
#[error("IO error: {0}")]
|
19
|
+
Io(#[from] std::io::Error),
|
20
|
+
|
21
|
+
/// Type conversion errors
|
22
|
+
#[error("Type conversion error: {0}")]
|
23
|
+
TypeConversion(String),
|
24
|
+
|
25
|
+
/// Schema conversion errors
|
26
|
+
#[error("Schema conversion error: {0}")]
|
27
|
+
SchemaConversion(String),
|
28
|
+
|
29
|
+
/// Metadata extraction errors
|
30
|
+
#[error("Metadata error: {0}")]
|
31
|
+
Metadata(String),
|
32
|
+
|
33
|
+
/// Invalid input errors
|
34
|
+
#[error("Invalid input: {0}")]
|
35
|
+
InvalidInput(String),
|
36
|
+
|
37
|
+
/// Runtime errors
|
38
|
+
#[error("Runtime error: {0}")]
|
39
|
+
Runtime(String),
|
40
|
+
}
|
41
|
+
|
42
|
+
pub type Result<T> = std::result::Result<T, RubyAdapterError>;
|
43
|
+
|
44
|
+
impl RubyAdapterError {
|
45
|
+
/// Create a new Ruby error
|
46
|
+
pub fn ruby<S: Into<String>>(msg: S) -> Self {
|
47
|
+
RubyAdapterError::Ruby(msg.into())
|
48
|
+
}
|
49
|
+
|
50
|
+
/// Create a new type conversion error
|
51
|
+
pub fn type_conversion<S: Into<String>>(msg: S) -> Self {
|
52
|
+
RubyAdapterError::TypeConversion(msg.into())
|
53
|
+
}
|
54
|
+
|
55
|
+
/// Create a new schema conversion error
|
56
|
+
pub fn schema_conversion<S: Into<String>>(msg: S) -> Self {
|
57
|
+
RubyAdapterError::SchemaConversion(msg.into())
|
58
|
+
}
|
59
|
+
|
60
|
+
/// Create a new metadata error
|
61
|
+
pub fn metadata<S: Into<String>>(msg: S) -> Self {
|
62
|
+
RubyAdapterError::Metadata(msg.into())
|
63
|
+
}
|
64
|
+
|
65
|
+
/// Create a new invalid input error
|
66
|
+
pub fn invalid_input<S: Into<String>>(msg: S) -> Self {
|
67
|
+
RubyAdapterError::InvalidInput(msg.into())
|
68
|
+
}
|
69
|
+
|
70
|
+
/// Create a new runtime error
|
71
|
+
pub fn runtime<S: Into<String>>(msg: S) -> Self {
|
72
|
+
RubyAdapterError::Runtime(msg.into())
|
73
|
+
}
|
74
|
+
}
|
75
|
+
|
76
|
+
/// Convert RubyAdapterError to MagnusError
|
77
|
+
impl From<RubyAdapterError> for MagnusError {
|
78
|
+
fn from(err: RubyAdapterError) -> Self {
|
79
|
+
match Ruby::get() {
|
80
|
+
Ok(ruby) => match &err {
|
81
|
+
RubyAdapterError::Io(_) => {
|
82
|
+
MagnusError::new(ruby.exception_io_error(), err.to_string())
|
83
|
+
}
|
84
|
+
RubyAdapterError::TypeConversion(_) => {
|
85
|
+
MagnusError::new(ruby.exception_type_error(), err.to_string())
|
86
|
+
}
|
87
|
+
RubyAdapterError::InvalidInput(_) => {
|
88
|
+
MagnusError::new(ruby.exception_arg_error(), err.to_string())
|
89
|
+
}
|
90
|
+
_ => MagnusError::new(ruby.exception_runtime_error(), err.to_string()),
|
91
|
+
},
|
92
|
+
Err(_) => {
|
93
|
+
// Fallback if we can't get Ruby runtime
|
94
|
+
MagnusError::new(
|
95
|
+
magnus::exception::runtime_error(),
|
96
|
+
format!("Failed to get Ruby runtime: {}", err),
|
97
|
+
)
|
98
|
+
}
|
99
|
+
}
|
100
|
+
}
|
101
|
+
}
|
102
|
+
|
103
|
+
/// Extension trait to convert errors to MagnusError at the boundary
|
104
|
+
pub trait IntoMagnusError<T> {
|
105
|
+
/// Convert to MagnusError
|
106
|
+
fn into_magnus_error(self) -> std::result::Result<T, MagnusError>;
|
107
|
+
}
|
108
|
+
|
109
|
+
impl<T> IntoMagnusError<T> for Result<T> {
|
110
|
+
fn into_magnus_error(self) -> std::result::Result<T, MagnusError> {
|
111
|
+
self.map_err(Into::into)
|
112
|
+
}
|
113
|
+
}
|
114
|
+
|
115
|
+
/// Extension trait to add context to errors
|
116
|
+
pub trait ErrorContext<T> {
|
117
|
+
/// Add context to an error
|
118
|
+
fn context<S: Display>(self, ctx: S) -> Result<T>;
|
119
|
+
|
120
|
+
/// Add context with a closure that's only called on error
|
121
|
+
fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T>;
|
122
|
+
}
|
123
|
+
|
124
|
+
impl<T, E> ErrorContext<T> for std::result::Result<T, E>
|
125
|
+
where
|
126
|
+
E: Into<RubyAdapterError>,
|
127
|
+
{
|
128
|
+
fn context<S: Display>(self, ctx: S) -> Result<T> {
|
129
|
+
self.map_err(|e| {
|
130
|
+
let base_error = e.into();
|
131
|
+
RubyAdapterError::Runtime(format!("{}: {}", ctx, base_error))
|
132
|
+
})
|
133
|
+
}
|
134
|
+
|
135
|
+
fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T> {
|
136
|
+
self.map_err(|e| {
|
137
|
+
let base_error = e.into();
|
138
|
+
RubyAdapterError::Runtime(format!("{}: {}", f(), base_error))
|
139
|
+
})
|
140
|
+
}
|
141
|
+
}
|
142
|
+
|
143
|
+
/// Convert from MagnusError to RubyAdapterError
|
144
|
+
impl From<MagnusError> for RubyAdapterError {
|
145
|
+
fn from(err: MagnusError) -> Self {
|
146
|
+
RubyAdapterError::Ruby(err.to_string())
|
147
|
+
}
|
148
|
+
}
|
@@ -1,23 +1,21 @@
|
|
1
1
|
use bytes::Bytes;
|
2
|
-
use magnus::{
|
3
|
-
|
4
|
-
|
5
|
-
|
2
|
+
use magnus::value::{Opaque, ReprValue};
|
3
|
+
use magnus::{Error as MagnusError, RString, Ruby, Value};
|
4
|
+
use std::io::{Error as IoError, ErrorKind, Read, Seek, SeekFrom, Write};
|
5
|
+
|
6
6
|
use parquet::{
|
7
7
|
errors::ParquetError,
|
8
8
|
file::reader::{ChunkReader, Length},
|
9
9
|
};
|
10
|
-
use std::{fs::File,
|
10
|
+
use std::{fs::File, sync::Mutex};
|
11
11
|
use std::{
|
12
|
-
io::{self, BufReader
|
12
|
+
io::{self, BufReader},
|
13
13
|
sync::Arc,
|
14
14
|
};
|
15
15
|
|
16
|
-
use crate::types::ParquetGemError;
|
17
|
-
|
18
16
|
/// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
|
19
17
|
/// and provide a standard Read implementation for them.
|
20
|
-
pub enum
|
18
|
+
pub enum RubyIOReader {
|
21
19
|
String {
|
22
20
|
inner: Opaque<RString>,
|
23
21
|
offset: usize,
|
@@ -32,38 +30,35 @@ pub enum RubyReader {
|
|
32
30
|
|
33
31
|
// Sending is technically not safe, but the only things that threatens to
|
34
32
|
// do this is the parquet gem, and they don't seem to actually do it.
|
35
|
-
unsafe impl Send for
|
33
|
+
unsafe impl Send for RubyIOReader {}
|
36
34
|
|
37
|
-
impl
|
38
|
-
pub fn new(
|
39
|
-
if
|
40
|
-
Ok(
|
35
|
+
impl RubyIOReader {
|
36
|
+
pub fn new(value: Value) -> std::io::Result<Self> {
|
37
|
+
if RubyIOReader::is_seekable_io_like(&value) {
|
38
|
+
Ok(RubyIOReader::RubyIoLike {
|
41
39
|
inner: Opaque::from(value),
|
42
40
|
})
|
43
|
-
} else if
|
44
|
-
let mut temp_file = tempfile::tempfile()
|
45
|
-
.map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
41
|
+
} else if RubyIOReader::is_io_like(&value) {
|
42
|
+
let mut temp_file = tempfile::tempfile()?;
|
46
43
|
|
47
44
|
// This is safe, because we won't call seek
|
48
|
-
let inner_readable =
|
45
|
+
let inner_readable = RubyIOReader::RubyIoLike {
|
49
46
|
inner: Opaque::from(value),
|
50
47
|
};
|
51
48
|
let mut reader = BufReader::new(inner_readable);
|
52
|
-
io::copy(&mut reader, &mut temp_file)
|
53
|
-
|
54
|
-
temp_file
|
55
|
-
.seek(SeekFrom::Start(0))
|
56
|
-
.map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
|
49
|
+
io::copy(&mut reader, &mut temp_file)?;
|
50
|
+
temp_file.seek(SeekFrom::Start(0))?;
|
57
51
|
|
58
|
-
Ok(
|
52
|
+
Ok(RubyIOReader::NativeProxyIoLike {
|
59
53
|
proxy_file: temp_file,
|
60
54
|
})
|
61
55
|
} else {
|
62
56
|
// Try calling `to_str`, and if that fails, try `to_s`
|
63
57
|
let string_content = value
|
64
58
|
.funcall::<_, _, RString>("to_str", ())
|
65
|
-
.or_else(|_| value.funcall::<_, _, RString>("to_s", ()))
|
66
|
-
|
59
|
+
.or_else(|_| value.funcall::<_, _, RString>("to_s", ()))
|
60
|
+
.map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))?;
|
61
|
+
Ok(RubyIOReader::String {
|
67
62
|
inner: Opaque::from(string_content),
|
68
63
|
offset: 0,
|
69
64
|
})
|
@@ -82,12 +77,13 @@ impl RubyReader {
|
|
82
77
|
}
|
83
78
|
}
|
84
79
|
|
85
|
-
impl Seek for
|
80
|
+
impl Seek for RubyIOReader {
|
86
81
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
87
|
-
let ruby =
|
82
|
+
let ruby = Ruby::get()
|
83
|
+
.map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
|
88
84
|
match self {
|
89
|
-
|
90
|
-
|
85
|
+
RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
|
86
|
+
RubyIOReader::String {
|
91
87
|
inner,
|
92
88
|
offset: original_offset,
|
93
89
|
} => {
|
@@ -108,7 +104,7 @@ impl Seek for RubyReader {
|
|
108
104
|
*original_offset = new_offset.min(unwrapped_inner.len());
|
109
105
|
Ok(*original_offset as u64)
|
110
106
|
}
|
111
|
-
|
107
|
+
RubyIOReader::RubyIoLike { inner } => {
|
112
108
|
let unwrapped_inner = ruby.get_inner(*inner);
|
113
109
|
|
114
110
|
let (whence, ruby_offset) = match pos {
|
@@ -131,12 +127,13 @@ impl Seek for RubyReader {
|
|
131
127
|
}
|
132
128
|
}
|
133
129
|
|
134
|
-
impl Read for
|
130
|
+
impl Read for RubyIOReader {
|
135
131
|
fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
|
136
|
-
let ruby =
|
132
|
+
let ruby = Ruby::get()
|
133
|
+
.map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
|
137
134
|
match self {
|
138
|
-
|
139
|
-
|
135
|
+
RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
|
136
|
+
RubyIOReader::String { inner, offset } => {
|
140
137
|
let unwrapped_inner = ruby.get_inner(*inner);
|
141
138
|
|
142
139
|
let string_buffer = unsafe { unwrapped_inner.as_slice() };
|
@@ -152,7 +149,7 @@ impl Read for RubyReader {
|
|
152
149
|
|
153
150
|
Ok(copy_size)
|
154
151
|
}
|
155
|
-
|
152
|
+
RubyIOReader::RubyIoLike { inner } => {
|
156
153
|
let unwrapped_inner = ruby.get_inner(*inner);
|
157
154
|
|
158
155
|
let bytes = unwrapped_inner
|
@@ -172,16 +169,22 @@ impl Read for RubyReader {
|
|
172
169
|
}
|
173
170
|
}
|
174
171
|
|
175
|
-
impl Length for
|
172
|
+
impl Length for RubyIOReader {
|
176
173
|
fn len(&self) -> u64 {
|
177
|
-
let ruby =
|
174
|
+
let ruby = match Ruby::get() {
|
175
|
+
Ok(r) => r,
|
176
|
+
Err(_) => {
|
177
|
+
eprintln!("Failed to get Ruby runtime in RubyIOReader::len");
|
178
|
+
return 0;
|
179
|
+
}
|
180
|
+
};
|
178
181
|
match self {
|
179
|
-
|
180
|
-
|
182
|
+
RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
|
183
|
+
RubyIOReader::String { inner, offset: _ } => {
|
181
184
|
let unwrapped_inner = ruby.get_inner(*inner);
|
182
185
|
unwrapped_inner.len() as u64
|
183
186
|
}
|
184
|
-
|
187
|
+
RubyIOReader::RubyIoLike { inner } => {
|
185
188
|
let unwrapped_inner = ruby.get_inner(*inner);
|
186
189
|
|
187
190
|
// Get current position
|
@@ -222,12 +225,13 @@ impl Length for RubyReader {
|
|
222
225
|
}
|
223
226
|
};
|
224
227
|
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
228
|
+
if current_pos != final_pos {
|
229
|
+
eprintln!(
|
230
|
+
"Failed to restore original position in seekable IO object. Started at position {}, but ended at {}",
|
231
|
+
current_pos,
|
232
|
+
final_pos
|
233
|
+
);
|
234
|
+
}
|
231
235
|
|
232
236
|
size
|
233
237
|
}
|
@@ -238,21 +242,28 @@ impl Length for RubyReader {
|
|
238
242
|
const READ_BUFFER_SIZE: usize = 16 * 1024;
|
239
243
|
|
240
244
|
#[derive(Clone)]
|
241
|
-
pub struct
|
245
|
+
pub struct ThreadSafeRubyIOReader(Arc<Mutex<RubyIOReader>>);
|
242
246
|
|
243
|
-
impl
|
244
|
-
pub fn new(reader:
|
247
|
+
impl ThreadSafeRubyIOReader {
|
248
|
+
pub fn new(reader: RubyIOReader) -> Self {
|
245
249
|
Self(Arc::new(Mutex::new(reader)))
|
246
250
|
}
|
247
251
|
}
|
248
252
|
|
249
|
-
impl Length for
|
253
|
+
impl Length for ThreadSafeRubyIOReader {
|
250
254
|
fn len(&self) -> u64 {
|
251
|
-
self.0.lock()
|
255
|
+
match self.0.lock() {
|
256
|
+
Ok(reader) => reader.len(),
|
257
|
+
Err(_) => {
|
258
|
+
// If the mutex is poisoned, we can't recover, return 0
|
259
|
+
eprintln!("Failed to lock mutex in ThreadSafeRubyIOReader::len");
|
260
|
+
0
|
261
|
+
}
|
262
|
+
}
|
252
263
|
}
|
253
264
|
}
|
254
265
|
|
255
|
-
impl Seek for
|
266
|
+
impl Seek for ThreadSafeRubyIOReader {
|
256
267
|
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
257
268
|
let mut reader = self
|
258
269
|
.0
|
@@ -262,7 +273,7 @@ impl Seek for ThreadSafeRubyReader {
|
|
262
273
|
}
|
263
274
|
}
|
264
275
|
|
265
|
-
impl Read for
|
276
|
+
impl Read for ThreadSafeRubyIOReader {
|
266
277
|
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
267
278
|
let mut reader = self
|
268
279
|
.0
|
@@ -272,8 +283,8 @@ impl Read for ThreadSafeRubyReader {
|
|
272
283
|
}
|
273
284
|
}
|
274
285
|
|
275
|
-
impl ChunkReader for
|
276
|
-
type T = BufReader<
|
286
|
+
impl ChunkReader for ThreadSafeRubyIOReader {
|
287
|
+
type T = BufReader<ThreadSafeRubyIOReader>;
|
277
288
|
|
278
289
|
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
279
290
|
let mut reader = self.clone();
|
@@ -296,3 +307,126 @@ impl ChunkReader for ThreadSafeRubyReader {
|
|
296
307
|
Ok(buffer.into())
|
297
308
|
}
|
298
309
|
}
|
310
|
+
|
311
|
+
/// Adapter for Ruby IO objects that implements std::io::Write
|
312
|
+
pub struct RubyIOWriter {
|
313
|
+
io: Value,
|
314
|
+
}
|
315
|
+
|
316
|
+
impl RubyIOWriter {
|
317
|
+
pub fn new(io: Value) -> Self {
|
318
|
+
Self { io }
|
319
|
+
}
|
320
|
+
}
|
321
|
+
|
322
|
+
impl Write for RubyIOWriter {
|
323
|
+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
324
|
+
// Call Ruby IO#write method
|
325
|
+
let ruby = Ruby::get().map_err(|e| {
|
326
|
+
IoError::new(
|
327
|
+
ErrorKind::Other,
|
328
|
+
format!("Failed to get Ruby runtime: {}", e),
|
329
|
+
)
|
330
|
+
})?;
|
331
|
+
|
332
|
+
if buf.is_empty() {
|
333
|
+
return Ok(0);
|
334
|
+
}
|
335
|
+
|
336
|
+
// Convert bytes to Ruby string
|
337
|
+
let ruby_string = ruby.str_from_slice(buf);
|
338
|
+
|
339
|
+
// Call io.write(string)
|
340
|
+
let result: Result<usize, MagnusError> = self.io.funcall("write", (ruby_string,));
|
341
|
+
|
342
|
+
match result {
|
343
|
+
Ok(bytes_written) => Ok(bytes_written),
|
344
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
345
|
+
}
|
346
|
+
}
|
347
|
+
|
348
|
+
fn flush(&mut self) -> std::io::Result<()> {
|
349
|
+
// Call Ruby IO#flush method
|
350
|
+
let result: Result<Value, MagnusError> = self.io.funcall("flush", ());
|
351
|
+
|
352
|
+
match result {
|
353
|
+
Ok(_) => Ok(()),
|
354
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
355
|
+
}
|
356
|
+
}
|
357
|
+
}
|
358
|
+
|
359
|
+
/// Wrapper that implements both Read and Write for Ruby IO objects
|
360
|
+
pub struct RubyIO {
|
361
|
+
io: Value,
|
362
|
+
}
|
363
|
+
|
364
|
+
impl RubyIO {
|
365
|
+
pub fn new(io: Value) -> Self {
|
366
|
+
Self { io }
|
367
|
+
}
|
368
|
+
}
|
369
|
+
|
370
|
+
impl Read for RubyIO {
|
371
|
+
fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
|
372
|
+
RubyIOReader::new(self.io)?.read(buf)
|
373
|
+
}
|
374
|
+
}
|
375
|
+
|
376
|
+
impl Write for RubyIO {
|
377
|
+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
|
378
|
+
RubyIOWriter::new(self.io).write(buf)
|
379
|
+
}
|
380
|
+
|
381
|
+
fn flush(&mut self) -> std::io::Result<()> {
|
382
|
+
RubyIOWriter::new(self.io).flush()
|
383
|
+
}
|
384
|
+
}
|
385
|
+
|
386
|
+
impl Seek for RubyIO {
|
387
|
+
fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
|
388
|
+
RubyIOReader::new(self.io)?.seek(pos)
|
389
|
+
}
|
390
|
+
}
|
391
|
+
|
392
|
+
/// Check if a Ruby value responds to IO methods
|
393
|
+
pub fn is_io_like(value: Value) -> bool {
|
394
|
+
// Check if the object responds to read/write methods
|
395
|
+
let responds_to_read: Result<bool, MagnusError> = value.funcall("respond_to?", ("read",));
|
396
|
+
let responds_to_write: Result<bool, MagnusError> = value.funcall("respond_to?", ("write",));
|
397
|
+
|
398
|
+
matches!(
|
399
|
+
(responds_to_read, responds_to_write),
|
400
|
+
(Ok(true), _) | (_, Ok(true))
|
401
|
+
)
|
402
|
+
}
|
403
|
+
|
404
|
+
/// Create a reader from a Ruby IO-like object
|
405
|
+
pub fn create_reader(io: Value) -> std::io::Result<RubyIOReader> {
|
406
|
+
// Verify it has a read method
|
407
|
+
let responds_to_read: Result<bool, MagnusError> = io.funcall("respond_to?", ("read",));
|
408
|
+
|
409
|
+
match responds_to_read {
|
410
|
+
Ok(true) => RubyIOReader::new(io),
|
411
|
+
Ok(false) => Err(IoError::new(
|
412
|
+
ErrorKind::InvalidInput,
|
413
|
+
"Object does not respond to 'read' method",
|
414
|
+
)),
|
415
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
416
|
+
}
|
417
|
+
}
|
418
|
+
|
419
|
+
/// Create a writer from a Ruby IO-like object
|
420
|
+
pub fn create_writer(io: Value) -> std::io::Result<RubyIOWriter> {
|
421
|
+
// Verify it has a write method
|
422
|
+
let responds_to_write: Result<bool, MagnusError> = io.funcall("respond_to?", ("write",));
|
423
|
+
|
424
|
+
match responds_to_write {
|
425
|
+
Ok(true) => Ok(RubyIOWriter::new(io)),
|
426
|
+
Ok(false) => Err(IoError::new(
|
427
|
+
ErrorKind::InvalidInput,
|
428
|
+
"Object does not respond to 'write' method",
|
429
|
+
)),
|
430
|
+
Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
|
431
|
+
}
|
432
|
+
}
|
@@ -0,0 +1,90 @@
|
|
1
|
+
//! Ruby-specific adapter for parquet-core
|
2
|
+
//!
|
3
|
+
//! This crate provides Ruby-specific implementations of the parquet-core traits,
|
4
|
+
//! enabling seamless integration between Ruby and the core Parquet functionality.
|
5
|
+
//!
|
6
|
+
//! # Overview
|
7
|
+
//!
|
8
|
+
//! The adapter implements three main components:
|
9
|
+
//!
|
10
|
+
//! ## Value Conversion
|
11
|
+
//!
|
12
|
+
//! The [`RubyValueConverter`] implements the `ValueConverter` trait to handle
|
13
|
+
//! conversions between Ruby values (via Magnus) and Parquet values:
|
14
|
+
//!
|
15
|
+
//! - Ruby integers ↔ Parquet int types
|
16
|
+
//! - Ruby floats ↔ Parquet float/double
|
17
|
+
//! - Ruby strings ↔ Parquet strings/binary
|
18
|
+
//! - Ruby BigDecimal ↔ Parquet decimal types
|
19
|
+
//! - Ruby Time/DateTime ↔ Parquet temporal types
|
20
|
+
//! - Ruby arrays/hashes ↔ Parquet lists/maps/structs
|
21
|
+
//!
|
22
|
+
//! ## I/O Operations
|
23
|
+
//!
|
24
|
+
//! The I/O module provides [`RubyIOReader`] and [`RubyIOWriter`] which implement
|
25
|
+
//! parquet-core's `ChunkReader` trait for Ruby IO objects:
|
26
|
+
//!
|
27
|
+
//! - File objects
|
28
|
+
//! - StringIO for in-memory operations
|
29
|
+
//! - Any Ruby object implementing read/write/seek methods
|
30
|
+
//!
|
31
|
+
//! ## Schema Conversion
|
32
|
+
//!
|
33
|
+
//! Schema utilities for converting between Ruby schema representations and
|
34
|
+
//! parquet-core's schema types:
|
35
|
+
//!
|
36
|
+
//! - Legacy hash-based schemas
|
37
|
+
//! - New DSL-based schemas
|
38
|
+
//! - Automatic type inference from data
|
39
|
+
|
40
|
+
pub mod error;
|
41
|
+
pub use error::{ErrorContext, IntoMagnusError, Result, RubyAdapterError};
|
42
|
+
|
43
|
+
pub mod chunk_reader;
|
44
|
+
pub use chunk_reader::CloneableChunkReader;
|
45
|
+
|
46
|
+
pub mod converter;
|
47
|
+
pub use converter::RubyValueConverter;
|
48
|
+
|
49
|
+
pub mod io;
|
50
|
+
pub use io::{create_reader, is_io_like, RubyIO, RubyIOReader, RubyIOWriter};
|
51
|
+
|
52
|
+
pub mod logger;
|
53
|
+
pub use logger::RubyLogger;
|
54
|
+
|
55
|
+
pub mod schema;
|
56
|
+
pub use schema::{
|
57
|
+
convert_legacy_schema, extract_field_schemas, is_dsl_schema, parquet_schema_to_ruby,
|
58
|
+
process_schema_value, ruby_schema_to_parquet, RubySchemaBuilder,
|
59
|
+
};
|
60
|
+
|
61
|
+
pub mod string_cache;
|
62
|
+
pub use string_cache::StringCache;
|
63
|
+
|
64
|
+
pub mod batch_manager;
|
65
|
+
pub use batch_manager::{
|
66
|
+
BatchSizeManager, DEFAULT_MEMORY_THRESHOLD, INITIAL_BATCH_SIZE, MIN_BATCH_SIZE, SAMPLE_SIZE,
|
67
|
+
};
|
68
|
+
|
69
|
+
pub mod metadata;
|
70
|
+
pub use metadata::{parse_metadata, RubyParquetMetaData};
|
71
|
+
|
72
|
+
pub mod types;
|
73
|
+
pub use types::{
|
74
|
+
ColumnEnumeratorArgs, ParquetWriteArgs, ParserResultType, RowEnumeratorArgs, WriterOutput,
|
75
|
+
};
|
76
|
+
|
77
|
+
pub mod utils;
|
78
|
+
pub use utils::{
|
79
|
+
create_column_enumerator, create_row_enumerator, estimate_parquet_value_size,
|
80
|
+
estimate_row_size, handle_block_or_enum, parse_compression, parse_parquet_write_args,
|
81
|
+
};
|
82
|
+
|
83
|
+
pub mod reader;
|
84
|
+
pub use reader::{each_column, each_row};
|
85
|
+
|
86
|
+
pub mod writer;
|
87
|
+
pub use writer::{create_writer, finalize_writer, write_columns, write_rows};
|
88
|
+
|
89
|
+
pub mod try_into_value;
|
90
|
+
pub use try_into_value::TryIntoValue;
|
@@ -0,0 +1,64 @@
|
|
1
|
+
use magnus::value::ReprValue;
|
2
|
+
use magnus::{Error as MagnusError, Value};
|
3
|
+
|
4
|
+
pub struct RubyLogger {
|
5
|
+
logger: Option<Value>,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RubyLogger {
|
9
|
+
pub fn new(logger: Option<Value>) -> Result<Self, MagnusError> {
|
10
|
+
// Validate logger has required methods if provided
|
11
|
+
if let Some(ref log) = logger {
|
12
|
+
for method in &["debug", "info", "warn", "error"] {
|
13
|
+
if !log.respond_to(*method, false)? {
|
14
|
+
return Err(MagnusError::new(
|
15
|
+
magnus::exception::arg_error(),
|
16
|
+
format!("Logger must respond to {}", method),
|
17
|
+
));
|
18
|
+
}
|
19
|
+
}
|
20
|
+
}
|
21
|
+
Ok(Self { logger })
|
22
|
+
}
|
23
|
+
|
24
|
+
pub fn debug<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
|
25
|
+
if let Some(ref logger) = self.logger {
|
26
|
+
logger.funcall::<_, _, Value>("debug", (msg_fn(),))?;
|
27
|
+
}
|
28
|
+
Ok(())
|
29
|
+
}
|
30
|
+
|
31
|
+
pub fn info<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
|
32
|
+
if let Some(ref logger) = self.logger {
|
33
|
+
logger.funcall::<_, _, Value>("info", (msg_fn(),))?;
|
34
|
+
}
|
35
|
+
Ok(())
|
36
|
+
}
|
37
|
+
|
38
|
+
pub fn warn<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
|
39
|
+
if let Some(ref logger) = self.logger {
|
40
|
+
logger.funcall::<_, _, Value>("warn", (msg_fn(),))?;
|
41
|
+
}
|
42
|
+
Ok(())
|
43
|
+
}
|
44
|
+
|
45
|
+
pub fn error<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
|
46
|
+
if let Some(ref logger) = self.logger {
|
47
|
+
logger.funcall::<_, _, Value>("error", (msg_fn(),))?;
|
48
|
+
}
|
49
|
+
Ok(())
|
50
|
+
}
|
51
|
+
|
52
|
+
pub fn inner(&self) -> Option<Value> {
|
53
|
+
self.logger
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
// Make RubyLogger cloneable for passing to multiple functions
|
58
|
+
impl Clone for RubyLogger {
|
59
|
+
fn clone(&self) -> Self {
|
60
|
+
Self {
|
61
|
+
logger: self.logger,
|
62
|
+
}
|
63
|
+
}
|
64
|
+
}
|