parquet-tyfoom 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. checksums.yaml +7 -0
  2. data/Cargo.lock +1854 -0
  3. data/Cargo.toml +3 -0
  4. data/Gemfile +21 -0
  5. data/LICENSE +21 -0
  6. data/README.md +428 -0
  7. data/Rakefile +43 -0
  8. data/ext/parquet/Cargo.toml +39 -0
  9. data/ext/parquet/build.rs +5 -0
  10. data/ext/parquet/extconf.rb +4 -0
  11. data/ext/parquet/src/adapter_ffi.rs +297 -0
  12. data/ext/parquet/src/allocator.rs +13 -0
  13. data/ext/parquet/src/lib.rs +24 -0
  14. data/ext/parquet-core/Cargo.toml +24 -0
  15. data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
  16. data/ext/parquet-core/src/error.rs +189 -0
  17. data/ext/parquet-core/src/lib.rs +60 -0
  18. data/ext/parquet-core/src/reader.rs +368 -0
  19. data/ext/parquet-core/src/schema.rs +452 -0
  20. data/ext/parquet-core/src/test_utils.rs +308 -0
  21. data/ext/parquet-core/src/traits/mod.rs +5 -0
  22. data/ext/parquet-core/src/traits/schema.rs +190 -0
  23. data/ext/parquet-core/src/value.rs +220 -0
  24. data/ext/parquet-core/src/writer.rs +1241 -0
  25. data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
  26. data/ext/parquet-core/tests/binary_data.rs +437 -0
  27. data/ext/parquet-core/tests/column_projection.rs +557 -0
  28. data/ext/parquet-core/tests/complex_types.rs +821 -0
  29. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  30. data/ext/parquet-core/tests/concurrent_access.rs +431 -0
  31. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  32. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  33. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
  34. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  35. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  36. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  37. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  38. data/ext/parquet-core/tests/review_regressions.rs +787 -0
  39. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  40. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
  41. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  42. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  43. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  44. data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
  45. data/ext/parquet-ruby-adapter/build.rs +5 -0
  46. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  47. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  48. data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
  49. data/ext/parquet-ruby-adapter/src/error.rs +141 -0
  50. data/ext/parquet-ruby-adapter/src/io.rs +432 -0
  51. data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
  52. data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
  53. data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
  54. data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
  55. data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
  56. data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
  57. data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
  58. data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
  59. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  60. data/ext/parquet-ruby-adapter/src/types.rs +98 -0
  61. data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
  62. data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
  63. data/lib/parquet/schema.rb +262 -0
  64. data/lib/parquet/version.rb +3 -0
  65. data/lib/parquet.rb +11 -0
  66. data/lib/parquet.rbi +181 -0
  67. metadata +165 -0
@@ -0,0 +1,141 @@
1
+ use magnus::{Error as MagnusError, Ruby};
2
+ use parquet_core::ParquetError as CoreParquetError;
3
+ use std::fmt::Display;
4
+ use thiserror::Error;
5
+
6
+ /// Error type for parquet-ruby-adapter
7
+ #[derive(Error, Debug)]
8
+ pub enum RubyAdapterError {
9
+ /// Core parquet errors
10
+ #[error("Parquet error: {0}")]
11
+ Parquet(#[from] CoreParquetError),
12
+
13
+ /// Magnus/Ruby errors
14
+ #[error("Ruby error: {0}")]
15
+ Ruby(String),
16
+
17
+ /// IO errors
18
+ #[error("IO error: {0}")]
19
+ Io(#[from] std::io::Error),
20
+
21
+ /// Type conversion errors
22
+ #[error("Type conversion error: {0}")]
23
+ TypeConversion(String),
24
+
25
+ /// Schema conversion errors
26
+ #[error("Schema conversion error: {0}")]
27
+ SchemaConversion(String),
28
+
29
+ /// Metadata extraction errors
30
+ #[error("Metadata error: {0}")]
31
+ Metadata(String),
32
+
33
+ /// Invalid input errors
34
+ #[error("Invalid input: {0}")]
35
+ InvalidInput(String),
36
+
37
+ /// Runtime errors
38
+ #[error("Runtime error: {0}")]
39
+ Runtime(String),
40
+ }
41
+
42
+ pub type Result<T> = std::result::Result<T, RubyAdapterError>;
43
+
44
+ impl RubyAdapterError {
45
+ /// Create a new Ruby error
46
+ pub fn ruby<S: Into<String>>(msg: S) -> Self {
47
+ RubyAdapterError::Ruby(msg.into())
48
+ }
49
+
50
+ /// Create a new type conversion error
51
+ pub fn type_conversion<S: Into<String>>(msg: S) -> Self {
52
+ RubyAdapterError::TypeConversion(msg.into())
53
+ }
54
+
55
+ /// Create a new schema conversion error
56
+ pub fn schema_conversion<S: Into<String>>(msg: S) -> Self {
57
+ RubyAdapterError::SchemaConversion(msg.into())
58
+ }
59
+
60
+ /// Create a new metadata error
61
+ pub fn metadata<S: Into<String>>(msg: S) -> Self {
62
+ RubyAdapterError::Metadata(msg.into())
63
+ }
64
+
65
+ /// Create a new invalid input error
66
+ pub fn invalid_input<S: Into<String>>(msg: S) -> Self {
67
+ RubyAdapterError::InvalidInput(msg.into())
68
+ }
69
+
70
+ /// Create a new runtime error
71
+ pub fn runtime<S: Into<String>>(msg: S) -> Self {
72
+ RubyAdapterError::Runtime(msg.into())
73
+ }
74
+ }
75
+
76
+ /// Convert RubyAdapterError to MagnusError
77
+ impl From<RubyAdapterError> for MagnusError {
78
+ fn from(err: RubyAdapterError) -> Self {
79
+ // This conversion only runs at the FFI boundary, where the GVL is held
80
+ // and a Ruby handle is always available. A Ruby exception cannot be
81
+ // constructed without that handle, so an unavailable runtime is an
82
+ // impossible state we fail fast on rather than paper over.
83
+ let ruby = Ruby::get().unwrap_or_else(|unavailable| {
84
+ panic!("cannot build Ruby exception off the Ruby thread ({unavailable}); source error: {err}")
85
+ });
86
+ let class = match &err {
87
+ RubyAdapterError::Io(_) => ruby.exception_io_error(),
88
+ RubyAdapterError::TypeConversion(_) => ruby.exception_type_error(),
89
+ RubyAdapterError::InvalidInput(_) => ruby.exception_arg_error(),
90
+ _ => ruby.exception_runtime_error(),
91
+ };
92
+ MagnusError::new(class, err.to_string())
93
+ }
94
+ }
95
+
96
+ /// Extension trait to convert errors to MagnusError at the boundary
97
+ pub trait IntoMagnusError<T> {
98
+ /// Convert to MagnusError
99
+ fn into_magnus_error(self) -> std::result::Result<T, MagnusError>;
100
+ }
101
+
102
+ impl<T> IntoMagnusError<T> for Result<T> {
103
+ fn into_magnus_error(self) -> std::result::Result<T, MagnusError> {
104
+ self.map_err(Into::into)
105
+ }
106
+ }
107
+
108
+ /// Extension trait to add context to errors
109
+ pub trait ErrorContext<T> {
110
+ /// Add context to an error
111
+ fn context<S: Display>(self, ctx: S) -> Result<T>;
112
+
113
+ /// Add context with a closure that's only called on error
114
+ fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T>;
115
+ }
116
+
117
+ impl<T, E> ErrorContext<T> for std::result::Result<T, E>
118
+ where
119
+ E: Into<RubyAdapterError>,
120
+ {
121
+ fn context<S: Display>(self, ctx: S) -> Result<T> {
122
+ self.map_err(|e| {
123
+ let base_error = e.into();
124
+ RubyAdapterError::Runtime(format!("{}: {}", ctx, base_error))
125
+ })
126
+ }
127
+
128
+ fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T> {
129
+ self.map_err(|e| {
130
+ let base_error = e.into();
131
+ RubyAdapterError::Runtime(format!("{}: {}", f(), base_error))
132
+ })
133
+ }
134
+ }
135
+
136
+ /// Convert from MagnusError to RubyAdapterError
137
+ impl From<MagnusError> for RubyAdapterError {
138
+ fn from(err: MagnusError) -> Self {
139
+ RubyAdapterError::Ruby(err.to_string())
140
+ }
141
+ }
@@ -0,0 +1,432 @@
1
+ use bytes::Bytes;
2
+ use magnus::value::{Opaque, ReprValue};
3
+ use magnus::{Error as MagnusError, RString, Ruby, Value};
4
+ use std::io::{Error as IoError, ErrorKind, Read, Seek, SeekFrom, Write};
5
+
6
+ use parquet::{
7
+ errors::ParquetError,
8
+ file::reader::{ChunkReader, Length},
9
+ };
10
+ use std::{fs::File, sync::Mutex};
11
+ use std::{
12
+ io::{self, BufReader},
13
+ sync::Arc,
14
+ };
15
+
16
+ /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
17
+ /// and provide a standard Read implementation for them.
18
+ pub enum RubyIOReader {
19
+ String {
20
+ inner: Opaque<RString>,
21
+ offset: usize,
22
+ },
23
+ RubyIoLike {
24
+ inner: Opaque<Value>,
25
+ },
26
+ NativeProxyIoLike {
27
+ proxy_file: File,
28
+ },
29
+ }
30
+
31
+ // Sending is technically not safe, but the only things that threatens to
32
+ // do this is the parquet gem, and they don't seem to actually do it.
33
+ unsafe impl Send for RubyIOReader {}
34
+
35
+ impl RubyIOReader {
36
+ pub fn new(value: Value) -> std::io::Result<Self> {
37
+ if RubyIOReader::is_seekable_io_like(&value) {
38
+ Ok(RubyIOReader::RubyIoLike {
39
+ inner: Opaque::from(value),
40
+ })
41
+ } else if RubyIOReader::is_io_like(&value) {
42
+ let mut temp_file = tempfile::tempfile()?;
43
+
44
+ // This is safe, because we won't call seek
45
+ let inner_readable = RubyIOReader::RubyIoLike {
46
+ inner: Opaque::from(value),
47
+ };
48
+ let mut reader = BufReader::new(inner_readable);
49
+ io::copy(&mut reader, &mut temp_file)?;
50
+ temp_file.seek(SeekFrom::Start(0))?;
51
+
52
+ Ok(RubyIOReader::NativeProxyIoLike {
53
+ proxy_file: temp_file,
54
+ })
55
+ } else {
56
+ // Try calling `to_str`, and if that fails, try `to_s`
57
+ let string_content = value
58
+ .funcall::<_, _, RString>("to_str", ())
59
+ .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))
60
+ .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))?;
61
+ Ok(RubyIOReader::String {
62
+ inner: Opaque::from(string_content),
63
+ offset: 0,
64
+ })
65
+ }
66
+ }
67
+
68
+ fn is_io_like(value: &Value) -> bool {
69
+ value.respond_to("read", false).unwrap_or(false)
70
+ }
71
+
72
+ // For now, don't use this. Having to use seek in length is scary.
73
+ fn is_seekable_io_like(value: &Value) -> bool {
74
+ Self::is_io_like(value)
75
+ && value.respond_to("seek", false).unwrap_or(false)
76
+ && value.respond_to("pos", false).unwrap_or(false)
77
+ }
78
+ }
79
+
80
+ impl Seek for RubyIOReader {
81
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
82
+ let ruby = Ruby::get()
83
+ .map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
84
+ match self {
85
+ RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
86
+ RubyIOReader::String {
87
+ inner,
88
+ offset: original_offset,
89
+ } => {
90
+ let unwrapped_inner = ruby.get_inner(*inner);
91
+
92
+ let new_offset = match pos {
93
+ SeekFrom::Start(off) => off as usize,
94
+ SeekFrom::Current(off) => {
95
+ let signed = *original_offset as i64 + off;
96
+ signed.max(0) as usize
97
+ }
98
+ SeekFrom::End(off) => {
99
+ let signed = unwrapped_inner.len() as i64 + off;
100
+ signed.max(0) as usize
101
+ }
102
+ };
103
+
104
+ *original_offset = new_offset.min(unwrapped_inner.len());
105
+ Ok(*original_offset as u64)
106
+ }
107
+ RubyIOReader::RubyIoLike { inner } => {
108
+ let unwrapped_inner = ruby.get_inner(*inner);
109
+
110
+ let (whence, ruby_offset) = match pos {
111
+ SeekFrom::Start(i) => (0, i as i64),
112
+ SeekFrom::Current(i) => (1, i),
113
+ SeekFrom::End(i) => (2, i),
114
+ };
115
+
116
+ unwrapped_inner
117
+ .funcall::<_, _, u64>("seek", (ruby_offset, whence))
118
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
119
+
120
+ let new_position = unwrapped_inner
121
+ .funcall::<_, _, u64>("pos", ())
122
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
123
+
124
+ Ok(new_position)
125
+ }
126
+ }
127
+ }
128
+ }
129
+
130
+ impl Read for RubyIOReader {
131
+ fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
132
+ let ruby = Ruby::get()
133
+ .map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
134
+ match self {
135
+ RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
136
+ RubyIOReader::String { inner, offset } => {
137
+ let unwrapped_inner = ruby.get_inner(*inner);
138
+
139
+ let string_buffer = unsafe { unwrapped_inner.as_slice() };
140
+ if *offset >= string_buffer.len() {
141
+ return Ok(0); // EOF
142
+ }
143
+
144
+ let remaining = string_buffer.len() - *offset;
145
+ let copy_size = remaining.min(buf.len());
146
+ buf[..copy_size].copy_from_slice(&string_buffer[*offset..*offset + copy_size]);
147
+
148
+ *offset += copy_size;
149
+
150
+ Ok(copy_size)
151
+ }
152
+ RubyIOReader::RubyIoLike { inner } => {
153
+ let unwrapped_inner = ruby.get_inner(*inner);
154
+
155
+ let bytes = unwrapped_inner
156
+ .funcall::<_, _, Option<RString>>("read", (buf.len(),))
157
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
158
+
159
+ match bytes {
160
+ Some(bytes) => {
161
+ let string_buffer = unsafe { bytes.as_slice() };
162
+ buf.write_all(string_buffer)?;
163
+ Ok(string_buffer.len())
164
+ }
165
+ None => Ok(0),
166
+ }
167
+ }
168
+ }
169
+ }
170
+ }
171
+
172
+ impl Length for RubyIOReader {
173
+ fn len(&self) -> u64 {
174
+ let ruby = match Ruby::get() {
175
+ Ok(r) => r,
176
+ Err(_) => {
177
+ eprintln!("Failed to get Ruby runtime in RubyIOReader::len");
178
+ return 0;
179
+ }
180
+ };
181
+ match self {
182
+ RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
183
+ RubyIOReader::String { inner, offset: _ } => {
184
+ let unwrapped_inner = ruby.get_inner(*inner);
185
+ unwrapped_inner.len() as u64
186
+ }
187
+ RubyIOReader::RubyIoLike { inner } => {
188
+ let unwrapped_inner = ruby.get_inner(*inner);
189
+
190
+ // Get current position
191
+ let current_pos = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
192
+ Ok(pos) => pos,
193
+ Err(e) => {
194
+ eprintln!("Error seeking: {}", e);
195
+ return 0;
196
+ }
197
+ };
198
+
199
+ // Seek to end
200
+ if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (0, 2)) {
201
+ eprintln!("Error seeking: {}", e);
202
+ return 0;
203
+ }
204
+
205
+ // Offset at the end of the file is the length of the file
206
+ let size = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
207
+ Ok(pos) => pos,
208
+ Err(e) => {
209
+ eprintln!("Error seeking: {}", e);
210
+ return 0;
211
+ }
212
+ };
213
+
214
+ // Restore original position
215
+ if let Err(e) = unwrapped_inner.funcall::<_, _, u64>("seek", (current_pos, 0)) {
216
+ eprintln!("Error seeking: {}", e);
217
+ return 0;
218
+ }
219
+
220
+ let final_pos = match unwrapped_inner.funcall::<_, _, u64>("pos", ()) {
221
+ Ok(pos) => pos,
222
+ Err(e) => {
223
+ eprintln!("Error seeking: {}", e);
224
+ return 0;
225
+ }
226
+ };
227
+
228
+ if current_pos != final_pos {
229
+ eprintln!(
230
+ "Failed to restore original position in seekable IO object. Started at position {}, but ended at {}",
231
+ current_pos,
232
+ final_pos
233
+ );
234
+ }
235
+
236
+ size
237
+ }
238
+ }
239
+ }
240
+ }
241
+
242
+ const READ_BUFFER_SIZE: usize = 16 * 1024;
243
+
244
+ #[derive(Clone)]
245
+ pub struct ThreadSafeRubyIOReader(Arc<Mutex<RubyIOReader>>);
246
+
247
+ impl ThreadSafeRubyIOReader {
248
+ pub fn new(reader: RubyIOReader) -> Self {
249
+ Self(Arc::new(Mutex::new(reader)))
250
+ }
251
+ }
252
+
253
+ impl Length for ThreadSafeRubyIOReader {
254
+ fn len(&self) -> u64 {
255
+ match self.0.lock() {
256
+ Ok(reader) => reader.len(),
257
+ Err(_) => {
258
+ // If the mutex is poisoned, we can't recover, return 0
259
+ eprintln!("Failed to lock mutex in ThreadSafeRubyIOReader::len");
260
+ 0
261
+ }
262
+ }
263
+ }
264
+ }
265
+
266
+ impl Seek for ThreadSafeRubyIOReader {
267
+ fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
268
+ let mut reader = self
269
+ .0
270
+ .lock()
271
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
272
+ reader.seek(pos)
273
+ }
274
+ }
275
+
276
+ impl Read for ThreadSafeRubyIOReader {
277
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
278
+ let mut reader = self
279
+ .0
280
+ .lock()
281
+ .map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
282
+ reader.read(buf)
283
+ }
284
+ }
285
+
286
+ impl ChunkReader for ThreadSafeRubyIOReader {
287
+ type T = BufReader<ThreadSafeRubyIOReader>;
288
+
289
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
290
+ let mut reader = self.clone();
291
+ reader.seek(SeekFrom::Start(start))?;
292
+ Ok(BufReader::with_capacity(READ_BUFFER_SIZE, reader))
293
+ }
294
+
295
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
296
+ let mut buffer = Vec::with_capacity(length);
297
+ let mut reader = self.clone();
298
+ reader.seek(SeekFrom::Start(start))?;
299
+ let read = reader.take(length as _).read_to_end(&mut buffer)?;
300
+
301
+ if read != length {
302
+ return Err(ParquetError::EOF(format!(
303
+ "Expected to read {} bytes, read only {}",
304
+ length, read
305
+ )));
306
+ }
307
+ Ok(buffer.into())
308
+ }
309
+ }
310
+
311
+ /// Adapter for Ruby IO objects that implements std::io::Write
312
+ pub struct RubyIOWriter {
313
+ io: Value,
314
+ }
315
+
316
+ impl RubyIOWriter {
317
+ pub fn new(io: Value) -> Self {
318
+ Self { io }
319
+ }
320
+ }
321
+
322
+ impl Write for RubyIOWriter {
323
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
324
+ // Call Ruby IO#write method
325
+ let ruby = Ruby::get().map_err(|e| {
326
+ IoError::new(
327
+ ErrorKind::Other,
328
+ format!("Failed to get Ruby runtime: {}", e),
329
+ )
330
+ })?;
331
+
332
+ if buf.is_empty() {
333
+ return Ok(0);
334
+ }
335
+
336
+ // Convert bytes to Ruby string
337
+ let ruby_string = ruby.str_from_slice(buf);
338
+
339
+ // Call io.write(string)
340
+ let result: Result<usize, MagnusError> = self.io.funcall("write", (ruby_string,));
341
+
342
+ match result {
343
+ Ok(bytes_written) => Ok(bytes_written),
344
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
345
+ }
346
+ }
347
+
348
+ fn flush(&mut self) -> std::io::Result<()> {
349
+ // Call Ruby IO#flush method
350
+ let result: Result<Value, MagnusError> = self.io.funcall("flush", ());
351
+
352
+ match result {
353
+ Ok(_) => Ok(()),
354
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
355
+ }
356
+ }
357
+ }
358
+
359
+ /// Wrapper that implements both Read and Write for Ruby IO objects
360
+ pub struct RubyIO {
361
+ io: Value,
362
+ }
363
+
364
+ impl RubyIO {
365
+ pub fn new(io: Value) -> Self {
366
+ Self { io }
367
+ }
368
+ }
369
+
370
+ impl Read for RubyIO {
371
+ fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
372
+ RubyIOReader::new(self.io)?.read(buf)
373
+ }
374
+ }
375
+
376
+ impl Write for RubyIO {
377
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
378
+ RubyIOWriter::new(self.io).write(buf)
379
+ }
380
+
381
+ fn flush(&mut self) -> std::io::Result<()> {
382
+ RubyIOWriter::new(self.io).flush()
383
+ }
384
+ }
385
+
386
+ impl Seek for RubyIO {
387
+ fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
388
+ RubyIOReader::new(self.io)?.seek(pos)
389
+ }
390
+ }
391
+
392
+ /// Check if a Ruby value responds to IO methods
393
+ pub fn is_io_like(value: Value) -> bool {
394
+ // Check if the object responds to read/write methods
395
+ let responds_to_read: Result<bool, MagnusError> = value.funcall("respond_to?", ("read",));
396
+ let responds_to_write: Result<bool, MagnusError> = value.funcall("respond_to?", ("write",));
397
+
398
+ matches!(
399
+ (responds_to_read, responds_to_write),
400
+ (Ok(true), _) | (_, Ok(true))
401
+ )
402
+ }
403
+
404
+ /// Create a reader from a Ruby IO-like object
405
+ pub fn create_reader(io: Value) -> std::io::Result<RubyIOReader> {
406
+ // Verify it has a read method
407
+ let responds_to_read: Result<bool, MagnusError> = io.funcall("respond_to?", ("read",));
408
+
409
+ match responds_to_read {
410
+ Ok(true) => RubyIOReader::new(io),
411
+ Ok(false) => Err(IoError::new(
412
+ ErrorKind::InvalidInput,
413
+ "Object does not respond to 'read' method",
414
+ )),
415
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
416
+ }
417
+ }
418
+
419
+ /// Create a writer from a Ruby IO-like object
420
+ pub fn create_writer(io: Value) -> std::io::Result<RubyIOWriter> {
421
+ // Verify it has a write method
422
+ let responds_to_write: Result<bool, MagnusError> = io.funcall("respond_to?", ("write",));
423
+
424
+ match responds_to_write {
425
+ Ok(true) => Ok(RubyIOWriter::new(io)),
426
+ Ok(false) => Err(IoError::new(
427
+ ErrorKind::InvalidInput,
428
+ "Object does not respond to 'write' method",
429
+ )),
430
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
431
+ }
432
+ }
@@ -0,0 +1,91 @@
1
+ //! Ruby-specific adapter for parquet-core
2
+ //!
3
+ //! This crate provides Ruby-specific implementations of the parquet-core traits,
4
+ //! enabling seamless integration between Ruby and the core Parquet functionality.
5
+ //!
6
+ //! # Overview
7
+ //!
8
+ //! The adapter implements three main components:
9
+ //!
10
+ //! ## Value Conversion
11
+ //!
12
+ //! The [`RubyValueConverter`] implements the `ValueConverter` trait to handle
13
+ //! conversions between Ruby values (via Magnus) and Parquet values:
14
+ //!
15
+ //! - Ruby integers ↔ Parquet int types
16
+ //! - Ruby floats ↔ Parquet float/double
17
+ //! - Ruby strings ↔ Parquet strings/binary
18
+ //! - Ruby BigDecimal ↔ Parquet decimal types
19
+ //! - Ruby Time/DateTime ↔ Parquet temporal types
20
+ //! - Ruby arrays/hashes ↔ Parquet lists/maps/structs
21
+ //!
22
+ //! ## I/O Operations
23
+ //!
24
+ //! The I/O module provides [`RubyIOReader`] and [`RubyIOWriter`] which implement
25
+ //! parquet-core's `ChunkReader` trait for Ruby IO objects:
26
+ //!
27
+ //! - File objects
28
+ //! - StringIO for in-memory operations
29
+ //! - Any Ruby object implementing read/write/seek methods
30
+ //!
31
+ //! ## Schema Conversion
32
+ //!
33
+ //! Schema utilities for converting between Ruby schema representations and
34
+ //! parquet-core's schema types:
35
+ //!
36
+ //! - Legacy hash-based schemas
37
+ //! - New DSL-based schemas
38
+ //! - Automatic type inference from data
39
+
40
+ pub mod error;
41
+ pub use error::{ErrorContext, IntoMagnusError, Result, RubyAdapterError};
42
+
43
+ pub mod chunk_reader;
44
+ pub use chunk_reader::CloneableChunkReader;
45
+
46
+ pub mod converter;
47
+ pub use converter::RubyValueConverter;
48
+
49
+ pub mod io;
50
+ pub use io::{create_reader, is_io_like, RubyIO, RubyIOReader, RubyIOWriter};
51
+
52
+ pub mod logger;
53
+ pub use logger::RubyLogger;
54
+
55
+ pub mod schema;
56
+ pub use schema::{
57
+ convert_legacy_schema, extract_field_schemas, is_dsl_schema, parquet_schema_to_ruby,
58
+ process_schema_value, ruby_schema_to_parquet, RubySchemaBuilder,
59
+ };
60
+
61
+ pub mod string_cache;
62
+ pub use string_cache::StringCache;
63
+
64
+ pub mod string_storage;
65
+ pub use string_storage::{
66
+ StringStorage, StringStorageConfig, StringStorageMode, DEFAULT_SHARED_MAX_ENTRIES,
67
+ DEFAULT_SHARED_MAX_VALUE_BYTES,
68
+ };
69
+
70
+ pub mod metadata;
71
+ pub use metadata::{parse_metadata, RubyParquetMetaData};
72
+
73
+ pub mod types;
74
+ pub use types::{
75
+ ColumnEnumeratorArgs, ParquetWriteArgs, ParserResultType, RowEnumeratorArgs, WriterOutput,
76
+ };
77
+
78
+ pub mod utils;
79
+ pub use utils::{
80
+ create_column_enumerator, create_row_enumerator, handle_block_or_enum, parse_compression,
81
+ parse_parquet_write_args,
82
+ };
83
+
84
+ pub mod reader;
85
+ pub use reader::{each_column, each_row};
86
+
87
+ pub mod writer;
88
+ pub use writer::{create_writer, finalize_writer, write_columns, write_rows};
89
+
90
+ pub mod try_into_value;
91
+ pub use try_into_value::TryIntoValue;