parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,148 @@
1
+ use magnus::{Error as MagnusError, Ruby};
2
+ use parquet_core::ParquetError as CoreParquetError;
3
+ use std::fmt::Display;
4
+ use thiserror::Error;
5
+
6
+ /// Error type for parquet-ruby-adapter
7
+ #[derive(Error, Debug)]
8
+ pub enum RubyAdapterError {
9
+ /// Core parquet errors
10
+ #[error("Parquet error: {0}")]
11
+ Parquet(#[from] CoreParquetError),
12
+
13
+ /// Magnus/Ruby errors
14
+ #[error("Ruby error: {0}")]
15
+ Ruby(String),
16
+
17
+ /// IO errors
18
+ #[error("IO error: {0}")]
19
+ Io(#[from] std::io::Error),
20
+
21
+ /// Type conversion errors
22
+ #[error("Type conversion error: {0}")]
23
+ TypeConversion(String),
24
+
25
+ /// Schema conversion errors
26
+ #[error("Schema conversion error: {0}")]
27
+ SchemaConversion(String),
28
+
29
+ /// Metadata extraction errors
30
+ #[error("Metadata error: {0}")]
31
+ Metadata(String),
32
+
33
+ /// Invalid input errors
34
+ #[error("Invalid input: {0}")]
35
+ InvalidInput(String),
36
+
37
+ /// Runtime errors
38
+ #[error("Runtime error: {0}")]
39
+ Runtime(String),
40
+ }
41
+
42
+ pub type Result<T> = std::result::Result<T, RubyAdapterError>;
43
+
44
+ impl RubyAdapterError {
45
+ /// Create a new Ruby error
46
+ pub fn ruby<S: Into<String>>(msg: S) -> Self {
47
+ RubyAdapterError::Ruby(msg.into())
48
+ }
49
+
50
+ /// Create a new type conversion error
51
+ pub fn type_conversion<S: Into<String>>(msg: S) -> Self {
52
+ RubyAdapterError::TypeConversion(msg.into())
53
+ }
54
+
55
+ /// Create a new schema conversion error
56
+ pub fn schema_conversion<S: Into<String>>(msg: S) -> Self {
57
+ RubyAdapterError::SchemaConversion(msg.into())
58
+ }
59
+
60
+ /// Create a new metadata error
61
+ pub fn metadata<S: Into<String>>(msg: S) -> Self {
62
+ RubyAdapterError::Metadata(msg.into())
63
+ }
64
+
65
+ /// Create a new invalid input error
66
+ pub fn invalid_input<S: Into<String>>(msg: S) -> Self {
67
+ RubyAdapterError::InvalidInput(msg.into())
68
+ }
69
+
70
+ /// Create a new runtime error
71
+ pub fn runtime<S: Into<String>>(msg: S) -> Self {
72
+ RubyAdapterError::Runtime(msg.into())
73
+ }
74
+ }
75
+
76
+ /// Convert RubyAdapterError to MagnusError
77
+ impl From<RubyAdapterError> for MagnusError {
78
+ fn from(err: RubyAdapterError) -> Self {
79
+ match Ruby::get() {
80
+ Ok(ruby) => match &err {
81
+ RubyAdapterError::Io(_) => {
82
+ MagnusError::new(ruby.exception_io_error(), err.to_string())
83
+ }
84
+ RubyAdapterError::TypeConversion(_) => {
85
+ MagnusError::new(ruby.exception_type_error(), err.to_string())
86
+ }
87
+ RubyAdapterError::InvalidInput(_) => {
88
+ MagnusError::new(ruby.exception_arg_error(), err.to_string())
89
+ }
90
+ _ => MagnusError::new(ruby.exception_runtime_error(), err.to_string()),
91
+ },
92
+ Err(_) => {
93
+ // Fallback if we can't get Ruby runtime
94
+ MagnusError::new(
95
+ magnus::exception::runtime_error(),
96
+ format!("Failed to get Ruby runtime: {}", err),
97
+ )
98
+ }
99
+ }
100
+ }
101
+ }
102
+
103
+ /// Extension trait to convert errors to MagnusError at the boundary
104
+ pub trait IntoMagnusError<T> {
105
+ /// Convert to MagnusError
106
+ fn into_magnus_error(self) -> std::result::Result<T, MagnusError>;
107
+ }
108
+
109
+ impl<T> IntoMagnusError<T> for Result<T> {
110
+ fn into_magnus_error(self) -> std::result::Result<T, MagnusError> {
111
+ self.map_err(Into::into)
112
+ }
113
+ }
114
+
115
+ /// Extension trait to add context to errors
116
+ pub trait ErrorContext<T> {
117
+ /// Add context to an error
118
+ fn context<S: Display>(self, ctx: S) -> Result<T>;
119
+
120
+ /// Add context with a closure that's only called on error
121
+ fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T>;
122
+ }
123
+
124
+ impl<T, E> ErrorContext<T> for std::result::Result<T, E>
125
+ where
126
+ E: Into<RubyAdapterError>,
127
+ {
128
+ fn context<S: Display>(self, ctx: S) -> Result<T> {
129
+ self.map_err(|e| {
130
+ let base_error = e.into();
131
+ RubyAdapterError::Runtime(format!("{}: {}", ctx, base_error))
132
+ })
133
+ }
134
+
135
+ fn with_context<S: Display, F: FnOnce() -> S>(self, f: F) -> Result<T> {
136
+ self.map_err(|e| {
137
+ let base_error = e.into();
138
+ RubyAdapterError::Runtime(format!("{}: {}", f(), base_error))
139
+ })
140
+ }
141
+ }
142
+
143
+ /// Convert from MagnusError to RubyAdapterError
144
+ impl From<MagnusError> for RubyAdapterError {
145
+ fn from(err: MagnusError) -> Self {
146
+ RubyAdapterError::Ruby(err.to_string())
147
+ }
148
+ }
@@ -1,23 +1,21 @@
1
1
  use bytes::Bytes;
2
- use magnus::{
3
- value::{Opaque, ReprValue},
4
- RString, Ruby, Value,
5
- };
2
+ use magnus::value::{Opaque, ReprValue};
3
+ use magnus::{Error as MagnusError, RString, Ruby, Value};
4
+ use std::io::{Error as IoError, ErrorKind, Read, Seek, SeekFrom, Write};
5
+
6
6
  use parquet::{
7
7
  errors::ParquetError,
8
8
  file::reader::{ChunkReader, Length},
9
9
  };
10
- use std::{fs::File, rc::Rc, sync::Mutex};
10
+ use std::{fs::File, sync::Mutex};
11
11
  use std::{
12
- io::{self, BufReader, Read, Seek, SeekFrom, Write},
12
+ io::{self, BufReader},
13
13
  sync::Arc,
14
14
  };
15
15
 
16
- use crate::types::ParquetGemError;
17
-
18
16
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
19
17
  /// and provide a standard Read implementation for them.
20
- pub enum RubyReader {
18
+ pub enum RubyIOReader {
21
19
  String {
22
20
  inner: Opaque<RString>,
23
21
  offset: usize,
@@ -32,38 +30,35 @@ pub enum RubyReader {
32
30
 
33
31
  // Sending is technically not safe, but the only things that threatens to
34
32
  // do this is the parquet gem, and they don't seem to actually do it.
35
- unsafe impl Send for RubyReader {}
33
+ unsafe impl Send for RubyIOReader {}
36
34
 
37
- impl RubyReader {
38
- pub fn new(ruby: Rc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
39
- if RubyReader::is_seekable_io_like(&value) {
40
- Ok(RubyReader::RubyIoLike {
35
+ impl RubyIOReader {
36
+ pub fn new(value: Value) -> std::io::Result<Self> {
37
+ if RubyIOReader::is_seekable_io_like(&value) {
38
+ Ok(RubyIOReader::RubyIoLike {
41
39
  inner: Opaque::from(value),
42
40
  })
43
- } else if RubyReader::is_io_like(&value) {
44
- let mut temp_file = tempfile::tempfile()
45
- .map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
41
+ } else if RubyIOReader::is_io_like(&value) {
42
+ let mut temp_file = tempfile::tempfile()?;
46
43
 
47
44
  // This is safe, because we won't call seek
48
- let inner_readable = RubyReader::RubyIoLike {
45
+ let inner_readable = RubyIOReader::RubyIoLike {
49
46
  inner: Opaque::from(value),
50
47
  };
51
48
  let mut reader = BufReader::new(inner_readable);
52
- io::copy(&mut reader, &mut temp_file)
53
- .map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
54
- temp_file
55
- .seek(SeekFrom::Start(0))
56
- .map_err(|e| magnus::Error::new(ruby.exception_runtime_error(), e.to_string()))?;
49
+ io::copy(&mut reader, &mut temp_file)?;
50
+ temp_file.seek(SeekFrom::Start(0))?;
57
51
 
58
- Ok(RubyReader::NativeProxyIoLike {
52
+ Ok(RubyIOReader::NativeProxyIoLike {
59
53
  proxy_file: temp_file,
60
54
  })
61
55
  } else {
62
56
  // Try calling `to_str`, and if that fails, try `to_s`
63
57
  let string_content = value
64
58
  .funcall::<_, _, RString>("to_str", ())
65
- .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
66
- Ok(RubyReader::String {
59
+ .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))
60
+ .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e.to_string()))?;
61
+ Ok(RubyIOReader::String {
67
62
  inner: Opaque::from(string_content),
68
63
  offset: 0,
69
64
  })
@@ -82,12 +77,13 @@ impl RubyReader {
82
77
  }
83
78
  }
84
79
 
85
- impl Seek for RubyReader {
80
+ impl Seek for RubyIOReader {
86
81
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
87
- let ruby = unsafe { Ruby::get_unchecked() };
82
+ let ruby = Ruby::get()
83
+ .map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
88
84
  match self {
89
- RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
90
- RubyReader::String {
85
+ RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
86
+ RubyIOReader::String {
91
87
  inner,
92
88
  offset: original_offset,
93
89
  } => {
@@ -108,7 +104,7 @@ impl Seek for RubyReader {
108
104
  *original_offset = new_offset.min(unwrapped_inner.len());
109
105
  Ok(*original_offset as u64)
110
106
  }
111
- RubyReader::RubyIoLike { inner } => {
107
+ RubyIOReader::RubyIoLike { inner } => {
112
108
  let unwrapped_inner = ruby.get_inner(*inner);
113
109
 
114
110
  let (whence, ruby_offset) = match pos {
@@ -131,12 +127,13 @@ impl Seek for RubyReader {
131
127
  }
132
128
  }
133
129
 
134
- impl Read for RubyReader {
130
+ impl Read for RubyIOReader {
135
131
  fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
136
- let ruby = unsafe { Ruby::get_unchecked() };
132
+ let ruby = Ruby::get()
133
+ .map_err(|_| io::Error::new(io::ErrorKind::Other, "Failed to get Ruby runtime"))?;
137
134
  match self {
138
- RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
139
- RubyReader::String { inner, offset } => {
135
+ RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
136
+ RubyIOReader::String { inner, offset } => {
140
137
  let unwrapped_inner = ruby.get_inner(*inner);
141
138
 
142
139
  let string_buffer = unsafe { unwrapped_inner.as_slice() };
@@ -152,7 +149,7 @@ impl Read for RubyReader {
152
149
 
153
150
  Ok(copy_size)
154
151
  }
155
- RubyReader::RubyIoLike { inner } => {
152
+ RubyIOReader::RubyIoLike { inner } => {
156
153
  let unwrapped_inner = ruby.get_inner(*inner);
157
154
 
158
155
  let bytes = unwrapped_inner
@@ -172,16 +169,22 @@ impl Read for RubyReader {
172
169
  }
173
170
  }
174
171
 
175
- impl Length for RubyReader {
172
+ impl Length for RubyIOReader {
176
173
  fn len(&self) -> u64 {
177
- let ruby = unsafe { Ruby::get_unchecked() };
174
+ let ruby = match Ruby::get() {
175
+ Ok(r) => r,
176
+ Err(_) => {
177
+ eprintln!("Failed to get Ruby runtime in RubyIOReader::len");
178
+ return 0;
179
+ }
180
+ };
178
181
  match self {
179
- RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
180
- RubyReader::String { inner, offset: _ } => {
182
+ RubyIOReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
183
+ RubyIOReader::String { inner, offset: _ } => {
181
184
  let unwrapped_inner = ruby.get_inner(*inner);
182
185
  unwrapped_inner.len() as u64
183
186
  }
184
- RubyReader::RubyIoLike { inner } => {
187
+ RubyIOReader::RubyIoLike { inner } => {
185
188
  let unwrapped_inner = ruby.get_inner(*inner);
186
189
 
187
190
  // Get current position
@@ -222,12 +225,13 @@ impl Length for RubyReader {
222
225
  }
223
226
  };
224
227
 
225
- assert_eq!(
226
- current_pos, final_pos,
227
- "Failed to restore original position in seekable IO object. Started at position {}, but ended at {}",
228
- current_pos,
229
- final_pos
230
- );
228
+ if current_pos != final_pos {
229
+ eprintln!(
230
+ "Failed to restore original position in seekable IO object. Started at position {}, but ended at {}",
231
+ current_pos,
232
+ final_pos
233
+ );
234
+ }
231
235
 
232
236
  size
233
237
  }
@@ -238,21 +242,28 @@ impl Length for RubyReader {
238
242
  const READ_BUFFER_SIZE: usize = 16 * 1024;
239
243
 
240
244
  #[derive(Clone)]
241
- pub struct ThreadSafeRubyReader(Arc<Mutex<RubyReader>>);
245
+ pub struct ThreadSafeRubyIOReader(Arc<Mutex<RubyIOReader>>);
242
246
 
243
- impl ThreadSafeRubyReader {
244
- pub fn new(reader: RubyReader) -> Self {
247
+ impl ThreadSafeRubyIOReader {
248
+ pub fn new(reader: RubyIOReader) -> Self {
245
249
  Self(Arc::new(Mutex::new(reader)))
246
250
  }
247
251
  }
248
252
 
249
- impl Length for ThreadSafeRubyReader {
253
+ impl Length for ThreadSafeRubyIOReader {
250
254
  fn len(&self) -> u64 {
251
- self.0.lock().expect("Failed to lock mutex").len()
255
+ match self.0.lock() {
256
+ Ok(reader) => reader.len(),
257
+ Err(_) => {
258
+ // If the mutex is poisoned, we can't recover, return 0
259
+ eprintln!("Failed to lock mutex in ThreadSafeRubyIOReader::len");
260
+ 0
261
+ }
262
+ }
252
263
  }
253
264
  }
254
265
 
255
- impl Seek for ThreadSafeRubyReader {
266
+ impl Seek for ThreadSafeRubyIOReader {
256
267
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
257
268
  let mut reader = self
258
269
  .0
@@ -262,7 +273,7 @@ impl Seek for ThreadSafeRubyReader {
262
273
  }
263
274
  }
264
275
 
265
- impl Read for ThreadSafeRubyReader {
276
+ impl Read for ThreadSafeRubyIOReader {
266
277
  fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
267
278
  let mut reader = self
268
279
  .0
@@ -272,8 +283,8 @@ impl Read for ThreadSafeRubyReader {
272
283
  }
273
284
  }
274
285
 
275
- impl ChunkReader for ThreadSafeRubyReader {
276
- type T = BufReader<ThreadSafeRubyReader>;
286
+ impl ChunkReader for ThreadSafeRubyIOReader {
287
+ type T = BufReader<ThreadSafeRubyIOReader>;
277
288
 
278
289
  fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
279
290
  let mut reader = self.clone();
@@ -296,3 +307,126 @@ impl ChunkReader for ThreadSafeRubyReader {
296
307
  Ok(buffer.into())
297
308
  }
298
309
  }
310
+
311
+ /// Adapter for Ruby IO objects that implements std::io::Write
312
+ pub struct RubyIOWriter {
313
+ io: Value,
314
+ }
315
+
316
+ impl RubyIOWriter {
317
+ pub fn new(io: Value) -> Self {
318
+ Self { io }
319
+ }
320
+ }
321
+
322
+ impl Write for RubyIOWriter {
323
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
324
+ // Call Ruby IO#write method
325
+ let ruby = Ruby::get().map_err(|e| {
326
+ IoError::new(
327
+ ErrorKind::Other,
328
+ format!("Failed to get Ruby runtime: {}", e),
329
+ )
330
+ })?;
331
+
332
+ if buf.is_empty() {
333
+ return Ok(0);
334
+ }
335
+
336
+ // Convert bytes to Ruby string
337
+ let ruby_string = ruby.str_from_slice(buf);
338
+
339
+ // Call io.write(string)
340
+ let result: Result<usize, MagnusError> = self.io.funcall("write", (ruby_string,));
341
+
342
+ match result {
343
+ Ok(bytes_written) => Ok(bytes_written),
344
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
345
+ }
346
+ }
347
+
348
+ fn flush(&mut self) -> std::io::Result<()> {
349
+ // Call Ruby IO#flush method
350
+ let result: Result<Value, MagnusError> = self.io.funcall("flush", ());
351
+
352
+ match result {
353
+ Ok(_) => Ok(()),
354
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
355
+ }
356
+ }
357
+ }
358
+
359
+ /// Wrapper that implements both Read and Write for Ruby IO objects
360
+ pub struct RubyIO {
361
+ io: Value,
362
+ }
363
+
364
+ impl RubyIO {
365
+ pub fn new(io: Value) -> Self {
366
+ Self { io }
367
+ }
368
+ }
369
+
370
+ impl Read for RubyIO {
371
+ fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
372
+ RubyIOReader::new(self.io)?.read(buf)
373
+ }
374
+ }
375
+
376
+ impl Write for RubyIO {
377
+ fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
378
+ RubyIOWriter::new(self.io).write(buf)
379
+ }
380
+
381
+ fn flush(&mut self) -> std::io::Result<()> {
382
+ RubyIOWriter::new(self.io).flush()
383
+ }
384
+ }
385
+
386
+ impl Seek for RubyIO {
387
+ fn seek(&mut self, pos: SeekFrom) -> std::io::Result<u64> {
388
+ RubyIOReader::new(self.io)?.seek(pos)
389
+ }
390
+ }
391
+
392
+ /// Check if a Ruby value responds to IO methods
393
+ pub fn is_io_like(value: Value) -> bool {
394
+ // Check if the object responds to read/write methods
395
+ let responds_to_read: Result<bool, MagnusError> = value.funcall("respond_to?", ("read",));
396
+ let responds_to_write: Result<bool, MagnusError> = value.funcall("respond_to?", ("write",));
397
+
398
+ matches!(
399
+ (responds_to_read, responds_to_write),
400
+ (Ok(true), _) | (_, Ok(true))
401
+ )
402
+ }
403
+
404
+ /// Create a reader from a Ruby IO-like object
405
+ pub fn create_reader(io: Value) -> std::io::Result<RubyIOReader> {
406
+ // Verify it has a read method
407
+ let responds_to_read: Result<bool, MagnusError> = io.funcall("respond_to?", ("read",));
408
+
409
+ match responds_to_read {
410
+ Ok(true) => RubyIOReader::new(io),
411
+ Ok(false) => Err(IoError::new(
412
+ ErrorKind::InvalidInput,
413
+ "Object does not respond to 'read' method",
414
+ )),
415
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
416
+ }
417
+ }
418
+
419
+ /// Create a writer from a Ruby IO-like object
420
+ pub fn create_writer(io: Value) -> std::io::Result<RubyIOWriter> {
421
+ // Verify it has a write method
422
+ let responds_to_write: Result<bool, MagnusError> = io.funcall("respond_to?", ("write",));
423
+
424
+ match responds_to_write {
425
+ Ok(true) => Ok(RubyIOWriter::new(io)),
426
+ Ok(false) => Err(IoError::new(
427
+ ErrorKind::InvalidInput,
428
+ "Object does not respond to 'write' method",
429
+ )),
430
+ Err(e) => Err(IoError::new(ErrorKind::Other, e.to_string())),
431
+ }
432
+ }
@@ -0,0 +1,90 @@
1
+ //! Ruby-specific adapter for parquet-core
2
+ //!
3
+ //! This crate provides Ruby-specific implementations of the parquet-core traits,
4
+ //! enabling seamless integration between Ruby and the core Parquet functionality.
5
+ //!
6
+ //! # Overview
7
+ //!
8
+ //! The adapter implements three main components:
9
+ //!
10
+ //! ## Value Conversion
11
+ //!
12
+ //! The [`RubyValueConverter`] implements the `ValueConverter` trait to handle
13
+ //! conversions between Ruby values (via Magnus) and Parquet values:
14
+ //!
15
+ //! - Ruby integers ↔ Parquet int types
16
+ //! - Ruby floats ↔ Parquet float/double
17
+ //! - Ruby strings ↔ Parquet strings/binary
18
+ //! - Ruby BigDecimal ↔ Parquet decimal types
19
+ //! - Ruby Time/DateTime ↔ Parquet temporal types
20
+ //! - Ruby arrays/hashes ↔ Parquet lists/maps/structs
21
+ //!
22
+ //! ## I/O Operations
23
+ //!
24
+ //! The I/O module provides [`RubyIOReader`] and [`RubyIOWriter`] which implement
25
+ //! parquet-core's `ChunkReader` trait for Ruby IO objects:
26
+ //!
27
+ //! - File objects
28
+ //! - StringIO for in-memory operations
29
+ //! - Any Ruby object implementing read/write/seek methods
30
+ //!
31
+ //! ## Schema Conversion
32
+ //!
33
+ //! Schema utilities for converting between Ruby schema representations and
34
+ //! parquet-core's schema types:
35
+ //!
36
+ //! - Legacy hash-based schemas
37
+ //! - New DSL-based schemas
38
+ //! - Automatic type inference from data
39
+
40
+ pub mod error;
41
+ pub use error::{ErrorContext, IntoMagnusError, Result, RubyAdapterError};
42
+
43
+ pub mod chunk_reader;
44
+ pub use chunk_reader::CloneableChunkReader;
45
+
46
+ pub mod converter;
47
+ pub use converter::RubyValueConverter;
48
+
49
+ pub mod io;
50
+ pub use io::{create_reader, is_io_like, RubyIO, RubyIOReader, RubyIOWriter};
51
+
52
+ pub mod logger;
53
+ pub use logger::RubyLogger;
54
+
55
+ pub mod schema;
56
+ pub use schema::{
57
+ convert_legacy_schema, extract_field_schemas, is_dsl_schema, parquet_schema_to_ruby,
58
+ process_schema_value, ruby_schema_to_parquet, RubySchemaBuilder,
59
+ };
60
+
61
+ pub mod string_cache;
62
+ pub use string_cache::StringCache;
63
+
64
+ pub mod batch_manager;
65
+ pub use batch_manager::{
66
+ BatchSizeManager, DEFAULT_MEMORY_THRESHOLD, INITIAL_BATCH_SIZE, MIN_BATCH_SIZE, SAMPLE_SIZE,
67
+ };
68
+
69
+ pub mod metadata;
70
+ pub use metadata::{parse_metadata, RubyParquetMetaData};
71
+
72
+ pub mod types;
73
+ pub use types::{
74
+ ColumnEnumeratorArgs, ParquetWriteArgs, ParserResultType, RowEnumeratorArgs, WriterOutput,
75
+ };
76
+
77
+ pub mod utils;
78
+ pub use utils::{
79
+ create_column_enumerator, create_row_enumerator, estimate_parquet_value_size,
80
+ estimate_row_size, handle_block_or_enum, parse_compression, parse_parquet_write_args,
81
+ };
82
+
83
+ pub mod reader;
84
+ pub use reader::{each_column, each_row};
85
+
86
+ pub mod writer;
87
+ pub use writer::{create_writer, finalize_writer, write_columns, write_rows};
88
+
89
+ pub mod try_into_value;
90
+ pub use try_into_value::TryIntoValue;
@@ -0,0 +1,64 @@
1
+ use magnus::value::ReprValue;
2
+ use magnus::{Error as MagnusError, Value};
3
+
4
+ pub struct RubyLogger {
5
+ logger: Option<Value>,
6
+ }
7
+
8
+ impl RubyLogger {
9
+ pub fn new(logger: Option<Value>) -> Result<Self, MagnusError> {
10
+ // Validate logger has required methods if provided
11
+ if let Some(ref log) = logger {
12
+ for method in &["debug", "info", "warn", "error"] {
13
+ if !log.respond_to(*method, false)? {
14
+ return Err(MagnusError::new(
15
+ magnus::exception::arg_error(),
16
+ format!("Logger must respond to {}", method),
17
+ ));
18
+ }
19
+ }
20
+ }
21
+ Ok(Self { logger })
22
+ }
23
+
24
+ pub fn debug<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
25
+ if let Some(ref logger) = self.logger {
26
+ logger.funcall::<_, _, Value>("debug", (msg_fn(),))?;
27
+ }
28
+ Ok(())
29
+ }
30
+
31
+ pub fn info<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
32
+ if let Some(ref logger) = self.logger {
33
+ logger.funcall::<_, _, Value>("info", (msg_fn(),))?;
34
+ }
35
+ Ok(())
36
+ }
37
+
38
+ pub fn warn<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
39
+ if let Some(ref logger) = self.logger {
40
+ logger.funcall::<_, _, Value>("warn", (msg_fn(),))?;
41
+ }
42
+ Ok(())
43
+ }
44
+
45
+ pub fn error<F: FnOnce() -> String>(&self, msg_fn: F) -> Result<(), MagnusError> {
46
+ if let Some(ref logger) = self.logger {
47
+ logger.funcall::<_, _, Value>("error", (msg_fn(),))?;
48
+ }
49
+ Ok(())
50
+ }
51
+
52
+ pub fn inner(&self) -> Option<Value> {
53
+ self.logger
54
+ }
55
+ }
56
+
57
+ // Make RubyLogger cloneable for passing to multiple functions
58
+ impl Clone for RubyLogger {
59
+ fn clone(&self) -> Self {
60
+ Self {
61
+ logger: self.logger,
62
+ }
63
+ }
64
+ }