parquet 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 252c96a0dc96337e1d64514ee24ec0a470621fd25b5e1497de75f666e060e32e
4
- data.tar.gz: 38749e71de16f448404bedf5f316a7b1cebbf51121bff05b76e83b50e7081968
3
+ metadata.gz: 0d72c16371c10a011af5118f2915de9bbeb33cde133369bdac2050e3c035572e
4
+ data.tar.gz: b39c6ec9a8232eca5b5b156bf28992ed59c05e9a36e4c13db2b8933a74485ba0
5
5
  SHA512:
6
- metadata.gz: 5b4cc0e162f2f823e127167d93963b778c56f0ebaa1deda6bee87f5ff453b36cad1c7a77e6e53f35fde6c24492fcb6d002da0dab8749afcafd4ee7e084e81042
7
- data.tar.gz: 39112b58e3c859b589a0b89d9ff59b18294c336c0d43357c5859a10153e5345b4fdc79cb1e1b09e5649441bb957c8c45506c3818bd1b13aaec8c5fa63a1415a8
6
+ metadata.gz: c7f338b1d010fa59c2344065b233ff20a08d4a17c6ca987ef72677150dd1cbf55d134855585d68e187b748dc5121f13d5e86cb82aabc1eeb3562a3326aca459c
7
+ data.tar.gz: 69eaa6b133123944138a826612a7b48d9f87acb202ecbe172e253be02a1a1c7009e3d7182e8bb31ae423098bc34bb5dddc4ce042453f0d1cb41505d56d02c21e
data/README.md CHANGED
@@ -294,7 +294,7 @@ The Schema DSL supports:
294
294
  - **Complex types**: Structs, lists, and maps with arbitrary nesting
295
295
  - **Nullability control**: Specify which fields can contain null values with `nullable: false/true`
296
296
  - **List item nullability**: Control whether list items can be null with `item_nullable: false/true`
297
- - **Map key/value nullability**: Control whether map keys or values can be null with `key_nullable: false/true` and `value_nullable: false/true`
297
+ - **Map key/value nullability**: Control whether map keys or values can be null with `value_nullable: false/true`
298
298
 
299
299
  Note: When using List and Map types, you need to provide at least:
300
300
  - For lists: The `item:` parameter specifying the item type
@@ -6,10 +6,7 @@
6
6
  /// so this optimization could be removed if any issues arise.
7
7
  use std::{
8
8
  collections::HashMap,
9
- sync::{
10
- atomic::{AtomicU32, Ordering},
11
- LazyLock, Mutex,
12
- },
9
+ sync::{LazyLock, Mutex},
13
10
  };
14
11
 
15
12
  use magnus::{IntoValue, RString, Ruby, Value};
@@ -24,7 +21,7 @@ pub enum CacheError {
24
21
  RStringConversion(String),
25
22
  }
26
23
 
27
- static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
24
+ static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, StringCacheKey>>> =
28
25
  LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
29
26
 
30
27
  pub struct StringCache;
@@ -84,18 +81,16 @@ impl StringCache {
84
81
  pub fn intern_many<AsStr: AsRef<str>>(
85
82
  strings: &[AsStr],
86
83
  ) -> Result<Vec<StringCacheKey>, CacheError> {
87
- let mut cache = STRING_CACHE
84
+ let cache = STRING_CACHE
88
85
  .lock()
89
86
  .map_err(|e| CacheError::LockError(e.to_string()))?;
90
87
 
91
88
  let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
92
89
  for string in strings {
93
- if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
94
- counter.fetch_add(1, Ordering::Relaxed);
90
+ if let Some((_, interned_string)) = cache.get_key_value(string.as_ref()) {
95
91
  result.push(*interned_string);
96
92
  } else {
97
93
  let interned = StringCacheKey::new(string.as_ref())?;
98
- cache.insert(interned.0, (interned, AtomicU32::new(1)));
99
94
  result.push(interned);
100
95
  }
101
96
  }
@@ -5,7 +5,7 @@ use std::str::FromStr;
5
5
 
6
6
  use magnus::{exception::runtime_error, value::ReprValue, Error as MagnusError, Ruby, Value};
7
7
 
8
- use crate::{reader::ReaderError, utils::parse_string_or_symbol};
8
+ use crate::{types::ParquetGemError, utils::parse_string_or_symbol};
9
9
 
10
10
  /// Severity levels that match Ruby's Logger levels
11
11
  #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -45,7 +45,7 @@ pub struct RubyLogger {
45
45
 
46
46
  #[allow(dead_code)]
47
47
  impl RubyLogger {
48
- pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ReaderError> {
48
+ pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ParquetGemError> {
49
49
  let environment_level = std::env::var("PARQUET_GEM_LOG_LEVEL")
50
50
  .unwrap_or_else(|_| "warn".to_string())
51
51
  .parse::<LogLevel>()
@@ -8,32 +8,29 @@ use std::fs::File;
8
8
  use std::sync::Arc;
9
9
 
10
10
  use magnus::value::ReprValue;
11
- use magnus::{Error as MagnusError, Value};
11
+ use magnus::{Error as MagnusError, Ruby, Value};
12
12
 
13
13
  use crate::header_cache::StringCache;
14
14
  use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
15
- use crate::types::TryIntoValue;
15
+ use crate::types::{ParquetGemError, TryIntoValue};
16
16
  use crate::ColumnRecord;
17
17
 
18
- use super::ReaderError;
19
-
20
18
  /// Opens a parquet file or IO-like object for reading
21
19
  ///
22
20
  /// This function handles both file paths (as strings) and IO-like objects,
23
21
  /// returning either a File or a ThreadSafeRubyReader that can be used with
24
22
  /// parquet readers.
25
23
  pub fn open_parquet_source(
24
+ ruby: Arc<Ruby>,
26
25
  to_read: Value,
27
- ) -> Result<Either<File, ThreadSafeRubyReader>, ReaderError> {
28
- let ruby = unsafe { magnus::Ruby::get_unchecked() };
29
-
26
+ ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
30
27
  if to_read.is_kind_of(ruby.class_string()) {
31
28
  let path_string = to_read.to_r_string()?;
32
29
  let file_path = unsafe { path_string.as_str()? };
33
- let file = File::open(file_path).map_err(ReaderError::from)?;
30
+ let file = File::open(file_path).map_err(ParquetGemError::from)?;
34
31
  Ok(Either::Left(file))
35
32
  } else {
36
- let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
33
+ let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
37
34
  Ok(Either::Right(readable))
38
35
  }
39
36
  }
@@ -60,9 +57,9 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
60
57
  reader: T,
61
58
  columns: &Option<Vec<String>>,
62
59
  batch_size: Option<usize>,
63
- ) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ReaderError> {
64
- let mut builder =
65
- ParquetRecordBatchReaderBuilder::try_new(reader).map_err(|e| ReaderError::Parquet(e))?;
60
+ ) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
61
+ let mut builder = ParquetRecordBatchReaderBuilder::try_new(reader)
62
+ .map_err(|e| ParquetGemError::Parquet(e))?;
66
63
 
67
64
  let schema = builder.schema().clone();
68
65
  let num_rows = builder.metadata().file_metadata().num_rows();
@@ -81,7 +78,7 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
81
78
  builder = builder.with_batch_size(batch_size);
82
79
  }
83
80
 
84
- let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
81
+ let reader = builder.build().map_err(|e| ParquetGemError::Parquet(e))?;
85
82
  Ok((reader, schema, num_rows))
86
83
  }
87
84
 
@@ -91,7 +88,7 @@ pub fn handle_empty_file(
91
88
  ruby: &magnus::Ruby,
92
89
  schema: &Arc<Schema>,
93
90
  num_rows: i64,
94
- ) -> Result<bool, ReaderError> {
91
+ ) -> Result<bool, ParquetGemError> {
95
92
  if num_rows == 0 {
96
93
  let mut map =
97
94
  HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
@@ -101,7 +98,7 @@ pub fn handle_empty_file(
101
98
  .map(|field| field.name().to_string())
102
99
  .collect();
103
100
  let interned_headers =
104
- StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
101
+ StringCache::intern_many(&headers).map_err(|e| ParquetGemError::HeaderIntern(e))?;
105
102
  for field in interned_headers.iter() {
106
103
  map.insert(*field, vec![]);
107
104
  }
@@ -2,61 +2,5 @@ mod common;
2
2
  mod parquet_column_reader;
3
3
  mod parquet_row_reader;
4
4
 
5
- use std::io;
6
-
7
- use magnus::Error as MagnusError;
8
- use thiserror::Error;
9
-
10
- use crate::header_cache::CacheError;
11
5
  pub use parquet_column_reader::parse_parquet_columns;
12
6
  pub use parquet_row_reader::parse_parquet_rows;
13
-
14
- #[derive(Error, Debug)]
15
- pub enum ReaderError {
16
- #[error("Failed to open file: {0}")]
17
- FileOpen(#[from] io::Error),
18
- #[error("Failed to intern headers: {0}")]
19
- HeaderIntern(#[from] CacheError),
20
- #[error("Ruby error: {0}")]
21
- Ruby(#[from] MagnusErrorWrapper),
22
- #[error("Parquet error: {0}")]
23
- Parquet(#[from] parquet::errors::ParquetError),
24
- #[error("Arrow error: {0}")]
25
- Arrow(#[from] arrow_schema::ArrowError),
26
- #[error("UTF-8 error: {0}")]
27
- Utf8Error(#[from] simdutf8::basic::Utf8Error),
28
- #[error("Jiff error: {0}")]
29
- Jiff(#[from] jiff::Error),
30
- }
31
-
32
- #[derive(Debug)]
33
- pub struct MagnusErrorWrapper(pub MagnusError);
34
-
35
- impl From<MagnusError> for MagnusErrorWrapper {
36
- fn from(err: MagnusError) -> Self {
37
- Self(err)
38
- }
39
- }
40
-
41
- impl std::fmt::Display for MagnusErrorWrapper {
42
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
43
- write!(f, "{}", self.0)
44
- }
45
- }
46
-
47
- impl std::error::Error for MagnusErrorWrapper {}
48
-
49
- impl From<MagnusError> for ReaderError {
50
- fn from(err: MagnusError) -> Self {
51
- Self::Ruby(MagnusErrorWrapper(err))
52
- }
53
- }
54
-
55
- impl Into<MagnusError> for ReaderError {
56
- fn into(self) -> MagnusError {
57
- match self {
58
- Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
59
- _ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
60
- }
61
- }
62
- }
@@ -1,6 +1,6 @@
1
1
  use crate::header_cache::StringCache;
2
2
  use crate::logger::RubyLogger;
3
- use crate::types::{ArrayWrapper, TryIntoValue};
3
+ use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
4
4
  use crate::{
5
5
  create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
6
6
  ParserResultType,
@@ -10,25 +10,29 @@ use either::Either;
10
10
  use magnus::IntoValue;
11
11
  use magnus::{Error as MagnusError, Ruby, Value};
12
12
  use std::collections::HashMap;
13
- use std::sync::OnceLock;
13
+ use std::sync::{Arc, OnceLock};
14
14
 
15
15
  use super::common::{
16
16
  create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
17
17
  };
18
- use super::ReaderError;
19
18
 
20
19
  #[inline]
21
20
  pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
22
- Ok(parse_parquet_columns_impl(rb_self, args).map_err(|e| {
23
- let z: MagnusError = e.into();
24
- z
25
- })?)
21
+ let ruby = unsafe { Ruby::get_unchecked() };
22
+ Ok(
23
+ parse_parquet_columns_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
24
+ let z: MagnusError = e.into();
25
+ z
26
+ })?,
27
+ )
26
28
  }
27
29
 
28
30
  #[inline]
29
- fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value, ReaderError> {
30
- let ruby = unsafe { Ruby::get_unchecked() };
31
-
31
+ fn parse_parquet_columns_impl<'a>(
32
+ ruby: Arc<Ruby>,
33
+ rb_self: Value,
34
+ args: &[Value],
35
+ ) -> Result<Value, ParquetGemError> {
32
36
  let ParquetColumnsArgs {
33
37
  to_read,
34
38
  result_type,
@@ -63,7 +67,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
63
67
  return Ok(enum_value);
64
68
  }
65
69
 
66
- let source = open_parquet_source(to_read)?;
70
+ let source = open_parquet_source(ruby.clone(), to_read)?;
67
71
 
68
72
  // Use the common function to create the batch reader
69
73
 
@@ -82,7 +86,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
82
86
  let headers = OnceLock::new();
83
87
  let headers_clone = headers.clone();
84
88
  let iter = batch_reader.map(move |batch| {
85
- batch.map_err(ReaderError::Arrow).and_then(|batch| {
89
+ batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
86
90
  let local_headers = headers_clone
87
91
  .get_or_init(|| {
88
92
  let schema = batch.schema();
@@ -94,7 +98,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
94
98
  StringCache::intern_many(&header_string)
95
99
  })
96
100
  .as_ref()
97
- .map_err(|e| ReaderError::HeaderIntern(e.clone()))?;
101
+ .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
98
102
 
99
103
  let mut map = HashMap::with_capacity_and_hasher(
100
104
  local_headers.len(),
@@ -112,7 +116,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
112
116
  strict: strict,
113
117
  })?;
114
118
  map.insert(header, values.into_inner());
115
- Ok::<_, ReaderError>(())
119
+ Ok::<_, ParquetGemError>(())
116
120
  })?;
117
121
 
118
122
  Ok(ColumnRecord::Map::<RandomState>(map))
@@ -126,7 +130,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
126
130
  }
127
131
  ParserResultType::Array => {
128
132
  let iter = batch_reader.map(|batch| {
129
- batch.map_err(ReaderError::Arrow).and_then(|batch| {
133
+ batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
130
134
  let vec = batch
131
135
  .columns()
132
136
  .into_iter()
@@ -135,7 +139,7 @@ fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Valu
135
139
  array: &*column,
136
140
  strict: strict,
137
141
  })?;
138
- Ok::<_, ReaderError>(values.into_inner())
142
+ Ok::<_, ParquetGemError>(values.into_inner())
139
143
  })
140
144
  .collect::<Result<Vec<_>, _>>()?;
141
145
  Ok(ColumnRecord::Vec::<RandomState>(vec))
@@ -2,7 +2,7 @@ use crate::header_cache::StringCache;
2
2
  use crate::logger::RubyLogger;
3
3
  use crate::types::TryIntoValue;
4
4
  use crate::{
5
- create_row_enumerator, utils::*, ParquetField, ParserResultType, ReaderError,
5
+ create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
6
6
  RowEnumeratorArgs, RowRecord,
7
7
  };
8
8
  use ahash::RandomState;
@@ -13,22 +13,27 @@ use parquet::file::reader::{FileReader, SerializedFileReader};
13
13
  use parquet::record::reader::RowIter as ParquetRowIter;
14
14
  use parquet::schema::types::{Type as SchemaType, TypePtr};
15
15
  use std::collections::HashMap;
16
- use std::sync::OnceLock;
16
+ use std::sync::{Arc, OnceLock};
17
17
 
18
18
  use super::common::{handle_block_or_enum, open_parquet_source};
19
19
 
20
20
  #[inline]
21
21
  pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
22
- Ok(parse_parquet_rows_impl(rb_self, args).map_err(|e| {
23
- let z: MagnusError = e.into();
24
- z
25
- })?)
22
+ let ruby = unsafe { Ruby::get_unchecked() };
23
+ Ok(
24
+ parse_parquet_rows_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
25
+ let z: MagnusError = e.into();
26
+ z
27
+ })?,
28
+ )
26
29
  }
27
30
 
28
31
  #[inline]
29
- fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value, ReaderError> {
30
- let ruby = unsafe { Ruby::get_unchecked() };
31
-
32
+ fn parse_parquet_rows_impl<'a>(
33
+ ruby: Arc<Ruby>,
34
+ rb_self: Value,
35
+ args: &[Value],
36
+ ) -> Result<Value, ParquetGemError> {
32
37
  let ParquetRowsArgs {
33
38
  to_read,
34
39
  result_type,
@@ -58,11 +63,13 @@ fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value,
58
63
  return Ok(enum_value);
59
64
  }
60
65
 
61
- let source = open_parquet_source(to_read)?;
66
+ let source = open_parquet_source(ruby.clone(), to_read)?;
62
67
  let reader: Box<dyn FileReader> = match source {
63
- Either::Left(file) => Box::new(SerializedFileReader::new(file).map_err(ReaderError::from)?),
68
+ Either::Left(file) => {
69
+ Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
70
+ }
64
71
  Either::Right(readable) => {
65
- Box::new(SerializedFileReader::new(readable).map_err(ReaderError::from)?)
72
+ Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
66
73
  }
67
74
  };
68
75
 
@@ -109,7 +116,7 @@ fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value,
109
116
  Ok(map)
110
117
  })
111
118
  .and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
112
- .map_err(|e| ReaderError::from(e))
119
+ .map_err(|e| ParquetGemError::from(e))
113
120
  });
114
121
 
115
122
  for result in iter {
@@ -128,7 +135,7 @@ fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value,
128
135
  Ok(vec)
129
136
  })
130
137
  .and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
131
- .map_err(|e| ReaderError::from(e))
138
+ .map_err(|e| ParquetGemError::from(e))
132
139
  });
133
140
 
134
141
  for result in iter {
@@ -13,14 +13,18 @@ use std::{
13
13
  sync::Arc,
14
14
  };
15
15
 
16
+ use crate::types::ParquetGemError;
17
+
16
18
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
17
19
  /// and provide a standard Read implementation for them.
18
20
  pub enum RubyReader {
19
21
  String {
22
+ ruby: Arc<Ruby>,
20
23
  inner: Opaque<RString>,
21
24
  offset: usize,
22
25
  },
23
26
  RubyIoLike {
27
+ ruby: Arc<Ruby>,
24
28
  inner: Opaque<Value>,
25
29
  },
26
30
  NativeProxyIoLike {
@@ -28,26 +32,15 @@ pub enum RubyReader {
28
32
  },
29
33
  }
30
34
 
31
- impl RubyReader {
32
- fn is_io_like(value: &Value) -> bool {
33
- value.respond_to("read", false).unwrap_or(false)
34
- }
35
-
36
- // For now, don't use this. Having to use seek in length is scary.
37
- fn is_seekable_io_like(value: &Value) -> bool {
38
- Self::is_io_like(value)
39
- && value.respond_to("seek", false).unwrap_or(false)
40
- && value.respond_to("pos", false).unwrap_or(false)
41
- }
42
- }
43
-
44
- impl TryFrom<Value> for RubyReader {
45
- type Error = magnus::Error;
35
+ // Sending is technically not safe, but the only things that threatens to
36
+ // do this is the parquet gem, and they don't seem to actually do it.
37
+ unsafe impl Send for RubyReader {}
46
38
 
47
- fn try_from(value: Value) -> Result<Self, Self::Error> {
48
- let ruby = unsafe { Ruby::get_unchecked() };
39
+ impl RubyReader {
40
+ pub fn new(ruby: Arc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
49
41
  if RubyReader::is_seekable_io_like(&value) {
50
42
  Ok(RubyReader::RubyIoLike {
43
+ ruby,
51
44
  inner: Opaque::from(value),
52
45
  })
53
46
  } else if RubyReader::is_io_like(&value) {
@@ -56,6 +49,7 @@ impl TryFrom<Value> for RubyReader {
56
49
 
57
50
  // This is safe, because we won't call seek
58
51
  let inner_readable = RubyReader::RubyIoLike {
52
+ ruby: ruby.clone(),
59
53
  inner: Opaque::from(value),
60
54
  };
61
55
  let mut reader = BufReader::new(inner_readable);
@@ -74,19 +68,31 @@ impl TryFrom<Value> for RubyReader {
74
68
  .funcall::<_, _, RString>("to_str", ())
75
69
  .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
76
70
  Ok(RubyReader::String {
71
+ ruby,
77
72
  inner: Opaque::from(string_content),
78
73
  offset: 0,
79
74
  })
80
75
  }
81
76
  }
77
+
78
+ fn is_io_like(value: &Value) -> bool {
79
+ value.respond_to("read", false).unwrap_or(false)
80
+ }
81
+
82
+ // For now, don't use this. Having to use seek in length is scary.
83
+ fn is_seekable_io_like(value: &Value) -> bool {
84
+ Self::is_io_like(value)
85
+ && value.respond_to("seek", false).unwrap_or(false)
86
+ && value.respond_to("pos", false).unwrap_or(false)
87
+ }
82
88
  }
83
89
 
84
90
  impl Seek for RubyReader {
85
91
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
86
- let ruby = unsafe { Ruby::get_unchecked() };
87
92
  match self {
88
93
  RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
89
94
  RubyReader::String {
95
+ ruby,
90
96
  inner,
91
97
  offset: original_offset,
92
98
  } => {
@@ -107,7 +113,7 @@ impl Seek for RubyReader {
107
113
  *original_offset = new_offset.min(unwrapped_inner.len());
108
114
  Ok(*original_offset as u64)
109
115
  }
110
- RubyReader::RubyIoLike { inner } => {
116
+ RubyReader::RubyIoLike { ruby, inner } => {
111
117
  let unwrapped_inner = ruby.get_inner(*inner);
112
118
 
113
119
  let (whence, ruby_offset) = match pos {
@@ -132,10 +138,13 @@ impl Seek for RubyReader {
132
138
 
133
139
  impl Read for RubyReader {
134
140
  fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
135
- let ruby = unsafe { Ruby::get_unchecked() };
136
141
  match self {
137
142
  RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
138
- RubyReader::String { inner, offset } => {
143
+ RubyReader::String {
144
+ ruby,
145
+ inner,
146
+ offset,
147
+ } => {
139
148
  let unwrapped_inner = ruby.get_inner(*inner);
140
149
 
141
150
  let string_buffer = unsafe { unwrapped_inner.as_slice() };
@@ -151,7 +160,7 @@ impl Read for RubyReader {
151
160
 
152
161
  Ok(copy_size)
153
162
  }
154
- RubyReader::RubyIoLike { inner } => {
163
+ RubyReader::RubyIoLike { ruby, inner } => {
155
164
  let unwrapped_inner = ruby.get_inner(*inner);
156
165
 
157
166
  let bytes = unwrapped_inner
@@ -175,14 +184,17 @@ impl Read for RubyReader {
175
184
 
176
185
  impl Length for RubyReader {
177
186
  fn len(&self) -> u64 {
178
- let ruby = unsafe { Ruby::get_unchecked() };
179
187
  match self {
180
188
  RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
181
- RubyReader::String { inner, offset: _ } => {
189
+ RubyReader::String {
190
+ ruby,
191
+ inner,
192
+ offset: _,
193
+ } => {
182
194
  let unwrapped_inner = ruby.get_inner(*inner);
183
195
  unwrapped_inner.len() as u64
184
196
  }
185
- RubyReader::RubyIoLike { inner } => {
197
+ RubyReader::RubyIoLike { ruby, inner } => {
186
198
  let unwrapped_inner = ruby.get_inner(*inner);
187
199
 
188
200
  // Get current position
@@ -62,22 +62,7 @@ pub struct StructField<'a> {
62
62
 
63
63
  #[derive(Clone, Debug)]
64
64
  pub enum ParquetSchemaType<'a> {
65
- Int8,
66
- Int16,
67
- Int32,
68
- Int64,
69
- UInt8,
70
- UInt16,
71
- UInt32,
72
- UInt64,
73
- Float,
74
- Double,
75
- String,
76
- Binary,
77
- Boolean,
78
- Date32,
79
- TimestampMillis,
80
- TimestampMicros,
65
+ Primitive(PrimitiveType),
81
66
  List(Box<ListField<'a>>),
82
67
  Map(Box<MapField<'a>>),
83
68
  Struct(Box<StructField<'a>>),
@@ -110,7 +95,7 @@ pub enum SchemaNode {
110
95
  },
111
96
  }
112
97
 
113
- #[derive(Debug, Clone)]
98
+ #[derive(Debug, Copy, Clone, PartialEq, Eq)]
114
99
  pub enum PrimitiveType {
115
100
  Int8,
116
101
  Int16,
@@ -35,3 +35,59 @@ use parquet::record::Field;
35
35
  use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
36
36
 
37
37
  use crate::header_cache::StringCacheKey;
38
+
39
+ use crate::header_cache::CacheError;
40
+
41
+ use std::io;
42
+
43
+ use thiserror::Error;
44
+
45
+ #[derive(Error, Debug)]
46
+ pub enum ParquetGemError {
47
+ #[error("Failed to open file: {0}")]
48
+ FileOpen(#[from] io::Error),
49
+ #[error("Failed to intern headers: {0}")]
50
+ HeaderIntern(#[from] CacheError),
51
+ #[error("Ruby error: {0}")]
52
+ Ruby(#[from] MagnusErrorWrapper),
53
+ #[error("Parquet error: {0}")]
54
+ Parquet(#[from] parquet::errors::ParquetError),
55
+ #[error("Arrow error: {0}")]
56
+ Arrow(#[from] arrow_schema::ArrowError),
57
+ #[error("UTF-8 error: {0}")]
58
+ Utf8Error(#[from] simdutf8::basic::Utf8Error),
59
+ #[error("Jiff error: {0}")]
60
+ Jiff(#[from] jiff::Error),
61
+ }
62
+
63
+ #[derive(Debug)]
64
+ pub struct MagnusErrorWrapper(pub MagnusError);
65
+
66
+ impl From<MagnusError> for MagnusErrorWrapper {
67
+ fn from(err: MagnusError) -> Self {
68
+ Self(err)
69
+ }
70
+ }
71
+
72
+ impl std::fmt::Display for MagnusErrorWrapper {
73
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74
+ write!(f, "{}", self.0)
75
+ }
76
+ }
77
+
78
+ impl std::error::Error for MagnusErrorWrapper {}
79
+
80
+ impl From<MagnusError> for ParquetGemError {
81
+ fn from(err: MagnusError) -> Self {
82
+ Self::Ruby(MagnusErrorWrapper(err))
83
+ }
84
+ }
85
+
86
+ impl Into<MagnusError> for ParquetGemError {
87
+ fn into(self) -> MagnusError {
88
+ match self {
89
+ Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
90
+ _ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
91
+ }
92
+ }
93
+ }