parquet 0.4.2 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,14 +13,18 @@ use std::{
13
13
  sync::Arc,
14
14
  };
15
15
 
16
+ use crate::types::ParquetGemError;
17
+
16
18
  /// A reader that can handle various Ruby input types (String, StringIO, IO-like objects)
17
19
  /// and provide a standard Read implementation for them.
18
20
  pub enum RubyReader {
19
21
  String {
22
+ ruby: Arc<Ruby>,
20
23
  inner: Opaque<RString>,
21
24
  offset: usize,
22
25
  },
23
26
  RubyIoLike {
27
+ ruby: Arc<Ruby>,
24
28
  inner: Opaque<Value>,
25
29
  },
26
30
  NativeProxyIoLike {
@@ -28,26 +32,15 @@ pub enum RubyReader {
28
32
  },
29
33
  }
30
34
 
31
- impl RubyReader {
32
- fn is_io_like(value: &Value) -> bool {
33
- value.respond_to("read", false).unwrap_or(false)
34
- }
35
-
36
- // For now, don't use this. Having to use seek in length is scary.
37
- fn is_seekable_io_like(value: &Value) -> bool {
38
- Self::is_io_like(value)
39
- && value.respond_to("seek", false).unwrap_or(false)
40
- && value.respond_to("pos", false).unwrap_or(false)
41
- }
42
- }
43
-
44
- impl TryFrom<Value> for RubyReader {
45
- type Error = magnus::Error;
35
+ // Sending is technically not safe, but the only things that threatens to
36
+ // do this is the parquet gem, and they don't seem to actually do it.
37
+ unsafe impl Send for RubyReader {}
46
38
 
47
- fn try_from(value: Value) -> Result<Self, Self::Error> {
48
- let ruby = unsafe { Ruby::get_unchecked() };
39
+ impl RubyReader {
40
+ pub fn new(ruby: Arc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
49
41
  if RubyReader::is_seekable_io_like(&value) {
50
42
  Ok(RubyReader::RubyIoLike {
43
+ ruby,
51
44
  inner: Opaque::from(value),
52
45
  })
53
46
  } else if RubyReader::is_io_like(&value) {
@@ -56,6 +49,7 @@ impl TryFrom<Value> for RubyReader {
56
49
 
57
50
  // This is safe, because we won't call seek
58
51
  let inner_readable = RubyReader::RubyIoLike {
52
+ ruby: ruby.clone(),
59
53
  inner: Opaque::from(value),
60
54
  };
61
55
  let mut reader = BufReader::new(inner_readable);
@@ -74,19 +68,31 @@ impl TryFrom<Value> for RubyReader {
74
68
  .funcall::<_, _, RString>("to_str", ())
75
69
  .or_else(|_| value.funcall::<_, _, RString>("to_s", ()))?;
76
70
  Ok(RubyReader::String {
71
+ ruby,
77
72
  inner: Opaque::from(string_content),
78
73
  offset: 0,
79
74
  })
80
75
  }
81
76
  }
77
+
78
+ fn is_io_like(value: &Value) -> bool {
79
+ value.respond_to("read", false).unwrap_or(false)
80
+ }
81
+
82
+ // For now, don't use this. Having to use seek in length is scary.
83
+ fn is_seekable_io_like(value: &Value) -> bool {
84
+ Self::is_io_like(value)
85
+ && value.respond_to("seek", false).unwrap_or(false)
86
+ && value.respond_to("pos", false).unwrap_or(false)
87
+ }
82
88
  }
83
89
 
84
90
  impl Seek for RubyReader {
85
91
  fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
86
- let ruby = unsafe { Ruby::get_unchecked() };
87
92
  match self {
88
93
  RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.seek(pos),
89
94
  RubyReader::String {
95
+ ruby,
90
96
  inner,
91
97
  offset: original_offset,
92
98
  } => {
@@ -107,7 +113,7 @@ impl Seek for RubyReader {
107
113
  *original_offset = new_offset.min(unwrapped_inner.len());
108
114
  Ok(*original_offset as u64)
109
115
  }
110
- RubyReader::RubyIoLike { inner } => {
116
+ RubyReader::RubyIoLike { ruby, inner } => {
111
117
  let unwrapped_inner = ruby.get_inner(*inner);
112
118
 
113
119
  let (whence, ruby_offset) = match pos {
@@ -132,10 +138,13 @@ impl Seek for RubyReader {
132
138
 
133
139
  impl Read for RubyReader {
134
140
  fn read(&mut self, mut buf: &mut [u8]) -> io::Result<usize> {
135
- let ruby = unsafe { Ruby::get_unchecked() };
136
141
  match self {
137
142
  RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.read(buf),
138
- RubyReader::String { inner, offset } => {
143
+ RubyReader::String {
144
+ ruby,
145
+ inner,
146
+ offset,
147
+ } => {
139
148
  let unwrapped_inner = ruby.get_inner(*inner);
140
149
 
141
150
  let string_buffer = unsafe { unwrapped_inner.as_slice() };
@@ -151,7 +160,7 @@ impl Read for RubyReader {
151
160
 
152
161
  Ok(copy_size)
153
162
  }
154
- RubyReader::RubyIoLike { inner } => {
163
+ RubyReader::RubyIoLike { ruby, inner } => {
155
164
  let unwrapped_inner = ruby.get_inner(*inner);
156
165
 
157
166
  let bytes = unwrapped_inner
@@ -175,14 +184,17 @@ impl Read for RubyReader {
175
184
 
176
185
  impl Length for RubyReader {
177
186
  fn len(&self) -> u64 {
178
- let ruby = unsafe { Ruby::get_unchecked() };
179
187
  match self {
180
188
  RubyReader::NativeProxyIoLike { proxy_file } => proxy_file.len(),
181
- RubyReader::String { inner, offset: _ } => {
189
+ RubyReader::String {
190
+ ruby,
191
+ inner,
192
+ offset: _,
193
+ } => {
182
194
  let unwrapped_inner = ruby.get_inner(*inner);
183
195
  unwrapped_inner.len() as u64
184
196
  }
185
- RubyReader::RubyIoLike { inner } => {
197
+ RubyReader::RubyIoLike { ruby, inner } => {
186
198
  let unwrapped_inner = ruby.get_inner(*inner);
187
199
 
188
200
  // Get current position
@@ -43,17 +43,60 @@ impl std::fmt::Display for ParserResultType {
43
43
  pub struct ListField<'a> {
44
44
  pub item_type: ParquetSchemaType<'a>,
45
45
  pub format: Option<&'a str>,
46
+ pub nullable: bool,
46
47
  }
47
48
 
48
49
  #[derive(Debug, Clone)]
49
50
  pub struct MapField<'a> {
50
51
  pub key_type: ParquetSchemaType<'a>,
51
52
  pub value_type: ParquetSchemaType<'a>,
52
- pub format: Option<&'a str>,
53
+ pub key_format: Option<&'a str>,
54
+ pub value_format: Option<&'a str>,
55
+ pub value_nullable: bool,
53
56
  }
54
57
 
55
58
  #[derive(Debug, Clone)]
59
+ pub struct StructField<'a> {
60
+ pub fields: Vec<super::writer_types::SchemaField<'a>>,
61
+ }
62
+
63
+ #[derive(Clone, Debug)]
56
64
  pub enum ParquetSchemaType<'a> {
65
+ Primitive(PrimitiveType),
66
+ List(Box<ListField<'a>>),
67
+ Map(Box<MapField<'a>>),
68
+ Struct(Box<StructField<'a>>),
69
+ }
70
+
71
+ // New schema representation for the DSL-based approach
72
+ #[derive(Debug, Clone)]
73
+ pub enum SchemaNode {
74
+ Struct {
75
+ name: String,
76
+ nullable: bool,
77
+ fields: Vec<SchemaNode>,
78
+ },
79
+ List {
80
+ name: String,
81
+ nullable: bool,
82
+ item: Box<SchemaNode>,
83
+ },
84
+ Map {
85
+ name: String,
86
+ nullable: bool,
87
+ key: Box<SchemaNode>,
88
+ value: Box<SchemaNode>,
89
+ },
90
+ Primitive {
91
+ name: String,
92
+ parquet_type: PrimitiveType,
93
+ nullable: bool,
94
+ format: Option<String>,
95
+ },
96
+ }
97
+
98
+ #[derive(Debug, Copy, Clone, PartialEq, Eq)]
99
+ pub enum PrimitiveType {
57
100
  Int8,
58
101
  Int16,
59
102
  Int32,
@@ -62,14 +105,12 @@ pub enum ParquetSchemaType<'a> {
62
105
  UInt16,
63
106
  UInt32,
64
107
  UInt64,
65
- Float,
66
- Double,
108
+ Float32,
109
+ Float64,
110
+ Boolean,
67
111
  String,
68
112
  Binary,
69
- Boolean,
70
113
  Date32,
71
114
  TimestampMillis,
72
115
  TimestampMicros,
73
- List(Box<ListField<'a>>),
74
- Map(Box<MapField<'a>>),
75
116
  }
@@ -2,13 +2,20 @@
2
2
  mod core_types;
3
3
  mod parquet_value;
4
4
  mod record_types;
5
+ pub mod schema_converter;
6
+ pub mod schema_node;
5
7
  mod timestamp;
6
- mod type_conversion;
8
+ pub mod type_conversion;
7
9
  mod writer_types;
8
10
 
9
11
  pub use core_types::*;
10
12
  pub use parquet_value::*;
11
13
  pub use record_types::*;
14
+ // Explicitly export schema-related items
15
+ pub use schema_converter::{
16
+ infer_schema_from_first_row, legacy_schema_to_dsl, parse_legacy_schema,
17
+ };
18
+ pub use schema_node::parse_schema_node;
12
19
  pub use timestamp::*;
13
20
  pub use type_conversion::*;
14
21
  pub use writer_types::*;
@@ -28,3 +35,59 @@ use parquet::record::Field;
28
35
  use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
29
36
 
30
37
  use crate::header_cache::StringCacheKey;
38
+
39
+ use crate::header_cache::CacheError;
40
+
41
+ use std::io;
42
+
43
+ use thiserror::Error;
44
+
45
+ #[derive(Error, Debug)]
46
+ pub enum ParquetGemError {
47
+ #[error("Failed to open file: {0}")]
48
+ FileOpen(#[from] io::Error),
49
+ #[error("Failed to intern headers: {0}")]
50
+ HeaderIntern(#[from] CacheError),
51
+ #[error("Ruby error: {0}")]
52
+ Ruby(#[from] MagnusErrorWrapper),
53
+ #[error("Parquet error: {0}")]
54
+ Parquet(#[from] parquet::errors::ParquetError),
55
+ #[error("Arrow error: {0}")]
56
+ Arrow(#[from] arrow_schema::ArrowError),
57
+ #[error("UTF-8 error: {0}")]
58
+ Utf8Error(#[from] simdutf8::basic::Utf8Error),
59
+ #[error("Jiff error: {0}")]
60
+ Jiff(#[from] jiff::Error),
61
+ }
62
+
63
+ #[derive(Debug)]
64
+ pub struct MagnusErrorWrapper(pub MagnusError);
65
+
66
+ impl From<MagnusError> for MagnusErrorWrapper {
67
+ fn from(err: MagnusError) -> Self {
68
+ Self(err)
69
+ }
70
+ }
71
+
72
+ impl std::fmt::Display for MagnusErrorWrapper {
73
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74
+ write!(f, "{}", self.0)
75
+ }
76
+ }
77
+
78
+ impl std::error::Error for MagnusErrorWrapper {}
79
+
80
+ impl From<MagnusError> for ParquetGemError {
81
+ fn from(err: MagnusError) -> Self {
82
+ Self::Ruby(MagnusErrorWrapper(err))
83
+ }
84
+ }
85
+
86
+ impl Into<MagnusError> for ParquetGemError {
87
+ fn into(self) -> MagnusError {
88
+ match self {
89
+ Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
90
+ _ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
91
+ }
92
+ }
93
+ }