parquet 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 896f2833b6db8e4466af8fc9d43eb5c695e25a207a6f8050d22052458edded36
4
- data.tar.gz: 38de2831bf7013e0194b2e61a91b26a1283fef65a04309dfbe125c570d64e9ed
3
+ metadata.gz: e1ae8e2c64920df8527a16d7348fc37c5ae2cf5c783b648bed93e31cab25bd72
4
+ data.tar.gz: 2d7b45349d33679f96559683e31d7c9dd5718fb78611aad057bba92d7324c2d3
5
5
  SHA512:
6
- metadata.gz: 52d83bc198f789856eac4bff7ff985a82c3f03f75e5de79efc5b388ce5afc63cb507b4cacc90625ee321619ade1ddc16f66f6c437ca0f60d144bc593bbec8cc5
7
- data.tar.gz: c6dd98694fd2a1d29ceebec6b58d63220f3992fe4dc63dae1c28ea27f8a353764f87a16a13c95e3e2111bca811c57fde45da9f008a93d8868112b4be608d46ee
6
+ metadata.gz: 1f56d8e538bdb095e43472940a8c3a57b6b54d74ab87d9c1519878d759962e6d844f9c992927dc22d22ebefee4bd64a858b2ed89ccc3c694d183bcb9fd154497
7
+ data.tar.gz: 5f5c8914d81ef297bebb021ba40e70725208e61c2bd1565f7d134341ac3c31489b501766266f7390ffde82a44e5821321b55f827467ac95c760cd08588788e9d
data/Cargo.lock CHANGED
@@ -681,7 +681,7 @@ checksum = "3d87ae53030f3a22e83879e666cb94e58a7bdf31706878a0ba48752994146dab"
681
681
  dependencies = [
682
682
  "magnus-macros",
683
683
  "rb-sys",
684
- "rb-sys-env",
684
+ "rb-sys-env 0.1.2",
685
685
  "seq-macro",
686
686
  ]
687
687
 
@@ -839,9 +839,11 @@ dependencies = [
839
839
  "jiff",
840
840
  "magnus",
841
841
  "mimalloc",
842
+ "num",
842
843
  "parquet 54.2.0",
843
844
  "rand",
844
845
  "rb-sys",
846
+ "rb-sys-env 0.2.2",
845
847
  "simdutf8",
846
848
  "tempfile",
847
849
  "thiserror",
@@ -997,6 +999,12 @@ version = "0.1.2"
997
999
  source = "registry+https://github.com/rust-lang/crates.io-index"
998
1000
  checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
999
1001
 
1002
+ [[package]]
1003
+ name = "rb-sys-env"
1004
+ version = "0.2.2"
1005
+ source = "registry+https://github.com/rust-lang/crates.io-index"
1006
+ checksum = "08f8d2924cf136a1315e2b4c7460a39f62ef11ee5d522df9b2750fab55b868b6"
1007
+
1000
1008
  [[package]]
1001
1009
  name = "regex"
1002
1010
  version = "1.11.1"
@@ -6,6 +6,9 @@ edition = "2021"
6
6
  [lib]
7
7
  crate-type = ["cdylib"]
8
8
 
9
+ [build-dependencies]
10
+ rb-sys-env = "^0.2"
11
+
9
12
  [dependencies]
10
13
  ahash = "0.8"
11
14
  arrow-array = "54.0.0"
@@ -21,6 +24,7 @@ rb-sys = "^0.9"
21
24
  simdutf8 = "0.1.5"
22
25
  tempfile = "^3.15"
23
26
  thiserror = "2.0"
27
+ num = "0.4.3"
24
28
 
25
29
  [target.'cfg(target_os = "linux")'.dependencies]
26
30
  jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
@@ -0,0 +1,5 @@
1
+ pub fn main() -> Result<(), Box<dyn std::error::Error>> {
2
+ let _rb_env = rb_sys_env::activate()?;
3
+
4
+ Ok(())
5
+ }
@@ -20,6 +20,7 @@ use writer::write_rows;
20
20
  #[magnus::init]
21
21
  fn init(ruby: &Ruby) -> Result<(), Error> {
22
22
  let module = ruby.define_module("Parquet")?;
23
+ module.define_module_function("metadata", magnus::method!(reader::parse_metadata, -1))?;
23
24
  module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
24
25
  module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
25
26
  module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
@@ -5,6 +5,7 @@ use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchR
5
5
  use parquet::arrow::ProjectionMask;
6
6
  use std::collections::HashMap;
7
7
  use std::fs::File;
8
+ use std::rc::Rc;
8
9
  use std::sync::Arc;
9
10
 
10
11
  use magnus::value::ReprValue;
@@ -21,7 +22,7 @@ use crate::ColumnRecord;
21
22
  /// returning either a File or a ThreadSafeRubyReader that can be used with
22
23
  /// parquet readers.
23
24
  pub fn open_parquet_source(
24
- ruby: Arc<Ruby>,
25
+ ruby: Rc<Ruby>,
25
26
  to_read: Value,
26
27
  ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
27
28
  if to_read.is_kind_of(ruby.class_string()) {
@@ -58,8 +59,8 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
58
59
  columns: &Option<Vec<String>>,
59
60
  batch_size: Option<usize>,
60
61
  ) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
61
- let mut builder = ParquetRecordBatchReaderBuilder::try_new(reader)
62
- .map_err(|e| ParquetGemError::Parquet(e))?;
62
+ let mut builder =
63
+ ParquetRecordBatchReaderBuilder::try_new(reader).map_err(ParquetGemError::Parquet)?;
63
64
 
64
65
  let schema = builder.schema().clone();
65
66
  let num_rows = builder.metadata().file_metadata().num_rows();
@@ -78,7 +79,7 @@ pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
78
79
  builder = builder.with_batch_size(batch_size);
79
80
  }
80
81
 
81
- let reader = builder.build().map_err(|e| ParquetGemError::Parquet(e))?;
82
+ let reader = builder.build().map_err(ParquetGemError::Parquet)?;
82
83
  Ok((reader, schema, num_rows))
83
84
  }
84
85
 
@@ -98,12 +99,12 @@ pub fn handle_empty_file(
98
99
  .map(|field| field.name().to_string())
99
100
  .collect();
100
101
  let interned_headers =
101
- StringCache::intern_many(&headers).map_err(|e| ParquetGemError::HeaderIntern(e))?;
102
+ StringCache::intern_many(&headers).map_err(ParquetGemError::HeaderIntern)?;
102
103
  for field in interned_headers.iter() {
103
104
  map.insert(*field, vec![]);
104
105
  }
105
106
  let record = ColumnRecord::Map(map);
106
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
107
+ let _: Value = ruby.yield_value(record.try_into_value_with(ruby)?)?;
107
108
  return Ok(true);
108
109
  }
109
110
  Ok(false)
@@ -1,6 +1,210 @@
1
1
  mod common;
2
2
  mod parquet_column_reader;
3
3
  mod parquet_row_reader;
4
+ use std::{fs::File, rc::Rc};
4
5
 
6
+ use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
7
+ use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
5
8
  pub use parquet_column_reader::parse_parquet_columns;
6
9
  pub use parquet_row_reader::parse_parquet_rows;
10
+
11
+ use crate::{
12
+ ruby_reader::{RubyReader, ThreadSafeRubyReader},
13
+ types::{ParquetGemError, TryIntoValue},
14
+ };
15
+
16
+ struct RubyParquetMetaData(ParquetMetaData);
17
+
18
+ impl TryIntoValue for RubyParquetMetaData {
19
+ fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
20
+ let metadata = self.0;
21
+ let file_metadata = metadata.file_metadata();
22
+ let row_groups = metadata.row_groups();
23
+
24
+ // Construct a hash with the metadata
25
+ let hash = handle.hash_new();
26
+ hash.aset("num_rows", file_metadata.num_rows())?;
27
+ hash.aset("created_by", file_metadata.created_by())?;
28
+ // Convert key_value_metadata to a Ruby array if it exists
29
+ if let Some(key_value_metadata) = file_metadata.key_value_metadata() {
30
+ let kv_array = handle.ary_new();
31
+ for kv in key_value_metadata {
32
+ let kv_hash = handle.hash_new();
33
+ kv_hash.aset("key", kv.key.clone())?;
34
+ kv_hash.aset("value", kv.value.clone())?;
35
+ kv_array.push(kv_hash)?;
36
+ }
37
+ hash.aset("key_value_metadata", kv_array)?;
38
+ } else {
39
+ hash.aset("key_value_metadata", None::<Value>)?;
40
+ }
41
+
42
+ // Convert schema to a Ruby hash since &Type doesn't implement IntoValue
43
+ let schema_hash = handle.hash_new();
44
+ let schema = file_metadata.schema();
45
+ schema_hash.aset("name", schema.name())?;
46
+ // Add schema fields information
47
+ let fields_array = handle.ary_new();
48
+ for field in schema.get_fields() {
49
+ let field_hash = handle.hash_new();
50
+ field_hash.aset("name", field.name())?;
51
+
52
+ // Handle different field types
53
+ match field.as_ref() {
54
+ parquet::schema::types::Type::PrimitiveType {
55
+ physical_type,
56
+ type_length,
57
+ scale,
58
+ precision,
59
+ ..
60
+ } => {
61
+ field_hash.aset("type", "primitive")?;
62
+ field_hash.aset("physical_type", format!("{:?}", physical_type))?;
63
+ field_hash.aset("type_length", *type_length)?;
64
+ field_hash.aset("scale", *scale)?;
65
+ field_hash.aset("precision", *precision)?;
66
+ }
67
+ parquet::schema::types::Type::GroupType { .. } => {
68
+ field_hash.aset("type", "group")?;
69
+ }
70
+ }
71
+
72
+ // Add basic info
73
+ let basic_info = field.get_basic_info();
74
+ field_hash.aset("repetition", format!("{:?}", basic_info.repetition()))?;
75
+ field_hash.aset(
76
+ "converted_type",
77
+ format!("{:?}", basic_info.converted_type()),
78
+ )?;
79
+ if let Some(logical_type) = basic_info.logical_type() {
80
+ field_hash.aset("logical_type", format!("{:?}", logical_type))?;
81
+ }
82
+
83
+ fields_array.push(field_hash)?;
84
+ }
85
+ schema_hash.aset("fields", fields_array)?;
86
+
87
+ hash.aset("schema", schema_hash)?;
88
+
89
+ // Convert row_groups to a Ruby array since &[RowGroupMetaData] doesn't implement IntoValue
90
+ let row_groups_array = handle.ary_new();
91
+ for row_group in row_groups.iter() {
92
+ let rg_hash = handle.hash_new();
93
+ rg_hash.aset("num_columns", row_group.num_columns())?;
94
+ rg_hash.aset("num_rows", row_group.num_rows())?;
95
+ rg_hash.aset("total_byte_size", row_group.total_byte_size())?;
96
+ rg_hash.aset("file_offset", row_group.file_offset())?;
97
+ rg_hash.aset("ordinal", row_group.ordinal())?;
98
+ rg_hash.aset("compressed_size", row_group.compressed_size())?;
99
+
100
+ // Add column chunks metadata
101
+ let columns_array = handle.ary_new();
102
+ for col_idx in 0..row_group.num_columns() {
103
+ let column = row_group.column(col_idx);
104
+ let col_hash = handle.hash_new();
105
+
106
+ col_hash.aset("column_path", column.column_path().string())?;
107
+ col_hash.aset("file_path", column.file_path())?;
108
+ col_hash.aset("file_offset", column.file_offset())?;
109
+ col_hash.aset("num_values", column.num_values())?;
110
+ col_hash.aset("compression", format!("{:?}", column.compression()))?;
111
+ col_hash.aset("total_compressed_size", column.compressed_size())?;
112
+ col_hash.aset("total_uncompressed_size", column.uncompressed_size())?;
113
+ col_hash.aset("data_page_offset", column.data_page_offset())?;
114
+
115
+ if let Some(offset) = column.dictionary_page_offset() {
116
+ col_hash.aset("dictionary_page_offset", offset)?;
117
+ }
118
+
119
+ if let Some(offset) = column.bloom_filter_offset() {
120
+ col_hash.aset("bloom_filter_offset", offset)?;
121
+ }
122
+
123
+ if let Some(length) = column.bloom_filter_length() {
124
+ col_hash.aset("bloom_filter_length", length)?;
125
+ }
126
+
127
+ if let Some(offset) = column.offset_index_offset() {
128
+ col_hash.aset("offset_index_offset", offset)?;
129
+ }
130
+
131
+ if let Some(length) = column.offset_index_length() {
132
+ col_hash.aset("offset_index_length", length)?;
133
+ }
134
+
135
+ if let Some(offset) = column.column_index_offset() {
136
+ col_hash.aset("column_index_offset", offset)?;
137
+ }
138
+
139
+ if let Some(length) = column.column_index_length() {
140
+ col_hash.aset("column_index_length", length)?;
141
+ }
142
+
143
+ // Add encodings
144
+ let encodings_array = handle.ary_new();
145
+ for encoding in column.encodings() {
146
+ encodings_array.push(format!("{:?}", encoding))?;
147
+ }
148
+ col_hash.aset("encodings", encodings_array)?;
149
+
150
+ // Add statistics if available
151
+ if let Some(stats) = column.statistics() {
152
+ let stats_hash = handle.hash_new();
153
+ stats_hash.aset("min_is_exact", stats.min_is_exact())?;
154
+ stats_hash.aset("max_is_exact", stats.max_is_exact())?;
155
+
156
+ col_hash.aset("statistics", stats_hash)?;
157
+ }
158
+
159
+ // Add page encoding stats if available
160
+ if let Some(page_encoding_stats) = column.page_encoding_stats() {
161
+ let page_stats_array = handle.ary_new();
162
+ for stat in page_encoding_stats {
163
+ let stat_hash = handle.hash_new();
164
+ stat_hash.aset("page_type", format!("{:?}", stat.page_type))?;
165
+ stat_hash.aset("encoding", format!("{:?}", stat.encoding))?;
166
+ stat_hash.aset("count", stat.count)?;
167
+ page_stats_array.push(stat_hash)?;
168
+ }
169
+ col_hash.aset("page_encoding_stats", page_stats_array)?;
170
+ }
171
+
172
+ columns_array.push(col_hash)?;
173
+ }
174
+ rg_hash.aset("columns", columns_array)?;
175
+
176
+ row_groups_array.push(rg_hash)?;
177
+ }
178
+ hash.aset("row_groups", row_groups_array)?;
179
+
180
+ Ok(handle.into_value(hash))
181
+ }
182
+ }
183
+
184
+ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
185
+ let ruby = unsafe { Ruby::get_unchecked() };
186
+
187
+ if args.len() != 1 {
188
+ return Err(MagnusError::new(
189
+ magnus::exception::arg_error(),
190
+ format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
191
+ ));
192
+ }
193
+
194
+ let ruby = Rc::new(ruby);
195
+ let arg = args[0];
196
+
197
+ let mut reader = ParquetMetaDataReader::new();
198
+ if arg.is_kind_of(ruby.class_string()) {
199
+ let path = arg.to_r_string()?.to_string()?;
200
+ let file = File::open(path).map_err(ParquetGemError::FileOpen)?;
201
+ reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
202
+ } else {
203
+ let file = ThreadSafeRubyReader::new(RubyReader::new(ruby.clone(), arg)?);
204
+ reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
205
+ }
206
+
207
+ let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
208
+
209
+ Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
210
+ }
@@ -10,26 +10,25 @@ use either::Either;
10
10
  use magnus::IntoValue;
11
11
  use magnus::{Error as MagnusError, Ruby, Value};
12
12
  use std::collections::HashMap;
13
- use std::sync::{Arc, OnceLock};
13
+ use std::rc::Rc;
14
+ use std::sync::OnceLock;
14
15
 
15
16
  use super::common::{
16
17
  create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
17
18
  };
18
19
 
19
20
  #[inline]
20
- pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
21
+ pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
21
22
  let ruby = unsafe { Ruby::get_unchecked() };
22
- Ok(
23
- parse_parquet_columns_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
24
- let z: MagnusError = e.into();
25
- z
26
- })?,
27
- )
23
+ parse_parquet_columns_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
24
+ let z: MagnusError = e.into();
25
+ z
26
+ })
28
27
  }
29
28
 
30
29
  #[inline]
31
- fn parse_parquet_columns_impl<'a>(
32
- ruby: Arc<Ruby>,
30
+ fn parse_parquet_columns_impl(
31
+ ruby: Rc<Ruby>,
33
32
  rb_self: Value,
34
33
  args: &[Value],
35
34
  ) -> Result<Value, ParquetGemError> {
@@ -76,13 +75,13 @@ fn parse_parquet_columns_impl<'a>(
76
75
  Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
77
76
  };
78
77
 
79
- // Handle empty file case
80
- if handle_empty_file(&ruby, &schema, num_rows)? {
81
- return Ok(ruby.qnil().into_value_with(&ruby));
82
- }
83
-
84
78
  match result_type {
85
79
  ParserResultType::Hash => {
80
+ // For hash return type, we need to return a hash with column names pointing at empty arrays
81
+ if handle_empty_file(&ruby, &schema, num_rows)? {
82
+ return Ok(ruby.qnil().into_value_with(&ruby));
83
+ }
84
+
86
85
  let headers = OnceLock::new();
87
86
  let headers_clone = headers.clone();
88
87
  let iter = batch_reader.map(move |batch| {
@@ -112,8 +111,8 @@ fn parse_parquet_columns_impl<'a>(
112
111
  .try_for_each(|(i, column)| {
113
112
  let header = local_headers[i];
114
113
  let values = ParquetValueVec::try_from(ArrayWrapper {
115
- array: &*column,
116
- strict: strict,
114
+ array: column,
115
+ strict,
117
116
  })?;
118
117
  map.insert(header, values.into_inner());
119
118
  Ok::<_, ParquetGemError>(())
@@ -133,11 +132,11 @@ fn parse_parquet_columns_impl<'a>(
133
132
  batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
134
133
  let vec = batch
135
134
  .columns()
136
- .into_iter()
135
+ .iter()
137
136
  .map(|column| {
138
137
  let values = ParquetValueVec::try_from(ArrayWrapper {
139
- array: &*column,
140
- strict: strict,
138
+ array: column,
139
+ strict,
141
140
  })?;
142
141
  Ok::<_, ParquetGemError>(values.into_inner())
143
142
  })
@@ -13,24 +13,23 @@ use parquet::file::reader::{FileReader, SerializedFileReader};
13
13
  use parquet::record::reader::RowIter as ParquetRowIter;
14
14
  use parquet::schema::types::{Type as SchemaType, TypePtr};
15
15
  use std::collections::HashMap;
16
- use std::sync::{Arc, OnceLock};
16
+ use std::rc::Rc;
17
+ use std::sync::OnceLock;
17
18
 
18
19
  use super::common::{handle_block_or_enum, open_parquet_source};
19
20
 
20
21
  #[inline]
21
- pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
22
+ pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
22
23
  let ruby = unsafe { Ruby::get_unchecked() };
23
- Ok(
24
- parse_parquet_rows_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
25
- let z: MagnusError = e.into();
26
- z
27
- })?,
28
- )
24
+ parse_parquet_rows_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
25
+ let z: MagnusError = e.into();
26
+ z
27
+ })
29
28
  }
30
29
 
31
30
  #[inline]
32
- fn parse_parquet_rows_impl<'a>(
33
- ruby: Arc<Ruby>,
31
+ fn parse_parquet_rows_impl(
32
+ ruby: Rc<Ruby>,
34
33
  rb_self: Value,
35
34
  args: &[Value],
36
35
  ) -> Result<Value, ParquetGemError> {
@@ -93,7 +92,7 @@ fn parse_parquet_rows_impl<'a>(
93
92
  let headers = OnceLock::new();
94
93
  let headers_clone = headers.clone();
95
94
  let iter = iter.map(move |row| {
96
- row.and_then(|row| {
95
+ row.map(|row| {
97
96
  let headers = headers_clone.get_or_init(|| {
98
97
  let column_count = row.get_column_iter().count();
99
98
 
@@ -102,10 +101,7 @@ fn parse_parquet_rows_impl<'a>(
102
101
  header_string.push(k.to_owned());
103
102
  }
104
103
 
105
- let headers = StringCache::intern_many(&header_string)
106
- .expect("Failed to intern headers");
107
-
108
- headers
104
+ StringCache::intern_many(&header_string).expect("Failed to intern headers")
109
105
  });
110
106
 
111
107
  let mut map =
@@ -113,10 +109,10 @@ fn parse_parquet_rows_impl<'a>(
113
109
  for (i, (_, v)) in row.get_column_iter().enumerate() {
114
110
  map.insert(headers[i], ParquetField(v.clone(), strict));
115
111
  }
116
- Ok(map)
112
+ map
117
113
  })
118
- .and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
119
- .map_err(|e| ParquetGemError::from(e))
114
+ .map(RowRecord::Map::<RandomState>)
115
+ .map_err(ParquetGemError::from)
120
116
  });
121
117
 
122
118
  for result in iter {
@@ -126,16 +122,16 @@ fn parse_parquet_rows_impl<'a>(
126
122
  }
127
123
  ParserResultType::Array => {
128
124
  let iter = iter.map(|row| {
129
- row.and_then(|row| {
125
+ row.map(|row| {
130
126
  let column_count = row.get_column_iter().count();
131
127
  let mut vec = Vec::with_capacity(column_count);
132
128
  for (_, v) in row.get_column_iter() {
133
129
  vec.push(ParquetField(v.clone(), strict));
134
130
  }
135
- Ok(vec)
131
+ vec
136
132
  })
137
- .and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
138
- .map_err(|e| ParquetGemError::from(e))
133
+ .map(RowRecord::Vec::<RandomState>)
134
+ .map_err(ParquetGemError::from)
139
135
  });
140
136
 
141
137
  for result in iter {
@@ -7,7 +7,7 @@ use parquet::{
7
7
  errors::ParquetError,
8
8
  file::reader::{ChunkReader, Length},
9
9
  };
10
- use std::{fs::File, sync::Mutex};
10
+ use std::{fs::File, rc::Rc, sync::Mutex};
11
11
  use std::{
12
12
  io::{self, BufReader, Read, Seek, SeekFrom, Write},
13
13
  sync::Arc,
@@ -35,7 +35,7 @@ pub enum RubyReader {
35
35
  unsafe impl Send for RubyReader {}
36
36
 
37
37
  impl RubyReader {
38
- pub fn new(ruby: Arc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
38
+ pub fn new(ruby: Rc<Ruby>, value: Value) -> Result<Self, ParquetGemError> {
39
39
  if RubyReader::is_seekable_io_like(&value) {
40
40
  Ok(RubyReader::RubyIoLike {
41
41
  inner: Opaque::from(value),
@@ -165,9 +165,7 @@ impl Read for RubyReader {
165
165
  buf.write_all(string_buffer)?;
166
166
  Ok(string_buffer.len())
167
167
  }
168
- None => {
169
- return Ok(0);
170
- }
168
+ None => Ok(0),
171
169
  }
172
170
  }
173
171
  }
@@ -107,6 +107,7 @@ pub enum PrimitiveType {
107
107
  UInt64,
108
108
  Float32,
109
109
  Float64,
110
+ Decimal128(u8, i8),
110
111
  Boolean,
111
112
  String,
112
113
  Binary,
@@ -32,6 +32,7 @@ use arrow_schema::{DataType, TimeUnit};
32
32
  use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
33
33
  use parquet::data_type::Decimal;
34
34
  use parquet::record::Field;
35
+ use std::array::TryFromSliceError;
35
36
  use std::{collections::HashMap, hash::BuildHasher, sync::Arc};
36
37
 
37
38
  use crate::header_cache::StringCacheKey;
@@ -58,6 +59,8 @@ pub enum ParquetGemError {
58
59
  Utf8Error(#[from] simdutf8::basic::Utf8Error),
59
60
  #[error("Jiff error: {0}")]
60
61
  Jiff(#[from] jiff::Error),
62
+ #[error("Failed to cast slice to array: {0}")]
63
+ InvalidDecimal(#[from] TryFromSliceError),
61
64
  }
62
65
 
63
66
  #[derive(Debug)]
@@ -83,11 +86,11 @@ impl From<MagnusError> for ParquetGemError {
83
86
  }
84
87
  }
85
88
 
86
- impl Into<MagnusError> for ParquetGemError {
87
- fn into(self) -> MagnusError {
88
- match self {
89
- Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
90
- _ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
89
+ impl From<ParquetGemError> for MagnusError {
90
+ fn from(val: ParquetGemError) -> Self {
91
+ match val {
92
+ ParquetGemError::Ruby(MagnusErrorWrapper(err)) => err,
93
+ _ => MagnusError::new(magnus::exception::runtime_error(), val.to_string()),
91
94
  }
92
95
  }
93
96
  }