parquet 0.5.4 → 0.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 936feb49be7a1bbbb36236551480ae0522d6b52443e76b4ebb7502abdb9d2903
4
- data.tar.gz: bcc56665ec0cd132e22c262373e7b1294e085be364c93efbd214e434ada7dcb6
3
+ metadata.gz: e2295ee94fe35758ae8e5137070e2206ec1e104aad6b9a0806aa508ad4799247
4
+ data.tar.gz: 340f86257082bdba22d6ced530ecd1d201c7b4e6d9116eebac41541ba2aaa257
5
5
  SHA512:
6
- metadata.gz: 7856d7f36820a8384faf564f166d39e0daca1c9d15457b6f6aae8ff56f4176a8b1302bfbc2cc5edcfedfcb0805cbe71029f5712e716a29dc4942a1e6453a3e5e
7
- data.tar.gz: '08d1f4cfe357b22bad4c4fab4ddd4fa93069b13c65559d668fb704e2f7d8884fc8f081270e4dc43a5db60aab7147be36bfe7d26945f93c9ad6e9badbd0ad957e'
6
+ metadata.gz: f333ae2914cdd00468c390e8b3d876aec4e522a546d43ab29db5d777792105a38d2a40c49db0f0afe1e800bf32e54bb4c479441f8f9876937ba59917b444d15a
7
+ data.tar.gz: da2832c3514729cc0e99e16f70a10bbfc4e9093dc734de55715305121649ebc371dff93a7bb462b97fde27c79ad65cec12c5fa90a47f70bc64153a7fd2ce1a5c
@@ -1,6 +1,7 @@
1
1
  mod common;
2
2
  mod parquet_column_reader;
3
3
  mod parquet_row_reader;
4
+ mod unified;
4
5
  use std::{fs::File, rc::Rc};
5
6
 
6
7
  use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
@@ -207,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
207
208
  let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
208
209
 
209
210
  Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
210
- }
211
+ }
@@ -1,21 +1,9 @@
1
- use crate::header_cache::StringCache;
2
- use crate::logger::RubyLogger;
3
- use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
4
- use crate::{
5
- create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
6
- ParserResultType,
7
- };
8
- use ahash::RandomState;
9
- use either::Either;
10
- use magnus::IntoValue;
1
+ use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
+ use crate::utils::*;
3
+ use crate::ParquetGemError;
4
+
11
5
  use magnus::{Error as MagnusError, Ruby, Value};
12
- use std::collections::HashMap;
13
6
  use std::rc::Rc;
14
- use std::sync::OnceLock;
15
-
16
- use super::common::{
17
- create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
18
- };
19
7
 
20
8
  #[inline]
21
9
  pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,116 +29,16 @@ fn parse_parquet_columns_impl(
41
29
  logger,
42
30
  } = parse_parquet_columns_args(&ruby, args)?;
43
31
 
44
- // Initialize the logger if provided
45
- let ruby_logger = RubyLogger::new(&ruby, logger)?;
46
- if let Some(ref bs) = batch_size {
47
- ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
48
- }
49
-
50
- // Clone values for the closure to avoid move issues
51
- let columns_clone = columns.clone();
52
-
53
- // Handle block or create enumerator
54
- if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
55
- create_column_enumerator(ColumnEnumeratorArgs {
56
- rb_self,
32
+ // Use the unified parsing implementation
33
+ parse_parquet_unified(
34
+ ruby,
35
+ rb_self,
36
+ UnifiedParserArgs {
57
37
  to_read,
58
38
  result_type,
59
- columns: columns_clone,
60
- batch_size,
61
- strict,
62
- logger: logger.as_ref().map(|_| to_read),
63
- })
64
- .map(|yield_enum| yield_enum.into_value_with(&ruby))
65
- })? {
66
- return Ok(enum_value);
67
- }
68
-
69
- let source = open_parquet_source(ruby.clone(), to_read)?;
70
-
71
- // Use the common function to create the batch reader
72
-
73
- let (batch_reader, schema, num_rows) = match source {
74
- Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
75
- Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
76
- };
77
-
78
- match result_type {
79
- ParserResultType::Hash => {
80
- // For hash return type, we need to return a hash with column names pointing at empty arrays
81
- if handle_empty_file(&ruby, &schema, num_rows)? {
82
- return Ok(ruby.qnil().into_value_with(&ruby));
83
- }
84
-
85
- let headers = OnceLock::new();
86
- let headers_clone = headers.clone();
87
- let iter = batch_reader.map(move |batch| {
88
- batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
89
- let local_headers = headers_clone
90
- .get_or_init(|| {
91
- let schema = batch.schema();
92
- let fields = schema.fields();
93
- let mut header_string = Vec::with_capacity(fields.len());
94
- for field in fields {
95
- header_string.push(field.name().to_owned());
96
- }
97
- StringCache::intern_many(&header_string)
98
- })
99
- .as_ref()
100
- .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
101
-
102
- let mut map = HashMap::with_capacity_and_hasher(
103
- local_headers.len(),
104
- RandomState::default(),
105
- );
106
-
107
- batch
108
- .columns()
109
- .iter()
110
- .enumerate()
111
- .try_for_each(|(i, column)| {
112
- let header = local_headers[i];
113
- let values = ParquetValueVec::try_from(ArrayWrapper {
114
- array: column,
115
- strict,
116
- })?;
117
- map.insert(header, values.into_inner());
118
- Ok::<_, ParquetGemError>(())
119
- })?;
120
-
121
- Ok(ColumnRecord::Map::<RandomState>(map))
122
- })
123
- });
124
-
125
- for result in iter {
126
- let record = result?;
127
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
128
- }
129
- }
130
- ParserResultType::Array => {
131
- let iter = batch_reader.map(|batch| {
132
- batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
133
- let vec = batch
134
- .columns()
135
- .iter()
136
- .map(|column| {
137
- let values = ParquetValueVec::try_from(ArrayWrapper {
138
- array: column,
139
- strict,
140
- })?;
141
- Ok::<_, ParquetGemError>(values.into_inner())
142
- })
143
- .collect::<Result<Vec<_>, _>>()?;
144
- Ok(ColumnRecord::Vec::<RandomState>(vec))
145
- })
146
- });
147
-
148
- for result in iter {
149
- let record = result?;
150
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
151
- }
152
- }
153
- }
154
-
155
- Ok(ruby.qnil().into_value_with(&ruby))
156
- }
39
+ columns,
40
+ parser_type: ParserType::Column { batch_size, strict },
41
+ logger,
42
+ },
43
+ )
44
+ }
@@ -1,22 +1,9 @@
1
- use crate::header_cache::StringCache;
2
- use crate::logger::RubyLogger;
3
- use crate::types::TryIntoValue;
4
- use crate::{
5
- create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
6
- RowEnumeratorArgs, RowRecord,
7
- };
8
- use ahash::RandomState;
9
- use either::Either;
10
- use magnus::IntoValue;
1
+ use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
2
+ use crate::utils::*;
3
+ use crate::ParquetGemError;
4
+
11
5
  use magnus::{Error as MagnusError, Ruby, Value};
12
- use parquet::file::reader::{FileReader, SerializedFileReader};
13
- use parquet::record::reader::RowIter as ParquetRowIter;
14
- use parquet::schema::types::{Type as SchemaType, TypePtr};
15
- use std::collections::HashMap;
16
6
  use std::rc::Rc;
17
- use std::sync::OnceLock;
18
-
19
- use super::common::{handle_block_or_enum, open_parquet_source};
20
7
 
21
8
  #[inline]
22
9
  pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
@@ -41,123 +28,16 @@ fn parse_parquet_rows_impl(
41
28
  logger,
42
29
  } = parse_parquet_rows_args(&ruby, args)?;
43
30
 
44
- // Initialize the logger if provided
45
- let ruby_logger = RubyLogger::new(&ruby, logger)?;
46
-
47
- // Clone values for the closure to avoid move issues
48
- let columns_clone = columns.clone();
49
-
50
- // Handle block or create enumerator
51
- if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
52
- create_row_enumerator(RowEnumeratorArgs {
53
- rb_self,
31
+ // Use the unified parsing implementation
32
+ parse_parquet_unified(
33
+ ruby,
34
+ rb_self,
35
+ UnifiedParserArgs {
54
36
  to_read,
55
37
  result_type,
56
- columns: columns_clone,
57
- strict,
38
+ columns,
39
+ parser_type: ParserType::Row { strict },
58
40
  logger,
59
- })
60
- .map(|yield_enum| yield_enum.into_value_with(&ruby))
61
- })? {
62
- return Ok(enum_value);
63
- }
64
-
65
- let source = open_parquet_source(ruby.clone(), to_read)?;
66
- let reader: Box<dyn FileReader> = match source {
67
- Either::Left(file) => {
68
- Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
69
- }
70
- Either::Right(readable) => {
71
- Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
72
- }
73
- };
74
-
75
- let schema = reader.metadata().file_metadata().schema().clone();
76
- ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
77
-
78
- let mut iter = ParquetRowIter::from_file_into(reader);
79
- if let Some(cols) = columns {
80
- ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
81
- let projection = create_projection_schema(&schema, &cols);
82
- iter = iter.project(Some(projection.to_owned())).map_err(|e| {
83
- MagnusError::new(
84
- ruby.exception_runtime_error(),
85
- format!("Failed to create projection: {}", e),
86
- )
87
- })?;
88
- }
89
-
90
- match result_type {
91
- ParserResultType::Hash => {
92
- let headers = OnceLock::new();
93
- let headers_clone = headers.clone();
94
- let iter = iter.map(move |row| {
95
- row.map(|row| {
96
- let headers = headers_clone.get_or_init(|| {
97
- let column_count = row.get_column_iter().count();
98
-
99
- let mut header_string = Vec::with_capacity(column_count);
100
- for (k, _) in row.get_column_iter() {
101
- header_string.push(k.to_owned());
102
- }
103
-
104
- StringCache::intern_many(&header_string).expect("Failed to intern headers")
105
- });
106
-
107
- let mut map =
108
- HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
109
- for (i, (_, v)) in row.get_column_iter().enumerate() {
110
- map.insert(headers[i], ParquetField(v.clone(), strict));
111
- }
112
- map
113
- })
114
- .map(RowRecord::Map::<RandomState>)
115
- .map_err(ParquetGemError::from)
116
- });
117
-
118
- for result in iter {
119
- let record = result?;
120
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
121
- }
122
- }
123
- ParserResultType::Array => {
124
- let iter = iter.map(|row| {
125
- row.map(|row| {
126
- let column_count = row.get_column_iter().count();
127
- let mut vec = Vec::with_capacity(column_count);
128
- for (_, v) in row.get_column_iter() {
129
- vec.push(ParquetField(v.clone(), strict));
130
- }
131
- vec
132
- })
133
- .map(RowRecord::Vec::<RandomState>)
134
- .map_err(ParquetGemError::from)
135
- });
136
-
137
- for result in iter {
138
- let record = result?;
139
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
140
- }
141
- }
142
- }
143
-
144
- Ok(ruby.qnil().into_value_with(&ruby))
145
- }
146
-
147
- fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
148
- if let SchemaType::GroupType { fields, .. } = schema {
149
- let projected_fields: Vec<TypePtr> = fields
150
- .iter()
151
- .filter(|field| columns.contains(&field.name().to_string()))
152
- .cloned()
153
- .collect();
154
-
155
- SchemaType::GroupType {
156
- basic_info: schema.get_basic_info().clone(),
157
- fields: projected_fields,
158
- }
159
- } else {
160
- // Return original schema if not a group type
161
- schema.clone()
162
- }
163
- }
41
+ },
42
+ )
43
+ }
@@ -0,0 +1,328 @@
1
+ use crate::header_cache::StringCache;
2
+ use crate::logger::RubyLogger;
3
+ use crate::types::TryIntoValue;
4
+ use crate::{
5
+ create_column_enumerator, create_row_enumerator, ParquetField, ParquetGemError,
6
+ ParserResultType, ColumnEnumeratorArgs, RowEnumeratorArgs, RowRecord, ColumnRecord, ParquetValueVec,
7
+ };
8
+ use ahash::RandomState;
9
+ use either::Either;
10
+ use magnus::IntoValue;
11
+ use magnus::{Error as MagnusError, Ruby, Value};
12
+ use std::collections::HashMap;
13
+ use std::rc::Rc;
14
+ use std::sync::OnceLock;
15
+
16
+ use crate::types::ArrayWrapper;
17
+ use super::common::{
18
+ create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
19
+ };
20
+
21
+ /// A unified parser configuration that can be used for both row and column parsing
22
+ pub enum ParserType {
23
+ Row {
24
+ strict: bool,
25
+ },
26
+ Column {
27
+ batch_size: Option<usize>,
28
+ strict: bool,
29
+ },
30
+ }
31
+
32
+ /// Unified parser arguments structure
33
+ pub struct UnifiedParserArgs {
34
+ pub to_read: Value,
35
+ pub result_type: ParserResultType,
36
+ pub columns: Option<Vec<String>>,
37
+ pub parser_type: ParserType,
38
+ pub logger: Option<Value>,
39
+ }
40
+
41
+ /// Unified implementation for parsing Parquet data (both rows and columns)
42
+ pub fn parse_parquet_unified(
43
+ ruby: Rc<Ruby>,
44
+ rb_self: Value,
45
+ args: UnifiedParserArgs,
46
+ ) -> Result<Value, ParquetGemError> {
47
+ let UnifiedParserArgs {
48
+ to_read,
49
+ result_type,
50
+ columns,
51
+ parser_type,
52
+ logger,
53
+ } = args;
54
+
55
+ // Initialize the logger if provided
56
+ let ruby_logger = RubyLogger::new(&ruby, logger.clone())?;
57
+
58
+ // Clone values for the closure to avoid move issues
59
+ let columns_clone = columns.clone();
60
+
61
+ // Determine if we're handling rows or columns for enumerator creation
62
+ match &parser_type {
63
+ ParserType::Row { strict } => {
64
+ // Handle block or create row enumerator
65
+ if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
66
+ create_row_enumerator(RowEnumeratorArgs {
67
+ rb_self,
68
+ to_read,
69
+ result_type,
70
+ columns: columns_clone,
71
+ strict: *strict,
72
+ logger,
73
+ })
74
+ .map(|yield_enum| yield_enum.into_value_with(&ruby))
75
+ })? {
76
+ return Ok(enum_value);
77
+ }
78
+ },
79
+ ParserType::Column { batch_size, strict } => {
80
+ // For column-based parsing, log the batch size if present
81
+ if let Some(ref bs) = batch_size {
82
+ ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
83
+ }
84
+
85
+ // Handle block or create column enumerator
86
+ if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
87
+ create_column_enumerator(ColumnEnumeratorArgs {
88
+ rb_self,
89
+ to_read,
90
+ result_type,
91
+ columns: columns_clone,
92
+ batch_size: *batch_size,
93
+ strict: *strict,
94
+ logger: logger.as_ref().map(|_| to_read),
95
+ })
96
+ .map(|yield_enum| yield_enum.into_value_with(&ruby))
97
+ })? {
98
+ return Ok(enum_value);
99
+ }
100
+ }
101
+ }
102
+
103
+ // Open the Parquet source
104
+ let source = open_parquet_source(ruby.clone(), to_read)?;
105
+
106
+ // Based on the parser type, handle the data differently
107
+ match parser_type {
108
+ ParserType::Row { strict } => {
109
+ // Handle row-based parsing
110
+ process_row_data(ruby.clone(), source, &columns, result_type, strict, &ruby_logger)?;
111
+ },
112
+ ParserType::Column { batch_size, strict } => {
113
+ // Handle column-based parsing
114
+ process_column_data(ruby.clone(), source, &columns, result_type, batch_size, strict, &ruby_logger)?;
115
+ }
116
+ }
117
+
118
+ Ok(ruby.qnil().into_value_with(&ruby))
119
+ }
120
+
121
+ /// Process row-based Parquet data
122
+ fn process_row_data(
123
+ ruby: Rc<Ruby>,
124
+ source: Either<std::fs::File, crate::ruby_reader::ThreadSafeRubyReader>,
125
+ columns: &Option<Vec<String>>,
126
+ result_type: ParserResultType,
127
+ strict: bool,
128
+ ruby_logger: &RubyLogger,
129
+ ) -> Result<(), ParquetGemError> {
130
+ use parquet::file::reader::{FileReader, SerializedFileReader};
131
+ use parquet::record::reader::RowIter as ParquetRowIter;
132
+
133
+ // Create the row-based reader
134
+ let reader: Box<dyn FileReader> = match source {
135
+ Either::Left(file) => {
136
+ Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
137
+ }
138
+ Either::Right(readable) => {
139
+ Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
140
+ }
141
+ };
142
+
143
+ let schema = reader.metadata().file_metadata().schema().clone();
144
+ ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
145
+
146
+ let mut iter = ParquetRowIter::from_file_into(reader);
147
+ if let Some(cols) = columns {
148
+ ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
149
+ let projection = create_projection_schema(&schema, cols);
150
+ iter = iter.project(Some(projection.to_owned())).map_err(|e| {
151
+ MagnusError::new(
152
+ ruby.exception_runtime_error(),
153
+ format!("Failed to create projection: {}", e),
154
+ )
155
+ })?;
156
+ }
157
+
158
+ match result_type {
159
+ ParserResultType::Hash => {
160
+ let headers = OnceLock::new();
161
+ let headers_clone = headers.clone();
162
+ let iter = iter.map(move |row| {
163
+ row.map(|row| {
164
+ let headers = headers_clone.get_or_init(|| {
165
+ let column_count = row.get_column_iter().count();
166
+
167
+ let mut header_string = Vec::with_capacity(column_count);
168
+ for (k, _) in row.get_column_iter() {
169
+ header_string.push(k.to_owned());
170
+ }
171
+
172
+ StringCache::intern_many(&header_string).expect("Failed to intern headers")
173
+ });
174
+
175
+ let mut map =
176
+ HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
177
+ for (i, (_, v)) in row.get_column_iter().enumerate() {
178
+ map.insert(headers[i], ParquetField(v.clone(), strict));
179
+ }
180
+ map
181
+ })
182
+ .map(RowRecord::Map::<RandomState>)
183
+ .map_err(ParquetGemError::from)
184
+ });
185
+
186
+ for result in iter {
187
+ let record = result?;
188
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
189
+ }
190
+ }
191
+ ParserResultType::Array => {
192
+ let iter = iter.map(|row| {
193
+ row.map(|row| {
194
+ let column_count = row.get_column_iter().count();
195
+ let mut vec = Vec::with_capacity(column_count);
196
+ for (_, v) in row.get_column_iter() {
197
+ vec.push(ParquetField(v.clone(), strict));
198
+ }
199
+ vec
200
+ })
201
+ .map(RowRecord::Vec::<RandomState>)
202
+ .map_err(ParquetGemError::from)
203
+ });
204
+
205
+ for result in iter {
206
+ let record = result?;
207
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
208
+ }
209
+ }
210
+ }
211
+
212
+ Ok(())
213
+ }
214
+
215
+ /// Process column-based Parquet data
216
+ fn process_column_data(
217
+ ruby: Rc<Ruby>,
218
+ source: Either<std::fs::File, crate::ruby_reader::ThreadSafeRubyReader>,
219
+ columns: &Option<Vec<String>>,
220
+ result_type: ParserResultType,
221
+ batch_size: Option<usize>,
222
+ strict: bool,
223
+ _ruby_logger: &RubyLogger,
224
+ ) -> Result<(), ParquetGemError> {
225
+ // Create the batch reader
226
+ let (batch_reader, schema, num_rows) = match source {
227
+ Either::Left(file) => create_batch_reader(file, columns, batch_size)?,
228
+ Either::Right(readable) => create_batch_reader(readable, columns, batch_size)?,
229
+ };
230
+
231
+ match result_type {
232
+ ParserResultType::Hash => {
233
+ // For hash return type, we need to return a hash with column names pointing at empty arrays
234
+ if handle_empty_file(&ruby, &schema, num_rows)? {
235
+ return Ok(());
236
+ }
237
+
238
+ let headers = OnceLock::new();
239
+ let headers_clone = headers.clone();
240
+ let iter = batch_reader.map(move |batch| {
241
+ batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
242
+ let local_headers = headers_clone
243
+ .get_or_init(|| {
244
+ let schema = batch.schema();
245
+ let fields = schema.fields();
246
+ let mut header_string = Vec::with_capacity(fields.len());
247
+ for field in fields {
248
+ header_string.push(field.name().to_owned());
249
+ }
250
+ StringCache::intern_many(&header_string)
251
+ })
252
+ .as_ref()
253
+ .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
254
+
255
+ let mut map = HashMap::with_capacity_and_hasher(
256
+ local_headers.len(),
257
+ RandomState::default(),
258
+ );
259
+
260
+ batch
261
+ .columns()
262
+ .iter()
263
+ .enumerate()
264
+ .try_for_each(|(i, column)| {
265
+ let header = local_headers[i];
266
+ let values = ParquetValueVec::try_from(ArrayWrapper {
267
+ array: column,
268
+ strict,
269
+ })?;
270
+ map.insert(header, values.into_inner());
271
+ Ok::<_, ParquetGemError>(())
272
+ })?;
273
+
274
+ Ok(ColumnRecord::Map::<RandomState>(map))
275
+ })
276
+ });
277
+
278
+ for result in iter {
279
+ let record = result?;
280
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
281
+ }
282
+ }
283
+ ParserResultType::Array => {
284
+ let iter = batch_reader.map(|batch| {
285
+ batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
286
+ let vec = batch
287
+ .columns()
288
+ .iter()
289
+ .map(|column| {
290
+ let values = ParquetValueVec::try_from(ArrayWrapper {
291
+ array: column,
292
+ strict,
293
+ })?;
294
+ Ok::<_, ParquetGemError>(values.into_inner())
295
+ })
296
+ .collect::<Result<Vec<_>, _>>()?;
297
+ Ok(ColumnRecord::Vec::<RandomState>(vec))
298
+ })
299
+ });
300
+
301
+ for result in iter {
302
+ let record = result?;
303
+ let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
304
+ }
305
+ }
306
+ }
307
+
308
+ Ok(())
309
+ }
310
+
311
+ /// Helper function to create a projection schema
312
+ fn create_projection_schema(schema: &parquet::schema::types::Type, columns: &[String]) -> parquet::schema::types::Type {
313
+ if let parquet::schema::types::Type::GroupType { fields, .. } = schema {
314
+ let projected_fields: Vec<std::sync::Arc<parquet::schema::types::Type>> = fields
315
+ .iter()
316
+ .filter(|field| columns.contains(&field.name().to_string()))
317
+ .cloned()
318
+ .collect();
319
+
320
+ parquet::schema::types::Type::GroupType {
321
+ basic_info: schema.get_basic_info().clone(),
322
+ fields: projected_fields,
323
+ }
324
+ } else {
325
+ // Return original schema if not a group type
326
+ schema.clone()
327
+ }
328
+ }
@@ -1,6 +1,7 @@
1
1
  use crate::{impl_date_conversion, impl_timestamp_array_conversion, impl_timestamp_conversion};
2
2
 
3
3
  use super::*;
4
+ use super::record_types::format_decimal_with_i8_scale;
4
5
  use arrow_array::MapArray;
5
6
  use magnus::{RArray, RString};
6
7
 
@@ -22,7 +23,7 @@ pub enum ParquetValue {
22
23
  Bytes(Vec<u8>),
23
24
  Date32(i32),
24
25
  Date64(i64),
25
- Decimal128(i128),
26
+ Decimal128(i128, i8),
26
27
  TimestampSecond(i64, Option<Arc<str>>),
27
28
  TimestampMillis(i64, Option<Arc<str>>),
28
29
  TimestampMicros(i64, Option<Arc<str>>),
@@ -52,7 +53,47 @@ impl PartialEq for ParquetValue {
52
53
  (ParquetValue::Bytes(a), ParquetValue::Bytes(b)) => a == b,
53
54
  (ParquetValue::Date32(a), ParquetValue::Date32(b)) => a == b,
54
55
  (ParquetValue::Date64(a), ParquetValue::Date64(b)) => a == b,
55
- (ParquetValue::Decimal128(a), ParquetValue::Decimal128(b)) => a == b,
56
+ (ParquetValue::Decimal128(a, scale_a), ParquetValue::Decimal128(b, scale_b)) => {
57
+ if scale_a == scale_b {
58
+ // Same scale, compare directly
59
+ a == b
60
+ } else {
61
+ // Different scales, need to adjust for proper comparison
62
+ let mut a_val = *a;
63
+ let mut b_val = *b;
64
+
65
+ // Adjust to the same scale for proper comparison
66
+ if scale_a < scale_b {
67
+ // Scale up a to match b's scale
68
+ let scale_diff = (*scale_b - *scale_a) as u32;
69
+ if scale_diff <= 38 {
70
+ // Limit to avoid overflow
71
+ a_val *= 10_i128.pow(scale_diff);
72
+ } else {
73
+ // For large scale differences, use BigInt for the comparison
74
+ let a_big = num::BigInt::from(*a)
75
+ * num::BigInt::from(10_i128.pow(scale_diff.min(38)));
76
+ let b_big = num::BigInt::from(*b);
77
+ return a_big == b_big;
78
+ }
79
+ } else {
80
+ // Scale up b to match a's scale
81
+ let scale_diff = (*scale_a - *scale_b) as u32;
82
+ if scale_diff <= 38 {
83
+ // Limit to avoid overflow
84
+ b_val *= 10_i128.pow(scale_diff);
85
+ } else {
86
+ // For large scale differences, use BigInt for the comparison
87
+ let a_big = num::BigInt::from(*a);
88
+ let b_big = num::BigInt::from(*b)
89
+ * num::BigInt::from(10_i128.pow(scale_diff.min(38)));
90
+ return a_big == b_big;
91
+ }
92
+ }
93
+
94
+ a_val == b_val
95
+ }
96
+ }
56
97
  (ParquetValue::TimestampSecond(a, _), ParquetValue::TimestampSecond(b, _)) => a == b,
57
98
  (ParquetValue::TimestampMillis(a, _), ParquetValue::TimestampMillis(b, _)) => a == b,
58
99
  (ParquetValue::TimestampMicros(a, _), ParquetValue::TimestampMicros(b, _)) => a == b,
@@ -85,7 +126,10 @@ impl std::hash::Hash for ParquetValue {
85
126
  ParquetValue::Bytes(b) => b.hash(state),
86
127
  ParquetValue::Date32(d) => d.hash(state),
87
128
  ParquetValue::Date64(d) => d.hash(state),
88
- ParquetValue::Decimal128(d) => d.hash(state),
129
+ ParquetValue::Decimal128(d, scale) => {
130
+ d.hash(state);
131
+ scale.hash(state);
132
+ }
89
133
  ParquetValue::TimestampSecond(ts, tz) => {
90
134
  ts.hash(state);
91
135
  tz.hash(state);
@@ -131,7 +175,16 @@ impl TryIntoValue for ParquetValue {
131
175
  ParquetValue::Boolean(b) => Ok(b.into_value_with(handle)),
132
176
  ParquetValue::String(s) => Ok(s.into_value_with(handle)),
133
177
  ParquetValue::Bytes(b) => Ok(handle.str_from_slice(&b).as_value()),
134
- ParquetValue::Decimal128(d) => Ok(d.to_string().into_value_with(handle)),
178
+ ParquetValue::Decimal128(d, scale) => {
179
+ // Load the bigdecimal gem if it's not already loaded
180
+ LOADED_BIGDECIMAL.get_or_init(|| handle.require("bigdecimal").unwrap_or_default());
181
+
182
+ // Format with proper scaling based on the sign of scale
183
+ let value = format_decimal_with_i8_scale(d, scale);
184
+
185
+ let kernel = handle.module_kernel();
186
+ Ok(kernel.funcall::<_, _, Value>("BigDecimal", (value,))?)
187
+ }
135
188
  ParquetValue::Date32(d) => impl_date_conversion!(d, handle),
136
189
  ParquetValue::Date64(d) => impl_date_conversion!(d, handle),
137
190
  timestamp @ ParquetValue::TimestampSecond(_, _) => {
@@ -375,7 +428,7 @@ impl ParquetValue {
375
428
  /// Unified helper to parse a decimal string and apply scaling
376
429
  fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, MagnusError> {
377
430
  let s = input_str.trim();
378
-
431
+
379
432
  // 1. Handle scientific notation case (e.g., "0.12345e3")
380
433
  if let Some(e_pos) = s.to_lowercase().find('e') {
381
434
  let base = &s[0..e_pos];
@@ -385,7 +438,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
385
438
  let exp_val = exp.parse::<i32>().map_err(|e| {
386
439
  MagnusError::new(
387
440
  magnus::exception::type_error(),
388
- format!("Failed to parse exponent '{}' in decimal string '{}': {}", exp, s, e),
441
+ format!(
442
+ "Failed to parse exponent '{}' in decimal string '{}': {}",
443
+ exp, s, e
444
+ ),
389
445
  )
390
446
  })?;
391
447
 
@@ -407,7 +463,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
407
463
  let base_val = base_without_point.parse::<i128>().map_err(|e| {
408
464
  MagnusError::new(
409
465
  magnus::exception::type_error(),
410
- format!("Failed to parse base '{}' in scientific notation '{}': {}", base, s, e),
466
+ format!(
467
+ "Failed to parse base '{}' in scientific notation '{}': {}",
468
+ base, s, e
469
+ ),
411
470
  )
412
471
  })?;
413
472
 
@@ -417,7 +476,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
417
476
  let base_val = base.parse::<i128>().map_err(|e| {
418
477
  MagnusError::new(
419
478
  magnus::exception::type_error(),
420
- format!("Failed to parse base '{}' in scientific notation '{}': {}", base, s, e),
479
+ format!(
480
+ "Failed to parse base '{}' in scientific notation '{}': {}",
481
+ base, s, e
482
+ ),
421
483
  )
422
484
  })?;
423
485
 
@@ -466,7 +528,10 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
466
528
  let v = s_without_point.parse::<i128>().map_err(|e| {
467
529
  MagnusError::new(
468
530
  magnus::exception::type_error(),
469
- format!("Failed to parse decimal string '{}' (without decimal point: '{}'): {}", s, s_without_point, e),
531
+ format!(
532
+ "Failed to parse decimal string '{}' (without decimal point: '{}'): {}",
533
+ s, s_without_point, e
534
+ ),
470
535
  )
471
536
  })?;
472
537
 
@@ -497,7 +562,7 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
497
562
  }
498
563
  std::cmp::Ordering::Equal => Ok(v),
499
564
  }
500
- }
565
+ }
501
566
  // 3. Plain integer value (e.g., "12345")
502
567
  else {
503
568
  // No decimal point, parse as i128 and scale appropriately
@@ -512,12 +577,18 @@ fn parse_decimal_string(input_str: &str, input_scale: i8) -> Result<i128, Magnus
512
577
  if input_scale > 38 {
513
578
  return Err(MagnusError::new(
514
579
  magnus::exception::range_error(),
515
- format!("Scale {} is too large for decimal value '{}'. Must be ≤ 38.", input_scale, s),
580
+ format!(
581
+ "Scale {} is too large for decimal value '{}'. Must be ≤ 38.",
582
+ input_scale, s
583
+ ),
516
584
  ));
517
585
  } else if input_scale < -38 {
518
586
  return Err(MagnusError::new(
519
587
  magnus::exception::range_error(),
520
- format!("Scale {} is too small for decimal value '{}'. Must be ≥ -38.", input_scale, s),
588
+ format!(
589
+ "Scale {} is too small for decimal value '{}'. Must be ≥ -38.",
590
+ input_scale, s
591
+ ),
521
592
  ));
522
593
  }
523
594
 
@@ -540,14 +611,17 @@ fn convert_to_decimal128(value: Value, scale: i8) -> Result<ParquetValue, Magnus
540
611
  } else {
541
612
  value.to_r_string()?.to_string()?
542
613
  };
543
-
614
+
544
615
  // Use our unified parser to convert the string to a decimal value with scaling
545
616
  match parse_decimal_string(&s, scale) {
546
- Ok(decimal_value) => Ok(ParquetValue::Decimal128(decimal_value)),
617
+ Ok(decimal_value) => Ok(ParquetValue::Decimal128(decimal_value, scale)),
547
618
  Err(e) => Err(MagnusError::new(
548
619
  magnus::exception::type_error(),
549
- format!("Failed to convert '{}' to decimal with scale {}: {}", s, scale, e),
550
- ))
620
+ format!(
621
+ "Failed to convert '{}' to decimal with scale {}: {}",
622
+ s, scale, e
623
+ ),
624
+ )),
551
625
  }
552
626
  }
553
627
 
@@ -5,7 +5,31 @@ use parquet::data_type::AsBytes;
5
5
 
6
6
  use super::*;
7
7
 
8
- static LOADED_BIGDECIMAL: OnceLock<bool> = OnceLock::new();
8
+ pub static LOADED_BIGDECIMAL: OnceLock<bool> = OnceLock::new();
9
+
10
+ /// Format decimal value with appropriate scale for BigDecimal conversion
11
+ /// Handles positive and negative scales correctly for i8 scale
12
+ pub fn format_decimal_with_i8_scale<T: std::fmt::Display>(value: T, scale: i8) -> String {
13
+ if scale >= 0 {
14
+ // Positive scale means divide (move decimal point left)
15
+ format!("{}e-{}", value, scale)
16
+ } else {
17
+ // Negative scale means multiply (move decimal point right)
18
+ format!("{}e{}", value, -scale)
19
+ }
20
+ }
21
+
22
+ /// Format decimal value with appropriate scale for BigDecimal conversion
23
+ /// Handles positive and negative scales correctly for i32 scale
24
+ pub fn format_decimal_with_i32_scale<T: std::fmt::Display>(value: T, scale: i32) -> String {
25
+ if scale >= 0 {
26
+ // Positive scale means divide (move decimal point left)
27
+ format!("{}e-{}", value, scale)
28
+ } else {
29
+ // Negative scale means multiply (move decimal point right)
30
+ format!("{}e{}", value, -scale)
31
+ }
32
+ }
9
33
 
10
34
  #[derive(Debug)]
11
35
  pub enum RowRecord<S: BuildHasher + Default> {
@@ -207,17 +231,17 @@ impl TryIntoValue for ParquetField {
207
231
  let value = match d {
208
232
  Decimal::Int32 { value, scale, .. } => {
209
233
  let unscaled = i32::from_be_bytes(value);
210
- format!("{}e-{}", unscaled, scale)
234
+ format_decimal_with_i32_scale(unscaled, scale)
211
235
  }
212
236
  Decimal::Int64 { value, scale, .. } => {
213
237
  let unscaled = i64::from_be_bytes(value);
214
- format!("{}e-{}", unscaled, scale)
238
+ format_decimal_with_i32_scale(unscaled, scale)
215
239
  }
216
240
  Decimal::Bytes { value, scale, .. } => {
217
241
  // value is a byte array containing the bytes for an i128 value in big endian order
218
242
  let casted = value.as_bytes()[..16].try_into()?;
219
243
  let unscaled = i128::from_be_bytes(casted);
220
- format!("{}e-{}", unscaled, scale)
244
+ format_decimal_with_i32_scale(unscaled, scale)
221
245
  }
222
246
  };
223
247
 
@@ -2,8 +2,8 @@ use std::str::FromStr;
2
2
  use std::sync::Arc;
3
3
 
4
4
  use super::*;
5
- use arrow_array::builder::*;
6
5
  use arrow_array::builder::MapFieldNames;
6
+ use arrow_array::builder::*;
7
7
  use arrow_schema::{DataType, Field, Fields, TimeUnit};
8
8
  use jiff::tz::{Offset, TimeZone};
9
9
  use magnus::{RArray, RString, TryConvert};
@@ -368,15 +368,17 @@ fn create_arrow_builder_for_type(
368
368
  ParquetSchemaType::Primitive(PrimitiveType::Decimal128(precision, scale)) => {
369
369
  // Create a Decimal128Builder with specific precision and scale
370
370
  let builder = Decimal128Builder::with_capacity(cap);
371
-
371
+
372
372
  // Set precision and scale for the decimal and return the new builder
373
- let builder_with_precision = builder.with_precision_and_scale(*precision, *scale).map_err(|e| {
374
- MagnusError::new(
375
- magnus::exception::runtime_error(),
376
- format!("Failed to set precision and scale: {}", e),
377
- )
378
- })?;
379
-
373
+ let builder_with_precision = builder
374
+ .with_precision_and_scale(*precision, *scale)
375
+ .map_err(|e| {
376
+ MagnusError::new(
377
+ magnus::exception::runtime_error(),
378
+ format!("Failed to set precision and scale: {}", e),
379
+ )
380
+ })?;
381
+
380
382
  Ok(Box::new(builder_with_precision))
381
383
  }
382
384
  ParquetSchemaType::Primitive(PrimitiveType::String) => {
@@ -857,7 +859,7 @@ fn fill_builder(
857
859
 
858
860
  for val in values {
859
861
  match val {
860
- ParquetValue::Decimal128(d) => typed_builder.append_value(*d),
862
+ ParquetValue::Decimal128(d, _scale) => typed_builder.append_value(*d),
861
863
  ParquetValue::Float64(f) => {
862
864
  // Scale the float to the desired precision and scale
863
865
  let scaled_value = (*f * 10_f64.powi(*scale as i32)) as i128;
@@ -1161,7 +1163,7 @@ fn fill_builder(
1161
1163
  )
1162
1164
  })?
1163
1165
  .append_value(bytes),
1164
- ParquetValue::Decimal128(x) => typed_builder
1166
+ ParquetValue::Decimal128(x, _scale) => typed_builder
1165
1167
  .field_builder::<Decimal128Builder>(i)
1166
1168
  .ok_or_else(|| {
1167
1169
  MagnusError::new(
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.5.4"
2
+ VERSION = "0.5.5"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.4
4
+ version: 0.5.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
@@ -66,6 +66,7 @@ files:
66
66
  - ext/parquet/src/reader/mod.rs
67
67
  - ext/parquet/src/reader/parquet_column_reader.rs
68
68
  - ext/parquet/src/reader/parquet_row_reader.rs
69
+ - ext/parquet/src/reader/unified/mod.rs
69
70
  - ext/parquet/src/ruby_reader.rs
70
71
  - ext/parquet/src/types/core_types.rs
71
72
  - ext/parquet/src/types/mod.rs