parquet 0.5.10 → 0.5.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 114891cfa5fa190e1f00d44803327f1c90cc11f64ba23f7f2a9cc9f9379da787
4
- data.tar.gz: 9168b2be960faa93ce9c84d170c6e8f73819535bcedbf3d3b26869ff9829ecc6
3
+ metadata.gz: 82528b663c4a577262db90b6d17ba473a81d0ea725ceba486b63a3619040fa73
4
+ data.tar.gz: 2e44daa9b4e36ef1503589daaa0815cbc3acee10c565d9942f6c0b6d35ced5f0
5
5
  SHA512:
6
- metadata.gz: f07f99a188ac5fa0663616fba00b1990a2cbd6bb14462383915f0e1617c26c5ca481840c16179958f2b3760b334f176e2e4542d95e3cc922379948ac2b0bfa61
7
- data.tar.gz: 42c7b0779d6e3fa46addc5fa92420f326418a54962d391e9b063db8378f8a5f8c2916b43f356649fc127e8fc582aa1e98d7afd71f0bc5f9700a0664ed46313f6
6
+ metadata.gz: 418951253384f5492385fcb30fa5b0113b85d9bc51346b6abad16105c124d8869266943c1a29bc0879cfee4270b94d32fb99004e233c6ebde4a70e1d329435af
7
+ data.tar.gz: bc0db4ebb36add314253b5b9b946cc2c84f315d51ba7fefbead6c7de3b65a3f7752fa4e4cf0be19704405b390ae0106d8383e30791e7fac4a86a75141c214de1
@@ -12,81 +12,27 @@ use magnus::value::ReprValue;
12
12
  use magnus::{Error as MagnusError, Ruby, Value};
13
13
 
14
14
  use crate::header_cache::StringCache;
15
- use crate::logger::RubyLogger;
16
15
  use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
17
16
  use crate::types::{ParquetGemError, TryIntoValue};
18
17
  use crate::ColumnRecord;
19
18
 
20
- use super::format_detector::{detect_file_format, detect_format_from_extension, FileFormat};
21
-
22
- /// Represents the different data sources we can open
23
- pub enum DataSource {
24
- Parquet(Either<File, ThreadSafeRubyReader>),
25
- Arrow(Either<File, ThreadSafeRubyReader>),
26
- }
27
-
28
- /// Opens a data file (Parquet or Arrow) for reading, automatically detecting the format
29
- pub fn open_data_source(
19
+ /// Opens a parquet file or IO-like object for reading
20
+ ///
21
+ /// This function handles both file paths (as strings) and IO-like objects,
22
+ /// returning either a File or a ThreadSafeRubyReader that can be used with
23
+ /// parquet readers.
24
+ pub fn open_parquet_source(
30
25
  ruby: Rc<Ruby>,
31
26
  to_read: Value,
32
- ruby_logger: &RubyLogger,
33
- ) -> Result<DataSource, ParquetGemError> {
27
+ ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
34
28
  if to_read.is_kind_of(ruby.class_string()) {
35
29
  let path_string = to_read.to_r_string()?;
36
30
  let file_path = unsafe { path_string.as_str()? };
37
-
38
- // Try to detect format from extension first
39
- let format_hint = detect_format_from_extension(file_path);
40
-
41
- let mut file = File::open(file_path).map_err(ParquetGemError::from)?;
42
-
43
- // Detect actual format from file content
44
- let format = detect_file_format(&mut file)?;
45
-
46
- // Warn if extension doesn't match content
47
- if let Some(hint) = format_hint {
48
- if hint != format {
49
- ruby_logger.warn(|| {
50
- format!(
51
- "Extension implied format {:?} but actual format is {:?}",
52
- hint, format
53
- )
54
- })?;
55
- }
56
- }
57
-
58
- match format {
59
- FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
60
- FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
61
- }
31
+ let file = File::open(file_path).map_err(ParquetGemError::from)?;
32
+ Ok(Either::Left(file))
62
33
  } else {
63
- // For IO-like objects, we need to use a temporary file
64
- use std::io::{Read, Write};
65
- use tempfile::NamedTempFile;
66
-
67
- let mut readable = RubyReader::new(ruby.clone(), to_read)?;
68
- let mut temp_file = NamedTempFile::new().map_err(ParquetGemError::from)?;
69
-
70
- // Copy the entire content to the temporary file
71
- let mut buffer = vec![0u8; 8192];
72
- loop {
73
- let bytes_read = readable.read(&mut buffer)?;
74
- if bytes_read == 0 {
75
- break;
76
- }
77
- temp_file.write_all(&buffer[..bytes_read])?;
78
- }
79
- temp_file.flush()?;
80
-
81
- // Detect format from the temporary file
82
- let mut file = temp_file.reopen()?;
83
- let format = detect_file_format(&mut file)?;
84
-
85
- // Use the temporary file as the source
86
- match format {
87
- FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
88
- FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
89
- }
34
+ let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
35
+ Ok(Either::Right(readable))
90
36
  }
91
37
  }
92
38
 
@@ -1,6 +1,4 @@
1
- mod arrow_reader;
2
1
  mod common;
3
- mod format_detector;
4
2
  mod parquet_column_reader;
5
3
  mod parquet_row_reader;
6
4
  mod unified;
@@ -190,10 +188,7 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
190
188
  if args.len() != 1 {
191
189
  return Err(MagnusError::new(
192
190
  magnus::exception::arg_error(),
193
- format!(
194
- "metadata expects exactly 1 argument (file path or IO-like object), got {}",
195
- args.len()
196
- ),
191
+ format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
197
192
  ));
198
193
  }
199
194
 
@@ -213,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
213
208
  let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
214
209
 
215
210
  Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
216
- }
211
+ }
@@ -13,11 +13,8 @@ use std::collections::HashMap;
13
13
  use std::rc::Rc;
14
14
  use std::sync::OnceLock;
15
15
 
16
- use super::arrow_reader::{
17
- process_arrow_column_data, process_arrow_file_column_data, process_arrow_row_data,
18
- };
19
16
  use super::common::{
20
- create_batch_reader, handle_block_or_enum, handle_empty_file, open_data_source, DataSource,
17
+ create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
21
18
  };
22
19
  use crate::types::ArrayWrapper;
23
20
 
@@ -103,99 +100,34 @@ pub fn parse_parquet_unified(
103
100
  }
104
101
  }
105
102
 
106
- // Open the data source and detect format
107
- let source = open_data_source(ruby.clone(), to_read, &ruby_logger)?;
103
+ // Open the Parquet source
104
+ let source = open_parquet_source(ruby.clone(), to_read)?;
108
105
 
109
- // Based on the source format and parser type, handle the data differently
110
- match (source, &parser_type) {
111
- (DataSource::Parquet(reader), ParserType::Row { strict }) => {
112
- // Handle Parquet row-based parsing
106
+ // Based on the parser type, handle the data differently
107
+ match parser_type {
108
+ ParserType::Row { strict } => {
109
+ // Handle row-based parsing
113
110
  process_row_data(
114
111
  ruby.clone(),
115
- reader,
112
+ source,
116
113
  &columns,
117
114
  result_type,
118
- *strict,
115
+ strict,
119
116
  &ruby_logger,
120
117
  )?;
121
118
  }
122
- (DataSource::Parquet(reader), ParserType::Column { batch_size, strict }) => {
123
- // Handle Parquet column-based parsing
119
+ ParserType::Column { batch_size, strict } => {
120
+ // Handle column-based parsing
124
121
  process_column_data(
125
122
  ruby.clone(),
126
- reader,
123
+ source,
127
124
  &columns,
128
125
  result_type,
129
- *batch_size,
130
- *strict,
126
+ batch_size,
127
+ strict,
131
128
  &ruby_logger,
132
129
  )?;
133
130
  }
134
- (DataSource::Arrow(reader), ParserType::Row { strict }) => {
135
- // Handle Arrow row-based parsing
136
- match reader {
137
- Either::Left(file) => {
138
- // For seekable files, use FileReader which handles IPC file format
139
- use arrow_ipc::reader::FileReader;
140
- let file_reader = FileReader::try_new(file, None)
141
- .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
142
-
143
- use super::arrow_reader::process_arrow_file_row_data;
144
- process_arrow_file_row_data(
145
- ruby.clone(),
146
- file_reader,
147
- &columns,
148
- result_type,
149
- *strict,
150
- &ruby_logger,
151
- )?;
152
- }
153
- Either::Right(readable) => {
154
- use arrow_ipc::reader::StreamReader;
155
- let stream_reader = StreamReader::try_new(readable, None)
156
- .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
157
- process_arrow_row_data(
158
- ruby.clone(),
159
- stream_reader,
160
- &columns,
161
- result_type,
162
- *strict,
163
- &ruby_logger,
164
- )?;
165
- }
166
- }
167
- }
168
- (DataSource::Arrow(reader), ParserType::Column { batch_size, strict }) => {
169
- // Handle Arrow column-based parsing
170
- match reader {
171
- Either::Left(file) => {
172
- // For seekable files, we can use the optimized FileReader
173
- process_arrow_file_column_data(
174
- ruby.clone(),
175
- file,
176
- &columns,
177
- result_type,
178
- *batch_size,
179
- *strict,
180
- &ruby_logger,
181
- )?;
182
- }
183
- Either::Right(readable) => {
184
- use arrow_ipc::reader::StreamReader;
185
- let stream_reader = StreamReader::try_new(readable, None)
186
- .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
187
- process_arrow_column_data(
188
- ruby.clone(),
189
- stream_reader,
190
- &columns,
191
- result_type,
192
- *batch_size,
193
- *strict,
194
- &ruby_logger,
195
- )?;
196
- }
197
- }
198
- }
199
131
  }
200
132
 
201
133
  Ok(ruby.qnil().into_value_with(&ruby))
@@ -55,10 +55,6 @@ pub enum ParquetGemError {
55
55
  Parquet(#[from] parquet::errors::ParquetError),
56
56
  #[error("Arrow error: {0}")]
57
57
  Arrow(#[from] arrow_schema::ArrowError),
58
- #[error("Arrow IPC error: {0}")]
59
- ArrowIpc(String),
60
- #[error("Unknown file format")]
61
- UnknownFormat,
62
58
  #[error("UTF-8 error: {0}")]
63
59
  Utf8Error(#[from] simdutf8::basic::Utf8Error),
64
60
  #[error("Jiff error: {0}")]
@@ -1,3 +1,3 @@
1
1
  module Parquet
2
- VERSION = "0.5.10"
2
+ VERSION = "0.5.11"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: parquet
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.10
4
+ version: 0.5.11
5
5
  platform: ruby
6
6
  authors:
7
7
  - Nathan Jaremko
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-06-17 00:00:00.000000000 Z
11
+ date: 2025-06-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -62,9 +62,7 @@ files:
62
62
  - ext/parquet/src/header_cache.rs
63
63
  - ext/parquet/src/lib.rs
64
64
  - ext/parquet/src/logger.rs
65
- - ext/parquet/src/reader/arrow_reader.rs
66
65
  - ext/parquet/src/reader/common.rs
67
- - ext/parquet/src/reader/format_detector.rs
68
66
  - ext/parquet/src/reader/mod.rs
69
67
  - ext/parquet/src/reader/parquet_column_reader.rs
70
68
  - ext/parquet/src/reader/parquet_row_reader.rs
@@ -1,579 +0,0 @@
1
- use crate::header_cache::StringCache;
2
- use crate::logger::RubyLogger;
3
- use crate::types::ArrayWrapper;
4
- use crate::types::{
5
- ColumnRecord, ParquetGemError, ParquetValueVec, ParserResultType, RowRecord, TryIntoValue,
6
- };
7
- use ahash::RandomState;
8
- use arrow_array::RecordBatch;
9
- use arrow_ipc::reader::{FileReader, StreamReader};
10
- use arrow_schema::Schema;
11
- use magnus::{Ruby, Value};
12
- use std::collections::HashMap;
13
- use std::fs::File;
14
- use std::io::Read;
15
- use std::rc::Rc;
16
- use std::sync::{Arc, OnceLock};
17
-
18
- /// Process Arrow IPC file data for column-based parsing
19
- pub fn process_arrow_column_data<R: Read>(
20
- ruby: Rc<Ruby>,
21
- reader: StreamReader<R>,
22
- columns: &Option<Vec<String>>,
23
- result_type: ParserResultType,
24
- _batch_size: Option<usize>,
25
- strict: bool,
26
- ruby_logger: &RubyLogger,
27
- ) -> Result<(), ParquetGemError> {
28
- let schema = reader.schema();
29
- ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
30
-
31
- // Filter schema if columns are specified
32
- let _filtered_schema = if let Some(cols) = columns {
33
- let mut fields = Vec::new();
34
- for field in schema.fields() {
35
- if cols.contains(&field.name().to_string()) {
36
- fields.push(field.clone());
37
- }
38
- }
39
- Arc::new(Schema::new(fields))
40
- } else {
41
- schema.clone()
42
- };
43
-
44
- match result_type {
45
- ParserResultType::Hash => {
46
- let headers = OnceLock::new();
47
-
48
- for batch_result in reader {
49
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
50
-
51
- // Filter columns if needed
52
- let batch = if let Some(cols) = columns {
53
- filter_record_batch(&batch, cols)?
54
- } else {
55
- batch
56
- };
57
-
58
- let local_headers = headers
59
- .get_or_init(|| {
60
- let schema = batch.schema();
61
- let fields = schema.fields();
62
- let mut header_string = Vec::with_capacity(fields.len());
63
- for field in fields {
64
- header_string.push(field.name().to_owned());
65
- }
66
- StringCache::intern_many(&header_string)
67
- })
68
- .as_ref()
69
- .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
70
-
71
- let mut map =
72
- HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
73
-
74
- batch
75
- .columns()
76
- .iter()
77
- .enumerate()
78
- .try_for_each(|(i, column)| {
79
- let header = local_headers[i];
80
- let values = ParquetValueVec::try_from(ArrayWrapper {
81
- array: column,
82
- strict,
83
- })?;
84
- map.insert(header, values.into_inner());
85
- Ok::<_, ParquetGemError>(())
86
- })?;
87
-
88
- let record = ColumnRecord::Map::<RandomState>(map);
89
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
90
- }
91
- }
92
- ParserResultType::Array => {
93
- for batch_result in reader {
94
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
95
-
96
- // Filter columns if needed
97
- let batch = if let Some(cols) = columns {
98
- filter_record_batch(&batch, cols)?
99
- } else {
100
- batch
101
- };
102
-
103
- let vec = batch
104
- .columns()
105
- .iter()
106
- .map(|column| {
107
- let values = ParquetValueVec::try_from(ArrayWrapper {
108
- array: column,
109
- strict,
110
- })?;
111
- Ok::<_, ParquetGemError>(values.into_inner())
112
- })
113
- .collect::<Result<Vec<_>, _>>()?;
114
-
115
- let record = ColumnRecord::Vec::<RandomState>(vec);
116
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
117
- }
118
- }
119
- }
120
-
121
- Ok(())
122
- }
123
-
124
- /// Process Arrow IPC file data for row-based parsing
125
- pub fn process_arrow_row_data<R: Read>(
126
- ruby: Rc<Ruby>,
127
- reader: StreamReader<R>,
128
- columns: &Option<Vec<String>>,
129
- result_type: ParserResultType,
130
- strict: bool,
131
- ruby_logger: &RubyLogger,
132
- ) -> Result<(), ParquetGemError> {
133
- let schema = reader.schema();
134
- ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
135
-
136
- match result_type {
137
- ParserResultType::Hash => {
138
- let headers = OnceLock::new();
139
-
140
- for batch_result in reader {
141
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
142
-
143
- // Filter columns if needed
144
- let batch = if let Some(cols) = columns {
145
- filter_record_batch(&batch, cols)?
146
- } else {
147
- batch
148
- };
149
-
150
- let local_headers = headers
151
- .get_or_init(|| {
152
- let schema = batch.schema();
153
- let fields = schema.fields();
154
- let mut header_string = Vec::with_capacity(fields.len());
155
- for field in fields {
156
- header_string.push(field.name().to_owned());
157
- }
158
- StringCache::intern_many(&header_string)
159
- })
160
- .as_ref()
161
- .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
162
-
163
- // Convert columnar data to rows
164
- for row_idx in 0..batch.num_rows() {
165
- let mut map = HashMap::with_capacity_and_hasher(
166
- local_headers.len(),
167
- RandomState::default(),
168
- );
169
-
170
- for (col_idx, column) in batch.columns().iter().enumerate() {
171
- let header = local_headers[col_idx];
172
- let value = extract_value_at_index(column, row_idx, strict)?;
173
- map.insert(header, value);
174
- }
175
-
176
- let record = RowRecord::Map::<RandomState>(map);
177
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
178
- }
179
- }
180
- }
181
- ParserResultType::Array => {
182
- for batch_result in reader {
183
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
184
-
185
- // Filter columns if needed
186
- let batch = if let Some(cols) = columns {
187
- filter_record_batch(&batch, cols)?
188
- } else {
189
- batch
190
- };
191
-
192
- // Convert columnar data to rows
193
- for row_idx in 0..batch.num_rows() {
194
- let mut row_vec = Vec::with_capacity(batch.num_columns());
195
-
196
- for column in batch.columns() {
197
- let value = extract_value_at_index(column, row_idx, strict)?;
198
- row_vec.push(value);
199
- }
200
-
201
- let record = RowRecord::Vec::<RandomState>(row_vec);
202
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
203
- }
204
- }
205
- }
206
- }
207
-
208
- Ok(())
209
- }
210
-
211
- /// Process Arrow IPC file with FileReader for row-based parsing
212
- pub fn process_arrow_file_row_data(
213
- ruby: Rc<Ruby>,
214
- reader: FileReader<File>,
215
- columns: &Option<Vec<String>>,
216
- result_type: ParserResultType,
217
- strict: bool,
218
- ruby_logger: &RubyLogger,
219
- ) -> Result<(), ParquetGemError> {
220
- let schema = reader.schema();
221
- ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
222
-
223
- match result_type {
224
- ParserResultType::Hash => {
225
- let headers = OnceLock::new();
226
-
227
- for batch_result in reader {
228
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
229
-
230
- // Filter columns if needed
231
- let batch = if let Some(cols) = columns {
232
- filter_record_batch(&batch, cols)?
233
- } else {
234
- batch
235
- };
236
-
237
- let local_headers = headers
238
- .get_or_init(|| {
239
- let schema = batch.schema();
240
- let fields = schema.fields();
241
- let mut header_string = Vec::with_capacity(fields.len());
242
- for field in fields {
243
- header_string.push(field.name().to_owned());
244
- }
245
- StringCache::intern_many(&header_string)
246
- })
247
- .as_ref()
248
- .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
249
-
250
- // Convert columnar data to rows
251
- for row_idx in 0..batch.num_rows() {
252
- let mut map = HashMap::with_capacity_and_hasher(
253
- local_headers.len(),
254
- RandomState::default(),
255
- );
256
-
257
- for (col_idx, column) in batch.columns().iter().enumerate() {
258
- let header = local_headers[col_idx];
259
- let value = extract_value_at_index(column, row_idx, strict)?;
260
- map.insert(header, value);
261
- }
262
-
263
- let record = RowRecord::Map::<RandomState>(map);
264
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
265
- }
266
- }
267
- }
268
- ParserResultType::Array => {
269
- for batch_result in reader {
270
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
271
-
272
- // Filter columns if needed
273
- let batch = if let Some(cols) = columns {
274
- filter_record_batch(&batch, cols)?
275
- } else {
276
- batch
277
- };
278
-
279
- // Convert columnar data to rows
280
- for row_idx in 0..batch.num_rows() {
281
- let mut row_vec = Vec::with_capacity(batch.num_columns());
282
-
283
- for column in batch.columns() {
284
- let value = extract_value_at_index(column, row_idx, strict)?;
285
- row_vec.push(value);
286
- }
287
-
288
- let record = RowRecord::Vec::<RandomState>(row_vec);
289
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
290
- }
291
- }
292
- }
293
- }
294
-
295
- Ok(())
296
- }
297
-
298
- /// Process Arrow IPC file with FileReader (for seekable sources)
299
- pub fn process_arrow_file_column_data(
300
- ruby: Rc<Ruby>,
301
- file: File,
302
- columns: &Option<Vec<String>>,
303
- result_type: ParserResultType,
304
- _batch_size: Option<usize>,
305
- strict: bool,
306
- ruby_logger: &RubyLogger,
307
- ) -> Result<(), ParquetGemError> {
308
- let reader =
309
- FileReader::try_new(file, None).map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
310
-
311
- let schema = reader.schema();
312
- ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
313
-
314
- // FileReader implements Iterator<Item = Result<RecordBatch, ArrowError>>
315
- match result_type {
316
- ParserResultType::Hash => {
317
- let headers = OnceLock::new();
318
-
319
- for batch_result in reader {
320
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
321
-
322
- // Filter columns if needed
323
- let batch = if let Some(cols) = columns {
324
- filter_record_batch(&batch, cols)?
325
- } else {
326
- batch
327
- };
328
-
329
- let local_headers = headers
330
- .get_or_init(|| {
331
- let schema = batch.schema();
332
- let fields = schema.fields();
333
- let mut header_string = Vec::with_capacity(fields.len());
334
- for field in fields {
335
- header_string.push(field.name().to_owned());
336
- }
337
- StringCache::intern_many(&header_string)
338
- })
339
- .as_ref()
340
- .map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
341
-
342
- let mut map =
343
- HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
344
-
345
- batch
346
- .columns()
347
- .iter()
348
- .enumerate()
349
- .try_for_each(|(i, column)| {
350
- let header = local_headers[i];
351
- let values = ParquetValueVec::try_from(ArrayWrapper {
352
- array: column,
353
- strict,
354
- })?;
355
- map.insert(header, values.into_inner());
356
- Ok::<_, ParquetGemError>(())
357
- })?;
358
-
359
- let record = ColumnRecord::Map::<RandomState>(map);
360
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
361
- }
362
- }
363
- ParserResultType::Array => {
364
- for batch_result in reader {
365
- let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
366
-
367
- // Filter columns if needed
368
- let batch = if let Some(cols) = columns {
369
- filter_record_batch(&batch, cols)?
370
- } else {
371
- batch
372
- };
373
-
374
- let vec = batch
375
- .columns()
376
- .iter()
377
- .map(|column| {
378
- let values = ParquetValueVec::try_from(ArrayWrapper {
379
- array: column,
380
- strict,
381
- })?;
382
- Ok::<_, ParquetGemError>(values.into_inner())
383
- })
384
- .collect::<Result<Vec<_>, _>>()?;
385
-
386
- let record = ColumnRecord::Vec::<RandomState>(vec);
387
- let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
388
- }
389
- }
390
- }
391
-
392
- Ok(())
393
- }
394
-
395
- /// Extract a single value from an Arrow array at a specific index
396
- fn extract_value_at_index(
397
- array: &Arc<dyn arrow_array::Array>,
398
- index: usize,
399
- strict: bool,
400
- ) -> Result<crate::types::ParquetField, ParquetGemError> {
401
- use crate::types::ParquetField;
402
- use arrow_array::*;
403
- use arrow_schema::DataType;
404
- use parquet::record::Field;
405
-
406
- // Convert Arrow array value at index to Parquet Field
407
- let field = match array.data_type() {
408
- DataType::Boolean => {
409
- let arr = array.as_any().downcast_ref::<BooleanArray>().unwrap();
410
- if arr.is_null(index) {
411
- Field::Null
412
- } else {
413
- Field::Bool(arr.value(index))
414
- }
415
- }
416
- DataType::Int8 => {
417
- let arr = array.as_any().downcast_ref::<Int8Array>().unwrap();
418
- if arr.is_null(index) {
419
- Field::Null
420
- } else {
421
- Field::Byte(arr.value(index) as i8)
422
- }
423
- }
424
- DataType::Int16 => {
425
- let arr = array.as_any().downcast_ref::<Int16Array>().unwrap();
426
- if arr.is_null(index) {
427
- Field::Null
428
- } else {
429
- Field::Short(arr.value(index))
430
- }
431
- }
432
- DataType::Int32 => {
433
- let arr = array.as_any().downcast_ref::<Int32Array>().unwrap();
434
- if arr.is_null(index) {
435
- Field::Null
436
- } else {
437
- Field::Int(arr.value(index))
438
- }
439
- }
440
- DataType::Int64 => {
441
- let arr = array.as_any().downcast_ref::<Int64Array>().unwrap();
442
- if arr.is_null(index) {
443
- Field::Null
444
- } else {
445
- Field::Long(arr.value(index))
446
- }
447
- }
448
- DataType::UInt8 => {
449
- let arr = array.as_any().downcast_ref::<UInt8Array>().unwrap();
450
- if arr.is_null(index) {
451
- Field::Null
452
- } else {
453
- Field::UByte(arr.value(index))
454
- }
455
- }
456
- DataType::UInt16 => {
457
- let arr = array.as_any().downcast_ref::<UInt16Array>().unwrap();
458
- if arr.is_null(index) {
459
- Field::Null
460
- } else {
461
- Field::UShort(arr.value(index))
462
- }
463
- }
464
- DataType::UInt32 => {
465
- let arr = array.as_any().downcast_ref::<UInt32Array>().unwrap();
466
- if arr.is_null(index) {
467
- Field::Null
468
- } else {
469
- Field::UInt(arr.value(index))
470
- }
471
- }
472
- DataType::UInt64 => {
473
- let arr = array.as_any().downcast_ref::<UInt64Array>().unwrap();
474
- if arr.is_null(index) {
475
- Field::Null
476
- } else {
477
- Field::ULong(arr.value(index))
478
- }
479
- }
480
- DataType::Float32 => {
481
- let arr = array.as_any().downcast_ref::<Float32Array>().unwrap();
482
- if arr.is_null(index) {
483
- Field::Null
484
- } else {
485
- Field::Float(arr.value(index))
486
- }
487
- }
488
- DataType::Float64 => {
489
- let arr = array.as_any().downcast_ref::<Float64Array>().unwrap();
490
- if arr.is_null(index) {
491
- Field::Null
492
- } else {
493
- Field::Double(arr.value(index))
494
- }
495
- }
496
- DataType::Utf8 => {
497
- let arr = array.as_any().downcast_ref::<StringArray>().unwrap();
498
- if arr.is_null(index) {
499
- Field::Null
500
- } else {
501
- Field::Str(arr.value(index).to_string())
502
- }
503
- }
504
- DataType::Binary => {
505
- let arr = array.as_any().downcast_ref::<BinaryArray>().unwrap();
506
- if arr.is_null(index) {
507
- Field::Null
508
- } else {
509
- Field::Bytes(arr.value(index).into())
510
- }
511
- }
512
- DataType::Date32 => {
513
- let arr = array.as_any().downcast_ref::<Date32Array>().unwrap();
514
- if arr.is_null(index) {
515
- Field::Null
516
- } else {
517
- Field::Date(arr.value(index))
518
- }
519
- }
520
- DataType::Timestamp(unit, _tz) => match unit {
521
- arrow_schema::TimeUnit::Millisecond => {
522
- let arr = array
523
- .as_any()
524
- .downcast_ref::<TimestampMillisecondArray>()
525
- .unwrap();
526
- if arr.is_null(index) {
527
- Field::Null
528
- } else {
529
- Field::TimestampMillis(arr.value(index))
530
- }
531
- }
532
- arrow_schema::TimeUnit::Microsecond => {
533
- let arr = array
534
- .as_any()
535
- .downcast_ref::<TimestampMicrosecondArray>()
536
- .unwrap();
537
- if arr.is_null(index) {
538
- Field::Null
539
- } else {
540
- Field::TimestampMicros(arr.value(index))
541
- }
542
- }
543
- _ => Field::Null,
544
- },
545
- // Add more type handling as needed
546
- _ => Field::Null,
547
- };
548
-
549
- // For Arrow files, we don't have Parquet logical types, so we use defaults
550
- Ok(ParquetField {
551
- field,
552
- converted_type: parquet::basic::ConvertedType::NONE,
553
- logical_type: None,
554
- strict,
555
- })
556
- }
557
-
558
- /// Filter a RecordBatch to only include specified columns
559
- fn filter_record_batch(
560
- batch: &RecordBatch,
561
- columns: &[String],
562
- ) -> Result<RecordBatch, ParquetGemError> {
563
- let schema = batch.schema();
564
- let mut indices = Vec::new();
565
- let mut fields = Vec::new();
566
-
567
- for (i, field) in schema.fields().iter().enumerate() {
568
- if columns.contains(&field.name().to_string()) {
569
- indices.push(i);
570
- fields.push(field.clone());
571
- }
572
- }
573
-
574
- let new_schema = Arc::new(Schema::new(fields));
575
- let new_columns: Vec<_> = indices.iter().map(|&i| batch.column(i).clone()).collect();
576
-
577
- RecordBatch::try_new(new_schema, new_columns)
578
- .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))
579
- }
@@ -1,69 +0,0 @@
1
- use crate::types::ParquetGemError;
2
- use std::io::{Read, Seek, SeekFrom};
3
-
4
- #[derive(Debug, Clone, Copy, PartialEq, Eq)]
5
- pub enum FileFormat {
6
- Parquet,
7
- Arrow,
8
- }
9
-
10
- /// Detect the file format by examining magic bytes
11
- pub fn detect_file_format<R: Read + Seek>(source: &mut R) -> Result<FileFormat, ParquetGemError> {
12
- let mut magic = [0u8; 8];
13
-
14
- // Read the first 8 bytes
15
- let bytes_read = source.read(&mut magic).map_err(ParquetGemError::from)?;
16
-
17
- // Reset to beginning
18
- source
19
- .seek(SeekFrom::Start(0))
20
- .map_err(ParquetGemError::from)?;
21
-
22
- if bytes_read >= 6 {
23
- // Arrow IPC file format magic: "ARROW1\0\0"
24
- if &magic[0..6] == b"ARROW1" {
25
- return Ok(FileFormat::Arrow);
26
- }
27
- }
28
-
29
- if bytes_read >= 4 {
30
- // Parquet magic: "PAR1" at start
31
- if &magic[0..4] == b"PAR1" {
32
- return Ok(FileFormat::Parquet);
33
- }
34
- }
35
-
36
- // If we can't detect from the beginning, check the end for Parquet
37
- // Parquet files also have "PAR1" at the end
38
- if let Ok(pos) = source.seek(SeekFrom::End(-4)) {
39
- if pos >= 4 {
40
- let mut end_magic = [0u8; 4];
41
- if source.read_exact(&mut end_magic).is_ok() && &end_magic == b"PAR1" {
42
- // Important: Reset to beginning before returning
43
- source
44
- .seek(SeekFrom::Start(0))
45
- .map_err(ParquetGemError::from)?;
46
- return Ok(FileFormat::Parquet);
47
- }
48
- }
49
- }
50
-
51
- // Always reset to beginning, even for unknown format
52
- source
53
- .seek(SeekFrom::Start(0))
54
- .map_err(ParquetGemError::from)?;
55
-
56
- Err(ParquetGemError::UnknownFormat)
57
- }
58
-
59
- /// Detect format from file extension as a fallback
60
- pub fn detect_format_from_extension(path: &str) -> Option<FileFormat> {
61
- let lower = path.to_lowercase();
62
- if lower.ends_with(".parquet") || lower.ends_with(".parq") {
63
- Some(FileFormat::Parquet)
64
- } else if lower.ends_with(".arrow") || lower.ends_with(".feather") || lower.ends_with(".ipc") {
65
- Some(FileFormat::Arrow)
66
- } else {
67
- None
68
- }
69
- }