parquet 0.5.9 → 0.5.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -12,27 +12,81 @@ use magnus::value::ReprValue;
12
12
  use magnus::{Error as MagnusError, Ruby, Value};
13
13
 
14
14
  use crate::header_cache::StringCache;
15
+ use crate::logger::RubyLogger;
15
16
  use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
16
17
  use crate::types::{ParquetGemError, TryIntoValue};
17
18
  use crate::ColumnRecord;
18
19
 
19
- /// Opens a parquet file or IO-like object for reading
20
- ///
21
- /// This function handles both file paths (as strings) and IO-like objects,
22
- /// returning either a File or a ThreadSafeRubyReader that can be used with
23
- /// parquet readers.
24
- pub fn open_parquet_source(
20
+ use super::format_detector::{detect_file_format, detect_format_from_extension, FileFormat};
21
+
22
+ /// Represents the different data sources we can open
23
+ pub enum DataSource {
24
+ Parquet(Either<File, ThreadSafeRubyReader>),
25
+ Arrow(Either<File, ThreadSafeRubyReader>),
26
+ }
27
+
28
+ /// Opens a data file (Parquet or Arrow) for reading, automatically detecting the format
29
+ pub fn open_data_source(
25
30
  ruby: Rc<Ruby>,
26
31
  to_read: Value,
27
- ) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
32
+ ruby_logger: &RubyLogger,
33
+ ) -> Result<DataSource, ParquetGemError> {
28
34
  if to_read.is_kind_of(ruby.class_string()) {
29
35
  let path_string = to_read.to_r_string()?;
30
36
  let file_path = unsafe { path_string.as_str()? };
31
- let file = File::open(file_path).map_err(ParquetGemError::from)?;
32
- Ok(Either::Left(file))
37
+
38
+ // Try to detect format from extension first
39
+ let format_hint = detect_format_from_extension(file_path);
40
+
41
+ let mut file = File::open(file_path).map_err(ParquetGemError::from)?;
42
+
43
+ // Detect actual format from file content
44
+ let format = detect_file_format(&mut file)?;
45
+
46
+ // Warn if extension doesn't match content
47
+ if let Some(hint) = format_hint {
48
+ if hint != format {
49
+ ruby_logger.warn(|| {
50
+ format!(
51
+ "Extension implied format {:?} but actual format is {:?}",
52
+ hint, format
53
+ )
54
+ })?;
55
+ }
56
+ }
57
+
58
+ match format {
59
+ FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
60
+ FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
61
+ }
33
62
  } else {
34
- let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
35
- Ok(Either::Right(readable))
63
+ // For IO-like objects, we need to use a temporary file
64
+ use std::io::{Read, Write};
65
+ use tempfile::NamedTempFile;
66
+
67
+ let mut readable = RubyReader::new(ruby.clone(), to_read)?;
68
+ let mut temp_file = NamedTempFile::new().map_err(ParquetGemError::from)?;
69
+
70
+ // Copy the entire content to the temporary file
71
+ let mut buffer = vec![0u8; 8192];
72
+ loop {
73
+ let bytes_read = readable.read(&mut buffer)?;
74
+ if bytes_read == 0 {
75
+ break;
76
+ }
77
+ temp_file.write_all(&buffer[..bytes_read])?;
78
+ }
79
+ temp_file.flush()?;
80
+
81
+ // Detect format from the temporary file
82
+ let mut file = temp_file.reopen()?;
83
+ let format = detect_file_format(&mut file)?;
84
+
85
+ // Use the temporary file as the source
86
+ match format {
87
+ FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
88
+ FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
89
+ }
36
90
  }
37
91
  }
38
92
 
@@ -0,0 +1,69 @@
1
+ use crate::types::ParquetGemError;
2
+ use std::io::{Read, Seek, SeekFrom};
3
+
4
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
5
+ pub enum FileFormat {
6
+ Parquet,
7
+ Arrow,
8
+ }
9
+
10
+ /// Detect the file format by examining magic bytes
11
+ pub fn detect_file_format<R: Read + Seek>(source: &mut R) -> Result<FileFormat, ParquetGemError> {
12
+ let mut magic = [0u8; 8];
13
+
14
+ // Read the first 8 bytes
15
+ let bytes_read = source.read(&mut magic).map_err(ParquetGemError::from)?;
16
+
17
+ // Reset to beginning
18
+ source
19
+ .seek(SeekFrom::Start(0))
20
+ .map_err(ParquetGemError::from)?;
21
+
22
+ if bytes_read >= 6 {
23
+ // Arrow IPC file format magic: "ARROW1\0\0"
24
+ if &magic[0..6] == b"ARROW1" {
25
+ return Ok(FileFormat::Arrow);
26
+ }
27
+ }
28
+
29
+ if bytes_read >= 4 {
30
+ // Parquet magic: "PAR1" at start
31
+ if &magic[0..4] == b"PAR1" {
32
+ return Ok(FileFormat::Parquet);
33
+ }
34
+ }
35
+
36
+ // If we can't detect from the beginning, check the end for Parquet
37
+ // Parquet files also have "PAR1" at the end
38
+ if let Ok(pos) = source.seek(SeekFrom::End(-4)) {
39
+ if pos >= 4 {
40
+ let mut end_magic = [0u8; 4];
41
+ if source.read_exact(&mut end_magic).is_ok() && &end_magic == b"PAR1" {
42
+ // Important: Reset to beginning before returning
43
+ source
44
+ .seek(SeekFrom::Start(0))
45
+ .map_err(ParquetGemError::from)?;
46
+ return Ok(FileFormat::Parquet);
47
+ }
48
+ }
49
+ }
50
+
51
+ // Always reset to beginning, even for unknown format
52
+ source
53
+ .seek(SeekFrom::Start(0))
54
+ .map_err(ParquetGemError::from)?;
55
+
56
+ Err(ParquetGemError::UnknownFormat)
57
+ }
58
+
59
+ /// Detect format from file extension as a fallback
60
+ pub fn detect_format_from_extension(path: &str) -> Option<FileFormat> {
61
+ let lower = path.to_lowercase();
62
+ if lower.ends_with(".parquet") || lower.ends_with(".parq") {
63
+ Some(FileFormat::Parquet)
64
+ } else if lower.ends_with(".arrow") || lower.ends_with(".feather") || lower.ends_with(".ipc") {
65
+ Some(FileFormat::Arrow)
66
+ } else {
67
+ None
68
+ }
69
+ }
@@ -1,4 +1,6 @@
1
+ mod arrow_reader;
1
2
  mod common;
3
+ mod format_detector;
2
4
  mod parquet_column_reader;
3
5
  mod parquet_row_reader;
4
6
  mod unified;
@@ -188,7 +190,10 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
188
190
  if args.len() != 1 {
189
191
  return Err(MagnusError::new(
190
192
  magnus::exception::arg_error(),
191
- format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
193
+ format!(
194
+ "metadata expects exactly 1 argument (file path or IO-like object), got {}",
195
+ args.len()
196
+ ),
192
197
  ));
193
198
  }
194
199
 
@@ -208,4 +213,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
208
213
  let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
209
214
 
210
215
  Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
211
- }
216
+ }
@@ -13,8 +13,11 @@ use std::collections::HashMap;
13
13
  use std::rc::Rc;
14
14
  use std::sync::OnceLock;
15
15
 
16
+ use super::arrow_reader::{
17
+ process_arrow_column_data, process_arrow_file_column_data, process_arrow_row_data,
18
+ };
16
19
  use super::common::{
17
- create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
20
+ create_batch_reader, handle_block_or_enum, handle_empty_file, open_data_source, DataSource,
18
21
  };
19
22
  use crate::types::ArrayWrapper;
20
23
 
@@ -100,34 +103,99 @@ pub fn parse_parquet_unified(
100
103
  }
101
104
  }
102
105
 
103
- // Open the Parquet source
104
- let source = open_parquet_source(ruby.clone(), to_read)?;
106
+ // Open the data source and detect format
107
+ let source = open_data_source(ruby.clone(), to_read, &ruby_logger)?;
105
108
 
106
- // Based on the parser type, handle the data differently
107
- match parser_type {
108
- ParserType::Row { strict } => {
109
- // Handle row-based parsing
109
+ // Based on the source format and parser type, handle the data differently
110
+ match (source, &parser_type) {
111
+ (DataSource::Parquet(reader), ParserType::Row { strict }) => {
112
+ // Handle Parquet row-based parsing
110
113
  process_row_data(
111
114
  ruby.clone(),
112
- source,
115
+ reader,
113
116
  &columns,
114
117
  result_type,
115
- strict,
118
+ *strict,
116
119
  &ruby_logger,
117
120
  )?;
118
121
  }
119
- ParserType::Column { batch_size, strict } => {
120
- // Handle column-based parsing
122
+ (DataSource::Parquet(reader), ParserType::Column { batch_size, strict }) => {
123
+ // Handle Parquet column-based parsing
121
124
  process_column_data(
122
125
  ruby.clone(),
123
- source,
126
+ reader,
124
127
  &columns,
125
128
  result_type,
126
- batch_size,
127
- strict,
129
+ *batch_size,
130
+ *strict,
128
131
  &ruby_logger,
129
132
  )?;
130
133
  }
134
+ (DataSource::Arrow(reader), ParserType::Row { strict }) => {
135
+ // Handle Arrow row-based parsing
136
+ match reader {
137
+ Either::Left(file) => {
138
+ // For seekable files, use FileReader which handles IPC file format
139
+ use arrow_ipc::reader::FileReader;
140
+ let file_reader = FileReader::try_new(file, None)
141
+ .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
142
+
143
+ use super::arrow_reader::process_arrow_file_row_data;
144
+ process_arrow_file_row_data(
145
+ ruby.clone(),
146
+ file_reader,
147
+ &columns,
148
+ result_type,
149
+ *strict,
150
+ &ruby_logger,
151
+ )?;
152
+ }
153
+ Either::Right(readable) => {
154
+ use arrow_ipc::reader::StreamReader;
155
+ let stream_reader = StreamReader::try_new(readable, None)
156
+ .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
157
+ process_arrow_row_data(
158
+ ruby.clone(),
159
+ stream_reader,
160
+ &columns,
161
+ result_type,
162
+ *strict,
163
+ &ruby_logger,
164
+ )?;
165
+ }
166
+ }
167
+ }
168
+ (DataSource::Arrow(reader), ParserType::Column { batch_size, strict }) => {
169
+ // Handle Arrow column-based parsing
170
+ match reader {
171
+ Either::Left(file) => {
172
+ // For seekable files, we can use the optimized FileReader
173
+ process_arrow_file_column_data(
174
+ ruby.clone(),
175
+ file,
176
+ &columns,
177
+ result_type,
178
+ *batch_size,
179
+ *strict,
180
+ &ruby_logger,
181
+ )?;
182
+ }
183
+ Either::Right(readable) => {
184
+ use arrow_ipc::reader::StreamReader;
185
+ let stream_reader = StreamReader::try_new(readable, None)
186
+ .map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
187
+ process_arrow_column_data(
188
+ ruby.clone(),
189
+ stream_reader,
190
+ &columns,
191
+ result_type,
192
+ *batch_size,
193
+ *strict,
194
+ &ruby_logger,
195
+ )?;
196
+ }
197
+ }
198
+ }
131
199
  }
132
200
 
133
201
  Ok(ruby.qnil().into_value_with(&ruby))
@@ -108,6 +108,7 @@ pub enum PrimitiveType {
108
108
  Float32,
109
109
  Float64,
110
110
  Decimal128(u8, i8),
111
+ Decimal256(u8, i8),
111
112
  Boolean,
112
113
  String,
113
114
  Binary,
@@ -23,10 +23,11 @@ pub use writer_types::*;
23
23
  // Common imports used across the module
24
24
  use arrow_array::cast::downcast_array;
25
25
  use arrow_array::{
26
- Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Float16Array, Float32Array,
27
- Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, ListArray, NullArray, StringArray,
28
- StructArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
29
- TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
26
+ Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
27
+ Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
28
+ ListArray, NullArray, StringArray, StructArray, TimestampMicrosecondArray,
29
+ TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
30
+ UInt32Array, UInt64Array, UInt8Array,
30
31
  };
31
32
  use arrow_schema::{DataType, TimeUnit};
32
33
  use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
@@ -54,6 +55,10 @@ pub enum ParquetGemError {
54
55
  Parquet(#[from] parquet::errors::ParquetError),
55
56
  #[error("Arrow error: {0}")]
56
57
  Arrow(#[from] arrow_schema::ArrowError),
58
+ #[error("Arrow IPC error: {0}")]
59
+ ArrowIpc(String),
60
+ #[error("Unknown file format")]
61
+ UnknownFormat,
57
62
  #[error("UTF-8 error: {0}")]
58
63
  Utf8Error(#[from] simdutf8::basic::Utf8Error),
59
64
  #[error("Jiff error: {0}")]
@@ -62,6 +67,8 @@ pub enum ParquetGemError {
62
67
  InvalidDecimal(String),
63
68
  #[error("Failed to parse UUID: {0}")]
64
69
  UuidError(#[from] uuid::Error),
70
+ #[error("Decimals larger than 128 bits are not supported")]
71
+ DecimalWouldBeTruncated,
65
72
  }
66
73
 
67
74
  #[derive(Debug)]