parquet 0.5.9 → 0.5.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +3 -0
- data/ext/parquet/Cargo.toml +2 -0
- data/ext/parquet/build.rs +1 -1
- data/ext/parquet/src/lib.rs +3 -0
- data/ext/parquet/src/reader/arrow_reader.rs +579 -0
- data/ext/parquet/src/reader/common.rs +65 -11
- data/ext/parquet/src/reader/format_detector.rs +69 -0
- data/ext/parquet/src/reader/mod.rs +7 -2
- data/ext/parquet/src/reader/unified/mod.rs +82 -14
- data/ext/parquet/src/types/core_types.rs +1 -0
- data/ext/parquet/src/types/mod.rs +11 -4
- data/ext/parquet/src/types/parquet_value.rs +290 -73
- data/ext/parquet/src/types/record_types.rs +92 -8
- data/ext/parquet/src/types/schema_node.rs +11 -5
- data/ext/parquet/src/types/type_conversion.rs +216 -0
- data/ext/parquet/src/types/writer_types.rs +50 -0
- data/ext/parquet/src/writer/mod.rs +3 -0
- data/ext/parquet/src/writer/write_columns.rs +3 -0
- data/ext/parquet/src/writer/write_rows.rs +1 -0
- data/lib/parquet/version.rb +1 -1
- metadata +4 -2
@@ -12,27 +12,81 @@ use magnus::value::ReprValue;
|
|
12
12
|
use magnus::{Error as MagnusError, Ruby, Value};
|
13
13
|
|
14
14
|
use crate::header_cache::StringCache;
|
15
|
+
use crate::logger::RubyLogger;
|
15
16
|
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
16
17
|
use crate::types::{ParquetGemError, TryIntoValue};
|
17
18
|
use crate::ColumnRecord;
|
18
19
|
|
19
|
-
|
20
|
-
|
21
|
-
///
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
use super::format_detector::{detect_file_format, detect_format_from_extension, FileFormat};
|
21
|
+
|
22
|
+
/// Represents the different data sources we can open
|
23
|
+
pub enum DataSource {
|
24
|
+
Parquet(Either<File, ThreadSafeRubyReader>),
|
25
|
+
Arrow(Either<File, ThreadSafeRubyReader>),
|
26
|
+
}
|
27
|
+
|
28
|
+
/// Opens a data file (Parquet or Arrow) for reading, automatically detecting the format
|
29
|
+
pub fn open_data_source(
|
25
30
|
ruby: Rc<Ruby>,
|
26
31
|
to_read: Value,
|
27
|
-
|
32
|
+
ruby_logger: &RubyLogger,
|
33
|
+
) -> Result<DataSource, ParquetGemError> {
|
28
34
|
if to_read.is_kind_of(ruby.class_string()) {
|
29
35
|
let path_string = to_read.to_r_string()?;
|
30
36
|
let file_path = unsafe { path_string.as_str()? };
|
31
|
-
|
32
|
-
|
37
|
+
|
38
|
+
// Try to detect format from extension first
|
39
|
+
let format_hint = detect_format_from_extension(file_path);
|
40
|
+
|
41
|
+
let mut file = File::open(file_path).map_err(ParquetGemError::from)?;
|
42
|
+
|
43
|
+
// Detect actual format from file content
|
44
|
+
let format = detect_file_format(&mut file)?;
|
45
|
+
|
46
|
+
// Warn if extension doesn't match content
|
47
|
+
if let Some(hint) = format_hint {
|
48
|
+
if hint != format {
|
49
|
+
ruby_logger.warn(|| {
|
50
|
+
format!(
|
51
|
+
"Extension implied format {:?} but actual format is {:?}",
|
52
|
+
hint, format
|
53
|
+
)
|
54
|
+
})?;
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
match format {
|
59
|
+
FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
|
60
|
+
FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
|
61
|
+
}
|
33
62
|
} else {
|
34
|
-
|
35
|
-
|
63
|
+
// For IO-like objects, we need to use a temporary file
|
64
|
+
use std::io::{Read, Write};
|
65
|
+
use tempfile::NamedTempFile;
|
66
|
+
|
67
|
+
let mut readable = RubyReader::new(ruby.clone(), to_read)?;
|
68
|
+
let mut temp_file = NamedTempFile::new().map_err(ParquetGemError::from)?;
|
69
|
+
|
70
|
+
// Copy the entire content to the temporary file
|
71
|
+
let mut buffer = vec![0u8; 8192];
|
72
|
+
loop {
|
73
|
+
let bytes_read = readable.read(&mut buffer)?;
|
74
|
+
if bytes_read == 0 {
|
75
|
+
break;
|
76
|
+
}
|
77
|
+
temp_file.write_all(&buffer[..bytes_read])?;
|
78
|
+
}
|
79
|
+
temp_file.flush()?;
|
80
|
+
|
81
|
+
// Detect format from the temporary file
|
82
|
+
let mut file = temp_file.reopen()?;
|
83
|
+
let format = detect_file_format(&mut file)?;
|
84
|
+
|
85
|
+
// Use the temporary file as the source
|
86
|
+
match format {
|
87
|
+
FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
|
88
|
+
FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
|
89
|
+
}
|
36
90
|
}
|
37
91
|
}
|
38
92
|
|
@@ -0,0 +1,69 @@
|
|
1
|
+
use crate::types::ParquetGemError;
|
2
|
+
use std::io::{Read, Seek, SeekFrom};
|
3
|
+
|
4
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
5
|
+
pub enum FileFormat {
|
6
|
+
Parquet,
|
7
|
+
Arrow,
|
8
|
+
}
|
9
|
+
|
10
|
+
/// Detect the file format by examining magic bytes
|
11
|
+
pub fn detect_file_format<R: Read + Seek>(source: &mut R) -> Result<FileFormat, ParquetGemError> {
|
12
|
+
let mut magic = [0u8; 8];
|
13
|
+
|
14
|
+
// Read the first 8 bytes
|
15
|
+
let bytes_read = source.read(&mut magic).map_err(ParquetGemError::from)?;
|
16
|
+
|
17
|
+
// Reset to beginning
|
18
|
+
source
|
19
|
+
.seek(SeekFrom::Start(0))
|
20
|
+
.map_err(ParquetGemError::from)?;
|
21
|
+
|
22
|
+
if bytes_read >= 6 {
|
23
|
+
// Arrow IPC file format magic: "ARROW1\0\0"
|
24
|
+
if &magic[0..6] == b"ARROW1" {
|
25
|
+
return Ok(FileFormat::Arrow);
|
26
|
+
}
|
27
|
+
}
|
28
|
+
|
29
|
+
if bytes_read >= 4 {
|
30
|
+
// Parquet magic: "PAR1" at start
|
31
|
+
if &magic[0..4] == b"PAR1" {
|
32
|
+
return Ok(FileFormat::Parquet);
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
// If we can't detect from the beginning, check the end for Parquet
|
37
|
+
// Parquet files also have "PAR1" at the end
|
38
|
+
if let Ok(pos) = source.seek(SeekFrom::End(-4)) {
|
39
|
+
if pos >= 4 {
|
40
|
+
let mut end_magic = [0u8; 4];
|
41
|
+
if source.read_exact(&mut end_magic).is_ok() && &end_magic == b"PAR1" {
|
42
|
+
// Important: Reset to beginning before returning
|
43
|
+
source
|
44
|
+
.seek(SeekFrom::Start(0))
|
45
|
+
.map_err(ParquetGemError::from)?;
|
46
|
+
return Ok(FileFormat::Parquet);
|
47
|
+
}
|
48
|
+
}
|
49
|
+
}
|
50
|
+
|
51
|
+
// Always reset to beginning, even for unknown format
|
52
|
+
source
|
53
|
+
.seek(SeekFrom::Start(0))
|
54
|
+
.map_err(ParquetGemError::from)?;
|
55
|
+
|
56
|
+
Err(ParquetGemError::UnknownFormat)
|
57
|
+
}
|
58
|
+
|
59
|
+
/// Detect format from file extension as a fallback
|
60
|
+
pub fn detect_format_from_extension(path: &str) -> Option<FileFormat> {
|
61
|
+
let lower = path.to_lowercase();
|
62
|
+
if lower.ends_with(".parquet") || lower.ends_with(".parq") {
|
63
|
+
Some(FileFormat::Parquet)
|
64
|
+
} else if lower.ends_with(".arrow") || lower.ends_with(".feather") || lower.ends_with(".ipc") {
|
65
|
+
Some(FileFormat::Arrow)
|
66
|
+
} else {
|
67
|
+
None
|
68
|
+
}
|
69
|
+
}
|
@@ -1,4 +1,6 @@
|
|
1
|
+
mod arrow_reader;
|
1
2
|
mod common;
|
3
|
+
mod format_detector;
|
2
4
|
mod parquet_column_reader;
|
3
5
|
mod parquet_row_reader;
|
4
6
|
mod unified;
|
@@ -188,7 +190,10 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
|
|
188
190
|
if args.len() != 1 {
|
189
191
|
return Err(MagnusError::new(
|
190
192
|
magnus::exception::arg_error(),
|
191
|
-
format!(
|
193
|
+
format!(
|
194
|
+
"metadata expects exactly 1 argument (file path or IO-like object), got {}",
|
195
|
+
args.len()
|
196
|
+
),
|
192
197
|
));
|
193
198
|
}
|
194
199
|
|
@@ -208,4 +213,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
|
|
208
213
|
let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
|
209
214
|
|
210
215
|
Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
|
211
|
-
}
|
216
|
+
}
|
@@ -13,8 +13,11 @@ use std::collections::HashMap;
|
|
13
13
|
use std::rc::Rc;
|
14
14
|
use std::sync::OnceLock;
|
15
15
|
|
16
|
+
use super::arrow_reader::{
|
17
|
+
process_arrow_column_data, process_arrow_file_column_data, process_arrow_row_data,
|
18
|
+
};
|
16
19
|
use super::common::{
|
17
|
-
create_batch_reader, handle_block_or_enum, handle_empty_file,
|
20
|
+
create_batch_reader, handle_block_or_enum, handle_empty_file, open_data_source, DataSource,
|
18
21
|
};
|
19
22
|
use crate::types::ArrayWrapper;
|
20
23
|
|
@@ -100,34 +103,99 @@ pub fn parse_parquet_unified(
|
|
100
103
|
}
|
101
104
|
}
|
102
105
|
|
103
|
-
// Open the
|
104
|
-
let source =
|
106
|
+
// Open the data source and detect format
|
107
|
+
let source = open_data_source(ruby.clone(), to_read, &ruby_logger)?;
|
105
108
|
|
106
|
-
// Based on the parser type, handle the data differently
|
107
|
-
match parser_type {
|
108
|
-
ParserType::Row { strict } => {
|
109
|
-
// Handle row-based parsing
|
109
|
+
// Based on the source format and parser type, handle the data differently
|
110
|
+
match (source, &parser_type) {
|
111
|
+
(DataSource::Parquet(reader), ParserType::Row { strict }) => {
|
112
|
+
// Handle Parquet row-based parsing
|
110
113
|
process_row_data(
|
111
114
|
ruby.clone(),
|
112
|
-
|
115
|
+
reader,
|
113
116
|
&columns,
|
114
117
|
result_type,
|
115
|
-
strict,
|
118
|
+
*strict,
|
116
119
|
&ruby_logger,
|
117
120
|
)?;
|
118
121
|
}
|
119
|
-
ParserType::Column { batch_size, strict } => {
|
120
|
-
// Handle column-based parsing
|
122
|
+
(DataSource::Parquet(reader), ParserType::Column { batch_size, strict }) => {
|
123
|
+
// Handle Parquet column-based parsing
|
121
124
|
process_column_data(
|
122
125
|
ruby.clone(),
|
123
|
-
|
126
|
+
reader,
|
124
127
|
&columns,
|
125
128
|
result_type,
|
126
|
-
batch_size,
|
127
|
-
strict,
|
129
|
+
*batch_size,
|
130
|
+
*strict,
|
128
131
|
&ruby_logger,
|
129
132
|
)?;
|
130
133
|
}
|
134
|
+
(DataSource::Arrow(reader), ParserType::Row { strict }) => {
|
135
|
+
// Handle Arrow row-based parsing
|
136
|
+
match reader {
|
137
|
+
Either::Left(file) => {
|
138
|
+
// For seekable files, use FileReader which handles IPC file format
|
139
|
+
use arrow_ipc::reader::FileReader;
|
140
|
+
let file_reader = FileReader::try_new(file, None)
|
141
|
+
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
142
|
+
|
143
|
+
use super::arrow_reader::process_arrow_file_row_data;
|
144
|
+
process_arrow_file_row_data(
|
145
|
+
ruby.clone(),
|
146
|
+
file_reader,
|
147
|
+
&columns,
|
148
|
+
result_type,
|
149
|
+
*strict,
|
150
|
+
&ruby_logger,
|
151
|
+
)?;
|
152
|
+
}
|
153
|
+
Either::Right(readable) => {
|
154
|
+
use arrow_ipc::reader::StreamReader;
|
155
|
+
let stream_reader = StreamReader::try_new(readable, None)
|
156
|
+
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
157
|
+
process_arrow_row_data(
|
158
|
+
ruby.clone(),
|
159
|
+
stream_reader,
|
160
|
+
&columns,
|
161
|
+
result_type,
|
162
|
+
*strict,
|
163
|
+
&ruby_logger,
|
164
|
+
)?;
|
165
|
+
}
|
166
|
+
}
|
167
|
+
}
|
168
|
+
(DataSource::Arrow(reader), ParserType::Column { batch_size, strict }) => {
|
169
|
+
// Handle Arrow column-based parsing
|
170
|
+
match reader {
|
171
|
+
Either::Left(file) => {
|
172
|
+
// For seekable files, we can use the optimized FileReader
|
173
|
+
process_arrow_file_column_data(
|
174
|
+
ruby.clone(),
|
175
|
+
file,
|
176
|
+
&columns,
|
177
|
+
result_type,
|
178
|
+
*batch_size,
|
179
|
+
*strict,
|
180
|
+
&ruby_logger,
|
181
|
+
)?;
|
182
|
+
}
|
183
|
+
Either::Right(readable) => {
|
184
|
+
use arrow_ipc::reader::StreamReader;
|
185
|
+
let stream_reader = StreamReader::try_new(readable, None)
|
186
|
+
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
187
|
+
process_arrow_column_data(
|
188
|
+
ruby.clone(),
|
189
|
+
stream_reader,
|
190
|
+
&columns,
|
191
|
+
result_type,
|
192
|
+
*batch_size,
|
193
|
+
*strict,
|
194
|
+
&ruby_logger,
|
195
|
+
)?;
|
196
|
+
}
|
197
|
+
}
|
198
|
+
}
|
131
199
|
}
|
132
200
|
|
133
201
|
Ok(ruby.qnil().into_value_with(&ruby))
|
@@ -23,10 +23,11 @@ pub use writer_types::*;
|
|
23
23
|
// Common imports used across the module
|
24
24
|
use arrow_array::cast::downcast_array;
|
25
25
|
use arrow_array::{
|
26
|
-
Array, BinaryArray, BooleanArray, Date32Array, Date64Array,
|
27
|
-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
|
28
|
-
|
29
|
-
|
26
|
+
Array, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array,
|
27
|
+
Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array,
|
28
|
+
ListArray, NullArray, StringArray, StructArray, TimestampMicrosecondArray,
|
29
|
+
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
|
30
|
+
UInt32Array, UInt64Array, UInt8Array,
|
30
31
|
};
|
31
32
|
use arrow_schema::{DataType, TimeUnit};
|
32
33
|
use magnus::{value::ReprValue, Error as MagnusError, IntoValue, Ruby, Value};
|
@@ -54,6 +55,10 @@ pub enum ParquetGemError {
|
|
54
55
|
Parquet(#[from] parquet::errors::ParquetError),
|
55
56
|
#[error("Arrow error: {0}")]
|
56
57
|
Arrow(#[from] arrow_schema::ArrowError),
|
58
|
+
#[error("Arrow IPC error: {0}")]
|
59
|
+
ArrowIpc(String),
|
60
|
+
#[error("Unknown file format")]
|
61
|
+
UnknownFormat,
|
57
62
|
#[error("UTF-8 error: {0}")]
|
58
63
|
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
59
64
|
#[error("Jiff error: {0}")]
|
@@ -62,6 +67,8 @@ pub enum ParquetGemError {
|
|
62
67
|
InvalidDecimal(String),
|
63
68
|
#[error("Failed to parse UUID: {0}")]
|
64
69
|
UuidError(#[from] uuid::Error),
|
70
|
+
#[error("Decimals larger than 128 bits are not supported")]
|
71
|
+
DecimalWouldBeTruncated,
|
65
72
|
}
|
66
73
|
|
67
74
|
#[derive(Debug)]
|