parquet 0.5.10 → 0.5.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/ext/parquet/src/reader/common.rs +11 -65
- data/ext/parquet/src/reader/mod.rs +2 -7
- data/ext/parquet/src/reader/unified/mod.rs +14 -82
- data/ext/parquet/src/types/mod.rs +0 -4
- data/lib/parquet/version.rb +1 -1
- metadata +2 -4
- data/ext/parquet/src/reader/arrow_reader.rs +0 -579
- data/ext/parquet/src/reader/format_detector.rs +0 -69
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 82528b663c4a577262db90b6d17ba473a81d0ea725ceba486b63a3619040fa73
|
4
|
+
data.tar.gz: 2e44daa9b4e36ef1503589daaa0815cbc3acee10c565d9942f6c0b6d35ced5f0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 418951253384f5492385fcb30fa5b0113b85d9bc51346b6abad16105c124d8869266943c1a29bc0879cfee4270b94d32fb99004e233c6ebde4a70e1d329435af
|
7
|
+
data.tar.gz: bc0db4ebb36add314253b5b9b946cc2c84f315d51ba7fefbead6c7de3b65a3f7752fa4e4cf0be19704405b390ae0106d8383e30791e7fac4a86a75141c214de1
|
@@ -12,81 +12,27 @@ use magnus::value::ReprValue;
|
|
12
12
|
use magnus::{Error as MagnusError, Ruby, Value};
|
13
13
|
|
14
14
|
use crate::header_cache::StringCache;
|
15
|
-
use crate::logger::RubyLogger;
|
16
15
|
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
17
16
|
use crate::types::{ParquetGemError, TryIntoValue};
|
18
17
|
use crate::ColumnRecord;
|
19
18
|
|
20
|
-
|
21
|
-
|
22
|
-
///
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
}
|
27
|
-
|
28
|
-
/// Opens a data file (Parquet or Arrow) for reading, automatically detecting the format
|
29
|
-
pub fn open_data_source(
|
19
|
+
/// Opens a parquet file or IO-like object for reading
|
20
|
+
///
|
21
|
+
/// This function handles both file paths (as strings) and IO-like objects,
|
22
|
+
/// returning either a File or a ThreadSafeRubyReader that can be used with
|
23
|
+
/// parquet readers.
|
24
|
+
pub fn open_parquet_source(
|
30
25
|
ruby: Rc<Ruby>,
|
31
26
|
to_read: Value,
|
32
|
-
|
33
|
-
) -> Result<DataSource, ParquetGemError> {
|
27
|
+
) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
|
34
28
|
if to_read.is_kind_of(ruby.class_string()) {
|
35
29
|
let path_string = to_read.to_r_string()?;
|
36
30
|
let file_path = unsafe { path_string.as_str()? };
|
37
|
-
|
38
|
-
|
39
|
-
let format_hint = detect_format_from_extension(file_path);
|
40
|
-
|
41
|
-
let mut file = File::open(file_path).map_err(ParquetGemError::from)?;
|
42
|
-
|
43
|
-
// Detect actual format from file content
|
44
|
-
let format = detect_file_format(&mut file)?;
|
45
|
-
|
46
|
-
// Warn if extension doesn't match content
|
47
|
-
if let Some(hint) = format_hint {
|
48
|
-
if hint != format {
|
49
|
-
ruby_logger.warn(|| {
|
50
|
-
format!(
|
51
|
-
"Extension implied format {:?} but actual format is {:?}",
|
52
|
-
hint, format
|
53
|
-
)
|
54
|
-
})?;
|
55
|
-
}
|
56
|
-
}
|
57
|
-
|
58
|
-
match format {
|
59
|
-
FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
|
60
|
-
FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
|
61
|
-
}
|
31
|
+
let file = File::open(file_path).map_err(ParquetGemError::from)?;
|
32
|
+
Ok(Either::Left(file))
|
62
33
|
} else {
|
63
|
-
|
64
|
-
|
65
|
-
use tempfile::NamedTempFile;
|
66
|
-
|
67
|
-
let mut readable = RubyReader::new(ruby.clone(), to_read)?;
|
68
|
-
let mut temp_file = NamedTempFile::new().map_err(ParquetGemError::from)?;
|
69
|
-
|
70
|
-
// Copy the entire content to the temporary file
|
71
|
-
let mut buffer = vec![0u8; 8192];
|
72
|
-
loop {
|
73
|
-
let bytes_read = readable.read(&mut buffer)?;
|
74
|
-
if bytes_read == 0 {
|
75
|
-
break;
|
76
|
-
}
|
77
|
-
temp_file.write_all(&buffer[..bytes_read])?;
|
78
|
-
}
|
79
|
-
temp_file.flush()?;
|
80
|
-
|
81
|
-
// Detect format from the temporary file
|
82
|
-
let mut file = temp_file.reopen()?;
|
83
|
-
let format = detect_file_format(&mut file)?;
|
84
|
-
|
85
|
-
// Use the temporary file as the source
|
86
|
-
match format {
|
87
|
-
FileFormat::Parquet => Ok(DataSource::Parquet(Either::Left(file))),
|
88
|
-
FileFormat::Arrow => Ok(DataSource::Arrow(Either::Left(file))),
|
89
|
-
}
|
34
|
+
let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
|
35
|
+
Ok(Either::Right(readable))
|
90
36
|
}
|
91
37
|
}
|
92
38
|
|
@@ -1,6 +1,4 @@
|
|
1
|
-
mod arrow_reader;
|
2
1
|
mod common;
|
3
|
-
mod format_detector;
|
4
2
|
mod parquet_column_reader;
|
5
3
|
mod parquet_row_reader;
|
6
4
|
mod unified;
|
@@ -190,10 +188,7 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
|
|
190
188
|
if args.len() != 1 {
|
191
189
|
return Err(MagnusError::new(
|
192
190
|
magnus::exception::arg_error(),
|
193
|
-
format!(
|
194
|
-
"metadata expects exactly 1 argument (file path or IO-like object), got {}",
|
195
|
-
args.len()
|
196
|
-
),
|
191
|
+
format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
|
197
192
|
));
|
198
193
|
}
|
199
194
|
|
@@ -213,4 +208,4 @@ pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusEr
|
|
213
208
|
let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
|
214
209
|
|
215
210
|
Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
|
216
|
-
}
|
211
|
+
}
|
@@ -13,11 +13,8 @@ use std::collections::HashMap;
|
|
13
13
|
use std::rc::Rc;
|
14
14
|
use std::sync::OnceLock;
|
15
15
|
|
16
|
-
use super::arrow_reader::{
|
17
|
-
process_arrow_column_data, process_arrow_file_column_data, process_arrow_row_data,
|
18
|
-
};
|
19
16
|
use super::common::{
|
20
|
-
create_batch_reader, handle_block_or_enum, handle_empty_file,
|
17
|
+
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
21
18
|
};
|
22
19
|
use crate::types::ArrayWrapper;
|
23
20
|
|
@@ -103,99 +100,34 @@ pub fn parse_parquet_unified(
|
|
103
100
|
}
|
104
101
|
}
|
105
102
|
|
106
|
-
// Open the
|
107
|
-
let source =
|
103
|
+
// Open the Parquet source
|
104
|
+
let source = open_parquet_source(ruby.clone(), to_read)?;
|
108
105
|
|
109
|
-
// Based on the
|
110
|
-
match
|
111
|
-
|
112
|
-
// Handle
|
106
|
+
// Based on the parser type, handle the data differently
|
107
|
+
match parser_type {
|
108
|
+
ParserType::Row { strict } => {
|
109
|
+
// Handle row-based parsing
|
113
110
|
process_row_data(
|
114
111
|
ruby.clone(),
|
115
|
-
|
112
|
+
source,
|
116
113
|
&columns,
|
117
114
|
result_type,
|
118
|
-
|
115
|
+
strict,
|
119
116
|
&ruby_logger,
|
120
117
|
)?;
|
121
118
|
}
|
122
|
-
|
123
|
-
// Handle
|
119
|
+
ParserType::Column { batch_size, strict } => {
|
120
|
+
// Handle column-based parsing
|
124
121
|
process_column_data(
|
125
122
|
ruby.clone(),
|
126
|
-
|
123
|
+
source,
|
127
124
|
&columns,
|
128
125
|
result_type,
|
129
|
-
|
130
|
-
|
126
|
+
batch_size,
|
127
|
+
strict,
|
131
128
|
&ruby_logger,
|
132
129
|
)?;
|
133
130
|
}
|
134
|
-
(DataSource::Arrow(reader), ParserType::Row { strict }) => {
|
135
|
-
// Handle Arrow row-based parsing
|
136
|
-
match reader {
|
137
|
-
Either::Left(file) => {
|
138
|
-
// For seekable files, use FileReader which handles IPC file format
|
139
|
-
use arrow_ipc::reader::FileReader;
|
140
|
-
let file_reader = FileReader::try_new(file, None)
|
141
|
-
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
142
|
-
|
143
|
-
use super::arrow_reader::process_arrow_file_row_data;
|
144
|
-
process_arrow_file_row_data(
|
145
|
-
ruby.clone(),
|
146
|
-
file_reader,
|
147
|
-
&columns,
|
148
|
-
result_type,
|
149
|
-
*strict,
|
150
|
-
&ruby_logger,
|
151
|
-
)?;
|
152
|
-
}
|
153
|
-
Either::Right(readable) => {
|
154
|
-
use arrow_ipc::reader::StreamReader;
|
155
|
-
let stream_reader = StreamReader::try_new(readable, None)
|
156
|
-
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
157
|
-
process_arrow_row_data(
|
158
|
-
ruby.clone(),
|
159
|
-
stream_reader,
|
160
|
-
&columns,
|
161
|
-
result_type,
|
162
|
-
*strict,
|
163
|
-
&ruby_logger,
|
164
|
-
)?;
|
165
|
-
}
|
166
|
-
}
|
167
|
-
}
|
168
|
-
(DataSource::Arrow(reader), ParserType::Column { batch_size, strict }) => {
|
169
|
-
// Handle Arrow column-based parsing
|
170
|
-
match reader {
|
171
|
-
Either::Left(file) => {
|
172
|
-
// For seekable files, we can use the optimized FileReader
|
173
|
-
process_arrow_file_column_data(
|
174
|
-
ruby.clone(),
|
175
|
-
file,
|
176
|
-
&columns,
|
177
|
-
result_type,
|
178
|
-
*batch_size,
|
179
|
-
*strict,
|
180
|
-
&ruby_logger,
|
181
|
-
)?;
|
182
|
-
}
|
183
|
-
Either::Right(readable) => {
|
184
|
-
use arrow_ipc::reader::StreamReader;
|
185
|
-
let stream_reader = StreamReader::try_new(readable, None)
|
186
|
-
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
187
|
-
process_arrow_column_data(
|
188
|
-
ruby.clone(),
|
189
|
-
stream_reader,
|
190
|
-
&columns,
|
191
|
-
result_type,
|
192
|
-
*batch_size,
|
193
|
-
*strict,
|
194
|
-
&ruby_logger,
|
195
|
-
)?;
|
196
|
-
}
|
197
|
-
}
|
198
|
-
}
|
199
131
|
}
|
200
132
|
|
201
133
|
Ok(ruby.qnil().into_value_with(&ruby))
|
@@ -55,10 +55,6 @@ pub enum ParquetGemError {
|
|
55
55
|
Parquet(#[from] parquet::errors::ParquetError),
|
56
56
|
#[error("Arrow error: {0}")]
|
57
57
|
Arrow(#[from] arrow_schema::ArrowError),
|
58
|
-
#[error("Arrow IPC error: {0}")]
|
59
|
-
ArrowIpc(String),
|
60
|
-
#[error("Unknown file format")]
|
61
|
-
UnknownFormat,
|
62
58
|
#[error("UTF-8 error: {0}")]
|
63
59
|
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
64
60
|
#[error("Jiff error: {0}")]
|
data/lib/parquet/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: parquet
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Nathan Jaremko
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-06-
|
11
|
+
date: 2025-06-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -62,9 +62,7 @@ files:
|
|
62
62
|
- ext/parquet/src/header_cache.rs
|
63
63
|
- ext/parquet/src/lib.rs
|
64
64
|
- ext/parquet/src/logger.rs
|
65
|
-
- ext/parquet/src/reader/arrow_reader.rs
|
66
65
|
- ext/parquet/src/reader/common.rs
|
67
|
-
- ext/parquet/src/reader/format_detector.rs
|
68
66
|
- ext/parquet/src/reader/mod.rs
|
69
67
|
- ext/parquet/src/reader/parquet_column_reader.rs
|
70
68
|
- ext/parquet/src/reader/parquet_row_reader.rs
|
@@ -1,579 +0,0 @@
|
|
1
|
-
use crate::header_cache::StringCache;
|
2
|
-
use crate::logger::RubyLogger;
|
3
|
-
use crate::types::ArrayWrapper;
|
4
|
-
use crate::types::{
|
5
|
-
ColumnRecord, ParquetGemError, ParquetValueVec, ParserResultType, RowRecord, TryIntoValue,
|
6
|
-
};
|
7
|
-
use ahash::RandomState;
|
8
|
-
use arrow_array::RecordBatch;
|
9
|
-
use arrow_ipc::reader::{FileReader, StreamReader};
|
10
|
-
use arrow_schema::Schema;
|
11
|
-
use magnus::{Ruby, Value};
|
12
|
-
use std::collections::HashMap;
|
13
|
-
use std::fs::File;
|
14
|
-
use std::io::Read;
|
15
|
-
use std::rc::Rc;
|
16
|
-
use std::sync::{Arc, OnceLock};
|
17
|
-
|
18
|
-
/// Process Arrow IPC file data for column-based parsing
|
19
|
-
pub fn process_arrow_column_data<R: Read>(
|
20
|
-
ruby: Rc<Ruby>,
|
21
|
-
reader: StreamReader<R>,
|
22
|
-
columns: &Option<Vec<String>>,
|
23
|
-
result_type: ParserResultType,
|
24
|
-
_batch_size: Option<usize>,
|
25
|
-
strict: bool,
|
26
|
-
ruby_logger: &RubyLogger,
|
27
|
-
) -> Result<(), ParquetGemError> {
|
28
|
-
let schema = reader.schema();
|
29
|
-
ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
|
30
|
-
|
31
|
-
// Filter schema if columns are specified
|
32
|
-
let _filtered_schema = if let Some(cols) = columns {
|
33
|
-
let mut fields = Vec::new();
|
34
|
-
for field in schema.fields() {
|
35
|
-
if cols.contains(&field.name().to_string()) {
|
36
|
-
fields.push(field.clone());
|
37
|
-
}
|
38
|
-
}
|
39
|
-
Arc::new(Schema::new(fields))
|
40
|
-
} else {
|
41
|
-
schema.clone()
|
42
|
-
};
|
43
|
-
|
44
|
-
match result_type {
|
45
|
-
ParserResultType::Hash => {
|
46
|
-
let headers = OnceLock::new();
|
47
|
-
|
48
|
-
for batch_result in reader {
|
49
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
50
|
-
|
51
|
-
// Filter columns if needed
|
52
|
-
let batch = if let Some(cols) = columns {
|
53
|
-
filter_record_batch(&batch, cols)?
|
54
|
-
} else {
|
55
|
-
batch
|
56
|
-
};
|
57
|
-
|
58
|
-
let local_headers = headers
|
59
|
-
.get_or_init(|| {
|
60
|
-
let schema = batch.schema();
|
61
|
-
let fields = schema.fields();
|
62
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
63
|
-
for field in fields {
|
64
|
-
header_string.push(field.name().to_owned());
|
65
|
-
}
|
66
|
-
StringCache::intern_many(&header_string)
|
67
|
-
})
|
68
|
-
.as_ref()
|
69
|
-
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
70
|
-
|
71
|
-
let mut map =
|
72
|
-
HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
|
73
|
-
|
74
|
-
batch
|
75
|
-
.columns()
|
76
|
-
.iter()
|
77
|
-
.enumerate()
|
78
|
-
.try_for_each(|(i, column)| {
|
79
|
-
let header = local_headers[i];
|
80
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
81
|
-
array: column,
|
82
|
-
strict,
|
83
|
-
})?;
|
84
|
-
map.insert(header, values.into_inner());
|
85
|
-
Ok::<_, ParquetGemError>(())
|
86
|
-
})?;
|
87
|
-
|
88
|
-
let record = ColumnRecord::Map::<RandomState>(map);
|
89
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
90
|
-
}
|
91
|
-
}
|
92
|
-
ParserResultType::Array => {
|
93
|
-
for batch_result in reader {
|
94
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
95
|
-
|
96
|
-
// Filter columns if needed
|
97
|
-
let batch = if let Some(cols) = columns {
|
98
|
-
filter_record_batch(&batch, cols)?
|
99
|
-
} else {
|
100
|
-
batch
|
101
|
-
};
|
102
|
-
|
103
|
-
let vec = batch
|
104
|
-
.columns()
|
105
|
-
.iter()
|
106
|
-
.map(|column| {
|
107
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
108
|
-
array: column,
|
109
|
-
strict,
|
110
|
-
})?;
|
111
|
-
Ok::<_, ParquetGemError>(values.into_inner())
|
112
|
-
})
|
113
|
-
.collect::<Result<Vec<_>, _>>()?;
|
114
|
-
|
115
|
-
let record = ColumnRecord::Vec::<RandomState>(vec);
|
116
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
117
|
-
}
|
118
|
-
}
|
119
|
-
}
|
120
|
-
|
121
|
-
Ok(())
|
122
|
-
}
|
123
|
-
|
124
|
-
/// Process Arrow IPC file data for row-based parsing
|
125
|
-
pub fn process_arrow_row_data<R: Read>(
|
126
|
-
ruby: Rc<Ruby>,
|
127
|
-
reader: StreamReader<R>,
|
128
|
-
columns: &Option<Vec<String>>,
|
129
|
-
result_type: ParserResultType,
|
130
|
-
strict: bool,
|
131
|
-
ruby_logger: &RubyLogger,
|
132
|
-
) -> Result<(), ParquetGemError> {
|
133
|
-
let schema = reader.schema();
|
134
|
-
ruby_logger.debug(|| format!("Arrow schema loaded: {:?}", schema))?;
|
135
|
-
|
136
|
-
match result_type {
|
137
|
-
ParserResultType::Hash => {
|
138
|
-
let headers = OnceLock::new();
|
139
|
-
|
140
|
-
for batch_result in reader {
|
141
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
142
|
-
|
143
|
-
// Filter columns if needed
|
144
|
-
let batch = if let Some(cols) = columns {
|
145
|
-
filter_record_batch(&batch, cols)?
|
146
|
-
} else {
|
147
|
-
batch
|
148
|
-
};
|
149
|
-
|
150
|
-
let local_headers = headers
|
151
|
-
.get_or_init(|| {
|
152
|
-
let schema = batch.schema();
|
153
|
-
let fields = schema.fields();
|
154
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
155
|
-
for field in fields {
|
156
|
-
header_string.push(field.name().to_owned());
|
157
|
-
}
|
158
|
-
StringCache::intern_many(&header_string)
|
159
|
-
})
|
160
|
-
.as_ref()
|
161
|
-
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
162
|
-
|
163
|
-
// Convert columnar data to rows
|
164
|
-
for row_idx in 0..batch.num_rows() {
|
165
|
-
let mut map = HashMap::with_capacity_and_hasher(
|
166
|
-
local_headers.len(),
|
167
|
-
RandomState::default(),
|
168
|
-
);
|
169
|
-
|
170
|
-
for (col_idx, column) in batch.columns().iter().enumerate() {
|
171
|
-
let header = local_headers[col_idx];
|
172
|
-
let value = extract_value_at_index(column, row_idx, strict)?;
|
173
|
-
map.insert(header, value);
|
174
|
-
}
|
175
|
-
|
176
|
-
let record = RowRecord::Map::<RandomState>(map);
|
177
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
178
|
-
}
|
179
|
-
}
|
180
|
-
}
|
181
|
-
ParserResultType::Array => {
|
182
|
-
for batch_result in reader {
|
183
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
184
|
-
|
185
|
-
// Filter columns if needed
|
186
|
-
let batch = if let Some(cols) = columns {
|
187
|
-
filter_record_batch(&batch, cols)?
|
188
|
-
} else {
|
189
|
-
batch
|
190
|
-
};
|
191
|
-
|
192
|
-
// Convert columnar data to rows
|
193
|
-
for row_idx in 0..batch.num_rows() {
|
194
|
-
let mut row_vec = Vec::with_capacity(batch.num_columns());
|
195
|
-
|
196
|
-
for column in batch.columns() {
|
197
|
-
let value = extract_value_at_index(column, row_idx, strict)?;
|
198
|
-
row_vec.push(value);
|
199
|
-
}
|
200
|
-
|
201
|
-
let record = RowRecord::Vec::<RandomState>(row_vec);
|
202
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
203
|
-
}
|
204
|
-
}
|
205
|
-
}
|
206
|
-
}
|
207
|
-
|
208
|
-
Ok(())
|
209
|
-
}
|
210
|
-
|
211
|
-
/// Process Arrow IPC file with FileReader for row-based parsing
|
212
|
-
pub fn process_arrow_file_row_data(
|
213
|
-
ruby: Rc<Ruby>,
|
214
|
-
reader: FileReader<File>,
|
215
|
-
columns: &Option<Vec<String>>,
|
216
|
-
result_type: ParserResultType,
|
217
|
-
strict: bool,
|
218
|
-
ruby_logger: &RubyLogger,
|
219
|
-
) -> Result<(), ParquetGemError> {
|
220
|
-
let schema = reader.schema();
|
221
|
-
ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
|
222
|
-
|
223
|
-
match result_type {
|
224
|
-
ParserResultType::Hash => {
|
225
|
-
let headers = OnceLock::new();
|
226
|
-
|
227
|
-
for batch_result in reader {
|
228
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
229
|
-
|
230
|
-
// Filter columns if needed
|
231
|
-
let batch = if let Some(cols) = columns {
|
232
|
-
filter_record_batch(&batch, cols)?
|
233
|
-
} else {
|
234
|
-
batch
|
235
|
-
};
|
236
|
-
|
237
|
-
let local_headers = headers
|
238
|
-
.get_or_init(|| {
|
239
|
-
let schema = batch.schema();
|
240
|
-
let fields = schema.fields();
|
241
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
242
|
-
for field in fields {
|
243
|
-
header_string.push(field.name().to_owned());
|
244
|
-
}
|
245
|
-
StringCache::intern_many(&header_string)
|
246
|
-
})
|
247
|
-
.as_ref()
|
248
|
-
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
249
|
-
|
250
|
-
// Convert columnar data to rows
|
251
|
-
for row_idx in 0..batch.num_rows() {
|
252
|
-
let mut map = HashMap::with_capacity_and_hasher(
|
253
|
-
local_headers.len(),
|
254
|
-
RandomState::default(),
|
255
|
-
);
|
256
|
-
|
257
|
-
for (col_idx, column) in batch.columns().iter().enumerate() {
|
258
|
-
let header = local_headers[col_idx];
|
259
|
-
let value = extract_value_at_index(column, row_idx, strict)?;
|
260
|
-
map.insert(header, value);
|
261
|
-
}
|
262
|
-
|
263
|
-
let record = RowRecord::Map::<RandomState>(map);
|
264
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
265
|
-
}
|
266
|
-
}
|
267
|
-
}
|
268
|
-
ParserResultType::Array => {
|
269
|
-
for batch_result in reader {
|
270
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
271
|
-
|
272
|
-
// Filter columns if needed
|
273
|
-
let batch = if let Some(cols) = columns {
|
274
|
-
filter_record_batch(&batch, cols)?
|
275
|
-
} else {
|
276
|
-
batch
|
277
|
-
};
|
278
|
-
|
279
|
-
// Convert columnar data to rows
|
280
|
-
for row_idx in 0..batch.num_rows() {
|
281
|
-
let mut row_vec = Vec::with_capacity(batch.num_columns());
|
282
|
-
|
283
|
-
for column in batch.columns() {
|
284
|
-
let value = extract_value_at_index(column, row_idx, strict)?;
|
285
|
-
row_vec.push(value);
|
286
|
-
}
|
287
|
-
|
288
|
-
let record = RowRecord::Vec::<RandomState>(row_vec);
|
289
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
290
|
-
}
|
291
|
-
}
|
292
|
-
}
|
293
|
-
}
|
294
|
-
|
295
|
-
Ok(())
|
296
|
-
}
|
297
|
-
|
298
|
-
/// Process Arrow IPC file with FileReader (for seekable sources)
|
299
|
-
pub fn process_arrow_file_column_data(
|
300
|
-
ruby: Rc<Ruby>,
|
301
|
-
file: File,
|
302
|
-
columns: &Option<Vec<String>>,
|
303
|
-
result_type: ParserResultType,
|
304
|
-
_batch_size: Option<usize>,
|
305
|
-
strict: bool,
|
306
|
-
ruby_logger: &RubyLogger,
|
307
|
-
) -> Result<(), ParquetGemError> {
|
308
|
-
let reader =
|
309
|
-
FileReader::try_new(file, None).map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
310
|
-
|
311
|
-
let schema = reader.schema();
|
312
|
-
ruby_logger.debug(|| format!("Arrow file schema loaded: {:?}", schema))?;
|
313
|
-
|
314
|
-
// FileReader implements Iterator<Item = Result<RecordBatch, ArrowError>>
|
315
|
-
match result_type {
|
316
|
-
ParserResultType::Hash => {
|
317
|
-
let headers = OnceLock::new();
|
318
|
-
|
319
|
-
for batch_result in reader {
|
320
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
321
|
-
|
322
|
-
// Filter columns if needed
|
323
|
-
let batch = if let Some(cols) = columns {
|
324
|
-
filter_record_batch(&batch, cols)?
|
325
|
-
} else {
|
326
|
-
batch
|
327
|
-
};
|
328
|
-
|
329
|
-
let local_headers = headers
|
330
|
-
.get_or_init(|| {
|
331
|
-
let schema = batch.schema();
|
332
|
-
let fields = schema.fields();
|
333
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
334
|
-
for field in fields {
|
335
|
-
header_string.push(field.name().to_owned());
|
336
|
-
}
|
337
|
-
StringCache::intern_many(&header_string)
|
338
|
-
})
|
339
|
-
.as_ref()
|
340
|
-
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
341
|
-
|
342
|
-
let mut map =
|
343
|
-
HashMap::with_capacity_and_hasher(local_headers.len(), RandomState::default());
|
344
|
-
|
345
|
-
batch
|
346
|
-
.columns()
|
347
|
-
.iter()
|
348
|
-
.enumerate()
|
349
|
-
.try_for_each(|(i, column)| {
|
350
|
-
let header = local_headers[i];
|
351
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
352
|
-
array: column,
|
353
|
-
strict,
|
354
|
-
})?;
|
355
|
-
map.insert(header, values.into_inner());
|
356
|
-
Ok::<_, ParquetGemError>(())
|
357
|
-
})?;
|
358
|
-
|
359
|
-
let record = ColumnRecord::Map::<RandomState>(map);
|
360
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
361
|
-
}
|
362
|
-
}
|
363
|
-
ParserResultType::Array => {
|
364
|
-
for batch_result in reader {
|
365
|
-
let batch = batch_result.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))?;
|
366
|
-
|
367
|
-
// Filter columns if needed
|
368
|
-
let batch = if let Some(cols) = columns {
|
369
|
-
filter_record_batch(&batch, cols)?
|
370
|
-
} else {
|
371
|
-
batch
|
372
|
-
};
|
373
|
-
|
374
|
-
let vec = batch
|
375
|
-
.columns()
|
376
|
-
.iter()
|
377
|
-
.map(|column| {
|
378
|
-
let values = ParquetValueVec::try_from(ArrayWrapper {
|
379
|
-
array: column,
|
380
|
-
strict,
|
381
|
-
})?;
|
382
|
-
Ok::<_, ParquetGemError>(values.into_inner())
|
383
|
-
})
|
384
|
-
.collect::<Result<Vec<_>, _>>()?;
|
385
|
-
|
386
|
-
let record = ColumnRecord::Vec::<RandomState>(vec);
|
387
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
388
|
-
}
|
389
|
-
}
|
390
|
-
}
|
391
|
-
|
392
|
-
Ok(())
|
393
|
-
}
|
394
|
-
|
395
|
-
/// Extract a single value from an Arrow array at a specific index
|
396
|
-
fn extract_value_at_index(
|
397
|
-
array: &Arc<dyn arrow_array::Array>,
|
398
|
-
index: usize,
|
399
|
-
strict: bool,
|
400
|
-
) -> Result<crate::types::ParquetField, ParquetGemError> {
|
401
|
-
use crate::types::ParquetField;
|
402
|
-
use arrow_array::*;
|
403
|
-
use arrow_schema::DataType;
|
404
|
-
use parquet::record::Field;
|
405
|
-
|
406
|
-
// Convert Arrow array value at index to Parquet Field
|
407
|
-
let field = match array.data_type() {
|
408
|
-
DataType::Boolean => {
|
409
|
-
let arr = array.as_any().downcast_ref::<BooleanArray>().unwrap();
|
410
|
-
if arr.is_null(index) {
|
411
|
-
Field::Null
|
412
|
-
} else {
|
413
|
-
Field::Bool(arr.value(index))
|
414
|
-
}
|
415
|
-
}
|
416
|
-
DataType::Int8 => {
|
417
|
-
let arr = array.as_any().downcast_ref::<Int8Array>().unwrap();
|
418
|
-
if arr.is_null(index) {
|
419
|
-
Field::Null
|
420
|
-
} else {
|
421
|
-
Field::Byte(arr.value(index) as i8)
|
422
|
-
}
|
423
|
-
}
|
424
|
-
DataType::Int16 => {
|
425
|
-
let arr = array.as_any().downcast_ref::<Int16Array>().unwrap();
|
426
|
-
if arr.is_null(index) {
|
427
|
-
Field::Null
|
428
|
-
} else {
|
429
|
-
Field::Short(arr.value(index))
|
430
|
-
}
|
431
|
-
}
|
432
|
-
DataType::Int32 => {
|
433
|
-
let arr = array.as_any().downcast_ref::<Int32Array>().unwrap();
|
434
|
-
if arr.is_null(index) {
|
435
|
-
Field::Null
|
436
|
-
} else {
|
437
|
-
Field::Int(arr.value(index))
|
438
|
-
}
|
439
|
-
}
|
440
|
-
DataType::Int64 => {
|
441
|
-
let arr = array.as_any().downcast_ref::<Int64Array>().unwrap();
|
442
|
-
if arr.is_null(index) {
|
443
|
-
Field::Null
|
444
|
-
} else {
|
445
|
-
Field::Long(arr.value(index))
|
446
|
-
}
|
447
|
-
}
|
448
|
-
DataType::UInt8 => {
|
449
|
-
let arr = array.as_any().downcast_ref::<UInt8Array>().unwrap();
|
450
|
-
if arr.is_null(index) {
|
451
|
-
Field::Null
|
452
|
-
} else {
|
453
|
-
Field::UByte(arr.value(index))
|
454
|
-
}
|
455
|
-
}
|
456
|
-
DataType::UInt16 => {
|
457
|
-
let arr = array.as_any().downcast_ref::<UInt16Array>().unwrap();
|
458
|
-
if arr.is_null(index) {
|
459
|
-
Field::Null
|
460
|
-
} else {
|
461
|
-
Field::UShort(arr.value(index))
|
462
|
-
}
|
463
|
-
}
|
464
|
-
DataType::UInt32 => {
|
465
|
-
let arr = array.as_any().downcast_ref::<UInt32Array>().unwrap();
|
466
|
-
if arr.is_null(index) {
|
467
|
-
Field::Null
|
468
|
-
} else {
|
469
|
-
Field::UInt(arr.value(index))
|
470
|
-
}
|
471
|
-
}
|
472
|
-
DataType::UInt64 => {
|
473
|
-
let arr = array.as_any().downcast_ref::<UInt64Array>().unwrap();
|
474
|
-
if arr.is_null(index) {
|
475
|
-
Field::Null
|
476
|
-
} else {
|
477
|
-
Field::ULong(arr.value(index))
|
478
|
-
}
|
479
|
-
}
|
480
|
-
DataType::Float32 => {
|
481
|
-
let arr = array.as_any().downcast_ref::<Float32Array>().unwrap();
|
482
|
-
if arr.is_null(index) {
|
483
|
-
Field::Null
|
484
|
-
} else {
|
485
|
-
Field::Float(arr.value(index))
|
486
|
-
}
|
487
|
-
}
|
488
|
-
DataType::Float64 => {
|
489
|
-
let arr = array.as_any().downcast_ref::<Float64Array>().unwrap();
|
490
|
-
if arr.is_null(index) {
|
491
|
-
Field::Null
|
492
|
-
} else {
|
493
|
-
Field::Double(arr.value(index))
|
494
|
-
}
|
495
|
-
}
|
496
|
-
DataType::Utf8 => {
|
497
|
-
let arr = array.as_any().downcast_ref::<StringArray>().unwrap();
|
498
|
-
if arr.is_null(index) {
|
499
|
-
Field::Null
|
500
|
-
} else {
|
501
|
-
Field::Str(arr.value(index).to_string())
|
502
|
-
}
|
503
|
-
}
|
504
|
-
DataType::Binary => {
|
505
|
-
let arr = array.as_any().downcast_ref::<BinaryArray>().unwrap();
|
506
|
-
if arr.is_null(index) {
|
507
|
-
Field::Null
|
508
|
-
} else {
|
509
|
-
Field::Bytes(arr.value(index).into())
|
510
|
-
}
|
511
|
-
}
|
512
|
-
DataType::Date32 => {
|
513
|
-
let arr = array.as_any().downcast_ref::<Date32Array>().unwrap();
|
514
|
-
if arr.is_null(index) {
|
515
|
-
Field::Null
|
516
|
-
} else {
|
517
|
-
Field::Date(arr.value(index))
|
518
|
-
}
|
519
|
-
}
|
520
|
-
DataType::Timestamp(unit, _tz) => match unit {
|
521
|
-
arrow_schema::TimeUnit::Millisecond => {
|
522
|
-
let arr = array
|
523
|
-
.as_any()
|
524
|
-
.downcast_ref::<TimestampMillisecondArray>()
|
525
|
-
.unwrap();
|
526
|
-
if arr.is_null(index) {
|
527
|
-
Field::Null
|
528
|
-
} else {
|
529
|
-
Field::TimestampMillis(arr.value(index))
|
530
|
-
}
|
531
|
-
}
|
532
|
-
arrow_schema::TimeUnit::Microsecond => {
|
533
|
-
let arr = array
|
534
|
-
.as_any()
|
535
|
-
.downcast_ref::<TimestampMicrosecondArray>()
|
536
|
-
.unwrap();
|
537
|
-
if arr.is_null(index) {
|
538
|
-
Field::Null
|
539
|
-
} else {
|
540
|
-
Field::TimestampMicros(arr.value(index))
|
541
|
-
}
|
542
|
-
}
|
543
|
-
_ => Field::Null,
|
544
|
-
},
|
545
|
-
// Add more type handling as needed
|
546
|
-
_ => Field::Null,
|
547
|
-
};
|
548
|
-
|
549
|
-
// For Arrow files, we don't have Parquet logical types, so we use defaults
|
550
|
-
Ok(ParquetField {
|
551
|
-
field,
|
552
|
-
converted_type: parquet::basic::ConvertedType::NONE,
|
553
|
-
logical_type: None,
|
554
|
-
strict,
|
555
|
-
})
|
556
|
-
}
|
557
|
-
|
558
|
-
/// Filter a RecordBatch to only include specified columns
|
559
|
-
fn filter_record_batch(
|
560
|
-
batch: &RecordBatch,
|
561
|
-
columns: &[String],
|
562
|
-
) -> Result<RecordBatch, ParquetGemError> {
|
563
|
-
let schema = batch.schema();
|
564
|
-
let mut indices = Vec::new();
|
565
|
-
let mut fields = Vec::new();
|
566
|
-
|
567
|
-
for (i, field) in schema.fields().iter().enumerate() {
|
568
|
-
if columns.contains(&field.name().to_string()) {
|
569
|
-
indices.push(i);
|
570
|
-
fields.push(field.clone());
|
571
|
-
}
|
572
|
-
}
|
573
|
-
|
574
|
-
let new_schema = Arc::new(Schema::new(fields));
|
575
|
-
let new_columns: Vec<_> = indices.iter().map(|&i| batch.column(i).clone()).collect();
|
576
|
-
|
577
|
-
RecordBatch::try_new(new_schema, new_columns)
|
578
|
-
.map_err(|e| ParquetGemError::ArrowIpc(e.to_string()))
|
579
|
-
}
|
@@ -1,69 +0,0 @@
|
|
1
|
-
use crate::types::ParquetGemError;
|
2
|
-
use std::io::{Read, Seek, SeekFrom};
|
3
|
-
|
4
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
5
|
-
pub enum FileFormat {
|
6
|
-
Parquet,
|
7
|
-
Arrow,
|
8
|
-
}
|
9
|
-
|
10
|
-
/// Detect the file format by examining magic bytes
|
11
|
-
pub fn detect_file_format<R: Read + Seek>(source: &mut R) -> Result<FileFormat, ParquetGemError> {
|
12
|
-
let mut magic = [0u8; 8];
|
13
|
-
|
14
|
-
// Read the first 8 bytes
|
15
|
-
let bytes_read = source.read(&mut magic).map_err(ParquetGemError::from)?;
|
16
|
-
|
17
|
-
// Reset to beginning
|
18
|
-
source
|
19
|
-
.seek(SeekFrom::Start(0))
|
20
|
-
.map_err(ParquetGemError::from)?;
|
21
|
-
|
22
|
-
if bytes_read >= 6 {
|
23
|
-
// Arrow IPC file format magic: "ARROW1\0\0"
|
24
|
-
if &magic[0..6] == b"ARROW1" {
|
25
|
-
return Ok(FileFormat::Arrow);
|
26
|
-
}
|
27
|
-
}
|
28
|
-
|
29
|
-
if bytes_read >= 4 {
|
30
|
-
// Parquet magic: "PAR1" at start
|
31
|
-
if &magic[0..4] == b"PAR1" {
|
32
|
-
return Ok(FileFormat::Parquet);
|
33
|
-
}
|
34
|
-
}
|
35
|
-
|
36
|
-
// If we can't detect from the beginning, check the end for Parquet
|
37
|
-
// Parquet files also have "PAR1" at the end
|
38
|
-
if let Ok(pos) = source.seek(SeekFrom::End(-4)) {
|
39
|
-
if pos >= 4 {
|
40
|
-
let mut end_magic = [0u8; 4];
|
41
|
-
if source.read_exact(&mut end_magic).is_ok() && &end_magic == b"PAR1" {
|
42
|
-
// Important: Reset to beginning before returning
|
43
|
-
source
|
44
|
-
.seek(SeekFrom::Start(0))
|
45
|
-
.map_err(ParquetGemError::from)?;
|
46
|
-
return Ok(FileFormat::Parquet);
|
47
|
-
}
|
48
|
-
}
|
49
|
-
}
|
50
|
-
|
51
|
-
// Always reset to beginning, even for unknown format
|
52
|
-
source
|
53
|
-
.seek(SeekFrom::Start(0))
|
54
|
-
.map_err(ParquetGemError::from)?;
|
55
|
-
|
56
|
-
Err(ParquetGemError::UnknownFormat)
|
57
|
-
}
|
58
|
-
|
59
|
-
/// Detect format from file extension as a fallback
|
60
|
-
pub fn detect_format_from_extension(path: &str) -> Option<FileFormat> {
|
61
|
-
let lower = path.to_lowercase();
|
62
|
-
if lower.ends_with(".parquet") || lower.ends_with(".parq") {
|
63
|
-
Some(FileFormat::Parquet)
|
64
|
-
} else if lower.ends_with(".arrow") || lower.ends_with(".feather") || lower.ends_with(".ipc") {
|
65
|
-
Some(FileFormat::Arrow)
|
66
|
-
} else {
|
67
|
-
None
|
68
|
-
}
|
69
|
-
}
|