parquet 0.4.2 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +66 -59
- data/README.md +105 -1
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +11 -12
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/logger.rs +171 -0
- data/ext/parquet/src/reader/common.rs +110 -0
- data/ext/parquet/src/reader/mod.rs +1 -43
- data/ext/parquet/src/reader/parquet_column_reader.rs +50 -86
- data/ext/parquet/src/reader/parquet_row_reader.rs +53 -23
- data/ext/parquet/src/ruby_reader.rs +37 -25
- data/ext/parquet/src/types/core_types.rs +47 -6
- data/ext/parquet/src/types/mod.rs +64 -1
- data/ext/parquet/src/types/parquet_value.rs +284 -102
- data/ext/parquet/src/types/record_types.rs +24 -23
- data/ext/parquet/src/types/schema_converter.rs +244 -0
- data/ext/parquet/src/types/schema_node.rs +329 -0
- data/ext/parquet/src/types/timestamp.rs +16 -8
- data/ext/parquet/src/types/type_conversion.rs +1151 -521
- data/ext/parquet/src/types/writer_types.rs +94 -151
- data/ext/parquet/src/utils.rs +29 -9
- data/ext/parquet/src/writer/mod.rs +342 -457
- data/ext/parquet/src/writer/write_columns.rs +226 -0
- data/ext/parquet/src/writer/write_rows.rs +484 -0
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +9 -2
@@ -0,0 +1,171 @@
|
|
1
|
+
// Logger module for Parquet gem
|
2
|
+
// Provides a Rust wrapper for Ruby logger objects
|
3
|
+
|
4
|
+
use std::str::FromStr;
|
5
|
+
|
6
|
+
use magnus::{exception::runtime_error, value::ReprValue, Error as MagnusError, Ruby, Value};
|
7
|
+
|
8
|
+
use crate::{types::ParquetGemError, utils::parse_string_or_symbol};
|
9
|
+
|
10
|
+
/// Severity levels that match Ruby's Logger levels
|
11
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
12
|
+
pub enum LogLevel {
|
13
|
+
Debug,
|
14
|
+
Info,
|
15
|
+
Warn,
|
16
|
+
Error,
|
17
|
+
Fatal,
|
18
|
+
}
|
19
|
+
|
20
|
+
impl FromStr for LogLevel {
|
21
|
+
type Err = MagnusError;
|
22
|
+
|
23
|
+
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
24
|
+
Ok(match s {
|
25
|
+
"debug" => LogLevel::Debug,
|
26
|
+
"info" => LogLevel::Info,
|
27
|
+
"warn" => LogLevel::Warn,
|
28
|
+
"error" => LogLevel::Error,
|
29
|
+
"fatal" => LogLevel::Fatal,
|
30
|
+
_ => {
|
31
|
+
return Err(MagnusError::new(
|
32
|
+
runtime_error(),
|
33
|
+
format!("Invalid log level: {}", s),
|
34
|
+
))
|
35
|
+
}
|
36
|
+
})
|
37
|
+
}
|
38
|
+
}
|
39
|
+
/// A wrapper around a Ruby logger object
|
40
|
+
#[derive(Debug, Clone)]
|
41
|
+
pub struct RubyLogger {
|
42
|
+
logger: Option<Value>,
|
43
|
+
level: LogLevel,
|
44
|
+
}
|
45
|
+
|
46
|
+
#[allow(dead_code)]
|
47
|
+
impl RubyLogger {
|
48
|
+
pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ParquetGemError> {
|
49
|
+
let environment_level = std::env::var("PARQUET_GEM_LOG_LEVEL")
|
50
|
+
.unwrap_or_else(|_| "warn".to_string())
|
51
|
+
.parse::<LogLevel>()
|
52
|
+
.unwrap_or(LogLevel::Warn);
|
53
|
+
|
54
|
+
match logger_value {
|
55
|
+
Some(logger) => {
|
56
|
+
if logger.is_nil() {
|
57
|
+
return Ok(Self {
|
58
|
+
logger: None,
|
59
|
+
level: environment_level,
|
60
|
+
});
|
61
|
+
}
|
62
|
+
|
63
|
+
let level_value = logger.funcall::<_, _, Value>("level", ())?;
|
64
|
+
let level = parse_string_or_symbol(ruby, level_value)?;
|
65
|
+
let level = level
|
66
|
+
.map(|s| s.parse::<LogLevel>())
|
67
|
+
.transpose()?
|
68
|
+
.unwrap_or(environment_level);
|
69
|
+
|
70
|
+
Ok(Self {
|
71
|
+
logger: Some(logger),
|
72
|
+
level,
|
73
|
+
})
|
74
|
+
}
|
75
|
+
None => Ok(Self {
|
76
|
+
logger: None,
|
77
|
+
level: environment_level,
|
78
|
+
}),
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
/// Log a message at the given level
|
83
|
+
pub fn log(&self, level: LogLevel, message: &str) -> Result<(), MagnusError> {
|
84
|
+
let method = match level {
|
85
|
+
LogLevel::Debug => "debug",
|
86
|
+
LogLevel::Info => "info",
|
87
|
+
LogLevel::Warn => "warn",
|
88
|
+
LogLevel::Error => "error",
|
89
|
+
LogLevel::Fatal => "fatal",
|
90
|
+
};
|
91
|
+
|
92
|
+
match self.logger {
|
93
|
+
Some(logger) => {
|
94
|
+
logger.funcall::<_, _, Value>(method, (message,))?;
|
95
|
+
}
|
96
|
+
None => eprintln!("{}", message),
|
97
|
+
}
|
98
|
+
|
99
|
+
Ok(())
|
100
|
+
}
|
101
|
+
|
102
|
+
/// Log a debug message
|
103
|
+
pub fn debug<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
104
|
+
where
|
105
|
+
F: FnOnce() -> S,
|
106
|
+
S: AsRef<str>,
|
107
|
+
{
|
108
|
+
if self.level <= LogLevel::Debug {
|
109
|
+
let message = message_fn();
|
110
|
+
self.log(LogLevel::Debug, message.as_ref())
|
111
|
+
} else {
|
112
|
+
Ok(())
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
/// Log an info message
|
117
|
+
pub fn info<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
118
|
+
where
|
119
|
+
F: FnOnce() -> S,
|
120
|
+
S: AsRef<str>,
|
121
|
+
{
|
122
|
+
if self.level <= LogLevel::Info {
|
123
|
+
let message = message_fn();
|
124
|
+
self.log(LogLevel::Info, message.as_ref())
|
125
|
+
} else {
|
126
|
+
Ok(())
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
/// Log a warning message
|
131
|
+
pub fn warn<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
132
|
+
where
|
133
|
+
F: FnOnce() -> S,
|
134
|
+
S: AsRef<str>,
|
135
|
+
{
|
136
|
+
if self.level <= LogLevel::Warn {
|
137
|
+
let message = message_fn();
|
138
|
+
self.log(LogLevel::Warn, message.as_ref())
|
139
|
+
} else {
|
140
|
+
Ok(())
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
/// Log an error message
|
145
|
+
pub fn error<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
146
|
+
where
|
147
|
+
F: FnOnce() -> S,
|
148
|
+
S: AsRef<str>,
|
149
|
+
{
|
150
|
+
if self.level <= LogLevel::Error {
|
151
|
+
let message = message_fn();
|
152
|
+
self.log(LogLevel::Error, message.as_ref())
|
153
|
+
} else {
|
154
|
+
Ok(())
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
/// Log a fatal message
|
159
|
+
pub fn fatal<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
160
|
+
where
|
161
|
+
F: FnOnce() -> S,
|
162
|
+
S: AsRef<str>,
|
163
|
+
{
|
164
|
+
if self.level <= LogLevel::Fatal {
|
165
|
+
let message = message_fn();
|
166
|
+
self.log(LogLevel::Fatal, message.as_ref())
|
167
|
+
} else {
|
168
|
+
Ok(())
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|
@@ -0,0 +1,110 @@
|
|
1
|
+
use ahash::RandomState;
|
2
|
+
use arrow_schema::Schema;
|
3
|
+
use either::Either;
|
4
|
+
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
|
5
|
+
use parquet::arrow::ProjectionMask;
|
6
|
+
use std::collections::HashMap;
|
7
|
+
use std::fs::File;
|
8
|
+
use std::sync::Arc;
|
9
|
+
|
10
|
+
use magnus::value::ReprValue;
|
11
|
+
use magnus::{Error as MagnusError, Ruby, Value};
|
12
|
+
|
13
|
+
use crate::header_cache::StringCache;
|
14
|
+
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
15
|
+
use crate::types::{ParquetGemError, TryIntoValue};
|
16
|
+
use crate::ColumnRecord;
|
17
|
+
|
18
|
+
/// Opens a parquet file or IO-like object for reading
|
19
|
+
///
|
20
|
+
/// This function handles both file paths (as strings) and IO-like objects,
|
21
|
+
/// returning either a File or a ThreadSafeRubyReader that can be used with
|
22
|
+
/// parquet readers.
|
23
|
+
pub fn open_parquet_source(
|
24
|
+
ruby: Arc<Ruby>,
|
25
|
+
to_read: Value,
|
26
|
+
) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
|
27
|
+
if to_read.is_kind_of(ruby.class_string()) {
|
28
|
+
let path_string = to_read.to_r_string()?;
|
29
|
+
let file_path = unsafe { path_string.as_str()? };
|
30
|
+
let file = File::open(file_path).map_err(ParquetGemError::from)?;
|
31
|
+
Ok(Either::Left(file))
|
32
|
+
} else {
|
33
|
+
let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
|
34
|
+
Ok(Either::Right(readable))
|
35
|
+
}
|
36
|
+
}
|
37
|
+
|
38
|
+
/// Helper function to check if a block is given and create an appropriate enumerator
|
39
|
+
/// if not
|
40
|
+
pub fn handle_block_or_enum<F, T>(
|
41
|
+
_ruby: &magnus::Ruby,
|
42
|
+
block_given: bool,
|
43
|
+
create_enum: F,
|
44
|
+
) -> Result<Option<T>, MagnusError>
|
45
|
+
where
|
46
|
+
F: FnOnce() -> Result<T, MagnusError>,
|
47
|
+
{
|
48
|
+
if !block_given {
|
49
|
+
let enum_value = create_enum()?;
|
50
|
+
return Ok(Some(enum_value));
|
51
|
+
}
|
52
|
+
Ok(None)
|
53
|
+
}
|
54
|
+
|
55
|
+
/// Creates a ParquetRecordBatchReader with the given columns and batch size configurations
|
56
|
+
pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
|
57
|
+
reader: T,
|
58
|
+
columns: &Option<Vec<String>>,
|
59
|
+
batch_size: Option<usize>,
|
60
|
+
) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
|
61
|
+
let mut builder = ParquetRecordBatchReaderBuilder::try_new(reader)
|
62
|
+
.map_err(|e| ParquetGemError::Parquet(e))?;
|
63
|
+
|
64
|
+
let schema = builder.schema().clone();
|
65
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
66
|
+
|
67
|
+
// If columns are specified, project only those columns
|
68
|
+
if let Some(cols) = columns {
|
69
|
+
// Get the parquet schema
|
70
|
+
let parquet_schema = builder.parquet_schema();
|
71
|
+
|
72
|
+
// Create a projection mask from column names
|
73
|
+
let projection = ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
74
|
+
builder = builder.with_projection(projection);
|
75
|
+
}
|
76
|
+
|
77
|
+
if let Some(batch_size) = batch_size {
|
78
|
+
builder = builder.with_batch_size(batch_size);
|
79
|
+
}
|
80
|
+
|
81
|
+
let reader = builder.build().map_err(|e| ParquetGemError::Parquet(e))?;
|
82
|
+
Ok((reader, schema, num_rows))
|
83
|
+
}
|
84
|
+
|
85
|
+
/// Handles the case of an empty parquet file (no rows) by yielding a record with empty arrays
|
86
|
+
/// Returns true if the file was empty and was handled, false otherwise
|
87
|
+
pub fn handle_empty_file(
|
88
|
+
ruby: &magnus::Ruby,
|
89
|
+
schema: &Arc<Schema>,
|
90
|
+
num_rows: i64,
|
91
|
+
) -> Result<bool, ParquetGemError> {
|
92
|
+
if num_rows == 0 {
|
93
|
+
let mut map =
|
94
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
95
|
+
let headers: Vec<String> = schema
|
96
|
+
.fields()
|
97
|
+
.iter()
|
98
|
+
.map(|field| field.name().to_string())
|
99
|
+
.collect();
|
100
|
+
let interned_headers =
|
101
|
+
StringCache::intern_many(&headers).map_err(|e| ParquetGemError::HeaderIntern(e))?;
|
102
|
+
for field in interned_headers.iter() {
|
103
|
+
map.insert(*field, vec![]);
|
104
|
+
}
|
105
|
+
let record = ColumnRecord::Map(map);
|
106
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
107
|
+
return Ok(true);
|
108
|
+
}
|
109
|
+
Ok(false)
|
110
|
+
}
|
@@ -1,48 +1,6 @@
|
|
1
|
+
mod common;
|
1
2
|
mod parquet_column_reader;
|
2
3
|
mod parquet_row_reader;
|
3
4
|
|
4
|
-
use std::io;
|
5
|
-
|
6
|
-
use magnus::{Error as MagnusError, Ruby};
|
7
|
-
use thiserror::Error;
|
8
|
-
|
9
|
-
use crate::header_cache::CacheError;
|
10
5
|
pub use parquet_column_reader::parse_parquet_columns;
|
11
6
|
pub use parquet_row_reader::parse_parquet_rows;
|
12
|
-
|
13
|
-
#[derive(Error, Debug)]
|
14
|
-
pub enum ReaderError {
|
15
|
-
#[error("Failed to get file descriptor: {0}")]
|
16
|
-
FileDescriptor(String),
|
17
|
-
#[error("Invalid file descriptor")]
|
18
|
-
InvalidFileDescriptor,
|
19
|
-
#[error("Failed to open file: {0}")]
|
20
|
-
FileOpen(#[from] io::Error),
|
21
|
-
#[error("Failed to intern headers: {0}")]
|
22
|
-
HeaderIntern(#[from] CacheError),
|
23
|
-
#[error("Ruby error: {0}")]
|
24
|
-
Ruby(String),
|
25
|
-
#[error("Parquet error: {0}")]
|
26
|
-
Parquet(#[from] parquet::errors::ParquetError),
|
27
|
-
#[error("Arrow error: {0}")]
|
28
|
-
Arrow(#[from] arrow_schema::ArrowError),
|
29
|
-
#[error("UTF-8 error: {0}")]
|
30
|
-
Utf8Error(#[from] simdutf8::basic::Utf8Error),
|
31
|
-
#[error("Jiff error: {0}")]
|
32
|
-
Jiff(#[from] jiff::Error),
|
33
|
-
}
|
34
|
-
|
35
|
-
impl From<MagnusError> for ReaderError {
|
36
|
-
fn from(err: MagnusError) -> Self {
|
37
|
-
Self::Ruby(err.to_string())
|
38
|
-
}
|
39
|
-
}
|
40
|
-
|
41
|
-
impl From<ReaderError> for MagnusError {
|
42
|
-
fn from(err: ReaderError) -> Self {
|
43
|
-
MagnusError::new(
|
44
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
45
|
-
err.to_string(),
|
46
|
-
)
|
47
|
-
}
|
48
|
-
}
|
@@ -1,119 +1,83 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
-
use crate::
|
3
|
-
use crate::types::{ArrayWrapper, TryIntoValue};
|
2
|
+
use crate::logger::RubyLogger;
|
3
|
+
use crate::types::{ArrayWrapper, ParquetGemError, TryIntoValue};
|
4
4
|
use crate::{
|
5
5
|
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
|
6
6
|
ParserResultType,
|
7
7
|
};
|
8
8
|
use ahash::RandomState;
|
9
|
-
use
|
9
|
+
use either::Either;
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
|
-
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
13
|
-
use parquet::arrow::ProjectionMask;
|
14
12
|
use std::collections::HashMap;
|
15
|
-
use std::
|
16
|
-
use std::sync::OnceLock;
|
13
|
+
use std::sync::{Arc, OnceLock};
|
17
14
|
|
18
|
-
use super::
|
15
|
+
use super::common::{
|
16
|
+
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
17
|
+
};
|
19
18
|
|
20
19
|
#[inline]
|
21
20
|
pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
21
|
let ruby = unsafe { Ruby::get_unchecked() };
|
22
|
+
Ok(
|
23
|
+
parse_parquet_columns_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
|
24
|
+
let z: MagnusError = e.into();
|
25
|
+
z
|
26
|
+
})?,
|
27
|
+
)
|
28
|
+
}
|
23
29
|
|
30
|
+
#[inline]
|
31
|
+
fn parse_parquet_columns_impl<'a>(
|
32
|
+
ruby: Arc<Ruby>,
|
33
|
+
rb_self: Value,
|
34
|
+
args: &[Value],
|
35
|
+
) -> Result<Value, ParquetGemError> {
|
24
36
|
let ParquetColumnsArgs {
|
25
37
|
to_read,
|
26
38
|
result_type,
|
27
39
|
columns,
|
28
40
|
batch_size,
|
29
41
|
strict,
|
42
|
+
logger,
|
30
43
|
} = parse_parquet_columns_args(&ruby, args)?;
|
31
44
|
|
32
|
-
if
|
33
|
-
|
45
|
+
// Initialize the logger if provided
|
46
|
+
let ruby_logger = RubyLogger::new(&ruby, logger)?;
|
47
|
+
if let Some(ref bs) = batch_size {
|
48
|
+
ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
|
49
|
+
}
|
50
|
+
|
51
|
+
// Clone values for the closure to avoid move issues
|
52
|
+
let columns_clone = columns.clone();
|
53
|
+
|
54
|
+
// Handle block or create enumerator
|
55
|
+
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
56
|
+
create_column_enumerator(ColumnEnumeratorArgs {
|
34
57
|
rb_self,
|
35
58
|
to_read,
|
36
59
|
result_type,
|
37
|
-
columns,
|
60
|
+
columns: columns_clone,
|
38
61
|
batch_size,
|
39
62
|
strict,
|
63
|
+
logger: logger.as_ref().map(|_| to_read),
|
40
64
|
})
|
41
|
-
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
65
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
66
|
+
})? {
|
67
|
+
return Ok(enum_value);
|
42
68
|
}
|
43
69
|
|
44
|
-
let
|
45
|
-
let path_string = to_read.to_r_string()?;
|
46
|
-
let file_path = unsafe { path_string.as_str()? };
|
47
|
-
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
70
|
+
let source = open_parquet_source(ruby.clone(), to_read)?;
|
48
71
|
|
49
|
-
|
50
|
-
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
51
|
-
let schema = builder.schema().clone();
|
52
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
53
|
-
|
54
|
-
// If columns are specified, project only those columns
|
55
|
-
if let Some(cols) = &columns {
|
56
|
-
// Get the parquet schema
|
57
|
-
let parquet_schema = builder.parquet_schema();
|
58
|
-
|
59
|
-
// Create a projection mask from column names
|
60
|
-
let projection =
|
61
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
62
|
-
|
63
|
-
builder = builder.with_projection(projection);
|
64
|
-
}
|
65
|
-
|
66
|
-
if let Some(batch_size) = batch_size {
|
67
|
-
builder = builder.with_batch_size(batch_size);
|
68
|
-
}
|
72
|
+
// Use the common function to create the batch reader
|
69
73
|
|
70
|
-
|
71
|
-
|
72
|
-
(
|
73
|
-
} else {
|
74
|
-
let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
|
75
|
-
|
76
|
-
let mut builder =
|
77
|
-
ParquetRecordBatchReaderBuilder::try_new(readable).map_err(ReaderError::from)?;
|
78
|
-
let schema = builder.schema().clone();
|
79
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
80
|
-
|
81
|
-
// If columns are specified, project only those columns
|
82
|
-
if let Some(cols) = &columns {
|
83
|
-
// Get the parquet schema
|
84
|
-
let parquet_schema = builder.parquet_schema();
|
85
|
-
|
86
|
-
// Create a projection mask from column names
|
87
|
-
let projection =
|
88
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
89
|
-
|
90
|
-
builder = builder.with_projection(projection);
|
91
|
-
}
|
92
|
-
|
93
|
-
if let Some(batch_size) = batch_size {
|
94
|
-
builder = builder.with_batch_size(batch_size);
|
95
|
-
}
|
96
|
-
|
97
|
-
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
98
|
-
|
99
|
-
(reader, schema, num_rows)
|
74
|
+
let (batch_reader, schema, num_rows) = match source {
|
75
|
+
Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
|
76
|
+
Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
|
100
77
|
};
|
101
78
|
|
102
|
-
|
103
|
-
|
104
|
-
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
105
|
-
let headers: Vec<String> = schema
|
106
|
-
.fields()
|
107
|
-
.iter()
|
108
|
-
.map(|field| field.name().to_string())
|
109
|
-
.collect();
|
110
|
-
let interned_headers =
|
111
|
-
StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
|
112
|
-
for field in interned_headers.iter() {
|
113
|
-
map.insert(*field, vec![]);
|
114
|
-
}
|
115
|
-
let record = ColumnRecord::Map(map);
|
116
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
79
|
+
// Handle empty file case
|
80
|
+
if handle_empty_file(&ruby, &schema, num_rows)? {
|
117
81
|
return Ok(ruby.qnil().into_value_with(&ruby));
|
118
82
|
}
|
119
83
|
|
@@ -122,7 +86,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
122
86
|
let headers = OnceLock::new();
|
123
87
|
let headers_clone = headers.clone();
|
124
88
|
let iter = batch_reader.map(move |batch| {
|
125
|
-
batch.map_err(
|
89
|
+
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
126
90
|
let local_headers = headers_clone
|
127
91
|
.get_or_init(|| {
|
128
92
|
let schema = batch.schema();
|
@@ -134,7 +98,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
134
98
|
StringCache::intern_many(&header_string)
|
135
99
|
})
|
136
100
|
.as_ref()
|
137
|
-
.map_err(|e|
|
101
|
+
.map_err(|e| ParquetGemError::HeaderIntern(e.clone()))?;
|
138
102
|
|
139
103
|
let mut map = HashMap::with_capacity_and_hasher(
|
140
104
|
local_headers.len(),
|
@@ -152,7 +116,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
152
116
|
strict: strict,
|
153
117
|
})?;
|
154
118
|
map.insert(header, values.into_inner());
|
155
|
-
Ok::<_,
|
119
|
+
Ok::<_, ParquetGemError>(())
|
156
120
|
})?;
|
157
121
|
|
158
122
|
Ok(ColumnRecord::Map::<RandomState>(map))
|
@@ -166,7 +130,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
166
130
|
}
|
167
131
|
ParserResultType::Array => {
|
168
132
|
let iter = batch_reader.map(|batch| {
|
169
|
-
batch.map_err(
|
133
|
+
batch.map_err(ParquetGemError::Arrow).and_then(|batch| {
|
170
134
|
let vec = batch
|
171
135
|
.columns()
|
172
136
|
.into_iter()
|
@@ -175,7 +139,7 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
175
139
|
array: &*column,
|
176
140
|
strict: strict,
|
177
141
|
})?;
|
178
|
-
Ok::<_,
|
142
|
+
Ok::<_, ParquetGemError>(values.into_inner())
|
179
143
|
})
|
180
144
|
.collect::<Result<Vec<_>, _>>()?;
|
181
145
|
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
@@ -1,55 +1,84 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
-
use crate::
|
2
|
+
use crate::logger::RubyLogger;
|
3
3
|
use crate::types::TryIntoValue;
|
4
4
|
use crate::{
|
5
|
-
create_row_enumerator, utils::*, ParquetField,
|
5
|
+
create_row_enumerator, utils::*, ParquetField, ParquetGemError, ParserResultType,
|
6
6
|
RowEnumeratorArgs, RowRecord,
|
7
7
|
};
|
8
8
|
use ahash::RandomState;
|
9
|
-
use
|
9
|
+
use either::Either;
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
12
|
use parquet::file::reader::{FileReader, SerializedFileReader};
|
13
13
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
14
14
|
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
15
15
|
use std::collections::HashMap;
|
16
|
-
use std::
|
17
|
-
|
16
|
+
use std::sync::{Arc, OnceLock};
|
17
|
+
|
18
|
+
use super::common::{handle_block_or_enum, open_parquet_source};
|
18
19
|
|
19
20
|
#[inline]
|
20
21
|
pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
21
22
|
let ruby = unsafe { Ruby::get_unchecked() };
|
23
|
+
Ok(
|
24
|
+
parse_parquet_rows_impl(Arc::new(ruby), rb_self, args).map_err(|e| {
|
25
|
+
let z: MagnusError = e.into();
|
26
|
+
z
|
27
|
+
})?,
|
28
|
+
)
|
29
|
+
}
|
22
30
|
|
31
|
+
#[inline]
|
32
|
+
fn parse_parquet_rows_impl<'a>(
|
33
|
+
ruby: Arc<Ruby>,
|
34
|
+
rb_self: Value,
|
35
|
+
args: &[Value],
|
36
|
+
) -> Result<Value, ParquetGemError> {
|
23
37
|
let ParquetRowsArgs {
|
24
38
|
to_read,
|
25
39
|
result_type,
|
26
40
|
columns,
|
27
41
|
strict,
|
42
|
+
logger,
|
28
43
|
} = parse_parquet_rows_args(&ruby, args)?;
|
29
44
|
|
30
|
-
if
|
31
|
-
|
45
|
+
// Initialize the logger if provided
|
46
|
+
let ruby_logger = RubyLogger::new(&ruby, logger)?;
|
47
|
+
|
48
|
+
// Clone values for the closure to avoid move issues
|
49
|
+
let columns_clone = columns.clone();
|
50
|
+
|
51
|
+
// Handle block or create enumerator
|
52
|
+
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
53
|
+
create_row_enumerator(RowEnumeratorArgs {
|
32
54
|
rb_self,
|
33
55
|
to_read,
|
34
56
|
result_type,
|
35
|
-
columns,
|
57
|
+
columns: columns_clone,
|
36
58
|
strict,
|
59
|
+
logger,
|
37
60
|
})
|
38
|
-
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
61
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
62
|
+
})? {
|
63
|
+
return Ok(enum_value);
|
39
64
|
}
|
40
65
|
|
41
|
-
let
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
66
|
+
let source = open_parquet_source(ruby.clone(), to_read)?;
|
67
|
+
let reader: Box<dyn FileReader> = match source {
|
68
|
+
Either::Left(file) => {
|
69
|
+
Box::new(SerializedFileReader::new(file).map_err(ParquetGemError::from)?)
|
70
|
+
}
|
71
|
+
Either::Right(readable) => {
|
72
|
+
Box::new(SerializedFileReader::new(readable).map_err(ParquetGemError::from)?)
|
73
|
+
}
|
49
74
|
};
|
75
|
+
|
50
76
|
let schema = reader.metadata().file_metadata().schema().clone();
|
77
|
+
ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
|
78
|
+
|
51
79
|
let mut iter = ParquetRowIter::from_file_into(reader);
|
52
80
|
if let Some(cols) = columns {
|
81
|
+
ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
|
53
82
|
let projection = create_projection_schema(&schema, &cols);
|
54
83
|
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
55
84
|
MagnusError::new(
|
@@ -81,13 +110,13 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
81
110
|
|
82
111
|
let mut map =
|
83
112
|
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
84
|
-
|
113
|
+
for (i, (_, v)) in row.get_column_iter().enumerate() {
|
85
114
|
map.insert(headers[i], ParquetField(v.clone(), strict));
|
86
|
-
}
|
115
|
+
}
|
87
116
|
Ok(map)
|
88
117
|
})
|
89
118
|
.and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
|
90
|
-
.map_err(|e|
|
119
|
+
.map_err(|e| ParquetGemError::from(e))
|
91
120
|
});
|
92
121
|
|
93
122
|
for result in iter {
|
@@ -100,12 +129,13 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
100
129
|
row.and_then(|row| {
|
101
130
|
let column_count = row.get_column_iter().count();
|
102
131
|
let mut vec = Vec::with_capacity(column_count);
|
103
|
-
row.get_column_iter()
|
104
|
-
|
132
|
+
for (_, v) in row.get_column_iter() {
|
133
|
+
vec.push(ParquetField(v.clone(), strict));
|
134
|
+
}
|
105
135
|
Ok(vec)
|
106
136
|
})
|
107
137
|
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|
108
|
-
.map_err(|e|
|
138
|
+
.map_err(|e| ParquetGemError::from(e))
|
109
139
|
});
|
110
140
|
|
111
141
|
for result in iter {
|