parquet 0.4.2 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +66 -59
- data/README.md +105 -1
- data/ext/parquet/Cargo.toml +4 -3
- data/ext/parquet/src/enumerator.rs +8 -0
- data/ext/parquet/src/header_cache.rs +7 -3
- data/ext/parquet/src/lib.rs +1 -0
- data/ext/parquet/src/logger.rs +171 -0
- data/ext/parquet/src/reader/common.rs +113 -0
- data/ext/parquet/src/reader/mod.rs +27 -13
- data/ext/parquet/src/reader/parquet_column_reader.rs +38 -78
- data/ext/parquet/src/reader/parquet_row_reader.rs +42 -19
- data/ext/parquet/src/types/core_types.rs +57 -1
- data/ext/parquet/src/types/mod.rs +8 -1
- data/ext/parquet/src/types/parquet_value.rs +211 -35
- data/ext/parquet/src/types/record_types.rs +18 -15
- data/ext/parquet/src/types/schema_converter.rs +349 -0
- data/ext/parquet/src/types/schema_node.rs +329 -0
- data/ext/parquet/src/types/timestamp.rs +18 -8
- data/ext/parquet/src/types/type_conversion.rs +1106 -511
- data/ext/parquet/src/types/writer_types.rs +78 -107
- data/ext/parquet/src/utils.rs +29 -9
- data/ext/parquet/src/writer/mod.rs +828 -280
- data/lib/parquet/schema.rb +154 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +1 -0
- metadata +7 -2
@@ -0,0 +1,171 @@
|
|
1
|
+
// Logger module for Parquet gem
|
2
|
+
// Provides a Rust wrapper for Ruby logger objects
|
3
|
+
|
4
|
+
use std::str::FromStr;
|
5
|
+
|
6
|
+
use magnus::{exception::runtime_error, value::ReprValue, Error as MagnusError, Ruby, Value};
|
7
|
+
|
8
|
+
use crate::{reader::ReaderError, utils::parse_string_or_symbol};
|
9
|
+
|
10
|
+
/// Severity levels that match Ruby's Logger levels
|
11
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
12
|
+
pub enum LogLevel {
|
13
|
+
Debug,
|
14
|
+
Info,
|
15
|
+
Warn,
|
16
|
+
Error,
|
17
|
+
Fatal,
|
18
|
+
}
|
19
|
+
|
20
|
+
impl FromStr for LogLevel {
|
21
|
+
type Err = MagnusError;
|
22
|
+
|
23
|
+
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
24
|
+
Ok(match s {
|
25
|
+
"debug" => LogLevel::Debug,
|
26
|
+
"info" => LogLevel::Info,
|
27
|
+
"warn" => LogLevel::Warn,
|
28
|
+
"error" => LogLevel::Error,
|
29
|
+
"fatal" => LogLevel::Fatal,
|
30
|
+
_ => {
|
31
|
+
return Err(MagnusError::new(
|
32
|
+
runtime_error(),
|
33
|
+
format!("Invalid log level: {}", s),
|
34
|
+
))
|
35
|
+
}
|
36
|
+
})
|
37
|
+
}
|
38
|
+
}
|
39
|
+
/// A wrapper around a Ruby logger object
|
40
|
+
#[derive(Debug, Clone)]
|
41
|
+
pub struct RubyLogger {
|
42
|
+
logger: Option<Value>,
|
43
|
+
level: LogLevel,
|
44
|
+
}
|
45
|
+
|
46
|
+
#[allow(dead_code)]
|
47
|
+
impl RubyLogger {
|
48
|
+
pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ReaderError> {
|
49
|
+
let environment_level = std::env::var("PARQUET_GEM_LOG_LEVEL")
|
50
|
+
.unwrap_or_else(|_| "warn".to_string())
|
51
|
+
.parse::<LogLevel>()
|
52
|
+
.unwrap_or(LogLevel::Warn);
|
53
|
+
|
54
|
+
match logger_value {
|
55
|
+
Some(logger) => {
|
56
|
+
if logger.is_nil() {
|
57
|
+
return Ok(Self {
|
58
|
+
logger: None,
|
59
|
+
level: environment_level,
|
60
|
+
});
|
61
|
+
}
|
62
|
+
|
63
|
+
let level_value = logger.funcall::<_, _, Value>("level", ())?;
|
64
|
+
let level = parse_string_or_symbol(ruby, level_value)?;
|
65
|
+
let level = level
|
66
|
+
.map(|s| s.parse::<LogLevel>())
|
67
|
+
.transpose()?
|
68
|
+
.unwrap_or(environment_level);
|
69
|
+
|
70
|
+
Ok(Self {
|
71
|
+
logger: Some(logger),
|
72
|
+
level,
|
73
|
+
})
|
74
|
+
}
|
75
|
+
None => Ok(Self {
|
76
|
+
logger: None,
|
77
|
+
level: environment_level,
|
78
|
+
}),
|
79
|
+
}
|
80
|
+
}
|
81
|
+
|
82
|
+
/// Log a message at the given level
|
83
|
+
pub fn log(&self, level: LogLevel, message: &str) -> Result<(), MagnusError> {
|
84
|
+
let method = match level {
|
85
|
+
LogLevel::Debug => "debug",
|
86
|
+
LogLevel::Info => "info",
|
87
|
+
LogLevel::Warn => "warn",
|
88
|
+
LogLevel::Error => "error",
|
89
|
+
LogLevel::Fatal => "fatal",
|
90
|
+
};
|
91
|
+
|
92
|
+
match self.logger {
|
93
|
+
Some(logger) => {
|
94
|
+
logger.funcall::<_, _, Value>(method, (message,))?;
|
95
|
+
}
|
96
|
+
None => eprintln!("{}", message),
|
97
|
+
}
|
98
|
+
|
99
|
+
Ok(())
|
100
|
+
}
|
101
|
+
|
102
|
+
/// Log a debug message
|
103
|
+
pub fn debug<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
104
|
+
where
|
105
|
+
F: FnOnce() -> S,
|
106
|
+
S: AsRef<str>,
|
107
|
+
{
|
108
|
+
if self.level <= LogLevel::Debug {
|
109
|
+
let message = message_fn();
|
110
|
+
self.log(LogLevel::Debug, message.as_ref())
|
111
|
+
} else {
|
112
|
+
Ok(())
|
113
|
+
}
|
114
|
+
}
|
115
|
+
|
116
|
+
/// Log an info message
|
117
|
+
pub fn info<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
118
|
+
where
|
119
|
+
F: FnOnce() -> S,
|
120
|
+
S: AsRef<str>,
|
121
|
+
{
|
122
|
+
if self.level <= LogLevel::Info {
|
123
|
+
let message = message_fn();
|
124
|
+
self.log(LogLevel::Info, message.as_ref())
|
125
|
+
} else {
|
126
|
+
Ok(())
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
/// Log a warning message
|
131
|
+
pub fn warn<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
132
|
+
where
|
133
|
+
F: FnOnce() -> S,
|
134
|
+
S: AsRef<str>,
|
135
|
+
{
|
136
|
+
if self.level <= LogLevel::Warn {
|
137
|
+
let message = message_fn();
|
138
|
+
self.log(LogLevel::Warn, message.as_ref())
|
139
|
+
} else {
|
140
|
+
Ok(())
|
141
|
+
}
|
142
|
+
}
|
143
|
+
|
144
|
+
/// Log an error message
|
145
|
+
pub fn error<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
146
|
+
where
|
147
|
+
F: FnOnce() -> S,
|
148
|
+
S: AsRef<str>,
|
149
|
+
{
|
150
|
+
if self.level <= LogLevel::Error {
|
151
|
+
let message = message_fn();
|
152
|
+
self.log(LogLevel::Error, message.as_ref())
|
153
|
+
} else {
|
154
|
+
Ok(())
|
155
|
+
}
|
156
|
+
}
|
157
|
+
|
158
|
+
/// Log a fatal message
|
159
|
+
pub fn fatal<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
160
|
+
where
|
161
|
+
F: FnOnce() -> S,
|
162
|
+
S: AsRef<str>,
|
163
|
+
{
|
164
|
+
if self.level <= LogLevel::Fatal {
|
165
|
+
let message = message_fn();
|
166
|
+
self.log(LogLevel::Fatal, message.as_ref())
|
167
|
+
} else {
|
168
|
+
Ok(())
|
169
|
+
}
|
170
|
+
}
|
171
|
+
}
|
@@ -0,0 +1,113 @@
|
|
1
|
+
use ahash::RandomState;
|
2
|
+
use arrow_schema::Schema;
|
3
|
+
use either::Either;
|
4
|
+
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
|
5
|
+
use parquet::arrow::ProjectionMask;
|
6
|
+
use std::collections::HashMap;
|
7
|
+
use std::fs::File;
|
8
|
+
use std::sync::Arc;
|
9
|
+
|
10
|
+
use magnus::value::ReprValue;
|
11
|
+
use magnus::{Error as MagnusError, Value};
|
12
|
+
|
13
|
+
use crate::header_cache::StringCache;
|
14
|
+
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
15
|
+
use crate::types::TryIntoValue;
|
16
|
+
use crate::ColumnRecord;
|
17
|
+
|
18
|
+
use super::ReaderError;
|
19
|
+
|
20
|
+
/// Opens a parquet file or IO-like object for reading
|
21
|
+
///
|
22
|
+
/// This function handles both file paths (as strings) and IO-like objects,
|
23
|
+
/// returning either a File or a ThreadSafeRubyReader that can be used with
|
24
|
+
/// parquet readers.
|
25
|
+
pub fn open_parquet_source(
|
26
|
+
to_read: Value,
|
27
|
+
) -> Result<Either<File, ThreadSafeRubyReader>, ReaderError> {
|
28
|
+
let ruby = unsafe { magnus::Ruby::get_unchecked() };
|
29
|
+
|
30
|
+
if to_read.is_kind_of(ruby.class_string()) {
|
31
|
+
let path_string = to_read.to_r_string()?;
|
32
|
+
let file_path = unsafe { path_string.as_str()? };
|
33
|
+
let file = File::open(file_path).map_err(ReaderError::from)?;
|
34
|
+
Ok(Either::Left(file))
|
35
|
+
} else {
|
36
|
+
let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
|
37
|
+
Ok(Either::Right(readable))
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
/// Helper function to check if a block is given and create an appropriate enumerator
|
42
|
+
/// if not
|
43
|
+
pub fn handle_block_or_enum<F, T>(
|
44
|
+
_ruby: &magnus::Ruby,
|
45
|
+
block_given: bool,
|
46
|
+
create_enum: F,
|
47
|
+
) -> Result<Option<T>, MagnusError>
|
48
|
+
where
|
49
|
+
F: FnOnce() -> Result<T, MagnusError>,
|
50
|
+
{
|
51
|
+
if !block_given {
|
52
|
+
let enum_value = create_enum()?;
|
53
|
+
return Ok(Some(enum_value));
|
54
|
+
}
|
55
|
+
Ok(None)
|
56
|
+
}
|
57
|
+
|
58
|
+
/// Creates a ParquetRecordBatchReader with the given columns and batch size configurations
|
59
|
+
pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
|
60
|
+
reader: T,
|
61
|
+
columns: &Option<Vec<String>>,
|
62
|
+
batch_size: Option<usize>,
|
63
|
+
) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ReaderError> {
|
64
|
+
let mut builder =
|
65
|
+
ParquetRecordBatchReaderBuilder::try_new(reader).map_err(|e| ReaderError::Parquet(e))?;
|
66
|
+
|
67
|
+
let schema = builder.schema().clone();
|
68
|
+
let num_rows = builder.metadata().file_metadata().num_rows();
|
69
|
+
|
70
|
+
// If columns are specified, project only those columns
|
71
|
+
if let Some(cols) = columns {
|
72
|
+
// Get the parquet schema
|
73
|
+
let parquet_schema = builder.parquet_schema();
|
74
|
+
|
75
|
+
// Create a projection mask from column names
|
76
|
+
let projection = ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
77
|
+
builder = builder.with_projection(projection);
|
78
|
+
}
|
79
|
+
|
80
|
+
if let Some(batch_size) = batch_size {
|
81
|
+
builder = builder.with_batch_size(batch_size);
|
82
|
+
}
|
83
|
+
|
84
|
+
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
85
|
+
Ok((reader, schema, num_rows))
|
86
|
+
}
|
87
|
+
|
88
|
+
/// Handles the case of an empty parquet file (no rows) by yielding a record with empty arrays
|
89
|
+
/// Returns true if the file was empty and was handled, false otherwise
|
90
|
+
pub fn handle_empty_file(
|
91
|
+
ruby: &magnus::Ruby,
|
92
|
+
schema: &Arc<Schema>,
|
93
|
+
num_rows: i64,
|
94
|
+
) -> Result<bool, ReaderError> {
|
95
|
+
if num_rows == 0 {
|
96
|
+
let mut map =
|
97
|
+
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
98
|
+
let headers: Vec<String> = schema
|
99
|
+
.fields()
|
100
|
+
.iter()
|
101
|
+
.map(|field| field.name().to_string())
|
102
|
+
.collect();
|
103
|
+
let interned_headers =
|
104
|
+
StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
|
105
|
+
for field in interned_headers.iter() {
|
106
|
+
map.insert(*field, vec![]);
|
107
|
+
}
|
108
|
+
let record = ColumnRecord::Map(map);
|
109
|
+
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
110
|
+
return Ok(true);
|
111
|
+
}
|
112
|
+
Ok(false)
|
113
|
+
}
|
@@ -1,9 +1,10 @@
|
|
1
|
+
mod common;
|
1
2
|
mod parquet_column_reader;
|
2
3
|
mod parquet_row_reader;
|
3
4
|
|
4
5
|
use std::io;
|
5
6
|
|
6
|
-
use magnus::
|
7
|
+
use magnus::Error as MagnusError;
|
7
8
|
use thiserror::Error;
|
8
9
|
|
9
10
|
use crate::header_cache::CacheError;
|
@@ -12,16 +13,12 @@ pub use parquet_row_reader::parse_parquet_rows;
|
|
12
13
|
|
13
14
|
#[derive(Error, Debug)]
|
14
15
|
pub enum ReaderError {
|
15
|
-
#[error("Failed to get file descriptor: {0}")]
|
16
|
-
FileDescriptor(String),
|
17
|
-
#[error("Invalid file descriptor")]
|
18
|
-
InvalidFileDescriptor,
|
19
16
|
#[error("Failed to open file: {0}")]
|
20
17
|
FileOpen(#[from] io::Error),
|
21
18
|
#[error("Failed to intern headers: {0}")]
|
22
19
|
HeaderIntern(#[from] CacheError),
|
23
20
|
#[error("Ruby error: {0}")]
|
24
|
-
Ruby(
|
21
|
+
Ruby(#[from] MagnusErrorWrapper),
|
25
22
|
#[error("Parquet error: {0}")]
|
26
23
|
Parquet(#[from] parquet::errors::ParquetError),
|
27
24
|
#[error("Arrow error: {0}")]
|
@@ -32,17 +29,34 @@ pub enum ReaderError {
|
|
32
29
|
Jiff(#[from] jiff::Error),
|
33
30
|
}
|
34
31
|
|
32
|
+
#[derive(Debug)]
|
33
|
+
pub struct MagnusErrorWrapper(pub MagnusError);
|
34
|
+
|
35
|
+
impl From<MagnusError> for MagnusErrorWrapper {
|
36
|
+
fn from(err: MagnusError) -> Self {
|
37
|
+
Self(err)
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
impl std::fmt::Display for MagnusErrorWrapper {
|
42
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
43
|
+
write!(f, "{}", self.0)
|
44
|
+
}
|
45
|
+
}
|
46
|
+
|
47
|
+
impl std::error::Error for MagnusErrorWrapper {}
|
48
|
+
|
35
49
|
impl From<MagnusError> for ReaderError {
|
36
50
|
fn from(err: MagnusError) -> Self {
|
37
|
-
Self::Ruby(err
|
51
|
+
Self::Ruby(MagnusErrorWrapper(err))
|
38
52
|
}
|
39
53
|
}
|
40
54
|
|
41
|
-
impl
|
42
|
-
fn
|
43
|
-
|
44
|
-
Ruby
|
45
|
-
|
46
|
-
|
55
|
+
impl Into<MagnusError> for ReaderError {
|
56
|
+
fn into(self) -> MagnusError {
|
57
|
+
match self {
|
58
|
+
Self::Ruby(MagnusErrorWrapper(err)) => err.into(),
|
59
|
+
_ => MagnusError::new(magnus::exception::runtime_error(), self.to_string()),
|
60
|
+
}
|
47
61
|
}
|
48
62
|
}
|
@@ -1,24 +1,32 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
-
use crate::
|
2
|
+
use crate::logger::RubyLogger;
|
3
3
|
use crate::types::{ArrayWrapper, TryIntoValue};
|
4
4
|
use crate::{
|
5
5
|
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ParquetValueVec,
|
6
6
|
ParserResultType,
|
7
7
|
};
|
8
8
|
use ahash::RandomState;
|
9
|
-
use
|
9
|
+
use either::Either;
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
|
-
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
13
|
-
use parquet::arrow::ProjectionMask;
|
14
12
|
use std::collections::HashMap;
|
15
|
-
use std::fs::File;
|
16
13
|
use std::sync::OnceLock;
|
17
14
|
|
15
|
+
use super::common::{
|
16
|
+
create_batch_reader, handle_block_or_enum, handle_empty_file, open_parquet_source,
|
17
|
+
};
|
18
18
|
use super::ReaderError;
|
19
19
|
|
20
20
|
#[inline]
|
21
21
|
pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
|
+
Ok(parse_parquet_columns_impl(rb_self, args).map_err(|e| {
|
23
|
+
let z: MagnusError = e.into();
|
24
|
+
z
|
25
|
+
})?)
|
26
|
+
}
|
27
|
+
|
28
|
+
#[inline]
|
29
|
+
fn parse_parquet_columns_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value, ReaderError> {
|
22
30
|
let ruby = unsafe { Ruby::get_unchecked() };
|
23
31
|
|
24
32
|
let ParquetColumnsArgs {
|
@@ -27,93 +35,45 @@ pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value
|
|
27
35
|
columns,
|
28
36
|
batch_size,
|
29
37
|
strict,
|
38
|
+
logger,
|
30
39
|
} = parse_parquet_columns_args(&ruby, args)?;
|
31
40
|
|
32
|
-
if
|
33
|
-
|
41
|
+
// Initialize the logger if provided
|
42
|
+
let ruby_logger = RubyLogger::new(&ruby, logger)?;
|
43
|
+
if let Some(ref bs) = batch_size {
|
44
|
+
ruby_logger.debug(|| format!("Using batch size: {}", bs))?;
|
45
|
+
}
|
46
|
+
|
47
|
+
// Clone values for the closure to avoid move issues
|
48
|
+
let columns_clone = columns.clone();
|
49
|
+
|
50
|
+
// Handle block or create enumerator
|
51
|
+
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
52
|
+
create_column_enumerator(ColumnEnumeratorArgs {
|
34
53
|
rb_self,
|
35
54
|
to_read,
|
36
55
|
result_type,
|
37
|
-
columns,
|
56
|
+
columns: columns_clone,
|
38
57
|
batch_size,
|
39
58
|
strict,
|
59
|
+
logger: logger.as_ref().map(|_| to_read),
|
40
60
|
})
|
41
|
-
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
61
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
62
|
+
})? {
|
63
|
+
return Ok(enum_value);
|
42
64
|
}
|
43
65
|
|
44
|
-
let
|
45
|
-
let path_string = to_read.to_r_string()?;
|
46
|
-
let file_path = unsafe { path_string.as_str()? };
|
47
|
-
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
48
|
-
|
49
|
-
let mut builder =
|
50
|
-
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
51
|
-
let schema = builder.schema().clone();
|
52
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
53
|
-
|
54
|
-
// If columns are specified, project only those columns
|
55
|
-
if let Some(cols) = &columns {
|
56
|
-
// Get the parquet schema
|
57
|
-
let parquet_schema = builder.parquet_schema();
|
58
|
-
|
59
|
-
// Create a projection mask from column names
|
60
|
-
let projection =
|
61
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
62
|
-
|
63
|
-
builder = builder.with_projection(projection);
|
64
|
-
}
|
65
|
-
|
66
|
-
if let Some(batch_size) = batch_size {
|
67
|
-
builder = builder.with_batch_size(batch_size);
|
68
|
-
}
|
69
|
-
|
70
|
-
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
71
|
-
|
72
|
-
(reader, schema, num_rows)
|
73
|
-
} else {
|
74
|
-
let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
|
75
|
-
|
76
|
-
let mut builder =
|
77
|
-
ParquetRecordBatchReaderBuilder::try_new(readable).map_err(ReaderError::from)?;
|
78
|
-
let schema = builder.schema().clone();
|
79
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
80
|
-
|
81
|
-
// If columns are specified, project only those columns
|
82
|
-
if let Some(cols) = &columns {
|
83
|
-
// Get the parquet schema
|
84
|
-
let parquet_schema = builder.parquet_schema();
|
66
|
+
let source = open_parquet_source(to_read)?;
|
85
67
|
|
86
|
-
|
87
|
-
let projection =
|
88
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
68
|
+
// Use the common function to create the batch reader
|
89
69
|
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
if let Some(batch_size) = batch_size {
|
94
|
-
builder = builder.with_batch_size(batch_size);
|
95
|
-
}
|
96
|
-
|
97
|
-
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
98
|
-
|
99
|
-
(reader, schema, num_rows)
|
70
|
+
let (batch_reader, schema, num_rows) = match source {
|
71
|
+
Either::Left(file) => create_batch_reader(file, &columns, batch_size)?,
|
72
|
+
Either::Right(readable) => create_batch_reader(readable, &columns, batch_size)?,
|
100
73
|
};
|
101
74
|
|
102
|
-
|
103
|
-
|
104
|
-
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
105
|
-
let headers: Vec<String> = schema
|
106
|
-
.fields()
|
107
|
-
.iter()
|
108
|
-
.map(|field| field.name().to_string())
|
109
|
-
.collect();
|
110
|
-
let interned_headers =
|
111
|
-
StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
|
112
|
-
for field in interned_headers.iter() {
|
113
|
-
map.insert(*field, vec![]);
|
114
|
-
}
|
115
|
-
let record = ColumnRecord::Map(map);
|
116
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(&ruby)?)?;
|
75
|
+
// Handle empty file case
|
76
|
+
if handle_empty_file(&ruby, &schema, num_rows)? {
|
117
77
|
return Ok(ruby.qnil().into_value_with(&ruby));
|
118
78
|
}
|
119
79
|
|
@@ -1,23 +1,32 @@
|
|
1
1
|
use crate::header_cache::StringCache;
|
2
|
-
use crate::
|
2
|
+
use crate::logger::RubyLogger;
|
3
3
|
use crate::types::TryIntoValue;
|
4
4
|
use crate::{
|
5
5
|
create_row_enumerator, utils::*, ParquetField, ParserResultType, ReaderError,
|
6
6
|
RowEnumeratorArgs, RowRecord,
|
7
7
|
};
|
8
8
|
use ahash::RandomState;
|
9
|
-
use
|
9
|
+
use either::Either;
|
10
10
|
use magnus::IntoValue;
|
11
11
|
use magnus::{Error as MagnusError, Ruby, Value};
|
12
12
|
use parquet::file::reader::{FileReader, SerializedFileReader};
|
13
13
|
use parquet::record::reader::RowIter as ParquetRowIter;
|
14
14
|
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
15
15
|
use std::collections::HashMap;
|
16
|
-
use std::fs::File;
|
17
16
|
use std::sync::OnceLock;
|
18
17
|
|
18
|
+
use super::common::{handle_block_or_enum, open_parquet_source};
|
19
|
+
|
19
20
|
#[inline]
|
20
21
|
pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
|
+
Ok(parse_parquet_rows_impl(rb_self, args).map_err(|e| {
|
23
|
+
let z: MagnusError = e.into();
|
24
|
+
z
|
25
|
+
})?)
|
26
|
+
}
|
27
|
+
|
28
|
+
#[inline]
|
29
|
+
fn parse_parquet_rows_impl<'a>(rb_self: Value, args: &[Value]) -> Result<Value, ReaderError> {
|
21
30
|
let ruby = unsafe { Ruby::get_unchecked() };
|
22
31
|
|
23
32
|
let ParquetRowsArgs {
|
@@ -25,31 +34,44 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
25
34
|
result_type,
|
26
35
|
columns,
|
27
36
|
strict,
|
37
|
+
logger,
|
28
38
|
} = parse_parquet_rows_args(&ruby, args)?;
|
29
39
|
|
30
|
-
if
|
31
|
-
|
40
|
+
// Initialize the logger if provided
|
41
|
+
let ruby_logger = RubyLogger::new(&ruby, logger)?;
|
42
|
+
|
43
|
+
// Clone values for the closure to avoid move issues
|
44
|
+
let columns_clone = columns.clone();
|
45
|
+
|
46
|
+
// Handle block or create enumerator
|
47
|
+
if let Some(enum_value) = handle_block_or_enum(&ruby, ruby.block_given(), || {
|
48
|
+
create_row_enumerator(RowEnumeratorArgs {
|
32
49
|
rb_self,
|
33
50
|
to_read,
|
34
51
|
result_type,
|
35
|
-
columns,
|
52
|
+
columns: columns_clone,
|
36
53
|
strict,
|
54
|
+
logger,
|
37
55
|
})
|
38
|
-
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
56
|
+
.map(|yield_enum| yield_enum.into_value_with(&ruby))
|
57
|
+
})? {
|
58
|
+
return Ok(enum_value);
|
39
59
|
}
|
40
60
|
|
41
|
-
let
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
let readable = ThreadSafeRubyReader::new(RubyReader::try_from(to_read)?);
|
48
|
-
Box::new(SerializedFileReader::new(readable).map_err(ReaderError::from)?)
|
61
|
+
let source = open_parquet_source(to_read)?;
|
62
|
+
let reader: Box<dyn FileReader> = match source {
|
63
|
+
Either::Left(file) => Box::new(SerializedFileReader::new(file).map_err(ReaderError::from)?),
|
64
|
+
Either::Right(readable) => {
|
65
|
+
Box::new(SerializedFileReader::new(readable).map_err(ReaderError::from)?)
|
66
|
+
}
|
49
67
|
};
|
68
|
+
|
50
69
|
let schema = reader.metadata().file_metadata().schema().clone();
|
70
|
+
ruby_logger.debug(|| format!("Schema loaded: {:?}", schema))?;
|
71
|
+
|
51
72
|
let mut iter = ParquetRowIter::from_file_into(reader);
|
52
73
|
if let Some(cols) = columns {
|
74
|
+
ruby_logger.debug(|| format!("Projecting columns: {:?}", cols))?;
|
53
75
|
let projection = create_projection_schema(&schema, &cols);
|
54
76
|
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
55
77
|
MagnusError::new(
|
@@ -81,9 +103,9 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
81
103
|
|
82
104
|
let mut map =
|
83
105
|
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
84
|
-
|
106
|
+
for (i, (_, v)) in row.get_column_iter().enumerate() {
|
85
107
|
map.insert(headers[i], ParquetField(v.clone(), strict));
|
86
|
-
}
|
108
|
+
}
|
87
109
|
Ok(map)
|
88
110
|
})
|
89
111
|
.and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
|
@@ -100,8 +122,9 @@ pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, M
|
|
100
122
|
row.and_then(|row| {
|
101
123
|
let column_count = row.get_column_iter().count();
|
102
124
|
let mut vec = Vec::with_capacity(column_count);
|
103
|
-
row.get_column_iter()
|
104
|
-
|
125
|
+
for (_, v) in row.get_column_iter() {
|
126
|
+
vec.push(ParquetField(v.clone(), strict));
|
127
|
+
}
|
105
128
|
Ok(vec)
|
106
129
|
})
|
107
130
|
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|