parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,99 +0,0 @@
|
|
1
|
-
/// This module exists to avoid cloning header keys in returned HashMaps.
|
2
|
-
/// Since the underlying RString creation already involves cloning,
|
3
|
-
/// this caching layer aims to reduce redundant allocations.
|
4
|
-
///
|
5
|
-
/// Note: Performance testing on macOS showed minimal speed improvements,
|
6
|
-
/// so this optimization could be removed if any issues arise.
|
7
|
-
use std::{
|
8
|
-
collections::HashMap,
|
9
|
-
sync::{LazyLock, Mutex},
|
10
|
-
};
|
11
|
-
|
12
|
-
use magnus::{IntoValue, RString, Ruby, Value};
|
13
|
-
|
14
|
-
use thiserror::Error;
|
15
|
-
|
16
|
-
#[derive(Debug, Clone, Error)]
|
17
|
-
pub enum CacheError {
|
18
|
-
#[error("Failed to acquire lock: {0}")]
|
19
|
-
LockError(String),
|
20
|
-
#[error("Failed to convert Ruby String to interned string: {0}")]
|
21
|
-
RStringConversion(String),
|
22
|
-
}
|
23
|
-
|
24
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, StringCacheKey>>> =
|
25
|
-
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
26
|
-
|
27
|
-
pub struct StringCache;
|
28
|
-
|
29
|
-
#[derive(Copy, Clone)]
|
30
|
-
pub struct StringCacheKey(&'static str);
|
31
|
-
|
32
|
-
impl StringCacheKey {
|
33
|
-
pub fn new(string: &str) -> Result<Self, CacheError> {
|
34
|
-
let rstr = RString::new(string);
|
35
|
-
let fstr = rstr.to_interned_str();
|
36
|
-
Ok(Self(fstr.as_str().map_err(|e| {
|
37
|
-
CacheError::RStringConversion(e.to_string())
|
38
|
-
})?))
|
39
|
-
}
|
40
|
-
}
|
41
|
-
|
42
|
-
impl AsRef<str> for StringCacheKey {
|
43
|
-
fn as_ref(&self) -> &'static str {
|
44
|
-
self.0
|
45
|
-
}
|
46
|
-
}
|
47
|
-
|
48
|
-
impl IntoValue for StringCacheKey {
|
49
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
50
|
-
handle.into_value(self.0)
|
51
|
-
}
|
52
|
-
}
|
53
|
-
|
54
|
-
impl IntoValue for &StringCacheKey {
|
55
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
56
|
-
handle.into_value(self.0)
|
57
|
-
}
|
58
|
-
}
|
59
|
-
|
60
|
-
impl std::fmt::Debug for StringCacheKey {
|
61
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
62
|
-
self.0.fmt(f)
|
63
|
-
}
|
64
|
-
}
|
65
|
-
|
66
|
-
impl PartialEq for StringCacheKey {
|
67
|
-
fn eq(&self, other: &Self) -> bool {
|
68
|
-
self.0 == other.0
|
69
|
-
}
|
70
|
-
}
|
71
|
-
|
72
|
-
impl std::cmp::Eq for StringCacheKey {}
|
73
|
-
|
74
|
-
impl std::hash::Hash for StringCacheKey {
|
75
|
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
76
|
-
self.0.hash(state);
|
77
|
-
}
|
78
|
-
}
|
79
|
-
|
80
|
-
impl StringCache {
|
81
|
-
pub fn intern_many<AsStr: AsRef<str>>(
|
82
|
-
strings: &[AsStr],
|
83
|
-
) -> Result<Vec<StringCacheKey>, CacheError> {
|
84
|
-
let cache = STRING_CACHE
|
85
|
-
.lock()
|
86
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
87
|
-
|
88
|
-
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
89
|
-
for string in strings {
|
90
|
-
if let Some((_, interned_string)) = cache.get_key_value(string.as_ref()) {
|
91
|
-
result.push(*interned_string);
|
92
|
-
} else {
|
93
|
-
let interned = StringCacheKey::new(string.as_ref())?;
|
94
|
-
result.push(interned);
|
95
|
-
}
|
96
|
-
}
|
97
|
-
Ok(result)
|
98
|
-
}
|
99
|
-
}
|
data/ext/parquet/src/logger.rs
DELETED
@@ -1,171 +0,0 @@
|
|
1
|
-
// Logger module for Parquet gem
|
2
|
-
// Provides a Rust wrapper for Ruby logger objects
|
3
|
-
|
4
|
-
use std::str::FromStr;
|
5
|
-
|
6
|
-
use magnus::{exception::runtime_error, value::ReprValue, Error as MagnusError, Ruby, Value};
|
7
|
-
|
8
|
-
use crate::{types::ParquetGemError, utils::parse_string_or_symbol};
|
9
|
-
|
10
|
-
/// Severity levels that match Ruby's Logger levels
|
11
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
12
|
-
pub enum LogLevel {
|
13
|
-
Debug,
|
14
|
-
Info,
|
15
|
-
Warn,
|
16
|
-
Error,
|
17
|
-
Fatal,
|
18
|
-
}
|
19
|
-
|
20
|
-
impl FromStr for LogLevel {
|
21
|
-
type Err = MagnusError;
|
22
|
-
|
23
|
-
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
24
|
-
Ok(match s {
|
25
|
-
"debug" => LogLevel::Debug,
|
26
|
-
"info" => LogLevel::Info,
|
27
|
-
"warn" => LogLevel::Warn,
|
28
|
-
"error" => LogLevel::Error,
|
29
|
-
"fatal" => LogLevel::Fatal,
|
30
|
-
_ => {
|
31
|
-
return Err(MagnusError::new(
|
32
|
-
runtime_error(),
|
33
|
-
format!("Invalid log level: {}", s),
|
34
|
-
))
|
35
|
-
}
|
36
|
-
})
|
37
|
-
}
|
38
|
-
}
|
39
|
-
/// A wrapper around a Ruby logger object
|
40
|
-
#[derive(Debug, Clone)]
|
41
|
-
pub struct RubyLogger {
|
42
|
-
logger: Option<Value>,
|
43
|
-
level: LogLevel,
|
44
|
-
}
|
45
|
-
|
46
|
-
#[allow(dead_code)]
|
47
|
-
impl RubyLogger {
|
48
|
-
pub fn new(ruby: &Ruby, logger_value: Option<Value>) -> Result<Self, ParquetGemError> {
|
49
|
-
let environment_level = std::env::var("PARQUET_GEM_LOG_LEVEL")
|
50
|
-
.unwrap_or_else(|_| "warn".to_string())
|
51
|
-
.parse::<LogLevel>()
|
52
|
-
.unwrap_or(LogLevel::Warn);
|
53
|
-
|
54
|
-
match logger_value {
|
55
|
-
Some(logger) => {
|
56
|
-
if logger.is_nil() {
|
57
|
-
return Ok(Self {
|
58
|
-
logger: None,
|
59
|
-
level: environment_level,
|
60
|
-
});
|
61
|
-
}
|
62
|
-
|
63
|
-
let level_value = logger.funcall::<_, _, Value>("level", ())?;
|
64
|
-
let level = parse_string_or_symbol(ruby, level_value)?;
|
65
|
-
let level = level
|
66
|
-
.map(|s| s.parse::<LogLevel>())
|
67
|
-
.transpose()?
|
68
|
-
.unwrap_or(environment_level);
|
69
|
-
|
70
|
-
Ok(Self {
|
71
|
-
logger: Some(logger),
|
72
|
-
level,
|
73
|
-
})
|
74
|
-
}
|
75
|
-
None => Ok(Self {
|
76
|
-
logger: None,
|
77
|
-
level: environment_level,
|
78
|
-
}),
|
79
|
-
}
|
80
|
-
}
|
81
|
-
|
82
|
-
/// Log a message at the given level
|
83
|
-
pub fn log(&self, level: LogLevel, message: &str) -> Result<(), MagnusError> {
|
84
|
-
let method = match level {
|
85
|
-
LogLevel::Debug => "debug",
|
86
|
-
LogLevel::Info => "info",
|
87
|
-
LogLevel::Warn => "warn",
|
88
|
-
LogLevel::Error => "error",
|
89
|
-
LogLevel::Fatal => "fatal",
|
90
|
-
};
|
91
|
-
|
92
|
-
match self.logger {
|
93
|
-
Some(logger) => {
|
94
|
-
logger.funcall::<_, _, Value>(method, (message,))?;
|
95
|
-
}
|
96
|
-
None => eprintln!("{}", message),
|
97
|
-
}
|
98
|
-
|
99
|
-
Ok(())
|
100
|
-
}
|
101
|
-
|
102
|
-
/// Log a debug message
|
103
|
-
pub fn debug<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
104
|
-
where
|
105
|
-
F: FnOnce() -> S,
|
106
|
-
S: AsRef<str>,
|
107
|
-
{
|
108
|
-
if self.level <= LogLevel::Debug {
|
109
|
-
let message = message_fn();
|
110
|
-
self.log(LogLevel::Debug, message.as_ref())
|
111
|
-
} else {
|
112
|
-
Ok(())
|
113
|
-
}
|
114
|
-
}
|
115
|
-
|
116
|
-
/// Log an info message
|
117
|
-
pub fn info<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
118
|
-
where
|
119
|
-
F: FnOnce() -> S,
|
120
|
-
S: AsRef<str>,
|
121
|
-
{
|
122
|
-
if self.level <= LogLevel::Info {
|
123
|
-
let message = message_fn();
|
124
|
-
self.log(LogLevel::Info, message.as_ref())
|
125
|
-
} else {
|
126
|
-
Ok(())
|
127
|
-
}
|
128
|
-
}
|
129
|
-
|
130
|
-
/// Log a warning message
|
131
|
-
pub fn warn<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
132
|
-
where
|
133
|
-
F: FnOnce() -> S,
|
134
|
-
S: AsRef<str>,
|
135
|
-
{
|
136
|
-
if self.level <= LogLevel::Warn {
|
137
|
-
let message = message_fn();
|
138
|
-
self.log(LogLevel::Warn, message.as_ref())
|
139
|
-
} else {
|
140
|
-
Ok(())
|
141
|
-
}
|
142
|
-
}
|
143
|
-
|
144
|
-
/// Log an error message
|
145
|
-
pub fn error<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
146
|
-
where
|
147
|
-
F: FnOnce() -> S,
|
148
|
-
S: AsRef<str>,
|
149
|
-
{
|
150
|
-
if self.level <= LogLevel::Error {
|
151
|
-
let message = message_fn();
|
152
|
-
self.log(LogLevel::Error, message.as_ref())
|
153
|
-
} else {
|
154
|
-
Ok(())
|
155
|
-
}
|
156
|
-
}
|
157
|
-
|
158
|
-
/// Log a fatal message
|
159
|
-
pub fn fatal<F, S>(&self, message_fn: F) -> Result<(), MagnusError>
|
160
|
-
where
|
161
|
-
F: FnOnce() -> S,
|
162
|
-
S: AsRef<str>,
|
163
|
-
{
|
164
|
-
if self.level <= LogLevel::Fatal {
|
165
|
-
let message = message_fn();
|
166
|
-
self.log(LogLevel::Fatal, message.as_ref())
|
167
|
-
} else {
|
168
|
-
Ok(())
|
169
|
-
}
|
170
|
-
}
|
171
|
-
}
|
@@ -1,111 +0,0 @@
|
|
1
|
-
use ahash::RandomState;
|
2
|
-
use arrow_schema::Schema;
|
3
|
-
use either::Either;
|
4
|
-
use parquet::arrow::arrow_reader::{ParquetRecordBatchReader, ParquetRecordBatchReaderBuilder};
|
5
|
-
use parquet::arrow::ProjectionMask;
|
6
|
-
use std::collections::HashMap;
|
7
|
-
use std::fs::File;
|
8
|
-
use std::rc::Rc;
|
9
|
-
use std::sync::Arc;
|
10
|
-
|
11
|
-
use magnus::value::ReprValue;
|
12
|
-
use magnus::{Error as MagnusError, Ruby, Value};
|
13
|
-
|
14
|
-
use crate::header_cache::StringCache;
|
15
|
-
use crate::ruby_reader::{RubyReader, ThreadSafeRubyReader};
|
16
|
-
use crate::types::{ParquetGemError, TryIntoValue};
|
17
|
-
use crate::ColumnRecord;
|
18
|
-
|
19
|
-
/// Opens a parquet file or IO-like object for reading
|
20
|
-
///
|
21
|
-
/// This function handles both file paths (as strings) and IO-like objects,
|
22
|
-
/// returning either a File or a ThreadSafeRubyReader that can be used with
|
23
|
-
/// parquet readers.
|
24
|
-
pub fn open_parquet_source(
|
25
|
-
ruby: Rc<Ruby>,
|
26
|
-
to_read: Value,
|
27
|
-
) -> Result<Either<File, ThreadSafeRubyReader>, ParquetGemError> {
|
28
|
-
if to_read.is_kind_of(ruby.class_string()) {
|
29
|
-
let path_string = to_read.to_r_string()?;
|
30
|
-
let file_path = unsafe { path_string.as_str()? };
|
31
|
-
let file = File::open(file_path).map_err(ParquetGemError::from)?;
|
32
|
-
Ok(Either::Left(file))
|
33
|
-
} else {
|
34
|
-
let readable = ThreadSafeRubyReader::new(RubyReader::new(ruby, to_read)?);
|
35
|
-
Ok(Either::Right(readable))
|
36
|
-
}
|
37
|
-
}
|
38
|
-
|
39
|
-
/// Helper function to check if a block is given and create an appropriate enumerator
|
40
|
-
/// if not
|
41
|
-
pub fn handle_block_or_enum<F, T>(
|
42
|
-
_ruby: &magnus::Ruby,
|
43
|
-
block_given: bool,
|
44
|
-
create_enum: F,
|
45
|
-
) -> Result<Option<T>, MagnusError>
|
46
|
-
where
|
47
|
-
F: FnOnce() -> Result<T, MagnusError>,
|
48
|
-
{
|
49
|
-
if !block_given {
|
50
|
-
let enum_value = create_enum()?;
|
51
|
-
return Ok(Some(enum_value));
|
52
|
-
}
|
53
|
-
Ok(None)
|
54
|
-
}
|
55
|
-
|
56
|
-
/// Creates a ParquetRecordBatchReader with the given columns and batch size configurations
|
57
|
-
pub fn create_batch_reader<T: parquet::file::reader::ChunkReader + 'static>(
|
58
|
-
reader: T,
|
59
|
-
columns: &Option<Vec<String>>,
|
60
|
-
batch_size: Option<usize>,
|
61
|
-
) -> Result<(ParquetRecordBatchReader, std::sync::Arc<Schema>, i64), ParquetGemError> {
|
62
|
-
let mut builder =
|
63
|
-
ParquetRecordBatchReaderBuilder::try_new(reader).map_err(ParquetGemError::Parquet)?;
|
64
|
-
|
65
|
-
let schema = builder.schema().clone();
|
66
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
67
|
-
|
68
|
-
// If columns are specified, project only those columns
|
69
|
-
if let Some(cols) = columns {
|
70
|
-
// Get the parquet schema
|
71
|
-
let parquet_schema = builder.parquet_schema();
|
72
|
-
|
73
|
-
// Create a projection mask from column names
|
74
|
-
let projection = ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
75
|
-
builder = builder.with_projection(projection);
|
76
|
-
}
|
77
|
-
|
78
|
-
if let Some(batch_size) = batch_size {
|
79
|
-
builder = builder.with_batch_size(batch_size);
|
80
|
-
}
|
81
|
-
|
82
|
-
let reader = builder.build().map_err(ParquetGemError::Parquet)?;
|
83
|
-
Ok((reader, schema, num_rows))
|
84
|
-
}
|
85
|
-
|
86
|
-
/// Handles the case of an empty parquet file (no rows) by yielding a record with empty arrays
|
87
|
-
/// Returns true if the file was empty and was handled, false otherwise
|
88
|
-
pub fn handle_empty_file(
|
89
|
-
ruby: &magnus::Ruby,
|
90
|
-
schema: &Arc<Schema>,
|
91
|
-
num_rows: i64,
|
92
|
-
) -> Result<bool, ParquetGemError> {
|
93
|
-
if num_rows == 0 {
|
94
|
-
let mut map =
|
95
|
-
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
96
|
-
let headers: Vec<String> = schema
|
97
|
-
.fields()
|
98
|
-
.iter()
|
99
|
-
.map(|field| field.name().to_string())
|
100
|
-
.collect();
|
101
|
-
let interned_headers =
|
102
|
-
StringCache::intern_many(&headers).map_err(ParquetGemError::HeaderIntern)?;
|
103
|
-
for field in interned_headers.iter() {
|
104
|
-
map.insert(*field, vec![]);
|
105
|
-
}
|
106
|
-
let record = ColumnRecord::Map(map);
|
107
|
-
let _: Value = ruby.yield_value(record.try_into_value_with(ruby)?)?;
|
108
|
-
return Ok(true);
|
109
|
-
}
|
110
|
-
Ok(false)
|
111
|
-
}
|
@@ -1,211 +0,0 @@
|
|
1
|
-
mod common;
|
2
|
-
mod parquet_column_reader;
|
3
|
-
mod parquet_row_reader;
|
4
|
-
mod unified;
|
5
|
-
use std::{fs::File, rc::Rc};
|
6
|
-
|
7
|
-
use magnus::{value::ReprValue, Error as MagnusError, Ruby, Value};
|
8
|
-
use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
|
9
|
-
pub use parquet_column_reader::parse_parquet_columns;
|
10
|
-
pub use parquet_row_reader::parse_parquet_rows;
|
11
|
-
|
12
|
-
use crate::{
|
13
|
-
ruby_reader::{RubyReader, ThreadSafeRubyReader},
|
14
|
-
types::{ParquetGemError, TryIntoValue},
|
15
|
-
};
|
16
|
-
|
17
|
-
struct RubyParquetMetaData(ParquetMetaData);
|
18
|
-
|
19
|
-
impl TryIntoValue for RubyParquetMetaData {
|
20
|
-
fn try_into_value_with(self, handle: &Ruby) -> Result<Value, ParquetGemError> {
|
21
|
-
let metadata = self.0;
|
22
|
-
let file_metadata = metadata.file_metadata();
|
23
|
-
let row_groups = metadata.row_groups();
|
24
|
-
|
25
|
-
// Construct a hash with the metadata
|
26
|
-
let hash = handle.hash_new();
|
27
|
-
hash.aset("num_rows", file_metadata.num_rows())?;
|
28
|
-
hash.aset("created_by", file_metadata.created_by())?;
|
29
|
-
// Convert key_value_metadata to a Ruby array if it exists
|
30
|
-
if let Some(key_value_metadata) = file_metadata.key_value_metadata() {
|
31
|
-
let kv_array = handle.ary_new();
|
32
|
-
for kv in key_value_metadata {
|
33
|
-
let kv_hash = handle.hash_new();
|
34
|
-
kv_hash.aset("key", kv.key.clone())?;
|
35
|
-
kv_hash.aset("value", kv.value.clone())?;
|
36
|
-
kv_array.push(kv_hash)?;
|
37
|
-
}
|
38
|
-
hash.aset("key_value_metadata", kv_array)?;
|
39
|
-
} else {
|
40
|
-
hash.aset("key_value_metadata", None::<Value>)?;
|
41
|
-
}
|
42
|
-
|
43
|
-
// Convert schema to a Ruby hash since &Type doesn't implement IntoValue
|
44
|
-
let schema_hash = handle.hash_new();
|
45
|
-
let schema = file_metadata.schema();
|
46
|
-
schema_hash.aset("name", schema.name())?;
|
47
|
-
// Add schema fields information
|
48
|
-
let fields_array = handle.ary_new();
|
49
|
-
for field in schema.get_fields() {
|
50
|
-
let field_hash = handle.hash_new();
|
51
|
-
field_hash.aset("name", field.name())?;
|
52
|
-
|
53
|
-
// Handle different field types
|
54
|
-
match field.as_ref() {
|
55
|
-
parquet::schema::types::Type::PrimitiveType {
|
56
|
-
physical_type,
|
57
|
-
type_length,
|
58
|
-
scale,
|
59
|
-
precision,
|
60
|
-
..
|
61
|
-
} => {
|
62
|
-
field_hash.aset("type", "primitive")?;
|
63
|
-
field_hash.aset("physical_type", format!("{:?}", physical_type))?;
|
64
|
-
field_hash.aset("type_length", *type_length)?;
|
65
|
-
field_hash.aset("scale", *scale)?;
|
66
|
-
field_hash.aset("precision", *precision)?;
|
67
|
-
}
|
68
|
-
parquet::schema::types::Type::GroupType { .. } => {
|
69
|
-
field_hash.aset("type", "group")?;
|
70
|
-
}
|
71
|
-
}
|
72
|
-
|
73
|
-
// Add basic info
|
74
|
-
let basic_info = field.get_basic_info();
|
75
|
-
field_hash.aset("repetition", format!("{:?}", basic_info.repetition()))?;
|
76
|
-
field_hash.aset(
|
77
|
-
"converted_type",
|
78
|
-
format!("{:?}", basic_info.converted_type()),
|
79
|
-
)?;
|
80
|
-
if let Some(logical_type) = basic_info.logical_type() {
|
81
|
-
field_hash.aset("logical_type", format!("{:?}", logical_type))?;
|
82
|
-
}
|
83
|
-
|
84
|
-
fields_array.push(field_hash)?;
|
85
|
-
}
|
86
|
-
schema_hash.aset("fields", fields_array)?;
|
87
|
-
|
88
|
-
hash.aset("schema", schema_hash)?;
|
89
|
-
|
90
|
-
// Convert row_groups to a Ruby array since &[RowGroupMetaData] doesn't implement IntoValue
|
91
|
-
let row_groups_array = handle.ary_new();
|
92
|
-
for row_group in row_groups.iter() {
|
93
|
-
let rg_hash = handle.hash_new();
|
94
|
-
rg_hash.aset("num_columns", row_group.num_columns())?;
|
95
|
-
rg_hash.aset("num_rows", row_group.num_rows())?;
|
96
|
-
rg_hash.aset("total_byte_size", row_group.total_byte_size())?;
|
97
|
-
rg_hash.aset("file_offset", row_group.file_offset())?;
|
98
|
-
rg_hash.aset("ordinal", row_group.ordinal())?;
|
99
|
-
rg_hash.aset("compressed_size", row_group.compressed_size())?;
|
100
|
-
|
101
|
-
// Add column chunks metadata
|
102
|
-
let columns_array = handle.ary_new();
|
103
|
-
for col_idx in 0..row_group.num_columns() {
|
104
|
-
let column = row_group.column(col_idx);
|
105
|
-
let col_hash = handle.hash_new();
|
106
|
-
|
107
|
-
col_hash.aset("column_path", column.column_path().string())?;
|
108
|
-
col_hash.aset("file_path", column.file_path())?;
|
109
|
-
col_hash.aset("file_offset", column.file_offset())?;
|
110
|
-
col_hash.aset("num_values", column.num_values())?;
|
111
|
-
col_hash.aset("compression", format!("{:?}", column.compression()))?;
|
112
|
-
col_hash.aset("total_compressed_size", column.compressed_size())?;
|
113
|
-
col_hash.aset("total_uncompressed_size", column.uncompressed_size())?;
|
114
|
-
col_hash.aset("data_page_offset", column.data_page_offset())?;
|
115
|
-
|
116
|
-
if let Some(offset) = column.dictionary_page_offset() {
|
117
|
-
col_hash.aset("dictionary_page_offset", offset)?;
|
118
|
-
}
|
119
|
-
|
120
|
-
if let Some(offset) = column.bloom_filter_offset() {
|
121
|
-
col_hash.aset("bloom_filter_offset", offset)?;
|
122
|
-
}
|
123
|
-
|
124
|
-
if let Some(length) = column.bloom_filter_length() {
|
125
|
-
col_hash.aset("bloom_filter_length", length)?;
|
126
|
-
}
|
127
|
-
|
128
|
-
if let Some(offset) = column.offset_index_offset() {
|
129
|
-
col_hash.aset("offset_index_offset", offset)?;
|
130
|
-
}
|
131
|
-
|
132
|
-
if let Some(length) = column.offset_index_length() {
|
133
|
-
col_hash.aset("offset_index_length", length)?;
|
134
|
-
}
|
135
|
-
|
136
|
-
if let Some(offset) = column.column_index_offset() {
|
137
|
-
col_hash.aset("column_index_offset", offset)?;
|
138
|
-
}
|
139
|
-
|
140
|
-
if let Some(length) = column.column_index_length() {
|
141
|
-
col_hash.aset("column_index_length", length)?;
|
142
|
-
}
|
143
|
-
|
144
|
-
// Add encodings
|
145
|
-
let encodings_array = handle.ary_new();
|
146
|
-
for encoding in column.encodings() {
|
147
|
-
encodings_array.push(format!("{:?}", encoding))?;
|
148
|
-
}
|
149
|
-
col_hash.aset("encodings", encodings_array)?;
|
150
|
-
|
151
|
-
// Add statistics if available
|
152
|
-
if let Some(stats) = column.statistics() {
|
153
|
-
let stats_hash = handle.hash_new();
|
154
|
-
stats_hash.aset("min_is_exact", stats.min_is_exact())?;
|
155
|
-
stats_hash.aset("max_is_exact", stats.max_is_exact())?;
|
156
|
-
|
157
|
-
col_hash.aset("statistics", stats_hash)?;
|
158
|
-
}
|
159
|
-
|
160
|
-
// Add page encoding stats if available
|
161
|
-
if let Some(page_encoding_stats) = column.page_encoding_stats() {
|
162
|
-
let page_stats_array = handle.ary_new();
|
163
|
-
for stat in page_encoding_stats {
|
164
|
-
let stat_hash = handle.hash_new();
|
165
|
-
stat_hash.aset("page_type", format!("{:?}", stat.page_type))?;
|
166
|
-
stat_hash.aset("encoding", format!("{:?}", stat.encoding))?;
|
167
|
-
stat_hash.aset("count", stat.count)?;
|
168
|
-
page_stats_array.push(stat_hash)?;
|
169
|
-
}
|
170
|
-
col_hash.aset("page_encoding_stats", page_stats_array)?;
|
171
|
-
}
|
172
|
-
|
173
|
-
columns_array.push(col_hash)?;
|
174
|
-
}
|
175
|
-
rg_hash.aset("columns", columns_array)?;
|
176
|
-
|
177
|
-
row_groups_array.push(rg_hash)?;
|
178
|
-
}
|
179
|
-
hash.aset("row_groups", row_groups_array)?;
|
180
|
-
|
181
|
-
Ok(handle.into_value(hash))
|
182
|
-
}
|
183
|
-
}
|
184
|
-
|
185
|
-
pub fn parse_metadata(_rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
186
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
187
|
-
|
188
|
-
if args.len() != 1 {
|
189
|
-
return Err(MagnusError::new(
|
190
|
-
magnus::exception::arg_error(),
|
191
|
-
format!("metadata expects exactly 1 argument (file path or IO-like object), got {}", args.len()),
|
192
|
-
));
|
193
|
-
}
|
194
|
-
|
195
|
-
let ruby = Rc::new(ruby);
|
196
|
-
let arg = args[0];
|
197
|
-
|
198
|
-
let mut reader = ParquetMetaDataReader::new();
|
199
|
-
if arg.is_kind_of(ruby.class_string()) {
|
200
|
-
let path = arg.to_r_string()?.to_string()?;
|
201
|
-
let file = File::open(path).map_err(ParquetGemError::FileOpen)?;
|
202
|
-
reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
|
203
|
-
} else {
|
204
|
-
let file = ThreadSafeRubyReader::new(RubyReader::new(ruby.clone(), arg)?);
|
205
|
-
reader.try_parse(&file).map_err(ParquetGemError::Parquet)?;
|
206
|
-
}
|
207
|
-
|
208
|
-
let metadata = reader.finish().map_err(ParquetGemError::Parquet)?;
|
209
|
-
|
210
|
-
Ok(RubyParquetMetaData(metadata).try_into_value_with(&ruby)?)
|
211
|
-
}
|
@@ -1,44 +0,0 @@
|
|
1
|
-
use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
|
2
|
-
use crate::utils::*;
|
3
|
-
use crate::ParquetGemError;
|
4
|
-
|
5
|
-
use magnus::{Error as MagnusError, Ruby, Value};
|
6
|
-
use std::rc::Rc;
|
7
|
-
|
8
|
-
#[inline]
|
9
|
-
pub fn parse_parquet_columns(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
10
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
11
|
-
parse_parquet_columns_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
|
12
|
-
let z: MagnusError = e.into();
|
13
|
-
z
|
14
|
-
})
|
15
|
-
}
|
16
|
-
|
17
|
-
#[inline]
|
18
|
-
fn parse_parquet_columns_impl(
|
19
|
-
ruby: Rc<Ruby>,
|
20
|
-
rb_self: Value,
|
21
|
-
args: &[Value],
|
22
|
-
) -> Result<Value, ParquetGemError> {
|
23
|
-
let ParquetColumnsArgs {
|
24
|
-
to_read,
|
25
|
-
result_type,
|
26
|
-
columns,
|
27
|
-
batch_size,
|
28
|
-
strict,
|
29
|
-
logger,
|
30
|
-
} = parse_parquet_columns_args(&ruby, args)?;
|
31
|
-
|
32
|
-
// Use the unified parsing implementation
|
33
|
-
parse_parquet_unified(
|
34
|
-
ruby,
|
35
|
-
rb_self,
|
36
|
-
UnifiedParserArgs {
|
37
|
-
to_read,
|
38
|
-
result_type,
|
39
|
-
columns,
|
40
|
-
parser_type: ParserType::Column { batch_size, strict },
|
41
|
-
logger,
|
42
|
-
},
|
43
|
-
)
|
44
|
-
}
|
@@ -1,43 +0,0 @@
|
|
1
|
-
use crate::reader::unified::{parse_parquet_unified, ParserType, UnifiedParserArgs};
|
2
|
-
use crate::utils::*;
|
3
|
-
use crate::ParquetGemError;
|
4
|
-
|
5
|
-
use magnus::{Error as MagnusError, Ruby, Value};
|
6
|
-
use std::rc::Rc;
|
7
|
-
|
8
|
-
#[inline]
|
9
|
-
pub fn parse_parquet_rows(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
10
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
11
|
-
parse_parquet_rows_impl(Rc::new(ruby), rb_self, args).map_err(|e| {
|
12
|
-
let z: MagnusError = e.into();
|
13
|
-
z
|
14
|
-
})
|
15
|
-
}
|
16
|
-
|
17
|
-
#[inline]
|
18
|
-
fn parse_parquet_rows_impl(
|
19
|
-
ruby: Rc<Ruby>,
|
20
|
-
rb_self: Value,
|
21
|
-
args: &[Value],
|
22
|
-
) -> Result<Value, ParquetGemError> {
|
23
|
-
let ParquetRowsArgs {
|
24
|
-
to_read,
|
25
|
-
result_type,
|
26
|
-
columns,
|
27
|
-
strict,
|
28
|
-
logger,
|
29
|
-
} = parse_parquet_rows_args(&ruby, args)?;
|
30
|
-
|
31
|
-
// Use the unified parsing implementation
|
32
|
-
parse_parquet_unified(
|
33
|
-
ruby,
|
34
|
-
rb_self,
|
35
|
-
UnifiedParserArgs {
|
36
|
-
to_read,
|
37
|
-
result_type,
|
38
|
-
columns,
|
39
|
-
parser_type: ParserType::Row { strict },
|
40
|
-
logger,
|
41
|
-
},
|
42
|
-
)
|
43
|
-
}
|