parquet 0.2.12-arm64-darwin → 0.3.0-arm64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +5 -3
- data/README.md +1 -1
- data/Rakefile +16 -0
- data/lib/parquet/3.2/parquet.bundle +0 -0
- data/lib/parquet/3.3/parquet.bundle +0 -0
- data/lib/parquet/3.4/parquet.bundle +0 -0
- data/lib/parquet/version.rb +1 -1
- data/lib/parquet.rb +6 -1
- metadata +14 -45
- data/Cargo.lock +0 -1449
- data/Cargo.toml +0 -3
- data/ext/parquet/Cargo.toml +0 -28
- data/ext/parquet/extconf.rb +0 -4
- data/ext/parquet/src/allocator.rs +0 -13
- data/ext/parquet/src/enumerator.rs +0 -52
- data/ext/parquet/src/header_cache.rs +0 -100
- data/ext/parquet/src/lib.rs +0 -29
- data/ext/parquet/src/reader/mod.rs +0 -44
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -214
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -157
- data/ext/parquet/src/ruby_integration.rs +0 -77
- data/ext/parquet/src/ruby_reader.rs +0 -171
- data/ext/parquet/src/types/core_types.rs +0 -75
- data/ext/parquet/src/types/mod.rs +0 -30
- data/ext/parquet/src/types/parquet_value.rs +0 -462
- data/ext/parquet/src/types/record_types.rs +0 -204
- data/ext/parquet/src/types/timestamp.rs +0 -85
- data/ext/parquet/src/types/type_conversion.rs +0 -809
- data/ext/parquet/src/types/writer_types.rs +0 -283
- data/ext/parquet/src/utils.rs +0 -148
- data/ext/parquet/src/writer/mod.rs +0 -575
data/Cargo.toml
DELETED
data/ext/parquet/Cargo.toml
DELETED
@@ -1,28 +0,0 @@
|
|
1
|
-
[package]
|
2
|
-
name = "parquet"
|
3
|
-
version = "0.1.0"
|
4
|
-
edition = "2021"
|
5
|
-
|
6
|
-
[lib]
|
7
|
-
crate-type = ["cdylib"]
|
8
|
-
|
9
|
-
[dependencies]
|
10
|
-
ahash = "0.8"
|
11
|
-
arrow-array = "54.0.0"
|
12
|
-
arrow-schema = "54.0.0"
|
13
|
-
bytes = "^1.9"
|
14
|
-
itertools = "^0.14"
|
15
|
-
jiff = "0.1.19"
|
16
|
-
magnus = { version = "0.7", features = ["rb-sys"] }
|
17
|
-
parquet = { version = "^54.0", features = ["json"] }
|
18
|
-
rand = "0.9"
|
19
|
-
rb-sys = "^0.9"
|
20
|
-
thiserror = "2.0"
|
21
|
-
tempfile = "^3.15"
|
22
|
-
simdutf8 = "0.1.5"
|
23
|
-
|
24
|
-
[target.'cfg(target_os = "linux")'.dependencies]
|
25
|
-
jemallocator = { version = "0.5", features = ["disable_initial_exec_tls"] }
|
26
|
-
|
27
|
-
[target.'cfg(not(any(target_os = "linux", target_os = "windows")))'.dependencies]
|
28
|
-
mimalloc = { version = "0.1", default-features = false }
|
data/ext/parquet/extconf.rb
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
#[cfg(target_os = "linux")]
|
2
|
-
use jemallocator::Jemalloc;
|
3
|
-
|
4
|
-
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
5
|
-
use mimalloc::MiMalloc;
|
6
|
-
|
7
|
-
#[global_allocator]
|
8
|
-
#[cfg(target_os = "linux")]
|
9
|
-
static ALLOC: Jemalloc = Jemalloc;
|
10
|
-
|
11
|
-
#[global_allocator]
|
12
|
-
#[cfg(not(any(target_os = "linux", target_os = "windows")))]
|
13
|
-
static ALLOC: MiMalloc = MiMalloc;
|
@@ -1,52 +0,0 @@
|
|
1
|
-
use crate::ParserResultType;
|
2
|
-
use magnus::{value::ReprValue, Error as MagnusError, KwArgs, RArray, RHash, Symbol, Value};
|
3
|
-
|
4
|
-
pub struct RowEnumeratorArgs {
|
5
|
-
pub rb_self: Value,
|
6
|
-
pub to_read: Value,
|
7
|
-
pub result_type: ParserResultType,
|
8
|
-
pub columns: Option<Vec<String>>,
|
9
|
-
}
|
10
|
-
|
11
|
-
/// Creates an enumerator for lazy Parquet row parsing
|
12
|
-
pub fn create_row_enumerator(args: RowEnumeratorArgs) -> Result<magnus::Enumerator, MagnusError> {
|
13
|
-
let kwargs = RHash::new();
|
14
|
-
kwargs.aset(
|
15
|
-
Symbol::new("result_type"),
|
16
|
-
Symbol::new(args.result_type.to_string()),
|
17
|
-
)?;
|
18
|
-
if let Some(columns) = args.columns {
|
19
|
-
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
20
|
-
}
|
21
|
-
Ok(args
|
22
|
-
.rb_self
|
23
|
-
.enumeratorize("each_row", (args.to_read, KwArgs(kwargs))))
|
24
|
-
}
|
25
|
-
|
26
|
-
pub struct ColumnEnumeratorArgs {
|
27
|
-
pub rb_self: Value,
|
28
|
-
pub to_read: Value,
|
29
|
-
pub result_type: ParserResultType,
|
30
|
-
pub columns: Option<Vec<String>>,
|
31
|
-
pub batch_size: Option<usize>,
|
32
|
-
}
|
33
|
-
|
34
|
-
#[inline]
|
35
|
-
pub fn create_column_enumerator(
|
36
|
-
args: ColumnEnumeratorArgs,
|
37
|
-
) -> Result<magnus::Enumerator, MagnusError> {
|
38
|
-
let kwargs = RHash::new();
|
39
|
-
kwargs.aset(
|
40
|
-
Symbol::new("result_type"),
|
41
|
-
Symbol::new(args.result_type.to_string()),
|
42
|
-
)?;
|
43
|
-
if let Some(columns) = args.columns {
|
44
|
-
kwargs.aset(Symbol::new("columns"), RArray::from_vec(columns))?;
|
45
|
-
}
|
46
|
-
if let Some(batch_size) = args.batch_size {
|
47
|
-
kwargs.aset(Symbol::new("batch_size"), batch_size)?;
|
48
|
-
}
|
49
|
-
Ok(args
|
50
|
-
.rb_self
|
51
|
-
.enumeratorize("each_column", (args.to_read, KwArgs(kwargs))))
|
52
|
-
}
|
@@ -1,100 +0,0 @@
|
|
1
|
-
/// This module exists to avoid cloning header keys in returned HashMaps.
|
2
|
-
/// Since the underlying RString creation already involves cloning,
|
3
|
-
/// this caching layer aims to reduce redundant allocations.
|
4
|
-
///
|
5
|
-
/// Note: Performance testing on macOS showed minimal speed improvements,
|
6
|
-
/// so this optimization could be removed if any issues arise.
|
7
|
-
use std::{
|
8
|
-
collections::HashMap,
|
9
|
-
sync::{
|
10
|
-
atomic::{AtomicU32, Ordering},
|
11
|
-
LazyLock, Mutex,
|
12
|
-
},
|
13
|
-
};
|
14
|
-
|
15
|
-
use magnus::{IntoValue, RString, Ruby, Value};
|
16
|
-
|
17
|
-
use thiserror::Error;
|
18
|
-
|
19
|
-
#[derive(Debug, Error)]
|
20
|
-
pub enum CacheError {
|
21
|
-
#[error("Failed to acquire lock: {0}")]
|
22
|
-
LockError(String),
|
23
|
-
}
|
24
|
-
|
25
|
-
static STRING_CACHE: LazyLock<Mutex<HashMap<&'static str, (StringCacheKey, AtomicU32)>>> =
|
26
|
-
LazyLock::new(|| Mutex::new(HashMap::with_capacity(100)));
|
27
|
-
|
28
|
-
pub struct StringCache;
|
29
|
-
|
30
|
-
#[derive(Copy, Clone)]
|
31
|
-
pub struct StringCacheKey(&'static str);
|
32
|
-
|
33
|
-
impl StringCacheKey {
|
34
|
-
pub fn new(string: &str) -> Self {
|
35
|
-
let rstr = RString::new(string);
|
36
|
-
let fstr = rstr.to_interned_str();
|
37
|
-
Self(fstr.as_str().unwrap())
|
38
|
-
}
|
39
|
-
}
|
40
|
-
|
41
|
-
impl AsRef<str> for StringCacheKey {
|
42
|
-
fn as_ref(&self) -> &'static str {
|
43
|
-
self.0
|
44
|
-
}
|
45
|
-
}
|
46
|
-
|
47
|
-
impl IntoValue for StringCacheKey {
|
48
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
49
|
-
handle.into_value(self.0)
|
50
|
-
}
|
51
|
-
}
|
52
|
-
|
53
|
-
impl IntoValue for &StringCacheKey {
|
54
|
-
fn into_value_with(self, handle: &Ruby) -> Value {
|
55
|
-
handle.into_value(self.0)
|
56
|
-
}
|
57
|
-
}
|
58
|
-
|
59
|
-
impl std::fmt::Debug for StringCacheKey {
|
60
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
61
|
-
self.0.fmt(f)
|
62
|
-
}
|
63
|
-
}
|
64
|
-
|
65
|
-
impl PartialEq for StringCacheKey {
|
66
|
-
fn eq(&self, other: &Self) -> bool {
|
67
|
-
self.0 == other.0
|
68
|
-
}
|
69
|
-
}
|
70
|
-
|
71
|
-
impl std::cmp::Eq for StringCacheKey {}
|
72
|
-
|
73
|
-
impl std::hash::Hash for StringCacheKey {
|
74
|
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
75
|
-
self.0.hash(state);
|
76
|
-
}
|
77
|
-
}
|
78
|
-
|
79
|
-
impl StringCache {
|
80
|
-
pub fn intern_many<AsStr: AsRef<str>>(
|
81
|
-
strings: &[AsStr],
|
82
|
-
) -> Result<Vec<StringCacheKey>, CacheError> {
|
83
|
-
let mut cache = STRING_CACHE
|
84
|
-
.lock()
|
85
|
-
.map_err(|e| CacheError::LockError(e.to_string()))?;
|
86
|
-
|
87
|
-
let mut result: Vec<StringCacheKey> = Vec::with_capacity(strings.len());
|
88
|
-
for string in strings {
|
89
|
-
if let Some((_, (interned_string, counter))) = cache.get_key_value(string.as_ref()) {
|
90
|
-
counter.fetch_add(1, Ordering::Relaxed);
|
91
|
-
result.push(*interned_string);
|
92
|
-
} else {
|
93
|
-
let interned = StringCacheKey::new(string.as_ref());
|
94
|
-
cache.insert(interned.0, (interned, AtomicU32::new(1)));
|
95
|
-
result.push(interned);
|
96
|
-
}
|
97
|
-
}
|
98
|
-
Ok(result)
|
99
|
-
}
|
100
|
-
}
|
data/ext/parquet/src/lib.rs
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
mod allocator;
|
2
|
-
mod enumerator;
|
3
|
-
pub mod header_cache;
|
4
|
-
mod reader;
|
5
|
-
mod ruby_integration;
|
6
|
-
mod ruby_reader;
|
7
|
-
mod types;
|
8
|
-
mod utils;
|
9
|
-
mod writer;
|
10
|
-
|
11
|
-
use crate::enumerator::*;
|
12
|
-
use crate::reader::*;
|
13
|
-
use crate::ruby_integration::*;
|
14
|
-
use crate::types::*;
|
15
|
-
|
16
|
-
use magnus::{Error, Ruby};
|
17
|
-
use writer::write_columns;
|
18
|
-
use writer::write_rows;
|
19
|
-
|
20
|
-
/// Initializes the Ruby extension and defines methods.
|
21
|
-
#[magnus::init]
|
22
|
-
fn init(ruby: &Ruby) -> Result<(), Error> {
|
23
|
-
let module = ruby.define_module("Parquet")?;
|
24
|
-
module.define_module_function("each_row", magnus::method!(parse_parquet_rows, -1))?;
|
25
|
-
module.define_module_function("each_column", magnus::method!(parse_parquet_columns, -1))?;
|
26
|
-
module.define_module_function("write_rows", magnus::function!(write_rows, -1))?;
|
27
|
-
module.define_module_function("write_columns", magnus::function!(write_columns, -1))?;
|
28
|
-
Ok(())
|
29
|
-
}
|
@@ -1,44 +0,0 @@
|
|
1
|
-
mod parquet_column_reader;
|
2
|
-
mod parquet_row_reader;
|
3
|
-
|
4
|
-
use std::io;
|
5
|
-
|
6
|
-
use magnus::{Error as MagnusError, Ruby};
|
7
|
-
use thiserror::Error;
|
8
|
-
|
9
|
-
use crate::header_cache::CacheError;
|
10
|
-
pub use parquet_column_reader::parse_parquet_columns;
|
11
|
-
pub use parquet_row_reader::parse_parquet_rows;
|
12
|
-
|
13
|
-
#[derive(Error, Debug)]
|
14
|
-
pub enum ReaderError {
|
15
|
-
#[error("Failed to get file descriptor: {0}")]
|
16
|
-
FileDescriptor(String),
|
17
|
-
#[error("Invalid file descriptor")]
|
18
|
-
InvalidFileDescriptor,
|
19
|
-
#[error("Failed to open file: {0}")]
|
20
|
-
FileOpen(#[from] io::Error),
|
21
|
-
#[error("Failed to intern headers: {0}")]
|
22
|
-
HeaderIntern(#[from] CacheError),
|
23
|
-
#[error("Ruby error: {0}")]
|
24
|
-
Ruby(String),
|
25
|
-
#[error("Parquet error: {0}")]
|
26
|
-
Parquet(#[from] parquet::errors::ParquetError),
|
27
|
-
#[error("Arrow error: {0}")]
|
28
|
-
Arrow(#[from] arrow_schema::ArrowError),
|
29
|
-
}
|
30
|
-
|
31
|
-
impl From<MagnusError> for ReaderError {
|
32
|
-
fn from(err: MagnusError) -> Self {
|
33
|
-
Self::Ruby(err.to_string())
|
34
|
-
}
|
35
|
-
}
|
36
|
-
|
37
|
-
impl From<ReaderError> for MagnusError {
|
38
|
-
fn from(err: ReaderError) -> Self {
|
39
|
-
MagnusError::new(
|
40
|
-
Ruby::get().unwrap().exception_runtime_error(),
|
41
|
-
err.to_string(),
|
42
|
-
)
|
43
|
-
}
|
44
|
-
}
|
@@ -1,214 +0,0 @@
|
|
1
|
-
use crate::header_cache::StringCache;
|
2
|
-
use crate::{
|
3
|
-
create_column_enumerator, utils::*, ColumnEnumeratorArgs, ColumnRecord, ForgottenFileHandle,
|
4
|
-
ParquetValueVec, ParserResultType, SeekableRubyValue,
|
5
|
-
};
|
6
|
-
use ahash::RandomState;
|
7
|
-
use magnus::rb_sys::AsRawValue;
|
8
|
-
use magnus::value::{Opaque, ReprValue};
|
9
|
-
use magnus::IntoValue;
|
10
|
-
use magnus::{Error as MagnusError, Ruby, Value};
|
11
|
-
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
|
12
|
-
use parquet::arrow::ProjectionMask;
|
13
|
-
use std::collections::HashMap;
|
14
|
-
use std::fs::File;
|
15
|
-
use std::mem::ManuallyDrop;
|
16
|
-
use std::os::fd::FromRawFd;
|
17
|
-
use std::sync::OnceLock;
|
18
|
-
|
19
|
-
use super::ReaderError;
|
20
|
-
|
21
|
-
#[inline]
|
22
|
-
pub fn parse_parquet_columns<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
23
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
24
|
-
|
25
|
-
let ParquetColumnsArgs {
|
26
|
-
to_read,
|
27
|
-
result_type,
|
28
|
-
columns,
|
29
|
-
batch_size,
|
30
|
-
} = parse_parquet_columns_args(&ruby, args)?;
|
31
|
-
|
32
|
-
if !ruby.block_given() {
|
33
|
-
return create_column_enumerator(ColumnEnumeratorArgs {
|
34
|
-
rb_self,
|
35
|
-
to_read,
|
36
|
-
result_type,
|
37
|
-
columns,
|
38
|
-
batch_size,
|
39
|
-
})
|
40
|
-
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
41
|
-
}
|
42
|
-
|
43
|
-
let (batch_reader, schema, num_rows) = if to_read.is_kind_of(ruby.class_string()) {
|
44
|
-
let path_string = to_read.to_r_string()?;
|
45
|
-
let file_path = unsafe { path_string.as_str()? };
|
46
|
-
let file = File::open(file_path).map_err(|e| ReaderError::FileOpen(e))?;
|
47
|
-
|
48
|
-
let mut builder =
|
49
|
-
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
50
|
-
let schema = builder.schema().clone();
|
51
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
52
|
-
|
53
|
-
// If columns are specified, project only those columns
|
54
|
-
if let Some(cols) = &columns {
|
55
|
-
// Get the parquet schema
|
56
|
-
let parquet_schema = builder.parquet_schema();
|
57
|
-
|
58
|
-
// Create a projection mask from column names
|
59
|
-
let projection =
|
60
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
61
|
-
|
62
|
-
builder = builder.with_projection(projection);
|
63
|
-
}
|
64
|
-
|
65
|
-
if let Some(batch_size) = batch_size {
|
66
|
-
builder = builder.with_batch_size(batch_size);
|
67
|
-
}
|
68
|
-
|
69
|
-
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
70
|
-
|
71
|
-
(reader, schema, num_rows)
|
72
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
73
|
-
let raw_value = to_read.as_raw();
|
74
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
75
|
-
.map_err(|_| {
|
76
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
77
|
-
})?;
|
78
|
-
|
79
|
-
if fd < 0 {
|
80
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
81
|
-
}
|
82
|
-
|
83
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
84
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
85
|
-
|
86
|
-
let mut builder =
|
87
|
-
ParquetRecordBatchReaderBuilder::try_new(file).map_err(|e| ReaderError::Parquet(e))?;
|
88
|
-
let schema = builder.schema().clone();
|
89
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
90
|
-
|
91
|
-
// If columns are specified, project only those columns
|
92
|
-
if let Some(cols) = &columns {
|
93
|
-
// Get the parquet schema
|
94
|
-
let parquet_schema = builder.parquet_schema();
|
95
|
-
|
96
|
-
// Create a projection mask from column names
|
97
|
-
let projection =
|
98
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
99
|
-
|
100
|
-
builder = builder.with_projection(projection);
|
101
|
-
}
|
102
|
-
|
103
|
-
if let Some(batch_size) = batch_size {
|
104
|
-
builder = builder.with_batch_size(batch_size);
|
105
|
-
}
|
106
|
-
|
107
|
-
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
108
|
-
|
109
|
-
(reader, schema, num_rows)
|
110
|
-
} else {
|
111
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
112
|
-
|
113
|
-
let mut builder = ParquetRecordBatchReaderBuilder::try_new(readable)
|
114
|
-
.map_err(|e| ReaderError::Parquet(e))?;
|
115
|
-
let schema = builder.schema().clone();
|
116
|
-
let num_rows = builder.metadata().file_metadata().num_rows();
|
117
|
-
|
118
|
-
// If columns are specified, project only those columns
|
119
|
-
if let Some(cols) = &columns {
|
120
|
-
// Get the parquet schema
|
121
|
-
let parquet_schema = builder.parquet_schema();
|
122
|
-
|
123
|
-
// Create a projection mask from column names
|
124
|
-
let projection =
|
125
|
-
ProjectionMask::columns(parquet_schema, cols.iter().map(|s| s.as_str()));
|
126
|
-
|
127
|
-
builder = builder.with_projection(projection);
|
128
|
-
}
|
129
|
-
|
130
|
-
if let Some(batch_size) = batch_size {
|
131
|
-
builder = builder.with_batch_size(batch_size);
|
132
|
-
}
|
133
|
-
|
134
|
-
let reader = builder.build().map_err(|e| ReaderError::Parquet(e))?;
|
135
|
-
|
136
|
-
(reader, schema, num_rows)
|
137
|
-
};
|
138
|
-
|
139
|
-
if num_rows == 0 {
|
140
|
-
let mut map =
|
141
|
-
HashMap::with_capacity_and_hasher(schema.fields().len(), RandomState::default());
|
142
|
-
let headers: Vec<String> = schema
|
143
|
-
.fields()
|
144
|
-
.iter()
|
145
|
-
.map(|field| field.name().to_string())
|
146
|
-
.collect();
|
147
|
-
let interned_headers =
|
148
|
-
StringCache::intern_many(&headers).map_err(|e| ReaderError::HeaderIntern(e))?;
|
149
|
-
for field in interned_headers.iter() {
|
150
|
-
map.insert(*field, vec![]);
|
151
|
-
}
|
152
|
-
let record = ColumnRecord::Map(map);
|
153
|
-
let _: Value = ruby.yield_value(record)?;
|
154
|
-
return Ok(ruby.qnil().into_value_with(&ruby));
|
155
|
-
}
|
156
|
-
|
157
|
-
match result_type {
|
158
|
-
ParserResultType::Hash => {
|
159
|
-
let headers = OnceLock::new();
|
160
|
-
let headers_clone = headers.clone();
|
161
|
-
let iter = batch_reader.map(move |batch| {
|
162
|
-
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
163
|
-
let headers = headers_clone.get_or_init(|| {
|
164
|
-
let schema = batch.schema();
|
165
|
-
let fields = schema.fields();
|
166
|
-
let mut header_string = Vec::with_capacity(fields.len());
|
167
|
-
for field in fields {
|
168
|
-
header_string.push(field.name().to_owned());
|
169
|
-
}
|
170
|
-
StringCache::intern_many(&header_string).unwrap()
|
171
|
-
});
|
172
|
-
|
173
|
-
let mut map =
|
174
|
-
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
175
|
-
|
176
|
-
batch.columns().iter().enumerate().for_each(|(i, column)| {
|
177
|
-
let header = headers[i];
|
178
|
-
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
179
|
-
map.insert(header, values.into_inner());
|
180
|
-
});
|
181
|
-
|
182
|
-
Ok(ColumnRecord::Map::<RandomState>(map))
|
183
|
-
})
|
184
|
-
});
|
185
|
-
|
186
|
-
for result in iter {
|
187
|
-
let record = result?;
|
188
|
-
let _: Value = ruby.yield_value(record)?;
|
189
|
-
}
|
190
|
-
}
|
191
|
-
ParserResultType::Array => {
|
192
|
-
let iter = batch_reader.map(|batch| {
|
193
|
-
batch.map_err(ReaderError::Arrow).and_then(|batch| {
|
194
|
-
let vec = batch
|
195
|
-
.columns()
|
196
|
-
.into_iter()
|
197
|
-
.map(|column| {
|
198
|
-
let values = ParquetValueVec::try_from(column.clone()).unwrap();
|
199
|
-
values.into_inner()
|
200
|
-
})
|
201
|
-
.collect();
|
202
|
-
Ok(ColumnRecord::Vec::<RandomState>(vec))
|
203
|
-
})
|
204
|
-
});
|
205
|
-
|
206
|
-
for result in iter {
|
207
|
-
let record = result?;
|
208
|
-
let _: Value = ruby.yield_value(record)?;
|
209
|
-
}
|
210
|
-
}
|
211
|
-
}
|
212
|
-
|
213
|
-
Ok(ruby.qnil().into_value_with(&ruby))
|
214
|
-
}
|
@@ -1,157 +0,0 @@
|
|
1
|
-
use crate::header_cache::StringCache;
|
2
|
-
use crate::{
|
3
|
-
create_row_enumerator, utils::*, ForgottenFileHandle, ParquetField, ParserResultType,
|
4
|
-
ReaderError, RowEnumeratorArgs, RowRecord, SeekableRubyValue,
|
5
|
-
};
|
6
|
-
use ahash::RandomState;
|
7
|
-
use magnus::rb_sys::AsRawValue;
|
8
|
-
use magnus::value::{Opaque, ReprValue};
|
9
|
-
use magnus::IntoValue;
|
10
|
-
use magnus::{Error as MagnusError, Ruby, Value};
|
11
|
-
use parquet::file::reader::{FileReader, SerializedFileReader};
|
12
|
-
use parquet::record::reader::RowIter as ParquetRowIter;
|
13
|
-
use parquet::schema::types::{Type as SchemaType, TypePtr};
|
14
|
-
use std::collections::HashMap;
|
15
|
-
use std::fs::File;
|
16
|
-
use std::mem::ManuallyDrop;
|
17
|
-
use std::os::fd::FromRawFd;
|
18
|
-
use std::sync::OnceLock;
|
19
|
-
|
20
|
-
#[inline]
|
21
|
-
pub fn parse_parquet_rows<'a>(rb_self: Value, args: &[Value]) -> Result<Value, MagnusError> {
|
22
|
-
let ruby = unsafe { Ruby::get_unchecked() };
|
23
|
-
|
24
|
-
let ParquetRowsArgs {
|
25
|
-
to_read,
|
26
|
-
result_type,
|
27
|
-
columns,
|
28
|
-
} = parse_parquet_rows_args(&ruby, args)?;
|
29
|
-
|
30
|
-
if !ruby.block_given() {
|
31
|
-
return create_row_enumerator(RowEnumeratorArgs {
|
32
|
-
rb_self,
|
33
|
-
to_read,
|
34
|
-
result_type,
|
35
|
-
columns,
|
36
|
-
})
|
37
|
-
.map(|yield_enum| yield_enum.into_value_with(&ruby));
|
38
|
-
}
|
39
|
-
|
40
|
-
let (schema, mut iter) = if to_read.is_kind_of(ruby.class_string()) {
|
41
|
-
let path_string = to_read.to_r_string()?;
|
42
|
-
let file_path = unsafe { path_string.as_str()? };
|
43
|
-
let file = File::open(file_path).unwrap();
|
44
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
45
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
46
|
-
|
47
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
48
|
-
} else if to_read.is_kind_of(ruby.class_io()) {
|
49
|
-
let raw_value = to_read.as_raw();
|
50
|
-
let fd = std::panic::catch_unwind(|| unsafe { rb_sys::rb_io_descriptor(raw_value) })
|
51
|
-
.map_err(|_| {
|
52
|
-
ReaderError::FileDescriptor("Failed to get file descriptor".to_string())
|
53
|
-
})?;
|
54
|
-
|
55
|
-
if fd < 0 {
|
56
|
-
return Err(ReaderError::InvalidFileDescriptor.into());
|
57
|
-
}
|
58
|
-
|
59
|
-
let file = unsafe { File::from_raw_fd(fd) };
|
60
|
-
let file = ForgottenFileHandle(ManuallyDrop::new(file));
|
61
|
-
let reader = SerializedFileReader::new(file).unwrap();
|
62
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
63
|
-
|
64
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
65
|
-
} else {
|
66
|
-
let readable = SeekableRubyValue(Opaque::from(to_read));
|
67
|
-
let reader = SerializedFileReader::new(readable).unwrap();
|
68
|
-
let schema = reader.metadata().file_metadata().schema().clone();
|
69
|
-
|
70
|
-
(schema, ParquetRowIter::from_file_into(Box::new(reader)))
|
71
|
-
};
|
72
|
-
|
73
|
-
if let Some(cols) = columns {
|
74
|
-
let projection = create_projection_schema(&schema, &cols);
|
75
|
-
iter = iter.project(Some(projection.to_owned())).map_err(|e| {
|
76
|
-
MagnusError::new(
|
77
|
-
ruby.exception_runtime_error(),
|
78
|
-
format!("Failed to create projection: {}", e),
|
79
|
-
)
|
80
|
-
})?;
|
81
|
-
}
|
82
|
-
|
83
|
-
match result_type {
|
84
|
-
ParserResultType::Hash => {
|
85
|
-
let headers = OnceLock::new();
|
86
|
-
let headers_clone = headers.clone();
|
87
|
-
let iter = iter.map(move |row| {
|
88
|
-
row.and_then(|row| {
|
89
|
-
let headers = headers_clone.get_or_init(|| {
|
90
|
-
let column_count = row.get_column_iter().count();
|
91
|
-
|
92
|
-
let mut header_string = Vec::with_capacity(column_count);
|
93
|
-
for (k, _) in row.get_column_iter() {
|
94
|
-
header_string.push(k.to_owned());
|
95
|
-
}
|
96
|
-
|
97
|
-
let headers = StringCache::intern_many(&header_string).unwrap();
|
98
|
-
|
99
|
-
headers
|
100
|
-
});
|
101
|
-
|
102
|
-
let mut map =
|
103
|
-
HashMap::with_capacity_and_hasher(headers.len(), RandomState::default());
|
104
|
-
row.get_column_iter().enumerate().for_each(|(i, (_, v))| {
|
105
|
-
map.insert(headers[i], ParquetField(v.clone()));
|
106
|
-
});
|
107
|
-
Ok(map)
|
108
|
-
})
|
109
|
-
.and_then(|row| Ok(RowRecord::Map::<RandomState>(row)))
|
110
|
-
.map_err(|e| ReaderError::Parquet(e))
|
111
|
-
});
|
112
|
-
|
113
|
-
for result in iter {
|
114
|
-
let record = result?;
|
115
|
-
let _: Value = ruby.yield_value(record)?;
|
116
|
-
}
|
117
|
-
}
|
118
|
-
ParserResultType::Array => {
|
119
|
-
let iter = iter.map(|row| {
|
120
|
-
row.and_then(|row| {
|
121
|
-
let column_count = row.get_column_iter().count();
|
122
|
-
let mut vec = Vec::with_capacity(column_count);
|
123
|
-
row.get_column_iter()
|
124
|
-
.for_each(|(_, v)| vec.push(ParquetField(v.clone())));
|
125
|
-
Ok(vec)
|
126
|
-
})
|
127
|
-
.and_then(|row| Ok(RowRecord::Vec::<RandomState>(row)))
|
128
|
-
.map_err(|e| ReaderError::Parquet(e))
|
129
|
-
});
|
130
|
-
|
131
|
-
for result in iter {
|
132
|
-
let record = result?;
|
133
|
-
let _: Value = ruby.yield_value(record)?;
|
134
|
-
}
|
135
|
-
}
|
136
|
-
}
|
137
|
-
|
138
|
-
Ok(ruby.qnil().into_value_with(&ruby))
|
139
|
-
}
|
140
|
-
|
141
|
-
fn create_projection_schema(schema: &SchemaType, columns: &[String]) -> SchemaType {
|
142
|
-
if let SchemaType::GroupType { fields, .. } = schema {
|
143
|
-
let projected_fields: Vec<TypePtr> = fields
|
144
|
-
.iter()
|
145
|
-
.filter(|field| columns.contains(&field.name().to_string()))
|
146
|
-
.cloned()
|
147
|
-
.collect();
|
148
|
-
|
149
|
-
SchemaType::GroupType {
|
150
|
-
basic_info: schema.get_basic_info().clone(),
|
151
|
-
fields: projected_fields,
|
152
|
-
}
|
153
|
-
} else {
|
154
|
-
// Return original schema if not a group type
|
155
|
-
schema.clone()
|
156
|
-
}
|
157
|
-
}
|