parquet 0.5.12 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Cargo.lock +295 -98
- data/Cargo.toml +1 -1
- data/Gemfile +1 -0
- data/README.md +94 -3
- data/ext/parquet/Cargo.toml +8 -5
- data/ext/parquet/src/adapter_ffi.rs +156 -0
- data/ext/parquet/src/lib.rs +13 -21
- data/ext/parquet-core/Cargo.toml +23 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
- data/ext/parquet-core/src/error.rs +163 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +263 -0
- data/ext/parquet-core/src/schema.rs +283 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +151 -0
- data/ext/parquet-core/src/value.rs +209 -0
- data/ext/parquet-core/src/writer.rs +839 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +430 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
- data/ext/parquet-ruby-adapter/src/error.rs +148 -0
- data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
- data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +94 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
- data/lib/parquet/schema.rb +19 -0
- data/lib/parquet/version.rb +1 -1
- metadata +50 -24
- data/ext/parquet/src/enumerator.rs +0 -68
- data/ext/parquet/src/header_cache.rs +0 -99
- data/ext/parquet/src/logger.rs +0 -171
- data/ext/parquet/src/reader/common.rs +0 -111
- data/ext/parquet/src/reader/mod.rs +0 -211
- data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
- data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
- data/ext/parquet/src/reader/unified/mod.rs +0 -363
- data/ext/parquet/src/types/core_types.rs +0 -120
- data/ext/parquet/src/types/mod.rs +0 -100
- data/ext/parquet/src/types/parquet_value.rs +0 -1275
- data/ext/parquet/src/types/record_types.rs +0 -603
- data/ext/parquet/src/types/schema_converter.rs +0 -290
- data/ext/parquet/src/types/schema_node.rs +0 -424
- data/ext/parquet/src/types/timestamp.rs +0 -285
- data/ext/parquet/src/types/type_conversion.rs +0 -1949
- data/ext/parquet/src/types/writer_types.rs +0 -329
- data/ext/parquet/src/utils.rs +0 -184
- data/ext/parquet/src/writer/mod.rs +0 -505
- data/ext/parquet/src/writer/write_columns.rs +0 -238
- data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,116 @@
|
|
1
|
+
/// Default constants for batch processing
|
2
|
+
pub const SAMPLE_SIZE: usize = 100;
|
3
|
+
pub const MIN_BATCH_SIZE: usize = 10;
|
4
|
+
pub const INITIAL_BATCH_SIZE: usize = 100;
|
5
|
+
pub const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
|
6
|
+
|
7
|
+
/// Manages dynamic batch sizing based on memory usage
|
8
|
+
pub struct BatchSizeManager {
|
9
|
+
pub fixed_batch_size: Option<usize>,
|
10
|
+
pub memory_threshold: usize,
|
11
|
+
pub sample_size: usize,
|
12
|
+
pub row_size_samples: Vec<usize>,
|
13
|
+
pub current_batch_size: usize,
|
14
|
+
pub rows_processed: usize,
|
15
|
+
pub recent_row_sizes: Vec<usize>,
|
16
|
+
}
|
17
|
+
|
18
|
+
impl BatchSizeManager {
|
19
|
+
pub fn new(
|
20
|
+
fixed_batch_size: Option<usize>,
|
21
|
+
memory_threshold: Option<usize>,
|
22
|
+
sample_size: Option<usize>,
|
23
|
+
) -> Self {
|
24
|
+
Self {
|
25
|
+
fixed_batch_size,
|
26
|
+
memory_threshold: memory_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD),
|
27
|
+
sample_size: sample_size.unwrap_or(SAMPLE_SIZE),
|
28
|
+
row_size_samples: Vec::new(),
|
29
|
+
current_batch_size: fixed_batch_size.unwrap_or(INITIAL_BATCH_SIZE),
|
30
|
+
rows_processed: 0,
|
31
|
+
recent_row_sizes: Vec::with_capacity(10),
|
32
|
+
}
|
33
|
+
}
|
34
|
+
|
35
|
+
pub fn record_row_size(&mut self, size: usize) {
|
36
|
+
self.rows_processed += 1;
|
37
|
+
|
38
|
+
// Always track recent row sizes for current memory estimation
|
39
|
+
if self.recent_row_sizes.len() >= 10 {
|
40
|
+
self.recent_row_sizes.remove(0);
|
41
|
+
}
|
42
|
+
self.recent_row_sizes.push(size);
|
43
|
+
|
44
|
+
// Sample for batch size calculation
|
45
|
+
if self.row_size_samples.len() < self.sample_size {
|
46
|
+
self.row_size_samples.push(size);
|
47
|
+
|
48
|
+
// Recalculate batch size after enough samples
|
49
|
+
if self.row_size_samples.len() >= MIN_BATCH_SIZE {
|
50
|
+
self.recalculate_batch_size();
|
51
|
+
}
|
52
|
+
} else if self.rows_processed % 50 == 0 {
|
53
|
+
// Periodically adjust based on recent data
|
54
|
+
self.adjust_for_recent_sizes();
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
pub fn average_row_size(&self) -> usize {
|
59
|
+
if self.row_size_samples.is_empty() {
|
60
|
+
1024 // Default estimate
|
61
|
+
} else {
|
62
|
+
self.row_size_samples.iter().sum::<usize>() / self.row_size_samples.len()
|
63
|
+
}
|
64
|
+
}
|
65
|
+
|
66
|
+
pub fn recent_average_size(&self) -> usize {
|
67
|
+
if self.recent_row_sizes.is_empty() {
|
68
|
+
self.average_row_size()
|
69
|
+
} else {
|
70
|
+
self.recent_row_sizes.iter().sum::<usize>() / self.recent_row_sizes.len()
|
71
|
+
}
|
72
|
+
}
|
73
|
+
|
74
|
+
fn adjust_for_recent_sizes(&mut self) {
|
75
|
+
if self.fixed_batch_size.is_some() {
|
76
|
+
return;
|
77
|
+
}
|
78
|
+
|
79
|
+
let recent_avg = self.recent_average_size();
|
80
|
+
let overall_avg = self.average_row_size();
|
81
|
+
|
82
|
+
// If recent rows are significantly different from the sample average, adjust
|
83
|
+
if recent_avg > overall_avg * 2 || recent_avg < overall_avg / 2 {
|
84
|
+
let target_memory = (self.memory_threshold as f64 * 0.8) as usize;
|
85
|
+
self.current_batch_size = (target_memory / recent_avg).max(MIN_BATCH_SIZE);
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
fn recalculate_batch_size(&mut self) {
|
90
|
+
if self.fixed_batch_size.is_some() {
|
91
|
+
return; // User specified fixed size
|
92
|
+
}
|
93
|
+
|
94
|
+
let avg_size = self.average_row_size();
|
95
|
+
if avg_size > 0 {
|
96
|
+
// Target 80% of memory threshold to leave headroom
|
97
|
+
let target_memory = (self.memory_threshold as f64 * 0.8) as usize;
|
98
|
+
self.current_batch_size = (target_memory / avg_size).max(MIN_BATCH_SIZE);
|
99
|
+
}
|
100
|
+
}
|
101
|
+
|
102
|
+
pub fn should_flush(&self, batch_size: usize, current_batch_memory: usize) -> bool {
|
103
|
+
if batch_size >= self.current_batch_size {
|
104
|
+
return true;
|
105
|
+
}
|
106
|
+
|
107
|
+
// Use actual memory size if available, otherwise estimate
|
108
|
+
let memory_usage = if current_batch_memory > 0 {
|
109
|
+
current_batch_memory
|
110
|
+
} else {
|
111
|
+
batch_size * self.recent_average_size()
|
112
|
+
};
|
113
|
+
|
114
|
+
memory_usage >= self.memory_threshold
|
115
|
+
}
|
116
|
+
}
|
@@ -0,0 +1,237 @@
|
|
1
|
+
//! Cloneable ChunkReader implementation for streaming Parquet files
|
2
|
+
//!
|
3
|
+
//! This module provides a ChunkReader that implements Clone, enabling
|
4
|
+
//! true streaming of Parquet files without loading them entirely into memory.
|
5
|
+
|
6
|
+
use bytes::Bytes;
|
7
|
+
use parquet::file::reader::{ChunkReader, Length};
|
8
|
+
use parquet_core::Result;
|
9
|
+
use std::fs::File;
|
10
|
+
use std::io::{self, Read, Seek, SeekFrom};
|
11
|
+
use std::path::{Path, PathBuf};
|
12
|
+
|
13
|
+
use crate::io::ThreadSafeRubyIOReader;
|
14
|
+
|
15
|
+
/// A ChunkReader that can be cloned for parallel reading
|
16
|
+
#[derive(Clone)]
|
17
|
+
pub enum CloneableChunkReader {
|
18
|
+
/// File-based reader that reopens files on clone
|
19
|
+
File(FileChunkReader),
|
20
|
+
/// Ruby IO-based reader using thread-safe wrapper
|
21
|
+
RubyIO(RubyIOChunkReader),
|
22
|
+
/// In-memory bytes (fallback for small files)
|
23
|
+
Bytes(bytes::Bytes),
|
24
|
+
}
|
25
|
+
|
26
|
+
/// File-based chunk reader that reopens files for each clone
|
27
|
+
#[derive(Clone)]
|
28
|
+
pub struct FileChunkReader {
|
29
|
+
path: PathBuf,
|
30
|
+
file_len: u64,
|
31
|
+
}
|
32
|
+
|
33
|
+
impl FileChunkReader {
|
34
|
+
pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
|
35
|
+
let path = path.as_ref().to_path_buf();
|
36
|
+
let file = File::open(&path)?;
|
37
|
+
let metadata = file.metadata()?;
|
38
|
+
let file_len = metadata.len();
|
39
|
+
|
40
|
+
Ok(FileChunkReader { path, file_len })
|
41
|
+
}
|
42
|
+
}
|
43
|
+
|
44
|
+
/// Ruby IO-based chunk reader using thread-safe wrapper
|
45
|
+
#[derive(Clone)]
|
46
|
+
pub struct RubyIOChunkReader {
|
47
|
+
reader: ThreadSafeRubyIOReader,
|
48
|
+
len: u64,
|
49
|
+
}
|
50
|
+
|
51
|
+
impl RubyIOChunkReader {
|
52
|
+
pub fn new(reader: ThreadSafeRubyIOReader, len: u64) -> Self {
|
53
|
+
RubyIOChunkReader { reader, len }
|
54
|
+
}
|
55
|
+
}
|
56
|
+
|
57
|
+
/// A reader that reads a specific range from a ChunkReader
|
58
|
+
struct RangeReader<R> {
|
59
|
+
inner: R,
|
60
|
+
_start: u64,
|
61
|
+
end: u64,
|
62
|
+
pos: u64,
|
63
|
+
}
|
64
|
+
|
65
|
+
impl<R: Read + Seek> RangeReader<R> {
|
66
|
+
fn new(mut inner: R, start: u64, length: u64) -> io::Result<Self> {
|
67
|
+
inner.seek(SeekFrom::Start(start))?;
|
68
|
+
Ok(RangeReader {
|
69
|
+
inner,
|
70
|
+
_start: start,
|
71
|
+
end: start + length,
|
72
|
+
pos: start,
|
73
|
+
})
|
74
|
+
}
|
75
|
+
}
|
76
|
+
|
77
|
+
impl<R: Read> Read for RangeReader<R> {
|
78
|
+
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
|
79
|
+
let remaining = (self.end - self.pos) as usize;
|
80
|
+
if remaining == 0 {
|
81
|
+
return Ok(0);
|
82
|
+
}
|
83
|
+
|
84
|
+
let to_read = buf.len().min(remaining);
|
85
|
+
let n = self.inner.read(&mut buf[..to_read])?;
|
86
|
+
self.pos += n as u64;
|
87
|
+
Ok(n)
|
88
|
+
}
|
89
|
+
}
|
90
|
+
|
91
|
+
// Implement Length trait for our readers
|
92
|
+
impl Length for FileChunkReader {
|
93
|
+
fn len(&self) -> u64 {
|
94
|
+
self.file_len
|
95
|
+
}
|
96
|
+
}
|
97
|
+
|
98
|
+
impl Length for RubyIOChunkReader {
|
99
|
+
fn len(&self) -> u64 {
|
100
|
+
self.len
|
101
|
+
}
|
102
|
+
}
|
103
|
+
|
104
|
+
impl Length for CloneableChunkReader {
|
105
|
+
fn len(&self) -> u64 {
|
106
|
+
match self {
|
107
|
+
CloneableChunkReader::File(f) => f.len(),
|
108
|
+
CloneableChunkReader::RubyIO(r) => r.len(),
|
109
|
+
CloneableChunkReader::Bytes(b) => b.len() as u64,
|
110
|
+
}
|
111
|
+
}
|
112
|
+
}
|
113
|
+
|
114
|
+
// Implement ChunkReader for FileChunkReader
|
115
|
+
impl ChunkReader for FileChunkReader {
|
116
|
+
type T = Box<dyn Read + Send>;
|
117
|
+
|
118
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
119
|
+
let file = File::open(&self.path)
|
120
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
121
|
+
let reader = RangeReader::new(file, start, self.file_len - start)
|
122
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
123
|
+
Ok(Box::new(reader))
|
124
|
+
}
|
125
|
+
|
126
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
127
|
+
let mut file = File::open(&self.path)
|
128
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
129
|
+
file.seek(SeekFrom::Start(start))
|
130
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
131
|
+
|
132
|
+
let mut buf = vec![0; length];
|
133
|
+
file.read_exact(&mut buf)
|
134
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
135
|
+
Ok(Bytes::from(buf))
|
136
|
+
}
|
137
|
+
}
|
138
|
+
|
139
|
+
// Implement ChunkReader for RubyIOChunkReader
|
140
|
+
impl ChunkReader for RubyIOChunkReader {
|
141
|
+
type T = Box<dyn Read + Send>;
|
142
|
+
|
143
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
144
|
+
// Clone the reader for thread-safe access
|
145
|
+
let mut reader = self.reader.clone();
|
146
|
+
|
147
|
+
// Seek to the start position
|
148
|
+
reader
|
149
|
+
.seek(SeekFrom::Start(start))
|
150
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
151
|
+
|
152
|
+
// Create a range reader that limits reading to the available data
|
153
|
+
let reader = RangeReader::new(reader, start, self.len - start)
|
154
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
155
|
+
Ok(Box::new(reader))
|
156
|
+
}
|
157
|
+
|
158
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
159
|
+
let mut reader = self.reader.clone();
|
160
|
+
reader
|
161
|
+
.seek(SeekFrom::Start(start))
|
162
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
163
|
+
|
164
|
+
let mut buf = vec![0; length];
|
165
|
+
reader
|
166
|
+
.read_exact(&mut buf)
|
167
|
+
.map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
|
168
|
+
Ok(Bytes::from(buf))
|
169
|
+
}
|
170
|
+
}
|
171
|
+
|
172
|
+
// Implement ChunkReader for CloneableChunkReader
|
173
|
+
impl ChunkReader for CloneableChunkReader {
|
174
|
+
type T = Box<dyn Read + Send>;
|
175
|
+
|
176
|
+
fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
|
177
|
+
match self {
|
178
|
+
CloneableChunkReader::File(f) => f.get_read(start),
|
179
|
+
CloneableChunkReader::RubyIO(r) => r.get_read(start),
|
180
|
+
CloneableChunkReader::Bytes(b) => {
|
181
|
+
// For bytes, we can use the built-in implementation
|
182
|
+
let bytes = b.clone();
|
183
|
+
let len = bytes.len();
|
184
|
+
if start as usize > len {
|
185
|
+
return Err(parquet::errors::ParquetError::IndexOutOfBound(
|
186
|
+
start as usize,
|
187
|
+
len,
|
188
|
+
));
|
189
|
+
}
|
190
|
+
let reader = std::io::Cursor::new(bytes.slice(start as usize..));
|
191
|
+
Ok(Box::new(reader))
|
192
|
+
}
|
193
|
+
}
|
194
|
+
}
|
195
|
+
|
196
|
+
fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
|
197
|
+
match self {
|
198
|
+
CloneableChunkReader::File(f) => f.get_bytes(start, length),
|
199
|
+
CloneableChunkReader::RubyIO(r) => r.get_bytes(start, length),
|
200
|
+
CloneableChunkReader::Bytes(b) => {
|
201
|
+
// For bytes, use the built-in slice functionality
|
202
|
+
let end = (start as usize).saturating_add(length).min(b.len());
|
203
|
+
Ok(b.slice(start as usize..end))
|
204
|
+
}
|
205
|
+
}
|
206
|
+
}
|
207
|
+
}
|
208
|
+
|
209
|
+
/// Create a CloneableChunkReader from various sources
|
210
|
+
impl CloneableChunkReader {
|
211
|
+
/// Create from a file path
|
212
|
+
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
|
213
|
+
Ok(CloneableChunkReader::File(FileChunkReader::new(path)?))
|
214
|
+
}
|
215
|
+
|
216
|
+
/// Create from a Ruby IO object
|
217
|
+
pub fn from_ruby_io(reader: ThreadSafeRubyIOReader) -> Result<Self> {
|
218
|
+
// Get the length by seeking to the end and back
|
219
|
+
let mut reader_clone = reader.clone();
|
220
|
+
let len = reader_clone.seek(SeekFrom::End(0))?;
|
221
|
+
reader_clone.seek(SeekFrom::Start(0))?;
|
222
|
+
|
223
|
+
Ok(CloneableChunkReader::RubyIO(RubyIOChunkReader::new(
|
224
|
+
reader, len,
|
225
|
+
)))
|
226
|
+
}
|
227
|
+
|
228
|
+
/// Create from bytes (for small files or testing)
|
229
|
+
pub fn from_bytes(bytes: Bytes) -> Self {
|
230
|
+
CloneableChunkReader::Bytes(bytes)
|
231
|
+
}
|
232
|
+
|
233
|
+
/// Check if this reader should use streaming (based on size threshold)
|
234
|
+
pub fn should_stream(&self, threshold_bytes: u64) -> bool {
|
235
|
+
self.len() > threshold_bytes
|
236
|
+
}
|
237
|
+
}
|