parquet 0.5.13 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +3 -0
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -605
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -0,0 +1,116 @@
1
+ /// Default constants for batch processing
2
+ pub const SAMPLE_SIZE: usize = 100;
3
+ pub const MIN_BATCH_SIZE: usize = 10;
4
+ pub const INITIAL_BATCH_SIZE: usize = 100;
5
+ pub const DEFAULT_MEMORY_THRESHOLD: usize = 64 * 1024 * 1024;
6
+
7
+ /// Manages dynamic batch sizing based on memory usage
8
+ pub struct BatchSizeManager {
9
+ pub fixed_batch_size: Option<usize>,
10
+ pub memory_threshold: usize,
11
+ pub sample_size: usize,
12
+ pub row_size_samples: Vec<usize>,
13
+ pub current_batch_size: usize,
14
+ pub rows_processed: usize,
15
+ pub recent_row_sizes: Vec<usize>,
16
+ }
17
+
18
+ impl BatchSizeManager {
19
+ pub fn new(
20
+ fixed_batch_size: Option<usize>,
21
+ memory_threshold: Option<usize>,
22
+ sample_size: Option<usize>,
23
+ ) -> Self {
24
+ Self {
25
+ fixed_batch_size,
26
+ memory_threshold: memory_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD),
27
+ sample_size: sample_size.unwrap_or(SAMPLE_SIZE),
28
+ row_size_samples: Vec::new(),
29
+ current_batch_size: fixed_batch_size.unwrap_or(INITIAL_BATCH_SIZE),
30
+ rows_processed: 0,
31
+ recent_row_sizes: Vec::with_capacity(10),
32
+ }
33
+ }
34
+
35
+ pub fn record_row_size(&mut self, size: usize) {
36
+ self.rows_processed += 1;
37
+
38
+ // Always track recent row sizes for current memory estimation
39
+ if self.recent_row_sizes.len() >= 10 {
40
+ self.recent_row_sizes.remove(0);
41
+ }
42
+ self.recent_row_sizes.push(size);
43
+
44
+ // Sample for batch size calculation
45
+ if self.row_size_samples.len() < self.sample_size {
46
+ self.row_size_samples.push(size);
47
+
48
+ // Recalculate batch size after enough samples
49
+ if self.row_size_samples.len() >= MIN_BATCH_SIZE {
50
+ self.recalculate_batch_size();
51
+ }
52
+ } else if self.rows_processed % 50 == 0 {
53
+ // Periodically adjust based on recent data
54
+ self.adjust_for_recent_sizes();
55
+ }
56
+ }
57
+
58
+ pub fn average_row_size(&self) -> usize {
59
+ if self.row_size_samples.is_empty() {
60
+ 1024 // Default estimate
61
+ } else {
62
+ self.row_size_samples.iter().sum::<usize>() / self.row_size_samples.len()
63
+ }
64
+ }
65
+
66
+ pub fn recent_average_size(&self) -> usize {
67
+ if self.recent_row_sizes.is_empty() {
68
+ self.average_row_size()
69
+ } else {
70
+ self.recent_row_sizes.iter().sum::<usize>() / self.recent_row_sizes.len()
71
+ }
72
+ }
73
+
74
+ fn adjust_for_recent_sizes(&mut self) {
75
+ if self.fixed_batch_size.is_some() {
76
+ return;
77
+ }
78
+
79
+ let recent_avg = self.recent_average_size();
80
+ let overall_avg = self.average_row_size();
81
+
82
+ // If recent rows are significantly different from the sample average, adjust
83
+ if recent_avg > overall_avg * 2 || recent_avg < overall_avg / 2 {
84
+ let target_memory = (self.memory_threshold as f64 * 0.8) as usize;
85
+ self.current_batch_size = (target_memory / recent_avg).max(MIN_BATCH_SIZE);
86
+ }
87
+ }
88
+
89
+ fn recalculate_batch_size(&mut self) {
90
+ if self.fixed_batch_size.is_some() {
91
+ return; // User specified fixed size
92
+ }
93
+
94
+ let avg_size = self.average_row_size();
95
+ if avg_size > 0 {
96
+ // Target 80% of memory threshold to leave headroom
97
+ let target_memory = (self.memory_threshold as f64 * 0.8) as usize;
98
+ self.current_batch_size = (target_memory / avg_size).max(MIN_BATCH_SIZE);
99
+ }
100
+ }
101
+
102
+ pub fn should_flush(&self, batch_size: usize, current_batch_memory: usize) -> bool {
103
+ if batch_size >= self.current_batch_size {
104
+ return true;
105
+ }
106
+
107
+ // Use actual memory size if available, otherwise estimate
108
+ let memory_usage = if current_batch_memory > 0 {
109
+ current_batch_memory
110
+ } else {
111
+ batch_size * self.recent_average_size()
112
+ };
113
+
114
+ memory_usage >= self.memory_threshold
115
+ }
116
+ }
@@ -0,0 +1,237 @@
1
+ //! Cloneable ChunkReader implementation for streaming Parquet files
2
+ //!
3
+ //! This module provides a ChunkReader that implements Clone, enabling
4
+ //! true streaming of Parquet files without loading them entirely into memory.
5
+
6
+ use bytes::Bytes;
7
+ use parquet::file::reader::{ChunkReader, Length};
8
+ use parquet_core::Result;
9
+ use std::fs::File;
10
+ use std::io::{self, Read, Seek, SeekFrom};
11
+ use std::path::{Path, PathBuf};
12
+
13
+ use crate::io::ThreadSafeRubyIOReader;
14
+
15
+ /// A ChunkReader that can be cloned for parallel reading
16
+ #[derive(Clone)]
17
+ pub enum CloneableChunkReader {
18
+ /// File-based reader that reopens files on clone
19
+ File(FileChunkReader),
20
+ /// Ruby IO-based reader using thread-safe wrapper
21
+ RubyIO(RubyIOChunkReader),
22
+ /// In-memory bytes (fallback for small files)
23
+ Bytes(bytes::Bytes),
24
+ }
25
+
26
+ /// File-based chunk reader that reopens files for each clone
27
+ #[derive(Clone)]
28
+ pub struct FileChunkReader {
29
+ path: PathBuf,
30
+ file_len: u64,
31
+ }
32
+
33
+ impl FileChunkReader {
34
+ pub fn new<P: AsRef<Path>>(path: P) -> Result<Self> {
35
+ let path = path.as_ref().to_path_buf();
36
+ let file = File::open(&path)?;
37
+ let metadata = file.metadata()?;
38
+ let file_len = metadata.len();
39
+
40
+ Ok(FileChunkReader { path, file_len })
41
+ }
42
+ }
43
+
44
+ /// Ruby IO-based chunk reader using thread-safe wrapper
45
+ #[derive(Clone)]
46
+ pub struct RubyIOChunkReader {
47
+ reader: ThreadSafeRubyIOReader,
48
+ len: u64,
49
+ }
50
+
51
+ impl RubyIOChunkReader {
52
+ pub fn new(reader: ThreadSafeRubyIOReader, len: u64) -> Self {
53
+ RubyIOChunkReader { reader, len }
54
+ }
55
+ }
56
+
57
+ /// A reader that reads a specific range from a ChunkReader
58
+ struct RangeReader<R> {
59
+ inner: R,
60
+ _start: u64,
61
+ end: u64,
62
+ pos: u64,
63
+ }
64
+
65
+ impl<R: Read + Seek> RangeReader<R> {
66
+ fn new(mut inner: R, start: u64, length: u64) -> io::Result<Self> {
67
+ inner.seek(SeekFrom::Start(start))?;
68
+ Ok(RangeReader {
69
+ inner,
70
+ _start: start,
71
+ end: start + length,
72
+ pos: start,
73
+ })
74
+ }
75
+ }
76
+
77
+ impl<R: Read> Read for RangeReader<R> {
78
+ fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
79
+ let remaining = (self.end - self.pos) as usize;
80
+ if remaining == 0 {
81
+ return Ok(0);
82
+ }
83
+
84
+ let to_read = buf.len().min(remaining);
85
+ let n = self.inner.read(&mut buf[..to_read])?;
86
+ self.pos += n as u64;
87
+ Ok(n)
88
+ }
89
+ }
90
+
91
+ // Implement Length trait for our readers
92
+ impl Length for FileChunkReader {
93
+ fn len(&self) -> u64 {
94
+ self.file_len
95
+ }
96
+ }
97
+
98
+ impl Length for RubyIOChunkReader {
99
+ fn len(&self) -> u64 {
100
+ self.len
101
+ }
102
+ }
103
+
104
+ impl Length for CloneableChunkReader {
105
+ fn len(&self) -> u64 {
106
+ match self {
107
+ CloneableChunkReader::File(f) => f.len(),
108
+ CloneableChunkReader::RubyIO(r) => r.len(),
109
+ CloneableChunkReader::Bytes(b) => b.len() as u64,
110
+ }
111
+ }
112
+ }
113
+
114
+ // Implement ChunkReader for FileChunkReader
115
+ impl ChunkReader for FileChunkReader {
116
+ type T = Box<dyn Read + Send>;
117
+
118
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
119
+ let file = File::open(&self.path)
120
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
121
+ let reader = RangeReader::new(file, start, self.file_len - start)
122
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
123
+ Ok(Box::new(reader))
124
+ }
125
+
126
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
127
+ let mut file = File::open(&self.path)
128
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
129
+ file.seek(SeekFrom::Start(start))
130
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
131
+
132
+ let mut buf = vec![0; length];
133
+ file.read_exact(&mut buf)
134
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
135
+ Ok(Bytes::from(buf))
136
+ }
137
+ }
138
+
139
+ // Implement ChunkReader for RubyIOChunkReader
140
+ impl ChunkReader for RubyIOChunkReader {
141
+ type T = Box<dyn Read + Send>;
142
+
143
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
144
+ // Clone the reader for thread-safe access
145
+ let mut reader = self.reader.clone();
146
+
147
+ // Seek to the start position
148
+ reader
149
+ .seek(SeekFrom::Start(start))
150
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
151
+
152
+ // Create a range reader that limits reading to the available data
153
+ let reader = RangeReader::new(reader, start, self.len - start)
154
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
155
+ Ok(Box::new(reader))
156
+ }
157
+
158
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
159
+ let mut reader = self.reader.clone();
160
+ reader
161
+ .seek(SeekFrom::Start(start))
162
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
163
+
164
+ let mut buf = vec![0; length];
165
+ reader
166
+ .read_exact(&mut buf)
167
+ .map_err(|e| parquet::errors::ParquetError::External(Box::new(e)))?;
168
+ Ok(Bytes::from(buf))
169
+ }
170
+ }
171
+
172
+ // Implement ChunkReader for CloneableChunkReader
173
+ impl ChunkReader for CloneableChunkReader {
174
+ type T = Box<dyn Read + Send>;
175
+
176
+ fn get_read(&self, start: u64) -> parquet::errors::Result<Self::T> {
177
+ match self {
178
+ CloneableChunkReader::File(f) => f.get_read(start),
179
+ CloneableChunkReader::RubyIO(r) => r.get_read(start),
180
+ CloneableChunkReader::Bytes(b) => {
181
+ // For bytes, we can use the built-in implementation
182
+ let bytes = b.clone();
183
+ let len = bytes.len();
184
+ if start as usize > len {
185
+ return Err(parquet::errors::ParquetError::IndexOutOfBound(
186
+ start as usize,
187
+ len,
188
+ ));
189
+ }
190
+ let reader = std::io::Cursor::new(bytes.slice(start as usize..));
191
+ Ok(Box::new(reader))
192
+ }
193
+ }
194
+ }
195
+
196
+ fn get_bytes(&self, start: u64, length: usize) -> parquet::errors::Result<Bytes> {
197
+ match self {
198
+ CloneableChunkReader::File(f) => f.get_bytes(start, length),
199
+ CloneableChunkReader::RubyIO(r) => r.get_bytes(start, length),
200
+ CloneableChunkReader::Bytes(b) => {
201
+ // For bytes, use the built-in slice functionality
202
+ let end = (start as usize).saturating_add(length).min(b.len());
203
+ Ok(b.slice(start as usize..end))
204
+ }
205
+ }
206
+ }
207
+ }
208
+
209
+ /// Create a CloneableChunkReader from various sources
210
+ impl CloneableChunkReader {
211
+ /// Create from a file path
212
+ pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self> {
213
+ Ok(CloneableChunkReader::File(FileChunkReader::new(path)?))
214
+ }
215
+
216
+ /// Create from a Ruby IO object
217
+ pub fn from_ruby_io(reader: ThreadSafeRubyIOReader) -> Result<Self> {
218
+ // Get the length by seeking to the end and back
219
+ let mut reader_clone = reader.clone();
220
+ let len = reader_clone.seek(SeekFrom::End(0))?;
221
+ reader_clone.seek(SeekFrom::Start(0))?;
222
+
223
+ Ok(CloneableChunkReader::RubyIO(RubyIOChunkReader::new(
224
+ reader, len,
225
+ )))
226
+ }
227
+
228
+ /// Create from bytes (for small files or testing)
229
+ pub fn from_bytes(bytes: Bytes) -> Self {
230
+ CloneableChunkReader::Bytes(bytes)
231
+ }
232
+
233
+ /// Check if this reader should use streaming (based on size threshold)
234
+ pub fn should_stream(&self, threshold_bytes: u64) -> bool {
235
+ self.len() > threshold_bytes
236
+ }
237
+ }