parquet-tyfoom 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Cargo.lock +1854 -0
- data/Cargo.toml +3 -0
- data/Gemfile +21 -0
- data/LICENSE +21 -0
- data/README.md +428 -0
- data/Rakefile +43 -0
- data/ext/parquet/Cargo.toml +39 -0
- data/ext/parquet/build.rs +5 -0
- data/ext/parquet/extconf.rb +4 -0
- data/ext/parquet/src/adapter_ffi.rs +297 -0
- data/ext/parquet/src/allocator.rs +13 -0
- data/ext/parquet/src/lib.rs +24 -0
- data/ext/parquet-core/Cargo.toml +24 -0
- data/ext/parquet-core/src/arrow_conversion.rs +1243 -0
- data/ext/parquet-core/src/error.rs +189 -0
- data/ext/parquet-core/src/lib.rs +60 -0
- data/ext/parquet-core/src/reader.rs +368 -0
- data/ext/parquet-core/src/schema.rs +452 -0
- data/ext/parquet-core/src/test_utils.rs +308 -0
- data/ext/parquet-core/src/traits/mod.rs +5 -0
- data/ext/parquet-core/src/traits/schema.rs +190 -0
- data/ext/parquet-core/src/value.rs +220 -0
- data/ext/parquet-core/src/writer.rs +1241 -0
- data/ext/parquet-core/tests/arrow_conversion_tests.rs +484 -0
- data/ext/parquet-core/tests/binary_data.rs +437 -0
- data/ext/parquet-core/tests/column_projection.rs +557 -0
- data/ext/parquet-core/tests/complex_types.rs +821 -0
- data/ext/parquet-core/tests/compression_tests.rs +434 -0
- data/ext/parquet-core/tests/concurrent_access.rs +431 -0
- data/ext/parquet-core/tests/decimal_tests.rs +488 -0
- data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
- data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +540 -0
- data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
- data/ext/parquet-core/tests/performance_memory.rs +181 -0
- data/ext/parquet-core/tests/primitive_types.rs +547 -0
- data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
- data/ext/parquet-core/tests/review_regressions.rs +787 -0
- data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
- data/ext/parquet-core/tests/schema_comprehensive_tests.rs +542 -0
- data/ext/parquet-core/tests/temporal_tests.rs +518 -0
- data/ext/parquet-core/tests/test_helpers.rs +132 -0
- data/ext/parquet-core/tests/writer_tests.rs +545 -0
- data/ext/parquet-ruby-adapter/Cargo.toml +24 -0
- data/ext/parquet-ruby-adapter/build.rs +5 -0
- data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
- data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
- data/ext/parquet-ruby-adapter/src/converter.rs +1734 -0
- data/ext/parquet-ruby-adapter/src/error.rs +141 -0
- data/ext/parquet-ruby-adapter/src/io.rs +432 -0
- data/ext/parquet-ruby-adapter/src/lib.rs +91 -0
- data/ext/parquet-ruby-adapter/src/logger.rs +67 -0
- data/ext/parquet-ruby-adapter/src/metadata.rs +529 -0
- data/ext/parquet-ruby-adapter/src/reader.rs +339 -0
- data/ext/parquet-ruby-adapter/src/schema.rs +884 -0
- data/ext/parquet-ruby-adapter/src/string_cache.rs +115 -0
- data/ext/parquet-ruby-adapter/src/string_cache_test.rs +122 -0
- data/ext/parquet-ruby-adapter/src/string_storage.rs +632 -0
- data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
- data/ext/parquet-ruby-adapter/src/types.rs +98 -0
- data/ext/parquet-ruby-adapter/src/utils.rs +280 -0
- data/ext/parquet-ruby-adapter/src/writer.rs +625 -0
- data/lib/parquet/schema.rb +262 -0
- data/lib/parquet/version.rb +3 -0
- data/lib/parquet.rb +11 -0
- data/lib/parquet.rbi +181 -0
- metadata +165 -0
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
use magnus::value::{BoxValue, ReprValue};
|
|
2
|
+
use magnus::{Error as MagnusError, IntoValue, RArray, RHash, Ruby, TryConvert, Value};
|
|
3
|
+
use parquet_core::reader::Reader;
|
|
4
|
+
|
|
5
|
+
use crate::{
|
|
6
|
+
converter::parquet_to_ruby,
|
|
7
|
+
io::{RubyIOReader, ThreadSafeRubyIOReader},
|
|
8
|
+
logger::RubyLogger,
|
|
9
|
+
string_storage::{StringStorage, StringStorageConfig},
|
|
10
|
+
types::{ColumnEnumeratorArgs, ParserResultType, RowEnumeratorArgs},
|
|
11
|
+
utils::{create_column_enumerator, create_row_enumerator, handle_block_or_enum},
|
|
12
|
+
CloneableChunkReader,
|
|
13
|
+
};
|
|
14
|
+
use std::collections::HashSet;
|
|
15
|
+
|
|
16
|
+
/// Read parquet file row by row
|
|
17
|
+
pub fn each_row(
|
|
18
|
+
ruby: &Ruby,
|
|
19
|
+
rb_self: Value,
|
|
20
|
+
to_read: Value,
|
|
21
|
+
result_type: ParserResultType,
|
|
22
|
+
columns: Option<Vec<String>>,
|
|
23
|
+
strict: bool,
|
|
24
|
+
string_storage: StringStorageConfig,
|
|
25
|
+
logger: RubyLogger,
|
|
26
|
+
) -> Result<Value, MagnusError> {
|
|
27
|
+
if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
|
|
28
|
+
create_row_enumerator(
|
|
29
|
+
ruby,
|
|
30
|
+
RowEnumeratorArgs {
|
|
31
|
+
rb_self,
|
|
32
|
+
to_read,
|
|
33
|
+
result_type,
|
|
34
|
+
columns: columns.clone(),
|
|
35
|
+
strict,
|
|
36
|
+
string_storage,
|
|
37
|
+
logger: logger.inner(),
|
|
38
|
+
},
|
|
39
|
+
)
|
|
40
|
+
.map(|yield_enum| yield_enum.into_value_with(ruby))
|
|
41
|
+
})? {
|
|
42
|
+
return Ok(enum_value);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// Log start of processing
|
|
46
|
+
let _ = logger.info(|| "Starting to read parquet file".to_string());
|
|
47
|
+
|
|
48
|
+
// Create a streaming reader based on input type
|
|
49
|
+
let chunk_reader = if to_read.is_kind_of(ruby.class_string()) {
|
|
50
|
+
let path_str: String = TryConvert::try_convert(to_read)?;
|
|
51
|
+
let _ = logger.debug(|| format!("Reading from file: {}", path_str));
|
|
52
|
+
CloneableChunkReader::from_path(&path_str)
|
|
53
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
|
54
|
+
} else if to_read.respond_to("read", false)? {
|
|
55
|
+
// Handle IO objects with streaming
|
|
56
|
+
let _ = logger.debug(|| "Reading from IO object".to_string());
|
|
57
|
+
let ruby_reader = RubyIOReader::new(to_read)
|
|
58
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
59
|
+
let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
|
|
60
|
+
|
|
61
|
+
CloneableChunkReader::from_ruby_io(thread_safe_reader)
|
|
62
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
|
63
|
+
} else {
|
|
64
|
+
return Err(MagnusError::new(
|
|
65
|
+
ruby.exception_runtime_error(),
|
|
66
|
+
format!(
|
|
67
|
+
"Invalid input type: expected String or IO object with read method, got {}",
|
|
68
|
+
to_read.class()
|
|
69
|
+
),
|
|
70
|
+
));
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
let reader = Reader::new(chunk_reader.clone());
|
|
74
|
+
let mut reader_for_metadata = Reader::new(chunk_reader);
|
|
75
|
+
|
|
76
|
+
// Get metadata to extract column names
|
|
77
|
+
let metadata = reader_for_metadata
|
|
78
|
+
.metadata()
|
|
79
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
80
|
+
let schema = metadata.schema();
|
|
81
|
+
let all_column_names: Vec<String> = schema
|
|
82
|
+
.get_fields()
|
|
83
|
+
.iter()
|
|
84
|
+
.map(|f| f.name().to_string())
|
|
85
|
+
.collect();
|
|
86
|
+
|
|
87
|
+
let _ = logger.info(|| format!("Processing {} columns", all_column_names.len()));
|
|
88
|
+
|
|
89
|
+
// Get the row iterator. Projected rows are yielded in file-schema order, not
|
|
90
|
+
// request order, so the hash keys must follow file order too — derive them by
|
|
91
|
+
// filtering the file columns, never from the request-ordered `cols`.
|
|
92
|
+
let (row_iter, column_names) = if let Some(ref cols) = columns {
|
|
93
|
+
let requested = cols.iter().map(String::as_str).collect::<HashSet<_>>();
|
|
94
|
+
let projected_names = all_column_names
|
|
95
|
+
.iter()
|
|
96
|
+
.filter(|name| requested.contains(name.as_str()))
|
|
97
|
+
.cloned()
|
|
98
|
+
.collect::<Vec<_>>();
|
|
99
|
+
let iter = reader
|
|
100
|
+
.read_rows_with_projection(cols)
|
|
101
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
102
|
+
(iter, projected_names)
|
|
103
|
+
} else {
|
|
104
|
+
let iter = reader
|
|
105
|
+
.read_rows()
|
|
106
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
107
|
+
(iter, all_column_names)
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
let mut row_count = 0u64;
|
|
111
|
+
let mut string_storage = StringStorage::new(string_storage);
|
|
112
|
+
|
|
113
|
+
// BoxValue registers each interned key with the GC so it survives a
|
|
114
|
+
// GC.compact triggered by user code inside the yield loop; a bare RString
|
|
115
|
+
// held in this Vec would be relocated and dangle.
|
|
116
|
+
let interned_column_names = column_names
|
|
117
|
+
.iter()
|
|
118
|
+
.map(|name| BoxValue::new(ruby.str_new(name).to_interned_str()))
|
|
119
|
+
.collect::<Vec<_>>();
|
|
120
|
+
|
|
121
|
+
for row_result in row_iter {
|
|
122
|
+
let row = row_result
|
|
123
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
124
|
+
|
|
125
|
+
// Convert row to Ruby value based on result_type
|
|
126
|
+
let ruby_row = match result_type {
|
|
127
|
+
ParserResultType::Array => {
|
|
128
|
+
let array: RArray = ruby.ary_new_capa(row.len());
|
|
129
|
+
for value in row {
|
|
130
|
+
let ruby_value = parquet_to_ruby(value, &mut string_storage).map_err(|e| {
|
|
131
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
|
132
|
+
})?;
|
|
133
|
+
array.push(ruby_value)?;
|
|
134
|
+
}
|
|
135
|
+
array.as_value()
|
|
136
|
+
}
|
|
137
|
+
ParserResultType::Hash => {
|
|
138
|
+
let hash: RHash = ruby.hash_new_capa(row.len());
|
|
139
|
+
for (idx, value) in row.into_iter().enumerate() {
|
|
140
|
+
if idx < interned_column_names.len() {
|
|
141
|
+
let ruby_value =
|
|
142
|
+
parquet_to_ruby(value, &mut string_storage).map_err(|e| {
|
|
143
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
|
144
|
+
})?;
|
|
145
|
+
hash.aset(interned_column_names[idx].as_value(), ruby_value)?;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
hash.as_value()
|
|
149
|
+
}
|
|
150
|
+
};
|
|
151
|
+
|
|
152
|
+
let _: Value = ruby.yield_value(ruby_row)?;
|
|
153
|
+
|
|
154
|
+
row_count += 1;
|
|
155
|
+
if row_count % 1000 == 0 {
|
|
156
|
+
let _ = logger.debug(|| format!("Processed {} rows", row_count));
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
let _ = logger.info(|| format!("Finished processing {} rows", row_count));
|
|
161
|
+
|
|
162
|
+
Ok(ruby.qnil().as_value())
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/// Arguments for each_column function
|
|
166
|
+
struct EachColumnArgs {
|
|
167
|
+
rb_self: Value,
|
|
168
|
+
to_read: Value,
|
|
169
|
+
result_type: ParserResultType,
|
|
170
|
+
columns: Option<Vec<String>>,
|
|
171
|
+
batch_size: Option<usize>,
|
|
172
|
+
strict: bool,
|
|
173
|
+
string_storage: StringStorageConfig,
|
|
174
|
+
logger: RubyLogger,
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
/// Read parquet file column by column
|
|
178
|
+
#[allow(clippy::too_many_arguments)]
|
|
179
|
+
pub fn each_column(
|
|
180
|
+
ruby: &Ruby,
|
|
181
|
+
rb_self: Value,
|
|
182
|
+
to_read: Value,
|
|
183
|
+
result_type: ParserResultType,
|
|
184
|
+
columns: Option<Vec<String>>,
|
|
185
|
+
batch_size: Option<usize>,
|
|
186
|
+
strict: bool,
|
|
187
|
+
string_storage: StringStorageConfig,
|
|
188
|
+
logger: RubyLogger,
|
|
189
|
+
) -> Result<Value, MagnusError> {
|
|
190
|
+
let args = EachColumnArgs {
|
|
191
|
+
rb_self,
|
|
192
|
+
to_read,
|
|
193
|
+
result_type,
|
|
194
|
+
columns,
|
|
195
|
+
batch_size,
|
|
196
|
+
strict,
|
|
197
|
+
string_storage,
|
|
198
|
+
logger,
|
|
199
|
+
};
|
|
200
|
+
each_column_impl(ruby, args)
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
fn each_column_impl(ruby: &Ruby, args: EachColumnArgs) -> Result<Value, MagnusError> {
|
|
204
|
+
if let Some(enum_value) = handle_block_or_enum(ruby.block_given(), || {
|
|
205
|
+
create_column_enumerator(
|
|
206
|
+
ruby,
|
|
207
|
+
ColumnEnumeratorArgs {
|
|
208
|
+
rb_self: args.rb_self,
|
|
209
|
+
to_read: args.to_read,
|
|
210
|
+
result_type: args.result_type,
|
|
211
|
+
columns: args.columns.clone(),
|
|
212
|
+
batch_size: args.batch_size,
|
|
213
|
+
strict: args.strict,
|
|
214
|
+
string_storage: args.string_storage,
|
|
215
|
+
logger: args.logger.inner(),
|
|
216
|
+
},
|
|
217
|
+
)
|
|
218
|
+
.map(|yield_enum| yield_enum.into_value_with(ruby))
|
|
219
|
+
})? {
|
|
220
|
+
return Ok(enum_value);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// Log start of processing
|
|
224
|
+
let _ = args
|
|
225
|
+
.logger
|
|
226
|
+
.info(|| "Starting to read parquet file columns".to_string());
|
|
227
|
+
|
|
228
|
+
// Create a streaming reader based on input type
|
|
229
|
+
let chunk_reader = if args.to_read.is_kind_of(ruby.class_string()) {
|
|
230
|
+
let path_str: String = TryConvert::try_convert(args.to_read)?;
|
|
231
|
+
let _ = args
|
|
232
|
+
.logger
|
|
233
|
+
.debug(|| format!("Reading columns from file: {}", path_str));
|
|
234
|
+
CloneableChunkReader::from_path(&path_str)
|
|
235
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
|
236
|
+
} else if args.to_read.respond_to("read", false)? {
|
|
237
|
+
// Handle IO objects with streaming
|
|
238
|
+
let _ = args
|
|
239
|
+
.logger
|
|
240
|
+
.debug(|| "Reading columns from IO object".to_string());
|
|
241
|
+
let ruby_reader = RubyIOReader::new(args.to_read)
|
|
242
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
243
|
+
let thread_safe_reader = ThreadSafeRubyIOReader::new(ruby_reader);
|
|
244
|
+
|
|
245
|
+
CloneableChunkReader::from_ruby_io(thread_safe_reader)
|
|
246
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?
|
|
247
|
+
} else {
|
|
248
|
+
return Err(MagnusError::new(
|
|
249
|
+
ruby.exception_runtime_error(),
|
|
250
|
+
format!(
|
|
251
|
+
"Invalid input type: expected String or IO object with read method, got {}",
|
|
252
|
+
args.to_read.class()
|
|
253
|
+
),
|
|
254
|
+
));
|
|
255
|
+
};
|
|
256
|
+
|
|
257
|
+
let reader = Reader::new(chunk_reader.clone());
|
|
258
|
+
let mut reader_for_metadata = Reader::new(chunk_reader);
|
|
259
|
+
|
|
260
|
+
// Get metadata to extract column names
|
|
261
|
+
let metadata = reader_for_metadata
|
|
262
|
+
.metadata()
|
|
263
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
264
|
+
let schema = metadata.schema();
|
|
265
|
+
let all_column_names: Vec<String> = schema
|
|
266
|
+
.get_fields()
|
|
267
|
+
.iter()
|
|
268
|
+
.map(|f| f.name().to_string())
|
|
269
|
+
.collect();
|
|
270
|
+
|
|
271
|
+
// Get the column iterator
|
|
272
|
+
let (col_iter, _column_names) = if let Some(ref cols) = args.columns {
|
|
273
|
+
let iter = reader
|
|
274
|
+
.read_columns_with_projection(cols, args.batch_size)
|
|
275
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
276
|
+
(iter, cols.clone())
|
|
277
|
+
} else {
|
|
278
|
+
let iter = reader
|
|
279
|
+
.read_columns(args.batch_size)
|
|
280
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
281
|
+
(iter, all_column_names)
|
|
282
|
+
};
|
|
283
|
+
|
|
284
|
+
let mut batch_count = 0u64;
|
|
285
|
+
let mut string_storage = StringStorage::new(args.string_storage);
|
|
286
|
+
|
|
287
|
+
for batch_result in col_iter {
|
|
288
|
+
let batch = batch_result
|
|
289
|
+
.map_err(|e| MagnusError::new(ruby.exception_runtime_error(), e.to_string()))?;
|
|
290
|
+
|
|
291
|
+
// Convert batch to Ruby value based on result_type
|
|
292
|
+
let ruby_batch = match args.result_type {
|
|
293
|
+
ParserResultType::Array => {
|
|
294
|
+
let array: RArray = ruby.ary_new_capa(batch.columns.len());
|
|
295
|
+
for (_name, values) in batch.columns {
|
|
296
|
+
let col_array: RArray = ruby.ary_new_capa(values.len());
|
|
297
|
+
for value in values {
|
|
298
|
+
let ruby_value =
|
|
299
|
+
parquet_to_ruby(value, &mut string_storage).map_err(|e| {
|
|
300
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
|
301
|
+
})?;
|
|
302
|
+
col_array.push(ruby_value)?;
|
|
303
|
+
}
|
|
304
|
+
array.push(col_array)?;
|
|
305
|
+
}
|
|
306
|
+
array.as_value()
|
|
307
|
+
}
|
|
308
|
+
ParserResultType::Hash => {
|
|
309
|
+
let hash: RHash = ruby.hash_new();
|
|
310
|
+
for (name, values) in batch.columns {
|
|
311
|
+
let col_array: RArray = ruby.ary_new_capa(values.len());
|
|
312
|
+
for value in values {
|
|
313
|
+
let ruby_value =
|
|
314
|
+
parquet_to_ruby(value, &mut string_storage).map_err(|e| {
|
|
315
|
+
MagnusError::new(ruby.exception_runtime_error(), e.to_string())
|
|
316
|
+
})?;
|
|
317
|
+
col_array.push(ruby_value)?;
|
|
318
|
+
}
|
|
319
|
+
let ruby_key = string_storage.ruby_key(ruby, &name);
|
|
320
|
+
hash.aset(ruby_key, col_array)?;
|
|
321
|
+
}
|
|
322
|
+
hash.as_value()
|
|
323
|
+
}
|
|
324
|
+
};
|
|
325
|
+
|
|
326
|
+
let _: Value = ruby.yield_value(ruby_batch)?;
|
|
327
|
+
|
|
328
|
+
batch_count += 1;
|
|
329
|
+
let _ = args
|
|
330
|
+
.logger
|
|
331
|
+
.debug(|| format!("Processed batch {}", batch_count));
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
let _ = args
|
|
335
|
+
.logger
|
|
336
|
+
.info(|| format!("Finished processing {} batches", batch_count));
|
|
337
|
+
|
|
338
|
+
Ok(ruby.qnil().as_value())
|
|
339
|
+
}
|