parquet 0.5.12 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +4 -4
  2. data/Cargo.lock +295 -98
  3. data/Cargo.toml +1 -1
  4. data/Gemfile +1 -0
  5. data/README.md +94 -3
  6. data/ext/parquet/Cargo.toml +8 -5
  7. data/ext/parquet/src/adapter_ffi.rs +156 -0
  8. data/ext/parquet/src/lib.rs +13 -21
  9. data/ext/parquet-core/Cargo.toml +23 -0
  10. data/ext/parquet-core/src/arrow_conversion.rs +1133 -0
  11. data/ext/parquet-core/src/error.rs +163 -0
  12. data/ext/parquet-core/src/lib.rs +60 -0
  13. data/ext/parquet-core/src/reader.rs +263 -0
  14. data/ext/parquet-core/src/schema.rs +283 -0
  15. data/ext/parquet-core/src/test_utils.rs +308 -0
  16. data/ext/parquet-core/src/traits/mod.rs +5 -0
  17. data/ext/parquet-core/src/traits/schema.rs +151 -0
  18. data/ext/parquet-core/src/value.rs +209 -0
  19. data/ext/parquet-core/src/writer.rs +839 -0
  20. data/ext/parquet-core/tests/arrow_conversion_tests.rs +423 -0
  21. data/ext/parquet-core/tests/binary_data.rs +437 -0
  22. data/ext/parquet-core/tests/column_projection.rs +557 -0
  23. data/ext/parquet-core/tests/complex_types.rs +821 -0
  24. data/ext/parquet-core/tests/compression_tests.rs +434 -0
  25. data/ext/parquet-core/tests/concurrent_access.rs +430 -0
  26. data/ext/parquet-core/tests/decimal_tests.rs +488 -0
  27. data/ext/parquet-core/tests/edge_cases_corner_cases.rs +322 -0
  28. data/ext/parquet-core/tests/error_handling_comprehensive_tests.rs +547 -0
  29. data/ext/parquet-core/tests/null_handling_tests.rs +430 -0
  30. data/ext/parquet-core/tests/performance_memory.rs +181 -0
  31. data/ext/parquet-core/tests/primitive_types.rs +547 -0
  32. data/ext/parquet-core/tests/real_world_patterns.rs +777 -0
  33. data/ext/parquet-core/tests/roundtrip_correctness.rs +279 -0
  34. data/ext/parquet-core/tests/schema_comprehensive_tests.rs +534 -0
  35. data/ext/parquet-core/tests/temporal_tests.rs +518 -0
  36. data/ext/parquet-core/tests/test_helpers.rs +132 -0
  37. data/ext/parquet-core/tests/writer_tests.rs +545 -0
  38. data/ext/parquet-ruby-adapter/Cargo.toml +22 -0
  39. data/ext/parquet-ruby-adapter/build.rs +5 -0
  40. data/ext/parquet-ruby-adapter/examples/try_into_value_demo.rs +98 -0
  41. data/ext/parquet-ruby-adapter/src/batch_manager.rs +116 -0
  42. data/ext/parquet-ruby-adapter/src/chunk_reader.rs +237 -0
  43. data/ext/parquet-ruby-adapter/src/converter.rs +1685 -0
  44. data/ext/parquet-ruby-adapter/src/error.rs +148 -0
  45. data/ext/{parquet/src/ruby_reader.rs → parquet-ruby-adapter/src/io.rs} +190 -56
  46. data/ext/parquet-ruby-adapter/src/lib.rs +90 -0
  47. data/ext/parquet-ruby-adapter/src/logger.rs +64 -0
  48. data/ext/parquet-ruby-adapter/src/metadata.rs +427 -0
  49. data/ext/parquet-ruby-adapter/src/reader.rs +317 -0
  50. data/ext/parquet-ruby-adapter/src/schema.rs +810 -0
  51. data/ext/parquet-ruby-adapter/src/string_cache.rs +106 -0
  52. data/ext/parquet-ruby-adapter/src/try_into_value.rs +91 -0
  53. data/ext/parquet-ruby-adapter/src/types.rs +94 -0
  54. data/ext/parquet-ruby-adapter/src/utils.rs +186 -0
  55. data/ext/parquet-ruby-adapter/src/writer.rs +435 -0
  56. data/lib/parquet/schema.rb +19 -0
  57. data/lib/parquet/version.rb +1 -1
  58. metadata +50 -24
  59. data/ext/parquet/src/enumerator.rs +0 -68
  60. data/ext/parquet/src/header_cache.rs +0 -99
  61. data/ext/parquet/src/logger.rs +0 -171
  62. data/ext/parquet/src/reader/common.rs +0 -111
  63. data/ext/parquet/src/reader/mod.rs +0 -211
  64. data/ext/parquet/src/reader/parquet_column_reader.rs +0 -44
  65. data/ext/parquet/src/reader/parquet_row_reader.rs +0 -43
  66. data/ext/parquet/src/reader/unified/mod.rs +0 -363
  67. data/ext/parquet/src/types/core_types.rs +0 -120
  68. data/ext/parquet/src/types/mod.rs +0 -100
  69. data/ext/parquet/src/types/parquet_value.rs +0 -1275
  70. data/ext/parquet/src/types/record_types.rs +0 -603
  71. data/ext/parquet/src/types/schema_converter.rs +0 -290
  72. data/ext/parquet/src/types/schema_node.rs +0 -424
  73. data/ext/parquet/src/types/timestamp.rs +0 -285
  74. data/ext/parquet/src/types/type_conversion.rs +0 -1949
  75. data/ext/parquet/src/types/writer_types.rs +0 -329
  76. data/ext/parquet/src/utils.rs +0 -184
  77. data/ext/parquet/src/writer/mod.rs +0 -505
  78. data/ext/parquet/src/writer/write_columns.rs +0 -238
  79. data/ext/parquet/src/writer/write_rows.rs +0 -488
@@ -1,488 +0,0 @@
1
- use super::{
2
- build_column_collectors_from_dsl, copy_temp_file_to_io_like, create_writer,
3
- parse_parquet_write_args, DEFAULT_MEMORY_THRESHOLD, INITIAL_BATCH_SIZE, MIN_BATCH_SIZE,
4
- SAMPLE_SIZE,
5
- };
6
- use crate::{
7
- logger::RubyLogger,
8
- types::{
9
- schema_node::build_arrow_schema, ColumnCollector, ParquetGemError, ParquetSchemaType,
10
- PrimitiveType, WriterOutput,
11
- },
12
- IoLikeValue, ParquetWriteArgs,
13
- };
14
- use arrow_array::{Array, RecordBatch};
15
- use magnus::{
16
- value::ReprValue, Error as MagnusError, RArray, RHash, Ruby, Symbol, TryConvert, Value,
17
- };
18
- use rand::Rng;
19
- use std::{rc::Rc, sync::Arc};
20
-
21
- const MIN_SAMPLES_FOR_ESTIMATE: usize = 10;
22
-
23
- #[inline]
24
- pub fn write_rows(args: &[Value]) -> Result<(), MagnusError> {
25
- let ruby = unsafe { Ruby::get_unchecked() };
26
- write_rows_impl(Rc::new(ruby), args).map_err(|e| {
27
- let z: MagnusError = e.into();
28
- z
29
- })?;
30
- Ok(())
31
- }
32
-
33
- #[inline]
34
- fn write_rows_impl(ruby: Rc<Ruby>, args: &[Value]) -> Result<(), ParquetGemError> {
35
- let ParquetWriteArgs {
36
- read_from,
37
- write_to,
38
- schema,
39
- batch_size: user_batch_size,
40
- compression,
41
- flush_threshold,
42
- sample_size: user_sample_size,
43
- logger,
44
- } = parse_parquet_write_args(&ruby, args)?;
45
-
46
- let logger = RubyLogger::new(&ruby, logger)?;
47
- let flush_threshold = flush_threshold.unwrap_or(DEFAULT_MEMORY_THRESHOLD);
48
-
49
- // Get the Arrow schema from the SchemaNode (we only have DSL schema now, since legacy is converted)
50
- let arrow_schema = build_arrow_schema(&schema, &logger).map_err(|e| {
51
- MagnusError::new(
52
- magnus::exception::runtime_error(),
53
- format!("Failed to build Arrow schema from DSL schema: {}", e),
54
- )
55
- })?;
56
-
57
- // Create the writer
58
- let mut writer = create_writer(&ruby, &write_to, arrow_schema.clone(), compression)?;
59
-
60
- if read_from.is_kind_of(ruby.class_enumerator()) {
61
- // Build column collectors - we only have DSL schema now
62
- let mut column_collectors =
63
- build_column_collectors_from_dsl(&ruby, &arrow_schema, &schema)?;
64
-
65
- let mut rows_in_batch = 0;
66
- let mut total_rows = 0;
67
- let mut rng = rand::rng();
68
- let sample_size = user_sample_size.unwrap_or(SAMPLE_SIZE);
69
- let mut size_samples = Vec::with_capacity(sample_size);
70
- let mut current_batch_size = user_batch_size.unwrap_or(INITIAL_BATCH_SIZE);
71
-
72
- loop {
73
- match read_from.funcall::<_, _, Value>("next", ()) {
74
- Ok(row) => {
75
- // Process the row
76
- process_row(&ruby, row, &mut column_collectors)?;
77
-
78
- // Update row sampling for dynamic batch sizing
79
- if size_samples.len() < sample_size {
80
- // estimate row size
81
- let row_array = RArray::from_value(row).ok_or_else(|| {
82
- MagnusError::new(ruby.exception_type_error(), "Row must be an array")
83
- })?;
84
- let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
85
- size_samples.push(row_size);
86
- } else if rng.random_range(0..=total_rows) < sample_size {
87
- let idx = rng.random_range(0..sample_size);
88
- let row_array = RArray::from_value(row).ok_or_else(|| {
89
- MagnusError::new(ruby.exception_type_error(), "Row must be an array")
90
- })?;
91
- let row_size = estimate_single_row_size(&row_array, &column_collectors)?;
92
- size_samples[idx] = row_size;
93
- }
94
-
95
- rows_in_batch += 1;
96
- total_rows += 1;
97
-
98
- // Calculate batch size progressively once we have minimum samples
99
- if user_batch_size.is_none() && size_samples.len() >= MIN_SAMPLES_FOR_ESTIMATE {
100
- current_batch_size =
101
- update_batch_size(&size_samples, flush_threshold, MIN_BATCH_SIZE);
102
- }
103
-
104
- // When we reach batch size, write the batch
105
- if rows_in_batch >= current_batch_size {
106
- write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
107
- rows_in_batch = 0;
108
- }
109
- }
110
- Err(e) => {
111
- if e.is_kind_of(ruby.exception_stop_iteration()) {
112
- // Write any remaining rows
113
- if rows_in_batch > 0 {
114
- write_batch(&mut writer, &mut column_collectors, flush_threshold)?;
115
- }
116
- break;
117
- }
118
- Err(e)?;
119
- }
120
- }
121
- }
122
- } else {
123
- Err(MagnusError::new(
124
- magnus::exception::type_error(),
125
- "read_from must be an Enumerator".to_string(),
126
- ))?;
127
- }
128
-
129
- // Ensure everything is written and get the temp file if it exists
130
- if let Some(temp_file) = writer.close()? {
131
- // If we got a temp file back, we need to copy its contents to the IO-like object
132
- copy_temp_file_to_io_like(temp_file, IoLikeValue(write_to))?;
133
- }
134
-
135
- Ok(())
136
- }
137
-
138
- // Processes a single data row and adds values to the corresponding column collectors
139
- // This function is called for each row of input data when writing in row-wise mode.
140
- // It performs important validation to ensure the row structure matches the schema:
141
- // - Verifies that the number of columns in the row matches the schema
142
- // - Distributes each value to the appropriate ColumnCollector
143
- //
144
- // Each ColumnCollector handles type conversion and accumulation for its specific column,
145
- // allowing this function to focus on row-level validation and distribution.
146
- fn process_row(
147
- ruby: &Ruby,
148
- row: Value,
149
- column_collectors: &mut [ColumnCollector],
150
- ) -> Result<(), MagnusError> {
151
- let row_array = RArray::from_value(row)
152
- .ok_or_else(|| MagnusError::new(ruby.exception_type_error(), "Row must be an array"))?;
153
-
154
- // Validate row length matches schema
155
- if row_array.len() != column_collectors.len() {
156
- return Err(MagnusError::new(
157
- magnus::exception::runtime_error(),
158
- format!(
159
- "Row length ({}) does not match schema length ({}). Schema expects columns: {:?}",
160
- row_array.len(),
161
- column_collectors.len(),
162
- column_collectors
163
- .iter()
164
- .map(|c| c.name.as_str())
165
- .collect::<Vec<_>>()
166
- ),
167
- ));
168
- }
169
-
170
- // Process each value in the row
171
- for (collector, value) in column_collectors.iter_mut().zip(row_array) {
172
- collector.push_value(value)?;
173
- }
174
-
175
- Ok(())
176
- }
177
-
178
- // Converts all accumulated data from ColumnCollectors into an Arrow RecordBatch
179
- // and writes it to the Parquet file/output. This is a crucial function that bridges
180
- // between our Ruby-oriented data collectors and the Arrow/Parquet ecosystem.
181
- //
182
- // The function:
183
- // 1. Takes all collected values from each ColumnCollector and converts them to Arrow arrays
184
- // 2. Creates a RecordBatch from these arrays (column-oriented data format)
185
- // 3. Writes the batch to the ParquetWriter
186
- // 4. Flushes the writer if the accumulated memory exceeds the threshold
187
- //
188
- // This approach enables efficient batch-wise writing while controlling memory usage.
189
- fn write_batch(
190
- writer: &mut WriterOutput,
191
- collectors: &mut [ColumnCollector],
192
- flush_threshold: usize,
193
- ) -> Result<(), ParquetGemError> {
194
- // Convert columns to Arrow arrays
195
- let arrow_arrays: Vec<(String, Arc<dyn Array>)> = collectors
196
- .iter_mut()
197
- .map(|c| {
198
- let arr = c.take_array()?;
199
- Ok((c.name.clone(), arr))
200
- })
201
- .collect::<Result<_, ParquetGemError>>()?;
202
-
203
- let record_batch = RecordBatch::try_from_iter(arrow_arrays.clone()).map_err(|e| {
204
- MagnusError::new(
205
- magnus::exception::runtime_error(),
206
- format!("Failed to create RecordBatch: {}", e),
207
- )
208
- })?;
209
-
210
- writer.write(&record_batch)?;
211
-
212
- // Check if we need to flush based on memory usage thresholds
213
- match writer {
214
- WriterOutput::File(w) | WriterOutput::TempFile(w, _) => {
215
- if w.in_progress_size() >= flush_threshold || w.memory_size() >= flush_threshold {
216
- w.flush()?;
217
- }
218
- }
219
- }
220
- Ok(())
221
- }
222
-
223
- // Estimates the memory size of a single row by examining each value
224
- // This is used for dynamic batch sizing to optimize memory usage during writes
225
- // by adapting batch sizes based on the actual data being processed.
226
- pub fn estimate_single_row_size(
227
- row_array: &RArray,
228
- collectors: &[ColumnCollector],
229
- ) -> Result<usize, MagnusError> {
230
- let mut size = 0;
231
- for (idx, val) in row_array.into_iter().enumerate() {
232
- let col_type = &collectors[idx].type_;
233
- // Calculate size based on the type-specific estimation
234
- size += estimate_value_size(val, col_type)?;
235
- }
236
- Ok(size)
237
- }
238
-
239
- // Estimates the memory footprint of a single value based on its schema type
240
- // This provides type-specific size estimates that help with dynamic batch sizing
241
- // For complex types like lists, maps, and structs, we use reasonable approximations
242
- pub fn estimate_value_size(
243
- value: Value,
244
- schema_type: &ParquetSchemaType,
245
- ) -> Result<usize, MagnusError> {
246
- use ParquetSchemaType as PST;
247
- if value.is_nil() {
248
- return Ok(0); // nil => minimal
249
- }
250
- match schema_type {
251
- PST::Primitive(PrimitiveType::Int8) | PST::Primitive(PrimitiveType::UInt8) => Ok(1),
252
- PST::Primitive(PrimitiveType::Int16) | PST::Primitive(PrimitiveType::UInt16) => Ok(2),
253
- PST::Primitive(PrimitiveType::Int32)
254
- | PST::Primitive(PrimitiveType::UInt32)
255
- | PST::Primitive(PrimitiveType::Float32) => Ok(4),
256
- PST::Primitive(PrimitiveType::Int64)
257
- | PST::Primitive(PrimitiveType::UInt64)
258
- | PST::Primitive(PrimitiveType::Float64) => Ok(8),
259
- PST::Primitive(PrimitiveType::Boolean) => Ok(1),
260
- PST::Primitive(PrimitiveType::Decimal128(_, _)) => Ok(16),
261
- PST::Primitive(PrimitiveType::Decimal256(_, _)) => Ok(32),
262
- PST::Primitive(PrimitiveType::Date32) => Ok(4), // Date32 is 4 bytes
263
- PST::Primitive(PrimitiveType::TimestampMillis)
264
- | PST::Primitive(PrimitiveType::TimestampMicros) => Ok(8), // Timestamps are 8 bytes
265
- PST::Primitive(PrimitiveType::TimeMillis) => Ok(4), // TimeMillis is 4 bytes
266
- PST::Primitive(PrimitiveType::TimeMicros) => Ok(8), // TimeMicros is 8 bytes
267
- PST::Primitive(PrimitiveType::String) | PST::Primitive(PrimitiveType::Binary) => {
268
- if let Ok(s) = String::try_convert(value) {
269
- // Account for string length plus Rust String's capacity+pointer overhead
270
- Ok(s.len() + std::mem::size_of::<usize>() * 3)
271
- } else {
272
- // Try to convert the value to a string using to_s for non-string types
273
- // This handles numeric values that will be converted to strings later
274
- match value.funcall::<_, _, Value>("to_s", ()) {
275
- Ok(str_val) => {
276
- if let Ok(s) = String::try_convert(str_val) {
277
- Ok(s.len() + std::mem::size_of::<usize>() * 3)
278
- } else {
279
- // If to_s conversion fails, just use a reasonable default
280
- Ok(8) // Reasonable size estimate for small values
281
- }
282
- }
283
- Err(_) => {
284
- // If to_s method fails, use a default size
285
- Ok(8) // Reasonable size estimate for small values
286
- }
287
- }
288
- }
289
- }
290
- PST::List(item_type) => {
291
- if let Ok(arr) = RArray::try_convert(value) {
292
- let len = arr.len();
293
-
294
- // Base overhead for the array structure (pointer, length, capacity)
295
- let base_size = std::mem::size_of::<usize>() * 3;
296
-
297
- // If empty, just return the base size
298
- if len == 0 {
299
- return Ok(base_size);
300
- }
301
-
302
- // Sample up to 5 elements to get average element size
303
- let sample_count = std::cmp::min(len, 5);
304
- let mut total_sample_size = 0;
305
-
306
- for i in 0..sample_count {
307
- let element = arr.entry(i as isize)?;
308
- let element_size = estimate_value_size(element, &item_type.item_type)?;
309
- total_sample_size += element_size;
310
- }
311
-
312
- // If we couldn't sample any elements properly, that's an error
313
- if sample_count > 0 && total_sample_size == 0 {
314
- return Err(MagnusError::new(
315
- magnus::exception::runtime_error(),
316
- "Failed to estimate size of list elements",
317
- ));
318
- }
319
-
320
- // Calculate average element size from samples
321
- let avg_element_size = if sample_count > 0 {
322
- total_sample_size as f64 / sample_count as f64
323
- } else {
324
- return Err(MagnusError::new(
325
- magnus::exception::runtime_error(),
326
- "Failed to sample list elements for size estimation",
327
- ));
328
- };
329
-
330
- // Estimate total size based on average element size * length + base overhead
331
- Ok(base_size + (avg_element_size as usize * len))
332
- } else {
333
- // Instead of assuming it's a small list, return an error
334
- Err(MagnusError::new(
335
- magnus::exception::runtime_error(),
336
- format!("Expected array for List type but got: {:?}", value),
337
- ))
338
- }
339
- }
340
- PST::Map(map_field) => {
341
- if let Ok(hash) = RHash::try_convert(value) {
342
- let size_estimate = hash.funcall::<_, _, usize>("size", ())?;
343
-
344
- // Base overhead for the hash structure
345
- let base_size = std::mem::size_of::<usize>() * 4;
346
-
347
- // If empty, just return the base size
348
- if size_estimate == 0 {
349
- return Ok(base_size);
350
- }
351
-
352
- // Sample up to 5 key-value pairs to estimate average sizes
353
- let mut key_sample_size = 0;
354
- let mut value_sample_size = 0;
355
- let mut sample_count = 0;
356
-
357
- // Get an enumerator for the hash
358
- let enumerator = hash.funcall::<_, _, Value>("to_enum", ())?;
359
-
360
- // Sample up to 5 entries
361
- for _ in 0..std::cmp::min(size_estimate, 5) {
362
- match enumerator.funcall::<_, _, Value>("next", ()) {
363
- Ok(pair) => {
364
- if let Ok(pair_array) = RArray::try_convert(pair) {
365
- if pair_array.len() == 2 {
366
- let key = pair_array.entry(0)?;
367
- let val = pair_array.entry(1)?;
368
-
369
- key_sample_size +=
370
- estimate_value_size(key, &map_field.key_type)?;
371
- value_sample_size +=
372
- estimate_value_size(val, &map_field.value_type)?;
373
- sample_count += 1;
374
- }
375
- }
376
- }
377
- Err(_) => break, // Stop if we reach the end
378
- }
379
- }
380
-
381
- // If we couldn't sample any pairs, return an error
382
- if size_estimate > 0 && sample_count == 0 {
383
- return Err(MagnusError::new(
384
- magnus::exception::runtime_error(),
385
- "Failed to sample map entries for size estimation",
386
- ));
387
- }
388
-
389
- // Calculate average key and value sizes
390
- let (avg_key_size, avg_value_size) = if sample_count > 0 {
391
- (
392
- key_sample_size as f64 / sample_count as f64,
393
- value_sample_size as f64 / sample_count as f64,
394
- )
395
- } else {
396
- return Err(MagnusError::new(
397
- magnus::exception::runtime_error(),
398
- "Failed to sample hash key-value pairs for size estimation",
399
- ));
400
- };
401
-
402
- // Each entry has overhead (node pointers, etc.) in a hash map
403
- let entry_overhead = std::mem::size_of::<usize>() * 2;
404
-
405
- // Estimate total size:
406
- // base size + (key_size + value_size + entry_overhead) * count
407
- Ok(base_size
408
- + ((avg_key_size + avg_value_size + entry_overhead as f64) as usize
409
- * size_estimate))
410
- } else {
411
- // Instead of assuming a small map, return an error
412
- Err(MagnusError::new(
413
- magnus::exception::runtime_error(),
414
- format!("Expected hash for Map type but got: {:?}", value),
415
- ))
416
- }
417
- }
418
- PST::Struct(struct_field) => {
419
- if let Ok(hash) = RHash::try_convert(value) {
420
- // Base overhead for the struct
421
- let base_size = std::mem::size_of::<usize>() * 3;
422
-
423
- // Estimate size for each field
424
- let mut total_fields_size = 0;
425
-
426
- for field in &struct_field.fields {
427
- // Try to get the field value from the hash
428
- match hash.get(Symbol::new(&field.name)) {
429
- Some(field_value) => {
430
- total_fields_size += estimate_value_size(field_value, &field.type_)?;
431
- }
432
- None => {
433
- if let Some(field_value) = hash.get(&*field.name) {
434
- total_fields_size +=
435
- estimate_value_size(field_value, &field.type_)?;
436
- } else if field.nullable {
437
- total_fields_size += 0;
438
- } else {
439
- return Err(MagnusError::new(
440
- magnus::exception::runtime_error(),
441
- format!("Missing field: {} in hash {:?}", field.name, hash),
442
- ));
443
- }
444
- }
445
- }
446
- }
447
-
448
- // We no longer error on missing fields during size estimation
449
- Ok(base_size + total_fields_size)
450
- } else {
451
- // Instead of trying instance_variables or assuming a default, return an error
452
- Err(MagnusError::new(
453
- magnus::exception::runtime_error(),
454
- format!("Expected hash for Struct type but got: {:?}", value),
455
- ))
456
- }
457
- }
458
- }
459
- }
460
-
461
- // Dynamically calculates an optimal batch size based on estimated row sizes
462
- // and memory constraints. This function enables the writer to adapt to different
463
- // data characteristics for optimal performance.
464
- //
465
- // The algorithm:
466
- // 1. Requires a minimum number of samples to make a reliable estimate
467
- // 2. Calculates the average row size from the samples
468
- // 3. Determines a batch size that would consume approximately the target memory threshold
469
- // 4. Ensures the batch size doesn't go below a minimum value for efficiency
470
- //
471
- // This approach balances memory usage with processing efficiency by targeting
472
- // a specific memory footprint per batch.
473
- fn update_batch_size(
474
- size_samples: &[usize],
475
- flush_threshold: usize,
476
- min_batch_size: usize,
477
- ) -> usize {
478
- if size_samples.len() < MIN_SAMPLES_FOR_ESTIMATE {
479
- return min_batch_size;
480
- }
481
-
482
- let total_size = size_samples.iter().sum::<usize>();
483
- // Safe because we know we have at least MIN_SAMPLES_FOR_ESTIMATE samples
484
- let avg_row_size = total_size as f64 / size_samples.len() as f64;
485
- let avg_row_size = avg_row_size.max(1.0); // Ensure we don't divide by zero
486
- let suggested_batch_size = (flush_threshold as f64 / avg_row_size).floor() as usize;
487
- suggested_batch_size.max(min_batch_size)
488
- }