kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,364 +0,0 @@
|
|
|
1
|
-
//! Dynamic pool sizing heuristics based on document complexity.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides functions to estimate optimal pool sizes based on file size
|
|
4
|
-
//! and document format (MIME type). By sizing pools to match actual document complexity,
|
|
5
|
-
//! we reduce memory waste from pre-allocated but unused capacity.
|
|
6
|
-
//!
|
|
7
|
-
//! # Sizing Strategy
|
|
8
|
-
//!
|
|
9
|
-
//! Pool size is determined by a combination of:
|
|
10
|
-
//! 1. **Format-specific ratio**: Extraction overhead varies by format
|
|
11
|
-
//! - PDF: 25% (binary, compression overhead)
|
|
12
|
-
//! - DOCX/XLSX/PPTX: 40-45% (compressed, XML-heavy)
|
|
13
|
-
//! - HTML: 65% (markup overhead)
|
|
14
|
-
//! - Text/Markdown: 95% (minimal overhead)
|
|
15
|
-
//! - Default: 50% (conservative)
|
|
16
|
-
//! 2. **File size scaling**: Larger documents benefit from more buffers
|
|
17
|
-
//! - Small (< 100KB): Base allocation
|
|
18
|
-
//! - Medium (100KB-1MB): +2 buffers
|
|
19
|
-
//! - Large (1MB-10MB): +4 buffers
|
|
20
|
-
//! - Huge (>10MB): +6 buffers
|
|
21
|
-
//!
|
|
22
|
-
//! # Example
|
|
23
|
-
//!
|
|
24
|
-
//! ```rust,ignore
|
|
25
|
-
//! use kreuzberg::utils::pool_sizing::estimate_pool_size;
|
|
26
|
-
//!
|
|
27
|
-
//! // 5MB PDF → pool sized at ~1.25MB (5MB * 0.25)
|
|
28
|
-
//! let hint = estimate_pool_size(5_000_000, "application/pdf");
|
|
29
|
-
//! assert_eq!(hint.estimated_total_size, 1_250_000);
|
|
30
|
-
//!
|
|
31
|
-
//! // 2MB HTML → pool sized at ~1.3MB (2MB * 0.65)
|
|
32
|
-
//! let hint = estimate_pool_size(2_000_000, "text/html");
|
|
33
|
-
//! assert_eq!(hint.estimated_total_size, 1_300_000);
|
|
34
|
-
//! ```
|
|
35
|
-
|
|
36
|
-
/// Hint for optimal pool sizing based on document characteristics.
|
|
37
|
-
///
|
|
38
|
-
/// This struct contains the estimated sizes for string and byte buffers
|
|
39
|
-
/// that should be allocated in the pool to handle extraction without
|
|
40
|
-
/// excessive reallocation.
|
|
41
|
-
#[derive(Debug, Clone, Copy)]
|
|
42
|
-
pub struct PoolSizeHint {
|
|
43
|
-
/// Estimated total string buffer pool size in bytes
|
|
44
|
-
pub estimated_total_size: usize,
|
|
45
|
-
/// Recommended number of string buffers
|
|
46
|
-
pub string_buffer_count: usize,
|
|
47
|
-
/// Recommended capacity per string buffer in bytes
|
|
48
|
-
pub string_buffer_capacity: usize,
|
|
49
|
-
/// Recommended number of byte buffers
|
|
50
|
-
pub byte_buffer_count: usize,
|
|
51
|
-
/// Recommended capacity per byte buffer in bytes
|
|
52
|
-
pub byte_buffer_capacity: usize,
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
impl PoolSizeHint {
|
|
56
|
-
/// Calculate the estimated string pool memory in bytes.
|
|
57
|
-
///
|
|
58
|
-
/// This is the total estimated memory for all string buffers.
|
|
59
|
-
#[inline]
|
|
60
|
-
pub fn estimated_string_pool_memory(&self) -> usize {
|
|
61
|
-
self.string_buffer_count * self.string_buffer_capacity
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
/// Calculate the estimated byte pool memory in bytes.
|
|
65
|
-
///
|
|
66
|
-
/// This is the total estimated memory for all byte buffers.
|
|
67
|
-
#[inline]
|
|
68
|
-
pub fn estimated_byte_pool_memory(&self) -> usize {
|
|
69
|
-
self.byte_buffer_count * self.byte_buffer_capacity
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
/// Calculate the total estimated pool memory in bytes.
|
|
73
|
-
///
|
|
74
|
-
/// This includes both string and byte buffer pools.
|
|
75
|
-
#[inline]
|
|
76
|
-
pub fn total_pool_memory(&self) -> usize {
|
|
77
|
-
self.estimated_string_pool_memory() + self.estimated_byte_pool_memory()
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
/// Get the format-specific extraction ratio.
|
|
82
|
-
///
|
|
83
|
-
/// This ratio represents the approximate size of extracted content
|
|
84
|
-
/// as a percentage of the original file size. Different formats have
|
|
85
|
-
/// different overhead due to compression, binary structures, markup, etc.
|
|
86
|
-
///
|
|
87
|
-
/// # Arguments
|
|
88
|
-
///
|
|
89
|
-
/// * `mime_type` - The MIME type of the document (e.g., "application/pdf")
|
|
90
|
-
///
|
|
91
|
-
/// # Returns
|
|
92
|
-
///
|
|
93
|
-
/// A ratio between 0.0 and 1.0 representing the expected extraction ratio
|
|
94
|
-
#[inline]
|
|
95
|
-
fn get_format_ratio(mime_type: &str) -> f64 {
|
|
96
|
-
match mime_type {
|
|
97
|
-
"text/plain" | "text/markdown" | "text/x-markdown" => 0.95,
|
|
98
|
-
"text/csv" | "text/tab-separated-values" => 0.90,
|
|
99
|
-
|
|
100
|
-
"text/html" | "text/html; charset=utf-8" => 0.65,
|
|
101
|
-
"application/xml" | "text/xml" => 0.60,
|
|
102
|
-
"image/svg+xml" => 0.55,
|
|
103
|
-
|
|
104
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
105
|
-
| "application/vnd.openxmlformats-officedocument.wordprocessingml.macro-enabled.document"
|
|
106
|
-
| "application/msword" => 0.45,
|
|
107
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
108
|
-
| "application/vnd.openxmlformats-officedocument.spreadsheetml.macro-enabled.sheet"
|
|
109
|
-
| "application/vnd.ms-excel" => 0.40,
|
|
110
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
111
|
-
| "application/vnd.openxmlformats-officedocument.presentationml.macro-enabled.presentation"
|
|
112
|
-
| "application/vnd.ms-powerpoint" => 0.35,
|
|
113
|
-
|
|
114
|
-
"application/vnd.oasis.opendocument.text" => 0.45,
|
|
115
|
-
"application/vnd.oasis.opendocument.spreadsheet" => 0.40,
|
|
116
|
-
"application/vnd.oasis.opendocument.presentation" => 0.35,
|
|
117
|
-
|
|
118
|
-
"application/pdf" => 0.25,
|
|
119
|
-
|
|
120
|
-
"application/json" | "text/json" => 0.80,
|
|
121
|
-
"application/x-yaml" | "text/yaml" | "text/x-yaml" | "application/yaml" => 0.85,
|
|
122
|
-
|
|
123
|
-
"application/zip" | "application/x-zip-compressed" => 0.30,
|
|
124
|
-
"application/gzip" | "application/x-gzip" => 0.25,
|
|
125
|
-
"application/x-rar-compressed" => 0.30,
|
|
126
|
-
"application/x-7z-compressed" => 0.25,
|
|
127
|
-
|
|
128
|
-
_ => 0.50,
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
/// Get base pool configuration for a format type.
|
|
133
|
-
///
|
|
134
|
-
/// The base configuration represents the minimum number of buffers
|
|
135
|
-
/// needed for typical documents of that format.
|
|
136
|
-
///
|
|
137
|
-
/// # Arguments
|
|
138
|
-
///
|
|
139
|
-
/// * `mime_type` - The MIME type of the document
|
|
140
|
-
///
|
|
141
|
-
/// # Returns
|
|
142
|
-
///
|
|
143
|
-
/// A tuple of (base_buffer_count, base_buffer_capacity)
|
|
144
|
-
#[inline]
|
|
145
|
-
fn get_format_base_config(mime_type: &str) -> (usize, usize) {
|
|
146
|
-
match mime_type {
|
|
147
|
-
"text/plain" | "text/markdown" | "text/x-markdown" => (2, 4096),
|
|
148
|
-
"text/csv" | "text/tab-separated-values" => (3, 8192),
|
|
149
|
-
|
|
150
|
-
"text/html" | "text/html; charset=utf-8" => (8, 16384),
|
|
151
|
-
|
|
152
|
-
"application/xml" | "text/xml" => (5, 8192),
|
|
153
|
-
"image/svg+xml" => (4, 8192),
|
|
154
|
-
|
|
155
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
|
156
|
-
| "application/vnd.openxmlformats-officedocument.wordprocessingml.macro-enabled.document"
|
|
157
|
-
| "application/msword" => (5, 8192),
|
|
158
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
159
|
-
| "application/vnd.openxmlformats-officedocument.spreadsheetml.macro-enabled.sheet"
|
|
160
|
-
| "application/vnd.ms-excel" => (4, 8192),
|
|
161
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
|
162
|
-
| "application/vnd.openxmlformats-officedocument.presentationml.macro-enabled.presentation"
|
|
163
|
-
| "application/vnd.ms-powerpoint" => (4, 8192),
|
|
164
|
-
|
|
165
|
-
"application/vnd.oasis.opendocument.text" => (5, 8192),
|
|
166
|
-
"application/vnd.oasis.opendocument.spreadsheet" => (4, 8192),
|
|
167
|
-
"application/vnd.oasis.opendocument.presentation" => (4, 8192),
|
|
168
|
-
|
|
169
|
-
"application/pdf" => (6, 16384),
|
|
170
|
-
|
|
171
|
-
"application/json" | "text/json" => (4, 8192),
|
|
172
|
-
"application/x-yaml" | "text/yaml" | "text/x-yaml" | "application/yaml" => (4, 8192),
|
|
173
|
-
|
|
174
|
-
_ => (3, 8192),
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
/// Estimate optimal pool configuration based on document size.
|
|
179
|
-
///
|
|
180
|
-
/// Adjusts the base configuration up for larger documents to provide
|
|
181
|
-
/// adequate buffering for streaming extraction operations.
|
|
182
|
-
///
|
|
183
|
-
/// # Arguments
|
|
184
|
-
///
|
|
185
|
-
/// * `file_size` - Size of the file in bytes
|
|
186
|
-
/// * `base_count` - Base buffer count from format config
|
|
187
|
-
///
|
|
188
|
-
/// # Returns
|
|
189
|
-
///
|
|
190
|
-
/// Adjusted buffer count considering file size
|
|
191
|
-
#[inline]
|
|
192
|
-
fn adjust_for_file_size(file_size: u64, base_count: usize) -> usize {
|
|
193
|
-
match file_size {
|
|
194
|
-
0..=100_000 => base_count,
|
|
195
|
-
100_001..=1_000_000 => base_count.saturating_add(2),
|
|
196
|
-
1_000_001..=10_000_000 => base_count.saturating_add(4),
|
|
197
|
-
_ => base_count.saturating_add(6),
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
|
|
201
|
-
/// Estimate pool capacity based on file size.
|
|
202
|
-
///
|
|
203
|
-
/// Larger files benefit from larger buffers to reduce reallocation cycles
|
|
204
|
-
/// during extraction.
|
|
205
|
-
///
|
|
206
|
-
/// # Arguments
|
|
207
|
-
///
|
|
208
|
-
/// * `file_size` - Size of the file in bytes
|
|
209
|
-
///
|
|
210
|
-
/// # Returns
|
|
211
|
-
///
|
|
212
|
-
/// Recommended buffer capacity in bytes
|
|
213
|
-
#[inline]
|
|
214
|
-
fn estimate_buffer_capacity(file_size: u64) -> usize {
|
|
215
|
-
match file_size {
|
|
216
|
-
0..=10_000 => 1024,
|
|
217
|
-
10_001..=100_000 => 4096,
|
|
218
|
-
100_001..=1_000_000 => 16384,
|
|
219
|
-
1_000_001..=10_000_000 => 65536,
|
|
220
|
-
_ => 262144,
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
/// Estimate optimal pool sizing based on file size and document type.
|
|
225
|
-
///
|
|
226
|
-
/// This function uses the file size and MIME type to estimate how many
|
|
227
|
-
/// buffers and what capacity they should have. The estimates are conservative
|
|
228
|
-
/// to avoid starving large document processing.
|
|
229
|
-
///
|
|
230
|
-
/// # Arguments
|
|
231
|
-
///
|
|
232
|
-
/// * `file_size` - Size of the file in bytes
|
|
233
|
-
/// * `mime_type` - MIME type of the document (e.g., "application/pdf")
|
|
234
|
-
///
|
|
235
|
-
/// # Returns
|
|
236
|
-
///
|
|
237
|
-
/// A `PoolSizeHint` with recommended pool configuration
|
|
238
|
-
///
|
|
239
|
-
/// # Example
|
|
240
|
-
///
|
|
241
|
-
/// ```rust,ignore
|
|
242
|
-
/// use kreuzberg::utils::pool_sizing::estimate_pool_size;
|
|
243
|
-
///
|
|
244
|
-
/// let hint = estimate_pool_size(5_000_000, "application/pdf");
|
|
245
|
-
/// // PDF at 5MB gets 10 string buffers (base 6 + 4 for size)
|
|
246
|
-
/// // of 65KB each (for 1-10MB files)
|
|
247
|
-
/// ```
|
|
248
|
-
#[inline]
|
|
249
|
-
pub fn estimate_pool_size(file_size: u64, mime_type: &str) -> PoolSizeHint {
|
|
250
|
-
let format_ratio = get_format_ratio(mime_type);
|
|
251
|
-
let (base_count, _base_capacity) = get_format_base_config(mime_type);
|
|
252
|
-
|
|
253
|
-
let adjusted_string_buffer_count = adjust_for_file_size(file_size, base_count);
|
|
254
|
-
|
|
255
|
-
let buffer_capacity = estimate_buffer_capacity(file_size);
|
|
256
|
-
|
|
257
|
-
let estimated_total_size = (file_size as f64 * format_ratio).ceil() as usize;
|
|
258
|
-
|
|
259
|
-
let byte_buffer_count = (adjusted_string_buffer_count / 2).max(1);
|
|
260
|
-
let byte_buffer_capacity = buffer_capacity * 8;
|
|
261
|
-
|
|
262
|
-
PoolSizeHint {
|
|
263
|
-
estimated_total_size,
|
|
264
|
-
string_buffer_count: adjusted_string_buffer_count,
|
|
265
|
-
string_buffer_capacity: buffer_capacity,
|
|
266
|
-
byte_buffer_count,
|
|
267
|
-
byte_buffer_capacity,
|
|
268
|
-
}
|
|
269
|
-
}
|
|
270
|
-
|
|
271
|
-
#[cfg(test)]
|
|
272
|
-
mod tests {
|
|
273
|
-
use super::*;
|
|
274
|
-
|
|
275
|
-
#[test]
|
|
276
|
-
fn test_format_ratio_pdf() {
|
|
277
|
-
let ratio = get_format_ratio("application/pdf");
|
|
278
|
-
assert_eq!(ratio, 0.25);
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
#[test]
|
|
282
|
-
fn test_format_ratio_html() {
|
|
283
|
-
let ratio = get_format_ratio("text/html");
|
|
284
|
-
assert_eq!(ratio, 0.65);
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
#[test]
|
|
288
|
-
fn test_format_ratio_docx() {
|
|
289
|
-
let ratio = get_format_ratio("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
|
|
290
|
-
assert_eq!(ratio, 0.45);
|
|
291
|
-
}
|
|
292
|
-
|
|
293
|
-
#[test]
|
|
294
|
-
fn test_format_ratio_default() {
|
|
295
|
-
let ratio = get_format_ratio("application/unknown-format");
|
|
296
|
-
assert_eq!(ratio, 0.50);
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
#[test]
|
|
300
|
-
fn test_small_file_sizing() {
|
|
301
|
-
let hint = estimate_pool_size(5_000, "application/pdf");
|
|
302
|
-
assert_eq!(hint.string_buffer_count, 6);
|
|
303
|
-
assert_eq!(hint.string_buffer_capacity, 1024);
|
|
304
|
-
}
|
|
305
|
-
|
|
306
|
-
#[test]
|
|
307
|
-
fn test_medium_file_sizing() {
|
|
308
|
-
let hint = estimate_pool_size(500_000, "application/pdf");
|
|
309
|
-
assert_eq!(hint.string_buffer_count, 8);
|
|
310
|
-
assert_eq!(hint.string_buffer_capacity, 16384);
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
#[test]
|
|
314
|
-
fn test_large_file_sizing() {
|
|
315
|
-
let hint = estimate_pool_size(5_000_000, "application/pdf");
|
|
316
|
-
assert_eq!(hint.string_buffer_count, 10);
|
|
317
|
-
assert_eq!(hint.string_buffer_capacity, 65536);
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
#[test]
|
|
321
|
-
fn test_huge_file_sizing() {
|
|
322
|
-
let hint = estimate_pool_size(50_000_000, "application/pdf");
|
|
323
|
-
assert_eq!(hint.string_buffer_count, 12);
|
|
324
|
-
assert_eq!(hint.string_buffer_capacity, 262144);
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
#[test]
|
|
328
|
-
fn test_html_sizing() {
|
|
329
|
-
let hint = estimate_pool_size(1_000_000, "text/html");
|
|
330
|
-
assert_eq!(hint.string_buffer_count, 10);
|
|
331
|
-
assert_eq!(hint.string_buffer_capacity, 16384);
|
|
332
|
-
assert_eq!(hint.estimated_total_size, 650_000);
|
|
333
|
-
}
|
|
334
|
-
|
|
335
|
-
#[test]
|
|
336
|
-
fn test_text_sizing() {
|
|
337
|
-
let hint = estimate_pool_size(1_000_000, "text/plain");
|
|
338
|
-
assert_eq!(hint.string_buffer_count, 4);
|
|
339
|
-
assert_eq!(hint.estimated_total_size, 950_000);
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
#[test]
|
|
343
|
-
fn test_byte_buffer_sizing() {
|
|
344
|
-
let hint = estimate_pool_size(5_000_000, "application/pdf");
|
|
345
|
-
assert!(hint.byte_buffer_count < hint.string_buffer_count);
|
|
346
|
-
assert_eq!(hint.byte_buffer_capacity, hint.string_buffer_capacity * 8);
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
#[test]
|
|
350
|
-
fn test_total_size_estimation() {
|
|
351
|
-
let hint = estimate_pool_size(10_000_000, "application/pdf");
|
|
352
|
-
assert_eq!(hint.estimated_total_size, 2_500_000);
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
#[test]
|
|
356
|
-
fn test_xlsx_sizing() {
|
|
357
|
-
let hint = estimate_pool_size(
|
|
358
|
-
2_000_000,
|
|
359
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
360
|
-
);
|
|
361
|
-
assert_eq!(hint.estimated_total_size, 800_000);
|
|
362
|
-
assert_eq!(hint.string_buffer_count, 8);
|
|
363
|
-
}
|
|
364
|
-
}
|