kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
//! Regex patterns for quality detection
|
|
2
|
+
//!
|
|
3
|
+
//! This module contains all regex patterns used for detecting OCR artifacts,
|
|
4
|
+
//! script content, navigation elements, and text structure.
|
|
5
|
+
|
|
6
|
+
use once_cell::sync::Lazy;
|
|
7
|
+
use regex::Regex;
|
|
8
|
+
|
|
9
|
+
// ============================================================================
|
|
10
|
+
// OCR Artifact Patterns
|
|
11
|
+
// ============================================================================
|
|
12
|
+
|
|
13
|
+
/// Detects scattered characters with excessive spacing (e.g., "a b c")
|
|
14
|
+
pub(crate) static SCATTERED_CHARS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
15
|
+
Regex::new(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b")
|
|
16
|
+
.expect("Scattered chars regex pattern is valid and should compile")
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
/// Detects repeated punctuation marks (3 or more dots or underscores)
|
|
20
|
+
pub(crate) static REPEATED_PUNCT_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
21
|
+
Regex::new(r"[.]{3,}|[_]{3,}").expect("Repeated punctuation regex pattern is valid and should compile")
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
/// Detects repeated dashes (3 or more)
|
|
25
|
+
pub(crate) static DASH_PATTERN: Lazy<Regex> =
|
|
26
|
+
Lazy::new(|| Regex::new(r"[-]{3,}").expect("Dash pattern regex is valid and should compile"));
|
|
27
|
+
|
|
28
|
+
/// Detects isolated punctuation surrounded by spaces
|
|
29
|
+
pub(crate) static ISOLATED_PUNCT_PATTERN: Lazy<Regex> =
|
|
30
|
+
Lazy::new(|| Regex::new(r"\s[.,;:!?]\s").expect("Isolated punctuation regex pattern is valid and should compile"));
|
|
31
|
+
|
|
32
|
+
/// Detects malformed words with mixed letters and numbers
|
|
33
|
+
pub(crate) static MALFORMED_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
34
|
+
Regex::new(r"\b[a-zA-Z]+[0-9]+[a-zA-Z]+[a-zA-Z0-9]*\b")
|
|
35
|
+
.expect("Malformed words regex pattern is valid and should compile")
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
/// Detects excessive whitespace (3 or more spaces)
|
|
39
|
+
pub(crate) static EXCESSIVE_WHITESPACE_PATTERN: Lazy<Regex> =
|
|
40
|
+
Lazy::new(|| Regex::new(r"\s{3,}").expect("Excessive whitespace regex pattern is valid and should compile"));
|
|
41
|
+
|
|
42
|
+
// ============================================================================
|
|
43
|
+
// Script and Code Patterns
|
|
44
|
+
// ============================================================================
|
|
45
|
+
|
|
46
|
+
/// Detects JavaScript function declarations
|
|
47
|
+
pub(crate) static JS_FUNCTION_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
48
|
+
Regex::new(r"(?i)function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}")
|
|
49
|
+
.expect("JavaScript function regex pattern is valid and should compile")
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
/// Detects CSS rules
|
|
53
|
+
pub(crate) static CSS_RULES_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
54
|
+
Regex::new(r"(?i)\.[a-zA-Z][\w-]*\s*\{[^}]*\}").expect("CSS rules regex pattern is valid and should compile")
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
/// Detects HTML script tags
|
|
58
|
+
pub(crate) static SCRIPT_TAG_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
59
|
+
Regex::new(r"(?is)<script[^>]*>.*?</script>").expect("Script tag regex pattern is valid and should compile")
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
/// Detects HTML style tags
|
|
63
|
+
pub(crate) static STYLE_TAG_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
64
|
+
Regex::new(r"(?is)<style[^>]*>.*?</style>").expect("Style tag regex pattern is valid and should compile")
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
// ============================================================================
|
|
68
|
+
// Navigation Element Patterns
|
|
69
|
+
// ============================================================================
|
|
70
|
+
|
|
71
|
+
/// Detects common navigation words and phrases
|
|
72
|
+
pub(crate) static NAV_WORDS_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
73
|
+
Regex::new(r"(?i)\b(?:Skip to main content|Back to top|Main navigation|Site navigation)\b")
|
|
74
|
+
.expect("Navigation words regex pattern is valid and should compile")
|
|
75
|
+
});
|
|
76
|
+
|
|
77
|
+
/// Detects breadcrumb navigation patterns
|
|
78
|
+
pub(crate) static BREADCRUMB_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
79
|
+
Regex::new(r"(?:Home\s*[>»]\s*|[>»]\s*){2,}").expect("Breadcrumb regex pattern is valid and should compile")
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
/// Detects pagination text
|
|
83
|
+
pub(crate) static PAGINATION_PATTERN: Lazy<Regex> = Lazy::new(|| {
|
|
84
|
+
Regex::new(r"(?i)\b(?:Page \d+ of \d+|First page|Last page|Previous page|Next page|^\d+ of \d+$)\b")
|
|
85
|
+
.expect("Pagination regex pattern is valid and should compile")
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
// ============================================================================
|
|
89
|
+
// Text Structure Patterns
|
|
90
|
+
// ============================================================================
|
|
91
|
+
|
|
92
|
+
/// Detects sentence boundaries
|
|
93
|
+
pub(crate) static SENTENCE_DETECT: Lazy<Regex> =
|
|
94
|
+
Lazy::new(|| Regex::new(r"[.!?]\s+[A-Z]").expect("Sentence detection regex pattern is valid and should compile"));
|
|
95
|
+
|
|
96
|
+
/// Detects punctuation marks
|
|
97
|
+
pub(crate) static PUNCTUATION_DETECT: Lazy<Regex> =
|
|
98
|
+
Lazy::new(|| Regex::new(r"[.!?]").expect("Punctuation detection regex pattern is valid and should compile"));
|
|
99
|
+
|
|
100
|
+
// ============================================================================
|
|
101
|
+
// Whitespace Normalization Patterns
|
|
102
|
+
// ============================================================================
|
|
103
|
+
|
|
104
|
+
/// Normalizes various types of whitespace characters
|
|
105
|
+
pub(crate) static WHITESPACE_NORMALIZE: Lazy<Regex> = Lazy::new(|| {
|
|
106
|
+
Regex::new(r"[ \t\f\v\r\xa0\u{2000}-\u{200b}\u{2028}\u{2029}\u{3000}]+")
|
|
107
|
+
.expect("Whitespace normalization regex pattern is valid and should compile")
|
|
108
|
+
});
|
|
109
|
+
|
|
110
|
+
/// Normalizes multiple consecutive newlines
|
|
111
|
+
pub(crate) static NEWLINE_NORMALIZE: Lazy<Regex> = Lazy::new(|| {
|
|
112
|
+
Regex::new(r"\n\s*\n\s*\n+").expect("Newline normalization regex pattern is valid and should compile")
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
/// Cleans up newline sequences
|
|
116
|
+
pub(crate) static NEWLINE_CLEANUP: Lazy<Regex> =
|
|
117
|
+
Lazy::new(|| Regex::new(r"\n+").expect("Newline cleanup regex pattern is valid and should compile"));
|
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
//! Quality scoring functions
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functions to calculate quality scores and penalties
|
|
4
|
+
//! based on various text characteristics.
|
|
5
|
+
|
|
6
|
+
use super::patterns::*;
|
|
7
|
+
use ahash::AHashMap;
|
|
8
|
+
use regex::Regex;
|
|
9
|
+
|
|
10
|
+
// ============================================================================
|
|
11
|
+
// Scoring Constants and Weights
|
|
12
|
+
// ============================================================================
|
|
13
|
+
|
|
14
|
+
pub(crate) const OCR_PENALTY_WEIGHT: f64 = 0.3;
|
|
15
|
+
pub(crate) const SCRIPT_PENALTY_WEIGHT: f64 = 0.2;
|
|
16
|
+
pub(crate) const NAV_PENALTY_WEIGHT: f64 = 0.1;
|
|
17
|
+
pub(crate) const STRUCTURE_BONUS_WEIGHT: f64 = 0.2;
|
|
18
|
+
pub(crate) const METADATA_BONUS_WEIGHT: f64 = 0.1;
|
|
19
|
+
|
|
20
|
+
pub(crate) const MIN_TEXT_LENGTH: usize = 10;
|
|
21
|
+
pub(crate) const LARGE_TEXT_LENGTH: usize = 1000;
|
|
22
|
+
|
|
23
|
+
// ============================================================================
|
|
24
|
+
// Helper Functions
|
|
25
|
+
// ============================================================================
|
|
26
|
+
|
|
27
|
+
/// Sums the total length of all regex matches in the text
|
|
28
|
+
#[inline]
|
|
29
|
+
pub(crate) fn sum_match_lengths(text: &str, pattern: &Regex) -> usize {
|
|
30
|
+
pattern.find_iter(text).map(|m| m.len()).sum()
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// ============================================================================
|
|
34
|
+
// Penalty Calculation Functions
|
|
35
|
+
// ============================================================================
|
|
36
|
+
|
|
37
|
+
/// Calculate penalty based on OCR artifacts in the text
|
|
38
|
+
#[inline]
|
|
39
|
+
pub(crate) fn calculate_ocr_penalty(text: &str, total_chars: f64) -> f64 {
|
|
40
|
+
if total_chars == 0.0 {
|
|
41
|
+
return 0.0;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if !text.contains(" ") && !text.contains("...") {
|
|
45
|
+
return 0.0;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
let artifact_chars = sum_match_lengths(text, &SCATTERED_CHARS_PATTERN)
|
|
49
|
+
+ sum_match_lengths(text, &REPEATED_PUNCT_PATTERN)
|
|
50
|
+
+ count_non_table_dash_artifacts(text)
|
|
51
|
+
+ sum_match_lengths(text, &ISOLATED_PUNCT_PATTERN)
|
|
52
|
+
+ sum_match_lengths(text, &MALFORMED_WORDS_PATTERN)
|
|
53
|
+
+ sum_match_lengths(text, &EXCESSIVE_WHITESPACE_PATTERN);
|
|
54
|
+
|
|
55
|
+
(artifact_chars as f64 / total_chars).min(1.0)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/// Count dash artifacts while preserving table separators
|
|
59
|
+
#[inline]
|
|
60
|
+
pub(crate) fn count_non_table_dash_artifacts(text: &str) -> usize {
|
|
61
|
+
let mut artifact_count = 0;
|
|
62
|
+
|
|
63
|
+
for line in text.lines() {
|
|
64
|
+
let trimmed = line.trim();
|
|
65
|
+
let is_table_separator = trimmed.starts_with('|')
|
|
66
|
+
&& trimmed.ends_with('|')
|
|
67
|
+
&& trimmed
|
|
68
|
+
.chars()
|
|
69
|
+
.all(|c| c == '|' || c == '-' || c.is_whitespace() || c == ':');
|
|
70
|
+
|
|
71
|
+
if !is_table_separator {
|
|
72
|
+
for m in DASH_PATTERN.find_iter(line) {
|
|
73
|
+
artifact_count += m.len();
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
artifact_count
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Calculate penalty based on embedded scripts and code
|
|
82
|
+
#[inline]
|
|
83
|
+
pub(crate) fn calculate_script_penalty(text: &str, total_chars: f64) -> f64 {
|
|
84
|
+
if total_chars == 0.0 {
|
|
85
|
+
return 0.0;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if !text.contains("function") && !text.contains("<script") && !text.contains("<style") {
|
|
89
|
+
return 0.0;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
let script_chars = sum_match_lengths(text, &JS_FUNCTION_PATTERN)
|
|
93
|
+
+ sum_match_lengths(text, &CSS_RULES_PATTERN)
|
|
94
|
+
+ sum_match_lengths(text, &SCRIPT_TAG_PATTERN)
|
|
95
|
+
+ sum_match_lengths(text, &STYLE_TAG_PATTERN);
|
|
96
|
+
|
|
97
|
+
(script_chars as f64 / total_chars).min(1.0)
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/// Calculate penalty based on navigation elements
|
|
101
|
+
#[inline]
|
|
102
|
+
pub(crate) fn calculate_navigation_penalty(text: &str, total_chars: f64) -> f64 {
|
|
103
|
+
if total_chars == 0.0 {
|
|
104
|
+
return 0.0;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
let nav_chars = sum_match_lengths(text, &NAV_WORDS_PATTERN)
|
|
108
|
+
+ sum_match_lengths(text, &BREADCRUMB_PATTERN)
|
|
109
|
+
+ sum_match_lengths(text, &PAGINATION_PATTERN);
|
|
110
|
+
|
|
111
|
+
(nav_chars as f64 / total_chars).min(1.0)
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// ============================================================================
|
|
115
|
+
// Bonus Calculation Functions
|
|
116
|
+
// ============================================================================
|
|
117
|
+
|
|
118
|
+
/// Calculate bonus based on document metadata quality
|
|
119
|
+
#[inline]
|
|
120
|
+
pub(crate) fn calculate_metadata_bonus(metadata: &AHashMap<String, String>) -> f64 {
|
|
121
|
+
const IMPORTANT_FIELDS: &[&str] = &["title", "author", "subject", "description", "keywords"];
|
|
122
|
+
|
|
123
|
+
let present_fields = IMPORTANT_FIELDS
|
|
124
|
+
.iter()
|
|
125
|
+
.filter(|&&field| metadata.contains_key(field))
|
|
126
|
+
.count();
|
|
127
|
+
|
|
128
|
+
present_fields as f64 / IMPORTANT_FIELDS.len() as f64
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/// Compute a heuristic score (0.0–1.0) describing how clean the extracted text is.
|
|
132
|
+
///
|
|
133
|
+
/// The scoring pipeline rewards well-structured prose while penalising OCR artefacts,
|
|
134
|
+
/// embedded scripts, and navigation chrome. Supplying document metadata allows the
|
|
135
|
+
/// function to include contextual bonuses.
|
|
136
|
+
///
|
|
137
|
+
/// ```rust
|
|
138
|
+
/// use ahash::AHashMap;
|
|
139
|
+
/// use kreuzberg::utils::quality::calculate_quality_score;
|
|
140
|
+
///
|
|
141
|
+
/// let text = "Executive Summary\n===================\nKreuzberg extracts documents quickly.";
|
|
142
|
+
/// let score = calculate_quality_score(text, None);
|
|
143
|
+
/// assert!(score > 0.7);
|
|
144
|
+
/// ```
|
|
145
|
+
pub fn calculate_quality_score(text: &str, metadata: Option<&AHashMap<String, String>>) -> f64 {
|
|
146
|
+
if text.is_empty() || text.trim().is_empty() {
|
|
147
|
+
return 0.0;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
let total_chars = text.len() as f64;
|
|
151
|
+
|
|
152
|
+
if text.len() < MIN_TEXT_LENGTH {
|
|
153
|
+
return 0.1;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
let mut score = 1.0;
|
|
157
|
+
|
|
158
|
+
if text.len() > LARGE_TEXT_LENGTH {
|
|
159
|
+
let ocr_penalty = calculate_ocr_penalty(text, total_chars);
|
|
160
|
+
let script_penalty = calculate_script_penalty(text, total_chars);
|
|
161
|
+
let nav_penalty = calculate_navigation_penalty(text, total_chars);
|
|
162
|
+
let structure_bonus = super::heuristics::calculate_structure_bonus(text);
|
|
163
|
+
|
|
164
|
+
score -= ocr_penalty * OCR_PENALTY_WEIGHT;
|
|
165
|
+
score -= script_penalty * SCRIPT_PENALTY_WEIGHT;
|
|
166
|
+
score -= nav_penalty * NAV_PENALTY_WEIGHT;
|
|
167
|
+
score += structure_bonus * STRUCTURE_BONUS_WEIGHT;
|
|
168
|
+
} else {
|
|
169
|
+
score -= calculate_ocr_penalty(text, total_chars) * OCR_PENALTY_WEIGHT;
|
|
170
|
+
score += super::heuristics::calculate_structure_bonus(text) * STRUCTURE_BONUS_WEIGHT;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if let Some(metadata) = metadata {
|
|
174
|
+
score += calculate_metadata_bonus(metadata) * METADATA_BONUS_WEIGHT;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
score.clamp(0.0, 1.0)
|
|
178
|
+
}
|
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
//! Thread-safe reusable string buffer pool for reducing allocations.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a pool of reusable String buffers that can be acquired,
|
|
4
|
+
//! used, and automatically returned to the pool when dropped.
|
|
5
|
+
|
|
6
|
+
use once_cell::sync::Lazy;
|
|
7
|
+
use std::collections::VecDeque;
|
|
8
|
+
use std::sync::Arc;
|
|
9
|
+
|
|
10
|
+
#[cfg(feature = "pool-metrics")]
|
|
11
|
+
use std::sync::atomic::AtomicUsize;
|
|
12
|
+
|
|
13
|
+
#[cfg(feature = "pool-metrics")]
|
|
14
|
+
use std::sync::atomic::Ordering;
|
|
15
|
+
|
|
16
|
+
/// Configuration for the string buffer pool.
|
|
17
|
+
pub struct PoolConfig {
|
|
18
|
+
/// Maximum buffers per size bucket
|
|
19
|
+
pub max_buffers_per_size: usize,
|
|
20
|
+
/// Initial capacity for new buffers
|
|
21
|
+
pub initial_capacity: usize,
|
|
22
|
+
/// Maximum capacity before discarding
|
|
23
|
+
pub max_capacity_before_discard: usize,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
impl Default for PoolConfig {
|
|
27
|
+
fn default() -> Self {
|
|
28
|
+
Self {
|
|
29
|
+
max_buffers_per_size: 4,
|
|
30
|
+
initial_capacity: 4096,
|
|
31
|
+
max_capacity_before_discard: 65536,
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Thread-safe reusable string buffer pool.
|
|
37
|
+
///
|
|
38
|
+
/// This pool allows allocation and reuse of String buffers to reduce memory allocations
|
|
39
|
+
/// during document extraction. Buffers are returned to the pool with cleared contents
|
|
40
|
+
/// but preserved capacity, ready for reuse.
|
|
41
|
+
///
|
|
42
|
+
/// # Thread Safety
|
|
43
|
+
///
|
|
44
|
+
/// The pool uses DashMap for lock-free concurrent access. Multiple threads can
|
|
45
|
+
/// acquire and release buffers simultaneously.
|
|
46
|
+
///
|
|
47
|
+
/// # Usage
|
|
48
|
+
///
|
|
49
|
+
/// ```rust,ignore
|
|
50
|
+
/// use kreuzberg::utils::string_pool::STRING_BUFFER_POOL;
|
|
51
|
+
///
|
|
52
|
+
/// // Acquire a buffer from the pool
|
|
53
|
+
/// let mut buffer = STRING_BUFFER_POOL.acquire();
|
|
54
|
+
/// buffer.push_str("some content");
|
|
55
|
+
/// // Automatically returned to pool when dropped
|
|
56
|
+
/// drop(buffer);
|
|
57
|
+
/// ```
|
|
58
|
+
pub struct StringBufferPool {
|
|
59
|
+
pool: dashmap::DashMap<usize, VecDeque<String>>,
|
|
60
|
+
config: PoolConfig,
|
|
61
|
+
#[cfg(feature = "pool-metrics")]
|
|
62
|
+
acquire_count: AtomicUsize,
|
|
63
|
+
#[cfg(feature = "pool-metrics")]
|
|
64
|
+
reuse_count: AtomicUsize,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
impl StringBufferPool {
|
|
68
|
+
/// Create a new string buffer pool with given configuration.
|
|
69
|
+
pub fn new(config: PoolConfig) -> Self {
|
|
70
|
+
StringBufferPool {
|
|
71
|
+
pool: dashmap::DashMap::new(),
|
|
72
|
+
config,
|
|
73
|
+
#[cfg(feature = "pool-metrics")]
|
|
74
|
+
acquire_count: AtomicUsize::new(0),
|
|
75
|
+
#[cfg(feature = "pool-metrics")]
|
|
76
|
+
reuse_count: AtomicUsize::new(0),
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/// Find the appropriate bucket size for a given capacity.
|
|
81
|
+
fn find_bucket(&self, capacity: usize) -> usize {
|
|
82
|
+
if capacity <= 1024 {
|
|
83
|
+
1024
|
|
84
|
+
} else if capacity <= 4096 {
|
|
85
|
+
4096
|
|
86
|
+
} else if capacity <= 16384 {
|
|
87
|
+
16384
|
|
88
|
+
} else if capacity <= 65536 {
|
|
89
|
+
65536
|
|
90
|
+
} else {
|
|
91
|
+
262144
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/// Try to acquire a buffer from a specific bucket, returning it if found.
|
|
96
|
+
fn try_acquire_from_bucket(&self, bucket: usize) -> Option<String> {
|
|
97
|
+
if let Some(mut entry) = self.pool.get_mut(&bucket) {
|
|
98
|
+
entry.pop_front()
|
|
99
|
+
} else {
|
|
100
|
+
None
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
/// Acquire a string buffer from the pool, or allocate a new one if pool is exhausted.
|
|
105
|
+
///
|
|
106
|
+
/// The returned buffer is automatically returned to the pool when dropped.
|
|
107
|
+
/// Must be called with the pool wrapped in Arc.
|
|
108
|
+
pub fn acquire(self: Arc<Self>) -> PooledString {
|
|
109
|
+
#[cfg(feature = "pool-metrics")]
|
|
110
|
+
self.acquire_count.fetch_add(1, Ordering::Relaxed);
|
|
111
|
+
|
|
112
|
+
let default_bucket = self.config.initial_capacity;
|
|
113
|
+
if let Some(buffer) = self.try_acquire_from_bucket(default_bucket) {
|
|
114
|
+
#[cfg(feature = "pool-metrics")]
|
|
115
|
+
self.reuse_count.fetch_add(1, Ordering::Relaxed);
|
|
116
|
+
return PooledString { buffer, pool: self };
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
for &bucket in &[1024, 16384, 65536] {
|
|
120
|
+
if let Some(buffer) = self.try_acquire_from_bucket(bucket) {
|
|
121
|
+
#[cfg(feature = "pool-metrics")]
|
|
122
|
+
self.reuse_count.fetch_add(1, Ordering::Relaxed);
|
|
123
|
+
return PooledString { buffer, pool: self };
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
PooledString {
|
|
128
|
+
buffer: String::with_capacity(self.config.initial_capacity),
|
|
129
|
+
pool: self,
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/// Return a buffer to the pool for reuse.
|
|
134
|
+
pub fn release(&self, mut buffer: String) {
|
|
135
|
+
if buffer.capacity() > self.config.max_capacity_before_discard {
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
let bucket = self.find_bucket(buffer.capacity());
|
|
140
|
+
buffer.clear();
|
|
141
|
+
|
|
142
|
+
if let Some(mut queue) = self.pool.get_mut(&bucket) {
|
|
143
|
+
if queue.len() < self.config.max_buffers_per_size {
|
|
144
|
+
queue.push_back(buffer);
|
|
145
|
+
}
|
|
146
|
+
} else {
|
|
147
|
+
let mut queue = VecDeque::with_capacity(self.config.max_buffers_per_size);
|
|
148
|
+
queue.push_back(buffer);
|
|
149
|
+
self.pool.insert(bucket, queue);
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/// Get the current pool size across all buckets.
|
|
154
|
+
#[allow(dead_code)]
|
|
155
|
+
pub fn size(&self) -> usize {
|
|
156
|
+
self.pool.iter().map(|entry| entry.value().len()).sum()
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/// Get buffer reuse metrics (only available with `pool-metrics` feature).
|
|
160
|
+
#[cfg(feature = "pool-metrics")]
|
|
161
|
+
pub fn metrics(&self) -> StringBufferPoolMetrics {
|
|
162
|
+
let acquire = self.acquire_count.load(Ordering::Relaxed);
|
|
163
|
+
let reuse = self.reuse_count.load(Ordering::Relaxed);
|
|
164
|
+
let hit_rate = if acquire == 0 {
|
|
165
|
+
0.0
|
|
166
|
+
} else {
|
|
167
|
+
(reuse as f64 / acquire as f64) * 100.0
|
|
168
|
+
};
|
|
169
|
+
|
|
170
|
+
StringBufferPoolMetrics {
|
|
171
|
+
total_acquires: acquire,
|
|
172
|
+
total_reuses: reuse,
|
|
173
|
+
hit_rate,
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/// Metrics for StringBufferPool (only available with `pool-metrics` feature).
|
|
179
|
+
#[cfg(feature = "pool-metrics")]
|
|
180
|
+
#[derive(Debug, Clone, Copy)]
|
|
181
|
+
pub struct StringBufferPoolMetrics {
|
|
182
|
+
/// Total number of acquire calls
|
|
183
|
+
pub total_acquires: usize,
|
|
184
|
+
/// Total number of buffer reuses from pool
|
|
185
|
+
pub total_reuses: usize,
|
|
186
|
+
/// Hit rate as percentage (0.0-100.0)
|
|
187
|
+
pub hit_rate: f64,
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/// RAII wrapper for a pooled string buffer.
|
|
191
|
+
///
|
|
192
|
+
/// Automatically returns the buffer to the pool when dropped.
|
|
193
|
+
pub struct PooledString {
|
|
194
|
+
buffer: String,
|
|
195
|
+
pool: Arc<StringBufferPool>,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
impl PooledString {
|
|
199
|
+
/// Get mutable access to the underlying string buffer.
|
|
200
|
+
pub fn buffer_mut(&mut self) -> &mut String {
|
|
201
|
+
&mut self.buffer
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
/// Get immutable access to the underlying string buffer.
|
|
205
|
+
pub fn as_str(&self) -> &str {
|
|
206
|
+
self.buffer.as_str()
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
impl std::ops::Deref for PooledString {
|
|
211
|
+
type Target = String;
|
|
212
|
+
|
|
213
|
+
fn deref(&self) -> &Self::Target {
|
|
214
|
+
&self.buffer
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
impl std::ops::DerefMut for PooledString {
|
|
219
|
+
fn deref_mut(&mut self) -> &mut Self::Target {
|
|
220
|
+
&mut self.buffer
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
impl Drop for PooledString {
|
|
225
|
+
fn drop(&mut self) {
|
|
226
|
+
let buffer = std::mem::take(&mut self.buffer);
|
|
227
|
+
self.pool.release(buffer);
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
impl std::fmt::Display for PooledString {
|
|
232
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
233
|
+
write!(f, "{}", self.buffer)
|
|
234
|
+
}
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
impl std::fmt::Debug for PooledString {
|
|
238
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
239
|
+
f.debug_tuple("PooledString").field(&self.buffer).finish()
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/// Global string buffer pool for temporary allocations during extraction.
|
|
244
|
+
pub static STRING_BUFFER_POOL: Lazy<Arc<StringBufferPool>> =
|
|
245
|
+
Lazy::new(|| Arc::new(StringBufferPool::new(PoolConfig::default())));
|
|
246
|
+
|
|
247
|
+
/// Acquire a string buffer from the global pool.
|
|
248
|
+
///
|
|
249
|
+
/// The returned buffer is automatically returned to the pool when dropped.
|
|
250
|
+
///
|
|
251
|
+
/// # Example
|
|
252
|
+
///
|
|
253
|
+
/// ```rust,ignore
|
|
254
|
+
/// let mut buffer = acquire_string_buffer();
|
|
255
|
+
/// buffer.push_str("content");
|
|
256
|
+
/// // Automatically returned to pool when buffer goes out of scope
|
|
257
|
+
/// ```
|
|
258
|
+
pub fn acquire_string_buffer() -> PooledString {
|
|
259
|
+
Arc::clone(&*STRING_BUFFER_POOL).acquire()
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
#[cfg(test)]
|
|
263
|
+
mod tests {
|
|
264
|
+
use super::*;
|
|
265
|
+
|
|
266
|
+
#[test]
|
|
267
|
+
fn test_buffer_pool_acquire_and_release() {
|
|
268
|
+
let config = PoolConfig::default();
|
|
269
|
+
let pool = Arc::new(StringBufferPool::new(config));
|
|
270
|
+
|
|
271
|
+
let mut buffer = pool.clone().acquire();
|
|
272
|
+
buffer.push_str("test content");
|
|
273
|
+
let capacity = buffer.capacity();
|
|
274
|
+
|
|
275
|
+
drop(buffer);
|
|
276
|
+
|
|
277
|
+
let buffer2 = pool.clone().acquire();
|
|
278
|
+
assert_eq!(buffer2.capacity(), capacity);
|
|
279
|
+
assert!(buffer2.is_empty());
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
#[test]
|
|
283
|
+
fn test_buffer_pool_size() {
|
|
284
|
+
let config = PoolConfig::default();
|
|
285
|
+
let pool = Arc::new(StringBufferPool::new(config));
|
|
286
|
+
|
|
287
|
+
assert_eq!(pool.size(), 0);
|
|
288
|
+
|
|
289
|
+
let buffer1 = pool.clone().acquire();
|
|
290
|
+
drop(buffer1);
|
|
291
|
+
assert_eq!(pool.size(), 1);
|
|
292
|
+
|
|
293
|
+
let buffer2 = pool.clone().acquire();
|
|
294
|
+
drop(buffer2);
|
|
295
|
+
assert_eq!(pool.size(), 1);
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
#[test]
|
|
299
|
+
fn test_buffer_pool_global() {
|
|
300
|
+
let buffer1 = acquire_string_buffer();
|
|
301
|
+
drop(buffer1);
|
|
302
|
+
|
|
303
|
+
let buffer2 = acquire_string_buffer();
|
|
304
|
+
assert!(buffer2.capacity() >= 4096);
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
#[test]
|
|
308
|
+
fn test_pooled_string_deref() {
|
|
309
|
+
let mut buffer = acquire_string_buffer();
|
|
310
|
+
buffer.push_str("hello");
|
|
311
|
+
|
|
312
|
+
assert_eq!(&*buffer, "hello");
|
|
313
|
+
assert_eq!(buffer.as_str(), "hello");
|
|
314
|
+
assert!(!buffer.is_empty());
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
#[test]
|
|
318
|
+
fn test_pooled_string_deref_mut() {
|
|
319
|
+
let mut buffer = acquire_string_buffer();
|
|
320
|
+
buffer.push_str("test");
|
|
321
|
+
|
|
322
|
+
buffer.buffer_mut().push_str(" more");
|
|
323
|
+
assert_eq!(buffer.as_str(), "test more");
|
|
324
|
+
}
|
|
325
|
+
}
|