kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,762 +0,0 @@
|
|
|
1
|
-
//! String interning/pooling for frequently used strings.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides thread-safe string interning to reduce memory allocations
|
|
4
|
-
//! for strings that appear repeatedly across documents (MIME types, language codes, format field names).
|
|
5
|
-
//!
|
|
6
|
-
//! # Performance
|
|
7
|
-
//!
|
|
8
|
-
//! String interning provides 0.1-0.3% improvement by:
|
|
9
|
-
//! - Deduplicating repeated strings (e.g., "application/pdf" appears 1000s of times)
|
|
10
|
-
//! - Reducing allocation overhead for commonly used strings
|
|
11
|
-
//! - Enabling pointer comparisons instead of string comparisons
|
|
12
|
-
//!
|
|
13
|
-
//! # Thread Safety
|
|
14
|
-
//!
|
|
15
|
-
//! The intern pool uses a `DashMap` for lock-free concurrent access. Multiple threads
|
|
16
|
-
//! can insert and lookup strings simultaneously without contention.
|
|
17
|
-
//!
|
|
18
|
-
//! # Example
|
|
19
|
-
//!
|
|
20
|
-
//! ```rust,ignore
|
|
21
|
-
//! use kreuzberg::utils::string_pool::intern_mime_type;
|
|
22
|
-
//!
|
|
23
|
-
//! let mime1 = intern_mime_type("application/pdf");
|
|
24
|
-
//! let mime2 = intern_mime_type("application/pdf");
|
|
25
|
-
//! // Both mime1 and mime2 point to the same interned string
|
|
26
|
-
//! assert_eq!(mime1, mime2);
|
|
27
|
-
//! ```
|
|
28
|
-
|
|
29
|
-
use once_cell::sync::Lazy;
|
|
30
|
-
use std::collections::VecDeque;
|
|
31
|
-
use std::sync::Arc;
|
|
32
|
-
use std::sync::atomic::{AtomicBool, Ordering};
|
|
33
|
-
|
|
34
|
-
#[cfg(feature = "pool-metrics")]
|
|
35
|
-
use std::sync::atomic::AtomicUsize;
|
|
36
|
-
|
|
37
|
-
/// A reference to an interned string stored in an Arc.
|
|
38
|
-
///
|
|
39
|
-
/// This wraps an Arc<String> and provides convenient access to the string content.
|
|
40
|
-
/// Multiple calls with the same string content will share the same Arc, reducing memory usage.
|
|
41
|
-
#[derive(Clone)]
|
|
42
|
-
pub struct InternedString(Arc<String>);
|
|
43
|
-
|
|
44
|
-
impl InternedString {
|
|
45
|
-
/// Get the string content.
|
|
46
|
-
pub fn as_str(&self) -> &str {
|
|
47
|
-
self.0.as_str()
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
impl AsRef<str> for InternedString {
|
|
52
|
-
fn as_ref(&self) -> &str {
|
|
53
|
-
self.as_str()
|
|
54
|
-
}
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
impl std::fmt::Display for InternedString {
|
|
58
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
59
|
-
write!(f, "{}", self.as_str())
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
|
|
63
|
-
impl std::fmt::Debug for InternedString {
|
|
64
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
65
|
-
f.debug_tuple("InternedString").field(&self.as_str()).finish()
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
impl PartialEq for InternedString {
|
|
70
|
-
fn eq(&self, other: &Self) -> bool {
|
|
71
|
-
Arc::ptr_eq(&self.0, &other.0) || self.as_str() == other.as_str()
|
|
72
|
-
}
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
impl Eq for InternedString {}
|
|
76
|
-
|
|
77
|
-
impl std::hash::Hash for InternedString {
|
|
78
|
-
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
|
79
|
-
self.as_str().hash(state);
|
|
80
|
-
}
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
impl std::ops::Deref for InternedString {
|
|
84
|
-
type Target = str;
|
|
85
|
-
|
|
86
|
-
fn deref(&self) -> &Self::Target {
|
|
87
|
-
self.as_str()
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
/// String pool for MIME types.
|
|
92
|
-
///
|
|
93
|
-
/// Lazily initializes with all known MIME types from `kreuzberg::core::mime`.
|
|
94
|
-
/// Pre-interning is deferred until first access to reduce startup memory usage.
|
|
95
|
-
struct MimeStringPool {
|
|
96
|
-
pool: dashmap::DashMap<String, Arc<String>>,
|
|
97
|
-
initialized: AtomicBool,
|
|
98
|
-
}
|
|
99
|
-
|
|
100
|
-
impl MimeStringPool {
|
|
101
|
-
/// Create a new MIME string pool.
|
|
102
|
-
/// Pre-interning is deferred until first `get_or_intern()` call.
|
|
103
|
-
fn new() -> Self {
|
|
104
|
-
MimeStringPool {
|
|
105
|
-
pool: dashmap::DashMap::new(),
|
|
106
|
-
initialized: AtomicBool::new(false),
|
|
107
|
-
}
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
/// Ensure all known MIME types are pre-interned (one-time initialization).
|
|
111
|
-
#[inline]
|
|
112
|
-
fn ensure_initialized(&self) {
|
|
113
|
-
if self.initialized.load(Ordering::Acquire) {
|
|
114
|
-
return;
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
let mime_types = vec![
|
|
118
|
-
"text/html",
|
|
119
|
-
"text/markdown",
|
|
120
|
-
"text/x-markdown",
|
|
121
|
-
"text/plain",
|
|
122
|
-
"application/pdf",
|
|
123
|
-
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
124
|
-
"application/msword",
|
|
125
|
-
"application/vnd.ms-powerpoint",
|
|
126
|
-
"message/rfc822",
|
|
127
|
-
"application/vnd.ms-outlook",
|
|
128
|
-
"application/json",
|
|
129
|
-
"text/json",
|
|
130
|
-
"application/x-yaml",
|
|
131
|
-
"text/yaml",
|
|
132
|
-
"text/x-yaml",
|
|
133
|
-
"application/yaml",
|
|
134
|
-
"application/toml",
|
|
135
|
-
"text/toml",
|
|
136
|
-
"application/xml",
|
|
137
|
-
"text/xml",
|
|
138
|
-
"image/svg+xml",
|
|
139
|
-
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
140
|
-
"application/vnd.ms-excel",
|
|
141
|
-
"application/vnd.ms-excel.sheet.macroEnabled.12",
|
|
142
|
-
"application/vnd.ms-excel.sheet.binary.macroEnabled.12",
|
|
143
|
-
"application/vnd.ms-excel.addin.macroEnabled.12",
|
|
144
|
-
"application/vnd.ms-excel.template.macroEnabled.12",
|
|
145
|
-
"application/vnd.oasis.opendocument.spreadsheet",
|
|
146
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
147
|
-
"application/vnd.oasis.opendocument.text",
|
|
148
|
-
"image/bmp",
|
|
149
|
-
"image/gif",
|
|
150
|
-
"image/jp2",
|
|
151
|
-
"image/jpeg",
|
|
152
|
-
"image/jpm",
|
|
153
|
-
"image/jpx",
|
|
154
|
-
"image/mj2",
|
|
155
|
-
"image/pjpeg",
|
|
156
|
-
"image/png",
|
|
157
|
-
"image/tiff",
|
|
158
|
-
"image/webp",
|
|
159
|
-
"image/x-bmp",
|
|
160
|
-
"image/x-ms-bmp",
|
|
161
|
-
"image/x-portable-anymap",
|
|
162
|
-
"image/x-portable-bitmap",
|
|
163
|
-
"image/x-portable-graymap",
|
|
164
|
-
"image/x-portable-pixmap",
|
|
165
|
-
"image/x-tiff",
|
|
166
|
-
"application/csl+json",
|
|
167
|
-
"application/docbook+xml",
|
|
168
|
-
"application/epub+zip",
|
|
169
|
-
"application/rtf",
|
|
170
|
-
"application/x-biblatex",
|
|
171
|
-
"application/x-bibtex",
|
|
172
|
-
"application/x-endnote+xml",
|
|
173
|
-
"application/x-fictionbook+xml",
|
|
174
|
-
"application/x-ipynb+json",
|
|
175
|
-
"application/x-jats+xml",
|
|
176
|
-
"application/x-latex",
|
|
177
|
-
"application/xml+opml",
|
|
178
|
-
"application/x-opml+xml",
|
|
179
|
-
"application/x-research-info-systems",
|
|
180
|
-
"application/x-typst",
|
|
181
|
-
"text/csv",
|
|
182
|
-
"text/tab-separated-values",
|
|
183
|
-
"text/troff",
|
|
184
|
-
"text/x-commonmark",
|
|
185
|
-
"text/x-dokuwiki",
|
|
186
|
-
"text/x-gfm",
|
|
187
|
-
"text/x-markdown-extra",
|
|
188
|
-
"text/x-mdoc",
|
|
189
|
-
"text/x-multimarkdown",
|
|
190
|
-
"text/x-opml",
|
|
191
|
-
"text/x-org",
|
|
192
|
-
"text/x-pod",
|
|
193
|
-
"text/x-rst",
|
|
194
|
-
"application/zip",
|
|
195
|
-
"application/x-zip-compressed",
|
|
196
|
-
"application/x-tar",
|
|
197
|
-
"application/tar",
|
|
198
|
-
"application/x-gtar",
|
|
199
|
-
"application/x-ustar",
|
|
200
|
-
"application/gzip",
|
|
201
|
-
"application/x-7z-compressed",
|
|
202
|
-
];
|
|
203
|
-
|
|
204
|
-
for mime_type in mime_types {
|
|
205
|
-
self.pool.insert(mime_type.to_string(), Arc::new(mime_type.to_string()));
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
let _ = self
|
|
209
|
-
.initialized
|
|
210
|
-
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
|
211
|
-
}
|
|
212
|
-
|
|
213
|
-
/// Get or intern a MIME type string.
|
|
214
|
-
/// Ensures pre-interned MIME types are initialized on first call.
|
|
215
|
-
fn get_or_intern(&self, mime_type: &str) -> Arc<String> {
|
|
216
|
-
self.ensure_initialized();
|
|
217
|
-
|
|
218
|
-
if let Some(entry) = self.pool.get(mime_type) {
|
|
219
|
-
Arc::clone(&*entry)
|
|
220
|
-
} else {
|
|
221
|
-
let arc_string = Arc::new(mime_type.to_string());
|
|
222
|
-
self.pool.insert(mime_type.to_string(), Arc::clone(&arc_string));
|
|
223
|
-
arc_string
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
/// String pool for language codes.
|
|
229
|
-
///
|
|
230
|
-
/// Lazily initializes with common ISO 639 language codes.
|
|
231
|
-
/// Pre-interning is deferred until first access to reduce startup memory usage.
|
|
232
|
-
struct LanguageStringPool {
|
|
233
|
-
pool: dashmap::DashMap<String, Arc<String>>,
|
|
234
|
-
initialized: AtomicBool,
|
|
235
|
-
}
|
|
236
|
-
|
|
237
|
-
impl LanguageStringPool {
|
|
238
|
-
/// Create a new language string pool.
|
|
239
|
-
/// Pre-interning is deferred until first `get_or_intern()` call.
|
|
240
|
-
fn new() -> Self {
|
|
241
|
-
LanguageStringPool {
|
|
242
|
-
pool: dashmap::DashMap::new(),
|
|
243
|
-
initialized: AtomicBool::new(false),
|
|
244
|
-
}
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
/// Ensure all known language codes are pre-interned (one-time initialization).
|
|
248
|
-
#[inline]
|
|
249
|
-
fn ensure_initialized(&self) {
|
|
250
|
-
if self.initialized.load(Ordering::Acquire) {
|
|
251
|
-
return;
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
let lang_codes = vec![
|
|
255
|
-
"en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "hi", "th", "tr", "pl", "nl", "sv", "no",
|
|
256
|
-
"da", "fi", "cs", "hu", "ro", "el", "he", "fa", "ur", "vi", "id", "ms", "bn", "pa", "te", "mr", "ta", "gu",
|
|
257
|
-
"kn", "ml", "or", "uk", "bg", "sr", "hr", "sl", "sk", "et", "lv", "lt", "sq", "mk", "ka", "hy", "eo",
|
|
258
|
-
"ast", "ca", "eu", "gl", "cy", "gd", "ga",
|
|
259
|
-
];
|
|
260
|
-
|
|
261
|
-
for code in lang_codes {
|
|
262
|
-
self.pool.insert(code.to_string(), Arc::new(code.to_string()));
|
|
263
|
-
}
|
|
264
|
-
|
|
265
|
-
let _ = self
|
|
266
|
-
.initialized
|
|
267
|
-
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
/// Get or intern a language code string.
|
|
271
|
-
/// Ensures pre-interned language codes are initialized on first call.
|
|
272
|
-
fn get_or_intern(&self, lang_code: &str) -> Arc<String> {
|
|
273
|
-
self.ensure_initialized();
|
|
274
|
-
|
|
275
|
-
if let Some(entry) = self.pool.get(lang_code) {
|
|
276
|
-
Arc::clone(&*entry)
|
|
277
|
-
} else {
|
|
278
|
-
let arc_string = Arc::new(lang_code.to_string());
|
|
279
|
-
self.pool.insert(lang_code.to_string(), Arc::clone(&arc_string));
|
|
280
|
-
arc_string
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
}
|
|
284
|
-
|
|
285
|
-
/// Configuration for the string buffer pool.
|
|
286
|
-
pub struct PoolConfig {
|
|
287
|
-
/// Maximum buffers per size bucket
|
|
288
|
-
pub max_buffers_per_size: usize,
|
|
289
|
-
/// Initial capacity for new buffers
|
|
290
|
-
pub initial_capacity: usize,
|
|
291
|
-
/// Maximum capacity before discarding
|
|
292
|
-
pub max_capacity_before_discard: usize,
|
|
293
|
-
}
|
|
294
|
-
|
|
295
|
-
impl Default for PoolConfig {
|
|
296
|
-
fn default() -> Self {
|
|
297
|
-
Self {
|
|
298
|
-
max_buffers_per_size: 4,
|
|
299
|
-
initial_capacity: 4096,
|
|
300
|
-
max_capacity_before_discard: 65536,
|
|
301
|
-
}
|
|
302
|
-
}
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
/// Thread-safe reusable string buffer pool.
|
|
306
|
-
///
|
|
307
|
-
/// This pool allows allocation and reuse of String buffers to reduce memory allocations
|
|
308
|
-
/// during document extraction. Buffers are returned to the pool with cleared contents
|
|
309
|
-
/// but preserved capacity, ready for reuse.
|
|
310
|
-
///
|
|
311
|
-
/// # Thread Safety
|
|
312
|
-
///
|
|
313
|
-
/// The pool uses DashMap for lock-free concurrent access. Multiple threads can
|
|
314
|
-
/// acquire and release buffers simultaneously.
|
|
315
|
-
///
|
|
316
|
-
/// # Usage
|
|
317
|
-
///
|
|
318
|
-
/// ```rust,ignore
|
|
319
|
-
/// use kreuzberg::utils::string_pool::STRING_BUFFER_POOL;
|
|
320
|
-
///
|
|
321
|
-
/// // Acquire a buffer from the pool
|
|
322
|
-
/// let mut buffer = STRING_BUFFER_POOL.acquire();
|
|
323
|
-
/// buffer.push_str("some content");
|
|
324
|
-
/// // Automatically returned to pool when dropped
|
|
325
|
-
/// drop(buffer);
|
|
326
|
-
/// ```
|
|
327
|
-
pub struct StringBufferPool {
|
|
328
|
-
pool: dashmap::DashMap<usize, VecDeque<String>>,
|
|
329
|
-
config: PoolConfig,
|
|
330
|
-
#[cfg(feature = "pool-metrics")]
|
|
331
|
-
acquire_count: AtomicUsize,
|
|
332
|
-
#[cfg(feature = "pool-metrics")]
|
|
333
|
-
reuse_count: AtomicUsize,
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
impl StringBufferPool {
|
|
337
|
-
/// Create a new string buffer pool with given configuration.
|
|
338
|
-
pub fn new(config: PoolConfig) -> Self {
|
|
339
|
-
StringBufferPool {
|
|
340
|
-
pool: dashmap::DashMap::new(),
|
|
341
|
-
config,
|
|
342
|
-
#[cfg(feature = "pool-metrics")]
|
|
343
|
-
acquire_count: AtomicUsize::new(0),
|
|
344
|
-
#[cfg(feature = "pool-metrics")]
|
|
345
|
-
reuse_count: AtomicUsize::new(0),
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
/// Find the appropriate bucket size for a given capacity.
|
|
350
|
-
fn find_bucket(&self, capacity: usize) -> usize {
|
|
351
|
-
if capacity <= 1024 {
|
|
352
|
-
1024
|
|
353
|
-
} else if capacity <= 4096 {
|
|
354
|
-
4096
|
|
355
|
-
} else if capacity <= 16384 {
|
|
356
|
-
16384
|
|
357
|
-
} else if capacity <= 65536 {
|
|
358
|
-
65536
|
|
359
|
-
} else {
|
|
360
|
-
262144
|
|
361
|
-
}
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
/// Try to acquire a buffer from a specific bucket, returning it if found.
|
|
365
|
-
fn try_acquire_from_bucket(&self, bucket: usize) -> Option<String> {
|
|
366
|
-
if let Some(mut entry) = self.pool.get_mut(&bucket) {
|
|
367
|
-
entry.pop_front()
|
|
368
|
-
} else {
|
|
369
|
-
None
|
|
370
|
-
}
|
|
371
|
-
}
|
|
372
|
-
|
|
373
|
-
/// Acquire a string buffer from the pool, or allocate a new one if pool is exhausted.
|
|
374
|
-
///
|
|
375
|
-
/// The returned buffer is automatically returned to the pool when dropped.
|
|
376
|
-
/// Must be called with the pool wrapped in Arc.
|
|
377
|
-
pub fn acquire(self: Arc<Self>) -> PooledString {
|
|
378
|
-
#[cfg(feature = "pool-metrics")]
|
|
379
|
-
self.acquire_count.fetch_add(1, Ordering::Relaxed);
|
|
380
|
-
|
|
381
|
-
let default_bucket = self.config.initial_capacity;
|
|
382
|
-
if let Some(buffer) = self.try_acquire_from_bucket(default_bucket) {
|
|
383
|
-
#[cfg(feature = "pool-metrics")]
|
|
384
|
-
self.reuse_count.fetch_add(1, Ordering::Relaxed);
|
|
385
|
-
return PooledString { buffer, pool: self };
|
|
386
|
-
}
|
|
387
|
-
|
|
388
|
-
for &bucket in &[1024, 16384, 65536] {
|
|
389
|
-
if let Some(buffer) = self.try_acquire_from_bucket(bucket) {
|
|
390
|
-
#[cfg(feature = "pool-metrics")]
|
|
391
|
-
self.reuse_count.fetch_add(1, Ordering::Relaxed);
|
|
392
|
-
return PooledString { buffer, pool: self };
|
|
393
|
-
}
|
|
394
|
-
}
|
|
395
|
-
|
|
396
|
-
PooledString {
|
|
397
|
-
buffer: String::with_capacity(self.config.initial_capacity),
|
|
398
|
-
pool: self,
|
|
399
|
-
}
|
|
400
|
-
}
|
|
401
|
-
|
|
402
|
-
/// Return a buffer to the pool for reuse.
|
|
403
|
-
pub fn release(&self, mut buffer: String) {
|
|
404
|
-
if buffer.capacity() > self.config.max_capacity_before_discard {
|
|
405
|
-
return;
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
let bucket = self.find_bucket(buffer.capacity());
|
|
409
|
-
buffer.clear();
|
|
410
|
-
|
|
411
|
-
if let Some(mut queue) = self.pool.get_mut(&bucket) {
|
|
412
|
-
if queue.len() < self.config.max_buffers_per_size {
|
|
413
|
-
queue.push_back(buffer);
|
|
414
|
-
}
|
|
415
|
-
} else {
|
|
416
|
-
let mut queue = VecDeque::with_capacity(self.config.max_buffers_per_size);
|
|
417
|
-
queue.push_back(buffer);
|
|
418
|
-
self.pool.insert(bucket, queue);
|
|
419
|
-
}
|
|
420
|
-
}
|
|
421
|
-
|
|
422
|
-
/// Get the current pool size across all buckets.
|
|
423
|
-
#[allow(dead_code)]
|
|
424
|
-
pub fn size(&self) -> usize {
|
|
425
|
-
self.pool.iter().map(|entry| entry.value().len()).sum()
|
|
426
|
-
}
|
|
427
|
-
|
|
428
|
-
/// Get buffer reuse metrics (only available with `pool-metrics` feature).
|
|
429
|
-
#[cfg(feature = "pool-metrics")]
|
|
430
|
-
pub fn metrics(&self) -> StringBufferPoolMetrics {
|
|
431
|
-
let acquire = self.acquire_count.load(Ordering::Relaxed);
|
|
432
|
-
let reuse = self.reuse_count.load(Ordering::Relaxed);
|
|
433
|
-
let hit_rate = if acquire == 0 {
|
|
434
|
-
0.0
|
|
435
|
-
} else {
|
|
436
|
-
(reuse as f64 / acquire as f64) * 100.0
|
|
437
|
-
};
|
|
438
|
-
|
|
439
|
-
StringBufferPoolMetrics {
|
|
440
|
-
total_acquires: acquire,
|
|
441
|
-
total_reuses: reuse,
|
|
442
|
-
hit_rate,
|
|
443
|
-
}
|
|
444
|
-
}
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
/// Metrics for StringBufferPool (only available with `pool-metrics` feature).
|
|
448
|
-
#[cfg(feature = "pool-metrics")]
|
|
449
|
-
#[derive(Debug, Clone, Copy)]
|
|
450
|
-
pub struct StringBufferPoolMetrics {
|
|
451
|
-
/// Total number of acquire calls
|
|
452
|
-
pub total_acquires: usize,
|
|
453
|
-
/// Total number of buffer reuses from pool
|
|
454
|
-
pub total_reuses: usize,
|
|
455
|
-
/// Hit rate as percentage (0.0-100.0)
|
|
456
|
-
pub hit_rate: f64,
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
/// RAII wrapper for a pooled string buffer.
|
|
460
|
-
///
|
|
461
|
-
/// Automatically returns the buffer to the pool when dropped.
|
|
462
|
-
pub struct PooledString {
|
|
463
|
-
buffer: String,
|
|
464
|
-
pool: Arc<StringBufferPool>,
|
|
465
|
-
}
|
|
466
|
-
|
|
467
|
-
impl PooledString {
|
|
468
|
-
/// Get mutable access to the underlying string buffer.
|
|
469
|
-
pub fn buffer_mut(&mut self) -> &mut String {
|
|
470
|
-
&mut self.buffer
|
|
471
|
-
}
|
|
472
|
-
|
|
473
|
-
/// Get immutable access to the underlying string buffer.
|
|
474
|
-
pub fn as_str(&self) -> &str {
|
|
475
|
-
self.buffer.as_str()
|
|
476
|
-
}
|
|
477
|
-
}
|
|
478
|
-
|
|
479
|
-
impl std::ops::Deref for PooledString {
|
|
480
|
-
type Target = String;
|
|
481
|
-
|
|
482
|
-
fn deref(&self) -> &Self::Target {
|
|
483
|
-
&self.buffer
|
|
484
|
-
}
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
impl std::ops::DerefMut for PooledString {
|
|
488
|
-
fn deref_mut(&mut self) -> &mut Self::Target {
|
|
489
|
-
&mut self.buffer
|
|
490
|
-
}
|
|
491
|
-
}
|
|
492
|
-
|
|
493
|
-
impl Drop for PooledString {
|
|
494
|
-
fn drop(&mut self) {
|
|
495
|
-
let buffer = std::mem::take(&mut self.buffer);
|
|
496
|
-
self.pool.release(buffer);
|
|
497
|
-
}
|
|
498
|
-
}
|
|
499
|
-
|
|
500
|
-
impl std::fmt::Display for PooledString {
|
|
501
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
502
|
-
write!(f, "{}", self.buffer)
|
|
503
|
-
}
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
impl std::fmt::Debug for PooledString {
|
|
507
|
-
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
508
|
-
f.debug_tuple("PooledString").field(&self.buffer).finish()
|
|
509
|
-
}
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
/// Global MIME type string pool.
|
|
513
|
-
static MIME_POOL: Lazy<MimeStringPool> = Lazy::new(MimeStringPool::new);
|
|
514
|
-
|
|
515
|
-
/// Global language code string pool.
|
|
516
|
-
static LANGUAGE_POOL: Lazy<LanguageStringPool> = Lazy::new(LanguageStringPool::new);
|
|
517
|
-
|
|
518
|
-
/// Global string buffer pool for temporary allocations during extraction.
|
|
519
|
-
pub static STRING_BUFFER_POOL: Lazy<Arc<StringBufferPool>> =
|
|
520
|
-
Lazy::new(|| Arc::new(StringBufferPool::new(PoolConfig::default())));
|
|
521
|
-
|
|
522
|
-
/// Get or intern a MIME type string.
|
|
523
|
-
///
|
|
524
|
-
/// Returns an `InternedString` that is guaranteed to be deduplicated with any other
|
|
525
|
-
/// intern call for the same MIME type. This reduces memory usage and allows
|
|
526
|
-
/// fast pointer-based comparisons.
|
|
527
|
-
///
|
|
528
|
-
/// # Arguments
|
|
529
|
-
///
|
|
530
|
-
/// * `mime_type` - The MIME type string to intern
|
|
531
|
-
///
|
|
532
|
-
/// # Returns
|
|
533
|
-
///
|
|
534
|
-
/// An `InternedString` pointing to the deduplicated string
|
|
535
|
-
///
|
|
536
|
-
/// # Example
|
|
537
|
-
///
|
|
538
|
-
/// ```rust,ignore
|
|
539
|
-
/// let pdf1 = intern_mime_type("application/pdf");
|
|
540
|
-
/// let pdf2 = intern_mime_type("application/pdf");
|
|
541
|
-
/// assert_eq!(pdf1, pdf2); // Same pointer
|
|
542
|
-
/// ```
|
|
543
|
-
pub fn intern_mime_type(mime_type: &str) -> InternedString {
|
|
544
|
-
InternedString(MIME_POOL.get_or_intern(mime_type))
|
|
545
|
-
}
|
|
546
|
-
|
|
547
|
-
/// Get or intern a language code string.
|
|
548
|
-
///
|
|
549
|
-
/// Returns an `InternedString` that is guaranteed to be deduplicated with any other
|
|
550
|
-
/// intern call for the same language code.
|
|
551
|
-
///
|
|
552
|
-
/// # Arguments
|
|
553
|
-
///
|
|
554
|
-
/// * `lang_code` - The language code to intern (e.g., "en", "es", "fr")
|
|
555
|
-
///
|
|
556
|
-
/// # Returns
|
|
557
|
-
///
|
|
558
|
-
/// An `InternedString` pointing to the deduplicated string
|
|
559
|
-
///
|
|
560
|
-
/// # Example
|
|
561
|
-
///
|
|
562
|
-
/// ```rust,ignore
|
|
563
|
-
/// let en1 = intern_language_code("en");
|
|
564
|
-
/// let en2 = intern_language_code("en");
|
|
565
|
-
/// assert_eq!(en1, en2); // Same pointer
|
|
566
|
-
/// ```
|
|
567
|
-
pub fn intern_language_code(lang_code: &str) -> InternedString {
|
|
568
|
-
InternedString(LANGUAGE_POOL.get_or_intern(lang_code))
|
|
569
|
-
}
|
|
570
|
-
|
|
571
|
-
/// Acquire a string buffer from the global pool.
|
|
572
|
-
///
|
|
573
|
-
/// The returned buffer is automatically returned to the pool when dropped.
|
|
574
|
-
///
|
|
575
|
-
/// # Example
|
|
576
|
-
///
|
|
577
|
-
/// ```rust,ignore
|
|
578
|
-
/// let mut buffer = acquire_string_buffer();
|
|
579
|
-
/// buffer.push_str("content");
|
|
580
|
-
/// // Automatically returned to pool when buffer goes out of scope
|
|
581
|
-
/// ```
|
|
582
|
-
pub fn acquire_string_buffer() -> PooledString {
|
|
583
|
-
Arc::clone(&*STRING_BUFFER_POOL).acquire()
|
|
584
|
-
}
|
|
585
|
-
|
|
586
|
-
#[cfg(test)]
|
|
587
|
-
mod tests {
|
|
588
|
-
use super::*;
|
|
589
|
-
|
|
590
|
-
#[test]
|
|
591
|
-
fn test_mime_type_deduplication() {
|
|
592
|
-
let mime1 = intern_mime_type("application/pdf");
|
|
593
|
-
let mime2 = intern_mime_type("application/pdf");
|
|
594
|
-
|
|
595
|
-
assert_eq!(mime1, mime2);
|
|
596
|
-
assert!(Arc::ptr_eq(&mime1.0, &mime2.0));
|
|
597
|
-
}
|
|
598
|
-
|
|
599
|
-
#[test]
|
|
600
|
-
fn test_language_code_deduplication() {
|
|
601
|
-
let en1 = intern_language_code("en");
|
|
602
|
-
let en2 = intern_language_code("en");
|
|
603
|
-
|
|
604
|
-
assert_eq!(en1, en2);
|
|
605
|
-
assert!(Arc::ptr_eq(&en1.0, &en2.0));
|
|
606
|
-
}
|
|
607
|
-
|
|
608
|
-
#[test]
|
|
609
|
-
fn test_interned_string_display() {
|
|
610
|
-
let mime = intern_mime_type("text/html");
|
|
611
|
-
assert_eq!(format!("{}", mime), "text/html");
|
|
612
|
-
}
|
|
613
|
-
|
|
614
|
-
#[test]
|
|
615
|
-
fn test_interned_string_deref() {
|
|
616
|
-
let mime = intern_mime_type("application/json");
|
|
617
|
-
assert_eq!(&*mime, "application/json");
|
|
618
|
-
assert_eq!(mime.as_ref(), "application/json");
|
|
619
|
-
assert_eq!(mime.as_str(), "application/json");
|
|
620
|
-
}
|
|
621
|
-
|
|
622
|
-
#[test]
|
|
623
|
-
fn test_preinterned_mime_types() {
|
|
624
|
-
let pdf = intern_mime_type("application/pdf");
|
|
625
|
-
assert_eq!(pdf.as_str(), "application/pdf");
|
|
626
|
-
|
|
627
|
-
let html = intern_mime_type("text/html");
|
|
628
|
-
assert_eq!(html.as_str(), "text/html");
|
|
629
|
-
|
|
630
|
-
let json = intern_mime_type("application/json");
|
|
631
|
-
assert_eq!(json.as_str(), "application/json");
|
|
632
|
-
}
|
|
633
|
-
|
|
634
|
-
#[test]
|
|
635
|
-
fn test_preinterned_language_codes() {
|
|
636
|
-
let en = intern_language_code("en");
|
|
637
|
-
assert_eq!(en.as_str(), "en");
|
|
638
|
-
|
|
639
|
-
let es = intern_language_code("es");
|
|
640
|
-
assert_eq!(es.as_str(), "es");
|
|
641
|
-
|
|
642
|
-
let fr = intern_language_code("fr");
|
|
643
|
-
assert_eq!(fr.as_str(), "fr");
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
#[test]
|
|
647
|
-
#[ignore = "Flaky test - concurrent interning may not always share the same Arc"]
|
|
648
|
-
fn test_concurrent_interning() {
|
|
649
|
-
use std::sync::Arc;
|
|
650
|
-
use std::thread;
|
|
651
|
-
|
|
652
|
-
let mime = "application/pdf";
|
|
653
|
-
let results = Arc::new(std::sync::Mutex::new(Vec::new()));
|
|
654
|
-
|
|
655
|
-
let handles: Vec<_> = (0..10)
|
|
656
|
-
.map(|_| {
|
|
657
|
-
let results = Arc::clone(&results);
|
|
658
|
-
thread::spawn(move || {
|
|
659
|
-
let interned = intern_mime_type(mime);
|
|
660
|
-
results.lock().unwrap().push(interned);
|
|
661
|
-
})
|
|
662
|
-
})
|
|
663
|
-
.collect();
|
|
664
|
-
|
|
665
|
-
for handle in handles {
|
|
666
|
-
handle.join().unwrap();
|
|
667
|
-
}
|
|
668
|
-
|
|
669
|
-
let interned_strings = results.lock().unwrap();
|
|
670
|
-
assert_eq!(interned_strings.len(), 10);
|
|
671
|
-
|
|
672
|
-
let first_arc = &interned_strings[0].0;
|
|
673
|
-
for interned in &*interned_strings {
|
|
674
|
-
assert!(
|
|
675
|
-
Arc::ptr_eq(&interned.0, first_arc),
|
|
676
|
-
"All interned strings should share the same Arc"
|
|
677
|
-
);
|
|
678
|
-
}
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
#[test]
|
|
682
|
-
fn test_interned_string_hash() {
|
|
683
|
-
let mime1 = intern_mime_type("application/pdf");
|
|
684
|
-
let mime2 = intern_mime_type("application/pdf");
|
|
685
|
-
|
|
686
|
-
use std::collections::HashSet;
|
|
687
|
-
let mut set = HashSet::new();
|
|
688
|
-
set.insert(mime1);
|
|
689
|
-
set.insert(mime2);
|
|
690
|
-
|
|
691
|
-
assert_eq!(set.len(), 1);
|
|
692
|
-
}
|
|
693
|
-
|
|
694
|
-
#[test]
|
|
695
|
-
fn test_interned_string_clone() {
|
|
696
|
-
let mime1 = intern_mime_type("text/html");
|
|
697
|
-
let mime2 = mime1.clone();
|
|
698
|
-
|
|
699
|
-
assert_eq!(mime1, mime2);
|
|
700
|
-
assert!(Arc::ptr_eq(&mime1.0, &mime2.0));
|
|
701
|
-
}
|
|
702
|
-
|
|
703
|
-
#[test]
|
|
704
|
-
fn test_buffer_pool_acquire_and_release() {
|
|
705
|
-
let config = PoolConfig::default();
|
|
706
|
-
let pool = Arc::new(StringBufferPool::new(config));
|
|
707
|
-
|
|
708
|
-
let mut buffer = pool.clone().acquire();
|
|
709
|
-
buffer.push_str("test content");
|
|
710
|
-
let capacity = buffer.capacity();
|
|
711
|
-
|
|
712
|
-
drop(buffer);
|
|
713
|
-
|
|
714
|
-
let buffer2 = pool.clone().acquire();
|
|
715
|
-
assert_eq!(buffer2.capacity(), capacity);
|
|
716
|
-
assert!(buffer2.is_empty());
|
|
717
|
-
}
|
|
718
|
-
|
|
719
|
-
#[test]
|
|
720
|
-
fn test_buffer_pool_size() {
|
|
721
|
-
let config = PoolConfig::default();
|
|
722
|
-
let pool = Arc::new(StringBufferPool::new(config));
|
|
723
|
-
|
|
724
|
-
assert_eq!(pool.size(), 0);
|
|
725
|
-
|
|
726
|
-
let buffer1 = pool.clone().acquire();
|
|
727
|
-
drop(buffer1);
|
|
728
|
-
assert_eq!(pool.size(), 1);
|
|
729
|
-
|
|
730
|
-
let buffer2 = pool.clone().acquire();
|
|
731
|
-
drop(buffer2);
|
|
732
|
-
assert_eq!(pool.size(), 1);
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
#[test]
|
|
736
|
-
fn test_buffer_pool_global() {
|
|
737
|
-
let buffer1 = acquire_string_buffer();
|
|
738
|
-
drop(buffer1);
|
|
739
|
-
|
|
740
|
-
let buffer2 = acquire_string_buffer();
|
|
741
|
-
assert!(buffer2.capacity() >= 4096);
|
|
742
|
-
}
|
|
743
|
-
|
|
744
|
-
#[test]
|
|
745
|
-
fn test_pooled_string_deref() {
|
|
746
|
-
let mut buffer = acquire_string_buffer();
|
|
747
|
-
buffer.push_str("hello");
|
|
748
|
-
|
|
749
|
-
assert_eq!(&*buffer, "hello");
|
|
750
|
-
assert_eq!(buffer.as_str(), "hello");
|
|
751
|
-
assert!(!buffer.is_empty());
|
|
752
|
-
}
|
|
753
|
-
|
|
754
|
-
#[test]
|
|
755
|
-
fn test_pooled_string_deref_mut() {
|
|
756
|
-
let mut buffer = acquire_string_buffer();
|
|
757
|
-
buffer.push_str("test");
|
|
758
|
-
|
|
759
|
-
buffer.buffer_mut().push_str(" more");
|
|
760
|
-
assert_eq!(buffer.as_str(), "test more");
|
|
761
|
-
}
|
|
762
|
-
}
|