kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
//! Interned string type and trait implementations.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides the `InternedString` type which wraps an Arc<String>
|
|
4
|
+
//! to enable string deduplication and pointer-based comparisons.
|
|
5
|
+
|
|
6
|
+
use std::sync::Arc;
|
|
7
|
+
|
|
8
|
+
/// A reference to an interned string stored in an Arc.
|
|
9
|
+
///
|
|
10
|
+
/// This wraps an Arc<String> and provides convenient access to the string content.
|
|
11
|
+
/// Multiple calls with the same string content will share the same Arc, reducing memory usage.
|
|
12
|
+
#[derive(Clone)]
|
|
13
|
+
pub struct InternedString(pub(super) Arc<String>);
|
|
14
|
+
|
|
15
|
+
impl InternedString {
|
|
16
|
+
/// Get the string content.
|
|
17
|
+
pub fn as_str(&self) -> &str {
|
|
18
|
+
self.0.as_str()
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
impl AsRef<str> for InternedString {
|
|
23
|
+
fn as_ref(&self) -> &str {
|
|
24
|
+
self.as_str()
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
impl std::fmt::Display for InternedString {
|
|
29
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
30
|
+
write!(f, "{}", self.as_str())
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
impl std::fmt::Debug for InternedString {
|
|
35
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
36
|
+
f.debug_tuple("InternedString").field(&self.as_str()).finish()
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
impl PartialEq for InternedString {
|
|
41
|
+
fn eq(&self, other: &Self) -> bool {
|
|
42
|
+
Arc::ptr_eq(&self.0, &other.0) || self.as_str() == other.as_str()
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
impl Eq for InternedString {}
|
|
47
|
+
|
|
48
|
+
impl std::hash::Hash for InternedString {
|
|
49
|
+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
|
|
50
|
+
self.as_str().hash(state);
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
impl std::ops::Deref for InternedString {
|
|
55
|
+
type Target = str;
|
|
56
|
+
|
|
57
|
+
fn deref(&self) -> &Self::Target {
|
|
58
|
+
self.as_str()
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
#[cfg(test)]
|
|
63
|
+
mod tests {
|
|
64
|
+
use super::*;
|
|
65
|
+
|
|
66
|
+
#[test]
|
|
67
|
+
fn test_interned_string_display() {
|
|
68
|
+
let s = InternedString(Arc::new("text/html".to_string()));
|
|
69
|
+
assert_eq!(format!("{}", s), "text/html");
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
#[test]
|
|
73
|
+
fn test_interned_string_deref() {
|
|
74
|
+
let s = InternedString(Arc::new("application/json".to_string()));
|
|
75
|
+
assert_eq!(&*s, "application/json");
|
|
76
|
+
assert_eq!(s.as_ref(), "application/json");
|
|
77
|
+
assert_eq!(s.as_str(), "application/json");
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
#[test]
|
|
81
|
+
fn test_interned_string_hash() {
|
|
82
|
+
use std::collections::HashSet;
|
|
83
|
+
|
|
84
|
+
let s1 = InternedString(Arc::new("application/pdf".to_string()));
|
|
85
|
+
let s2 = InternedString(Arc::clone(&s1.0));
|
|
86
|
+
|
|
87
|
+
let mut set = HashSet::new();
|
|
88
|
+
set.insert(s1);
|
|
89
|
+
set.insert(s2);
|
|
90
|
+
|
|
91
|
+
assert_eq!(set.len(), 1);
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
#[test]
|
|
95
|
+
fn test_interned_string_clone() {
|
|
96
|
+
let s1 = InternedString(Arc::new("text/html".to_string()));
|
|
97
|
+
let s2 = s1.clone();
|
|
98
|
+
|
|
99
|
+
assert_eq!(s1, s2);
|
|
100
|
+
assert!(Arc::ptr_eq(&s1.0, &s2.0));
|
|
101
|
+
}
|
|
102
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
//! String pool for language codes with pre-interning of common ISO 639 codes.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a thread-safe string pool specifically for language codes,
|
|
4
|
+
//! with lazy initialization of common language codes on first access.
|
|
5
|
+
|
|
6
|
+
use super::interned::InternedString;
|
|
7
|
+
use once_cell::sync::Lazy;
|
|
8
|
+
use std::sync::Arc;
|
|
9
|
+
use std::sync::atomic::{AtomicBool, Ordering};
|
|
10
|
+
|
|
11
|
+
/// String pool for language codes.
|
|
12
|
+
///
|
|
13
|
+
/// Lazily initializes with common ISO 639 language codes.
|
|
14
|
+
/// Pre-interning is deferred until first access to reduce startup memory usage.
|
|
15
|
+
pub(super) struct LanguageStringPool {
|
|
16
|
+
pool: dashmap::DashMap<String, Arc<String>>,
|
|
17
|
+
initialized: AtomicBool,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
impl LanguageStringPool {
|
|
21
|
+
/// Create a new language string pool.
|
|
22
|
+
/// Pre-interning is deferred until first `get_or_intern()` call.
|
|
23
|
+
pub(super) fn new() -> Self {
|
|
24
|
+
LanguageStringPool {
|
|
25
|
+
pool: dashmap::DashMap::new(),
|
|
26
|
+
initialized: AtomicBool::new(false),
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Ensure all known language codes are pre-interned (one-time initialization).
|
|
31
|
+
#[inline]
|
|
32
|
+
fn ensure_initialized(&self) {
|
|
33
|
+
if self.initialized.load(Ordering::Acquire) {
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
let lang_codes = vec![
|
|
38
|
+
"en", "es", "fr", "de", "it", "pt", "ru", "ja", "ko", "zh", "ar", "hi", "th", "tr", "pl", "nl", "sv", "no",
|
|
39
|
+
"da", "fi", "cs", "hu", "ro", "el", "he", "fa", "ur", "vi", "id", "ms", "bn", "pa", "te", "mr", "ta", "gu",
|
|
40
|
+
"kn", "ml", "or", "uk", "bg", "sr", "hr", "sl", "sk", "et", "lv", "lt", "sq", "mk", "ka", "hy", "eo",
|
|
41
|
+
"ast", "ca", "eu", "gl", "cy", "gd", "ga",
|
|
42
|
+
];
|
|
43
|
+
|
|
44
|
+
for code in lang_codes {
|
|
45
|
+
self.pool.insert(code.to_string(), Arc::new(code.to_string()));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
let _ = self
|
|
49
|
+
.initialized
|
|
50
|
+
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Get or intern a language code string.
|
|
54
|
+
/// Ensures pre-interned language codes are initialized on first call.
|
|
55
|
+
pub(super) fn get_or_intern(&self, lang_code: &str) -> Arc<String> {
|
|
56
|
+
self.ensure_initialized();
|
|
57
|
+
|
|
58
|
+
if let Some(entry) = self.pool.get(lang_code) {
|
|
59
|
+
Arc::clone(&*entry)
|
|
60
|
+
} else {
|
|
61
|
+
let arc_string = Arc::new(lang_code.to_string());
|
|
62
|
+
self.pool.insert(lang_code.to_string(), Arc::clone(&arc_string));
|
|
63
|
+
arc_string
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/// Global language code string pool.
|
|
69
|
+
pub(super) static LANGUAGE_POOL: Lazy<LanguageStringPool> = Lazy::new(LanguageStringPool::new);
|
|
70
|
+
|
|
71
|
+
/// Get or intern a language code string.
|
|
72
|
+
///
|
|
73
|
+
/// Returns an `InternedString` that is guaranteed to be deduplicated with any other
|
|
74
|
+
/// intern call for the same language code.
|
|
75
|
+
///
|
|
76
|
+
/// # Arguments
|
|
77
|
+
///
|
|
78
|
+
/// * `lang_code` - The language code to intern (e.g., "en", "es", "fr")
|
|
79
|
+
///
|
|
80
|
+
/// # Returns
|
|
81
|
+
///
|
|
82
|
+
/// An `InternedString` pointing to the deduplicated string
|
|
83
|
+
///
|
|
84
|
+
/// # Example
|
|
85
|
+
///
|
|
86
|
+
/// ```rust,ignore
|
|
87
|
+
/// let en1 = intern_language_code("en");
|
|
88
|
+
/// let en2 = intern_language_code("en");
|
|
89
|
+
/// assert_eq!(en1, en2); // Same pointer
|
|
90
|
+
/// ```
|
|
91
|
+
pub fn intern_language_code(lang_code: &str) -> InternedString {
|
|
92
|
+
InternedString(LANGUAGE_POOL.get_or_intern(lang_code))
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
#[cfg(test)]
|
|
96
|
+
mod tests {
|
|
97
|
+
use super::*;
|
|
98
|
+
|
|
99
|
+
#[test]
|
|
100
|
+
fn test_language_code_deduplication() {
|
|
101
|
+
let en1 = intern_language_code("en");
|
|
102
|
+
let en2 = intern_language_code("en");
|
|
103
|
+
|
|
104
|
+
assert_eq!(en1, en2);
|
|
105
|
+
assert!(Arc::ptr_eq(&en1.0, &en2.0));
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
#[test]
|
|
109
|
+
fn test_preinterned_language_codes() {
|
|
110
|
+
let en = intern_language_code("en");
|
|
111
|
+
assert_eq!(en.as_str(), "en");
|
|
112
|
+
|
|
113
|
+
let es = intern_language_code("es");
|
|
114
|
+
assert_eq!(es.as_str(), "es");
|
|
115
|
+
|
|
116
|
+
let fr = intern_language_code("fr");
|
|
117
|
+
assert_eq!(fr.as_str(), "fr");
|
|
118
|
+
}
|
|
119
|
+
}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
//! String pool for MIME types with pre-interning of common types.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a thread-safe string pool specifically for MIME types,
|
|
4
|
+
//! with lazy initialization of common MIME types on first access.
|
|
5
|
+
|
|
6
|
+
use super::interned::InternedString;
|
|
7
|
+
use once_cell::sync::Lazy;
|
|
8
|
+
use std::sync::Arc;
|
|
9
|
+
use std::sync::atomic::{AtomicBool, Ordering};
|
|
10
|
+
|
|
11
|
+
/// String pool for MIME types.
|
|
12
|
+
///
|
|
13
|
+
/// Lazily initializes with all known MIME types from `kreuzberg::core::mime`.
|
|
14
|
+
/// Pre-interning is deferred until first access to reduce startup memory usage.
|
|
15
|
+
pub(super) struct MimeStringPool {
|
|
16
|
+
pool: dashmap::DashMap<String, Arc<String>>,
|
|
17
|
+
initialized: AtomicBool,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
impl MimeStringPool {
|
|
21
|
+
/// Create a new MIME string pool.
|
|
22
|
+
/// Pre-interning is deferred until first `get_or_intern()` call.
|
|
23
|
+
pub(super) fn new() -> Self {
|
|
24
|
+
MimeStringPool {
|
|
25
|
+
pool: dashmap::DashMap::new(),
|
|
26
|
+
initialized: AtomicBool::new(false),
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Ensure all known MIME types are pre-interned (one-time initialization).
|
|
31
|
+
#[inline]
|
|
32
|
+
fn ensure_initialized(&self) {
|
|
33
|
+
if self.initialized.load(Ordering::Acquire) {
|
|
34
|
+
return;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
let mime_types = vec![
|
|
38
|
+
"text/html",
|
|
39
|
+
"text/markdown",
|
|
40
|
+
"text/x-markdown",
|
|
41
|
+
"text/plain",
|
|
42
|
+
"application/pdf",
|
|
43
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
44
|
+
"application/msword",
|
|
45
|
+
"application/vnd.ms-powerpoint",
|
|
46
|
+
"message/rfc822",
|
|
47
|
+
"application/vnd.ms-outlook",
|
|
48
|
+
"application/json",
|
|
49
|
+
"text/json",
|
|
50
|
+
"application/x-yaml",
|
|
51
|
+
"text/yaml",
|
|
52
|
+
"text/x-yaml",
|
|
53
|
+
"application/yaml",
|
|
54
|
+
"application/toml",
|
|
55
|
+
"text/toml",
|
|
56
|
+
"application/xml",
|
|
57
|
+
"text/xml",
|
|
58
|
+
"image/svg+xml",
|
|
59
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
60
|
+
"application/vnd.ms-excel",
|
|
61
|
+
"application/vnd.ms-excel.sheet.macroEnabled.12",
|
|
62
|
+
"application/vnd.ms-excel.sheet.binary.macroEnabled.12",
|
|
63
|
+
"application/vnd.ms-excel.addin.macroEnabled.12",
|
|
64
|
+
"application/vnd.ms-excel.template.macroEnabled.12",
|
|
65
|
+
"application/vnd.oasis.opendocument.spreadsheet",
|
|
66
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
67
|
+
"application/vnd.oasis.opendocument.text",
|
|
68
|
+
"image/bmp",
|
|
69
|
+
"image/gif",
|
|
70
|
+
"image/jp2",
|
|
71
|
+
"image/jpeg",
|
|
72
|
+
"image/jpm",
|
|
73
|
+
"image/jpx",
|
|
74
|
+
"image/mj2",
|
|
75
|
+
"image/pjpeg",
|
|
76
|
+
"image/png",
|
|
77
|
+
"image/tiff",
|
|
78
|
+
"image/webp",
|
|
79
|
+
"image/x-bmp",
|
|
80
|
+
"image/x-ms-bmp",
|
|
81
|
+
"image/x-portable-anymap",
|
|
82
|
+
"image/x-portable-bitmap",
|
|
83
|
+
"image/x-portable-graymap",
|
|
84
|
+
"image/x-portable-pixmap",
|
|
85
|
+
"image/x-tiff",
|
|
86
|
+
"application/csl+json",
|
|
87
|
+
"application/docbook+xml",
|
|
88
|
+
"application/epub+zip",
|
|
89
|
+
"application/rtf",
|
|
90
|
+
"application/x-biblatex",
|
|
91
|
+
"application/x-bibtex",
|
|
92
|
+
"application/x-endnote+xml",
|
|
93
|
+
"application/x-fictionbook+xml",
|
|
94
|
+
"application/x-ipynb+json",
|
|
95
|
+
"application/x-jats+xml",
|
|
96
|
+
"application/x-latex",
|
|
97
|
+
"application/xml+opml",
|
|
98
|
+
"application/x-opml+xml",
|
|
99
|
+
"application/x-research-info-systems",
|
|
100
|
+
"application/x-typst",
|
|
101
|
+
"text/csv",
|
|
102
|
+
"text/tab-separated-values",
|
|
103
|
+
"text/troff",
|
|
104
|
+
"text/x-commonmark",
|
|
105
|
+
"text/x-dokuwiki",
|
|
106
|
+
"text/x-gfm",
|
|
107
|
+
"text/x-markdown-extra",
|
|
108
|
+
"text/x-mdoc",
|
|
109
|
+
"text/x-multimarkdown",
|
|
110
|
+
"text/x-opml",
|
|
111
|
+
"text/x-org",
|
|
112
|
+
"text/x-pod",
|
|
113
|
+
"text/x-rst",
|
|
114
|
+
"application/zip",
|
|
115
|
+
"application/x-zip-compressed",
|
|
116
|
+
"application/x-tar",
|
|
117
|
+
"application/tar",
|
|
118
|
+
"application/x-gtar",
|
|
119
|
+
"application/x-ustar",
|
|
120
|
+
"application/gzip",
|
|
121
|
+
"application/x-7z-compressed",
|
|
122
|
+
];
|
|
123
|
+
|
|
124
|
+
for mime_type in mime_types {
|
|
125
|
+
self.pool.insert(mime_type.to_string(), Arc::new(mime_type.to_string()));
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
let _ = self
|
|
129
|
+
.initialized
|
|
130
|
+
.compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/// Get or intern a MIME type string.
|
|
134
|
+
/// Ensures pre-interned MIME types are initialized on first call.
|
|
135
|
+
pub(super) fn get_or_intern(&self, mime_type: &str) -> Arc<String> {
|
|
136
|
+
self.ensure_initialized();
|
|
137
|
+
|
|
138
|
+
if let Some(entry) = self.pool.get(mime_type) {
|
|
139
|
+
Arc::clone(&*entry)
|
|
140
|
+
} else {
|
|
141
|
+
let arc_string = Arc::new(mime_type.to_string());
|
|
142
|
+
self.pool.insert(mime_type.to_string(), Arc::clone(&arc_string));
|
|
143
|
+
arc_string
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Global MIME type string pool.
|
|
149
|
+
pub(super) static MIME_POOL: Lazy<MimeStringPool> = Lazy::new(MimeStringPool::new);
|
|
150
|
+
|
|
151
|
+
/// Get or intern a MIME type string.
|
|
152
|
+
///
|
|
153
|
+
/// Returns an `InternedString` that is guaranteed to be deduplicated with any other
|
|
154
|
+
/// intern call for the same MIME type. This reduces memory usage and allows
|
|
155
|
+
/// fast pointer-based comparisons.
|
|
156
|
+
///
|
|
157
|
+
/// # Arguments
|
|
158
|
+
///
|
|
159
|
+
/// * `mime_type` - The MIME type string to intern
|
|
160
|
+
///
|
|
161
|
+
/// # Returns
|
|
162
|
+
///
|
|
163
|
+
/// An `InternedString` pointing to the deduplicated string
|
|
164
|
+
///
|
|
165
|
+
/// # Example
|
|
166
|
+
///
|
|
167
|
+
/// ```rust,ignore
|
|
168
|
+
/// let pdf1 = intern_mime_type("application/pdf");
|
|
169
|
+
/// let pdf2 = intern_mime_type("application/pdf");
|
|
170
|
+
/// assert_eq!(pdf1, pdf2); // Same pointer
|
|
171
|
+
/// ```
|
|
172
|
+
pub fn intern_mime_type(mime_type: &str) -> InternedString {
|
|
173
|
+
InternedString(MIME_POOL.get_or_intern(mime_type))
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
#[cfg(test)]
|
|
177
|
+
mod tests {
|
|
178
|
+
use super::*;
|
|
179
|
+
|
|
180
|
+
#[test]
|
|
181
|
+
fn test_mime_type_deduplication() {
|
|
182
|
+
let mime1 = intern_mime_type("application/pdf");
|
|
183
|
+
let mime2 = intern_mime_type("application/pdf");
|
|
184
|
+
|
|
185
|
+
assert_eq!(mime1, mime2);
|
|
186
|
+
assert!(Arc::ptr_eq(&mime1.0, &mime2.0));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_preinterned_mime_types() {
|
|
191
|
+
let pdf = intern_mime_type("application/pdf");
|
|
192
|
+
assert_eq!(pdf.as_str(), "application/pdf");
|
|
193
|
+
|
|
194
|
+
let html = intern_mime_type("text/html");
|
|
195
|
+
assert_eq!(html.as_str(), "text/html");
|
|
196
|
+
|
|
197
|
+
let json = intern_mime_type("application/json");
|
|
198
|
+
assert_eq!(json.as_str(), "application/json");
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
#[test]
|
|
202
|
+
#[ignore = "Flaky test - concurrent interning may not always share the same Arc"]
|
|
203
|
+
fn test_concurrent_interning() {
|
|
204
|
+
use std::sync::Arc as StdArc;
|
|
205
|
+
use std::thread;
|
|
206
|
+
|
|
207
|
+
let mime = "application/pdf";
|
|
208
|
+
let results = StdArc::new(std::sync::Mutex::new(Vec::new()));
|
|
209
|
+
|
|
210
|
+
let handles: Vec<_> = (0..10)
|
|
211
|
+
.map(|_| {
|
|
212
|
+
let results = StdArc::clone(&results);
|
|
213
|
+
thread::spawn(move || {
|
|
214
|
+
let interned = intern_mime_type(mime);
|
|
215
|
+
results.lock().unwrap().push(interned);
|
|
216
|
+
})
|
|
217
|
+
})
|
|
218
|
+
.collect();
|
|
219
|
+
|
|
220
|
+
for handle in handles {
|
|
221
|
+
handle.join().unwrap();
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
let interned_strings = results.lock().unwrap();
|
|
225
|
+
assert_eq!(interned_strings.len(), 10);
|
|
226
|
+
|
|
227
|
+
let first_arc = &interned_strings[0].0;
|
|
228
|
+
for interned in &*interned_strings {
|
|
229
|
+
assert!(
|
|
230
|
+
Arc::ptr_eq(&interned.0, first_arc),
|
|
231
|
+
"All interned strings should share the same Arc"
|
|
232
|
+
);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
//! String interning/pooling for frequently used strings.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides thread-safe string interning to reduce memory allocations
|
|
4
|
+
//! for strings that appear repeatedly across documents (MIME types, language codes, format field names).
|
|
5
|
+
//!
|
|
6
|
+
//! # Performance
|
|
7
|
+
//!
|
|
8
|
+
//! String interning provides 0.1-0.3% improvement by:
|
|
9
|
+
//! - Deduplicating repeated strings (e.g., "application/pdf" appears 1000s of times)
|
|
10
|
+
//! - Reducing allocation overhead for commonly used strings
|
|
11
|
+
//! - Enabling pointer comparisons instead of string comparisons
|
|
12
|
+
//!
|
|
13
|
+
//! # Thread Safety
|
|
14
|
+
//!
|
|
15
|
+
//! The intern pool uses a `DashMap` for lock-free concurrent access. Multiple threads
|
|
16
|
+
//! can insert and lookup strings simultaneously without contention.
|
|
17
|
+
//!
|
|
18
|
+
//! # Example
|
|
19
|
+
//!
|
|
20
|
+
//! ```rust,ignore
|
|
21
|
+
//! use kreuzberg::utils::string_pool::intern_mime_type;
|
|
22
|
+
//!
|
|
23
|
+
//! let mime1 = intern_mime_type("application/pdf");
|
|
24
|
+
//! let mime2 = intern_mime_type("application/pdf");
|
|
25
|
+
//! // Both mime1 and mime2 point to the same interned string
|
|
26
|
+
//! assert_eq!(mime1, mime2);
|
|
27
|
+
//! ```
|
|
28
|
+
|
|
29
|
+
mod buffer_pool;
|
|
30
|
+
mod interned;
|
|
31
|
+
mod language_pool;
|
|
32
|
+
mod mime_pool;
|
|
33
|
+
|
|
34
|
+
// Re-export public types and functions
|
|
35
|
+
pub use buffer_pool::{PoolConfig, PooledString, STRING_BUFFER_POOL, StringBufferPool, acquire_string_buffer};
|
|
36
|
+
pub use interned::InternedString;
|
|
37
|
+
pub use language_pool::intern_language_code;
|
|
38
|
+
pub use mime_pool::intern_mime_type;
|
|
39
|
+
|
|
40
|
+
#[cfg(feature = "pool-metrics")]
|
|
41
|
+
pub use buffer_pool::StringBufferPoolMetrics;
|