kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
//! Cache utilities for key generation and disk space management.
|
|
2
|
+
|
|
3
|
+
use crate::error::Result;
|
|
4
|
+
use ahash::AHasher;
|
|
5
|
+
use std::hash::{Hash, Hasher};
|
|
6
|
+
|
|
7
|
+
#[cfg(unix)]
|
|
8
|
+
use crate::error::KreuzbergError;
|
|
9
|
+
#[cfg(unix)]
|
|
10
|
+
use std::path::Path;
|
|
11
|
+
|
|
12
|
+
/// Cache key hash format width (32 hex digits for u64 hash)
|
|
13
|
+
const CACHE_KEY_HASH_WIDTH: usize = 32;
|
|
14
|
+
|
|
15
|
+
/// Generate a deterministic cache key from configuration parameters.
|
|
16
|
+
///
|
|
17
|
+
/// # Algorithm
|
|
18
|
+
///
|
|
19
|
+
/// Uses ahash (non-cryptographic 64-bit hash) for performance. Cache keys are
|
|
20
|
+
/// generated by:
|
|
21
|
+
/// 1. Sorting key-value pairs by key (for determinism)
|
|
22
|
+
/// 2. Concatenating as "key1=val1&key2=val2&..."
|
|
23
|
+
/// 3. Hashing with ahash and formatting as 32-character hex
|
|
24
|
+
///
|
|
25
|
+
/// # Collision Probability
|
|
26
|
+
///
|
|
27
|
+
/// AHash produces 64-bit hashes, leading to birthday paradox collisions:
|
|
28
|
+
/// - **~0.01%** probability at 1 million cache entries
|
|
29
|
+
/// - **~1%** probability at 100 million entries
|
|
30
|
+
/// - **~50%** probability at 4.3 billion (2^32) entries
|
|
31
|
+
///
|
|
32
|
+
/// For context: P(collision) ≈ n^2 / (2 * 2^64) where n = number of entries.
|
|
33
|
+
///
|
|
34
|
+
/// # Performance vs Security Trade-off
|
|
35
|
+
///
|
|
36
|
+
/// - **ahash**: ~10x faster than SHA256, sufficient for cache keys
|
|
37
|
+
/// - **SHA256**: Collision-resistant but overkill for caching
|
|
38
|
+
/// - **Practical risk**: Low for typical usage (< 1M entries)
|
|
39
|
+
///
|
|
40
|
+
/// # Impact of Collisions
|
|
41
|
+
///
|
|
42
|
+
/// If two different configurations hash to the same key:
|
|
43
|
+
/// - One configuration reads the other's cached data
|
|
44
|
+
/// - Results in incorrect data served from cache
|
|
45
|
+
/// - Detected via metadata validation (size/mtime checks)
|
|
46
|
+
///
|
|
47
|
+
/// # Recommendations
|
|
48
|
+
///
|
|
49
|
+
/// - **< 1M entries**: ahash is safe and fast
|
|
50
|
+
/// - **> 100M entries**: Monitor cache size, consider periodic clearing
|
|
51
|
+
/// - **Critical data**: If collision risk is unacceptable, add SHA256 option
|
|
52
|
+
///
|
|
53
|
+
/// # Example
|
|
54
|
+
///
|
|
55
|
+
/// ```rust
|
|
56
|
+
/// use kreuzberg::cache::generate_cache_key;
|
|
57
|
+
///
|
|
58
|
+
/// let parts = [("format", "pdf"), ("ocr", "true"), ("lang", "en")];
|
|
59
|
+
/// let key = generate_cache_key(&parts);
|
|
60
|
+
/// assert_eq!(key.len(), 32); // 64-bit hash as hex
|
|
61
|
+
/// ```
|
|
62
|
+
pub fn generate_cache_key(parts: &[(&str, &str)]) -> String {
|
|
63
|
+
if parts.is_empty() {
|
|
64
|
+
return "empty".to_string();
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
let mut sorted_parts: Vec<_> = parts.to_vec();
|
|
68
|
+
sorted_parts.sort_by_key(|(k, _)| *k);
|
|
69
|
+
|
|
70
|
+
let estimated_size = sorted_parts.iter().map(|(k, v)| k.len() + v.len() + 2).sum::<usize>();
|
|
71
|
+
let mut cache_str = String::with_capacity(estimated_size);
|
|
72
|
+
|
|
73
|
+
for (i, (key, val)) in sorted_parts.iter().enumerate() {
|
|
74
|
+
if i > 0 {
|
|
75
|
+
cache_str.push('&');
|
|
76
|
+
}
|
|
77
|
+
cache_str.push_str(&format!("{}={}", key, val));
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
let mut hasher = AHasher::default();
|
|
81
|
+
cache_str.hash(&mut hasher);
|
|
82
|
+
let hash = hasher.finish();
|
|
83
|
+
|
|
84
|
+
format!("{:0width$x}", hash, width = CACHE_KEY_HASH_WIDTH)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
#[allow(unsafe_code)]
|
|
88
|
+
pub fn get_available_disk_space(path: &str) -> Result<f64> {
|
|
89
|
+
#[cfg(unix)]
|
|
90
|
+
{
|
|
91
|
+
let path = Path::new(path);
|
|
92
|
+
let check_path = if path.exists() {
|
|
93
|
+
path
|
|
94
|
+
} else if let Some(parent) = path.parent() {
|
|
95
|
+
parent
|
|
96
|
+
} else {
|
|
97
|
+
Path::new("/")
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
use libc::{statvfs, statvfs as statvfs_struct};
|
|
101
|
+
use std::ffi::CString;
|
|
102
|
+
|
|
103
|
+
let path_str = check_path
|
|
104
|
+
.to_str()
|
|
105
|
+
.ok_or_else(|| KreuzbergError::validation("Path contains invalid UTF-8".to_string()))?;
|
|
106
|
+
let c_path = CString::new(path_str).map_err(|e| KreuzbergError::validation(format!("Invalid path: {}", e)))?;
|
|
107
|
+
|
|
108
|
+
let mut stat: statvfs_struct = unsafe { std::mem::zeroed() };
|
|
109
|
+
|
|
110
|
+
let result = unsafe { statvfs(c_path.as_ptr(), &mut stat) };
|
|
111
|
+
|
|
112
|
+
if result == 0 {
|
|
113
|
+
#[allow(clippy::unnecessary_cast)]
|
|
114
|
+
let available_bytes = stat.f_bavail as u64 * stat.f_frsize as u64;
|
|
115
|
+
Ok(available_bytes as f64 / (1024.0 * 1024.0))
|
|
116
|
+
} else {
|
|
117
|
+
tracing::debug!("Failed to get disk stats for {}: errno {}", path_str, result);
|
|
118
|
+
Ok(10000.0)
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
#[cfg(not(unix))]
|
|
123
|
+
{
|
|
124
|
+
let _ = path;
|
|
125
|
+
Ok(10000.0)
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
pub fn fast_hash(data: &[u8]) -> u64 {
|
|
130
|
+
let mut hasher = AHasher::default();
|
|
131
|
+
data.hash(&mut hasher);
|
|
132
|
+
hasher.finish()
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
pub fn validate_cache_key(key: &str) -> bool {
|
|
136
|
+
key.len() == 32 && key.chars().all(|c| c.is_ascii_hexdigit())
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
pub fn filter_old_cache_entries(cache_times: &[f64], current_time: f64, max_age_seconds: f64) -> Vec<usize> {
|
|
140
|
+
cache_times
|
|
141
|
+
.iter()
|
|
142
|
+
.enumerate()
|
|
143
|
+
.filter_map(|(idx, &time)| {
|
|
144
|
+
if current_time - time > max_age_seconds {
|
|
145
|
+
Some(idx)
|
|
146
|
+
} else {
|
|
147
|
+
None
|
|
148
|
+
}
|
|
149
|
+
})
|
|
150
|
+
.collect()
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
pub fn sort_cache_by_access_time(mut entries: Vec<(String, f64)>) -> Vec<String> {
|
|
154
|
+
entries.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
|
|
155
|
+
entries.into_iter().map(|(key, _)| key).collect()
|
|
156
|
+
}
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
//! Page boundary handling and page range calculation for chunked text.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functions to track which pages text chunks span,
|
|
4
|
+
//! enabling accurate page-level metadata for document processing.
|
|
5
|
+
|
|
6
|
+
use crate::error::{KreuzbergError, Result};
|
|
7
|
+
use crate::types::PageBoundary;
|
|
8
|
+
|
|
9
|
+
/// Validates the consistency and correctness of page boundaries.
|
|
10
|
+
///
|
|
11
|
+
/// # Validation Rules
|
|
12
|
+
///
|
|
13
|
+
/// 1. Boundaries must be sorted by byte_start (monotonically increasing)
|
|
14
|
+
/// 2. Boundaries must not overlap (byte_end[i] <= byte_start[i+1])
|
|
15
|
+
/// 3. Each boundary must have byte_start < byte_end
|
|
16
|
+
///
|
|
17
|
+
/// # Arguments
|
|
18
|
+
///
|
|
19
|
+
/// * `boundaries` - Page boundary markers to validate
|
|
20
|
+
///
|
|
21
|
+
/// # Returns
|
|
22
|
+
///
|
|
23
|
+
/// Returns `Ok(())` if all boundaries are valid.
|
|
24
|
+
/// Returns `KreuzbergError::Validation` if any boundary is invalid.
|
|
25
|
+
pub fn validate_page_boundaries(boundaries: &[PageBoundary]) -> Result<()> {
|
|
26
|
+
if boundaries.is_empty() {
|
|
27
|
+
return Ok(());
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
for (idx, boundary) in boundaries.iter().enumerate() {
|
|
31
|
+
if boundary.byte_start >= boundary.byte_end {
|
|
32
|
+
return Err(KreuzbergError::validation(format!(
|
|
33
|
+
"Invalid boundary range at index {}: byte_start ({}) must be < byte_end ({})",
|
|
34
|
+
idx, boundary.byte_start, boundary.byte_end
|
|
35
|
+
)));
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
for i in 0..boundaries.len() - 1 {
|
|
40
|
+
let current = &boundaries[i];
|
|
41
|
+
let next = &boundaries[i + 1];
|
|
42
|
+
|
|
43
|
+
if current.byte_start > next.byte_start {
|
|
44
|
+
return Err(KreuzbergError::validation(format!(
|
|
45
|
+
"Page boundaries not sorted: boundary at index {} (byte_start={}) comes after boundary at index {} (byte_start={})",
|
|
46
|
+
i,
|
|
47
|
+
current.byte_start,
|
|
48
|
+
i + 1,
|
|
49
|
+
next.byte_start
|
|
50
|
+
)));
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if current.byte_end > next.byte_start {
|
|
54
|
+
return Err(KreuzbergError::validation(format!(
|
|
55
|
+
"Overlapping page boundaries: boundary {} ends at {} but boundary {} starts at {}",
|
|
56
|
+
i,
|
|
57
|
+
current.byte_end,
|
|
58
|
+
i + 1,
|
|
59
|
+
next.byte_start
|
|
60
|
+
)));
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
Ok(())
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/// Calculate which pages a byte range spans.
|
|
68
|
+
///
|
|
69
|
+
/// # Arguments
|
|
70
|
+
///
|
|
71
|
+
/// * `byte_start` - Starting byte offset of the chunk
|
|
72
|
+
/// * `byte_end` - Ending byte offset of the chunk
|
|
73
|
+
/// * `boundaries` - Page boundary markers from the document
|
|
74
|
+
///
|
|
75
|
+
/// # Returns
|
|
76
|
+
///
|
|
77
|
+
/// A tuple of (first_page, last_page) where page numbers are 1-indexed.
|
|
78
|
+
/// Returns (None, None) if boundaries are empty or chunk doesn't overlap any page.
|
|
79
|
+
///
|
|
80
|
+
/// # Errors
|
|
81
|
+
///
|
|
82
|
+
/// Returns `KreuzbergError::Validation` if boundaries are invalid.
|
|
83
|
+
///
|
|
84
|
+
/// # Examples
|
|
85
|
+
///
|
|
86
|
+
/// ```rust,ignore
|
|
87
|
+
/// use kreuzberg::chunking::boundaries::calculate_page_range;
|
|
88
|
+
/// use kreuzberg::types::PageBoundary;
|
|
89
|
+
///
|
|
90
|
+
/// let boundaries = vec![
|
|
91
|
+
/// PageBoundary { byte_start: 0, byte_end: 100, page_number: 1 },
|
|
92
|
+
/// PageBoundary { byte_start: 100, byte_end: 200, page_number: 2 },
|
|
93
|
+
/// ];
|
|
94
|
+
///
|
|
95
|
+
/// let (first, last) = calculate_page_range(50, 150, &boundaries)?;
|
|
96
|
+
/// assert_eq!(first, Some(1));
|
|
97
|
+
/// assert_eq!(last, Some(2));
|
|
98
|
+
/// # Ok::<(), kreuzberg::Result<()>>(())
|
|
99
|
+
/// ```
|
|
100
|
+
pub fn calculate_page_range(
|
|
101
|
+
byte_start: usize,
|
|
102
|
+
byte_end: usize,
|
|
103
|
+
boundaries: &[PageBoundary],
|
|
104
|
+
) -> Result<(Option<usize>, Option<usize>)> {
|
|
105
|
+
if boundaries.is_empty() {
|
|
106
|
+
return Ok((None, None));
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
validate_page_boundaries(boundaries)?;
|
|
110
|
+
|
|
111
|
+
let mut first_page = None;
|
|
112
|
+
let mut last_page = None;
|
|
113
|
+
|
|
114
|
+
for boundary in boundaries {
|
|
115
|
+
if byte_start < boundary.byte_end && byte_end > boundary.byte_start {
|
|
116
|
+
if first_page.is_none() {
|
|
117
|
+
first_page = Some(boundary.page_number);
|
|
118
|
+
}
|
|
119
|
+
last_page = Some(boundary.page_number);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
Ok((first_page, last_page))
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
#[cfg(test)]
|
|
127
|
+
mod tests {
|
|
128
|
+
use super::*;
|
|
129
|
+
|
|
130
|
+
#[test]
|
|
131
|
+
fn test_validate_page_boundaries_valid() {
|
|
132
|
+
let boundaries = vec![
|
|
133
|
+
PageBoundary {
|
|
134
|
+
byte_start: 0,
|
|
135
|
+
byte_end: 20,
|
|
136
|
+
page_number: 1,
|
|
137
|
+
},
|
|
138
|
+
PageBoundary {
|
|
139
|
+
byte_start: 20,
|
|
140
|
+
byte_end: 40,
|
|
141
|
+
page_number: 2,
|
|
142
|
+
},
|
|
143
|
+
PageBoundary {
|
|
144
|
+
byte_start: 40,
|
|
145
|
+
byte_end: 60,
|
|
146
|
+
page_number: 3,
|
|
147
|
+
},
|
|
148
|
+
];
|
|
149
|
+
|
|
150
|
+
let result = validate_page_boundaries(&boundaries);
|
|
151
|
+
assert!(result.is_ok());
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
#[test]
|
|
155
|
+
fn test_validate_page_boundaries_empty() {
|
|
156
|
+
let boundaries: Vec<PageBoundary> = vec![];
|
|
157
|
+
let result = validate_page_boundaries(&boundaries);
|
|
158
|
+
assert!(result.is_ok());
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
#[test]
|
|
162
|
+
fn test_calculate_page_range_within_page() {
|
|
163
|
+
let boundaries = vec![
|
|
164
|
+
PageBoundary {
|
|
165
|
+
byte_start: 0,
|
|
166
|
+
byte_end: 100,
|
|
167
|
+
page_number: 1,
|
|
168
|
+
},
|
|
169
|
+
PageBoundary {
|
|
170
|
+
byte_start: 100,
|
|
171
|
+
byte_end: 200,
|
|
172
|
+
page_number: 2,
|
|
173
|
+
},
|
|
174
|
+
];
|
|
175
|
+
|
|
176
|
+
let (first, last) = calculate_page_range(10, 50, &boundaries).unwrap();
|
|
177
|
+
assert_eq!(first, Some(1));
|
|
178
|
+
assert_eq!(last, Some(1));
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[test]
|
|
182
|
+
fn test_calculate_page_range_spanning_pages() {
|
|
183
|
+
let boundaries = vec![
|
|
184
|
+
PageBoundary {
|
|
185
|
+
byte_start: 0,
|
|
186
|
+
byte_end: 100,
|
|
187
|
+
page_number: 1,
|
|
188
|
+
},
|
|
189
|
+
PageBoundary {
|
|
190
|
+
byte_start: 100,
|
|
191
|
+
byte_end: 200,
|
|
192
|
+
page_number: 2,
|
|
193
|
+
},
|
|
194
|
+
];
|
|
195
|
+
|
|
196
|
+
let (first, last) = calculate_page_range(50, 150, &boundaries).unwrap();
|
|
197
|
+
assert_eq!(first, Some(1));
|
|
198
|
+
assert_eq!(last, Some(2));
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
#[test]
|
|
202
|
+
fn test_calculate_page_range_empty_boundaries() {
|
|
203
|
+
let boundaries: Vec<PageBoundary> = vec![];
|
|
204
|
+
|
|
205
|
+
let (first, last) = calculate_page_range(0, 50, &boundaries).unwrap();
|
|
206
|
+
assert_eq!(first, None);
|
|
207
|
+
assert_eq!(last, None);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
#[test]
|
|
211
|
+
fn test_calculate_page_range_no_overlap() {
|
|
212
|
+
let boundaries = vec![
|
|
213
|
+
PageBoundary {
|
|
214
|
+
byte_start: 0,
|
|
215
|
+
byte_end: 100,
|
|
216
|
+
page_number: 1,
|
|
217
|
+
},
|
|
218
|
+
PageBoundary {
|
|
219
|
+
byte_start: 100,
|
|
220
|
+
byte_end: 200,
|
|
221
|
+
page_number: 2,
|
|
222
|
+
},
|
|
223
|
+
];
|
|
224
|
+
|
|
225
|
+
let (first, last) = calculate_page_range(200, 250, &boundaries).unwrap();
|
|
226
|
+
assert_eq!(first, None);
|
|
227
|
+
assert_eq!(last, None);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
#[test]
|
|
231
|
+
fn test_calculate_page_range_three_pages() {
|
|
232
|
+
let boundaries = vec![
|
|
233
|
+
PageBoundary {
|
|
234
|
+
byte_start: 0,
|
|
235
|
+
byte_end: 100,
|
|
236
|
+
page_number: 1,
|
|
237
|
+
},
|
|
238
|
+
PageBoundary {
|
|
239
|
+
byte_start: 100,
|
|
240
|
+
byte_end: 200,
|
|
241
|
+
page_number: 2,
|
|
242
|
+
},
|
|
243
|
+
PageBoundary {
|
|
244
|
+
byte_start: 200,
|
|
245
|
+
byte_end: 300,
|
|
246
|
+
page_number: 3,
|
|
247
|
+
},
|
|
248
|
+
];
|
|
249
|
+
|
|
250
|
+
let (first, last) = calculate_page_range(50, 250, &boundaries).unwrap();
|
|
251
|
+
assert_eq!(first, Some(1));
|
|
252
|
+
assert_eq!(last, Some(3));
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
#[test]
|
|
256
|
+
fn test_calculate_page_range_with_invalid_boundaries() {
|
|
257
|
+
let boundaries = vec![PageBoundary {
|
|
258
|
+
byte_start: 15,
|
|
259
|
+
byte_end: 10,
|
|
260
|
+
page_number: 1,
|
|
261
|
+
}];
|
|
262
|
+
|
|
263
|
+
let result = calculate_page_range(0, 20, &boundaries);
|
|
264
|
+
assert!(result.is_err());
|
|
265
|
+
let err = result.unwrap_err();
|
|
266
|
+
assert!(err.to_string().contains("Invalid boundary range"));
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
#[test]
|
|
270
|
+
fn test_page_boundaries_with_gaps() {
|
|
271
|
+
let boundaries = vec![
|
|
272
|
+
PageBoundary {
|
|
273
|
+
byte_start: 0,
|
|
274
|
+
byte_end: 10,
|
|
275
|
+
page_number: 1,
|
|
276
|
+
},
|
|
277
|
+
PageBoundary {
|
|
278
|
+
byte_start: 15,
|
|
279
|
+
byte_end: 25,
|
|
280
|
+
page_number: 2,
|
|
281
|
+
},
|
|
282
|
+
];
|
|
283
|
+
|
|
284
|
+
let result = validate_page_boundaries(&boundaries);
|
|
285
|
+
assert!(result.is_ok());
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
#[test]
|
|
289
|
+
fn test_chunk_with_same_start_and_end() {
|
|
290
|
+
let boundaries = vec![PageBoundary {
|
|
291
|
+
byte_start: 10,
|
|
292
|
+
byte_end: 10,
|
|
293
|
+
page_number: 1,
|
|
294
|
+
}];
|
|
295
|
+
|
|
296
|
+
let result = validate_page_boundaries(&boundaries);
|
|
297
|
+
assert!(result.is_err());
|
|
298
|
+
let err = result.unwrap_err();
|
|
299
|
+
assert!(err.to_string().contains("Invalid boundary range"));
|
|
300
|
+
}
|
|
301
|
+
}
|