kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
//! API server startup functions.
|
|
2
|
+
|
|
3
|
+
use std::net::{IpAddr, SocketAddr};
|
|
4
|
+
|
|
5
|
+
use crate::{ExtractionConfig, Result, core::ServerConfig};
|
|
6
|
+
|
|
7
|
+
use super::{config::load_server_config, router::create_router_with_limits_and_server_config, types::ApiSizeLimits};
|
|
8
|
+
|
|
9
|
+
/// Start the API server with config file discovery.
|
|
10
|
+
///
|
|
11
|
+
/// Searches for kreuzberg.toml/yaml/json in current and parent directories.
|
|
12
|
+
/// If no config file is found, uses default configuration.
|
|
13
|
+
///
|
|
14
|
+
/// # Arguments
|
|
15
|
+
///
|
|
16
|
+
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
17
|
+
/// * `port` - Port number to bind to (e.g., 8000)
|
|
18
|
+
///
|
|
19
|
+
/// # Examples
|
|
20
|
+
///
|
|
21
|
+
/// ```no_run
|
|
22
|
+
/// use kreuzberg::api::serve;
|
|
23
|
+
///
|
|
24
|
+
/// #[tokio::main]
|
|
25
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
26
|
+
/// // Local development
|
|
27
|
+
/// serve("127.0.0.1", 8000).await?;
|
|
28
|
+
/// Ok(())
|
|
29
|
+
/// }
|
|
30
|
+
/// ```
|
|
31
|
+
///
|
|
32
|
+
/// ```no_run
|
|
33
|
+
/// use kreuzberg::api::serve;
|
|
34
|
+
///
|
|
35
|
+
/// #[tokio::main]
|
|
36
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
37
|
+
/// // Docker/production (listen on all interfaces)
|
|
38
|
+
/// serve("0.0.0.0", 8000).await?;
|
|
39
|
+
/// Ok(())
|
|
40
|
+
/// }
|
|
41
|
+
/// ```
|
|
42
|
+
///
|
|
43
|
+
/// # Environment Variables
|
|
44
|
+
///
|
|
45
|
+
/// ```bash
|
|
46
|
+
/// # Python/Docker usage
|
|
47
|
+
/// export KREUZBERG_HOST=0.0.0.0
|
|
48
|
+
/// export KREUZBERG_PORT=8000
|
|
49
|
+
///
|
|
50
|
+
/// # CORS configuration (IMPORTANT for production security)
|
|
51
|
+
/// # Default: allows all origins (permits CSRF attacks)
|
|
52
|
+
/// # Production: set to comma-separated list of allowed origins
|
|
53
|
+
/// export KREUZBERG_CORS_ORIGINS="https://app.example.com,https://api.example.com"
|
|
54
|
+
///
|
|
55
|
+
/// # Upload size limits (default: 100 MB)
|
|
56
|
+
/// # Modern approach (in bytes):
|
|
57
|
+
/// export KREUZBERG_MAX_REQUEST_BODY_BYTES=104857600 # 100 MB
|
|
58
|
+
/// export KREUZBERG_MAX_MULTIPART_FIELD_BYTES=104857600 # 100 MB per file
|
|
59
|
+
///
|
|
60
|
+
/// # Legacy approach (in MB, applies to both limits):
|
|
61
|
+
/// export KREUZBERG_MAX_UPLOAD_SIZE_MB=100 # 100 MB
|
|
62
|
+
///
|
|
63
|
+
/// python -m kreuzberg.api
|
|
64
|
+
/// ```
|
|
65
|
+
pub async fn serve(host: impl AsRef<str>, port: u16) -> Result<()> {
|
|
66
|
+
let extraction_config = match ExtractionConfig::discover()? {
|
|
67
|
+
Some(config) => {
|
|
68
|
+
tracing::info!("Loaded extraction config from discovered file");
|
|
69
|
+
config
|
|
70
|
+
}
|
|
71
|
+
None => {
|
|
72
|
+
tracing::info!("No config file found, using default configuration");
|
|
73
|
+
ExtractionConfig::default()
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
let server_config = load_server_config(None)?;
|
|
78
|
+
let limits = ApiSizeLimits::new(
|
|
79
|
+
server_config.max_request_body_bytes,
|
|
80
|
+
server_config.max_multipart_field_bytes,
|
|
81
|
+
);
|
|
82
|
+
|
|
83
|
+
serve_with_config_and_limits(host, port, extraction_config, limits).await
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/// Start the API server with explicit config.
|
|
87
|
+
///
|
|
88
|
+
/// Uses default size limits (100 MB). For custom limits, use `serve_with_config_and_limits`.
|
|
89
|
+
///
|
|
90
|
+
/// # Arguments
|
|
91
|
+
///
|
|
92
|
+
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
93
|
+
/// * `port` - Port number to bind to (e.g., 8000)
|
|
94
|
+
/// * `config` - Default extraction configuration for all requests
|
|
95
|
+
///
|
|
96
|
+
/// # Examples
|
|
97
|
+
///
|
|
98
|
+
/// ```no_run
|
|
99
|
+
/// use kreuzberg::{ExtractionConfig, api::serve_with_config};
|
|
100
|
+
///
|
|
101
|
+
/// #[tokio::main]
|
|
102
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
103
|
+
/// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
|
|
104
|
+
/// serve_with_config("127.0.0.1", 8000, config).await?;
|
|
105
|
+
/// Ok(())
|
|
106
|
+
/// }
|
|
107
|
+
/// ```
|
|
108
|
+
pub async fn serve_with_config(host: impl AsRef<str>, port: u16, config: ExtractionConfig) -> Result<()> {
|
|
109
|
+
let limits = ApiSizeLimits::default();
|
|
110
|
+
tracing::info!(
|
|
111
|
+
"Upload size limit: 100 MB (default, {} bytes)",
|
|
112
|
+
limits.max_request_body_bytes
|
|
113
|
+
);
|
|
114
|
+
serve_with_config_and_limits(host, port, config, limits).await
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
/// Start the API server with explicit config and size limits.
|
|
118
|
+
///
|
|
119
|
+
/// # Arguments
|
|
120
|
+
///
|
|
121
|
+
/// * `host` - IP address to bind to (e.g., "127.0.0.1" or "0.0.0.0")
|
|
122
|
+
/// * `port` - Port number to bind to (e.g., 8000)
|
|
123
|
+
/// * `config` - Default extraction configuration for all requests
|
|
124
|
+
/// * `limits` - Size limits for request bodies and multipart uploads
|
|
125
|
+
///
|
|
126
|
+
/// # Examples
|
|
127
|
+
///
|
|
128
|
+
/// ```no_run
|
|
129
|
+
/// use kreuzberg::{ExtractionConfig, api::{serve_with_config_and_limits, ApiSizeLimits}};
|
|
130
|
+
///
|
|
131
|
+
/// #[tokio::main]
|
|
132
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
133
|
+
/// let config = ExtractionConfig::from_toml_file("config/kreuzberg.toml")?;
|
|
134
|
+
/// let limits = ApiSizeLimits::from_mb(200, 200);
|
|
135
|
+
/// serve_with_config_and_limits("127.0.0.1", 8000, config, limits).await?;
|
|
136
|
+
/// Ok(())
|
|
137
|
+
/// }
|
|
138
|
+
/// ```
|
|
139
|
+
pub async fn serve_with_config_and_limits(
|
|
140
|
+
host: impl AsRef<str>,
|
|
141
|
+
port: u16,
|
|
142
|
+
config: ExtractionConfig,
|
|
143
|
+
limits: ApiSizeLimits,
|
|
144
|
+
) -> Result<()> {
|
|
145
|
+
let ip: IpAddr = host
|
|
146
|
+
.as_ref()
|
|
147
|
+
.parse()
|
|
148
|
+
.map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
|
|
149
|
+
|
|
150
|
+
let server_config = ServerConfig {
|
|
151
|
+
host: host.as_ref().to_string(),
|
|
152
|
+
port,
|
|
153
|
+
max_request_body_bytes: limits.max_request_body_bytes,
|
|
154
|
+
max_multipart_field_bytes: limits.max_multipart_field_bytes,
|
|
155
|
+
..Default::default()
|
|
156
|
+
};
|
|
157
|
+
|
|
158
|
+
let addr = SocketAddr::new(ip, port);
|
|
159
|
+
let app = create_router_with_limits_and_server_config(config, limits, server_config);
|
|
160
|
+
|
|
161
|
+
tracing::info!("Starting Kreuzberg API server on http://{}:{}", ip, port);
|
|
162
|
+
|
|
163
|
+
let listener = tokio::net::TcpListener::bind(addr)
|
|
164
|
+
.await
|
|
165
|
+
.map_err(crate::error::KreuzbergError::Io)?;
|
|
166
|
+
|
|
167
|
+
axum::serve(listener, app)
|
|
168
|
+
.await
|
|
169
|
+
.map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
|
|
170
|
+
|
|
171
|
+
Ok(())
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/// Start the API server with explicit extraction config and server config.
|
|
175
|
+
///
|
|
176
|
+
/// This function accepts a fully-configured ServerConfig, including CORS origins,
|
|
177
|
+
/// size limits, host, and port. It respects all ServerConfig fields without
|
|
178
|
+
/// re-parsing environment variables, making it ideal for CLI usage where
|
|
179
|
+
/// configuration precedence has already been applied.
|
|
180
|
+
///
|
|
181
|
+
/// # Arguments
|
|
182
|
+
///
|
|
183
|
+
/// * `extraction_config` - Default extraction configuration for all requests
|
|
184
|
+
/// * `server_config` - Server configuration including host, port, CORS, and size limits
|
|
185
|
+
///
|
|
186
|
+
/// # Examples
|
|
187
|
+
///
|
|
188
|
+
/// ```no_run
|
|
189
|
+
/// use kreuzberg::{ExtractionConfig, api::serve_with_server_config, core::ServerConfig};
|
|
190
|
+
///
|
|
191
|
+
/// #[tokio::main]
|
|
192
|
+
/// async fn main() -> kreuzberg::Result<()> {
|
|
193
|
+
/// let extraction_config = ExtractionConfig::default();
|
|
194
|
+
/// let mut server_config = ServerConfig::default();
|
|
195
|
+
/// server_config.host = "0.0.0.0".to_string();
|
|
196
|
+
/// server_config.port = 3000;
|
|
197
|
+
/// server_config.cors_origins = vec!["https://example.com".to_string()];
|
|
198
|
+
///
|
|
199
|
+
/// serve_with_server_config(extraction_config, server_config).await?;
|
|
200
|
+
/// Ok(())
|
|
201
|
+
/// }
|
|
202
|
+
/// ```
|
|
203
|
+
pub async fn serve_with_server_config(extraction_config: ExtractionConfig, server_config: ServerConfig) -> Result<()> {
|
|
204
|
+
let ip: IpAddr = server_config
|
|
205
|
+
.host
|
|
206
|
+
.parse()
|
|
207
|
+
.map_err(|e| crate::error::KreuzbergError::validation(format!("Invalid host address: {}", e)))?;
|
|
208
|
+
|
|
209
|
+
let limits = ApiSizeLimits::new(
|
|
210
|
+
server_config.max_request_body_bytes,
|
|
211
|
+
server_config.max_multipart_field_bytes,
|
|
212
|
+
);
|
|
213
|
+
|
|
214
|
+
let addr = SocketAddr::new(ip, server_config.port);
|
|
215
|
+
let app = create_router_with_limits_and_server_config(extraction_config, limits, server_config.clone());
|
|
216
|
+
|
|
217
|
+
tracing::info!(
|
|
218
|
+
"Starting Kreuzberg API server on http://{}:{} (request_body_limit={} MB, multipart_field_limit={} MB)",
|
|
219
|
+
ip,
|
|
220
|
+
server_config.port,
|
|
221
|
+
server_config.max_request_body_mb(),
|
|
222
|
+
server_config.max_multipart_field_mb()
|
|
223
|
+
);
|
|
224
|
+
|
|
225
|
+
let listener = tokio::net::TcpListener::bind(addr)
|
|
226
|
+
.await
|
|
227
|
+
.map_err(crate::error::KreuzbergError::Io)?;
|
|
228
|
+
|
|
229
|
+
axum::serve(listener, app)
|
|
230
|
+
.await
|
|
231
|
+
.map_err(|e| crate::error::KreuzbergError::Other(e.to_string()))?;
|
|
232
|
+
|
|
233
|
+
Ok(())
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
/// Start the API server with default host and port.
|
|
237
|
+
///
|
|
238
|
+
/// Defaults: host = "127.0.0.1", port = 8000
|
|
239
|
+
///
|
|
240
|
+
/// Uses config file discovery (searches current/parent directories for kreuzberg.toml/yaml/json).
|
|
241
|
+
pub async fn serve_default() -> Result<()> {
|
|
242
|
+
serve("127.0.0.1", 8000).await
|
|
243
|
+
}
|
|
@@ -204,3 +204,81 @@ pub struct EmbedResponse {
|
|
|
204
204
|
/// Number of embeddings generated
|
|
205
205
|
pub count: usize,
|
|
206
206
|
}
|
|
207
|
+
|
|
208
|
+
/// Default chunker type.
|
|
209
|
+
fn default_chunker_type() -> String {
|
|
210
|
+
"text".to_string()
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/// Chunk request with text and configuration.
|
|
214
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
215
|
+
pub struct ChunkRequest {
|
|
216
|
+
/// Text to chunk
|
|
217
|
+
pub text: String,
|
|
218
|
+
/// Optional chunking configuration
|
|
219
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
220
|
+
pub config: Option<ChunkingConfigRequest>,
|
|
221
|
+
/// Chunker type (text or markdown)
|
|
222
|
+
#[serde(default = "default_chunker_type")]
|
|
223
|
+
pub chunker_type: String,
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/// Chunking configuration request.
|
|
227
|
+
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
|
228
|
+
pub struct ChunkingConfigRequest {
|
|
229
|
+
/// Maximum characters per chunk
|
|
230
|
+
pub max_characters: Option<usize>,
|
|
231
|
+
/// Overlap between chunks in characters
|
|
232
|
+
pub overlap: Option<usize>,
|
|
233
|
+
/// Whether to trim whitespace
|
|
234
|
+
pub trim: Option<bool>,
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
/// Chunk response with chunks and metadata.
|
|
238
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
239
|
+
pub struct ChunkResponse {
|
|
240
|
+
/// List of chunks
|
|
241
|
+
pub chunks: Vec<ChunkItem>,
|
|
242
|
+
/// Total number of chunks
|
|
243
|
+
pub chunk_count: usize,
|
|
244
|
+
/// Configuration used for chunking
|
|
245
|
+
pub config: ChunkingConfigResponse,
|
|
246
|
+
/// Input text size in bytes
|
|
247
|
+
pub input_size_bytes: usize,
|
|
248
|
+
/// Chunker type used for chunking
|
|
249
|
+
pub chunker_type: String,
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
/// Individual chunk item with metadata.
|
|
253
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
254
|
+
pub struct ChunkItem {
|
|
255
|
+
/// Chunk content
|
|
256
|
+
pub content: String,
|
|
257
|
+
/// Byte offset start position
|
|
258
|
+
pub byte_start: usize,
|
|
259
|
+
/// Byte offset end position
|
|
260
|
+
pub byte_end: usize,
|
|
261
|
+
/// Index of this chunk (0-based)
|
|
262
|
+
pub chunk_index: usize,
|
|
263
|
+
/// Total number of chunks
|
|
264
|
+
pub total_chunks: usize,
|
|
265
|
+
/// First page number (optional, for PDF chunking)
|
|
266
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
267
|
+
pub first_page: Option<usize>,
|
|
268
|
+
/// Last page number (optional, for PDF chunking)
|
|
269
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
270
|
+
pub last_page: Option<usize>,
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
/// Chunking configuration response.
|
|
274
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
275
|
+
pub struct ChunkingConfigResponse {
|
|
276
|
+
/// Maximum characters per chunk
|
|
277
|
+
pub max_characters: usize,
|
|
278
|
+
/// Overlap between chunks in characters
|
|
279
|
+
pub overlap: usize,
|
|
280
|
+
/// Whether whitespace was trimmed
|
|
281
|
+
pub trim: bool,
|
|
282
|
+
/// Type of chunker used
|
|
283
|
+
pub chunker_type: String,
|
|
284
|
+
}
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
//! Cache cleanup operations for managing cache size and age.
|
|
2
|
+
|
|
3
|
+
use crate::error::{KreuzbergError, Result};
|
|
4
|
+
use std::fs;
|
|
5
|
+
use std::path::Path;
|
|
6
|
+
use std::time::{SystemTime, UNIX_EPOCH};
|
|
7
|
+
|
|
8
|
+
use super::core::{CacheEntry, CacheScanResult, CacheStats};
|
|
9
|
+
use super::utilities::get_available_disk_space;
|
|
10
|
+
|
|
11
|
+
pub(super) fn scan_cache_directory(cache_dir: &str) -> Result<CacheScanResult> {
|
|
12
|
+
let dir_path = Path::new(cache_dir);
|
|
13
|
+
|
|
14
|
+
if !dir_path.exists() {
|
|
15
|
+
return Ok(CacheScanResult {
|
|
16
|
+
stats: CacheStats {
|
|
17
|
+
total_files: 0,
|
|
18
|
+
total_size_mb: 0.0,
|
|
19
|
+
available_space_mb: get_available_disk_space(cache_dir)?,
|
|
20
|
+
oldest_file_age_days: 0.0,
|
|
21
|
+
newest_file_age_days: 0.0,
|
|
22
|
+
},
|
|
23
|
+
entries: Vec::new(),
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
let current_time = SystemTime::now()
|
|
28
|
+
.duration_since(UNIX_EPOCH)
|
|
29
|
+
.unwrap_or_default()
|
|
30
|
+
.as_secs() as f64;
|
|
31
|
+
|
|
32
|
+
let read_dir =
|
|
33
|
+
fs::read_dir(dir_path).map_err(|e| KreuzbergError::cache(format!("Failed to read cache directory: {}", e)))?;
|
|
34
|
+
|
|
35
|
+
let mut total_size = 0u64;
|
|
36
|
+
let mut oldest_age = 0.0f64;
|
|
37
|
+
let mut newest_age = f64::INFINITY;
|
|
38
|
+
let mut entries = Vec::new();
|
|
39
|
+
|
|
40
|
+
for entry in read_dir {
|
|
41
|
+
let entry = match entry {
|
|
42
|
+
Ok(e) => e,
|
|
43
|
+
Err(e) => {
|
|
44
|
+
tracing::debug!("Error reading cache entry: {}", e);
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
let metadata = match entry.metadata() {
|
|
50
|
+
Ok(m) if m.is_file() => m,
|
|
51
|
+
_ => continue,
|
|
52
|
+
};
|
|
53
|
+
|
|
54
|
+
let path = entry.path();
|
|
55
|
+
if path.extension().and_then(|s| s.to_str()) != Some("msgpack") {
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
let modified = match metadata.modified() {
|
|
60
|
+
Ok(m) => m,
|
|
61
|
+
Err(e) => {
|
|
62
|
+
tracing::debug!("Error getting modification time for {:?}: {}", path, e);
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
};
|
|
66
|
+
|
|
67
|
+
let size = metadata.len();
|
|
68
|
+
total_size += size;
|
|
69
|
+
|
|
70
|
+
if let Ok(duration) = modified.duration_since(UNIX_EPOCH) {
|
|
71
|
+
let age_days = (current_time - duration.as_secs() as f64) / (24.0 * 3600.0);
|
|
72
|
+
oldest_age = oldest_age.max(age_days);
|
|
73
|
+
newest_age = newest_age.min(age_days);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
entries.push(CacheEntry { path, size, modified });
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if entries.is_empty() {
|
|
80
|
+
oldest_age = 0.0;
|
|
81
|
+
newest_age = 0.0;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
Ok(CacheScanResult {
|
|
85
|
+
stats: CacheStats {
|
|
86
|
+
total_files: entries.len(),
|
|
87
|
+
total_size_mb: total_size as f64 / (1024.0 * 1024.0),
|
|
88
|
+
available_space_mb: get_available_disk_space(cache_dir)?,
|
|
89
|
+
oldest_file_age_days: oldest_age,
|
|
90
|
+
newest_file_age_days: newest_age,
|
|
91
|
+
},
|
|
92
|
+
entries,
|
|
93
|
+
})
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
pub fn get_cache_metadata(cache_dir: &str) -> Result<CacheStats> {
|
|
97
|
+
let scan_result = scan_cache_directory(cache_dir)?;
|
|
98
|
+
Ok(scan_result.stats)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
pub fn cleanup_cache(
|
|
102
|
+
cache_dir: &str,
|
|
103
|
+
max_age_days: f64,
|
|
104
|
+
max_size_mb: f64,
|
|
105
|
+
target_size_ratio: f64,
|
|
106
|
+
) -> Result<(usize, f64)> {
|
|
107
|
+
let scan_result = scan_cache_directory(cache_dir)?;
|
|
108
|
+
|
|
109
|
+
if scan_result.entries.is_empty() {
|
|
110
|
+
return Ok((0, 0.0));
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
let current_time = SystemTime::now()
|
|
114
|
+
.duration_since(UNIX_EPOCH)
|
|
115
|
+
.unwrap_or_default()
|
|
116
|
+
.as_secs() as f64;
|
|
117
|
+
let max_age_seconds = max_age_days * 24.0 * 3600.0;
|
|
118
|
+
|
|
119
|
+
let mut removed_count = 0;
|
|
120
|
+
let mut removed_size = 0.0;
|
|
121
|
+
let mut remaining_entries = Vec::new();
|
|
122
|
+
let mut total_remaining_size = 0u64;
|
|
123
|
+
|
|
124
|
+
for entry in scan_result.entries {
|
|
125
|
+
if let Ok(age) = entry.modified.duration_since(UNIX_EPOCH) {
|
|
126
|
+
let age_seconds = current_time - age.as_secs() as f64;
|
|
127
|
+
if age_seconds > max_age_seconds {
|
|
128
|
+
match fs::remove_file(&entry.path) {
|
|
129
|
+
Ok(_) => {
|
|
130
|
+
removed_count += 1;
|
|
131
|
+
removed_size += entry.size as f64 / (1024.0 * 1024.0);
|
|
132
|
+
}
|
|
133
|
+
Err(e) => {
|
|
134
|
+
tracing::debug!("Failed to remove {:?}: {}", entry.path, e);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
} else {
|
|
138
|
+
total_remaining_size += entry.size;
|
|
139
|
+
remaining_entries.push(entry);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
let mut total_size_mb = total_remaining_size as f64 / (1024.0 * 1024.0);
|
|
145
|
+
|
|
146
|
+
if total_size_mb > max_size_mb {
|
|
147
|
+
remaining_entries.sort_by_key(|e| e.modified);
|
|
148
|
+
|
|
149
|
+
let target_size = max_size_mb * target_size_ratio;
|
|
150
|
+
|
|
151
|
+
for entry in remaining_entries {
|
|
152
|
+
if total_size_mb <= target_size {
|
|
153
|
+
break;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
match fs::remove_file(&entry.path) {
|
|
157
|
+
Ok(_) => {
|
|
158
|
+
let size_mb = entry.size as f64 / (1024.0 * 1024.0);
|
|
159
|
+
removed_count += 1;
|
|
160
|
+
removed_size += size_mb;
|
|
161
|
+
total_size_mb -= size_mb;
|
|
162
|
+
}
|
|
163
|
+
Err(e) => {
|
|
164
|
+
tracing::debug!("Failed to remove {:?}: {}", entry.path, e);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
Ok((removed_count, removed_size))
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
pub fn smart_cleanup_cache(
|
|
174
|
+
cache_dir: &str,
|
|
175
|
+
max_age_days: f64,
|
|
176
|
+
max_size_mb: f64,
|
|
177
|
+
min_free_space_mb: f64,
|
|
178
|
+
) -> Result<(usize, f64)> {
|
|
179
|
+
let stats = get_cache_metadata(cache_dir)?;
|
|
180
|
+
|
|
181
|
+
let needs_cleanup = stats.available_space_mb < min_free_space_mb
|
|
182
|
+
|| stats.total_size_mb > max_size_mb
|
|
183
|
+
|| stats.oldest_file_age_days > max_age_days;
|
|
184
|
+
|
|
185
|
+
if !needs_cleanup {
|
|
186
|
+
return Ok((0, 0.0));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
let target_ratio = if stats.available_space_mb < min_free_space_mb {
|
|
190
|
+
0.5
|
|
191
|
+
} else {
|
|
192
|
+
0.8
|
|
193
|
+
};
|
|
194
|
+
|
|
195
|
+
cleanup_cache(cache_dir, max_age_days, max_size_mb, target_ratio)
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
pub fn is_cache_valid(cache_path: &str, max_age_days: f64) -> bool {
|
|
199
|
+
let path = Path::new(cache_path);
|
|
200
|
+
|
|
201
|
+
if !path.exists() {
|
|
202
|
+
return false;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
match fs::metadata(path) {
|
|
206
|
+
Ok(metadata) => match metadata.modified() {
|
|
207
|
+
Ok(modified) => match SystemTime::now().duration_since(modified) {
|
|
208
|
+
Ok(elapsed) => {
|
|
209
|
+
let age_days = elapsed.as_secs() as f64 / (24.0 * 3600.0);
|
|
210
|
+
age_days <= max_age_days
|
|
211
|
+
}
|
|
212
|
+
Err(_) => false,
|
|
213
|
+
},
|
|
214
|
+
Err(_) => false,
|
|
215
|
+
},
|
|
216
|
+
Err(_) => false,
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
pub fn clear_cache_directory(cache_dir: &str) -> Result<(usize, f64)> {
|
|
221
|
+
let dir_path = Path::new(cache_dir);
|
|
222
|
+
|
|
223
|
+
if !dir_path.exists() {
|
|
224
|
+
return Ok((0, 0.0));
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
let mut removed_count = 0;
|
|
228
|
+
let mut removed_size = 0.0;
|
|
229
|
+
|
|
230
|
+
let read_dir =
|
|
231
|
+
fs::read_dir(dir_path).map_err(|e| KreuzbergError::cache(format!("Failed to read cache directory: {}", e)))?;
|
|
232
|
+
|
|
233
|
+
for entry in read_dir {
|
|
234
|
+
let entry = match entry {
|
|
235
|
+
Ok(e) => e,
|
|
236
|
+
Err(e) => {
|
|
237
|
+
tracing::debug!("Error reading entry: {}", e);
|
|
238
|
+
continue;
|
|
239
|
+
}
|
|
240
|
+
};
|
|
241
|
+
|
|
242
|
+
let metadata = match entry.metadata() {
|
|
243
|
+
Ok(m) if m.is_file() => m,
|
|
244
|
+
_ => continue,
|
|
245
|
+
};
|
|
246
|
+
|
|
247
|
+
let path = entry.path();
|
|
248
|
+
if path.extension().and_then(|s| s.to_str()) != Some("msgpack") {
|
|
249
|
+
continue;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
let size_mb = metadata.len() as f64 / (1024.0 * 1024.0);
|
|
253
|
+
match fs::remove_file(&path) {
|
|
254
|
+
Ok(_) => {
|
|
255
|
+
removed_count += 1;
|
|
256
|
+
removed_size += size_mb;
|
|
257
|
+
}
|
|
258
|
+
Err(e) => {
|
|
259
|
+
tracing::debug!("Failed to remove {:?}: {}", path, e);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
Ok((removed_count, removed_size))
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
pub fn batch_cleanup_caches(
|
|
268
|
+
cache_dirs: &[&str],
|
|
269
|
+
max_age_days: f64,
|
|
270
|
+
max_size_mb: f64,
|
|
271
|
+
min_free_space_mb: f64,
|
|
272
|
+
) -> Result<Vec<(usize, f64)>> {
|
|
273
|
+
cache_dirs
|
|
274
|
+
.iter()
|
|
275
|
+
.map(|dir| smart_cleanup_cache(dir, max_age_days, max_size_mb, min_free_space_mb))
|
|
276
|
+
.collect()
|
|
277
|
+
}
|