kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,312 @@
|
|
|
1
|
+
//! Post-processing and chunking configuration.
|
|
2
|
+
//!
|
|
3
|
+
//! Defines configuration for post-processing pipelines, text chunking,
|
|
4
|
+
//! and embedding generation.
|
|
5
|
+
|
|
6
|
+
use serde::{Deserialize, Serialize};
|
|
7
|
+
use std::collections::HashSet;
|
|
8
|
+
use std::path::PathBuf;
|
|
9
|
+
|
|
10
|
+
/// Post-processor configuration.
|
|
11
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
12
|
+
pub struct PostProcessorConfig {
|
|
13
|
+
/// Enable post-processors
|
|
14
|
+
#[serde(default = "default_true")]
|
|
15
|
+
pub enabled: bool,
|
|
16
|
+
|
|
17
|
+
/// Whitelist of processor names to run (None = all enabled)
|
|
18
|
+
#[serde(default)]
|
|
19
|
+
pub enabled_processors: Option<Vec<String>>,
|
|
20
|
+
|
|
21
|
+
/// Blacklist of processor names to skip (None = none disabled)
|
|
22
|
+
#[serde(default)]
|
|
23
|
+
pub disabled_processors: Option<Vec<String>>,
|
|
24
|
+
|
|
25
|
+
/// Pre-computed HashSet for O(1) enabled processor lookup
|
|
26
|
+
#[serde(skip)]
|
|
27
|
+
pub enabled_set: Option<HashSet<String>>,
|
|
28
|
+
|
|
29
|
+
/// Pre-computed HashSet for O(1) disabled processor lookup
|
|
30
|
+
#[serde(skip)]
|
|
31
|
+
pub disabled_set: Option<HashSet<String>>,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
impl PostProcessorConfig {
|
|
35
|
+
/// Pre-compute HashSets for O(1) processor name lookups.
|
|
36
|
+
///
|
|
37
|
+
/// This method converts the enabled/disabled processor Vec to HashSet
|
|
38
|
+
/// for constant-time lookups in the pipeline.
|
|
39
|
+
pub fn build_lookup_sets(&mut self) {
|
|
40
|
+
if let Some(ref enabled) = self.enabled_processors {
|
|
41
|
+
self.enabled_set = Some(enabled.iter().cloned().collect());
|
|
42
|
+
}
|
|
43
|
+
if let Some(ref disabled) = self.disabled_processors {
|
|
44
|
+
self.disabled_set = Some(disabled.iter().cloned().collect());
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
impl Default for PostProcessorConfig {
|
|
50
|
+
fn default() -> Self {
|
|
51
|
+
Self {
|
|
52
|
+
enabled: true,
|
|
53
|
+
enabled_processors: None,
|
|
54
|
+
disabled_processors: None,
|
|
55
|
+
enabled_set: None,
|
|
56
|
+
disabled_set: None,
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/// Chunking configuration.
|
|
62
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
63
|
+
pub struct ChunkingConfig {
|
|
64
|
+
/// Maximum characters per chunk
|
|
65
|
+
#[serde(default = "default_chunk_size")]
|
|
66
|
+
pub max_chars: usize,
|
|
67
|
+
|
|
68
|
+
/// Overlap between chunks in characters
|
|
69
|
+
#[serde(default = "default_chunk_overlap")]
|
|
70
|
+
pub max_overlap: usize,
|
|
71
|
+
|
|
72
|
+
/// Optional embedding configuration for chunk embeddings
|
|
73
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
+
pub embedding: Option<EmbeddingConfig>,
|
|
75
|
+
|
|
76
|
+
/// Use a preset configuration (overrides individual settings if provided)
|
|
77
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
78
|
+
pub preset: Option<String>,
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Embedding configuration for text chunks.
|
|
82
|
+
///
|
|
83
|
+
/// Configures embedding generation using ONNX models via fastembed-rs.
|
|
84
|
+
/// Requires the `embeddings` feature to be enabled.
|
|
85
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
86
|
+
pub struct EmbeddingConfig {
|
|
87
|
+
/// The embedding model to use
|
|
88
|
+
pub model: EmbeddingModelType,
|
|
89
|
+
|
|
90
|
+
/// Whether to normalize embedding vectors (recommended for cosine similarity)
|
|
91
|
+
#[serde(default = "default_normalize")]
|
|
92
|
+
pub normalize: bool,
|
|
93
|
+
|
|
94
|
+
/// Batch size for embedding generation
|
|
95
|
+
#[serde(default = "default_batch_size")]
|
|
96
|
+
pub batch_size: usize,
|
|
97
|
+
|
|
98
|
+
/// Show model download progress
|
|
99
|
+
#[serde(default)]
|
|
100
|
+
pub show_download_progress: bool,
|
|
101
|
+
|
|
102
|
+
/// Custom cache directory for model files
|
|
103
|
+
///
|
|
104
|
+
/// Defaults to `~/.cache/kreuzberg/embeddings/` if not specified.
|
|
105
|
+
/// Allows full customization of model download location.
|
|
106
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
107
|
+
pub cache_dir: Option<PathBuf>,
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
impl Default for EmbeddingConfig {
|
|
111
|
+
fn default() -> Self {
|
|
112
|
+
Self {
|
|
113
|
+
model: EmbeddingModelType::Preset {
|
|
114
|
+
name: "balanced".to_string(),
|
|
115
|
+
},
|
|
116
|
+
normalize: true,
|
|
117
|
+
batch_size: 32,
|
|
118
|
+
show_download_progress: false,
|
|
119
|
+
cache_dir: None,
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/// Embedding model types supported by Kreuzberg.
|
|
125
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
126
|
+
#[serde(tag = "type", rename_all = "snake_case")]
|
|
127
|
+
pub enum EmbeddingModelType {
|
|
128
|
+
/// Use a preset model configuration (recommended)
|
|
129
|
+
Preset { name: String },
|
|
130
|
+
|
|
131
|
+
/// Use a specific fastembed model by name
|
|
132
|
+
#[cfg(feature = "embeddings")]
|
|
133
|
+
FastEmbed { model: String, dimensions: usize },
|
|
134
|
+
|
|
135
|
+
/// Use a custom ONNX model from HuggingFace
|
|
136
|
+
Custom { model_id: String, dimensions: usize },
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
fn default_true() -> bool {
|
|
140
|
+
true
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
fn default_chunk_size() -> usize {
|
|
144
|
+
1000
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
fn default_chunk_overlap() -> usize {
|
|
148
|
+
200
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
fn default_normalize() -> bool {
|
|
152
|
+
true
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
fn default_batch_size() -> usize {
|
|
156
|
+
32
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
#[cfg(test)]
|
|
160
|
+
mod tests {
|
|
161
|
+
use super::*;
|
|
162
|
+
|
|
163
|
+
#[test]
|
|
164
|
+
fn test_postprocessor_config_default() {
|
|
165
|
+
let config = PostProcessorConfig::default();
|
|
166
|
+
assert!(config.enabled);
|
|
167
|
+
assert!(config.enabled_processors.is_none());
|
|
168
|
+
assert!(config.disabled_processors.is_none());
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
#[test]
|
|
172
|
+
fn test_postprocessor_config_build_lookup_sets() {
|
|
173
|
+
let mut config = PostProcessorConfig {
|
|
174
|
+
enabled: true,
|
|
175
|
+
enabled_processors: Some(vec!["a".to_string(), "b".to_string()]),
|
|
176
|
+
disabled_processors: Some(vec!["c".to_string()]),
|
|
177
|
+
enabled_set: None,
|
|
178
|
+
disabled_set: None,
|
|
179
|
+
};
|
|
180
|
+
|
|
181
|
+
config.build_lookup_sets();
|
|
182
|
+
|
|
183
|
+
assert!(config.enabled_set.is_some());
|
|
184
|
+
assert!(config.disabled_set.is_some());
|
|
185
|
+
assert!(config.enabled_set.unwrap().contains("a"));
|
|
186
|
+
assert!(config.disabled_set.unwrap().contains("c"));
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
#[test]
|
|
190
|
+
fn test_chunking_config_defaults() {
|
|
191
|
+
let config = ChunkingConfig {
|
|
192
|
+
max_chars: 1000,
|
|
193
|
+
max_overlap: 200,
|
|
194
|
+
embedding: None,
|
|
195
|
+
preset: None,
|
|
196
|
+
};
|
|
197
|
+
assert_eq!(config.max_chars, 1000);
|
|
198
|
+
assert_eq!(config.max_overlap, 200);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
#[test]
|
|
202
|
+
fn test_embedding_config_default() {
|
|
203
|
+
let config = EmbeddingConfig::default();
|
|
204
|
+
assert!(config.normalize);
|
|
205
|
+
assert_eq!(config.batch_size, 32);
|
|
206
|
+
assert!(config.cache_dir.is_none());
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
/// Tests that EmbeddingModelType::Preset serializes with "type" field (internally-tagged).
|
|
210
|
+
/// This validates the API schema matches the documented format:
|
|
211
|
+
/// `{"type": "preset", "name": "fast"}` NOT `{"preset": {"name": "fast"}}`
|
|
212
|
+
#[test]
|
|
213
|
+
fn test_embedding_model_type_preset_serialization() {
|
|
214
|
+
let model = EmbeddingModelType::Preset {
|
|
215
|
+
name: "fast".to_string(),
|
|
216
|
+
};
|
|
217
|
+
let json = serde_json::to_string(&model).unwrap();
|
|
218
|
+
|
|
219
|
+
// Should use internally-tagged format with "type" discriminator
|
|
220
|
+
assert!(json.contains(r#""type":"preset""#), "Should contain type:preset field");
|
|
221
|
+
assert!(json.contains(r#""name":"fast""#), "Should contain name:fast field");
|
|
222
|
+
|
|
223
|
+
// Should NOT use adjacently-tagged format
|
|
224
|
+
assert!(
|
|
225
|
+
!json.contains(r#"{"preset":"#),
|
|
226
|
+
"Should NOT use adjacently-tagged format"
|
|
227
|
+
);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/// Tests that EmbeddingModelType::Preset deserializes from the documented API format.
|
|
231
|
+
/// API documentation shows: `{"type": "preset", "name": "fast"}`
|
|
232
|
+
#[test]
|
|
233
|
+
fn test_embedding_model_type_preset_deserialization() {
|
|
234
|
+
// This is the documented API format that users should send
|
|
235
|
+
let json = r#"{"type": "preset", "name": "fast"}"#;
|
|
236
|
+
let model: EmbeddingModelType = serde_json::from_str(json).unwrap();
|
|
237
|
+
|
|
238
|
+
match model {
|
|
239
|
+
EmbeddingModelType::Preset { name } => {
|
|
240
|
+
assert_eq!(name, "fast");
|
|
241
|
+
}
|
|
242
|
+
_ => panic!("Expected Preset variant"),
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
/// Tests that the wrong format (adjacently-tagged) is rejected.
|
|
247
|
+
/// This ensures the API doesn't accept the old/wrong documentation format.
|
|
248
|
+
#[test]
|
|
249
|
+
fn test_embedding_model_type_rejects_wrong_format() {
|
|
250
|
+
// This is the WRONG format that was in the old documentation
|
|
251
|
+
let wrong_json = r#"{"preset": {"name": "fast"}}"#;
|
|
252
|
+
let result: Result<EmbeddingModelType, _> = serde_json::from_str(wrong_json);
|
|
253
|
+
|
|
254
|
+
// Should fail to parse - the wrong format should be rejected
|
|
255
|
+
assert!(result.is_err(), "Should reject adjacently-tagged format");
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
/// Tests round-trip serialization/deserialization of EmbeddingConfig.
|
|
259
|
+
#[test]
|
|
260
|
+
fn test_embedding_config_roundtrip() {
|
|
261
|
+
let config = EmbeddingConfig {
|
|
262
|
+
model: EmbeddingModelType::Preset {
|
|
263
|
+
name: "balanced".to_string(),
|
|
264
|
+
},
|
|
265
|
+
normalize: true,
|
|
266
|
+
batch_size: 64,
|
|
267
|
+
show_download_progress: false,
|
|
268
|
+
cache_dir: None,
|
|
269
|
+
};
|
|
270
|
+
|
|
271
|
+
let json = serde_json::to_string(&config).unwrap();
|
|
272
|
+
let deserialized: EmbeddingConfig = serde_json::from_str(&json).unwrap();
|
|
273
|
+
|
|
274
|
+
match deserialized.model {
|
|
275
|
+
EmbeddingModelType::Preset { name } => {
|
|
276
|
+
assert_eq!(name, "balanced");
|
|
277
|
+
}
|
|
278
|
+
_ => panic!("Expected Preset variant"),
|
|
279
|
+
}
|
|
280
|
+
assert!(deserialized.normalize);
|
|
281
|
+
assert_eq!(deserialized.batch_size, 64);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
/// Tests Custom model type serialization format.
|
|
285
|
+
#[test]
|
|
286
|
+
fn test_embedding_model_type_custom_serialization() {
|
|
287
|
+
let model = EmbeddingModelType::Custom {
|
|
288
|
+
model_id: "sentence-transformers/all-MiniLM-L6-v2".to_string(),
|
|
289
|
+
dimensions: 384,
|
|
290
|
+
};
|
|
291
|
+
let json = serde_json::to_string(&model).unwrap();
|
|
292
|
+
|
|
293
|
+
assert!(json.contains(r#""type":"custom""#), "Should contain type:custom field");
|
|
294
|
+
assert!(json.contains(r#""model_id":"#), "Should contain model_id field");
|
|
295
|
+
assert!(json.contains(r#""dimensions":384"#), "Should contain dimensions field");
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// Tests Custom model type deserialization.
|
|
299
|
+
#[test]
|
|
300
|
+
fn test_embedding_model_type_custom_deserialization() {
|
|
301
|
+
let json = r#"{"type": "custom", "model_id": "test/model", "dimensions": 512}"#;
|
|
302
|
+
let model: EmbeddingModelType = serde_json::from_str(json).unwrap();
|
|
303
|
+
|
|
304
|
+
match model {
|
|
305
|
+
EmbeddingModelType::Custom { model_id, dimensions } => {
|
|
306
|
+
assert_eq!(model_id, "test/model");
|
|
307
|
+
assert_eq!(dimensions, 512);
|
|
308
|
+
}
|
|
309
|
+
_ => panic!("Expected Custom variant"),
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
//! Cross-section dependency validation.
|
|
2
|
+
//!
|
|
3
|
+
//! This module contains validation functions that check dependencies and relationships
|
|
4
|
+
//! between different configuration sections. These validators ensure that related
|
|
5
|
+
//! configuration values are consistent and compatible with each other.
|
|
6
|
+
|
|
7
|
+
use crate::{KreuzbergError, Result};
|
|
8
|
+
|
|
9
|
+
/// Validate a port number for server configuration.
|
|
10
|
+
///
|
|
11
|
+
/// Port must be in the range 1-65535. While ports 1-1023 are privileged and may require
|
|
12
|
+
/// special permissions on some systems, they are still valid port numbers.
|
|
13
|
+
///
|
|
14
|
+
/// # Arguments
|
|
15
|
+
///
|
|
16
|
+
/// * `port` - The port number to validate
|
|
17
|
+
///
|
|
18
|
+
/// # Returns
|
|
19
|
+
///
|
|
20
|
+
/// `Ok(())` if the port is valid, or a `ValidationError` with details about valid ranges.
|
|
21
|
+
///
|
|
22
|
+
/// # Examples
|
|
23
|
+
///
|
|
24
|
+
/// ```rust
|
|
25
|
+
/// use kreuzberg::core::config_validation::validate_port;
|
|
26
|
+
///
|
|
27
|
+
/// assert!(validate_port(8000).is_ok());
|
|
28
|
+
/// assert!(validate_port(80).is_ok());
|
|
29
|
+
/// assert!(validate_port(1).is_ok());
|
|
30
|
+
/// assert!(validate_port(65535).is_ok());
|
|
31
|
+
/// assert!(validate_port(0).is_err());
|
|
32
|
+
/// ```
|
|
33
|
+
pub fn validate_port(port: u16) -> Result<()> {
|
|
34
|
+
if port > 0 {
|
|
35
|
+
Ok(())
|
|
36
|
+
} else {
|
|
37
|
+
Err(KreuzbergError::Validation {
|
|
38
|
+
message: format!("Port must be 1-65535, got {}", port),
|
|
39
|
+
source: None,
|
|
40
|
+
})
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Validate a host/IP address string for server configuration.
|
|
45
|
+
///
|
|
46
|
+
/// Accepts valid IPv4 addresses (e.g., "127.0.0.1", "0.0.0.0"), valid IPv6 addresses
|
|
47
|
+
/// (e.g., "::1", "::"), and hostnames (e.g., "localhost", "example.com").
|
|
48
|
+
///
|
|
49
|
+
/// # Arguments
|
|
50
|
+
///
|
|
51
|
+
/// * `host` - The host/IP address string to validate
|
|
52
|
+
///
|
|
53
|
+
/// # Returns
|
|
54
|
+
///
|
|
55
|
+
/// `Ok(())` if the host is valid, or a `ValidationError` with details about valid formats.
|
|
56
|
+
///
|
|
57
|
+
/// # Examples
|
|
58
|
+
///
|
|
59
|
+
/// ```rust
|
|
60
|
+
/// use kreuzberg::core::config_validation::validate_host;
|
|
61
|
+
///
|
|
62
|
+
/// assert!(validate_host("127.0.0.1").is_ok());
|
|
63
|
+
/// assert!(validate_host("0.0.0.0").is_ok());
|
|
64
|
+
/// assert!(validate_host("::1").is_ok());
|
|
65
|
+
/// assert!(validate_host("::").is_ok());
|
|
66
|
+
/// assert!(validate_host("localhost").is_ok());
|
|
67
|
+
/// assert!(validate_host("example.com").is_ok());
|
|
68
|
+
/// assert!(validate_host("").is_err());
|
|
69
|
+
/// ```
|
|
70
|
+
pub fn validate_host(host: &str) -> Result<()> {
|
|
71
|
+
let host = host.trim();
|
|
72
|
+
|
|
73
|
+
if host.is_empty() {
|
|
74
|
+
return Err(KreuzbergError::Validation {
|
|
75
|
+
message: "Invalid host '': must be a valid IP address or hostname".to_string(),
|
|
76
|
+
source: None,
|
|
77
|
+
});
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Check if it's a valid IPv4 address
|
|
81
|
+
if host.parse::<std::net::Ipv4Addr>().is_ok() {
|
|
82
|
+
return Ok(());
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Check if it's a valid IPv6 address
|
|
86
|
+
if host.parse::<std::net::Ipv6Addr>().is_ok() {
|
|
87
|
+
return Ok(());
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Check if it's a valid hostname (basic validation)
|
|
91
|
+
// Hostnames must contain only alphanumeric characters, dots, and hyphens
|
|
92
|
+
// Must not look like an invalid IPv4 address (all numeric with dots)
|
|
93
|
+
let looks_like_ipv4 = host
|
|
94
|
+
.split('.')
|
|
95
|
+
.all(|part| !part.is_empty() && part.chars().all(|c| c.is_numeric()));
|
|
96
|
+
if !looks_like_ipv4
|
|
97
|
+
&& host.chars().all(|c| c.is_alphanumeric() || c == '.' || c == '-')
|
|
98
|
+
&& !host.starts_with('-')
|
|
99
|
+
&& !host.ends_with('-')
|
|
100
|
+
{
|
|
101
|
+
return Ok(());
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
Err(KreuzbergError::Validation {
|
|
105
|
+
message: format!("Invalid host '{}': must be a valid IP address or hostname", host),
|
|
106
|
+
source: None,
|
|
107
|
+
})
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/// Validate a CORS (Cross-Origin Resource Sharing) origin URL.
|
|
111
|
+
///
|
|
112
|
+
/// Accepts valid HTTP/HTTPS URLs (e.g., "https://example.com") or the wildcard "*"
|
|
113
|
+
/// to allow all origins. URLs must start with "http://" or "https://", or be exactly "*".
|
|
114
|
+
///
|
|
115
|
+
/// # Arguments
|
|
116
|
+
///
|
|
117
|
+
/// * `origin` - The CORS origin URL to validate
|
|
118
|
+
///
|
|
119
|
+
/// # Returns
|
|
120
|
+
///
|
|
121
|
+
/// `Ok(())` if the origin is valid, or a `ValidationError` with details about valid formats.
|
|
122
|
+
///
|
|
123
|
+
/// # Examples
|
|
124
|
+
///
|
|
125
|
+
/// ```rust
|
|
126
|
+
/// use kreuzberg::core::config_validation::validate_cors_origin;
|
|
127
|
+
///
|
|
128
|
+
/// assert!(validate_cors_origin("https://example.com").is_ok());
|
|
129
|
+
/// assert!(validate_cors_origin("http://localhost:3000").is_ok());
|
|
130
|
+
/// assert!(validate_cors_origin("*").is_ok());
|
|
131
|
+
/// assert!(validate_cors_origin("not-a-url").is_err());
|
|
132
|
+
/// assert!(validate_cors_origin("ftp://example.com").is_err());
|
|
133
|
+
/// ```
|
|
134
|
+
pub fn validate_cors_origin(origin: &str) -> Result<()> {
|
|
135
|
+
let origin = origin.trim();
|
|
136
|
+
|
|
137
|
+
if origin == "*" {
|
|
138
|
+
return Ok(());
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if origin.starts_with("http://") || origin.starts_with("https://") {
|
|
142
|
+
// Basic validation: ensure there's something after the protocol
|
|
143
|
+
if origin.len() > 8 && (origin.starts_with("http://") && origin.len() > 7 || origin.starts_with("https://")) {
|
|
144
|
+
return Ok(());
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
Err(KreuzbergError::Validation {
|
|
149
|
+
message: format!(
|
|
150
|
+
"Invalid CORS origin '{}': must be a valid HTTP/HTTPS URL or '*'",
|
|
151
|
+
origin
|
|
152
|
+
),
|
|
153
|
+
source: None,
|
|
154
|
+
})
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
/// Validate an upload size limit for server configuration.
|
|
158
|
+
///
|
|
159
|
+
/// Upload size must be greater than 0 (measured in bytes).
|
|
160
|
+
///
|
|
161
|
+
/// # Arguments
|
|
162
|
+
///
|
|
163
|
+
/// * `size` - The maximum upload size in bytes to validate
|
|
164
|
+
///
|
|
165
|
+
/// # Returns
|
|
166
|
+
///
|
|
167
|
+
/// `Ok(())` if the size is valid, or a `ValidationError` with details about constraints.
|
|
168
|
+
///
|
|
169
|
+
/// # Examples
|
|
170
|
+
///
|
|
171
|
+
/// ```rust
|
|
172
|
+
/// use kreuzberg::core::config_validation::validate_upload_size;
|
|
173
|
+
///
|
|
174
|
+
/// assert!(validate_upload_size(1024).is_ok());
|
|
175
|
+
/// assert!(validate_upload_size(1_000_000).is_ok());
|
|
176
|
+
/// assert!(validate_upload_size(0).is_err());
|
|
177
|
+
/// ```
|
|
178
|
+
pub fn validate_upload_size(size: usize) -> Result<()> {
|
|
179
|
+
if size > 0 {
|
|
180
|
+
Ok(())
|
|
181
|
+
} else {
|
|
182
|
+
Err(KreuzbergError::Validation {
|
|
183
|
+
message: format!("Upload size must be greater than 0, got {}", size),
|
|
184
|
+
source: None,
|
|
185
|
+
})
|
|
186
|
+
}
|
|
187
|
+
}
|