kreuzberg 4.0.8 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +99 -2
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/spec/fixtures/config.toml +1 -1
- data/spec/fixtures/config.yaml +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +5 -4
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mime.rs +15 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +201 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,1220 +0,0 @@
|
|
|
1
|
-
//! Server configuration for the Kreuzberg API.
|
|
2
|
-
//!
|
|
3
|
-
//! This module provides the `ServerConfig` struct for managing API server settings
|
|
4
|
-
//! including host, port, CORS, and upload size limits. Configuration can be loaded
|
|
5
|
-
//! from TOML, YAML, or JSON files and can be overridden by environment variables.
|
|
6
|
-
//!
|
|
7
|
-
//! # Features
|
|
8
|
-
//!
|
|
9
|
-
//! - **Multi-format support**: Load configuration from TOML, YAML, or JSON files
|
|
10
|
-
//! - **Environment overrides**: All settings can be overridden via environment variables
|
|
11
|
-
//! - **Backward compatibility**: Supports legacy `max_upload_mb` field for smooth migrations
|
|
12
|
-
//! - **Sensible defaults**: All fields have reasonable defaults matching current behavior
|
|
13
|
-
//! - **Flexible CORS**: Support for all origins (default) or specific origin lists
|
|
14
|
-
//!
|
|
15
|
-
//! # Example
|
|
16
|
-
//!
|
|
17
|
-
//! ```rust,no_run
|
|
18
|
-
//! use kreuzberg::core::ServerConfig;
|
|
19
|
-
//!
|
|
20
|
-
//! # fn example() -> kreuzberg::Result<()> {
|
|
21
|
-
//! // Create with defaults
|
|
22
|
-
//! let mut config = ServerConfig::default();
|
|
23
|
-
//!
|
|
24
|
-
//! // Or load from file
|
|
25
|
-
//! let mut config = ServerConfig::from_file("kreuzberg.toml")?;
|
|
26
|
-
//!
|
|
27
|
-
//! // Apply environment variable overrides
|
|
28
|
-
//! config.apply_env_overrides()?;
|
|
29
|
-
//!
|
|
30
|
-
//! # Ok(())
|
|
31
|
-
//! # }
|
|
32
|
-
//! ```
|
|
33
|
-
|
|
34
|
-
use crate::{KreuzbergError, Result};
|
|
35
|
-
use serde::{Deserialize, Serialize};
|
|
36
|
-
use std::path::Path;
|
|
37
|
-
|
|
38
|
-
/// Default host address for API server
|
|
39
|
-
const DEFAULT_HOST: &str = "127.0.0.1";
|
|
40
|
-
|
|
41
|
-
/// Default port for API server
|
|
42
|
-
const DEFAULT_PORT: u16 = 8000;
|
|
43
|
-
|
|
44
|
-
/// Default maximum request body size: 100 MB
|
|
45
|
-
const DEFAULT_MAX_REQUEST_BODY_BYTES: usize = 104_857_600;
|
|
46
|
-
|
|
47
|
-
/// Default maximum multipart field size: 100 MB
|
|
48
|
-
const DEFAULT_MAX_MULTIPART_FIELD_BYTES: usize = 104_857_600;
|
|
49
|
-
|
|
50
|
-
/// API server configuration.
|
|
51
|
-
///
|
|
52
|
-
/// This struct holds all configuration options for the Kreuzberg API server,
|
|
53
|
-
/// including host/port settings, CORS configuration, and upload limits.
|
|
54
|
-
///
|
|
55
|
-
/// # Defaults
|
|
56
|
-
///
|
|
57
|
-
/// - `host`: "127.0.0.1" (localhost only)
|
|
58
|
-
/// - `port`: 8000
|
|
59
|
-
/// - `cors_origins`: empty vector (allows all origins)
|
|
60
|
-
/// - `max_request_body_bytes`: 104_857_600 (100 MB)
|
|
61
|
-
/// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
|
62
|
-
/// - `max_upload_mb`: None (legacy field, not used if other fields set)
|
|
63
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
64
|
-
#[serde(default)]
|
|
65
|
-
pub struct ServerConfig {
|
|
66
|
-
/// Server host address (e.g., "127.0.0.1", "0.0.0.0")
|
|
67
|
-
#[serde(default = "default_host")]
|
|
68
|
-
pub host: String,
|
|
69
|
-
|
|
70
|
-
/// Server port number
|
|
71
|
-
#[serde(default = "default_port")]
|
|
72
|
-
pub port: u16,
|
|
73
|
-
|
|
74
|
-
/// CORS allowed origins. Empty vector means allow all origins.
|
|
75
|
-
///
|
|
76
|
-
/// If this is an empty vector, the server will accept requests from any origin.
|
|
77
|
-
/// If populated with specific origins (e.g., ["https://example.com"]), only
|
|
78
|
-
/// those origins will be allowed.
|
|
79
|
-
#[serde(default)]
|
|
80
|
-
pub cors_origins: Vec<String>,
|
|
81
|
-
|
|
82
|
-
/// Maximum size of request body in bytes (default: 100 MB)
|
|
83
|
-
#[serde(default = "default_max_request_body_bytes")]
|
|
84
|
-
pub max_request_body_bytes: usize,
|
|
85
|
-
|
|
86
|
-
/// Maximum size of multipart fields in bytes (default: 100 MB)
|
|
87
|
-
#[serde(default = "default_max_multipart_field_bytes")]
|
|
88
|
-
pub max_multipart_field_bytes: usize,
|
|
89
|
-
|
|
90
|
-
/// Legacy upload size limit in MB (for backward compatibility).
|
|
91
|
-
///
|
|
92
|
-
/// This field is deprecated and only used for backward compatibility.
|
|
93
|
-
/// If set, it will override `max_multipart_field_bytes` during normalization.
|
|
94
|
-
/// New configurations should use `max_multipart_field_bytes` directly.
|
|
95
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
96
|
-
pub max_upload_mb: Option<usize>,
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
impl Default for ServerConfig {
|
|
100
|
-
fn default() -> Self {
|
|
101
|
-
Self {
|
|
102
|
-
host: default_host(),
|
|
103
|
-
port: default_port(),
|
|
104
|
-
cors_origins: Vec::new(),
|
|
105
|
-
max_request_body_bytes: default_max_request_body_bytes(),
|
|
106
|
-
max_multipart_field_bytes: default_max_multipart_field_bytes(),
|
|
107
|
-
max_upload_mb: None,
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
// Default value functions for serde
|
|
113
|
-
fn default_host() -> String {
|
|
114
|
-
DEFAULT_HOST.to_string()
|
|
115
|
-
}
|
|
116
|
-
|
|
117
|
-
fn default_port() -> u16 {
|
|
118
|
-
DEFAULT_PORT
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
fn default_max_request_body_bytes() -> usize {
|
|
122
|
-
DEFAULT_MAX_REQUEST_BODY_BYTES
|
|
123
|
-
}
|
|
124
|
-
|
|
125
|
-
fn default_max_multipart_field_bytes() -> usize {
|
|
126
|
-
DEFAULT_MAX_MULTIPART_FIELD_BYTES
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
impl ServerConfig {
|
|
130
|
-
/// Create a new `ServerConfig` with default values.
|
|
131
|
-
pub fn new() -> Self {
|
|
132
|
-
Self::default()
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
/// Get the server listen address (host:port).
|
|
136
|
-
///
|
|
137
|
-
/// # Example
|
|
138
|
-
///
|
|
139
|
-
/// ```rust
|
|
140
|
-
/// use kreuzberg::core::ServerConfig;
|
|
141
|
-
///
|
|
142
|
-
/// let config = ServerConfig::default();
|
|
143
|
-
/// assert_eq!(config.listen_addr(), "127.0.0.1:8000");
|
|
144
|
-
/// ```
|
|
145
|
-
pub fn listen_addr(&self) -> String {
|
|
146
|
-
format!("{}:{}", self.host, self.port)
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
/// Check if CORS allows all origins.
|
|
150
|
-
///
|
|
151
|
-
/// Returns `true` if the `cors_origins` vector is empty, meaning all origins
|
|
152
|
-
/// are allowed. Returns `false` if specific origins are configured.
|
|
153
|
-
///
|
|
154
|
-
/// # Example
|
|
155
|
-
///
|
|
156
|
-
/// ```rust
|
|
157
|
-
/// use kreuzberg::core::ServerConfig;
|
|
158
|
-
///
|
|
159
|
-
/// let mut config = ServerConfig::default();
|
|
160
|
-
/// assert!(config.cors_allows_all());
|
|
161
|
-
///
|
|
162
|
-
/// config.cors_origins.push("https://example.com".to_string());
|
|
163
|
-
/// assert!(!config.cors_allows_all());
|
|
164
|
-
/// ```
|
|
165
|
-
pub fn cors_allows_all(&self) -> bool {
|
|
166
|
-
self.cors_origins.is_empty()
|
|
167
|
-
}
|
|
168
|
-
|
|
169
|
-
/// Check if a given origin is allowed by CORS configuration.
|
|
170
|
-
///
|
|
171
|
-
/// Returns `true` if:
|
|
172
|
-
/// - CORS allows all origins (empty origins list), or
|
|
173
|
-
/// - The given origin is in the allowed origins list
|
|
174
|
-
///
|
|
175
|
-
/// # Arguments
|
|
176
|
-
///
|
|
177
|
-
/// * `origin` - The origin to check (e.g., "https://example.com")
|
|
178
|
-
///
|
|
179
|
-
/// # Example
|
|
180
|
-
///
|
|
181
|
-
/// ```rust
|
|
182
|
-
/// use kreuzberg::core::ServerConfig;
|
|
183
|
-
///
|
|
184
|
-
/// let mut config = ServerConfig::default();
|
|
185
|
-
/// assert!(config.is_origin_allowed("https://example.com"));
|
|
186
|
-
///
|
|
187
|
-
/// config.cors_origins.push("https://allowed.com".to_string());
|
|
188
|
-
/// assert!(config.is_origin_allowed("https://allowed.com"));
|
|
189
|
-
/// assert!(!config.is_origin_allowed("https://denied.com"));
|
|
190
|
-
/// ```
|
|
191
|
-
pub fn is_origin_allowed(&self, origin: &str) -> bool {
|
|
192
|
-
self.cors_origins.is_empty() || self.cors_origins.contains(&origin.to_string())
|
|
193
|
-
}
|
|
194
|
-
|
|
195
|
-
/// Get maximum request body size in megabytes (rounded up).
|
|
196
|
-
///
|
|
197
|
-
/// # Example
|
|
198
|
-
///
|
|
199
|
-
/// ```rust
|
|
200
|
-
/// use kreuzberg::core::ServerConfig;
|
|
201
|
-
///
|
|
202
|
-
/// let mut config = ServerConfig::default();
|
|
203
|
-
/// assert_eq!(config.max_request_body_mb(), 100);
|
|
204
|
-
/// ```
|
|
205
|
-
pub fn max_request_body_mb(&self) -> usize {
|
|
206
|
-
self.max_request_body_bytes.div_ceil(1_048_576)
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
/// Get maximum multipart field size in megabytes (rounded up).
|
|
210
|
-
///
|
|
211
|
-
/// # Example
|
|
212
|
-
///
|
|
213
|
-
/// ```rust
|
|
214
|
-
/// use kreuzberg::core::ServerConfig;
|
|
215
|
-
///
|
|
216
|
-
/// let mut config = ServerConfig::default();
|
|
217
|
-
/// assert_eq!(config.max_multipart_field_mb(), 100);
|
|
218
|
-
/// ```
|
|
219
|
-
pub fn max_multipart_field_mb(&self) -> usize {
|
|
220
|
-
self.max_multipart_field_bytes.div_ceil(1_048_576)
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
/// Normalize legacy field values for backward compatibility.
|
|
224
|
-
///
|
|
225
|
-
/// If `max_upload_mb` is set, it will be converted to bytes and used to
|
|
226
|
-
/// override `max_multipart_field_bytes`. This allows old configurations
|
|
227
|
-
/// using the legacy field to continue working.
|
|
228
|
-
///
|
|
229
|
-
/// This method is automatically called by `apply_env_overrides()`.
|
|
230
|
-
pub fn normalize_legacy_fields(&mut self) {
|
|
231
|
-
if let Some(max_upload_mb) = self.max_upload_mb {
|
|
232
|
-
// Convert MB to bytes
|
|
233
|
-
let max_bytes = max_upload_mb.saturating_mul(1_048_576);
|
|
234
|
-
self.max_multipart_field_bytes = max_bytes;
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
/// Apply environment variable overrides to the configuration.
|
|
239
|
-
///
|
|
240
|
-
/// Reads the following environment variables and overrides config values if set:
|
|
241
|
-
///
|
|
242
|
-
/// - `KREUZBERG_HOST` - Server host address
|
|
243
|
-
/// - `KREUZBERG_PORT` - Server port number (parsed as u16)
|
|
244
|
-
/// - `KREUZBERG_CORS_ORIGINS` - Comma-separated list of allowed origins
|
|
245
|
-
/// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` - Max request body size in bytes
|
|
246
|
-
/// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` - Max multipart field size in bytes
|
|
247
|
-
/// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` - Max upload size in MB (legacy)
|
|
248
|
-
///
|
|
249
|
-
/// # Errors
|
|
250
|
-
///
|
|
251
|
-
/// Returns `KreuzbergError::Validation` if:
|
|
252
|
-
/// - `KREUZBERG_PORT` cannot be parsed as u16
|
|
253
|
-
/// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` cannot be parsed as usize
|
|
254
|
-
/// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` cannot be parsed as usize
|
|
255
|
-
/// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` cannot be parsed as usize
|
|
256
|
-
///
|
|
257
|
-
/// # Example
|
|
258
|
-
///
|
|
259
|
-
/// ```rust,no_run
|
|
260
|
-
/// use kreuzberg::core::ServerConfig;
|
|
261
|
-
///
|
|
262
|
-
/// # fn example() -> kreuzberg::Result<()> {
|
|
263
|
-
/// std::env::set_var("KREUZBERG_HOST", "0.0.0.0");
|
|
264
|
-
/// std::env::set_var("KREUZBERG_PORT", "3000");
|
|
265
|
-
///
|
|
266
|
-
/// let mut config = ServerConfig::default();
|
|
267
|
-
/// config.apply_env_overrides()?;
|
|
268
|
-
///
|
|
269
|
-
/// assert_eq!(config.host, "0.0.0.0");
|
|
270
|
-
/// assert_eq!(config.port, 3000);
|
|
271
|
-
/// # Ok(())
|
|
272
|
-
/// # }
|
|
273
|
-
/// ```
|
|
274
|
-
pub fn apply_env_overrides(&mut self) -> Result<()> {
|
|
275
|
-
// Host override
|
|
276
|
-
if let Ok(host) = std::env::var("KREUZBERG_HOST") {
|
|
277
|
-
self.host = host;
|
|
278
|
-
}
|
|
279
|
-
|
|
280
|
-
// Port override
|
|
281
|
-
if let Ok(port_str) = std::env::var("KREUZBERG_PORT") {
|
|
282
|
-
self.port = port_str.parse::<u16>().map_err(|e| {
|
|
283
|
-
KreuzbergError::validation(format!(
|
|
284
|
-
"KREUZBERG_PORT must be a valid u16 number, got '{}': {}",
|
|
285
|
-
port_str, e
|
|
286
|
-
))
|
|
287
|
-
})?;
|
|
288
|
-
}
|
|
289
|
-
|
|
290
|
-
// CORS origins override (comma-separated)
|
|
291
|
-
if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
|
|
292
|
-
self.cors_origins = origins_str
|
|
293
|
-
.split(',')
|
|
294
|
-
.map(|s| s.trim().to_string())
|
|
295
|
-
.filter(|s| !s.is_empty())
|
|
296
|
-
.collect();
|
|
297
|
-
}
|
|
298
|
-
|
|
299
|
-
// Max request body bytes override
|
|
300
|
-
if let Ok(bytes_str) = std::env::var("KREUZBERG_MAX_REQUEST_BODY_BYTES") {
|
|
301
|
-
self.max_request_body_bytes = bytes_str.parse::<usize>().map_err(|e| {
|
|
302
|
-
KreuzbergError::validation(format!(
|
|
303
|
-
"KREUZBERG_MAX_REQUEST_BODY_BYTES must be a valid usize, got '{}': {}",
|
|
304
|
-
bytes_str, e
|
|
305
|
-
))
|
|
306
|
-
})?;
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
// Max multipart field bytes override
|
|
310
|
-
if let Ok(bytes_str) = std::env::var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES") {
|
|
311
|
-
self.max_multipart_field_bytes = bytes_str.parse::<usize>().map_err(|e| {
|
|
312
|
-
KreuzbergError::validation(format!(
|
|
313
|
-
"KREUZBERG_MAX_MULTIPART_FIELD_BYTES must be a valid usize, got '{}': {}",
|
|
314
|
-
bytes_str, e
|
|
315
|
-
))
|
|
316
|
-
})?;
|
|
317
|
-
}
|
|
318
|
-
|
|
319
|
-
// Legacy max upload size override (in MB)
|
|
320
|
-
if let Ok(mb_str) = std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
|
|
321
|
-
let mb = mb_str.parse::<usize>().map_err(|e| {
|
|
322
|
-
KreuzbergError::validation(format!(
|
|
323
|
-
"KREUZBERG_MAX_UPLOAD_SIZE_MB must be a valid usize, got '{}': {}",
|
|
324
|
-
mb_str, e
|
|
325
|
-
))
|
|
326
|
-
})?;
|
|
327
|
-
self.max_upload_mb = Some(mb);
|
|
328
|
-
}
|
|
329
|
-
|
|
330
|
-
// Apply legacy field normalization
|
|
331
|
-
self.normalize_legacy_fields();
|
|
332
|
-
|
|
333
|
-
Ok(())
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
/// Load server configuration from a file.
|
|
337
|
-
///
|
|
338
|
-
/// Automatically detects the file format based on extension:
|
|
339
|
-
/// - `.toml` - TOML format
|
|
340
|
-
/// - `.yaml` or `.yml` - YAML format
|
|
341
|
-
/// - `.json` - JSON format
|
|
342
|
-
///
|
|
343
|
-
/// This function handles two config file formats:
|
|
344
|
-
/// 1. Flat format: Server config at root level
|
|
345
|
-
/// 2. Nested format: Server config under `[server]` section (combined with ExtractionConfig)
|
|
346
|
-
///
|
|
347
|
-
/// # Arguments
|
|
348
|
-
///
|
|
349
|
-
/// * `path` - Path to the configuration file
|
|
350
|
-
///
|
|
351
|
-
/// # Errors
|
|
352
|
-
///
|
|
353
|
-
/// Returns `KreuzbergError::Validation` if:
|
|
354
|
-
/// - File doesn't exist or cannot be read
|
|
355
|
-
/// - File extension is not recognized
|
|
356
|
-
/// - File content is invalid for the detected format
|
|
357
|
-
///
|
|
358
|
-
/// # Example
|
|
359
|
-
///
|
|
360
|
-
/// ```rust,no_run
|
|
361
|
-
/// use kreuzberg::core::ServerConfig;
|
|
362
|
-
///
|
|
363
|
-
/// # fn example() -> kreuzberg::Result<()> {
|
|
364
|
-
/// let config = ServerConfig::from_file("kreuzberg.toml")?;
|
|
365
|
-
/// # Ok(())
|
|
366
|
-
/// # }
|
|
367
|
-
/// ```
|
|
368
|
-
pub fn from_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
369
|
-
let path = path.as_ref();
|
|
370
|
-
|
|
371
|
-
let content = std::fs::read_to_string(path)
|
|
372
|
-
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
373
|
-
|
|
374
|
-
let extension = path.extension().and_then(|ext| ext.to_str()).ok_or_else(|| {
|
|
375
|
-
KreuzbergError::validation(format!(
|
|
376
|
-
"Cannot determine file format: no extension found in {}",
|
|
377
|
-
path.display()
|
|
378
|
-
))
|
|
379
|
-
})?;
|
|
380
|
-
|
|
381
|
-
let mut config = match extension.to_lowercase().as_str() {
|
|
382
|
-
"toml" => {
|
|
383
|
-
// Try nested format first (with [server] section)
|
|
384
|
-
#[derive(Deserialize)]
|
|
385
|
-
struct RootConfig {
|
|
386
|
-
#[serde(default)]
|
|
387
|
-
server: Option<ServerConfig>,
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
if let Ok(root) = toml::from_str::<RootConfig>(&content) {
|
|
391
|
-
if let Some(server) = root.server {
|
|
392
|
-
server
|
|
393
|
-
} else {
|
|
394
|
-
// No [server] section, try flat format
|
|
395
|
-
toml::from_str::<Self>(&content).map_err(|e| {
|
|
396
|
-
KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e))
|
|
397
|
-
})?
|
|
398
|
-
}
|
|
399
|
-
} else {
|
|
400
|
-
// Fall back to flat format
|
|
401
|
-
toml::from_str::<Self>(&content)
|
|
402
|
-
.map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
"yaml" | "yml" => {
|
|
406
|
-
// Try nested format first (with server: section)
|
|
407
|
-
#[derive(Deserialize)]
|
|
408
|
-
struct RootConfig {
|
|
409
|
-
#[serde(default)]
|
|
410
|
-
server: Option<ServerConfig>,
|
|
411
|
-
}
|
|
412
|
-
|
|
413
|
-
if let Ok(root) = serde_yaml_ng::from_str::<RootConfig>(&content) {
|
|
414
|
-
if let Some(server) = root.server {
|
|
415
|
-
server
|
|
416
|
-
} else {
|
|
417
|
-
// No server section, try flat format
|
|
418
|
-
serde_yaml_ng::from_str::<Self>(&content).map_err(|e| {
|
|
419
|
-
KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e))
|
|
420
|
-
})?
|
|
421
|
-
}
|
|
422
|
-
} else {
|
|
423
|
-
// Fall back to flat format
|
|
424
|
-
serde_yaml_ng::from_str::<Self>(&content)
|
|
425
|
-
.map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
"json" => {
|
|
429
|
-
// Try nested format first (with "server" key)
|
|
430
|
-
#[derive(Deserialize)]
|
|
431
|
-
struct RootConfig {
|
|
432
|
-
#[serde(default)]
|
|
433
|
-
server: Option<ServerConfig>,
|
|
434
|
-
}
|
|
435
|
-
|
|
436
|
-
if let Ok(root) = serde_json::from_str::<RootConfig>(&content) {
|
|
437
|
-
if let Some(server) = root.server {
|
|
438
|
-
server
|
|
439
|
-
} else {
|
|
440
|
-
// No server key, try flat format
|
|
441
|
-
serde_json::from_str::<Self>(&content).map_err(|e| {
|
|
442
|
-
KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e))
|
|
443
|
-
})?
|
|
444
|
-
}
|
|
445
|
-
} else {
|
|
446
|
-
// Fall back to flat format
|
|
447
|
-
serde_json::from_str::<Self>(&content)
|
|
448
|
-
.map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?
|
|
449
|
-
}
|
|
450
|
-
}
|
|
451
|
-
_ => {
|
|
452
|
-
return Err(KreuzbergError::validation(format!(
|
|
453
|
-
"Unsupported config file format: .{}. Supported formats: .toml, .yaml, .yml, .json",
|
|
454
|
-
extension
|
|
455
|
-
)));
|
|
456
|
-
}
|
|
457
|
-
};
|
|
458
|
-
|
|
459
|
-
// Normalize legacy fields
|
|
460
|
-
config.normalize_legacy_fields();
|
|
461
|
-
|
|
462
|
-
Ok(config)
|
|
463
|
-
}
|
|
464
|
-
|
|
465
|
-
/// Load server configuration from a TOML file.
|
|
466
|
-
///
|
|
467
|
-
/// # Arguments
|
|
468
|
-
///
|
|
469
|
-
/// * `path` - Path to the TOML file
|
|
470
|
-
///
|
|
471
|
-
/// # Errors
|
|
472
|
-
///
|
|
473
|
-
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid TOML.
|
|
474
|
-
///
|
|
475
|
-
/// # Example
|
|
476
|
-
///
|
|
477
|
-
/// ```rust,no_run
|
|
478
|
-
/// use kreuzberg::core::ServerConfig;
|
|
479
|
-
///
|
|
480
|
-
/// # fn example() -> kreuzberg::Result<()> {
|
|
481
|
-
/// let config = ServerConfig::from_toml_file("kreuzberg.toml")?;
|
|
482
|
-
/// # Ok(())
|
|
483
|
-
/// # }
|
|
484
|
-
/// ```
|
|
485
|
-
pub fn from_toml_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
486
|
-
let path = path.as_ref();
|
|
487
|
-
|
|
488
|
-
let content = std::fs::read_to_string(path)
|
|
489
|
-
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
490
|
-
|
|
491
|
-
let mut config: Self = toml::from_str(&content)
|
|
492
|
-
.map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
|
|
493
|
-
|
|
494
|
-
config.normalize_legacy_fields();
|
|
495
|
-
|
|
496
|
-
Ok(config)
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
/// Load server configuration from a YAML file.
|
|
500
|
-
///
|
|
501
|
-
/// # Arguments
|
|
502
|
-
///
|
|
503
|
-
/// * `path` - Path to the YAML file
|
|
504
|
-
///
|
|
505
|
-
/// # Errors
|
|
506
|
-
///
|
|
507
|
-
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid YAML.
|
|
508
|
-
pub fn from_yaml_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
509
|
-
let path = path.as_ref();
|
|
510
|
-
|
|
511
|
-
let content = std::fs::read_to_string(path)
|
|
512
|
-
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
513
|
-
|
|
514
|
-
let mut config: Self = serde_yaml_ng::from_str(&content)
|
|
515
|
-
.map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
|
|
516
|
-
|
|
517
|
-
config.normalize_legacy_fields();
|
|
518
|
-
|
|
519
|
-
Ok(config)
|
|
520
|
-
}
|
|
521
|
-
|
|
522
|
-
/// Load server configuration from a JSON file.
|
|
523
|
-
///
|
|
524
|
-
/// # Arguments
|
|
525
|
-
///
|
|
526
|
-
/// * `path` - Path to the JSON file
|
|
527
|
-
///
|
|
528
|
-
/// # Errors
|
|
529
|
-
///
|
|
530
|
-
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid JSON.
|
|
531
|
-
pub fn from_json_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
532
|
-
let path = path.as_ref();
|
|
533
|
-
|
|
534
|
-
let content = std::fs::read_to_string(path)
|
|
535
|
-
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
536
|
-
|
|
537
|
-
let mut config: Self = serde_json::from_str(&content)
|
|
538
|
-
.map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
|
|
539
|
-
|
|
540
|
-
config.normalize_legacy_fields();
|
|
541
|
-
|
|
542
|
-
Ok(config)
|
|
543
|
-
}
|
|
544
|
-
}
|
|
545
|
-
|
|
546
|
-
#[cfg(test)]
|
|
547
|
-
#[allow(unsafe_code)]
|
|
548
|
-
mod tests {
|
|
549
|
-
use super::*;
|
|
550
|
-
use std::fs;
|
|
551
|
-
use tempfile::tempdir;
|
|
552
|
-
|
|
553
|
-
#[test]
|
|
554
|
-
fn test_default_config() {
|
|
555
|
-
let config = ServerConfig::default();
|
|
556
|
-
assert_eq!(config.host, "127.0.0.1");
|
|
557
|
-
assert_eq!(config.port, 8000);
|
|
558
|
-
assert!(config.cors_origins.is_empty());
|
|
559
|
-
assert_eq!(config.max_request_body_bytes, 104_857_600);
|
|
560
|
-
assert_eq!(config.max_multipart_field_bytes, 104_857_600);
|
|
561
|
-
assert!(config.max_upload_mb.is_none());
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
#[test]
|
|
565
|
-
fn test_listen_addr() {
|
|
566
|
-
let config = ServerConfig::default();
|
|
567
|
-
assert_eq!(config.listen_addr(), "127.0.0.1:8000");
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
#[test]
|
|
571
|
-
fn test_listen_addr_custom() {
|
|
572
|
-
let config = ServerConfig {
|
|
573
|
-
host: "0.0.0.0".to_string(),
|
|
574
|
-
port: 3000,
|
|
575
|
-
..Default::default()
|
|
576
|
-
};
|
|
577
|
-
assert_eq!(config.listen_addr(), "0.0.0.0:3000");
|
|
578
|
-
}
|
|
579
|
-
|
|
580
|
-
#[test]
|
|
581
|
-
fn test_cors_allows_all() {
|
|
582
|
-
let mut config = ServerConfig::default();
|
|
583
|
-
assert!(config.cors_allows_all());
|
|
584
|
-
|
|
585
|
-
config.cors_origins.push("https://example.com".to_string());
|
|
586
|
-
assert!(!config.cors_allows_all());
|
|
587
|
-
}
|
|
588
|
-
|
|
589
|
-
#[test]
|
|
590
|
-
fn test_is_origin_allowed_all() {
|
|
591
|
-
let config = ServerConfig::default();
|
|
592
|
-
assert!(config.is_origin_allowed("https://example.com"));
|
|
593
|
-
assert!(config.is_origin_allowed("https://other.com"));
|
|
594
|
-
}
|
|
595
|
-
|
|
596
|
-
#[test]
|
|
597
|
-
fn test_is_origin_allowed_specific() {
|
|
598
|
-
let mut config = ServerConfig::default();
|
|
599
|
-
config.cors_origins.push("https://allowed.com".to_string());
|
|
600
|
-
|
|
601
|
-
assert!(config.is_origin_allowed("https://allowed.com"));
|
|
602
|
-
assert!(!config.is_origin_allowed("https://denied.com"));
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
#[test]
|
|
606
|
-
fn test_max_request_body_mb() {
|
|
607
|
-
let config = ServerConfig::default();
|
|
608
|
-
assert_eq!(config.max_request_body_mb(), 100);
|
|
609
|
-
}
|
|
610
|
-
|
|
611
|
-
#[test]
|
|
612
|
-
fn test_max_multipart_field_mb() {
|
|
613
|
-
let config = ServerConfig::default();
|
|
614
|
-
assert_eq!(config.max_multipart_field_mb(), 100);
|
|
615
|
-
}
|
|
616
|
-
|
|
617
|
-
#[test]
|
|
618
|
-
fn test_max_bytes_to_mb_rounding() {
|
|
619
|
-
let mut config = ServerConfig {
|
|
620
|
-
max_request_body_bytes: 1_048_576, // 1 MB
|
|
621
|
-
..Default::default()
|
|
622
|
-
};
|
|
623
|
-
assert_eq!(config.max_request_body_mb(), 1);
|
|
624
|
-
|
|
625
|
-
config.max_request_body_bytes = 1_048_577; // 1 MB + 1 byte
|
|
626
|
-
assert_eq!(config.max_request_body_mb(), 2); // Rounds up
|
|
627
|
-
}
|
|
628
|
-
|
|
629
|
-
#[test]
|
|
630
|
-
fn test_normalize_legacy_max_upload_mb() {
|
|
631
|
-
let mut config = ServerConfig {
|
|
632
|
-
max_upload_mb: Some(50),
|
|
633
|
-
..Default::default()
|
|
634
|
-
};
|
|
635
|
-
|
|
636
|
-
config.normalize_legacy_fields();
|
|
637
|
-
|
|
638
|
-
assert_eq!(config.max_multipart_field_bytes, 50 * 1_048_576);
|
|
639
|
-
}
|
|
640
|
-
|
|
641
|
-
#[test]
|
|
642
|
-
fn test_normalize_legacy_max_upload_mb_zero() {
|
|
643
|
-
let mut config = ServerConfig {
|
|
644
|
-
max_upload_mb: Some(0),
|
|
645
|
-
..Default::default()
|
|
646
|
-
};
|
|
647
|
-
|
|
648
|
-
config.normalize_legacy_fields();
|
|
649
|
-
|
|
650
|
-
assert_eq!(config.max_multipart_field_bytes, 0);
|
|
651
|
-
}
|
|
652
|
-
|
|
653
|
-
#[test]
|
|
654
|
-
fn test_from_toml_file() {
|
|
655
|
-
let dir = tempdir().unwrap();
|
|
656
|
-
let config_path = dir.path().join("server.toml");
|
|
657
|
-
|
|
658
|
-
fs::write(
|
|
659
|
-
&config_path,
|
|
660
|
-
r#"
|
|
661
|
-
host = "0.0.0.0"
|
|
662
|
-
port = 3000
|
|
663
|
-
cors_origins = ["https://example.com", "https://other.com"]
|
|
664
|
-
max_request_body_bytes = 50000000
|
|
665
|
-
max_multipart_field_bytes = 75000000
|
|
666
|
-
"#,
|
|
667
|
-
)
|
|
668
|
-
.unwrap();
|
|
669
|
-
|
|
670
|
-
let config = ServerConfig::from_toml_file(&config_path).unwrap();
|
|
671
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
672
|
-
assert_eq!(config.port, 3000);
|
|
673
|
-
assert_eq!(config.cors_origins.len(), 2);
|
|
674
|
-
assert_eq!(config.max_request_body_bytes, 50_000_000);
|
|
675
|
-
assert_eq!(config.max_multipart_field_bytes, 75_000_000);
|
|
676
|
-
}
|
|
677
|
-
|
|
678
|
-
#[test]
|
|
679
|
-
fn test_from_yaml_file() {
|
|
680
|
-
let dir = tempdir().unwrap();
|
|
681
|
-
let config_path = dir.path().join("server.yaml");
|
|
682
|
-
|
|
683
|
-
fs::write(
|
|
684
|
-
&config_path,
|
|
685
|
-
r#"
|
|
686
|
-
host: 0.0.0.0
|
|
687
|
-
port: 3000
|
|
688
|
-
cors_origins:
|
|
689
|
-
- https://example.com
|
|
690
|
-
- https://other.com
|
|
691
|
-
max_request_body_bytes: 50000000
|
|
692
|
-
max_multipart_field_bytes: 75000000
|
|
693
|
-
"#,
|
|
694
|
-
)
|
|
695
|
-
.unwrap();
|
|
696
|
-
|
|
697
|
-
let config = ServerConfig::from_yaml_file(&config_path).unwrap();
|
|
698
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
699
|
-
assert_eq!(config.port, 3000);
|
|
700
|
-
assert_eq!(config.cors_origins.len(), 2);
|
|
701
|
-
assert_eq!(config.max_request_body_bytes, 50_000_000);
|
|
702
|
-
assert_eq!(config.max_multipart_field_bytes, 75_000_000);
|
|
703
|
-
}
|
|
704
|
-
|
|
705
|
-
#[test]
|
|
706
|
-
fn test_from_json_file() {
|
|
707
|
-
let dir = tempdir().unwrap();
|
|
708
|
-
let config_path = dir.path().join("server.json");
|
|
709
|
-
|
|
710
|
-
fs::write(
|
|
711
|
-
&config_path,
|
|
712
|
-
r#"{
|
|
713
|
-
"host": "0.0.0.0",
|
|
714
|
-
"port": 3000,
|
|
715
|
-
"cors_origins": ["https://example.com", "https://other.com"],
|
|
716
|
-
"max_request_body_bytes": 50000000,
|
|
717
|
-
"max_multipart_field_bytes": 75000000
|
|
718
|
-
}
|
|
719
|
-
"#,
|
|
720
|
-
)
|
|
721
|
-
.unwrap();
|
|
722
|
-
|
|
723
|
-
let config = ServerConfig::from_json_file(&config_path).unwrap();
|
|
724
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
725
|
-
assert_eq!(config.port, 3000);
|
|
726
|
-
assert_eq!(config.cors_origins.len(), 2);
|
|
727
|
-
assert_eq!(config.max_request_body_bytes, 50_000_000);
|
|
728
|
-
assert_eq!(config.max_multipart_field_bytes, 75_000_000);
|
|
729
|
-
}
|
|
730
|
-
|
|
731
|
-
#[test]
|
|
732
|
-
fn test_from_file_auto_detects_toml() {
|
|
733
|
-
let dir = tempdir().unwrap();
|
|
734
|
-
let config_path = dir.path().join("server.toml");
|
|
735
|
-
|
|
736
|
-
fs::write(
|
|
737
|
-
&config_path,
|
|
738
|
-
r#"
|
|
739
|
-
host = "0.0.0.0"
|
|
740
|
-
port = 3000
|
|
741
|
-
"#,
|
|
742
|
-
)
|
|
743
|
-
.unwrap();
|
|
744
|
-
|
|
745
|
-
let config = ServerConfig::from_file(&config_path).unwrap();
|
|
746
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
747
|
-
assert_eq!(config.port, 3000);
|
|
748
|
-
}
|
|
749
|
-
|
|
750
|
-
#[test]
|
|
751
|
-
fn test_from_file_auto_detects_yaml() {
|
|
752
|
-
let dir = tempdir().unwrap();
|
|
753
|
-
let config_path = dir.path().join("server.yaml");
|
|
754
|
-
|
|
755
|
-
fs::write(
|
|
756
|
-
&config_path,
|
|
757
|
-
r#"
|
|
758
|
-
host: 0.0.0.0
|
|
759
|
-
port: 3000
|
|
760
|
-
"#,
|
|
761
|
-
)
|
|
762
|
-
.unwrap();
|
|
763
|
-
|
|
764
|
-
let config = ServerConfig::from_file(&config_path).unwrap();
|
|
765
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
766
|
-
assert_eq!(config.port, 3000);
|
|
767
|
-
}
|
|
768
|
-
|
|
769
|
-
#[test]
|
|
770
|
-
fn test_from_file_auto_detects_json() {
|
|
771
|
-
let dir = tempdir().unwrap();
|
|
772
|
-
let config_path = dir.path().join("server.json");
|
|
773
|
-
|
|
774
|
-
fs::write(&config_path, r#"{"host": "0.0.0.0", "port": 3000}"#).unwrap();
|
|
775
|
-
|
|
776
|
-
let config = ServerConfig::from_file(&config_path).unwrap();
|
|
777
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
778
|
-
assert_eq!(config.port, 3000);
|
|
779
|
-
}
|
|
780
|
-
|
|
781
|
-
#[test]
|
|
782
|
-
fn test_from_file_unsupported_extension() {
|
|
783
|
-
let dir = tempdir().unwrap();
|
|
784
|
-
let config_path = dir.path().join("server.txt");
|
|
785
|
-
|
|
786
|
-
fs::write(&config_path, "host = 0.0.0.0").unwrap();
|
|
787
|
-
|
|
788
|
-
let result = ServerConfig::from_file(&config_path);
|
|
789
|
-
assert!(result.is_err());
|
|
790
|
-
assert!(
|
|
791
|
-
result
|
|
792
|
-
.unwrap_err()
|
|
793
|
-
.to_string()
|
|
794
|
-
.contains("Unsupported config file format")
|
|
795
|
-
);
|
|
796
|
-
}
|
|
797
|
-
|
|
798
|
-
#[test]
|
|
799
|
-
fn test_from_file_no_extension() {
|
|
800
|
-
let dir = tempdir().unwrap();
|
|
801
|
-
let config_path = dir.path().join("server");
|
|
802
|
-
|
|
803
|
-
fs::write(&config_path, "host = 0.0.0.0").unwrap();
|
|
804
|
-
|
|
805
|
-
let result = ServerConfig::from_file(&config_path);
|
|
806
|
-
assert!(result.is_err());
|
|
807
|
-
assert!(result.unwrap_err().to_string().contains("no extension found"));
|
|
808
|
-
}
|
|
809
|
-
|
|
810
|
-
#[test]
|
|
811
|
-
fn test_legacy_max_upload_mb_in_file() {
|
|
812
|
-
let dir = tempdir().unwrap();
|
|
813
|
-
let config_path = dir.path().join("server.toml");
|
|
814
|
-
|
|
815
|
-
fs::write(
|
|
816
|
-
&config_path,
|
|
817
|
-
r#"
|
|
818
|
-
host = "127.0.0.1"
|
|
819
|
-
port = 8000
|
|
820
|
-
max_upload_mb = 50
|
|
821
|
-
"#,
|
|
822
|
-
)
|
|
823
|
-
.unwrap();
|
|
824
|
-
|
|
825
|
-
let config = ServerConfig::from_toml_file(&config_path).unwrap();
|
|
826
|
-
assert_eq!(config.max_upload_mb, Some(50));
|
|
827
|
-
assert_eq!(config.max_multipart_field_bytes, 50 * 1_048_576);
|
|
828
|
-
}
|
|
829
|
-
|
|
830
|
-
#[serial_test::serial]
|
|
831
|
-
#[test]
|
|
832
|
-
fn test_apply_env_host_override() {
|
|
833
|
-
let original = std::env::var("KREUZBERG_HOST").ok();
|
|
834
|
-
unsafe {
|
|
835
|
-
std::env::set_var("KREUZBERG_HOST", "192.168.1.1");
|
|
836
|
-
}
|
|
837
|
-
|
|
838
|
-
let mut config = ServerConfig::default();
|
|
839
|
-
config.apply_env_overrides().unwrap();
|
|
840
|
-
|
|
841
|
-
assert_eq!(config.host, "192.168.1.1");
|
|
842
|
-
|
|
843
|
-
// Cleanup
|
|
844
|
-
unsafe {
|
|
845
|
-
if let Some(orig) = original {
|
|
846
|
-
std::env::set_var("KREUZBERG_HOST", orig);
|
|
847
|
-
} else {
|
|
848
|
-
std::env::remove_var("KREUZBERG_HOST");
|
|
849
|
-
}
|
|
850
|
-
}
|
|
851
|
-
}
|
|
852
|
-
|
|
853
|
-
#[serial_test::serial]
|
|
854
|
-
#[test]
|
|
855
|
-
fn test_apply_env_port_override() {
|
|
856
|
-
let original = std::env::var("KREUZBERG_PORT").ok();
|
|
857
|
-
unsafe {
|
|
858
|
-
std::env::set_var("KREUZBERG_PORT", "5000");
|
|
859
|
-
}
|
|
860
|
-
|
|
861
|
-
let mut config = ServerConfig::default();
|
|
862
|
-
config.apply_env_overrides().unwrap();
|
|
863
|
-
|
|
864
|
-
assert_eq!(config.port, 5000);
|
|
865
|
-
|
|
866
|
-
// Cleanup
|
|
867
|
-
unsafe {
|
|
868
|
-
if let Some(orig) = original {
|
|
869
|
-
std::env::set_var("KREUZBERG_PORT", orig);
|
|
870
|
-
} else {
|
|
871
|
-
std::env::remove_var("KREUZBERG_PORT");
|
|
872
|
-
}
|
|
873
|
-
}
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
#[serial_test::serial]
|
|
877
|
-
#[test]
|
|
878
|
-
fn test_apply_env_port_invalid() {
|
|
879
|
-
let original = std::env::var("KREUZBERG_PORT").ok();
|
|
880
|
-
unsafe {
|
|
881
|
-
std::env::set_var("KREUZBERG_PORT", "not_a_number");
|
|
882
|
-
}
|
|
883
|
-
|
|
884
|
-
let mut config = ServerConfig::default();
|
|
885
|
-
let result = config.apply_env_overrides();
|
|
886
|
-
|
|
887
|
-
assert!(result.is_err());
|
|
888
|
-
assert!(
|
|
889
|
-
result
|
|
890
|
-
.unwrap_err()
|
|
891
|
-
.to_string()
|
|
892
|
-
.contains("KREUZBERG_PORT must be a valid u16")
|
|
893
|
-
);
|
|
894
|
-
|
|
895
|
-
// Cleanup
|
|
896
|
-
unsafe {
|
|
897
|
-
if let Some(orig) = original {
|
|
898
|
-
std::env::set_var("KREUZBERG_PORT", orig);
|
|
899
|
-
} else {
|
|
900
|
-
std::env::remove_var("KREUZBERG_PORT");
|
|
901
|
-
}
|
|
902
|
-
}
|
|
903
|
-
}
|
|
904
|
-
|
|
905
|
-
#[serial_test::serial]
|
|
906
|
-
#[test]
|
|
907
|
-
fn test_apply_env_cors_origins_override() {
|
|
908
|
-
let original = std::env::var("KREUZBERG_CORS_ORIGINS").ok();
|
|
909
|
-
unsafe {
|
|
910
|
-
std::env::set_var("KREUZBERG_CORS_ORIGINS", "https://example.com, https://other.com");
|
|
911
|
-
}
|
|
912
|
-
|
|
913
|
-
let mut config = ServerConfig::default();
|
|
914
|
-
config.apply_env_overrides().unwrap();
|
|
915
|
-
|
|
916
|
-
assert_eq!(config.cors_origins.len(), 2);
|
|
917
|
-
assert!(config.cors_origins.contains(&"https://example.com".to_string()));
|
|
918
|
-
assert!(config.cors_origins.contains(&"https://other.com".to_string()));
|
|
919
|
-
|
|
920
|
-
// Cleanup
|
|
921
|
-
unsafe {
|
|
922
|
-
if let Some(orig) = original {
|
|
923
|
-
std::env::set_var("KREUZBERG_CORS_ORIGINS", orig);
|
|
924
|
-
} else {
|
|
925
|
-
std::env::remove_var("KREUZBERG_CORS_ORIGINS");
|
|
926
|
-
}
|
|
927
|
-
}
|
|
928
|
-
}
|
|
929
|
-
|
|
930
|
-
#[serial_test::serial]
|
|
931
|
-
#[test]
|
|
932
|
-
fn test_apply_env_max_request_body_bytes_override() {
|
|
933
|
-
let original = std::env::var("KREUZBERG_MAX_REQUEST_BODY_BYTES").ok();
|
|
934
|
-
unsafe {
|
|
935
|
-
std::env::set_var("KREUZBERG_MAX_REQUEST_BODY_BYTES", "52428800");
|
|
936
|
-
}
|
|
937
|
-
|
|
938
|
-
let mut config = ServerConfig::default();
|
|
939
|
-
config.apply_env_overrides().unwrap();
|
|
940
|
-
|
|
941
|
-
assert_eq!(config.max_request_body_bytes, 52_428_800);
|
|
942
|
-
|
|
943
|
-
// Cleanup
|
|
944
|
-
unsafe {
|
|
945
|
-
if let Some(orig) = original {
|
|
946
|
-
std::env::set_var("KREUZBERG_MAX_REQUEST_BODY_BYTES", orig);
|
|
947
|
-
} else {
|
|
948
|
-
std::env::remove_var("KREUZBERG_MAX_REQUEST_BODY_BYTES");
|
|
949
|
-
}
|
|
950
|
-
}
|
|
951
|
-
}
|
|
952
|
-
|
|
953
|
-
#[serial_test::serial]
|
|
954
|
-
#[test]
|
|
955
|
-
fn test_apply_env_max_multipart_field_bytes_override() {
|
|
956
|
-
let original = std::env::var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES").ok();
|
|
957
|
-
unsafe {
|
|
958
|
-
std::env::set_var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES", "78643200");
|
|
959
|
-
}
|
|
960
|
-
|
|
961
|
-
let mut config = ServerConfig::default();
|
|
962
|
-
config.apply_env_overrides().unwrap();
|
|
963
|
-
|
|
964
|
-
assert_eq!(config.max_multipart_field_bytes, 78_643_200);
|
|
965
|
-
|
|
966
|
-
// Cleanup
|
|
967
|
-
unsafe {
|
|
968
|
-
if let Some(orig) = original {
|
|
969
|
-
std::env::set_var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES", orig);
|
|
970
|
-
} else {
|
|
971
|
-
std::env::remove_var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES");
|
|
972
|
-
}
|
|
973
|
-
}
|
|
974
|
-
}
|
|
975
|
-
|
|
976
|
-
#[serial_test::serial]
|
|
977
|
-
#[test]
|
|
978
|
-
fn test_apply_env_legacy_max_upload_size_mb_override() {
|
|
979
|
-
let original = std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB").ok();
|
|
980
|
-
unsafe {
|
|
981
|
-
std::env::set_var("KREUZBERG_MAX_UPLOAD_SIZE_MB", "75");
|
|
982
|
-
}
|
|
983
|
-
|
|
984
|
-
let mut config = ServerConfig::default();
|
|
985
|
-
config.apply_env_overrides().unwrap();
|
|
986
|
-
|
|
987
|
-
assert_eq!(config.max_upload_mb, Some(75));
|
|
988
|
-
assert_eq!(config.max_multipart_field_bytes, 75 * 1_048_576);
|
|
989
|
-
|
|
990
|
-
// Cleanup
|
|
991
|
-
unsafe {
|
|
992
|
-
if let Some(orig) = original {
|
|
993
|
-
std::env::set_var("KREUZBERG_MAX_UPLOAD_SIZE_MB", orig);
|
|
994
|
-
} else {
|
|
995
|
-
std::env::remove_var("KREUZBERG_MAX_UPLOAD_SIZE_MB");
|
|
996
|
-
}
|
|
997
|
-
}
|
|
998
|
-
}
|
|
999
|
-
|
|
1000
|
-
#[serial_test::serial]
|
|
1001
|
-
#[test]
|
|
1002
|
-
fn test_apply_env_multiple_overrides() {
|
|
1003
|
-
let host_orig = std::env::var("KREUZBERG_HOST").ok();
|
|
1004
|
-
let port_orig = std::env::var("KREUZBERG_PORT").ok();
|
|
1005
|
-
let cors_orig = std::env::var("KREUZBERG_CORS_ORIGINS").ok();
|
|
1006
|
-
|
|
1007
|
-
unsafe {
|
|
1008
|
-
std::env::set_var("KREUZBERG_HOST", "0.0.0.0");
|
|
1009
|
-
std::env::set_var("KREUZBERG_PORT", "4000");
|
|
1010
|
-
std::env::set_var("KREUZBERG_CORS_ORIGINS", "https://api.example.com");
|
|
1011
|
-
}
|
|
1012
|
-
|
|
1013
|
-
let mut config = ServerConfig::default();
|
|
1014
|
-
config.apply_env_overrides().unwrap();
|
|
1015
|
-
|
|
1016
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
1017
|
-
assert_eq!(config.port, 4000);
|
|
1018
|
-
assert_eq!(config.cors_origins.len(), 1);
|
|
1019
|
-
assert_eq!(config.cors_origins[0], "https://api.example.com");
|
|
1020
|
-
|
|
1021
|
-
// Cleanup
|
|
1022
|
-
unsafe {
|
|
1023
|
-
if let Some(orig) = host_orig {
|
|
1024
|
-
std::env::set_var("KREUZBERG_HOST", orig);
|
|
1025
|
-
} else {
|
|
1026
|
-
std::env::remove_var("KREUZBERG_HOST");
|
|
1027
|
-
}
|
|
1028
|
-
if let Some(orig) = port_orig {
|
|
1029
|
-
std::env::set_var("KREUZBERG_PORT", orig);
|
|
1030
|
-
} else {
|
|
1031
|
-
std::env::remove_var("KREUZBERG_PORT");
|
|
1032
|
-
}
|
|
1033
|
-
if let Some(orig) = cors_orig {
|
|
1034
|
-
std::env::set_var("KREUZBERG_CORS_ORIGINS", orig);
|
|
1035
|
-
} else {
|
|
1036
|
-
std::env::remove_var("KREUZBERG_CORS_ORIGINS");
|
|
1037
|
-
}
|
|
1038
|
-
}
|
|
1039
|
-
}
|
|
1040
|
-
|
|
1041
|
-
#[test]
|
|
1042
|
-
fn test_serde_default_serialization() {
|
|
1043
|
-
let config = ServerConfig::default();
|
|
1044
|
-
let json = serde_json::to_string(&config).unwrap();
|
|
1045
|
-
|
|
1046
|
-
// Should serialize without the max_upload_mb field when None
|
|
1047
|
-
assert!(!json.contains("max_upload_mb"));
|
|
1048
|
-
}
|
|
1049
|
-
|
|
1050
|
-
#[test]
|
|
1051
|
-
fn test_serde_with_max_upload_mb_serialization() {
|
|
1052
|
-
let config = ServerConfig {
|
|
1053
|
-
max_upload_mb: Some(50),
|
|
1054
|
-
..Default::default()
|
|
1055
|
-
};
|
|
1056
|
-
let json = serde_json::to_string(&config).unwrap();
|
|
1057
|
-
|
|
1058
|
-
// Should serialize with max_upload_mb when Some
|
|
1059
|
-
assert!(json.contains("max_upload_mb"));
|
|
1060
|
-
}
|
|
1061
|
-
|
|
1062
|
-
#[test]
|
|
1063
|
-
fn test_cors_origins_empty_in_toml() {
|
|
1064
|
-
let dir = tempdir().unwrap();
|
|
1065
|
-
let config_path = dir.path().join("server.toml");
|
|
1066
|
-
|
|
1067
|
-
fs::write(
|
|
1068
|
-
&config_path,
|
|
1069
|
-
r#"
|
|
1070
|
-
host = "127.0.0.1"
|
|
1071
|
-
port = 8000
|
|
1072
|
-
"#,
|
|
1073
|
-
)
|
|
1074
|
-
.unwrap();
|
|
1075
|
-
|
|
1076
|
-
let config = ServerConfig::from_toml_file(&config_path).unwrap();
|
|
1077
|
-
assert!(config.cors_origins.is_empty());
|
|
1078
|
-
assert!(config.cors_allows_all());
|
|
1079
|
-
}
|
|
1080
|
-
|
|
1081
|
-
#[test]
|
|
1082
|
-
fn test_full_configuration_toml() {
|
|
1083
|
-
let dir = tempdir().unwrap();
|
|
1084
|
-
let config_path = dir.path().join("server.toml");
|
|
1085
|
-
|
|
1086
|
-
fs::write(
|
|
1087
|
-
&config_path,
|
|
1088
|
-
r#"
|
|
1089
|
-
host = "192.168.1.100"
|
|
1090
|
-
port = 9000
|
|
1091
|
-
cors_origins = ["https://app1.com", "https://app2.com", "https://app3.com"]
|
|
1092
|
-
max_request_body_bytes = 200000000
|
|
1093
|
-
max_multipart_field_bytes = 150000000
|
|
1094
|
-
"#,
|
|
1095
|
-
)
|
|
1096
|
-
.unwrap();
|
|
1097
|
-
|
|
1098
|
-
let config = ServerConfig::from_toml_file(&config_path).unwrap();
|
|
1099
|
-
assert_eq!(config.host, "192.168.1.100");
|
|
1100
|
-
assert_eq!(config.port, 9000);
|
|
1101
|
-
assert_eq!(config.listen_addr(), "192.168.1.100:9000");
|
|
1102
|
-
assert_eq!(config.cors_origins.len(), 3);
|
|
1103
|
-
assert!(!config.cors_allows_all());
|
|
1104
|
-
assert!(config.is_origin_allowed("https://app1.com"));
|
|
1105
|
-
assert!(!config.is_origin_allowed("https://app4.com"));
|
|
1106
|
-
assert_eq!(config.max_request_body_bytes, 200_000_000);
|
|
1107
|
-
assert_eq!(config.max_multipart_field_bytes, 150_000_000);
|
|
1108
|
-
assert_eq!(config.max_request_body_mb(), 191);
|
|
1109
|
-
assert_eq!(config.max_multipart_field_mb(), 144);
|
|
1110
|
-
}
|
|
1111
|
-
|
|
1112
|
-
#[test]
|
|
1113
|
-
fn test_from_file_with_nested_server_section_toml() {
|
|
1114
|
-
let dir = tempdir().unwrap();
|
|
1115
|
-
let config_path = dir.path().join("kreuzberg.toml");
|
|
1116
|
-
|
|
1117
|
-
// Config file with [server] section and other sections (like ExtractionConfig)
|
|
1118
|
-
fs::write(
|
|
1119
|
-
&config_path,
|
|
1120
|
-
r#"
|
|
1121
|
-
[server]
|
|
1122
|
-
host = "0.0.0.0"
|
|
1123
|
-
port = 3000
|
|
1124
|
-
cors_origins = ["https://example.com"]
|
|
1125
|
-
|
|
1126
|
-
[ocr]
|
|
1127
|
-
backend = "tesseract"
|
|
1128
|
-
language = "eng"
|
|
1129
|
-
|
|
1130
|
-
[extraction]
|
|
1131
|
-
enabled = true
|
|
1132
|
-
"#,
|
|
1133
|
-
)
|
|
1134
|
-
.unwrap();
|
|
1135
|
-
|
|
1136
|
-
let config = ServerConfig::from_file(&config_path).unwrap();
|
|
1137
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
1138
|
-
assert_eq!(config.port, 3000);
|
|
1139
|
-
assert_eq!(config.cors_origins.len(), 1);
|
|
1140
|
-
assert_eq!(config.cors_origins[0], "https://example.com");
|
|
1141
|
-
}
|
|
1142
|
-
|
|
1143
|
-
#[test]
|
|
1144
|
-
fn test_from_file_with_nested_server_section_yaml() {
|
|
1145
|
-
let dir = tempdir().unwrap();
|
|
1146
|
-
let config_path = dir.path().join("kreuzberg.yaml");
|
|
1147
|
-
|
|
1148
|
-
// Config file with server: section and other sections
|
|
1149
|
-
fs::write(
|
|
1150
|
-
&config_path,
|
|
1151
|
-
r#"
|
|
1152
|
-
server:
|
|
1153
|
-
host: 0.0.0.0
|
|
1154
|
-
port: 4000
|
|
1155
|
-
cors_origins:
|
|
1156
|
-
- https://example.com
|
|
1157
|
-
|
|
1158
|
-
ocr:
|
|
1159
|
-
backend: tesseract
|
|
1160
|
-
language: eng
|
|
1161
|
-
"#,
|
|
1162
|
-
)
|
|
1163
|
-
.unwrap();
|
|
1164
|
-
|
|
1165
|
-
let config = ServerConfig::from_file(&config_path).unwrap();
|
|
1166
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
1167
|
-
assert_eq!(config.port, 4000);
|
|
1168
|
-
assert_eq!(config.cors_origins.len(), 1);
|
|
1169
|
-
}
|
|
1170
|
-
|
|
1171
|
-
#[test]
|
|
1172
|
-
fn test_from_file_with_nested_server_section_json() {
|
|
1173
|
-
let dir = tempdir().unwrap();
|
|
1174
|
-
let config_path = dir.path().join("kreuzberg.json");
|
|
1175
|
-
|
|
1176
|
-
// Config file with "server" key and other sections
|
|
1177
|
-
fs::write(
|
|
1178
|
-
&config_path,
|
|
1179
|
-
r#"
|
|
1180
|
-
{
|
|
1181
|
-
"server": {
|
|
1182
|
-
"host": "0.0.0.0",
|
|
1183
|
-
"port": 5000,
|
|
1184
|
-
"cors_origins": ["https://example.com"]
|
|
1185
|
-
},
|
|
1186
|
-
"ocr": {
|
|
1187
|
-
"backend": "tesseract",
|
|
1188
|
-
"language": "eng"
|
|
1189
|
-
}
|
|
1190
|
-
}
|
|
1191
|
-
"#,
|
|
1192
|
-
)
|
|
1193
|
-
.unwrap();
|
|
1194
|
-
|
|
1195
|
-
let config = ServerConfig::from_file(&config_path).unwrap();
|
|
1196
|
-
assert_eq!(config.host, "0.0.0.0");
|
|
1197
|
-
assert_eq!(config.port, 5000);
|
|
1198
|
-
assert_eq!(config.cors_origins.len(), 1);
|
|
1199
|
-
}
|
|
1200
|
-
|
|
1201
|
-
#[test]
|
|
1202
|
-
fn test_from_file_flat_format_still_works() {
|
|
1203
|
-
let dir = tempdir().unwrap();
|
|
1204
|
-
let config_path = dir.path().join("server.toml");
|
|
1205
|
-
|
|
1206
|
-
// Old flat format without [server] section
|
|
1207
|
-
fs::write(
|
|
1208
|
-
&config_path,
|
|
1209
|
-
r#"
|
|
1210
|
-
host = "192.168.1.1"
|
|
1211
|
-
port = 6000
|
|
1212
|
-
"#,
|
|
1213
|
-
)
|
|
1214
|
-
.unwrap();
|
|
1215
|
-
|
|
1216
|
-
let config = ServerConfig::from_file(&config_path).unwrap();
|
|
1217
|
-
assert_eq!(config.host, "192.168.1.1");
|
|
1218
|
-
assert_eq!(config.port, 6000);
|
|
1219
|
-
}
|
|
1220
|
-
}
|