kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
//! Environment variable overrides for server configuration.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functionality to override server configuration values
|
|
4
|
+
//! using environment variables. All settings can be overridden at runtime.
|
|
5
|
+
|
|
6
|
+
use crate::{KreuzbergError, Result};
|
|
7
|
+
|
|
8
|
+
/// Apply environment variable overrides to a ServerConfig.
|
|
9
|
+
///
|
|
10
|
+
/// Reads the following environment variables and overrides config values if set:
|
|
11
|
+
///
|
|
12
|
+
/// - `KREUZBERG_HOST` - Server host address
|
|
13
|
+
/// - `KREUZBERG_PORT` - Server port number (parsed as u16)
|
|
14
|
+
/// - `KREUZBERG_CORS_ORIGINS` - Comma-separated list of allowed origins
|
|
15
|
+
/// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` - Max request body size in bytes
|
|
16
|
+
/// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` - Max multipart field size in bytes
|
|
17
|
+
/// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` - Max upload size in MB (legacy)
|
|
18
|
+
///
|
|
19
|
+
/// # Errors
|
|
20
|
+
///
|
|
21
|
+
/// Returns `KreuzbergError::Validation` if:
|
|
22
|
+
/// - `KREUZBERG_PORT` cannot be parsed as u16
|
|
23
|
+
/// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` cannot be parsed as usize
|
|
24
|
+
/// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` cannot be parsed as usize
|
|
25
|
+
/// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` cannot be parsed as usize
|
|
26
|
+
pub fn apply_env_overrides(
|
|
27
|
+
host: &mut String,
|
|
28
|
+
port: &mut u16,
|
|
29
|
+
cors_origins: &mut Vec<String>,
|
|
30
|
+
max_request_body_bytes: &mut usize,
|
|
31
|
+
max_multipart_field_bytes: &mut usize,
|
|
32
|
+
max_upload_mb: &mut Option<usize>,
|
|
33
|
+
) -> Result<()> {
|
|
34
|
+
// Host override
|
|
35
|
+
if let Ok(env_host) = std::env::var("KREUZBERG_HOST") {
|
|
36
|
+
*host = env_host;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Port override
|
|
40
|
+
if let Ok(port_str) = std::env::var("KREUZBERG_PORT") {
|
|
41
|
+
*port = port_str.parse::<u16>().map_err(|e| {
|
|
42
|
+
KreuzbergError::validation(format!(
|
|
43
|
+
"KREUZBERG_PORT must be a valid u16 number, got '{}': {}",
|
|
44
|
+
port_str, e
|
|
45
|
+
))
|
|
46
|
+
})?;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// CORS origins override (comma-separated)
|
|
50
|
+
if let Ok(origins_str) = std::env::var("KREUZBERG_CORS_ORIGINS") {
|
|
51
|
+
*cors_origins = origins_str
|
|
52
|
+
.split(',')
|
|
53
|
+
.map(|s| s.trim().to_string())
|
|
54
|
+
.filter(|s| !s.is_empty())
|
|
55
|
+
.collect();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
// Max request body bytes override
|
|
59
|
+
if let Ok(bytes_str) = std::env::var("KREUZBERG_MAX_REQUEST_BODY_BYTES") {
|
|
60
|
+
*max_request_body_bytes = bytes_str.parse::<usize>().map_err(|e| {
|
|
61
|
+
KreuzbergError::validation(format!(
|
|
62
|
+
"KREUZBERG_MAX_REQUEST_BODY_BYTES must be a valid usize, got '{}': {}",
|
|
63
|
+
bytes_str, e
|
|
64
|
+
))
|
|
65
|
+
})?;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Max multipart field bytes override
|
|
69
|
+
if let Ok(bytes_str) = std::env::var("KREUZBERG_MAX_MULTIPART_FIELD_BYTES") {
|
|
70
|
+
*max_multipart_field_bytes = bytes_str.parse::<usize>().map_err(|e| {
|
|
71
|
+
KreuzbergError::validation(format!(
|
|
72
|
+
"KREUZBERG_MAX_MULTIPART_FIELD_BYTES must be a valid usize, got '{}': {}",
|
|
73
|
+
bytes_str, e
|
|
74
|
+
))
|
|
75
|
+
})?;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Legacy max upload size override (in MB)
|
|
79
|
+
if let Ok(mb_str) = std::env::var("KREUZBERG_MAX_UPLOAD_SIZE_MB") {
|
|
80
|
+
let mb = mb_str.parse::<usize>().map_err(|e| {
|
|
81
|
+
KreuzbergError::validation(format!(
|
|
82
|
+
"KREUZBERG_MAX_UPLOAD_SIZE_MB must be a valid usize, got '{}': {}",
|
|
83
|
+
mb_str, e
|
|
84
|
+
))
|
|
85
|
+
})?;
|
|
86
|
+
*max_upload_mb = Some(mb);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
Ok(())
|
|
90
|
+
}
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
//! File loading logic for server configuration.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functionality to load server configuration from various
|
|
4
|
+
//! file formats (TOML, YAML, JSON) with support for both flat and nested formats.
|
|
5
|
+
|
|
6
|
+
use crate::{KreuzbergError, Result};
|
|
7
|
+
use serde::Deserialize;
|
|
8
|
+
use std::path::Path;
|
|
9
|
+
|
|
10
|
+
use super::ServerConfig;
|
|
11
|
+
|
|
12
|
+
/// Load server configuration from a file.
|
|
13
|
+
///
|
|
14
|
+
/// Automatically detects the file format based on extension:
|
|
15
|
+
/// - `.toml` - TOML format
|
|
16
|
+
/// - `.yaml` or `.yml` - YAML format
|
|
17
|
+
/// - `.json` - JSON format
|
|
18
|
+
///
|
|
19
|
+
/// This function handles two config file formats:
|
|
20
|
+
/// 1. Flat format: Server config at root level
|
|
21
|
+
/// 2. Nested format: Server config under `[server]` section (combined with ExtractionConfig)
|
|
22
|
+
///
|
|
23
|
+
/// # Arguments
|
|
24
|
+
///
|
|
25
|
+
/// * `path` - Path to the configuration file
|
|
26
|
+
///
|
|
27
|
+
/// # Errors
|
|
28
|
+
///
|
|
29
|
+
/// Returns `KreuzbergError::Validation` if:
|
|
30
|
+
/// - File doesn't exist or cannot be read
|
|
31
|
+
/// - File extension is not recognized
|
|
32
|
+
/// - File content is invalid for the detected format
|
|
33
|
+
pub fn from_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
|
|
34
|
+
let path = path.as_ref();
|
|
35
|
+
|
|
36
|
+
let content = std::fs::read_to_string(path)
|
|
37
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
38
|
+
|
|
39
|
+
let extension = path.extension().and_then(|ext| ext.to_str()).ok_or_else(|| {
|
|
40
|
+
KreuzbergError::validation(format!(
|
|
41
|
+
"Cannot determine file format: no extension found in {}",
|
|
42
|
+
path.display()
|
|
43
|
+
))
|
|
44
|
+
})?;
|
|
45
|
+
|
|
46
|
+
let mut config = match extension.to_lowercase().as_str() {
|
|
47
|
+
"toml" => from_toml_str(&content, path)?,
|
|
48
|
+
"yaml" | "yml" => from_yaml_str(&content, path)?,
|
|
49
|
+
"json" => from_json_str(&content, path)?,
|
|
50
|
+
_ => {
|
|
51
|
+
return Err(KreuzbergError::validation(format!(
|
|
52
|
+
"Unsupported config file format: .{}. Supported formats: .toml, .yaml, .yml, .json",
|
|
53
|
+
extension
|
|
54
|
+
)));
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
// Normalize legacy fields
|
|
59
|
+
config.normalize_legacy_fields();
|
|
60
|
+
|
|
61
|
+
Ok(config)
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/// Load server configuration from a TOML file.
|
|
65
|
+
///
|
|
66
|
+
/// # Arguments
|
|
67
|
+
///
|
|
68
|
+
/// * `path` - Path to the TOML file
|
|
69
|
+
///
|
|
70
|
+
/// # Errors
|
|
71
|
+
///
|
|
72
|
+
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid TOML.
|
|
73
|
+
pub fn from_toml_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
|
|
74
|
+
let path = path.as_ref();
|
|
75
|
+
|
|
76
|
+
let content = std::fs::read_to_string(path)
|
|
77
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
78
|
+
|
|
79
|
+
let mut config: ServerConfig = toml::from_str(&content)
|
|
80
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
|
|
81
|
+
|
|
82
|
+
config.normalize_legacy_fields();
|
|
83
|
+
|
|
84
|
+
Ok(config)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/// Load server configuration from a YAML file.
|
|
88
|
+
///
|
|
89
|
+
/// # Arguments
|
|
90
|
+
///
|
|
91
|
+
/// * `path` - Path to the YAML file
|
|
92
|
+
///
|
|
93
|
+
/// # Errors
|
|
94
|
+
///
|
|
95
|
+
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid YAML.
|
|
96
|
+
pub fn from_yaml_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
|
|
97
|
+
let path = path.as_ref();
|
|
98
|
+
|
|
99
|
+
let content = std::fs::read_to_string(path)
|
|
100
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
101
|
+
|
|
102
|
+
let mut config: ServerConfig = serde_yaml_ng::from_str(&content)
|
|
103
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
|
|
104
|
+
|
|
105
|
+
config.normalize_legacy_fields();
|
|
106
|
+
|
|
107
|
+
Ok(config)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/// Load server configuration from a JSON file.
|
|
111
|
+
///
|
|
112
|
+
/// # Arguments
|
|
113
|
+
///
|
|
114
|
+
/// * `path` - Path to the JSON file
|
|
115
|
+
///
|
|
116
|
+
/// # Errors
|
|
117
|
+
///
|
|
118
|
+
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid JSON.
|
|
119
|
+
pub fn from_json_file(path: impl AsRef<Path>) -> Result<ServerConfig> {
|
|
120
|
+
let path = path.as_ref();
|
|
121
|
+
|
|
122
|
+
let content = std::fs::read_to_string(path)
|
|
123
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
124
|
+
|
|
125
|
+
let mut config: ServerConfig = serde_json::from_str(&content)
|
|
126
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
|
|
127
|
+
|
|
128
|
+
config.normalize_legacy_fields();
|
|
129
|
+
|
|
130
|
+
Ok(config)
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Helper functions for parsing different formats
|
|
134
|
+
|
|
135
|
+
fn from_toml_str(content: &str, path: &Path) -> Result<ServerConfig> {
|
|
136
|
+
// Try nested format first (with [server] section)
|
|
137
|
+
#[derive(Deserialize)]
|
|
138
|
+
struct RootConfig {
|
|
139
|
+
#[serde(default)]
|
|
140
|
+
server: Option<ServerConfig>,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
if let Ok(root) = toml::from_str::<RootConfig>(content) {
|
|
144
|
+
if let Some(server) = root.server {
|
|
145
|
+
return Ok(server);
|
|
146
|
+
} else {
|
|
147
|
+
// No [server] section, try flat format
|
|
148
|
+
return toml::from_str::<ServerConfig>(content)
|
|
149
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)));
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Fall back to flat format
|
|
154
|
+
toml::from_str::<ServerConfig>(content)
|
|
155
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
fn from_yaml_str(content: &str, path: &Path) -> Result<ServerConfig> {
|
|
159
|
+
// Try nested format first (with server: section)
|
|
160
|
+
#[derive(Deserialize)]
|
|
161
|
+
struct RootConfig {
|
|
162
|
+
#[serde(default)]
|
|
163
|
+
server: Option<ServerConfig>,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if let Ok(root) = serde_yaml_ng::from_str::<RootConfig>(content) {
|
|
167
|
+
if let Some(server) = root.server {
|
|
168
|
+
return Ok(server);
|
|
169
|
+
} else {
|
|
170
|
+
// No server section, try flat format
|
|
171
|
+
return serde_yaml_ng::from_str::<ServerConfig>(content)
|
|
172
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)));
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Fall back to flat format
|
|
177
|
+
serde_yaml_ng::from_str::<ServerConfig>(content)
|
|
178
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
fn from_json_str(content: &str, path: &Path) -> Result<ServerConfig> {
|
|
182
|
+
// Try nested format first (with "server" key)
|
|
183
|
+
#[derive(Deserialize)]
|
|
184
|
+
struct RootConfig {
|
|
185
|
+
#[serde(default)]
|
|
186
|
+
server: Option<ServerConfig>,
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
if let Ok(root) = serde_json::from_str::<RootConfig>(content) {
|
|
190
|
+
if let Some(server) = root.server {
|
|
191
|
+
return Ok(server);
|
|
192
|
+
} else {
|
|
193
|
+
// No server key, try flat format
|
|
194
|
+
return serde_json::from_str::<ServerConfig>(content)
|
|
195
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)));
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Fall back to flat format
|
|
200
|
+
serde_json::from_str::<ServerConfig>(content)
|
|
201
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))
|
|
202
|
+
}
|
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
//! Server configuration for the Kreuzberg API.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides the `ServerConfig` struct for managing API server settings
|
|
4
|
+
//! including host, port, CORS, and upload size limits. Configuration can be loaded
|
|
5
|
+
//! from TOML, YAML, or JSON files and can be overridden by environment variables.
|
|
6
|
+
//!
|
|
7
|
+
//! # Features
|
|
8
|
+
//!
|
|
9
|
+
//! - **Multi-format support**: Load configuration from TOML, YAML, or JSON files
|
|
10
|
+
//! - **Environment overrides**: All settings can be overridden via environment variables
|
|
11
|
+
//! - **Backward compatibility**: Supports legacy `max_upload_mb` field for smooth migrations
|
|
12
|
+
//! - **Sensible defaults**: All fields have reasonable defaults matching current behavior
|
|
13
|
+
//! - **Flexible CORS**: Support for all origins (default) or specific origin lists
|
|
14
|
+
//!
|
|
15
|
+
//! # Example
|
|
16
|
+
//!
|
|
17
|
+
//! ```rust,no_run
|
|
18
|
+
//! use kreuzberg::core::ServerConfig;
|
|
19
|
+
//!
|
|
20
|
+
//! # fn example() -> kreuzberg::Result<()> {
|
|
21
|
+
//! // Create with defaults
|
|
22
|
+
//! let mut config = ServerConfig::default();
|
|
23
|
+
//!
|
|
24
|
+
//! // Or load from file
|
|
25
|
+
//! let mut config = ServerConfig::from_file("kreuzberg.toml")?;
|
|
26
|
+
//!
|
|
27
|
+
//! // Apply environment variable overrides
|
|
28
|
+
//! config.apply_env_overrides()?;
|
|
29
|
+
//!
|
|
30
|
+
//! # Ok(())
|
|
31
|
+
//! # }
|
|
32
|
+
//! ```
|
|
33
|
+
|
|
34
|
+
use crate::Result;
|
|
35
|
+
use serde::{Deserialize, Serialize};
|
|
36
|
+
use std::path::Path;
|
|
37
|
+
|
|
38
|
+
mod env;
|
|
39
|
+
mod loader;
|
|
40
|
+
mod validation;
|
|
41
|
+
|
|
42
|
+
#[cfg(test)]
|
|
43
|
+
mod tests;
|
|
44
|
+
|
|
45
|
+
/// Default host address for API server
|
|
46
|
+
const DEFAULT_HOST: &str = "127.0.0.1";
|
|
47
|
+
|
|
48
|
+
/// Default port for API server
|
|
49
|
+
const DEFAULT_PORT: u16 = 8000;
|
|
50
|
+
|
|
51
|
+
/// Default maximum request body size: 100 MB
|
|
52
|
+
const DEFAULT_MAX_REQUEST_BODY_BYTES: usize = 104_857_600;
|
|
53
|
+
|
|
54
|
+
/// Default maximum multipart field size: 100 MB
|
|
55
|
+
const DEFAULT_MAX_MULTIPART_FIELD_BYTES: usize = 104_857_600;
|
|
56
|
+
|
|
57
|
+
/// API server configuration.
|
|
58
|
+
///
|
|
59
|
+
/// This struct holds all configuration options for the Kreuzberg API server,
|
|
60
|
+
/// including host/port settings, CORS configuration, and upload limits.
|
|
61
|
+
///
|
|
62
|
+
/// # Defaults
|
|
63
|
+
///
|
|
64
|
+
/// - `host`: "127.0.0.1" (localhost only)
|
|
65
|
+
/// - `port`: 8000
|
|
66
|
+
/// - `cors_origins`: empty vector (allows all origins)
|
|
67
|
+
/// - `max_request_body_bytes`: 104_857_600 (100 MB)
|
|
68
|
+
/// - `max_multipart_field_bytes`: 104_857_600 (100 MB)
|
|
69
|
+
/// - `max_upload_mb`: None (legacy field, not used if other fields set)
|
|
70
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
71
|
+
#[serde(default)]
|
|
72
|
+
pub struct ServerConfig {
|
|
73
|
+
/// Server host address (e.g., "127.0.0.1", "0.0.0.0")
|
|
74
|
+
#[serde(default = "default_host")]
|
|
75
|
+
pub host: String,
|
|
76
|
+
|
|
77
|
+
/// Server port number
|
|
78
|
+
#[serde(default = "default_port")]
|
|
79
|
+
pub port: u16,
|
|
80
|
+
|
|
81
|
+
/// CORS allowed origins. Empty vector means allow all origins.
|
|
82
|
+
///
|
|
83
|
+
/// If this is an empty vector, the server will accept requests from any origin.
|
|
84
|
+
/// If populated with specific origins (e.g., ["https://example.com"]), only
|
|
85
|
+
/// those origins will be allowed.
|
|
86
|
+
#[serde(default)]
|
|
87
|
+
pub cors_origins: Vec<String>,
|
|
88
|
+
|
|
89
|
+
/// Maximum size of request body in bytes (default: 100 MB)
|
|
90
|
+
#[serde(default = "default_max_request_body_bytes")]
|
|
91
|
+
pub max_request_body_bytes: usize,
|
|
92
|
+
|
|
93
|
+
/// Maximum size of multipart fields in bytes (default: 100 MB)
|
|
94
|
+
#[serde(default = "default_max_multipart_field_bytes")]
|
|
95
|
+
pub max_multipart_field_bytes: usize,
|
|
96
|
+
|
|
97
|
+
/// Legacy upload size limit in MB (for backward compatibility).
|
|
98
|
+
///
|
|
99
|
+
/// This field is deprecated and only used for backward compatibility.
|
|
100
|
+
/// If set, it will override `max_multipart_field_bytes` during normalization.
|
|
101
|
+
/// New configurations should use `max_multipart_field_bytes` directly.
|
|
102
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
103
|
+
pub max_upload_mb: Option<usize>,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
impl Default for ServerConfig {
|
|
107
|
+
fn default() -> Self {
|
|
108
|
+
Self {
|
|
109
|
+
host: default_host(),
|
|
110
|
+
port: default_port(),
|
|
111
|
+
cors_origins: Vec::new(),
|
|
112
|
+
max_request_body_bytes: default_max_request_body_bytes(),
|
|
113
|
+
max_multipart_field_bytes: default_max_multipart_field_bytes(),
|
|
114
|
+
max_upload_mb: None,
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Default value functions for serde
|
|
120
|
+
fn default_host() -> String {
|
|
121
|
+
DEFAULT_HOST.to_string()
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
fn default_port() -> u16 {
|
|
125
|
+
DEFAULT_PORT
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
fn default_max_request_body_bytes() -> usize {
|
|
129
|
+
DEFAULT_MAX_REQUEST_BODY_BYTES
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
fn default_max_multipart_field_bytes() -> usize {
|
|
133
|
+
DEFAULT_MAX_MULTIPART_FIELD_BYTES
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
impl ServerConfig {
|
|
137
|
+
/// Create a new `ServerConfig` with default values.
|
|
138
|
+
pub fn new() -> Self {
|
|
139
|
+
Self::default()
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Get the server listen address (host:port).
|
|
143
|
+
///
|
|
144
|
+
/// # Example
|
|
145
|
+
///
|
|
146
|
+
/// ```rust
|
|
147
|
+
/// use kreuzberg::core::ServerConfig;
|
|
148
|
+
///
|
|
149
|
+
/// let config = ServerConfig::default();
|
|
150
|
+
/// assert_eq!(config.listen_addr(), "127.0.0.1:8000");
|
|
151
|
+
/// ```
|
|
152
|
+
pub fn listen_addr(&self) -> String {
|
|
153
|
+
format!("{}:{}", self.host, self.port)
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/// Check if CORS allows all origins.
|
|
157
|
+
///
|
|
158
|
+
/// Returns `true` if the `cors_origins` vector is empty, meaning all origins
|
|
159
|
+
/// are allowed. Returns `false` if specific origins are configured.
|
|
160
|
+
///
|
|
161
|
+
/// # Example
|
|
162
|
+
///
|
|
163
|
+
/// ```rust
|
|
164
|
+
/// use kreuzberg::core::ServerConfig;
|
|
165
|
+
///
|
|
166
|
+
/// let mut config = ServerConfig::default();
|
|
167
|
+
/// assert!(config.cors_allows_all());
|
|
168
|
+
///
|
|
169
|
+
/// config.cors_origins.push("https://example.com".to_string());
|
|
170
|
+
/// assert!(!config.cors_allows_all());
|
|
171
|
+
/// ```
|
|
172
|
+
pub fn cors_allows_all(&self) -> bool {
|
|
173
|
+
self.cors_origins.is_empty()
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/// Check if a given origin is allowed by CORS configuration.
|
|
177
|
+
///
|
|
178
|
+
/// Returns `true` if:
|
|
179
|
+
/// - CORS allows all origins (empty origins list), or
|
|
180
|
+
/// - The given origin is in the allowed origins list
|
|
181
|
+
///
|
|
182
|
+
/// # Arguments
|
|
183
|
+
///
|
|
184
|
+
/// * `origin` - The origin to check (e.g., "https://example.com")
|
|
185
|
+
///
|
|
186
|
+
/// # Example
|
|
187
|
+
///
|
|
188
|
+
/// ```rust
|
|
189
|
+
/// use kreuzberg::core::ServerConfig;
|
|
190
|
+
///
|
|
191
|
+
/// let mut config = ServerConfig::default();
|
|
192
|
+
/// assert!(config.is_origin_allowed("https://example.com"));
|
|
193
|
+
///
|
|
194
|
+
/// config.cors_origins.push("https://allowed.com".to_string());
|
|
195
|
+
/// assert!(config.is_origin_allowed("https://allowed.com"));
|
|
196
|
+
/// assert!(!config.is_origin_allowed("https://denied.com"));
|
|
197
|
+
/// ```
|
|
198
|
+
pub fn is_origin_allowed(&self, origin: &str) -> bool {
|
|
199
|
+
self.cors_origins.is_empty() || self.cors_origins.contains(&origin.to_string())
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
/// Get maximum request body size in megabytes (rounded up).
|
|
203
|
+
///
|
|
204
|
+
/// # Example
|
|
205
|
+
///
|
|
206
|
+
/// ```rust
|
|
207
|
+
/// use kreuzberg::core::ServerConfig;
|
|
208
|
+
///
|
|
209
|
+
/// let mut config = ServerConfig::default();
|
|
210
|
+
/// assert_eq!(config.max_request_body_mb(), 100);
|
|
211
|
+
/// ```
|
|
212
|
+
pub fn max_request_body_mb(&self) -> usize {
|
|
213
|
+
self.max_request_body_bytes.div_ceil(1_048_576)
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
/// Get maximum multipart field size in megabytes (rounded up).
|
|
217
|
+
///
|
|
218
|
+
/// # Example
|
|
219
|
+
///
|
|
220
|
+
/// ```rust
|
|
221
|
+
/// use kreuzberg::core::ServerConfig;
|
|
222
|
+
///
|
|
223
|
+
/// let mut config = ServerConfig::default();
|
|
224
|
+
/// assert_eq!(config.max_multipart_field_mb(), 100);
|
|
225
|
+
/// ```
|
|
226
|
+
pub fn max_multipart_field_mb(&self) -> usize {
|
|
227
|
+
self.max_multipart_field_bytes.div_ceil(1_048_576)
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/// Normalize legacy field values for backward compatibility.
|
|
231
|
+
///
|
|
232
|
+
/// If `max_upload_mb` is set, it will be converted to bytes and used to
|
|
233
|
+
/// override `max_multipart_field_bytes`. This allows old configurations
|
|
234
|
+
/// using the legacy field to continue working.
|
|
235
|
+
///
|
|
236
|
+
/// This method is automatically called by `apply_env_overrides()`.
|
|
237
|
+
pub fn normalize_legacy_fields(&mut self) {
|
|
238
|
+
validation::normalize_legacy_fields(self.max_upload_mb, &mut self.max_multipart_field_bytes);
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/// Apply environment variable overrides to the configuration.
|
|
242
|
+
///
|
|
243
|
+
/// Reads the following environment variables and overrides config values if set:
|
|
244
|
+
///
|
|
245
|
+
/// - `KREUZBERG_HOST` - Server host address
|
|
246
|
+
/// - `KREUZBERG_PORT` - Server port number (parsed as u16)
|
|
247
|
+
/// - `KREUZBERG_CORS_ORIGINS` - Comma-separated list of allowed origins
|
|
248
|
+
/// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` - Max request body size in bytes
|
|
249
|
+
/// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` - Max multipart field size in bytes
|
|
250
|
+
/// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` - Max upload size in MB (legacy)
|
|
251
|
+
///
|
|
252
|
+
/// # Errors
|
|
253
|
+
///
|
|
254
|
+
/// Returns `KreuzbergError::Validation` if:
|
|
255
|
+
/// - `KREUZBERG_PORT` cannot be parsed as u16
|
|
256
|
+
/// - `KREUZBERG_MAX_REQUEST_BODY_BYTES` cannot be parsed as usize
|
|
257
|
+
/// - `KREUZBERG_MAX_MULTIPART_FIELD_BYTES` cannot be parsed as usize
|
|
258
|
+
/// - `KREUZBERG_MAX_UPLOAD_SIZE_MB` cannot be parsed as usize
|
|
259
|
+
///
|
|
260
|
+
/// # Example
|
|
261
|
+
///
|
|
262
|
+
/// ```rust,no_run
|
|
263
|
+
/// use kreuzberg::core::ServerConfig;
|
|
264
|
+
///
|
|
265
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
266
|
+
/// unsafe {
|
|
267
|
+
/// std::env::set_var("KREUZBERG_HOST", "0.0.0.0");
|
|
268
|
+
/// std::env::set_var("KREUZBERG_PORT", "3000");
|
|
269
|
+
/// }
|
|
270
|
+
///
|
|
271
|
+
/// let mut config = ServerConfig::default();
|
|
272
|
+
/// config.apply_env_overrides()?;
|
|
273
|
+
///
|
|
274
|
+
/// assert_eq!(config.host, "0.0.0.0");
|
|
275
|
+
/// assert_eq!(config.port, 3000);
|
|
276
|
+
/// # Ok(())
|
|
277
|
+
/// # }
|
|
278
|
+
/// ```
|
|
279
|
+
pub fn apply_env_overrides(&mut self) -> Result<()> {
|
|
280
|
+
env::apply_env_overrides(
|
|
281
|
+
&mut self.host,
|
|
282
|
+
&mut self.port,
|
|
283
|
+
&mut self.cors_origins,
|
|
284
|
+
&mut self.max_request_body_bytes,
|
|
285
|
+
&mut self.max_multipart_field_bytes,
|
|
286
|
+
&mut self.max_upload_mb,
|
|
287
|
+
)?;
|
|
288
|
+
|
|
289
|
+
// Apply legacy field normalization
|
|
290
|
+
self.normalize_legacy_fields();
|
|
291
|
+
|
|
292
|
+
Ok(())
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/// Load server configuration from a file.
|
|
296
|
+
///
|
|
297
|
+
/// Automatically detects the file format based on extension:
|
|
298
|
+
/// - `.toml` - TOML format
|
|
299
|
+
/// - `.yaml` or `.yml` - YAML format
|
|
300
|
+
/// - `.json` - JSON format
|
|
301
|
+
///
|
|
302
|
+
/// This function handles two config file formats:
|
|
303
|
+
/// 1. Flat format: Server config at root level
|
|
304
|
+
/// 2. Nested format: Server config under `[server]` section (combined with ExtractionConfig)
|
|
305
|
+
///
|
|
306
|
+
/// # Arguments
|
|
307
|
+
///
|
|
308
|
+
/// * `path` - Path to the configuration file
|
|
309
|
+
///
|
|
310
|
+
/// # Errors
|
|
311
|
+
///
|
|
312
|
+
/// Returns `KreuzbergError::Validation` if:
|
|
313
|
+
/// - File doesn't exist or cannot be read
|
|
314
|
+
/// - File extension is not recognized
|
|
315
|
+
/// - File content is invalid for the detected format
|
|
316
|
+
///
|
|
317
|
+
/// # Example
|
|
318
|
+
///
|
|
319
|
+
/// ```rust,no_run
|
|
320
|
+
/// use kreuzberg::core::ServerConfig;
|
|
321
|
+
///
|
|
322
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
323
|
+
/// let config = ServerConfig::from_file("kreuzberg.toml")?;
|
|
324
|
+
/// # Ok(())
|
|
325
|
+
/// # }
|
|
326
|
+
/// ```
|
|
327
|
+
pub fn from_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
328
|
+
loader::from_file(path)
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/// Load server configuration from a TOML file.
|
|
332
|
+
///
|
|
333
|
+
/// # Arguments
|
|
334
|
+
///
|
|
335
|
+
/// * `path` - Path to the TOML file
|
|
336
|
+
///
|
|
337
|
+
/// # Errors
|
|
338
|
+
///
|
|
339
|
+
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid TOML.
|
|
340
|
+
///
|
|
341
|
+
/// # Example
|
|
342
|
+
///
|
|
343
|
+
/// ```rust,no_run
|
|
344
|
+
/// use kreuzberg::core::ServerConfig;
|
|
345
|
+
///
|
|
346
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
347
|
+
/// let config = ServerConfig::from_toml_file("kreuzberg.toml")?;
|
|
348
|
+
/// # Ok(())
|
|
349
|
+
/// # }
|
|
350
|
+
/// ```
|
|
351
|
+
pub fn from_toml_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
352
|
+
loader::from_toml_file(path)
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/// Load server configuration from a YAML file.
|
|
356
|
+
///
|
|
357
|
+
/// # Arguments
|
|
358
|
+
///
|
|
359
|
+
/// * `path` - Path to the YAML file
|
|
360
|
+
///
|
|
361
|
+
/// # Errors
|
|
362
|
+
///
|
|
363
|
+
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid YAML.
|
|
364
|
+
pub fn from_yaml_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
365
|
+
loader::from_yaml_file(path)
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
/// Load server configuration from a JSON file.
|
|
369
|
+
///
|
|
370
|
+
/// # Arguments
|
|
371
|
+
///
|
|
372
|
+
/// * `path` - Path to the JSON file
|
|
373
|
+
///
|
|
374
|
+
/// # Errors
|
|
375
|
+
///
|
|
376
|
+
/// Returns `KreuzbergError::Validation` if the file doesn't exist or is invalid JSON.
|
|
377
|
+
pub fn from_json_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
378
|
+
loader::from_json_file(path)
|
|
379
|
+
}
|
|
380
|
+
}
|