kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,949 +0,0 @@
|
|
|
1
|
-
//! Configuration validation module.
|
|
2
|
-
//!
|
|
3
|
-
//! Provides centralized validation for configuration values across all bindings.
|
|
4
|
-
//! This eliminates duplication of validation logic in Python, TypeScript, Java, Go, and other language bindings.
|
|
5
|
-
//!
|
|
6
|
-
//! All validation functions return `Result<()>` and produce detailed error messages
|
|
7
|
-
//! suitable for user-facing error handling.
|
|
8
|
-
//!
|
|
9
|
-
//! # Examples
|
|
10
|
-
//!
|
|
11
|
-
//! ```rust
|
|
12
|
-
//! use kreuzberg::core::config_validation::{
|
|
13
|
-
//! validate_binarization_method,
|
|
14
|
-
//! validate_token_reduction_level,
|
|
15
|
-
//! validate_language_code,
|
|
16
|
-
//! };
|
|
17
|
-
//!
|
|
18
|
-
//! // Valid values
|
|
19
|
-
//! assert!(validate_binarization_method("otsu").is_ok());
|
|
20
|
-
//! assert!(validate_token_reduction_level("moderate").is_ok());
|
|
21
|
-
//! assert!(validate_language_code("en").is_ok());
|
|
22
|
-
//!
|
|
23
|
-
//! // Invalid values
|
|
24
|
-
//! assert!(validate_binarization_method("invalid").is_err());
|
|
25
|
-
//! assert!(validate_token_reduction_level("extreme").is_err());
|
|
26
|
-
//! ```
|
|
27
|
-
|
|
28
|
-
use crate::{KreuzbergError, Result};
|
|
29
|
-
|
|
30
|
-
/// Valid binarization methods for image preprocessing.
|
|
31
|
-
const VALID_BINARIZATION_METHODS: &[&str] = &["otsu", "adaptive", "sauvola"];
|
|
32
|
-
|
|
33
|
-
/// Valid token reduction levels.
|
|
34
|
-
const VALID_TOKEN_REDUCTION_LEVELS: &[&str] = &["off", "light", "moderate", "aggressive", "maximum"];
|
|
35
|
-
|
|
36
|
-
/// Valid OCR backends.
|
|
37
|
-
const VALID_OCR_BACKENDS: &[&str] = &["tesseract", "easyocr", "paddleocr"];
|
|
38
|
-
|
|
39
|
-
/// Common ISO 639-1 language codes (extended list).
|
|
40
|
-
/// Covers most major languages and variants used in document processing.
|
|
41
|
-
const VALID_LANGUAGE_CODES: &[&str] = &[
|
|
42
|
-
"en", "de", "fr", "es", "it", "pt", "nl", "pl", "ru", "zh", "ja", "ko", "bg", "cs", "da", "el", "et", "fi", "hu",
|
|
43
|
-
"lt", "lv", "ro", "sk", "sl", "sv", "uk", "ar", "hi", "th", "tr", "vi", "eng", "deu", "fra", "spa", "ita", "por",
|
|
44
|
-
"nld", "pol", "rus", "zho", "jpn", "kor", "ces", "dan", "ell", "est", "fin", "hun", "lit", "lav", "ron", "slk",
|
|
45
|
-
"slv", "swe", "tur",
|
|
46
|
-
];
|
|
47
|
-
|
|
48
|
-
/// Valid tesseract PSM (Page Segmentation Mode) values.
|
|
49
|
-
const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
|
|
50
|
-
|
|
51
|
-
/// Valid tesseract OEM (OCR Engine Mode) values.
|
|
52
|
-
const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
|
|
53
|
-
|
|
54
|
-
/// Valid output formats for tesseract.
|
|
55
|
-
const VALID_OUTPUT_FORMATS: &[&str] = &["text", "markdown"];
|
|
56
|
-
|
|
57
|
-
/// Validate a binarization method string.
|
|
58
|
-
///
|
|
59
|
-
/// # Arguments
|
|
60
|
-
///
|
|
61
|
-
/// * `method` - The binarization method to validate (e.g., "otsu", "adaptive", "sauvola")
|
|
62
|
-
///
|
|
63
|
-
/// # Returns
|
|
64
|
-
///
|
|
65
|
-
/// `Ok(())` if the method is valid, or a `ValidationError` with details about valid options.
|
|
66
|
-
///
|
|
67
|
-
/// # Examples
|
|
68
|
-
///
|
|
69
|
-
/// ```rust
|
|
70
|
-
/// use kreuzberg::core::config_validation::validate_binarization_method;
|
|
71
|
-
///
|
|
72
|
-
/// assert!(validate_binarization_method("otsu").is_ok());
|
|
73
|
-
/// assert!(validate_binarization_method("adaptive").is_ok());
|
|
74
|
-
/// assert!(validate_binarization_method("invalid").is_err());
|
|
75
|
-
/// ```
|
|
76
|
-
pub fn validate_binarization_method(method: &str) -> Result<()> {
|
|
77
|
-
let method = method.to_lowercase();
|
|
78
|
-
if VALID_BINARIZATION_METHODS.contains(&method.as_str()) {
|
|
79
|
-
Ok(())
|
|
80
|
-
} else {
|
|
81
|
-
Err(KreuzbergError::Validation {
|
|
82
|
-
message: format!(
|
|
83
|
-
"Invalid binarization method '{}'. Valid options are: {}",
|
|
84
|
-
method,
|
|
85
|
-
VALID_BINARIZATION_METHODS.join(", ")
|
|
86
|
-
),
|
|
87
|
-
source: None,
|
|
88
|
-
})
|
|
89
|
-
}
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
/// Validate a token reduction level string.
|
|
93
|
-
///
|
|
94
|
-
/// # Arguments
|
|
95
|
-
///
|
|
96
|
-
/// * `level` - The token reduction level to validate (e.g., "off", "light", "moderate")
|
|
97
|
-
///
|
|
98
|
-
/// # Returns
|
|
99
|
-
///
|
|
100
|
-
/// `Ok(())` if the level is valid, or a `ValidationError` with details about valid options.
|
|
101
|
-
///
|
|
102
|
-
/// # Examples
|
|
103
|
-
///
|
|
104
|
-
/// ```rust
|
|
105
|
-
/// use kreuzberg::core::config_validation::validate_token_reduction_level;
|
|
106
|
-
///
|
|
107
|
-
/// assert!(validate_token_reduction_level("off").is_ok());
|
|
108
|
-
/// assert!(validate_token_reduction_level("moderate").is_ok());
|
|
109
|
-
/// assert!(validate_token_reduction_level("extreme").is_err());
|
|
110
|
-
/// ```
|
|
111
|
-
pub fn validate_token_reduction_level(level: &str) -> Result<()> {
|
|
112
|
-
let level = level.to_lowercase();
|
|
113
|
-
if VALID_TOKEN_REDUCTION_LEVELS.contains(&level.as_str()) {
|
|
114
|
-
Ok(())
|
|
115
|
-
} else {
|
|
116
|
-
Err(KreuzbergError::Validation {
|
|
117
|
-
message: format!(
|
|
118
|
-
"Invalid token reduction level '{}'. Valid options are: {}",
|
|
119
|
-
level,
|
|
120
|
-
VALID_TOKEN_REDUCTION_LEVELS.join(", ")
|
|
121
|
-
),
|
|
122
|
-
source: None,
|
|
123
|
-
})
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
/// Validate an OCR backend string.
|
|
128
|
-
///
|
|
129
|
-
/// # Arguments
|
|
130
|
-
///
|
|
131
|
-
/// * `backend` - The OCR backend to validate (e.g., "tesseract", "easyocr", "paddleocr")
|
|
132
|
-
///
|
|
133
|
-
/// # Returns
|
|
134
|
-
///
|
|
135
|
-
/// `Ok(())` if the backend is valid, or a `ValidationError` with details about valid options.
|
|
136
|
-
///
|
|
137
|
-
/// # Examples
|
|
138
|
-
///
|
|
139
|
-
/// ```rust
|
|
140
|
-
/// use kreuzberg::core::config_validation::validate_ocr_backend;
|
|
141
|
-
///
|
|
142
|
-
/// assert!(validate_ocr_backend("tesseract").is_ok());
|
|
143
|
-
/// assert!(validate_ocr_backend("easyocr").is_ok());
|
|
144
|
-
/// assert!(validate_ocr_backend("invalid").is_err());
|
|
145
|
-
/// ```
|
|
146
|
-
pub fn validate_ocr_backend(backend: &str) -> Result<()> {
|
|
147
|
-
let backend = backend.to_lowercase();
|
|
148
|
-
if VALID_OCR_BACKENDS.contains(&backend.as_str()) {
|
|
149
|
-
Ok(())
|
|
150
|
-
} else {
|
|
151
|
-
Err(KreuzbergError::Validation {
|
|
152
|
-
message: format!(
|
|
153
|
-
"Invalid OCR backend '{}'. Valid options are: {}",
|
|
154
|
-
backend,
|
|
155
|
-
VALID_OCR_BACKENDS.join(", ")
|
|
156
|
-
),
|
|
157
|
-
source: None,
|
|
158
|
-
})
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
/// Validate a language code (ISO 639-1 or 639-3 format).
|
|
163
|
-
///
|
|
164
|
-
/// Accepts both 2-letter ISO 639-1 codes (e.g., "en", "de") and
|
|
165
|
-
/// 3-letter ISO 639-3 codes (e.g., "eng", "deu") for broader compatibility.
|
|
166
|
-
///
|
|
167
|
-
/// # Arguments
|
|
168
|
-
///
|
|
169
|
-
/// * `code` - The language code to validate
|
|
170
|
-
///
|
|
171
|
-
/// # Returns
|
|
172
|
-
///
|
|
173
|
-
/// `Ok(())` if the code is valid, or a `ValidationError` indicating an invalid language code.
|
|
174
|
-
///
|
|
175
|
-
/// # Examples
|
|
176
|
-
///
|
|
177
|
-
/// ```rust
|
|
178
|
-
/// use kreuzberg::core::config_validation::validate_language_code;
|
|
179
|
-
///
|
|
180
|
-
/// assert!(validate_language_code("en").is_ok());
|
|
181
|
-
/// assert!(validate_language_code("eng").is_ok());
|
|
182
|
-
/// assert!(validate_language_code("de").is_ok());
|
|
183
|
-
/// assert!(validate_language_code("deu").is_ok());
|
|
184
|
-
/// assert!(validate_language_code("invalid").is_err());
|
|
185
|
-
/// ```
|
|
186
|
-
pub fn validate_language_code(code: &str) -> Result<()> {
|
|
187
|
-
let code_lower = code.to_lowercase();
|
|
188
|
-
|
|
189
|
-
if VALID_LANGUAGE_CODES.contains(&code_lower.as_str()) {
|
|
190
|
-
return Ok(());
|
|
191
|
-
}
|
|
192
|
-
|
|
193
|
-
Err(KreuzbergError::Validation {
|
|
194
|
-
message: format!(
|
|
195
|
-
"Invalid language code '{}'. Use ISO 639-1 (2-letter, e.g., 'en', 'de') \
|
|
196
|
-
or ISO 639-3 (3-letter, e.g., 'eng', 'deu') codes. \
|
|
197
|
-
Common codes: en, de, fr, es, it, pt, nl, pl, ru, zh, ja, ko, ar, hi, th.",
|
|
198
|
-
code
|
|
199
|
-
),
|
|
200
|
-
source: None,
|
|
201
|
-
})
|
|
202
|
-
}
|
|
203
|
-
|
|
204
|
-
/// Validate a tesseract Page Segmentation Mode (PSM).
|
|
205
|
-
///
|
|
206
|
-
/// # Arguments
|
|
207
|
-
///
|
|
208
|
-
/// * `psm` - The PSM value to validate (0-13)
|
|
209
|
-
///
|
|
210
|
-
/// # Returns
|
|
211
|
-
///
|
|
212
|
-
/// `Ok(())` if the PSM is valid, or a `ValidationError` with details about valid ranges.
|
|
213
|
-
///
|
|
214
|
-
/// # Examples
|
|
215
|
-
///
|
|
216
|
-
/// ```rust
|
|
217
|
-
/// use kreuzberg::core::config_validation::validate_tesseract_psm;
|
|
218
|
-
///
|
|
219
|
-
/// assert!(validate_tesseract_psm(3).is_ok()); // Fully automatic
|
|
220
|
-
/// assert!(validate_tesseract_psm(6).is_ok()); // Single block of text
|
|
221
|
-
/// assert!(validate_tesseract_psm(14).is_err()); // Out of range
|
|
222
|
-
/// ```
|
|
223
|
-
pub fn validate_tesseract_psm(psm: i32) -> Result<()> {
|
|
224
|
-
if VALID_TESSERACT_PSM.contains(&psm) {
|
|
225
|
-
Ok(())
|
|
226
|
-
} else {
|
|
227
|
-
Err(KreuzbergError::Validation {
|
|
228
|
-
message: format!(
|
|
229
|
-
"Invalid tesseract PSM value '{}'. Valid range is 0-13. \
|
|
230
|
-
Common values: 3 (auto), 6 (single block), 11 (sparse text).",
|
|
231
|
-
psm
|
|
232
|
-
),
|
|
233
|
-
source: None,
|
|
234
|
-
})
|
|
235
|
-
}
|
|
236
|
-
}
|
|
237
|
-
|
|
238
|
-
/// Validate a tesseract OCR Engine Mode (OEM).
|
|
239
|
-
///
|
|
240
|
-
/// # Arguments
|
|
241
|
-
///
|
|
242
|
-
/// * `oem` - The OEM value to validate (0-3)
|
|
243
|
-
///
|
|
244
|
-
/// # Returns
|
|
245
|
-
///
|
|
246
|
-
/// `Ok(())` if the OEM is valid, or a `ValidationError` with details about valid options.
|
|
247
|
-
///
|
|
248
|
-
/// # Examples
|
|
249
|
-
///
|
|
250
|
-
/// ```rust
|
|
251
|
-
/// use kreuzberg::core::config_validation::validate_tesseract_oem;
|
|
252
|
-
///
|
|
253
|
-
/// assert!(validate_tesseract_oem(1).is_ok()); // Neural nets (LSTM)
|
|
254
|
-
/// assert!(validate_tesseract_oem(2).is_ok()); // Legacy + LSTM
|
|
255
|
-
/// assert!(validate_tesseract_oem(4).is_err()); // Out of range
|
|
256
|
-
/// ```
|
|
257
|
-
pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
258
|
-
if VALID_TESSERACT_OEM.contains(&oem) {
|
|
259
|
-
Ok(())
|
|
260
|
-
} else {
|
|
261
|
-
Err(KreuzbergError::Validation {
|
|
262
|
-
message: format!(
|
|
263
|
-
"Invalid tesseract OEM value '{}'. Valid range is 0-3. \
|
|
264
|
-
0=Legacy, 1=LSTM, 2=Legacy+LSTM, 3=Default",
|
|
265
|
-
oem
|
|
266
|
-
),
|
|
267
|
-
source: None,
|
|
268
|
-
})
|
|
269
|
-
}
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
/// Validate a tesseract output format.
|
|
273
|
-
///
|
|
274
|
-
/// # Arguments
|
|
275
|
-
///
|
|
276
|
-
/// * `format` - The output format to validate (e.g., "text", "markdown")
|
|
277
|
-
///
|
|
278
|
-
/// # Returns
|
|
279
|
-
///
|
|
280
|
-
/// `Ok(())` if the format is valid, or a `ValidationError` with details about valid options.
|
|
281
|
-
///
|
|
282
|
-
/// # Examples
|
|
283
|
-
///
|
|
284
|
-
/// ```rust
|
|
285
|
-
/// use kreuzberg::core::config_validation::validate_output_format;
|
|
286
|
-
///
|
|
287
|
-
/// assert!(validate_output_format("text").is_ok());
|
|
288
|
-
/// assert!(validate_output_format("markdown").is_ok());
|
|
289
|
-
/// assert!(validate_output_format("json").is_err());
|
|
290
|
-
/// ```
|
|
291
|
-
pub fn validate_output_format(format: &str) -> Result<()> {
|
|
292
|
-
let format = format.to_lowercase();
|
|
293
|
-
if VALID_OUTPUT_FORMATS.contains(&format.as_str()) {
|
|
294
|
-
Ok(())
|
|
295
|
-
} else {
|
|
296
|
-
Err(KreuzbergError::Validation {
|
|
297
|
-
message: format!(
|
|
298
|
-
"Invalid output format '{}'. Valid options are: {}",
|
|
299
|
-
format,
|
|
300
|
-
VALID_OUTPUT_FORMATS.join(", ")
|
|
301
|
-
),
|
|
302
|
-
source: None,
|
|
303
|
-
})
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
|
|
307
|
-
/// Validate a confidence threshold value.
|
|
308
|
-
///
|
|
309
|
-
/// Confidence thresholds should be between 0.0 and 1.0 inclusive.
|
|
310
|
-
///
|
|
311
|
-
/// # Arguments
|
|
312
|
-
///
|
|
313
|
-
/// * `confidence` - The confidence threshold to validate
|
|
314
|
-
///
|
|
315
|
-
/// # Returns
|
|
316
|
-
///
|
|
317
|
-
/// `Ok(())` if the confidence is valid, or a `ValidationError` with details about valid ranges.
|
|
318
|
-
///
|
|
319
|
-
/// # Examples
|
|
320
|
-
///
|
|
321
|
-
/// ```rust
|
|
322
|
-
/// use kreuzberg::core::config_validation::validate_confidence;
|
|
323
|
-
///
|
|
324
|
-
/// assert!(validate_confidence(0.5).is_ok());
|
|
325
|
-
/// assert!(validate_confidence(0.0).is_ok());
|
|
326
|
-
/// assert!(validate_confidence(1.0).is_ok());
|
|
327
|
-
/// assert!(validate_confidence(1.5).is_err());
|
|
328
|
-
/// assert!(validate_confidence(-0.1).is_err());
|
|
329
|
-
/// ```
|
|
330
|
-
pub fn validate_confidence(confidence: f64) -> Result<()> {
|
|
331
|
-
if (0.0..=1.0).contains(&confidence) {
|
|
332
|
-
Ok(())
|
|
333
|
-
} else {
|
|
334
|
-
Err(KreuzbergError::Validation {
|
|
335
|
-
message: format!(
|
|
336
|
-
"Invalid confidence threshold '{}'. Must be between 0.0 and 1.0.",
|
|
337
|
-
confidence
|
|
338
|
-
),
|
|
339
|
-
source: None,
|
|
340
|
-
})
|
|
341
|
-
}
|
|
342
|
-
}
|
|
343
|
-
|
|
344
|
-
/// Validate a DPI (dots per inch) value.
|
|
345
|
-
///
|
|
346
|
-
/// DPI should be a positive integer, typically 72-600.
|
|
347
|
-
///
|
|
348
|
-
/// # Arguments
|
|
349
|
-
///
|
|
350
|
-
/// * `dpi` - The DPI value to validate
|
|
351
|
-
///
|
|
352
|
-
/// # Returns
|
|
353
|
-
///
|
|
354
|
-
/// `Ok(())` if the DPI is valid, or a `ValidationError` with details about valid ranges.
|
|
355
|
-
///
|
|
356
|
-
/// # Examples
|
|
357
|
-
///
|
|
358
|
-
/// ```rust
|
|
359
|
-
/// use kreuzberg::core::config_validation::validate_dpi;
|
|
360
|
-
///
|
|
361
|
-
/// assert!(validate_dpi(96).is_ok());
|
|
362
|
-
/// assert!(validate_dpi(300).is_ok());
|
|
363
|
-
/// assert!(validate_dpi(0).is_err());
|
|
364
|
-
/// assert!(validate_dpi(-1).is_err());
|
|
365
|
-
/// ```
|
|
366
|
-
pub fn validate_dpi(dpi: i32) -> Result<()> {
|
|
367
|
-
if dpi > 0 && dpi <= 2400 {
|
|
368
|
-
Ok(())
|
|
369
|
-
} else {
|
|
370
|
-
Err(KreuzbergError::Validation {
|
|
371
|
-
message: format!(
|
|
372
|
-
"Invalid DPI value '{}'. Must be a positive integer, typically 72-600.",
|
|
373
|
-
dpi
|
|
374
|
-
),
|
|
375
|
-
source: None,
|
|
376
|
-
})
|
|
377
|
-
}
|
|
378
|
-
}
|
|
379
|
-
|
|
380
|
-
/// Validate chunk size parameters.
|
|
381
|
-
///
|
|
382
|
-
/// Checks that max_chars > 0 and max_overlap < max_chars.
|
|
383
|
-
///
|
|
384
|
-
/// # Arguments
|
|
385
|
-
///
|
|
386
|
-
/// * `max_chars` - The maximum characters per chunk
|
|
387
|
-
/// * `max_overlap` - The maximum overlap between chunks
|
|
388
|
-
///
|
|
389
|
-
/// # Returns
|
|
390
|
-
///
|
|
391
|
-
/// `Ok(())` if the parameters are valid, or a `ValidationError` with details about constraints.
|
|
392
|
-
///
|
|
393
|
-
/// # Examples
|
|
394
|
-
///
|
|
395
|
-
/// ```rust
|
|
396
|
-
/// use kreuzberg::core::config_validation::validate_chunking_params;
|
|
397
|
-
///
|
|
398
|
-
/// assert!(validate_chunking_params(1000, 200).is_ok());
|
|
399
|
-
/// assert!(validate_chunking_params(500, 50).is_ok());
|
|
400
|
-
/// assert!(validate_chunking_params(0, 100).is_err()); // max_chars must be > 0
|
|
401
|
-
/// assert!(validate_chunking_params(100, 150).is_err()); // overlap >= max_chars
|
|
402
|
-
/// ```
|
|
403
|
-
pub fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<()> {
|
|
404
|
-
if max_chars == 0 {
|
|
405
|
-
return Err(KreuzbergError::Validation {
|
|
406
|
-
message: "max_chars must be greater than 0".to_string(),
|
|
407
|
-
source: None,
|
|
408
|
-
});
|
|
409
|
-
}
|
|
410
|
-
|
|
411
|
-
if max_overlap >= max_chars {
|
|
412
|
-
return Err(KreuzbergError::Validation {
|
|
413
|
-
message: format!(
|
|
414
|
-
"max_overlap ({}) must be less than max_chars ({})",
|
|
415
|
-
max_overlap, max_chars
|
|
416
|
-
),
|
|
417
|
-
source: None,
|
|
418
|
-
});
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
Ok(())
|
|
422
|
-
}
|
|
423
|
-
|
|
424
|
-
/// Validate a port number for server configuration.
|
|
425
|
-
///
|
|
426
|
-
/// Port must be in the range 1-65535. While ports 1-1023 are privileged and may require
|
|
427
|
-
/// special permissions on some systems, they are still valid port numbers.
|
|
428
|
-
///
|
|
429
|
-
/// # Arguments
|
|
430
|
-
///
|
|
431
|
-
/// * `port` - The port number to validate
|
|
432
|
-
///
|
|
433
|
-
/// # Returns
|
|
434
|
-
///
|
|
435
|
-
/// `Ok(())` if the port is valid, or a `ValidationError` with details about valid ranges.
|
|
436
|
-
///
|
|
437
|
-
/// # Examples
|
|
438
|
-
///
|
|
439
|
-
/// ```rust
|
|
440
|
-
/// use kreuzberg::core::config_validation::validate_port;
|
|
441
|
-
///
|
|
442
|
-
/// assert!(validate_port(8000).is_ok());
|
|
443
|
-
/// assert!(validate_port(80).is_ok());
|
|
444
|
-
/// assert!(validate_port(1).is_ok());
|
|
445
|
-
/// assert!(validate_port(65535).is_ok());
|
|
446
|
-
/// assert!(validate_port(0).is_err());
|
|
447
|
-
/// assert!(validate_port(65536).is_err());
|
|
448
|
-
/// ```
|
|
449
|
-
pub fn validate_port(port: u16) -> Result<()> {
|
|
450
|
-
if port > 0 {
|
|
451
|
-
Ok(())
|
|
452
|
-
} else {
|
|
453
|
-
Err(KreuzbergError::Validation {
|
|
454
|
-
message: format!("Port must be 1-65535, got {}", port),
|
|
455
|
-
source: None,
|
|
456
|
-
})
|
|
457
|
-
}
|
|
458
|
-
}
|
|
459
|
-
|
|
460
|
-
/// Validate a host/IP address string for server configuration.
|
|
461
|
-
///
|
|
462
|
-
/// Accepts valid IPv4 addresses (e.g., "127.0.0.1", "0.0.0.0"), valid IPv6 addresses
|
|
463
|
-
/// (e.g., "::1", "::"), and hostnames (e.g., "localhost", "example.com").
|
|
464
|
-
///
|
|
465
|
-
/// # Arguments
|
|
466
|
-
///
|
|
467
|
-
/// * `host` - The host/IP address string to validate
|
|
468
|
-
///
|
|
469
|
-
/// # Returns
|
|
470
|
-
///
|
|
471
|
-
/// `Ok(())` if the host is valid, or a `ValidationError` with details about valid formats.
|
|
472
|
-
///
|
|
473
|
-
/// # Examples
|
|
474
|
-
///
|
|
475
|
-
/// ```rust
|
|
476
|
-
/// use kreuzberg::core::config_validation::validate_host;
|
|
477
|
-
///
|
|
478
|
-
/// assert!(validate_host("127.0.0.1").is_ok());
|
|
479
|
-
/// assert!(validate_host("0.0.0.0").is_ok());
|
|
480
|
-
/// assert!(validate_host("::1").is_ok());
|
|
481
|
-
/// assert!(validate_host("::").is_ok());
|
|
482
|
-
/// assert!(validate_host("localhost").is_ok());
|
|
483
|
-
/// assert!(validate_host("example.com").is_ok());
|
|
484
|
-
/// assert!(validate_host("").is_err());
|
|
485
|
-
/// ```
|
|
486
|
-
pub fn validate_host(host: &str) -> Result<()> {
|
|
487
|
-
let host = host.trim();
|
|
488
|
-
|
|
489
|
-
if host.is_empty() {
|
|
490
|
-
return Err(KreuzbergError::Validation {
|
|
491
|
-
message: "Invalid host '': must be a valid IP address or hostname".to_string(),
|
|
492
|
-
source: None,
|
|
493
|
-
});
|
|
494
|
-
}
|
|
495
|
-
|
|
496
|
-
// Check if it's a valid IPv4 address
|
|
497
|
-
if host.parse::<std::net::Ipv4Addr>().is_ok() {
|
|
498
|
-
return Ok(());
|
|
499
|
-
}
|
|
500
|
-
|
|
501
|
-
// Check if it's a valid IPv6 address
|
|
502
|
-
if host.parse::<std::net::Ipv6Addr>().is_ok() {
|
|
503
|
-
return Ok(());
|
|
504
|
-
}
|
|
505
|
-
|
|
506
|
-
// Check if it's a valid hostname (basic validation)
|
|
507
|
-
// Hostnames must contain only alphanumeric characters, dots, and hyphens
|
|
508
|
-
// Must not look like an invalid IPv4 address (all numeric with dots)
|
|
509
|
-
let looks_like_ipv4 = host
|
|
510
|
-
.split('.')
|
|
511
|
-
.all(|part| !part.is_empty() && part.chars().all(|c| c.is_numeric()));
|
|
512
|
-
if !looks_like_ipv4
|
|
513
|
-
&& host.chars().all(|c| c.is_alphanumeric() || c == '.' || c == '-')
|
|
514
|
-
&& !host.starts_with('-')
|
|
515
|
-
&& !host.ends_with('-')
|
|
516
|
-
{
|
|
517
|
-
return Ok(());
|
|
518
|
-
}
|
|
519
|
-
|
|
520
|
-
Err(KreuzbergError::Validation {
|
|
521
|
-
message: format!("Invalid host '{}': must be a valid IP address or hostname", host),
|
|
522
|
-
source: None,
|
|
523
|
-
})
|
|
524
|
-
}
|
|
525
|
-
|
|
526
|
-
/// Validate a CORS (Cross-Origin Resource Sharing) origin URL.
|
|
527
|
-
///
|
|
528
|
-
/// Accepts valid HTTP/HTTPS URLs (e.g., "https://example.com") or the wildcard "*"
|
|
529
|
-
/// to allow all origins. URLs must start with "http://" or "https://", or be exactly "*".
|
|
530
|
-
///
|
|
531
|
-
/// # Arguments
|
|
532
|
-
///
|
|
533
|
-
/// * `origin` - The CORS origin URL to validate
|
|
534
|
-
///
|
|
535
|
-
/// # Returns
|
|
536
|
-
///
|
|
537
|
-
/// `Ok(())` if the origin is valid, or a `ValidationError` with details about valid formats.
|
|
538
|
-
///
|
|
539
|
-
/// # Examples
|
|
540
|
-
///
|
|
541
|
-
/// ```rust
|
|
542
|
-
/// use kreuzberg::core::config_validation::validate_cors_origin;
|
|
543
|
-
///
|
|
544
|
-
/// assert!(validate_cors_origin("https://example.com").is_ok());
|
|
545
|
-
/// assert!(validate_cors_origin("http://localhost:3000").is_ok());
|
|
546
|
-
/// assert!(validate_cors_origin("*").is_ok());
|
|
547
|
-
/// assert!(validate_cors_origin("not-a-url").is_err());
|
|
548
|
-
/// assert!(validate_cors_origin("ftp://example.com").is_err());
|
|
549
|
-
/// ```
|
|
550
|
-
pub fn validate_cors_origin(origin: &str) -> Result<()> {
|
|
551
|
-
let origin = origin.trim();
|
|
552
|
-
|
|
553
|
-
if origin == "*" {
|
|
554
|
-
return Ok(());
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
if origin.starts_with("http://") || origin.starts_with("https://") {
|
|
558
|
-
// Basic validation: ensure there's something after the protocol
|
|
559
|
-
if origin.len() > 8 && (origin.starts_with("http://") && origin.len() > 7 || origin.starts_with("https://")) {
|
|
560
|
-
return Ok(());
|
|
561
|
-
}
|
|
562
|
-
}
|
|
563
|
-
|
|
564
|
-
Err(KreuzbergError::Validation {
|
|
565
|
-
message: format!(
|
|
566
|
-
"Invalid CORS origin '{}': must be a valid HTTP/HTTPS URL or '*'",
|
|
567
|
-
origin
|
|
568
|
-
),
|
|
569
|
-
source: None,
|
|
570
|
-
})
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
/// Validate an upload size limit for server configuration.
|
|
574
|
-
///
|
|
575
|
-
/// Upload size must be greater than 0 (measured in bytes).
|
|
576
|
-
///
|
|
577
|
-
/// # Arguments
|
|
578
|
-
///
|
|
579
|
-
/// * `size` - The maximum upload size in bytes to validate
|
|
580
|
-
///
|
|
581
|
-
/// # Returns
|
|
582
|
-
///
|
|
583
|
-
/// `Ok(())` if the size is valid, or a `ValidationError` with details about constraints.
|
|
584
|
-
///
|
|
585
|
-
/// # Examples
|
|
586
|
-
///
|
|
587
|
-
/// ```rust
|
|
588
|
-
/// use kreuzberg::core::config_validation::validate_upload_size;
|
|
589
|
-
///
|
|
590
|
-
/// assert!(validate_upload_size(1024).is_ok());
|
|
591
|
-
/// assert!(validate_upload_size(1_000_000).is_ok());
|
|
592
|
-
/// assert!(validate_upload_size(0).is_err());
|
|
593
|
-
/// ```
|
|
594
|
-
pub fn validate_upload_size(size: usize) -> Result<()> {
|
|
595
|
-
if size > 0 {
|
|
596
|
-
Ok(())
|
|
597
|
-
} else {
|
|
598
|
-
Err(KreuzbergError::Validation {
|
|
599
|
-
message: format!("Upload size must be greater than 0, got {}", size),
|
|
600
|
-
source: None,
|
|
601
|
-
})
|
|
602
|
-
}
|
|
603
|
-
}
|
|
604
|
-
|
|
605
|
-
#[cfg(test)]
|
|
606
|
-
mod tests {
|
|
607
|
-
use super::*;
|
|
608
|
-
|
|
609
|
-
#[test]
|
|
610
|
-
fn test_validate_binarization_method_valid() {
|
|
611
|
-
assert!(validate_binarization_method("otsu").is_ok());
|
|
612
|
-
assert!(validate_binarization_method("adaptive").is_ok());
|
|
613
|
-
assert!(validate_binarization_method("sauvola").is_ok());
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
#[test]
|
|
617
|
-
fn test_validate_binarization_method_case_insensitive() {
|
|
618
|
-
assert!(validate_binarization_method("OTSU").is_ok());
|
|
619
|
-
assert!(validate_binarization_method("Adaptive").is_ok());
|
|
620
|
-
assert!(validate_binarization_method("SAUVOLA").is_ok());
|
|
621
|
-
}
|
|
622
|
-
|
|
623
|
-
#[test]
|
|
624
|
-
fn test_validate_binarization_method_invalid() {
|
|
625
|
-
let result = validate_binarization_method("invalid");
|
|
626
|
-
assert!(result.is_err());
|
|
627
|
-
let msg = result.unwrap_err().to_string();
|
|
628
|
-
assert!(msg.contains("Invalid binarization method"));
|
|
629
|
-
assert!(msg.contains("otsu"));
|
|
630
|
-
}
|
|
631
|
-
|
|
632
|
-
#[test]
|
|
633
|
-
fn test_validate_token_reduction_level_valid() {
|
|
634
|
-
assert!(validate_token_reduction_level("off").is_ok());
|
|
635
|
-
assert!(validate_token_reduction_level("light").is_ok());
|
|
636
|
-
assert!(validate_token_reduction_level("moderate").is_ok());
|
|
637
|
-
assert!(validate_token_reduction_level("aggressive").is_ok());
|
|
638
|
-
assert!(validate_token_reduction_level("maximum").is_ok());
|
|
639
|
-
}
|
|
640
|
-
|
|
641
|
-
#[test]
|
|
642
|
-
fn test_validate_token_reduction_level_case_insensitive() {
|
|
643
|
-
assert!(validate_token_reduction_level("OFF").is_ok());
|
|
644
|
-
assert!(validate_token_reduction_level("Moderate").is_ok());
|
|
645
|
-
assert!(validate_token_reduction_level("MAXIMUM").is_ok());
|
|
646
|
-
}
|
|
647
|
-
|
|
648
|
-
#[test]
|
|
649
|
-
fn test_validate_token_reduction_level_invalid() {
|
|
650
|
-
let result = validate_token_reduction_level("extreme");
|
|
651
|
-
assert!(result.is_err());
|
|
652
|
-
let msg = result.unwrap_err().to_string();
|
|
653
|
-
assert!(msg.contains("Invalid token reduction level"));
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
#[test]
|
|
657
|
-
fn test_validate_ocr_backend_valid() {
|
|
658
|
-
assert!(validate_ocr_backend("tesseract").is_ok());
|
|
659
|
-
assert!(validate_ocr_backend("easyocr").is_ok());
|
|
660
|
-
assert!(validate_ocr_backend("paddleocr").is_ok());
|
|
661
|
-
}
|
|
662
|
-
|
|
663
|
-
#[test]
|
|
664
|
-
fn test_validate_ocr_backend_case_insensitive() {
|
|
665
|
-
assert!(validate_ocr_backend("TESSERACT").is_ok());
|
|
666
|
-
assert!(validate_ocr_backend("EasyOCR").is_ok());
|
|
667
|
-
assert!(validate_ocr_backend("PADDLEOCR").is_ok());
|
|
668
|
-
}
|
|
669
|
-
|
|
670
|
-
#[test]
|
|
671
|
-
fn test_validate_ocr_backend_invalid() {
|
|
672
|
-
let result = validate_ocr_backend("invalid_backend");
|
|
673
|
-
assert!(result.is_err());
|
|
674
|
-
let msg = result.unwrap_err().to_string();
|
|
675
|
-
assert!(msg.contains("Invalid OCR backend"));
|
|
676
|
-
}
|
|
677
|
-
|
|
678
|
-
#[test]
|
|
679
|
-
fn test_validate_language_code_valid_iso639_1() {
|
|
680
|
-
assert!(validate_language_code("en").is_ok());
|
|
681
|
-
assert!(validate_language_code("de").is_ok());
|
|
682
|
-
assert!(validate_language_code("fr").is_ok());
|
|
683
|
-
assert!(validate_language_code("es").is_ok());
|
|
684
|
-
assert!(validate_language_code("zh").is_ok());
|
|
685
|
-
assert!(validate_language_code("ja").is_ok());
|
|
686
|
-
assert!(validate_language_code("ko").is_ok());
|
|
687
|
-
}
|
|
688
|
-
|
|
689
|
-
#[test]
|
|
690
|
-
fn test_validate_language_code_valid_iso639_3() {
|
|
691
|
-
assert!(validate_language_code("eng").is_ok());
|
|
692
|
-
assert!(validate_language_code("deu").is_ok());
|
|
693
|
-
assert!(validate_language_code("fra").is_ok());
|
|
694
|
-
assert!(validate_language_code("spa").is_ok());
|
|
695
|
-
assert!(validate_language_code("zho").is_ok());
|
|
696
|
-
assert!(validate_language_code("jpn").is_ok());
|
|
697
|
-
assert!(validate_language_code("kor").is_ok());
|
|
698
|
-
}
|
|
699
|
-
|
|
700
|
-
#[test]
|
|
701
|
-
fn test_validate_language_code_case_insensitive() {
|
|
702
|
-
assert!(validate_language_code("EN").is_ok());
|
|
703
|
-
assert!(validate_language_code("ENG").is_ok());
|
|
704
|
-
assert!(validate_language_code("De").is_ok());
|
|
705
|
-
assert!(validate_language_code("DEU").is_ok());
|
|
706
|
-
}
|
|
707
|
-
|
|
708
|
-
#[test]
|
|
709
|
-
fn test_validate_language_code_invalid() {
|
|
710
|
-
let result = validate_language_code("invalid");
|
|
711
|
-
assert!(result.is_err());
|
|
712
|
-
let msg = result.unwrap_err().to_string();
|
|
713
|
-
assert!(msg.contains("Invalid language code"));
|
|
714
|
-
assert!(msg.contains("ISO 639"));
|
|
715
|
-
}
|
|
716
|
-
|
|
717
|
-
#[test]
|
|
718
|
-
fn test_validate_tesseract_psm_valid() {
|
|
719
|
-
for psm in 0..=13 {
|
|
720
|
-
assert!(validate_tesseract_psm(psm).is_ok(), "PSM {} should be valid", psm);
|
|
721
|
-
}
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
#[test]
|
|
725
|
-
fn test_validate_tesseract_psm_invalid() {
|
|
726
|
-
assert!(validate_tesseract_psm(-1).is_err());
|
|
727
|
-
assert!(validate_tesseract_psm(14).is_err());
|
|
728
|
-
assert!(validate_tesseract_psm(100).is_err());
|
|
729
|
-
}
|
|
730
|
-
|
|
731
|
-
#[test]
|
|
732
|
-
fn test_validate_tesseract_oem_valid() {
|
|
733
|
-
for oem in 0..=3 {
|
|
734
|
-
assert!(validate_tesseract_oem(oem).is_ok(), "OEM {} should be valid", oem);
|
|
735
|
-
}
|
|
736
|
-
}
|
|
737
|
-
|
|
738
|
-
#[test]
|
|
739
|
-
fn test_validate_tesseract_oem_invalid() {
|
|
740
|
-
assert!(validate_tesseract_oem(-1).is_err());
|
|
741
|
-
assert!(validate_tesseract_oem(4).is_err());
|
|
742
|
-
assert!(validate_tesseract_oem(10).is_err());
|
|
743
|
-
}
|
|
744
|
-
|
|
745
|
-
#[test]
|
|
746
|
-
fn test_validate_output_format_valid() {
|
|
747
|
-
assert!(validate_output_format("text").is_ok());
|
|
748
|
-
assert!(validate_output_format("markdown").is_ok());
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
#[test]
|
|
752
|
-
fn test_validate_output_format_case_insensitive() {
|
|
753
|
-
assert!(validate_output_format("TEXT").is_ok());
|
|
754
|
-
assert!(validate_output_format("Markdown").is_ok());
|
|
755
|
-
}
|
|
756
|
-
|
|
757
|
-
#[test]
|
|
758
|
-
fn test_validate_output_format_invalid() {
|
|
759
|
-
let result = validate_output_format("json");
|
|
760
|
-
assert!(result.is_err());
|
|
761
|
-
let msg = result.unwrap_err().to_string();
|
|
762
|
-
assert!(msg.contains("Invalid output format"));
|
|
763
|
-
}
|
|
764
|
-
|
|
765
|
-
#[test]
|
|
766
|
-
fn test_validate_confidence_valid() {
|
|
767
|
-
assert!(validate_confidence(0.0).is_ok());
|
|
768
|
-
assert!(validate_confidence(0.5).is_ok());
|
|
769
|
-
assert!(validate_confidence(1.0).is_ok());
|
|
770
|
-
assert!(validate_confidence(0.75).is_ok());
|
|
771
|
-
}
|
|
772
|
-
|
|
773
|
-
#[test]
|
|
774
|
-
fn test_validate_confidence_invalid() {
|
|
775
|
-
assert!(validate_confidence(-0.1).is_err());
|
|
776
|
-
assert!(validate_confidence(1.1).is_err());
|
|
777
|
-
assert!(validate_confidence(2.0).is_err());
|
|
778
|
-
}
|
|
779
|
-
|
|
780
|
-
#[test]
|
|
781
|
-
fn test_validate_dpi_valid() {
|
|
782
|
-
assert!(validate_dpi(72).is_ok());
|
|
783
|
-
assert!(validate_dpi(96).is_ok());
|
|
784
|
-
assert!(validate_dpi(300).is_ok());
|
|
785
|
-
assert!(validate_dpi(600).is_ok());
|
|
786
|
-
assert!(validate_dpi(1).is_ok());
|
|
787
|
-
}
|
|
788
|
-
|
|
789
|
-
#[test]
|
|
790
|
-
fn test_validate_dpi_invalid() {
|
|
791
|
-
assert!(validate_dpi(0).is_err());
|
|
792
|
-
assert!(validate_dpi(-1).is_err());
|
|
793
|
-
assert!(validate_dpi(2401).is_err());
|
|
794
|
-
}
|
|
795
|
-
|
|
796
|
-
#[test]
|
|
797
|
-
fn test_validate_chunking_params_valid() {
|
|
798
|
-
assert!(validate_chunking_params(1000, 200).is_ok());
|
|
799
|
-
assert!(validate_chunking_params(500, 50).is_ok());
|
|
800
|
-
assert!(validate_chunking_params(1, 0).is_ok());
|
|
801
|
-
}
|
|
802
|
-
|
|
803
|
-
#[test]
|
|
804
|
-
fn test_validate_chunking_params_zero_chars() {
|
|
805
|
-
let result = validate_chunking_params(0, 100);
|
|
806
|
-
assert!(result.is_err());
|
|
807
|
-
assert!(result.unwrap_err().to_string().contains("max_chars"));
|
|
808
|
-
}
|
|
809
|
-
|
|
810
|
-
#[test]
|
|
811
|
-
fn test_validate_chunking_params_overlap_too_large() {
|
|
812
|
-
let result = validate_chunking_params(100, 100);
|
|
813
|
-
assert!(result.is_err());
|
|
814
|
-
assert!(result.unwrap_err().to_string().contains("overlap"));
|
|
815
|
-
|
|
816
|
-
let result = validate_chunking_params(100, 150);
|
|
817
|
-
assert!(result.is_err());
|
|
818
|
-
}
|
|
819
|
-
|
|
820
|
-
#[test]
|
|
821
|
-
fn test_error_messages_are_helpful() {
|
|
822
|
-
let err = validate_binarization_method("bad").unwrap_err().to_string();
|
|
823
|
-
assert!(err.contains("otsu"));
|
|
824
|
-
assert!(err.contains("adaptive"));
|
|
825
|
-
assert!(err.contains("sauvola"));
|
|
826
|
-
|
|
827
|
-
let err = validate_token_reduction_level("bad").unwrap_err().to_string();
|
|
828
|
-
assert!(err.contains("off"));
|
|
829
|
-
assert!(err.contains("moderate"));
|
|
830
|
-
|
|
831
|
-
let err = validate_language_code("bad").unwrap_err().to_string();
|
|
832
|
-
assert!(err.contains("ISO 639"));
|
|
833
|
-
assert!(err.contains("en"));
|
|
834
|
-
}
|
|
835
|
-
|
|
836
|
-
#[test]
|
|
837
|
-
fn test_validate_port_valid() {
|
|
838
|
-
assert!(validate_port(1).is_ok());
|
|
839
|
-
assert!(validate_port(80).is_ok());
|
|
840
|
-
assert!(validate_port(443).is_ok());
|
|
841
|
-
assert!(validate_port(8000).is_ok());
|
|
842
|
-
assert!(validate_port(65535).is_ok());
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
#[test]
|
|
846
|
-
fn test_validate_port_invalid() {
|
|
847
|
-
let result = validate_port(0);
|
|
848
|
-
assert!(result.is_err());
|
|
849
|
-
let msg = result.unwrap_err().to_string();
|
|
850
|
-
assert!(msg.contains("Port must be 1-65535"));
|
|
851
|
-
assert!(msg.contains("0"));
|
|
852
|
-
}
|
|
853
|
-
|
|
854
|
-
#[test]
|
|
855
|
-
fn test_validate_host_ipv4() {
|
|
856
|
-
assert!(validate_host("127.0.0.1").is_ok());
|
|
857
|
-
assert!(validate_host("0.0.0.0").is_ok());
|
|
858
|
-
assert!(validate_host("192.168.1.1").is_ok());
|
|
859
|
-
assert!(validate_host("10.0.0.1").is_ok());
|
|
860
|
-
assert!(validate_host("255.255.255.255").is_ok());
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
#[test]
|
|
864
|
-
fn test_validate_host_ipv6() {
|
|
865
|
-
assert!(validate_host("::1").is_ok());
|
|
866
|
-
assert!(validate_host("::").is_ok());
|
|
867
|
-
assert!(validate_host("2001:db8::1").is_ok());
|
|
868
|
-
assert!(validate_host("fe80::1").is_ok());
|
|
869
|
-
}
|
|
870
|
-
|
|
871
|
-
#[test]
|
|
872
|
-
fn test_validate_host_hostname() {
|
|
873
|
-
assert!(validate_host("localhost").is_ok());
|
|
874
|
-
assert!(validate_host("example.com").is_ok());
|
|
875
|
-
assert!(validate_host("sub.example.com").is_ok());
|
|
876
|
-
assert!(validate_host("api-server").is_ok());
|
|
877
|
-
assert!(validate_host("app123").is_ok());
|
|
878
|
-
}
|
|
879
|
-
|
|
880
|
-
#[test]
|
|
881
|
-
fn test_validate_host_invalid() {
|
|
882
|
-
let result = validate_host("");
|
|
883
|
-
assert!(result.is_err());
|
|
884
|
-
let msg = result.unwrap_err().to_string();
|
|
885
|
-
assert!(msg.contains("Invalid host"));
|
|
886
|
-
|
|
887
|
-
let result = validate_host("not a valid host");
|
|
888
|
-
assert!(result.is_err());
|
|
889
|
-
|
|
890
|
-
let result = validate_host("256.256.256.256");
|
|
891
|
-
assert!(result.is_err());
|
|
892
|
-
}
|
|
893
|
-
|
|
894
|
-
#[test]
|
|
895
|
-
fn test_validate_cors_origin_https() {
|
|
896
|
-
assert!(validate_cors_origin("https://example.com").is_ok());
|
|
897
|
-
assert!(validate_cors_origin("https://localhost:3000").is_ok());
|
|
898
|
-
assert!(validate_cors_origin("https://sub.example.com").is_ok());
|
|
899
|
-
assert!(validate_cors_origin("https://192.168.1.1").is_ok());
|
|
900
|
-
assert!(validate_cors_origin("https://example.com/path").is_ok());
|
|
901
|
-
}
|
|
902
|
-
|
|
903
|
-
#[test]
|
|
904
|
-
fn test_validate_cors_origin_http() {
|
|
905
|
-
assert!(validate_cors_origin("http://example.com").is_ok());
|
|
906
|
-
assert!(validate_cors_origin("http://localhost:3000").is_ok());
|
|
907
|
-
assert!(validate_cors_origin("http://127.0.0.1:8000").is_ok());
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
#[test]
|
|
911
|
-
fn test_validate_cors_origin_wildcard() {
|
|
912
|
-
assert!(validate_cors_origin("*").is_ok());
|
|
913
|
-
}
|
|
914
|
-
|
|
915
|
-
#[test]
|
|
916
|
-
fn test_validate_cors_origin_invalid() {
|
|
917
|
-
let result = validate_cors_origin("not-a-url");
|
|
918
|
-
assert!(result.is_err());
|
|
919
|
-
let msg = result.unwrap_err().to_string();
|
|
920
|
-
assert!(msg.contains("Invalid CORS origin"));
|
|
921
|
-
|
|
922
|
-
let result = validate_cors_origin("ftp://example.com");
|
|
923
|
-
assert!(result.is_err());
|
|
924
|
-
|
|
925
|
-
let result = validate_cors_origin("example.com");
|
|
926
|
-
assert!(result.is_err());
|
|
927
|
-
|
|
928
|
-
let result = validate_cors_origin("http://");
|
|
929
|
-
assert!(result.is_err());
|
|
930
|
-
}
|
|
931
|
-
|
|
932
|
-
#[test]
|
|
933
|
-
fn test_validate_upload_size_valid() {
|
|
934
|
-
assert!(validate_upload_size(1).is_ok());
|
|
935
|
-
assert!(validate_upload_size(1024).is_ok());
|
|
936
|
-
assert!(validate_upload_size(1_000_000).is_ok());
|
|
937
|
-
assert!(validate_upload_size(1_000_000_000).is_ok());
|
|
938
|
-
assert!(validate_upload_size(usize::MAX).is_ok());
|
|
939
|
-
}
|
|
940
|
-
|
|
941
|
-
#[test]
|
|
942
|
-
fn test_validate_upload_size_invalid() {
|
|
943
|
-
let result = validate_upload_size(0);
|
|
944
|
-
assert!(result.is_err());
|
|
945
|
-
let msg = result.unwrap_err().to_string();
|
|
946
|
-
assert!(msg.contains("Upload size must be greater than 0"));
|
|
947
|
-
assert!(msg.contains("0"));
|
|
948
|
-
}
|
|
949
|
-
}
|