kreuzberg 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +8 -0
- data/.rspec +3 -0
- data/.rubocop.yaml +534 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +157 -0
- data/README.md +421 -0
- data/Rakefile +25 -0
- data/Steepfile +47 -0
- data/examples/async_patterns.rb +340 -0
- data/ext/kreuzberg_rb/extconf.rb +35 -0
- data/ext/kreuzberg_rb/native/Cargo.toml +36 -0
- data/ext/kreuzberg_rb/native/README.md +425 -0
- data/ext/kreuzberg_rb/native/build.rs +17 -0
- data/ext/kreuzberg_rb/native/include/ieeefp.h +11 -0
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +14 -0
- data/ext/kreuzberg_rb/native/include/strings.h +20 -0
- data/ext/kreuzberg_rb/native/include/unistd.h +47 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +2939 -0
- data/extconf.rb +28 -0
- data/kreuzberg.gemspec +105 -0
- data/lib/kreuzberg/api_proxy.rb +142 -0
- data/lib/kreuzberg/cache_api.rb +45 -0
- data/lib/kreuzberg/cli.rb +55 -0
- data/lib/kreuzberg/cli_proxy.rb +127 -0
- data/lib/kreuzberg/config.rb +684 -0
- data/lib/kreuzberg/errors.rb +50 -0
- data/lib/kreuzberg/extraction_api.rb +84 -0
- data/lib/kreuzberg/mcp_proxy.rb +186 -0
- data/lib/kreuzberg/ocr_backend_protocol.rb +113 -0
- data/lib/kreuzberg/post_processor_protocol.rb +86 -0
- data/lib/kreuzberg/result.rb +216 -0
- data/lib/kreuzberg/setup_lib_path.rb +79 -0
- data/lib/kreuzberg/validator_protocol.rb +89 -0
- data/lib/kreuzberg/version.rb +5 -0
- data/lib/kreuzberg.rb +82 -0
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg/internal.rbs +184 -0
- data/sig/kreuzberg.rbs +468 -0
- data/spec/binding/cache_spec.rb +227 -0
- data/spec/binding/cli_proxy_spec.rb +87 -0
- data/spec/binding/cli_spec.rb +54 -0
- data/spec/binding/config_spec.rb +345 -0
- data/spec/binding/config_validation_spec.rb +283 -0
- data/spec/binding/error_handling_spec.rb +213 -0
- data/spec/binding/errors_spec.rb +66 -0
- data/spec/binding/plugins/ocr_backend_spec.rb +307 -0
- data/spec/binding/plugins/postprocessor_spec.rb +269 -0
- data/spec/binding/plugins/validator_spec.rb +274 -0
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +39 -0
- data/spec/fixtures/config.yaml +42 -0
- data/spec/fixtures/invalid_config.toml +4 -0
- data/spec/smoke/package_spec.rb +178 -0
- data/spec/spec_helper.rb +42 -0
- data/vendor/kreuzberg/Cargo.toml +134 -0
- data/vendor/kreuzberg/README.md +175 -0
- data/vendor/kreuzberg/build.rs +460 -0
- data/vendor/kreuzberg/src/api/error.rs +81 -0
- data/vendor/kreuzberg/src/api/handlers.rs +199 -0
- data/vendor/kreuzberg/src/api/mod.rs +79 -0
- data/vendor/kreuzberg/src/api/server.rs +353 -0
- data/vendor/kreuzberg/src/api/types.rs +170 -0
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +1143 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +677 -0
- data/vendor/kreuzberg/src/core/batch_mode.rs +35 -0
- data/vendor/kreuzberg/src/core/config.rs +1032 -0
- data/vendor/kreuzberg/src/core/extractor.rs +903 -0
- data/vendor/kreuzberg/src/core/io.rs +327 -0
- data/vendor/kreuzberg/src/core/mime.rs +615 -0
- data/vendor/kreuzberg/src/core/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/pipeline.rs +906 -0
- data/vendor/kreuzberg/src/embeddings.rs +323 -0
- data/vendor/kreuzberg/src/error.rs +431 -0
- data/vendor/kreuzberg/src/extraction/archive.rs +954 -0
- data/vendor/kreuzberg/src/extraction/docx.rs +40 -0
- data/vendor/kreuzberg/src/extraction/email.rs +854 -0
- data/vendor/kreuzberg/src/extraction/excel.rs +688 -0
- data/vendor/kreuzberg/src/extraction/html.rs +553 -0
- data/vendor/kreuzberg/src/extraction/image.rs +368 -0
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +564 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +77 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/app_properties.rs +398 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/core_properties.rs +247 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/custom_properties.rs +240 -0
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +128 -0
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +3000 -0
- data/vendor/kreuzberg/src/extraction/structured.rs +490 -0
- data/vendor/kreuzberg/src/extraction/table.rs +328 -0
- data/vendor/kreuzberg/src/extraction/text.rs +269 -0
- data/vendor/kreuzberg/src/extraction/xml.rs +333 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +425 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +479 -0
- data/vendor/kreuzberg/src/extractors/email.rs +129 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +344 -0
- data/vendor/kreuzberg/src/extractors/html.rs +410 -0
- data/vendor/kreuzberg/src/extractors/image.rs +195 -0
- data/vendor/kreuzberg/src/extractors/mod.rs +268 -0
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +496 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +234 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +126 -0
- data/vendor/kreuzberg/src/extractors/text.rs +242 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +128 -0
- data/vendor/kreuzberg/src/image/dpi.rs +164 -0
- data/vendor/kreuzberg/src/image/mod.rs +6 -0
- data/vendor/kreuzberg/src/image/preprocessing.rs +417 -0
- data/vendor/kreuzberg/src/image/resize.rs +89 -0
- data/vendor/kreuzberg/src/keywords/config.rs +154 -0
- data/vendor/kreuzberg/src/keywords/mod.rs +237 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +267 -0
- data/vendor/kreuzberg/src/keywords/rake.rs +294 -0
- data/vendor/kreuzberg/src/keywords/types.rs +68 -0
- data/vendor/kreuzberg/src/keywords/yake.rs +163 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +942 -0
- data/vendor/kreuzberg/src/lib.rs +102 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +32 -0
- data/vendor/kreuzberg/src/mcp/server.rs +1966 -0
- data/vendor/kreuzberg/src/ocr/cache.rs +469 -0
- data/vendor/kreuzberg/src/ocr/error.rs +37 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +216 -0
- data/vendor/kreuzberg/src/ocr/mod.rs +58 -0
- data/vendor/kreuzberg/src/ocr/processor.rs +847 -0
- data/vendor/kreuzberg/src/ocr/table/mod.rs +4 -0
- data/vendor/kreuzberg/src/ocr/table/tsv_parser.rs +144 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +450 -0
- data/vendor/kreuzberg/src/ocr/types.rs +393 -0
- data/vendor/kreuzberg/src/ocr/utils.rs +47 -0
- data/vendor/kreuzberg/src/ocr/validation.rs +206 -0
- data/vendor/kreuzberg/src/pdf/error.rs +122 -0
- data/vendor/kreuzberg/src/pdf/images.rs +139 -0
- data/vendor/kreuzberg/src/pdf/metadata.rs +346 -0
- data/vendor/kreuzberg/src/pdf/mod.rs +50 -0
- data/vendor/kreuzberg/src/pdf/rendering.rs +369 -0
- data/vendor/kreuzberg/src/pdf/table.rs +420 -0
- data/vendor/kreuzberg/src/pdf/text.rs +161 -0
- data/vendor/kreuzberg/src/plugins/extractor.rs +1010 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +209 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +629 -0
- data/vendor/kreuzberg/src/plugins/processor.rs +641 -0
- data/vendor/kreuzberg/src/plugins/registry.rs +1324 -0
- data/vendor/kreuzberg/src/plugins/traits.rs +258 -0
- data/vendor/kreuzberg/src/plugins/validator.rs +955 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +1470 -0
- data/vendor/kreuzberg/src/text/mod.rs +19 -0
- data/vendor/kreuzberg/src/text/quality.rs +697 -0
- data/vendor/kreuzberg/src/text/string_utils.rs +217 -0
- data/vendor/kreuzberg/src/text/token_reduction/cjk_utils.rs +164 -0
- data/vendor/kreuzberg/src/text/token_reduction/config.rs +100 -0
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +796 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +902 -0
- data/vendor/kreuzberg/src/text/token_reduction/mod.rs +160 -0
- data/vendor/kreuzberg/src/text/token_reduction/semantic.rs +619 -0
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +147 -0
- data/vendor/kreuzberg/src/types.rs +873 -0
- data/vendor/kreuzberg/src/utils/mod.rs +17 -0
- data/vendor/kreuzberg/src/utils/quality.rs +959 -0
- data/vendor/kreuzberg/src/utils/string_utils.rs +381 -0
- data/vendor/kreuzberg/stopwords/af_stopwords.json +53 -0
- data/vendor/kreuzberg/stopwords/ar_stopwords.json +482 -0
- data/vendor/kreuzberg/stopwords/bg_stopwords.json +261 -0
- data/vendor/kreuzberg/stopwords/bn_stopwords.json +400 -0
- data/vendor/kreuzberg/stopwords/br_stopwords.json +1205 -0
- data/vendor/kreuzberg/stopwords/ca_stopwords.json +280 -0
- data/vendor/kreuzberg/stopwords/cs_stopwords.json +425 -0
- data/vendor/kreuzberg/stopwords/da_stopwords.json +172 -0
- data/vendor/kreuzberg/stopwords/de_stopwords.json +622 -0
- data/vendor/kreuzberg/stopwords/el_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/en_stopwords.json +1300 -0
- data/vendor/kreuzberg/stopwords/eo_stopwords.json +175 -0
- data/vendor/kreuzberg/stopwords/es_stopwords.json +734 -0
- data/vendor/kreuzberg/stopwords/et_stopwords.json +37 -0
- data/vendor/kreuzberg/stopwords/eu_stopwords.json +100 -0
- data/vendor/kreuzberg/stopwords/fa_stopwords.json +801 -0
- data/vendor/kreuzberg/stopwords/fi_stopwords.json +849 -0
- data/vendor/kreuzberg/stopwords/fr_stopwords.json +693 -0
- data/vendor/kreuzberg/stopwords/ga_stopwords.json +111 -0
- data/vendor/kreuzberg/stopwords/gl_stopwords.json +162 -0
- data/vendor/kreuzberg/stopwords/gu_stopwords.json +226 -0
- data/vendor/kreuzberg/stopwords/ha_stopwords.json +41 -0
- data/vendor/kreuzberg/stopwords/he_stopwords.json +196 -0
- data/vendor/kreuzberg/stopwords/hi_stopwords.json +227 -0
- data/vendor/kreuzberg/stopwords/hr_stopwords.json +181 -0
- data/vendor/kreuzberg/stopwords/hu_stopwords.json +791 -0
- data/vendor/kreuzberg/stopwords/hy_stopwords.json +47 -0
- data/vendor/kreuzberg/stopwords/id_stopwords.json +760 -0
- data/vendor/kreuzberg/stopwords/it_stopwords.json +634 -0
- data/vendor/kreuzberg/stopwords/ja_stopwords.json +136 -0
- data/vendor/kreuzberg/stopwords/kn_stopwords.json +84 -0
- data/vendor/kreuzberg/stopwords/ko_stopwords.json +681 -0
- data/vendor/kreuzberg/stopwords/ku_stopwords.json +64 -0
- data/vendor/kreuzberg/stopwords/la_stopwords.json +51 -0
- data/vendor/kreuzberg/stopwords/lt_stopwords.json +476 -0
- data/vendor/kreuzberg/stopwords/lv_stopwords.json +163 -0
- data/vendor/kreuzberg/stopwords/ml_stopwords.json +1 -0
- data/vendor/kreuzberg/stopwords/mr_stopwords.json +101 -0
- data/vendor/kreuzberg/stopwords/ms_stopwords.json +477 -0
- data/vendor/kreuzberg/stopwords/ne_stopwords.json +490 -0
- data/vendor/kreuzberg/stopwords/nl_stopwords.json +415 -0
- data/vendor/kreuzberg/stopwords/no_stopwords.json +223 -0
- data/vendor/kreuzberg/stopwords/pl_stopwords.json +331 -0
- data/vendor/kreuzberg/stopwords/pt_stopwords.json +562 -0
- data/vendor/kreuzberg/stopwords/ro_stopwords.json +436 -0
- data/vendor/kreuzberg/stopwords/ru_stopwords.json +561 -0
- data/vendor/kreuzberg/stopwords/si_stopwords.json +193 -0
- data/vendor/kreuzberg/stopwords/sk_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sl_stopwords.json +448 -0
- data/vendor/kreuzberg/stopwords/so_stopwords.json +32 -0
- data/vendor/kreuzberg/stopwords/st_stopwords.json +33 -0
- data/vendor/kreuzberg/stopwords/sv_stopwords.json +420 -0
- data/vendor/kreuzberg/stopwords/sw_stopwords.json +76 -0
- data/vendor/kreuzberg/stopwords/ta_stopwords.json +129 -0
- data/vendor/kreuzberg/stopwords/te_stopwords.json +54 -0
- data/vendor/kreuzberg/stopwords/th_stopwords.json +118 -0
- data/vendor/kreuzberg/stopwords/tl_stopwords.json +149 -0
- data/vendor/kreuzberg/stopwords/tr_stopwords.json +506 -0
- data/vendor/kreuzberg/stopwords/uk_stopwords.json +75 -0
- data/vendor/kreuzberg/stopwords/ur_stopwords.json +519 -0
- data/vendor/kreuzberg/stopwords/vi_stopwords.json +647 -0
- data/vendor/kreuzberg/stopwords/yo_stopwords.json +62 -0
- data/vendor/kreuzberg/stopwords/zh_stopwords.json +796 -0
- data/vendor/kreuzberg/stopwords/zu_stopwords.json +31 -0
- data/vendor/kreuzberg/tests/api_tests.rs +966 -0
- data/vendor/kreuzberg/tests/archive_integration.rs +543 -0
- data/vendor/kreuzberg/tests/batch_orchestration.rs +542 -0
- data/vendor/kreuzberg/tests/batch_processing.rs +304 -0
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +509 -0
- data/vendor/kreuzberg/tests/config_features.rs +580 -0
- data/vendor/kreuzberg/tests/config_loading_tests.rs +439 -0
- data/vendor/kreuzberg/tests/core_integration.rs +493 -0
- data/vendor/kreuzberg/tests/csv_integration.rs +424 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +124 -0
- data/vendor/kreuzberg/tests/email_integration.rs +325 -0
- data/vendor/kreuzberg/tests/error_handling.rs +393 -0
- data/vendor/kreuzberg/tests/format_integration.rs +159 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +142 -0
- data/vendor/kreuzberg/tests/image_integration.rs +253 -0
- data/vendor/kreuzberg/tests/keywords_integration.rs +479 -0
- data/vendor/kreuzberg/tests/keywords_quality.rs +509 -0
- data/vendor/kreuzberg/tests/mime_detection.rs +428 -0
- data/vendor/kreuzberg/tests/ocr_configuration.rs +510 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +676 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +627 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +469 -0
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +43 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +1412 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +771 -0
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +561 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +921 -0
- data/vendor/kreuzberg/tests/plugin_validator_test.rs +783 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +607 -0
- data/vendor/kreuzberg/tests/security_validation.rs +404 -0
- data/vendor/kreuzberg/tests/stopwords_integration_test.rs +888 -0
- data/vendor/kreuzberg/tests/test_fastembed.rs +609 -0
- data/vendor/kreuzberg/tests/xlsx_metadata_extraction_test.rs +87 -0
- metadata +471 -0
|
@@ -0,0 +1,955 @@
|
|
|
1
|
+
//! Validator plugin trait.
|
|
2
|
+
//!
|
|
3
|
+
//! This module defines the trait for implementing custom validation logic.
|
|
4
|
+
|
|
5
|
+
use crate::Result;
|
|
6
|
+
use crate::core::config::ExtractionConfig;
|
|
7
|
+
use crate::plugins::Plugin;
|
|
8
|
+
use crate::types::ExtractionResult;
|
|
9
|
+
use async_trait::async_trait;
|
|
10
|
+
use std::sync::Arc;
|
|
11
|
+
|
|
12
|
+
/// Trait for validator plugins.
|
|
13
|
+
///
|
|
14
|
+
/// Validators check extraction results for quality, completeness, or correctness.
|
|
15
|
+
/// Unlike post-processors, validator errors **fail fast** - if a validator returns
|
|
16
|
+
/// an error, the extraction fails immediately.
|
|
17
|
+
///
|
|
18
|
+
/// # Use Cases
|
|
19
|
+
///
|
|
20
|
+
/// - **Quality Gates**: Ensure extracted content meets minimum quality standards
|
|
21
|
+
/// - **Compliance**: Verify content meets regulatory requirements
|
|
22
|
+
/// - **Content Filtering**: Reject documents containing unwanted content
|
|
23
|
+
/// - **Format Validation**: Verify extracted content structure
|
|
24
|
+
/// - **Security Checks**: Scan for malicious content
|
|
25
|
+
///
|
|
26
|
+
/// # Error Handling
|
|
27
|
+
///
|
|
28
|
+
/// Validator errors are **fatal** - they cause the extraction to fail and bubble up
|
|
29
|
+
/// to the caller. Use validators for hard requirements that must be met.
|
|
30
|
+
///
|
|
31
|
+
/// For non-fatal checks, use post-processors instead.
|
|
32
|
+
///
|
|
33
|
+
/// # Thread Safety
|
|
34
|
+
///
|
|
35
|
+
/// Validators must be thread-safe (`Send + Sync`).
|
|
36
|
+
///
|
|
37
|
+
/// # Example
|
|
38
|
+
///
|
|
39
|
+
/// ```rust
|
|
40
|
+
/// use kreuzberg::plugins::{Plugin, Validator};
|
|
41
|
+
/// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
|
42
|
+
/// use async_trait::async_trait;
|
|
43
|
+
///
|
|
44
|
+
/// /// Validate that extracted content has minimum length
|
|
45
|
+
/// struct MinimumLengthValidator {
|
|
46
|
+
/// min_length: usize,
|
|
47
|
+
/// }
|
|
48
|
+
///
|
|
49
|
+
/// impl Plugin for MinimumLengthValidator {
|
|
50
|
+
/// fn name(&self) -> &str { "min-length-validator" }
|
|
51
|
+
/// fn version(&self) -> String { "1.0.0".to_string() }
|
|
52
|
+
/// fn initialize(&self) -> Result<()> { Ok(()) }
|
|
53
|
+
/// fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
54
|
+
/// }
|
|
55
|
+
///
|
|
56
|
+
/// #[async_trait]
|
|
57
|
+
/// impl Validator for MinimumLengthValidator {
|
|
58
|
+
/// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
|
59
|
+
/// -> Result<()> {
|
|
60
|
+
/// if result.content.len() < self.min_length {
|
|
61
|
+
/// return Err(KreuzbergError::validation(format!(
|
|
62
|
+
/// "Content too short: {} < {} characters",
|
|
63
|
+
/// result.content.len(),
|
|
64
|
+
/// self.min_length
|
|
65
|
+
/// )));
|
|
66
|
+
/// }
|
|
67
|
+
/// Ok(())
|
|
68
|
+
/// }
|
|
69
|
+
/// }
|
|
70
|
+
/// ```
|
|
71
|
+
#[async_trait]
|
|
72
|
+
pub trait Validator: Plugin {
|
|
73
|
+
/// Validate an extraction result.
|
|
74
|
+
///
|
|
75
|
+
/// Check the extraction result and return `Ok(())` if valid, or an error
|
|
76
|
+
/// if validation fails.
|
|
77
|
+
///
|
|
78
|
+
/// # Arguments
|
|
79
|
+
///
|
|
80
|
+
/// * `result` - The extraction result to validate
|
|
81
|
+
/// * `config` - Extraction configuration
|
|
82
|
+
///
|
|
83
|
+
/// # Returns
|
|
84
|
+
///
|
|
85
|
+
/// - `Ok(())` if validation passes
|
|
86
|
+
/// - `Err(...)` if validation fails (extraction will fail)
|
|
87
|
+
///
|
|
88
|
+
/// # Errors
|
|
89
|
+
///
|
|
90
|
+
/// - `KreuzbergError::Validation` - Validation failed
|
|
91
|
+
/// - Any other error type appropriate for the failure
|
|
92
|
+
///
|
|
93
|
+
/// # Example - Content Length Validation
|
|
94
|
+
///
|
|
95
|
+
/// ```rust
|
|
96
|
+
/// # use kreuzberg::plugins::{Plugin, Validator};
|
|
97
|
+
/// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
|
98
|
+
/// # use async_trait::async_trait;
|
|
99
|
+
/// # struct ContentLengthValidator { min: usize, max: usize }
|
|
100
|
+
/// # impl Plugin for ContentLengthValidator {
|
|
101
|
+
/// # fn name(&self) -> &str { "length-validator" }
|
|
102
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
103
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
104
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
105
|
+
/// # }
|
|
106
|
+
/// # #[async_trait]
|
|
107
|
+
/// # impl Validator for ContentLengthValidator {
|
|
108
|
+
/// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
|
109
|
+
/// -> Result<()> {
|
|
110
|
+
/// let length = result.content.len();
|
|
111
|
+
///
|
|
112
|
+
/// if length < self.min {
|
|
113
|
+
/// return Err(KreuzbergError::validation(format!(
|
|
114
|
+
/// "Content too short: {} < {} characters",
|
|
115
|
+
/// length, self.min
|
|
116
|
+
/// )));
|
|
117
|
+
/// }
|
|
118
|
+
///
|
|
119
|
+
/// if length > self.max {
|
|
120
|
+
/// return Err(KreuzbergError::validation(format!(
|
|
121
|
+
/// "Content too long: {} > {} characters",
|
|
122
|
+
/// length, self.max
|
|
123
|
+
/// )));
|
|
124
|
+
/// }
|
|
125
|
+
///
|
|
126
|
+
/// Ok(())
|
|
127
|
+
/// }
|
|
128
|
+
/// # }
|
|
129
|
+
/// ```
|
|
130
|
+
///
|
|
131
|
+
/// # Example - Quality Score Validation
|
|
132
|
+
///
|
|
133
|
+
/// ```rust
|
|
134
|
+
/// # use kreuzberg::plugins::{Plugin, Validator};
|
|
135
|
+
/// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
|
136
|
+
/// # use async_trait::async_trait;
|
|
137
|
+
/// # struct QualityValidator { min_score: f64 }
|
|
138
|
+
/// # impl Plugin for QualityValidator {
|
|
139
|
+
/// # fn name(&self) -> &str { "quality-validator" }
|
|
140
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
141
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
142
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
143
|
+
/// # }
|
|
144
|
+
/// # #[async_trait]
|
|
145
|
+
/// # impl Validator for QualityValidator {
|
|
146
|
+
/// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
|
147
|
+
/// -> Result<()> {
|
|
148
|
+
/// // Check if quality_score exists in metadata
|
|
149
|
+
/// let score = result.metadata
|
|
150
|
+
/// .additional
|
|
151
|
+
/// .get("quality_score")
|
|
152
|
+
/// .and_then(|v| v.as_f64())
|
|
153
|
+
/// .unwrap_or(0.0);
|
|
154
|
+
///
|
|
155
|
+
/// if score < self.min_score {
|
|
156
|
+
/// return Err(KreuzbergError::validation(format!(
|
|
157
|
+
/// "Quality score too low: {} < {}",
|
|
158
|
+
/// score, self.min_score
|
|
159
|
+
/// )));
|
|
160
|
+
/// }
|
|
161
|
+
///
|
|
162
|
+
/// Ok(())
|
|
163
|
+
/// }
|
|
164
|
+
/// # }
|
|
165
|
+
/// ```
|
|
166
|
+
///
|
|
167
|
+
/// # Example - Security Validation
|
|
168
|
+
///
|
|
169
|
+
/// ```rust
|
|
170
|
+
/// # use kreuzberg::plugins::{Plugin, Validator};
|
|
171
|
+
/// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
|
172
|
+
/// # use async_trait::async_trait;
|
|
173
|
+
/// # struct SecurityValidator { blocked_patterns: Vec<String> }
|
|
174
|
+
/// # impl Plugin for SecurityValidator {
|
|
175
|
+
/// # fn name(&self) -> &str { "security-validator" }
|
|
176
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
177
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
178
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
179
|
+
/// # }
|
|
180
|
+
/// # #[async_trait]
|
|
181
|
+
/// # impl Validator for SecurityValidator {
|
|
182
|
+
/// async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig)
|
|
183
|
+
/// -> Result<()> {
|
|
184
|
+
/// // Check for blocked patterns
|
|
185
|
+
/// for pattern in &self.blocked_patterns {
|
|
186
|
+
/// if result.content.contains(pattern) {
|
|
187
|
+
/// return Err(KreuzbergError::validation(format!(
|
|
188
|
+
/// "Content contains blocked pattern: {}",
|
|
189
|
+
/// pattern
|
|
190
|
+
/// )));
|
|
191
|
+
/// }
|
|
192
|
+
/// }
|
|
193
|
+
///
|
|
194
|
+
/// Ok(())
|
|
195
|
+
/// }
|
|
196
|
+
/// # }
|
|
197
|
+
/// ```
|
|
198
|
+
async fn validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> Result<()>;
|
|
199
|
+
|
|
200
|
+
/// Optional: Check if this validator should run for a given result.
|
|
201
|
+
///
|
|
202
|
+
/// Allows conditional validation based on MIME type, metadata, or content.
|
|
203
|
+
/// Defaults to `true` (always run).
|
|
204
|
+
///
|
|
205
|
+
/// # Arguments
|
|
206
|
+
///
|
|
207
|
+
/// * `result` - The extraction result to check
|
|
208
|
+
/// * `config` - Extraction configuration
|
|
209
|
+
///
|
|
210
|
+
/// # Returns
|
|
211
|
+
///
|
|
212
|
+
/// `true` if the validator should run, `false` to skip.
|
|
213
|
+
///
|
|
214
|
+
/// # Example
|
|
215
|
+
///
|
|
216
|
+
/// ```rust
|
|
217
|
+
/// # use kreuzberg::plugins::{Plugin, Validator};
|
|
218
|
+
/// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
|
219
|
+
/// # use async_trait::async_trait;
|
|
220
|
+
/// # struct PdfValidator;
|
|
221
|
+
/// # impl Plugin for PdfValidator {
|
|
222
|
+
/// # fn name(&self) -> &str { "pdf-validator" }
|
|
223
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
224
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
225
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
226
|
+
/// # }
|
|
227
|
+
/// # #[async_trait]
|
|
228
|
+
/// # impl Validator for PdfValidator {
|
|
229
|
+
/// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
|
|
230
|
+
/// /// Only validate PDF documents
|
|
231
|
+
/// fn should_validate(&self, result: &ExtractionResult, config: &ExtractionConfig) -> bool {
|
|
232
|
+
/// result.mime_type == "application/pdf"
|
|
233
|
+
/// }
|
|
234
|
+
/// # }
|
|
235
|
+
/// ```
|
|
236
|
+
fn should_validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
|
|
237
|
+
true
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/// Optional: Get the validation priority.
|
|
241
|
+
///
|
|
242
|
+
/// Higher priority validators run first. Useful for ordering validation checks
|
|
243
|
+
/// (e.g., run cheap validations before expensive ones).
|
|
244
|
+
///
|
|
245
|
+
/// Default priority is 50.
|
|
246
|
+
///
|
|
247
|
+
/// # Returns
|
|
248
|
+
///
|
|
249
|
+
/// Priority value (higher = runs earlier).
|
|
250
|
+
///
|
|
251
|
+
/// # Example
|
|
252
|
+
///
|
|
253
|
+
/// ```rust
|
|
254
|
+
/// # use kreuzberg::plugins::{Plugin, Validator};
|
|
255
|
+
/// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
|
256
|
+
/// # use async_trait::async_trait;
|
|
257
|
+
/// # struct FastValidator;
|
|
258
|
+
/// # impl Plugin for FastValidator {
|
|
259
|
+
/// # fn name(&self) -> &str { "fast-validator" }
|
|
260
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
261
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
262
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
263
|
+
/// # }
|
|
264
|
+
/// # #[async_trait]
|
|
265
|
+
/// # impl Validator for FastValidator {
|
|
266
|
+
/// # async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> { Ok(()) }
|
|
267
|
+
/// /// Run this validator first (it's fast)
|
|
268
|
+
/// fn priority(&self) -> i32 {
|
|
269
|
+
/// 100
|
|
270
|
+
/// }
|
|
271
|
+
/// # }
|
|
272
|
+
/// ```
|
|
273
|
+
fn priority(&self) -> i32 {
|
|
274
|
+
50
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
// Public registration APIs
|
|
279
|
+
|
|
280
|
+
/// Register a validator with the global registry.
|
|
281
|
+
///
|
|
282
|
+
/// The validator will be registered with its default priority and will be called
|
|
283
|
+
/// during extraction validation. The validator's `name()` method is used as the
|
|
284
|
+
/// registration name.
|
|
285
|
+
///
|
|
286
|
+
/// # Arguments
|
|
287
|
+
///
|
|
288
|
+
/// * `validator` - The validator implementation wrapped in Arc
|
|
289
|
+
///
|
|
290
|
+
/// # Returns
|
|
291
|
+
///
|
|
292
|
+
/// - `Ok(())` if registration succeeded
|
|
293
|
+
/// - `Err(...)` if validation failed or initialization failed
|
|
294
|
+
///
|
|
295
|
+
/// # Errors
|
|
296
|
+
///
|
|
297
|
+
/// - `KreuzbergError::Validation` - Invalid validator name (empty or contains whitespace)
|
|
298
|
+
/// - Any error from the validator's `initialize()` method
|
|
299
|
+
///
|
|
300
|
+
/// # Example
|
|
301
|
+
///
|
|
302
|
+
/// ```rust
|
|
303
|
+
/// use kreuzberg::plugins::{Plugin, Validator, register_validator};
|
|
304
|
+
/// use kreuzberg::{Result, ExtractionResult, ExtractionConfig, KreuzbergError};
|
|
305
|
+
/// use async_trait::async_trait;
|
|
306
|
+
/// use std::sync::Arc;
|
|
307
|
+
///
|
|
308
|
+
/// struct MinLengthValidator { min_length: usize }
|
|
309
|
+
///
|
|
310
|
+
/// impl Plugin for MinLengthValidator {
|
|
311
|
+
/// fn name(&self) -> &str { "min-length" }
|
|
312
|
+
/// fn version(&self) -> String { "1.0.0".to_string() }
|
|
313
|
+
/// fn initialize(&self) -> Result<()> { Ok(()) }
|
|
314
|
+
/// fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
315
|
+
/// }
|
|
316
|
+
///
|
|
317
|
+
/// #[async_trait]
|
|
318
|
+
/// impl Validator for MinLengthValidator {
|
|
319
|
+
/// async fn validate(&self, result: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
|
|
320
|
+
/// if result.content.len() < self.min_length {
|
|
321
|
+
/// return Err(KreuzbergError::validation(
|
|
322
|
+
/// format!("Content too short: {} < {}", result.content.len(), self.min_length)
|
|
323
|
+
/// ));
|
|
324
|
+
/// }
|
|
325
|
+
/// Ok(())
|
|
326
|
+
/// }
|
|
327
|
+
/// }
|
|
328
|
+
///
|
|
329
|
+
/// # tokio_test::block_on(async {
|
|
330
|
+
/// let validator = Arc::new(MinLengthValidator { min_length: 10 });
|
|
331
|
+
/// register_validator(validator)?;
|
|
332
|
+
/// # Ok::<(), KreuzbergError>(())
|
|
333
|
+
/// # });
|
|
334
|
+
/// ```
|
|
335
|
+
pub fn register_validator(validator: Arc<dyn Validator>) -> crate::Result<()> {
|
|
336
|
+
use crate::plugins::registry::get_validator_registry;
|
|
337
|
+
|
|
338
|
+
let registry = get_validator_registry();
|
|
339
|
+
let mut registry = registry
|
|
340
|
+
.write()
|
|
341
|
+
.expect("~keep Failed to acquire write lock on validator registry"); // ~keep
|
|
342
|
+
|
|
343
|
+
registry.register(validator)
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/// Unregister a validator by name.
|
|
347
|
+
///
|
|
348
|
+
/// Removes the validator from the global registry and calls its `shutdown()` method.
|
|
349
|
+
///
|
|
350
|
+
/// # Arguments
|
|
351
|
+
///
|
|
352
|
+
/// * `name` - Name of the validator to unregister
|
|
353
|
+
///
|
|
354
|
+
/// # Returns
|
|
355
|
+
///
|
|
356
|
+
/// - `Ok(())` if the validator was unregistered or didn't exist
|
|
357
|
+
/// - `Err(...)` if the shutdown method failed
|
|
358
|
+
///
|
|
359
|
+
/// # Example
|
|
360
|
+
///
|
|
361
|
+
/// ```rust
|
|
362
|
+
/// use kreuzberg::plugins::unregister_validator;
|
|
363
|
+
///
|
|
364
|
+
/// # tokio_test::block_on(async {
|
|
365
|
+
/// unregister_validator("min-length")?;
|
|
366
|
+
/// # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
367
|
+
/// # });
|
|
368
|
+
/// ```
|
|
369
|
+
pub fn unregister_validator(name: &str) -> crate::Result<()> {
|
|
370
|
+
use crate::plugins::registry::get_validator_registry;
|
|
371
|
+
|
|
372
|
+
let registry = get_validator_registry();
|
|
373
|
+
let mut registry = registry
|
|
374
|
+
.write()
|
|
375
|
+
.expect("~keep Failed to acquire write lock on validator registry"); // ~keep
|
|
376
|
+
|
|
377
|
+
registry.remove(name)
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/// List all registered validators.
|
|
381
|
+
///
|
|
382
|
+
/// Returns the names of all validators currently registered in the global registry.
|
|
383
|
+
///
|
|
384
|
+
/// # Returns
|
|
385
|
+
///
|
|
386
|
+
/// A vector of validator names.
|
|
387
|
+
///
|
|
388
|
+
/// # Example
|
|
389
|
+
///
|
|
390
|
+
/// ```rust
|
|
391
|
+
/// use kreuzberg::plugins::list_validators;
|
|
392
|
+
///
|
|
393
|
+
/// # tokio_test::block_on(async {
|
|
394
|
+
/// let validators = list_validators()?;
|
|
395
|
+
/// for name in validators {
|
|
396
|
+
/// println!("Registered validator: {}", name);
|
|
397
|
+
/// }
|
|
398
|
+
/// # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
399
|
+
/// # });
|
|
400
|
+
/// ```
|
|
401
|
+
pub fn list_validators() -> crate::Result<Vec<String>> {
|
|
402
|
+
use crate::plugins::registry::get_validator_registry;
|
|
403
|
+
|
|
404
|
+
let registry = get_validator_registry();
|
|
405
|
+
let registry = registry
|
|
406
|
+
.read()
|
|
407
|
+
.expect("~keep Failed to acquire read lock on validator registry"); // ~keep
|
|
408
|
+
|
|
409
|
+
Ok(registry.list())
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
/// Clear all validators from the global registry.
|
|
413
|
+
///
|
|
414
|
+
/// Removes all validators and calls their `shutdown()` methods.
|
|
415
|
+
///
|
|
416
|
+
/// # Returns
|
|
417
|
+
///
|
|
418
|
+
/// - `Ok(())` if all validators were cleared successfully
|
|
419
|
+
/// - `Err(...)` if any shutdown method failed
|
|
420
|
+
///
|
|
421
|
+
/// # Example
|
|
422
|
+
///
|
|
423
|
+
/// ```rust
|
|
424
|
+
/// use kreuzberg::plugins::clear_validators;
|
|
425
|
+
///
|
|
426
|
+
/// # tokio_test::block_on(async {
|
|
427
|
+
/// clear_validators()?;
|
|
428
|
+
/// # Ok::<(), kreuzberg::KreuzbergError>(())
|
|
429
|
+
/// # });
|
|
430
|
+
/// ```
|
|
431
|
+
pub fn clear_validators() -> crate::Result<()> {
|
|
432
|
+
use crate::plugins::registry::get_validator_registry;
|
|
433
|
+
|
|
434
|
+
let registry = get_validator_registry();
|
|
435
|
+
let mut registry = registry
|
|
436
|
+
.write()
|
|
437
|
+
.expect("~keep Failed to acquire write lock on validator registry"); // ~keep
|
|
438
|
+
|
|
439
|
+
registry.shutdown_all()
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
#[cfg(test)]
|
|
443
|
+
mod tests {
|
|
444
|
+
use super::*;
|
|
445
|
+
use crate::KreuzbergError;
|
|
446
|
+
use std::collections::HashMap;
|
|
447
|
+
|
|
448
|
+
struct MockValidator {
|
|
449
|
+
should_fail: bool,
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
impl Plugin for MockValidator {
|
|
453
|
+
fn name(&self) -> &str {
|
|
454
|
+
"mock-validator"
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
fn version(&self) -> String {
|
|
458
|
+
"1.0.0".to_string()
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
fn initialize(&self) -> Result<()> {
|
|
462
|
+
Ok(())
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
fn shutdown(&self) -> Result<()> {
|
|
466
|
+
Ok(())
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
#[async_trait]
|
|
471
|
+
impl Validator for MockValidator {
|
|
472
|
+
async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
473
|
+
if self.should_fail {
|
|
474
|
+
Err(KreuzbergError::validation("Validation failed".to_string()))
|
|
475
|
+
} else {
|
|
476
|
+
Ok(())
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
}
|
|
480
|
+
|
|
481
|
+
#[tokio::test]
|
|
482
|
+
async fn test_validator_success() {
|
|
483
|
+
let validator = MockValidator { should_fail: false };
|
|
484
|
+
|
|
485
|
+
let result = ExtractionResult {
|
|
486
|
+
content: "test content".to_string(),
|
|
487
|
+
mime_type: "text/plain".to_string(),
|
|
488
|
+
metadata: crate::types::Metadata::default(),
|
|
489
|
+
tables: vec![],
|
|
490
|
+
detected_languages: None,
|
|
491
|
+
chunks: None,
|
|
492
|
+
images: None,
|
|
493
|
+
};
|
|
494
|
+
|
|
495
|
+
let config = ExtractionConfig::default();
|
|
496
|
+
assert!(validator.validate(&result, &config).await.is_ok());
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
#[tokio::test]
|
|
500
|
+
async fn test_validator_failure() {
|
|
501
|
+
let validator = MockValidator { should_fail: true };
|
|
502
|
+
|
|
503
|
+
let result = ExtractionResult {
|
|
504
|
+
content: "test content".to_string(),
|
|
505
|
+
mime_type: "text/plain".to_string(),
|
|
506
|
+
metadata: crate::types::Metadata::default(),
|
|
507
|
+
tables: vec![],
|
|
508
|
+
detected_languages: None,
|
|
509
|
+
chunks: None,
|
|
510
|
+
images: None,
|
|
511
|
+
};
|
|
512
|
+
|
|
513
|
+
let config = ExtractionConfig::default();
|
|
514
|
+
let validation_result = validator.validate(&result, &config).await;
|
|
515
|
+
|
|
516
|
+
assert!(matches!(validation_result, Err(KreuzbergError::Validation { .. })));
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
#[test]
|
|
520
|
+
fn test_validator_should_validate_default() {
|
|
521
|
+
let validator = MockValidator { should_fail: false };
|
|
522
|
+
|
|
523
|
+
let result = ExtractionResult {
|
|
524
|
+
content: "test".to_string(),
|
|
525
|
+
mime_type: "text/plain".to_string(),
|
|
526
|
+
metadata: crate::types::Metadata::default(),
|
|
527
|
+
tables: vec![],
|
|
528
|
+
detected_languages: None,
|
|
529
|
+
chunks: None,
|
|
530
|
+
images: None,
|
|
531
|
+
};
|
|
532
|
+
|
|
533
|
+
let config = ExtractionConfig::default();
|
|
534
|
+
|
|
535
|
+
assert!(validator.should_validate(&result, &config));
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
#[test]
|
|
539
|
+
fn test_validator_priority_default() {
|
|
540
|
+
let validator = MockValidator { should_fail: false };
|
|
541
|
+
assert_eq!(validator.priority(), 50);
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
#[tokio::test]
|
|
545
|
+
async fn test_validator_plugin_interface() {
|
|
546
|
+
let validator = MockValidator { should_fail: false };
|
|
547
|
+
|
|
548
|
+
assert_eq!(validator.name(), "mock-validator");
|
|
549
|
+
assert_eq!(validator.version(), "1.0.0");
|
|
550
|
+
assert!(validator.initialize().is_ok());
|
|
551
|
+
assert!(validator.shutdown().is_ok());
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
#[tokio::test]
|
|
555
|
+
async fn test_validator_empty_content() {
|
|
556
|
+
let validator = MockValidator { should_fail: false };
|
|
557
|
+
|
|
558
|
+
let result = ExtractionResult {
|
|
559
|
+
content: String::new(),
|
|
560
|
+
mime_type: "text/plain".to_string(),
|
|
561
|
+
metadata: crate::types::Metadata::default(),
|
|
562
|
+
tables: vec![],
|
|
563
|
+
detected_languages: None,
|
|
564
|
+
chunks: None,
|
|
565
|
+
images: None,
|
|
566
|
+
};
|
|
567
|
+
|
|
568
|
+
let config = ExtractionConfig::default();
|
|
569
|
+
assert!(validator.validate(&result, &config).await.is_ok());
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
#[test]
|
|
573
|
+
fn test_validator_should_validate_conditional() {
|
|
574
|
+
struct PdfOnlyValidator;
|
|
575
|
+
|
|
576
|
+
impl Plugin for PdfOnlyValidator {
|
|
577
|
+
fn name(&self) -> &str {
|
|
578
|
+
"pdf-only"
|
|
579
|
+
}
|
|
580
|
+
fn version(&self) -> String {
|
|
581
|
+
"1.0.0".to_string()
|
|
582
|
+
}
|
|
583
|
+
fn initialize(&self) -> Result<()> {
|
|
584
|
+
Ok(())
|
|
585
|
+
}
|
|
586
|
+
fn shutdown(&self) -> Result<()> {
|
|
587
|
+
Ok(())
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
#[async_trait]
|
|
592
|
+
impl Validator for PdfOnlyValidator {
|
|
593
|
+
async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
594
|
+
Ok(())
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
fn should_validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> bool {
|
|
598
|
+
result.mime_type == "application/pdf"
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
let validator = PdfOnlyValidator;
|
|
603
|
+
let config = ExtractionConfig::default();
|
|
604
|
+
|
|
605
|
+
let pdf_result = ExtractionResult {
|
|
606
|
+
content: "test".to_string(),
|
|
607
|
+
mime_type: "application/pdf".to_string(),
|
|
608
|
+
metadata: crate::types::Metadata::default(),
|
|
609
|
+
tables: vec![],
|
|
610
|
+
detected_languages: None,
|
|
611
|
+
chunks: None,
|
|
612
|
+
images: None,
|
|
613
|
+
};
|
|
614
|
+
|
|
615
|
+
let txt_result = ExtractionResult {
|
|
616
|
+
content: "test".to_string(),
|
|
617
|
+
mime_type: "text/plain".to_string(),
|
|
618
|
+
metadata: crate::types::Metadata::default(),
|
|
619
|
+
tables: vec![],
|
|
620
|
+
detected_languages: None,
|
|
621
|
+
chunks: None,
|
|
622
|
+
images: None,
|
|
623
|
+
};
|
|
624
|
+
|
|
625
|
+
assert!(validator.should_validate(&pdf_result, &config));
|
|
626
|
+
assert!(!validator.should_validate(&txt_result, &config));
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
#[test]
|
|
630
|
+
fn test_validator_priority_ranges() {
|
|
631
|
+
struct HighPriorityValidator;
|
|
632
|
+
struct LowPriorityValidator;
|
|
633
|
+
|
|
634
|
+
impl Plugin for HighPriorityValidator {
|
|
635
|
+
fn name(&self) -> &str {
|
|
636
|
+
"high-priority"
|
|
637
|
+
}
|
|
638
|
+
fn version(&self) -> String {
|
|
639
|
+
"1.0.0".to_string()
|
|
640
|
+
}
|
|
641
|
+
fn initialize(&self) -> Result<()> {
|
|
642
|
+
Ok(())
|
|
643
|
+
}
|
|
644
|
+
fn shutdown(&self) -> Result<()> {
|
|
645
|
+
Ok(())
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
impl Plugin for LowPriorityValidator {
|
|
650
|
+
fn name(&self) -> &str {
|
|
651
|
+
"low-priority"
|
|
652
|
+
}
|
|
653
|
+
fn version(&self) -> String {
|
|
654
|
+
"1.0.0".to_string()
|
|
655
|
+
}
|
|
656
|
+
fn initialize(&self) -> Result<()> {
|
|
657
|
+
Ok(())
|
|
658
|
+
}
|
|
659
|
+
fn shutdown(&self) -> Result<()> {
|
|
660
|
+
Ok(())
|
|
661
|
+
}
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
#[async_trait]
|
|
665
|
+
impl Validator for HighPriorityValidator {
|
|
666
|
+
async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
667
|
+
Ok(())
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
fn priority(&self) -> i32 {
|
|
671
|
+
100
|
|
672
|
+
}
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
#[async_trait]
|
|
676
|
+
impl Validator for LowPriorityValidator {
|
|
677
|
+
async fn validate(&self, _result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
678
|
+
Ok(())
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
fn priority(&self) -> i32 {
|
|
682
|
+
10
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
let high = HighPriorityValidator;
|
|
687
|
+
let low = LowPriorityValidator;
|
|
688
|
+
|
|
689
|
+
assert_eq!(high.priority(), 100);
|
|
690
|
+
assert_eq!(low.priority(), 10);
|
|
691
|
+
assert!(high.priority() > low.priority());
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
#[tokio::test]
|
|
695
|
+
async fn test_validator_error_message() {
|
|
696
|
+
let validator = MockValidator { should_fail: true };
|
|
697
|
+
|
|
698
|
+
let result = ExtractionResult {
|
|
699
|
+
content: "test".to_string(),
|
|
700
|
+
mime_type: "text/plain".to_string(),
|
|
701
|
+
metadata: crate::types::Metadata::default(),
|
|
702
|
+
tables: vec![],
|
|
703
|
+
detected_languages: None,
|
|
704
|
+
chunks: None,
|
|
705
|
+
images: None,
|
|
706
|
+
};
|
|
707
|
+
|
|
708
|
+
let config = ExtractionConfig::default();
|
|
709
|
+
let err = validator.validate(&result, &config).await.unwrap_err();
|
|
710
|
+
|
|
711
|
+
match err {
|
|
712
|
+
KreuzbergError::Validation { message: msg, .. } => {
|
|
713
|
+
assert_eq!(msg, "Validation failed");
|
|
714
|
+
}
|
|
715
|
+
_ => panic!("Expected Validation error"),
|
|
716
|
+
}
|
|
717
|
+
}
|
|
718
|
+
|
|
719
|
+
#[tokio::test]
|
|
720
|
+
async fn test_validator_with_metadata() {
|
|
721
|
+
let validator = MockValidator { should_fail: false };
|
|
722
|
+
|
|
723
|
+
let mut additional = HashMap::new();
|
|
724
|
+
additional.insert("quality_score".to_string(), serde_json::json!(0.95));
|
|
725
|
+
|
|
726
|
+
let result = ExtractionResult {
|
|
727
|
+
content: "test".to_string(),
|
|
728
|
+
mime_type: "text/plain".to_string(),
|
|
729
|
+
metadata: crate::types::Metadata {
|
|
730
|
+
additional,
|
|
731
|
+
..Default::default()
|
|
732
|
+
},
|
|
733
|
+
tables: vec![],
|
|
734
|
+
detected_languages: None,
|
|
735
|
+
chunks: None,
|
|
736
|
+
images: None,
|
|
737
|
+
};
|
|
738
|
+
|
|
739
|
+
let config = ExtractionConfig::default();
|
|
740
|
+
assert!(validator.validate(&result, &config).await.is_ok());
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
#[tokio::test]
|
|
744
|
+
async fn test_validator_with_tables() {
|
|
745
|
+
use crate::types::Table;
|
|
746
|
+
|
|
747
|
+
let validator = MockValidator { should_fail: false };
|
|
748
|
+
|
|
749
|
+
let table = Table {
|
|
750
|
+
cells: vec![vec!["A".to_string(), "B".to_string()]],
|
|
751
|
+
markdown: "| A | B |".to_string(),
|
|
752
|
+
page_number: 0,
|
|
753
|
+
};
|
|
754
|
+
|
|
755
|
+
let result = ExtractionResult {
|
|
756
|
+
content: "test".to_string(),
|
|
757
|
+
mime_type: "text/plain".to_string(),
|
|
758
|
+
metadata: crate::types::Metadata::default(),
|
|
759
|
+
tables: vec![table],
|
|
760
|
+
detected_languages: None,
|
|
761
|
+
chunks: None,
|
|
762
|
+
images: None,
|
|
763
|
+
};
|
|
764
|
+
|
|
765
|
+
let config = ExtractionConfig::default();
|
|
766
|
+
assert!(validator.validate(&result, &config).await.is_ok());
|
|
767
|
+
}
|
|
768
|
+
|
|
769
|
+
#[tokio::test]
|
|
770
|
+
async fn test_validator_different_mime_types() {
|
|
771
|
+
let validator = MockValidator { should_fail: false };
|
|
772
|
+
let config = ExtractionConfig::default();
|
|
773
|
+
|
|
774
|
+
let mime_types = vec![
|
|
775
|
+
"text/plain",
|
|
776
|
+
"application/pdf",
|
|
777
|
+
"application/json",
|
|
778
|
+
"text/html",
|
|
779
|
+
"image/png",
|
|
780
|
+
];
|
|
781
|
+
|
|
782
|
+
for mime_type in mime_types {
|
|
783
|
+
let result = ExtractionResult {
|
|
784
|
+
content: "test".to_string(),
|
|
785
|
+
mime_type: mime_type.to_string(),
|
|
786
|
+
metadata: crate::types::Metadata::default(),
|
|
787
|
+
tables: vec![],
|
|
788
|
+
detected_languages: None,
|
|
789
|
+
chunks: None,
|
|
790
|
+
images: None,
|
|
791
|
+
};
|
|
792
|
+
|
|
793
|
+
assert!(validator.validate(&result, &config).await.is_ok());
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
#[tokio::test]
|
|
798
|
+
async fn test_validator_long_content() {
|
|
799
|
+
let validator = MockValidator { should_fail: false };
|
|
800
|
+
|
|
801
|
+
let result = ExtractionResult {
|
|
802
|
+
content: "test content ".repeat(10000),
|
|
803
|
+
mime_type: "text/plain".to_string(),
|
|
804
|
+
metadata: crate::types::Metadata::default(),
|
|
805
|
+
tables: vec![],
|
|
806
|
+
detected_languages: None,
|
|
807
|
+
chunks: None,
|
|
808
|
+
images: None,
|
|
809
|
+
};
|
|
810
|
+
|
|
811
|
+
let config = ExtractionConfig::default();
|
|
812
|
+
assert!(validator.validate(&result, &config).await.is_ok());
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
// Tests for public registration APIs
|
|
816
|
+
|
|
817
|
+
#[test]
|
|
818
|
+
fn test_register_validator() {
|
|
819
|
+
use std::sync::Arc;
|
|
820
|
+
|
|
821
|
+
let validator = Arc::new(MockValidator { should_fail: false });
|
|
822
|
+
let result = super::register_validator(validator);
|
|
823
|
+
assert!(result.is_ok());
|
|
824
|
+
|
|
825
|
+
let _ = super::unregister_validator("mock-validator");
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
#[test]
|
|
829
|
+
fn test_unregister_validator() {
|
|
830
|
+
use std::sync::Arc;
|
|
831
|
+
|
|
832
|
+
let validator = Arc::new(MockValidator { should_fail: false });
|
|
833
|
+
super::register_validator(validator).unwrap();
|
|
834
|
+
|
|
835
|
+
let result = super::unregister_validator("mock-validator");
|
|
836
|
+
assert!(result.is_ok());
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
#[test]
|
|
840
|
+
fn test_unregister_nonexistent_validator() {
|
|
841
|
+
let result = super::unregister_validator("nonexistent-validator-xyz");
|
|
842
|
+
assert!(result.is_ok());
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
#[test]
|
|
846
|
+
fn test_list_validators() {
|
|
847
|
+
use std::sync::Arc;
|
|
848
|
+
|
|
849
|
+
super::clear_validators().unwrap();
|
|
850
|
+
|
|
851
|
+
let validator1 = Arc::new(MockValidator { should_fail: false });
|
|
852
|
+
// Both validators have the same name, so only one will be registered
|
|
853
|
+
let validator2 = Arc::new(MockValidator { should_fail: false });
|
|
854
|
+
|
|
855
|
+
let list_before = super::list_validators().unwrap();
|
|
856
|
+
assert_eq!(list_before.len(), 0);
|
|
857
|
+
|
|
858
|
+
super::register_validator(validator1).unwrap();
|
|
859
|
+
super::register_validator(validator2).unwrap();
|
|
860
|
+
|
|
861
|
+
let list = super::list_validators().unwrap();
|
|
862
|
+
// Only 1 validator registered since they have the same name
|
|
863
|
+
assert_eq!(list.len(), 1);
|
|
864
|
+
assert!(list.contains(&"mock-validator".to_string()));
|
|
865
|
+
|
|
866
|
+
super::unregister_validator("mock-validator").unwrap();
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
#[test]
|
|
870
|
+
fn test_clear_validators() {
|
|
871
|
+
use std::sync::Arc;
|
|
872
|
+
|
|
873
|
+
super::clear_validators().unwrap();
|
|
874
|
+
|
|
875
|
+
let validator1 = Arc::new(MockValidator { should_fail: false });
|
|
876
|
+
let validator2 = Arc::new(MockValidator { should_fail: false });
|
|
877
|
+
|
|
878
|
+
super::register_validator(validator1).unwrap();
|
|
879
|
+
super::register_validator(validator2).unwrap();
|
|
880
|
+
|
|
881
|
+
// Verify at least one validator is registered
|
|
882
|
+
let list_before = super::list_validators().unwrap();
|
|
883
|
+
assert!(!list_before.is_empty());
|
|
884
|
+
|
|
885
|
+
let result = super::clear_validators();
|
|
886
|
+
assert!(result.is_ok());
|
|
887
|
+
|
|
888
|
+
let list = super::list_validators().unwrap();
|
|
889
|
+
assert_eq!(list.len(), 0);
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
#[test]
|
|
893
|
+
fn test_register_validator_with_invalid_name() {
|
|
894
|
+
use std::sync::Arc;
|
|
895
|
+
|
|
896
|
+
struct InvalidNameValidator;
|
|
897
|
+
impl Plugin for InvalidNameValidator {
|
|
898
|
+
fn name(&self) -> &str {
|
|
899
|
+
"invalid name with spaces"
|
|
900
|
+
}
|
|
901
|
+
fn version(&self) -> String {
|
|
902
|
+
"1.0.0".to_string()
|
|
903
|
+
}
|
|
904
|
+
fn initialize(&self) -> Result<()> {
|
|
905
|
+
Ok(())
|
|
906
|
+
}
|
|
907
|
+
fn shutdown(&self) -> Result<()> {
|
|
908
|
+
Ok(())
|
|
909
|
+
}
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
#[async_trait]
|
|
913
|
+
impl Validator for InvalidNameValidator {
|
|
914
|
+
async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
|
|
915
|
+
Ok(())
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
let validator = Arc::new(InvalidNameValidator);
|
|
920
|
+
let result = super::register_validator(validator);
|
|
921
|
+
assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
#[test]
|
|
925
|
+
fn test_register_validator_with_empty_name() {
|
|
926
|
+
use std::sync::Arc;
|
|
927
|
+
|
|
928
|
+
struct EmptyNameValidator;
|
|
929
|
+
impl Plugin for EmptyNameValidator {
|
|
930
|
+
fn name(&self) -> &str {
|
|
931
|
+
""
|
|
932
|
+
}
|
|
933
|
+
fn version(&self) -> String {
|
|
934
|
+
"1.0.0".to_string()
|
|
935
|
+
}
|
|
936
|
+
fn initialize(&self) -> Result<()> {
|
|
937
|
+
Ok(())
|
|
938
|
+
}
|
|
939
|
+
fn shutdown(&self) -> Result<()> {
|
|
940
|
+
Ok(())
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
#[async_trait]
|
|
945
|
+
impl Validator for EmptyNameValidator {
|
|
946
|
+
async fn validate(&self, _: &ExtractionResult, _: &ExtractionConfig) -> Result<()> {
|
|
947
|
+
Ok(())
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
|
|
951
|
+
let validator = Arc::new(EmptyNameValidator);
|
|
952
|
+
let result = super::register_validator(validator);
|
|
953
|
+
assert!(matches!(result, Err(KreuzbergError::Validation { .. })));
|
|
954
|
+
}
|
|
955
|
+
}
|