kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -4,62 +4,9 @@
|
|
|
4
4
|
//! quality processing, chunking, and custom hooks in the correct order.
|
|
5
5
|
|
|
6
6
|
use crate::core::config::ExtractionConfig;
|
|
7
|
-
use crate::plugins::
|
|
7
|
+
use crate::plugins::ProcessingStage;
|
|
8
8
|
use crate::types::ExtractionResult;
|
|
9
9
|
use crate::{KreuzbergError, Result};
|
|
10
|
-
use once_cell::sync::Lazy;
|
|
11
|
-
use std::sync::Arc;
|
|
12
|
-
use std::sync::RwLock as StdRwLock;
|
|
13
|
-
|
|
14
|
-
/// Cached post-processors for each stage to reduce lock contention.
|
|
15
|
-
///
|
|
16
|
-
/// This cache is populated once during the first pipeline run and reused
|
|
17
|
-
/// for all subsequent extractions, eliminating 3 of 4 registry lock acquisitions
|
|
18
|
-
/// per extraction.
|
|
19
|
-
struct ProcessorCache {
|
|
20
|
-
early: Arc<Vec<Arc<dyn PostProcessor>>>,
|
|
21
|
-
middle: Arc<Vec<Arc<dyn PostProcessor>>>,
|
|
22
|
-
late: Arc<Vec<Arc<dyn PostProcessor>>>,
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
impl ProcessorCache {
|
|
26
|
-
/// Create a new processor cache by fetching from the registry.
|
|
27
|
-
fn new() -> Result<Self> {
|
|
28
|
-
let processor_registry = crate::plugins::registry::get_post_processor_registry();
|
|
29
|
-
let registry = processor_registry
|
|
30
|
-
.read()
|
|
31
|
-
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
32
|
-
|
|
33
|
-
Ok(Self {
|
|
34
|
-
early: Arc::new(registry.get_for_stage(ProcessingStage::Early)),
|
|
35
|
-
middle: Arc::new(registry.get_for_stage(ProcessingStage::Middle)),
|
|
36
|
-
late: Arc::new(registry.get_for_stage(ProcessingStage::Late)),
|
|
37
|
-
})
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
/// Get processors for a specific stage from cache.
|
|
41
|
-
#[allow(dead_code)]
|
|
42
|
-
fn get_for_stage(&self, stage: ProcessingStage) -> Arc<Vec<Arc<dyn PostProcessor>>> {
|
|
43
|
-
match stage {
|
|
44
|
-
ProcessingStage::Early => Arc::clone(&self.early),
|
|
45
|
-
ProcessingStage::Middle => Arc::clone(&self.middle),
|
|
46
|
-
ProcessingStage::Late => Arc::clone(&self.late),
|
|
47
|
-
}
|
|
48
|
-
}
|
|
49
|
-
}
|
|
50
|
-
|
|
51
|
-
/// Lazy processor cache - initialized on first use, then cached.
|
|
52
|
-
static PROCESSOR_CACHE: Lazy<StdRwLock<Option<ProcessorCache>>> = Lazy::new(|| StdRwLock::new(None));
|
|
53
|
-
|
|
54
|
-
/// Clear the processor cache (primarily for testing when registry changes).
|
|
55
|
-
#[allow(dead_code)]
|
|
56
|
-
pub fn clear_processor_cache() -> Result<()> {
|
|
57
|
-
let mut cache = PROCESSOR_CACHE
|
|
58
|
-
.write()
|
|
59
|
-
.map_err(|e| crate::KreuzbergError::Other(format!("Processor cache lock poisoned: {}", e)))?;
|
|
60
|
-
*cache = None;
|
|
61
|
-
Ok(())
|
|
62
|
-
}
|
|
63
10
|
|
|
64
11
|
/// Run the post-processing pipeline on an extraction result.
|
|
65
12
|
///
|
|
@@ -83,13 +30,6 @@ pub fn clear_processor_cache() -> Result<()> {
|
|
|
83
30
|
/// - Validator errors bubble up immediately
|
|
84
31
|
/// - Post-processor errors are caught and recorded in metadata
|
|
85
32
|
/// - System errors (IO, RuntimeError equivalents) always bubble up
|
|
86
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
87
|
-
skip(result, config),
|
|
88
|
-
fields(
|
|
89
|
-
pipeline.stage = "post_processing",
|
|
90
|
-
content.length = result.content.len(),
|
|
91
|
-
)
|
|
92
|
-
))]
|
|
93
33
|
pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
94
34
|
let pp_config = config.postprocessor.as_ref();
|
|
95
35
|
let postprocessing_enabled = pp_config.is_none_or(|c| c.enabled);
|
|
@@ -100,61 +40,21 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
|
|
|
100
40
|
let _ = crate::keywords::ensure_initialized();
|
|
101
41
|
}
|
|
102
42
|
|
|
103
|
-
|
|
104
|
-
{
|
|
105
|
-
let _ = crate::language_detection::ensure_initialized();
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
#[cfg(feature = "chunking")]
|
|
109
|
-
{
|
|
110
|
-
let _ = crate::chunking::ensure_initialized();
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
#[cfg(feature = "quality")]
|
|
114
|
-
{
|
|
115
|
-
let registry = crate::plugins::registry::get_post_processor_registry();
|
|
116
|
-
if let Ok(mut reg) = registry.write() {
|
|
117
|
-
let _ = reg.register(std::sync::Arc::new(crate::text::QualityProcessor), 30);
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
{
|
|
122
|
-
let mut cache_lock = PROCESSOR_CACHE
|
|
123
|
-
.write()
|
|
124
|
-
.map_err(|e| crate::KreuzbergError::Other(format!("Processor cache lock poisoned: {}", e)))?;
|
|
125
|
-
if cache_lock.is_none() {
|
|
126
|
-
*cache_lock = Some(ProcessorCache::new()?);
|
|
127
|
-
}
|
|
128
|
-
}
|
|
43
|
+
let processor_registry = crate::plugins::registry::get_post_processor_registry();
|
|
129
44
|
|
|
130
|
-
|
|
131
|
-
let
|
|
132
|
-
.read()
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
.
|
|
136
|
-
|
|
137
|
-
(
|
|
138
|
-
Arc::clone(&cache.early),
|
|
139
|
-
Arc::clone(&cache.middle),
|
|
140
|
-
Arc::clone(&cache.late),
|
|
141
|
-
)
|
|
142
|
-
};
|
|
45
|
+
for stage in [ProcessingStage::Early, ProcessingStage::Middle, ProcessingStage::Late] {
|
|
46
|
+
let processors = {
|
|
47
|
+
let registry = processor_registry.read().map_err(|e| {
|
|
48
|
+
crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e))
|
|
49
|
+
})?;
|
|
50
|
+
registry.get_for_stage(stage)
|
|
51
|
+
};
|
|
143
52
|
|
|
144
|
-
|
|
145
|
-
(ProcessingStage::Early, early_processors),
|
|
146
|
-
(ProcessingStage::Middle, middle_processors),
|
|
147
|
-
(ProcessingStage::Late, late_processors),
|
|
148
|
-
] {
|
|
149
|
-
for processor in processors_arc.iter() {
|
|
53
|
+
for processor in processors {
|
|
150
54
|
let processor_name = processor.name();
|
|
151
55
|
|
|
152
56
|
let should_run = if let Some(config) = pp_config {
|
|
153
|
-
if let Some(ref
|
|
154
|
-
enabled_set.contains(processor_name)
|
|
155
|
-
} else if let Some(ref disabled_set) = config.disabled_set {
|
|
156
|
-
!disabled_set.contains(processor_name)
|
|
157
|
-
} else if let Some(ref enabled) = config.enabled_processors {
|
|
57
|
+
if let Some(ref enabled) = config.enabled_processors {
|
|
158
58
|
enabled.iter().any(|name| name == processor_name)
|
|
159
59
|
} else if let Some(ref disabled) = config.disabled_processors {
|
|
160
60
|
!disabled.iter().any(|name| name == processor_name)
|
|
@@ -185,6 +85,35 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
|
|
|
185
85
|
}
|
|
186
86
|
}
|
|
187
87
|
|
|
88
|
+
#[cfg(feature = "quality")]
|
|
89
|
+
if config.enable_quality_processing {
|
|
90
|
+
let quality_score = crate::text::quality::calculate_quality_score(
|
|
91
|
+
&result.content,
|
|
92
|
+
Some(
|
|
93
|
+
&result
|
|
94
|
+
.metadata
|
|
95
|
+
.additional
|
|
96
|
+
.iter()
|
|
97
|
+
.map(|(k, v)| (k.clone(), v.to_string()))
|
|
98
|
+
.collect(),
|
|
99
|
+
),
|
|
100
|
+
);
|
|
101
|
+
result.metadata.additional.insert(
|
|
102
|
+
"quality_score".to_string(),
|
|
103
|
+
serde_json::Value::Number(
|
|
104
|
+
serde_json::Number::from_f64(quality_score).unwrap_or(serde_json::Number::from(0)),
|
|
105
|
+
),
|
|
106
|
+
);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
#[cfg(not(feature = "quality"))]
|
|
110
|
+
if config.enable_quality_processing {
|
|
111
|
+
result.metadata.additional.insert(
|
|
112
|
+
"quality_processing_error".to_string(),
|
|
113
|
+
serde_json::Value::String("Quality processing feature not enabled".to_string()),
|
|
114
|
+
);
|
|
115
|
+
}
|
|
116
|
+
|
|
188
117
|
#[cfg(feature = "chunking")]
|
|
189
118
|
if let Some(ref chunking_config) = config.chunking {
|
|
190
119
|
let chunk_config = crate::chunking::ChunkingConfig {
|
|
@@ -194,9 +123,7 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
|
|
|
194
123
|
chunker_type: crate::chunking::ChunkerType::Text,
|
|
195
124
|
};
|
|
196
125
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
match crate::chunking::chunk_text(&result.content, &chunk_config, page_boundaries) {
|
|
126
|
+
match crate::chunking::chunk_text(&result.content, &chunk_config) {
|
|
200
127
|
Ok(chunking_result) => {
|
|
201
128
|
result.chunks = Some(chunking_result.chunks);
|
|
202
129
|
|
|
@@ -284,11 +211,9 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
|
|
|
284
211
|
registry.get_all()
|
|
285
212
|
};
|
|
286
213
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
validator.validate(&result, config).await?;
|
|
291
|
-
}
|
|
214
|
+
for validator in validators {
|
|
215
|
+
if validator.should_validate(&result, config) {
|
|
216
|
+
validator.validate(&result, config).await?;
|
|
292
217
|
}
|
|
293
218
|
}
|
|
294
219
|
}
|
|
@@ -296,144 +221,19 @@ pub async fn run_pipeline(mut result: ExtractionResult, config: &ExtractionConfi
|
|
|
296
221
|
Ok(result)
|
|
297
222
|
}
|
|
298
223
|
|
|
299
|
-
/// Run the post-processing pipeline synchronously (WASM-compatible version).
|
|
300
|
-
///
|
|
301
|
-
/// This is a synchronous implementation for WASM and non-async contexts.
|
|
302
|
-
/// It performs a subset of the full async pipeline, excluding async post-processors
|
|
303
|
-
/// and validators.
|
|
304
|
-
///
|
|
305
|
-
/// # Arguments
|
|
306
|
-
///
|
|
307
|
-
/// * `result` - The extraction result to process
|
|
308
|
-
/// * `config` - Extraction configuration
|
|
309
|
-
///
|
|
310
|
-
/// # Returns
|
|
311
|
-
///
|
|
312
|
-
/// The processed extraction result.
|
|
313
|
-
///
|
|
314
|
-
/// # Notes
|
|
315
|
-
///
|
|
316
|
-
/// This function is only available when the `tokio-runtime` feature is disabled.
|
|
317
|
-
/// It handles:
|
|
318
|
-
/// - Quality processing (if enabled)
|
|
319
|
-
/// - Chunking (if enabled)
|
|
320
|
-
/// - Language detection (if enabled)
|
|
321
|
-
///
|
|
322
|
-
/// It does NOT handle:
|
|
323
|
-
/// - Async post-processors
|
|
324
|
-
/// - Async validators
|
|
325
|
-
#[cfg(not(feature = "tokio-runtime"))]
|
|
326
|
-
pub fn run_pipeline_sync(mut result: ExtractionResult, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
327
|
-
#[cfg(feature = "chunking")]
|
|
328
|
-
if let Some(ref chunking_config) = config.chunking {
|
|
329
|
-
let chunk_config = crate::chunking::ChunkingConfig {
|
|
330
|
-
max_characters: chunking_config.max_chars,
|
|
331
|
-
overlap: chunking_config.max_overlap,
|
|
332
|
-
trim: true,
|
|
333
|
-
chunker_type: crate::chunking::ChunkerType::Text,
|
|
334
|
-
};
|
|
335
|
-
|
|
336
|
-
match crate::chunking::chunk_text(&result.content, &chunk_config, None) {
|
|
337
|
-
Ok(chunking_result) => {
|
|
338
|
-
result.chunks = Some(chunking_result.chunks);
|
|
339
|
-
|
|
340
|
-
if let Some(ref chunks) = result.chunks {
|
|
341
|
-
result.metadata.additional.insert(
|
|
342
|
-
"chunk_count".to_string(),
|
|
343
|
-
serde_json::Value::Number(serde_json::Number::from(chunks.len())),
|
|
344
|
-
);
|
|
345
|
-
}
|
|
346
|
-
|
|
347
|
-
#[cfg(feature = "embeddings")]
|
|
348
|
-
if let Some(ref embedding_config) = chunking_config.embedding
|
|
349
|
-
&& let Some(ref mut chunks) = result.chunks
|
|
350
|
-
{
|
|
351
|
-
match crate::embeddings::generate_embeddings_for_chunks(chunks, embedding_config) {
|
|
352
|
-
Ok(()) => {
|
|
353
|
-
result
|
|
354
|
-
.metadata
|
|
355
|
-
.additional
|
|
356
|
-
.insert("embeddings_generated".to_string(), serde_json::Value::Bool(true));
|
|
357
|
-
}
|
|
358
|
-
Err(e) => {
|
|
359
|
-
result
|
|
360
|
-
.metadata
|
|
361
|
-
.additional
|
|
362
|
-
.insert("embedding_error".to_string(), serde_json::Value::String(e.to_string()));
|
|
363
|
-
}
|
|
364
|
-
}
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
#[cfg(not(feature = "embeddings"))]
|
|
368
|
-
if chunking_config.embedding.is_some() {
|
|
369
|
-
result.metadata.additional.insert(
|
|
370
|
-
"embedding_error".to_string(),
|
|
371
|
-
serde_json::Value::String("Embeddings feature not enabled".to_string()),
|
|
372
|
-
);
|
|
373
|
-
}
|
|
374
|
-
}
|
|
375
|
-
Err(e) => {
|
|
376
|
-
result
|
|
377
|
-
.metadata
|
|
378
|
-
.additional
|
|
379
|
-
.insert("chunking_error".to_string(), serde_json::Value::String(e.to_string()));
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
#[cfg(not(feature = "chunking"))]
|
|
385
|
-
if config.chunking.is_some() {
|
|
386
|
-
result.metadata.additional.insert(
|
|
387
|
-
"chunking_error".to_string(),
|
|
388
|
-
serde_json::Value::String("Chunking feature not enabled".to_string()),
|
|
389
|
-
);
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
#[cfg(feature = "language-detection")]
|
|
393
|
-
if let Some(ref lang_config) = config.language_detection {
|
|
394
|
-
match crate::language_detection::detect_languages(&result.content, lang_config) {
|
|
395
|
-
Ok(detected) => {
|
|
396
|
-
result.detected_languages = detected;
|
|
397
|
-
}
|
|
398
|
-
Err(e) => {
|
|
399
|
-
result.metadata.additional.insert(
|
|
400
|
-
"language_detection_error".to_string(),
|
|
401
|
-
serde_json::Value::String(e.to_string()),
|
|
402
|
-
);
|
|
403
|
-
}
|
|
404
|
-
}
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
#[cfg(not(feature = "language-detection"))]
|
|
408
|
-
if config.language_detection.is_some() {
|
|
409
|
-
result.metadata.additional.insert(
|
|
410
|
-
"language_detection_error".to_string(),
|
|
411
|
-
serde_json::Value::String("Language detection feature not enabled".to_string()),
|
|
412
|
-
);
|
|
413
|
-
}
|
|
414
|
-
|
|
415
|
-
Ok(result)
|
|
416
|
-
}
|
|
417
|
-
|
|
418
224
|
#[cfg(test)]
|
|
419
225
|
mod tests {
|
|
420
226
|
use super::*;
|
|
421
227
|
use crate::types::Metadata;
|
|
422
228
|
use lazy_static::lazy_static;
|
|
423
229
|
|
|
424
|
-
const VALIDATION_MARKER_KEY: &str = "registry_validation_marker";
|
|
425
|
-
#[cfg(feature = "quality")]
|
|
426
|
-
const QUALITY_VALIDATION_MARKER: &str = "quality_validation_test";
|
|
427
|
-
const POSTPROCESSOR_VALIDATION_MARKER: &str = "postprocessor_validation_test";
|
|
428
|
-
const ORDER_VALIDATION_MARKER: &str = "order_validation_test";
|
|
429
|
-
|
|
430
230
|
lazy_static! {
|
|
431
231
|
static ref REGISTRY_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
|
|
432
232
|
}
|
|
433
233
|
|
|
434
234
|
#[tokio::test]
|
|
435
235
|
async fn test_run_pipeline_basic() {
|
|
436
|
-
let
|
|
236
|
+
let result = ExtractionResult {
|
|
437
237
|
content: "test".to_string(),
|
|
438
238
|
mime_type: "text/plain".to_string(),
|
|
439
239
|
metadata: Metadata::default(),
|
|
@@ -441,12 +241,7 @@ mod tests {
|
|
|
441
241
|
detected_languages: None,
|
|
442
242
|
chunks: None,
|
|
443
243
|
images: None,
|
|
444
|
-
pages: None,
|
|
445
244
|
};
|
|
446
|
-
result.metadata.additional.insert(
|
|
447
|
-
VALIDATION_MARKER_KEY.to_string(),
|
|
448
|
-
serde_json::json!(ORDER_VALIDATION_MARKER),
|
|
449
|
-
);
|
|
450
245
|
let config = ExtractionConfig::default();
|
|
451
246
|
|
|
452
247
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
@@ -464,7 +259,6 @@ mod tests {
|
|
|
464
259
|
detected_languages: None,
|
|
465
260
|
chunks: None,
|
|
466
261
|
images: None,
|
|
467
|
-
pages: None,
|
|
468
262
|
};
|
|
469
263
|
let config = ExtractionConfig {
|
|
470
264
|
enable_quality_processing: true,
|
|
@@ -485,7 +279,6 @@ mod tests {
|
|
|
485
279
|
detected_languages: None,
|
|
486
280
|
chunks: None,
|
|
487
281
|
images: None,
|
|
488
|
-
pages: None,
|
|
489
282
|
};
|
|
490
283
|
let config = ExtractionConfig {
|
|
491
284
|
enable_quality_processing: false,
|
|
@@ -507,7 +300,6 @@ mod tests {
|
|
|
507
300
|
detected_languages: None,
|
|
508
301
|
chunks: None,
|
|
509
302
|
images: None,
|
|
510
|
-
pages: None,
|
|
511
303
|
};
|
|
512
304
|
let config = ExtractionConfig {
|
|
513
305
|
chunking: Some(crate::ChunkingConfig {
|
|
@@ -535,7 +327,6 @@ mod tests {
|
|
|
535
327
|
detected_languages: None,
|
|
536
328
|
chunks: None,
|
|
537
329
|
images: None,
|
|
538
|
-
pages: None,
|
|
539
330
|
};
|
|
540
331
|
let config = ExtractionConfig {
|
|
541
332
|
chunking: None,
|
|
@@ -560,7 +351,6 @@ mod tests {
|
|
|
560
351
|
additional,
|
|
561
352
|
..Default::default()
|
|
562
353
|
},
|
|
563
|
-
pages: None,
|
|
564
354
|
tables: vec![],
|
|
565
355
|
detected_languages: None,
|
|
566
356
|
chunks: None,
|
|
@@ -597,7 +387,6 @@ mod tests {
|
|
|
597
387
|
detected_languages: None,
|
|
598
388
|
chunks: None,
|
|
599
389
|
images: None,
|
|
600
|
-
pages: None,
|
|
601
390
|
};
|
|
602
391
|
let config = ExtractionConfig::default();
|
|
603
392
|
|
|
@@ -608,17 +397,9 @@ mod tests {
|
|
|
608
397
|
|
|
609
398
|
#[tokio::test]
|
|
610
399
|
async fn test_pipeline_empty_content() {
|
|
611
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
612
|
-
|
|
613
400
|
{
|
|
614
|
-
let
|
|
615
|
-
|
|
616
|
-
}
|
|
617
|
-
{
|
|
618
|
-
let registry = crate::plugins::registry::get_validator_registry();
|
|
619
|
-
registry.write().unwrap().shutdown_all().unwrap();
|
|
620
|
-
}
|
|
621
|
-
|
|
401
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
402
|
+
} // Drop guard before async operations
|
|
622
403
|
let result = ExtractionResult {
|
|
623
404
|
content: String::new(),
|
|
624
405
|
mime_type: "text/plain".to_string(),
|
|
@@ -627,12 +408,9 @@ mod tests {
|
|
|
627
408
|
detected_languages: None,
|
|
628
409
|
chunks: None,
|
|
629
410
|
images: None,
|
|
630
|
-
pages: None,
|
|
631
411
|
};
|
|
632
412
|
let config = ExtractionConfig::default();
|
|
633
413
|
|
|
634
|
-
drop(_guard);
|
|
635
|
-
|
|
636
414
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
637
415
|
assert_eq!(processed.content, "");
|
|
638
416
|
}
|
|
@@ -648,7 +426,6 @@ mod tests {
|
|
|
648
426
|
detected_languages: None,
|
|
649
427
|
chunks: None,
|
|
650
428
|
images: None,
|
|
651
|
-
pages: None,
|
|
652
429
|
};
|
|
653
430
|
let config = ExtractionConfig {
|
|
654
431
|
enable_quality_processing: true,
|
|
@@ -669,22 +446,6 @@ mod tests {
|
|
|
669
446
|
#[tokio::test]
|
|
670
447
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
671
448
|
async fn test_pipeline_with_keyword_extraction() {
|
|
672
|
-
{
|
|
673
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
674
|
-
crate::plugins::registry::get_validator_registry()
|
|
675
|
-
.write()
|
|
676
|
-
.unwrap()
|
|
677
|
-
.shutdown_all()
|
|
678
|
-
.unwrap();
|
|
679
|
-
crate::plugins::registry::get_post_processor_registry()
|
|
680
|
-
.write()
|
|
681
|
-
.unwrap()
|
|
682
|
-
.shutdown_all()
|
|
683
|
-
.unwrap();
|
|
684
|
-
|
|
685
|
-
let _ = crate::keywords::register_keyword_processor();
|
|
686
|
-
}
|
|
687
|
-
|
|
688
449
|
let result = ExtractionResult {
|
|
689
450
|
content: r#"
|
|
690
451
|
Machine learning is a branch of artificial intelligence that focuses on
|
|
@@ -699,7 +460,6 @@ Natural language processing enables computers to understand human language.
|
|
|
699
460
|
detected_languages: None,
|
|
700
461
|
chunks: None,
|
|
701
462
|
images: None,
|
|
702
|
-
pages: None,
|
|
703
463
|
};
|
|
704
464
|
|
|
705
465
|
#[cfg(feature = "keywords-yake")]
|
|
@@ -733,9 +493,6 @@ Natural language processing enables computers to understand human language.
|
|
|
733
493
|
#[tokio::test]
|
|
734
494
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
735
495
|
async fn test_pipeline_without_keyword_config() {
|
|
736
|
-
{
|
|
737
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
738
|
-
}
|
|
739
496
|
let result = ExtractionResult {
|
|
740
497
|
content: "Machine learning and artificial intelligence.".to_string(),
|
|
741
498
|
mime_type: "text/plain".to_string(),
|
|
@@ -744,7 +501,6 @@ Natural language processing enables computers to understand human language.
|
|
|
744
501
|
detected_languages: None,
|
|
745
502
|
chunks: None,
|
|
746
503
|
images: None,
|
|
747
|
-
pages: None,
|
|
748
504
|
};
|
|
749
505
|
|
|
750
506
|
let config = ExtractionConfig {
|
|
@@ -760,18 +516,6 @@ Natural language processing enables computers to understand human language.
|
|
|
760
516
|
#[tokio::test]
|
|
761
517
|
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
762
518
|
async fn test_pipeline_keyword_extraction_short_content() {
|
|
763
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
764
|
-
crate::plugins::registry::get_validator_registry()
|
|
765
|
-
.write()
|
|
766
|
-
.unwrap()
|
|
767
|
-
.shutdown_all()
|
|
768
|
-
.unwrap();
|
|
769
|
-
crate::plugins::registry::get_post_processor_registry()
|
|
770
|
-
.write()
|
|
771
|
-
.unwrap()
|
|
772
|
-
.shutdown_all()
|
|
773
|
-
.unwrap();
|
|
774
|
-
|
|
775
519
|
let result = ExtractionResult {
|
|
776
520
|
content: "Short text".to_string(),
|
|
777
521
|
mime_type: "text/plain".to_string(),
|
|
@@ -780,7 +524,6 @@ Natural language processing enables computers to understand human language.
|
|
|
780
524
|
detected_languages: None,
|
|
781
525
|
chunks: None,
|
|
782
526
|
images: None,
|
|
783
|
-
pages: None,
|
|
784
527
|
};
|
|
785
528
|
|
|
786
529
|
#[cfg(feature = "keywords-yake")]
|
|
@@ -794,8 +537,6 @@ Natural language processing enables computers to understand human language.
|
|
|
794
537
|
..Default::default()
|
|
795
538
|
};
|
|
796
539
|
|
|
797
|
-
drop(_guard);
|
|
798
|
-
|
|
799
540
|
let processed = run_pipeline(result, &config).await.unwrap();
|
|
800
541
|
|
|
801
542
|
assert!(!processed.metadata.additional.contains_key("keywords"));
|
|
@@ -803,6 +544,9 @@ Natural language processing enables computers to understand human language.
|
|
|
803
544
|
|
|
804
545
|
#[tokio::test]
|
|
805
546
|
async fn test_postprocessor_runs_before_validator() {
|
|
547
|
+
{
|
|
548
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
549
|
+
} // Drop guard before async operations
|
|
806
550
|
use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
|
|
807
551
|
use async_trait::async_trait;
|
|
808
552
|
use std::sync::Arc;
|
|
@@ -857,17 +601,6 @@ Natural language processing enables computers to understand human language.
|
|
|
857
601
|
#[async_trait]
|
|
858
602
|
impl Validator for TestValidator {
|
|
859
603
|
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
860
|
-
let should_validate = result
|
|
861
|
-
.metadata
|
|
862
|
-
.additional
|
|
863
|
-
.get(VALIDATION_MARKER_KEY)
|
|
864
|
-
.and_then(|v| v.as_str())
|
|
865
|
-
== Some(POSTPROCESSOR_VALIDATION_MARKER);
|
|
866
|
-
|
|
867
|
-
if !should_validate {
|
|
868
|
-
return Ok(());
|
|
869
|
-
}
|
|
870
|
-
|
|
871
604
|
let processed = result
|
|
872
605
|
.metadata
|
|
873
606
|
.additional
|
|
@@ -886,28 +619,18 @@ Natural language processing enables computers to understand human language.
|
|
|
886
619
|
}
|
|
887
620
|
|
|
888
621
|
let pp_registry = crate::plugins::registry::get_post_processor_registry();
|
|
889
|
-
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
890
|
-
|
|
891
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
892
|
-
clear_processor_cache().unwrap();
|
|
893
|
-
pp_registry.write().unwrap().shutdown_all().unwrap();
|
|
894
|
-
val_registry.write().unwrap().shutdown_all().unwrap();
|
|
895
|
-
clear_processor_cache().unwrap();
|
|
896
|
-
|
|
897
622
|
{
|
|
898
623
|
let mut registry = pp_registry.write().unwrap();
|
|
899
624
|
registry.register(Arc::new(TestPostProcessor), 0).unwrap();
|
|
900
625
|
}
|
|
901
626
|
|
|
627
|
+
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
902
628
|
{
|
|
903
629
|
let mut registry = val_registry.write().unwrap();
|
|
904
630
|
registry.register(Arc::new(TestValidator)).unwrap();
|
|
905
631
|
}
|
|
906
632
|
|
|
907
|
-
|
|
908
|
-
clear_processor_cache().unwrap();
|
|
909
|
-
|
|
910
|
-
let mut result = ExtractionResult {
|
|
633
|
+
let result = ExtractionResult {
|
|
911
634
|
content: "test".to_string(),
|
|
912
635
|
mime_type: "text/plain".to_string(),
|
|
913
636
|
metadata: Metadata::default(),
|
|
@@ -915,29 +638,19 @@ Natural language processing enables computers to understand human language.
|
|
|
915
638
|
detected_languages: None,
|
|
916
639
|
chunks: None,
|
|
917
640
|
images: None,
|
|
918
|
-
pages: None,
|
|
919
|
-
};
|
|
920
|
-
result.metadata.additional.insert(
|
|
921
|
-
VALIDATION_MARKER_KEY.to_string(),
|
|
922
|
-
serde_json::json!(POSTPROCESSOR_VALIDATION_MARKER),
|
|
923
|
-
);
|
|
924
|
-
|
|
925
|
-
let config = ExtractionConfig {
|
|
926
|
-
postprocessor: Some(crate::core::config::PostProcessorConfig {
|
|
927
|
-
enabled: true,
|
|
928
|
-
enabled_set: None,
|
|
929
|
-
disabled_set: None,
|
|
930
|
-
enabled_processors: None,
|
|
931
|
-
disabled_processors: None,
|
|
932
|
-
}),
|
|
933
|
-
..Default::default()
|
|
934
641
|
};
|
|
935
|
-
drop(_guard);
|
|
936
642
|
|
|
643
|
+
let config = ExtractionConfig::default();
|
|
937
644
|
let processed = run_pipeline(result, &config).await;
|
|
938
645
|
|
|
939
|
-
|
|
940
|
-
|
|
646
|
+
{
|
|
647
|
+
let mut registry = pp_registry.write().unwrap();
|
|
648
|
+
registry.remove("test-processor").unwrap();
|
|
649
|
+
}
|
|
650
|
+
{
|
|
651
|
+
let mut registry = val_registry.write().unwrap();
|
|
652
|
+
registry.remove("test-validator").unwrap();
|
|
653
|
+
}
|
|
941
654
|
|
|
942
655
|
assert!(processed.is_ok(), "Validator should have seen post-processor metadata");
|
|
943
656
|
let processed = processed.unwrap();
|
|
@@ -951,7 +664,9 @@ Natural language processing enables computers to understand human language.
|
|
|
951
664
|
#[tokio::test]
|
|
952
665
|
#[cfg(feature = "quality")]
|
|
953
666
|
async fn test_quality_processing_runs_before_validator() {
|
|
954
|
-
|
|
667
|
+
{
|
|
668
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
669
|
+
} // Drop guard before async operations
|
|
955
670
|
use crate::plugins::{Plugin, Validator};
|
|
956
671
|
use async_trait::async_trait;
|
|
957
672
|
use std::sync::Arc;
|
|
@@ -975,17 +690,6 @@ Natural language processing enables computers to understand human language.
|
|
|
975
690
|
#[async_trait]
|
|
976
691
|
impl Validator for QualityValidator {
|
|
977
692
|
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
978
|
-
let should_validate = result
|
|
979
|
-
.metadata
|
|
980
|
-
.additional
|
|
981
|
-
.get(VALIDATION_MARKER_KEY)
|
|
982
|
-
.and_then(|v| v.as_str())
|
|
983
|
-
== Some(QUALITY_VALIDATION_MARKER);
|
|
984
|
-
|
|
985
|
-
if !should_validate {
|
|
986
|
-
return Ok(());
|
|
987
|
-
}
|
|
988
|
-
|
|
989
693
|
if !result.metadata.additional.contains_key("quality_score") {
|
|
990
694
|
return Err(crate::KreuzbergError::Validation {
|
|
991
695
|
message: "Quality processing did not run before validator".to_string(),
|
|
@@ -1002,7 +706,7 @@ Natural language processing enables computers to understand human language.
|
|
|
1002
706
|
registry.register(Arc::new(QualityValidator)).unwrap();
|
|
1003
707
|
}
|
|
1004
708
|
|
|
1005
|
-
let
|
|
709
|
+
let result = ExtractionResult {
|
|
1006
710
|
content: "This is meaningful test content for quality scoring.".to_string(),
|
|
1007
711
|
mime_type: "text/plain".to_string(),
|
|
1008
712
|
metadata: Metadata::default(),
|
|
@@ -1010,20 +714,13 @@ Natural language processing enables computers to understand human language.
|
|
|
1010
714
|
detected_languages: None,
|
|
1011
715
|
chunks: None,
|
|
1012
716
|
images: None,
|
|
1013
|
-
pages: None,
|
|
1014
717
|
};
|
|
1015
|
-
result.metadata.additional.insert(
|
|
1016
|
-
VALIDATION_MARKER_KEY.to_string(),
|
|
1017
|
-
serde_json::json!(QUALITY_VALIDATION_MARKER),
|
|
1018
|
-
);
|
|
1019
718
|
|
|
1020
719
|
let config = ExtractionConfig {
|
|
1021
720
|
enable_quality_processing: true,
|
|
1022
721
|
..Default::default()
|
|
1023
722
|
};
|
|
1024
723
|
|
|
1025
|
-
drop(_guard);
|
|
1026
|
-
|
|
1027
724
|
let processed = run_pipeline(result, &config).await;
|
|
1028
725
|
|
|
1029
726
|
{
|
|
@@ -1036,6 +733,9 @@ Natural language processing enables computers to understand human language.
|
|
|
1036
733
|
|
|
1037
734
|
#[tokio::test]
|
|
1038
735
|
async fn test_multiple_postprocessors_run_before_validator() {
|
|
736
|
+
{
|
|
737
|
+
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
738
|
+
} // Drop guard before async operations
|
|
1039
739
|
use crate::plugins::{Plugin, PostProcessor, ProcessingStage, Validator};
|
|
1040
740
|
use async_trait::async_trait;
|
|
1041
741
|
use std::sync::Arc;
|
|
@@ -1137,17 +837,6 @@ Natural language processing enables computers to understand human language.
|
|
|
1137
837
|
#[async_trait]
|
|
1138
838
|
impl Validator for OrderValidator {
|
|
1139
839
|
async fn validate(&self, result: &ExtractionResult, _config: &ExtractionConfig) -> Result<()> {
|
|
1140
|
-
let should_validate = result
|
|
1141
|
-
.metadata
|
|
1142
|
-
.additional
|
|
1143
|
-
.get(VALIDATION_MARKER_KEY)
|
|
1144
|
-
.and_then(|v| v.as_str())
|
|
1145
|
-
== Some(ORDER_VALIDATION_MARKER);
|
|
1146
|
-
|
|
1147
|
-
if !should_validate {
|
|
1148
|
-
return Ok(());
|
|
1149
|
-
}
|
|
1150
|
-
|
|
1151
840
|
let order = result
|
|
1152
841
|
.metadata
|
|
1153
842
|
.additional
|
|
@@ -1177,27 +866,18 @@ Natural language processing enables computers to understand human language.
|
|
|
1177
866
|
}
|
|
1178
867
|
|
|
1179
868
|
let pp_registry = crate::plugins::registry::get_post_processor_registry();
|
|
1180
|
-
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
1181
|
-
let _guard = REGISTRY_TEST_GUARD.lock().unwrap();
|
|
1182
|
-
|
|
1183
|
-
pp_registry.write().unwrap().shutdown_all().unwrap();
|
|
1184
|
-
val_registry.write().unwrap().shutdown_all().unwrap();
|
|
1185
|
-
clear_processor_cache().unwrap();
|
|
1186
|
-
|
|
1187
869
|
{
|
|
1188
870
|
let mut registry = pp_registry.write().unwrap();
|
|
1189
871
|
registry.register(Arc::new(EarlyProcessor), 0).unwrap();
|
|
1190
872
|
registry.register(Arc::new(LateProcessor), 0).unwrap();
|
|
1191
873
|
}
|
|
1192
874
|
|
|
875
|
+
let val_registry = crate::plugins::registry::get_validator_registry();
|
|
1193
876
|
{
|
|
1194
877
|
let mut registry = val_registry.write().unwrap();
|
|
1195
878
|
registry.register(Arc::new(OrderValidator)).unwrap();
|
|
1196
879
|
}
|
|
1197
880
|
|
|
1198
|
-
// Clear the cache after registering new processors so it rebuilds with the test processors
|
|
1199
|
-
clear_processor_cache().unwrap();
|
|
1200
|
-
|
|
1201
881
|
let result = ExtractionResult {
|
|
1202
882
|
content: "test".to_string(),
|
|
1203
883
|
mime_type: "text/plain".to_string(),
|
|
@@ -1206,17 +886,20 @@ Natural language processing enables computers to understand human language.
|
|
|
1206
886
|
detected_languages: None,
|
|
1207
887
|
chunks: None,
|
|
1208
888
|
images: None,
|
|
1209
|
-
pages: None,
|
|
1210
889
|
};
|
|
1211
890
|
|
|
1212
891
|
let config = ExtractionConfig::default();
|
|
1213
|
-
drop(_guard);
|
|
1214
|
-
|
|
1215
892
|
let processed = run_pipeline(result, &config).await;
|
|
1216
893
|
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
894
|
+
{
|
|
895
|
+
let mut registry = pp_registry.write().unwrap();
|
|
896
|
+
registry.remove("early-proc").unwrap();
|
|
897
|
+
registry.remove("late-proc").unwrap();
|
|
898
|
+
}
|
|
899
|
+
{
|
|
900
|
+
let mut registry = val_registry.write().unwrap();
|
|
901
|
+
registry.remove("order-validator").unwrap();
|
|
902
|
+
}
|
|
1220
903
|
|
|
1221
904
|
assert!(processed.is_ok(), "All processors should run before validator");
|
|
1222
905
|
}
|