kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
use async_trait::async_trait;
|
|
9
9
|
use kreuzberg::core::config::{ExtractionConfig, PostProcessorConfig};
|
|
10
|
-
use kreuzberg::core::pipeline::
|
|
10
|
+
use kreuzberg::core::pipeline::run_pipeline;
|
|
11
11
|
use kreuzberg::plugins::registry::get_post_processor_registry;
|
|
12
12
|
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
13
13
|
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
@@ -123,8 +123,6 @@ fn clear_processor_registry() {
|
|
|
123
123
|
.write()
|
|
124
124
|
.expect("Failed to acquire write lock on registry in test");
|
|
125
125
|
let _ = reg.shutdown_all();
|
|
126
|
-
drop(reg);
|
|
127
|
-
let _ = clear_processor_cache();
|
|
128
126
|
}
|
|
129
127
|
|
|
130
128
|
#[tokio::test]
|
|
@@ -140,7 +138,6 @@ async fn test_pipeline_empty_no_processors() {
|
|
|
140
138
|
detected_languages: None,
|
|
141
139
|
chunks: None,
|
|
142
140
|
images: None,
|
|
143
|
-
pages: None,
|
|
144
141
|
};
|
|
145
142
|
let config = ExtractionConfig::default();
|
|
146
143
|
|
|
@@ -185,7 +182,6 @@ async fn test_pipeline_single_processor_per_stage() {
|
|
|
185
182
|
detected_languages: None,
|
|
186
183
|
chunks: None,
|
|
187
184
|
images: None,
|
|
188
|
-
pages: None,
|
|
189
185
|
};
|
|
190
186
|
let config = ExtractionConfig::default();
|
|
191
187
|
|
|
@@ -230,7 +226,6 @@ async fn test_pipeline_multiple_processors_per_stage() {
|
|
|
230
226
|
detected_languages: None,
|
|
231
227
|
chunks: None,
|
|
232
228
|
images: None,
|
|
233
|
-
pages: None,
|
|
234
229
|
};
|
|
235
230
|
let config = ExtractionConfig::default();
|
|
236
231
|
|
|
@@ -266,7 +261,6 @@ async fn test_pipeline_all_stages_enabled() {
|
|
|
266
261
|
detected_languages: None,
|
|
267
262
|
chunks: None,
|
|
268
263
|
images: None,
|
|
269
|
-
pages: None,
|
|
270
264
|
};
|
|
271
265
|
let config = ExtractionConfig::default();
|
|
272
266
|
|
|
@@ -300,15 +294,12 @@ async fn test_pipeline_postprocessing_disabled() {
|
|
|
300
294
|
detected_languages: None,
|
|
301
295
|
chunks: None,
|
|
302
296
|
images: None,
|
|
303
|
-
pages: None,
|
|
304
297
|
};
|
|
305
298
|
let config = ExtractionConfig {
|
|
306
299
|
postprocessor: Some(PostProcessorConfig {
|
|
307
300
|
enabled: false,
|
|
308
301
|
enabled_processors: None,
|
|
309
302
|
disabled_processors: None,
|
|
310
|
-
enabled_set: None,
|
|
311
|
-
disabled_set: None,
|
|
312
303
|
}),
|
|
313
304
|
..Default::default()
|
|
314
305
|
};
|
|
@@ -349,7 +340,6 @@ async fn test_pipeline_early_stage_runs_first() {
|
|
|
349
340
|
detected_languages: None,
|
|
350
341
|
chunks: None,
|
|
351
342
|
images: None,
|
|
352
|
-
pages: None,
|
|
353
343
|
};
|
|
354
344
|
let config = ExtractionConfig::default();
|
|
355
345
|
|
|
@@ -389,7 +379,6 @@ async fn test_pipeline_middle_stage_runs_second() {
|
|
|
389
379
|
detected_languages: None,
|
|
390
380
|
chunks: None,
|
|
391
381
|
images: None,
|
|
392
|
-
pages: None,
|
|
393
382
|
};
|
|
394
383
|
let config = ExtractionConfig::default();
|
|
395
384
|
|
|
@@ -425,7 +414,6 @@ async fn test_pipeline_late_stage_runs_last() {
|
|
|
425
414
|
detected_languages: None,
|
|
426
415
|
chunks: None,
|
|
427
416
|
images: None,
|
|
428
|
-
pages: None,
|
|
429
417
|
};
|
|
430
418
|
let config = ExtractionConfig::default();
|
|
431
419
|
|
|
@@ -461,7 +449,6 @@ async fn test_pipeline_within_stage_priority_order() {
|
|
|
461
449
|
detected_languages: None,
|
|
462
450
|
chunks: None,
|
|
463
451
|
images: None,
|
|
464
|
-
pages: None,
|
|
465
452
|
};
|
|
466
453
|
let config = ExtractionConfig::default();
|
|
467
454
|
|
|
@@ -526,7 +513,6 @@ async fn test_pipeline_cross_stage_data_flow() {
|
|
|
526
513
|
detected_languages: None,
|
|
527
514
|
chunks: None,
|
|
528
515
|
images: None,
|
|
529
|
-
pages: None,
|
|
530
516
|
};
|
|
531
517
|
let config = ExtractionConfig::default();
|
|
532
518
|
|
|
@@ -583,7 +569,6 @@ async fn test_pipeline_early_stage_error_recorded() {
|
|
|
583
569
|
detected_languages: None,
|
|
584
570
|
chunks: None,
|
|
585
571
|
images: None,
|
|
586
|
-
pages: None,
|
|
587
572
|
};
|
|
588
573
|
let config = ExtractionConfig::default();
|
|
589
574
|
|
|
@@ -625,7 +610,6 @@ async fn test_pipeline_middle_stage_error_propagation() {
|
|
|
625
610
|
detected_languages: None,
|
|
626
611
|
chunks: None,
|
|
627
612
|
images: None,
|
|
628
|
-
pages: None,
|
|
629
613
|
};
|
|
630
614
|
let config = ExtractionConfig::default();
|
|
631
615
|
|
|
@@ -697,7 +681,6 @@ async fn test_pipeline_late_stage_error_doesnt_affect_earlier_stages() {
|
|
|
697
681
|
detected_languages: None,
|
|
698
682
|
chunks: None,
|
|
699
683
|
images: None,
|
|
700
|
-
pages: None,
|
|
701
684
|
};
|
|
702
685
|
let config = ExtractionConfig::default();
|
|
703
686
|
|
|
@@ -785,7 +768,6 @@ async fn test_pipeline_processor_error_doesnt_stop_other_processors() {
|
|
|
785
768
|
detected_languages: None,
|
|
786
769
|
chunks: None,
|
|
787
770
|
images: None,
|
|
788
|
-
pages: None,
|
|
789
771
|
};
|
|
790
772
|
let config = ExtractionConfig::default();
|
|
791
773
|
|
|
@@ -863,12 +845,12 @@ async fn test_pipeline_multiple_processor_errors() {
|
|
|
863
845
|
detected_languages: None,
|
|
864
846
|
chunks: None,
|
|
865
847
|
images: None,
|
|
866
|
-
pages: None,
|
|
867
848
|
};
|
|
868
849
|
let config = ExtractionConfig::default();
|
|
869
850
|
|
|
870
851
|
let result = run_pipeline(result, &config).await;
|
|
871
852
|
assert!(result.is_err(), "Expected pipeline to return error");
|
|
853
|
+
// First failing processor (fail1 in Early stage) will cause pipeline to fail
|
|
872
854
|
match result {
|
|
873
855
|
Err(KreuzbergError::Plugin { message, plugin_name }) => {
|
|
874
856
|
assert_eq!(message, "fail1 error");
|
|
@@ -905,7 +887,6 @@ async fn test_pipeline_error_context_preservation() {
|
|
|
905
887
|
detected_languages: None,
|
|
906
888
|
chunks: None,
|
|
907
889
|
images: None,
|
|
908
|
-
pages: None,
|
|
909
890
|
};
|
|
910
891
|
let config = ExtractionConfig::default();
|
|
911
892
|
|
|
@@ -977,7 +958,6 @@ async fn test_pipeline_metadata_added_in_early_visible_in_middle() {
|
|
|
977
958
|
detected_languages: None,
|
|
978
959
|
chunks: None,
|
|
979
960
|
images: None,
|
|
980
|
-
pages: None,
|
|
981
961
|
};
|
|
982
962
|
let config = ExtractionConfig::default();
|
|
983
963
|
|
|
@@ -1048,7 +1028,6 @@ async fn test_pipeline_content_modified_in_middle_visible_in_late() {
|
|
|
1048
1028
|
detected_languages: None,
|
|
1049
1029
|
chunks: None,
|
|
1050
1030
|
images: None,
|
|
1051
|
-
pages: None,
|
|
1052
1031
|
};
|
|
1053
1032
|
let config = ExtractionConfig::default();
|
|
1054
1033
|
|
|
@@ -1117,7 +1096,6 @@ async fn test_pipeline_multiple_processors_modifying_same_metadata() {
|
|
|
1117
1096
|
detected_languages: None,
|
|
1118
1097
|
chunks: None,
|
|
1119
1098
|
images: None,
|
|
1120
|
-
pages: None,
|
|
1121
1099
|
};
|
|
1122
1100
|
let config = ExtractionConfig::default();
|
|
1123
1101
|
|
|
@@ -1205,7 +1183,6 @@ async fn test_pipeline_processors_reading_previous_output() {
|
|
|
1205
1183
|
detected_languages: None,
|
|
1206
1184
|
chunks: None,
|
|
1207
1185
|
images: None,
|
|
1208
|
-
pages: None,
|
|
1209
1186
|
};
|
|
1210
1187
|
let config = ExtractionConfig::default();
|
|
1211
1188
|
|
|
@@ -1260,7 +1237,6 @@ async fn test_pipeline_large_content_modification() {
|
|
|
1260
1237
|
detected_languages: None,
|
|
1261
1238
|
chunks: None,
|
|
1262
1239
|
images: None,
|
|
1263
|
-
pages: None,
|
|
1264
1240
|
};
|
|
1265
1241
|
let config = ExtractionConfig::default();
|
|
1266
1242
|
|
|
@@ -1296,15 +1272,12 @@ async fn test_pipeline_enabled_processors_whitelist() {
|
|
|
1296
1272
|
detected_languages: None,
|
|
1297
1273
|
chunks: None,
|
|
1298
1274
|
images: None,
|
|
1299
|
-
pages: None,
|
|
1300
1275
|
};
|
|
1301
1276
|
let config = ExtractionConfig {
|
|
1302
1277
|
postprocessor: Some(PostProcessorConfig {
|
|
1303
1278
|
enabled: true,
|
|
1304
1279
|
enabled_processors: Some(vec!["proc1".to_string(), "proc3".to_string()]),
|
|
1305
1280
|
disabled_processors: None,
|
|
1306
|
-
enabled_set: None,
|
|
1307
|
-
disabled_set: None,
|
|
1308
1281
|
}),
|
|
1309
1282
|
..Default::default()
|
|
1310
1283
|
};
|
|
@@ -1343,15 +1316,12 @@ async fn test_pipeline_disabled_processors_blacklist() {
|
|
|
1343
1316
|
detected_languages: None,
|
|
1344
1317
|
chunks: None,
|
|
1345
1318
|
images: None,
|
|
1346
|
-
pages: None,
|
|
1347
1319
|
};
|
|
1348
1320
|
let config = ExtractionConfig {
|
|
1349
1321
|
postprocessor: Some(PostProcessorConfig {
|
|
1350
1322
|
enabled: true,
|
|
1351
1323
|
enabled_processors: None,
|
|
1352
1324
|
disabled_processors: Some(vec!["proc2".to_string()]),
|
|
1353
|
-
enabled_set: None,
|
|
1354
|
-
disabled_set: None,
|
|
1355
1325
|
}),
|
|
1356
1326
|
..Default::default()
|
|
1357
1327
|
};
|
|
@@ -1390,7 +1360,6 @@ async fn test_pipeline_no_filtering_runs_all() {
|
|
|
1390
1360
|
detected_languages: None,
|
|
1391
1361
|
chunks: None,
|
|
1392
1362
|
images: None,
|
|
1393
|
-
pages: None,
|
|
1394
1363
|
};
|
|
1395
1364
|
let config = ExtractionConfig::default();
|
|
1396
1365
|
|
|
@@ -1428,15 +1397,12 @@ async fn test_pipeline_empty_whitelist_runs_none() {
|
|
|
1428
1397
|
detected_languages: None,
|
|
1429
1398
|
chunks: None,
|
|
1430
1399
|
images: None,
|
|
1431
|
-
pages: None,
|
|
1432
1400
|
};
|
|
1433
1401
|
let config = ExtractionConfig {
|
|
1434
1402
|
postprocessor: Some(PostProcessorConfig {
|
|
1435
1403
|
enabled: true,
|
|
1436
1404
|
enabled_processors: Some(vec![]),
|
|
1437
1405
|
disabled_processors: None,
|
|
1438
|
-
enabled_set: None,
|
|
1439
|
-
disabled_set: None,
|
|
1440
1406
|
}),
|
|
1441
1407
|
..Default::default()
|
|
1442
1408
|
};
|
|
@@ -3,8 +3,6 @@
|
|
|
3
3
|
//! Tests custom OCR backend registration, execution, parameter passing,
|
|
4
4
|
//! error handling, and backend switching with real image extraction.
|
|
5
5
|
|
|
6
|
-
#![cfg(feature = "ocr")]
|
|
7
|
-
|
|
8
6
|
use async_trait::async_trait;
|
|
9
7
|
use kreuzberg::core::config::{ExtractionConfig, OcrConfig};
|
|
10
8
|
use kreuzberg::plugins::registry::get_ocr_backend_registry;
|
|
@@ -62,7 +60,6 @@ impl OcrBackend for MockOcrBackend {
|
|
|
62
60
|
detected_languages: None,
|
|
63
61
|
chunks: None,
|
|
64
62
|
images: None,
|
|
65
|
-
pages: None,
|
|
66
63
|
})
|
|
67
64
|
}
|
|
68
65
|
|
|
@@ -158,7 +155,6 @@ impl OcrBackend for ValidatingOcrBackend {
|
|
|
158
155
|
detected_languages: None,
|
|
159
156
|
chunks: None,
|
|
160
157
|
images: None,
|
|
161
|
-
pages: None,
|
|
162
158
|
})
|
|
163
159
|
}
|
|
164
160
|
|
|
@@ -215,7 +211,6 @@ impl OcrBackend for MetadataOcrBackend {
|
|
|
215
211
|
detected_languages: None,
|
|
216
212
|
chunks: None,
|
|
217
213
|
images: None,
|
|
218
|
-
pages: None,
|
|
219
214
|
})
|
|
220
215
|
}
|
|
221
216
|
|
|
@@ -5,7 +5,6 @@
|
|
|
5
5
|
|
|
6
6
|
use async_trait::async_trait;
|
|
7
7
|
use kreuzberg::core::config::ExtractionConfig;
|
|
8
|
-
use kreuzberg::core::pipeline::clear_processor_cache;
|
|
9
8
|
use kreuzberg::plugins::registry::get_post_processor_registry;
|
|
10
9
|
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
11
10
|
use kreuzberg::types::ExtractionResult;
|
|
@@ -165,16 +164,6 @@ impl PostProcessor for FailingProcessor {
|
|
|
165
164
|
}
|
|
166
165
|
}
|
|
167
166
|
|
|
168
|
-
fn clear_processor_registry_and_cache() {
|
|
169
|
-
let registry = get_post_processor_registry();
|
|
170
|
-
let mut reg = registry
|
|
171
|
-
.write()
|
|
172
|
-
.expect("Failed to acquire write lock on registry in test");
|
|
173
|
-
let _ = reg.shutdown_all();
|
|
174
|
-
drop(reg);
|
|
175
|
-
let _ = clear_processor_cache();
|
|
176
|
-
}
|
|
177
|
-
|
|
178
167
|
#[serial]
|
|
179
168
|
#[test]
|
|
180
169
|
fn test_register_custom_postprocessor() {
|
|
@@ -213,7 +202,6 @@ fn test_register_custom_postprocessor() {
|
|
|
213
202
|
#[serial]
|
|
214
203
|
#[test]
|
|
215
204
|
fn test_postprocessor_called_during_extraction() {
|
|
216
|
-
clear_processor_registry_and_cache();
|
|
217
205
|
let test_file = "../../test_documents/text/fake_text.txt";
|
|
218
206
|
let registry = get_post_processor_registry();
|
|
219
207
|
|
|
@@ -256,7 +244,6 @@ fn test_postprocessor_called_during_extraction() {
|
|
|
256
244
|
#[serial]
|
|
257
245
|
#[test]
|
|
258
246
|
fn test_postprocessor_modifies_content() {
|
|
259
|
-
clear_processor_registry_and_cache();
|
|
260
247
|
let test_file = "../../test_documents/text/fake_text.txt";
|
|
261
248
|
let registry = get_post_processor_registry();
|
|
262
249
|
|
|
@@ -288,7 +275,6 @@ fn test_postprocessor_modifies_content() {
|
|
|
288
275
|
#[serial]
|
|
289
276
|
#[test]
|
|
290
277
|
fn test_postprocessor_adds_metadata() {
|
|
291
|
-
clear_processor_registry_and_cache();
|
|
292
278
|
let test_file = "../../test_documents/text/fake_text.txt";
|
|
293
279
|
let registry = get_post_processor_registry();
|
|
294
280
|
|
|
@@ -431,7 +417,6 @@ fn test_clear_all_postprocessors() {
|
|
|
431
417
|
#[serial]
|
|
432
418
|
#[test]
|
|
433
419
|
fn test_postprocessor_error_handling() {
|
|
434
|
-
clear_processor_registry_and_cache();
|
|
435
420
|
let test_file = "../../test_documents/text/fake_text.txt";
|
|
436
421
|
let registry = get_post_processor_registry();
|
|
437
422
|
|
|
@@ -448,6 +433,7 @@ fn test_postprocessor_error_handling() {
|
|
|
448
433
|
let result = extract_file_sync(test_file, None, &config);
|
|
449
434
|
|
|
450
435
|
// NOTE: Plugin errors now bubble up and fail the extraction (design change)
|
|
436
|
+
// Other error types (non-IO, non-Plugin) are caught and recorded in metadata
|
|
451
437
|
assert!(
|
|
452
438
|
result.is_err(),
|
|
453
439
|
"Extraction should fail when postprocessor returns Plugin error"
|
|
@@ -500,7 +486,6 @@ fn test_postprocessor_invalid_name() {
|
|
|
500
486
|
#[serial]
|
|
501
487
|
#[test]
|
|
502
488
|
fn test_multiple_postprocessors_execution_order() {
|
|
503
|
-
clear_processor_registry_and_cache();
|
|
504
489
|
let test_file = "../../test_documents/text/fake_text.txt";
|
|
505
490
|
let registry = get_post_processor_registry();
|
|
506
491
|
|
|
@@ -546,7 +531,6 @@ fn test_multiple_postprocessors_execution_order() {
|
|
|
546
531
|
#[serial]
|
|
547
532
|
#[test]
|
|
548
533
|
fn test_postprocessor_preserves_mime_type() {
|
|
549
|
-
clear_processor_registry_and_cache();
|
|
550
534
|
let test_file = "../../test_documents/text/fake_text.txt";
|
|
551
535
|
let registry = get_post_processor_registry();
|
|
552
536
|
|
|
@@ -58,7 +58,6 @@ impl DocumentExtractor for FailingExtractor {
|
|
|
58
58
|
detected_languages: None,
|
|
59
59
|
chunks: None,
|
|
60
60
|
images: None,
|
|
61
|
-
pages: None,
|
|
62
61
|
})
|
|
63
62
|
}
|
|
64
63
|
}
|
|
@@ -303,7 +302,6 @@ fn test_extractor_priority_ordering_complex() {
|
|
|
303
302
|
detected_languages: None,
|
|
304
303
|
chunks: None,
|
|
305
304
|
images: None,
|
|
306
|
-
pages: None,
|
|
307
305
|
})
|
|
308
306
|
}
|
|
309
307
|
fn supported_mime_types(&self) -> &[&str] {
|
|
@@ -463,7 +461,6 @@ async fn test_processor_execution_order_within_stage() {
|
|
|
463
461
|
detected_languages: None,
|
|
464
462
|
chunks: None,
|
|
465
463
|
images: None,
|
|
466
|
-
pages: None,
|
|
467
464
|
};
|
|
468
465
|
|
|
469
466
|
let config = ExtractionConfig::default();
|
|
@@ -495,7 +492,6 @@ async fn test_processor_error_propagation() {
|
|
|
495
492
|
detected_languages: None,
|
|
496
493
|
chunks: None,
|
|
497
494
|
images: None,
|
|
498
|
-
pages: None,
|
|
499
495
|
};
|
|
500
496
|
|
|
501
497
|
let config = ExtractionConfig::default();
|
|
@@ -667,7 +663,6 @@ async fn test_validator_content_validation() {
|
|
|
667
663
|
detected_languages: None,
|
|
668
664
|
chunks: None,
|
|
669
665
|
images: None,
|
|
670
|
-
pages: None,
|
|
671
666
|
};
|
|
672
667
|
|
|
673
668
|
let validation = validators[0].validate(&short_result, &config).await;
|
|
@@ -681,7 +676,6 @@ async fn test_validator_content_validation() {
|
|
|
681
676
|
detected_languages: None,
|
|
682
677
|
chunks: None,
|
|
683
678
|
images: None,
|
|
684
|
-
pages: None,
|
|
685
679
|
};
|
|
686
680
|
|
|
687
681
|
let validation = validators[0].validate(&long_result, &config).await;
|
|
@@ -15,6 +15,8 @@ use kreuzberg::{KreuzbergError, Result};
|
|
|
15
15
|
use std::path::Path;
|
|
16
16
|
use std::sync::Arc;
|
|
17
17
|
|
|
18
|
+
// ===== Mock Validators =====
|
|
19
|
+
|
|
18
20
|
struct MockValidator {
|
|
19
21
|
name: String,
|
|
20
22
|
should_fail: bool,
|
|
@@ -85,6 +87,8 @@ impl Validator for FailingInitValidator {
|
|
|
85
87
|
}
|
|
86
88
|
}
|
|
87
89
|
|
|
90
|
+
// ===== Mock Extractors =====
|
|
91
|
+
|
|
88
92
|
struct MockExtractor {
|
|
89
93
|
name: String,
|
|
90
94
|
mime_types: Vec<&'static str>,
|
|
@@ -125,7 +129,6 @@ impl DocumentExtractor for MockExtractor {
|
|
|
125
129
|
detected_languages: None,
|
|
126
130
|
chunks: None,
|
|
127
131
|
images: None,
|
|
128
|
-
pages: None,
|
|
129
132
|
})
|
|
130
133
|
}
|
|
131
134
|
|
|
@@ -143,6 +146,8 @@ impl DocumentExtractor for MockExtractor {
|
|
|
143
146
|
}
|
|
144
147
|
}
|
|
145
148
|
|
|
149
|
+
// ===== Validator Registry Tests =====
|
|
150
|
+
|
|
146
151
|
/// Test validator registration and listing.
|
|
147
152
|
#[test]
|
|
148
153
|
fn test_validator_registration_succeeds() {
|
|
@@ -275,10 +280,13 @@ fn test_validator_registration_with_failed_init_fails() {
|
|
|
275
280
|
assert!(result.is_err(), "Registration with failed init should fail");
|
|
276
281
|
|
|
277
282
|
match result {
|
|
278
|
-
Err(KreuzbergError::Plugin { .. }) => {
|
|
283
|
+
Err(KreuzbergError::Plugin { .. }) => {
|
|
284
|
+
// Expected error type
|
|
285
|
+
}
|
|
279
286
|
_ => panic!("Expected Plugin error"),
|
|
280
287
|
}
|
|
281
288
|
|
|
289
|
+
// Validator should not be in the list
|
|
282
290
|
assert_eq!(registry.list().len(), 0, "Failed validator should not be registered");
|
|
283
291
|
}
|
|
284
292
|
|
|
@@ -287,6 +295,7 @@ fn test_validator_registration_with_failed_init_fails() {
|
|
|
287
295
|
fn test_clear_validators_succeeds() {
|
|
288
296
|
let mut registry = ValidatorRegistry::new();
|
|
289
297
|
|
|
298
|
+
// Register multiple validators
|
|
290
299
|
let v1 = Arc::new(MockValidator {
|
|
291
300
|
name: "validator-1".to_string(),
|
|
292
301
|
should_fail: false,
|
|
@@ -300,6 +309,7 @@ fn test_clear_validators_succeeds() {
|
|
|
300
309
|
registry.register(v2).unwrap();
|
|
301
310
|
assert_eq!(registry.list().len(), 2);
|
|
302
311
|
|
|
312
|
+
// Clear all
|
|
303
313
|
let result = registry.shutdown_all();
|
|
304
314
|
assert!(result.is_ok(), "Clear should succeed");
|
|
305
315
|
assert_eq!(registry.list().len(), 0, "Registry should be empty after clear");
|
|
@@ -360,11 +370,14 @@ fn test_get_all_validators_respects_priority() {
|
|
|
360
370
|
let all = registry.get_all();
|
|
361
371
|
assert_eq!(all.len(), 3, "Should have three validators");
|
|
362
372
|
|
|
373
|
+
// Should be in descending priority order
|
|
363
374
|
assert_eq!(all[0].name(), "high-priority");
|
|
364
375
|
assert_eq!(all[1].name(), "medium-priority");
|
|
365
376
|
assert_eq!(all[2].name(), "low-priority");
|
|
366
377
|
}
|
|
367
378
|
|
|
379
|
+
// ===== Extractor Registry Tests =====
|
|
380
|
+
|
|
368
381
|
/// Test extractor registration and retrieval.
|
|
369
382
|
#[test]
|
|
370
383
|
fn test_extractor_registration_succeeds() {
|
|
@@ -438,6 +451,7 @@ fn test_extractor_priority_selection() {
|
|
|
438
451
|
registry.register(low_priority).unwrap();
|
|
439
452
|
registry.register(high_priority).unwrap();
|
|
440
453
|
|
|
454
|
+
// Should get the high priority extractor
|
|
441
455
|
let result = registry.get("text/plain").unwrap();
|
|
442
456
|
assert_eq!(
|
|
443
457
|
result.name(),
|
|
@@ -459,14 +473,17 @@ fn test_extractor_wildcard_mime_matching() {
|
|
|
459
473
|
|
|
460
474
|
registry.register(extractor).unwrap();
|
|
461
475
|
|
|
476
|
+
// Should match text/plain
|
|
462
477
|
let result = registry.get("text/plain");
|
|
463
478
|
assert!(result.is_ok(), "Should match text/plain with text/*");
|
|
464
479
|
assert_eq!(result.unwrap().name(), "text-extractor");
|
|
465
480
|
|
|
481
|
+
// Should match text/html
|
|
466
482
|
let result = registry.get("text/html");
|
|
467
483
|
assert!(result.is_ok(), "Should match text/html with text/*");
|
|
468
484
|
assert_eq!(result.unwrap().name(), "text-extractor");
|
|
469
485
|
|
|
486
|
+
// Should not match application/pdf
|
|
470
487
|
let result = registry.get("application/pdf");
|
|
471
488
|
assert!(result.is_err(), "Should not match application/pdf with text/*");
|
|
472
489
|
}
|
|
@@ -489,6 +506,7 @@ fn test_extractor_unregistration_succeeds() {
|
|
|
489
506
|
assert!(result.is_ok(), "Unregistration should succeed");
|
|
490
507
|
assert_eq!(registry.list().len(), 0, "Registry should be empty after removal");
|
|
491
508
|
|
|
509
|
+
// Should no longer find extractor for MIME type
|
|
492
510
|
let lookup_result = registry.get("text/plain");
|
|
493
511
|
assert!(lookup_result.is_err(), "Should not find extractor after removal");
|
|
494
512
|
}
|
|
@@ -506,10 +524,12 @@ fn test_extractor_multiple_mime_types() {
|
|
|
506
524
|
|
|
507
525
|
registry.register(extractor).unwrap();
|
|
508
526
|
|
|
527
|
+
// Should find for all MIME types
|
|
509
528
|
assert!(registry.get("application/pdf").is_ok());
|
|
510
529
|
assert!(registry.get("application/vnd.ms-excel").is_ok());
|
|
511
530
|
assert!(registry.get("text/csv").is_ok());
|
|
512
531
|
|
|
532
|
+
// All should return the same extractor
|
|
513
533
|
assert_eq!(
|
|
514
534
|
registry.get("application/pdf").unwrap().name(),
|
|
515
535
|
"multi-format-extractor"
|
|
@@ -12,17 +12,6 @@ use kreuzberg::core::extractor::{extract_bytes_sync, extract_file_sync};
|
|
|
12
12
|
use std::io::Write;
|
|
13
13
|
use tempfile::NamedTempFile;
|
|
14
14
|
|
|
15
|
-
fn trim_trailing_newlines(value: &str) -> &str {
|
|
16
|
-
value.trim_end_matches(['\n', '\r'])
|
|
17
|
-
}
|
|
18
|
-
|
|
19
|
-
fn assert_text_content(actual: &str, expected: &str) {
|
|
20
|
-
assert_eq!(
|
|
21
|
-
trim_trailing_newlines(actual),
|
|
22
|
-
expected,
|
|
23
|
-
"Content mismatch after trimming trailing newlines"
|
|
24
|
-
);
|
|
25
|
-
}
|
|
26
15
|
#[test]
|
|
27
16
|
fn test_archive_zip_bomb_detection() {
|
|
28
17
|
let mut cursor = std::io::Cursor::new(Vec::new());
|
|
@@ -140,7 +129,6 @@ fn test_archive_deeply_nested_directories() {
|
|
|
140
129
|
}
|
|
141
130
|
|
|
142
131
|
#[test]
|
|
143
|
-
#[cfg(feature = "archives")]
|
|
144
132
|
fn test_archive_many_small_files() {
|
|
145
133
|
let mut cursor = std::io::Cursor::new(Vec::new());
|
|
146
134
|
{
|
|
@@ -278,7 +266,7 @@ fn test_resource_single_byte_file() {
|
|
|
278
266
|
|
|
279
267
|
assert!(result.is_ok());
|
|
280
268
|
if let Ok(extracted) = result {
|
|
281
|
-
|
|
269
|
+
assert_eq!(extracted.content, "a");
|
|
282
270
|
}
|
|
283
271
|
}
|
|
284
272
|
|