kreuzberg 4.0.0.pre.rc.29 → 4.0.0.rc1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +0 -6
- data/.rubocop.yaml +534 -1
- data/Gemfile +2 -1
- data/Gemfile.lock +28 -116
- data/README.md +269 -629
- data/Rakefile +0 -9
- data/Steepfile +4 -8
- data/examples/async_patterns.rb +58 -1
- data/ext/kreuzberg_rb/extconf.rb +5 -35
- data/ext/kreuzberg_rb/native/Cargo.toml +16 -55
- data/ext/kreuzberg_rb/native/build.rs +14 -12
- data/ext/kreuzberg_rb/native/include/ieeefp.h +1 -1
- data/ext/kreuzberg_rb/native/include/msvc_compat/strings.h +1 -1
- data/ext/kreuzberg_rb/native/include/strings.h +2 -2
- data/ext/kreuzberg_rb/native/include/unistd.h +1 -1
- data/ext/kreuzberg_rb/native/src/lib.rs +34 -897
- data/extconf.rb +6 -38
- data/kreuzberg.gemspec +20 -114
- data/lib/kreuzberg/api_proxy.rb +18 -2
- data/lib/kreuzberg/cache_api.rb +0 -22
- data/lib/kreuzberg/cli.rb +10 -2
- data/lib/kreuzberg/cli_proxy.rb +10 -0
- data/lib/kreuzberg/config.rb +22 -274
- data/lib/kreuzberg/errors.rb +7 -73
- data/lib/kreuzberg/extraction_api.rb +8 -237
- data/lib/kreuzberg/mcp_proxy.rb +11 -2
- data/lib/kreuzberg/ocr_backend_protocol.rb +73 -0
- data/lib/kreuzberg/post_processor_protocol.rb +71 -0
- data/lib/kreuzberg/result.rb +33 -151
- data/lib/kreuzberg/setup_lib_path.rb +2 -22
- data/lib/kreuzberg/validator_protocol.rb +73 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +13 -27
- data/pkg/kreuzberg-4.0.0.rc1.gem +0 -0
- data/sig/kreuzberg.rbs +12 -105
- data/spec/binding/cache_spec.rb +22 -22
- data/spec/binding/cli_proxy_spec.rb +4 -2
- data/spec/binding/cli_spec.rb +11 -12
- data/spec/binding/config_spec.rb +0 -74
- data/spec/binding/config_validation_spec.rb +6 -100
- data/spec/binding/error_handling_spec.rb +97 -283
- data/spec/binding/plugins/ocr_backend_spec.rb +8 -8
- data/spec/binding/plugins/postprocessor_spec.rb +11 -11
- data/spec/binding/plugins/validator_spec.rb +13 -12
- data/spec/examples.txt +104 -0
- data/spec/fixtures/config.toml +1 -0
- data/spec/fixtures/config.yaml +1 -0
- data/spec/fixtures/invalid_config.toml +1 -0
- data/spec/smoke/package_spec.rb +3 -2
- data/spec/spec_helper.rb +3 -1
- data/vendor/kreuzberg/Cargo.toml +67 -192
- data/vendor/kreuzberg/README.md +9 -97
- data/vendor/kreuzberg/build.rs +194 -516
- data/vendor/kreuzberg/src/api/handlers.rs +9 -130
- data/vendor/kreuzberg/src/api/mod.rs +3 -18
- data/vendor/kreuzberg/src/api/server.rs +71 -236
- data/vendor/kreuzberg/src/api/types.rs +7 -43
- data/vendor/kreuzberg/src/bin/profile_extract.rs +455 -0
- data/vendor/kreuzberg/src/cache/mod.rs +3 -27
- data/vendor/kreuzberg/src/chunking/mod.rs +79 -1705
- data/vendor/kreuzberg/src/core/batch_mode.rs +0 -60
- data/vendor/kreuzberg/src/core/config.rs +23 -905
- data/vendor/kreuzberg/src/core/extractor.rs +106 -403
- data/vendor/kreuzberg/src/core/io.rs +2 -4
- data/vendor/kreuzberg/src/core/mime.rs +12 -2
- data/vendor/kreuzberg/src/core/mod.rs +3 -22
- data/vendor/kreuzberg/src/core/pipeline.rs +78 -395
- data/vendor/kreuzberg/src/embeddings.rs +21 -169
- data/vendor/kreuzberg/src/error.rs +2 -2
- data/vendor/kreuzberg/src/extraction/archive.rs +31 -36
- data/vendor/kreuzberg/src/extraction/docx.rs +1 -365
- data/vendor/kreuzberg/src/extraction/email.rs +11 -12
- data/vendor/kreuzberg/src/extraction/excel.rs +129 -138
- data/vendor/kreuzberg/src/extraction/html.rs +170 -1447
- data/vendor/kreuzberg/src/extraction/image.rs +14 -138
- data/vendor/kreuzberg/src/extraction/libreoffice.rs +3 -13
- data/vendor/kreuzberg/src/extraction/mod.rs +5 -21
- data/vendor/kreuzberg/src/extraction/office_metadata/mod.rs +0 -2
- data/vendor/kreuzberg/src/extraction/pandoc/batch.rs +275 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mime_types.rs +178 -0
- data/vendor/kreuzberg/src/extraction/pandoc/mod.rs +491 -0
- data/vendor/kreuzberg/src/extraction/pandoc/server.rs +496 -0
- data/vendor/kreuzberg/src/extraction/pandoc/subprocess.rs +1188 -0
- data/vendor/kreuzberg/src/extraction/pandoc/version.rs +162 -0
- data/vendor/kreuzberg/src/extraction/pptx.rs +94 -196
- data/vendor/kreuzberg/src/extraction/structured.rs +4 -5
- data/vendor/kreuzberg/src/extraction/table.rs +1 -2
- data/vendor/kreuzberg/src/extraction/text.rs +10 -18
- data/vendor/kreuzberg/src/extractors/archive.rs +0 -22
- data/vendor/kreuzberg/src/extractors/docx.rs +148 -69
- data/vendor/kreuzberg/src/extractors/email.rs +9 -37
- data/vendor/kreuzberg/src/extractors/excel.rs +40 -81
- data/vendor/kreuzberg/src/extractors/html.rs +173 -182
- data/vendor/kreuzberg/src/extractors/image.rs +8 -32
- data/vendor/kreuzberg/src/extractors/mod.rs +10 -171
- data/vendor/kreuzberg/src/extractors/pandoc.rs +201 -0
- data/vendor/kreuzberg/src/extractors/pdf.rs +64 -329
- data/vendor/kreuzberg/src/extractors/pptx.rs +34 -79
- data/vendor/kreuzberg/src/extractors/structured.rs +0 -16
- data/vendor/kreuzberg/src/extractors/text.rs +7 -30
- data/vendor/kreuzberg/src/extractors/xml.rs +8 -27
- data/vendor/kreuzberg/src/keywords/processor.rs +1 -9
- data/vendor/kreuzberg/src/keywords/rake.rs +1 -0
- data/vendor/kreuzberg/src/language_detection/mod.rs +51 -94
- data/vendor/kreuzberg/src/lib.rs +5 -17
- data/vendor/kreuzberg/src/mcp/mod.rs +1 -4
- data/vendor/kreuzberg/src/mcp/server.rs +21 -145
- data/vendor/kreuzberg/src/ocr/mod.rs +0 -2
- data/vendor/kreuzberg/src/ocr/processor.rs +8 -19
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +0 -2
- data/vendor/kreuzberg/src/pdf/error.rs +1 -93
- data/vendor/kreuzberg/src/pdf/metadata.rs +100 -263
- data/vendor/kreuzberg/src/pdf/mod.rs +2 -33
- data/vendor/kreuzberg/src/pdf/rendering.rs +12 -12
- data/vendor/kreuzberg/src/pdf/table.rs +64 -61
- data/vendor/kreuzberg/src/pdf/text.rs +24 -416
- data/vendor/kreuzberg/src/plugins/extractor.rs +8 -40
- data/vendor/kreuzberg/src/plugins/mod.rs +0 -3
- data/vendor/kreuzberg/src/plugins/ocr.rs +14 -22
- data/vendor/kreuzberg/src/plugins/processor.rs +1 -10
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -15
- data/vendor/kreuzberg/src/plugins/validator.rs +8 -20
- data/vendor/kreuzberg/src/stopwords/mod.rs +2 -2
- data/vendor/kreuzberg/src/text/mod.rs +0 -8
- data/vendor/kreuzberg/src/text/quality.rs +15 -28
- data/vendor/kreuzberg/src/text/string_utils.rs +10 -22
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +50 -86
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +16 -37
- data/vendor/kreuzberg/src/text/token_reduction/simd_text.rs +1 -2
- data/vendor/kreuzberg/src/types.rs +67 -907
- data/vendor/kreuzberg/src/utils/mod.rs +0 -14
- data/vendor/kreuzberg/src/utils/quality.rs +3 -12
- data/vendor/kreuzberg/tests/api_tests.rs +0 -506
- data/vendor/kreuzberg/tests/archive_integration.rs +0 -2
- data/vendor/kreuzberg/tests/batch_orchestration.rs +12 -57
- data/vendor/kreuzberg/tests/batch_processing.rs +8 -32
- data/vendor/kreuzberg/tests/chunking_offset_demo.rs +92 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +8 -40
- data/vendor/kreuzberg/tests/config_features.rs +1 -33
- data/vendor/kreuzberg/tests/config_loading_tests.rs +39 -16
- data/vendor/kreuzberg/tests/core_integration.rs +9 -35
- data/vendor/kreuzberg/tests/csv_integration.rs +81 -71
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +25 -23
- data/vendor/kreuzberg/tests/email_integration.rs +1 -3
- data/vendor/kreuzberg/tests/error_handling.rs +34 -43
- data/vendor/kreuzberg/tests/format_integration.rs +1 -7
- data/vendor/kreuzberg/tests/helpers/mod.rs +0 -60
- data/vendor/kreuzberg/tests/image_integration.rs +0 -2
- data/vendor/kreuzberg/tests/mime_detection.rs +16 -17
- data/vendor/kreuzberg/tests/ocr_configuration.rs +0 -4
- data/vendor/kreuzberg/tests/ocr_errors.rs +0 -22
- data/vendor/kreuzberg/tests/ocr_quality.rs +0 -2
- data/vendor/kreuzberg/tests/pandoc_integration.rs +503 -0
- data/vendor/kreuzberg/tests/pdf_integration.rs +0 -2
- data/vendor/kreuzberg/tests/pipeline_integration.rs +2 -36
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +0 -5
- data/vendor/kreuzberg/tests/plugin_postprocessor_test.rs +1 -17
- data/vendor/kreuzberg/tests/plugin_system.rs +0 -6
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +22 -2
- data/vendor/kreuzberg/tests/security_validation.rs +1 -13
- data/vendor/kreuzberg/tests/test_fastembed.rs +23 -45
- metadata +25 -171
- data/.rubocop.yml +0 -543
- data/ext/kreuzberg_rb/native/.cargo/config.toml +0 -23
- data/ext/kreuzberg_rb/native/Cargo.lock +0 -7619
- data/lib/kreuzberg/error_context.rb +0 -136
- data/lib/kreuzberg/types.rb +0 -170
- data/lib/libpdfium.so +0 -0
- data/spec/binding/async_operations_spec.rb +0 -473
- data/spec/binding/batch_operations_spec.rb +0 -595
- data/spec/binding/batch_spec.rb +0 -359
- data/spec/binding/config_result_spec.rb +0 -377
- data/spec/binding/embeddings_spec.rb +0 -816
- data/spec/binding/error_recovery_spec.rb +0 -488
- data/spec/binding/font_config_spec.rb +0 -220
- data/spec/binding/images_spec.rb +0 -738
- data/spec/binding/keywords_extraction_spec.rb +0 -600
- data/spec/binding/metadata_types_spec.rb +0 -1228
- data/spec/binding/pages_extraction_spec.rb +0 -471
- data/spec/binding/tables_spec.rb +0 -641
- data/spec/unit/config/chunking_config_spec.rb +0 -213
- data/spec/unit/config/embedding_config_spec.rb +0 -343
- data/spec/unit/config/extraction_config_spec.rb +0 -438
- data/spec/unit/config/font_config_spec.rb +0 -285
- data/spec/unit/config/hierarchy_config_spec.rb +0 -314
- data/spec/unit/config/image_extraction_config_spec.rb +0 -209
- data/spec/unit/config/image_preprocessing_config_spec.rb +0 -249
- data/spec/unit/config/keyword_config_spec.rb +0 -229
- data/spec/unit/config/language_detection_config_spec.rb +0 -258
- data/spec/unit/config/ocr_config_spec.rb +0 -171
- data/spec/unit/config/page_config_spec.rb +0 -221
- data/spec/unit/config/pdf_config_spec.rb +0 -267
- data/spec/unit/config/postprocessor_config_spec.rb +0 -290
- data/spec/unit/config/tesseract_config_spec.rb +0 -181
- data/spec/unit/config/token_reduction_config_spec.rb +0 -251
- data/test/metadata_types_test.rb +0 -959
- data/vendor/Cargo.toml +0 -61
- data/vendor/kreuzberg/examples/bench_fixes.rs +0 -71
- data/vendor/kreuzberg/examples/test_pdfium_fork.rs +0 -62
- data/vendor/kreuzberg/src/chunking/processor.rs +0 -219
- data/vendor/kreuzberg/src/core/batch_optimizations.rs +0 -385
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/formats.rs +0 -235
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/capacity.rs +0 -263
- data/vendor/kreuzberg/src/extraction/markdown.rs +0 -216
- data/vendor/kreuzberg/src/extraction/office_metadata/odt_properties.rs +0 -284
- data/vendor/kreuzberg/src/extractors/bibtex.rs +0 -470
- data/vendor/kreuzberg/src/extractors/docbook.rs +0 -504
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +0 -492
- data/vendor/kreuzberg/src/extractors/jats.rs +0 -1054
- data/vendor/kreuzberg/src/extractors/jupyter.rs +0 -368
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/markdown.rs +0 -701
- data/vendor/kreuzberg/src/extractors/odt.rs +0 -628
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/orgmode.rs +0 -529
- data/vendor/kreuzberg/src/extractors/rst.rs +0 -577
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/extractors/security.rs +0 -484
- data/vendor/kreuzberg/src/extractors/security_tests.rs +0 -367
- data/vendor/kreuzberg/src/extractors/typst.rs +0 -651
- data/vendor/kreuzberg/src/language_detection/processor.rs +0 -218
- data/vendor/kreuzberg/src/ocr/language_registry.rs +0 -520
- data/vendor/kreuzberg/src/panic_context.rs +0 -154
- data/vendor/kreuzberg/src/pdf/bindings.rs +0 -306
- data/vendor/kreuzberg/src/pdf/bundled.rs +0 -408
- data/vendor/kreuzberg/src/pdf/fonts.rs +0 -358
- data/vendor/kreuzberg/src/pdf/hierarchy.rs +0 -903
- data/vendor/kreuzberg/src/text/quality_processor.rs +0 -231
- data/vendor/kreuzberg/src/text/utf8_validation.rs +0 -193
- data/vendor/kreuzberg/src/utils/pool.rs +0 -503
- data/vendor/kreuzberg/src/utils/pool_sizing.rs +0 -364
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -761
- data/vendor/kreuzberg/tests/api_embed.rs +0 -360
- data/vendor/kreuzberg/tests/api_extract_multipart.rs +0 -52
- data/vendor/kreuzberg/tests/api_large_pdf_extraction.rs +0 -471
- data/vendor/kreuzberg/tests/api_large_pdf_extraction_diagnostics.rs +0 -289
- data/vendor/kreuzberg/tests/batch_pooling_benchmark.rs +0 -154
- data/vendor/kreuzberg/tests/bibtex_parity_test.rs +0 -421
- data/vendor/kreuzberg/tests/config_integration_test.rs +0 -753
- data/vendor/kreuzberg/tests/data/hierarchy_ground_truth.json +0 -294
- data/vendor/kreuzberg/tests/docbook_extractor_tests.rs +0 -500
- data/vendor/kreuzberg/tests/docx_vs_pandoc_comparison.rs +0 -370
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +0 -275
- data/vendor/kreuzberg/tests/fictionbook_extractor_tests.rs +0 -228
- data/vendor/kreuzberg/tests/html_table_test.rs +0 -551
- data/vendor/kreuzberg/tests/instrumentation_test.rs +0 -139
- data/vendor/kreuzberg/tests/jats_extractor_tests.rs +0 -639
- data/vendor/kreuzberg/tests/jupyter_extractor_tests.rs +0 -704
- data/vendor/kreuzberg/tests/latex_extractor_tests.rs +0 -496
- data/vendor/kreuzberg/tests/markdown_extractor_tests.rs +0 -490
- data/vendor/kreuzberg/tests/ocr_language_registry.rs +0 -191
- data/vendor/kreuzberg/tests/odt_extractor_tests.rs +0 -674
- data/vendor/kreuzberg/tests/opml_extractor_tests.rs +0 -616
- data/vendor/kreuzberg/tests/orgmode_extractor_tests.rs +0 -822
- data/vendor/kreuzberg/tests/page_markers.rs +0 -297
- data/vendor/kreuzberg/tests/pdf_hierarchy_detection.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_hierarchy_quality.rs +0 -589
- data/vendor/kreuzberg/tests/pdf_ocr_triggering.rs +0 -301
- data/vendor/kreuzberg/tests/pdf_text_merging.rs +0 -475
- data/vendor/kreuzberg/tests/pdfium_linking.rs +0 -340
- data/vendor/kreuzberg/tests/rst_extractor_tests.rs +0 -694
- data/vendor/kreuzberg/tests/rtf_extractor_tests.rs +0 -775
- data/vendor/kreuzberg/tests/typst_behavioral_tests.rs +0 -1260
- data/vendor/kreuzberg/tests/typst_extractor_tests.rs +0 -648
- data/vendor/kreuzberg-ffi/Cargo.toml +0 -67
- data/vendor/kreuzberg-ffi/README.md +0 -851
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +0 -227
- data/vendor/kreuzberg-ffi/build.rs +0 -168
- data/vendor/kreuzberg-ffi/cbindgen.toml +0 -37
- data/vendor/kreuzberg-ffi/kreuzberg-ffi.pc.in +0 -12
- data/vendor/kreuzberg-ffi/kreuzberg.h +0 -3012
- data/vendor/kreuzberg-ffi/src/batch_streaming.rs +0 -588
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
- data/vendor/kreuzberg-ffi/src/error.rs +0 -901
- data/vendor/kreuzberg-ffi/src/extraction.rs +0 -555
- data/vendor/kreuzberg-ffi/src/helpers.rs +0 -879
- data/vendor/kreuzberg-ffi/src/lib.rs +0 -977
- data/vendor/kreuzberg-ffi/src/memory.rs +0 -493
- data/vendor/kreuzberg-ffi/src/mime.rs +0 -329
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +0 -265
- data/vendor/kreuzberg-ffi/src/plugins/document_extractor.rs +0 -442
- data/vendor/kreuzberg-ffi/src/plugins/mod.rs +0 -14
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +0 -628
- data/vendor/kreuzberg-ffi/src/plugins/post_processor.rs +0 -438
- data/vendor/kreuzberg-ffi/src/plugins/validator.rs +0 -329
- data/vendor/kreuzberg-ffi/src/result.rs +0 -510
- data/vendor/kreuzberg-ffi/src/result_pool.rs +0 -639
- data/vendor/kreuzberg-ffi/src/result_view.rs +0 -773
- data/vendor/kreuzberg-ffi/src/string_intern.rs +0 -568
- data/vendor/kreuzberg-ffi/src/types.rs +0 -363
- data/vendor/kreuzberg-ffi/src/util.rs +0 -210
- data/vendor/kreuzberg-ffi/src/validation.rs +0 -848
- data/vendor/kreuzberg-ffi/tests.disabled/README.md +0 -48
- data/vendor/kreuzberg-ffi/tests.disabled/config_loading_tests.rs +0 -299
- data/vendor/kreuzberg-ffi/tests.disabled/config_tests.rs +0 -346
- data/vendor/kreuzberg-ffi/tests.disabled/extractor_tests.rs +0 -232
- data/vendor/kreuzberg-ffi/tests.disabled/plugin_registration_tests.rs +0 -470
- data/vendor/kreuzberg-tesseract/.commitlintrc.json +0 -13
- data/vendor/kreuzberg-tesseract/.crate-ignore +0 -2
- data/vendor/kreuzberg-tesseract/Cargo.lock +0 -2933
- data/vendor/kreuzberg-tesseract/Cargo.toml +0 -57
- data/vendor/kreuzberg-tesseract/LICENSE +0 -22
- data/vendor/kreuzberg-tesseract/README.md +0 -399
- data/vendor/kreuzberg-tesseract/build.rs +0 -1127
- data/vendor/kreuzberg-tesseract/patches/README.md +0 -71
- data/vendor/kreuzberg-tesseract/patches/tesseract.diff +0 -199
- data/vendor/kreuzberg-tesseract/src/api.rs +0 -1371
- data/vendor/kreuzberg-tesseract/src/choice_iterator.rs +0 -77
- data/vendor/kreuzberg-tesseract/src/enums.rs +0 -297
- data/vendor/kreuzberg-tesseract/src/error.rs +0 -81
- data/vendor/kreuzberg-tesseract/src/lib.rs +0 -145
- data/vendor/kreuzberg-tesseract/src/monitor.rs +0 -57
- data/vendor/kreuzberg-tesseract/src/mutable_iterator.rs +0 -197
- data/vendor/kreuzberg-tesseract/src/page_iterator.rs +0 -253
- data/vendor/kreuzberg-tesseract/src/result_iterator.rs +0 -286
- data/vendor/kreuzberg-tesseract/src/result_renderer.rs +0 -183
- data/vendor/kreuzberg-tesseract/tests/integration_test.rs +0 -211
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
#![cfg(all(feature = "tokio-runtime", feature = "office"))]
|
|
2
|
-
|
|
3
1
|
//! PowerPoint presentation extractor.
|
|
4
2
|
|
|
5
3
|
use crate::Result;
|
|
@@ -45,10 +43,8 @@ impl PptxExtractor {
|
|
|
45
43
|
for image in &mut images {
|
|
46
44
|
let image_data = image.data.clone();
|
|
47
45
|
let tess_config_clone = tess_config.clone();
|
|
48
|
-
let span = tracing::Span::current();
|
|
49
46
|
|
|
50
47
|
let ocr_result = tokio::task::spawn_blocking(move || {
|
|
51
|
-
let _guard = span.entered();
|
|
52
48
|
let cache_dir = std::env::var("KREUZBERG_CACHE_DIR").ok().map(std::path::PathBuf::from);
|
|
53
49
|
|
|
54
50
|
let proc = OcrProcessor::new(cache_dir)?;
|
|
@@ -71,7 +67,6 @@ impl PptxExtractor {
|
|
|
71
67
|
detected_languages: None,
|
|
72
68
|
chunks: None,
|
|
73
69
|
images: None,
|
|
74
|
-
pages: None,
|
|
75
70
|
};
|
|
76
71
|
image.ocr_result = Some(Box::new(extraction_result));
|
|
77
72
|
}
|
|
@@ -105,13 +100,6 @@ impl Plugin for PptxExtractor {
|
|
|
105
100
|
|
|
106
101
|
#[async_trait]
|
|
107
102
|
impl DocumentExtractor for PptxExtractor {
|
|
108
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
109
|
-
skip(self, content, config),
|
|
110
|
-
fields(
|
|
111
|
-
extractor.name = self.name(),
|
|
112
|
-
content.size_bytes = content.len(),
|
|
113
|
-
)
|
|
114
|
-
))]
|
|
115
103
|
async fn extract_bytes(
|
|
116
104
|
&self,
|
|
117
105
|
content: &[u8],
|
|
@@ -120,18 +108,18 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
120
108
|
) -> Result<ExtractionResult> {
|
|
121
109
|
let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
|
|
122
110
|
|
|
123
|
-
|
|
111
|
+
// Extract PPTX content
|
|
124
112
|
let pptx_result = if crate::core::batch_mode::is_batch_mode() {
|
|
113
|
+
// Batch mode: Use spawn_blocking for parallelism
|
|
125
114
|
let content_owned = content.to_vec();
|
|
126
|
-
let span = tracing::Span::current();
|
|
127
115
|
tokio::task::spawn_blocking(move || {
|
|
128
|
-
|
|
129
|
-
crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images, pages_config.as_ref())
|
|
116
|
+
crate::extraction::pptx::extract_pptx_from_bytes(&content_owned, extract_images)
|
|
130
117
|
})
|
|
131
118
|
.await
|
|
132
119
|
.map_err(|e| crate::error::KreuzbergError::parsing(format!("PPTX extraction task failed: {}", e)))??
|
|
133
120
|
} else {
|
|
134
|
-
|
|
121
|
+
// Single-file mode: Direct extraction (no spawn overhead)
|
|
122
|
+
crate::extraction::pptx::extract_pptx_from_bytes(content, extract_images)?
|
|
135
123
|
};
|
|
136
124
|
|
|
137
125
|
let mut additional = std::collections::HashMap::new();
|
|
@@ -139,41 +127,28 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
139
127
|
additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
|
|
140
128
|
additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
|
|
141
129
|
|
|
142
|
-
let images = if
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
{
|
|
152
|
-
Some(pptx_result.images)
|
|
153
|
-
}
|
|
154
|
-
} else {
|
|
155
|
-
Some(vec![])
|
|
130
|
+
let images = if !pptx_result.images.is_empty() {
|
|
131
|
+
#[cfg(feature = "ocr")]
|
|
132
|
+
{
|
|
133
|
+
let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
|
|
134
|
+
Some(processed_images)
|
|
135
|
+
}
|
|
136
|
+
#[cfg(not(feature = "ocr"))]
|
|
137
|
+
{
|
|
138
|
+
Some(pptx_result.images)
|
|
156
139
|
}
|
|
157
140
|
} else {
|
|
158
|
-
// Image extraction is disabled
|
|
159
141
|
None
|
|
160
142
|
};
|
|
161
143
|
|
|
162
|
-
let mut metadata = Metadata {
|
|
163
|
-
format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
|
|
164
|
-
additional,
|
|
165
|
-
..Default::default()
|
|
166
|
-
};
|
|
167
|
-
|
|
168
|
-
if let Some(page_structure) = pptx_result.page_structure {
|
|
169
|
-
metadata.pages = Some(page_structure);
|
|
170
|
-
}
|
|
171
|
-
|
|
172
144
|
Ok(ExtractionResult {
|
|
173
145
|
content: pptx_result.content,
|
|
174
146
|
mime_type: mime_type.to_string(),
|
|
175
|
-
metadata
|
|
176
|
-
|
|
147
|
+
metadata: Metadata {
|
|
148
|
+
format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
|
|
149
|
+
additional,
|
|
150
|
+
..Default::default()
|
|
151
|
+
},
|
|
177
152
|
tables: vec![],
|
|
178
153
|
detected_languages: None,
|
|
179
154
|
chunks: None,
|
|
@@ -181,12 +156,6 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
181
156
|
})
|
|
182
157
|
}
|
|
183
158
|
|
|
184
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
185
|
-
skip(self, path, config),
|
|
186
|
-
fields(
|
|
187
|
-
extractor.name = self.name(),
|
|
188
|
-
)
|
|
189
|
-
))]
|
|
190
159
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
191
160
|
let path_str = path
|
|
192
161
|
.to_str()
|
|
@@ -194,49 +163,35 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
194
163
|
|
|
195
164
|
let extract_images = config.images.as_ref().is_some_and(|img| img.extract_images);
|
|
196
165
|
|
|
197
|
-
let pptx_result =
|
|
198
|
-
crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images, config.pages.as_ref())?;
|
|
166
|
+
let pptx_result = crate::extraction::pptx::extract_pptx_from_path(path_str, extract_images)?;
|
|
199
167
|
|
|
200
168
|
let mut additional = std::collections::HashMap::new();
|
|
201
169
|
additional.insert("slide_count".to_string(), serde_json::json!(pptx_result.slide_count));
|
|
202
170
|
additional.insert("image_count".to_string(), serde_json::json!(pptx_result.image_count));
|
|
203
171
|
additional.insert("table_count".to_string(), serde_json::json!(pptx_result.table_count));
|
|
204
172
|
|
|
205
|
-
let images = if
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
{
|
|
215
|
-
Some(pptx_result.images)
|
|
216
|
-
}
|
|
217
|
-
} else {
|
|
218
|
-
Some(vec![])
|
|
173
|
+
let images = if !pptx_result.images.is_empty() {
|
|
174
|
+
#[cfg(feature = "ocr")]
|
|
175
|
+
{
|
|
176
|
+
let processed_images = self.process_images_with_ocr(pptx_result.images, config).await?;
|
|
177
|
+
Some(processed_images)
|
|
178
|
+
}
|
|
179
|
+
#[cfg(not(feature = "ocr"))]
|
|
180
|
+
{
|
|
181
|
+
Some(pptx_result.images)
|
|
219
182
|
}
|
|
220
183
|
} else {
|
|
221
|
-
// Image extraction is disabled
|
|
222
184
|
None
|
|
223
185
|
};
|
|
224
186
|
|
|
225
|
-
let mut metadata = Metadata {
|
|
226
|
-
format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
|
|
227
|
-
additional,
|
|
228
|
-
..Default::default()
|
|
229
|
-
};
|
|
230
|
-
|
|
231
|
-
if let Some(page_structure) = pptx_result.page_structure {
|
|
232
|
-
metadata.pages = Some(page_structure);
|
|
233
|
-
}
|
|
234
|
-
|
|
235
187
|
Ok(ExtractionResult {
|
|
236
188
|
content: pptx_result.content,
|
|
237
189
|
mime_type: mime_type.to_string(),
|
|
238
|
-
metadata
|
|
239
|
-
|
|
190
|
+
metadata: Metadata {
|
|
191
|
+
format: Some(crate::types::FormatMetadata::Pptx(pptx_result.metadata)),
|
|
192
|
+
additional,
|
|
193
|
+
..Default::default()
|
|
194
|
+
},
|
|
240
195
|
tables: vec![],
|
|
241
196
|
detected_languages: None,
|
|
242
197
|
chunks: None,
|
|
@@ -5,7 +5,6 @@ use crate::core::config::ExtractionConfig;
|
|
|
5
5
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
6
|
use crate::types::{ExtractionResult, Metadata};
|
|
7
7
|
use async_trait::async_trait;
|
|
8
|
-
#[cfg(feature = "tokio-runtime")]
|
|
9
8
|
use std::path::Path;
|
|
10
9
|
|
|
11
10
|
/// Structured data extractor supporting JSON, YAML, and TOML.
|
|
@@ -43,13 +42,6 @@ impl Plugin for StructuredExtractor {
|
|
|
43
42
|
|
|
44
43
|
#[async_trait]
|
|
45
44
|
impl DocumentExtractor for StructuredExtractor {
|
|
46
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
47
|
-
skip(self, content, _config),
|
|
48
|
-
fields(
|
|
49
|
-
extractor.name = self.name(),
|
|
50
|
-
content.size_bytes = content.len(),
|
|
51
|
-
)
|
|
52
|
-
))]
|
|
53
45
|
async fn extract_bytes(
|
|
54
46
|
&self,
|
|
55
47
|
content: &[u8],
|
|
@@ -81,7 +73,6 @@ impl DocumentExtractor for StructuredExtractor {
|
|
|
81
73
|
additional,
|
|
82
74
|
..Default::default()
|
|
83
75
|
},
|
|
84
|
-
pages: None,
|
|
85
76
|
tables: vec![],
|
|
86
77
|
detected_languages: None,
|
|
87
78
|
chunks: None,
|
|
@@ -89,13 +80,6 @@ impl DocumentExtractor for StructuredExtractor {
|
|
|
89
80
|
})
|
|
90
81
|
}
|
|
91
82
|
|
|
92
|
-
#[cfg(feature = "tokio-runtime")]
|
|
93
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
94
|
-
skip(self, path, config),
|
|
95
|
-
fields(
|
|
96
|
-
extractor.name = self.name(),
|
|
97
|
-
)
|
|
98
|
-
))]
|
|
99
83
|
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
100
84
|
let bytes = tokio::fs::read(path).await?;
|
|
101
85
|
self.extract_bytes(&bytes, mime_type, config).await
|
|
@@ -53,40 +53,28 @@ impl Plugin for PlainTextExtractor {
|
|
|
53
53
|
|
|
54
54
|
#[async_trait]
|
|
55
55
|
impl DocumentExtractor for PlainTextExtractor {
|
|
56
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
57
|
-
skip(self, content, _config),
|
|
58
|
-
fields(
|
|
59
|
-
extractor.name = self.name(),
|
|
60
|
-
content.size_bytes = content.len(),
|
|
61
|
-
)
|
|
62
|
-
))]
|
|
63
56
|
async fn extract_bytes(
|
|
64
57
|
&self,
|
|
65
58
|
content: &[u8],
|
|
66
59
|
mime_type: &str,
|
|
67
60
|
_config: &ExtractionConfig,
|
|
68
61
|
) -> Result<ExtractionResult> {
|
|
69
|
-
let
|
|
70
|
-
let text = text.trim_end_matches('\n').trim_end_matches('\r').to_string();
|
|
71
|
-
let line_count = text.lines().count();
|
|
72
|
-
let word_count = text.split_whitespace().count();
|
|
73
|
-
let character_count = text.len();
|
|
62
|
+
let text_result = parse_text(content, false)?;
|
|
74
63
|
|
|
75
64
|
Ok(ExtractionResult {
|
|
76
|
-
content:
|
|
65
|
+
content: text_result.content,
|
|
77
66
|
mime_type: mime_type.to_string(),
|
|
78
67
|
metadata: crate::types::Metadata {
|
|
79
68
|
format: Some(crate::types::FormatMetadata::Text(crate::types::TextMetadata {
|
|
80
|
-
line_count,
|
|
81
|
-
word_count,
|
|
82
|
-
character_count,
|
|
69
|
+
line_count: text_result.line_count,
|
|
70
|
+
word_count: text_result.word_count,
|
|
71
|
+
character_count: text_result.character_count,
|
|
83
72
|
headers: None,
|
|
84
73
|
links: None,
|
|
85
74
|
code_blocks: None,
|
|
86
75
|
})),
|
|
87
76
|
..Default::default()
|
|
88
77
|
},
|
|
89
|
-
pages: None,
|
|
90
78
|
tables: vec![],
|
|
91
79
|
detected_languages: None,
|
|
92
80
|
chunks: None,
|
|
@@ -95,7 +83,7 @@ impl DocumentExtractor for PlainTextExtractor {
|
|
|
95
83
|
}
|
|
96
84
|
|
|
97
85
|
fn supported_mime_types(&self) -> &[&str] {
|
|
98
|
-
&["text/plain"
|
|
86
|
+
&["text/plain"]
|
|
99
87
|
}
|
|
100
88
|
|
|
101
89
|
fn priority(&self) -> i32 {
|
|
@@ -150,13 +138,6 @@ impl Plugin for MarkdownExtractor {
|
|
|
150
138
|
|
|
151
139
|
#[async_trait]
|
|
152
140
|
impl DocumentExtractor for MarkdownExtractor {
|
|
153
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
154
|
-
skip(self, content, _config),
|
|
155
|
-
fields(
|
|
156
|
-
extractor.name = self.name(),
|
|
157
|
-
content.size_bytes = content.len(),
|
|
158
|
-
)
|
|
159
|
-
))]
|
|
160
141
|
async fn extract_bytes(
|
|
161
142
|
&self,
|
|
162
143
|
content: &[u8],
|
|
@@ -179,7 +160,6 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
179
160
|
})),
|
|
180
161
|
..Default::default()
|
|
181
162
|
},
|
|
182
|
-
pages: None,
|
|
183
163
|
tables: vec![],
|
|
184
164
|
detected_languages: None,
|
|
185
165
|
chunks: None,
|
|
@@ -247,10 +227,7 @@ mod tests {
|
|
|
247
227
|
let extractor = PlainTextExtractor::new();
|
|
248
228
|
assert_eq!(extractor.name(), "plain-text-extractor");
|
|
249
229
|
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
250
|
-
assert_eq!(
|
|
251
|
-
extractor.supported_mime_types(),
|
|
252
|
-
&["text/plain", "text/csv", "text/tab-separated-values"]
|
|
253
|
-
);
|
|
230
|
+
assert_eq!(extractor.supported_mime_types(), &["text/plain"]);
|
|
254
231
|
assert_eq!(extractor.priority(), 50);
|
|
255
232
|
}
|
|
256
233
|
|
|
@@ -3,7 +3,6 @@
|
|
|
3
3
|
use crate::Result;
|
|
4
4
|
use crate::core::config::ExtractionConfig;
|
|
5
5
|
use crate::extraction::xml::parse_xml;
|
|
6
|
-
use crate::extractors::SyncExtractor;
|
|
7
6
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
8
7
|
use crate::types::ExtractionResult;
|
|
9
8
|
use async_trait::async_trait;
|
|
@@ -52,8 +51,14 @@ impl Plugin for XmlExtractor {
|
|
|
52
51
|
}
|
|
53
52
|
}
|
|
54
53
|
|
|
55
|
-
|
|
56
|
-
|
|
54
|
+
#[async_trait]
|
|
55
|
+
impl DocumentExtractor for XmlExtractor {
|
|
56
|
+
async fn extract_bytes(
|
|
57
|
+
&self,
|
|
58
|
+
content: &[u8],
|
|
59
|
+
mime_type: &str,
|
|
60
|
+
_config: &ExtractionConfig,
|
|
61
|
+
) -> Result<ExtractionResult> {
|
|
57
62
|
let xml_result = parse_xml(content, false)?;
|
|
58
63
|
|
|
59
64
|
Ok(ExtractionResult {
|
|
@@ -70,28 +75,8 @@ impl SyncExtractor for XmlExtractor {
|
|
|
70
75
|
detected_languages: None,
|
|
71
76
|
chunks: None,
|
|
72
77
|
images: None,
|
|
73
|
-
pages: None,
|
|
74
78
|
})
|
|
75
79
|
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
#[async_trait]
|
|
79
|
-
impl DocumentExtractor for XmlExtractor {
|
|
80
|
-
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
81
|
-
skip(self, content, config),
|
|
82
|
-
fields(
|
|
83
|
-
extractor.name = self.name(),
|
|
84
|
-
content.size_bytes = content.len(),
|
|
85
|
-
)
|
|
86
|
-
))]
|
|
87
|
-
async fn extract_bytes(
|
|
88
|
-
&self,
|
|
89
|
-
content: &[u8],
|
|
90
|
-
mime_type: &str,
|
|
91
|
-
config: &ExtractionConfig,
|
|
92
|
-
) -> Result<ExtractionResult> {
|
|
93
|
-
self.extract_sync(content, mime_type, config)
|
|
94
|
-
}
|
|
95
80
|
|
|
96
81
|
fn supported_mime_types(&self) -> &[&str] {
|
|
97
82
|
&["application/xml", "text/xml", "image/svg+xml"]
|
|
@@ -100,10 +85,6 @@ impl DocumentExtractor for XmlExtractor {
|
|
|
100
85
|
fn priority(&self) -> i32 {
|
|
101
86
|
50
|
|
102
87
|
}
|
|
103
|
-
|
|
104
|
-
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
105
|
-
Some(self)
|
|
106
|
-
}
|
|
107
88
|
}
|
|
108
89
|
|
|
109
90
|
#[cfg(test)]
|
|
@@ -45,8 +45,7 @@ impl Plugin for KeywordExtractor {
|
|
|
45
45
|
}
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
#[
|
|
49
|
-
#[cfg_attr(target_arch = "wasm32", async_trait(?Send))]
|
|
48
|
+
#[async_trait]
|
|
50
49
|
impl PostProcessor for KeywordExtractor {
|
|
51
50
|
async fn process(&self, result: &mut ExtractionResult, config: &ExtractionConfig) -> Result<()> {
|
|
52
51
|
let keyword_config = match &config.keywords {
|
|
@@ -113,7 +112,6 @@ machine learning that uses neural networks with multiple layers.
|
|
|
113
112
|
detected_languages: None,
|
|
114
113
|
chunks: None,
|
|
115
114
|
images: None,
|
|
116
|
-
pages: None,
|
|
117
115
|
};
|
|
118
116
|
|
|
119
117
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -142,7 +140,6 @@ machine learning that uses neural networks with multiple layers.
|
|
|
142
140
|
detected_languages: None,
|
|
143
141
|
chunks: None,
|
|
144
142
|
images: None,
|
|
145
|
-
pages: None,
|
|
146
143
|
};
|
|
147
144
|
|
|
148
145
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -167,7 +164,6 @@ machine learning that uses neural networks with multiple layers.
|
|
|
167
164
|
detected_languages: None,
|
|
168
165
|
chunks: None,
|
|
169
166
|
images: None,
|
|
170
|
-
pages: None,
|
|
171
167
|
};
|
|
172
168
|
|
|
173
169
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -192,7 +188,6 @@ machine learning that uses neural networks with multiple layers.
|
|
|
192
188
|
detected_languages: None,
|
|
193
189
|
chunks: None,
|
|
194
190
|
images: None,
|
|
195
|
-
pages: None,
|
|
196
191
|
};
|
|
197
192
|
|
|
198
193
|
processor.process(&mut result, &config).await.unwrap();
|
|
@@ -228,7 +223,6 @@ machine learning that uses neural networks with multiple layers.
|
|
|
228
223
|
detected_languages: None,
|
|
229
224
|
chunks: None,
|
|
230
225
|
images: None,
|
|
231
|
-
pages: None,
|
|
232
226
|
};
|
|
233
227
|
|
|
234
228
|
let config_with_keywords = ExtractionConfig {
|
|
@@ -253,7 +247,6 @@ machine learning that uses neural networks with multiple layers.
|
|
|
253
247
|
detected_languages: None,
|
|
254
248
|
chunks: None,
|
|
255
249
|
images: None,
|
|
256
|
-
pages: None,
|
|
257
250
|
};
|
|
258
251
|
|
|
259
252
|
let long_result = ExtractionResult {
|
|
@@ -264,7 +257,6 @@ machine learning that uses neural networks with multiple layers.
|
|
|
264
257
|
detected_languages: None,
|
|
265
258
|
chunks: None,
|
|
266
259
|
images: None,
|
|
267
|
-
pages: None,
|
|
268
260
|
};
|
|
269
261
|
|
|
270
262
|
let short_duration = processor.estimated_duration_ms(&short_result);
|
|
@@ -248,6 +248,7 @@ mod tests {
|
|
|
248
248
|
let english_text = "Natural language processing is a subfield of artificial intelligence.";
|
|
249
249
|
let config = KeywordConfig::rake().with_language("fr");
|
|
250
250
|
let keywords = extract_keywords_rake(english_text, &config).unwrap();
|
|
251
|
+
dbg!(&keywords);
|
|
251
252
|
assert!(
|
|
252
253
|
!keywords.is_empty(),
|
|
253
254
|
"Should fall back to English stopwords and extract keywords"
|
|
@@ -4,13 +4,8 @@
|
|
|
4
4
|
|
|
5
5
|
use crate::Result;
|
|
6
6
|
use crate::core::config::LanguageDetectionConfig;
|
|
7
|
-
use once_cell::sync::Lazy;
|
|
8
|
-
use std::sync::Arc;
|
|
9
7
|
use whatlang::{Lang, detect};
|
|
10
8
|
|
|
11
|
-
pub mod processor;
|
|
12
|
-
pub use processor::LanguageDetector;
|
|
13
|
-
|
|
14
9
|
/// Detect languages in text using whatlang.
|
|
15
10
|
///
|
|
16
11
|
/// Returns a list of detected language codes (ISO 639-3 format).
|
|
@@ -185,44 +180,6 @@ fn lang_to_iso639_3(lang: Lang) -> String {
|
|
|
185
180
|
.to_string()
|
|
186
181
|
}
|
|
187
182
|
|
|
188
|
-
/// Register the language detection processor with the global registry.
|
|
189
|
-
///
|
|
190
|
-
/// This function should be called once at application startup to register
|
|
191
|
-
/// the language detection post-processor.
|
|
192
|
-
///
|
|
193
|
-
/// **Note:** This is called automatically on first use.
|
|
194
|
-
/// Explicit calling is optional.
|
|
195
|
-
pub fn register_language_detection_processor() -> Result<()> {
|
|
196
|
-
let registry = crate::plugins::registry::get_post_processor_registry();
|
|
197
|
-
let mut registry = registry
|
|
198
|
-
.write()
|
|
199
|
-
.map_err(|e| crate::KreuzbergError::Other(format!("Post-processor registry lock poisoned: {}", e)))?;
|
|
200
|
-
|
|
201
|
-
registry.register(Arc::new(LanguageDetector), 40)?;
|
|
202
|
-
|
|
203
|
-
Ok(())
|
|
204
|
-
}
|
|
205
|
-
|
|
206
|
-
/// Lazy-initialized flag that ensures language detection processor is registered exactly once.
|
|
207
|
-
///
|
|
208
|
-
/// This static is accessed on first use to automatically register the
|
|
209
|
-
/// language detection processor with the plugin registry.
|
|
210
|
-
static PROCESSOR_INITIALIZED: Lazy<Result<()>> = Lazy::new(register_language_detection_processor);
|
|
211
|
-
|
|
212
|
-
/// Ensure the language detection processor is registered.
|
|
213
|
-
///
|
|
214
|
-
/// This function is called automatically when needed.
|
|
215
|
-
/// It's safe to call multiple times - registration only happens once.
|
|
216
|
-
pub fn ensure_initialized() -> Result<()> {
|
|
217
|
-
PROCESSOR_INITIALIZED
|
|
218
|
-
.as_ref()
|
|
219
|
-
.map(|_| ())
|
|
220
|
-
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
221
|
-
message: format!("Failed to register language detection processor: {}", e),
|
|
222
|
-
plugin_name: "language-detection".to_string(),
|
|
223
|
-
})
|
|
224
|
-
}
|
|
225
|
-
|
|
226
183
|
#[cfg(test)]
|
|
227
184
|
mod tests {
|
|
228
185
|
use super::*;
|
|
@@ -719,57 +676,6 @@ mod tests {
|
|
|
719
676
|
assert_eq!(langs[0], "eng");
|
|
720
677
|
}
|
|
721
678
|
|
|
722
|
-
#[test]
|
|
723
|
-
fn test_medical_terminology() {
|
|
724
|
-
let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
|
|
725
|
-
The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
|
|
726
|
-
Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
|
|
727
|
-
let config = LanguageDetectionConfig {
|
|
728
|
-
enabled: true,
|
|
729
|
-
min_confidence: 0.5,
|
|
730
|
-
detect_multiple: false,
|
|
731
|
-
};
|
|
732
|
-
|
|
733
|
-
let result = detect_languages(text, &config).unwrap();
|
|
734
|
-
assert!(result.is_some());
|
|
735
|
-
let langs = result.unwrap();
|
|
736
|
-
assert_eq!(langs[0], "eng");
|
|
737
|
-
}
|
|
738
|
-
|
|
739
|
-
#[test]
|
|
740
|
-
fn test_legal_terminology() {
|
|
741
|
-
let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
|
|
742
|
-
Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
|
|
743
|
-
The court finds that the preponderance of evidence supports the plaintiff's claims.";
|
|
744
|
-
let config = LanguageDetectionConfig {
|
|
745
|
-
enabled: true,
|
|
746
|
-
min_confidence: 0.5,
|
|
747
|
-
detect_multiple: false,
|
|
748
|
-
};
|
|
749
|
-
|
|
750
|
-
let result = detect_languages(text, &config).unwrap();
|
|
751
|
-
assert!(result.is_some());
|
|
752
|
-
let langs = result.unwrap();
|
|
753
|
-
assert_eq!(langs[0], "eng");
|
|
754
|
-
}
|
|
755
|
-
|
|
756
|
-
#[test]
|
|
757
|
-
fn test_scientific_terminology() {
|
|
758
|
-
let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
|
|
759
|
-
Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
|
|
760
|
-
The results demonstrated significant correlation between molecular structure and optical properties.";
|
|
761
|
-
let config = LanguageDetectionConfig {
|
|
762
|
-
enabled: true,
|
|
763
|
-
min_confidence: 0.5,
|
|
764
|
-
detect_multiple: false,
|
|
765
|
-
};
|
|
766
|
-
|
|
767
|
-
let result = detect_languages(text, &config).unwrap();
|
|
768
|
-
assert!(result.is_some());
|
|
769
|
-
let langs = result.unwrap();
|
|
770
|
-
assert_eq!(langs[0], "eng");
|
|
771
|
-
}
|
|
772
|
-
|
|
773
679
|
#[test]
|
|
774
680
|
fn test_code_with_comments() {
|
|
775
681
|
let text = r#"
|
|
@@ -845,6 +751,57 @@ mod tests {
|
|
|
845
751
|
assert_eq!(langs[0], "eng");
|
|
846
752
|
}
|
|
847
753
|
|
|
754
|
+
#[test]
|
|
755
|
+
fn test_medical_terminology() {
|
|
756
|
+
let text = "The patient presented with acute myocardial infarction and was administered thrombolytic therapy. \
|
|
757
|
+
The electrocardiogram showed significant ST-segment elevation in the anterior leads. \
|
|
758
|
+
Cardiac biomarkers including troponin and creatine kinase were significantly elevated.";
|
|
759
|
+
let config = LanguageDetectionConfig {
|
|
760
|
+
enabled: true,
|
|
761
|
+
min_confidence: 0.5,
|
|
762
|
+
detect_multiple: false,
|
|
763
|
+
};
|
|
764
|
+
|
|
765
|
+
let result = detect_languages(text, &config).unwrap();
|
|
766
|
+
assert!(result.is_some());
|
|
767
|
+
let langs = result.unwrap();
|
|
768
|
+
assert_eq!(langs[0], "eng");
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
#[test]
|
|
772
|
+
fn test_legal_terminology() {
|
|
773
|
+
let text = "The plaintiff hereby alleges that the defendant breached the contractual obligations as stipulated in the aforementioned agreement. \
|
|
774
|
+
Pursuant to clause 5.2, the defendant was required to provide adequate consideration within thirty days of execution. \
|
|
775
|
+
The court finds that the preponderance of evidence supports the plaintiff's claims.";
|
|
776
|
+
let config = LanguageDetectionConfig {
|
|
777
|
+
enabled: true,
|
|
778
|
+
min_confidence: 0.5,
|
|
779
|
+
detect_multiple: false,
|
|
780
|
+
};
|
|
781
|
+
|
|
782
|
+
let result = detect_languages(text, &config).unwrap();
|
|
783
|
+
assert!(result.is_some());
|
|
784
|
+
let langs = result.unwrap();
|
|
785
|
+
assert_eq!(langs[0], "eng");
|
|
786
|
+
}
|
|
787
|
+
|
|
788
|
+
#[test]
|
|
789
|
+
fn test_scientific_terminology() {
|
|
790
|
+
let text = "The experimental protocol involved spectrophotometric analysis using ultraviolet-visible spectroscopy. \
|
|
791
|
+
Quantum mechanical calculations were performed using density functional theory at the B3LYP level. \
|
|
792
|
+
The results demonstrated significant correlation between molecular structure and optical properties.";
|
|
793
|
+
let config = LanguageDetectionConfig {
|
|
794
|
+
enabled: true,
|
|
795
|
+
min_confidence: 0.5,
|
|
796
|
+
detect_multiple: false,
|
|
797
|
+
};
|
|
798
|
+
|
|
799
|
+
let result = detect_languages(text, &config).unwrap();
|
|
800
|
+
assert!(result.is_some());
|
|
801
|
+
let langs = result.unwrap();
|
|
802
|
+
assert_eq!(langs[0], "eng");
|
|
803
|
+
}
|
|
804
|
+
|
|
848
805
|
#[test]
|
|
849
806
|
fn test_latin_cyrillic_mix() {
|
|
850
807
|
let text = format!(
|
data/vendor/kreuzberg/src/lib.rs
CHANGED
|
@@ -39,10 +39,11 @@ pub mod core;
|
|
|
39
39
|
pub mod error;
|
|
40
40
|
pub mod extraction;
|
|
41
41
|
pub mod extractors;
|
|
42
|
-
pub mod panic_context;
|
|
43
42
|
pub mod plugins;
|
|
44
43
|
pub mod text;
|
|
45
44
|
pub mod types;
|
|
45
|
+
|
|
46
|
+
#[cfg(feature = "quality")]
|
|
46
47
|
pub mod utils;
|
|
47
48
|
|
|
48
49
|
#[cfg(feature = "api")]
|
|
@@ -78,34 +79,21 @@ pub mod pdf;
|
|
|
78
79
|
pub use error::{KreuzbergError, Result};
|
|
79
80
|
pub use types::*;
|
|
80
81
|
|
|
81
|
-
|
|
82
|
-
pub use core::extractor::{batch_extract_bytes, batch_extract_file};
|
|
83
|
-
pub use core::extractor::{extract_bytes, extract_file};
|
|
84
|
-
|
|
85
|
-
pub use core::extractor::{batch_extract_bytes_sync, extract_bytes_sync};
|
|
82
|
+
pub use core::extractor::{batch_extract_bytes, batch_extract_file, extract_bytes, extract_file};
|
|
86
83
|
|
|
87
|
-
|
|
88
|
-
pub use core::extractor::{batch_extract_file_sync, extract_file_sync};
|
|
84
|
+
pub use core::extractor::{batch_extract_bytes_sync, batch_extract_file_sync, extract_bytes_sync, extract_file_sync};
|
|
89
85
|
|
|
90
86
|
pub use core::config::{
|
|
91
87
|
ChunkingConfig, EmbeddingConfig, EmbeddingModelType, ExtractionConfig, ImageExtractionConfig,
|
|
92
|
-
LanguageDetectionConfig, OcrConfig, PostProcessorConfig, TokenReductionConfig,
|
|
88
|
+
LanguageDetectionConfig, OcrConfig, PdfConfig, PostProcessorConfig, TokenReductionConfig,
|
|
93
89
|
};
|
|
94
90
|
|
|
95
|
-
#[cfg(feature = "api")]
|
|
96
|
-
pub use core::server_config::ServerConfig;
|
|
97
|
-
|
|
98
|
-
#[cfg(feature = "pdf")]
|
|
99
|
-
pub use core::config::PdfConfig;
|
|
100
|
-
|
|
101
91
|
pub use core::mime::{
|
|
102
92
|
DOCX_MIME_TYPE, EXCEL_MIME_TYPE, HTML_MIME_TYPE, JSON_MIME_TYPE, MARKDOWN_MIME_TYPE, PDF_MIME_TYPE,
|
|
103
93
|
PLAIN_TEXT_MIME_TYPE, POWER_POINT_MIME_TYPE, XML_MIME_TYPE, detect_mime_type, detect_mime_type_from_bytes,
|
|
104
94
|
detect_or_validate, get_extensions_for_mime, validate_mime_type,
|
|
105
95
|
};
|
|
106
96
|
|
|
107
|
-
pub use core::formats::{KNOWN_FORMATS, is_valid_format_field};
|
|
108
|
-
|
|
109
97
|
pub use plugins::registry::{
|
|
110
98
|
get_document_extractor_registry, get_ocr_backend_registry, get_post_processor_registry, get_validator_registry,
|
|
111
99
|
};
|