kreuzberg 4.0.8 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +99 -2
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/spec/fixtures/config.toml +1 -1
- data/spec/fixtures/config.yaml +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +5 -4
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mime.rs +15 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +201 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
//! OCR processor implementation using Tesseract.
|
|
2
|
+
//!
|
|
3
|
+
//! This module has been split into focused submodules for better organization:
|
|
4
|
+
//! - `validation` - Image and configuration validation
|
|
5
|
+
//! - `config` - Configuration hashing and Tesseract variables
|
|
6
|
+
//! - `execution` - Core OCR execution logic
|
|
7
|
+
|
|
8
|
+
mod config;
|
|
9
|
+
mod execution;
|
|
10
|
+
mod validation;
|
|
11
|
+
|
|
12
|
+
use crate::ocr::cache::OcrCache;
|
|
13
|
+
use crate::ocr::error::OcrError;
|
|
14
|
+
use crate::ocr::types::{BatchItemResult, TesseractConfig};
|
|
15
|
+
use crate::types::OcrExtractionResult;
|
|
16
|
+
|
|
17
|
+
pub struct OcrProcessor {
|
|
18
|
+
cache: OcrCache,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
impl OcrProcessor {
|
|
22
|
+
pub fn new(cache_dir: Option<std::path::PathBuf>) -> Result<Self, OcrError> {
|
|
23
|
+
let cache = OcrCache::new(cache_dir)?;
|
|
24
|
+
Ok(Self { cache })
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
28
|
+
skip(self, image_bytes),
|
|
29
|
+
fields(
|
|
30
|
+
ocr.backend = "tesseract",
|
|
31
|
+
ocr.language = %config.language,
|
|
32
|
+
image.size_bytes = image_bytes.len(),
|
|
33
|
+
)
|
|
34
|
+
))]
|
|
35
|
+
pub fn process_image(&self, image_bytes: &[u8], config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
|
|
36
|
+
execution::process_image_with_cache(image_bytes, config, &self.cache, None)
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/// Process an image with OCR and respect the output format from ExtractionConfig.
|
|
40
|
+
///
|
|
41
|
+
/// This variant allows specifying an output format (Plain, Markdown, Djot) which
|
|
42
|
+
/// affects how the OCR result's mime_type is set when markdown output is requested.
|
|
43
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
44
|
+
skip(self, image_bytes),
|
|
45
|
+
fields(
|
|
46
|
+
ocr.backend = "tesseract",
|
|
47
|
+
ocr.language = %config.language,
|
|
48
|
+
image.size_bytes = image_bytes.len(),
|
|
49
|
+
)
|
|
50
|
+
))]
|
|
51
|
+
pub fn process_image_with_format(
|
|
52
|
+
&self,
|
|
53
|
+
image_bytes: &[u8],
|
|
54
|
+
config: &TesseractConfig,
|
|
55
|
+
output_format: crate::core::config::OutputFormat,
|
|
56
|
+
) -> Result<OcrExtractionResult, OcrError> {
|
|
57
|
+
execution::process_image_with_cache(image_bytes, config, &self.cache, Some(output_format))
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
pub fn clear_cache(&self) -> Result<(), OcrError> {
|
|
61
|
+
self.cache.clear()
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
pub fn get_cache_stats(&self) -> Result<super::cache::OcrCacheStats, OcrError> {
|
|
65
|
+
self.cache.get_stats()
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
pub fn process_file(&self, file_path: &str, config: &TesseractConfig) -> Result<OcrExtractionResult, OcrError> {
|
|
69
|
+
execution::process_file_with_cache(file_path, config, &self.cache, None)
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/// Process a file with OCR and respect the output format from ExtractionConfig.
|
|
73
|
+
///
|
|
74
|
+
/// This variant allows specifying an output format (Plain, Markdown, Djot) which
|
|
75
|
+
/// affects how the OCR result's mime_type is set when markdown output is requested.
|
|
76
|
+
pub fn process_file_with_format(
|
|
77
|
+
&self,
|
|
78
|
+
file_path: &str,
|
|
79
|
+
config: &TesseractConfig,
|
|
80
|
+
output_format: crate::core::config::OutputFormat,
|
|
81
|
+
) -> Result<OcrExtractionResult, OcrError> {
|
|
82
|
+
execution::process_file_with_cache(file_path, config, &self.cache, Some(output_format))
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/// Process multiple image files in parallel using Rayon.
|
|
86
|
+
///
|
|
87
|
+
/// This method processes OCR operations in parallel across CPU cores for improved throughput.
|
|
88
|
+
/// Results are returned in the same order as the input file paths.
|
|
89
|
+
pub fn process_files_batch(&self, file_paths: Vec<String>, config: &TesseractConfig) -> Vec<BatchItemResult> {
|
|
90
|
+
execution::process_files_batch(file_paths, config, &self.cache)
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
#[cfg(test)]
|
|
95
|
+
mod tests {
|
|
96
|
+
use super::*;
|
|
97
|
+
use tempfile::tempdir;
|
|
98
|
+
|
|
99
|
+
fn create_test_config() -> TesseractConfig {
|
|
100
|
+
TesseractConfig {
|
|
101
|
+
output_format: "text".to_string(),
|
|
102
|
+
enable_table_detection: false,
|
|
103
|
+
use_cache: false,
|
|
104
|
+
..TesseractConfig::default()
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
#[test]
|
|
109
|
+
fn test_processor_creation() {
|
|
110
|
+
let temp_dir = tempdir().unwrap();
|
|
111
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf()));
|
|
112
|
+
assert!(processor.is_ok());
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
#[test]
|
|
116
|
+
fn test_processor_creation_default_cache_dir() {
|
|
117
|
+
let processor = OcrProcessor::new(None);
|
|
118
|
+
assert!(processor.is_ok());
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
#[test]
|
|
122
|
+
fn test_cache_operations() {
|
|
123
|
+
let temp_dir = tempdir().unwrap();
|
|
124
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
125
|
+
|
|
126
|
+
assert!(processor.clear_cache().is_ok());
|
|
127
|
+
|
|
128
|
+
let stats = processor.get_cache_stats();
|
|
129
|
+
assert!(stats.is_ok());
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
fn test_process_file_nonexistent() {
|
|
134
|
+
let temp_dir = tempdir().unwrap();
|
|
135
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
136
|
+
let config = create_test_config();
|
|
137
|
+
|
|
138
|
+
let result = processor.process_file("/nonexistent/file.png", &config);
|
|
139
|
+
assert!(result.is_err());
|
|
140
|
+
assert!(result.unwrap_err().to_string().contains("Failed to read file"));
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
#[test]
|
|
144
|
+
fn test_process_files_batch_empty() {
|
|
145
|
+
let temp_dir = tempdir().unwrap();
|
|
146
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
147
|
+
let config = create_test_config();
|
|
148
|
+
|
|
149
|
+
let results = processor.process_files_batch(vec![], &config);
|
|
150
|
+
assert_eq!(results.len(), 0);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
#[test]
|
|
154
|
+
fn test_process_image_invalid_image_data() {
|
|
155
|
+
let temp_dir = tempdir().unwrap();
|
|
156
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
157
|
+
let config = create_test_config();
|
|
158
|
+
|
|
159
|
+
let invalid_data = vec![0, 1, 2, 3, 4];
|
|
160
|
+
let result = processor.process_image(&invalid_data, &config);
|
|
161
|
+
|
|
162
|
+
assert!(result.is_err());
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
#[test]
|
|
166
|
+
fn test_process_files_batch_single_file() {
|
|
167
|
+
let temp_dir = tempdir().unwrap();
|
|
168
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
169
|
+
let config = create_test_config();
|
|
170
|
+
|
|
171
|
+
let results = processor.process_files_batch(vec!["/nonexistent.png".to_string()], &config);
|
|
172
|
+
assert_eq!(results.len(), 1);
|
|
173
|
+
assert!(!results[0].success);
|
|
174
|
+
assert!(results[0].error.is_some());
|
|
175
|
+
assert!(results[0].result.is_none());
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
#[test]
|
|
179
|
+
fn test_process_files_batch_multiple_files() {
|
|
180
|
+
let temp_dir = tempdir().unwrap();
|
|
181
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
182
|
+
let config = create_test_config();
|
|
183
|
+
|
|
184
|
+
let file_paths = vec![
|
|
185
|
+
"/nonexistent1.png".to_string(),
|
|
186
|
+
"/nonexistent2.png".to_string(),
|
|
187
|
+
"/nonexistent3.png".to_string(),
|
|
188
|
+
];
|
|
189
|
+
|
|
190
|
+
let results = processor.process_files_batch(file_paths, &config);
|
|
191
|
+
assert_eq!(results.len(), 3);
|
|
192
|
+
|
|
193
|
+
for result in &results {
|
|
194
|
+
assert!(!result.success);
|
|
195
|
+
assert!(result.error.is_some());
|
|
196
|
+
assert!(result.result.is_none());
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
#[test]
|
|
201
|
+
fn test_batch_item_result_structure() {
|
|
202
|
+
use std::collections::HashMap;
|
|
203
|
+
|
|
204
|
+
let success_result = BatchItemResult {
|
|
205
|
+
file_path: "test.png".to_string(),
|
|
206
|
+
success: true,
|
|
207
|
+
result: Some(OcrExtractionResult {
|
|
208
|
+
content: "test".to_string(),
|
|
209
|
+
mime_type: "text/plain".to_string(),
|
|
210
|
+
metadata: HashMap::new(),
|
|
211
|
+
tables: vec![],
|
|
212
|
+
}),
|
|
213
|
+
error: None,
|
|
214
|
+
};
|
|
215
|
+
|
|
216
|
+
assert!(success_result.success);
|
|
217
|
+
assert!(success_result.result.is_some());
|
|
218
|
+
assert!(success_result.error.is_none());
|
|
219
|
+
|
|
220
|
+
let error_result = BatchItemResult {
|
|
221
|
+
file_path: "error.png".to_string(),
|
|
222
|
+
success: false,
|
|
223
|
+
result: None,
|
|
224
|
+
error: Some("Test error".to_string()),
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
assert!(!error_result.success);
|
|
228
|
+
assert!(error_result.result.is_none());
|
|
229
|
+
assert!(error_result.error.is_some());
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
#[test]
|
|
233
|
+
fn test_process_files_batch_preserves_order() {
|
|
234
|
+
let temp_dir = tempdir().unwrap();
|
|
235
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
236
|
+
let config = create_test_config();
|
|
237
|
+
|
|
238
|
+
let file_paths = vec![
|
|
239
|
+
"file1.png".to_string(),
|
|
240
|
+
"file2.png".to_string(),
|
|
241
|
+
"file3.png".to_string(),
|
|
242
|
+
];
|
|
243
|
+
|
|
244
|
+
let results = processor.process_files_batch(file_paths.clone(), &config);
|
|
245
|
+
|
|
246
|
+
assert_eq!(results.len(), 3);
|
|
247
|
+
assert_eq!(results[0].file_path, "file1.png");
|
|
248
|
+
assert_eq!(results[1].file_path, "file2.png");
|
|
249
|
+
assert_eq!(results[2].file_path, "file3.png");
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
#[test]
|
|
253
|
+
fn test_process_image_with_cache_disabled() {
|
|
254
|
+
let temp_dir = tempdir().unwrap();
|
|
255
|
+
let processor = OcrProcessor::new(Some(temp_dir.path().to_path_buf())).unwrap();
|
|
256
|
+
|
|
257
|
+
let mut config = create_test_config();
|
|
258
|
+
config.use_cache = false;
|
|
259
|
+
|
|
260
|
+
let invalid_data = vec![0, 1, 2, 3];
|
|
261
|
+
let result = processor.process_image(&invalid_data, &config);
|
|
262
|
+
|
|
263
|
+
assert!(result.is_err());
|
|
264
|
+
}
|
|
265
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
//! Image and configuration validation logic.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles validation of images, language files, and Tesseract configuration
|
|
4
|
+
//! before OCR processing begins.
|
|
5
|
+
|
|
6
|
+
use crate::ocr::error::OcrError;
|
|
7
|
+
use std::env;
|
|
8
|
+
use std::path::Path;
|
|
9
|
+
|
|
10
|
+
/// Validate language configuration and check for traineddata files.
|
|
11
|
+
///
|
|
12
|
+
/// This function validates that:
|
|
13
|
+
/// 1. Language string is not empty
|
|
14
|
+
/// 2. Traineddata files exist for all specified languages
|
|
15
|
+
///
|
|
16
|
+
/// # Arguments
|
|
17
|
+
///
|
|
18
|
+
/// * `language` - Language code(s) to validate (can be "eng" or "eng+fra" etc.)
|
|
19
|
+
/// * `tessdata_path` - Path to tessdata directory
|
|
20
|
+
///
|
|
21
|
+
/// # Returns
|
|
22
|
+
///
|
|
23
|
+
/// `Ok(())` if validation passes, otherwise returns an error
|
|
24
|
+
pub(super) fn validate_language_and_traineddata(language: &str, tessdata_path: &str) -> Result<(), OcrError> {
|
|
25
|
+
// Validate language before initializing to prevent segfault ~keep
|
|
26
|
+
if language.trim().is_empty() {
|
|
27
|
+
return Err(OcrError::TesseractInitializationFailed(
|
|
28
|
+
"Language cannot be empty. Please specify a valid language code (e.g., 'eng')".to_string(),
|
|
29
|
+
));
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// Validate language file exists before initializing to prevent segfault ~keep
|
|
33
|
+
if !tessdata_path.is_empty() {
|
|
34
|
+
let languages: Vec<&str> = language.split('+').collect();
|
|
35
|
+
for lang in languages {
|
|
36
|
+
let lang = lang.trim();
|
|
37
|
+
if lang.is_empty() {
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
let traineddata_path = Path::new(tessdata_path).join(format!("{}.traineddata", lang));
|
|
41
|
+
if !traineddata_path.exists() {
|
|
42
|
+
return Err(OcrError::TesseractInitializationFailed(format!(
|
|
43
|
+
"Language '{}' not found. Traineddata file does not exist: {}",
|
|
44
|
+
lang,
|
|
45
|
+
traineddata_path.display()
|
|
46
|
+
)));
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
Ok(())
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/// Resolve tessdata path from environment or fallback locations.
|
|
55
|
+
///
|
|
56
|
+
/// Checks TESSDATA_PREFIX environment variable first, then tries common
|
|
57
|
+
/// installation paths for macOS, Linux, and Windows.
|
|
58
|
+
///
|
|
59
|
+
/// # Returns
|
|
60
|
+
///
|
|
61
|
+
/// Path to tessdata directory if found, otherwise empty string
|
|
62
|
+
pub(super) fn resolve_tessdata_path() -> String {
|
|
63
|
+
let tessdata_env = env::var("TESSDATA_PREFIX").ok();
|
|
64
|
+
let fallback_paths = [
|
|
65
|
+
"/opt/homebrew/share/tessdata",
|
|
66
|
+
"/opt/homebrew/opt/tesseract/share/tessdata",
|
|
67
|
+
"/usr/local/opt/tesseract/share/tessdata",
|
|
68
|
+
"/usr/share/tesseract-ocr/5/tessdata",
|
|
69
|
+
"/usr/share/tesseract-ocr/4/tessdata",
|
|
70
|
+
"/usr/share/tessdata",
|
|
71
|
+
"/usr/local/share/tessdata",
|
|
72
|
+
r#"C:\Program Files\Tesseract-OCR\tessdata"#,
|
|
73
|
+
r#"C:\ProgramData\Tesseract-OCR\tessdata"#,
|
|
74
|
+
];
|
|
75
|
+
|
|
76
|
+
tessdata_env
|
|
77
|
+
.or_else(|| {
|
|
78
|
+
fallback_paths
|
|
79
|
+
.iter()
|
|
80
|
+
.find(|p| Path::new(p).exists())
|
|
81
|
+
.map(|p| (*p).to_string())
|
|
82
|
+
})
|
|
83
|
+
.unwrap_or_default()
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/// Strip control characters from text, preserving whitespace.
|
|
87
|
+
///
|
|
88
|
+
/// Removes control characters (0x00-0x1F, 0x7F) except for newlines, carriage returns, and tabs.
|
|
89
|
+
///
|
|
90
|
+
/// # Arguments
|
|
91
|
+
///
|
|
92
|
+
/// * `text` - Text to clean
|
|
93
|
+
///
|
|
94
|
+
/// # Returns
|
|
95
|
+
///
|
|
96
|
+
/// Cleaned text with control characters removed
|
|
97
|
+
pub(super) fn strip_control_characters(text: &str) -> String {
|
|
98
|
+
if text
|
|
99
|
+
.chars()
|
|
100
|
+
.any(|c| matches!(c, '\u{0000}'..='\u{001F}' | '\u{007F}') && c != '\n' && c != '\r' && c != '\t')
|
|
101
|
+
{
|
|
102
|
+
text.chars()
|
|
103
|
+
.filter(|c| !matches!(c, '\u{0000}'..='\u{001F}' | '\u{007F}') || matches!(c, '\n' | '\r' | '\t'))
|
|
104
|
+
.collect()
|
|
105
|
+
} else {
|
|
106
|
+
text.to_string()
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
#[cfg(test)]
|
|
111
|
+
mod tests {
|
|
112
|
+
use super::*;
|
|
113
|
+
|
|
114
|
+
#[test]
|
|
115
|
+
fn test_strip_control_characters() {
|
|
116
|
+
let input = "Hello\x00World\x01Test";
|
|
117
|
+
let output = strip_control_characters(input);
|
|
118
|
+
assert_eq!(output, "HelloWorldTest");
|
|
119
|
+
|
|
120
|
+
let input_with_newlines = "Hello\nWorld\rTest\t!";
|
|
121
|
+
let output = strip_control_characters(input_with_newlines);
|
|
122
|
+
assert_eq!(output, "Hello\nWorld\rTest\t!");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
#[test]
|
|
126
|
+
fn test_strip_control_characters_all_control() {
|
|
127
|
+
let input = "\x00\x01\x02\x03";
|
|
128
|
+
let output = strip_control_characters(input);
|
|
129
|
+
assert_eq!(output, "");
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
fn test_strip_control_characters_no_control() {
|
|
134
|
+
let input = "Hello World Test";
|
|
135
|
+
let output = strip_control_characters(input);
|
|
136
|
+
assert_eq!(output, "Hello World Test");
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
#[test]
|
|
140
|
+
fn test_strip_control_characters_delete_char() {
|
|
141
|
+
let input = "Hello\x7FWorld";
|
|
142
|
+
let output = strip_control_characters(input);
|
|
143
|
+
assert_eq!(output, "HelloWorld");
|
|
144
|
+
}
|
|
145
|
+
}
|
|
@@ -177,20 +177,24 @@ impl OcrBackend for TesseractBackend {
|
|
|
177
177
|
async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> Result<ExtractionResult> {
|
|
178
178
|
let tess_config = self.config_to_tesseract(config);
|
|
179
179
|
let tess_config_clone = tess_config.clone();
|
|
180
|
+
let output_format = config.output_format;
|
|
180
181
|
|
|
181
182
|
let processor = Arc::clone(&self.processor);
|
|
182
183
|
let image_bytes = image_bytes.to_vec();
|
|
183
184
|
|
|
184
|
-
let ocr_result = tokio::task::spawn_blocking(move ||
|
|
185
|
-
.
|
|
186
|
-
.
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
185
|
+
let ocr_result = tokio::task::spawn_blocking(move || match output_format {
|
|
186
|
+
Some(fmt) => processor.process_image_with_format(&image_bytes, &tess_config_clone, fmt),
|
|
187
|
+
None => processor.process_image(&image_bytes, &tess_config_clone),
|
|
188
|
+
})
|
|
189
|
+
.await
|
|
190
|
+
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
191
|
+
message: format!("Tesseract task panicked: {}", e),
|
|
192
|
+
plugin_name: "tesseract".to_string(),
|
|
193
|
+
})?
|
|
194
|
+
.map_err(|e| crate::KreuzbergError::Ocr {
|
|
195
|
+
message: format!("Tesseract OCR failed: {}", e),
|
|
196
|
+
source: Some(Box::new(e)),
|
|
197
|
+
})?;
|
|
194
198
|
|
|
195
199
|
let metadata = crate::types::Metadata {
|
|
196
200
|
format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
|
|
@@ -225,26 +229,32 @@ impl OcrBackend for TesseractBackend {
|
|
|
225
229
|
detected_languages: None,
|
|
226
230
|
chunks: None,
|
|
227
231
|
images: None,
|
|
232
|
+
elements: None,
|
|
233
|
+
djot_content: None,
|
|
228
234
|
})
|
|
229
235
|
}
|
|
230
236
|
|
|
231
237
|
async fn process_file(&self, path: &Path, config: &OcrConfig) -> Result<ExtractionResult> {
|
|
232
238
|
let tess_config = self.config_to_tesseract(config);
|
|
233
239
|
let tess_config_clone = tess_config.clone();
|
|
240
|
+
let output_format = config.output_format;
|
|
234
241
|
|
|
235
242
|
let processor = Arc::clone(&self.processor);
|
|
236
243
|
let path_str = path.to_string_lossy().to_string();
|
|
237
244
|
|
|
238
|
-
let ocr_result = tokio::task::spawn_blocking(move ||
|
|
239
|
-
.
|
|
240
|
-
.
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
245
|
+
let ocr_result = tokio::task::spawn_blocking(move || match output_format {
|
|
246
|
+
Some(fmt) => processor.process_file_with_format(&path_str, &tess_config_clone, fmt),
|
|
247
|
+
None => processor.process_file(&path_str, &tess_config_clone),
|
|
248
|
+
})
|
|
249
|
+
.await
|
|
250
|
+
.map_err(|e| crate::KreuzbergError::Plugin {
|
|
251
|
+
message: format!("Tesseract task panicked: {}", e),
|
|
252
|
+
plugin_name: "tesseract".to_string(),
|
|
253
|
+
})?
|
|
254
|
+
.map_err(|e| crate::KreuzbergError::Ocr {
|
|
255
|
+
message: format!("Tesseract OCR failed: {}", e),
|
|
256
|
+
source: Some(Box::new(e)),
|
|
257
|
+
})?;
|
|
248
258
|
|
|
249
259
|
let metadata = crate::types::Metadata {
|
|
250
260
|
format: Some(crate::types::FormatMetadata::Ocr(crate::types::OcrMetadata {
|
|
@@ -279,6 +289,8 @@ impl OcrBackend for TesseractBackend {
|
|
|
279
289
|
detected_languages: None,
|
|
280
290
|
chunks: None,
|
|
281
291
|
images: None,
|
|
292
|
+
elements: None,
|
|
293
|
+
djot_content: None,
|
|
282
294
|
})
|
|
283
295
|
}
|
|
284
296
|
|
|
@@ -326,10 +338,11 @@ mod tests {
|
|
|
326
338
|
#[test]
|
|
327
339
|
fn test_tesseract_backend_supports_language() {
|
|
328
340
|
let backend = TesseractBackend::new().unwrap();
|
|
341
|
+
// English should always be available
|
|
329
342
|
assert!(backend.supports_language("eng"));
|
|
330
|
-
|
|
331
|
-
assert!(backend.supports_language("fra"));
|
|
343
|
+
// Invalid language codes should return false
|
|
332
344
|
assert!(!backend.supports_language("xyz"));
|
|
345
|
+
assert!(!backend.supports_language("invalid"));
|
|
333
346
|
}
|
|
334
347
|
|
|
335
348
|
#[test]
|
|
@@ -342,9 +355,10 @@ mod tests {
|
|
|
342
355
|
fn test_tesseract_backend_supported_languages() {
|
|
343
356
|
let backend = TesseractBackend::new().unwrap();
|
|
344
357
|
let languages = backend.supported_languages();
|
|
358
|
+
// English should always be available
|
|
345
359
|
assert!(languages.contains(&"eng".to_string()));
|
|
346
|
-
|
|
347
|
-
assert!(languages.
|
|
360
|
+
// Should have at least English
|
|
361
|
+
assert!(!languages.is_empty());
|
|
348
362
|
}
|
|
349
363
|
|
|
350
364
|
#[test]
|
|
@@ -354,6 +368,7 @@ mod tests {
|
|
|
354
368
|
backend: "tesseract".to_string(),
|
|
355
369
|
language: "deu".to_string(),
|
|
356
370
|
tesseract_config: None,
|
|
371
|
+
output_format: None,
|
|
357
372
|
};
|
|
358
373
|
|
|
359
374
|
let tess_config = backend.config_to_tesseract(&ocr_config);
|
|
@@ -375,6 +390,7 @@ mod tests {
|
|
|
375
390
|
backend: "tesseract".to_string(),
|
|
376
391
|
language: "eng".to_string(),
|
|
377
392
|
tesseract_config: Some(custom_tess_config),
|
|
393
|
+
output_format: None,
|
|
378
394
|
};
|
|
379
395
|
|
|
380
396
|
let tess_config = backend.config_to_tesseract(&ocr_config);
|
|
@@ -418,6 +434,7 @@ mod tests {
|
|
|
418
434
|
backend: "tesseract".to_string(),
|
|
419
435
|
language: "eng".to_string(),
|
|
420
436
|
tesseract_config: Some(custom_tess_config),
|
|
437
|
+
output_format: None,
|
|
421
438
|
};
|
|
422
439
|
|
|
423
440
|
let tess_config = backend.config_to_tesseract(&ocr_config);
|
|
@@ -263,21 +263,34 @@ mod tests {
|
|
|
263
263
|
|
|
264
264
|
#[test]
|
|
265
265
|
fn test_bind_pdfium_multiple_calls() {
|
|
266
|
-
|
|
267
|
-
|
|
266
|
+
// First call - acquire lock, test success, then drop handle to release lock
|
|
267
|
+
{
|
|
268
|
+
let result1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1");
|
|
269
|
+
assert!(result1.is_ok(), "First call should succeed");
|
|
270
|
+
} // result1 dropped here, releasing the lock
|
|
268
271
|
|
|
269
|
-
|
|
270
|
-
|
|
272
|
+
// Second call - can now acquire lock since first handle was dropped
|
|
273
|
+
{
|
|
274
|
+
let result2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2");
|
|
275
|
+
assert!(result2.is_ok(), "Second call should also succeed");
|
|
276
|
+
}
|
|
271
277
|
}
|
|
272
278
|
|
|
273
279
|
#[test]
|
|
274
280
|
fn test_bind_pdfium_returns_same_instance() {
|
|
275
|
-
|
|
276
|
-
let
|
|
281
|
+
// Get pointer from first handle, then drop it to release lock
|
|
282
|
+
let ptr1 = {
|
|
283
|
+
let handle1 = bind_pdfium(PdfError::TextExtractionFailed, "test 1").unwrap();
|
|
284
|
+
&*handle1 as *const Pdfium
|
|
285
|
+
}; // handle1 dropped here, releasing the lock
|
|
286
|
+
|
|
287
|
+
// Get pointer from second handle
|
|
288
|
+
let ptr2 = {
|
|
289
|
+
let handle2 = bind_pdfium(PdfError::TextExtractionFailed, "test 2").unwrap();
|
|
290
|
+
&*handle2 as *const Pdfium
|
|
291
|
+
};
|
|
277
292
|
|
|
278
293
|
// Both handles should dereference to the same Pdfium instance
|
|
279
|
-
let ptr1 = &*handle1 as *const Pdfium;
|
|
280
|
-
let ptr2 = &*handle2 as *const Pdfium;
|
|
281
294
|
assert_eq!(ptr1, ptr2, "Both handles should reference the same Pdfium instance");
|
|
282
295
|
}
|
|
283
296
|
|