kreuzberg 4.0.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +24 -16
- data/README.md +4 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,391 @@
|
|
|
1
|
+
//! Document extractor plugin trait.
|
|
2
|
+
//!
|
|
3
|
+
//! This module defines the trait for implementing custom document extractors.
|
|
4
|
+
|
|
5
|
+
use crate::Result;
|
|
6
|
+
use crate::core::config::ExtractionConfig;
|
|
7
|
+
use crate::plugins::Plugin;
|
|
8
|
+
use crate::types::ExtractionResult;
|
|
9
|
+
use async_trait::async_trait;
|
|
10
|
+
use std::path::Path;
|
|
11
|
+
|
|
12
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
13
|
+
use crate::KreuzbergError;
|
|
14
|
+
|
|
15
|
+
/// Trait for document extractor plugins.
|
|
16
|
+
///
|
|
17
|
+
/// Implement this trait to add support for new document formats or to override
|
|
18
|
+
/// built-in extraction behavior with custom logic.
|
|
19
|
+
///
|
|
20
|
+
/// # Priority System
|
|
21
|
+
///
|
|
22
|
+
/// When multiple extractors support the same MIME type, the registry selects
|
|
23
|
+
/// the extractor with the highest priority value. Use this to:
|
|
24
|
+
/// - Override built-in extractors (priority > 50)
|
|
25
|
+
/// - Provide fallback extractors (priority < 50)
|
|
26
|
+
/// - Implement specialized extractors for specific use cases
|
|
27
|
+
///
|
|
28
|
+
/// Default priority is 50.
|
|
29
|
+
///
|
|
30
|
+
/// # Thread Safety
|
|
31
|
+
///
|
|
32
|
+
/// Extractors must be thread-safe (`Send + Sync`) to support concurrent extraction.
|
|
33
|
+
///
|
|
34
|
+
/// # Example
|
|
35
|
+
///
|
|
36
|
+
/// ```rust
|
|
37
|
+
/// use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
38
|
+
/// use kreuzberg::{Result, ExtractionConfig};
|
|
39
|
+
/// use kreuzberg::types::{ExtractionResult, Metadata};
|
|
40
|
+
/// use async_trait::async_trait;
|
|
41
|
+
/// use std::path::Path;
|
|
42
|
+
///
|
|
43
|
+
/// /// Custom PDF extractor with premium features
|
|
44
|
+
/// struct PremiumPdfExtractor;
|
|
45
|
+
///
|
|
46
|
+
/// impl Plugin for PremiumPdfExtractor {
|
|
47
|
+
/// fn name(&self) -> &str { "premium-pdf" }
|
|
48
|
+
/// fn version(&self) -> String { "2.0.0".to_string() }
|
|
49
|
+
/// fn initialize(&self) -> Result<()> { Ok(()) }
|
|
50
|
+
/// fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
51
|
+
/// }
|
|
52
|
+
///
|
|
53
|
+
/// #[async_trait]
|
|
54
|
+
/// impl DocumentExtractor for PremiumPdfExtractor {
|
|
55
|
+
/// async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
|
|
56
|
+
/// -> Result<ExtractionResult> {
|
|
57
|
+
/// // Premium extraction logic with better accuracy
|
|
58
|
+
/// Ok(ExtractionResult {
|
|
59
|
+
/// content: "Premium extracted content".to_string(),
|
|
60
|
+
/// mime_type: mime_type.to_string(),
|
|
61
|
+
/// metadata: Metadata::default(),
|
|
62
|
+
/// tables: vec![],
|
|
63
|
+
/// detected_languages: None,
|
|
64
|
+
/// chunks: None,
|
|
65
|
+
/// images: None,
|
|
66
|
+
/// djot_content: None,
|
|
67
|
+
/// pages: None,
|
|
68
|
+
/// elements: None,
|
|
69
|
+
/// })
|
|
70
|
+
/// }
|
|
71
|
+
///
|
|
72
|
+
/// async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig)
|
|
73
|
+
/// -> Result<ExtractionResult> {
|
|
74
|
+
/// let bytes = std::fs::read(path)?;
|
|
75
|
+
/// self.extract_bytes(&bytes, mime_type, config).await
|
|
76
|
+
/// }
|
|
77
|
+
///
|
|
78
|
+
/// fn supported_mime_types(&self) -> &[&str] {
|
|
79
|
+
/// &["application/pdf"]
|
|
80
|
+
/// }
|
|
81
|
+
///
|
|
82
|
+
/// fn priority(&self) -> i32 {
|
|
83
|
+
/// 100 // Higher than default (50) - will be preferred
|
|
84
|
+
/// }
|
|
85
|
+
/// }
|
|
86
|
+
/// ```
|
|
87
|
+
#[async_trait]
|
|
88
|
+
pub trait DocumentExtractor: Plugin {
|
|
89
|
+
/// Extract content from a byte array.
|
|
90
|
+
///
|
|
91
|
+
/// This is the core extraction method that processes in-memory document data.
|
|
92
|
+
///
|
|
93
|
+
/// # Arguments
|
|
94
|
+
///
|
|
95
|
+
/// * `content` - Raw document bytes
|
|
96
|
+
/// * `mime_type` - MIME type of the document (already validated)
|
|
97
|
+
/// * `config` - Extraction configuration
|
|
98
|
+
///
|
|
99
|
+
/// # Returns
|
|
100
|
+
///
|
|
101
|
+
/// An `ExtractionResult` containing the extracted content, metadata, and tables.
|
|
102
|
+
///
|
|
103
|
+
/// # Errors
|
|
104
|
+
///
|
|
105
|
+
/// - `KreuzbergError::Parsing` - Document parsing failed
|
|
106
|
+
/// - `KreuzbergError::Validation` - Invalid document structure
|
|
107
|
+
/// - `KreuzbergError::Io` - I/O errors (these always bubble up)
|
|
108
|
+
/// - `KreuzbergError::MissingDependency` - Required dependency not available
|
|
109
|
+
///
|
|
110
|
+
/// # Example
|
|
111
|
+
///
|
|
112
|
+
/// ```rust,no_run
|
|
113
|
+
/// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
114
|
+
/// # use kreuzberg::{Result, ExtractionConfig};
|
|
115
|
+
/// # use kreuzberg::types::{ExtractionResult, Metadata};
|
|
116
|
+
/// # use async_trait::async_trait;
|
|
117
|
+
/// # use std::path::Path;
|
|
118
|
+
/// # struct MyExtractor;
|
|
119
|
+
/// # impl Plugin for MyExtractor {
|
|
120
|
+
/// # fn name(&self) -> &str { "my-extractor" }
|
|
121
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
122
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
123
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
124
|
+
/// # }
|
|
125
|
+
/// # #[async_trait]
|
|
126
|
+
/// # impl DocumentExtractor for MyExtractor {
|
|
127
|
+
/// # fn supported_mime_types(&self) -> &[&str] { &["text/plain"] }
|
|
128
|
+
/// # fn priority(&self) -> i32 { 50 }
|
|
129
|
+
/// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
130
|
+
/// async fn extract_bytes(&self, content: &[u8], mime_type: &str, config: &ExtractionConfig)
|
|
131
|
+
/// -> Result<ExtractionResult> {
|
|
132
|
+
/// // Parse document
|
|
133
|
+
/// let text = String::from_utf8_lossy(content).to_string();
|
|
134
|
+
///
|
|
135
|
+
/// // Extract metadata
|
|
136
|
+
/// let mut metadata = Metadata::default();
|
|
137
|
+
/// metadata.additional.insert("byte_count".to_string(), serde_json::json!(content.len()));
|
|
138
|
+
///
|
|
139
|
+
/// Ok(ExtractionResult {
|
|
140
|
+
/// content: text,
|
|
141
|
+
/// mime_type: mime_type.to_string(),
|
|
142
|
+
/// metadata,
|
|
143
|
+
/// tables: vec![],
|
|
144
|
+
/// detected_languages: None,
|
|
145
|
+
/// chunks: None,
|
|
146
|
+
/// images: None,
|
|
147
|
+
/// djot_content: None,
|
|
148
|
+
/// pages: None,
|
|
149
|
+
/// elements: None,
|
|
150
|
+
/// })
|
|
151
|
+
/// }
|
|
152
|
+
/// # }
|
|
153
|
+
/// ```
|
|
154
|
+
async fn extract_bytes(
|
|
155
|
+
&self,
|
|
156
|
+
content: &[u8],
|
|
157
|
+
mime_type: &str,
|
|
158
|
+
config: &ExtractionConfig,
|
|
159
|
+
) -> Result<ExtractionResult>;
|
|
160
|
+
|
|
161
|
+
/// Extract content from a file.
|
|
162
|
+
///
|
|
163
|
+
/// Default implementation reads the file and calls `extract_bytes`.
|
|
164
|
+
/// Override for custom file handling, streaming, or memory optimizations.
|
|
165
|
+
///
|
|
166
|
+
/// # Arguments
|
|
167
|
+
///
|
|
168
|
+
/// * `path` - Path to the document file
|
|
169
|
+
/// * `mime_type` - MIME type of the document (already validated)
|
|
170
|
+
/// * `config` - Extraction configuration
|
|
171
|
+
///
|
|
172
|
+
/// # Errors
|
|
173
|
+
///
|
|
174
|
+
/// Same as `extract_bytes`, plus file I/O errors.
|
|
175
|
+
///
|
|
176
|
+
/// # Example - Custom File Handling
|
|
177
|
+
///
|
|
178
|
+
/// ```rust,no_run
|
|
179
|
+
/// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
180
|
+
/// # use kreuzberg::{Result, ExtractionResult, ExtractionConfig};
|
|
181
|
+
/// # use kreuzberg::types::Metadata;
|
|
182
|
+
/// # use async_trait::async_trait;
|
|
183
|
+
/// # use std::path::Path;
|
|
184
|
+
/// # struct StreamingExtractor;
|
|
185
|
+
/// # impl Plugin for StreamingExtractor {
|
|
186
|
+
/// # fn name(&self) -> &str { "streaming" }
|
|
187
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
188
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
189
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
190
|
+
/// # }
|
|
191
|
+
/// # #[async_trait]
|
|
192
|
+
/// # impl DocumentExtractor for StreamingExtractor {
|
|
193
|
+
/// # fn supported_mime_types(&self) -> &[&str] { &["text/plain"] }
|
|
194
|
+
/// # fn priority(&self) -> i32 { 50 }
|
|
195
|
+
/// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
196
|
+
/// /// Override for memory-efficient streaming extraction
|
|
197
|
+
/// async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig)
|
|
198
|
+
/// -> Result<ExtractionResult> {
|
|
199
|
+
/// // Stream large files instead of loading entirely into memory
|
|
200
|
+
/// let mut content = String::new();
|
|
201
|
+
///
|
|
202
|
+
/// // Use buffered reader for streaming
|
|
203
|
+
/// use std::io::{BufRead, BufReader};
|
|
204
|
+
/// let file = std::fs::File::open(path)?;
|
|
205
|
+
/// let reader = BufReader::new(file);
|
|
206
|
+
///
|
|
207
|
+
/// for line in reader.lines() {
|
|
208
|
+
/// content.push_str(&line?);
|
|
209
|
+
/// content.push('\n');
|
|
210
|
+
/// }
|
|
211
|
+
///
|
|
212
|
+
/// Ok(ExtractionResult {
|
|
213
|
+
/// content,
|
|
214
|
+
/// mime_type: mime_type.to_string(),
|
|
215
|
+
/// metadata: Metadata::default(),
|
|
216
|
+
/// tables: vec![],
|
|
217
|
+
/// detected_languages: None,
|
|
218
|
+
/// chunks: None,
|
|
219
|
+
/// images: None,
|
|
220
|
+
/// djot_content: None,
|
|
221
|
+
/// pages: None,
|
|
222
|
+
/// elements: None,
|
|
223
|
+
/// })
|
|
224
|
+
/// }
|
|
225
|
+
/// # }
|
|
226
|
+
/// ```
|
|
227
|
+
async fn extract_file(&self, path: &Path, mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
228
|
+
#[cfg(feature = "tokio-runtime")]
|
|
229
|
+
{
|
|
230
|
+
use crate::core::io;
|
|
231
|
+
let bytes = io::read_file_async(path).await?;
|
|
232
|
+
self.extract_bytes(&bytes, mime_type, config).await
|
|
233
|
+
}
|
|
234
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
235
|
+
{
|
|
236
|
+
let _ = (path, mime_type, config);
|
|
237
|
+
Err(KreuzbergError::Other(
|
|
238
|
+
"File-based extraction requires the tokio-runtime feature".to_string(),
|
|
239
|
+
))
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/// Get the list of MIME types supported by this extractor.
|
|
244
|
+
///
|
|
245
|
+
/// Can include exact MIME types and prefix patterns:
|
|
246
|
+
/// - Exact: `"application/pdf"`, `"text/plain"`
|
|
247
|
+
/// - Prefix: `"image/*"` (matches any image type)
|
|
248
|
+
///
|
|
249
|
+
/// # Returns
|
|
250
|
+
///
|
|
251
|
+
/// A slice of MIME type strings.
|
|
252
|
+
///
|
|
253
|
+
/// # Example
|
|
254
|
+
///
|
|
255
|
+
/// ```rust
|
|
256
|
+
/// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
257
|
+
/// # use kreuzberg::Result;
|
|
258
|
+
/// # use async_trait::async_trait;
|
|
259
|
+
/// # use std::path::Path;
|
|
260
|
+
/// # struct MultiFormatExtractor;
|
|
261
|
+
/// # impl Plugin for MultiFormatExtractor {
|
|
262
|
+
/// # fn name(&self) -> &str { "multi-format" }
|
|
263
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
264
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
265
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
266
|
+
/// # }
|
|
267
|
+
/// # use kreuzberg::{ExtractionResult, ExtractionConfig};
|
|
268
|
+
/// # #[async_trait]
|
|
269
|
+
/// # impl DocumentExtractor for MultiFormatExtractor {
|
|
270
|
+
/// # fn priority(&self) -> i32 { 50 }
|
|
271
|
+
/// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
272
|
+
/// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
273
|
+
/// fn supported_mime_types(&self) -> &[&str] {
|
|
274
|
+
/// &[
|
|
275
|
+
/// "text/plain",
|
|
276
|
+
/// "text/markdown",
|
|
277
|
+
/// "application/json",
|
|
278
|
+
/// "application/xml",
|
|
279
|
+
/// "text/html",
|
|
280
|
+
/// ]
|
|
281
|
+
/// }
|
|
282
|
+
/// # }
|
|
283
|
+
/// ```
|
|
284
|
+
fn supported_mime_types(&self) -> &[&str];
|
|
285
|
+
|
|
286
|
+
/// Get the priority of this extractor.
|
|
287
|
+
///
|
|
288
|
+
/// Higher priority extractors are preferred when multiple extractors
|
|
289
|
+
/// support the same MIME type.
|
|
290
|
+
///
|
|
291
|
+
/// # Priority Guidelines
|
|
292
|
+
///
|
|
293
|
+
/// - **0-25**: Fallback/low-quality extractors
|
|
294
|
+
/// - **26-49**: Alternative extractors
|
|
295
|
+
/// - **50**: Default priority (built-in extractors)
|
|
296
|
+
/// - **51-75**: Premium/enhanced extractors
|
|
297
|
+
/// - **76-100**: Specialized/high-priority extractors
|
|
298
|
+
///
|
|
299
|
+
/// # Returns
|
|
300
|
+
///
|
|
301
|
+
/// Priority value (default: 50)
|
|
302
|
+
///
|
|
303
|
+
/// # Example
|
|
304
|
+
///
|
|
305
|
+
/// ```rust
|
|
306
|
+
/// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
307
|
+
/// # use kreuzberg::Result;
|
|
308
|
+
/// # use async_trait::async_trait;
|
|
309
|
+
/// # use std::path::Path;
|
|
310
|
+
/// # struct FallbackExtractor;
|
|
311
|
+
/// # impl Plugin for FallbackExtractor {
|
|
312
|
+
/// # fn name(&self) -> &str { "fallback" }
|
|
313
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
314
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
315
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
316
|
+
/// # }
|
|
317
|
+
/// # use kreuzberg::{ExtractionResult, ExtractionConfig};
|
|
318
|
+
/// # #[async_trait]
|
|
319
|
+
/// # impl DocumentExtractor for FallbackExtractor {
|
|
320
|
+
/// # fn supported_mime_types(&self) -> &[&str] { &["text/plain"] }
|
|
321
|
+
/// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
322
|
+
/// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
323
|
+
/// fn priority(&self) -> i32 {
|
|
324
|
+
/// 10 // Low priority - only used as fallback
|
|
325
|
+
/// }
|
|
326
|
+
/// # }
|
|
327
|
+
/// ```
|
|
328
|
+
fn priority(&self) -> i32 {
|
|
329
|
+
50
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
/// Optional: Check if this extractor can handle a specific file.
|
|
333
|
+
///
|
|
334
|
+
/// Allows for more sophisticated detection beyond MIME types.
|
|
335
|
+
/// Defaults to `true` (rely on MIME type matching).
|
|
336
|
+
///
|
|
337
|
+
/// # Arguments
|
|
338
|
+
///
|
|
339
|
+
/// * `path` - Path to the file to check
|
|
340
|
+
/// * `mime_type` - Detected MIME type
|
|
341
|
+
///
|
|
342
|
+
/// # Returns
|
|
343
|
+
///
|
|
344
|
+
/// `true` if the extractor can handle this file, `false` otherwise.
|
|
345
|
+
///
|
|
346
|
+
/// # Example
|
|
347
|
+
///
|
|
348
|
+
/// ```rust,no_run
|
|
349
|
+
/// # use kreuzberg::plugins::{Plugin, DocumentExtractor};
|
|
350
|
+
/// # use kreuzberg::Result;
|
|
351
|
+
/// # use async_trait::async_trait;
|
|
352
|
+
/// # use std::path::Path;
|
|
353
|
+
/// # struct SmartExtractor;
|
|
354
|
+
/// # impl Plugin for SmartExtractor {
|
|
355
|
+
/// # fn name(&self) -> &str { "smart" }
|
|
356
|
+
/// # fn version(&self) -> String { "1.0.0".to_string() }
|
|
357
|
+
/// # fn initialize(&self) -> Result<()> { Ok(()) }
|
|
358
|
+
/// # fn shutdown(&self) -> Result<()> { Ok(()) }
|
|
359
|
+
/// # }
|
|
360
|
+
/// # use kreuzberg::{ExtractionResult, ExtractionConfig};
|
|
361
|
+
/// # #[async_trait]
|
|
362
|
+
/// # impl DocumentExtractor for SmartExtractor {
|
|
363
|
+
/// # fn supported_mime_types(&self) -> &[&str] { &["application/pdf"] }
|
|
364
|
+
/// # fn priority(&self) -> i32 { 50 }
|
|
365
|
+
/// # async fn extract_bytes(&self, _: &[u8], _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
366
|
+
/// # async fn extract_file(&self, _: &Path, _: &str, _: &ExtractionConfig) -> Result<ExtractionResult> { todo!() }
|
|
367
|
+
/// /// Only handle PDFs that are searchable (have text layer)
|
|
368
|
+
/// fn can_handle(&self, path: &Path, mime_type: &str) -> bool {
|
|
369
|
+
/// if mime_type != "application/pdf" {
|
|
370
|
+
/// return false;
|
|
371
|
+
/// }
|
|
372
|
+
///
|
|
373
|
+
/// // Check if PDF has text layer (simplified example)
|
|
374
|
+
/// // In real implementation, analyze PDF structure here
|
|
375
|
+
/// let _ = path; // Use path for PDF analysis
|
|
376
|
+
/// true // Simplified - always accept
|
|
377
|
+
/// }
|
|
378
|
+
/// # }
|
|
379
|
+
/// ```
|
|
380
|
+
fn can_handle(&self, _path: &Path, _mime_type: &str) -> bool {
|
|
381
|
+
true
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
/// Attempt to get a reference to this extractor as a SyncExtractor.
|
|
385
|
+
///
|
|
386
|
+
/// Returns None if the extractor doesn't support synchronous extraction.
|
|
387
|
+
/// This is used for WASM and other sync-only environments.
|
|
388
|
+
fn as_sync_extractor(&self) -> Option<&dyn crate::extractors::SyncExtractor> {
|
|
389
|
+
None
|
|
390
|
+
}
|
|
391
|
+
}
|
|
@@ -48,6 +48,8 @@
|
|
|
48
48
|
//! # chunks: None,
|
|
49
49
|
//! # images: None,
|
|
50
50
|
//! # pages: None,
|
|
51
|
+
//! # djot_content: None,
|
|
52
|
+
//! # elements: None,
|
|
51
53
|
//! # })
|
|
52
54
|
//! # }
|
|
53
55
|
//! # async fn extract_file(&self, _: &std::path::Path, _: &str, _: &kreuzberg::ExtractionConfig)
|
|
@@ -61,6 +63,8 @@
|
|
|
61
63
|
//! # chunks: None,
|
|
62
64
|
//! # images: None,
|
|
63
65
|
//! # pages: None,
|
|
66
|
+
//! # djot_content: None,
|
|
67
|
+
//! # elements: None,
|
|
64
68
|
//! # })
|
|
65
69
|
//! # }
|
|
66
70
|
//! # fn supported_mime_types(&self) -> &[&str] { &[] }
|
|
@@ -122,7 +126,9 @@
|
|
|
122
126
|
//! detected_languages: None,
|
|
123
127
|
//! chunks: None,
|
|
124
128
|
//! images: None,
|
|
129
|
+
//! djot_content: None,
|
|
125
130
|
//! pages: None,
|
|
131
|
+
//! elements: None,
|
|
126
132
|
//! })
|
|
127
133
|
//! }
|
|
128
134
|
//!
|
|
@@ -210,3 +216,10 @@ pub use ocr::{
|
|
|
210
216
|
pub use processor::{PostProcessor, ProcessingStage, list_post_processors};
|
|
211
217
|
pub use traits::Plugin;
|
|
212
218
|
pub use validator::{Validator, clear_validators, list_validators, register_validator, unregister_validator};
|
|
219
|
+
|
|
220
|
+
// Re-export registry items for backward compatibility
|
|
221
|
+
pub use registry::{
|
|
222
|
+
DOCUMENT_EXTRACTOR_REGISTRY, DocumentExtractorRegistry, OCR_BACKEND_REGISTRY, OcrBackendRegistry,
|
|
223
|
+
POST_PROCESSOR_REGISTRY, PostProcessorRegistry, VALIDATOR_REGISTRY, ValidatorRegistry,
|
|
224
|
+
get_document_extractor_registry, get_ocr_backend_registry, get_post_processor_registry, get_validator_registry,
|
|
225
|
+
};
|
|
@@ -67,7 +67,9 @@ pub enum OcrBackendType {
|
|
|
67
67
|
/// detected_languages: None,
|
|
68
68
|
/// chunks: None,
|
|
69
69
|
/// images: None,
|
|
70
|
+
/// djot_content: None,
|
|
70
71
|
/// pages: None,
|
|
72
|
+
/// elements: None,
|
|
71
73
|
/// })
|
|
72
74
|
/// }
|
|
73
75
|
///
|
|
@@ -146,7 +148,9 @@ pub trait OcrBackend: Plugin {
|
|
|
146
148
|
/// detected_languages: None,
|
|
147
149
|
/// chunks: None,
|
|
148
150
|
/// images: None,
|
|
151
|
+
/// djot_content: None,
|
|
149
152
|
/// pages: None,
|
|
153
|
+
/// elements: None,
|
|
150
154
|
/// })
|
|
151
155
|
/// }
|
|
152
156
|
/// # }
|
|
@@ -317,7 +321,9 @@ pub trait OcrBackend: Plugin {
|
|
|
317
321
|
/// detected_languages: None,
|
|
318
322
|
/// chunks: None,
|
|
319
323
|
/// images: None,
|
|
324
|
+
/// djot_content: None,
|
|
320
325
|
/// pages: None,
|
|
326
|
+
/// elements: None,
|
|
321
327
|
/// })
|
|
322
328
|
/// }
|
|
323
329
|
/// fn supports_language(&self, _: &str) -> bool { true }
|
|
@@ -478,7 +484,9 @@ mod tests {
|
|
|
478
484
|
detected_languages: None,
|
|
479
485
|
chunks: None,
|
|
480
486
|
images: None,
|
|
487
|
+
djot_content: None,
|
|
481
488
|
pages: None,
|
|
489
|
+
elements: None,
|
|
482
490
|
})
|
|
483
491
|
}
|
|
484
492
|
|
|
@@ -505,6 +513,7 @@ mod tests {
|
|
|
505
513
|
backend: "mock".to_string(),
|
|
506
514
|
language: "eng".to_string(),
|
|
507
515
|
tesseract_config: None,
|
|
516
|
+
output_format: None,
|
|
508
517
|
};
|
|
509
518
|
|
|
510
519
|
let result = backend.process_image(b"fake image data", &config).await.unwrap();
|
|
@@ -592,6 +601,7 @@ mod tests {
|
|
|
592
601
|
backend: "mock".to_string(),
|
|
593
602
|
language: "eng".to_string(),
|
|
594
603
|
tesseract_config: None,
|
|
604
|
+
output_format: None,
|
|
595
605
|
};
|
|
596
606
|
|
|
597
607
|
let result = backend.process_file(path, &config).await.unwrap();
|
|
@@ -629,6 +639,7 @@ mod tests {
|
|
|
629
639
|
backend: "mock".to_string(),
|
|
630
640
|
language: "eng".to_string(),
|
|
631
641
|
tesseract_config: None,
|
|
642
|
+
output_format: None,
|
|
632
643
|
};
|
|
633
644
|
|
|
634
645
|
let result = backend.process_image(b"", &config).await;
|