kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,233 +1,31 @@
|
|
|
1
1
|
//! PDF document extractor.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides extraction of text, metadata, tables, and images from PDF documents
|
|
4
|
+
//! using pypdfium2 and playa-pdf. Supports both native text extraction and OCR fallback.
|
|
5
|
+
|
|
6
|
+
mod extraction;
|
|
7
|
+
mod ocr;
|
|
8
|
+
mod pages;
|
|
2
9
|
|
|
3
10
|
use crate::Result;
|
|
4
11
|
use crate::core::config::ExtractionConfig;
|
|
5
12
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
6
|
-
use crate::types::{ExtractionResult, Metadata
|
|
13
|
+
use crate::types::{ExtractionResult, Metadata};
|
|
7
14
|
use async_trait::async_trait;
|
|
8
15
|
#[cfg(feature = "tokio-runtime")]
|
|
9
16
|
use std::path::Path;
|
|
10
17
|
|
|
11
18
|
#[cfg(feature = "pdf")]
|
|
12
19
|
use crate::pdf::error::PdfError;
|
|
13
|
-
#[cfg(feature = "ocr")]
|
|
14
|
-
use crate::pdf::rendering::{PageRenderOptions, PdfRenderer};
|
|
15
|
-
#[cfg(feature = "pdf")]
|
|
16
|
-
use crate::types::Table;
|
|
17
|
-
#[cfg(feature = "pdf")]
|
|
18
|
-
use pdfium_render::prelude::*;
|
|
19
|
-
|
|
20
|
-
#[cfg(feature = "pdf")]
|
|
21
|
-
type PdfExtractionPhaseResult = (
|
|
22
|
-
crate::pdf::metadata::PdfExtractionMetadata,
|
|
23
|
-
String,
|
|
24
|
-
Vec<Table>,
|
|
25
|
-
Option<Vec<PageContent>>,
|
|
26
|
-
);
|
|
27
20
|
|
|
21
|
+
// Re-export for backward compatibility
|
|
28
22
|
#[cfg(feature = "ocr")]
|
|
29
|
-
|
|
30
|
-
#[cfg(feature = "ocr")]
|
|
31
|
-
const MIN_NON_WHITESPACE_PER_PAGE: f64 = 32.0;
|
|
32
|
-
#[cfg(feature = "ocr")]
|
|
33
|
-
const MIN_MEANINGFUL_WORD_LEN: usize = 4;
|
|
34
|
-
#[cfg(feature = "ocr")]
|
|
35
|
-
const MIN_MEANINGFUL_WORDS: usize = 3;
|
|
36
|
-
#[cfg(feature = "ocr")]
|
|
37
|
-
const MIN_ALNUM_RATIO: f64 = 0.3;
|
|
38
|
-
|
|
39
|
-
#[cfg(feature = "ocr")]
|
|
40
|
-
struct NativeTextStats {
|
|
41
|
-
non_whitespace: usize,
|
|
42
|
-
alnum: usize,
|
|
43
|
-
meaningful_words: usize,
|
|
44
|
-
alnum_ratio: f64,
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
#[cfg(feature = "ocr")]
|
|
48
|
-
struct OcrFallbackDecision {
|
|
49
|
-
stats: NativeTextStats,
|
|
50
|
-
avg_non_whitespace: f64,
|
|
51
|
-
avg_alnum: f64,
|
|
52
|
-
fallback: bool,
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
#[cfg(feature = "ocr")]
|
|
56
|
-
impl NativeTextStats {
|
|
57
|
-
fn from(text: &str) -> Self {
|
|
58
|
-
let mut non_whitespace = 0usize;
|
|
59
|
-
let mut alnum = 0usize;
|
|
60
|
-
|
|
61
|
-
for ch in text.chars() {
|
|
62
|
-
if !ch.is_whitespace() {
|
|
63
|
-
non_whitespace += 1;
|
|
64
|
-
if ch.is_alphanumeric() {
|
|
65
|
-
alnum += 1;
|
|
66
|
-
}
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
let meaningful_words = text
|
|
71
|
-
.split_whitespace()
|
|
72
|
-
.filter(|word| {
|
|
73
|
-
word.chars()
|
|
74
|
-
.filter(|c| c.is_alphanumeric())
|
|
75
|
-
.take(MIN_MEANINGFUL_WORD_LEN)
|
|
76
|
-
.count()
|
|
77
|
-
>= MIN_MEANINGFUL_WORD_LEN
|
|
78
|
-
})
|
|
79
|
-
.take(MIN_MEANINGFUL_WORDS)
|
|
80
|
-
.count();
|
|
81
|
-
|
|
82
|
-
let alnum_ratio = if non_whitespace == 0 {
|
|
83
|
-
0.0
|
|
84
|
-
} else {
|
|
85
|
-
alnum as f64 / non_whitespace as f64
|
|
86
|
-
};
|
|
87
|
-
|
|
88
|
-
Self {
|
|
89
|
-
non_whitespace,
|
|
90
|
-
alnum,
|
|
91
|
-
meaningful_words,
|
|
92
|
-
alnum_ratio,
|
|
93
|
-
}
|
|
94
|
-
}
|
|
95
|
-
}
|
|
23
|
+
pub use ocr::{NativeTextStats, OcrFallbackDecision, evaluate_native_text_for_ocr};
|
|
96
24
|
|
|
25
|
+
use extraction::extract_all_from_document;
|
|
97
26
|
#[cfg(feature = "ocr")]
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
if trimmed.is_empty() {
|
|
102
|
-
let empty_stats = NativeTextStats {
|
|
103
|
-
non_whitespace: 0,
|
|
104
|
-
alnum: 0,
|
|
105
|
-
meaningful_words: 0,
|
|
106
|
-
alnum_ratio: 0.0,
|
|
107
|
-
};
|
|
108
|
-
return OcrFallbackDecision {
|
|
109
|
-
stats: empty_stats,
|
|
110
|
-
avg_non_whitespace: 0.0,
|
|
111
|
-
avg_alnum: 0.0,
|
|
112
|
-
fallback: true,
|
|
113
|
-
};
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
let stats = NativeTextStats::from(trimmed);
|
|
117
|
-
let pages = page_count.unwrap_or(1).max(1) as f64;
|
|
118
|
-
let avg_non_whitespace = stats.non_whitespace as f64 / pages;
|
|
119
|
-
let avg_alnum = stats.alnum as f64 / pages;
|
|
120
|
-
|
|
121
|
-
let has_substantial_text = stats.non_whitespace >= MIN_TOTAL_NON_WHITESPACE
|
|
122
|
-
&& avg_non_whitespace >= MIN_NON_WHITESPACE_PER_PAGE
|
|
123
|
-
&& stats.meaningful_words >= MIN_MEANINGFUL_WORDS;
|
|
124
|
-
|
|
125
|
-
let fallback = if stats.non_whitespace == 0 || stats.alnum == 0 {
|
|
126
|
-
true
|
|
127
|
-
} else if has_substantial_text {
|
|
128
|
-
false
|
|
129
|
-
} else if (stats.alnum_ratio < MIN_ALNUM_RATIO && avg_alnum < MIN_NON_WHITESPACE_PER_PAGE)
|
|
130
|
-
|| (stats.non_whitespace < MIN_TOTAL_NON_WHITESPACE && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE)
|
|
131
|
-
{
|
|
132
|
-
true
|
|
133
|
-
} else {
|
|
134
|
-
stats.meaningful_words == 0 && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE
|
|
135
|
-
};
|
|
136
|
-
|
|
137
|
-
OcrFallbackDecision {
|
|
138
|
-
stats,
|
|
139
|
-
avg_non_whitespace,
|
|
140
|
-
avg_alnum,
|
|
141
|
-
fallback,
|
|
142
|
-
}
|
|
143
|
-
}
|
|
144
|
-
|
|
145
|
-
/// Extract tables from PDF document using native text positions.
|
|
146
|
-
///
|
|
147
|
-
/// This function converts PDF character positions to HocrWord format,
|
|
148
|
-
/// then uses the existing table reconstruction logic to detect tables.
|
|
149
|
-
///
|
|
150
|
-
/// Uses the shared PdfDocument reference (wrapped in Arc<RwLock<>> for thread-safety).
|
|
151
|
-
#[cfg(all(feature = "pdf", feature = "ocr"))]
|
|
152
|
-
fn extract_tables_from_document(
|
|
153
|
-
document: &PdfDocument,
|
|
154
|
-
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
|
|
155
|
-
) -> Result<Vec<Table>> {
|
|
156
|
-
use crate::ocr::table::{reconstruct_table, table_to_markdown};
|
|
157
|
-
use crate::pdf::table::extract_words_from_page;
|
|
158
|
-
|
|
159
|
-
let mut all_tables = Vec::new();
|
|
160
|
-
|
|
161
|
-
for (page_index, page) in document.pages().iter().enumerate() {
|
|
162
|
-
let words = extract_words_from_page(&page, 0.0)?;
|
|
163
|
-
|
|
164
|
-
if words.is_empty() {
|
|
165
|
-
continue;
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
let column_threshold = 50;
|
|
169
|
-
let row_threshold_ratio = 0.5;
|
|
170
|
-
|
|
171
|
-
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
|
|
172
|
-
|
|
173
|
-
if !table_cells.is_empty() {
|
|
174
|
-
let markdown = table_to_markdown(&table_cells);
|
|
175
|
-
|
|
176
|
-
all_tables.push(Table {
|
|
177
|
-
cells: table_cells,
|
|
178
|
-
markdown,
|
|
179
|
-
page_number: page_index + 1,
|
|
180
|
-
});
|
|
181
|
-
}
|
|
182
|
-
}
|
|
183
|
-
|
|
184
|
-
Ok(all_tables)
|
|
185
|
-
}
|
|
186
|
-
|
|
187
|
-
/// Fallback for when OCR feature is not enabled - returns empty tables.
|
|
188
|
-
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
|
|
189
|
-
fn extract_tables_from_document(
|
|
190
|
-
_document: &PdfDocument,
|
|
191
|
-
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
|
|
192
|
-
) -> Result<Vec<crate::types::Table>> {
|
|
193
|
-
Ok(vec![])
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
/// Helper function to assign tables and images to pages.
|
|
197
|
-
///
|
|
198
|
-
/// If page_contents is None, returns None (no per-page tracking enabled).
|
|
199
|
-
/// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
|
|
200
|
-
///
|
|
201
|
-
/// # Performance
|
|
202
|
-
///
|
|
203
|
-
/// Uses Arc::new to wrap tables and images, avoiding expensive copies.
|
|
204
|
-
/// This reduces memory overhead by enabling zero-copy sharing of table/image data
|
|
205
|
-
/// across multiple references (e.g., when the same table appears on multiple pages).
|
|
206
|
-
fn assign_tables_and_images_to_pages(
|
|
207
|
-
mut page_contents: Option<Vec<PageContent>>,
|
|
208
|
-
tables: &[crate::types::Table],
|
|
209
|
-
images: &[crate::types::ExtractedImage],
|
|
210
|
-
) -> Option<Vec<PageContent>> {
|
|
211
|
-
let pages = page_contents.take()?;
|
|
212
|
-
|
|
213
|
-
let mut updated_pages = pages;
|
|
214
|
-
|
|
215
|
-
for table in tables {
|
|
216
|
-
if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
|
|
217
|
-
page.tables.push(std::sync::Arc::new(table.clone()));
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
|
|
221
|
-
for image in images {
|
|
222
|
-
if let Some(page_num) = image.page_number
|
|
223
|
-
&& let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
|
|
224
|
-
{
|
|
225
|
-
page.images.push(std::sync::Arc::new(image.clone()));
|
|
226
|
-
}
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
Some(updated_pages)
|
|
230
|
-
}
|
|
27
|
+
use ocr::extract_with_ocr;
|
|
28
|
+
use pages::assign_tables_and_images_to_pages;
|
|
231
29
|
|
|
232
30
|
/// PDF document extractor using pypdfium2 and playa-pdf.
|
|
233
31
|
pub struct PdfExtractor;
|
|
@@ -242,105 +40,6 @@ impl PdfExtractor {
|
|
|
242
40
|
pub fn new() -> Self {
|
|
243
41
|
Self
|
|
244
42
|
}
|
|
245
|
-
|
|
246
|
-
/// Extract text, metadata, and tables from a PDF document using a single shared instance.
|
|
247
|
-
///
|
|
248
|
-
/// This method consolidates all PDF extraction phases (text, metadata, tables) into a single
|
|
249
|
-
/// operation using a single PdfDocument instance. This avoids redundant document parsing
|
|
250
|
-
/// and pdfium initialization overhead.
|
|
251
|
-
///
|
|
252
|
-
/// # Performance
|
|
253
|
-
///
|
|
254
|
-
/// By reusing a single document instance across all extraction phases, we eliminate:
|
|
255
|
-
/// - Duplicate document parsing overhead (25-40ms saved)
|
|
256
|
-
/// - Redundant pdfium bindings initialization
|
|
257
|
-
/// - Multiple page tree traversals
|
|
258
|
-
///
|
|
259
|
-
/// Expected improvement: 20-30% faster PDF processing.
|
|
260
|
-
///
|
|
261
|
-
/// # Returns
|
|
262
|
-
///
|
|
263
|
-
/// A tuple containing:
|
|
264
|
-
/// - PDF metadata (title, authors, dates, page structure, etc.)
|
|
265
|
-
/// - Native extracted text (or empty if using OCR)
|
|
266
|
-
/// - Extracted tables (if OCR feature enabled)
|
|
267
|
-
/// - Per-page content (if page extraction configured)
|
|
268
|
-
#[cfg(feature = "pdf")]
|
|
269
|
-
fn extract_all_from_document(
|
|
270
|
-
document: &PdfDocument,
|
|
271
|
-
config: &ExtractionConfig,
|
|
272
|
-
) -> Result<PdfExtractionPhaseResult> {
|
|
273
|
-
let (native_text, _boundaries, page_contents, pdf_metadata) =
|
|
274
|
-
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
|
|
275
|
-
|
|
276
|
-
let tables = extract_tables_from_document(document, &pdf_metadata)?;
|
|
277
|
-
|
|
278
|
-
Ok((pdf_metadata, native_text, tables, page_contents))
|
|
279
|
-
}
|
|
280
|
-
|
|
281
|
-
/// Extract text from PDF using OCR.
|
|
282
|
-
///
|
|
283
|
-
/// Renders all pages to images and processes them with OCR.
|
|
284
|
-
#[cfg(feature = "ocr")]
|
|
285
|
-
async fn extract_with_ocr(&self, content: &[u8], config: &ExtractionConfig) -> Result<String> {
|
|
286
|
-
use crate::plugins::registry::get_ocr_backend_registry;
|
|
287
|
-
use image::ImageEncoder;
|
|
288
|
-
use image::codecs::png::PngEncoder;
|
|
289
|
-
use std::io::Cursor;
|
|
290
|
-
|
|
291
|
-
let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
|
|
292
|
-
message: "OCR config required for force_ocr".to_string(),
|
|
293
|
-
source: None,
|
|
294
|
-
})?;
|
|
295
|
-
|
|
296
|
-
let backend = {
|
|
297
|
-
let registry = get_ocr_backend_registry();
|
|
298
|
-
let registry = registry.read().map_err(|e| crate::KreuzbergError::Plugin {
|
|
299
|
-
message: format!("Failed to acquire read lock on OCR backend registry: {}", e),
|
|
300
|
-
plugin_name: "ocr-registry".to_string(),
|
|
301
|
-
})?;
|
|
302
|
-
registry.get(&ocr_config.backend)?
|
|
303
|
-
};
|
|
304
|
-
|
|
305
|
-
let images = {
|
|
306
|
-
let render_options = PageRenderOptions::default();
|
|
307
|
-
let renderer = PdfRenderer::new().map_err(|e| crate::KreuzbergError::Parsing {
|
|
308
|
-
message: format!("Failed to initialize PDF renderer: {}", e),
|
|
309
|
-
source: None,
|
|
310
|
-
})?;
|
|
311
|
-
|
|
312
|
-
renderer
|
|
313
|
-
.render_all_pages(content, &render_options)
|
|
314
|
-
.map_err(|e| crate::KreuzbergError::Parsing {
|
|
315
|
-
message: format!("Failed to render PDF pages: {}", e),
|
|
316
|
-
source: None,
|
|
317
|
-
})?
|
|
318
|
-
};
|
|
319
|
-
|
|
320
|
-
let mut page_texts = Vec::with_capacity(images.len());
|
|
321
|
-
|
|
322
|
-
for image in images {
|
|
323
|
-
let rgb_image = image.to_rgb8();
|
|
324
|
-
let (width, height) = rgb_image.dimensions();
|
|
325
|
-
|
|
326
|
-
let mut image_bytes = Cursor::new(Vec::new());
|
|
327
|
-
let encoder = PngEncoder::new(&mut image_bytes);
|
|
328
|
-
encoder
|
|
329
|
-
.write_image(&rgb_image, width, height, image::ColorType::Rgb8.into())
|
|
330
|
-
.map_err(|e| crate::KreuzbergError::Parsing {
|
|
331
|
-
message: format!("Failed to encode image: {}", e),
|
|
332
|
-
source: None,
|
|
333
|
-
})?;
|
|
334
|
-
|
|
335
|
-
let image_data = image_bytes.into_inner();
|
|
336
|
-
|
|
337
|
-
let ocr_result = backend.process_image(&image_data, ocr_config).await?;
|
|
338
|
-
|
|
339
|
-
page_texts.push(ocr_result.content);
|
|
340
|
-
}
|
|
341
|
-
|
|
342
|
-
Ok(page_texts.join("\n\n"))
|
|
343
|
-
}
|
|
344
43
|
}
|
|
345
44
|
|
|
346
45
|
impl Plugin for PdfExtractor {
|
|
@@ -404,7 +103,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
404
103
|
}
|
|
405
104
|
})?;
|
|
406
105
|
|
|
407
|
-
|
|
106
|
+
extract_all_from_document(&document, config)?
|
|
408
107
|
}
|
|
409
108
|
#[cfg(all(not(target_arch = "wasm32"), feature = "tokio-runtime"))]
|
|
410
109
|
{
|
|
@@ -428,7 +127,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
428
127
|
})?;
|
|
429
128
|
|
|
430
129
|
let (pdf_metadata, native_text, tables, page_contents) =
|
|
431
|
-
|
|
130
|
+
extract_all_from_document(&document, &config_owned)?;
|
|
432
131
|
|
|
433
132
|
if let Some(page_cfg) = config_owned.pages.as_ref()
|
|
434
133
|
&& page_cfg.extract_pages
|
|
@@ -458,7 +157,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
458
157
|
}
|
|
459
158
|
})?;
|
|
460
159
|
|
|
461
|
-
|
|
160
|
+
extract_all_from_document(&document, config)?
|
|
462
161
|
}
|
|
463
162
|
}
|
|
464
163
|
#[cfg(all(not(target_arch = "wasm32"), not(feature = "tokio-runtime")))]
|
|
@@ -475,19 +174,19 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
475
174
|
}
|
|
476
175
|
})?;
|
|
477
176
|
|
|
478
|
-
|
|
177
|
+
extract_all_from_document(&document, config)?
|
|
479
178
|
}
|
|
480
179
|
};
|
|
481
180
|
|
|
482
181
|
#[cfg(feature = "ocr")]
|
|
483
182
|
let text = if config.force_ocr {
|
|
484
183
|
if config.ocr.is_some() {
|
|
485
|
-
|
|
184
|
+
extract_with_ocr(content, config).await?
|
|
486
185
|
} else {
|
|
487
186
|
native_text
|
|
488
187
|
}
|
|
489
188
|
} else if config.ocr.is_some() {
|
|
490
|
-
let decision = evaluate_native_text_for_ocr(&native_text, None);
|
|
189
|
+
let decision = ocr::evaluate_native_text_for_ocr(&native_text, None);
|
|
491
190
|
|
|
492
191
|
if std::env::var("KREUZBERG_DEBUG_OCR").is_ok() {
|
|
493
192
|
eprintln!(
|
|
@@ -504,7 +203,7 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
504
203
|
}
|
|
505
204
|
|
|
506
205
|
if decision.fallback {
|
|
507
|
-
|
|
206
|
+
extract_with_ocr(content, config).await?
|
|
508
207
|
} else {
|
|
509
208
|
native_text
|
|
510
209
|
}
|
|
@@ -593,6 +292,8 @@ impl DocumentExtractor for PdfExtractor {
|
|
|
593
292
|
detected_languages: None,
|
|
594
293
|
chunks: None,
|
|
595
294
|
images,
|
|
295
|
+
djot_content: None,
|
|
296
|
+
elements: None,
|
|
596
297
|
})
|
|
597
298
|
}
|
|
598
299
|
|
|
@@ -640,21 +341,21 @@ mod tests {
|
|
|
640
341
|
#[cfg(feature = "ocr")]
|
|
641
342
|
#[test]
|
|
642
343
|
fn test_should_fallback_to_ocr_for_empty_text() {
|
|
643
|
-
assert!(evaluate_native_text_for_ocr("", Some(1)).fallback);
|
|
344
|
+
assert!(ocr::evaluate_native_text_for_ocr("", Some(1)).fallback);
|
|
644
345
|
}
|
|
645
346
|
|
|
646
347
|
#[cfg(feature = "ocr")]
|
|
647
348
|
#[test]
|
|
648
349
|
fn test_should_not_fallback_for_meaningful_text() {
|
|
649
350
|
let sample = "This page has searchable vector text and should avoid OCR.";
|
|
650
|
-
assert!(!evaluate_native_text_for_ocr(sample, Some(1)).fallback);
|
|
351
|
+
assert!(!ocr::evaluate_native_text_for_ocr(sample, Some(1)).fallback);
|
|
651
352
|
}
|
|
652
353
|
|
|
653
354
|
#[cfg(feature = "ocr")]
|
|
654
355
|
#[test]
|
|
655
356
|
fn test_should_fallback_for_punctuation_only_text() {
|
|
656
357
|
let sample = " . , ; : -- -- ";
|
|
657
|
-
assert!(evaluate_native_text_for_ocr(sample, Some(2)).fallback);
|
|
358
|
+
assert!(ocr::evaluate_native_text_for_ocr(sample, Some(2)).fallback);
|
|
658
359
|
}
|
|
659
360
|
|
|
660
361
|
#[tokio::test]
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
//! OCR functionality for PDF extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! Handles text quality evaluation, OCR fallback decision logic, and OCR processing.
|
|
4
|
+
|
|
5
|
+
#[cfg(feature = "ocr")]
|
|
6
|
+
use crate::core::config::ExtractionConfig;
|
|
7
|
+
|
|
8
|
+
#[cfg(feature = "ocr")]
|
|
9
|
+
pub(crate) const MIN_TOTAL_NON_WHITESPACE: usize = 64;
|
|
10
|
+
#[cfg(feature = "ocr")]
|
|
11
|
+
pub(crate) const MIN_NON_WHITESPACE_PER_PAGE: f64 = 32.0;
|
|
12
|
+
#[cfg(feature = "ocr")]
|
|
13
|
+
pub(crate) const MIN_MEANINGFUL_WORD_LEN: usize = 4;
|
|
14
|
+
#[cfg(feature = "ocr")]
|
|
15
|
+
pub(crate) const MIN_MEANINGFUL_WORDS: usize = 3;
|
|
16
|
+
#[cfg(feature = "ocr")]
|
|
17
|
+
pub(crate) const MIN_ALNUM_RATIO: f64 = 0.3;
|
|
18
|
+
|
|
19
|
+
#[cfg(feature = "ocr")]
|
|
20
|
+
pub struct NativeTextStats {
|
|
21
|
+
pub non_whitespace: usize,
|
|
22
|
+
pub alnum: usize,
|
|
23
|
+
pub meaningful_words: usize,
|
|
24
|
+
pub alnum_ratio: f64,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
#[cfg(feature = "ocr")]
|
|
28
|
+
pub struct OcrFallbackDecision {
|
|
29
|
+
pub stats: NativeTextStats,
|
|
30
|
+
pub avg_non_whitespace: f64,
|
|
31
|
+
pub avg_alnum: f64,
|
|
32
|
+
pub fallback: bool,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
#[cfg(feature = "ocr")]
|
|
36
|
+
impl NativeTextStats {
|
|
37
|
+
pub fn from(text: &str) -> Self {
|
|
38
|
+
let mut non_whitespace = 0usize;
|
|
39
|
+
let mut alnum = 0usize;
|
|
40
|
+
|
|
41
|
+
for ch in text.chars() {
|
|
42
|
+
if !ch.is_whitespace() {
|
|
43
|
+
non_whitespace += 1;
|
|
44
|
+
if ch.is_alphanumeric() {
|
|
45
|
+
alnum += 1;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
let meaningful_words = text
|
|
51
|
+
.split_whitespace()
|
|
52
|
+
.filter(|word| {
|
|
53
|
+
word.chars()
|
|
54
|
+
.filter(|c| c.is_alphanumeric())
|
|
55
|
+
.take(MIN_MEANINGFUL_WORD_LEN)
|
|
56
|
+
.count()
|
|
57
|
+
>= MIN_MEANINGFUL_WORD_LEN
|
|
58
|
+
})
|
|
59
|
+
.take(MIN_MEANINGFUL_WORDS)
|
|
60
|
+
.count();
|
|
61
|
+
|
|
62
|
+
let alnum_ratio = if non_whitespace == 0 {
|
|
63
|
+
0.0
|
|
64
|
+
} else {
|
|
65
|
+
alnum as f64 / non_whitespace as f64
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
Self {
|
|
69
|
+
non_whitespace,
|
|
70
|
+
alnum,
|
|
71
|
+
meaningful_words,
|
|
72
|
+
alnum_ratio,
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/// Evaluates native PDF text quality to determine if OCR fallback is needed.
|
|
78
|
+
///
|
|
79
|
+
/// Analyzes text characteristics (whitespace, alphanumeric ratio, meaningful words)
|
|
80
|
+
/// to detect cases where native text extraction produced poor results (e.g., scanned
|
|
81
|
+
/// PDFs with garbled text).
|
|
82
|
+
///
|
|
83
|
+
/// # Arguments
|
|
84
|
+
///
|
|
85
|
+
/// * `native_text` - The text extracted from the PDF using native methods
|
|
86
|
+
/// * `page_count` - Optional page count for per-page average calculations
|
|
87
|
+
///
|
|
88
|
+
/// # Returns
|
|
89
|
+
///
|
|
90
|
+
/// An `OcrFallbackDecision` containing:
|
|
91
|
+
/// - Statistics about the text quality
|
|
92
|
+
/// - Per-page averages
|
|
93
|
+
/// - Boolean decision on whether to use OCR
|
|
94
|
+
#[cfg(feature = "ocr")]
|
|
95
|
+
pub fn evaluate_native_text_for_ocr(native_text: &str, page_count: Option<usize>) -> OcrFallbackDecision {
|
|
96
|
+
let trimmed = native_text.trim();
|
|
97
|
+
|
|
98
|
+
if trimmed.is_empty() {
|
|
99
|
+
let empty_stats = NativeTextStats {
|
|
100
|
+
non_whitespace: 0,
|
|
101
|
+
alnum: 0,
|
|
102
|
+
meaningful_words: 0,
|
|
103
|
+
alnum_ratio: 0.0,
|
|
104
|
+
};
|
|
105
|
+
return OcrFallbackDecision {
|
|
106
|
+
stats: empty_stats,
|
|
107
|
+
avg_non_whitespace: 0.0,
|
|
108
|
+
avg_alnum: 0.0,
|
|
109
|
+
fallback: true,
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
let stats = NativeTextStats::from(trimmed);
|
|
114
|
+
let pages = page_count.unwrap_or(1).max(1) as f64;
|
|
115
|
+
let avg_non_whitespace = stats.non_whitespace as f64 / pages;
|
|
116
|
+
let avg_alnum = stats.alnum as f64 / pages;
|
|
117
|
+
|
|
118
|
+
let has_substantial_text = stats.non_whitespace >= MIN_TOTAL_NON_WHITESPACE
|
|
119
|
+
&& avg_non_whitespace >= MIN_NON_WHITESPACE_PER_PAGE
|
|
120
|
+
&& stats.meaningful_words >= MIN_MEANINGFUL_WORDS;
|
|
121
|
+
|
|
122
|
+
let fallback = if stats.non_whitespace == 0 || stats.alnum == 0 {
|
|
123
|
+
true
|
|
124
|
+
} else if has_substantial_text {
|
|
125
|
+
false
|
|
126
|
+
} else if (stats.alnum_ratio < MIN_ALNUM_RATIO && avg_alnum < MIN_NON_WHITESPACE_PER_PAGE)
|
|
127
|
+
|| (stats.non_whitespace < MIN_TOTAL_NON_WHITESPACE && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE)
|
|
128
|
+
{
|
|
129
|
+
true
|
|
130
|
+
} else {
|
|
131
|
+
stats.meaningful_words == 0 && avg_non_whitespace < MIN_NON_WHITESPACE_PER_PAGE
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
OcrFallbackDecision {
|
|
135
|
+
stats,
|
|
136
|
+
avg_non_whitespace,
|
|
137
|
+
avg_alnum,
|
|
138
|
+
fallback,
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Extract text from PDF using OCR.
|
|
143
|
+
///
|
|
144
|
+
/// Renders all pages to images and processes them with OCR backend.
|
|
145
|
+
///
|
|
146
|
+
/// # Arguments
|
|
147
|
+
///
|
|
148
|
+
/// * `content` - Raw PDF bytes
|
|
149
|
+
/// * `config` - Extraction configuration including OCR settings
|
|
150
|
+
///
|
|
151
|
+
/// # Returns
|
|
152
|
+
///
|
|
153
|
+
/// Concatenated text from all pages, separated by double newlines
|
|
154
|
+
#[cfg(feature = "ocr")]
|
|
155
|
+
pub(crate) async fn extract_with_ocr(content: &[u8], config: &ExtractionConfig) -> crate::Result<String> {
|
|
156
|
+
use crate::pdf::rendering::{PageRenderOptions, PdfRenderer};
|
|
157
|
+
use crate::plugins::registry::get_ocr_backend_registry;
|
|
158
|
+
use image::ImageEncoder;
|
|
159
|
+
use image::codecs::png::PngEncoder;
|
|
160
|
+
use std::io::Cursor;
|
|
161
|
+
|
|
162
|
+
let ocr_config = config.ocr.as_ref().ok_or_else(|| crate::KreuzbergError::Parsing {
|
|
163
|
+
message: "OCR config required for force_ocr".to_string(),
|
|
164
|
+
source: None,
|
|
165
|
+
})?;
|
|
166
|
+
|
|
167
|
+
let backend = {
|
|
168
|
+
let registry = get_ocr_backend_registry();
|
|
169
|
+
let registry = registry.read().map_err(|e| crate::KreuzbergError::Plugin {
|
|
170
|
+
message: format!("Failed to acquire read lock on OCR backend registry: {}", e),
|
|
171
|
+
plugin_name: "ocr-registry".to_string(),
|
|
172
|
+
})?;
|
|
173
|
+
registry.get(&ocr_config.backend)?
|
|
174
|
+
};
|
|
175
|
+
|
|
176
|
+
let images = {
|
|
177
|
+
let render_options = PageRenderOptions::default();
|
|
178
|
+
let renderer = PdfRenderer::new().map_err(|e| crate::KreuzbergError::Parsing {
|
|
179
|
+
message: format!("Failed to initialize PDF renderer: {}", e),
|
|
180
|
+
source: None,
|
|
181
|
+
})?;
|
|
182
|
+
|
|
183
|
+
renderer
|
|
184
|
+
.render_all_pages(content, &render_options)
|
|
185
|
+
.map_err(|e| crate::KreuzbergError::Parsing {
|
|
186
|
+
message: format!("Failed to render PDF pages: {}", e),
|
|
187
|
+
source: None,
|
|
188
|
+
})?
|
|
189
|
+
};
|
|
190
|
+
|
|
191
|
+
let mut page_texts = Vec::with_capacity(images.len());
|
|
192
|
+
|
|
193
|
+
for image in images {
|
|
194
|
+
let rgb_image = image.to_rgb8();
|
|
195
|
+
let (width, height) = rgb_image.dimensions();
|
|
196
|
+
|
|
197
|
+
let mut image_bytes = Cursor::new(Vec::new());
|
|
198
|
+
let encoder = PngEncoder::new(&mut image_bytes);
|
|
199
|
+
encoder
|
|
200
|
+
.write_image(&rgb_image, width, height, image::ColorType::Rgb8.into())
|
|
201
|
+
.map_err(|e| crate::KreuzbergError::Parsing {
|
|
202
|
+
message: format!("Failed to encode image: {}", e),
|
|
203
|
+
source: None,
|
|
204
|
+
})?;
|
|
205
|
+
|
|
206
|
+
let image_data = image_bytes.into_inner();
|
|
207
|
+
|
|
208
|
+
let ocr_result = backend.process_image(&image_data, ocr_config).await?;
|
|
209
|
+
|
|
210
|
+
page_texts.push(ocr_result.content);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
Ok(page_texts.join("\n\n"))
|
|
214
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
//! Page content management for PDF extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! Handles assignment of tables and images to specific pages.
|
|
4
|
+
|
|
5
|
+
use crate::types::PageContent;
|
|
6
|
+
|
|
7
|
+
/// Helper function to assign tables and images to pages.
|
|
8
|
+
///
|
|
9
|
+
/// If page_contents is None, returns None (no per-page tracking enabled).
|
|
10
|
+
/// Otherwise, iterates through tables and images, assigning them to pages based on page_number.
|
|
11
|
+
///
|
|
12
|
+
/// # Performance
|
|
13
|
+
///
|
|
14
|
+
/// Uses Arc::new to wrap tables and images, avoiding expensive copies.
|
|
15
|
+
/// This reduces memory overhead by enabling zero-copy sharing of table/image data
|
|
16
|
+
/// across multiple references (e.g., when the same table appears on multiple pages).
|
|
17
|
+
///
|
|
18
|
+
/// # Arguments
|
|
19
|
+
///
|
|
20
|
+
/// * `page_contents` - Optional vector of page contents to populate
|
|
21
|
+
/// * `tables` - Slice of tables to assign to pages
|
|
22
|
+
/// * `images` - Slice of images to assign to pages
|
|
23
|
+
///
|
|
24
|
+
/// # Returns
|
|
25
|
+
///
|
|
26
|
+
/// Updated page contents with tables and images assigned, or None if page tracking disabled
|
|
27
|
+
pub(crate) fn assign_tables_and_images_to_pages(
|
|
28
|
+
mut page_contents: Option<Vec<PageContent>>,
|
|
29
|
+
tables: &[crate::types::Table],
|
|
30
|
+
images: &[crate::types::ExtractedImage],
|
|
31
|
+
) -> Option<Vec<PageContent>> {
|
|
32
|
+
let pages = page_contents.take()?;
|
|
33
|
+
|
|
34
|
+
let mut updated_pages = pages;
|
|
35
|
+
|
|
36
|
+
for table in tables {
|
|
37
|
+
if let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == table.page_number) {
|
|
38
|
+
page.tables.push(std::sync::Arc::new(table.clone()));
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
for image in images {
|
|
43
|
+
if let Some(page_num) = image.page_number
|
|
44
|
+
&& let Some(page) = updated_pages.iter_mut().find(|p| p.page_number == page_num)
|
|
45
|
+
{
|
|
46
|
+
page.images.push(std::sync::Arc::new(image.clone()));
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
Some(updated_pages)
|
|
51
|
+
}
|