kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,1713 +0,0 @@
|
|
|
1
|
-
use serde::{Deserialize, Serialize};
|
|
2
|
-
use std::collections::{BTreeMap, HashMap};
|
|
3
|
-
use std::sync::Arc;
|
|
4
|
-
|
|
5
|
-
#[cfg(feature = "pdf")]
|
|
6
|
-
use crate::pdf::metadata::PdfMetadata;
|
|
7
|
-
|
|
8
|
-
// ============================================================================
|
|
9
|
-
// ============================================================================
|
|
10
|
-
|
|
11
|
-
/// Module providing transparent serde support for Arc<T>.
|
|
12
|
-
///
|
|
13
|
-
/// Allows Arc-wrapped types to serialize/deserialize as if unwrapped,
|
|
14
|
-
/// maintaining exact JSON format while preserving memory efficiency benefits.
|
|
15
|
-
///
|
|
16
|
-
/// # Arc Sharing Semantics
|
|
17
|
-
///
|
|
18
|
-
/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
|
|
19
|
-
/// When deserializing, each Arc is independently created with `Arc::new()`.
|
|
20
|
-
/// This means that if two Arcs referenced the same data before serialization,
|
|
21
|
-
/// they will be separate Arcs after deserialization.
|
|
22
|
-
///
|
|
23
|
-
/// Example:
|
|
24
|
-
/// ```ignore
|
|
25
|
-
/// let shared = Arc::new(Table { /* ... */ });
|
|
26
|
-
/// let tables = vec![Arc::clone(&shared), Arc::clone(&shared)];
|
|
27
|
-
/// // Both in-memory Arcs point to the same Table
|
|
28
|
-
///
|
|
29
|
-
/// let json = serde_json::to_string(&tables)?;
|
|
30
|
-
/// let deserialized: Vec<Arc<Table>> = serde_json::from_str(&json)?;
|
|
31
|
-
/// // deserialized[0] and deserialized[1] are now independent Arcs,
|
|
32
|
-
/// // even though they contain identical data
|
|
33
|
-
/// ```
|
|
34
|
-
///
|
|
35
|
-
/// This design choice maintains:
|
|
36
|
-
/// - Exact JSON format compatibility (no sharing metadata in JSON)
|
|
37
|
-
/// - Predictable deserialization behavior
|
|
38
|
-
/// - Zero additional serialization overhead
|
|
39
|
-
///
|
|
40
|
-
/// If in-memory sharing is required, callers must implement custom sharing logic
|
|
41
|
-
/// or use a different data structure (like a HashMap of deduplicated values).
|
|
42
|
-
#[allow(dead_code)]
|
|
43
|
-
mod serde_arc {
|
|
44
|
-
use serde::{Deserialize, Deserializer, Serializer};
|
|
45
|
-
use std::sync::Arc;
|
|
46
|
-
|
|
47
|
-
/// Serialize an Arc<T> by serializing the inner value directly.
|
|
48
|
-
///
|
|
49
|
-
/// This makes Arc<T> serialize identically to T, maintaining API compatibility.
|
|
50
|
-
/// The outer Arc wrapper is transparent during serialization.
|
|
51
|
-
pub fn serialize<S, T>(arc_value: &Arc<T>, serializer: S) -> Result<S::Ok, S::Error>
|
|
52
|
-
where
|
|
53
|
-
S: Serializer,
|
|
54
|
-
T: serde::Serialize,
|
|
55
|
-
{
|
|
56
|
-
(**arc_value).serialize(serializer)
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
/// Deserialize a T and wrap it in Arc.
|
|
60
|
-
///
|
|
61
|
-
/// This makes Arc<T> deserialize from the same format as T.
|
|
62
|
-
/// Each Arc is independently created during deserialization;
|
|
63
|
-
/// Arc sharing from before serialization is NOT preserved.
|
|
64
|
-
pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Arc<T>, D::Error>
|
|
65
|
-
where
|
|
66
|
-
D: Deserializer<'de>,
|
|
67
|
-
T: Deserialize<'de>,
|
|
68
|
-
{
|
|
69
|
-
T::deserialize(deserializer).map(Arc::new)
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
/// Module for serializing Vec<Arc<T>> with transparent Arc handling.
|
|
74
|
-
///
|
|
75
|
-
/// Serializes a Vec<Arc<T>> as Vec<T> for compatibility, while preserving
|
|
76
|
-
/// Arc semantics for memory efficiency.
|
|
77
|
-
///
|
|
78
|
-
/// # Arc Sharing Semantics
|
|
79
|
-
///
|
|
80
|
-
/// **Important**: Arc sharing semantics are **NOT** preserved across serialization.
|
|
81
|
-
/// When deserializing, each element's Arc is independently created with `Arc::new()`.
|
|
82
|
-
/// This is important for `PageContent` where tables/images may be shared across pages.
|
|
83
|
-
///
|
|
84
|
-
/// Example with shared tables:
|
|
85
|
-
/// ```ignore
|
|
86
|
-
/// let shared_table = Arc::new(Table { /* ... */ });
|
|
87
|
-
/// let page_contents = vec![
|
|
88
|
-
/// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
|
|
89
|
-
/// PageContent { tables: vec![Arc::clone(&shared_table)], ... },
|
|
90
|
-
/// ];
|
|
91
|
-
/// // In-memory: both pages' tables point to the same Arc
|
|
92
|
-
///
|
|
93
|
-
/// let json = serde_json::to_string(&page_contents)?;
|
|
94
|
-
/// let deserialized = serde_json::from_str::<Vec<PageContent>>(&json)?;
|
|
95
|
-
/// // After deserialization: each page has independent Arc instances,
|
|
96
|
-
/// // even though the table data is identical
|
|
97
|
-
/// ```
|
|
98
|
-
///
|
|
99
|
-
/// Design rationale:
|
|
100
|
-
/// - JSON has no mechanism to represent shared references
|
|
101
|
-
/// - Preserving sharing would require complex metadata and deduplication
|
|
102
|
-
/// - Current approach is simple, predictable, and maintains compatibility
|
|
103
|
-
/// - In-memory sharing (via Arc) is an implementation detail for the Rust side
|
|
104
|
-
///
|
|
105
|
-
/// If in-memory sharing is required after deserialization, implement custom
|
|
106
|
-
/// deduplication logic using hashing or content comparison.
|
|
107
|
-
mod serde_vec_arc {
|
|
108
|
-
use serde::{Deserialize, Deserializer, Serializer};
|
|
109
|
-
use std::sync::Arc;
|
|
110
|
-
|
|
111
|
-
/// Serialize Vec<Arc<T>> by serializing each T directly.
|
|
112
|
-
///
|
|
113
|
-
/// Each element is unwrapped from its Arc and serialized independently.
|
|
114
|
-
/// No sharing metadata is included in the serialized output.
|
|
115
|
-
pub fn serialize<S, T>(vec: &[Arc<T>], serializer: S) -> Result<S::Ok, S::Error>
|
|
116
|
-
where
|
|
117
|
-
S: Serializer,
|
|
118
|
-
T: serde::Serialize,
|
|
119
|
-
{
|
|
120
|
-
use serde::ser::SerializeSeq;
|
|
121
|
-
let mut seq = serializer.serialize_seq(Some(vec.len()))?;
|
|
122
|
-
for arc_item in vec {
|
|
123
|
-
seq.serialize_element(&**arc_item)?;
|
|
124
|
-
}
|
|
125
|
-
seq.end()
|
|
126
|
-
}
|
|
127
|
-
|
|
128
|
-
/// Deserialize Vec<T> and wrap each element in Arc.
|
|
129
|
-
///
|
|
130
|
-
/// Each element is independently wrapped in a new Arc.
|
|
131
|
-
/// Sharing relationships from before serialization are lost.
|
|
132
|
-
pub fn deserialize<'de, D, T>(deserializer: D) -> Result<Vec<Arc<T>>, D::Error>
|
|
133
|
-
where
|
|
134
|
-
D: Deserializer<'de>,
|
|
135
|
-
T: Deserialize<'de>,
|
|
136
|
-
{
|
|
137
|
-
let vec: Vec<T> = Deserialize::deserialize(deserializer)?;
|
|
138
|
-
Ok(vec.into_iter().map(Arc::new).collect())
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
/// General extraction result used by the core extraction API.
|
|
143
|
-
///
|
|
144
|
-
/// This is the main result type returned by all extraction functions.
|
|
145
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
146
|
-
pub struct ExtractionResult {
|
|
147
|
-
pub content: String,
|
|
148
|
-
pub mime_type: String,
|
|
149
|
-
pub metadata: Metadata,
|
|
150
|
-
pub tables: Vec<Table>,
|
|
151
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
152
|
-
pub detected_languages: Option<Vec<String>>,
|
|
153
|
-
|
|
154
|
-
/// Text chunks when chunking is enabled.
|
|
155
|
-
///
|
|
156
|
-
/// When chunking configuration is provided, the content is split into
|
|
157
|
-
/// overlapping chunks for efficient processing. Each chunk contains the text,
|
|
158
|
-
/// optional embeddings (if enabled), and metadata about its position.
|
|
159
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
160
|
-
pub chunks: Option<Vec<Chunk>>,
|
|
161
|
-
|
|
162
|
-
/// Extracted images from the document.
|
|
163
|
-
///
|
|
164
|
-
/// When image extraction is enabled via `ImageExtractionConfig`, this field
|
|
165
|
-
/// contains all images found in the document with their raw data and metadata.
|
|
166
|
-
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|
167
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
168
|
-
pub images: Option<Vec<ExtractedImage>>,
|
|
169
|
-
|
|
170
|
-
/// Per-page content when page extraction is enabled.
|
|
171
|
-
///
|
|
172
|
-
/// When page extraction is configured, the document is split into per-page content
|
|
173
|
-
/// with tables and images mapped to their respective pages.
|
|
174
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
175
|
-
pub pages: Option<Vec<PageContent>>,
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
/// Format-specific metadata (discriminated union).
|
|
179
|
-
///
|
|
180
|
-
/// Only one format type can exist per extraction result. This provides
|
|
181
|
-
/// type-safe, clean metadata without nested optionals.
|
|
182
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
183
|
-
#[serde(tag = "format_type", rename_all = "snake_case")]
|
|
184
|
-
pub enum FormatMetadata {
|
|
185
|
-
#[cfg(feature = "pdf")]
|
|
186
|
-
Pdf(PdfMetadata),
|
|
187
|
-
Excel(ExcelMetadata),
|
|
188
|
-
Email(EmailMetadata),
|
|
189
|
-
Pptx(PptxMetadata),
|
|
190
|
-
Archive(ArchiveMetadata),
|
|
191
|
-
Image(ImageMetadata),
|
|
192
|
-
Xml(XmlMetadata),
|
|
193
|
-
Text(TextMetadata),
|
|
194
|
-
Html(Box<HtmlMetadata>),
|
|
195
|
-
Ocr(OcrMetadata),
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
/// Extraction result metadata.
|
|
199
|
-
///
|
|
200
|
-
/// Contains common fields applicable to all formats, format-specific metadata
|
|
201
|
-
/// via a discriminated union, and additional custom fields from postprocessors.
|
|
202
|
-
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
203
|
-
pub struct Metadata {
|
|
204
|
-
/// Document title
|
|
205
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
206
|
-
pub title: Option<String>,
|
|
207
|
-
|
|
208
|
-
/// Document subject or description
|
|
209
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
210
|
-
pub subject: Option<String>,
|
|
211
|
-
|
|
212
|
-
/// Primary author(s) - always Vec for consistency
|
|
213
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
214
|
-
pub authors: Option<Vec<String>>,
|
|
215
|
-
|
|
216
|
-
/// Keywords/tags - always Vec for consistency
|
|
217
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
218
|
-
pub keywords: Option<Vec<String>>,
|
|
219
|
-
|
|
220
|
-
/// Primary language (ISO 639 code)
|
|
221
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
222
|
-
pub language: Option<String>,
|
|
223
|
-
|
|
224
|
-
/// Creation timestamp (ISO 8601 format)
|
|
225
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
226
|
-
pub created_at: Option<String>,
|
|
227
|
-
|
|
228
|
-
/// Last modification timestamp (ISO 8601 format)
|
|
229
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
230
|
-
pub modified_at: Option<String>,
|
|
231
|
-
|
|
232
|
-
/// User who created the document
|
|
233
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
234
|
-
pub created_by: Option<String>,
|
|
235
|
-
|
|
236
|
-
/// User who last modified the document
|
|
237
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
238
|
-
pub modified_by: Option<String>,
|
|
239
|
-
|
|
240
|
-
/// Page/slide/sheet structure with boundaries
|
|
241
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
242
|
-
pub pages: Option<PageStructure>,
|
|
243
|
-
|
|
244
|
-
/// Format-specific metadata (discriminated union)
|
|
245
|
-
///
|
|
246
|
-
/// Contains detailed metadata specific to the document format.
|
|
247
|
-
/// Serializes with a `format_type` discriminator field.
|
|
248
|
-
#[serde(flatten, skip_serializing_if = "Option::is_none")]
|
|
249
|
-
pub format: Option<FormatMetadata>,
|
|
250
|
-
|
|
251
|
-
/// Image preprocessing metadata (when OCR preprocessing was applied)
|
|
252
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
253
|
-
pub image_preprocessing: Option<ImagePreprocessingMetadata>,
|
|
254
|
-
|
|
255
|
-
/// JSON schema (for structured data extraction)
|
|
256
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
257
|
-
pub json_schema: Option<serde_json::Value>,
|
|
258
|
-
|
|
259
|
-
/// Error metadata (for batch operations)
|
|
260
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
261
|
-
pub error: Option<ErrorMetadata>,
|
|
262
|
-
|
|
263
|
-
/// Additional custom fields from postprocessors.
|
|
264
|
-
///
|
|
265
|
-
/// This flattened HashMap allows Python/TypeScript postprocessors to add
|
|
266
|
-
/// arbitrary fields (entity extraction, keyword extraction, etc.).
|
|
267
|
-
/// Fields are merged at the root level during serialization.
|
|
268
|
-
#[serde(flatten)]
|
|
269
|
-
pub additional: HashMap<String, serde_json::Value>,
|
|
270
|
-
}
|
|
271
|
-
|
|
272
|
-
/// Unified page structure for documents.
|
|
273
|
-
///
|
|
274
|
-
/// Supports different page types (PDF pages, PPTX slides, Excel sheets)
|
|
275
|
-
/// with character offset boundaries for chunk-to-page mapping.
|
|
276
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
277
|
-
pub struct PageStructure {
|
|
278
|
-
/// Total number of pages/slides/sheets
|
|
279
|
-
pub total_count: usize,
|
|
280
|
-
|
|
281
|
-
/// Type of paginated unit
|
|
282
|
-
pub unit_type: PageUnitType,
|
|
283
|
-
|
|
284
|
-
/// Character offset boundaries for each page
|
|
285
|
-
///
|
|
286
|
-
/// Maps character ranges in the extracted content to page numbers.
|
|
287
|
-
/// Used for chunk page range calculation.
|
|
288
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
289
|
-
pub boundaries: Option<Vec<PageBoundary>>,
|
|
290
|
-
|
|
291
|
-
/// Detailed per-page metadata (optional, only when needed)
|
|
292
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
293
|
-
pub pages: Option<Vec<PageInfo>>,
|
|
294
|
-
}
|
|
295
|
-
|
|
296
|
-
/// Type of paginated unit in a document.
|
|
297
|
-
///
|
|
298
|
-
/// Distinguishes between different types of "pages" (PDF pages, presentation slides, spreadsheet sheets).
|
|
299
|
-
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
300
|
-
#[serde(rename_all = "snake_case")]
|
|
301
|
-
pub enum PageUnitType {
|
|
302
|
-
/// Standard document pages (PDF, DOCX, images)
|
|
303
|
-
Page,
|
|
304
|
-
/// Presentation slides (PPTX, ODP)
|
|
305
|
-
Slide,
|
|
306
|
-
/// Spreadsheet sheets (XLSX, ODS)
|
|
307
|
-
Sheet,
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
/// Byte offset boundary for a page.
|
|
311
|
-
///
|
|
312
|
-
/// Tracks where a specific page's content starts and ends in the main content string,
|
|
313
|
-
/// enabling mapping from byte positions to page numbers. Offsets are guaranteed to be
|
|
314
|
-
/// at valid UTF-8 character boundaries when using standard String methods (push_str, push, etc.).
|
|
315
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
316
|
-
pub struct PageBoundary {
|
|
317
|
-
/// Byte offset where this page starts in the content string (UTF-8 valid boundary, inclusive)
|
|
318
|
-
pub byte_start: usize,
|
|
319
|
-
/// Byte offset where this page ends in the content string (UTF-8 valid boundary, exclusive)
|
|
320
|
-
pub byte_end: usize,
|
|
321
|
-
/// Page number (1-indexed)
|
|
322
|
-
pub page_number: usize,
|
|
323
|
-
}
|
|
324
|
-
|
|
325
|
-
/// Metadata for individual page/slide/sheet.
|
|
326
|
-
///
|
|
327
|
-
/// Captures per-page information including dimensions, content counts,
|
|
328
|
-
/// and visibility state (for presentations).
|
|
329
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
330
|
-
pub struct PageInfo {
|
|
331
|
-
/// Page number (1-indexed)
|
|
332
|
-
pub number: usize,
|
|
333
|
-
|
|
334
|
-
/// Page title (usually for presentations)
|
|
335
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
336
|
-
pub title: Option<String>,
|
|
337
|
-
|
|
338
|
-
/// Dimensions in points (PDF) or pixels (images): (width, height)
|
|
339
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
340
|
-
pub dimensions: Option<(f64, f64)>,
|
|
341
|
-
|
|
342
|
-
/// Number of images on this page
|
|
343
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
344
|
-
pub image_count: Option<usize>,
|
|
345
|
-
|
|
346
|
-
/// Number of tables on this page
|
|
347
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
348
|
-
pub table_count: Option<usize>,
|
|
349
|
-
|
|
350
|
-
/// Whether this page is hidden (e.g., in presentations)
|
|
351
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
352
|
-
pub hidden: Option<bool>,
|
|
353
|
-
}
|
|
354
|
-
|
|
355
|
-
/// Content for a single page/slide.
|
|
356
|
-
///
|
|
357
|
-
/// When page extraction is enabled, documents are split into per-page content
|
|
358
|
-
/// with associated tables and images mapped to each page.
|
|
359
|
-
///
|
|
360
|
-
/// # Performance
|
|
361
|
-
///
|
|
362
|
-
/// Uses Arc-wrapped tables and images for memory efficiency:
|
|
363
|
-
/// - `Vec<Arc<Table>>` enables zero-copy sharing of table data
|
|
364
|
-
/// - `Vec<Arc<ExtractedImage>>` enables zero-copy sharing of image data
|
|
365
|
-
/// - Maintains exact JSON compatibility via custom Serialize/Deserialize
|
|
366
|
-
///
|
|
367
|
-
/// This reduces memory overhead for documents with shared tables/images
|
|
368
|
-
/// by avoiding redundant copies during serialization.
|
|
369
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
370
|
-
pub struct PageContent {
|
|
371
|
-
/// Page number (1-indexed)
|
|
372
|
-
pub page_number: usize,
|
|
373
|
-
|
|
374
|
-
/// Text content for this page
|
|
375
|
-
pub content: String,
|
|
376
|
-
|
|
377
|
-
/// Tables found on this page (uses Arc for memory efficiency)
|
|
378
|
-
///
|
|
379
|
-
/// Serializes as Vec<Table> for JSON compatibility while maintaining
|
|
380
|
-
/// Arc semantics in-memory for zero-copy sharing.
|
|
381
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
382
|
-
pub tables: Vec<Arc<Table>>,
|
|
383
|
-
|
|
384
|
-
/// Images found on this page (uses Arc for memory efficiency)
|
|
385
|
-
///
|
|
386
|
-
/// Serializes as Vec<ExtractedImage> for JSON compatibility while maintaining
|
|
387
|
-
/// Arc semantics in-memory for zero-copy sharing.
|
|
388
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default, with = "serde_vec_arc")]
|
|
389
|
-
pub images: Vec<Arc<ExtractedImage>>,
|
|
390
|
-
|
|
391
|
-
/// Hierarchy information for the page (when hierarchy extraction is enabled)
|
|
392
|
-
///
|
|
393
|
-
/// Contains text hierarchy levels (H1-H6) extracted from the page content.
|
|
394
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
395
|
-
pub hierarchy: Option<PageHierarchy>,
|
|
396
|
-
}
|
|
397
|
-
|
|
398
|
-
/// Page hierarchy structure containing heading levels and block information.
|
|
399
|
-
///
|
|
400
|
-
/// Used when PDF text hierarchy extraction is enabled. Contains hierarchical
|
|
401
|
-
/// blocks with heading levels (H1-H6) for semantic document structure.
|
|
402
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
403
|
-
pub struct PageHierarchy {
|
|
404
|
-
/// Number of hierarchy blocks on this page
|
|
405
|
-
pub block_count: usize,
|
|
406
|
-
|
|
407
|
-
/// Hierarchical blocks with heading levels
|
|
408
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
409
|
-
pub blocks: Vec<HierarchicalBlock>,
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
/// A text block with hierarchy level assignment.
|
|
413
|
-
///
|
|
414
|
-
/// Represents a block of text with semantic heading information extracted from
|
|
415
|
-
/// font size clustering and hierarchical analysis.
|
|
416
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
417
|
-
pub struct HierarchicalBlock {
|
|
418
|
-
/// The text content of this block
|
|
419
|
-
pub text: String,
|
|
420
|
-
|
|
421
|
-
/// The font size of the text in this block
|
|
422
|
-
pub font_size: f32,
|
|
423
|
-
|
|
424
|
-
/// The hierarchy level of this block (H1-H6 or Body)
|
|
425
|
-
///
|
|
426
|
-
/// Levels correspond to HTML heading tags:
|
|
427
|
-
/// - "h1": Top-level heading
|
|
428
|
-
/// - "h2": Secondary heading
|
|
429
|
-
/// - "h3": Tertiary heading
|
|
430
|
-
/// - "h4": Quaternary heading
|
|
431
|
-
/// - "h5": Quinary heading
|
|
432
|
-
/// - "h6": Senary heading
|
|
433
|
-
/// - "body": Body text (no heading level)
|
|
434
|
-
pub level: String,
|
|
435
|
-
|
|
436
|
-
/// Bounding box information for the block
|
|
437
|
-
///
|
|
438
|
-
/// Contains coordinates as (left, top, right, bottom) in PDF units.
|
|
439
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
440
|
-
pub bbox: Option<(f32, f32, f32, f32)>,
|
|
441
|
-
}
|
|
442
|
-
|
|
443
|
-
/// Excel/spreadsheet metadata.
|
|
444
|
-
///
|
|
445
|
-
/// Contains information about sheets in Excel, LibreOffice Calc, and other
|
|
446
|
-
/// spreadsheet formats (.xlsx, .xls, .ods, etc.).
|
|
447
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
448
|
-
pub struct ExcelMetadata {
|
|
449
|
-
/// Total number of sheets in the workbook
|
|
450
|
-
pub sheet_count: usize,
|
|
451
|
-
/// Names of all sheets in order
|
|
452
|
-
pub sheet_names: Vec<String>,
|
|
453
|
-
}
|
|
454
|
-
|
|
455
|
-
/// Email metadata extracted from .eml and .msg files.
|
|
456
|
-
///
|
|
457
|
-
/// Includes sender/recipient information, message ID, and attachment list.
|
|
458
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
459
|
-
pub struct EmailMetadata {
|
|
460
|
-
/// Sender's email address
|
|
461
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
462
|
-
pub from_email: Option<String>,
|
|
463
|
-
|
|
464
|
-
/// Sender's display name
|
|
465
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
466
|
-
pub from_name: Option<String>,
|
|
467
|
-
|
|
468
|
-
/// Primary recipients
|
|
469
|
-
pub to_emails: Vec<String>,
|
|
470
|
-
/// CC recipients
|
|
471
|
-
pub cc_emails: Vec<String>,
|
|
472
|
-
/// BCC recipients
|
|
473
|
-
pub bcc_emails: Vec<String>,
|
|
474
|
-
|
|
475
|
-
/// Message-ID header value
|
|
476
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
477
|
-
pub message_id: Option<String>,
|
|
478
|
-
|
|
479
|
-
/// List of attachment filenames
|
|
480
|
-
pub attachments: Vec<String>,
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
/// Archive (ZIP/TAR/7Z) metadata.
|
|
484
|
-
///
|
|
485
|
-
/// Extracted from compressed archive files containing file lists and size information.
|
|
486
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
487
|
-
pub struct ArchiveMetadata {
|
|
488
|
-
/// Archive format ("ZIP", "TAR", "7Z", etc.)
|
|
489
|
-
pub format: String,
|
|
490
|
-
/// Total number of files in the archive
|
|
491
|
-
pub file_count: usize,
|
|
492
|
-
/// List of file paths within the archive
|
|
493
|
-
pub file_list: Vec<String>,
|
|
494
|
-
/// Total uncompressed size in bytes
|
|
495
|
-
pub total_size: usize,
|
|
496
|
-
|
|
497
|
-
/// Compressed size in bytes (if available)
|
|
498
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
499
|
-
pub compressed_size: Option<usize>,
|
|
500
|
-
}
|
|
501
|
-
|
|
502
|
-
/// Image metadata extracted from image files.
|
|
503
|
-
///
|
|
504
|
-
/// Includes dimensions, format, and EXIF data.
|
|
505
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
506
|
-
pub struct ImageMetadata {
|
|
507
|
-
/// Image width in pixels
|
|
508
|
-
pub width: u32,
|
|
509
|
-
/// Image height in pixels
|
|
510
|
-
pub height: u32,
|
|
511
|
-
/// Image format (e.g., "PNG", "JPEG", "TIFF")
|
|
512
|
-
pub format: String,
|
|
513
|
-
/// EXIF metadata tags
|
|
514
|
-
pub exif: HashMap<String, String>,
|
|
515
|
-
}
|
|
516
|
-
|
|
517
|
-
/// XML metadata extracted during XML parsing.
|
|
518
|
-
///
|
|
519
|
-
/// Provides statistics about XML document structure.
|
|
520
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
521
|
-
pub struct XmlMetadata {
|
|
522
|
-
/// Total number of XML elements processed
|
|
523
|
-
pub element_count: usize,
|
|
524
|
-
/// List of unique element tag names (sorted)
|
|
525
|
-
pub unique_elements: Vec<String>,
|
|
526
|
-
}
|
|
527
|
-
|
|
528
|
-
/// Text/Markdown metadata.
|
|
529
|
-
///
|
|
530
|
-
/// Extracted from plain text and Markdown files. Includes word counts and,
|
|
531
|
-
/// for Markdown, structural elements like headers and links.
|
|
532
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
533
|
-
pub struct TextMetadata {
|
|
534
|
-
/// Number of lines in the document
|
|
535
|
-
pub line_count: usize,
|
|
536
|
-
/// Number of words
|
|
537
|
-
pub word_count: usize,
|
|
538
|
-
/// Number of characters
|
|
539
|
-
pub character_count: usize,
|
|
540
|
-
|
|
541
|
-
/// Markdown headers (headings text only, for Markdown files)
|
|
542
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
543
|
-
pub headers: Option<Vec<String>>,
|
|
544
|
-
|
|
545
|
-
/// Markdown links as (text, url) tuples (for Markdown files)
|
|
546
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
547
|
-
pub links: Option<Vec<(String, String)>>,
|
|
548
|
-
|
|
549
|
-
/// Code blocks as (language, code) tuples (for Markdown files)
|
|
550
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
551
|
-
pub code_blocks: Option<Vec<(String, String)>>,
|
|
552
|
-
}
|
|
553
|
-
|
|
554
|
-
/// Text direction enumeration for HTML documents.
|
|
555
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
556
|
-
#[serde(rename_all = "lowercase")]
|
|
557
|
-
pub enum TextDirection {
|
|
558
|
-
/// Left-to-right text direction
|
|
559
|
-
#[serde(rename = "ltr")]
|
|
560
|
-
LeftToRight,
|
|
561
|
-
/// Right-to-left text direction
|
|
562
|
-
#[serde(rename = "rtl")]
|
|
563
|
-
RightToLeft,
|
|
564
|
-
/// Automatic text direction detection
|
|
565
|
-
#[serde(rename = "auto")]
|
|
566
|
-
Auto,
|
|
567
|
-
}
|
|
568
|
-
|
|
569
|
-
/// Header/heading element metadata.
|
|
570
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
571
|
-
pub struct HeaderMetadata {
|
|
572
|
-
/// Header level: 1 (h1) through 6 (h6)
|
|
573
|
-
pub level: u8,
|
|
574
|
-
/// Normalized text content of the header
|
|
575
|
-
pub text: String,
|
|
576
|
-
/// HTML id attribute if present
|
|
577
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
578
|
-
pub id: Option<String>,
|
|
579
|
-
/// Document tree depth at the header element
|
|
580
|
-
pub depth: usize,
|
|
581
|
-
/// Byte offset in original HTML document
|
|
582
|
-
pub html_offset: usize,
|
|
583
|
-
}
|
|
584
|
-
|
|
585
|
-
/// Link element metadata.
|
|
586
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
587
|
-
pub struct LinkMetadata {
|
|
588
|
-
/// The href URL value
|
|
589
|
-
pub href: String,
|
|
590
|
-
/// Link text content (normalized)
|
|
591
|
-
pub text: String,
|
|
592
|
-
/// Optional title attribute
|
|
593
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
594
|
-
pub title: Option<String>,
|
|
595
|
-
/// Link type classification
|
|
596
|
-
pub link_type: LinkType,
|
|
597
|
-
/// Rel attribute values
|
|
598
|
-
pub rel: Vec<String>,
|
|
599
|
-
/// Additional attributes as key-value pairs
|
|
600
|
-
pub attributes: HashMap<String, String>,
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
/// Link type classification.
|
|
604
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
605
|
-
#[serde(rename_all = "lowercase")]
|
|
606
|
-
pub enum LinkType {
|
|
607
|
-
/// Anchor link (#section)
|
|
608
|
-
Anchor,
|
|
609
|
-
/// Internal link (same domain)
|
|
610
|
-
Internal,
|
|
611
|
-
/// External link (different domain)
|
|
612
|
-
External,
|
|
613
|
-
/// Email link (mailto:)
|
|
614
|
-
Email,
|
|
615
|
-
/// Phone link (tel:)
|
|
616
|
-
Phone,
|
|
617
|
-
/// Other link type
|
|
618
|
-
Other,
|
|
619
|
-
}
|
|
620
|
-
|
|
621
|
-
/// Image element metadata.
|
|
622
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
623
|
-
pub struct ImageMetadataType {
|
|
624
|
-
/// Image source (URL, data URI, or SVG content)
|
|
625
|
-
pub src: String,
|
|
626
|
-
/// Alternative text from alt attribute
|
|
627
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
628
|
-
pub alt: Option<String>,
|
|
629
|
-
/// Title attribute
|
|
630
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
631
|
-
pub title: Option<String>,
|
|
632
|
-
/// Image dimensions as (width, height) if available
|
|
633
|
-
pub dimensions: Option<(u32, u32)>,
|
|
634
|
-
/// Image type classification
|
|
635
|
-
pub image_type: ImageType,
|
|
636
|
-
/// Additional attributes as key-value pairs
|
|
637
|
-
pub attributes: HashMap<String, String>,
|
|
638
|
-
}
|
|
639
|
-
|
|
640
|
-
/// Image type classification.
|
|
641
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
642
|
-
#[serde(rename_all = "lowercase")]
|
|
643
|
-
pub enum ImageType {
|
|
644
|
-
/// Data URI image
|
|
645
|
-
#[serde(rename = "data-uri")]
|
|
646
|
-
DataUri,
|
|
647
|
-
/// Inline SVG
|
|
648
|
-
#[serde(rename = "inline-svg")]
|
|
649
|
-
InlineSvg,
|
|
650
|
-
/// External image URL
|
|
651
|
-
External,
|
|
652
|
-
/// Relative path image
|
|
653
|
-
Relative,
|
|
654
|
-
}
|
|
655
|
-
|
|
656
|
-
/// Structured data (Schema.org, microdata, RDFa) block.
|
|
657
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
658
|
-
pub struct StructuredData {
|
|
659
|
-
/// Type of structured data
|
|
660
|
-
pub data_type: StructuredDataType,
|
|
661
|
-
/// Raw JSON string representation
|
|
662
|
-
pub raw_json: String,
|
|
663
|
-
/// Schema type if detectable (e.g., "Article", "Event", "Product")
|
|
664
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
665
|
-
pub schema_type: Option<String>,
|
|
666
|
-
}
|
|
667
|
-
|
|
668
|
-
/// Structured data type classification.
|
|
669
|
-
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
670
|
-
#[serde(rename_all = "lowercase")]
|
|
671
|
-
pub enum StructuredDataType {
|
|
672
|
-
/// JSON-LD structured data
|
|
673
|
-
#[serde(rename = "json-ld")]
|
|
674
|
-
JsonLd,
|
|
675
|
-
/// Microdata
|
|
676
|
-
Microdata,
|
|
677
|
-
/// RDFa
|
|
678
|
-
#[serde(rename = "rdfa")]
|
|
679
|
-
RDFa,
|
|
680
|
-
}
|
|
681
|
-
|
|
682
|
-
/// HTML metadata extracted from HTML documents.
|
|
683
|
-
///
|
|
684
|
-
/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
|
685
|
-
/// and extracted structural elements (headers, links, images, structured data).
|
|
686
|
-
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
687
|
-
pub struct HtmlMetadata {
|
|
688
|
-
/// Document title from `<title>` tag
|
|
689
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
690
|
-
pub title: Option<String>,
|
|
691
|
-
|
|
692
|
-
/// Document description from `<meta name="description">` tag
|
|
693
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
694
|
-
pub description: Option<String>,
|
|
695
|
-
|
|
696
|
-
/// Document keywords from `<meta name="keywords">` tag, split on commas
|
|
697
|
-
#[serde(default)]
|
|
698
|
-
pub keywords: Vec<String>,
|
|
699
|
-
|
|
700
|
-
/// Document author from `<meta name="author">` tag
|
|
701
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
702
|
-
pub author: Option<String>,
|
|
703
|
-
|
|
704
|
-
/// Canonical URL from `<link rel="canonical">` tag
|
|
705
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
706
|
-
pub canonical_url: Option<String>,
|
|
707
|
-
|
|
708
|
-
/// Base URL from `<base href="">` tag for resolving relative URLs
|
|
709
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
710
|
-
pub base_href: Option<String>,
|
|
711
|
-
|
|
712
|
-
/// Document language from `lang` attribute
|
|
713
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
714
|
-
pub language: Option<String>,
|
|
715
|
-
|
|
716
|
-
/// Document text direction from `dir` attribute
|
|
717
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
718
|
-
pub text_direction: Option<TextDirection>,
|
|
719
|
-
|
|
720
|
-
/// Open Graph metadata (og:* properties) for social media
|
|
721
|
-
/// Keys like "title", "description", "image", "url", etc.
|
|
722
|
-
#[serde(default)]
|
|
723
|
-
pub open_graph: BTreeMap<String, String>,
|
|
724
|
-
|
|
725
|
-
/// Twitter Card metadata (twitter:* properties)
|
|
726
|
-
/// Keys like "card", "site", "creator", "title", "description", "image", etc.
|
|
727
|
-
#[serde(default)]
|
|
728
|
-
pub twitter_card: BTreeMap<String, String>,
|
|
729
|
-
|
|
730
|
-
/// Additional meta tags not covered by specific fields
|
|
731
|
-
/// Keys are meta name/property attributes, values are content
|
|
732
|
-
#[serde(default)]
|
|
733
|
-
pub meta_tags: BTreeMap<String, String>,
|
|
734
|
-
|
|
735
|
-
/// Extracted header elements with hierarchy
|
|
736
|
-
#[serde(default)]
|
|
737
|
-
pub headers: Vec<HeaderMetadata>,
|
|
738
|
-
|
|
739
|
-
/// Extracted hyperlinks with type classification
|
|
740
|
-
#[serde(default)]
|
|
741
|
-
pub links: Vec<LinkMetadata>,
|
|
742
|
-
|
|
743
|
-
/// Extracted images with source and dimensions
|
|
744
|
-
#[serde(default)]
|
|
745
|
-
pub images: Vec<ImageMetadataType>,
|
|
746
|
-
|
|
747
|
-
/// Extracted structured data blocks
|
|
748
|
-
#[serde(default)]
|
|
749
|
-
pub structured_data: Vec<StructuredData>,
|
|
750
|
-
}
|
|
751
|
-
|
|
752
|
-
impl HtmlMetadata {
|
|
753
|
-
/// Check if metadata is empty (no meaningful content extracted).
|
|
754
|
-
pub fn is_empty(&self) -> bool {
|
|
755
|
-
self.title.is_none()
|
|
756
|
-
&& self.description.is_none()
|
|
757
|
-
&& self.keywords.is_empty()
|
|
758
|
-
&& self.author.is_none()
|
|
759
|
-
&& self.canonical_url.is_none()
|
|
760
|
-
&& self.base_href.is_none()
|
|
761
|
-
&& self.language.is_none()
|
|
762
|
-
&& self.text_direction.is_none()
|
|
763
|
-
&& self.open_graph.is_empty()
|
|
764
|
-
&& self.twitter_card.is_empty()
|
|
765
|
-
&& self.meta_tags.is_empty()
|
|
766
|
-
&& self.headers.is_empty()
|
|
767
|
-
&& self.links.is_empty()
|
|
768
|
-
&& self.images.is_empty()
|
|
769
|
-
&& self.structured_data.is_empty()
|
|
770
|
-
}
|
|
771
|
-
}
|
|
772
|
-
|
|
773
|
-
#[cfg(feature = "html")]
|
|
774
|
-
impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
|
|
775
|
-
fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
|
|
776
|
-
let text_dir = metadata.document.text_direction.map(|td| match td {
|
|
777
|
-
html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
|
|
778
|
-
html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
|
|
779
|
-
html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
|
|
780
|
-
});
|
|
781
|
-
|
|
782
|
-
HtmlMetadata {
|
|
783
|
-
title: metadata.document.title,
|
|
784
|
-
description: metadata.document.description,
|
|
785
|
-
keywords: metadata.document.keywords,
|
|
786
|
-
author: metadata.document.author,
|
|
787
|
-
canonical_url: metadata.document.canonical_url,
|
|
788
|
-
base_href: metadata.document.base_href,
|
|
789
|
-
language: metadata.document.language,
|
|
790
|
-
text_direction: text_dir,
|
|
791
|
-
open_graph: metadata.document.open_graph,
|
|
792
|
-
twitter_card: metadata.document.twitter_card,
|
|
793
|
-
meta_tags: metadata.document.meta_tags,
|
|
794
|
-
headers: metadata
|
|
795
|
-
.headers
|
|
796
|
-
.into_iter()
|
|
797
|
-
.map(|h| HeaderMetadata {
|
|
798
|
-
level: h.level,
|
|
799
|
-
text: h.text,
|
|
800
|
-
id: h.id,
|
|
801
|
-
depth: h.depth,
|
|
802
|
-
html_offset: h.html_offset,
|
|
803
|
-
})
|
|
804
|
-
.collect(),
|
|
805
|
-
links: metadata
|
|
806
|
-
.links
|
|
807
|
-
.into_iter()
|
|
808
|
-
.map(|l| LinkMetadata {
|
|
809
|
-
href: l.href,
|
|
810
|
-
text: l.text,
|
|
811
|
-
title: l.title,
|
|
812
|
-
link_type: match l.link_type {
|
|
813
|
-
html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
|
|
814
|
-
html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
|
|
815
|
-
html_to_markdown_rs::LinkType::External => LinkType::External,
|
|
816
|
-
html_to_markdown_rs::LinkType::Email => LinkType::Email,
|
|
817
|
-
html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
|
|
818
|
-
html_to_markdown_rs::LinkType::Other => LinkType::Other,
|
|
819
|
-
},
|
|
820
|
-
rel: l.rel,
|
|
821
|
-
attributes: l.attributes.into_iter().collect(),
|
|
822
|
-
})
|
|
823
|
-
.collect(),
|
|
824
|
-
images: metadata
|
|
825
|
-
.images
|
|
826
|
-
.into_iter()
|
|
827
|
-
.map(|img| ImageMetadataType {
|
|
828
|
-
src: img.src,
|
|
829
|
-
alt: img.alt,
|
|
830
|
-
title: img.title,
|
|
831
|
-
dimensions: img.dimensions,
|
|
832
|
-
image_type: match img.image_type {
|
|
833
|
-
html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
|
|
834
|
-
html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
|
|
835
|
-
html_to_markdown_rs::ImageType::External => ImageType::External,
|
|
836
|
-
html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
|
|
837
|
-
},
|
|
838
|
-
attributes: img.attributes.into_iter().collect(),
|
|
839
|
-
})
|
|
840
|
-
.collect(),
|
|
841
|
-
structured_data: metadata
|
|
842
|
-
.structured_data
|
|
843
|
-
.into_iter()
|
|
844
|
-
.map(|sd| StructuredData {
|
|
845
|
-
data_type: match sd.data_type {
|
|
846
|
-
html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
|
|
847
|
-
html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
|
|
848
|
-
html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
|
|
849
|
-
},
|
|
850
|
-
raw_json: sd.raw_json,
|
|
851
|
-
schema_type: sd.schema_type,
|
|
852
|
-
})
|
|
853
|
-
.collect(),
|
|
854
|
-
}
|
|
855
|
-
}
|
|
856
|
-
}
|
|
857
|
-
|
|
858
|
-
/// OCR processing metadata.
|
|
859
|
-
///
|
|
860
|
-
/// Captures information about OCR processing configuration and results.
|
|
861
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
862
|
-
pub struct OcrMetadata {
|
|
863
|
-
/// OCR language code(s) used
|
|
864
|
-
pub language: String,
|
|
865
|
-
/// Tesseract Page Segmentation Mode (PSM)
|
|
866
|
-
pub psm: i32,
|
|
867
|
-
/// Output format (e.g., "text", "hocr")
|
|
868
|
-
pub output_format: String,
|
|
869
|
-
/// Number of tables detected
|
|
870
|
-
pub table_count: usize,
|
|
871
|
-
|
|
872
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
873
|
-
pub table_rows: Option<usize>,
|
|
874
|
-
|
|
875
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
876
|
-
pub table_cols: Option<usize>,
|
|
877
|
-
}
|
|
878
|
-
|
|
879
|
-
/// Error metadata (for batch operations).
|
|
880
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
881
|
-
pub struct ErrorMetadata {
|
|
882
|
-
pub error_type: String,
|
|
883
|
-
pub message: String,
|
|
884
|
-
}
|
|
885
|
-
|
|
886
|
-
/// Extracted table structure.
|
|
887
|
-
///
|
|
888
|
-
/// Represents a table detected and extracted from a document (PDF, image, etc.).
|
|
889
|
-
/// Tables are converted to both structured cell data and Markdown format.
|
|
890
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
891
|
-
pub struct Table {
|
|
892
|
-
/// Table cells as a 2D vector (rows × columns)
|
|
893
|
-
pub cells: Vec<Vec<String>>,
|
|
894
|
-
/// Markdown representation of the table
|
|
895
|
-
pub markdown: String,
|
|
896
|
-
/// Page number where the table was found (1-indexed)
|
|
897
|
-
pub page_number: usize,
|
|
898
|
-
}
|
|
899
|
-
|
|
900
|
-
/// A text chunk with optional embedding and metadata.
|
|
901
|
-
///
|
|
902
|
-
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|
903
|
-
/// contains the text content, optional embedding vector (if embedding generation
|
|
904
|
-
/// is configured), and metadata about its position in the document.
|
|
905
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
906
|
-
pub struct Chunk {
|
|
907
|
-
/// The text content of this chunk.
|
|
908
|
-
pub content: String,
|
|
909
|
-
|
|
910
|
-
/// Optional embedding vector for this chunk.
|
|
911
|
-
///
|
|
912
|
-
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
|
913
|
-
/// The dimensionality depends on the chosen embedding model.
|
|
914
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
915
|
-
pub embedding: Option<Vec<f32>>,
|
|
916
|
-
|
|
917
|
-
/// Metadata about this chunk's position and properties.
|
|
918
|
-
pub metadata: ChunkMetadata,
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
/// Metadata about a chunk's position in the original document.
|
|
922
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
923
|
-
pub struct ChunkMetadata {
|
|
924
|
-
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|
925
|
-
pub byte_start: usize,
|
|
926
|
-
|
|
927
|
-
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
|
928
|
-
pub byte_end: usize,
|
|
929
|
-
|
|
930
|
-
/// Number of tokens in this chunk (if available).
|
|
931
|
-
///
|
|
932
|
-
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
|
933
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
934
|
-
pub token_count: Option<usize>,
|
|
935
|
-
|
|
936
|
-
/// Zero-based index of this chunk in the document.
|
|
937
|
-
pub chunk_index: usize,
|
|
938
|
-
|
|
939
|
-
/// Total number of chunks in the document.
|
|
940
|
-
pub total_chunks: usize,
|
|
941
|
-
|
|
942
|
-
/// First page number this chunk spans (1-indexed).
|
|
943
|
-
///
|
|
944
|
-
/// Only populated when page tracking is enabled in extraction configuration.
|
|
945
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
946
|
-
pub first_page: Option<usize>,
|
|
947
|
-
|
|
948
|
-
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|
949
|
-
///
|
|
950
|
-
/// Only populated when page tracking is enabled in extraction configuration.
|
|
951
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
952
|
-
pub last_page: Option<usize>,
|
|
953
|
-
}
|
|
954
|
-
|
|
955
|
-
/// Extracted image from a document.
|
|
956
|
-
///
|
|
957
|
-
/// Contains raw image data, metadata, and optional nested OCR results.
|
|
958
|
-
/// Raw bytes allow cross-language compatibility - users can convert to
|
|
959
|
-
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|
960
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
961
|
-
pub struct ExtractedImage {
|
|
962
|
-
/// Raw image data (PNG, JPEG, WebP, etc. bytes)
|
|
963
|
-
pub data: Vec<u8>,
|
|
964
|
-
|
|
965
|
-
/// Image format (e.g., "jpeg", "png", "webp")
|
|
966
|
-
pub format: String,
|
|
967
|
-
|
|
968
|
-
/// Zero-indexed position of this image in the document/page
|
|
969
|
-
pub image_index: usize,
|
|
970
|
-
|
|
971
|
-
/// Page/slide number where image was found (1-indexed)
|
|
972
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
973
|
-
pub page_number: Option<usize>,
|
|
974
|
-
|
|
975
|
-
/// Image width in pixels
|
|
976
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
977
|
-
pub width: Option<u32>,
|
|
978
|
-
|
|
979
|
-
/// Image height in pixels
|
|
980
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
981
|
-
pub height: Option<u32>,
|
|
982
|
-
|
|
983
|
-
/// Colorspace information (e.g., "RGB", "CMYK", "Gray")
|
|
984
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
985
|
-
pub colorspace: Option<String>,
|
|
986
|
-
|
|
987
|
-
/// Bits per color component (e.g., 8, 16)
|
|
988
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
989
|
-
pub bits_per_component: Option<u32>,
|
|
990
|
-
|
|
991
|
-
/// Whether this image is a mask image
|
|
992
|
-
#[serde(default)]
|
|
993
|
-
pub is_mask: bool,
|
|
994
|
-
|
|
995
|
-
/// Optional description of the image
|
|
996
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
997
|
-
pub description: Option<String>,
|
|
998
|
-
|
|
999
|
-
/// Nested OCR extraction result (if image was OCRed)
|
|
1000
|
-
///
|
|
1001
|
-
/// When OCR is performed on this image, the result is embedded here
|
|
1002
|
-
/// rather than in a separate collection, making the relationship explicit.
|
|
1003
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1004
|
-
pub ocr_result: Option<Box<ExtractionResult>>,
|
|
1005
|
-
}
|
|
1006
|
-
|
|
1007
|
-
/// Excel workbook representation.
|
|
1008
|
-
///
|
|
1009
|
-
/// Contains all sheets from an Excel file (.xlsx, .xls, etc.) with
|
|
1010
|
-
/// extracted content and metadata.
|
|
1011
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1012
|
-
pub struct ExcelWorkbook {
|
|
1013
|
-
/// All sheets in the workbook
|
|
1014
|
-
pub sheets: Vec<ExcelSheet>,
|
|
1015
|
-
/// Workbook-level metadata (author, creation date, etc.)
|
|
1016
|
-
pub metadata: HashMap<String, String>,
|
|
1017
|
-
}
|
|
1018
|
-
|
|
1019
|
-
/// Single Excel worksheet.
|
|
1020
|
-
///
|
|
1021
|
-
/// Represents one sheet from an Excel workbook with its content
|
|
1022
|
-
/// converted to Markdown format and dimensional statistics.
|
|
1023
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1024
|
-
pub struct ExcelSheet {
|
|
1025
|
-
/// Sheet name as it appears in Excel
|
|
1026
|
-
pub name: String,
|
|
1027
|
-
/// Sheet content converted to Markdown tables
|
|
1028
|
-
pub markdown: String,
|
|
1029
|
-
/// Number of rows
|
|
1030
|
-
pub row_count: usize,
|
|
1031
|
-
/// Number of columns
|
|
1032
|
-
pub col_count: usize,
|
|
1033
|
-
/// Total number of non-empty cells
|
|
1034
|
-
pub cell_count: usize,
|
|
1035
|
-
/// Pre-extracted table cells (2D vector of cell values)
|
|
1036
|
-
/// Populated during markdown generation to avoid re-parsing markdown.
|
|
1037
|
-
/// None for empty sheets.
|
|
1038
|
-
#[serde(skip)]
|
|
1039
|
-
pub table_cells: Option<Vec<Vec<String>>>,
|
|
1040
|
-
}
|
|
1041
|
-
|
|
1042
|
-
/// XML extraction result.
|
|
1043
|
-
///
|
|
1044
|
-
/// Contains extracted text content from XML files along with
|
|
1045
|
-
/// structural statistics about the XML document.
|
|
1046
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1047
|
-
pub struct XmlExtractionResult {
|
|
1048
|
-
/// Extracted text content (XML structure filtered out)
|
|
1049
|
-
pub content: String,
|
|
1050
|
-
/// Total number of XML elements processed
|
|
1051
|
-
pub element_count: usize,
|
|
1052
|
-
/// List of unique element names found (sorted)
|
|
1053
|
-
pub unique_elements: Vec<String>,
|
|
1054
|
-
}
|
|
1055
|
-
|
|
1056
|
-
/// Plain text and Markdown extraction result.
|
|
1057
|
-
///
|
|
1058
|
-
/// Contains the extracted text along with statistics and,
|
|
1059
|
-
/// for Markdown files, structural elements like headers and links.
|
|
1060
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1061
|
-
pub struct TextExtractionResult {
|
|
1062
|
-
/// Extracted text content
|
|
1063
|
-
pub content: String,
|
|
1064
|
-
/// Number of lines
|
|
1065
|
-
pub line_count: usize,
|
|
1066
|
-
/// Number of words
|
|
1067
|
-
pub word_count: usize,
|
|
1068
|
-
/// Number of characters
|
|
1069
|
-
pub character_count: usize,
|
|
1070
|
-
/// Markdown headers (text only, Markdown files only)
|
|
1071
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1072
|
-
pub headers: Option<Vec<String>>,
|
|
1073
|
-
/// Markdown links as (text, URL) tuples (Markdown files only)
|
|
1074
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1075
|
-
pub links: Option<Vec<(String, String)>>,
|
|
1076
|
-
/// Code blocks as (language, code) tuples (Markdown files only)
|
|
1077
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1078
|
-
pub code_blocks: Option<Vec<(String, String)>>,
|
|
1079
|
-
}
|
|
1080
|
-
|
|
1081
|
-
/// PowerPoint (PPTX) extraction result.
|
|
1082
|
-
///
|
|
1083
|
-
/// Contains extracted slide content, metadata, and embedded images/tables.
|
|
1084
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1085
|
-
pub struct PptxExtractionResult {
|
|
1086
|
-
/// Extracted text content from all slides
|
|
1087
|
-
pub content: String,
|
|
1088
|
-
/// Presentation metadata
|
|
1089
|
-
pub metadata: PptxMetadata,
|
|
1090
|
-
/// Total number of slides
|
|
1091
|
-
pub slide_count: usize,
|
|
1092
|
-
/// Total number of embedded images
|
|
1093
|
-
pub image_count: usize,
|
|
1094
|
-
/// Total number of tables
|
|
1095
|
-
pub table_count: usize,
|
|
1096
|
-
/// Extracted images from the presentation
|
|
1097
|
-
pub images: Vec<ExtractedImage>,
|
|
1098
|
-
/// Slide structure with boundaries (when page tracking is enabled)
|
|
1099
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1100
|
-
pub page_structure: Option<PageStructure>,
|
|
1101
|
-
/// Per-slide content (when page tracking is enabled)
|
|
1102
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1103
|
-
pub page_contents: Option<Vec<PageContent>>,
|
|
1104
|
-
}
|
|
1105
|
-
|
|
1106
|
-
/// PowerPoint presentation metadata.
|
|
1107
|
-
///
|
|
1108
|
-
/// Contains PPTX-specific metadata. Common fields like title, author, and description
|
|
1109
|
-
/// are now in the base `Metadata` struct.
|
|
1110
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1111
|
-
pub struct PptxMetadata {
|
|
1112
|
-
/// List of fonts used in the presentation
|
|
1113
|
-
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
1114
|
-
pub fonts: Vec<String>,
|
|
1115
|
-
}
|
|
1116
|
-
|
|
1117
|
-
/// Email extraction result.
|
|
1118
|
-
///
|
|
1119
|
-
/// Complete representation of an extracted email message (.eml or .msg)
|
|
1120
|
-
/// including headers, body content, and attachments.
|
|
1121
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1122
|
-
pub struct EmailExtractionResult {
|
|
1123
|
-
/// Email subject line
|
|
1124
|
-
pub subject: Option<String>,
|
|
1125
|
-
/// Sender email address
|
|
1126
|
-
pub from_email: Option<String>,
|
|
1127
|
-
/// Primary recipient email addresses
|
|
1128
|
-
pub to_emails: Vec<String>,
|
|
1129
|
-
/// CC recipient email addresses
|
|
1130
|
-
pub cc_emails: Vec<String>,
|
|
1131
|
-
/// BCC recipient email addresses
|
|
1132
|
-
pub bcc_emails: Vec<String>,
|
|
1133
|
-
/// Email date/timestamp
|
|
1134
|
-
pub date: Option<String>,
|
|
1135
|
-
/// Message-ID header value
|
|
1136
|
-
pub message_id: Option<String>,
|
|
1137
|
-
/// Plain text version of the email body
|
|
1138
|
-
pub plain_text: Option<String>,
|
|
1139
|
-
/// HTML version of the email body
|
|
1140
|
-
pub html_content: Option<String>,
|
|
1141
|
-
/// Cleaned/processed text content
|
|
1142
|
-
pub cleaned_text: String,
|
|
1143
|
-
/// List of email attachments
|
|
1144
|
-
pub attachments: Vec<EmailAttachment>,
|
|
1145
|
-
/// Additional email headers and metadata
|
|
1146
|
-
pub metadata: HashMap<String, String>,
|
|
1147
|
-
}
|
|
1148
|
-
|
|
1149
|
-
/// Email attachment representation.
|
|
1150
|
-
///
|
|
1151
|
-
/// Contains metadata and optionally the content of an email attachment.
|
|
1152
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1153
|
-
pub struct EmailAttachment {
|
|
1154
|
-
/// Attachment name (from Content-Disposition header)
|
|
1155
|
-
pub name: Option<String>,
|
|
1156
|
-
/// Filename of the attachment
|
|
1157
|
-
pub filename: Option<String>,
|
|
1158
|
-
/// MIME type of the attachment
|
|
1159
|
-
pub mime_type: Option<String>,
|
|
1160
|
-
/// Size in bytes
|
|
1161
|
-
pub size: Option<usize>,
|
|
1162
|
-
/// Whether this attachment is an image
|
|
1163
|
-
pub is_image: bool,
|
|
1164
|
-
/// Attachment data (if extracted)
|
|
1165
|
-
pub data: Option<Vec<u8>>,
|
|
1166
|
-
}
|
|
1167
|
-
|
|
1168
|
-
/// OCR extraction result.
|
|
1169
|
-
///
|
|
1170
|
-
/// Result of performing OCR on an image or scanned document,
|
|
1171
|
-
/// including recognized text and detected tables.
|
|
1172
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1173
|
-
pub struct OcrExtractionResult {
|
|
1174
|
-
/// Recognized text content
|
|
1175
|
-
pub content: String,
|
|
1176
|
-
/// Original MIME type of the processed image
|
|
1177
|
-
pub mime_type: String,
|
|
1178
|
-
/// OCR processing metadata (confidence scores, language, etc.)
|
|
1179
|
-
pub metadata: HashMap<String, serde_json::Value>,
|
|
1180
|
-
/// Tables detected and extracted via OCR
|
|
1181
|
-
pub tables: Vec<OcrTable>,
|
|
1182
|
-
}
|
|
1183
|
-
|
|
1184
|
-
/// Table detected via OCR.
|
|
1185
|
-
///
|
|
1186
|
-
/// Represents a table structure recognized during OCR processing.
|
|
1187
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1188
|
-
pub struct OcrTable {
|
|
1189
|
-
/// Table cells as a 2D vector (rows × columns)
|
|
1190
|
-
pub cells: Vec<Vec<String>>,
|
|
1191
|
-
/// Markdown representation of the table
|
|
1192
|
-
pub markdown: String,
|
|
1193
|
-
/// Page number where the table was found (1-indexed)
|
|
1194
|
-
pub page_number: usize,
|
|
1195
|
-
}
|
|
1196
|
-
|
|
1197
|
-
/// Image preprocessing configuration for OCR.
|
|
1198
|
-
///
|
|
1199
|
-
/// These settings control how images are preprocessed before OCR to improve
|
|
1200
|
-
/// text recognition quality. Different preprocessing strategies work better
|
|
1201
|
-
/// for different document types.
|
|
1202
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1203
|
-
#[serde(default)]
|
|
1204
|
-
pub struct ImagePreprocessingConfig {
|
|
1205
|
-
/// Target DPI for the image (300 is standard, 600 for small text).
|
|
1206
|
-
pub target_dpi: i32,
|
|
1207
|
-
|
|
1208
|
-
/// Auto-detect and correct image rotation.
|
|
1209
|
-
pub auto_rotate: bool,
|
|
1210
|
-
|
|
1211
|
-
/// Correct skew (tilted images).
|
|
1212
|
-
pub deskew: bool,
|
|
1213
|
-
|
|
1214
|
-
/// Remove noise from the image.
|
|
1215
|
-
pub denoise: bool,
|
|
1216
|
-
|
|
1217
|
-
/// Enhance contrast for better text visibility.
|
|
1218
|
-
pub contrast_enhance: bool,
|
|
1219
|
-
|
|
1220
|
-
/// Binarization method: "otsu", "sauvola", "adaptive".
|
|
1221
|
-
pub binarization_method: String,
|
|
1222
|
-
|
|
1223
|
-
/// Invert colors (white text on black → black on white).
|
|
1224
|
-
pub invert_colors: bool,
|
|
1225
|
-
}
|
|
1226
|
-
|
|
1227
|
-
impl Default for ImagePreprocessingConfig {
|
|
1228
|
-
fn default() -> Self {
|
|
1229
|
-
Self {
|
|
1230
|
-
target_dpi: 300,
|
|
1231
|
-
auto_rotate: true,
|
|
1232
|
-
deskew: true,
|
|
1233
|
-
denoise: false,
|
|
1234
|
-
contrast_enhance: false,
|
|
1235
|
-
binarization_method: "otsu".to_string(),
|
|
1236
|
-
invert_colors: false,
|
|
1237
|
-
}
|
|
1238
|
-
}
|
|
1239
|
-
}
|
|
1240
|
-
|
|
1241
|
-
/// Tesseract OCR configuration.
|
|
1242
|
-
///
|
|
1243
|
-
/// Provides fine-grained control over Tesseract OCR engine parameters.
|
|
1244
|
-
/// Most users can use the defaults, but these settings allow optimization
|
|
1245
|
-
/// for specific document types (invoices, handwriting, etc.).
|
|
1246
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1247
|
-
#[serde(default)]
|
|
1248
|
-
pub struct TesseractConfig {
|
|
1249
|
-
/// Language code (e.g., "eng", "deu", "fra")
|
|
1250
|
-
pub language: String,
|
|
1251
|
-
|
|
1252
|
-
/// Page Segmentation Mode (0-13).
|
|
1253
|
-
///
|
|
1254
|
-
/// Common values:
|
|
1255
|
-
/// - 3: Fully automatic page segmentation (default)
|
|
1256
|
-
/// - 6: Assume a single uniform block of text
|
|
1257
|
-
/// - 11: Sparse text with no particular order
|
|
1258
|
-
pub psm: i32,
|
|
1259
|
-
|
|
1260
|
-
/// Output format ("text" or "markdown")
|
|
1261
|
-
pub output_format: String,
|
|
1262
|
-
|
|
1263
|
-
/// OCR Engine Mode (0-3).
|
|
1264
|
-
///
|
|
1265
|
-
/// - 0: Legacy engine only
|
|
1266
|
-
/// - 1: Neural nets (LSTM) only (usually best)
|
|
1267
|
-
/// - 2: Legacy + LSTM
|
|
1268
|
-
/// - 3: Default (based on what's available)
|
|
1269
|
-
pub oem: i32,
|
|
1270
|
-
|
|
1271
|
-
/// Minimum confidence threshold (0.0-100.0).
|
|
1272
|
-
///
|
|
1273
|
-
/// Words with confidence below this threshold may be rejected or flagged.
|
|
1274
|
-
pub min_confidence: f64,
|
|
1275
|
-
|
|
1276
|
-
/// Image preprocessing configuration.
|
|
1277
|
-
///
|
|
1278
|
-
/// Controls how images are preprocessed before OCR. Can significantly
|
|
1279
|
-
/// improve quality for scanned documents or low-quality images.
|
|
1280
|
-
#[serde(skip_serializing_if = "Option::is_none")]
|
|
1281
|
-
pub preprocessing: Option<ImagePreprocessingConfig>,
|
|
1282
|
-
|
|
1283
|
-
/// Enable automatic table detection and reconstruction
|
|
1284
|
-
pub enable_table_detection: bool,
|
|
1285
|
-
|
|
1286
|
-
/// Minimum confidence threshold for table detection (0.0-1.0)
|
|
1287
|
-
pub table_min_confidence: f64,
|
|
1288
|
-
|
|
1289
|
-
/// Column threshold for table detection (pixels)
|
|
1290
|
-
pub table_column_threshold: i32,
|
|
1291
|
-
|
|
1292
|
-
/// Row threshold ratio for table detection (0.0-1.0)
|
|
1293
|
-
pub table_row_threshold_ratio: f64,
|
|
1294
|
-
|
|
1295
|
-
/// Enable OCR result caching
|
|
1296
|
-
pub use_cache: bool,
|
|
1297
|
-
|
|
1298
|
-
/// Use pre-adapted templates for character classification
|
|
1299
|
-
pub classify_use_pre_adapted_templates: bool,
|
|
1300
|
-
|
|
1301
|
-
/// Enable N-gram language model
|
|
1302
|
-
pub language_model_ngram_on: bool,
|
|
1303
|
-
|
|
1304
|
-
/// Don't reject good words during block-level processing
|
|
1305
|
-
pub tessedit_dont_blkrej_good_wds: bool,
|
|
1306
|
-
|
|
1307
|
-
/// Don't reject good words during row-level processing
|
|
1308
|
-
pub tessedit_dont_rowrej_good_wds: bool,
|
|
1309
|
-
|
|
1310
|
-
/// Enable dictionary correction
|
|
1311
|
-
pub tessedit_enable_dict_correction: bool,
|
|
1312
|
-
|
|
1313
|
-
/// Whitelist of allowed characters (empty = all allowed)
|
|
1314
|
-
pub tessedit_char_whitelist: String,
|
|
1315
|
-
|
|
1316
|
-
/// Blacklist of forbidden characters (empty = none forbidden)
|
|
1317
|
-
pub tessedit_char_blacklist: String,
|
|
1318
|
-
|
|
1319
|
-
/// Use primary language params model
|
|
1320
|
-
pub tessedit_use_primary_params_model: bool,
|
|
1321
|
-
|
|
1322
|
-
/// Variable-width space detection
|
|
1323
|
-
pub textord_space_size_is_variable: bool,
|
|
1324
|
-
|
|
1325
|
-
/// Use adaptive thresholding method
|
|
1326
|
-
pub thresholding_method: bool,
|
|
1327
|
-
}
|
|
1328
|
-
|
|
1329
|
-
impl Default for TesseractConfig {
|
|
1330
|
-
fn default() -> Self {
|
|
1331
|
-
Self {
|
|
1332
|
-
language: "eng".to_string(),
|
|
1333
|
-
psm: 3,
|
|
1334
|
-
output_format: "markdown".to_string(),
|
|
1335
|
-
oem: 3,
|
|
1336
|
-
min_confidence: 0.0,
|
|
1337
|
-
preprocessing: None,
|
|
1338
|
-
enable_table_detection: true,
|
|
1339
|
-
table_min_confidence: 0.0,
|
|
1340
|
-
table_column_threshold: 50,
|
|
1341
|
-
table_row_threshold_ratio: 0.5,
|
|
1342
|
-
use_cache: true,
|
|
1343
|
-
classify_use_pre_adapted_templates: true,
|
|
1344
|
-
language_model_ngram_on: false,
|
|
1345
|
-
tessedit_dont_blkrej_good_wds: true,
|
|
1346
|
-
tessedit_dont_rowrej_good_wds: true,
|
|
1347
|
-
tessedit_enable_dict_correction: true,
|
|
1348
|
-
tessedit_char_whitelist: String::new(),
|
|
1349
|
-
tessedit_char_blacklist: String::new(),
|
|
1350
|
-
tessedit_use_primary_params_model: true,
|
|
1351
|
-
textord_space_size_is_variable: true,
|
|
1352
|
-
thresholding_method: false,
|
|
1353
|
-
}
|
|
1354
|
-
}
|
|
1355
|
-
}
|
|
1356
|
-
|
|
1357
|
-
/// Image preprocessing metadata.
|
|
1358
|
-
///
|
|
1359
|
-
/// Tracks the transformations applied to an image during OCR preprocessing,
|
|
1360
|
-
/// including DPI normalization, resizing, and resampling.
|
|
1361
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1362
|
-
pub struct ImagePreprocessingMetadata {
|
|
1363
|
-
/// Original image dimensions (width, height) in pixels
|
|
1364
|
-
pub original_dimensions: (usize, usize),
|
|
1365
|
-
/// Original image DPI (horizontal, vertical)
|
|
1366
|
-
pub original_dpi: (f64, f64),
|
|
1367
|
-
/// Target DPI from configuration
|
|
1368
|
-
pub target_dpi: i32,
|
|
1369
|
-
/// Scaling factor applied to the image
|
|
1370
|
-
pub scale_factor: f64,
|
|
1371
|
-
/// Whether DPI was auto-adjusted based on content
|
|
1372
|
-
pub auto_adjusted: bool,
|
|
1373
|
-
/// Final DPI after processing
|
|
1374
|
-
pub final_dpi: i32,
|
|
1375
|
-
/// New dimensions after resizing (if resized)
|
|
1376
|
-
pub new_dimensions: Option<(usize, usize)>,
|
|
1377
|
-
/// Resampling algorithm used ("LANCZOS3", "CATMULLROM", etc.)
|
|
1378
|
-
pub resample_method: String,
|
|
1379
|
-
/// Whether dimensions were clamped to max_image_dimension
|
|
1380
|
-
pub dimension_clamped: bool,
|
|
1381
|
-
/// Calculated optimal DPI (if auto_adjust_dpi enabled)
|
|
1382
|
-
pub calculated_dpi: Option<i32>,
|
|
1383
|
-
/// Whether resize was skipped (dimensions already optimal)
|
|
1384
|
-
pub skipped_resize: bool,
|
|
1385
|
-
/// Error message if resize failed
|
|
1386
|
-
pub resize_error: Option<String>,
|
|
1387
|
-
}
|
|
1388
|
-
|
|
1389
|
-
/// Image extraction configuration (internal use).
|
|
1390
|
-
///
|
|
1391
|
-
/// **Note:** This is an internal type used for image preprocessing.
|
|
1392
|
-
/// For the main extraction configuration, see [`crate::core::config::ExtractionConfig`].
|
|
1393
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1394
|
-
pub struct ExtractionConfig {
|
|
1395
|
-
/// Target DPI for image normalization
|
|
1396
|
-
pub target_dpi: i32,
|
|
1397
|
-
/// Maximum image dimension (width or height)
|
|
1398
|
-
pub max_image_dimension: i32,
|
|
1399
|
-
/// Whether to auto-adjust DPI based on content
|
|
1400
|
-
pub auto_adjust_dpi: bool,
|
|
1401
|
-
/// Minimum DPI threshold
|
|
1402
|
-
pub min_dpi: i32,
|
|
1403
|
-
/// Maximum DPI threshold
|
|
1404
|
-
pub max_dpi: i32,
|
|
1405
|
-
}
|
|
1406
|
-
|
|
1407
|
-
impl Default for ExtractionConfig {
|
|
1408
|
-
fn default() -> Self {
|
|
1409
|
-
Self {
|
|
1410
|
-
target_dpi: 300,
|
|
1411
|
-
max_image_dimension: 4096,
|
|
1412
|
-
auto_adjust_dpi: true,
|
|
1413
|
-
min_dpi: 72,
|
|
1414
|
-
max_dpi: 600,
|
|
1415
|
-
}
|
|
1416
|
-
}
|
|
1417
|
-
}
|
|
1418
|
-
|
|
1419
|
-
/// Cache statistics.
|
|
1420
|
-
///
|
|
1421
|
-
/// Provides information about the extraction result cache,
|
|
1422
|
-
/// including size, file count, and age distribution.
|
|
1423
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1424
|
-
pub struct CacheStats {
|
|
1425
|
-
/// Total number of cached files
|
|
1426
|
-
pub total_files: usize,
|
|
1427
|
-
/// Total cache size in megabytes
|
|
1428
|
-
pub total_size_mb: f64,
|
|
1429
|
-
/// Available disk space in megabytes
|
|
1430
|
-
pub available_space_mb: f64,
|
|
1431
|
-
/// Age of the oldest cached file in days
|
|
1432
|
-
pub oldest_file_age_days: f64,
|
|
1433
|
-
/// Age of the newest cached file in days
|
|
1434
|
-
pub newest_file_age_days: f64,
|
|
1435
|
-
}
|
|
1436
|
-
|
|
1437
|
-
/// LibreOffice conversion result.
|
|
1438
|
-
///
|
|
1439
|
-
/// Result of converting a legacy office document (e.g., .doc, .ppt)
|
|
1440
|
-
/// to a modern format using LibreOffice.
|
|
1441
|
-
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
1442
|
-
pub struct LibreOfficeConversionResult {
|
|
1443
|
-
/// Converted file bytes
|
|
1444
|
-
pub converted_bytes: Vec<u8>,
|
|
1445
|
-
/// Original format identifier
|
|
1446
|
-
pub original_format: String,
|
|
1447
|
-
/// Target format identifier
|
|
1448
|
-
pub target_format: String,
|
|
1449
|
-
/// Target MIME type after conversion
|
|
1450
|
-
pub target_mime: String,
|
|
1451
|
-
}
|
|
1452
|
-
|
|
1453
|
-
#[cfg(test)]
|
|
1454
|
-
mod tests {
|
|
1455
|
-
use super::*;
|
|
1456
|
-
|
|
1457
|
-
#[test]
|
|
1458
|
-
fn test_metadata_serialization_with_format() {
|
|
1459
|
-
let mut metadata = Metadata {
|
|
1460
|
-
format: Some(FormatMetadata::Text(TextMetadata {
|
|
1461
|
-
line_count: 1,
|
|
1462
|
-
word_count: 2,
|
|
1463
|
-
character_count: 13,
|
|
1464
|
-
headers: None,
|
|
1465
|
-
links: None,
|
|
1466
|
-
code_blocks: None,
|
|
1467
|
-
})),
|
|
1468
|
-
..Default::default()
|
|
1469
|
-
};
|
|
1470
|
-
|
|
1471
|
-
metadata
|
|
1472
|
-
.additional
|
|
1473
|
-
.insert("quality_score".to_string(), serde_json::json!(1.0));
|
|
1474
|
-
|
|
1475
|
-
let json = serde_json::to_value(&metadata).unwrap();
|
|
1476
|
-
println!("Serialized metadata: {}", serde_json::to_string_pretty(&json).unwrap());
|
|
1477
|
-
|
|
1478
|
-
assert!(
|
|
1479
|
-
json.get("format_type").is_some(),
|
|
1480
|
-
"format_type should be present in serialized JSON"
|
|
1481
|
-
);
|
|
1482
|
-
assert_eq!(json.get("format_type").unwrap(), "text");
|
|
1483
|
-
|
|
1484
|
-
assert_eq!(json.get("line_count").unwrap(), 1);
|
|
1485
|
-
assert_eq!(json.get("word_count").unwrap(), 2);
|
|
1486
|
-
assert_eq!(json.get("character_count").unwrap(), 13);
|
|
1487
|
-
|
|
1488
|
-
assert_eq!(json.get("quality_score").unwrap(), 1.0);
|
|
1489
|
-
}
|
|
1490
|
-
|
|
1491
|
-
#[test]
|
|
1492
|
-
fn test_arc_table_serialization_format() {
|
|
1493
|
-
let table = Table {
|
|
1494
|
-
cells: vec![vec!["A".to_string(), "B".to_string()]],
|
|
1495
|
-
markdown: "| A | B |\n|---|---|\n".to_string(),
|
|
1496
|
-
page_number: 1,
|
|
1497
|
-
};
|
|
1498
|
-
|
|
1499
|
-
let json = serde_json::to_value(&table).unwrap();
|
|
1500
|
-
|
|
1501
|
-
assert_eq!(json.get("cells").unwrap()[0][0], "A");
|
|
1502
|
-
assert_eq!(json.get("markdown").unwrap(), "| A | B |\n|---|---|\n");
|
|
1503
|
-
assert_eq!(json.get("page_number").unwrap(), 1);
|
|
1504
|
-
}
|
|
1505
|
-
|
|
1506
|
-
#[test]
|
|
1507
|
-
fn test_arc_table_roundtrip() {
|
|
1508
|
-
let original = Table {
|
|
1509
|
-
cells: vec![
|
|
1510
|
-
vec!["X".to_string(), "Y".to_string()],
|
|
1511
|
-
vec!["1".to_string(), "2".to_string()],
|
|
1512
|
-
],
|
|
1513
|
-
markdown: "| X | Y |\n|---|---|\n| 1 | 2 |\n".to_string(),
|
|
1514
|
-
page_number: 5,
|
|
1515
|
-
};
|
|
1516
|
-
|
|
1517
|
-
let json = serde_json::to_string(&original).unwrap();
|
|
1518
|
-
let deserialized: Table = serde_json::from_str(&json).unwrap();
|
|
1519
|
-
|
|
1520
|
-
assert_eq!(deserialized.cells, original.cells);
|
|
1521
|
-
assert_eq!(deserialized.markdown, original.markdown);
|
|
1522
|
-
assert_eq!(deserialized.page_number, original.page_number);
|
|
1523
|
-
}
|
|
1524
|
-
|
|
1525
|
-
#[test]
|
|
1526
|
-
fn test_arc_sharing_preserved_before_serialization() {
|
|
1527
|
-
let shared_table = Arc::new(Table {
|
|
1528
|
-
cells: vec![vec!["shared".to_string()]],
|
|
1529
|
-
markdown: "| shared |".to_string(),
|
|
1530
|
-
page_number: 1,
|
|
1531
|
-
});
|
|
1532
|
-
|
|
1533
|
-
let tables_before = [Arc::clone(&shared_table), Arc::clone(&shared_table)].to_vec();
|
|
1534
|
-
assert_eq!(Arc::strong_count(&tables_before[0]), 3);
|
|
1535
|
-
assert_eq!(Arc::strong_count(&tables_before[1]), 3);
|
|
1536
|
-
assert!(Arc::ptr_eq(&tables_before[0], &tables_before[1]));
|
|
1537
|
-
}
|
|
1538
|
-
|
|
1539
|
-
#[test]
|
|
1540
|
-
fn test_vec_arc_table_serialization_format() {
|
|
1541
|
-
let tables = vec![
|
|
1542
|
-
Table {
|
|
1543
|
-
cells: vec![vec!["A".to_string()]],
|
|
1544
|
-
markdown: "| A |".to_string(),
|
|
1545
|
-
page_number: 1,
|
|
1546
|
-
},
|
|
1547
|
-
Table {
|
|
1548
|
-
cells: vec![vec!["B".to_string()]],
|
|
1549
|
-
markdown: "| B |".to_string(),
|
|
1550
|
-
page_number: 2,
|
|
1551
|
-
},
|
|
1552
|
-
];
|
|
1553
|
-
|
|
1554
|
-
let json = serde_json::to_string(&tables).unwrap();
|
|
1555
|
-
let parsed: serde_json::Value = serde_json::from_str(&json).unwrap();
|
|
1556
|
-
|
|
1557
|
-
assert!(parsed.is_array());
|
|
1558
|
-
assert_eq!(parsed.as_array().unwrap().len(), 2);
|
|
1559
|
-
assert_eq!(parsed[0]["cells"][0][0], "A");
|
|
1560
|
-
assert_eq!(parsed[1]["cells"][0][0], "B");
|
|
1561
|
-
}
|
|
1562
|
-
|
|
1563
|
-
#[test]
|
|
1564
|
-
fn test_page_content_arc_tables_roundtrip() {
|
|
1565
|
-
let page = PageContent {
|
|
1566
|
-
page_number: 3,
|
|
1567
|
-
content: "Page 3 content".to_string(),
|
|
1568
|
-
tables: vec![
|
|
1569
|
-
Arc::new(Table {
|
|
1570
|
-
cells: vec![vec!["Table1".to_string()]],
|
|
1571
|
-
markdown: "| Table1 |".to_string(),
|
|
1572
|
-
page_number: 3,
|
|
1573
|
-
}),
|
|
1574
|
-
Arc::new(Table {
|
|
1575
|
-
cells: vec![vec!["Table2".to_string()]],
|
|
1576
|
-
markdown: "| Table2 |".to_string(),
|
|
1577
|
-
page_number: 3,
|
|
1578
|
-
}),
|
|
1579
|
-
],
|
|
1580
|
-
images: Vec::new(),
|
|
1581
|
-
hierarchy: None,
|
|
1582
|
-
};
|
|
1583
|
-
|
|
1584
|
-
let json = serde_json::to_string(&page).unwrap();
|
|
1585
|
-
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
1586
|
-
|
|
1587
|
-
assert_eq!(deserialized.page_number, 3);
|
|
1588
|
-
assert_eq!(deserialized.content, "Page 3 content");
|
|
1589
|
-
assert_eq!(deserialized.tables.len(), 2);
|
|
1590
|
-
assert_eq!(deserialized.tables[0].cells[0][0], "Table1");
|
|
1591
|
-
assert_eq!(deserialized.tables[1].cells[0][0], "Table2");
|
|
1592
|
-
}
|
|
1593
|
-
|
|
1594
|
-
#[test]
|
|
1595
|
-
fn test_page_content_arc_images_roundtrip() {
|
|
1596
|
-
let image1 = Arc::new(ExtractedImage {
|
|
1597
|
-
data: vec![0xFF, 0xD8, 0xFF],
|
|
1598
|
-
format: "jpeg".to_string(),
|
|
1599
|
-
image_index: 0,
|
|
1600
|
-
page_number: Some(1),
|
|
1601
|
-
width: Some(100),
|
|
1602
|
-
height: Some(200),
|
|
1603
|
-
colorspace: Some("RGB".to_string()),
|
|
1604
|
-
bits_per_component: Some(8),
|
|
1605
|
-
is_mask: false,
|
|
1606
|
-
description: Some("Image 1".to_string()),
|
|
1607
|
-
ocr_result: None,
|
|
1608
|
-
});
|
|
1609
|
-
|
|
1610
|
-
let image2 = Arc::new(ExtractedImage {
|
|
1611
|
-
data: vec![0x89, 0x50, 0x4E],
|
|
1612
|
-
format: "png".to_string(),
|
|
1613
|
-
image_index: 1,
|
|
1614
|
-
page_number: Some(1),
|
|
1615
|
-
width: Some(300),
|
|
1616
|
-
height: Some(400),
|
|
1617
|
-
colorspace: Some("RGBA".to_string()),
|
|
1618
|
-
bits_per_component: Some(8),
|
|
1619
|
-
is_mask: false,
|
|
1620
|
-
description: Some("Image 2".to_string()),
|
|
1621
|
-
ocr_result: None,
|
|
1622
|
-
});
|
|
1623
|
-
|
|
1624
|
-
let page = PageContent {
|
|
1625
|
-
page_number: 1,
|
|
1626
|
-
content: "Page with images".to_string(),
|
|
1627
|
-
tables: Vec::new(),
|
|
1628
|
-
images: vec![image1, image2],
|
|
1629
|
-
hierarchy: None,
|
|
1630
|
-
};
|
|
1631
|
-
|
|
1632
|
-
let json = serde_json::to_string(&page).unwrap();
|
|
1633
|
-
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
1634
|
-
|
|
1635
|
-
assert_eq!(deserialized.images.len(), 2);
|
|
1636
|
-
assert_eq!(deserialized.images[0].format, "jpeg");
|
|
1637
|
-
assert_eq!(deserialized.images[0].width, Some(100));
|
|
1638
|
-
assert_eq!(deserialized.images[1].format, "png");
|
|
1639
|
-
assert_eq!(deserialized.images[1].height, Some(400));
|
|
1640
|
-
}
|
|
1641
|
-
|
|
1642
|
-
#[test]
|
|
1643
|
-
fn test_arc_sharing_loss_with_page_content() {
|
|
1644
|
-
let shared_table = Arc::new(Table {
|
|
1645
|
-
cells: vec![vec!["shared across pages".to_string()]],
|
|
1646
|
-
markdown: "| shared across pages |".to_string(),
|
|
1647
|
-
page_number: 0,
|
|
1648
|
-
});
|
|
1649
|
-
|
|
1650
|
-
let page1 = PageContent {
|
|
1651
|
-
page_number: 1,
|
|
1652
|
-
content: "Page 1".to_string(),
|
|
1653
|
-
tables: vec![Arc::clone(&shared_table)],
|
|
1654
|
-
images: Vec::new(),
|
|
1655
|
-
hierarchy: None,
|
|
1656
|
-
};
|
|
1657
|
-
|
|
1658
|
-
let page2 = PageContent {
|
|
1659
|
-
page_number: 2,
|
|
1660
|
-
content: "Page 2".to_string(),
|
|
1661
|
-
tables: vec![Arc::clone(&shared_table)],
|
|
1662
|
-
images: Vec::new(),
|
|
1663
|
-
hierarchy: None,
|
|
1664
|
-
};
|
|
1665
|
-
|
|
1666
|
-
assert!(Arc::ptr_eq(&page1.tables[0], &page2.tables[0]));
|
|
1667
|
-
|
|
1668
|
-
let pages = vec![page1, page2];
|
|
1669
|
-
let json = serde_json::to_string(&pages).unwrap();
|
|
1670
|
-
let deserialized: Vec<PageContent> = serde_json::from_str(&json).unwrap();
|
|
1671
|
-
|
|
1672
|
-
assert_eq!(deserialized.len(), 2);
|
|
1673
|
-
assert_eq!(deserialized[0].tables[0].cells, deserialized[1].tables[0].cells);
|
|
1674
|
-
assert!(!Arc::ptr_eq(&deserialized[0].tables[0], &deserialized[1].tables[0]));
|
|
1675
|
-
}
|
|
1676
|
-
|
|
1677
|
-
#[test]
|
|
1678
|
-
fn test_empty_page_content_arcs() {
|
|
1679
|
-
let page = PageContent {
|
|
1680
|
-
page_number: 5,
|
|
1681
|
-
content: "No tables or images".to_string(),
|
|
1682
|
-
tables: Vec::new(),
|
|
1683
|
-
images: Vec::new(),
|
|
1684
|
-
hierarchy: None,
|
|
1685
|
-
};
|
|
1686
|
-
|
|
1687
|
-
let json = serde_json::to_string(&page).unwrap();
|
|
1688
|
-
let deserialized: PageContent = serde_json::from_str(&json).unwrap();
|
|
1689
|
-
|
|
1690
|
-
assert_eq!(deserialized.page_number, 5);
|
|
1691
|
-
assert_eq!(deserialized.tables.len(), 0);
|
|
1692
|
-
assert_eq!(deserialized.images.len(), 0);
|
|
1693
|
-
}
|
|
1694
|
-
|
|
1695
|
-
#[test]
|
|
1696
|
-
fn test_serde_vec_arc_module_behavior() {
|
|
1697
|
-
let table1 = Table {
|
|
1698
|
-
cells: vec![vec!["A".to_string()]],
|
|
1699
|
-
markdown: "| A |".to_string(),
|
|
1700
|
-
page_number: 1,
|
|
1701
|
-
};
|
|
1702
|
-
|
|
1703
|
-
let table2 = Table {
|
|
1704
|
-
cells: vec![vec!["B".to_string()]],
|
|
1705
|
-
markdown: "| B |".to_string(),
|
|
1706
|
-
page_number: 2,
|
|
1707
|
-
};
|
|
1708
|
-
|
|
1709
|
-
let json = serde_json::to_string(&vec![table1, table2]).unwrap();
|
|
1710
|
-
assert!(json.contains("\"A\""));
|
|
1711
|
-
assert!(json.contains("\"B\""));
|
|
1712
|
-
}
|
|
1713
|
-
}
|