kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
//! Djot document types.
|
|
2
|
+
//!
|
|
3
|
+
//! This module defines types for representing Djot document structures.
|
|
4
|
+
|
|
5
|
+
use serde::{Deserialize, Serialize};
|
|
6
|
+
use std::collections::HashMap;
|
|
7
|
+
|
|
8
|
+
// Import Metadata and Table from parent module
|
|
9
|
+
use super::Table;
|
|
10
|
+
use super::metadata::Metadata;
|
|
11
|
+
|
|
12
|
+
/// Comprehensive Djot document structure with semantic preservation.
|
|
13
|
+
///
|
|
14
|
+
/// This type captures the full richness of Djot markup, including:
|
|
15
|
+
/// - Block-level structures (headings, lists, blockquotes, code blocks, etc.)
|
|
16
|
+
/// - Inline formatting (emphasis, strong, highlight, subscript, superscript, etc.)
|
|
17
|
+
/// - Attributes (classes, IDs, key-value pairs)
|
|
18
|
+
/// - Links, images, footnotes
|
|
19
|
+
/// - Math expressions (inline and display)
|
|
20
|
+
/// - Tables with full structure
|
|
21
|
+
///
|
|
22
|
+
/// Available when the `djot` feature is enabled.
|
|
23
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
24
|
+
pub struct DjotContent {
|
|
25
|
+
/// Plain text representation for backwards compatibility
|
|
26
|
+
pub plain_text: String,
|
|
27
|
+
|
|
28
|
+
/// Structured block-level content
|
|
29
|
+
pub blocks: Vec<FormattedBlock>,
|
|
30
|
+
|
|
31
|
+
/// Metadata from YAML frontmatter
|
|
32
|
+
pub metadata: Metadata,
|
|
33
|
+
|
|
34
|
+
/// Extracted tables as structured data
|
|
35
|
+
pub tables: Vec<Table>,
|
|
36
|
+
|
|
37
|
+
/// Extracted images with metadata
|
|
38
|
+
pub images: Vec<DjotImage>,
|
|
39
|
+
|
|
40
|
+
/// Extracted links with URLs
|
|
41
|
+
pub links: Vec<DjotLink>,
|
|
42
|
+
|
|
43
|
+
/// Footnote definitions
|
|
44
|
+
pub footnotes: Vec<Footnote>,
|
|
45
|
+
|
|
46
|
+
/// Attributes mapped by element identifier (if present)
|
|
47
|
+
#[serde(skip_serializing_if = "HashMap::is_empty", default)]
|
|
48
|
+
pub attributes: HashMap<String, Attributes>,
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Block-level element in a Djot document.
|
|
52
|
+
///
|
|
53
|
+
/// Represents structural elements like headings, paragraphs, lists, code blocks, etc.
|
|
54
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
55
|
+
pub struct FormattedBlock {
|
|
56
|
+
/// Type of block element
|
|
57
|
+
pub block_type: BlockType,
|
|
58
|
+
|
|
59
|
+
/// Heading level (1-6) for headings, or nesting level for lists
|
|
60
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
61
|
+
pub level: Option<usize>,
|
|
62
|
+
|
|
63
|
+
/// Inline content within the block
|
|
64
|
+
pub inline_content: Vec<InlineElement>,
|
|
65
|
+
|
|
66
|
+
/// Element attributes (classes, IDs, key-value pairs)
|
|
67
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
68
|
+
pub attributes: Option<Attributes>,
|
|
69
|
+
|
|
70
|
+
/// Language identifier for code blocks
|
|
71
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
72
|
+
pub language: Option<String>,
|
|
73
|
+
|
|
74
|
+
/// Raw code content for code blocks
|
|
75
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
76
|
+
pub code: Option<String>,
|
|
77
|
+
|
|
78
|
+
/// Nested blocks for containers (blockquotes, list items, divs)
|
|
79
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
80
|
+
pub children: Vec<FormattedBlock>,
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/// Types of block-level elements in Djot.
|
|
84
|
+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
|
85
|
+
#[serde(rename_all = "snake_case")]
|
|
86
|
+
pub enum BlockType {
|
|
87
|
+
Paragraph,
|
|
88
|
+
Heading,
|
|
89
|
+
Blockquote,
|
|
90
|
+
CodeBlock,
|
|
91
|
+
ListItem,
|
|
92
|
+
OrderedList,
|
|
93
|
+
BulletList,
|
|
94
|
+
TaskList,
|
|
95
|
+
DefinitionList,
|
|
96
|
+
DefinitionTerm,
|
|
97
|
+
DefinitionDescription,
|
|
98
|
+
Div,
|
|
99
|
+
Section,
|
|
100
|
+
ThematicBreak,
|
|
101
|
+
RawBlock,
|
|
102
|
+
MathDisplay,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/// Inline element within a block.
|
|
106
|
+
///
|
|
107
|
+
/// Represents text with formatting, links, images, etc.
|
|
108
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
109
|
+
pub struct InlineElement {
|
|
110
|
+
/// Type of inline element
|
|
111
|
+
pub element_type: InlineType,
|
|
112
|
+
|
|
113
|
+
/// Text content
|
|
114
|
+
pub content: String,
|
|
115
|
+
|
|
116
|
+
/// Element attributes
|
|
117
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
118
|
+
pub attributes: Option<Attributes>,
|
|
119
|
+
|
|
120
|
+
/// Additional metadata (e.g., href for links, src/alt for images)
|
|
121
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
122
|
+
pub metadata: Option<HashMap<String, String>>,
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/// Types of inline elements in Djot.
|
|
126
|
+
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
|
127
|
+
#[serde(rename_all = "snake_case")]
|
|
128
|
+
pub enum InlineType {
|
|
129
|
+
Text,
|
|
130
|
+
Strong,
|
|
131
|
+
Emphasis,
|
|
132
|
+
Highlight,
|
|
133
|
+
Subscript,
|
|
134
|
+
Superscript,
|
|
135
|
+
Insert,
|
|
136
|
+
Delete,
|
|
137
|
+
Code,
|
|
138
|
+
Link,
|
|
139
|
+
Image,
|
|
140
|
+
Span,
|
|
141
|
+
Math,
|
|
142
|
+
RawInline,
|
|
143
|
+
FootnoteRef,
|
|
144
|
+
Symbol,
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/// Element attributes in Djot.
|
|
148
|
+
///
|
|
149
|
+
/// Represents the attributes attached to elements using {.class #id key="value"} syntax.
|
|
150
|
+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
151
|
+
pub struct Attributes {
|
|
152
|
+
/// Element ID (#identifier)
|
|
153
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
154
|
+
pub id: Option<String>,
|
|
155
|
+
|
|
156
|
+
/// CSS classes (.class1 .class2)
|
|
157
|
+
#[serde(skip_serializing_if = "Vec::is_empty", default)]
|
|
158
|
+
pub classes: Vec<String>,
|
|
159
|
+
|
|
160
|
+
/// Key-value pairs (key="value")
|
|
161
|
+
#[serde(skip_serializing_if = "HashMap::is_empty", default)]
|
|
162
|
+
pub key_values: HashMap<String, String>,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/// Image element in Djot.
|
|
166
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
167
|
+
pub struct DjotImage {
|
|
168
|
+
/// Image source URL or path
|
|
169
|
+
pub src: String,
|
|
170
|
+
|
|
171
|
+
/// Alternative text
|
|
172
|
+
pub alt: String,
|
|
173
|
+
|
|
174
|
+
/// Optional title
|
|
175
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
176
|
+
pub title: Option<String>,
|
|
177
|
+
|
|
178
|
+
/// Element attributes
|
|
179
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
180
|
+
pub attributes: Option<Attributes>,
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/// Link element in Djot.
|
|
184
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
185
|
+
pub struct DjotLink {
|
|
186
|
+
/// Link URL
|
|
187
|
+
pub url: String,
|
|
188
|
+
|
|
189
|
+
/// Link text content
|
|
190
|
+
pub text: String,
|
|
191
|
+
|
|
192
|
+
/// Optional title
|
|
193
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
194
|
+
pub title: Option<String>,
|
|
195
|
+
|
|
196
|
+
/// Element attributes
|
|
197
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
198
|
+
pub attributes: Option<Attributes>,
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/// Footnote in Djot.
|
|
202
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
203
|
+
pub struct Footnote {
|
|
204
|
+
/// Footnote label
|
|
205
|
+
pub label: String,
|
|
206
|
+
|
|
207
|
+
/// Footnote content blocks
|
|
208
|
+
pub content: Vec<FormattedBlock>,
|
|
209
|
+
}
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
//! Core extraction types and results.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
use std::collections::HashMap;
|
|
5
|
+
|
|
6
|
+
use super::djot::DjotContent;
|
|
7
|
+
use super::metadata::Metadata;
|
|
8
|
+
use super::page::PageContent;
|
|
9
|
+
use super::tables::Table;
|
|
10
|
+
|
|
11
|
+
/// General extraction result used by the core extraction API.
|
|
12
|
+
///
|
|
13
|
+
/// This is the main result type returned by all extraction functions.
|
|
14
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
15
|
+
pub struct ExtractionResult {
|
|
16
|
+
pub content: String,
|
|
17
|
+
pub mime_type: String,
|
|
18
|
+
pub metadata: Metadata,
|
|
19
|
+
pub tables: Vec<Table>,
|
|
20
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
21
|
+
pub detected_languages: Option<Vec<String>>,
|
|
22
|
+
|
|
23
|
+
/// Text chunks when chunking is enabled.
|
|
24
|
+
///
|
|
25
|
+
/// When chunking configuration is provided, the content is split into
|
|
26
|
+
/// overlapping chunks for efficient processing. Each chunk contains the text,
|
|
27
|
+
/// optional embeddings (if enabled), and metadata about its position.
|
|
28
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
29
|
+
pub chunks: Option<Vec<Chunk>>,
|
|
30
|
+
|
|
31
|
+
/// Extracted images from the document.
|
|
32
|
+
///
|
|
33
|
+
/// When image extraction is enabled via `ImageExtractionConfig`, this field
|
|
34
|
+
/// contains all images found in the document with their raw data and metadata.
|
|
35
|
+
/// Each image may optionally contain a nested `ocr_result` if OCR was performed.
|
|
36
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
37
|
+
pub images: Option<Vec<ExtractedImage>>,
|
|
38
|
+
|
|
39
|
+
/// Per-page content when page extraction is enabled.
|
|
40
|
+
///
|
|
41
|
+
/// When page extraction is configured, the document is split into per-page content
|
|
42
|
+
/// with tables and images mapped to their respective pages.
|
|
43
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
44
|
+
pub pages: Option<Vec<PageContent>>,
|
|
45
|
+
|
|
46
|
+
/// Semantic elements when element-based output format is enabled.
|
|
47
|
+
///
|
|
48
|
+
/// When output_format is set to ElementBased, this field contains semantic
|
|
49
|
+
/// elements with type classification, unique identifiers, and metadata for
|
|
50
|
+
/// Unstructured-compatible element-based processing.
|
|
51
|
+
#[serde(skip_serializing_if = "Option::is_none", default)]
|
|
52
|
+
pub elements: Option<Vec<Element>>,
|
|
53
|
+
|
|
54
|
+
/// Rich Djot content structure (when extracting Djot documents).
|
|
55
|
+
///
|
|
56
|
+
/// When extracting Djot documents with structured extraction enabled,
|
|
57
|
+
/// this field contains the full semantic structure including:
|
|
58
|
+
/// - Block-level elements with nesting
|
|
59
|
+
/// - Inline formatting with attributes
|
|
60
|
+
/// - Links, images, footnotes
|
|
61
|
+
/// - Math expressions
|
|
62
|
+
/// - Complete attribute information
|
|
63
|
+
///
|
|
64
|
+
/// The `content` field still contains plain text for backward compatibility.
|
|
65
|
+
///
|
|
66
|
+
/// Always `None` for non-Djot documents.
|
|
67
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
68
|
+
#[serde(default)]
|
|
69
|
+
pub djot_content: Option<DjotContent>,
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/// A text chunk with optional embedding and metadata.
|
|
73
|
+
///
|
|
74
|
+
/// Chunks are created when chunking is enabled in `ExtractionConfig`. Each chunk
|
|
75
|
+
/// contains the text content, optional embedding vector (if embedding generation
|
|
76
|
+
/// is configured), and metadata about its position in the document.
|
|
77
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
78
|
+
pub struct Chunk {
|
|
79
|
+
/// The text content of this chunk.
|
|
80
|
+
pub content: String,
|
|
81
|
+
|
|
82
|
+
/// Optional embedding vector for this chunk.
|
|
83
|
+
///
|
|
84
|
+
/// Only populated when `EmbeddingConfig` is provided in chunking configuration.
|
|
85
|
+
/// The dimensionality depends on the chosen embedding model.
|
|
86
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
87
|
+
pub embedding: Option<Vec<f32>>,
|
|
88
|
+
|
|
89
|
+
/// Metadata about this chunk's position and properties.
|
|
90
|
+
pub metadata: ChunkMetadata,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/// Metadata about a chunk's position in the original document.
|
|
94
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
95
|
+
pub struct ChunkMetadata {
|
|
96
|
+
/// Byte offset where this chunk starts in the original text (UTF-8 valid boundary).
|
|
97
|
+
pub byte_start: usize,
|
|
98
|
+
|
|
99
|
+
/// Byte offset where this chunk ends in the original text (UTF-8 valid boundary).
|
|
100
|
+
pub byte_end: usize,
|
|
101
|
+
|
|
102
|
+
/// Number of tokens in this chunk (if available).
|
|
103
|
+
///
|
|
104
|
+
/// This is calculated by the embedding model's tokenizer if embeddings are enabled.
|
|
105
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
106
|
+
pub token_count: Option<usize>,
|
|
107
|
+
|
|
108
|
+
/// Zero-based index of this chunk in the document.
|
|
109
|
+
pub chunk_index: usize,
|
|
110
|
+
|
|
111
|
+
/// Total number of chunks in the document.
|
|
112
|
+
pub total_chunks: usize,
|
|
113
|
+
|
|
114
|
+
/// First page number this chunk spans (1-indexed).
|
|
115
|
+
///
|
|
116
|
+
/// Only populated when page tracking is enabled in extraction configuration.
|
|
117
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
118
|
+
pub first_page: Option<usize>,
|
|
119
|
+
|
|
120
|
+
/// Last page number this chunk spans (1-indexed, equal to first_page for single-page chunks).
|
|
121
|
+
///
|
|
122
|
+
/// Only populated when page tracking is enabled in extraction configuration.
|
|
123
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
124
|
+
pub last_page: Option<usize>,
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/// Extracted image from a document.
|
|
128
|
+
///
|
|
129
|
+
/// Contains raw image data, metadata, and optional nested OCR results.
|
|
130
|
+
/// Raw bytes allow cross-language compatibility - users can convert to
|
|
131
|
+
/// PIL.Image (Python), Sharp (Node.js), or other formats as needed.
|
|
132
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
133
|
+
pub struct ExtractedImage {
|
|
134
|
+
/// Raw image data (PNG, JPEG, WebP, etc. bytes)
|
|
135
|
+
pub data: Vec<u8>,
|
|
136
|
+
|
|
137
|
+
/// Image format (e.g., "jpeg", "png", "webp")
|
|
138
|
+
pub format: String,
|
|
139
|
+
|
|
140
|
+
/// Zero-indexed position of this image in the document/page
|
|
141
|
+
pub image_index: usize,
|
|
142
|
+
|
|
143
|
+
/// Page/slide number where image was found (1-indexed)
|
|
144
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
145
|
+
pub page_number: Option<usize>,
|
|
146
|
+
|
|
147
|
+
/// Image width in pixels
|
|
148
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
149
|
+
pub width: Option<u32>,
|
|
150
|
+
|
|
151
|
+
/// Image height in pixels
|
|
152
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
153
|
+
pub height: Option<u32>,
|
|
154
|
+
|
|
155
|
+
/// Colorspace information (e.g., "RGB", "CMYK", "Gray")
|
|
156
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
157
|
+
pub colorspace: Option<String>,
|
|
158
|
+
|
|
159
|
+
/// Bits per color component (e.g., 8, 16)
|
|
160
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
161
|
+
pub bits_per_component: Option<u32>,
|
|
162
|
+
|
|
163
|
+
/// Whether this image is a mask image
|
|
164
|
+
#[serde(default)]
|
|
165
|
+
pub is_mask: bool,
|
|
166
|
+
|
|
167
|
+
/// Optional description of the image
|
|
168
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
169
|
+
pub description: Option<String>,
|
|
170
|
+
|
|
171
|
+
/// Nested OCR extraction result (if image was OCRed)
|
|
172
|
+
///
|
|
173
|
+
/// When OCR is performed on this image, the result is embedded here
|
|
174
|
+
/// rather than in a separate collection, making the relationship explicit.
|
|
175
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
176
|
+
pub ocr_result: Option<Box<ExtractionResult>>,
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ============================================================================
|
|
180
|
+
// Element-based Output Format Types (Unstructured-compatible)
|
|
181
|
+
// ============================================================================
|
|
182
|
+
|
|
183
|
+
/// Output format selection for extraction results.
|
|
184
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
|
|
185
|
+
#[serde(rename_all = "snake_case")]
|
|
186
|
+
pub enum OutputFormat {
|
|
187
|
+
/// Unified format with all content in `content` field
|
|
188
|
+
#[default]
|
|
189
|
+
Unified,
|
|
190
|
+
/// Element-based format with semantic element extraction
|
|
191
|
+
ElementBased,
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/// Unique identifier for semantic elements.
|
|
195
|
+
///
|
|
196
|
+
/// Wraps a string identifier that is deterministically generated
|
|
197
|
+
/// from element type, content, and page number.
|
|
198
|
+
#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
|
199
|
+
pub struct ElementId(String);
|
|
200
|
+
|
|
201
|
+
impl ElementId {
|
|
202
|
+
/// Create a new ElementId from a string.
|
|
203
|
+
///
|
|
204
|
+
/// # Errors
|
|
205
|
+
///
|
|
206
|
+
/// Returns error if the string is not valid.
|
|
207
|
+
pub fn new(hex_str: impl Into<String>) -> std::result::Result<Self, String> {
|
|
208
|
+
let s = hex_str.into();
|
|
209
|
+
if s.is_empty() {
|
|
210
|
+
return Err("ElementId cannot be empty".to_string());
|
|
211
|
+
}
|
|
212
|
+
Ok(ElementId(s))
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
impl AsRef<str> for ElementId {
|
|
217
|
+
fn as_ref(&self) -> &str {
|
|
218
|
+
&self.0
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
impl std::fmt::Display for ElementId {
|
|
223
|
+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
224
|
+
write!(f, "{}", self.0)
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
/// Semantic element type classification.
|
|
229
|
+
///
|
|
230
|
+
/// Categorizes text content into semantic units for downstream processing.
|
|
231
|
+
/// Supports the element types commonly found in Unstructured documents.
|
|
232
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
|
233
|
+
#[serde(rename_all = "snake_case")]
|
|
234
|
+
pub enum ElementType {
|
|
235
|
+
/// Document title
|
|
236
|
+
Title,
|
|
237
|
+
/// Main narrative text body
|
|
238
|
+
NarrativeText,
|
|
239
|
+
/// Section heading
|
|
240
|
+
Heading,
|
|
241
|
+
/// List item (bullet, numbered, etc.)
|
|
242
|
+
ListItem,
|
|
243
|
+
/// Table element
|
|
244
|
+
Table,
|
|
245
|
+
/// Image element
|
|
246
|
+
Image,
|
|
247
|
+
/// Page break marker
|
|
248
|
+
PageBreak,
|
|
249
|
+
/// Code block
|
|
250
|
+
CodeBlock,
|
|
251
|
+
/// Block quote
|
|
252
|
+
BlockQuote,
|
|
253
|
+
/// Footer text
|
|
254
|
+
Footer,
|
|
255
|
+
/// Header text
|
|
256
|
+
Header,
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
/// Bounding box coordinates for element positioning.
|
|
260
|
+
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
|
|
261
|
+
pub struct BoundingBox {
|
|
262
|
+
/// Left x-coordinate
|
|
263
|
+
pub x0: f64,
|
|
264
|
+
/// Bottom y-coordinate
|
|
265
|
+
pub y0: f64,
|
|
266
|
+
/// Right x-coordinate
|
|
267
|
+
pub x1: f64,
|
|
268
|
+
/// Top y-coordinate
|
|
269
|
+
pub y1: f64,
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
/// Metadata for a semantic element.
|
|
273
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
274
|
+
pub struct ElementMetadata {
|
|
275
|
+
/// Page number (1-indexed)
|
|
276
|
+
pub page_number: Option<usize>,
|
|
277
|
+
/// Source filename or document name
|
|
278
|
+
pub filename: Option<String>,
|
|
279
|
+
/// Bounding box coordinates if available
|
|
280
|
+
pub coordinates: Option<BoundingBox>,
|
|
281
|
+
/// Position index in the element sequence
|
|
282
|
+
pub element_index: Option<usize>,
|
|
283
|
+
/// Additional custom metadata
|
|
284
|
+
pub additional: HashMap<String, String>,
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
/// Semantic element extracted from document.
|
|
288
|
+
///
|
|
289
|
+
/// Represents a logical unit of content with semantic classification,
|
|
290
|
+
/// unique identifier, and metadata for tracking origin and position.
|
|
291
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
292
|
+
pub struct Element {
|
|
293
|
+
/// Unique element identifier
|
|
294
|
+
pub element_id: ElementId,
|
|
295
|
+
/// Semantic type of this element
|
|
296
|
+
pub element_type: ElementType,
|
|
297
|
+
/// Text content of the element
|
|
298
|
+
pub text: String,
|
|
299
|
+
/// Metadata about the element
|
|
300
|
+
pub metadata: ElementMetadata,
|
|
301
|
+
}
|