kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,560 @@
|
|
|
1
|
+
//! Metadata types for extraction results.
|
|
2
|
+
//!
|
|
3
|
+
//! This module defines metadata structures for various document formats.
|
|
4
|
+
|
|
5
|
+
use serde::{Deserialize, Serialize};
|
|
6
|
+
use std::collections::{BTreeMap, HashMap};
|
|
7
|
+
|
|
8
|
+
#[cfg(feature = "pdf")]
|
|
9
|
+
use crate::pdf::metadata::PdfMetadata;
|
|
10
|
+
|
|
11
|
+
use super::formats::ImagePreprocessingMetadata;
|
|
12
|
+
use super::page::PageStructure;
|
|
13
|
+
|
|
14
|
+
/// Format-specific metadata (discriminated union).
|
|
15
|
+
///
|
|
16
|
+
/// Only one format type can exist per extraction result. This provides
|
|
17
|
+
/// type-safe, clean metadata without nested optionals.
|
|
18
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
19
|
+
#[serde(tag = "format_type", rename_all = "snake_case")]
|
|
20
|
+
pub enum FormatMetadata {
|
|
21
|
+
#[cfg(feature = "pdf")]
|
|
22
|
+
Pdf(PdfMetadata),
|
|
23
|
+
Excel(ExcelMetadata),
|
|
24
|
+
Email(EmailMetadata),
|
|
25
|
+
Pptx(PptxMetadata),
|
|
26
|
+
Archive(ArchiveMetadata),
|
|
27
|
+
Image(ImageMetadata),
|
|
28
|
+
Xml(XmlMetadata),
|
|
29
|
+
Text(TextMetadata),
|
|
30
|
+
Html(Box<HtmlMetadata>),
|
|
31
|
+
Ocr(OcrMetadata),
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/// Extraction result metadata.
|
|
35
|
+
///
|
|
36
|
+
/// Contains common fields applicable to all formats, format-specific metadata
|
|
37
|
+
/// via a discriminated union, and additional custom fields from postprocessors.
|
|
38
|
+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
39
|
+
pub struct Metadata {
|
|
40
|
+
/// Document title
|
|
41
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
42
|
+
pub title: Option<String>,
|
|
43
|
+
|
|
44
|
+
/// Document subject or description
|
|
45
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
46
|
+
pub subject: Option<String>,
|
|
47
|
+
|
|
48
|
+
/// Primary author(s) - always Vec for consistency
|
|
49
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
50
|
+
pub authors: Option<Vec<String>>,
|
|
51
|
+
|
|
52
|
+
/// Keywords/tags - always Vec for consistency
|
|
53
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
54
|
+
pub keywords: Option<Vec<String>>,
|
|
55
|
+
|
|
56
|
+
/// Primary language (ISO 639 code)
|
|
57
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
58
|
+
pub language: Option<String>,
|
|
59
|
+
|
|
60
|
+
/// Creation timestamp (ISO 8601 format)
|
|
61
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
62
|
+
pub created_at: Option<String>,
|
|
63
|
+
|
|
64
|
+
/// Last modification timestamp (ISO 8601 format)
|
|
65
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
66
|
+
pub modified_at: Option<String>,
|
|
67
|
+
|
|
68
|
+
/// User who created the document
|
|
69
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
70
|
+
pub created_by: Option<String>,
|
|
71
|
+
|
|
72
|
+
/// User who last modified the document
|
|
73
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
74
|
+
pub modified_by: Option<String>,
|
|
75
|
+
|
|
76
|
+
/// Page/slide/sheet structure with boundaries
|
|
77
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
78
|
+
pub pages: Option<PageStructure>,
|
|
79
|
+
|
|
80
|
+
/// Format-specific metadata (discriminated union)
|
|
81
|
+
///
|
|
82
|
+
/// Contains detailed metadata specific to the document format.
|
|
83
|
+
/// Serializes with a `format_type` discriminator field.
|
|
84
|
+
#[serde(flatten, skip_serializing_if = "Option::is_none")]
|
|
85
|
+
pub format: Option<FormatMetadata>,
|
|
86
|
+
|
|
87
|
+
/// Image preprocessing metadata (when OCR preprocessing was applied)
|
|
88
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
89
|
+
pub image_preprocessing: Option<ImagePreprocessingMetadata>,
|
|
90
|
+
|
|
91
|
+
/// JSON schema (for structured data extraction)
|
|
92
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
93
|
+
pub json_schema: Option<serde_json::Value>,
|
|
94
|
+
|
|
95
|
+
/// Error metadata (for batch operations)
|
|
96
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
97
|
+
pub error: Option<ErrorMetadata>,
|
|
98
|
+
|
|
99
|
+
/// Additional custom fields from postprocessors.
|
|
100
|
+
///
|
|
101
|
+
/// This flattened HashMap allows Python/TypeScript postprocessors to add
|
|
102
|
+
/// arbitrary fields (entity extraction, keyword extraction, etc.).
|
|
103
|
+
/// Fields are merged at the root level during serialization.
|
|
104
|
+
#[serde(flatten)]
|
|
105
|
+
pub additional: HashMap<String, serde_json::Value>,
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Excel/spreadsheet metadata.
|
|
109
|
+
///
|
|
110
|
+
/// Contains information about sheets in Excel, LibreOffice Calc, and other
|
|
111
|
+
/// spreadsheet formats (.xlsx, .xls, .ods, etc.).
|
|
112
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
113
|
+
pub struct ExcelMetadata {
|
|
114
|
+
/// Total number of sheets in the workbook
|
|
115
|
+
pub sheet_count: usize,
|
|
116
|
+
/// Names of all sheets in order
|
|
117
|
+
pub sheet_names: Vec<String>,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/// Email metadata extracted from .eml and .msg files.
|
|
121
|
+
///
|
|
122
|
+
/// Includes sender/recipient information, message ID, and attachment list.
|
|
123
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
124
|
+
pub struct EmailMetadata {
|
|
125
|
+
/// Sender's email address
|
|
126
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
127
|
+
pub from_email: Option<String>,
|
|
128
|
+
|
|
129
|
+
/// Sender's display name
|
|
130
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
131
|
+
pub from_name: Option<String>,
|
|
132
|
+
|
|
133
|
+
/// Primary recipients
|
|
134
|
+
pub to_emails: Vec<String>,
|
|
135
|
+
/// CC recipients
|
|
136
|
+
pub cc_emails: Vec<String>,
|
|
137
|
+
/// BCC recipients
|
|
138
|
+
pub bcc_emails: Vec<String>,
|
|
139
|
+
|
|
140
|
+
/// Message-ID header value
|
|
141
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
142
|
+
pub message_id: Option<String>,
|
|
143
|
+
|
|
144
|
+
/// List of attachment filenames
|
|
145
|
+
pub attachments: Vec<String>,
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Archive (ZIP/TAR/7Z) metadata.
|
|
149
|
+
///
|
|
150
|
+
/// Extracted from compressed archive files containing file lists and size information.
|
|
151
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
152
|
+
pub struct ArchiveMetadata {
|
|
153
|
+
/// Archive format ("ZIP", "TAR", "7Z", etc.)
|
|
154
|
+
pub format: String,
|
|
155
|
+
/// Total number of files in the archive
|
|
156
|
+
pub file_count: usize,
|
|
157
|
+
/// List of file paths within the archive
|
|
158
|
+
pub file_list: Vec<String>,
|
|
159
|
+
/// Total uncompressed size in bytes
|
|
160
|
+
pub total_size: usize,
|
|
161
|
+
|
|
162
|
+
/// Compressed size in bytes (if available)
|
|
163
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
164
|
+
pub compressed_size: Option<usize>,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/// Image metadata extracted from image files.
|
|
168
|
+
///
|
|
169
|
+
/// Includes dimensions, format, and EXIF data.
|
|
170
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
171
|
+
pub struct ImageMetadata {
|
|
172
|
+
/// Image width in pixels
|
|
173
|
+
pub width: u32,
|
|
174
|
+
/// Image height in pixels
|
|
175
|
+
pub height: u32,
|
|
176
|
+
/// Image format (e.g., "PNG", "JPEG", "TIFF")
|
|
177
|
+
pub format: String,
|
|
178
|
+
/// EXIF metadata tags
|
|
179
|
+
pub exif: HashMap<String, String>,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/// XML metadata extracted during XML parsing.
|
|
183
|
+
///
|
|
184
|
+
/// Provides statistics about XML document structure.
|
|
185
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
186
|
+
pub struct XmlMetadata {
|
|
187
|
+
/// Total number of XML elements processed
|
|
188
|
+
pub element_count: usize,
|
|
189
|
+
/// List of unique element tag names (sorted)
|
|
190
|
+
pub unique_elements: Vec<String>,
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
/// Text/Markdown metadata.
|
|
194
|
+
///
|
|
195
|
+
/// Extracted from plain text and Markdown files. Includes word counts and,
|
|
196
|
+
/// for Markdown, structural elements like headers and links.
|
|
197
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
198
|
+
pub struct TextMetadata {
|
|
199
|
+
/// Number of lines in the document
|
|
200
|
+
pub line_count: usize,
|
|
201
|
+
/// Number of words
|
|
202
|
+
pub word_count: usize,
|
|
203
|
+
/// Number of characters
|
|
204
|
+
pub character_count: usize,
|
|
205
|
+
|
|
206
|
+
/// Markdown headers (headings text only, for Markdown files)
|
|
207
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
208
|
+
pub headers: Option<Vec<String>>,
|
|
209
|
+
|
|
210
|
+
/// Markdown links as (text, url) tuples (for Markdown files)
|
|
211
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
212
|
+
pub links: Option<Vec<(String, String)>>,
|
|
213
|
+
|
|
214
|
+
/// Code blocks as (language, code) tuples (for Markdown files)
|
|
215
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
216
|
+
pub code_blocks: Option<Vec<(String, String)>>,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/// Text direction enumeration for HTML documents.
|
|
220
|
+
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
221
|
+
#[serde(rename_all = "lowercase")]
|
|
222
|
+
pub enum TextDirection {
|
|
223
|
+
/// Left-to-right text direction
|
|
224
|
+
#[serde(rename = "ltr")]
|
|
225
|
+
LeftToRight,
|
|
226
|
+
/// Right-to-left text direction
|
|
227
|
+
#[serde(rename = "rtl")]
|
|
228
|
+
RightToLeft,
|
|
229
|
+
/// Automatic text direction detection
|
|
230
|
+
#[serde(rename = "auto")]
|
|
231
|
+
Auto,
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/// Header/heading element metadata.
|
|
235
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
236
|
+
pub struct HeaderMetadata {
|
|
237
|
+
/// Header level: 1 (h1) through 6 (h6)
|
|
238
|
+
pub level: u8,
|
|
239
|
+
/// Normalized text content of the header
|
|
240
|
+
pub text: String,
|
|
241
|
+
/// HTML id attribute if present
|
|
242
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
243
|
+
pub id: Option<String>,
|
|
244
|
+
/// Document tree depth at the header element
|
|
245
|
+
pub depth: usize,
|
|
246
|
+
/// Byte offset in original HTML document
|
|
247
|
+
pub html_offset: usize,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
/// Link element metadata.
|
|
251
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
252
|
+
pub struct LinkMetadata {
|
|
253
|
+
/// The href URL value
|
|
254
|
+
pub href: String,
|
|
255
|
+
/// Link text content (normalized)
|
|
256
|
+
pub text: String,
|
|
257
|
+
/// Optional title attribute
|
|
258
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
259
|
+
pub title: Option<String>,
|
|
260
|
+
/// Link type classification
|
|
261
|
+
pub link_type: LinkType,
|
|
262
|
+
/// Rel attribute values
|
|
263
|
+
pub rel: Vec<String>,
|
|
264
|
+
/// Additional attributes as key-value pairs
|
|
265
|
+
pub attributes: HashMap<String, String>,
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/// Link type classification.
|
|
269
|
+
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
270
|
+
#[serde(rename_all = "lowercase")]
|
|
271
|
+
pub enum LinkType {
|
|
272
|
+
/// Anchor link (#section)
|
|
273
|
+
Anchor,
|
|
274
|
+
/// Internal link (same domain)
|
|
275
|
+
Internal,
|
|
276
|
+
/// External link (different domain)
|
|
277
|
+
External,
|
|
278
|
+
/// Email link (mailto:)
|
|
279
|
+
Email,
|
|
280
|
+
/// Phone link (tel:)
|
|
281
|
+
Phone,
|
|
282
|
+
/// Other link type
|
|
283
|
+
Other,
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
/// Image element metadata.
|
|
287
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
288
|
+
pub struct ImageMetadataType {
|
|
289
|
+
/// Image source (URL, data URI, or SVG content)
|
|
290
|
+
pub src: String,
|
|
291
|
+
/// Alternative text from alt attribute
|
|
292
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
293
|
+
pub alt: Option<String>,
|
|
294
|
+
/// Title attribute
|
|
295
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
296
|
+
pub title: Option<String>,
|
|
297
|
+
/// Image dimensions as (width, height) if available
|
|
298
|
+
pub dimensions: Option<(u32, u32)>,
|
|
299
|
+
/// Image type classification
|
|
300
|
+
pub image_type: ImageType,
|
|
301
|
+
/// Additional attributes as key-value pairs
|
|
302
|
+
pub attributes: HashMap<String, String>,
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
/// Image type classification.
|
|
306
|
+
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
307
|
+
#[serde(rename_all = "lowercase")]
|
|
308
|
+
pub enum ImageType {
|
|
309
|
+
/// Data URI image
|
|
310
|
+
#[serde(rename = "data-uri")]
|
|
311
|
+
DataUri,
|
|
312
|
+
/// Inline SVG
|
|
313
|
+
#[serde(rename = "inline-svg")]
|
|
314
|
+
InlineSvg,
|
|
315
|
+
/// External image URL
|
|
316
|
+
External,
|
|
317
|
+
/// Relative path image
|
|
318
|
+
Relative,
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/// Structured data (Schema.org, microdata, RDFa) block.
|
|
322
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
323
|
+
pub struct StructuredData {
|
|
324
|
+
/// Type of structured data
|
|
325
|
+
pub data_type: StructuredDataType,
|
|
326
|
+
/// Raw JSON string representation
|
|
327
|
+
pub raw_json: String,
|
|
328
|
+
/// Schema type if detectable (e.g., "Article", "Event", "Product")
|
|
329
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
330
|
+
pub schema_type: Option<String>,
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/// Structured data type classification.
|
|
334
|
+
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
|
|
335
|
+
#[serde(rename_all = "lowercase")]
|
|
336
|
+
pub enum StructuredDataType {
|
|
337
|
+
/// JSON-LD structured data
|
|
338
|
+
#[serde(rename = "json-ld")]
|
|
339
|
+
JsonLd,
|
|
340
|
+
/// Microdata
|
|
341
|
+
Microdata,
|
|
342
|
+
/// RDFa
|
|
343
|
+
#[serde(rename = "rdfa")]
|
|
344
|
+
RDFa,
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/// HTML metadata extracted from HTML documents.
|
|
348
|
+
///
|
|
349
|
+
/// Includes document-level metadata, Open Graph data, Twitter Card metadata,
|
|
350
|
+
/// and extracted structural elements (headers, links, images, structured data).
|
|
351
|
+
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
|
|
352
|
+
pub struct HtmlMetadata {
|
|
353
|
+
/// Document title from `<title>` tag
|
|
354
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
355
|
+
pub title: Option<String>,
|
|
356
|
+
|
|
357
|
+
/// Document description from `<meta name="description">` tag
|
|
358
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
359
|
+
pub description: Option<String>,
|
|
360
|
+
|
|
361
|
+
/// Document keywords from `<meta name="keywords">` tag, split on commas
|
|
362
|
+
#[serde(default)]
|
|
363
|
+
pub keywords: Vec<String>,
|
|
364
|
+
|
|
365
|
+
/// Document author from `<meta name="author">` tag
|
|
366
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
367
|
+
pub author: Option<String>,
|
|
368
|
+
|
|
369
|
+
/// Canonical URL from `<link rel="canonical">` tag
|
|
370
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
371
|
+
pub canonical_url: Option<String>,
|
|
372
|
+
|
|
373
|
+
/// Base URL from `<base href="">` tag for resolving relative URLs
|
|
374
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
375
|
+
pub base_href: Option<String>,
|
|
376
|
+
|
|
377
|
+
/// Document language from `lang` attribute
|
|
378
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
379
|
+
pub language: Option<String>,
|
|
380
|
+
|
|
381
|
+
/// Document text direction from `dir` attribute
|
|
382
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
383
|
+
pub text_direction: Option<TextDirection>,
|
|
384
|
+
|
|
385
|
+
/// Open Graph metadata (og:* properties) for social media
|
|
386
|
+
/// Keys like "title", "description", "image", "url", etc.
|
|
387
|
+
#[serde(default)]
|
|
388
|
+
pub open_graph: BTreeMap<String, String>,
|
|
389
|
+
|
|
390
|
+
/// Twitter Card metadata (twitter:* properties)
|
|
391
|
+
/// Keys like "card", "site", "creator", "title", "description", "image", etc.
|
|
392
|
+
#[serde(default)]
|
|
393
|
+
pub twitter_card: BTreeMap<String, String>,
|
|
394
|
+
|
|
395
|
+
/// Additional meta tags not covered by specific fields
|
|
396
|
+
/// Keys are meta name/property attributes, values are content
|
|
397
|
+
#[serde(default)]
|
|
398
|
+
pub meta_tags: BTreeMap<String, String>,
|
|
399
|
+
|
|
400
|
+
/// Extracted header elements with hierarchy
|
|
401
|
+
#[serde(default)]
|
|
402
|
+
pub headers: Vec<HeaderMetadata>,
|
|
403
|
+
|
|
404
|
+
/// Extracted hyperlinks with type classification
|
|
405
|
+
#[serde(default)]
|
|
406
|
+
pub links: Vec<LinkMetadata>,
|
|
407
|
+
|
|
408
|
+
/// Extracted images with source and dimensions
|
|
409
|
+
#[serde(default)]
|
|
410
|
+
pub images: Vec<ImageMetadataType>,
|
|
411
|
+
|
|
412
|
+
/// Extracted structured data blocks
|
|
413
|
+
#[serde(default)]
|
|
414
|
+
pub structured_data: Vec<StructuredData>,
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
impl HtmlMetadata {
|
|
418
|
+
/// Check if metadata is empty (no meaningful content extracted).
|
|
419
|
+
pub fn is_empty(&self) -> bool {
|
|
420
|
+
self.title.is_none()
|
|
421
|
+
&& self.description.is_none()
|
|
422
|
+
&& self.keywords.is_empty()
|
|
423
|
+
&& self.author.is_none()
|
|
424
|
+
&& self.canonical_url.is_none()
|
|
425
|
+
&& self.base_href.is_none()
|
|
426
|
+
&& self.language.is_none()
|
|
427
|
+
&& self.text_direction.is_none()
|
|
428
|
+
&& self.open_graph.is_empty()
|
|
429
|
+
&& self.twitter_card.is_empty()
|
|
430
|
+
&& self.meta_tags.is_empty()
|
|
431
|
+
&& self.headers.is_empty()
|
|
432
|
+
&& self.links.is_empty()
|
|
433
|
+
&& self.images.is_empty()
|
|
434
|
+
&& self.structured_data.is_empty()
|
|
435
|
+
}
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
#[cfg(feature = "html")]
|
|
439
|
+
impl From<html_to_markdown_rs::ExtendedMetadata> for HtmlMetadata {
|
|
440
|
+
fn from(metadata: html_to_markdown_rs::ExtendedMetadata) -> Self {
|
|
441
|
+
let text_dir = metadata.document.text_direction.map(|td| match td {
|
|
442
|
+
html_to_markdown_rs::TextDirection::LeftToRight => TextDirection::LeftToRight,
|
|
443
|
+
html_to_markdown_rs::TextDirection::RightToLeft => TextDirection::RightToLeft,
|
|
444
|
+
html_to_markdown_rs::TextDirection::Auto => TextDirection::Auto,
|
|
445
|
+
});
|
|
446
|
+
|
|
447
|
+
HtmlMetadata {
|
|
448
|
+
title: metadata.document.title,
|
|
449
|
+
description: metadata.document.description,
|
|
450
|
+
keywords: metadata.document.keywords,
|
|
451
|
+
author: metadata.document.author,
|
|
452
|
+
canonical_url: metadata.document.canonical_url,
|
|
453
|
+
base_href: metadata.document.base_href,
|
|
454
|
+
language: metadata.document.language,
|
|
455
|
+
text_direction: text_dir,
|
|
456
|
+
open_graph: metadata.document.open_graph,
|
|
457
|
+
twitter_card: metadata.document.twitter_card,
|
|
458
|
+
meta_tags: metadata.document.meta_tags,
|
|
459
|
+
headers: metadata
|
|
460
|
+
.headers
|
|
461
|
+
.into_iter()
|
|
462
|
+
.map(|h| HeaderMetadata {
|
|
463
|
+
level: h.level,
|
|
464
|
+
text: h.text,
|
|
465
|
+
id: h.id,
|
|
466
|
+
depth: h.depth,
|
|
467
|
+
html_offset: h.html_offset,
|
|
468
|
+
})
|
|
469
|
+
.collect(),
|
|
470
|
+
links: metadata
|
|
471
|
+
.links
|
|
472
|
+
.into_iter()
|
|
473
|
+
.map(|l| LinkMetadata {
|
|
474
|
+
href: l.href,
|
|
475
|
+
text: l.text,
|
|
476
|
+
title: l.title,
|
|
477
|
+
link_type: match l.link_type {
|
|
478
|
+
html_to_markdown_rs::LinkType::Anchor => LinkType::Anchor,
|
|
479
|
+
html_to_markdown_rs::LinkType::Internal => LinkType::Internal,
|
|
480
|
+
html_to_markdown_rs::LinkType::External => LinkType::External,
|
|
481
|
+
html_to_markdown_rs::LinkType::Email => LinkType::Email,
|
|
482
|
+
html_to_markdown_rs::LinkType::Phone => LinkType::Phone,
|
|
483
|
+
html_to_markdown_rs::LinkType::Other => LinkType::Other,
|
|
484
|
+
},
|
|
485
|
+
rel: l.rel,
|
|
486
|
+
attributes: l.attributes.into_iter().collect(),
|
|
487
|
+
})
|
|
488
|
+
.collect(),
|
|
489
|
+
images: metadata
|
|
490
|
+
.images
|
|
491
|
+
.into_iter()
|
|
492
|
+
.map(|img| ImageMetadataType {
|
|
493
|
+
src: img.src,
|
|
494
|
+
alt: img.alt,
|
|
495
|
+
title: img.title,
|
|
496
|
+
dimensions: img.dimensions,
|
|
497
|
+
image_type: match img.image_type {
|
|
498
|
+
html_to_markdown_rs::ImageType::DataUri => ImageType::DataUri,
|
|
499
|
+
html_to_markdown_rs::ImageType::InlineSvg => ImageType::InlineSvg,
|
|
500
|
+
html_to_markdown_rs::ImageType::External => ImageType::External,
|
|
501
|
+
html_to_markdown_rs::ImageType::Relative => ImageType::Relative,
|
|
502
|
+
},
|
|
503
|
+
attributes: img.attributes.into_iter().collect(),
|
|
504
|
+
})
|
|
505
|
+
.collect(),
|
|
506
|
+
structured_data: metadata
|
|
507
|
+
.structured_data
|
|
508
|
+
.into_iter()
|
|
509
|
+
.map(|sd| StructuredData {
|
|
510
|
+
data_type: match sd.data_type {
|
|
511
|
+
html_to_markdown_rs::StructuredDataType::JsonLd => StructuredDataType::JsonLd,
|
|
512
|
+
html_to_markdown_rs::StructuredDataType::Microdata => StructuredDataType::Microdata,
|
|
513
|
+
html_to_markdown_rs::StructuredDataType::RDFa => StructuredDataType::RDFa,
|
|
514
|
+
},
|
|
515
|
+
raw_json: sd.raw_json,
|
|
516
|
+
schema_type: sd.schema_type,
|
|
517
|
+
})
|
|
518
|
+
.collect(),
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
/// OCR processing metadata.
|
|
524
|
+
///
|
|
525
|
+
/// Captures information about OCR processing configuration and results.
|
|
526
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
527
|
+
pub struct OcrMetadata {
|
|
528
|
+
/// OCR language code(s) used
|
|
529
|
+
pub language: String,
|
|
530
|
+
/// Tesseract Page Segmentation Mode (PSM)
|
|
531
|
+
pub psm: i32,
|
|
532
|
+
/// Output format (e.g., "text", "hocr")
|
|
533
|
+
pub output_format: String,
|
|
534
|
+
/// Number of tables detected
|
|
535
|
+
pub table_count: usize,
|
|
536
|
+
|
|
537
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
538
|
+
pub table_rows: Option<usize>,
|
|
539
|
+
|
|
540
|
+
#[serde(skip_serializing_if = "Option::is_none")]
|
|
541
|
+
pub table_cols: Option<usize>,
|
|
542
|
+
}
|
|
543
|
+
|
|
544
|
+
/// Error metadata (for batch operations).
|
|
545
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
546
|
+
pub struct ErrorMetadata {
|
|
547
|
+
pub error_type: String,
|
|
548
|
+
pub message: String,
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
/// PowerPoint presentation metadata.
|
|
552
|
+
///
|
|
553
|
+
/// Extracted from PPTX files containing slide counts and presentation details.
|
|
554
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
555
|
+
pub struct PptxMetadata {
|
|
556
|
+
/// Total number of slides in the presentation
|
|
557
|
+
pub slide_count: usize,
|
|
558
|
+
/// Names of slides (if available)
|
|
559
|
+
pub slide_names: Vec<String>,
|
|
560
|
+
}
|