kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
//! Element generation and list detection utilities.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functions for detecting semantic elements in text,
|
|
4
|
+
//! including list items, and generating unique element IDs.
|
|
5
|
+
|
|
6
|
+
use crate::types::{Element, ElementId, ElementMetadata, ElementType};
|
|
7
|
+
use std::collections::HashMap;
|
|
8
|
+
|
|
9
|
+
use super::types::{ListItemMetadata, ListType};
|
|
10
|
+
|
|
11
|
+
/// Detect list items in text with support for multiple formats.
|
|
12
|
+
///
|
|
13
|
+
/// Identifies bullet points, numbered items, and indented items.
|
|
14
|
+
/// Supports formats like:
|
|
15
|
+
/// - `- bullet item`
|
|
16
|
+
/// - `* bullet item`
|
|
17
|
+
/// - `• bullet item`
|
|
18
|
+
/// - `1. numbered item`
|
|
19
|
+
/// - `a. lettered item`
|
|
20
|
+
/// - Indented items with leading whitespace
|
|
21
|
+
///
|
|
22
|
+
/// # Arguments
|
|
23
|
+
///
|
|
24
|
+
/// * `text` - The text to search for list items
|
|
25
|
+
///
|
|
26
|
+
/// # Returns
|
|
27
|
+
///
|
|
28
|
+
/// A vector of ListItemMetadata structs describing detected list items
|
|
29
|
+
pub fn detect_list_items(text: &str) -> Vec<ListItemMetadata> {
|
|
30
|
+
let mut items = Vec::new();
|
|
31
|
+
let lines: Vec<&str> = text.lines().collect();
|
|
32
|
+
|
|
33
|
+
let mut current_byte_offset = 0;
|
|
34
|
+
|
|
35
|
+
for line in lines {
|
|
36
|
+
let line_start_offset = current_byte_offset;
|
|
37
|
+
let trimmed = line.trim_start();
|
|
38
|
+
let indent_level = (line.len() - trimmed.len()) / 2; // Estimate indent level
|
|
39
|
+
|
|
40
|
+
// Check for bullet points
|
|
41
|
+
if let Some(stripped) = trimmed.strip_prefix('-')
|
|
42
|
+
&& (stripped.starts_with(' ') || stripped.is_empty())
|
|
43
|
+
{
|
|
44
|
+
let byte_end = line_start_offset + line.len();
|
|
45
|
+
items.push(ListItemMetadata {
|
|
46
|
+
list_type: ListType::Bullet,
|
|
47
|
+
byte_start: line_start_offset,
|
|
48
|
+
byte_end,
|
|
49
|
+
indent_level: indent_level as u32,
|
|
50
|
+
});
|
|
51
|
+
current_byte_offset = byte_end + 1; // +1 for newline
|
|
52
|
+
continue;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
if let Some(stripped) = trimmed.strip_prefix('*')
|
|
56
|
+
&& (stripped.starts_with(' ') || stripped.is_empty())
|
|
57
|
+
{
|
|
58
|
+
let byte_end = line_start_offset + line.len();
|
|
59
|
+
items.push(ListItemMetadata {
|
|
60
|
+
list_type: ListType::Bullet,
|
|
61
|
+
byte_start: line_start_offset,
|
|
62
|
+
byte_end,
|
|
63
|
+
indent_level: indent_level as u32,
|
|
64
|
+
});
|
|
65
|
+
current_byte_offset = byte_end + 1;
|
|
66
|
+
continue;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if let Some(stripped) = trimmed.strip_prefix('•')
|
|
70
|
+
&& (stripped.starts_with(' ') || stripped.is_empty())
|
|
71
|
+
{
|
|
72
|
+
let byte_end = line_start_offset + line.len();
|
|
73
|
+
items.push(ListItemMetadata {
|
|
74
|
+
list_type: ListType::Bullet,
|
|
75
|
+
byte_start: line_start_offset,
|
|
76
|
+
byte_end,
|
|
77
|
+
indent_level: indent_level as u32,
|
|
78
|
+
});
|
|
79
|
+
current_byte_offset = byte_end + 1;
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// Check for numbered lists (e.g., "1.", "2.", etc.)
|
|
84
|
+
if let Some(pos) = trimmed.find('.') {
|
|
85
|
+
let prefix = &trimmed[..pos];
|
|
86
|
+
if prefix.chars().all(|c| c.is_ascii_digit())
|
|
87
|
+
&& pos > 0
|
|
88
|
+
&& pos < 3
|
|
89
|
+
&& trimmed.len() > pos + 1
|
|
90
|
+
&& trimmed[pos + 1..].starts_with(' ')
|
|
91
|
+
{
|
|
92
|
+
let byte_end = line_start_offset + line.len();
|
|
93
|
+
items.push(ListItemMetadata {
|
|
94
|
+
list_type: ListType::Numbered,
|
|
95
|
+
byte_start: line_start_offset,
|
|
96
|
+
byte_end,
|
|
97
|
+
indent_level: indent_level as u32,
|
|
98
|
+
});
|
|
99
|
+
current_byte_offset = byte_end + 1;
|
|
100
|
+
continue;
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Check for lettered lists (e.g., "a.", "b.", "A.", "B.")
|
|
105
|
+
if let Some(pos) = trimmed.find('.') {
|
|
106
|
+
let prefix = &trimmed[..pos];
|
|
107
|
+
if prefix.len() == 1
|
|
108
|
+
&& prefix.chars().all(|c| c.is_alphabetic())
|
|
109
|
+
&& pos > 0
|
|
110
|
+
&& trimmed.len() > pos + 1
|
|
111
|
+
&& trimmed[pos + 1..].starts_with(' ')
|
|
112
|
+
{
|
|
113
|
+
let byte_end = line_start_offset + line.len();
|
|
114
|
+
items.push(ListItemMetadata {
|
|
115
|
+
list_type: ListType::Lettered,
|
|
116
|
+
byte_start: line_start_offset,
|
|
117
|
+
byte_end,
|
|
118
|
+
indent_level: indent_level as u32,
|
|
119
|
+
});
|
|
120
|
+
current_byte_offset = byte_end + 1;
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Check for indented items (more than 4 spaces)
|
|
126
|
+
if indent_level >= 2 && !trimmed.is_empty() {
|
|
127
|
+
let byte_end = line_start_offset + line.len();
|
|
128
|
+
items.push(ListItemMetadata {
|
|
129
|
+
list_type: ListType::Indented,
|
|
130
|
+
byte_start: line_start_offset,
|
|
131
|
+
byte_end,
|
|
132
|
+
indent_level: indent_level as u32,
|
|
133
|
+
});
|
|
134
|
+
current_byte_offset = byte_end + 1;
|
|
135
|
+
continue;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
current_byte_offset = line_start_offset + line.len() + 1; // +1 for newline
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
items
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/// Generate a unique element ID for semantic content.
|
|
145
|
+
///
|
|
146
|
+
/// Creates a deterministic hash-based ID from the element type, text content,
|
|
147
|
+
/// and page number. Uses a simple wrapping multiplication algorithm for
|
|
148
|
+
/// consistent ID generation without external dependencies.
|
|
149
|
+
///
|
|
150
|
+
/// # Arguments
|
|
151
|
+
///
|
|
152
|
+
/// * `text` - The element text content
|
|
153
|
+
/// * `element_type` - The semantic element type
|
|
154
|
+
/// * `page_number` - Optional page number for multi-page documents
|
|
155
|
+
///
|
|
156
|
+
/// # Returns
|
|
157
|
+
///
|
|
158
|
+
/// An ElementId suitable for referencing this semantic element
|
|
159
|
+
pub fn generate_element_id(text: &str, element_type: ElementType, page_number: Option<usize>) -> ElementId {
|
|
160
|
+
// Simple deterministic hash using wrapping multiplication
|
|
161
|
+
let type_hash = format!("{:?}", element_type)
|
|
162
|
+
.bytes()
|
|
163
|
+
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
|
|
164
|
+
|
|
165
|
+
let text_hash = text
|
|
166
|
+
.bytes()
|
|
167
|
+
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
|
|
168
|
+
|
|
169
|
+
let page_hash = page_number
|
|
170
|
+
.unwrap_or(1)
|
|
171
|
+
.to_string()
|
|
172
|
+
.bytes()
|
|
173
|
+
.fold(0u64, |acc, b| acc.wrapping_mul(31).wrapping_add(b as u64));
|
|
174
|
+
|
|
175
|
+
let combined = type_hash
|
|
176
|
+
.wrapping_mul(65599)
|
|
177
|
+
.wrapping_add(text_hash)
|
|
178
|
+
.wrapping_mul(65599)
|
|
179
|
+
.wrapping_add(page_hash);
|
|
180
|
+
|
|
181
|
+
ElementId::new(format!("elem-{:x}", combined)).expect("ElementId creation failed")
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/// Add paragraphs as NarrativeText elements, splitting on double newlines.
|
|
185
|
+
pub(super) fn add_paragraphs(elements: &mut Vec<Element>, text: &str, page_number: usize, title: &Option<String>) {
|
|
186
|
+
if text.is_empty() {
|
|
187
|
+
return;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Split on double newlines to detect paragraph boundaries
|
|
191
|
+
for paragraph in text.split("\n\n").filter(|p| !p.trim().is_empty()) {
|
|
192
|
+
let para_text = paragraph.trim();
|
|
193
|
+
if para_text.is_empty() {
|
|
194
|
+
continue;
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
let element_id = generate_element_id(para_text, ElementType::NarrativeText, Some(page_number));
|
|
198
|
+
elements.push(Element {
|
|
199
|
+
element_id,
|
|
200
|
+
element_type: ElementType::NarrativeText,
|
|
201
|
+
text: para_text.to_string(),
|
|
202
|
+
metadata: ElementMetadata {
|
|
203
|
+
page_number: Some(page_number),
|
|
204
|
+
filename: title.clone(),
|
|
205
|
+
coordinates: None,
|
|
206
|
+
element_index: Some(elements.len()),
|
|
207
|
+
additional: HashMap::new(),
|
|
208
|
+
},
|
|
209
|
+
});
|
|
210
|
+
}
|
|
211
|
+
}
|
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
//! Transformation utilities for converting extraction results into semantic elements.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides post-processing functions to transform raw extraction results
|
|
4
|
+
//! into element-based output format, suitable for downstream processing and analysis.
|
|
5
|
+
//! Key functionality includes:
|
|
6
|
+
//!
|
|
7
|
+
//! - Semantic element generation from text content
|
|
8
|
+
//! - List item detection with support for multiple formats
|
|
9
|
+
//! - PageBreak interleaving with reverse byte-order processing
|
|
10
|
+
//! - Safe bounds checking for text ranges
|
|
11
|
+
|
|
12
|
+
mod content;
|
|
13
|
+
mod elements;
|
|
14
|
+
mod types;
|
|
15
|
+
|
|
16
|
+
// Re-export public API
|
|
17
|
+
pub use elements::{detect_list_items, generate_element_id};
|
|
18
|
+
pub use types::{ListItemMetadata, ListType};
|
|
19
|
+
|
|
20
|
+
use crate::types::{Element, ExtractionResult};
|
|
21
|
+
use content::{
|
|
22
|
+
add_page_break, format_table_as_text, process_content, process_hierarchy, process_images, process_tables,
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
/// Transform an extraction result into semantic elements.
|
|
26
|
+
///
|
|
27
|
+
/// This function takes a reference to an ExtractionResult and generates
|
|
28
|
+
/// a vector of Element structs representing semantic blocks in the document.
|
|
29
|
+
/// It detects content sections, list items, page breaks, and other structural
|
|
30
|
+
/// elements to create an Unstructured-compatible element-based output.
|
|
31
|
+
///
|
|
32
|
+
/// Handles:
|
|
33
|
+
/// - PDF hierarchy → Title/Heading elements
|
|
34
|
+
/// - Multi-page documents with correct page numbers
|
|
35
|
+
/// - Table and Image extraction
|
|
36
|
+
/// - PageBreak interleaving
|
|
37
|
+
/// - Bounding box coordinates
|
|
38
|
+
/// - Paragraph detection for NarrativeText
|
|
39
|
+
///
|
|
40
|
+
/// # Arguments
|
|
41
|
+
///
|
|
42
|
+
/// * `result` - Reference to the ExtractionResult to transform
|
|
43
|
+
///
|
|
44
|
+
/// # Returns
|
|
45
|
+
///
|
|
46
|
+
/// A vector of Elements with proper semantic types and metadata.
|
|
47
|
+
pub fn transform_extraction_result_to_elements(result: &ExtractionResult) -> Vec<Element> {
|
|
48
|
+
let mut elements = Vec::new();
|
|
49
|
+
|
|
50
|
+
// If pages are available, process per-page with hierarchy, tables, images
|
|
51
|
+
if let Some(ref pages) = result.pages {
|
|
52
|
+
for page in pages {
|
|
53
|
+
let page_number = page.page_number;
|
|
54
|
+
|
|
55
|
+
// 1. Process hierarchy blocks (PDF headings)
|
|
56
|
+
if let Some(ref hierarchy) = page.hierarchy {
|
|
57
|
+
process_hierarchy(&mut elements, hierarchy, page_number, &result.metadata.title);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// 2. Process tables on this page
|
|
61
|
+
process_tables(&mut elements, &page.tables, page_number, &result.metadata.title);
|
|
62
|
+
|
|
63
|
+
// 3. Process images on this page
|
|
64
|
+
process_images(&mut elements, &page.images, page_number, &result.metadata.title);
|
|
65
|
+
|
|
66
|
+
// 4. Process page content (body text, list items, paragraphs)
|
|
67
|
+
process_content(&mut elements, &page.content, page_number, &result.metadata.title);
|
|
68
|
+
|
|
69
|
+
// 5. Add PageBreak after each page (except the last)
|
|
70
|
+
if page_number < pages.len() {
|
|
71
|
+
add_page_break(&mut elements, page_number, page_number + 1, &result.metadata.title);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
} else {
|
|
75
|
+
// Fallback: No pages, process unified content with page 1
|
|
76
|
+
process_content(&mut elements, &result.content, 1, &result.metadata.title);
|
|
77
|
+
|
|
78
|
+
// Process global tables (if any)
|
|
79
|
+
for table in &result.tables {
|
|
80
|
+
let table_text = format_table_as_text(table);
|
|
81
|
+
let element_id = elements::generate_element_id(&table_text, crate::types::ElementType::Table, Some(1));
|
|
82
|
+
elements.push(Element {
|
|
83
|
+
element_id,
|
|
84
|
+
element_type: crate::types::ElementType::Table,
|
|
85
|
+
text: table_text,
|
|
86
|
+
metadata: crate::types::ElementMetadata {
|
|
87
|
+
page_number: Some(1),
|
|
88
|
+
filename: result.metadata.title.clone(),
|
|
89
|
+
coordinates: None,
|
|
90
|
+
element_index: Some(elements.len()),
|
|
91
|
+
additional: std::collections::HashMap::new(),
|
|
92
|
+
},
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Process global images (if any)
|
|
97
|
+
if let Some(ref images) = result.images {
|
|
98
|
+
for image in images {
|
|
99
|
+
let image_text = format!(
|
|
100
|
+
"Image: {} ({}x{})",
|
|
101
|
+
image.format,
|
|
102
|
+
image.width.unwrap_or(0),
|
|
103
|
+
image.height.unwrap_or(0)
|
|
104
|
+
);
|
|
105
|
+
let page_num = image.page_number.unwrap_or(1);
|
|
106
|
+
|
|
107
|
+
let element_id =
|
|
108
|
+
elements::generate_element_id(&image_text, crate::types::ElementType::Image, Some(page_num));
|
|
109
|
+
elements.push(Element {
|
|
110
|
+
element_id,
|
|
111
|
+
element_type: crate::types::ElementType::Image,
|
|
112
|
+
text: image_text,
|
|
113
|
+
metadata: crate::types::ElementMetadata {
|
|
114
|
+
page_number: Some(page_num),
|
|
115
|
+
filename: result.metadata.title.clone(),
|
|
116
|
+
coordinates: None,
|
|
117
|
+
element_index: Some(elements.len()),
|
|
118
|
+
additional: {
|
|
119
|
+
let mut m = std::collections::HashMap::new();
|
|
120
|
+
m.insert("format".to_string(), image.format.clone());
|
|
121
|
+
if let Some(width) = image.width {
|
|
122
|
+
m.insert("width".to_string(), width.to_string());
|
|
123
|
+
}
|
|
124
|
+
if let Some(height) = image.height {
|
|
125
|
+
m.insert("height".to_string(), height.to_string());
|
|
126
|
+
}
|
|
127
|
+
m
|
|
128
|
+
},
|
|
129
|
+
},
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
elements
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
#[cfg(test)]
|
|
139
|
+
mod tests {
|
|
140
|
+
use super::*;
|
|
141
|
+
|
|
142
|
+
#[test]
|
|
143
|
+
fn test_detect_bullet_items() {
|
|
144
|
+
let text = "- First item\n- Second item\n- Third item";
|
|
145
|
+
let items = detect_list_items(text);
|
|
146
|
+
assert_eq!(items.len(), 3);
|
|
147
|
+
assert_eq!(items[0].list_type, ListType::Bullet);
|
|
148
|
+
assert_eq!(items[1].list_type, ListType::Bullet);
|
|
149
|
+
assert_eq!(items[2].list_type, ListType::Bullet);
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
#[test]
|
|
153
|
+
fn test_detect_numbered_items() {
|
|
154
|
+
let text = "1. First\n2. Second\n3. Third";
|
|
155
|
+
let items = detect_list_items(text);
|
|
156
|
+
assert_eq!(items.len(), 3);
|
|
157
|
+
assert!(items.iter().all(|i| i.list_type == ListType::Numbered));
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
#[test]
|
|
161
|
+
fn test_detect_lettered_items() {
|
|
162
|
+
let text = "a. First\nb. Second\nc. Third";
|
|
163
|
+
let items = detect_list_items(text);
|
|
164
|
+
assert_eq!(items.len(), 3);
|
|
165
|
+
assert!(items.iter().all(|i| i.list_type == ListType::Lettered));
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
#[test]
|
|
169
|
+
fn test_detect_mixed_items() {
|
|
170
|
+
let text = "Some text\n- Bullet\n1. Numbered\nMore text";
|
|
171
|
+
let items = detect_list_items(text);
|
|
172
|
+
assert_eq!(items.len(), 2);
|
|
173
|
+
assert_eq!(items[0].list_type, ListType::Bullet);
|
|
174
|
+
assert_eq!(items[1].list_type, ListType::Numbered);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
#[test]
|
|
178
|
+
fn test_element_id_generation() {
|
|
179
|
+
use crate::types::ElementType;
|
|
180
|
+
let id1 = generate_element_id("test", ElementType::Title, Some(1));
|
|
181
|
+
let id2 = generate_element_id("test", ElementType::Title, Some(1));
|
|
182
|
+
assert_eq!(id1.as_ref(), id2.as_ref());
|
|
183
|
+
|
|
184
|
+
let id3 = generate_element_id("different", ElementType::Title, Some(1));
|
|
185
|
+
assert_ne!(id1.as_ref(), id3.as_ref());
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
#[test]
|
|
189
|
+
fn test_page_break_interleaving_reverse_order() {
|
|
190
|
+
// Test that page breaks are processed in reverse byte order
|
|
191
|
+
let page_breaks = vec![(100, "page_break_1"), (50, "page_break_2"), (75, "page_break_3")];
|
|
192
|
+
|
|
193
|
+
// Sort in descending order by byte offset
|
|
194
|
+
let mut sorted = page_breaks.clone();
|
|
195
|
+
sorted.sort_by(|(offset_a, _), (offset_b, _)| offset_b.cmp(offset_a));
|
|
196
|
+
|
|
197
|
+
// Verify reverse order: 100, 75, 50
|
|
198
|
+
assert_eq!(sorted[0].0, 100);
|
|
199
|
+
assert_eq!(sorted[1].0, 75);
|
|
200
|
+
assert_eq!(sorted[2].0, 50);
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
#[test]
|
|
204
|
+
fn test_bounds_checking() {
|
|
205
|
+
let text = "Hello world";
|
|
206
|
+
|
|
207
|
+
// Valid range
|
|
208
|
+
let valid_item = ListItemMetadata {
|
|
209
|
+
list_type: ListType::Bullet,
|
|
210
|
+
byte_start: 0,
|
|
211
|
+
byte_end: 5,
|
|
212
|
+
indent_level: 0,
|
|
213
|
+
};
|
|
214
|
+
assert!(valid_item.byte_start <= text.len());
|
|
215
|
+
assert!(valid_item.byte_end <= text.len());
|
|
216
|
+
assert!(valid_item.byte_start <= valid_item.byte_end);
|
|
217
|
+
|
|
218
|
+
// Invalid: end beyond string
|
|
219
|
+
let invalid_item = ListItemMetadata {
|
|
220
|
+
list_type: ListType::Bullet,
|
|
221
|
+
byte_start: 0,
|
|
222
|
+
byte_end: 100,
|
|
223
|
+
indent_level: 0,
|
|
224
|
+
};
|
|
225
|
+
assert!(invalid_item.byte_end > text.len());
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
#[test]
|
|
229
|
+
fn test_indent_level_detection() {
|
|
230
|
+
let text = " - Indented item";
|
|
231
|
+
let items = detect_list_items(text);
|
|
232
|
+
assert_eq!(items.len(), 1);
|
|
233
|
+
assert!(items[0].indent_level >= 1);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Helper to create minimal Metadata for tests
|
|
237
|
+
fn test_metadata(title: Option<String>) -> crate::types::Metadata {
|
|
238
|
+
crate::types::Metadata {
|
|
239
|
+
title,
|
|
240
|
+
subject: None,
|
|
241
|
+
authors: None,
|
|
242
|
+
keywords: None,
|
|
243
|
+
language: None,
|
|
244
|
+
created_at: None,
|
|
245
|
+
modified_at: None,
|
|
246
|
+
created_by: None,
|
|
247
|
+
modified_by: None,
|
|
248
|
+
pages: None,
|
|
249
|
+
format: None,
|
|
250
|
+
image_preprocessing: None,
|
|
251
|
+
json_schema: None,
|
|
252
|
+
error: None,
|
|
253
|
+
additional: Default::default(),
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Integration tests for full transformation
|
|
258
|
+
#[test]
|
|
259
|
+
fn test_transform_with_pages_and_hierarchy() {
|
|
260
|
+
use crate::types::{ElementType, ExtractionResult, HierarchicalBlock, PageContent, PageHierarchy};
|
|
261
|
+
|
|
262
|
+
// Create a mock result with pages and hierarchy
|
|
263
|
+
let result = ExtractionResult {
|
|
264
|
+
content: "Full document content".to_string(),
|
|
265
|
+
mime_type: "application/pdf".to_string(),
|
|
266
|
+
metadata: test_metadata(Some("Test Document".to_string())),
|
|
267
|
+
tables: vec![],
|
|
268
|
+
detected_languages: None,
|
|
269
|
+
chunks: None,
|
|
270
|
+
images: None,
|
|
271
|
+
djot_content: None,
|
|
272
|
+
pages: Some(vec![
|
|
273
|
+
PageContent {
|
|
274
|
+
page_number: 1,
|
|
275
|
+
content: "This is a test paragraph.\n\nAnother paragraph here.".to_string(),
|
|
276
|
+
tables: vec![],
|
|
277
|
+
images: vec![],
|
|
278
|
+
hierarchy: Some(PageHierarchy {
|
|
279
|
+
block_count: 2,
|
|
280
|
+
blocks: vec![
|
|
281
|
+
HierarchicalBlock {
|
|
282
|
+
text: "Main Title".to_string(),
|
|
283
|
+
font_size: 24.0,
|
|
284
|
+
level: "h1".to_string(),
|
|
285
|
+
bbox: Some((10.0, 20.0, 100.0, 50.0)),
|
|
286
|
+
},
|
|
287
|
+
HierarchicalBlock {
|
|
288
|
+
text: "Subtitle".to_string(),
|
|
289
|
+
font_size: 16.0,
|
|
290
|
+
level: "h2".to_string(),
|
|
291
|
+
bbox: Some((10.0, 60.0, 100.0, 80.0)),
|
|
292
|
+
},
|
|
293
|
+
],
|
|
294
|
+
}),
|
|
295
|
+
},
|
|
296
|
+
PageContent {
|
|
297
|
+
page_number: 2,
|
|
298
|
+
content: "- List item 1\n- List item 2".to_string(),
|
|
299
|
+
tables: vec![],
|
|
300
|
+
images: vec![],
|
|
301
|
+
hierarchy: None,
|
|
302
|
+
},
|
|
303
|
+
]),
|
|
304
|
+
elements: None,
|
|
305
|
+
};
|
|
306
|
+
|
|
307
|
+
let elements = transform_extraction_result_to_elements(&result);
|
|
308
|
+
|
|
309
|
+
// Verify we have elements
|
|
310
|
+
assert!(!elements.is_empty());
|
|
311
|
+
|
|
312
|
+
// Find Title elements from hierarchy
|
|
313
|
+
let titles: Vec<_> = elements
|
|
314
|
+
.iter()
|
|
315
|
+
.filter(|e| e.element_type == ElementType::Title)
|
|
316
|
+
.collect();
|
|
317
|
+
assert_eq!(titles.len(), 2, "Should have 2 title elements from hierarchy");
|
|
318
|
+
assert_eq!(titles[0].text, "Main Title");
|
|
319
|
+
assert_eq!(titles[1].text, "Subtitle");
|
|
320
|
+
|
|
321
|
+
// Verify page numbers
|
|
322
|
+
assert_eq!(titles[0].metadata.page_number, Some(1));
|
|
323
|
+
assert_eq!(titles[1].metadata.page_number, Some(1));
|
|
324
|
+
|
|
325
|
+
// Verify coordinates were extracted
|
|
326
|
+
assert!(titles[0].metadata.coordinates.is_some());
|
|
327
|
+
assert!(titles[1].metadata.coordinates.is_some());
|
|
328
|
+
|
|
329
|
+
// Find list items
|
|
330
|
+
let list_items: Vec<_> = elements
|
|
331
|
+
.iter()
|
|
332
|
+
.filter(|e| e.element_type == ElementType::ListItem)
|
|
333
|
+
.collect();
|
|
334
|
+
assert_eq!(list_items.len(), 2, "Should have 2 list items");
|
|
335
|
+
assert_eq!(list_items[0].metadata.page_number, Some(2));
|
|
336
|
+
assert_eq!(list_items[1].metadata.page_number, Some(2));
|
|
337
|
+
|
|
338
|
+
// Find PageBreak
|
|
339
|
+
let page_breaks: Vec<_> = elements
|
|
340
|
+
.iter()
|
|
341
|
+
.filter(|e| e.element_type == ElementType::PageBreak)
|
|
342
|
+
.collect();
|
|
343
|
+
assert_eq!(page_breaks.len(), 1, "Should have 1 page break between pages");
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
#[test]
|
|
347
|
+
fn test_transform_with_tables_and_images() {
|
|
348
|
+
use crate::types::{ExtractedImage, ExtractionResult, PageContent, Table};
|
|
349
|
+
use std::sync::Arc;
|
|
350
|
+
|
|
351
|
+
let table = Table {
|
|
352
|
+
cells: vec![
|
|
353
|
+
vec!["Header1".to_string(), "Header2".to_string()],
|
|
354
|
+
vec!["Cell1".to_string(), "Cell2".to_string()],
|
|
355
|
+
],
|
|
356
|
+
markdown: "| Header1 | Header2 |\n| Cell1 | Cell2 |".to_string(),
|
|
357
|
+
page_number: 1,
|
|
358
|
+
};
|
|
359
|
+
|
|
360
|
+
let image = ExtractedImage {
|
|
361
|
+
data: vec![1, 2, 3, 4],
|
|
362
|
+
format: "jpeg".to_string(),
|
|
363
|
+
image_index: 0,
|
|
364
|
+
page_number: Some(1),
|
|
365
|
+
width: Some(640),
|
|
366
|
+
height: Some(480),
|
|
367
|
+
colorspace: Some("RGB".to_string()),
|
|
368
|
+
bits_per_component: Some(8),
|
|
369
|
+
is_mask: false,
|
|
370
|
+
description: None,
|
|
371
|
+
ocr_result: None,
|
|
372
|
+
};
|
|
373
|
+
|
|
374
|
+
let result = ExtractionResult {
|
|
375
|
+
content: "Test content".to_string(),
|
|
376
|
+
mime_type: "application/pdf".to_string(),
|
|
377
|
+
metadata: test_metadata(Some("Test".to_string())),
|
|
378
|
+
tables: vec![],
|
|
379
|
+
detected_languages: None,
|
|
380
|
+
chunks: None,
|
|
381
|
+
images: None,
|
|
382
|
+
djot_content: None,
|
|
383
|
+
pages: Some(vec![PageContent {
|
|
384
|
+
page_number: 1,
|
|
385
|
+
content: "Some text".to_string(),
|
|
386
|
+
tables: vec![Arc::new(table)],
|
|
387
|
+
images: vec![Arc::new(image)],
|
|
388
|
+
hierarchy: None,
|
|
389
|
+
}]),
|
|
390
|
+
elements: None,
|
|
391
|
+
};
|
|
392
|
+
|
|
393
|
+
let elements = transform_extraction_result_to_elements(&result);
|
|
394
|
+
|
|
395
|
+
// Find table elements
|
|
396
|
+
use crate::types::ElementType;
|
|
397
|
+
let tables: Vec<_> = elements
|
|
398
|
+
.iter()
|
|
399
|
+
.filter(|e| e.element_type == ElementType::Table)
|
|
400
|
+
.collect();
|
|
401
|
+
assert_eq!(tables.len(), 1, "Should have 1 table element");
|
|
402
|
+
assert!(tables[0].text.contains("Header1"));
|
|
403
|
+
assert!(tables[0].text.contains("Cell2"));
|
|
404
|
+
|
|
405
|
+
// Find image elements
|
|
406
|
+
let images: Vec<_> = elements
|
|
407
|
+
.iter()
|
|
408
|
+
.filter(|e| e.element_type == ElementType::Image)
|
|
409
|
+
.collect();
|
|
410
|
+
assert_eq!(images.len(), 1, "Should have 1 image element");
|
|
411
|
+
assert!(images[0].text.contains("jpeg"));
|
|
412
|
+
assert!(images[0].text.contains("640"));
|
|
413
|
+
assert!(images[0].text.contains("480"));
|
|
414
|
+
assert_eq!(images[0].metadata.page_number, Some(1));
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
#[test]
|
|
418
|
+
fn test_transform_fallback_no_pages() {
|
|
419
|
+
use crate::types::{ElementType, ExtractionResult};
|
|
420
|
+
|
|
421
|
+
// Create a result without pages
|
|
422
|
+
let result = ExtractionResult {
|
|
423
|
+
content: "Simple text content\n\nSecond paragraph".to_string(),
|
|
424
|
+
mime_type: "text/plain".to_string(),
|
|
425
|
+
metadata: test_metadata(Some("Simple Doc".to_string())),
|
|
426
|
+
tables: vec![],
|
|
427
|
+
detected_languages: None,
|
|
428
|
+
chunks: None,
|
|
429
|
+
images: None,
|
|
430
|
+
djot_content: None,
|
|
431
|
+
pages: None,
|
|
432
|
+
elements: None,
|
|
433
|
+
};
|
|
434
|
+
|
|
435
|
+
let elements = transform_extraction_result_to_elements(&result);
|
|
436
|
+
|
|
437
|
+
// Should have narrative text elements
|
|
438
|
+
let narratives: Vec<_> = elements
|
|
439
|
+
.iter()
|
|
440
|
+
.filter(|e| e.element_type == ElementType::NarrativeText)
|
|
441
|
+
.collect();
|
|
442
|
+
assert!(!narratives.is_empty(), "Should have narrative text elements");
|
|
443
|
+
|
|
444
|
+
// All elements should have page_number = 1 (fallback)
|
|
445
|
+
for element in &elements {
|
|
446
|
+
assert_eq!(element.metadata.page_number, Some(1));
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
#[test]
|
|
451
|
+
fn test_paragraph_splitting() {
|
|
452
|
+
use crate::types::{ElementType, ExtractionResult};
|
|
453
|
+
|
|
454
|
+
let result = ExtractionResult {
|
|
455
|
+
content: "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.".to_string(),
|
|
456
|
+
mime_type: "text/plain".to_string(),
|
|
457
|
+
metadata: test_metadata(None),
|
|
458
|
+
tables: vec![],
|
|
459
|
+
detected_languages: None,
|
|
460
|
+
chunks: None,
|
|
461
|
+
images: None,
|
|
462
|
+
djot_content: None,
|
|
463
|
+
pages: None,
|
|
464
|
+
elements: None,
|
|
465
|
+
};
|
|
466
|
+
|
|
467
|
+
let elements = transform_extraction_result_to_elements(&result);
|
|
468
|
+
|
|
469
|
+
let narratives: Vec<_> = elements
|
|
470
|
+
.iter()
|
|
471
|
+
.filter(|e| e.element_type == ElementType::NarrativeText)
|
|
472
|
+
.collect();
|
|
473
|
+
|
|
474
|
+
// Should split into 3 separate paragraphs
|
|
475
|
+
assert_eq!(narratives.len(), 3, "Should split into 3 paragraphs");
|
|
476
|
+
assert_eq!(narratives[0].text, "First paragraph.");
|
|
477
|
+
assert_eq!(narratives[1].text, "Second paragraph.");
|
|
478
|
+
assert_eq!(narratives[2].text, "Third paragraph.");
|
|
479
|
+
}
|
|
480
|
+
}
|