kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
//! Stack management for HTML extraction with support for large documents.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles the specialized concern of managing stack size for HTML conversion,
|
|
4
|
+
//! particularly for large HTML documents that may require more stack space than the default.
|
|
5
|
+
//! On WASM, stack size is limited and cannot be increased, so size limits are enforced.
|
|
6
|
+
//! On native platforms, dedicated threads with larger stacks are used for large HTML.
|
|
7
|
+
|
|
8
|
+
use crate::error::{KreuzbergError, Result};
|
|
9
|
+
|
|
10
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
11
|
+
use std::{any::Any, thread};
|
|
12
|
+
|
|
13
|
+
#[cfg(target_arch = "wasm32")]
|
|
14
|
+
pub const MAX_HTML_SIZE_BYTES: usize = 2 * 1024 * 1024;
|
|
15
|
+
|
|
16
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
17
|
+
pub const LARGE_HTML_STACK_THRESHOLD_BYTES: usize = 512 * 1024;
|
|
18
|
+
|
|
19
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
20
|
+
pub const HTML_CONVERSION_STACK_SIZE_BYTES: usize = 16 * 1024 * 1024;
|
|
21
|
+
|
|
22
|
+
/// Check if HTML size exceeds WASM limit and return error if so.
|
|
23
|
+
///
|
|
24
|
+
/// WASM builds have a fixed stack size that cannot be increased, so we enforce
|
|
25
|
+
/// a 2MB limit to prevent stack overflow during HTML conversion.
|
|
26
|
+
#[cfg(target_arch = "wasm32")]
|
|
27
|
+
pub fn check_wasm_size_limit(html: &str) -> Result<()> {
|
|
28
|
+
if html.len() > MAX_HTML_SIZE_BYTES {
|
|
29
|
+
return Err(KreuzbergError::validation(format!(
|
|
30
|
+
"HTML file size ({} bytes) exceeds WASM limit of {} bytes (2MB). \
|
|
31
|
+
Large HTML files cannot be processed in WASM due to stack constraints. \
|
|
32
|
+
Consider using the native library for files of this size.",
|
|
33
|
+
html.len(),
|
|
34
|
+
MAX_HTML_SIZE_BYTES
|
|
35
|
+
)));
|
|
36
|
+
}
|
|
37
|
+
Ok(())
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/// Check if HTML size exceeds WASM limit and return error if so.
|
|
41
|
+
///
|
|
42
|
+
/// No-op on non-WASM platforms.
|
|
43
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
44
|
+
pub fn check_wasm_size_limit(_html: &str) -> Result<()> {
|
|
45
|
+
Ok(())
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/// Determine if HTML requires a dedicated stack due to size.
|
|
49
|
+
///
|
|
50
|
+
/// On native platforms, HTML larger than the threshold will be processed
|
|
51
|
+
/// on a dedicated thread with a larger stack to prevent overflow.
|
|
52
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
53
|
+
pub fn html_requires_large_stack(len: usize) -> bool {
|
|
54
|
+
len >= LARGE_HTML_STACK_THRESHOLD_BYTES
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/// Run a job on a dedicated thread with a large stack.
|
|
58
|
+
///
|
|
59
|
+
/// This is useful for HTML conversion of large documents that might
|
|
60
|
+
/// overflow the default thread stack on native platforms.
|
|
61
|
+
///
|
|
62
|
+
/// # Arguments
|
|
63
|
+
///
|
|
64
|
+
/// * `job` - The closure to execute on the dedicated thread
|
|
65
|
+
///
|
|
66
|
+
/// # Returns
|
|
67
|
+
///
|
|
68
|
+
/// The result of the job execution, or an error if the thread panics
|
|
69
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
70
|
+
pub fn run_on_dedicated_stack<T, F>(job: F) -> Result<T>
|
|
71
|
+
where
|
|
72
|
+
T: Send + 'static,
|
|
73
|
+
F: FnOnce() -> Result<T> + Send + 'static,
|
|
74
|
+
{
|
|
75
|
+
let handle = thread::Builder::new()
|
|
76
|
+
.name("kreuzberg-html-conversion".to_string())
|
|
77
|
+
.stack_size(HTML_CONVERSION_STACK_SIZE_BYTES)
|
|
78
|
+
.spawn(job)
|
|
79
|
+
.map_err(|err| KreuzbergError::Other(format!("Failed to spawn HTML conversion thread: {}", err)))?;
|
|
80
|
+
|
|
81
|
+
match handle.join() {
|
|
82
|
+
Ok(result) => result,
|
|
83
|
+
Err(panic) => {
|
|
84
|
+
let reason = extract_panic_reason(&panic);
|
|
85
|
+
Err(KreuzbergError::Other(format!("HTML conversion panicked: {}", reason)))
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/// Extract a human-readable reason from a panic.
|
|
91
|
+
///
|
|
92
|
+
/// Attempts to downcast the panic value to either &str or String,
|
|
93
|
+
/// falling back to a generic message if neither succeeds.
|
|
94
|
+
#[cfg(not(target_arch = "wasm32"))]
|
|
95
|
+
fn extract_panic_reason(panic: &Box<dyn Any + Send + 'static>) -> String {
|
|
96
|
+
if let Some(msg) = panic.downcast_ref::<&str>() {
|
|
97
|
+
(*msg).to_string()
|
|
98
|
+
} else if let Some(msg) = panic.downcast_ref::<String>() {
|
|
99
|
+
msg.clone()
|
|
100
|
+
} else {
|
|
101
|
+
"unknown panic".to_string()
|
|
102
|
+
}
|
|
103
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
//! Type definitions for HTML extraction.
|
|
2
|
+
|
|
3
|
+
use serde::{Deserialize, Serialize};
|
|
4
|
+
use std::collections::HashMap;
|
|
5
|
+
|
|
6
|
+
pub use html_to_markdown_rs::{
|
|
7
|
+
CodeBlockStyle, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingOptions,
|
|
8
|
+
PreprocessingPreset, WhitespaceMode,
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
/// Result of HTML extraction with optional images and warnings.
|
|
12
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
13
|
+
pub struct HtmlExtractionResult {
|
|
14
|
+
pub markdown: String,
|
|
15
|
+
pub images: Vec<ExtractedInlineImage>,
|
|
16
|
+
pub warnings: Vec<String>,
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/// Extracted inline image with metadata.
|
|
20
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
21
|
+
pub struct ExtractedInlineImage {
|
|
22
|
+
pub data: Vec<u8>,
|
|
23
|
+
pub format: String,
|
|
24
|
+
pub filename: Option<String>,
|
|
25
|
+
pub description: Option<String>,
|
|
26
|
+
pub dimensions: Option<(u32, u32)>,
|
|
27
|
+
pub attributes: HashMap<String, String>,
|
|
28
|
+
}
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
pub mod structured;
|
|
2
2
|
pub mod text;
|
|
3
|
+
pub mod transform;
|
|
3
4
|
|
|
4
5
|
#[cfg(feature = "ocr")]
|
|
5
6
|
pub mod image;
|
|
@@ -41,11 +42,14 @@ pub mod table;
|
|
|
41
42
|
#[cfg(feature = "xml")]
|
|
42
43
|
pub mod xml;
|
|
43
44
|
|
|
44
|
-
#[cfg(any(feature = "office", feature = "html"))]
|
|
45
|
+
#[cfg(any(feature = "office", feature = "html", feature = "xml"))]
|
|
45
46
|
pub mod markdown;
|
|
46
47
|
|
|
47
48
|
pub use structured::{JsonExtractionConfig, StructuredDataResult, parse_json, parse_toml, parse_yaml};
|
|
48
49
|
pub use text::parse_text;
|
|
50
|
+
pub use transform::{
|
|
51
|
+
ListItemMetadata, ListType, detect_list_items, generate_element_id, transform_extraction_result_to_elements,
|
|
52
|
+
};
|
|
49
53
|
|
|
50
54
|
#[cfg(feature = "ocr")]
|
|
51
55
|
pub use image::{ImageMetadata, extract_image_metadata};
|
|
@@ -84,7 +88,7 @@ pub use table::table_from_arrow_to_markdown;
|
|
|
84
88
|
#[cfg(feature = "xml")]
|
|
85
89
|
pub use xml::parse_xml;
|
|
86
90
|
|
|
87
|
-
#[cfg(any(feature = "office", feature = "html"))]
|
|
91
|
+
#[cfg(any(feature = "office", feature = "html", feature = "xml"))]
|
|
88
92
|
pub use markdown::cells_to_markdown;
|
|
89
93
|
|
|
90
94
|
pub use capacity::{
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
//! PPTX container and ZIP archive management.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles opening PPTX files, reading files from the ZIP archive,
|
|
4
|
+
//! finding slide paths, and iterating through slides.
|
|
5
|
+
|
|
6
|
+
use std::collections::HashMap;
|
|
7
|
+
use std::fs::File;
|
|
8
|
+
use std::io::Read;
|
|
9
|
+
use std::path::Path;
|
|
10
|
+
use zip::ZipArchive;
|
|
11
|
+
|
|
12
|
+
use super::elements::Slide;
|
|
13
|
+
use super::image_handling::get_full_image_path;
|
|
14
|
+
use crate::error::{KreuzbergError, Result};
|
|
15
|
+
|
|
16
|
+
pub(super) struct PptxContainer {
|
|
17
|
+
pub(super) archive: ZipArchive<File>,
|
|
18
|
+
slide_paths: Vec<String>,
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
impl PptxContainer {
|
|
22
|
+
pub(super) fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
|
|
23
|
+
// IO errors must bubble up unchanged - file access issues need user reports ~keep
|
|
24
|
+
let file = File::open(path)?;
|
|
25
|
+
|
|
26
|
+
let mut archive = match ZipArchive::new(file) {
|
|
27
|
+
Ok(arc) => arc,
|
|
28
|
+
Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
|
|
29
|
+
Err(e) => {
|
|
30
|
+
return Err(KreuzbergError::parsing(format!(
|
|
31
|
+
"Failed to read PPTX archive (invalid format): {}",
|
|
32
|
+
e
|
|
33
|
+
)));
|
|
34
|
+
}
|
|
35
|
+
};
|
|
36
|
+
|
|
37
|
+
let slide_paths = Self::find_slide_paths(&mut archive)?;
|
|
38
|
+
|
|
39
|
+
Ok(Self { archive, slide_paths })
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
pub(super) fn slide_paths(&self) -> &[String] {
|
|
43
|
+
&self.slide_paths
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
pub(super) fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
|
|
47
|
+
match self.archive.by_name(path) {
|
|
48
|
+
Ok(mut file) => {
|
|
49
|
+
let mut contents = Vec::new();
|
|
50
|
+
// IO errors must bubble up - file read issues need user reports ~keep
|
|
51
|
+
file.read_to_end(&mut contents)?;
|
|
52
|
+
Ok(contents)
|
|
53
|
+
}
|
|
54
|
+
Err(zip::result::ZipError::FileNotFound) => {
|
|
55
|
+
Err(KreuzbergError::parsing("File not found in archive".to_string()))
|
|
56
|
+
}
|
|
57
|
+
Err(zip::result::ZipError::Io(io_err)) => Err(io_err.into()), // Bubble up IO errors ~keep
|
|
58
|
+
Err(e) => Err(KreuzbergError::parsing(format!("Zip error: {}", e))),
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
pub(super) fn get_slide_rels_path(&self, slide_path: &str) -> String {
|
|
63
|
+
super::image_handling::get_slide_rels_path(slide_path)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
fn find_slide_paths(archive: &mut ZipArchive<File>) -> Result<Vec<String>> {
|
|
67
|
+
if let Ok(rels_data) = Self::read_file_from_archive(archive, "ppt/_rels/presentation.xml.rels")
|
|
68
|
+
&& let Ok(paths) = super::parser::parse_presentation_rels(&rels_data)
|
|
69
|
+
{
|
|
70
|
+
return Ok(paths);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
let mut slide_paths = Vec::new();
|
|
74
|
+
for i in 0..archive.len() {
|
|
75
|
+
if let Ok(file) = archive.by_index(i) {
|
|
76
|
+
let name = file.name();
|
|
77
|
+
if name.starts_with("ppt/slides/slide") && name.ends_with(".xml") {
|
|
78
|
+
slide_paths.push(name.to_string());
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
slide_paths.sort();
|
|
84
|
+
Ok(slide_paths)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
fn read_file_from_archive(archive: &mut ZipArchive<File>, path: &str) -> Result<Vec<u8>> {
|
|
88
|
+
let mut file = match archive.by_name(path) {
|
|
89
|
+
Ok(f) => f,
|
|
90
|
+
Err(zip::result::ZipError::Io(io_err)) => return Err(io_err.into()), // Bubble up IO errors ~keep
|
|
91
|
+
Err(e) => {
|
|
92
|
+
return Err(KreuzbergError::parsing(format!(
|
|
93
|
+
"Failed to read file from archive: {}",
|
|
94
|
+
e
|
|
95
|
+
)));
|
|
96
|
+
}
|
|
97
|
+
};
|
|
98
|
+
let mut contents = Vec::new();
|
|
99
|
+
// IO errors must bubble up - file read issues need user reports ~keep
|
|
100
|
+
file.read_to_end(&mut contents)?;
|
|
101
|
+
Ok(contents)
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
pub(super) struct SlideIterator {
|
|
106
|
+
container: PptxContainer,
|
|
107
|
+
current_index: usize,
|
|
108
|
+
total_slides: usize,
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
impl SlideIterator {
|
|
112
|
+
pub(super) fn new(container: PptxContainer) -> Self {
|
|
113
|
+
let total_slides = container.slide_paths().len();
|
|
114
|
+
Self {
|
|
115
|
+
container,
|
|
116
|
+
current_index: 0,
|
|
117
|
+
total_slides,
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
pub(super) fn slide_count(&self) -> usize {
|
|
122
|
+
self.total_slides
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
pub(super) fn next_slide(&mut self) -> Result<Option<Slide>> {
|
|
126
|
+
if self.current_index >= self.total_slides {
|
|
127
|
+
return Ok(None);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
let slide_path = &self.container.slide_paths()[self.current_index].clone();
|
|
131
|
+
let slide_number = (self.current_index + 1) as u32;
|
|
132
|
+
|
|
133
|
+
let xml_data = self.container.read_file(slide_path)?;
|
|
134
|
+
|
|
135
|
+
let rels_path = self.container.get_slide_rels_path(slide_path);
|
|
136
|
+
let rels_data = self.container.read_file(&rels_path).ok();
|
|
137
|
+
|
|
138
|
+
let slide = Slide::from_xml(slide_number, &xml_data, rels_data.as_deref())?;
|
|
139
|
+
|
|
140
|
+
self.current_index += 1;
|
|
141
|
+
|
|
142
|
+
Ok(Some(slide))
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
pub(super) fn get_slide_images(&mut self, slide: &Slide) -> Result<HashMap<String, Vec<u8>>> {
|
|
146
|
+
let mut image_data = HashMap::new();
|
|
147
|
+
|
|
148
|
+
for img_ref in &slide.images {
|
|
149
|
+
let slide_path = &self.container.slide_paths()[slide.slide_number as usize - 1];
|
|
150
|
+
let full_path = get_full_image_path(slide_path, &img_ref.target);
|
|
151
|
+
|
|
152
|
+
if let Ok(data) = self.container.read_file(&full_path) {
|
|
153
|
+
image_data.insert(img_ref.id.clone(), data);
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
Ok(image_data)
|
|
158
|
+
}
|
|
159
|
+
}
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
//! Content builder for accumulating slide output.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides utilities for building the final markdown content
|
|
4
|
+
//! from slide elements and managing page boundaries.
|
|
5
|
+
|
|
6
|
+
pub(super) struct ContentBuilder {
|
|
7
|
+
pub(super) content: String,
|
|
8
|
+
pub(super) boundaries: Vec<crate::types::PageBoundary>,
|
|
9
|
+
pub(super) page_contents: Vec<crate::types::PageContent>,
|
|
10
|
+
pub(super) config: Option<crate::core::config::PageConfig>,
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
impl ContentBuilder {
|
|
14
|
+
pub(super) fn new() -> Self {
|
|
15
|
+
Self {
|
|
16
|
+
content: String::with_capacity(8192),
|
|
17
|
+
boundaries: Vec::new(),
|
|
18
|
+
page_contents: Vec::new(),
|
|
19
|
+
config: None,
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
pub(super) fn with_page_config(capacity: usize, config: Option<crate::core::config::PageConfig>) -> Self {
|
|
24
|
+
Self {
|
|
25
|
+
content: String::with_capacity(capacity),
|
|
26
|
+
boundaries: if config.is_some() {
|
|
27
|
+
Vec::new()
|
|
28
|
+
} else {
|
|
29
|
+
Vec::with_capacity(0)
|
|
30
|
+
},
|
|
31
|
+
page_contents: if config.is_some() {
|
|
32
|
+
Vec::new()
|
|
33
|
+
} else {
|
|
34
|
+
Vec::with_capacity(0)
|
|
35
|
+
},
|
|
36
|
+
config,
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
pub(super) fn start_slide(&mut self, slide_number: u32) -> usize {
|
|
41
|
+
let byte_start = self.content.len();
|
|
42
|
+
|
|
43
|
+
if let Some(ref cfg) = self.config
|
|
44
|
+
&& cfg.insert_page_markers
|
|
45
|
+
{
|
|
46
|
+
let marker = cfg.marker_format.replace("{page_num}", &slide_number.to_string());
|
|
47
|
+
self.content.push_str(&marker);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
byte_start
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
pub(super) fn end_slide(&mut self, slide_number: u32, byte_start: usize, slide_content: String) {
|
|
54
|
+
let byte_end = self.content.len();
|
|
55
|
+
|
|
56
|
+
if self.config.is_some() {
|
|
57
|
+
self.boundaries.push(crate::types::PageBoundary {
|
|
58
|
+
byte_start,
|
|
59
|
+
byte_end,
|
|
60
|
+
page_number: slide_number as usize,
|
|
61
|
+
});
|
|
62
|
+
|
|
63
|
+
self.page_contents.push(crate::types::PageContent {
|
|
64
|
+
page_number: slide_number as usize,
|
|
65
|
+
content: slide_content,
|
|
66
|
+
tables: Vec::new(),
|
|
67
|
+
images: Vec::new(),
|
|
68
|
+
hierarchy: None,
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
pub(super) fn add_slide_header(&mut self, slide_number: u32) {
|
|
74
|
+
self.content.reserve(50);
|
|
75
|
+
self.content.push_str("\n\n<!-- Slide number: ");
|
|
76
|
+
self.content.push_str(&slide_number.to_string());
|
|
77
|
+
self.content.push_str(" -->\n");
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
pub(super) fn add_text(&mut self, text: &str) {
|
|
81
|
+
if !text.trim().is_empty() {
|
|
82
|
+
self.content.push_str(text);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
pub(super) fn add_title(&mut self, title: &str) {
|
|
87
|
+
if !title.trim().is_empty() {
|
|
88
|
+
self.content.push_str("# ");
|
|
89
|
+
self.content.push_str(title.trim());
|
|
90
|
+
self.content.push('\n');
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
pub(super) fn add_table(&mut self, rows: &[Vec<String>]) {
|
|
95
|
+
if rows.is_empty() {
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
self.content.push_str("\n<table>");
|
|
100
|
+
for (i, row) in rows.iter().enumerate() {
|
|
101
|
+
self.content.push_str("<tr>");
|
|
102
|
+
let tag = if i == 0 { "th" } else { "td" };
|
|
103
|
+
|
|
104
|
+
for cell in row {
|
|
105
|
+
self.content.push('<');
|
|
106
|
+
self.content.push_str(tag);
|
|
107
|
+
self.content.push('>');
|
|
108
|
+
self.content.push_str(&super::image_handling::html_escape(cell));
|
|
109
|
+
self.content.push_str("</");
|
|
110
|
+
self.content.push_str(tag);
|
|
111
|
+
self.content.push('>');
|
|
112
|
+
}
|
|
113
|
+
self.content.push_str("</tr>");
|
|
114
|
+
}
|
|
115
|
+
self.content.push_str("</table>\n");
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
pub(super) fn add_list_item(&mut self, level: u32, is_ordered: bool, text: &str) {
|
|
119
|
+
let indent_count = level.saturating_sub(1) as usize;
|
|
120
|
+
for _ in 0..indent_count {
|
|
121
|
+
self.content.push_str(" ");
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
let marker = if is_ordered { "1." } else { "-" };
|
|
125
|
+
self.content.push_str(marker);
|
|
126
|
+
self.content.push(' ');
|
|
127
|
+
self.content.push_str(text.trim());
|
|
128
|
+
self.content.push('\n');
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
pub(super) fn add_image(&mut self, image_id: &str, slide_number: u32) {
|
|
132
|
+
let filename = format!("slide_{}_image_{}.jpg", slide_number, image_id);
|
|
133
|
+
self.content.push_str(";
|
|
136
|
+
self.content.push_str(&filename);
|
|
137
|
+
self.content.push_str(")\n");
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
pub(super) fn add_notes(&mut self, notes: &str) {
|
|
141
|
+
if !notes.trim().is_empty() {
|
|
142
|
+
self.content.push_str("\n\n### Notes:\n");
|
|
143
|
+
self.content.push_str(notes);
|
|
144
|
+
self.content.push('\n');
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
pub(super) fn build(
|
|
149
|
+
self,
|
|
150
|
+
) -> (
|
|
151
|
+
String,
|
|
152
|
+
Option<Vec<crate::types::PageBoundary>>,
|
|
153
|
+
Option<Vec<crate::types::PageContent>>,
|
|
154
|
+
) {
|
|
155
|
+
let content = self.content.trim().to_string();
|
|
156
|
+
let boundaries = if self.config.is_some() && !self.boundaries.is_empty() {
|
|
157
|
+
Some(self.boundaries)
|
|
158
|
+
} else {
|
|
159
|
+
None
|
|
160
|
+
};
|
|
161
|
+
let pages = if self.config.is_some() && !self.page_contents.is_empty() {
|
|
162
|
+
Some(self.page_contents)
|
|
163
|
+
} else {
|
|
164
|
+
None
|
|
165
|
+
};
|
|
166
|
+
(content, boundaries, pages)
|
|
167
|
+
}
|
|
168
|
+
}
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
//! Internal types for PPTX extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! This module defines the internal data structures used to represent
|
|
4
|
+
//! slide elements, formatting, and text runs during XML parsing.
|
|
5
|
+
|
|
6
|
+
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, PartialOrd, Ord)]
|
|
7
|
+
pub(super) struct ElementPosition {
|
|
8
|
+
pub(super) x: i64,
|
|
9
|
+
pub(super) y: i64,
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
#[derive(Debug, Clone, Default)]
|
|
13
|
+
pub(super) struct Formatting {
|
|
14
|
+
pub(super) bold: bool,
|
|
15
|
+
pub(super) italic: bool,
|
|
16
|
+
pub(super) underlined: bool,
|
|
17
|
+
pub(super) lang: String,
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
#[derive(Debug, Clone)]
|
|
21
|
+
pub(super) struct Run {
|
|
22
|
+
pub(super) text: String,
|
|
23
|
+
pub(super) formatting: Formatting,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
impl Run {
|
|
27
|
+
pub(super) fn extract(&self) -> String {
|
|
28
|
+
self.text.clone()
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
pub(super) fn render_as_md(&self) -> String {
|
|
32
|
+
let mut result = self.text.clone();
|
|
33
|
+
|
|
34
|
+
if self.formatting.bold {
|
|
35
|
+
result = format!("**{}**", result);
|
|
36
|
+
}
|
|
37
|
+
if self.formatting.italic {
|
|
38
|
+
result = format!("*{}*", result);
|
|
39
|
+
}
|
|
40
|
+
if self.formatting.underlined {
|
|
41
|
+
result = format!("<u>{}</u>", result);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
result
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#[derive(Debug, Clone)]
|
|
49
|
+
pub(super) struct TextElement {
|
|
50
|
+
pub(super) runs: Vec<Run>,
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
#[derive(Debug, Clone)]
|
|
54
|
+
pub(super) struct ListItem {
|
|
55
|
+
pub(super) level: u32,
|
|
56
|
+
pub(super) is_ordered: bool,
|
|
57
|
+
pub(super) runs: Vec<Run>,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
#[derive(Debug, Clone)]
|
|
61
|
+
pub(super) struct ListElement {
|
|
62
|
+
pub(super) items: Vec<ListItem>,
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#[derive(Debug, Clone)]
|
|
66
|
+
pub(super) struct TableCell {
|
|
67
|
+
pub(super) runs: Vec<Run>,
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
#[derive(Debug, Clone)]
|
|
71
|
+
pub(super) struct TableRow {
|
|
72
|
+
pub(super) cells: Vec<TableCell>,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
#[derive(Debug, Clone)]
|
|
76
|
+
pub(super) struct TableElement {
|
|
77
|
+
pub(super) rows: Vec<TableRow>,
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
#[derive(Debug, Clone)]
|
|
81
|
+
pub(super) struct ImageReference {
|
|
82
|
+
pub(super) id: String,
|
|
83
|
+
pub(super) target: String,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
#[derive(Debug, Clone)]
|
|
87
|
+
pub(super) enum SlideElement {
|
|
88
|
+
Text(TextElement, ElementPosition),
|
|
89
|
+
Table(TableElement, ElementPosition),
|
|
90
|
+
Image(ImageReference, ElementPosition),
|
|
91
|
+
List(ListElement, ElementPosition),
|
|
92
|
+
Unknown,
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
impl SlideElement {
|
|
96
|
+
pub(super) fn position(&self) -> ElementPosition {
|
|
97
|
+
match self {
|
|
98
|
+
SlideElement::Text(_, pos)
|
|
99
|
+
| SlideElement::Table(_, pos)
|
|
100
|
+
| SlideElement::Image(_, pos)
|
|
101
|
+
| SlideElement::List(_, pos) => *pos,
|
|
102
|
+
SlideElement::Unknown => ElementPosition::default(),
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
#[derive(Debug)]
|
|
108
|
+
pub(super) struct Slide {
|
|
109
|
+
pub(super) slide_number: u32,
|
|
110
|
+
pub(super) elements: Vec<SlideElement>,
|
|
111
|
+
pub(super) images: Vec<ImageReference>,
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
#[derive(Debug, Clone)]
|
|
115
|
+
pub(super) struct ParserConfig {
|
|
116
|
+
pub(super) extract_images: bool,
|
|
117
|
+
pub(super) include_slide_comment: bool,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
impl Default for ParserConfig {
|
|
121
|
+
fn default() -> Self {
|
|
122
|
+
Self {
|
|
123
|
+
extract_images: true,
|
|
124
|
+
include_slide_comment: false,
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
pub(super) enum ParsedContent {
|
|
130
|
+
Text(TextElement),
|
|
131
|
+
List(ListElement),
|
|
132
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
//! Image parsing and format detection.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles image-related parsing from slide XML and
|
|
4
|
+
//! detection of image formats from file data.
|
|
5
|
+
|
|
6
|
+
pub(super) fn html_escape(text: &str) -> String {
|
|
7
|
+
text.replace('&', "&")
|
|
8
|
+
.replace('<', "<")
|
|
9
|
+
.replace('>', ">")
|
|
10
|
+
.replace('"', """)
|
|
11
|
+
.replace('\'', "'")
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
pub(super) fn detect_image_format(data: &[u8]) -> String {
|
|
15
|
+
if data.starts_with(&[0xFF, 0xD8, 0xFF]) {
|
|
16
|
+
"jpeg".to_string()
|
|
17
|
+
} else if data.starts_with(&[0x89, 0x50, 0x4E, 0x47]) {
|
|
18
|
+
"png".to_string()
|
|
19
|
+
} else if data.starts_with(b"GIF") {
|
|
20
|
+
"gif".to_string()
|
|
21
|
+
} else if data.starts_with(b"BM") {
|
|
22
|
+
"bmp".to_string()
|
|
23
|
+
} else if data.starts_with(b"<svg") || data.starts_with(b"<?xml") {
|
|
24
|
+
"svg".to_string()
|
|
25
|
+
} else if data.starts_with(b"II\x2A\x00") || data.starts_with(b"MM\x00\x2A") {
|
|
26
|
+
"tiff".to_string()
|
|
27
|
+
} else {
|
|
28
|
+
"unknown".to_string()
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
pub(super) fn get_slide_rels_path(slide_path: &str) -> String {
|
|
33
|
+
let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
|
|
34
|
+
if parts.len() == 2 {
|
|
35
|
+
format!("{}/_rels/{}.rels", parts[1], parts[0])
|
|
36
|
+
} else {
|
|
37
|
+
format!("_rels/{}.rels", slide_path)
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
pub(super) fn get_full_image_path(slide_path: &str, image_target: &str) -> String {
|
|
42
|
+
if image_target.starts_with("..") {
|
|
43
|
+
let parts: Vec<&str> = slide_path.rsplitn(3, '/').collect();
|
|
44
|
+
if parts.len() >= 3 {
|
|
45
|
+
format!("{}/{}", parts[2], &image_target[3..])
|
|
46
|
+
} else {
|
|
47
|
+
format!("ppt/{}", &image_target[3..])
|
|
48
|
+
}
|
|
49
|
+
} else {
|
|
50
|
+
let parts: Vec<&str> = slide_path.rsplitn(2, '/').collect();
|
|
51
|
+
if parts.len() == 2 {
|
|
52
|
+
format!("{}/{}", parts[1], image_target)
|
|
53
|
+
} else {
|
|
54
|
+
format!("ppt/slides/{}", image_target)
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|