kreuzberg 4.0.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +24 -16
- data/README.md +4 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,479 @@
|
|
|
1
|
+
//! OPML parsing and content extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles XML parsing, metadata extraction from the `<head>` section,
|
|
4
|
+
//! and recursive processing of `<outline>` elements in the `<body>` section.
|
|
5
|
+
|
|
6
|
+
use crate::Result;
|
|
7
|
+
use std::collections::HashMap;
|
|
8
|
+
|
|
9
|
+
#[cfg(feature = "office")]
|
|
10
|
+
use roxmltree::Node;
|
|
11
|
+
|
|
12
|
+
/// Extract OPML content and metadata from raw bytes.
|
|
13
|
+
///
|
|
14
|
+
/// Parses the XML document structure, extracts metadata from the `<head>` section,
|
|
15
|
+
/// and processes the outline hierarchy in the `<body>` section.
|
|
16
|
+
///
|
|
17
|
+
/// # Returns
|
|
18
|
+
///
|
|
19
|
+
/// A tuple containing:
|
|
20
|
+
/// - Extracted content as a String (outline hierarchy with indentation)
|
|
21
|
+
/// - Metadata HashMap with key-value pairs from the head section
|
|
22
|
+
#[cfg(feature = "office")]
|
|
23
|
+
pub(crate) fn extract_content_and_metadata(content: &[u8]) -> Result<(String, HashMap<String, serde_json::Value>)> {
|
|
24
|
+
let doc = roxmltree::Document::parse(
|
|
25
|
+
std::str::from_utf8(content)
|
|
26
|
+
.map_err(|e| crate::KreuzbergError::Other(format!("Invalid UTF-8 in OPML: {}", e)))?,
|
|
27
|
+
)
|
|
28
|
+
.map_err(|e| crate::KreuzbergError::Other(format!("Failed to parse OPML: {}", e)))?;
|
|
29
|
+
|
|
30
|
+
let mut extracted_content = String::new();
|
|
31
|
+
let mut metadata = HashMap::new();
|
|
32
|
+
|
|
33
|
+
if let Some(opml) = doc.root().children().find(|n| n.tag_name().name() == "opml") {
|
|
34
|
+
if let Some(head) = opml.children().find(|n| n.tag_name().name() == "head") {
|
|
35
|
+
extract_metadata_from_head(head, &mut metadata);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
if let Some(body) = opml.children().find(|n| n.tag_name().name() == "body") {
|
|
39
|
+
if let Some(title) = metadata.get("title").and_then(|v| v.as_str()) {
|
|
40
|
+
extracted_content.push_str(title);
|
|
41
|
+
extracted_content.push('\n');
|
|
42
|
+
extracted_content.push('\n');
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
for outline in body.children().filter(|n| n.tag_name().name() == "outline") {
|
|
46
|
+
process_outline(outline, 0, &mut extracted_content);
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
Ok((extracted_content.trim().to_string(), metadata))
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/// Extract metadata from the OPML `<head>` section.
|
|
55
|
+
///
|
|
56
|
+
/// Extracts standard OPML metadata fields:
|
|
57
|
+
/// - title: The document title
|
|
58
|
+
/// - dateCreated: Creation date
|
|
59
|
+
/// - dateModified: Last modification date
|
|
60
|
+
/// - ownerName: Document owner's name
|
|
61
|
+
/// - ownerEmail: Document owner's email
|
|
62
|
+
#[cfg(feature = "office")]
|
|
63
|
+
fn extract_metadata_from_head(head: Node, metadata: &mut HashMap<String, serde_json::Value>) {
|
|
64
|
+
for child in head.children().filter(|n| n.is_element()) {
|
|
65
|
+
let tag = child.tag_name().name();
|
|
66
|
+
let text = child.text().unwrap_or("").trim();
|
|
67
|
+
|
|
68
|
+
if text.is_empty() {
|
|
69
|
+
continue;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
match tag {
|
|
73
|
+
"title" => {
|
|
74
|
+
metadata.insert("title".to_string(), serde_json::json!(text));
|
|
75
|
+
}
|
|
76
|
+
"dateCreated" => {
|
|
77
|
+
metadata.insert("dateCreated".to_string(), serde_json::json!(text));
|
|
78
|
+
}
|
|
79
|
+
"dateModified" => {
|
|
80
|
+
metadata.insert("dateModified".to_string(), serde_json::json!(text));
|
|
81
|
+
}
|
|
82
|
+
"ownerName" => {
|
|
83
|
+
metadata.insert("ownerName".to_string(), serde_json::json!(text));
|
|
84
|
+
}
|
|
85
|
+
"ownerEmail" => {
|
|
86
|
+
metadata.insert("ownerEmail".to_string(), serde_json::json!(text));
|
|
87
|
+
}
|
|
88
|
+
_ => {}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/// Process outline elements recursively.
|
|
94
|
+
///
|
|
95
|
+
/// Extracts text content from outline hierarchy while preserving nesting depth
|
|
96
|
+
/// through indentation. URL attributes are excluded from the main content.
|
|
97
|
+
///
|
|
98
|
+
/// # Arguments
|
|
99
|
+
///
|
|
100
|
+
/// * `node` - The outline node to process
|
|
101
|
+
/// * `depth` - Current nesting depth (for indentation)
|
|
102
|
+
/// * `output` - Output string buffer to append content to
|
|
103
|
+
#[cfg(feature = "office")]
|
|
104
|
+
pub(crate) fn process_outline(node: Node, depth: usize, output: &mut String) {
|
|
105
|
+
let text = node.attribute("text").unwrap_or("").trim();
|
|
106
|
+
|
|
107
|
+
if !text.is_empty() {
|
|
108
|
+
let indent = " ".repeat(depth);
|
|
109
|
+
output.push_str(&indent);
|
|
110
|
+
output.push_str(text);
|
|
111
|
+
output.push('\n');
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
for child in node.children().filter(|n| n.tag_name().name() == "outline") {
|
|
115
|
+
process_outline(child, depth + 1, output);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
#[cfg(all(test, feature = "office"))]
|
|
120
|
+
mod tests {
|
|
121
|
+
use super::*;
|
|
122
|
+
|
|
123
|
+
#[test]
|
|
124
|
+
fn test_simple_outline_parsing() {
|
|
125
|
+
let opml = br#"<?xml version="1.0"?>
|
|
126
|
+
<opml version="2.0">
|
|
127
|
+
<head>
|
|
128
|
+
<title>Test</title>
|
|
129
|
+
</head>
|
|
130
|
+
<body>
|
|
131
|
+
<outline text="Item 1" />
|
|
132
|
+
<outline text="Item 2" />
|
|
133
|
+
</body>
|
|
134
|
+
</opml>"#;
|
|
135
|
+
|
|
136
|
+
let (content, metadata) = extract_content_and_metadata(opml).expect("Should parse simple OPML");
|
|
137
|
+
|
|
138
|
+
assert!(content.contains("Item 1"), "Should extract first item");
|
|
139
|
+
assert!(content.contains("Item 2"), "Should extract second item");
|
|
140
|
+
assert_eq!(
|
|
141
|
+
metadata.get("title").and_then(|v| v.as_str()),
|
|
142
|
+
Some("Test"),
|
|
143
|
+
"Should extract title"
|
|
144
|
+
);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
#[test]
|
|
148
|
+
fn test_nested_hierarchy() {
|
|
149
|
+
let opml = br#"<?xml version="1.0"?>
|
|
150
|
+
<opml version="2.0">
|
|
151
|
+
<head>
|
|
152
|
+
<title>Hierarchy Test</title>
|
|
153
|
+
</head>
|
|
154
|
+
<body>
|
|
155
|
+
<outline text="Category">
|
|
156
|
+
<outline text="Subcategory">
|
|
157
|
+
<outline text="Item" />
|
|
158
|
+
</outline>
|
|
159
|
+
</outline>
|
|
160
|
+
</body>
|
|
161
|
+
</opml>"#;
|
|
162
|
+
|
|
163
|
+
let (content, _) = extract_content_and_metadata(opml).expect("Should parse nested OPML");
|
|
164
|
+
|
|
165
|
+
assert!(content.contains("Category"), "Should contain top level");
|
|
166
|
+
assert!(content.contains("Subcategory"), "Should contain nested level");
|
|
167
|
+
assert!(content.contains("Item"), "Should contain deep item");
|
|
168
|
+
assert!(content.contains(" "), "Should have indentation for nested items");
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
#[test]
|
|
172
|
+
fn test_rss_feeds() {
|
|
173
|
+
let opml = br#"<?xml version="1.0"?>
|
|
174
|
+
<opml version="2.0">
|
|
175
|
+
<head>
|
|
176
|
+
<title>Feeds</title>
|
|
177
|
+
</head>
|
|
178
|
+
<body>
|
|
179
|
+
<outline text="Tech">
|
|
180
|
+
<outline text="Hacker News" type="rss" xmlUrl="https://news.ycombinator.com/rss" htmlUrl="https://news.ycombinator.com/" />
|
|
181
|
+
<outline text="TechCrunch" type="rss" xmlUrl="https://techcrunch.com/feed/" />
|
|
182
|
+
</outline>
|
|
183
|
+
</body>
|
|
184
|
+
</opml>"#;
|
|
185
|
+
|
|
186
|
+
let (content, _) = extract_content_and_metadata(opml).expect("Should parse RSS OPML");
|
|
187
|
+
|
|
188
|
+
assert!(content.contains("Hacker News"), "Should extract feed title");
|
|
189
|
+
assert!(
|
|
190
|
+
!content.contains("https://"),
|
|
191
|
+
"Should NOT extract feed URLs (text-only extraction)"
|
|
192
|
+
);
|
|
193
|
+
assert!(content.contains("TechCrunch"), "Should extract multiple feeds");
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
#[test]
|
|
197
|
+
fn test_metadata_extraction() {
|
|
198
|
+
let opml = br#"<?xml version="1.0"?>
|
|
199
|
+
<opml version="2.0">
|
|
200
|
+
<head>
|
|
201
|
+
<title>My Feeds</title>
|
|
202
|
+
<dateCreated>Mon, 06 Nov 2023 00:00:00 GMT</dateCreated>
|
|
203
|
+
<dateModified>Fri, 01 Dec 2023 12:00:00 GMT</dateModified>
|
|
204
|
+
<ownerName>John Doe</ownerName>
|
|
205
|
+
<ownerEmail>john@example.com</ownerEmail>
|
|
206
|
+
</head>
|
|
207
|
+
<body>
|
|
208
|
+
<outline text="Item" />
|
|
209
|
+
</body>
|
|
210
|
+
</opml>"#;
|
|
211
|
+
|
|
212
|
+
let (_content, metadata) = extract_content_and_metadata(opml).expect("Should extract metadata");
|
|
213
|
+
|
|
214
|
+
assert_eq!(metadata.get("title").and_then(|v| v.as_str()), Some("My Feeds"));
|
|
215
|
+
assert_eq!(metadata.get("ownerName").and_then(|v| v.as_str()), Some("John Doe"));
|
|
216
|
+
assert_eq!(
|
|
217
|
+
metadata.get("ownerEmail").and_then(|v| v.as_str()),
|
|
218
|
+
Some("john@example.com")
|
|
219
|
+
);
|
|
220
|
+
assert!(metadata.contains_key("dateCreated"));
|
|
221
|
+
assert!(metadata.contains_key("dateModified"));
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
#[test]
|
|
225
|
+
fn test_with_special_characters() {
|
|
226
|
+
let opml = br#"<?xml version="1.0"?>
|
|
227
|
+
<opml version="2.0">
|
|
228
|
+
<head>
|
|
229
|
+
<title>Test & Special</title>
|
|
230
|
+
</head>
|
|
231
|
+
<body>
|
|
232
|
+
<outline text="Business & Startups" />
|
|
233
|
+
<outline text="Science <Advanced>" />
|
|
234
|
+
</body>
|
|
235
|
+
</opml>"#;
|
|
236
|
+
|
|
237
|
+
let (content, metadata) = extract_content_and_metadata(opml).expect("Should handle special characters");
|
|
238
|
+
|
|
239
|
+
assert!(
|
|
240
|
+
content.contains("Business") && content.contains("Startups"),
|
|
241
|
+
"Should decode HTML entities"
|
|
242
|
+
);
|
|
243
|
+
let title = metadata.get("title").and_then(|v| v.as_str()).unwrap_or("");
|
|
244
|
+
assert!(!title.is_empty(), "Should extract title");
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
#[test]
|
|
248
|
+
fn test_empty_body() {
|
|
249
|
+
let opml = br#"<?xml version="1.0"?>
|
|
250
|
+
<opml version="2.0">
|
|
251
|
+
<head>
|
|
252
|
+
<title>Empty</title>
|
|
253
|
+
</head>
|
|
254
|
+
<body>
|
|
255
|
+
</body>
|
|
256
|
+
</opml>"#;
|
|
257
|
+
|
|
258
|
+
let (_content, metadata) = extract_content_and_metadata(opml).expect("Should handle empty body");
|
|
259
|
+
|
|
260
|
+
assert_eq!(metadata.get("title").and_then(|v| v.as_str()), Some("Empty"));
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
#[test]
|
|
264
|
+
fn test_malformed_missing_closing_tag() {
|
|
265
|
+
let opml = br#"<?xml version="1.0"?>
|
|
266
|
+
<opml version="2.0">
|
|
267
|
+
<head>
|
|
268
|
+
<title>Broken</title>
|
|
269
|
+
</head>
|
|
270
|
+
<body>
|
|
271
|
+
<outline text="Unclosed"
|
|
272
|
+
</body>
|
|
273
|
+
</opml>"#;
|
|
274
|
+
|
|
275
|
+
let result = extract_content_and_metadata(opml);
|
|
276
|
+
assert!(result.is_err(), "Should fail to parse OPML with missing closing tags");
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
#[test]
|
|
280
|
+
fn test_malformed_invalid_nesting() {
|
|
281
|
+
let opml = br#"<?xml version="1.0"?>
|
|
282
|
+
<opml version="2.0">
|
|
283
|
+
<head>
|
|
284
|
+
<title>Invalid Nesting</title>
|
|
285
|
+
<body>
|
|
286
|
+
<outline text="Item" />
|
|
287
|
+
</body>
|
|
288
|
+
</opml>"#;
|
|
289
|
+
|
|
290
|
+
let result = extract_content_and_metadata(opml);
|
|
291
|
+
assert!(result.is_err(), "Should fail to parse OPML with invalid nesting");
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
#[test]
|
|
295
|
+
fn test_empty_outline_elements() {
|
|
296
|
+
let opml = br#"<?xml version="1.0"?>
|
|
297
|
+
<opml version="2.0">
|
|
298
|
+
<head>
|
|
299
|
+
<title>Empty Outlines</title>
|
|
300
|
+
</head>
|
|
301
|
+
<body>
|
|
302
|
+
<outline text="" />
|
|
303
|
+
<outline />
|
|
304
|
+
<outline text="Valid Item">
|
|
305
|
+
<outline text="" />
|
|
306
|
+
<outline text="Another Valid" />
|
|
307
|
+
</outline>
|
|
308
|
+
</body>
|
|
309
|
+
</opml>"#;
|
|
310
|
+
|
|
311
|
+
let (content, metadata) = extract_content_and_metadata(opml).expect("Should handle empty outline elements");
|
|
312
|
+
|
|
313
|
+
assert!(content.contains("Valid Item"), "Should extract valid items");
|
|
314
|
+
assert!(content.contains("Another Valid"), "Should extract nested valid items");
|
|
315
|
+
let empty_count = content.matches("\n\n").count();
|
|
316
|
+
assert!(empty_count < 3, "Should skip empty outline elements");
|
|
317
|
+
|
|
318
|
+
assert_eq!(metadata.get("title").and_then(|v| v.as_str()), Some("Empty Outlines"));
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
#[test]
|
|
322
|
+
fn test_deeply_nested_empty_nodes() {
|
|
323
|
+
let opml = br#"<?xml version="1.0"?>
|
|
324
|
+
<opml version="2.0">
|
|
325
|
+
<head>
|
|
326
|
+
<title>Deep Nesting</title>
|
|
327
|
+
</head>
|
|
328
|
+
<body>
|
|
329
|
+
<outline text="Level 1">
|
|
330
|
+
<outline text="">
|
|
331
|
+
<outline text="">
|
|
332
|
+
<outline text="Deep Item">
|
|
333
|
+
<outline text="" />
|
|
334
|
+
</outline>
|
|
335
|
+
</outline>
|
|
336
|
+
</outline>
|
|
337
|
+
<outline text="Level 1 Sibling" />
|
|
338
|
+
</outline>
|
|
339
|
+
</body>
|
|
340
|
+
</opml>"#;
|
|
341
|
+
|
|
342
|
+
let (content, _) = extract_content_and_metadata(opml).expect("Should handle deeply nested structures");
|
|
343
|
+
|
|
344
|
+
assert!(content.contains("Level 1"), "Should extract top-level item");
|
|
345
|
+
assert!(content.contains("Deep Item"), "Should extract deeply nested item");
|
|
346
|
+
assert!(content.contains("Level 1 Sibling"), "Should extract sibling items");
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
#[test]
|
|
350
|
+
fn test_outline_with_missing_text_attribute() {
|
|
351
|
+
let opml = br#"<?xml version="1.0"?>
|
|
352
|
+
<opml version="2.0">
|
|
353
|
+
<head>
|
|
354
|
+
<title>Missing Attributes</title>
|
|
355
|
+
</head>
|
|
356
|
+
<body>
|
|
357
|
+
<outline type="folder" />
|
|
358
|
+
<outline text="Valid Item" type="rss" />
|
|
359
|
+
<outline type="rss" xmlUrl="https://example.com/feed" />
|
|
360
|
+
</body>
|
|
361
|
+
</opml>"#;
|
|
362
|
+
|
|
363
|
+
let (content, metadata) =
|
|
364
|
+
extract_content_and_metadata(opml).expect("Should handle outline with missing text attribute");
|
|
365
|
+
|
|
366
|
+
assert!(content.contains("Valid Item"), "Should extract item with text");
|
|
367
|
+
assert!(!content.contains("https://"), "Should not extract URLs");
|
|
368
|
+
|
|
369
|
+
assert_eq!(
|
|
370
|
+
metadata.get("title").and_then(|v| v.as_str()),
|
|
371
|
+
Some("Missing Attributes")
|
|
372
|
+
);
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
#[test]
|
|
376
|
+
fn test_whitespace_only_text_attribute() {
|
|
377
|
+
let opml = br#"<?xml version="1.0"?>
|
|
378
|
+
<opml version="2.0">
|
|
379
|
+
<head>
|
|
380
|
+
<title>Whitespace Test</title>
|
|
381
|
+
</head>
|
|
382
|
+
<body>
|
|
383
|
+
<outline text=" " />
|
|
384
|
+
<outline text="
|
|
385
|
+
" />
|
|
386
|
+
<outline text="Real Content" />
|
|
387
|
+
</body>
|
|
388
|
+
</opml>"#;
|
|
389
|
+
|
|
390
|
+
let (content, _) = extract_content_and_metadata(opml).expect("Should handle whitespace-only text");
|
|
391
|
+
|
|
392
|
+
assert!(
|
|
393
|
+
content.contains("Real Content"),
|
|
394
|
+
"Should extract non-whitespace content"
|
|
395
|
+
);
|
|
396
|
+
let trimmed = content.trim();
|
|
397
|
+
assert!(trimmed.contains("Whitespace Test"), "Should have title");
|
|
398
|
+
assert!(trimmed.contains("Real Content"), "Should have real content");
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
#[test]
|
|
402
|
+
fn test_html_entity_in_nested_structure() {
|
|
403
|
+
let opml = br#"<?xml version="1.0"?>
|
|
404
|
+
<opml version="2.0">
|
|
405
|
+
<head>
|
|
406
|
+
<title>Entities & Nesting</title>
|
|
407
|
+
</head>
|
|
408
|
+
<body>
|
|
409
|
+
<outline text="News & Updates">
|
|
410
|
+
<outline text="Tech < Science" />
|
|
411
|
+
<outline text="Health > Wealth" />
|
|
412
|
+
</outline>
|
|
413
|
+
</body>
|
|
414
|
+
</opml>"#;
|
|
415
|
+
|
|
416
|
+
let (content, metadata) = extract_content_and_metadata(opml).expect("Should handle HTML entities");
|
|
417
|
+
|
|
418
|
+
assert!(
|
|
419
|
+
content.contains("News") && content.contains("Updates"),
|
|
420
|
+
"Should decode & entity"
|
|
421
|
+
);
|
|
422
|
+
assert!(content.contains("Tech"), "Should handle < entity");
|
|
423
|
+
assert!(content.contains("Science"), "Should decode entity properly");
|
|
424
|
+
|
|
425
|
+
let title = metadata.get("title").and_then(|v| v.as_str()).unwrap_or("");
|
|
426
|
+
assert!(
|
|
427
|
+
title.contains("&") && title.contains("Nesting"),
|
|
428
|
+
"Title should have decoded entity"
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
|
|
432
|
+
#[test]
|
|
433
|
+
fn test_single_outline_no_children() {
|
|
434
|
+
let opml = br#"<?xml version="1.0"?>
|
|
435
|
+
<opml version="2.0">
|
|
436
|
+
<head>
|
|
437
|
+
<title>Single</title>
|
|
438
|
+
</head>
|
|
439
|
+
<body>
|
|
440
|
+
<outline text="Only Item" />
|
|
441
|
+
</body>
|
|
442
|
+
</opml>"#;
|
|
443
|
+
|
|
444
|
+
let (content, metadata) = extract_content_and_metadata(opml).expect("Should handle single outline");
|
|
445
|
+
|
|
446
|
+
assert!(content.contains("Only Item"), "Should extract single item");
|
|
447
|
+
assert_eq!(metadata.get("title").and_then(|v| v.as_str()), Some("Single"));
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
#[test]
|
|
451
|
+
fn test_head_without_body() {
|
|
452
|
+
let opml = br#"<?xml version="1.0"?>
|
|
453
|
+
<opml version="2.0">
|
|
454
|
+
<head>
|
|
455
|
+
<title>No Body</title>
|
|
456
|
+
</head>
|
|
457
|
+
</opml>"#;
|
|
458
|
+
|
|
459
|
+
let (content, metadata) = extract_content_and_metadata(opml).expect("Should handle OPML without body");
|
|
460
|
+
|
|
461
|
+
assert_eq!(metadata.get("title").and_then(|v| v.as_str()), Some("No Body"));
|
|
462
|
+
assert!(content.is_empty() || content.trim() == "No Body");
|
|
463
|
+
}
|
|
464
|
+
|
|
465
|
+
#[test]
|
|
466
|
+
fn test_body_without_head() {
|
|
467
|
+
let opml = br#"<?xml version="1.0"?>
|
|
468
|
+
<opml version="2.0">
|
|
469
|
+
<body>
|
|
470
|
+
<outline text="Item" />
|
|
471
|
+
</body>
|
|
472
|
+
</opml>"#;
|
|
473
|
+
|
|
474
|
+
let (content, metadata) = extract_content_and_metadata(opml).expect("Should handle OPML without head");
|
|
475
|
+
|
|
476
|
+
assert!(content.contains("Item"), "Should extract body content");
|
|
477
|
+
assert!(metadata.is_empty(), "Should have no metadata without head");
|
|
478
|
+
}
|
|
479
|
+
}
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
//! Core PDF extraction functionality.
|
|
2
|
+
//!
|
|
3
|
+
//! Handles document loading, text extraction, metadata parsing, and table detection.
|
|
4
|
+
|
|
5
|
+
use crate::Result;
|
|
6
|
+
use crate::core::config::ExtractionConfig;
|
|
7
|
+
use crate::types::PageContent;
|
|
8
|
+
|
|
9
|
+
#[cfg(feature = "pdf")]
|
|
10
|
+
use crate::types::Table;
|
|
11
|
+
#[cfg(feature = "pdf")]
|
|
12
|
+
use pdfium_render::prelude::*;
|
|
13
|
+
|
|
14
|
+
#[cfg(feature = "pdf")]
|
|
15
|
+
pub(crate) type PdfExtractionPhaseResult = (
|
|
16
|
+
crate::pdf::metadata::PdfExtractionMetadata,
|
|
17
|
+
String,
|
|
18
|
+
Vec<Table>,
|
|
19
|
+
Option<Vec<PageContent>>,
|
|
20
|
+
);
|
|
21
|
+
|
|
22
|
+
/// Extract text, metadata, and tables from a PDF document using a single shared instance.
|
|
23
|
+
///
|
|
24
|
+
/// This method consolidates all PDF extraction phases (text, metadata, tables) into a single
|
|
25
|
+
/// operation using a single PdfDocument instance. This avoids redundant document parsing
|
|
26
|
+
/// and pdfium initialization overhead.
|
|
27
|
+
///
|
|
28
|
+
/// # Performance
|
|
29
|
+
///
|
|
30
|
+
/// By reusing a single document instance across all extraction phases, we eliminate:
|
|
31
|
+
/// - Duplicate document parsing overhead (25-40ms saved)
|
|
32
|
+
/// - Redundant pdfium bindings initialization
|
|
33
|
+
/// - Multiple page tree traversals
|
|
34
|
+
///
|
|
35
|
+
/// Expected improvement: 20-30% faster PDF processing.
|
|
36
|
+
///
|
|
37
|
+
/// # Returns
|
|
38
|
+
///
|
|
39
|
+
/// A tuple containing:
|
|
40
|
+
/// - PDF metadata (title, authors, dates, page structure, etc.)
|
|
41
|
+
/// - Native extracted text (or empty if using OCR)
|
|
42
|
+
/// - Extracted tables (if OCR feature enabled)
|
|
43
|
+
/// - Per-page content (if page extraction configured)
|
|
44
|
+
#[cfg(feature = "pdf")]
|
|
45
|
+
pub(crate) fn extract_all_from_document(
|
|
46
|
+
document: &PdfDocument,
|
|
47
|
+
config: &ExtractionConfig,
|
|
48
|
+
) -> Result<PdfExtractionPhaseResult> {
|
|
49
|
+
let (native_text, _boundaries, page_contents, pdf_metadata) =
|
|
50
|
+
crate::pdf::text::extract_text_and_metadata_from_pdf_document(document, Some(config))?;
|
|
51
|
+
|
|
52
|
+
let tables = extract_tables_from_document(document, &pdf_metadata)?;
|
|
53
|
+
|
|
54
|
+
Ok((pdf_metadata, native_text, tables, page_contents))
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/// Extract tables from PDF document using native text positions.
|
|
58
|
+
///
|
|
59
|
+
/// This function converts PDF character positions to HocrWord format,
|
|
60
|
+
/// then uses the existing table reconstruction logic to detect tables.
|
|
61
|
+
///
|
|
62
|
+
/// Uses the shared PdfDocument reference (wrapped in Arc<RwLock<>> for thread-safety).
|
|
63
|
+
#[cfg(all(feature = "pdf", feature = "ocr"))]
|
|
64
|
+
fn extract_tables_from_document(
|
|
65
|
+
document: &PdfDocument,
|
|
66
|
+
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
|
|
67
|
+
) -> Result<Vec<Table>> {
|
|
68
|
+
use crate::ocr::table::{reconstruct_table, table_to_markdown};
|
|
69
|
+
use crate::pdf::table::extract_words_from_page;
|
|
70
|
+
|
|
71
|
+
let mut all_tables = Vec::new();
|
|
72
|
+
|
|
73
|
+
for (page_index, page) in document.pages().iter().enumerate() {
|
|
74
|
+
let words = extract_words_from_page(&page, 0.0)?;
|
|
75
|
+
|
|
76
|
+
if words.is_empty() {
|
|
77
|
+
continue;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
let column_threshold = 50;
|
|
81
|
+
let row_threshold_ratio = 0.5;
|
|
82
|
+
|
|
83
|
+
let table_cells = reconstruct_table(&words, column_threshold, row_threshold_ratio);
|
|
84
|
+
|
|
85
|
+
if !table_cells.is_empty() {
|
|
86
|
+
let markdown = table_to_markdown(&table_cells);
|
|
87
|
+
|
|
88
|
+
all_tables.push(Table {
|
|
89
|
+
cells: table_cells,
|
|
90
|
+
markdown,
|
|
91
|
+
page_number: page_index + 1,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
Ok(all_tables)
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/// Fallback for when OCR feature is not enabled - returns empty tables.
|
|
100
|
+
#[cfg(all(feature = "pdf", not(feature = "ocr")))]
|
|
101
|
+
fn extract_tables_from_document(
|
|
102
|
+
_document: &PdfDocument,
|
|
103
|
+
_metadata: &crate::pdf::metadata::PdfExtractionMetadata,
|
|
104
|
+
) -> Result<Vec<crate::types::Table>> {
|
|
105
|
+
Ok(vec![])
|
|
106
|
+
}
|