kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -12,6 +12,10 @@
|
|
|
12
12
|
//!
|
|
13
13
|
//! Requires the `office` feature (which includes `pulldown-cmark`).
|
|
14
14
|
|
|
15
|
+
#[cfg(feature = "office")]
|
|
16
|
+
use super::frontmatter_utils::{
|
|
17
|
+
cells_to_markdown, extract_frontmatter, extract_metadata_from_yaml, extract_title_from_content,
|
|
18
|
+
};
|
|
15
19
|
#[cfg(feature = "office")]
|
|
16
20
|
use crate::Result;
|
|
17
21
|
#[cfg(feature = "office")]
|
|
@@ -24,8 +28,6 @@ use crate::types::{ExtractionResult, Metadata, Table};
|
|
|
24
28
|
use async_trait::async_trait;
|
|
25
29
|
#[cfg(feature = "office")]
|
|
26
30
|
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
|
|
27
|
-
#[cfg(feature = "office")]
|
|
28
|
-
use serde_yaml_ng::Value as YamlValue;
|
|
29
31
|
|
|
30
32
|
/// Enhanced Markdown extractor with metadata and table support.
|
|
31
33
|
///
|
|
@@ -44,102 +46,7 @@ impl MarkdownExtractor {
|
|
|
44
46
|
Self
|
|
45
47
|
}
|
|
46
48
|
|
|
47
|
-
|
|
48
|
-
///
|
|
49
|
-
/// Frontmatter is expected to be delimited by `---` at the start of the document.
|
|
50
|
-
/// Returns the remaining content after frontmatter.
|
|
51
|
-
fn extract_frontmatter(content: &str) -> (Option<YamlValue>, String) {
|
|
52
|
-
if !content.starts_with("---") {
|
|
53
|
-
return (None, content.to_string());
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
let rest = &content[3..];
|
|
57
|
-
if let Some(end_pos) = rest.find("\n---") {
|
|
58
|
-
let frontmatter_str = &rest[..end_pos];
|
|
59
|
-
let remaining = &rest[end_pos + 4..];
|
|
60
|
-
|
|
61
|
-
match serde_yaml_ng::from_str::<YamlValue>(frontmatter_str) {
|
|
62
|
-
Ok(value) => (Some(value), remaining.to_string()),
|
|
63
|
-
Err(_) => (None, content.to_string()),
|
|
64
|
-
}
|
|
65
|
-
} else {
|
|
66
|
-
(None, content.to_string())
|
|
67
|
-
}
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
/// Extract metadata from YAML frontmatter.
|
|
71
|
-
///
|
|
72
|
-
/// Extracts the following YAML fields:
|
|
73
|
-
/// - Standard fields: title, author, date, description (as subject)
|
|
74
|
-
/// - Extended fields: abstract, subject, category, tags, language, version
|
|
75
|
-
/// - Array fields (keywords, tags): converted to comma-separated strings
|
|
76
|
-
fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
|
|
77
|
-
let mut metadata = Metadata::default();
|
|
78
|
-
|
|
79
|
-
if let Some(title) = yaml.get("title").and_then(|v| v.as_str()) {
|
|
80
|
-
metadata.additional.insert("title".to_string(), title.into());
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
if let Some(author) = yaml.get("author").and_then(|v| v.as_str()) {
|
|
84
|
-
metadata.additional.insert("author".to_string(), author.into());
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
if let Some(date) = yaml.get("date").and_then(|v| v.as_str()) {
|
|
88
|
-
metadata.created_at = Some(date.to_string());
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
if let Some(keywords) = yaml.get("keywords") {
|
|
92
|
-
match keywords {
|
|
93
|
-
YamlValue::String(s) => {
|
|
94
|
-
metadata.additional.insert("keywords".to_string(), s.clone().into());
|
|
95
|
-
}
|
|
96
|
-
YamlValue::Sequence(seq) => {
|
|
97
|
-
let keywords_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
|
|
98
|
-
metadata.additional.insert("keywords".to_string(), keywords_str.into());
|
|
99
|
-
}
|
|
100
|
-
_ => {}
|
|
101
|
-
}
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
if let Some(description) = yaml.get("description").and_then(|v| v.as_str()) {
|
|
105
|
-
metadata.subject = Some(description.to_string());
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
if let Some(abstract_text) = yaml.get("abstract").and_then(|v| v.as_str()) {
|
|
109
|
-
metadata.additional.insert("abstract".to_string(), abstract_text.into());
|
|
110
|
-
}
|
|
111
|
-
|
|
112
|
-
if let Some(subject) = yaml.get("subject").and_then(|v| v.as_str()) {
|
|
113
|
-
metadata.subject = Some(subject.to_string());
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
if let Some(category) = yaml.get("category").and_then(|v| v.as_str()) {
|
|
117
|
-
metadata.additional.insert("category".to_string(), category.into());
|
|
118
|
-
}
|
|
119
|
-
|
|
120
|
-
if let Some(tags) = yaml.get("tags") {
|
|
121
|
-
match tags {
|
|
122
|
-
YamlValue::String(s) => {
|
|
123
|
-
metadata.additional.insert("tags".to_string(), s.clone().into());
|
|
124
|
-
}
|
|
125
|
-
YamlValue::Sequence(seq) => {
|
|
126
|
-
let tags_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
|
|
127
|
-
metadata.additional.insert("tags".to_string(), tags_str.into());
|
|
128
|
-
}
|
|
129
|
-
_ => {}
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
if let Some(language) = yaml.get("language").and_then(|v| v.as_str()) {
|
|
134
|
-
metadata.additional.insert("language".to_string(), language.into());
|
|
135
|
-
}
|
|
136
|
-
|
|
137
|
-
if let Some(version) = yaml.get("version").and_then(|v| v.as_str()) {
|
|
138
|
-
metadata.additional.insert("version".to_string(), version.into());
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
metadata
|
|
142
|
-
}
|
|
49
|
+
// Frontmatter utilities moved to shared frontmatter_utils module
|
|
143
50
|
|
|
144
51
|
/// Extract plain text from markdown AST.
|
|
145
52
|
fn extract_text_from_events(events: &[Event]) -> String {
|
|
@@ -222,7 +129,7 @@ impl MarkdownExtractor {
|
|
|
222
129
|
if let Some((cells, idx)) = current_table.take()
|
|
223
130
|
&& !cells.is_empty()
|
|
224
131
|
{
|
|
225
|
-
let markdown =
|
|
132
|
+
let markdown = cells_to_markdown(&cells);
|
|
226
133
|
tables.push(Table {
|
|
227
134
|
cells,
|
|
228
135
|
markdown,
|
|
@@ -238,50 +145,7 @@ impl MarkdownExtractor {
|
|
|
238
145
|
tables
|
|
239
146
|
}
|
|
240
147
|
|
|
241
|
-
|
|
242
|
-
fn cells_to_markdown(cells: &[Vec<String>]) -> String {
|
|
243
|
-
if cells.is_empty() {
|
|
244
|
-
return String::new();
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
let mut md = String::new();
|
|
248
|
-
|
|
249
|
-
md.push('|');
|
|
250
|
-
for cell in &cells[0] {
|
|
251
|
-
md.push(' ');
|
|
252
|
-
md.push_str(cell);
|
|
253
|
-
md.push_str(" |");
|
|
254
|
-
}
|
|
255
|
-
md.push('\n');
|
|
256
|
-
|
|
257
|
-
md.push('|');
|
|
258
|
-
for _ in &cells[0] {
|
|
259
|
-
md.push_str(" --- |");
|
|
260
|
-
}
|
|
261
|
-
md.push('\n');
|
|
262
|
-
|
|
263
|
-
for row in &cells[1..] {
|
|
264
|
-
md.push('|');
|
|
265
|
-
for cell in row {
|
|
266
|
-
md.push(' ');
|
|
267
|
-
md.push_str(cell);
|
|
268
|
-
md.push_str(" |");
|
|
269
|
-
}
|
|
270
|
-
md.push('\n');
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
md
|
|
274
|
-
}
|
|
275
|
-
|
|
276
|
-
/// Extract first heading as title if not in frontmatter.
|
|
277
|
-
fn extract_title_from_content(content: &str) -> Option<String> {
|
|
278
|
-
for line in content.lines() {
|
|
279
|
-
if let Some(heading) = line.strip_prefix("# ") {
|
|
280
|
-
return Some(heading.trim().to_string());
|
|
281
|
-
}
|
|
282
|
-
}
|
|
283
|
-
None
|
|
284
|
-
}
|
|
148
|
+
// cells_to_markdown and extract_title_from_content moved to shared frontmatter_utils module
|
|
285
149
|
}
|
|
286
150
|
|
|
287
151
|
#[cfg(feature = "office")]
|
|
@@ -336,16 +200,16 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
336
200
|
) -> Result<ExtractionResult> {
|
|
337
201
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
338
202
|
|
|
339
|
-
let (yaml, remaining_content) =
|
|
203
|
+
let (yaml, remaining_content) = extract_frontmatter(&text);
|
|
340
204
|
|
|
341
205
|
let mut metadata = if let Some(ref yaml_value) = yaml {
|
|
342
|
-
|
|
206
|
+
extract_metadata_from_yaml(yaml_value)
|
|
343
207
|
} else {
|
|
344
208
|
Metadata::default()
|
|
345
209
|
};
|
|
346
210
|
|
|
347
211
|
if !metadata.additional.contains_key("title")
|
|
348
|
-
&& let Some(title) =
|
|
212
|
+
&& let Some(title) = extract_title_from_content(&remaining_content)
|
|
349
213
|
{
|
|
350
214
|
metadata.additional.insert("title".to_string(), title.into());
|
|
351
215
|
}
|
|
@@ -365,7 +229,9 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
365
229
|
detected_languages: None,
|
|
366
230
|
chunks: None,
|
|
367
231
|
images: None,
|
|
232
|
+
djot_content: None,
|
|
368
233
|
pages: None,
|
|
234
|
+
elements: None,
|
|
369
235
|
})
|
|
370
236
|
}
|
|
371
237
|
|
|
@@ -380,7 +246,9 @@ impl DocumentExtractor for MarkdownExtractor {
|
|
|
380
246
|
|
|
381
247
|
#[cfg(all(test, feature = "office"))]
|
|
382
248
|
mod tests {
|
|
249
|
+
use super::super::frontmatter_utils::{cells_to_markdown, extract_frontmatter, extract_metadata_from_yaml};
|
|
383
250
|
use super::*;
|
|
251
|
+
use serde_yaml_ng::Value as YamlValue;
|
|
384
252
|
|
|
385
253
|
#[test]
|
|
386
254
|
fn test_can_extract_markdown_mime_types() {
|
|
@@ -399,7 +267,7 @@ mod tests {
|
|
|
399
267
|
b"# Header\n\nThis is a paragraph with **bold** and *italic* text.\n\n## Subheading\n\nMore content here.";
|
|
400
268
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
401
269
|
|
|
402
|
-
let (yaml, remaining) =
|
|
270
|
+
let (yaml, remaining) = extract_frontmatter(&text);
|
|
403
271
|
assert!(yaml.is_none());
|
|
404
272
|
assert!(!remaining.is_empty());
|
|
405
273
|
|
|
@@ -419,19 +287,25 @@ mod tests {
|
|
|
419
287
|
|
|
420
288
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
421
289
|
|
|
422
|
-
let (yaml_opt, remaining) =
|
|
290
|
+
let (yaml_opt, remaining) = extract_frontmatter(&text);
|
|
423
291
|
assert!(yaml_opt.is_some());
|
|
424
292
|
assert!(remaining.contains("# Content"));
|
|
425
293
|
|
|
426
294
|
let yaml = yaml_opt.expect("Should extract YAML frontmatter");
|
|
427
|
-
let metadata =
|
|
295
|
+
let metadata = extract_metadata_from_yaml(&yaml);
|
|
428
296
|
|
|
429
297
|
assert_eq!(
|
|
430
|
-
metadata
|
|
298
|
+
metadata
|
|
299
|
+
.additional
|
|
300
|
+
.get("title")
|
|
301
|
+
.and_then(|v: &serde_json::Value| v.as_str()),
|
|
431
302
|
Some("My Document")
|
|
432
303
|
);
|
|
433
304
|
assert_eq!(
|
|
434
|
-
metadata
|
|
305
|
+
metadata
|
|
306
|
+
.additional
|
|
307
|
+
.get("author")
|
|
308
|
+
.and_then(|v: &serde_json::Value| v.as_str()),
|
|
435
309
|
Some("John Doe")
|
|
436
310
|
);
|
|
437
311
|
assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
|
|
@@ -450,13 +324,16 @@ mod tests {
|
|
|
450
324
|
let content = b"---\ntitle: Document\nkeywords:\n - rust\n - markdown\n - parsing\n---\n\nContent";
|
|
451
325
|
|
|
452
326
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
453
|
-
let (yaml_opt, _remaining) =
|
|
327
|
+
let (yaml_opt, _remaining) = extract_frontmatter(&text);
|
|
454
328
|
|
|
455
329
|
assert!(yaml_opt.is_some());
|
|
456
330
|
let yaml = yaml_opt.expect("Should extract YAML frontmatter");
|
|
457
|
-
let metadata =
|
|
331
|
+
let metadata = extract_metadata_from_yaml(&yaml);
|
|
458
332
|
|
|
459
|
-
let keywords = metadata
|
|
333
|
+
let keywords = metadata
|
|
334
|
+
.additional
|
|
335
|
+
.get("keywords")
|
|
336
|
+
.and_then(|v: &serde_json::Value| v.as_str());
|
|
460
337
|
assert!(keywords.is_some());
|
|
461
338
|
let keywords_str = keywords.expect("Should extract keywords from metadata");
|
|
462
339
|
assert!(keywords_str.contains("rust"));
|
|
@@ -485,11 +362,11 @@ mod tests {
|
|
|
485
362
|
let content = b"# Main Title\n\nSome content\n\nMore text";
|
|
486
363
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
487
364
|
|
|
488
|
-
let (yaml, remaining) =
|
|
365
|
+
let (yaml, remaining) = extract_frontmatter(&text);
|
|
489
366
|
assert!(yaml.is_none());
|
|
490
367
|
assert_eq!(remaining, text);
|
|
491
368
|
|
|
492
|
-
let title =
|
|
369
|
+
let title = extract_title_from_content(&remaining);
|
|
493
370
|
assert_eq!(title, Some("Main Title".to_string()));
|
|
494
371
|
}
|
|
495
372
|
|
|
@@ -498,7 +375,7 @@ mod tests {
|
|
|
498
375
|
let content = b"";
|
|
499
376
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
500
377
|
|
|
501
|
-
let (yaml, remaining) =
|
|
378
|
+
let (yaml, remaining) = extract_frontmatter(&text);
|
|
502
379
|
assert!(yaml.is_none());
|
|
503
380
|
assert!(remaining.is_empty());
|
|
504
381
|
|
|
@@ -513,7 +390,7 @@ mod tests {
|
|
|
513
390
|
let content = b" \n\n \n";
|
|
514
391
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
515
392
|
|
|
516
|
-
let (yaml, remaining) =
|
|
393
|
+
let (yaml, remaining) = extract_frontmatter(&text);
|
|
517
394
|
assert!(yaml.is_none());
|
|
518
395
|
|
|
519
396
|
let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
|
|
@@ -528,7 +405,7 @@ mod tests {
|
|
|
528
405
|
|
|
529
406
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
530
407
|
|
|
531
|
-
let (yaml, remaining) =
|
|
408
|
+
let (yaml, remaining) = extract_frontmatter(&text);
|
|
532
409
|
assert!(yaml.is_none());
|
|
533
410
|
|
|
534
411
|
let parser = Parser::new_ext(&remaining, Options::ENABLE_TABLES);
|
|
@@ -580,7 +457,7 @@ mod tests {
|
|
|
580
457
|
vec!["Data 3".to_string(), "Data 4".to_string()],
|
|
581
458
|
];
|
|
582
459
|
|
|
583
|
-
let markdown =
|
|
460
|
+
let markdown = cells_to_markdown(&cells);
|
|
584
461
|
assert!(markdown.contains("Header 1"));
|
|
585
462
|
assert!(markdown.contains("Data 1"));
|
|
586
463
|
assert!(markdown.contains("---"));
|
|
@@ -619,7 +496,7 @@ mod tests {
|
|
|
619
496
|
let content = b"---\nthis: is: invalid: yaml:\n---\n\nContent here";
|
|
620
497
|
let text = String::from_utf8_lossy(content).into_owned();
|
|
621
498
|
|
|
622
|
-
let (yaml, _remaining) =
|
|
499
|
+
let (yaml, _remaining) = extract_frontmatter(&text);
|
|
623
500
|
let _ = yaml;
|
|
624
501
|
}
|
|
625
502
|
|
|
@@ -650,7 +527,7 @@ nested:
|
|
|
650
527
|
"#;
|
|
651
528
|
|
|
652
529
|
let yaml: YamlValue = serde_yaml_ng::from_str(yaml_str).expect("Valid YAML");
|
|
653
|
-
let metadata =
|
|
530
|
+
let metadata = extract_metadata_from_yaml(&yaml);
|
|
654
531
|
|
|
655
532
|
assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
|
|
656
533
|
assert_eq!(
|
|
@@ -64,6 +64,9 @@ pub trait SyncExtractor {
|
|
|
64
64
|
pub mod structured;
|
|
65
65
|
pub mod text;
|
|
66
66
|
|
|
67
|
+
pub mod djot_format;
|
|
68
|
+
pub mod frontmatter_utils;
|
|
69
|
+
|
|
67
70
|
#[cfg(feature = "archives")]
|
|
68
71
|
pub mod security;
|
|
69
72
|
|
|
@@ -166,6 +169,8 @@ pub use epub::EpubExtractor;
|
|
|
166
169
|
#[cfg(feature = "office")]
|
|
167
170
|
pub use fictionbook::FictionBookExtractor;
|
|
168
171
|
|
|
172
|
+
pub use djot_format::DjotExtractor;
|
|
173
|
+
|
|
169
174
|
#[cfg(feature = "office")]
|
|
170
175
|
pub use markdown::MarkdownExtractor as EnhancedMarkdownExtractor;
|
|
171
176
|
|
|
@@ -281,6 +286,8 @@ pub fn register_default_extractors() -> Result<()> {
|
|
|
281
286
|
#[cfg(feature = "excel")]
|
|
282
287
|
registry.register(Arc::new(ExcelExtractor::new()))?;
|
|
283
288
|
|
|
289
|
+
registry.register(Arc::new(DjotExtractor::new()))?;
|
|
290
|
+
|
|
284
291
|
#[cfg(feature = "office")]
|
|
285
292
|
{
|
|
286
293
|
registry.register(Arc::new(EnhancedMarkdownExtractor::new()))?;
|
|
@@ -341,10 +348,11 @@ mod tests {
|
|
|
341
348
|
let extractor_names = reg.list();
|
|
342
349
|
|
|
343
350
|
#[allow(unused_mut)]
|
|
344
|
-
let mut expected_count =
|
|
351
|
+
let mut expected_count = 4; // plain-text, markdown, structured, djot
|
|
345
352
|
assert!(extractor_names.contains(&"plain-text-extractor".to_string()));
|
|
346
353
|
assert!(extractor_names.contains(&"markdown-extractor".to_string()));
|
|
347
354
|
assert!(extractor_names.contains(&"structured-extractor".to_string()));
|
|
355
|
+
assert!(extractor_names.contains(&"djot-extractor".to_string()));
|
|
348
356
|
|
|
349
357
|
#[cfg(feature = "ocr")]
|
|
350
358
|
{
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
//! Core OPML extractor implementation.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides the main `OpmlExtractor` struct and implements the
|
|
4
|
+
//! `Plugin` and `DocumentExtractor` traits for OPML document processing.
|
|
5
|
+
|
|
6
|
+
use crate::Result;
|
|
7
|
+
use crate::core::config::ExtractionConfig;
|
|
8
|
+
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
|
+
use crate::types::{ExtractionResult, Metadata};
|
|
10
|
+
use async_trait::async_trait;
|
|
11
|
+
|
|
12
|
+
#[cfg(feature = "office")]
|
|
13
|
+
use super::parser;
|
|
14
|
+
|
|
15
|
+
/// OPML format extractor.
|
|
16
|
+
///
|
|
17
|
+
/// Extracts outline structure and metadata from OPML documents using native Rust parsing.
|
|
18
|
+
pub struct OpmlExtractor;
|
|
19
|
+
|
|
20
|
+
impl OpmlExtractor {
|
|
21
|
+
/// Create a new OPML extractor.
|
|
22
|
+
pub fn new() -> Self {
|
|
23
|
+
Self
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
impl Default for OpmlExtractor {
|
|
28
|
+
fn default() -> Self {
|
|
29
|
+
Self::new()
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
impl Plugin for OpmlExtractor {
|
|
34
|
+
fn name(&self) -> &str {
|
|
35
|
+
"opml-extractor"
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
fn version(&self) -> String {
|
|
39
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn initialize(&self) -> Result<()> {
|
|
43
|
+
Ok(())
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
fn shutdown(&self) -> Result<()> {
|
|
47
|
+
Ok(())
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
fn description(&self) -> &str {
|
|
51
|
+
"Extracts content and metadata from OPML (Outline Processor Markup Language) documents"
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
fn author(&self) -> &str {
|
|
55
|
+
"Kreuzberg Team"
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
#[cfg(feature = "office")]
|
|
60
|
+
#[async_trait]
|
|
61
|
+
impl DocumentExtractor for OpmlExtractor {
|
|
62
|
+
#[cfg_attr(
|
|
63
|
+
feature = "otel",
|
|
64
|
+
tracing::instrument(
|
|
65
|
+
skip(self, content, _config),
|
|
66
|
+
fields(
|
|
67
|
+
extractor.name = self.name(),
|
|
68
|
+
content.size_bytes = content.len(),
|
|
69
|
+
)
|
|
70
|
+
)
|
|
71
|
+
)]
|
|
72
|
+
async fn extract_bytes(
|
|
73
|
+
&self,
|
|
74
|
+
content: &[u8],
|
|
75
|
+
mime_type: &str,
|
|
76
|
+
_config: &ExtractionConfig,
|
|
77
|
+
) -> Result<ExtractionResult> {
|
|
78
|
+
let (extracted_content, metadata_map) = parser::extract_content_and_metadata(content)?;
|
|
79
|
+
|
|
80
|
+
Ok(ExtractionResult {
|
|
81
|
+
content: extracted_content,
|
|
82
|
+
mime_type: mime_type.to_string(),
|
|
83
|
+
metadata: Metadata {
|
|
84
|
+
additional: metadata_map,
|
|
85
|
+
..Default::default()
|
|
86
|
+
},
|
|
87
|
+
pages: None,
|
|
88
|
+
tables: vec![],
|
|
89
|
+
detected_languages: None,
|
|
90
|
+
chunks: None,
|
|
91
|
+
images: None,
|
|
92
|
+
djot_content: None,
|
|
93
|
+
elements: None,
|
|
94
|
+
})
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
98
|
+
&["text/x-opml", "application/xml+opml"]
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
fn priority(&self) -> i32 {
|
|
102
|
+
55
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
#[cfg(all(test, feature = "office"))]
|
|
107
|
+
mod tests {
|
|
108
|
+
use super::*;
|
|
109
|
+
|
|
110
|
+
#[test]
|
|
111
|
+
fn test_opml_extractor_plugin_interface() {
|
|
112
|
+
let extractor = OpmlExtractor::new();
|
|
113
|
+
assert_eq!(extractor.name(), "opml-extractor");
|
|
114
|
+
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
115
|
+
assert_eq!(extractor.priority(), 55);
|
|
116
|
+
assert!(!extractor.supported_mime_types().is_empty());
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
#[test]
|
|
120
|
+
fn test_opml_extractor_default() {
|
|
121
|
+
let extractor = OpmlExtractor;
|
|
122
|
+
assert_eq!(extractor.name(), "opml-extractor");
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
#[tokio::test]
|
|
126
|
+
async fn test_opml_extractor_initialize_shutdown() {
|
|
127
|
+
let extractor = OpmlExtractor::new();
|
|
128
|
+
assert!(extractor.initialize().is_ok());
|
|
129
|
+
assert!(extractor.shutdown().is_ok());
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
#[test]
|
|
133
|
+
fn test_opml_supported_mime_types() {
|
|
134
|
+
let extractor = OpmlExtractor::new();
|
|
135
|
+
let supported = extractor.supported_mime_types();
|
|
136
|
+
assert!(supported.contains(&"text/x-opml"));
|
|
137
|
+
assert!(supported.contains(&"application/xml+opml"));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
#[tokio::test]
|
|
141
|
+
async fn test_opml_extractor_async_extraction() {
|
|
142
|
+
let extractor = OpmlExtractor::new();
|
|
143
|
+
let opml = br#"<?xml version="1.0"?>
|
|
144
|
+
<opml version="2.0">
|
|
145
|
+
<head>
|
|
146
|
+
<title>Async Test</title>
|
|
147
|
+
</head>
|
|
148
|
+
<body>
|
|
149
|
+
<outline text="Item" />
|
|
150
|
+
</body>
|
|
151
|
+
</opml>"#;
|
|
152
|
+
|
|
153
|
+
let result = extractor
|
|
154
|
+
.extract_bytes(opml, "text/x-opml", &ExtractionConfig::default())
|
|
155
|
+
.await
|
|
156
|
+
.expect("Should extract OPML asynchronously");
|
|
157
|
+
|
|
158
|
+
assert_eq!(result.mime_type, "text/x-opml");
|
|
159
|
+
assert!(result.content.contains("Item"));
|
|
160
|
+
assert_eq!(
|
|
161
|
+
result.metadata.additional.get("title").and_then(|v| v.as_str()),
|
|
162
|
+
Some("Async Test")
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
//! Native OPML (Outline Processor Markup Language) extractor using the `roxmltree` library.
|
|
2
|
+
//!
|
|
3
|
+
//! This extractor provides native Rust-based OPML extraction, parsing outline structures
|
|
4
|
+
//! commonly used for RSS feed lists, podcast directories, and general outlines.
|
|
5
|
+
//!
|
|
6
|
+
//! Extracts:
|
|
7
|
+
//! - Metadata from `<head>`: title, dateCreated, dateModified, ownerName, ownerEmail
|
|
8
|
+
//! - Content from `<body><outline>` hierarchy using text attributes
|
|
9
|
+
//! - Outline hierarchy structure preserved in plain text format with indentation
|
|
10
|
+
//! - Note: URLs (xmlUrl, htmlUrl) are extracted from attributes but not included in main content
|
|
11
|
+
//!
|
|
12
|
+
//! Example OPML structure:
|
|
13
|
+
//! ```xml
|
|
14
|
+
//! <opml version="2.0">
|
|
15
|
+
//! <head>
|
|
16
|
+
//! <title>My Feeds</title>
|
|
17
|
+
//! <ownerName>John</ownerName>
|
|
18
|
+
//! </head>
|
|
19
|
+
//! <body>
|
|
20
|
+
//! <outline text="Tech" type="folder">
|
|
21
|
+
//! <outline text="Hacker News" type="rss" xmlUrl="https://..." />
|
|
22
|
+
//! </outline>
|
|
23
|
+
//! </body>
|
|
24
|
+
//! </opml>
|
|
25
|
+
//! ```
|
|
26
|
+
|
|
27
|
+
mod core;
|
|
28
|
+
mod parser;
|
|
29
|
+
|
|
30
|
+
// Re-export public API
|
|
31
|
+
pub use core::OpmlExtractor;
|