kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
//! EPUB ZIP archive and XML parsing utilities.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides low-level parsing functionality for EPUB container structure,
|
|
4
|
+
//! including ZIP archive operations and container.xml parsing.
|
|
5
|
+
|
|
6
|
+
use crate::Result;
|
|
7
|
+
use roxmltree;
|
|
8
|
+
use std::io::Cursor;
|
|
9
|
+
use zip::ZipArchive;
|
|
10
|
+
|
|
11
|
+
/// Parse container.xml to find the OPF file path
|
|
12
|
+
pub(super) fn parse_container_xml(xml: &str) -> Result<String> {
|
|
13
|
+
match roxmltree::Document::parse(xml) {
|
|
14
|
+
Ok(doc) => {
|
|
15
|
+
for node in doc.descendants() {
|
|
16
|
+
if node.tag_name().name() == "rootfile"
|
|
17
|
+
&& let Some(full_path) = node.attribute("full-path")
|
|
18
|
+
{
|
|
19
|
+
return Ok(full_path.to_string());
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
Err(crate::KreuzbergError::Parsing {
|
|
23
|
+
message: "No rootfile found in container.xml".to_string(),
|
|
24
|
+
source: None,
|
|
25
|
+
})
|
|
26
|
+
}
|
|
27
|
+
Err(e) => Err(crate::KreuzbergError::Parsing {
|
|
28
|
+
message: format!("Failed to parse container.xml: {}", e),
|
|
29
|
+
source: None,
|
|
30
|
+
}),
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/// Read a file from the ZIP archive
|
|
35
|
+
pub(super) fn read_file_from_zip(archive: &mut ZipArchive<Cursor<Vec<u8>>>, path: &str) -> Result<String> {
|
|
36
|
+
match archive.by_name(path) {
|
|
37
|
+
Ok(mut file) => {
|
|
38
|
+
let mut content = String::new();
|
|
39
|
+
match std::io::Read::read_to_string(&mut file, &mut content) {
|
|
40
|
+
Ok(_) => Ok(content),
|
|
41
|
+
Err(e) => Err(crate::KreuzbergError::Parsing {
|
|
42
|
+
message: format!("Failed to read file from EPUB: {}", e),
|
|
43
|
+
source: None,
|
|
44
|
+
}),
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
Err(e) => Err(crate::KreuzbergError::Parsing {
|
|
48
|
+
message: format!("File not found in EPUB: {} ({})", path, e),
|
|
49
|
+
source: None,
|
|
50
|
+
}),
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/// Resolve a relative path within the manifest directory
|
|
55
|
+
pub(super) fn resolve_path(base_dir: &str, relative_path: &str) -> String {
|
|
56
|
+
if relative_path.starts_with('/') {
|
|
57
|
+
relative_path.trim_start_matches('/').to_string()
|
|
58
|
+
} else if base_dir.is_empty() || base_dir == "." {
|
|
59
|
+
relative_path.to_string()
|
|
60
|
+
} else {
|
|
61
|
+
format!("{}/{}", base_dir.trim_end_matches('/'), relative_path)
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#[cfg(test)]
|
|
66
|
+
mod tests {
|
|
67
|
+
use super::*;
|
|
68
|
+
|
|
69
|
+
#[test]
|
|
70
|
+
fn test_resolve_path_with_base_dir() {
|
|
71
|
+
let result = resolve_path("OEBPS", "chapter.xhtml");
|
|
72
|
+
assert_eq!(result, "OEBPS/chapter.xhtml");
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
#[test]
|
|
76
|
+
fn test_resolve_path_absolute() {
|
|
77
|
+
let result = resolve_path("OEBPS", "/chapter.xhtml");
|
|
78
|
+
assert_eq!(result, "chapter.xhtml");
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
#[test]
|
|
82
|
+
fn test_resolve_path_empty_base() {
|
|
83
|
+
let result = resolve_path("", "chapter.xhtml");
|
|
84
|
+
assert_eq!(result, "chapter.xhtml");
|
|
85
|
+
}
|
|
86
|
+
}
|
|
@@ -140,6 +140,8 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
140
140
|
detected_languages: None,
|
|
141
141
|
chunks: None,
|
|
142
142
|
images: None,
|
|
143
|
+
djot_content: None,
|
|
144
|
+
elements: None,
|
|
143
145
|
})
|
|
144
146
|
}
|
|
145
147
|
|
|
@@ -184,6 +186,8 @@ impl DocumentExtractor for ExcelExtractor {
|
|
|
184
186
|
detected_languages: None,
|
|
185
187
|
chunks: None,
|
|
186
188
|
images: None,
|
|
189
|
+
djot_content: None,
|
|
190
|
+
elements: None,
|
|
187
191
|
})
|
|
188
192
|
}
|
|
189
193
|
|
|
@@ -0,0 +1,466 @@
|
|
|
1
|
+
//! Shared frontmatter and metadata utilities for markup extractors.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides common functionality for extractors that process
|
|
4
|
+
//! documents with YAML frontmatter (Markdown, Djot, etc.).
|
|
5
|
+
//!
|
|
6
|
+
//! This is a core module used by the Djot extractor (always available) and
|
|
7
|
+
//! the enhanced Markdown extractor (requires `office` feature).
|
|
8
|
+
|
|
9
|
+
use crate::types::Metadata;
|
|
10
|
+
|
|
11
|
+
use serde_yaml_ng::Value as YamlValue;
|
|
12
|
+
|
|
13
|
+
/// Extract YAML frontmatter from document content.
|
|
14
|
+
///
|
|
15
|
+
/// Frontmatter is expected to be delimited by `---` or `...` at the start of the document.
|
|
16
|
+
/// This implementation properly handles edge cases:
|
|
17
|
+
/// - `---` appearing within YAML strings or arrays
|
|
18
|
+
/// - Both `---` and `...` as end delimiters (YAML spec compliant)
|
|
19
|
+
/// - Multiline YAML values containing dashes
|
|
20
|
+
///
|
|
21
|
+
/// Returns a tuple of (parsed YAML value, remaining content after frontmatter).
|
|
22
|
+
///
|
|
23
|
+
/// # Examples
|
|
24
|
+
///
|
|
25
|
+
/// ```rust,ignore
|
|
26
|
+
/// let content = "---\ntitle: Test\n---\n\n# Content";
|
|
27
|
+
/// let (yaml, remaining) = extract_frontmatter(content);
|
|
28
|
+
/// assert!(yaml.is_some());
|
|
29
|
+
/// assert!(remaining.contains("# Content"));
|
|
30
|
+
/// ```
|
|
31
|
+
pub fn extract_frontmatter(content: &str) -> (Option<YamlValue>, String) {
|
|
32
|
+
// Frontmatter must start at the beginning of the document
|
|
33
|
+
if !content.starts_with("---") {
|
|
34
|
+
return (None, content.to_string());
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Skip opening delimiter
|
|
38
|
+
let rest = &content[3..];
|
|
39
|
+
|
|
40
|
+
// Find the closing delimiter
|
|
41
|
+
// We need to find "---" or "..." on its own line (not embedded in YAML content)
|
|
42
|
+
// The delimiter must be preceded by a newline and followed by newline or EOF
|
|
43
|
+
let mut end_pos = None;
|
|
44
|
+
let mut search_start = 0;
|
|
45
|
+
|
|
46
|
+
while let Some(pos) = rest[search_start..].find('\n') {
|
|
47
|
+
let absolute_pos = search_start + pos;
|
|
48
|
+
let after_newline = absolute_pos + 1;
|
|
49
|
+
|
|
50
|
+
if after_newline >= rest.len() {
|
|
51
|
+
break;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
// Check if we have "---" or "..." at the start of a line
|
|
55
|
+
let remaining = &rest[after_newline..];
|
|
56
|
+
if remaining.starts_with("---") || remaining.starts_with("...") {
|
|
57
|
+
// Verify it's on its own line (followed by newline or EOF)
|
|
58
|
+
let delimiter_end = after_newline + 3;
|
|
59
|
+
if delimiter_end >= rest.len() || rest.as_bytes()[delimiter_end] == b'\n' {
|
|
60
|
+
end_pos = Some(absolute_pos);
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
search_start = after_newline;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if let Some(end) = end_pos {
|
|
69
|
+
let frontmatter_str = &rest[..end];
|
|
70
|
+
// Skip past the closing delimiter and any following newline
|
|
71
|
+
let after_delimiter = end + 1; // Skip the newline before delimiter
|
|
72
|
+
let remaining_start = if after_delimiter + 3 < rest.len() {
|
|
73
|
+
// Skip "---" or "..."
|
|
74
|
+
let after_delim = after_delimiter + 3;
|
|
75
|
+
// Skip trailing newline after delimiter if present
|
|
76
|
+
if after_delim < rest.len() && rest.as_bytes()[after_delim] == b'\n' {
|
|
77
|
+
after_delim + 1
|
|
78
|
+
} else {
|
|
79
|
+
after_delim
|
|
80
|
+
}
|
|
81
|
+
} else {
|
|
82
|
+
rest.len()
|
|
83
|
+
};
|
|
84
|
+
|
|
85
|
+
let remaining = if remaining_start < rest.len() {
|
|
86
|
+
&rest[remaining_start..]
|
|
87
|
+
} else {
|
|
88
|
+
""
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
// Try to parse the frontmatter as YAML
|
|
92
|
+
match serde_yaml_ng::from_str::<YamlValue>(frontmatter_str) {
|
|
93
|
+
Ok(value) => (Some(value), remaining.to_string()),
|
|
94
|
+
Err(_) => (None, content.to_string()),
|
|
95
|
+
}
|
|
96
|
+
} else {
|
|
97
|
+
// No closing delimiter found
|
|
98
|
+
(None, content.to_string())
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/// Extract metadata from YAML frontmatter.
|
|
103
|
+
///
|
|
104
|
+
/// Extracts the following YAML fields into Kreuzberg metadata:
|
|
105
|
+
/// - **Standard fields**: title, author, date, description (as subject)
|
|
106
|
+
/// - **Extended fields**: abstract, subject, category, tags, language, version
|
|
107
|
+
/// - **Array fields** (keywords, tags): converted to comma-separated strings
|
|
108
|
+
///
|
|
109
|
+
/// # Arguments
|
|
110
|
+
///
|
|
111
|
+
/// * `yaml` - The parsed YAML value from frontmatter
|
|
112
|
+
///
|
|
113
|
+
/// # Returns
|
|
114
|
+
///
|
|
115
|
+
/// A `Metadata` struct populated with extracted fields
|
|
116
|
+
///
|
|
117
|
+
/// # Examples
|
|
118
|
+
///
|
|
119
|
+
/// ```rust,ignore
|
|
120
|
+
/// let yaml = serde_yaml_ng::from_str("title: Test\nauthor: John").unwrap();
|
|
121
|
+
/// let metadata = extract_metadata_from_yaml(&yaml);
|
|
122
|
+
/// assert_eq!(metadata.additional.get("title"), Some(&"Test".into()));
|
|
123
|
+
/// ```
|
|
124
|
+
pub fn extract_metadata_from_yaml(yaml: &YamlValue) -> Metadata {
|
|
125
|
+
let mut metadata = Metadata::default();
|
|
126
|
+
|
|
127
|
+
// Title
|
|
128
|
+
if let Some(title) = yaml.get("title").and_then(|v| v.as_str()) {
|
|
129
|
+
metadata.additional.insert("title".to_string(), title.into());
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// Author
|
|
133
|
+
if let Some(author) = yaml.get("author").and_then(|v| v.as_str()) {
|
|
134
|
+
metadata.additional.insert("author".to_string(), author.into());
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Date (map to created_at)
|
|
138
|
+
if let Some(date) = yaml.get("date").and_then(|v| v.as_str()) {
|
|
139
|
+
metadata.created_at = Some(date.to_string());
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Keywords (support both string and array)
|
|
143
|
+
if let Some(keywords) = yaml.get("keywords") {
|
|
144
|
+
match keywords {
|
|
145
|
+
YamlValue::String(s) => {
|
|
146
|
+
metadata.additional.insert("keywords".to_string(), s.clone().into());
|
|
147
|
+
}
|
|
148
|
+
YamlValue::Sequence(seq) => {
|
|
149
|
+
let keywords_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
|
|
150
|
+
metadata.additional.insert("keywords".to_string(), keywords_str.into());
|
|
151
|
+
}
|
|
152
|
+
_ => {}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
// Description (map to subject)
|
|
157
|
+
if let Some(description) = yaml.get("description").and_then(|v| v.as_str()) {
|
|
158
|
+
metadata.subject = Some(description.to_string());
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Abstract
|
|
162
|
+
if let Some(abstract_text) = yaml.get("abstract").and_then(|v| v.as_str()) {
|
|
163
|
+
metadata.additional.insert("abstract".to_string(), abstract_text.into());
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Subject (overrides description if both present)
|
|
167
|
+
if let Some(subject) = yaml.get("subject").and_then(|v| v.as_str()) {
|
|
168
|
+
metadata.subject = Some(subject.to_string());
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Category
|
|
172
|
+
if let Some(category) = yaml.get("category").and_then(|v| v.as_str()) {
|
|
173
|
+
metadata.additional.insert("category".to_string(), category.into());
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// Tags (support both string and array)
|
|
177
|
+
if let Some(tags) = yaml.get("tags") {
|
|
178
|
+
match tags {
|
|
179
|
+
YamlValue::String(s) => {
|
|
180
|
+
metadata.additional.insert("tags".to_string(), s.clone().into());
|
|
181
|
+
}
|
|
182
|
+
YamlValue::Sequence(seq) => {
|
|
183
|
+
let tags_str = seq.iter().filter_map(|v| v.as_str()).collect::<Vec<_>>().join(", ");
|
|
184
|
+
metadata.additional.insert("tags".to_string(), tags_str.into());
|
|
185
|
+
}
|
|
186
|
+
_ => {}
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// Language
|
|
191
|
+
if let Some(language) = yaml.get("language").and_then(|v| v.as_str()) {
|
|
192
|
+
metadata.additional.insert("language".to_string(), language.into());
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
// Version
|
|
196
|
+
if let Some(version) = yaml.get("version").and_then(|v| v.as_str()) {
|
|
197
|
+
metadata.additional.insert("version".to_string(), version.into());
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
metadata
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
/// Extract first heading as title from content.
|
|
204
|
+
///
|
|
205
|
+
/// Searches for the first level-1 heading (# Title) in the content
|
|
206
|
+
/// and returns it as a potential title if no title was found in frontmatter.
|
|
207
|
+
///
|
|
208
|
+
/// # Arguments
|
|
209
|
+
///
|
|
210
|
+
/// * `content` - The document content to search
|
|
211
|
+
///
|
|
212
|
+
/// # Returns
|
|
213
|
+
///
|
|
214
|
+
/// Some(title) if a heading is found, None otherwise
|
|
215
|
+
///
|
|
216
|
+
/// # Examples
|
|
217
|
+
///
|
|
218
|
+
/// ```rust,ignore
|
|
219
|
+
/// let content = "# My Document\n\nContent here";
|
|
220
|
+
/// assert_eq!(extract_title_from_content(content), Some("My Document".to_string()));
|
|
221
|
+
/// ```
|
|
222
|
+
pub fn extract_title_from_content(content: &str) -> Option<String> {
|
|
223
|
+
for line in content.lines() {
|
|
224
|
+
if let Some(heading) = line.strip_prefix("# ") {
|
|
225
|
+
return Some(heading.trim().to_string());
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
None
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
/// Convert table cells to markdown format.
|
|
232
|
+
///
|
|
233
|
+
/// Takes a 2D array of cell values and formats them as a markdown table
|
|
234
|
+
/// with header row, separator row, and data rows.
|
|
235
|
+
///
|
|
236
|
+
/// # Arguments
|
|
237
|
+
///
|
|
238
|
+
/// * `cells` - A 2D array where cells[0] is the header row
|
|
239
|
+
///
|
|
240
|
+
/// # Returns
|
|
241
|
+
///
|
|
242
|
+
/// A string containing the markdown-formatted table
|
|
243
|
+
///
|
|
244
|
+
/// # Examples
|
|
245
|
+
///
|
|
246
|
+
/// ```rust,ignore
|
|
247
|
+
/// let cells = vec![
|
|
248
|
+
/// vec!["Name".to_string(), "Age".to_string()],
|
|
249
|
+
/// vec!["Alice".to_string(), "30".to_string()],
|
|
250
|
+
/// ];
|
|
251
|
+
/// let markdown = cells_to_markdown(&cells);
|
|
252
|
+
/// assert!(markdown.contains("| Name | Age |"));
|
|
253
|
+
/// ```
|
|
254
|
+
pub fn cells_to_markdown(cells: &[Vec<String>]) -> String {
|
|
255
|
+
if cells.is_empty() {
|
|
256
|
+
return String::new();
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
let mut md = String::new();
|
|
260
|
+
|
|
261
|
+
// Header row
|
|
262
|
+
md.push('|');
|
|
263
|
+
for cell in &cells[0] {
|
|
264
|
+
md.push(' ');
|
|
265
|
+
md.push_str(cell);
|
|
266
|
+
md.push_str(" |");
|
|
267
|
+
}
|
|
268
|
+
md.push('\n');
|
|
269
|
+
|
|
270
|
+
// Separator row
|
|
271
|
+
md.push('|');
|
|
272
|
+
for _ in &cells[0] {
|
|
273
|
+
md.push_str(" --- |");
|
|
274
|
+
}
|
|
275
|
+
md.push('\n');
|
|
276
|
+
|
|
277
|
+
// Data rows
|
|
278
|
+
for row in &cells[1..] {
|
|
279
|
+
md.push('|');
|
|
280
|
+
for cell in row {
|
|
281
|
+
md.push(' ');
|
|
282
|
+
md.push_str(cell);
|
|
283
|
+
md.push_str(" |");
|
|
284
|
+
}
|
|
285
|
+
md.push('\n');
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
md
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
#[cfg(test)]
|
|
292
|
+
mod tests {
|
|
293
|
+
use super::*;
|
|
294
|
+
|
|
295
|
+
#[test]
|
|
296
|
+
fn test_frontmatter_basic() {
|
|
297
|
+
let content = "---\ntitle: Test\n---\n\n# Content";
|
|
298
|
+
let (yaml, remaining) = extract_frontmatter(content);
|
|
299
|
+
|
|
300
|
+
assert!(yaml.is_some());
|
|
301
|
+
assert!(remaining.contains("# Content"));
|
|
302
|
+
|
|
303
|
+
let metadata = extract_metadata_from_yaml(&yaml.unwrap());
|
|
304
|
+
assert_eq!(metadata.additional.get("title").and_then(|v| v.as_str()), Some("Test"));
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
#[test]
|
|
308
|
+
fn test_frontmatter_with_dashes_in_content() {
|
|
309
|
+
let content = "---\ntitle: Test\ndescription: |\n This has ---\n in the middle\n---\n\n# Body";
|
|
310
|
+
let (yaml, remaining) = extract_frontmatter(content);
|
|
311
|
+
|
|
312
|
+
assert!(yaml.is_some());
|
|
313
|
+
assert!(remaining.contains("# Body"));
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
#[test]
|
|
317
|
+
fn test_frontmatter_with_dots_terminator() {
|
|
318
|
+
let content = "---\ntitle: Test\nauthor: John\n...\n\n# Content";
|
|
319
|
+
let (yaml, remaining) = extract_frontmatter(content);
|
|
320
|
+
|
|
321
|
+
assert!(yaml.is_some());
|
|
322
|
+
assert!(remaining.contains("# Content"));
|
|
323
|
+
|
|
324
|
+
let metadata = extract_metadata_from_yaml(&yaml.unwrap());
|
|
325
|
+
assert_eq!(metadata.additional.get("title").and_then(|v| v.as_str()), Some("Test"));
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
#[test]
|
|
329
|
+
fn test_frontmatter_with_triple_dash_in_string() {
|
|
330
|
+
let content = "---\ntitle: \"Before --- After\"\nauthor: John\n---\n\n# Content";
|
|
331
|
+
let (yaml, remaining) = extract_frontmatter(content);
|
|
332
|
+
|
|
333
|
+
assert!(yaml.is_some());
|
|
334
|
+
assert!(remaining.contains("# Content"));
|
|
335
|
+
|
|
336
|
+
let metadata = extract_metadata_from_yaml(&yaml.unwrap());
|
|
337
|
+
assert_eq!(
|
|
338
|
+
metadata.additional.get("title").and_then(|v| v.as_str()),
|
|
339
|
+
Some("Before --- After")
|
|
340
|
+
);
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
#[test]
|
|
344
|
+
fn test_frontmatter_multiline_string_with_dashes() {
|
|
345
|
+
let content = "---\ntitle: Test\ndescription: |\n Line 1\n ---\n Line 2\n---\n\n# Body";
|
|
346
|
+
let (yaml, remaining) = extract_frontmatter(content);
|
|
347
|
+
|
|
348
|
+
assert!(yaml.is_some());
|
|
349
|
+
assert!(remaining.contains("# Body"));
|
|
350
|
+
|
|
351
|
+
let metadata = extract_metadata_from_yaml(&yaml.unwrap());
|
|
352
|
+
assert_eq!(metadata.additional.get("title").and_then(|v| v.as_str()), Some("Test"));
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
#[test]
|
|
356
|
+
fn test_no_frontmatter() {
|
|
357
|
+
let content = "# Title\n\nContent without frontmatter";
|
|
358
|
+
let (yaml, remaining) = extract_frontmatter(content);
|
|
359
|
+
|
|
360
|
+
assert!(yaml.is_none());
|
|
361
|
+
assert_eq!(remaining, content);
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
#[test]
|
|
365
|
+
fn test_incomplete_frontmatter() {
|
|
366
|
+
let content = "---\ntitle: Test\nauthor: John\n\n# Content";
|
|
367
|
+
let (yaml, remaining) = extract_frontmatter(content);
|
|
368
|
+
|
|
369
|
+
// No closing delimiter, should return None
|
|
370
|
+
assert!(yaml.is_none());
|
|
371
|
+
assert_eq!(remaining, content);
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
#[test]
|
|
375
|
+
fn test_extract_title_from_content() {
|
|
376
|
+
let content = "# My Document\n\nContent here";
|
|
377
|
+
assert_eq!(extract_title_from_content(content), Some("My Document".to_string()));
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
#[test]
|
|
381
|
+
fn test_extract_title_from_content_no_heading() {
|
|
382
|
+
let content = "Content without heading";
|
|
383
|
+
assert_eq!(extract_title_from_content(content), None);
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
#[test]
|
|
387
|
+
fn test_extract_title_from_content_level_2() {
|
|
388
|
+
let content = "## Subheading\n\nContent";
|
|
389
|
+
assert_eq!(extract_title_from_content(content), None);
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
#[test]
|
|
393
|
+
fn test_cells_to_markdown() {
|
|
394
|
+
let cells = vec![
|
|
395
|
+
vec!["Name".to_string(), "Age".to_string()],
|
|
396
|
+
vec!["Alice".to_string(), "30".to_string()],
|
|
397
|
+
vec!["Bob".to_string(), "25".to_string()],
|
|
398
|
+
];
|
|
399
|
+
|
|
400
|
+
let markdown = cells_to_markdown(&cells);
|
|
401
|
+
assert!(markdown.contains("| Name | Age |"));
|
|
402
|
+
assert!(markdown.contains("| Alice | 30 |"));
|
|
403
|
+
assert!(markdown.contains("| Bob | 25 |"));
|
|
404
|
+
assert!(markdown.contains("| --- | --- |"));
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
#[test]
|
|
408
|
+
fn test_cells_to_markdown_empty() {
|
|
409
|
+
let cells: Vec<Vec<String>> = vec![];
|
|
410
|
+
let markdown = cells_to_markdown(&cells);
|
|
411
|
+
assert_eq!(markdown, "");
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
#[test]
|
|
415
|
+
fn test_metadata_from_yaml_all_fields() {
|
|
416
|
+
let yaml_str = r#"
|
|
417
|
+
title: Test Document
|
|
418
|
+
author: John Doe
|
|
419
|
+
date: 2024-01-15
|
|
420
|
+
keywords:
|
|
421
|
+
- rust
|
|
422
|
+
- testing
|
|
423
|
+
description: A test document
|
|
424
|
+
abstract: This is an abstract
|
|
425
|
+
subject: Test Subject
|
|
426
|
+
category: Documentation
|
|
427
|
+
tags:
|
|
428
|
+
- tag1
|
|
429
|
+
- tag2
|
|
430
|
+
language: en
|
|
431
|
+
version: 1.0
|
|
432
|
+
"#;
|
|
433
|
+
|
|
434
|
+
let yaml: YamlValue = serde_yaml_ng::from_str(yaml_str).unwrap();
|
|
435
|
+
let metadata = extract_metadata_from_yaml(&yaml);
|
|
436
|
+
|
|
437
|
+
assert_eq!(
|
|
438
|
+
metadata.additional.get("title").and_then(|v| v.as_str()),
|
|
439
|
+
Some("Test Document")
|
|
440
|
+
);
|
|
441
|
+
assert_eq!(
|
|
442
|
+
metadata.additional.get("author").and_then(|v| v.as_str()),
|
|
443
|
+
Some("John Doe")
|
|
444
|
+
);
|
|
445
|
+
assert_eq!(metadata.created_at, Some("2024-01-15".to_string()));
|
|
446
|
+
assert!(metadata.additional.contains_key("keywords"));
|
|
447
|
+
assert_eq!(metadata.subject, Some("Test Subject".to_string()));
|
|
448
|
+
assert!(metadata.additional.contains_key("tags"));
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
#[test]
|
|
452
|
+
fn test_metadata_from_yaml_string_arrays() {
|
|
453
|
+
let yaml_str = r#"
|
|
454
|
+
keywords: "single, keyword, string"
|
|
455
|
+
tags: "tag1, tag2"
|
|
456
|
+
"#;
|
|
457
|
+
|
|
458
|
+
let yaml: YamlValue = serde_yaml_ng::from_str(yaml_str).unwrap();
|
|
459
|
+
let metadata = extract_metadata_from_yaml(&yaml);
|
|
460
|
+
|
|
461
|
+
assert_eq!(
|
|
462
|
+
metadata.additional.get("keywords").and_then(|v| v.as_str()),
|
|
463
|
+
Some("single, keyword, string")
|
|
464
|
+
);
|
|
465
|
+
}
|
|
466
|
+
}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
//! HTML document extractor.
|
|
2
2
|
|
|
3
3
|
use crate::Result;
|
|
4
|
-
use crate::core::config::ExtractionConfig;
|
|
4
|
+
use crate::core::config::{ExtractionConfig, OutputFormat};
|
|
5
5
|
use crate::extractors::SyncExtractor;
|
|
6
6
|
use crate::plugins::{DocumentExtractor, Plugin};
|
|
7
7
|
use crate::text::utf8_validation;
|
|
@@ -202,16 +202,24 @@ impl SyncExtractor for HtmlExtractor {
|
|
|
202
202
|
.map(|s| s.to_string())
|
|
203
203
|
.unwrap_or_else(|_| String::from_utf8_lossy(content).to_string());
|
|
204
204
|
|
|
205
|
-
let (
|
|
206
|
-
|
|
205
|
+
let (content_text, html_metadata) = crate::extraction::html::convert_html_to_markdown_with_metadata(
|
|
206
|
+
&html,
|
|
207
|
+
config.html_options.clone(),
|
|
208
|
+
Some(config.output_format),
|
|
209
|
+
)?;
|
|
207
210
|
|
|
208
|
-
let tables = extract_html_tables(&
|
|
211
|
+
let tables = extract_html_tables(&content_text)?;
|
|
209
212
|
|
|
210
|
-
|
|
213
|
+
// Set mime_type based on actual output format
|
|
214
|
+
let result_mime_type = match config.output_format {
|
|
215
|
+
OutputFormat::Markdown => "text/markdown",
|
|
216
|
+
OutputFormat::Djot => "text/djot",
|
|
217
|
+
_ => mime_type, // Preserve original mime_type for other formats
|
|
218
|
+
};
|
|
211
219
|
|
|
212
220
|
Ok(ExtractionResult {
|
|
213
|
-
content:
|
|
214
|
-
mime_type:
|
|
221
|
+
content: content_text,
|
|
222
|
+
mime_type: result_mime_type.to_string(),
|
|
215
223
|
metadata: Metadata {
|
|
216
224
|
format: html_metadata.map(|m| crate::types::FormatMetadata::Html(Box::new(m))),
|
|
217
225
|
..Default::default()
|
|
@@ -221,6 +229,8 @@ impl SyncExtractor for HtmlExtractor {
|
|
|
221
229
|
detected_languages: None,
|
|
222
230
|
chunks: None,
|
|
223
231
|
images: None,
|
|
232
|
+
djot_content: None,
|
|
233
|
+
elements: None,
|
|
224
234
|
})
|
|
225
235
|
}
|
|
226
236
|
}
|
|
@@ -275,7 +285,7 @@ mod tests {
|
|
|
275
285
|
|
|
276
286
|
/// Helper function to convert HTML to markdown for testing
|
|
277
287
|
fn html_to_markdown_for_test(html: &str) -> String {
|
|
278
|
-
crate::extraction::html::convert_html_to_markdown(html, None).unwrap()
|
|
288
|
+
crate::extraction::html::convert_html_to_markdown(html, None, None).unwrap()
|
|
279
289
|
}
|
|
280
290
|
|
|
281
291
|
#[test]
|
|
@@ -416,4 +426,66 @@ mod tests {
|
|
|
416
426
|
assert_eq!(table.cells[1], vec!["Alice", "30"]);
|
|
417
427
|
assert_eq!(table.cells[2], vec!["Bob", "25"]);
|
|
418
428
|
}
|
|
429
|
+
|
|
430
|
+
#[tokio::test]
|
|
431
|
+
async fn test_html_extractor_with_djot_output() {
|
|
432
|
+
let html = r#"
|
|
433
|
+
<html>
|
|
434
|
+
<body>
|
|
435
|
+
<h1>Test Page</h1>
|
|
436
|
+
<p>Content with <strong>emphasis</strong>.</p>
|
|
437
|
+
</body>
|
|
438
|
+
</html>
|
|
439
|
+
"#;
|
|
440
|
+
|
|
441
|
+
let extractor = HtmlExtractor::new();
|
|
442
|
+
let config = ExtractionConfig {
|
|
443
|
+
output_format: OutputFormat::Djot,
|
|
444
|
+
..Default::default()
|
|
445
|
+
};
|
|
446
|
+
|
|
447
|
+
let result = extractor
|
|
448
|
+
.extract_bytes(html.as_bytes(), "text/html", &config)
|
|
449
|
+
.await
|
|
450
|
+
.unwrap();
|
|
451
|
+
|
|
452
|
+
assert_eq!(result.mime_type, "text/djot");
|
|
453
|
+
assert!(result.content.contains("# Test Page"));
|
|
454
|
+
assert!(result.content.contains("*emphasis*")); // Djot strong syntax
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
#[tokio::test]
|
|
458
|
+
async fn test_html_extractor_djot_double_conversion_prevention() {
|
|
459
|
+
let html = r#"
|
|
460
|
+
<html>
|
|
461
|
+
<body>
|
|
462
|
+
<h1>Test</h1>
|
|
463
|
+
<p>Content with <strong>bold</strong> text.</p>
|
|
464
|
+
</body>
|
|
465
|
+
</html>
|
|
466
|
+
"#;
|
|
467
|
+
|
|
468
|
+
let extractor = HtmlExtractor::new();
|
|
469
|
+
let config = ExtractionConfig {
|
|
470
|
+
output_format: OutputFormat::Djot,
|
|
471
|
+
..Default::default()
|
|
472
|
+
};
|
|
473
|
+
|
|
474
|
+
let result = extractor
|
|
475
|
+
.extract_bytes(html.as_bytes(), "text/html", &config)
|
|
476
|
+
.await
|
|
477
|
+
.unwrap();
|
|
478
|
+
|
|
479
|
+
// Content should already be in djot format
|
|
480
|
+
assert_eq!(result.mime_type, "text/djot");
|
|
481
|
+
let original_content = result.content.clone();
|
|
482
|
+
|
|
483
|
+
// Simulate pipeline format application
|
|
484
|
+
let mut pipeline_result = result.clone();
|
|
485
|
+
crate::core::pipeline::apply_output_format(&mut pipeline_result, OutputFormat::Djot);
|
|
486
|
+
|
|
487
|
+
// Content should be identical - no re-conversion should occur
|
|
488
|
+
assert_eq!(pipeline_result.content, original_content);
|
|
489
|
+
assert_eq!(pipeline_result.mime_type, "text/djot");
|
|
490
|
+
}
|
|
419
491
|
}
|