kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
//! Type definitions for transformation operations.
|
|
2
|
+
|
|
3
|
+
/// Metadata about a detected list item.
|
|
4
|
+
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
5
|
+
pub struct ListItemMetadata {
|
|
6
|
+
/// Type of list (Bullet, Numbered, etc.)
|
|
7
|
+
pub list_type: ListType,
|
|
8
|
+
/// Starting byte offset in the content string
|
|
9
|
+
pub byte_start: usize,
|
|
10
|
+
/// Ending byte offset in the content string
|
|
11
|
+
pub byte_end: usize,
|
|
12
|
+
/// List item indent level
|
|
13
|
+
pub indent_level: u32,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/// Type of list detection.
|
|
17
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
18
|
+
pub enum ListType {
|
|
19
|
+
/// Bullet points (-, *, •, etc.)
|
|
20
|
+
Bullet,
|
|
21
|
+
/// Numbered lists (1., 2., etc.)
|
|
22
|
+
Numbered,
|
|
23
|
+
/// Lettered lists (a., b., A., B., etc.)
|
|
24
|
+
Lettered,
|
|
25
|
+
/// Indented items
|
|
26
|
+
Indented,
|
|
27
|
+
}
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
//! Djot attribute parsing utilities.
|
|
2
|
+
//!
|
|
3
|
+
//! Handles parsing of Djot attributes from jotdown events and string syntax.
|
|
4
|
+
|
|
5
|
+
use std::collections::HashMap;
|
|
6
|
+
|
|
7
|
+
/// Parse jotdown attributes into our Attributes representation.
|
|
8
|
+
///
|
|
9
|
+
/// Converts jotdown's internal attribute representation to Kreuzberg's
|
|
10
|
+
/// standardized Attributes struct, handling IDs, classes, and key-value pairs.
|
|
11
|
+
pub fn parse_jotdown_attributes(attrs: &jotdown::Attributes) -> crate::types::Attributes {
|
|
12
|
+
use crate::types::Attributes;
|
|
13
|
+
use jotdown::AttributeKind;
|
|
14
|
+
|
|
15
|
+
let mut id = None;
|
|
16
|
+
let mut classes = Vec::new();
|
|
17
|
+
let mut key_values = HashMap::new();
|
|
18
|
+
|
|
19
|
+
for (kind, value) in attrs.iter() {
|
|
20
|
+
match kind {
|
|
21
|
+
AttributeKind::Id => {
|
|
22
|
+
// Last ID wins if multiple are specified
|
|
23
|
+
id = Some(value.to_string());
|
|
24
|
+
}
|
|
25
|
+
AttributeKind::Class => {
|
|
26
|
+
classes.push(value.to_string());
|
|
27
|
+
}
|
|
28
|
+
AttributeKind::Pair { key } => {
|
|
29
|
+
key_values.insert(key.to_string(), value.to_string());
|
|
30
|
+
}
|
|
31
|
+
AttributeKind::Comment => {
|
|
32
|
+
// Comments are ignored in our representation
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
Attributes {
|
|
38
|
+
id,
|
|
39
|
+
classes,
|
|
40
|
+
key_values,
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Parse djot attribute syntax from string: {.class #id key="value"}
|
|
45
|
+
#[allow(dead_code)]
|
|
46
|
+
pub fn parse_djot_attributes(attr_str: &str) -> crate::types::Attributes {
|
|
47
|
+
use crate::types::Attributes;
|
|
48
|
+
|
|
49
|
+
let mut attrs = Attributes {
|
|
50
|
+
id: None,
|
|
51
|
+
classes: Vec::new(),
|
|
52
|
+
key_values: HashMap::new(),
|
|
53
|
+
};
|
|
54
|
+
|
|
55
|
+
// Simple parser for attribute syntax
|
|
56
|
+
let tokens = attr_str.split_whitespace();
|
|
57
|
+
|
|
58
|
+
for token in tokens {
|
|
59
|
+
if let Some(class) = token.strip_prefix('.') {
|
|
60
|
+
// Class
|
|
61
|
+
attrs.classes.push(class.to_string());
|
|
62
|
+
} else if let Some(id) = token.strip_prefix('#') {
|
|
63
|
+
// ID
|
|
64
|
+
attrs.id = Some(id.to_string());
|
|
65
|
+
} else if token.contains('=') {
|
|
66
|
+
// Key-value pair
|
|
67
|
+
if let Some((key, value)) = token.split_once('=') {
|
|
68
|
+
let clean_value = value.trim_matches('"').trim_matches('\'');
|
|
69
|
+
attrs.key_values.insert(key.to_string(), clean_value.to_string());
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
attrs
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/// Render attributes to djot attribute syntax.
|
|
78
|
+
///
|
|
79
|
+
/// Converts Kreuzberg's Attributes struct back to djot attribute syntax:
|
|
80
|
+
/// {.class #id key="value"}
|
|
81
|
+
pub fn render_attributes(attrs: &crate::types::Attributes) -> String {
|
|
82
|
+
let mut parts = Vec::new();
|
|
83
|
+
|
|
84
|
+
if let Some(ref id) = attrs.id {
|
|
85
|
+
parts.push(format!("#{}", id));
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
for class in &attrs.classes {
|
|
89
|
+
parts.push(format!(".{}", class));
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
for (key, value) in &attrs.key_values {
|
|
93
|
+
parts.push(format!("{}=\"{}\"", key, value));
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if parts.is_empty() {
|
|
97
|
+
String::new()
|
|
98
|
+
} else {
|
|
99
|
+
format!("{{{}}}", parts.join(" "))
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
#[cfg(test)]
|
|
104
|
+
mod tests {
|
|
105
|
+
use super::*;
|
|
106
|
+
|
|
107
|
+
#[test]
|
|
108
|
+
fn test_render_attributes_with_all_parts() {
|
|
109
|
+
let mut attrs = crate::types::Attributes {
|
|
110
|
+
id: Some("my-id".to_string()),
|
|
111
|
+
classes: vec!["class1".to_string(), "class2".to_string()],
|
|
112
|
+
key_values: HashMap::new(),
|
|
113
|
+
};
|
|
114
|
+
attrs.key_values.insert("data-test".to_string(), "value".to_string());
|
|
115
|
+
|
|
116
|
+
let rendered = render_attributes(&attrs);
|
|
117
|
+
assert!(rendered.contains("#my-id"));
|
|
118
|
+
assert!(rendered.contains(".class1"));
|
|
119
|
+
assert!(rendered.contains(".class2"));
|
|
120
|
+
assert!(rendered.contains("data-test"));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
#[test]
|
|
124
|
+
fn test_render_attributes_empty() {
|
|
125
|
+
let attrs = crate::types::Attributes {
|
|
126
|
+
id: None,
|
|
127
|
+
classes: vec![],
|
|
128
|
+
key_values: HashMap::new(),
|
|
129
|
+
};
|
|
130
|
+
|
|
131
|
+
let rendered = render_attributes(&attrs);
|
|
132
|
+
assert_eq!(rendered, "");
|
|
133
|
+
}
|
|
134
|
+
}
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
//! Djot content conversion and HTML rendering APIs.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides public APIs for converting between different representations:
|
|
4
|
+
//! - DjotContent to djot markup
|
|
5
|
+
//! - ExtractionResult to djot markup
|
|
6
|
+
//! - Djot markup to HTML
|
|
7
|
+
|
|
8
|
+
use super::rendering::render_block_to_djot;
|
|
9
|
+
use jotdown::Parser;
|
|
10
|
+
|
|
11
|
+
/// Convert DjotContent back to djot markup.
|
|
12
|
+
///
|
|
13
|
+
/// This function takes a `DjotContent` structure and generates valid djot markup
|
|
14
|
+
/// from it, preserving:
|
|
15
|
+
/// - Block structure (headings, code blocks, lists, blockquotes, etc.)
|
|
16
|
+
/// - Inline formatting (strong, emphasis, highlight, subscript, superscript, etc.)
|
|
17
|
+
/// - Attributes where present ({.class #id key="value"})
|
|
18
|
+
///
|
|
19
|
+
/// # Arguments
|
|
20
|
+
///
|
|
21
|
+
/// * `content` - The DjotContent to convert
|
|
22
|
+
///
|
|
23
|
+
/// # Returns
|
|
24
|
+
///
|
|
25
|
+
/// A String containing valid djot markup
|
|
26
|
+
///
|
|
27
|
+
/// # Example
|
|
28
|
+
///
|
|
29
|
+
/// ```ignore
|
|
30
|
+
/// let djot_content = // ... extract from some source
|
|
31
|
+
/// let markup = djot_content_to_djot(&djot_content);
|
|
32
|
+
/// println!("{}", markup);
|
|
33
|
+
/// ```
|
|
34
|
+
pub fn djot_content_to_djot(content: &crate::types::DjotContent) -> String {
|
|
35
|
+
let mut output = String::new();
|
|
36
|
+
|
|
37
|
+
for block in &content.blocks {
|
|
38
|
+
render_block_to_djot(&mut output, block, 0);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
output
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/// Convert any ExtractionResult to djot format.
|
|
45
|
+
///
|
|
46
|
+
/// This function converts an `ExtractionResult` to djot markup:
|
|
47
|
+
/// - If `djot_content` is `Some`, uses `djot_content_to_djot` for full fidelity conversion
|
|
48
|
+
/// - Otherwise, wraps the plain text content in paragraphs
|
|
49
|
+
///
|
|
50
|
+
/// # Arguments
|
|
51
|
+
///
|
|
52
|
+
/// * `result` - The ExtractionResult to convert
|
|
53
|
+
///
|
|
54
|
+
/// # Returns
|
|
55
|
+
///
|
|
56
|
+
/// A `Result` containing the djot markup string
|
|
57
|
+
///
|
|
58
|
+
/// # Example
|
|
59
|
+
///
|
|
60
|
+
/// ```ignore
|
|
61
|
+
/// let result = extractor.extract_bytes(bytes, "text/plain", &config).await?;
|
|
62
|
+
/// let djot_markup = extraction_result_to_djot(&result)?;
|
|
63
|
+
/// ```
|
|
64
|
+
pub fn extraction_result_to_djot(result: &crate::types::ExtractionResult) -> crate::Result<String> {
|
|
65
|
+
if let Some(ref djot_content) = result.djot_content {
|
|
66
|
+
Ok(djot_content_to_djot(djot_content))
|
|
67
|
+
} else {
|
|
68
|
+
// Convert plain text to basic djot paragraphs
|
|
69
|
+
let mut output = String::new();
|
|
70
|
+
|
|
71
|
+
// Split content by double newlines to create paragraphs
|
|
72
|
+
let paragraphs: Vec<&str> = result.content.split("\n\n").collect();
|
|
73
|
+
|
|
74
|
+
for para in paragraphs {
|
|
75
|
+
let trimmed = para.trim();
|
|
76
|
+
if !trimmed.is_empty() {
|
|
77
|
+
output.push_str(trimmed);
|
|
78
|
+
output.push_str("\n\n");
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
Ok(output)
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/// Render djot content to HTML.
|
|
87
|
+
///
|
|
88
|
+
/// This function takes djot source text and renders it to HTML using jotdown's
|
|
89
|
+
/// built-in HTML renderer.
|
|
90
|
+
///
|
|
91
|
+
/// # Arguments
|
|
92
|
+
///
|
|
93
|
+
/// * `djot_source` - The djot markup text to render
|
|
94
|
+
///
|
|
95
|
+
/// # Returns
|
|
96
|
+
///
|
|
97
|
+
/// A `Result` containing the rendered HTML string
|
|
98
|
+
///
|
|
99
|
+
/// # Example
|
|
100
|
+
///
|
|
101
|
+
/// ```ignore
|
|
102
|
+
/// let djot = "# Hello\n\nThis is *bold* and _italic_.";
|
|
103
|
+
/// let html = djot_to_html(djot)?;
|
|
104
|
+
/// assert!(html.contains("<h1>"));
|
|
105
|
+
/// assert!(html.contains("<strong>"));
|
|
106
|
+
/// assert!(html.contains("<em>"));
|
|
107
|
+
/// ```
|
|
108
|
+
pub fn djot_to_html(djot_source: &str) -> crate::Result<String> {
|
|
109
|
+
let parser = Parser::new(djot_source);
|
|
110
|
+
let html = jotdown::html::render_to_string(parser);
|
|
111
|
+
Ok(html)
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
#[cfg(test)]
|
|
115
|
+
mod tests {
|
|
116
|
+
use super::*;
|
|
117
|
+
use crate::types::{BlockType, DjotContent, ExtractionResult, FormattedBlock, InlineElement, InlineType, Metadata};
|
|
118
|
+
|
|
119
|
+
#[test]
|
|
120
|
+
fn test_djot_content_to_djot_heading() {
|
|
121
|
+
let content = DjotContent {
|
|
122
|
+
plain_text: "Test Heading".to_string(),
|
|
123
|
+
blocks: vec![FormattedBlock {
|
|
124
|
+
block_type: BlockType::Heading,
|
|
125
|
+
level: Some(1),
|
|
126
|
+
inline_content: vec![InlineElement {
|
|
127
|
+
element_type: InlineType::Text,
|
|
128
|
+
content: "Test Heading".to_string(),
|
|
129
|
+
attributes: None,
|
|
130
|
+
metadata: None,
|
|
131
|
+
}],
|
|
132
|
+
attributes: None,
|
|
133
|
+
language: None,
|
|
134
|
+
code: None,
|
|
135
|
+
children: vec![],
|
|
136
|
+
}],
|
|
137
|
+
metadata: Metadata::default(),
|
|
138
|
+
tables: vec![],
|
|
139
|
+
images: vec![],
|
|
140
|
+
links: vec![],
|
|
141
|
+
footnotes: vec![],
|
|
142
|
+
attributes: Default::default(),
|
|
143
|
+
};
|
|
144
|
+
|
|
145
|
+
let markup = djot_content_to_djot(&content);
|
|
146
|
+
assert!(markup.contains("# Test Heading"));
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
#[test]
|
|
150
|
+
fn test_extraction_result_to_djot_with_djot_content() {
|
|
151
|
+
let result = ExtractionResult {
|
|
152
|
+
content: "Test content".to_string(),
|
|
153
|
+
mime_type: "text/djot".to_string(),
|
|
154
|
+
metadata: Metadata::default(),
|
|
155
|
+
tables: vec![],
|
|
156
|
+
detected_languages: None,
|
|
157
|
+
chunks: None,
|
|
158
|
+
images: None,
|
|
159
|
+
pages: None,
|
|
160
|
+
djot_content: Some(DjotContent {
|
|
161
|
+
plain_text: "Test content".to_string(),
|
|
162
|
+
blocks: vec![FormattedBlock {
|
|
163
|
+
block_type: BlockType::Paragraph,
|
|
164
|
+
level: None,
|
|
165
|
+
inline_content: vec![InlineElement {
|
|
166
|
+
element_type: InlineType::Text,
|
|
167
|
+
content: "Test content".to_string(),
|
|
168
|
+
attributes: None,
|
|
169
|
+
metadata: None,
|
|
170
|
+
}],
|
|
171
|
+
attributes: None,
|
|
172
|
+
language: None,
|
|
173
|
+
code: None,
|
|
174
|
+
children: vec![],
|
|
175
|
+
}],
|
|
176
|
+
metadata: Metadata::default(),
|
|
177
|
+
tables: vec![],
|
|
178
|
+
images: vec![],
|
|
179
|
+
links: vec![],
|
|
180
|
+
footnotes: vec![],
|
|
181
|
+
attributes: Default::default(),
|
|
182
|
+
}),
|
|
183
|
+
elements: None,
|
|
184
|
+
};
|
|
185
|
+
|
|
186
|
+
let markup = extraction_result_to_djot(&result).expect("Should convert");
|
|
187
|
+
assert!(markup.contains("Test content"));
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
#[test]
|
|
191
|
+
fn test_extraction_result_to_djot_without_djot_content() {
|
|
192
|
+
let result = ExtractionResult {
|
|
193
|
+
content: "Paragraph one\n\nParagraph two".to_string(),
|
|
194
|
+
mime_type: "text/plain".to_string(),
|
|
195
|
+
metadata: Metadata::default(),
|
|
196
|
+
tables: vec![],
|
|
197
|
+
detected_languages: None,
|
|
198
|
+
chunks: None,
|
|
199
|
+
images: None,
|
|
200
|
+
pages: None,
|
|
201
|
+
djot_content: None,
|
|
202
|
+
elements: None,
|
|
203
|
+
};
|
|
204
|
+
|
|
205
|
+
let markup = extraction_result_to_djot(&result).expect("Should convert");
|
|
206
|
+
assert!(markup.contains("Paragraph one"));
|
|
207
|
+
assert!(markup.contains("Paragraph two"));
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
#[test]
|
|
211
|
+
fn test_djot_to_html_heading() {
|
|
212
|
+
let djot = "# Hello";
|
|
213
|
+
let html = djot_to_html(djot).expect("Should render");
|
|
214
|
+
assert!(html.contains("<h1>") || html.contains("<H1>"));
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
#[test]
|
|
218
|
+
fn test_djot_to_html_formatting() {
|
|
219
|
+
let djot = "This is *bold* and _italic_.";
|
|
220
|
+
let html = djot_to_html(djot).expect("Should render");
|
|
221
|
+
assert!(html.contains("<strong>") || html.contains("<em>"));
|
|
222
|
+
}
|
|
223
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
//! Djot document extractor with plugin integration.
|
|
2
|
+
//!
|
|
3
|
+
//! Implements the DocumentExtractor and Plugin traits for Djot markup files.
|
|
4
|
+
|
|
5
|
+
use super::parsing::{extract_complete_djot_content, extract_tables_from_events, extract_text_from_events};
|
|
6
|
+
use crate::Result;
|
|
7
|
+
use crate::core::config::ExtractionConfig;
|
|
8
|
+
use crate::plugins::{DocumentExtractor, Plugin};
|
|
9
|
+
use crate::types::{ExtractionResult, Metadata};
|
|
10
|
+
use async_trait::async_trait;
|
|
11
|
+
use jotdown::{Event, Parser};
|
|
12
|
+
|
|
13
|
+
/// Djot markup extractor with metadata and table support.
|
|
14
|
+
///
|
|
15
|
+
/// Parses Djot documents with YAML frontmatter, extracting:
|
|
16
|
+
/// - Metadata from YAML frontmatter
|
|
17
|
+
/// - Plain text content
|
|
18
|
+
/// - Tables as structured data
|
|
19
|
+
/// - Document structure (headings, links, code blocks)
|
|
20
|
+
#[derive(Debug, Clone)]
|
|
21
|
+
pub struct DjotExtractor;
|
|
22
|
+
|
|
23
|
+
impl DjotExtractor {
|
|
24
|
+
/// Create a new Djot extractor.
|
|
25
|
+
pub fn new() -> Self {
|
|
26
|
+
Self
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
impl Default for DjotExtractor {
|
|
31
|
+
fn default() -> Self {
|
|
32
|
+
Self::new()
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
impl Plugin for DjotExtractor {
|
|
37
|
+
fn name(&self) -> &str {
|
|
38
|
+
"djot-extractor"
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
fn version(&self) -> String {
|
|
42
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
fn initialize(&self) -> Result<()> {
|
|
46
|
+
Ok(())
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
fn shutdown(&self) -> Result<()> {
|
|
50
|
+
Ok(())
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
fn description(&self) -> &str {
|
|
54
|
+
"Extracts content from Djot markup files with YAML frontmatter and table support"
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
fn author(&self) -> &str {
|
|
58
|
+
"Kreuzberg Team"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
#[async_trait]
|
|
63
|
+
impl DocumentExtractor for DjotExtractor {
|
|
64
|
+
#[cfg_attr(
|
|
65
|
+
feature = "otel",
|
|
66
|
+
tracing::instrument(
|
|
67
|
+
skip(self, content, _config),
|
|
68
|
+
fields(
|
|
69
|
+
extractor.name = self.name(),
|
|
70
|
+
content.size_bytes = content.len(),
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
)]
|
|
74
|
+
async fn extract_bytes(
|
|
75
|
+
&self,
|
|
76
|
+
content: &[u8],
|
|
77
|
+
mime_type: &str,
|
|
78
|
+
_config: &ExtractionConfig,
|
|
79
|
+
) -> Result<ExtractionResult> {
|
|
80
|
+
let text = String::from_utf8_lossy(content).into_owned();
|
|
81
|
+
|
|
82
|
+
let (yaml, remaining_content) = crate::extractors::frontmatter_utils::extract_frontmatter(&text);
|
|
83
|
+
|
|
84
|
+
let mut metadata = if let Some(ref yaml_value) = yaml {
|
|
85
|
+
crate::extractors::frontmatter_utils::extract_metadata_from_yaml(yaml_value)
|
|
86
|
+
} else {
|
|
87
|
+
Metadata::default()
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
if !metadata.additional.contains_key("title")
|
|
91
|
+
&& let Some(title) = crate::extractors::frontmatter_utils::extract_title_from_content(&remaining_content)
|
|
92
|
+
{
|
|
93
|
+
metadata.additional.insert("title".to_string(), title.into());
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Parse with jotdown and collect events once for extraction
|
|
97
|
+
let parser = Parser::new(&remaining_content);
|
|
98
|
+
let events: Vec<Event> = parser.collect();
|
|
99
|
+
|
|
100
|
+
let extracted_text = extract_text_from_events(&events);
|
|
101
|
+
let tables = extract_tables_from_events(&events);
|
|
102
|
+
|
|
103
|
+
// Extract complete djot content with all features
|
|
104
|
+
let djot_content = extract_complete_djot_content(&events, metadata.clone(), tables.clone());
|
|
105
|
+
|
|
106
|
+
Ok(ExtractionResult {
|
|
107
|
+
content: extracted_text,
|
|
108
|
+
mime_type: mime_type.to_string(),
|
|
109
|
+
metadata,
|
|
110
|
+
tables,
|
|
111
|
+
detected_languages: None,
|
|
112
|
+
chunks: None,
|
|
113
|
+
images: None,
|
|
114
|
+
pages: None,
|
|
115
|
+
djot_content: Some(djot_content),
|
|
116
|
+
elements: None,
|
|
117
|
+
})
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
121
|
+
&["text/djot", "text/x-djot"]
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
fn priority(&self) -> i32 {
|
|
125
|
+
50
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
#[cfg(test)]
|
|
130
|
+
mod tests {
|
|
131
|
+
use super::*;
|
|
132
|
+
|
|
133
|
+
#[test]
|
|
134
|
+
fn test_djot_extractor_creation() {
|
|
135
|
+
let extractor = DjotExtractor::new();
|
|
136
|
+
assert_eq!(extractor.name(), "djot-extractor");
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
#[test]
|
|
140
|
+
fn test_can_extract_djot_mime_types() {
|
|
141
|
+
let extractor = DjotExtractor::new();
|
|
142
|
+
let mime_types = extractor.supported_mime_types();
|
|
143
|
+
|
|
144
|
+
assert!(mime_types.contains(&"text/djot"));
|
|
145
|
+
assert!(mime_types.contains(&"text/x-djot"));
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
#[test]
|
|
149
|
+
fn test_plugin_interface() {
|
|
150
|
+
let extractor = DjotExtractor::new();
|
|
151
|
+
assert_eq!(extractor.author(), "Kreuzberg Team");
|
|
152
|
+
assert!(!extractor.version().is_empty());
|
|
153
|
+
assert!(!extractor.description().is_empty());
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
#[tokio::test]
|
|
157
|
+
async fn test_extract_simple_djot() {
|
|
158
|
+
let content =
|
|
159
|
+
b"# Header\n\nThis is a paragraph with *bold* and _italic_ text.\n\n## Subheading\n\nMore content here.";
|
|
160
|
+
let extractor = DjotExtractor::new();
|
|
161
|
+
let config = ExtractionConfig::default();
|
|
162
|
+
|
|
163
|
+
let result = extractor.extract_bytes(content, "text/djot", &config).await;
|
|
164
|
+
assert!(result.is_ok());
|
|
165
|
+
|
|
166
|
+
let result = result.unwrap();
|
|
167
|
+
assert!(result.content.contains("Header"));
|
|
168
|
+
assert!(result.content.contains("This is a paragraph"));
|
|
169
|
+
assert!(result.content.contains("bold"));
|
|
170
|
+
assert!(result.content.contains("italic"));
|
|
171
|
+
}
|
|
172
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
//! Djot markup format extractor and utilities.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides:
|
|
4
|
+
//! - Djot parsing using the jotdown crate
|
|
5
|
+
//! - YAML frontmatter metadata extraction (same as Markdown)
|
|
6
|
+
//! - Table extraction as structured data
|
|
7
|
+
//! - Heading structure preservation
|
|
8
|
+
//! - Code block and link extraction
|
|
9
|
+
//! - Djot content rendering and conversion APIs
|
|
10
|
+
//!
|
|
11
|
+
//! Djot is a modern markup language with simpler parsing rules than CommonMark.
|
|
12
|
+
//! See https://djot.net for the specification.
|
|
13
|
+
//!
|
|
14
|
+
//! Requires the `djot` feature.
|
|
15
|
+
|
|
16
|
+
pub mod attributes;
|
|
17
|
+
pub mod conversion;
|
|
18
|
+
pub mod extractor;
|
|
19
|
+
pub mod parsing;
|
|
20
|
+
pub mod rendering;
|
|
21
|
+
|
|
22
|
+
// Re-export public API
|
|
23
|
+
pub use conversion::{djot_content_to_djot, djot_to_html, extraction_result_to_djot};
|
|
24
|
+
pub use extractor::DjotExtractor;
|