kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -41,6 +41,7 @@ impl PptxExtractor {
|
|
|
41
41
|
|
|
42
42
|
let ocr_config = config.ocr.as_ref().unwrap();
|
|
43
43
|
let tess_config = ocr_config.tesseract_config.as_ref().cloned().unwrap_or_default();
|
|
44
|
+
let output_format = config.output_format;
|
|
44
45
|
|
|
45
46
|
for image in &mut images {
|
|
46
47
|
let image_data = image.data.clone();
|
|
@@ -53,7 +54,7 @@ impl PptxExtractor {
|
|
|
53
54
|
|
|
54
55
|
let proc = OcrProcessor::new(cache_dir)?;
|
|
55
56
|
let ocr_tess_config: crate::ocr::types::TesseractConfig = (&tess_config_clone).into();
|
|
56
|
-
proc.
|
|
57
|
+
proc.process_image_with_format(&image_data, &ocr_tess_config, output_format)
|
|
57
58
|
})
|
|
58
59
|
.await
|
|
59
60
|
.map_err(|e| crate::KreuzbergError::Ocr {
|
|
@@ -65,13 +66,15 @@ impl PptxExtractor {
|
|
|
65
66
|
Ok(ocr_extraction) => {
|
|
66
67
|
let extraction_result = ExtractionResult {
|
|
67
68
|
content: ocr_extraction.content,
|
|
68
|
-
mime_type:
|
|
69
|
+
mime_type: ocr_extraction.mime_type,
|
|
69
70
|
metadata: Metadata::default(),
|
|
70
71
|
tables: vec![],
|
|
71
72
|
detected_languages: None,
|
|
72
73
|
chunks: None,
|
|
73
74
|
images: None,
|
|
75
|
+
djot_content: None,
|
|
74
76
|
pages: None,
|
|
77
|
+
elements: None,
|
|
75
78
|
};
|
|
76
79
|
image.ocr_result = Some(Box::new(extraction_result));
|
|
77
80
|
}
|
|
@@ -178,6 +181,8 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
178
181
|
detected_languages: None,
|
|
179
182
|
chunks: None,
|
|
180
183
|
images,
|
|
184
|
+
djot_content: None,
|
|
185
|
+
elements: None,
|
|
181
186
|
})
|
|
182
187
|
}
|
|
183
188
|
|
|
@@ -241,6 +246,8 @@ impl DocumentExtractor for PptxExtractor {
|
|
|
241
246
|
detected_languages: None,
|
|
242
247
|
chunks: None,
|
|
243
248
|
images,
|
|
249
|
+
djot_content: None,
|
|
250
|
+
elements: None,
|
|
244
251
|
})
|
|
245
252
|
}
|
|
246
253
|
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
//! Character encoding utilities for RTF parsing.
|
|
2
|
+
//!
|
|
3
|
+
//! Provides hex byte parsing and Windows-1252 character mapping for the 0x80-0x9F range.
|
|
4
|
+
|
|
5
|
+
/// Convert a hex digit character to its numeric value.
|
|
6
|
+
///
|
|
7
|
+
/// Returns None if the character is not a valid hex digit.
|
|
8
|
+
#[inline]
|
|
9
|
+
pub fn hex_digit_to_u8(c: char) -> Option<u8> {
|
|
10
|
+
match c {
|
|
11
|
+
'0'..='9' => Some((c as u8) - b'0'),
|
|
12
|
+
'a'..='f' => Some((c as u8) - b'a' + 10),
|
|
13
|
+
'A'..='F' => Some((c as u8) - b'A' + 10),
|
|
14
|
+
_ => None,
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/// Parse a hex-encoded byte from two characters.
|
|
19
|
+
///
|
|
20
|
+
/// Returns the decoded byte if both characters are valid hex digits.
|
|
21
|
+
#[inline]
|
|
22
|
+
pub fn parse_hex_byte(h1: char, h2: char) -> Option<u8> {
|
|
23
|
+
let high = hex_digit_to_u8(h1)?;
|
|
24
|
+
let low = hex_digit_to_u8(h2)?;
|
|
25
|
+
Some((high << 4) | low)
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
/// Decode a byte using Windows-1252 encoding for the 0x80-0x9F range.
|
|
29
|
+
///
|
|
30
|
+
/// This function maps Windows-1252 bytes in the 0x80-0x9F range to their
|
|
31
|
+
/// corresponding Unicode characters. For other values, it returns the byte
|
|
32
|
+
/// as a character directly.
|
|
33
|
+
#[inline]
|
|
34
|
+
pub fn decode_windows_1252(byte: u8) -> char {
|
|
35
|
+
match byte {
|
|
36
|
+
0x80 => '\u{20AC}', // Euro sign
|
|
37
|
+
0x81 => '?',
|
|
38
|
+
0x82 => '\u{201A}', // Single low-9 quotation mark
|
|
39
|
+
0x83 => '\u{0192}', // Latin small letter f with hook
|
|
40
|
+
0x84 => '\u{201E}', // Double low-9 quotation mark
|
|
41
|
+
0x85 => '\u{2026}', // Horizontal ellipsis
|
|
42
|
+
0x86 => '\u{2020}', // Dagger
|
|
43
|
+
0x87 => '\u{2021}', // Double dagger
|
|
44
|
+
0x88 => '\u{02C6}', // Modifier letter circumflex accent
|
|
45
|
+
0x89 => '\u{2030}', // Per mille sign
|
|
46
|
+
0x8A => '\u{0160}', // Latin capital letter S with caron
|
|
47
|
+
0x8B => '\u{2039}', // Single left-pointing angle quotation mark
|
|
48
|
+
0x8C => '\u{0152}', // Latin capital ligature OE
|
|
49
|
+
0x8D => '?',
|
|
50
|
+
0x8E => '\u{017D}', // Latin capital letter Z with caron
|
|
51
|
+
0x8F => '?',
|
|
52
|
+
0x90 => '?',
|
|
53
|
+
0x91 => '\u{2018}', // Left single quotation mark
|
|
54
|
+
0x92 => '\u{2019}', // Right single quotation mark
|
|
55
|
+
0x93 => '\u{201C}', // Left double quotation mark
|
|
56
|
+
0x94 => '\u{201D}', // Right double quotation mark
|
|
57
|
+
0x95 => '\u{2022}', // Bullet
|
|
58
|
+
0x96 => '\u{2013}', // En dash
|
|
59
|
+
0x97 => '\u{2014}', // Em dash
|
|
60
|
+
0x98 => '\u{02DC}', // Small tilde
|
|
61
|
+
0x99 => '\u{2122}', // Trade mark sign
|
|
62
|
+
0x9A => '\u{0161}', // Latin small letter s with caron
|
|
63
|
+
0x9B => '\u{203A}', // Single right-pointing angle quotation mark
|
|
64
|
+
0x9C => '\u{0153}', // Latin small ligature oe
|
|
65
|
+
0x9D => '?',
|
|
66
|
+
0x9E => '\u{017E}', // Latin small letter z with caron
|
|
67
|
+
0x9F => '\u{0178}', // Latin capital letter Y with diaeresis
|
|
68
|
+
_ => byte as char,
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/// Parse an RTF control word and extract its value.
|
|
73
|
+
///
|
|
74
|
+
/// Returns a tuple of (control_word, optional_numeric_value).
|
|
75
|
+
pub fn parse_rtf_control_word(chars: &mut std::iter::Peekable<std::str::Chars>) -> (String, Option<i32>) {
|
|
76
|
+
let mut word = String::new();
|
|
77
|
+
let mut num_str = String::new();
|
|
78
|
+
let mut is_negative = false;
|
|
79
|
+
|
|
80
|
+
// Parse alphabetic control word
|
|
81
|
+
while let Some(&c) = chars.peek() {
|
|
82
|
+
if c.is_alphabetic() {
|
|
83
|
+
word.push(c);
|
|
84
|
+
chars.next();
|
|
85
|
+
} else {
|
|
86
|
+
break;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// Check for negative sign
|
|
91
|
+
if let Some(&c) = chars.peek()
|
|
92
|
+
&& c == '-'
|
|
93
|
+
{
|
|
94
|
+
is_negative = true;
|
|
95
|
+
chars.next();
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Parse numeric parameter
|
|
99
|
+
while let Some(&c) = chars.peek() {
|
|
100
|
+
if c.is_ascii_digit() {
|
|
101
|
+
num_str.push(c);
|
|
102
|
+
chars.next();
|
|
103
|
+
} else {
|
|
104
|
+
break;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
let num_value = if !num_str.is_empty() {
|
|
109
|
+
let val = num_str.parse::<i32>().unwrap_or(0);
|
|
110
|
+
Some(if is_negative { -val } else { val })
|
|
111
|
+
} else {
|
|
112
|
+
None
|
|
113
|
+
};
|
|
114
|
+
|
|
115
|
+
(word, num_value)
|
|
116
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
//! Text formatting utilities for RTF content.
|
|
2
|
+
|
|
3
|
+
/// Normalize whitespace in a string using a single-pass algorithm.
|
|
4
|
+
///
|
|
5
|
+
/// Collapses multiple consecutive whitespace characters into single spaces
|
|
6
|
+
/// and trims leading/trailing whitespace.
|
|
7
|
+
pub fn normalize_whitespace(s: &str) -> String {
|
|
8
|
+
let mut result = String::with_capacity(s.len());
|
|
9
|
+
let mut last_was_space = false;
|
|
10
|
+
|
|
11
|
+
for ch in s.chars() {
|
|
12
|
+
if ch.is_whitespace() {
|
|
13
|
+
if !last_was_space {
|
|
14
|
+
result.push(' ');
|
|
15
|
+
last_was_space = true;
|
|
16
|
+
}
|
|
17
|
+
} else {
|
|
18
|
+
result.push(ch);
|
|
19
|
+
last_was_space = false;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
result.trim().to_string()
|
|
24
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
//! Image metadata extraction from RTF documents.
|
|
2
|
+
|
|
3
|
+
use crate::extractors::rtf::encoding::parse_rtf_control_word;
|
|
4
|
+
|
|
5
|
+
/// Extract image metadata from within a \pict group.
|
|
6
|
+
///
|
|
7
|
+
/// Looks for image type (jpegblip, pngblip, etc.) and dimensions.
|
|
8
|
+
pub fn extract_image_metadata(chars: &mut std::iter::Peekable<std::str::Chars>) -> String {
|
|
9
|
+
let mut metadata = String::new();
|
|
10
|
+
let mut image_type: Option<&str> = None;
|
|
11
|
+
let mut width_goal: Option<i32> = None;
|
|
12
|
+
let mut height_goal: Option<i32> = None;
|
|
13
|
+
let mut depth = 0;
|
|
14
|
+
|
|
15
|
+
while let Some(&ch) = chars.peek() {
|
|
16
|
+
match ch {
|
|
17
|
+
'{' => {
|
|
18
|
+
depth += 1;
|
|
19
|
+
chars.next();
|
|
20
|
+
}
|
|
21
|
+
'}' => {
|
|
22
|
+
if depth == 0 {
|
|
23
|
+
break;
|
|
24
|
+
}
|
|
25
|
+
depth -= 1;
|
|
26
|
+
chars.next();
|
|
27
|
+
}
|
|
28
|
+
'\\' => {
|
|
29
|
+
chars.next();
|
|
30
|
+
let (control_word, value) = parse_rtf_control_word(chars);
|
|
31
|
+
|
|
32
|
+
match control_word.as_str() {
|
|
33
|
+
"jpegblip" => image_type = Some("jpg"),
|
|
34
|
+
"pngblip" => image_type = Some("png"),
|
|
35
|
+
"wmetafile" => image_type = Some("wmf"),
|
|
36
|
+
"dibitmap" => image_type = Some("bmp"),
|
|
37
|
+
"picwgoal" => width_goal = value,
|
|
38
|
+
"pichgoal" => height_goal = value,
|
|
39
|
+
"bin" => break,
|
|
40
|
+
_ => {}
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
' ' => {
|
|
44
|
+
chars.next();
|
|
45
|
+
}
|
|
46
|
+
_ => {
|
|
47
|
+
chars.next();
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
if let Some(itype) = image_type {
|
|
53
|
+
metadata.push_str("image.");
|
|
54
|
+
metadata.push_str(itype);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if let Some(width) = width_goal {
|
|
58
|
+
let width_inches = f64::from(width) / 1440.0;
|
|
59
|
+
metadata.push_str(&format!(" width=\"{:.1}in\"", width_inches));
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if let Some(height) = height_goal {
|
|
63
|
+
let height_inches = f64::from(height) / 1440.0;
|
|
64
|
+
metadata.push_str(&format!(" height=\"{:.1}in\"", height_inches));
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if metadata.is_empty() {
|
|
68
|
+
metadata.push_str("image.jpg");
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
metadata
|
|
72
|
+
}
|
|
@@ -0,0 +1,216 @@
|
|
|
1
|
+
//! Metadata extraction from RTF documents.
|
|
2
|
+
|
|
3
|
+
use crate::extractors::rtf::encoding::parse_rtf_control_word;
|
|
4
|
+
use serde_json::Value;
|
|
5
|
+
use std::collections::HashMap;
|
|
6
|
+
|
|
7
|
+
/// Parse a `{\\creatim ...}` or `{\\revtim ...}` RTF info block into ISO 8601 format.
|
|
8
|
+
pub fn parse_rtf_datetime(segment: &str) -> Option<String> {
|
|
9
|
+
let mut year: Option<i32> = None;
|
|
10
|
+
let mut month: Option<i32> = None;
|
|
11
|
+
let mut day: Option<i32> = None;
|
|
12
|
+
let mut hour: Option<i32> = None;
|
|
13
|
+
let mut minute: Option<i32> = None;
|
|
14
|
+
|
|
15
|
+
let mut chars = segment.chars().peekable();
|
|
16
|
+
while let Some(&ch) = chars.peek() {
|
|
17
|
+
if ch != '\\' {
|
|
18
|
+
chars.next();
|
|
19
|
+
continue;
|
|
20
|
+
}
|
|
21
|
+
chars.next();
|
|
22
|
+
let (word, value) = parse_rtf_control_word(&mut chars);
|
|
23
|
+
if let Some(v) = value {
|
|
24
|
+
match word.as_str() {
|
|
25
|
+
"yr" => year = Some(v),
|
|
26
|
+
"mo" => month = Some(v),
|
|
27
|
+
"dy" => day = Some(v),
|
|
28
|
+
"hr" => hour = Some(v),
|
|
29
|
+
"min" => minute = Some(v),
|
|
30
|
+
_ => {}
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
let year = year?;
|
|
36
|
+
let month = month.unwrap_or(1).max(1) as u32;
|
|
37
|
+
let day = day.unwrap_or(1).max(1) as u32;
|
|
38
|
+
let hour = hour.unwrap_or(0).max(0) as u32;
|
|
39
|
+
let minute = minute.unwrap_or(0).max(0) as u32;
|
|
40
|
+
|
|
41
|
+
Some(format!(
|
|
42
|
+
"{:04}-{:02}-{:02}T{:02}:{:02}:00Z",
|
|
43
|
+
year, month, day, hour, minute
|
|
44
|
+
))
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Extract metadata from the RTF `\\info` block and augment with computed statistics.
|
|
48
|
+
pub fn extract_rtf_metadata(rtf_content: &str, extracted_text: &str) -> HashMap<String, Value> {
|
|
49
|
+
let mut metadata: HashMap<String, Value> = HashMap::new();
|
|
50
|
+
|
|
51
|
+
if let Some(start) = rtf_content.find("{\\info") {
|
|
52
|
+
let slice = &rtf_content[start..];
|
|
53
|
+
let mut depth = 0usize;
|
|
54
|
+
let mut end_offset: Option<usize> = None;
|
|
55
|
+
|
|
56
|
+
for (idx, ch) in slice.char_indices() {
|
|
57
|
+
match ch {
|
|
58
|
+
'{' => depth += 1,
|
|
59
|
+
'}' => {
|
|
60
|
+
if depth == 0 {
|
|
61
|
+
break;
|
|
62
|
+
}
|
|
63
|
+
depth -= 1;
|
|
64
|
+
if depth == 0 {
|
|
65
|
+
end_offset = Some(idx + 1);
|
|
66
|
+
break;
|
|
67
|
+
}
|
|
68
|
+
}
|
|
69
|
+
_ => {}
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
let info_block = end_offset.map(|end| &slice[..end]).unwrap_or(slice);
|
|
74
|
+
|
|
75
|
+
let mut segments: Vec<String> = Vec::new();
|
|
76
|
+
let mut seg_depth = 0usize;
|
|
77
|
+
let mut current = String::new();
|
|
78
|
+
let mut in_segment = false;
|
|
79
|
+
|
|
80
|
+
for ch in info_block.chars() {
|
|
81
|
+
if ch == '{' {
|
|
82
|
+
seg_depth += 1;
|
|
83
|
+
if seg_depth == 2 {
|
|
84
|
+
in_segment = true;
|
|
85
|
+
current.clear();
|
|
86
|
+
continue;
|
|
87
|
+
}
|
|
88
|
+
} else if ch == '}' {
|
|
89
|
+
if seg_depth == 2 && in_segment {
|
|
90
|
+
segments.push(current.clone());
|
|
91
|
+
in_segment = false;
|
|
92
|
+
}
|
|
93
|
+
seg_depth = seg_depth.saturating_sub(1);
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if in_segment {
|
|
98
|
+
current.push(ch);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
for segment in segments {
|
|
103
|
+
if !segment.starts_with('\\') {
|
|
104
|
+
continue;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
let cleaned_segment = if segment.starts_with("\\*\\") {
|
|
108
|
+
segment.replacen("\\*\\", "\\", 1)
|
|
109
|
+
} else {
|
|
110
|
+
segment.clone()
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
let mut chars = cleaned_segment.chars().peekable();
|
|
114
|
+
chars.next();
|
|
115
|
+
let (keyword, numeric) = parse_rtf_control_word(&mut chars);
|
|
116
|
+
let remaining: String = chars.collect();
|
|
117
|
+
let trimmed = remaining.trim();
|
|
118
|
+
|
|
119
|
+
match keyword.as_str() {
|
|
120
|
+
"author" => {
|
|
121
|
+
if !trimmed.is_empty() {
|
|
122
|
+
let author = trimmed.to_string();
|
|
123
|
+
metadata.insert("created_by".to_string(), Value::String(author.clone()));
|
|
124
|
+
metadata.insert("authors".to_string(), Value::Array(vec![Value::String(author)]));
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
"operator" => {
|
|
128
|
+
if !trimmed.is_empty() {
|
|
129
|
+
metadata.insert("modified_by".to_string(), Value::String(trimmed.to_string()));
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
"title" => {
|
|
133
|
+
if !trimmed.is_empty() {
|
|
134
|
+
metadata.insert("title".to_string(), Value::String(trimmed.to_string()));
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
"subject" => {
|
|
138
|
+
if !trimmed.is_empty() {
|
|
139
|
+
metadata.insert("subject".to_string(), Value::String(trimmed.to_string()));
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
"generator" => {
|
|
143
|
+
if !trimmed.is_empty() {
|
|
144
|
+
metadata.insert("generator".to_string(), Value::String(trimmed.to_string()));
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
"creatim" => {
|
|
148
|
+
if let Some(dt) = parse_rtf_datetime(trimmed) {
|
|
149
|
+
metadata.insert("created_at".to_string(), Value::String(dt));
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
"revtim" => {
|
|
153
|
+
if let Some(dt) = parse_rtf_datetime(trimmed) {
|
|
154
|
+
metadata.insert("modified_at".to_string(), Value::String(dt));
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
"version" => {
|
|
158
|
+
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
159
|
+
metadata.insert("revision".to_string(), Value::String(val.to_string()));
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
"nofpages" => {
|
|
163
|
+
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
164
|
+
metadata.insert("page_count".to_string(), Value::Number(val.into()));
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
"nofwords" => {
|
|
168
|
+
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
169
|
+
metadata.insert("word_count".to_string(), Value::Number(val.into()));
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
"nofchars" => {
|
|
173
|
+
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
174
|
+
metadata.insert("character_count".to_string(), Value::Number(val.into()));
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
"lines" => {
|
|
178
|
+
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
179
|
+
metadata.insert("line_count".to_string(), Value::Number(val.into()));
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
"paragraphs" => {
|
|
183
|
+
if let Some(val) = numeric.or_else(|| trimmed.parse::<i32>().ok()) {
|
|
184
|
+
metadata.insert("paragraph_count".to_string(), Value::Number(val.into()));
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
_ => {}
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
let cleaned_text = extracted_text.trim();
|
|
193
|
+
if !cleaned_text.is_empty() {
|
|
194
|
+
let word_count = cleaned_text.split_whitespace().count() as i64;
|
|
195
|
+
metadata
|
|
196
|
+
.entry("word_count".to_string())
|
|
197
|
+
.or_insert(Value::Number(word_count.into()));
|
|
198
|
+
|
|
199
|
+
let character_count = cleaned_text.chars().count() as i64;
|
|
200
|
+
metadata
|
|
201
|
+
.entry("character_count".to_string())
|
|
202
|
+
.or_insert(Value::Number(character_count.into()));
|
|
203
|
+
|
|
204
|
+
let line_count = cleaned_text.lines().count() as i64;
|
|
205
|
+
metadata
|
|
206
|
+
.entry("line_count".to_string())
|
|
207
|
+
.or_insert(Value::Number(line_count.into()));
|
|
208
|
+
|
|
209
|
+
let paragraph_count = cleaned_text.split("\n\n").filter(|p| !p.trim().is_empty()).count() as i64;
|
|
210
|
+
metadata
|
|
211
|
+
.entry("paragraph_count".to_string())
|
|
212
|
+
.or_insert(Value::Number(paragraph_count.into()));
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
metadata
|
|
216
|
+
}
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
//! RTF (Rich Text Format) extractor.
|
|
2
|
+
//!
|
|
3
|
+
//! Supports: Rich Text Format (.rtf)
|
|
4
|
+
//!
|
|
5
|
+
//! This native Rust extractor provides text extraction from RTF documents with:
|
|
6
|
+
//! - Character encoding support (Windows-1252 for 0x80-0x9F range)
|
|
7
|
+
//! - Common RTF control words (paragraph breaks, tabs, bullets, quotes, dashes)
|
|
8
|
+
//! - Unicode escape sequences
|
|
9
|
+
//! - Image metadata extraction
|
|
10
|
+
//! - Whitespace normalization
|
|
11
|
+
|
|
12
|
+
mod encoding;
|
|
13
|
+
mod formatting;
|
|
14
|
+
mod images;
|
|
15
|
+
mod metadata;
|
|
16
|
+
mod parser;
|
|
17
|
+
mod tables;
|
|
18
|
+
|
|
19
|
+
// Re-export public functions for backward compatibility
|
|
20
|
+
pub use encoding::{hex_digit_to_u8, parse_hex_byte, parse_rtf_control_word};
|
|
21
|
+
pub use formatting::normalize_whitespace;
|
|
22
|
+
pub use images::extract_image_metadata;
|
|
23
|
+
pub use metadata::{extract_rtf_metadata, parse_rtf_datetime};
|
|
24
|
+
pub use parser::extract_text_from_rtf;
|
|
25
|
+
|
|
26
|
+
use crate::Result;
|
|
27
|
+
use crate::core::config::ExtractionConfig;
|
|
28
|
+
use crate::plugins::{DocumentExtractor, Plugin};
|
|
29
|
+
use crate::types::{ExtractionResult, Metadata};
|
|
30
|
+
use async_trait::async_trait;
|
|
31
|
+
|
|
32
|
+
/// Native Rust RTF extractor.
|
|
33
|
+
///
|
|
34
|
+
/// Extracts text content, metadata, and structure from RTF documents
|
|
35
|
+
pub struct RtfExtractor;
|
|
36
|
+
|
|
37
|
+
impl RtfExtractor {
|
|
38
|
+
/// Create a new RTF extractor.
|
|
39
|
+
pub fn new() -> Self {
|
|
40
|
+
Self
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
impl Default for RtfExtractor {
|
|
45
|
+
fn default() -> Self {
|
|
46
|
+
Self::new()
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
impl Plugin for RtfExtractor {
|
|
51
|
+
fn name(&self) -> &str {
|
|
52
|
+
"rtf-extractor"
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
fn version(&self) -> String {
|
|
56
|
+
env!("CARGO_PKG_VERSION").to_string()
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
fn initialize(&self) -> Result<()> {
|
|
60
|
+
Ok(())
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
fn shutdown(&self) -> Result<()> {
|
|
64
|
+
Ok(())
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
fn description(&self) -> &str {
|
|
68
|
+
"Extracts content from RTF (Rich Text Format) files with native Rust parsing"
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
fn author(&self) -> &str {
|
|
72
|
+
"Kreuzberg Team"
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
#[async_trait]
|
|
77
|
+
impl DocumentExtractor for RtfExtractor {
|
|
78
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
79
|
+
skip(self, content, _config),
|
|
80
|
+
fields(
|
|
81
|
+
extractor.name = self.name(),
|
|
82
|
+
content.size_bytes = content.len(),
|
|
83
|
+
)
|
|
84
|
+
))]
|
|
85
|
+
async fn extract_bytes(
|
|
86
|
+
&self,
|
|
87
|
+
content: &[u8],
|
|
88
|
+
mime_type: &str,
|
|
89
|
+
_config: &ExtractionConfig,
|
|
90
|
+
) -> Result<ExtractionResult> {
|
|
91
|
+
let rtf_content = String::from_utf8_lossy(content);
|
|
92
|
+
|
|
93
|
+
let (extracted_text, tables) = extract_text_from_rtf(&rtf_content);
|
|
94
|
+
let metadata_map = extract_rtf_metadata(&rtf_content, &extracted_text);
|
|
95
|
+
|
|
96
|
+
Ok(ExtractionResult {
|
|
97
|
+
content: extracted_text,
|
|
98
|
+
mime_type: mime_type.to_string(),
|
|
99
|
+
metadata: Metadata {
|
|
100
|
+
additional: metadata_map,
|
|
101
|
+
..Default::default()
|
|
102
|
+
},
|
|
103
|
+
pages: None,
|
|
104
|
+
tables,
|
|
105
|
+
detected_languages: None,
|
|
106
|
+
chunks: None,
|
|
107
|
+
images: None,
|
|
108
|
+
djot_content: None,
|
|
109
|
+
elements: None,
|
|
110
|
+
})
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
fn supported_mime_types(&self) -> &[&str] {
|
|
114
|
+
&["application/rtf", "text/rtf"]
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
fn priority(&self) -> i32 {
|
|
118
|
+
50
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
#[cfg(test)]
|
|
123
|
+
mod tests {
|
|
124
|
+
use super::*;
|
|
125
|
+
|
|
126
|
+
#[tokio::test]
|
|
127
|
+
async fn test_rtf_extractor_plugin_interface() {
|
|
128
|
+
let extractor = RtfExtractor::new();
|
|
129
|
+
assert_eq!(extractor.name(), "rtf-extractor");
|
|
130
|
+
assert_eq!(extractor.version(), env!("CARGO_PKG_VERSION"));
|
|
131
|
+
assert!(extractor.supported_mime_types().contains(&"application/rtf"));
|
|
132
|
+
assert_eq!(extractor.priority(), 50);
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
#[test]
|
|
136
|
+
fn test_simple_rtf_extraction() {
|
|
137
|
+
let _extractor = RtfExtractor;
|
|
138
|
+
let rtf_content = r#"{\rtf1 Hello World}"#;
|
|
139
|
+
let (extracted, _) = extract_text_from_rtf(rtf_content);
|
|
140
|
+
assert!(extracted.contains("Hello") || extracted.contains("World"));
|
|
141
|
+
}
|
|
142
|
+
}
|