kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
//! Byte array extraction operations.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles extraction from in-memory byte arrays, including:
|
|
4
|
+
//! - MIME type validation
|
|
5
|
+
//! - Legacy format conversion (DOC, PPT)
|
|
6
|
+
//! - Extraction pipeline orchestration
|
|
7
|
+
|
|
8
|
+
#[cfg(not(feature = "office"))]
|
|
9
|
+
use crate::KreuzbergError;
|
|
10
|
+
use crate::Result;
|
|
11
|
+
use crate::core::config::ExtractionConfig;
|
|
12
|
+
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
|
|
13
|
+
#[cfg(feature = "office")]
|
|
14
|
+
use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
|
|
15
|
+
use crate::types::ExtractionResult;
|
|
16
|
+
|
|
17
|
+
#[cfg(feature = "office")]
|
|
18
|
+
use super::file::apply_libreoffice_metadata;
|
|
19
|
+
use super::file::extract_bytes_with_extractor;
|
|
20
|
+
#[cfg(feature = "otel")]
|
|
21
|
+
use super::file::record_error;
|
|
22
|
+
|
|
23
|
+
/// Extract content from a byte array.
|
|
24
|
+
///
|
|
25
|
+
/// This is the main entry point for in-memory extraction. It performs the following steps:
|
|
26
|
+
/// 1. Validate MIME type
|
|
27
|
+
/// 2. Handle legacy format conversion if needed
|
|
28
|
+
/// 3. Select appropriate extractor from registry
|
|
29
|
+
/// 4. Extract content
|
|
30
|
+
/// 5. Run post-processing pipeline
|
|
31
|
+
///
|
|
32
|
+
/// # Arguments
|
|
33
|
+
///
|
|
34
|
+
/// * `content` - The byte array to extract
|
|
35
|
+
/// * `mime_type` - MIME type of the content
|
|
36
|
+
/// * `config` - Extraction configuration
|
|
37
|
+
///
|
|
38
|
+
/// # Returns
|
|
39
|
+
///
|
|
40
|
+
/// An `ExtractionResult` containing the extracted content and metadata.
|
|
41
|
+
///
|
|
42
|
+
/// # Errors
|
|
43
|
+
///
|
|
44
|
+
/// Returns `KreuzbergError::Validation` if MIME type is invalid.
|
|
45
|
+
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|
46
|
+
///
|
|
47
|
+
/// # Example
|
|
48
|
+
///
|
|
49
|
+
/// ```rust,no_run
|
|
50
|
+
/// use kreuzberg::core::extractor::extract_bytes;
|
|
51
|
+
/// use kreuzberg::core::config::ExtractionConfig;
|
|
52
|
+
///
|
|
53
|
+
/// # async fn example() -> kreuzberg::Result<()> {
|
|
54
|
+
/// let config = ExtractionConfig::default();
|
|
55
|
+
/// let bytes = b"Hello, world!";
|
|
56
|
+
/// let result = extract_bytes(bytes, "text/plain", &config).await?;
|
|
57
|
+
/// println!("Content: {}", result.content);
|
|
58
|
+
/// # Ok(())
|
|
59
|
+
/// # }
|
|
60
|
+
/// ```
|
|
61
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
62
|
+
skip(config, content),
|
|
63
|
+
fields(
|
|
64
|
+
extraction.mime_type = mime_type,
|
|
65
|
+
extraction.size_bytes = content.len(),
|
|
66
|
+
)
|
|
67
|
+
))]
|
|
68
|
+
pub async fn extract_bytes(content: &[u8], mime_type: &str, config: &ExtractionConfig) -> Result<ExtractionResult> {
|
|
69
|
+
use crate::core::mime;
|
|
70
|
+
|
|
71
|
+
let result = async {
|
|
72
|
+
let validated_mime = mime::validate_mime_type(mime_type)?;
|
|
73
|
+
|
|
74
|
+
match validated_mime.as_str() {
|
|
75
|
+
#[cfg(feature = "office")]
|
|
76
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
77
|
+
let conversion = convert_doc_to_docx(content).await?;
|
|
78
|
+
let mut result =
|
|
79
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
80
|
+
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
81
|
+
return Ok(result);
|
|
82
|
+
}
|
|
83
|
+
#[cfg(not(feature = "office"))]
|
|
84
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
85
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
86
|
+
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
87
|
+
));
|
|
88
|
+
}
|
|
89
|
+
#[cfg(feature = "office")]
|
|
90
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
91
|
+
let conversion = convert_ppt_to_pptx(content).await?;
|
|
92
|
+
let mut result =
|
|
93
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
94
|
+
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
95
|
+
return Ok(result);
|
|
96
|
+
}
|
|
97
|
+
#[cfg(not(feature = "office"))]
|
|
98
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
99
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
100
|
+
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
101
|
+
));
|
|
102
|
+
}
|
|
103
|
+
_ => {}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
extract_bytes_with_extractor(content, &validated_mime, config).await
|
|
107
|
+
}
|
|
108
|
+
.await;
|
|
109
|
+
|
|
110
|
+
#[cfg(feature = "otel")]
|
|
111
|
+
if let Err(ref e) = result {
|
|
112
|
+
record_error(e);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
result
|
|
116
|
+
}
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
//! File-based extraction operations.
|
|
2
|
+
//!
|
|
3
|
+
//! This module handles extraction from filesystem paths, including:
|
|
4
|
+
//! - MIME type detection and validation
|
|
5
|
+
//! - Legacy format conversion (DOC, PPT)
|
|
6
|
+
//! - File validation and reading
|
|
7
|
+
//! - Extraction pipeline orchestration
|
|
8
|
+
|
|
9
|
+
#[cfg(any(feature = "otel", not(feature = "office")))]
|
|
10
|
+
use crate::KreuzbergError;
|
|
11
|
+
use crate::Result;
|
|
12
|
+
use crate::core::config::ExtractionConfig;
|
|
13
|
+
use crate::core::mime::{LEGACY_POWERPOINT_MIME_TYPE, LEGACY_WORD_MIME_TYPE};
|
|
14
|
+
#[cfg(feature = "office")]
|
|
15
|
+
use crate::extraction::libreoffice::{convert_doc_to_docx, convert_ppt_to_pptx};
|
|
16
|
+
use crate::types::ExtractionResult;
|
|
17
|
+
#[cfg(feature = "office")]
|
|
18
|
+
use crate::types::LibreOfficeConversionResult;
|
|
19
|
+
#[cfg(feature = "office")]
|
|
20
|
+
use serde_json::json;
|
|
21
|
+
use std::path::Path;
|
|
22
|
+
|
|
23
|
+
#[cfg(feature = "office")]
|
|
24
|
+
use super::helpers::pool_mime_type;
|
|
25
|
+
|
|
26
|
+
use super::helpers::get_extractor;
|
|
27
|
+
|
|
28
|
+
/// Sanitize a file path to return only the filename.
|
|
29
|
+
///
|
|
30
|
+
/// This function extracts the filename from a path to avoid recording
|
|
31
|
+
/// potentially sensitive full file paths in telemetry data.
|
|
32
|
+
///
|
|
33
|
+
/// # Arguments
|
|
34
|
+
///
|
|
35
|
+
/// * `path` - The path to sanitize
|
|
36
|
+
///
|
|
37
|
+
/// # Returns
|
|
38
|
+
///
|
|
39
|
+
/// The filename as a string, or "unknown" if extraction fails
|
|
40
|
+
///
|
|
41
|
+
/// # Security
|
|
42
|
+
///
|
|
43
|
+
/// This prevents PII (personally identifiable information) from appearing in
|
|
44
|
+
/// traces by only recording filenames instead of full paths.
|
|
45
|
+
///
|
|
46
|
+
/// # Example
|
|
47
|
+
///
|
|
48
|
+
/// ```rust,ignore
|
|
49
|
+
/// let path = Path::new("/home/user/documents/secret.pdf");
|
|
50
|
+
/// assert_eq!(sanitize_path(path), "secret.pdf");
|
|
51
|
+
/// ```
|
|
52
|
+
#[cfg(feature = "otel")]
|
|
53
|
+
pub(super) fn sanitize_path(path: &Path) -> String {
|
|
54
|
+
path.file_name()
|
|
55
|
+
.and_then(|n| n.to_str())
|
|
56
|
+
.unwrap_or("unknown")
|
|
57
|
+
.to_string()
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/// Record error information in the current OpenTelemetry span.
|
|
61
|
+
///
|
|
62
|
+
/// This function records error details in the current span when the `otel` feature is enabled.
|
|
63
|
+
/// It marks the span with `otel.status_code=ERROR` and adds error type and message fields.
|
|
64
|
+
///
|
|
65
|
+
/// # Arguments
|
|
66
|
+
///
|
|
67
|
+
/// * `error` - The error to record in the span
|
|
68
|
+
///
|
|
69
|
+
/// # Example
|
|
70
|
+
///
|
|
71
|
+
/// ```rust,ignore
|
|
72
|
+
/// let result = extract_file("doc.pdf", None, &config).await;
|
|
73
|
+
/// #[cfg(feature = "otel")]
|
|
74
|
+
/// if let Err(ref e) = result {
|
|
75
|
+
/// record_error(e);
|
|
76
|
+
/// }
|
|
77
|
+
/// result
|
|
78
|
+
/// ```
|
|
79
|
+
#[cfg(feature = "otel")]
|
|
80
|
+
pub(in crate::core::extractor) fn record_error(error: &KreuzbergError) {
|
|
81
|
+
let span = tracing::Span::current();
|
|
82
|
+
span.record("otel.status_code", "ERROR");
|
|
83
|
+
span.record("error.type", format!("{:?}", error));
|
|
84
|
+
span.record("error.message", error.to_string());
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/// Extract content from a file.
|
|
88
|
+
///
|
|
89
|
+
/// This is the main entry point for file-based extraction. It performs the following steps:
|
|
90
|
+
/// 1. Check cache for existing result (if caching enabled)
|
|
91
|
+
/// 2. Detect or validate MIME type
|
|
92
|
+
/// 3. Select appropriate extractor from registry
|
|
93
|
+
/// 4. Extract content
|
|
94
|
+
/// 5. Run post-processing pipeline
|
|
95
|
+
/// 6. Store result in cache (if caching enabled)
|
|
96
|
+
///
|
|
97
|
+
/// # Arguments
|
|
98
|
+
///
|
|
99
|
+
/// * `path` - Path to the file to extract
|
|
100
|
+
/// * `mime_type` - Optional MIME type override. If None, will be auto-detected
|
|
101
|
+
/// * `config` - Extraction configuration
|
|
102
|
+
///
|
|
103
|
+
/// # Returns
|
|
104
|
+
///
|
|
105
|
+
/// An `ExtractionResult` containing the extracted content and metadata.
|
|
106
|
+
///
|
|
107
|
+
/// # Errors
|
|
108
|
+
///
|
|
109
|
+
/// Returns `KreuzbergError::Validation` if the file doesn't exist or path is invalid.
|
|
110
|
+
/// Returns `KreuzbergError::UnsupportedFormat` if MIME type is not supported.
|
|
111
|
+
/// Returns `KreuzbergError::Io` for file I/O errors (these always bubble up).
|
|
112
|
+
///
|
|
113
|
+
/// # Example
|
|
114
|
+
///
|
|
115
|
+
/// ```rust,no_run
|
|
116
|
+
/// use kreuzberg::core::extractor::extract_file;
|
|
117
|
+
/// use kreuzberg::core::config::ExtractionConfig;
|
|
118
|
+
///
|
|
119
|
+
/// # async fn example() -> kreuzberg::Result<()> {
|
|
120
|
+
/// let config = ExtractionConfig::default();
|
|
121
|
+
/// let result = extract_file("document.pdf", None, &config).await?;
|
|
122
|
+
/// println!("Content: {}", result.content);
|
|
123
|
+
/// # Ok(())
|
|
124
|
+
/// # }
|
|
125
|
+
/// ```
|
|
126
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
127
|
+
skip(config, path),
|
|
128
|
+
fields(
|
|
129
|
+
extraction.filename = tracing::field::Empty,
|
|
130
|
+
)
|
|
131
|
+
))]
|
|
132
|
+
pub async fn extract_file(
|
|
133
|
+
path: impl AsRef<Path>,
|
|
134
|
+
mime_type: Option<&str>,
|
|
135
|
+
config: &ExtractionConfig,
|
|
136
|
+
) -> Result<ExtractionResult> {
|
|
137
|
+
use crate::core::{io, mime};
|
|
138
|
+
|
|
139
|
+
let path = path.as_ref();
|
|
140
|
+
|
|
141
|
+
#[cfg(feature = "otel")]
|
|
142
|
+
{
|
|
143
|
+
let span = tracing::Span::current();
|
|
144
|
+
span.record("extraction.filename", sanitize_path(path));
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
let result = async {
|
|
148
|
+
io::validate_file_exists(path)?;
|
|
149
|
+
|
|
150
|
+
let detected_mime = mime::detect_or_validate(Some(path), mime_type)?;
|
|
151
|
+
|
|
152
|
+
match detected_mime.as_str() {
|
|
153
|
+
#[cfg(feature = "office")]
|
|
154
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
155
|
+
let original_bytes = tokio::fs::read(path).await?;
|
|
156
|
+
let conversion = convert_doc_to_docx(&original_bytes).await?;
|
|
157
|
+
let mut result =
|
|
158
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
159
|
+
apply_libreoffice_metadata(&mut result, LEGACY_WORD_MIME_TYPE, &conversion);
|
|
160
|
+
return Ok(result);
|
|
161
|
+
}
|
|
162
|
+
#[cfg(not(feature = "office"))]
|
|
163
|
+
LEGACY_WORD_MIME_TYPE => {
|
|
164
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
165
|
+
"Legacy Word conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
166
|
+
));
|
|
167
|
+
}
|
|
168
|
+
#[cfg(feature = "office")]
|
|
169
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
170
|
+
let original_bytes = tokio::fs::read(path).await?;
|
|
171
|
+
let conversion = convert_ppt_to_pptx(&original_bytes).await?;
|
|
172
|
+
let mut result =
|
|
173
|
+
extract_bytes_with_extractor(&conversion.converted_bytes, &conversion.target_mime, config).await?;
|
|
174
|
+
apply_libreoffice_metadata(&mut result, LEGACY_POWERPOINT_MIME_TYPE, &conversion);
|
|
175
|
+
return Ok(result);
|
|
176
|
+
}
|
|
177
|
+
#[cfg(not(feature = "office"))]
|
|
178
|
+
LEGACY_POWERPOINT_MIME_TYPE => {
|
|
179
|
+
return Err(KreuzbergError::UnsupportedFormat(
|
|
180
|
+
"Legacy PowerPoint conversion requires the `office` feature or LibreOffice support".to_string(),
|
|
181
|
+
));
|
|
182
|
+
}
|
|
183
|
+
_ => {}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
extract_file_with_extractor(path, &detected_mime, config).await
|
|
187
|
+
}
|
|
188
|
+
.await;
|
|
189
|
+
|
|
190
|
+
#[cfg(feature = "otel")]
|
|
191
|
+
if let Err(ref e) = result {
|
|
192
|
+
record_error(e);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
result
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
pub(in crate::core::extractor) async fn extract_file_with_extractor(
|
|
199
|
+
path: &Path,
|
|
200
|
+
mime_type: &str,
|
|
201
|
+
config: &ExtractionConfig,
|
|
202
|
+
) -> Result<ExtractionResult> {
|
|
203
|
+
crate::extractors::ensure_initialized()?;
|
|
204
|
+
|
|
205
|
+
let extractor = get_extractor(mime_type)?;
|
|
206
|
+
let mut result = extractor.extract_file(path, mime_type, config).await?;
|
|
207
|
+
result = crate::core::pipeline::run_pipeline(result, config).await?;
|
|
208
|
+
Ok(result)
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
pub(in crate::core::extractor) async fn extract_bytes_with_extractor(
|
|
212
|
+
content: &[u8],
|
|
213
|
+
mime_type: &str,
|
|
214
|
+
config: &ExtractionConfig,
|
|
215
|
+
) -> Result<ExtractionResult> {
|
|
216
|
+
crate::extractors::ensure_initialized()?;
|
|
217
|
+
|
|
218
|
+
let extractor = get_extractor(mime_type)?;
|
|
219
|
+
let mut result = extractor.extract_bytes(content, mime_type, config).await?;
|
|
220
|
+
result = crate::core::pipeline::run_pipeline(result, config).await?;
|
|
221
|
+
Ok(result)
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
#[cfg(feature = "office")]
|
|
225
|
+
pub(in crate::core::extractor) fn apply_libreoffice_metadata(
|
|
226
|
+
result: &mut ExtractionResult,
|
|
227
|
+
legacy_mime: &str,
|
|
228
|
+
conversion: &LibreOfficeConversionResult,
|
|
229
|
+
) {
|
|
230
|
+
result.mime_type = pool_mime_type(legacy_mime);
|
|
231
|
+
result.metadata.additional.insert(
|
|
232
|
+
"libreoffice_conversion".to_string(),
|
|
233
|
+
json!({
|
|
234
|
+
"converter": "libreoffice",
|
|
235
|
+
"original_format": conversion.original_format,
|
|
236
|
+
"target_format": conversion.target_format,
|
|
237
|
+
"target_mime": conversion.target_mime,
|
|
238
|
+
}),
|
|
239
|
+
);
|
|
240
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
//! Helper functions and utilities for extraction operations.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides shared utilities used across extraction modules.
|
|
4
|
+
|
|
5
|
+
use crate::plugins::DocumentExtractor;
|
|
6
|
+
#[cfg(feature = "office")]
|
|
7
|
+
use crate::utils::intern_mime_type;
|
|
8
|
+
use crate::utils::{PoolSizeHint, estimate_pool_size};
|
|
9
|
+
use crate::{KreuzbergError, Result};
|
|
10
|
+
use std::sync::Arc;
|
|
11
|
+
|
|
12
|
+
/// Get an extractor from the registry.
|
|
13
|
+
///
|
|
14
|
+
/// This function acquires the registry read lock and retrieves the appropriate
|
|
15
|
+
/// extractor for the given MIME type.
|
|
16
|
+
///
|
|
17
|
+
/// # Performance
|
|
18
|
+
///
|
|
19
|
+
/// RwLock read + HashMap lookup is ~100ns, fast enough without caching.
|
|
20
|
+
/// Removed thread-local cache to avoid Tokio work-stealing scheduler issues.
|
|
21
|
+
pub(in crate::core::extractor) fn get_extractor(mime_type: &str) -> Result<Arc<dyn DocumentExtractor>> {
|
|
22
|
+
let registry = crate::plugins::registry::get_document_extractor_registry();
|
|
23
|
+
let registry_read = registry
|
|
24
|
+
.read()
|
|
25
|
+
.map_err(|e| KreuzbergError::Other(format!("Document extractor registry lock poisoned: {}", e)))?;
|
|
26
|
+
registry_read.get(mime_type)
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Get optimal pool sizing hint for a document.
|
|
30
|
+
///
|
|
31
|
+
/// This function calculates recommended pool sizes based on the document's
|
|
32
|
+
/// file size and MIME type. The hint can be used to create appropriately
|
|
33
|
+
/// sized thread pools for extraction, reducing memory waste from over-allocation.
|
|
34
|
+
///
|
|
35
|
+
/// # Arguments
|
|
36
|
+
///
|
|
37
|
+
/// * `file_size` - The size of the file in bytes
|
|
38
|
+
/// * `mime_type` - The MIME type of the document
|
|
39
|
+
///
|
|
40
|
+
/// # Returns
|
|
41
|
+
///
|
|
42
|
+
/// A `PoolSizeHint` with recommended pool configurations
|
|
43
|
+
///
|
|
44
|
+
/// # Example
|
|
45
|
+
///
|
|
46
|
+
/// ```rust,ignore
|
|
47
|
+
/// use kreuzberg::core::extractor::get_pool_sizing_hint;
|
|
48
|
+
///
|
|
49
|
+
/// let hint = get_pool_sizing_hint(5_000_000, "application/pdf");
|
|
50
|
+
/// println!("Recommended string buffers: {}", hint.string_buffer_count);
|
|
51
|
+
/// ```
|
|
52
|
+
#[inline]
|
|
53
|
+
pub fn get_pool_sizing_hint(file_size: u64, mime_type: &str) -> PoolSizeHint {
|
|
54
|
+
estimate_pool_size(file_size, mime_type)
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/// Convert a MIME type string to a pooled String for efficient deduplication.
|
|
58
|
+
///
|
|
59
|
+
/// This function uses the string interning pool to reduce memory allocations
|
|
60
|
+
/// for repeatedly used MIME types (e.g., "application/pdf" appears thousands of times
|
|
61
|
+
/// in batch processing). The interned string is converted to an owned String to satisfy
|
|
62
|
+
/// the ExtractionResult::mime_type field type.
|
|
63
|
+
///
|
|
64
|
+
/// # Performance
|
|
65
|
+
///
|
|
66
|
+
/// For pre-interned MIME types (all common types), this is O(1) pointer dereference.
|
|
67
|
+
/// For unknown MIME types, this allocates once per unique type and caches the result.
|
|
68
|
+
#[cfg(feature = "office")]
|
|
69
|
+
pub(in crate::core::extractor) fn pool_mime_type(mime_type: &str) -> String {
|
|
70
|
+
intern_mime_type(mime_type).to_string()
|
|
71
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
//! Legacy synchronous extraction for WASM compatibility.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides truly synchronous extraction implementations
|
|
4
|
+
//! for environments where Tokio runtime is not available (e.g., WASM).
|
|
5
|
+
|
|
6
|
+
/// Synchronous extraction implementation for WASM compatibility.
|
|
7
|
+
///
|
|
8
|
+
/// This function performs extraction without requiring a tokio runtime.
|
|
9
|
+
/// It calls the sync extractor methods directly.
|
|
10
|
+
///
|
|
11
|
+
/// # Arguments
|
|
12
|
+
///
|
|
13
|
+
/// * `content` - The byte content to extract
|
|
14
|
+
/// * `mime_type` - Optional MIME type to validate/use
|
|
15
|
+
/// * `config` - Optional extraction configuration
|
|
16
|
+
///
|
|
17
|
+
/// # Returns
|
|
18
|
+
///
|
|
19
|
+
/// An `ExtractionResult` or a `KreuzbergError`
|
|
20
|
+
///
|
|
21
|
+
/// # Implementation Notes
|
|
22
|
+
///
|
|
23
|
+
/// This is called when the `tokio-runtime` feature is disabled.
|
|
24
|
+
/// It replicates the logic of `extract_bytes` but uses synchronous extractor methods.
|
|
25
|
+
#[cfg(not(feature = "tokio-runtime"))]
|
|
26
|
+
pub(super) fn extract_bytes_sync_impl(
|
|
27
|
+
content: Vec<u8>,
|
|
28
|
+
mime_type: Option<String>,
|
|
29
|
+
config: Option<crate::core::config::ExtractionConfig>,
|
|
30
|
+
) -> crate::Result<crate::types::ExtractionResult> {
|
|
31
|
+
use crate::KreuzbergError;
|
|
32
|
+
use crate::core::extractor::helpers::get_extractor;
|
|
33
|
+
use crate::core::mime;
|
|
34
|
+
|
|
35
|
+
let config = config.unwrap_or_default();
|
|
36
|
+
|
|
37
|
+
let validated_mime = if let Some(mime) = mime_type {
|
|
38
|
+
mime::validate_mime_type(&mime)?
|
|
39
|
+
} else {
|
|
40
|
+
return Err(KreuzbergError::Validation {
|
|
41
|
+
message: "MIME type is required for synchronous extraction".to_string(),
|
|
42
|
+
source: None,
|
|
43
|
+
});
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
crate::extractors::ensure_initialized()?;
|
|
47
|
+
|
|
48
|
+
let extractor = get_extractor(&validated_mime)?;
|
|
49
|
+
|
|
50
|
+
let sync_extractor = extractor.as_sync_extractor().ok_or_else(|| {
|
|
51
|
+
KreuzbergError::UnsupportedFormat(format!(
|
|
52
|
+
"Extractor for '{}' does not support synchronous extraction",
|
|
53
|
+
validated_mime
|
|
54
|
+
))
|
|
55
|
+
})?;
|
|
56
|
+
|
|
57
|
+
let mut result = sync_extractor.extract_sync(&content, &validated_mime, &config)?;
|
|
58
|
+
|
|
59
|
+
result = crate::core::pipeline::run_pipeline_sync(result, &config)?;
|
|
60
|
+
|
|
61
|
+
Ok(result)
|
|
62
|
+
}
|