kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
//! Per-section validation functions.
|
|
2
|
+
//!
|
|
3
|
+
//! This module contains validation functions for individual configuration sections
|
|
4
|
+
//! and their specific parameters. Each function validates a specific aspect of
|
|
5
|
+
//! the configuration and returns detailed error messages when validation fails.
|
|
6
|
+
|
|
7
|
+
use crate::{KreuzbergError, Result};
|
|
8
|
+
|
|
9
|
+
/// Valid binarization methods for image preprocessing.
|
|
10
|
+
const VALID_BINARIZATION_METHODS: &[&str] = &["otsu", "adaptive", "sauvola"];
|
|
11
|
+
|
|
12
|
+
/// Valid token reduction levels.
|
|
13
|
+
const VALID_TOKEN_REDUCTION_LEVELS: &[&str] = &["off", "light", "moderate", "aggressive", "maximum"];
|
|
14
|
+
|
|
15
|
+
/// Valid OCR backends.
|
|
16
|
+
const VALID_OCR_BACKENDS: &[&str] = &["tesseract", "easyocr", "paddleocr"];
|
|
17
|
+
|
|
18
|
+
/// Common ISO 639-1 language codes (extended list).
|
|
19
|
+
/// Covers most major languages and variants used in document processing.
|
|
20
|
+
const VALID_LANGUAGE_CODES: &[&str] = &[
|
|
21
|
+
"en", "de", "fr", "es", "it", "pt", "nl", "pl", "ru", "zh", "ja", "ko", "bg", "cs", "da", "el", "et", "fi", "hu",
|
|
22
|
+
"lt", "lv", "ro", "sk", "sl", "sv", "uk", "ar", "hi", "th", "tr", "vi", "eng", "deu", "fra", "spa", "ita", "por",
|
|
23
|
+
"nld", "pol", "rus", "zho", "jpn", "kor", "ces", "dan", "ell", "est", "fin", "hun", "lit", "lav", "ron", "slk",
|
|
24
|
+
"slv", "swe", "tur",
|
|
25
|
+
];
|
|
26
|
+
|
|
27
|
+
/// Valid tesseract PSM (Page Segmentation Mode) values.
|
|
28
|
+
const VALID_TESSERACT_PSM: &[i32] = &[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13];
|
|
29
|
+
|
|
30
|
+
/// Valid tesseract OEM (OCR Engine Mode) values.
|
|
31
|
+
const VALID_TESSERACT_OEM: &[i32] = &[0, 1, 2, 3];
|
|
32
|
+
|
|
33
|
+
/// Valid output formats for tesseract.
|
|
34
|
+
const VALID_OUTPUT_FORMATS: &[&str] = &["text", "markdown"];
|
|
35
|
+
|
|
36
|
+
/// Validate a binarization method string.
|
|
37
|
+
///
|
|
38
|
+
/// # Arguments
|
|
39
|
+
///
|
|
40
|
+
/// * `method` - The binarization method to validate (e.g., "otsu", "adaptive", "sauvola")
|
|
41
|
+
///
|
|
42
|
+
/// # Returns
|
|
43
|
+
///
|
|
44
|
+
/// `Ok(())` if the method is valid, or a `ValidationError` with details about valid options.
|
|
45
|
+
///
|
|
46
|
+
/// # Examples
|
|
47
|
+
///
|
|
48
|
+
/// ```rust
|
|
49
|
+
/// use kreuzberg::core::config_validation::validate_binarization_method;
|
|
50
|
+
///
|
|
51
|
+
/// assert!(validate_binarization_method("otsu").is_ok());
|
|
52
|
+
/// assert!(validate_binarization_method("adaptive").is_ok());
|
|
53
|
+
/// assert!(validate_binarization_method("invalid").is_err());
|
|
54
|
+
/// ```
|
|
55
|
+
pub fn validate_binarization_method(method: &str) -> Result<()> {
|
|
56
|
+
let method = method.to_lowercase();
|
|
57
|
+
if VALID_BINARIZATION_METHODS.contains(&method.as_str()) {
|
|
58
|
+
Ok(())
|
|
59
|
+
} else {
|
|
60
|
+
Err(KreuzbergError::Validation {
|
|
61
|
+
message: format!(
|
|
62
|
+
"Invalid binarization method '{}'. Valid options are: {}",
|
|
63
|
+
method,
|
|
64
|
+
VALID_BINARIZATION_METHODS.join(", ")
|
|
65
|
+
),
|
|
66
|
+
source: None,
|
|
67
|
+
})
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/// Validate a token reduction level string.
|
|
72
|
+
///
|
|
73
|
+
/// # Arguments
|
|
74
|
+
///
|
|
75
|
+
/// * `level` - The token reduction level to validate (e.g., "off", "light", "moderate")
|
|
76
|
+
///
|
|
77
|
+
/// # Returns
|
|
78
|
+
///
|
|
79
|
+
/// `Ok(())` if the level is valid, or a `ValidationError` with details about valid options.
|
|
80
|
+
///
|
|
81
|
+
/// # Examples
|
|
82
|
+
///
|
|
83
|
+
/// ```rust
|
|
84
|
+
/// use kreuzberg::core::config_validation::validate_token_reduction_level;
|
|
85
|
+
///
|
|
86
|
+
/// assert!(validate_token_reduction_level("off").is_ok());
|
|
87
|
+
/// assert!(validate_token_reduction_level("moderate").is_ok());
|
|
88
|
+
/// assert!(validate_token_reduction_level("extreme").is_err());
|
|
89
|
+
/// ```
|
|
90
|
+
pub fn validate_token_reduction_level(level: &str) -> Result<()> {
|
|
91
|
+
let level = level.to_lowercase();
|
|
92
|
+
if VALID_TOKEN_REDUCTION_LEVELS.contains(&level.as_str()) {
|
|
93
|
+
Ok(())
|
|
94
|
+
} else {
|
|
95
|
+
Err(KreuzbergError::Validation {
|
|
96
|
+
message: format!(
|
|
97
|
+
"Invalid token reduction level '{}'. Valid options are: {}",
|
|
98
|
+
level,
|
|
99
|
+
VALID_TOKEN_REDUCTION_LEVELS.join(", ")
|
|
100
|
+
),
|
|
101
|
+
source: None,
|
|
102
|
+
})
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/// Validate an OCR backend string.
|
|
107
|
+
///
|
|
108
|
+
/// # Arguments
|
|
109
|
+
///
|
|
110
|
+
/// * `backend` - The OCR backend to validate (e.g., "tesseract", "easyocr", "paddleocr")
|
|
111
|
+
///
|
|
112
|
+
/// # Returns
|
|
113
|
+
///
|
|
114
|
+
/// `Ok(())` if the backend is valid, or a `ValidationError` with details about valid options.
|
|
115
|
+
///
|
|
116
|
+
/// # Examples
|
|
117
|
+
///
|
|
118
|
+
/// ```rust
|
|
119
|
+
/// use kreuzberg::core::config_validation::validate_ocr_backend;
|
|
120
|
+
///
|
|
121
|
+
/// assert!(validate_ocr_backend("tesseract").is_ok());
|
|
122
|
+
/// assert!(validate_ocr_backend("easyocr").is_ok());
|
|
123
|
+
/// assert!(validate_ocr_backend("invalid").is_err());
|
|
124
|
+
/// ```
|
|
125
|
+
pub fn validate_ocr_backend(backend: &str) -> Result<()> {
|
|
126
|
+
let backend = backend.to_lowercase();
|
|
127
|
+
if VALID_OCR_BACKENDS.contains(&backend.as_str()) {
|
|
128
|
+
Ok(())
|
|
129
|
+
} else {
|
|
130
|
+
Err(KreuzbergError::Validation {
|
|
131
|
+
message: format!(
|
|
132
|
+
"Invalid OCR backend '{}'. Valid options are: {}",
|
|
133
|
+
backend,
|
|
134
|
+
VALID_OCR_BACKENDS.join(", ")
|
|
135
|
+
),
|
|
136
|
+
source: None,
|
|
137
|
+
})
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/// Validate a language code (ISO 639-1 or 639-3 format).
|
|
142
|
+
///
|
|
143
|
+
/// Accepts both 2-letter ISO 639-1 codes (e.g., "en", "de") and
|
|
144
|
+
/// 3-letter ISO 639-3 codes (e.g., "eng", "deu") for broader compatibility.
|
|
145
|
+
///
|
|
146
|
+
/// # Arguments
|
|
147
|
+
///
|
|
148
|
+
/// * `code` - The language code to validate
|
|
149
|
+
///
|
|
150
|
+
/// # Returns
|
|
151
|
+
///
|
|
152
|
+
/// `Ok(())` if the code is valid, or a `ValidationError` indicating an invalid language code.
|
|
153
|
+
///
|
|
154
|
+
/// # Examples
|
|
155
|
+
///
|
|
156
|
+
/// ```rust
|
|
157
|
+
/// use kreuzberg::core::config_validation::validate_language_code;
|
|
158
|
+
///
|
|
159
|
+
/// assert!(validate_language_code("en").is_ok());
|
|
160
|
+
/// assert!(validate_language_code("eng").is_ok());
|
|
161
|
+
/// assert!(validate_language_code("de").is_ok());
|
|
162
|
+
/// assert!(validate_language_code("deu").is_ok());
|
|
163
|
+
/// assert!(validate_language_code("invalid").is_err());
|
|
164
|
+
/// ```
|
|
165
|
+
pub fn validate_language_code(code: &str) -> Result<()> {
|
|
166
|
+
let code_lower = code.to_lowercase();
|
|
167
|
+
|
|
168
|
+
if VALID_LANGUAGE_CODES.contains(&code_lower.as_str()) {
|
|
169
|
+
return Ok(());
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
Err(KreuzbergError::Validation {
|
|
173
|
+
message: format!(
|
|
174
|
+
"Invalid language code '{}'. Use ISO 639-1 (2-letter, e.g., 'en', 'de') \
|
|
175
|
+
or ISO 639-3 (3-letter, e.g., 'eng', 'deu') codes. \
|
|
176
|
+
Common codes: en, de, fr, es, it, pt, nl, pl, ru, zh, ja, ko, ar, hi, th.",
|
|
177
|
+
code
|
|
178
|
+
),
|
|
179
|
+
source: None,
|
|
180
|
+
})
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/// Validate a tesseract Page Segmentation Mode (PSM).
|
|
184
|
+
///
|
|
185
|
+
/// # Arguments
|
|
186
|
+
///
|
|
187
|
+
/// * `psm` - The PSM value to validate (0-13)
|
|
188
|
+
///
|
|
189
|
+
/// # Returns
|
|
190
|
+
///
|
|
191
|
+
/// `Ok(())` if the PSM is valid, or a `ValidationError` with details about valid ranges.
|
|
192
|
+
///
|
|
193
|
+
/// # Examples
|
|
194
|
+
///
|
|
195
|
+
/// ```rust
|
|
196
|
+
/// use kreuzberg::core::config_validation::validate_tesseract_psm;
|
|
197
|
+
///
|
|
198
|
+
/// assert!(validate_tesseract_psm(3).is_ok()); // Fully automatic
|
|
199
|
+
/// assert!(validate_tesseract_psm(6).is_ok()); // Single block of text
|
|
200
|
+
/// assert!(validate_tesseract_psm(14).is_err()); // Out of range
|
|
201
|
+
/// ```
|
|
202
|
+
pub fn validate_tesseract_psm(psm: i32) -> Result<()> {
|
|
203
|
+
if VALID_TESSERACT_PSM.contains(&psm) {
|
|
204
|
+
Ok(())
|
|
205
|
+
} else {
|
|
206
|
+
Err(KreuzbergError::Validation {
|
|
207
|
+
message: format!(
|
|
208
|
+
"Invalid tesseract PSM value '{}'. Valid range is 0-13. \
|
|
209
|
+
Common values: 3 (auto), 6 (single block), 11 (sparse text).",
|
|
210
|
+
psm
|
|
211
|
+
),
|
|
212
|
+
source: None,
|
|
213
|
+
})
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
/// Validate a tesseract OCR Engine Mode (OEM).
|
|
218
|
+
///
|
|
219
|
+
/// # Arguments
|
|
220
|
+
///
|
|
221
|
+
/// * `oem` - The OEM value to validate (0-3)
|
|
222
|
+
///
|
|
223
|
+
/// # Returns
|
|
224
|
+
///
|
|
225
|
+
/// `Ok(())` if the OEM is valid, or a `ValidationError` with details about valid options.
|
|
226
|
+
///
|
|
227
|
+
/// # Examples
|
|
228
|
+
///
|
|
229
|
+
/// ```rust
|
|
230
|
+
/// use kreuzberg::core::config_validation::validate_tesseract_oem;
|
|
231
|
+
///
|
|
232
|
+
/// assert!(validate_tesseract_oem(1).is_ok()); // Neural nets (LSTM)
|
|
233
|
+
/// assert!(validate_tesseract_oem(2).is_ok()); // Legacy + LSTM
|
|
234
|
+
/// assert!(validate_tesseract_oem(4).is_err()); // Out of range
|
|
235
|
+
/// ```
|
|
236
|
+
pub fn validate_tesseract_oem(oem: i32) -> Result<()> {
|
|
237
|
+
if VALID_TESSERACT_OEM.contains(&oem) {
|
|
238
|
+
Ok(())
|
|
239
|
+
} else {
|
|
240
|
+
Err(KreuzbergError::Validation {
|
|
241
|
+
message: format!(
|
|
242
|
+
"Invalid tesseract OEM value '{}'. Valid range is 0-3. \
|
|
243
|
+
0=Legacy, 1=LSTM, 2=Legacy+LSTM, 3=Default",
|
|
244
|
+
oem
|
|
245
|
+
),
|
|
246
|
+
source: None,
|
|
247
|
+
})
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
/// Validate a tesseract output format.
|
|
252
|
+
///
|
|
253
|
+
/// # Arguments
|
|
254
|
+
///
|
|
255
|
+
/// * `format` - The output format to validate (e.g., "text", "markdown")
|
|
256
|
+
///
|
|
257
|
+
/// # Returns
|
|
258
|
+
///
|
|
259
|
+
/// `Ok(())` if the format is valid, or a `ValidationError` with details about valid options.
|
|
260
|
+
///
|
|
261
|
+
/// # Examples
|
|
262
|
+
///
|
|
263
|
+
/// ```rust
|
|
264
|
+
/// use kreuzberg::core::config_validation::validate_output_format;
|
|
265
|
+
///
|
|
266
|
+
/// assert!(validate_output_format("text").is_ok());
|
|
267
|
+
/// assert!(validate_output_format("markdown").is_ok());
|
|
268
|
+
/// assert!(validate_output_format("json").is_err());
|
|
269
|
+
/// ```
|
|
270
|
+
pub fn validate_output_format(format: &str) -> Result<()> {
|
|
271
|
+
let format = format.to_lowercase();
|
|
272
|
+
if VALID_OUTPUT_FORMATS.contains(&format.as_str()) {
|
|
273
|
+
Ok(())
|
|
274
|
+
} else {
|
|
275
|
+
Err(KreuzbergError::Validation {
|
|
276
|
+
message: format!(
|
|
277
|
+
"Invalid output format '{}'. Valid options are: {}",
|
|
278
|
+
format,
|
|
279
|
+
VALID_OUTPUT_FORMATS.join(", ")
|
|
280
|
+
),
|
|
281
|
+
source: None,
|
|
282
|
+
})
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
/// Validate a confidence threshold value.
|
|
287
|
+
///
|
|
288
|
+
/// Confidence thresholds should be between 0.0 and 1.0 inclusive.
|
|
289
|
+
///
|
|
290
|
+
/// # Arguments
|
|
291
|
+
///
|
|
292
|
+
/// * `confidence` - The confidence threshold to validate
|
|
293
|
+
///
|
|
294
|
+
/// # Returns
|
|
295
|
+
///
|
|
296
|
+
/// `Ok(())` if the confidence is valid, or a `ValidationError` with details about valid ranges.
|
|
297
|
+
///
|
|
298
|
+
/// # Examples
|
|
299
|
+
///
|
|
300
|
+
/// ```rust
|
|
301
|
+
/// use kreuzberg::core::config_validation::validate_confidence;
|
|
302
|
+
///
|
|
303
|
+
/// assert!(validate_confidence(0.5).is_ok());
|
|
304
|
+
/// assert!(validate_confidence(0.0).is_ok());
|
|
305
|
+
/// assert!(validate_confidence(1.0).is_ok());
|
|
306
|
+
/// assert!(validate_confidence(1.5).is_err());
|
|
307
|
+
/// assert!(validate_confidence(-0.1).is_err());
|
|
308
|
+
/// ```
|
|
309
|
+
pub fn validate_confidence(confidence: f64) -> Result<()> {
|
|
310
|
+
if (0.0..=1.0).contains(&confidence) {
|
|
311
|
+
Ok(())
|
|
312
|
+
} else {
|
|
313
|
+
Err(KreuzbergError::Validation {
|
|
314
|
+
message: format!(
|
|
315
|
+
"Invalid confidence threshold '{}'. Must be between 0.0 and 1.0.",
|
|
316
|
+
confidence
|
|
317
|
+
),
|
|
318
|
+
source: None,
|
|
319
|
+
})
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/// Validate a DPI (dots per inch) value.
|
|
324
|
+
///
|
|
325
|
+
/// DPI should be a positive integer, typically 72-600.
|
|
326
|
+
///
|
|
327
|
+
/// # Arguments
|
|
328
|
+
///
|
|
329
|
+
/// * `dpi` - The DPI value to validate
|
|
330
|
+
///
|
|
331
|
+
/// # Returns
|
|
332
|
+
///
|
|
333
|
+
/// `Ok(())` if the DPI is valid, or a `ValidationError` with details about valid ranges.
|
|
334
|
+
///
|
|
335
|
+
/// # Examples
|
|
336
|
+
///
|
|
337
|
+
/// ```rust
|
|
338
|
+
/// use kreuzberg::core::config_validation::validate_dpi;
|
|
339
|
+
///
|
|
340
|
+
/// assert!(validate_dpi(96).is_ok());
|
|
341
|
+
/// assert!(validate_dpi(300).is_ok());
|
|
342
|
+
/// assert!(validate_dpi(0).is_err());
|
|
343
|
+
/// assert!(validate_dpi(-1).is_err());
|
|
344
|
+
/// ```
|
|
345
|
+
pub fn validate_dpi(dpi: i32) -> Result<()> {
|
|
346
|
+
if dpi > 0 && dpi <= 2400 {
|
|
347
|
+
Ok(())
|
|
348
|
+
} else {
|
|
349
|
+
Err(KreuzbergError::Validation {
|
|
350
|
+
message: format!(
|
|
351
|
+
"Invalid DPI value '{}'. Must be a positive integer, typically 72-600.",
|
|
352
|
+
dpi
|
|
353
|
+
),
|
|
354
|
+
source: None,
|
|
355
|
+
})
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/// Validate chunk size parameters.
|
|
360
|
+
///
|
|
361
|
+
/// Checks that max_chars > 0 and max_overlap < max_chars.
|
|
362
|
+
///
|
|
363
|
+
/// # Arguments
|
|
364
|
+
///
|
|
365
|
+
/// * `max_chars` - The maximum characters per chunk
|
|
366
|
+
/// * `max_overlap` - The maximum overlap between chunks
|
|
367
|
+
///
|
|
368
|
+
/// # Returns
|
|
369
|
+
///
|
|
370
|
+
/// `Ok(())` if the parameters are valid, or a `ValidationError` with details about constraints.
|
|
371
|
+
///
|
|
372
|
+
/// # Examples
|
|
373
|
+
///
|
|
374
|
+
/// ```rust
|
|
375
|
+
/// use kreuzberg::core::config_validation::validate_chunking_params;
|
|
376
|
+
///
|
|
377
|
+
/// assert!(validate_chunking_params(1000, 200).is_ok());
|
|
378
|
+
/// assert!(validate_chunking_params(500, 50).is_ok());
|
|
379
|
+
/// assert!(validate_chunking_params(0, 100).is_err()); // max_chars must be > 0
|
|
380
|
+
/// assert!(validate_chunking_params(100, 150).is_err()); // overlap >= max_chars
|
|
381
|
+
/// ```
|
|
382
|
+
pub fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<()> {
|
|
383
|
+
if max_chars == 0 {
|
|
384
|
+
return Err(KreuzbergError::Validation {
|
|
385
|
+
message: "max_chars must be greater than 0".to_string(),
|
|
386
|
+
source: None,
|
|
387
|
+
});
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
if max_overlap >= max_chars {
|
|
391
|
+
return Err(KreuzbergError::Validation {
|
|
392
|
+
message: format!(
|
|
393
|
+
"max_overlap ({}) must be less than max_chars ({})",
|
|
394
|
+
max_overlap, max_chars
|
|
395
|
+
),
|
|
396
|
+
source: None,
|
|
397
|
+
});
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
Ok(())
|
|
401
|
+
}
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
//! Batch extraction operations for concurrent processing.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides parallel extraction capabilities for processing
|
|
4
|
+
//! multiple files or byte arrays concurrently with automatic resource management.
|
|
5
|
+
|
|
6
|
+
use crate::core::config::ExtractionConfig;
|
|
7
|
+
use crate::types::{ErrorMetadata, ExtractionResult, Metadata};
|
|
8
|
+
use crate::{KreuzbergError, Result};
|
|
9
|
+
use std::path::Path;
|
|
10
|
+
use std::sync::Arc;
|
|
11
|
+
|
|
12
|
+
use super::bytes::extract_bytes;
|
|
13
|
+
use super::file::extract_file;
|
|
14
|
+
|
|
15
|
+
/// Extract content from multiple files concurrently.
|
|
16
|
+
///
|
|
17
|
+
/// This function processes multiple files in parallel, automatically managing
|
|
18
|
+
/// concurrency to prevent resource exhaustion. The concurrency limit can be
|
|
19
|
+
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
|
20
|
+
/// to `num_cpus * 2`.
|
|
21
|
+
///
|
|
22
|
+
/// # Arguments
|
|
23
|
+
///
|
|
24
|
+
/// * `paths` - Vector of file paths to extract
|
|
25
|
+
/// * `config` - Extraction configuration
|
|
26
|
+
///
|
|
27
|
+
/// # Returns
|
|
28
|
+
///
|
|
29
|
+
/// A vector of `ExtractionResult` in the same order as the input paths.
|
|
30
|
+
///
|
|
31
|
+
/// # Errors
|
|
32
|
+
///
|
|
33
|
+
/// Individual file errors are captured in the result metadata. System errors
|
|
34
|
+
/// (IO, RuntimeError equivalents) will bubble up and fail the entire batch.
|
|
35
|
+
///
|
|
36
|
+
/// # Example
|
|
37
|
+
///
|
|
38
|
+
/// ```rust,no_run
|
|
39
|
+
/// use kreuzberg::core::extractor::batch_extract_file;
|
|
40
|
+
/// use kreuzberg::core::config::ExtractionConfig;
|
|
41
|
+
///
|
|
42
|
+
/// # async fn example() -> kreuzberg::Result<()> {
|
|
43
|
+
/// let config = ExtractionConfig::default();
|
|
44
|
+
/// let paths = vec!["doc1.pdf", "doc2.pdf"];
|
|
45
|
+
/// let results = batch_extract_file(paths, &config).await?;
|
|
46
|
+
/// println!("Processed {} files", results.len());
|
|
47
|
+
/// # Ok(())
|
|
48
|
+
/// # }
|
|
49
|
+
/// ```
|
|
50
|
+
#[cfg(feature = "tokio-runtime")]
|
|
51
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
52
|
+
skip(config, paths),
|
|
53
|
+
fields(
|
|
54
|
+
extraction.batch_size = paths.len(),
|
|
55
|
+
)
|
|
56
|
+
))]
|
|
57
|
+
pub async fn batch_extract_file(
|
|
58
|
+
paths: Vec<impl AsRef<Path>>,
|
|
59
|
+
config: &ExtractionConfig,
|
|
60
|
+
) -> Result<Vec<ExtractionResult>> {
|
|
61
|
+
use tokio::sync::Semaphore;
|
|
62
|
+
use tokio::task::JoinSet;
|
|
63
|
+
|
|
64
|
+
if paths.is_empty() {
|
|
65
|
+
return Ok(vec![]);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
let config = Arc::new(config.clone());
|
|
69
|
+
|
|
70
|
+
let max_concurrent = config
|
|
71
|
+
.max_concurrent_extractions
|
|
72
|
+
.unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
|
|
73
|
+
let semaphore = Arc::new(Semaphore::new(max_concurrent));
|
|
74
|
+
|
|
75
|
+
let mut tasks = JoinSet::new();
|
|
76
|
+
|
|
77
|
+
for (index, path) in paths.into_iter().enumerate() {
|
|
78
|
+
let path_buf = path.as_ref().to_path_buf();
|
|
79
|
+
let config_clone = Arc::clone(&config);
|
|
80
|
+
let semaphore_clone = Arc::clone(&semaphore);
|
|
81
|
+
|
|
82
|
+
tasks.spawn(async move {
|
|
83
|
+
let _permit = semaphore_clone.acquire().await.unwrap();
|
|
84
|
+
let result =
|
|
85
|
+
crate::core::batch_mode::with_batch_mode(async { extract_file(&path_buf, None, &config_clone).await })
|
|
86
|
+
.await;
|
|
87
|
+
(index, result)
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
|
|
92
|
+
|
|
93
|
+
while let Some(task_result) = tasks.join_next().await {
|
|
94
|
+
match task_result {
|
|
95
|
+
Ok((index, Ok(result))) => {
|
|
96
|
+
results[index] = Some(result);
|
|
97
|
+
}
|
|
98
|
+
Ok((index, Err(e))) => {
|
|
99
|
+
// All errors (including Io) should create error results
|
|
100
|
+
// instead of causing early return that abandons running tasks
|
|
101
|
+
let metadata = Metadata {
|
|
102
|
+
error: Some(ErrorMetadata {
|
|
103
|
+
error_type: format!("{:?}", e),
|
|
104
|
+
message: e.to_string(),
|
|
105
|
+
}),
|
|
106
|
+
..Default::default()
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
results[index] = Some(ExtractionResult {
|
|
110
|
+
content: format!("Error: {}", e),
|
|
111
|
+
mime_type: "text/plain".to_string(),
|
|
112
|
+
metadata,
|
|
113
|
+
tables: vec![],
|
|
114
|
+
detected_languages: None,
|
|
115
|
+
chunks: None,
|
|
116
|
+
images: None,
|
|
117
|
+
djot_content: None,
|
|
118
|
+
pages: None,
|
|
119
|
+
elements: None,
|
|
120
|
+
});
|
|
121
|
+
}
|
|
122
|
+
Err(join_err) => {
|
|
123
|
+
return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
#[allow(clippy::unwrap_used)]
|
|
129
|
+
Ok(results.into_iter().map(|r| r.unwrap()).collect())
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
/// Extract content from multiple byte arrays concurrently.
|
|
133
|
+
///
|
|
134
|
+
/// This function processes multiple byte arrays in parallel, automatically managing
|
|
135
|
+
/// concurrency to prevent resource exhaustion. The concurrency limit can be
|
|
136
|
+
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
|
|
137
|
+
/// to `num_cpus * 2`.
|
|
138
|
+
///
|
|
139
|
+
/// # Arguments
|
|
140
|
+
///
|
|
141
|
+
/// * `contents` - Vector of (bytes, mime_type) tuples
|
|
142
|
+
/// * `config` - Extraction configuration
|
|
143
|
+
///
|
|
144
|
+
/// # Returns
|
|
145
|
+
///
|
|
146
|
+
/// A vector of `ExtractionResult` in the same order as the input.
|
|
147
|
+
///
|
|
148
|
+
/// # Example
|
|
149
|
+
///
|
|
150
|
+
/// ```rust,no_run
|
|
151
|
+
/// use kreuzberg::core::extractor::batch_extract_bytes;
|
|
152
|
+
/// use kreuzberg::core::config::ExtractionConfig;
|
|
153
|
+
///
|
|
154
|
+
/// # async fn example() -> kreuzberg::Result<()> {
|
|
155
|
+
/// let config = ExtractionConfig::default();
|
|
156
|
+
/// let contents = vec![
|
|
157
|
+
/// (b"content 1".to_vec(), "text/plain".to_string()),
|
|
158
|
+
/// (b"content 2".to_vec(), "text/plain".to_string()),
|
|
159
|
+
/// ];
|
|
160
|
+
/// let results = batch_extract_bytes(contents, &config).await?;
|
|
161
|
+
/// println!("Processed {} items", results.len());
|
|
162
|
+
/// # Ok(())
|
|
163
|
+
/// # }
|
|
164
|
+
/// ```
|
|
165
|
+
#[cfg(feature = "tokio-runtime")]
|
|
166
|
+
#[cfg_attr(feature = "otel", tracing::instrument(
|
|
167
|
+
skip(config, contents),
|
|
168
|
+
fields(
|
|
169
|
+
extraction.batch_size = contents.len(),
|
|
170
|
+
)
|
|
171
|
+
))]
|
|
172
|
+
pub async fn batch_extract_bytes(
|
|
173
|
+
contents: Vec<(Vec<u8>, String)>,
|
|
174
|
+
config: &ExtractionConfig,
|
|
175
|
+
) -> Result<Vec<ExtractionResult>> {
|
|
176
|
+
use tokio::sync::Semaphore;
|
|
177
|
+
use tokio::task::JoinSet;
|
|
178
|
+
|
|
179
|
+
if contents.is_empty() {
|
|
180
|
+
return Ok(vec![]);
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
let batch_config = config.clone();
|
|
184
|
+
let config = Arc::new(batch_config);
|
|
185
|
+
|
|
186
|
+
let max_concurrent = config
|
|
187
|
+
.max_concurrent_extractions
|
|
188
|
+
.unwrap_or_else(|| (num_cpus::get() as f64 * 1.5).ceil() as usize);
|
|
189
|
+
let semaphore = Arc::new(Semaphore::new(max_concurrent));
|
|
190
|
+
|
|
191
|
+
let mut tasks = JoinSet::new();
|
|
192
|
+
|
|
193
|
+
for (index, (bytes, mime_type)) in contents.into_iter().enumerate() {
|
|
194
|
+
let config_clone = Arc::clone(&config);
|
|
195
|
+
let semaphore_clone = Arc::clone(&semaphore);
|
|
196
|
+
|
|
197
|
+
tasks.spawn(async move {
|
|
198
|
+
let _permit = semaphore_clone.acquire().await.unwrap();
|
|
199
|
+
let result = crate::core::batch_mode::with_batch_mode(async {
|
|
200
|
+
extract_bytes(&bytes, &mime_type, &config_clone).await
|
|
201
|
+
})
|
|
202
|
+
.await;
|
|
203
|
+
(index, result)
|
|
204
|
+
});
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
let mut results: Vec<Option<ExtractionResult>> = vec![None; tasks.len()];
|
|
208
|
+
|
|
209
|
+
while let Some(task_result) = tasks.join_next().await {
|
|
210
|
+
match task_result {
|
|
211
|
+
Ok((index, Ok(result))) => {
|
|
212
|
+
results[index] = Some(result);
|
|
213
|
+
}
|
|
214
|
+
Ok((index, Err(e))) => {
|
|
215
|
+
// All errors (including Io) should create error results
|
|
216
|
+
// instead of causing early return that abandons running tasks
|
|
217
|
+
let metadata = Metadata {
|
|
218
|
+
error: Some(ErrorMetadata {
|
|
219
|
+
error_type: format!("{:?}", e),
|
|
220
|
+
message: e.to_string(),
|
|
221
|
+
}),
|
|
222
|
+
..Default::default()
|
|
223
|
+
};
|
|
224
|
+
|
|
225
|
+
results[index] = Some(ExtractionResult {
|
|
226
|
+
content: format!("Error: {}", e),
|
|
227
|
+
mime_type: "text/plain".to_string(),
|
|
228
|
+
metadata,
|
|
229
|
+
tables: vec![],
|
|
230
|
+
detected_languages: None,
|
|
231
|
+
chunks: None,
|
|
232
|
+
images: None,
|
|
233
|
+
djot_content: None,
|
|
234
|
+
pages: None,
|
|
235
|
+
elements: None,
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
Err(join_err) => {
|
|
239
|
+
return Err(KreuzbergError::Other(format!("Task panicked: {}", join_err)));
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
#[allow(clippy::unwrap_used)]
|
|
245
|
+
Ok(results.into_iter().map(|r| r.unwrap()).collect())
|
|
246
|
+
}
|