kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
//! Main extraction configuration struct.
|
|
2
|
+
//!
|
|
3
|
+
//! This module contains the main `ExtractionConfig` struct that aggregates all
|
|
4
|
+
//! configuration options for the extraction process.
|
|
5
|
+
|
|
6
|
+
use serde::{Deserialize, Serialize};
|
|
7
|
+
|
|
8
|
+
use super::super::formats::OutputFormat;
|
|
9
|
+
use super::super::ocr::OcrConfig;
|
|
10
|
+
use super::super::page::PageConfig;
|
|
11
|
+
use super::super::processing::{ChunkingConfig, PostProcessorConfig};
|
|
12
|
+
use super::types::{ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
|
|
13
|
+
|
|
14
|
+
/// Main extraction configuration.
|
|
15
|
+
///
|
|
16
|
+
/// This struct contains all configuration options for the extraction process.
|
|
17
|
+
/// It can be loaded from TOML, YAML, or JSON files, or created programmatically.
|
|
18
|
+
///
|
|
19
|
+
/// # Example
|
|
20
|
+
///
|
|
21
|
+
/// ```rust
|
|
22
|
+
/// use kreuzberg::core::config::ExtractionConfig;
|
|
23
|
+
///
|
|
24
|
+
/// // Create with defaults
|
|
25
|
+
/// let config = ExtractionConfig::default();
|
|
26
|
+
///
|
|
27
|
+
/// // Load from TOML file
|
|
28
|
+
/// // let config = ExtractionConfig::from_toml_file("kreuzberg.toml")?;
|
|
29
|
+
/// ```
|
|
30
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
31
|
+
pub struct ExtractionConfig {
|
|
32
|
+
/// Enable caching of extraction results
|
|
33
|
+
#[serde(default = "default_true")]
|
|
34
|
+
pub use_cache: bool,
|
|
35
|
+
|
|
36
|
+
/// Enable quality post-processing
|
|
37
|
+
#[serde(default = "default_true")]
|
|
38
|
+
pub enable_quality_processing: bool,
|
|
39
|
+
|
|
40
|
+
/// OCR configuration (None = OCR disabled)
|
|
41
|
+
#[serde(default)]
|
|
42
|
+
pub ocr: Option<OcrConfig>,
|
|
43
|
+
|
|
44
|
+
/// Force OCR even for searchable PDFs
|
|
45
|
+
#[serde(default)]
|
|
46
|
+
pub force_ocr: bool,
|
|
47
|
+
|
|
48
|
+
/// Text chunking configuration (None = chunking disabled)
|
|
49
|
+
#[serde(default)]
|
|
50
|
+
pub chunking: Option<ChunkingConfig>,
|
|
51
|
+
|
|
52
|
+
/// Image extraction configuration (None = no image extraction)
|
|
53
|
+
#[serde(default)]
|
|
54
|
+
pub images: Option<ImageExtractionConfig>,
|
|
55
|
+
|
|
56
|
+
/// PDF-specific options (None = use defaults)
|
|
57
|
+
#[cfg(feature = "pdf")]
|
|
58
|
+
#[serde(default)]
|
|
59
|
+
pub pdf_options: Option<super::super::pdf::PdfConfig>,
|
|
60
|
+
|
|
61
|
+
/// Token reduction configuration (None = no token reduction)
|
|
62
|
+
#[serde(default)]
|
|
63
|
+
pub token_reduction: Option<TokenReductionConfig>,
|
|
64
|
+
|
|
65
|
+
/// Language detection configuration (None = no language detection)
|
|
66
|
+
#[serde(default)]
|
|
67
|
+
pub language_detection: Option<LanguageDetectionConfig>,
|
|
68
|
+
|
|
69
|
+
/// Page extraction configuration (None = no page tracking)
|
|
70
|
+
#[serde(default)]
|
|
71
|
+
pub pages: Option<PageConfig>,
|
|
72
|
+
|
|
73
|
+
/// Keyword extraction configuration (None = no keyword extraction)
|
|
74
|
+
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
75
|
+
#[serde(default)]
|
|
76
|
+
pub keywords: Option<crate::keywords::KeywordConfig>,
|
|
77
|
+
|
|
78
|
+
/// Post-processor configuration (None = use defaults)
|
|
79
|
+
#[serde(default)]
|
|
80
|
+
pub postprocessor: Option<PostProcessorConfig>,
|
|
81
|
+
|
|
82
|
+
/// HTML to Markdown conversion options (None = use defaults)
|
|
83
|
+
///
|
|
84
|
+
/// Configure how HTML documents are converted to Markdown, including heading styles,
|
|
85
|
+
/// list formatting, code block styles, and preprocessing options.
|
|
86
|
+
#[cfg(feature = "html")]
|
|
87
|
+
#[serde(default)]
|
|
88
|
+
pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
|
|
89
|
+
|
|
90
|
+
/// Maximum concurrent extractions in batch operations (None = num_cpus * 2).
|
|
91
|
+
///
|
|
92
|
+
/// Limits parallelism to prevent resource exhaustion when processing
|
|
93
|
+
/// large batches. Defaults to twice the number of CPU cores.
|
|
94
|
+
#[serde(default)]
|
|
95
|
+
pub max_concurrent_extractions: Option<usize>,
|
|
96
|
+
|
|
97
|
+
/// Result structure format
|
|
98
|
+
///
|
|
99
|
+
/// Controls whether results are returned in unified format (default) with all
|
|
100
|
+
/// content in the `content` field, or element-based format with semantic
|
|
101
|
+
/// elements (for Unstructured-compatible output).
|
|
102
|
+
#[serde(default)]
|
|
103
|
+
pub result_format: crate::types::OutputFormat,
|
|
104
|
+
|
|
105
|
+
/// Content text format (default: Plain).
|
|
106
|
+
///
|
|
107
|
+
/// Controls the format of the extracted content:
|
|
108
|
+
/// - `Plain`: Raw extracted text (default)
|
|
109
|
+
/// - `Markdown`: Markdown formatted output
|
|
110
|
+
/// - `Djot`: Djot markup format (requires djot feature)
|
|
111
|
+
/// - `Html`: HTML formatted output
|
|
112
|
+
///
|
|
113
|
+
/// When set to a structured format, extraction results will include
|
|
114
|
+
/// formatted output. The `formatted_content` field may be populated
|
|
115
|
+
/// when format conversion is applied.
|
|
116
|
+
#[serde(default)]
|
|
117
|
+
pub output_format: OutputFormat,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
impl Default for ExtractionConfig {
|
|
121
|
+
fn default() -> Self {
|
|
122
|
+
Self {
|
|
123
|
+
use_cache: true,
|
|
124
|
+
enable_quality_processing: true,
|
|
125
|
+
ocr: None,
|
|
126
|
+
force_ocr: false,
|
|
127
|
+
chunking: None,
|
|
128
|
+
images: None,
|
|
129
|
+
#[cfg(feature = "pdf")]
|
|
130
|
+
pdf_options: None,
|
|
131
|
+
token_reduction: None,
|
|
132
|
+
language_detection: None,
|
|
133
|
+
pages: None,
|
|
134
|
+
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
135
|
+
keywords: None,
|
|
136
|
+
postprocessor: None,
|
|
137
|
+
#[cfg(feature = "html")]
|
|
138
|
+
html_options: None,
|
|
139
|
+
max_concurrent_extractions: None,
|
|
140
|
+
result_format: crate::types::OutputFormat::Unified,
|
|
141
|
+
output_format: OutputFormat::Plain,
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
impl ExtractionConfig {
|
|
147
|
+
/// Check if image processing is needed by examining OCR and image extraction settings.
|
|
148
|
+
///
|
|
149
|
+
/// Returns `true` if either OCR is enabled or image extraction is configured,
|
|
150
|
+
/// indicating that image decompression and processing should occur.
|
|
151
|
+
/// Returns `false` if both are disabled, allowing optimization to skip unnecessary
|
|
152
|
+
/// image decompression for text-only extraction workflows.
|
|
153
|
+
///
|
|
154
|
+
/// # Optimization Impact
|
|
155
|
+
/// For text-only extractions (no OCR, no image extraction), skipping image
|
|
156
|
+
/// decompression can improve CPU utilization by 5-10% by avoiding wasteful
|
|
157
|
+
/// image I/O and processing when results won't be used.
|
|
158
|
+
pub fn needs_image_processing(&self) -> bool {
|
|
159
|
+
let ocr_enabled = self.ocr.is_some();
|
|
160
|
+
|
|
161
|
+
let image_extraction_enabled = self.images.as_ref().map(|i| i.extract_images).unwrap_or(false);
|
|
162
|
+
|
|
163
|
+
ocr_enabled || image_extraction_enabled
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
fn default_true() -> bool {
|
|
168
|
+
true
|
|
169
|
+
}
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
//! Environment variable override support for extraction configuration.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides functionality to apply environment variable overrides
|
|
4
|
+
//! to extraction configuration, allowing runtime configuration changes.
|
|
5
|
+
|
|
6
|
+
use crate::{KreuzbergError, Result};
|
|
7
|
+
|
|
8
|
+
use super::super::ocr::OcrConfig;
|
|
9
|
+
use super::super::processing::ChunkingConfig;
|
|
10
|
+
use super::core::ExtractionConfig;
|
|
11
|
+
use super::types::TokenReductionConfig;
|
|
12
|
+
|
|
13
|
+
impl ExtractionConfig {
|
|
14
|
+
/// Apply environment variable overrides to configuration.
|
|
15
|
+
///
|
|
16
|
+
/// Environment variables have the highest precedence and will override any values
|
|
17
|
+
/// loaded from configuration files. This method supports the following environment variables:
|
|
18
|
+
///
|
|
19
|
+
/// - `KREUZBERG_OCR_LANGUAGE`: OCR language (ISO 639-1 or 639-3 code, e.g., "eng", "fra", "deu")
|
|
20
|
+
/// - `KREUZBERG_OCR_BACKEND`: OCR backend ("tesseract", "easyocr", or "paddleocr")
|
|
21
|
+
/// - `KREUZBERG_CHUNKING_MAX_CHARS`: Maximum characters per chunk (positive integer)
|
|
22
|
+
/// - `KREUZBERG_CHUNKING_MAX_OVERLAP`: Maximum overlap between chunks (non-negative integer)
|
|
23
|
+
/// - `KREUZBERG_CACHE_ENABLED`: Cache enabled flag ("true" or "false")
|
|
24
|
+
/// - `KREUZBERG_TOKEN_REDUCTION_MODE`: Token reduction mode ("off", "light", "moderate", "aggressive", or "maximum")
|
|
25
|
+
///
|
|
26
|
+
/// # Behavior
|
|
27
|
+
///
|
|
28
|
+
/// - If an environment variable is set and valid, it overrides the current configuration value
|
|
29
|
+
/// - If a required parent config is `None` (e.g., `self.ocr` is None), it's created with defaults before applying the override
|
|
30
|
+
/// - Invalid values return a `KreuzbergError::Validation` with helpful error messages
|
|
31
|
+
/// - Missing or unset environment variables are silently ignored
|
|
32
|
+
///
|
|
33
|
+
/// # Example
|
|
34
|
+
///
|
|
35
|
+
/// ```rust
|
|
36
|
+
/// # use kreuzberg::core::config::ExtractionConfig;
|
|
37
|
+
/// # fn example() -> kreuzberg::Result<()> {
|
|
38
|
+
/// let mut config = ExtractionConfig::from_file("config.toml")?;
|
|
39
|
+
/// // Set KREUZBERG_OCR_LANGUAGE=fra before calling
|
|
40
|
+
/// config.apply_env_overrides()?; // OCR language is now "fra"
|
|
41
|
+
/// # Ok(())
|
|
42
|
+
/// # }
|
|
43
|
+
/// ```
|
|
44
|
+
///
|
|
45
|
+
/// # Errors
|
|
46
|
+
///
|
|
47
|
+
/// Returns `KreuzbergError::Validation` if:
|
|
48
|
+
/// - An environment variable contains an invalid value
|
|
49
|
+
/// - A number cannot be parsed as the expected type
|
|
50
|
+
/// - A boolean is not "true" or "false"
|
|
51
|
+
pub fn apply_env_overrides(&mut self) -> Result<()> {
|
|
52
|
+
use crate::core::config_validation::{
|
|
53
|
+
validate_chunking_params, validate_language_code, validate_ocr_backend, validate_token_reduction_level,
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
// KREUZBERG_OCR_LANGUAGE override
|
|
57
|
+
if let Ok(lang) = std::env::var("KREUZBERG_OCR_LANGUAGE") {
|
|
58
|
+
validate_language_code(&lang)?;
|
|
59
|
+
if self.ocr.is_none() {
|
|
60
|
+
self.ocr = Some(OcrConfig::default());
|
|
61
|
+
}
|
|
62
|
+
if let Some(ref mut ocr) = self.ocr {
|
|
63
|
+
ocr.language = lang;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// KREUZBERG_OCR_BACKEND override
|
|
68
|
+
if let Ok(backend) = std::env::var("KREUZBERG_OCR_BACKEND") {
|
|
69
|
+
validate_ocr_backend(&backend)?;
|
|
70
|
+
if self.ocr.is_none() {
|
|
71
|
+
self.ocr = Some(OcrConfig::default());
|
|
72
|
+
}
|
|
73
|
+
if let Some(ref mut ocr) = self.ocr {
|
|
74
|
+
ocr.backend = backend;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// KREUZBERG_CHUNKING_MAX_CHARS override
|
|
79
|
+
if let Ok(max_chars_str) = std::env::var("KREUZBERG_CHUNKING_MAX_CHARS") {
|
|
80
|
+
let max_chars: usize = max_chars_str.parse().map_err(|_| KreuzbergError::Validation {
|
|
81
|
+
message: format!(
|
|
82
|
+
"Invalid value for KREUZBERG_CHUNKING_MAX_CHARS: '{}'. Must be a positive integer.",
|
|
83
|
+
max_chars_str
|
|
84
|
+
),
|
|
85
|
+
source: None,
|
|
86
|
+
})?;
|
|
87
|
+
|
|
88
|
+
if max_chars == 0 {
|
|
89
|
+
return Err(KreuzbergError::Validation {
|
|
90
|
+
message: "KREUZBERG_CHUNKING_MAX_CHARS must be greater than 0".to_string(),
|
|
91
|
+
source: None,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if self.chunking.is_none() {
|
|
96
|
+
self.chunking = Some(ChunkingConfig {
|
|
97
|
+
max_chars: 1000,
|
|
98
|
+
max_overlap: 200,
|
|
99
|
+
embedding: None,
|
|
100
|
+
preset: None,
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if let Some(ref mut chunking) = self.chunking {
|
|
105
|
+
// Validate against current overlap before updating
|
|
106
|
+
validate_chunking_params(max_chars, chunking.max_overlap)?;
|
|
107
|
+
chunking.max_chars = max_chars;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// KREUZBERG_CHUNKING_MAX_OVERLAP override
|
|
112
|
+
if let Ok(max_overlap_str) = std::env::var("KREUZBERG_CHUNKING_MAX_OVERLAP") {
|
|
113
|
+
let max_overlap: usize = max_overlap_str.parse().map_err(|_| KreuzbergError::Validation {
|
|
114
|
+
message: format!(
|
|
115
|
+
"Invalid value for KREUZBERG_CHUNKING_MAX_OVERLAP: '{}'. Must be a non-negative integer.",
|
|
116
|
+
max_overlap_str
|
|
117
|
+
),
|
|
118
|
+
source: None,
|
|
119
|
+
})?;
|
|
120
|
+
|
|
121
|
+
if self.chunking.is_none() {
|
|
122
|
+
self.chunking = Some(ChunkingConfig {
|
|
123
|
+
max_chars: 1000,
|
|
124
|
+
max_overlap: 200,
|
|
125
|
+
embedding: None,
|
|
126
|
+
preset: None,
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if let Some(ref mut chunking) = self.chunking {
|
|
131
|
+
// Validate against current max_chars before updating
|
|
132
|
+
validate_chunking_params(chunking.max_chars, max_overlap)?;
|
|
133
|
+
chunking.max_overlap = max_overlap;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// KREUZBERG_CACHE_ENABLED override
|
|
138
|
+
if let Ok(cache_str) = std::env::var("KREUZBERG_CACHE_ENABLED") {
|
|
139
|
+
let cache_enabled = match cache_str.to_lowercase().as_str() {
|
|
140
|
+
"true" => true,
|
|
141
|
+
"false" => false,
|
|
142
|
+
_ => {
|
|
143
|
+
return Err(KreuzbergError::Validation {
|
|
144
|
+
message: format!(
|
|
145
|
+
"Invalid value for KREUZBERG_CACHE_ENABLED: '{}'. Must be 'true' or 'false'.",
|
|
146
|
+
cache_str
|
|
147
|
+
),
|
|
148
|
+
source: None,
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
self.use_cache = cache_enabled;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
// KREUZBERG_TOKEN_REDUCTION_MODE override
|
|
156
|
+
if let Ok(mode) = std::env::var("KREUZBERG_TOKEN_REDUCTION_MODE") {
|
|
157
|
+
validate_token_reduction_level(&mode)?;
|
|
158
|
+
if self.token_reduction.is_none() {
|
|
159
|
+
self.token_reduction = Some(TokenReductionConfig {
|
|
160
|
+
mode: "off".to_string(),
|
|
161
|
+
preserve_important_words: true,
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
if let Some(ref mut token_reduction) = self.token_reduction {
|
|
165
|
+
token_reduction.mode = mode;
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
// KREUZBERG_OUTPUT_FORMAT override
|
|
170
|
+
if let Ok(val) = std::env::var("KREUZBERG_OUTPUT_FORMAT") {
|
|
171
|
+
self.output_format = val.parse().map_err(|e: String| KreuzbergError::Validation {
|
|
172
|
+
message: format!("Invalid value for KREUZBERG_OUTPUT_FORMAT: {}", e),
|
|
173
|
+
source: None,
|
|
174
|
+
})?;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
Ok(())
|
|
178
|
+
}
|
|
179
|
+
}
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
//! Configuration file loading with caching support.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides methods for loading extraction configuration from various
|
|
4
|
+
//! file formats (TOML, YAML, JSON) with automatic caching based on file modification times.
|
|
5
|
+
|
|
6
|
+
use crate::{KreuzbergError, Result};
|
|
7
|
+
use dashmap::DashMap;
|
|
8
|
+
use std::path::{Path, PathBuf};
|
|
9
|
+
use std::sync::{Arc, LazyLock};
|
|
10
|
+
use std::time::SystemTime;
|
|
11
|
+
|
|
12
|
+
use super::core::ExtractionConfig;
|
|
13
|
+
|
|
14
|
+
static CONFIG_CACHE: LazyLock<DashMap<PathBuf, (SystemTime, Arc<ExtractionConfig>)>> = LazyLock::new(DashMap::new);
|
|
15
|
+
|
|
16
|
+
impl ExtractionConfig {
|
|
17
|
+
/// Load configuration from a TOML file.
|
|
18
|
+
///
|
|
19
|
+
/// # Arguments
|
|
20
|
+
///
|
|
21
|
+
/// * `path` - Path to the TOML file
|
|
22
|
+
///
|
|
23
|
+
/// # Errors
|
|
24
|
+
///
|
|
25
|
+
/// Returns `KreuzbergError::Validation` if file doesn't exist or is invalid TOML.
|
|
26
|
+
pub fn from_toml_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
27
|
+
let path = path.as_ref();
|
|
28
|
+
|
|
29
|
+
let metadata = std::fs::metadata(path)
|
|
30
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
31
|
+
let mtime = metadata.modified().map_err(|e| {
|
|
32
|
+
KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
|
|
33
|
+
})?;
|
|
34
|
+
|
|
35
|
+
if let Some(entry) = CONFIG_CACHE.get(path)
|
|
36
|
+
&& entry.0 == mtime
|
|
37
|
+
{
|
|
38
|
+
return Ok((*entry.1).clone());
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
let content = std::fs::read_to_string(path)
|
|
42
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
43
|
+
|
|
44
|
+
let config: Self = toml::from_str(&content)
|
|
45
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid TOML in {}: {}", path.display(), e)))?;
|
|
46
|
+
|
|
47
|
+
let config_arc = Arc::new(config.clone());
|
|
48
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
49
|
+
|
|
50
|
+
Ok(config)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/// Load configuration from a YAML file.
|
|
54
|
+
pub fn from_yaml_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
55
|
+
let path = path.as_ref();
|
|
56
|
+
|
|
57
|
+
let metadata = std::fs::metadata(path)
|
|
58
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
59
|
+
let mtime = metadata.modified().map_err(|e| {
|
|
60
|
+
KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
|
|
61
|
+
})?;
|
|
62
|
+
|
|
63
|
+
if let Some(entry) = CONFIG_CACHE.get(path)
|
|
64
|
+
&& entry.0 == mtime
|
|
65
|
+
{
|
|
66
|
+
return Ok((*entry.1).clone());
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
let content = std::fs::read_to_string(path)
|
|
70
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
71
|
+
|
|
72
|
+
let config: Self = serde_yaml_ng::from_str(&content)
|
|
73
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid YAML in {}: {}", path.display(), e)))?;
|
|
74
|
+
|
|
75
|
+
let config_arc = Arc::new(config.clone());
|
|
76
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
77
|
+
|
|
78
|
+
Ok(config)
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Load configuration from a JSON file.
|
|
82
|
+
pub fn from_json_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
83
|
+
let path = path.as_ref();
|
|
84
|
+
|
|
85
|
+
let metadata = std::fs::metadata(path)
|
|
86
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
87
|
+
let mtime = metadata.modified().map_err(|e| {
|
|
88
|
+
KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
|
|
89
|
+
})?;
|
|
90
|
+
|
|
91
|
+
if let Some(entry) = CONFIG_CACHE.get(path)
|
|
92
|
+
&& entry.0 == mtime
|
|
93
|
+
{
|
|
94
|
+
return Ok((*entry.1).clone());
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
let content = std::fs::read_to_string(path)
|
|
98
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
99
|
+
|
|
100
|
+
let config: Self = serde_json::from_str(&content)
|
|
101
|
+
.map_err(|e| KreuzbergError::validation(format!("Invalid JSON in {}: {}", path.display(), e)))?;
|
|
102
|
+
|
|
103
|
+
let config_arc = Arc::new(config.clone());
|
|
104
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
105
|
+
|
|
106
|
+
Ok(config)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/// Load configuration from a file, auto-detecting format by extension.
|
|
110
|
+
///
|
|
111
|
+
/// Supported formats:
|
|
112
|
+
/// - `.toml` - TOML format
|
|
113
|
+
/// - `.yaml` - YAML format
|
|
114
|
+
/// - `.json` - JSON format
|
|
115
|
+
///
|
|
116
|
+
/// # Arguments
|
|
117
|
+
///
|
|
118
|
+
/// * `path` - Path to the configuration file
|
|
119
|
+
///
|
|
120
|
+
/// # Errors
|
|
121
|
+
///
|
|
122
|
+
/// Returns `KreuzbergError::Validation` if:
|
|
123
|
+
/// - File doesn't exist
|
|
124
|
+
/// - File extension is not supported
|
|
125
|
+
/// - File content is invalid for the detected format
|
|
126
|
+
///
|
|
127
|
+
/// # Example
|
|
128
|
+
///
|
|
129
|
+
/// ```rust
|
|
130
|
+
/// use kreuzberg::core::config::ExtractionConfig;
|
|
131
|
+
///
|
|
132
|
+
/// // Auto-detects TOML format
|
|
133
|
+
/// // let config = ExtractionConfig::from_file("kreuzberg.toml")?;
|
|
134
|
+
///
|
|
135
|
+
/// // Auto-detects YAML format
|
|
136
|
+
/// // let config = ExtractionConfig::from_file("kreuzberg.yaml")?;
|
|
137
|
+
/// ```
|
|
138
|
+
pub fn from_file(path: impl AsRef<Path>) -> Result<Self> {
|
|
139
|
+
let path = path.as_ref();
|
|
140
|
+
|
|
141
|
+
let metadata = std::fs::metadata(path)
|
|
142
|
+
.map_err(|e| KreuzbergError::validation(format!("Failed to read config file {}: {}", path.display(), e)))?;
|
|
143
|
+
let mtime = metadata.modified().map_err(|e| {
|
|
144
|
+
KreuzbergError::validation(format!("Failed to get modification time for {}: {}", path.display(), e))
|
|
145
|
+
})?;
|
|
146
|
+
|
|
147
|
+
if let Some(entry) = CONFIG_CACHE.get(path)
|
|
148
|
+
&& entry.0 == mtime
|
|
149
|
+
{
|
|
150
|
+
return Ok((*entry.1).clone());
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
let extension = path.extension().and_then(|ext| ext.to_str()).ok_or_else(|| {
|
|
154
|
+
KreuzbergError::validation(format!(
|
|
155
|
+
"Cannot determine file format: no extension found in {}",
|
|
156
|
+
path.display()
|
|
157
|
+
))
|
|
158
|
+
})?;
|
|
159
|
+
|
|
160
|
+
let config = match extension.to_lowercase().as_str() {
|
|
161
|
+
"toml" => Self::from_toml_file(path)?,
|
|
162
|
+
"yaml" | "yml" => Self::from_yaml_file(path)?,
|
|
163
|
+
"json" => Self::from_json_file(path)?,
|
|
164
|
+
_ => {
|
|
165
|
+
return Err(KreuzbergError::validation(format!(
|
|
166
|
+
"Unsupported config file format: .{}. Supported formats: .toml, .yaml, .json",
|
|
167
|
+
extension
|
|
168
|
+
)));
|
|
169
|
+
}
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
let config_arc = Arc::new(config.clone());
|
|
173
|
+
CONFIG_CACHE.insert(path.to_path_buf(), (mtime, config_arc));
|
|
174
|
+
|
|
175
|
+
Ok(config)
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/// Discover configuration file in parent directories.
|
|
179
|
+
///
|
|
180
|
+
/// Searches for `kreuzberg.toml` in current directory and parent directories.
|
|
181
|
+
///
|
|
182
|
+
/// # Returns
|
|
183
|
+
///
|
|
184
|
+
/// - `Some(config)` if found
|
|
185
|
+
/// - `None` if no config file found
|
|
186
|
+
pub fn discover() -> Result<Option<Self>> {
|
|
187
|
+
let mut current = std::env::current_dir().map_err(KreuzbergError::Io)?;
|
|
188
|
+
|
|
189
|
+
loop {
|
|
190
|
+
let kreuzberg_toml = current.join("kreuzberg.toml");
|
|
191
|
+
if kreuzberg_toml.exists() {
|
|
192
|
+
return Ok(Some(Self::from_toml_file(kreuzberg_toml)?));
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
if let Some(parent) = current.parent() {
|
|
196
|
+
current = parent.to_path_buf();
|
|
197
|
+
} else {
|
|
198
|
+
break;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
Ok(None)
|
|
203
|
+
}
|
|
204
|
+
}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
//! Main extraction configuration and environment variable handling.
|
|
2
|
+
//!
|
|
3
|
+
//! This module contains the main `ExtractionConfig` struct and related utilities
|
|
4
|
+
//! for loading configuration from files and applying environment variable overrides.
|
|
5
|
+
//!
|
|
6
|
+
//! The module is organized into focused submodules:
|
|
7
|
+
//! - `types`: Feature-specific configuration types (image, token reduction, language detection)
|
|
8
|
+
//! - `core`: Main ExtractionConfig struct and implementation
|
|
9
|
+
//! - `env`: Environment variable override support
|
|
10
|
+
//! - `loaders`: Configuration file loading with caching
|
|
11
|
+
|
|
12
|
+
mod core;
|
|
13
|
+
mod env;
|
|
14
|
+
mod loaders;
|
|
15
|
+
mod types;
|
|
16
|
+
|
|
17
|
+
// Re-export all public types for backward compatibility
|
|
18
|
+
pub use self::core::ExtractionConfig;
|
|
19
|
+
pub use self::types::{ImageExtractionConfig, LanguageDetectionConfig, TokenReductionConfig};
|
|
20
|
+
|
|
21
|
+
#[cfg(test)]
|
|
22
|
+
mod tests {
|
|
23
|
+
use super::*;
|
|
24
|
+
use crate::core::config::ocr::OcrConfig;
|
|
25
|
+
|
|
26
|
+
#[test]
|
|
27
|
+
fn test_default_config() {
|
|
28
|
+
let config = ExtractionConfig::default();
|
|
29
|
+
assert!(config.use_cache);
|
|
30
|
+
assert!(config.enable_quality_processing);
|
|
31
|
+
assert!(config.ocr.is_none());
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
#[test]
|
|
35
|
+
fn test_needs_image_processing() {
|
|
36
|
+
let mut config = ExtractionConfig::default();
|
|
37
|
+
assert!(!config.needs_image_processing());
|
|
38
|
+
|
|
39
|
+
config.ocr = Some(OcrConfig::default());
|
|
40
|
+
assert!(config.needs_image_processing());
|
|
41
|
+
}
|
|
42
|
+
}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
//! Feature-specific configuration types for extraction.
|
|
2
|
+
//!
|
|
3
|
+
//! This module contains configuration structs for specific extraction features:
|
|
4
|
+
//! - Image extraction and processing
|
|
5
|
+
//! - Token reduction
|
|
6
|
+
//! - Language detection
|
|
7
|
+
|
|
8
|
+
use serde::{Deserialize, Serialize};
|
|
9
|
+
|
|
10
|
+
/// Image extraction configuration.
|
|
11
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
12
|
+
pub struct ImageExtractionConfig {
|
|
13
|
+
/// Extract images from documents
|
|
14
|
+
#[serde(default = "default_true")]
|
|
15
|
+
pub extract_images: bool,
|
|
16
|
+
|
|
17
|
+
/// Target DPI for image normalization
|
|
18
|
+
#[serde(default = "default_target_dpi")]
|
|
19
|
+
pub target_dpi: i32,
|
|
20
|
+
|
|
21
|
+
/// Maximum dimension for images (width or height)
|
|
22
|
+
#[serde(default = "default_max_dimension")]
|
|
23
|
+
pub max_image_dimension: i32,
|
|
24
|
+
|
|
25
|
+
/// Automatically adjust DPI based on image content
|
|
26
|
+
#[serde(default = "default_true")]
|
|
27
|
+
pub auto_adjust_dpi: bool,
|
|
28
|
+
|
|
29
|
+
/// Minimum DPI threshold
|
|
30
|
+
#[serde(default = "default_min_dpi")]
|
|
31
|
+
pub min_dpi: i32,
|
|
32
|
+
|
|
33
|
+
/// Maximum DPI threshold
|
|
34
|
+
#[serde(default = "default_max_dpi")]
|
|
35
|
+
pub max_dpi: i32,
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
/// Token reduction configuration.
|
|
39
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
40
|
+
pub struct TokenReductionConfig {
|
|
41
|
+
/// Reduction mode: "off", "light", "moderate", "aggressive", "maximum"
|
|
42
|
+
#[serde(default = "default_reduction_mode")]
|
|
43
|
+
pub mode: String,
|
|
44
|
+
|
|
45
|
+
/// Preserve important words (capitalized, technical terms)
|
|
46
|
+
#[serde(default = "default_true")]
|
|
47
|
+
pub preserve_important_words: bool,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
/// Language detection configuration.
|
|
51
|
+
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
52
|
+
pub struct LanguageDetectionConfig {
|
|
53
|
+
/// Enable language detection
|
|
54
|
+
#[serde(default = "default_true")]
|
|
55
|
+
pub enabled: bool,
|
|
56
|
+
|
|
57
|
+
/// Minimum confidence threshold (0.0-1.0)
|
|
58
|
+
#[serde(default = "default_confidence")]
|
|
59
|
+
pub min_confidence: f64,
|
|
60
|
+
|
|
61
|
+
/// Detect multiple languages in the document
|
|
62
|
+
#[serde(default)]
|
|
63
|
+
pub detect_multiple: bool,
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Default value functions
|
|
67
|
+
fn default_true() -> bool {
|
|
68
|
+
true
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
fn default_target_dpi() -> i32 {
|
|
72
|
+
300
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
fn default_max_dimension() -> i32 {
|
|
76
|
+
4096
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fn default_min_dpi() -> i32 {
|
|
80
|
+
72
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
fn default_max_dpi() -> i32 {
|
|
84
|
+
600
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
fn default_reduction_mode() -> String {
|
|
88
|
+
"off".to_string()
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
fn default_confidence() -> f64 {
|
|
92
|
+
0.8
|
|
93
|
+
}
|