kreuzberg 4.0.8 → 4.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +99 -2
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/spec/fixtures/config.toml +1 -1
- data/spec/fixtures/config.yaml +1 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +5 -4
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mime.rs +15 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +388 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/pptx_regression_tests.rs +504 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +201 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
//! HTML options parsing from JSON
|
|
2
|
+
//!
|
|
3
|
+
//! Handles the complex nested structure of HTML conversion options.
|
|
4
|
+
|
|
5
|
+
use html_to_markdown_rs::options::{
|
|
6
|
+
CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle, PreprocessingPreset,
|
|
7
|
+
WhitespaceMode,
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
type FfiResult<T> = std::result::Result<T, String>;
|
|
11
|
+
|
|
12
|
+
/// Parse enum value from optional JSON value
|
|
13
|
+
fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
|
|
14
|
+
where
|
|
15
|
+
F: Fn(&str) -> FfiResult<T>,
|
|
16
|
+
{
|
|
17
|
+
if let Some(raw) = value {
|
|
18
|
+
let text = raw
|
|
19
|
+
.as_str()
|
|
20
|
+
.ok_or_else(|| "Expected string for enum field".to_string())?;
|
|
21
|
+
return parse_fn(text).map(Some);
|
|
22
|
+
}
|
|
23
|
+
Ok(None)
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/// Parse HeadingStyle from string
|
|
27
|
+
fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
|
|
28
|
+
match value.to_lowercase().as_str() {
|
|
29
|
+
"atx" => Ok(HeadingStyle::Atx),
|
|
30
|
+
"underlined" => Ok(HeadingStyle::Underlined),
|
|
31
|
+
"atx_closed" => Ok(HeadingStyle::AtxClosed),
|
|
32
|
+
other => Err(format!(
|
|
33
|
+
"Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
|
|
34
|
+
other
|
|
35
|
+
)),
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/// Parse ListIndentType from string
|
|
40
|
+
fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
|
|
41
|
+
match value.to_lowercase().as_str() {
|
|
42
|
+
"spaces" => Ok(ListIndentType::Spaces),
|
|
43
|
+
"tabs" => Ok(ListIndentType::Tabs),
|
|
44
|
+
other => Err(format!(
|
|
45
|
+
"Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
|
|
46
|
+
other
|
|
47
|
+
)),
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/// Parse HighlightStyle from string
|
|
52
|
+
fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
|
|
53
|
+
match value.to_lowercase().as_str() {
|
|
54
|
+
"double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
|
|
55
|
+
"html" => Ok(HighlightStyle::Html),
|
|
56
|
+
"bold" => Ok(HighlightStyle::Bold),
|
|
57
|
+
"none" => Ok(HighlightStyle::None),
|
|
58
|
+
other => Err(format!(
|
|
59
|
+
"Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
|
|
60
|
+
other
|
|
61
|
+
)),
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/// Parse WhitespaceMode from string
|
|
66
|
+
fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
|
|
67
|
+
match value.to_lowercase().as_str() {
|
|
68
|
+
"normalized" => Ok(WhitespaceMode::Normalized),
|
|
69
|
+
"strict" => Ok(WhitespaceMode::Strict),
|
|
70
|
+
other => Err(format!(
|
|
71
|
+
"Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
|
|
72
|
+
other
|
|
73
|
+
)),
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/// Parse NewlineStyle from string
|
|
78
|
+
fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
|
|
79
|
+
match value.to_lowercase().as_str() {
|
|
80
|
+
"spaces" => Ok(NewlineStyle::Spaces),
|
|
81
|
+
"backslash" => Ok(NewlineStyle::Backslash),
|
|
82
|
+
other => Err(format!(
|
|
83
|
+
"Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
|
|
84
|
+
other
|
|
85
|
+
)),
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/// Parse CodeBlockStyle from string
|
|
90
|
+
fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
|
|
91
|
+
match value.to_lowercase().as_str() {
|
|
92
|
+
"indented" => Ok(CodeBlockStyle::Indented),
|
|
93
|
+
"backticks" => Ok(CodeBlockStyle::Backticks),
|
|
94
|
+
"tildes" => Ok(CodeBlockStyle::Tildes),
|
|
95
|
+
other => Err(format!(
|
|
96
|
+
"Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
|
|
97
|
+
other
|
|
98
|
+
)),
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/// Parse PreprocessingPreset from string
|
|
103
|
+
#[allow(dead_code)]
|
|
104
|
+
fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
|
|
105
|
+
match value.to_lowercase().as_str() {
|
|
106
|
+
"minimal" => Ok(PreprocessingPreset::Minimal),
|
|
107
|
+
"standard" => Ok(PreprocessingPreset::Standard),
|
|
108
|
+
"aggressive" => Ok(PreprocessingPreset::Aggressive),
|
|
109
|
+
other => Err(format!(
|
|
110
|
+
"Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
|
|
111
|
+
other
|
|
112
|
+
)),
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/// Parse HTML conversion options from JSON value
|
|
117
|
+
pub fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
|
|
118
|
+
let mut opts = ConversionOptions::default();
|
|
119
|
+
let obj = value
|
|
120
|
+
.as_object()
|
|
121
|
+
.ok_or_else(|| "html_options must be an object".to_string())?;
|
|
122
|
+
|
|
123
|
+
if let Some(val) = obj.get("heading_style") {
|
|
124
|
+
opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
if let Some(val) = obj.get("list_indent_type") {
|
|
128
|
+
opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
if let Some(val) = obj.get("list_indent_width") {
|
|
132
|
+
opts.list_indent_width = val
|
|
133
|
+
.as_u64()
|
|
134
|
+
.map(|v| v as usize)
|
|
135
|
+
.ok_or_else(|| "list_indent_width must be an integer".to_string())?;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if let Some(val) = obj.get("bullets") {
|
|
139
|
+
opts.bullets = val
|
|
140
|
+
.as_str()
|
|
141
|
+
.map(str::to_string)
|
|
142
|
+
.ok_or_else(|| "bullets must be a string".to_string())?;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if let Some(val) = obj.get("strong_em_symbol") {
|
|
146
|
+
let symbol = val
|
|
147
|
+
.as_str()
|
|
148
|
+
.ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
|
|
149
|
+
let mut chars = symbol.chars();
|
|
150
|
+
opts.strong_em_symbol = chars
|
|
151
|
+
.next()
|
|
152
|
+
.ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
if let Some(val) = obj.get("escape_asterisks") {
|
|
156
|
+
opts.escape_asterisks = val
|
|
157
|
+
.as_bool()
|
|
158
|
+
.ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if let Some(val) = obj.get("escape_underscores") {
|
|
162
|
+
opts.escape_underscores = val
|
|
163
|
+
.as_bool()
|
|
164
|
+
.ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
if let Some(val) = obj.get("escape_misc") {
|
|
168
|
+
opts.escape_misc = val
|
|
169
|
+
.as_bool()
|
|
170
|
+
.ok_or_else(|| "escape_misc must be a boolean".to_string())?;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
if let Some(val) = obj.get("escape_ascii") {
|
|
174
|
+
opts.escape_ascii = val
|
|
175
|
+
.as_bool()
|
|
176
|
+
.ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
if let Some(val) = obj.get("code_language") {
|
|
180
|
+
opts.code_language = val
|
|
181
|
+
.as_str()
|
|
182
|
+
.map(str::to_string)
|
|
183
|
+
.ok_or_else(|| "code_language must be a string".to_string())?;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if let Some(val) = obj.get("autolinks") {
|
|
187
|
+
opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if let Some(val) = obj.get("default_title") {
|
|
191
|
+
opts.default_title = val
|
|
192
|
+
.as_bool()
|
|
193
|
+
.ok_or_else(|| "default_title must be a boolean".to_string())?;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
if let Some(val) = obj.get("br_in_tables") {
|
|
197
|
+
opts.br_in_tables = val
|
|
198
|
+
.as_bool()
|
|
199
|
+
.ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
if let Some(val) = obj.get("hocr_spatial_tables") {
|
|
203
|
+
opts.hocr_spatial_tables = val
|
|
204
|
+
.as_bool()
|
|
205
|
+
.ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if let Some(val) = obj.get("highlight_style") {
|
|
209
|
+
opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if let Some(val) = obj.get("extract_metadata") {
|
|
213
|
+
opts.extract_metadata = val
|
|
214
|
+
.as_bool()
|
|
215
|
+
.ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
if let Some(val) = obj.get("whitespace_mode") {
|
|
219
|
+
opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
if let Some(val) = obj.get("strip_newlines") {
|
|
223
|
+
opts.strip_newlines = val
|
|
224
|
+
.as_bool()
|
|
225
|
+
.ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if let Some(val) = obj.get("wrap") {
|
|
229
|
+
opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
if let Some(val) = obj.get("wrap_width") {
|
|
233
|
+
opts.wrap_width = val
|
|
234
|
+
.as_u64()
|
|
235
|
+
.map(|v| v as usize)
|
|
236
|
+
.ok_or_else(|| "wrap_width must be an integer".to_string())?;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if let Some(val) = obj.get("convert_as_inline") {
|
|
240
|
+
opts.convert_as_inline = val
|
|
241
|
+
.as_bool()
|
|
242
|
+
.ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
if let Some(val) = obj.get("sub_symbol") {
|
|
246
|
+
opts.sub_symbol = val
|
|
247
|
+
.as_str()
|
|
248
|
+
.map(str::to_string)
|
|
249
|
+
.ok_or_else(|| "sub_symbol must be a string".to_string())?;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
if let Some(val) = obj.get("sup_symbol") {
|
|
253
|
+
opts.sup_symbol = val
|
|
254
|
+
.as_str()
|
|
255
|
+
.map(str::to_string)
|
|
256
|
+
.ok_or_else(|| "sup_symbol must be a string".to_string())?;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
if let Some(val) = obj.get("newline_style") {
|
|
260
|
+
opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if let Some(val) = obj.get("code_block_style") {
|
|
264
|
+
opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if let Some(val) = obj.get("keep_inline_images_in") {
|
|
268
|
+
opts.keep_inline_images_in = val
|
|
269
|
+
.as_array()
|
|
270
|
+
.ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
|
|
271
|
+
.iter()
|
|
272
|
+
.map(|v| {
|
|
273
|
+
v.as_str()
|
|
274
|
+
.map(str::to_string)
|
|
275
|
+
.ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
|
|
276
|
+
})
|
|
277
|
+
.collect::<FfiResult<Vec<_>>>()?;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if let Some(val) = obj.get("encoding") {
|
|
281
|
+
opts.encoding = val
|
|
282
|
+
.as_str()
|
|
283
|
+
.map(str::to_string)
|
|
284
|
+
.ok_or_else(|| "encoding must be a string".to_string())?;
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
if let Some(val) = obj.get("debug") {
|
|
288
|
+
opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
if let Some(val) = obj.get("strip_tags") {
|
|
292
|
+
opts.strip_tags = val
|
|
293
|
+
.as_array()
|
|
294
|
+
.ok_or_else(|| "strip_tags must be an array".to_string())?
|
|
295
|
+
.iter()
|
|
296
|
+
.map(|v| {
|
|
297
|
+
v.as_str()
|
|
298
|
+
.map(str::to_string)
|
|
299
|
+
.ok_or_else(|| "strip_tags entries must be strings".to_string())
|
|
300
|
+
})
|
|
301
|
+
.collect::<FfiResult<Vec<_>>>()?;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
if let Some(val) = obj.get("preserve_tags") {
|
|
305
|
+
opts.preserve_tags = val
|
|
306
|
+
.as_array()
|
|
307
|
+
.ok_or_else(|| "preserve_tags must be an array".to_string())?
|
|
308
|
+
.iter()
|
|
309
|
+
.map(|v| {
|
|
310
|
+
v.as_str()
|
|
311
|
+
.map(str::to_string)
|
|
312
|
+
.ok_or_else(|| "preserve_tags entries must be strings".to_string())
|
|
313
|
+
})
|
|
314
|
+
.collect::<FfiResult<Vec<_>>>()?;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
Ok(opts)
|
|
318
|
+
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
//! Configuration loading from files
|
|
2
|
+
//!
|
|
3
|
+
//! Handles loading ExtractionConfig from TOML/JSON/YAML files and discovery.
|
|
4
|
+
|
|
5
|
+
use crate::helpers::set_last_error;
|
|
6
|
+
use kreuzberg::KreuzbergError;
|
|
7
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
8
|
+
use std::path::Path;
|
|
9
|
+
|
|
10
|
+
/// Load an ExtractionConfig from a file (returns JSON string).
|
|
11
|
+
///
|
|
12
|
+
/// # Arguments
|
|
13
|
+
///
|
|
14
|
+
/// * `file_path` - Path to the configuration file
|
|
15
|
+
///
|
|
16
|
+
/// # Returns
|
|
17
|
+
///
|
|
18
|
+
/// JSON string representation of the config, or error message.
|
|
19
|
+
pub fn load_config_as_json(file_path: &str) -> Result<String, String> {
|
|
20
|
+
match ExtractionConfig::from_file(file_path) {
|
|
21
|
+
Ok(config) => match serde_json::to_string(&config) {
|
|
22
|
+
Ok(json) => Ok(json),
|
|
23
|
+
Err(e) => Err(format!("Failed to serialize config to JSON: {}", e)),
|
|
24
|
+
},
|
|
25
|
+
Err(e) => Err(e.to_string()),
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/// Load an ExtractionConfig from a file (returns config struct).
|
|
30
|
+
///
|
|
31
|
+
/// # Arguments
|
|
32
|
+
///
|
|
33
|
+
/// * `path` - Path to the configuration file
|
|
34
|
+
///
|
|
35
|
+
/// # Returns
|
|
36
|
+
///
|
|
37
|
+
/// ExtractionConfig on success, or error message.
|
|
38
|
+
pub fn load_config_from_file(path: &Path) -> Result<ExtractionConfig, String> {
|
|
39
|
+
match ExtractionConfig::from_file(path) {
|
|
40
|
+
Ok(config) => Ok(config),
|
|
41
|
+
Err(e) => match &e {
|
|
42
|
+
KreuzbergError::Io(io_err) => Err(format!("IO error loading config: {}", io_err)),
|
|
43
|
+
_ => Err(format!("Failed to load config from file: {}", e)),
|
|
44
|
+
},
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
/// Discover and load an ExtractionConfig (returns JSON string).
|
|
49
|
+
///
|
|
50
|
+
/// Searches the current directory and all parent directories for:
|
|
51
|
+
/// - `kreuzberg.toml`
|
|
52
|
+
/// - `kreuzberg.json`
|
|
53
|
+
///
|
|
54
|
+
/// # Returns
|
|
55
|
+
///
|
|
56
|
+
/// JSON string of the first config file found, or None if not found.
|
|
57
|
+
pub fn discover_config_as_json() -> Option<String> {
|
|
58
|
+
match ExtractionConfig::discover() {
|
|
59
|
+
Ok(Some(config)) => match serde_json::to_string(&config) {
|
|
60
|
+
Ok(json) => Some(json),
|
|
61
|
+
Err(e) => {
|
|
62
|
+
set_last_error(format!("Failed to serialize config: {}", e));
|
|
63
|
+
None
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
Ok(None) => None,
|
|
67
|
+
Err(e) => {
|
|
68
|
+
match &e {
|
|
69
|
+
KreuzbergError::Io(io_err) => {
|
|
70
|
+
set_last_error(format!("IO error discovering config: {}", io_err));
|
|
71
|
+
}
|
|
72
|
+
_ => {
|
|
73
|
+
set_last_error(format!("Failed to discover config: {}", e));
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
None
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// List available embedding preset names.
|
|
82
|
+
///
|
|
83
|
+
/// # Returns
|
|
84
|
+
///
|
|
85
|
+
/// JSON array of preset names, or error message.
|
|
86
|
+
pub fn list_embedding_presets() -> Result<String, String> {
|
|
87
|
+
let presets = kreuzberg::embeddings::list_presets();
|
|
88
|
+
match serde_json::to_string(&presets) {
|
|
89
|
+
Ok(json) => Ok(json),
|
|
90
|
+
Err(e) => Err(format!("Failed to serialize presets: {}", e)),
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/// Get a specific embedding preset by name.
|
|
95
|
+
///
|
|
96
|
+
/// # Arguments
|
|
97
|
+
///
|
|
98
|
+
/// * `preset_name` - Name of the preset to retrieve
|
|
99
|
+
///
|
|
100
|
+
/// # Returns
|
|
101
|
+
///
|
|
102
|
+
/// JSON representation of the preset, or error message.
|
|
103
|
+
pub fn get_embedding_preset(preset_name: &str) -> Result<String, String> {
|
|
104
|
+
let preset = match kreuzberg::embeddings::get_preset(preset_name) {
|
|
105
|
+
Some(preset) => preset,
|
|
106
|
+
None => {
|
|
107
|
+
return Err(format!("Unknown embedding preset: {}", preset_name));
|
|
108
|
+
}
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
let model_name = format!("{:?}", preset.model);
|
|
112
|
+
let serializable = super::serialize::SerializableEmbeddingPreset {
|
|
113
|
+
name: preset.name,
|
|
114
|
+
chunk_size: preset.chunk_size,
|
|
115
|
+
overlap: preset.overlap,
|
|
116
|
+
model_name,
|
|
117
|
+
dimensions: preset.dimensions,
|
|
118
|
+
description: preset.description,
|
|
119
|
+
};
|
|
120
|
+
|
|
121
|
+
match serde_json::to_string(&serializable) {
|
|
122
|
+
Ok(json) => Ok(json),
|
|
123
|
+
Err(e) => Err(format!("Failed to serialize embedding preset: {}", e)),
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
#[cfg(test)]
|
|
128
|
+
mod tests {
|
|
129
|
+
use super::*;
|
|
130
|
+
|
|
131
|
+
#[test]
|
|
132
|
+
fn test_list_embedding_presets() {
|
|
133
|
+
let result = list_embedding_presets();
|
|
134
|
+
assert!(result.is_ok());
|
|
135
|
+
let json = result.unwrap();
|
|
136
|
+
assert!(json.starts_with('['));
|
|
137
|
+
assert!(json.ends_with(']'));
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
#[test]
|
|
141
|
+
fn test_get_embedding_preset_unknown() {
|
|
142
|
+
let result = get_embedding_preset("nonexistent_preset");
|
|
143
|
+
assert!(result.is_err());
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
#[test]
|
|
147
|
+
fn test_get_embedding_preset_valid() {
|
|
148
|
+
let result = get_embedding_preset("fast");
|
|
149
|
+
assert!(result.is_ok());
|
|
150
|
+
let json = result.unwrap();
|
|
151
|
+
assert!(json.contains("name"));
|
|
152
|
+
assert!(json.contains("chunk_size"));
|
|
153
|
+
}
|
|
154
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
//! Configuration merging logic
|
|
2
|
+
//!
|
|
3
|
+
//! Provides functionality to merge two ExtractionConfig instances.
|
|
4
|
+
|
|
5
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
6
|
+
|
|
7
|
+
/// Merge two configs (override takes precedence over base).
|
|
8
|
+
///
|
|
9
|
+
/// Performs a shallow merge where fields from `override_config` take
|
|
10
|
+
/// precedence over fields in `base`. The `base` config is modified in-place.
|
|
11
|
+
///
|
|
12
|
+
/// # Arguments
|
|
13
|
+
///
|
|
14
|
+
/// * `base` - Mutable reference to the base config (will be modified)
|
|
15
|
+
/// * `override_config` - Reference to the override config (read-only)
|
|
16
|
+
pub fn merge_configs(base: &mut ExtractionConfig, override_config: &ExtractionConfig) {
|
|
17
|
+
base.use_cache = override_config.use_cache;
|
|
18
|
+
base.enable_quality_processing = override_config.enable_quality_processing;
|
|
19
|
+
base.force_ocr = override_config.force_ocr;
|
|
20
|
+
base.max_concurrent_extractions = override_config.max_concurrent_extractions;
|
|
21
|
+
|
|
22
|
+
if override_config.ocr.is_some() {
|
|
23
|
+
base.ocr = override_config.ocr.clone();
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
if override_config.chunking.is_some() {
|
|
27
|
+
base.chunking = override_config.chunking.clone();
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if override_config.images.is_some() {
|
|
31
|
+
base.images = override_config.images.clone();
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
#[cfg(feature = "pdf")]
|
|
35
|
+
if override_config.pdf_options.is_some() {
|
|
36
|
+
base.pdf_options = override_config.pdf_options.clone();
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if override_config.token_reduction.is_some() {
|
|
40
|
+
base.token_reduction = override_config.token_reduction.clone();
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
if override_config.language_detection.is_some() {
|
|
44
|
+
base.language_detection = override_config.language_detection.clone();
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
if override_config.pages.is_some() {
|
|
48
|
+
base.pages = override_config.pages.clone();
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
52
|
+
if override_config.keywords.is_some() {
|
|
53
|
+
base.keywords = override_config.keywords.clone();
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
if override_config.postprocessor.is_some() {
|
|
57
|
+
base.postprocessor = override_config.postprocessor.clone();
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if override_config.html_options.is_some() {
|
|
61
|
+
base.html_options = override_config.html_options.clone();
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#[cfg(test)]
|
|
66
|
+
mod tests {
|
|
67
|
+
use super::*;
|
|
68
|
+
|
|
69
|
+
#[test]
|
|
70
|
+
fn test_merge_configs_simple() {
|
|
71
|
+
let mut base = ExtractionConfig {
|
|
72
|
+
use_cache: true,
|
|
73
|
+
force_ocr: false,
|
|
74
|
+
..Default::default()
|
|
75
|
+
};
|
|
76
|
+
|
|
77
|
+
let override_config = ExtractionConfig {
|
|
78
|
+
force_ocr: true,
|
|
79
|
+
..Default::default()
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
merge_configs(&mut base, &override_config);
|
|
83
|
+
|
|
84
|
+
assert!(base.use_cache);
|
|
85
|
+
assert!(base.force_ocr);
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
#[test]
|
|
89
|
+
fn test_merge_configs_override_to_default() {
|
|
90
|
+
let mut base = ExtractionConfig {
|
|
91
|
+
use_cache: false,
|
|
92
|
+
..Default::default()
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
let override_config = ExtractionConfig {
|
|
96
|
+
use_cache: true,
|
|
97
|
+
..Default::default()
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
merge_configs(&mut base, &override_config);
|
|
101
|
+
|
|
102
|
+
assert!(base.use_cache, "override to default value should be applied");
|
|
103
|
+
}
|
|
104
|
+
}
|