kreuzberg 4.0.7 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +24 -16
- data/README.md +4 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +26 -353
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +3 -3
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -1,1341 +0,0 @@
|
|
|
1
|
-
//! Centralized FFI configuration parsing module.
|
|
2
|
-
//!
|
|
3
|
-
//! This module consolidates all configuration parsing logic that was previously
|
|
4
|
-
//! duplicated across all language bindings (Python, TypeScript, Ruby, Java, Go, C#).
|
|
5
|
-
//!
|
|
6
|
-
//! Instead of each binding reimplementing config parsing from JSON, they now
|
|
7
|
-
//! call the FFI functions provided here, ensuring:
|
|
8
|
-
//! - Single source of truth for validation rules
|
|
9
|
-
//! - Consistent behavior across all languages
|
|
10
|
-
//! - Elimination of drift/inconsistencies
|
|
11
|
-
//! - Better performance (no JSON round-trips in language bindings)
|
|
12
|
-
|
|
13
|
-
use crate::ffi_panic_guard;
|
|
14
|
-
use crate::helpers::{clear_last_error, set_last_error, string_to_c_string};
|
|
15
|
-
use kreuzberg::KreuzbergError;
|
|
16
|
-
use kreuzberg::core::config::ExtractionConfig;
|
|
17
|
-
use serde::Serialize;
|
|
18
|
-
use std::ffi::{CStr, CString};
|
|
19
|
-
use std::os::raw::c_char;
|
|
20
|
-
use std::path::Path;
|
|
21
|
-
use std::ptr;
|
|
22
|
-
|
|
23
|
-
type FfiResult<T> = std::result::Result<T, String>;
|
|
24
|
-
|
|
25
|
-
/// Parse an ExtractionConfig from a JSON string.
|
|
26
|
-
///
|
|
27
|
-
/// This is the primary FFI entry point for all language bindings to parse
|
|
28
|
-
/// configuration from JSON. Replaces the need for each binding to implement
|
|
29
|
-
/// its own JSON parsing logic.
|
|
30
|
-
///
|
|
31
|
-
/// # Arguments
|
|
32
|
-
///
|
|
33
|
-
/// * `json_config` - Null-terminated C string containing JSON configuration
|
|
34
|
-
///
|
|
35
|
-
/// # Returns
|
|
36
|
-
///
|
|
37
|
-
/// A pointer to an ExtractionConfig struct that MUST be freed with
|
|
38
|
-
/// `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
|
|
39
|
-
///
|
|
40
|
-
/// # Safety
|
|
41
|
-
///
|
|
42
|
-
/// - `json_config` must be a valid null-terminated C string
|
|
43
|
-
/// - The returned pointer must be freed with `kreuzberg_config_free`
|
|
44
|
-
/// - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
|
|
45
|
-
///
|
|
46
|
-
/// # Example (C)
|
|
47
|
-
///
|
|
48
|
-
/// ```c
|
|
49
|
-
/// const char* config_json = "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}";
|
|
50
|
-
/// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
|
|
51
|
-
/// if (config == NULL) {
|
|
52
|
-
/// printf("Error: %s\n", kreuzberg_last_error());
|
|
53
|
-
/// return 1;
|
|
54
|
-
/// }
|
|
55
|
-
///
|
|
56
|
-
/// // Use config...
|
|
57
|
-
/// // char* result = kreuzberg_extract_file_with_config("doc.pdf", config);
|
|
58
|
-
///
|
|
59
|
-
/// kreuzberg_config_free(config);
|
|
60
|
-
/// ```
|
|
61
|
-
#[unsafe(no_mangle)]
|
|
62
|
-
pub unsafe extern "C" fn kreuzberg_config_from_json(json_config: *const c_char) -> *mut ExtractionConfig {
|
|
63
|
-
if json_config.is_null() {
|
|
64
|
-
set_last_error("Config JSON cannot be NULL".to_string());
|
|
65
|
-
return ptr::null_mut();
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
clear_last_error();
|
|
69
|
-
|
|
70
|
-
let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
|
|
71
|
-
Ok(s) => s,
|
|
72
|
-
Err(e) => {
|
|
73
|
-
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
74
|
-
return ptr::null_mut();
|
|
75
|
-
}
|
|
76
|
-
};
|
|
77
|
-
|
|
78
|
-
match parse_extraction_config_from_json(json_str) {
|
|
79
|
-
Ok(config) => Box::into_raw(Box::new(config)),
|
|
80
|
-
Err(e) => {
|
|
81
|
-
set_last_error(e);
|
|
82
|
-
ptr::null_mut()
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
/// Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
|
|
88
|
-
///
|
|
89
|
-
/// # Safety
|
|
90
|
-
///
|
|
91
|
-
/// - `config` must be a pointer previously returned by a config creation function
|
|
92
|
-
/// - `config` can be NULL (no-op)
|
|
93
|
-
/// - `config` must not be used after this call
|
|
94
|
-
///
|
|
95
|
-
/// # Example (C)
|
|
96
|
-
///
|
|
97
|
-
/// ```c
|
|
98
|
-
/// ExtractionConfig* config = kreuzberg_config_from_json("{...}");
|
|
99
|
-
/// if (config != NULL) {
|
|
100
|
-
/// // Use config...
|
|
101
|
-
/// kreuzberg_config_free(config);
|
|
102
|
-
/// }
|
|
103
|
-
/// ```
|
|
104
|
-
#[unsafe(no_mangle)]
|
|
105
|
-
pub unsafe extern "C" fn kreuzberg_config_free(config: *mut ExtractionConfig) {
|
|
106
|
-
if !config.is_null() {
|
|
107
|
-
let _ = unsafe { Box::from_raw(config) };
|
|
108
|
-
}
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
/// Validate a JSON config string without parsing it.
|
|
112
|
-
///
|
|
113
|
-
/// This function checks if a JSON config string is valid and would parse correctly,
|
|
114
|
-
/// without allocating the full ExtractionConfig structure. Useful for validation
|
|
115
|
-
/// before committing to parsing.
|
|
116
|
-
///
|
|
117
|
-
/// # Arguments
|
|
118
|
-
///
|
|
119
|
-
/// * `json_config` - Null-terminated C string containing JSON configuration
|
|
120
|
-
///
|
|
121
|
-
/// # Returns
|
|
122
|
-
///
|
|
123
|
-
/// - 1 if valid (would parse successfully)
|
|
124
|
-
/// - 0 if invalid (check `kreuzberg_last_error` for details)
|
|
125
|
-
///
|
|
126
|
-
/// # Safety
|
|
127
|
-
///
|
|
128
|
-
/// - `json_config` must be a valid null-terminated C string
|
|
129
|
-
///
|
|
130
|
-
/// # Example (C)
|
|
131
|
-
///
|
|
132
|
-
/// ```c
|
|
133
|
-
/// const char* config_json = "{\"use_cache\": true}";
|
|
134
|
-
/// if (kreuzberg_config_is_valid(config_json)) {
|
|
135
|
-
/// ExtractionConfig* config = kreuzberg_config_from_json(config_json);
|
|
136
|
-
/// // Use config...
|
|
137
|
-
/// kreuzberg_config_free(config);
|
|
138
|
-
/// } else {
|
|
139
|
-
/// printf("Invalid config: %s\n", kreuzberg_last_error());
|
|
140
|
-
/// }
|
|
141
|
-
/// ```
|
|
142
|
-
#[unsafe(no_mangle)]
|
|
143
|
-
pub unsafe extern "C" fn kreuzberg_config_is_valid(json_config: *const c_char) -> i32 {
|
|
144
|
-
if json_config.is_null() {
|
|
145
|
-
set_last_error("Config JSON cannot be NULL".to_string());
|
|
146
|
-
return 0;
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
clear_last_error();
|
|
150
|
-
|
|
151
|
-
let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
|
|
152
|
-
Ok(s) => s,
|
|
153
|
-
Err(e) => {
|
|
154
|
-
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
155
|
-
return 0;
|
|
156
|
-
}
|
|
157
|
-
};
|
|
158
|
-
|
|
159
|
-
match parse_extraction_config_from_json(json_str) {
|
|
160
|
-
Ok(_) => 1,
|
|
161
|
-
Err(e) => {
|
|
162
|
-
set_last_error(e);
|
|
163
|
-
0
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
/// Serialize an ExtractionConfig to JSON string.
|
|
169
|
-
///
|
|
170
|
-
/// Converts an ExtractionConfig structure to its JSON representation, allowing
|
|
171
|
-
/// bindings to serialize configs without reimplementing serialization logic.
|
|
172
|
-
///
|
|
173
|
-
/// # Arguments
|
|
174
|
-
///
|
|
175
|
-
/// * `config` - Pointer to an ExtractionConfig structure
|
|
176
|
-
///
|
|
177
|
-
/// # Returns
|
|
178
|
-
///
|
|
179
|
-
/// A pointer to a C string containing JSON that MUST be freed with `kreuzberg_free_string`.
|
|
180
|
-
/// Returns NULL on error (check `kreuzberg_last_error`).
|
|
181
|
-
///
|
|
182
|
-
/// # Safety
|
|
183
|
-
///
|
|
184
|
-
/// - `config` must be a valid pointer to an ExtractionConfig
|
|
185
|
-
/// - `config` cannot be NULL
|
|
186
|
-
/// - The returned pointer must be freed with `kreuzberg_free_string`
|
|
187
|
-
///
|
|
188
|
-
/// # Example (C)
|
|
189
|
-
///
|
|
190
|
-
/// ```c
|
|
191
|
-
/// ExtractionConfig* config = kreuzberg_config_from_json("{\"use_cache\": true}");
|
|
192
|
-
/// if (config != NULL) {
|
|
193
|
-
/// char* json = kreuzberg_config_to_json(config);
|
|
194
|
-
/// if (json != NULL) {
|
|
195
|
-
/// printf("Serialized: %s\n", json);
|
|
196
|
-
/// kreuzberg_free_string(json);
|
|
197
|
-
/// }
|
|
198
|
-
/// kreuzberg_config_free(config);
|
|
199
|
-
/// }
|
|
200
|
-
/// ```
|
|
201
|
-
#[unsafe(no_mangle)]
|
|
202
|
-
pub unsafe extern "C" fn kreuzberg_config_to_json(config: *const ExtractionConfig) -> *mut c_char {
|
|
203
|
-
if config.is_null() {
|
|
204
|
-
set_last_error("Config cannot be NULL".to_string());
|
|
205
|
-
return ptr::null_mut();
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
clear_last_error();
|
|
209
|
-
|
|
210
|
-
match serde_json::to_string(unsafe { &*config }) {
|
|
211
|
-
Ok(json) => match std::ffi::CString::new(json) {
|
|
212
|
-
Ok(c_string) => c_string.into_raw(),
|
|
213
|
-
Err(e) => {
|
|
214
|
-
set_last_error(format!("Failed to convert JSON to C string: {}", e));
|
|
215
|
-
ptr::null_mut()
|
|
216
|
-
}
|
|
217
|
-
},
|
|
218
|
-
Err(e) => {
|
|
219
|
-
set_last_error(format!("Failed to serialize config to JSON: {}", e));
|
|
220
|
-
ptr::null_mut()
|
|
221
|
-
}
|
|
222
|
-
}
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
/// Get a specific field from config as JSON string.
|
|
226
|
-
///
|
|
227
|
-
/// Retrieves a nested field from the configuration by path and returns its JSON
|
|
228
|
-
/// representation. Supports dot notation for nested fields (e.g., "ocr.backend").
|
|
229
|
-
///
|
|
230
|
-
/// # Arguments
|
|
231
|
-
///
|
|
232
|
-
/// * `config` - Pointer to an ExtractionConfig structure
|
|
233
|
-
/// * `field_name` - Null-terminated C string with field path (e.g., "use_cache", "ocr.backend")
|
|
234
|
-
///
|
|
235
|
-
/// # Returns
|
|
236
|
-
///
|
|
237
|
-
/// A pointer to a C string containing the field value as JSON, or NULL if:
|
|
238
|
-
/// - The field doesn't exist
|
|
239
|
-
/// - An error occurs during serialization
|
|
240
|
-
///
|
|
241
|
-
/// The returned pointer (if non-NULL) must be freed with `kreuzberg_free_string`.
|
|
242
|
-
///
|
|
243
|
-
/// # Safety
|
|
244
|
-
///
|
|
245
|
-
/// - `config` must be a valid pointer to an ExtractionConfig
|
|
246
|
-
/// - `field_name` must be a valid null-terminated C string
|
|
247
|
-
/// - Neither parameter can be NULL
|
|
248
|
-
///
|
|
249
|
-
/// # Example (C)
|
|
250
|
-
///
|
|
251
|
-
/// ```c
|
|
252
|
-
/// ExtractionConfig* config = kreuzberg_config_from_json(
|
|
253
|
-
/// "{\"use_cache\": true, \"ocr\": {\"backend\": \"tesseract\"}}"
|
|
254
|
-
/// );
|
|
255
|
-
/// if (config != NULL) {
|
|
256
|
-
/// char* use_cache = kreuzberg_config_get_field(config, "use_cache");
|
|
257
|
-
/// char* backend = kreuzberg_config_get_field(config, "ocr.backend");
|
|
258
|
-
///
|
|
259
|
-
/// if (use_cache != NULL) {
|
|
260
|
-
/// printf("use_cache: %s\n", use_cache);
|
|
261
|
-
/// kreuzberg_free_string(use_cache);
|
|
262
|
-
/// }
|
|
263
|
-
///
|
|
264
|
-
/// if (backend != NULL) {
|
|
265
|
-
/// printf("backend: %s\n", backend);
|
|
266
|
-
/// kreuzberg_free_string(backend);
|
|
267
|
-
/// }
|
|
268
|
-
///
|
|
269
|
-
/// kreuzberg_config_free(config);
|
|
270
|
-
/// }
|
|
271
|
-
/// ```
|
|
272
|
-
#[unsafe(no_mangle)]
|
|
273
|
-
pub unsafe extern "C" fn kreuzberg_config_get_field(
|
|
274
|
-
config: *const ExtractionConfig,
|
|
275
|
-
field_name: *const c_char,
|
|
276
|
-
) -> *mut c_char {
|
|
277
|
-
if config.is_null() {
|
|
278
|
-
set_last_error("Config cannot be NULL".to_string());
|
|
279
|
-
return ptr::null_mut();
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
if field_name.is_null() {
|
|
283
|
-
set_last_error("Field name cannot be NULL".to_string());
|
|
284
|
-
return ptr::null_mut();
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
clear_last_error();
|
|
288
|
-
|
|
289
|
-
let field_str = match unsafe { CStr::from_ptr(field_name) }.to_str() {
|
|
290
|
-
Ok(s) => s,
|
|
291
|
-
Err(e) => {
|
|
292
|
-
set_last_error(format!("Invalid UTF-8 in field name: {}", e));
|
|
293
|
-
return ptr::null_mut();
|
|
294
|
-
}
|
|
295
|
-
};
|
|
296
|
-
|
|
297
|
-
let json_value = match serde_json::to_value(unsafe { &*config }) {
|
|
298
|
-
Ok(val) => val,
|
|
299
|
-
Err(e) => {
|
|
300
|
-
set_last_error(format!("Failed to serialize config: {}", e));
|
|
301
|
-
return ptr::null_mut();
|
|
302
|
-
}
|
|
303
|
-
};
|
|
304
|
-
|
|
305
|
-
let mut current = &json_value;
|
|
306
|
-
for part in field_str.split('.') {
|
|
307
|
-
if let Some(obj) = current.as_object() {
|
|
308
|
-
match obj.get(part) {
|
|
309
|
-
Some(val) => current = val,
|
|
310
|
-
None => {
|
|
311
|
-
set_last_error(format!("Field '{}' not found in config", field_str));
|
|
312
|
-
return ptr::null_mut();
|
|
313
|
-
}
|
|
314
|
-
}
|
|
315
|
-
} else {
|
|
316
|
-
set_last_error(format!("Cannot access nested field '{}' in non-object", part));
|
|
317
|
-
return ptr::null_mut();
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
match serde_json::to_string(current) {
|
|
322
|
-
Ok(json) => match std::ffi::CString::new(json) {
|
|
323
|
-
Ok(c_string) => c_string.into_raw(),
|
|
324
|
-
Err(e) => {
|
|
325
|
-
set_last_error(format!("Failed to convert field value to C string: {}", e));
|
|
326
|
-
ptr::null_mut()
|
|
327
|
-
}
|
|
328
|
-
},
|
|
329
|
-
Err(e) => {
|
|
330
|
-
set_last_error(format!("Failed to serialize field value: {}", e));
|
|
331
|
-
ptr::null_mut()
|
|
332
|
-
}
|
|
333
|
-
}
|
|
334
|
-
}
|
|
335
|
-
|
|
336
|
-
/// Merge two configs (override takes precedence over base).
|
|
337
|
-
///
|
|
338
|
-
/// Performs a shallow merge of two ExtractionConfig structures, where fields
|
|
339
|
-
/// from `override_config` take precedence over fields in `base`. The `base`
|
|
340
|
-
/// config is modified in-place.
|
|
341
|
-
///
|
|
342
|
-
/// # Arguments
|
|
343
|
-
///
|
|
344
|
-
/// * `base` - Pointer to the base ExtractionConfig (will be modified)
|
|
345
|
-
/// * `override_config` - Pointer to the override ExtractionConfig (read-only)
|
|
346
|
-
///
|
|
347
|
-
/// # Returns
|
|
348
|
-
///
|
|
349
|
-
/// - 1 on success
|
|
350
|
-
/// - 0 on error (check `kreuzberg_last_error`)
|
|
351
|
-
///
|
|
352
|
-
/// # Safety
|
|
353
|
-
///
|
|
354
|
-
/// - `base` must be a valid mutable pointer to an ExtractionConfig
|
|
355
|
-
/// - `override_config` must be a valid pointer to an ExtractionConfig
|
|
356
|
-
/// - Neither parameter can be NULL
|
|
357
|
-
/// - `base` is modified in-place
|
|
358
|
-
///
|
|
359
|
-
/// # Example (C)
|
|
360
|
-
///
|
|
361
|
-
/// ```c
|
|
362
|
-
/// ExtractionConfig* base = kreuzberg_config_from_json(
|
|
363
|
-
/// "{\"use_cache\": true, \"force_ocr\": false}"
|
|
364
|
-
/// );
|
|
365
|
-
/// ExtractionConfig* override = kreuzberg_config_from_json(
|
|
366
|
-
/// "{\"force_ocr\": true}"
|
|
367
|
-
/// );
|
|
368
|
-
///
|
|
369
|
-
/// if (kreuzberg_config_merge(base, override) == 1) {
|
|
370
|
-
/// // base now has: use_cache=true, force_ocr=true
|
|
371
|
-
/// char* json = kreuzberg_config_to_json(base);
|
|
372
|
-
/// printf("Merged config: %s\n", json);
|
|
373
|
-
/// kreuzberg_free_string(json);
|
|
374
|
-
/// }
|
|
375
|
-
///
|
|
376
|
-
/// kreuzberg_config_free(base);
|
|
377
|
-
/// kreuzberg_config_free(override);
|
|
378
|
-
/// ```
|
|
379
|
-
#[unsafe(no_mangle)]
|
|
380
|
-
pub unsafe extern "C" fn kreuzberg_config_merge(
|
|
381
|
-
base: *mut ExtractionConfig,
|
|
382
|
-
override_config: *const ExtractionConfig,
|
|
383
|
-
) -> i32 {
|
|
384
|
-
if base.is_null() {
|
|
385
|
-
set_last_error("Base config cannot be NULL".to_string());
|
|
386
|
-
return 0;
|
|
387
|
-
}
|
|
388
|
-
|
|
389
|
-
if override_config.is_null() {
|
|
390
|
-
set_last_error("Override config cannot be NULL".to_string());
|
|
391
|
-
return 0;
|
|
392
|
-
}
|
|
393
|
-
|
|
394
|
-
clear_last_error();
|
|
395
|
-
|
|
396
|
-
let base_ref = unsafe { &mut *base };
|
|
397
|
-
let override_ref = unsafe { &*override_config };
|
|
398
|
-
|
|
399
|
-
base_ref.use_cache = override_ref.use_cache;
|
|
400
|
-
base_ref.enable_quality_processing = override_ref.enable_quality_processing;
|
|
401
|
-
base_ref.force_ocr = override_ref.force_ocr;
|
|
402
|
-
base_ref.max_concurrent_extractions = override_ref.max_concurrent_extractions;
|
|
403
|
-
|
|
404
|
-
if override_ref.ocr.is_some() {
|
|
405
|
-
base_ref.ocr = override_ref.ocr.clone();
|
|
406
|
-
}
|
|
407
|
-
|
|
408
|
-
if override_ref.chunking.is_some() {
|
|
409
|
-
base_ref.chunking = override_ref.chunking.clone();
|
|
410
|
-
}
|
|
411
|
-
|
|
412
|
-
if override_ref.images.is_some() {
|
|
413
|
-
base_ref.images = override_ref.images.clone();
|
|
414
|
-
}
|
|
415
|
-
|
|
416
|
-
#[cfg(feature = "pdf")]
|
|
417
|
-
if override_ref.pdf_options.is_some() {
|
|
418
|
-
base_ref.pdf_options = override_ref.pdf_options.clone();
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
if override_ref.token_reduction.is_some() {
|
|
422
|
-
base_ref.token_reduction = override_ref.token_reduction.clone();
|
|
423
|
-
}
|
|
424
|
-
|
|
425
|
-
if override_ref.language_detection.is_some() {
|
|
426
|
-
base_ref.language_detection = override_ref.language_detection.clone();
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
if override_ref.pages.is_some() {
|
|
430
|
-
base_ref.pages = override_ref.pages.clone();
|
|
431
|
-
}
|
|
432
|
-
|
|
433
|
-
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
|
|
434
|
-
if override_ref.keywords.is_some() {
|
|
435
|
-
base_ref.keywords = override_ref.keywords.clone();
|
|
436
|
-
}
|
|
437
|
-
|
|
438
|
-
if override_ref.postprocessor.is_some() {
|
|
439
|
-
base_ref.postprocessor = override_ref.postprocessor.clone();
|
|
440
|
-
}
|
|
441
|
-
|
|
442
|
-
if override_ref.html_options.is_some() {
|
|
443
|
-
base_ref.html_options = override_ref.html_options.clone();
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
1
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
/// Parse ExtractionConfig from JSON string.
|
|
450
|
-
///
|
|
451
|
-
/// This is the core parsing logic shared by all FFI functions that deal with
|
|
452
|
-
/// JSON configuration. It handles:
|
|
453
|
-
/// - JSON deserialization
|
|
454
|
-
/// - All validation rules
|
|
455
|
-
/// - Type conversions
|
|
456
|
-
/// - HTML options parsing (complex nested structure)
|
|
457
|
-
///
|
|
458
|
-
/// The error messages are user-friendly and include guidance on what went wrong.
|
|
459
|
-
fn parse_extraction_config_from_json(json_str: &str) -> FfiResult<ExtractionConfig> {
|
|
460
|
-
use html_to_markdown_rs::options::{
|
|
461
|
-
CodeBlockStyle, ConversionOptions, HeadingStyle, HighlightStyle, ListIndentType, NewlineStyle,
|
|
462
|
-
PreprocessingPreset, WhitespaceMode,
|
|
463
|
-
};
|
|
464
|
-
|
|
465
|
-
// ~keep: This function performs the JSON parsing and validation that was
|
|
466
|
-
|
|
467
|
-
fn parse_enum<T, F>(value: Option<&serde_json::Value>, parse_fn: F) -> FfiResult<Option<T>>
|
|
468
|
-
where
|
|
469
|
-
F: Fn(&str) -> FfiResult<T>,
|
|
470
|
-
{
|
|
471
|
-
if let Some(raw) = value {
|
|
472
|
-
let text = raw
|
|
473
|
-
.as_str()
|
|
474
|
-
.ok_or_else(|| "Expected string for enum field".to_string())?;
|
|
475
|
-
return parse_fn(text).map(Some);
|
|
476
|
-
}
|
|
477
|
-
Ok(None)
|
|
478
|
-
}
|
|
479
|
-
|
|
480
|
-
fn parse_heading_style(value: &str) -> FfiResult<HeadingStyle> {
|
|
481
|
-
match value.to_lowercase().as_str() {
|
|
482
|
-
"atx" => Ok(HeadingStyle::Atx),
|
|
483
|
-
"underlined" => Ok(HeadingStyle::Underlined),
|
|
484
|
-
"atx_closed" => Ok(HeadingStyle::AtxClosed),
|
|
485
|
-
other => Err(format!(
|
|
486
|
-
"Invalid heading_style '{}'. Expected one of: atx, underlined, atx_closed",
|
|
487
|
-
other
|
|
488
|
-
)),
|
|
489
|
-
}
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
fn parse_list_indent_type(value: &str) -> FfiResult<ListIndentType> {
|
|
493
|
-
match value.to_lowercase().as_str() {
|
|
494
|
-
"spaces" => Ok(ListIndentType::Spaces),
|
|
495
|
-
"tabs" => Ok(ListIndentType::Tabs),
|
|
496
|
-
other => Err(format!(
|
|
497
|
-
"Invalid list_indent_type '{}'. Expected 'spaces' or 'tabs'",
|
|
498
|
-
other
|
|
499
|
-
)),
|
|
500
|
-
}
|
|
501
|
-
}
|
|
502
|
-
|
|
503
|
-
fn parse_highlight_style(value: &str) -> FfiResult<HighlightStyle> {
|
|
504
|
-
match value.to_lowercase().as_str() {
|
|
505
|
-
"double_equal" | "==" | "highlight" => Ok(HighlightStyle::DoubleEqual),
|
|
506
|
-
"html" => Ok(HighlightStyle::Html),
|
|
507
|
-
"bold" => Ok(HighlightStyle::Bold),
|
|
508
|
-
"none" => Ok(HighlightStyle::None),
|
|
509
|
-
other => Err(format!(
|
|
510
|
-
"Invalid highlight_style '{}'. Expected one of: double_equal, html, bold, none",
|
|
511
|
-
other
|
|
512
|
-
)),
|
|
513
|
-
}
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
fn parse_whitespace_mode(value: &str) -> FfiResult<WhitespaceMode> {
|
|
517
|
-
match value.to_lowercase().as_str() {
|
|
518
|
-
"normalized" => Ok(WhitespaceMode::Normalized),
|
|
519
|
-
"strict" => Ok(WhitespaceMode::Strict),
|
|
520
|
-
other => Err(format!(
|
|
521
|
-
"Invalid whitespace_mode '{}'. Expected 'normalized' or 'strict'",
|
|
522
|
-
other
|
|
523
|
-
)),
|
|
524
|
-
}
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
fn parse_newline_style(value: &str) -> FfiResult<NewlineStyle> {
|
|
528
|
-
match value.to_lowercase().as_str() {
|
|
529
|
-
"spaces" => Ok(NewlineStyle::Spaces),
|
|
530
|
-
"backslash" => Ok(NewlineStyle::Backslash),
|
|
531
|
-
other => Err(format!(
|
|
532
|
-
"Invalid newline_style '{}'. Expected 'spaces' or 'backslash'",
|
|
533
|
-
other
|
|
534
|
-
)),
|
|
535
|
-
}
|
|
536
|
-
}
|
|
537
|
-
|
|
538
|
-
fn parse_code_block_style(value: &str) -> FfiResult<CodeBlockStyle> {
|
|
539
|
-
match value.to_lowercase().as_str() {
|
|
540
|
-
"indented" => Ok(CodeBlockStyle::Indented),
|
|
541
|
-
"backticks" => Ok(CodeBlockStyle::Backticks),
|
|
542
|
-
"tildes" => Ok(CodeBlockStyle::Tildes),
|
|
543
|
-
other => Err(format!(
|
|
544
|
-
"Invalid code_block_style '{}'. Expected 'indented', 'backticks', or 'tildes'",
|
|
545
|
-
other
|
|
546
|
-
)),
|
|
547
|
-
}
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
#[allow(dead_code)]
|
|
551
|
-
fn parse_preprocessing_preset(value: &str) -> FfiResult<PreprocessingPreset> {
|
|
552
|
-
match value.to_lowercase().as_str() {
|
|
553
|
-
"minimal" => Ok(PreprocessingPreset::Minimal),
|
|
554
|
-
"standard" => Ok(PreprocessingPreset::Standard),
|
|
555
|
-
"aggressive" => Ok(PreprocessingPreset::Aggressive),
|
|
556
|
-
other => Err(format!(
|
|
557
|
-
"Invalid preprocessing.preset '{}'. Expected one of: minimal, standard, aggressive",
|
|
558
|
-
other
|
|
559
|
-
)),
|
|
560
|
-
}
|
|
561
|
-
}
|
|
562
|
-
|
|
563
|
-
fn parse_html_options(value: &serde_json::Value) -> FfiResult<ConversionOptions> {
|
|
564
|
-
let mut opts = ConversionOptions::default();
|
|
565
|
-
let obj = value
|
|
566
|
-
.as_object()
|
|
567
|
-
.ok_or_else(|| "html_options must be an object".to_string())?;
|
|
568
|
-
|
|
569
|
-
if let Some(val) = obj.get("heading_style") {
|
|
570
|
-
opts.heading_style = parse_enum(Some(val), parse_heading_style)?.unwrap_or(opts.heading_style);
|
|
571
|
-
}
|
|
572
|
-
|
|
573
|
-
if let Some(val) = obj.get("list_indent_type") {
|
|
574
|
-
opts.list_indent_type = parse_enum(Some(val), parse_list_indent_type)?.unwrap_or(opts.list_indent_type);
|
|
575
|
-
}
|
|
576
|
-
|
|
577
|
-
if let Some(val) = obj.get("list_indent_width") {
|
|
578
|
-
opts.list_indent_width = val
|
|
579
|
-
.as_u64()
|
|
580
|
-
.map(|v| v as usize)
|
|
581
|
-
.ok_or_else(|| "list_indent_width must be an integer".to_string())?;
|
|
582
|
-
}
|
|
583
|
-
|
|
584
|
-
if let Some(val) = obj.get("bullets") {
|
|
585
|
-
opts.bullets = val
|
|
586
|
-
.as_str()
|
|
587
|
-
.map(str::to_string)
|
|
588
|
-
.ok_or_else(|| "bullets must be a string".to_string())?;
|
|
589
|
-
}
|
|
590
|
-
|
|
591
|
-
if let Some(val) = obj.get("strong_em_symbol") {
|
|
592
|
-
let symbol = val
|
|
593
|
-
.as_str()
|
|
594
|
-
.ok_or_else(|| "strong_em_symbol must be a string".to_string())?;
|
|
595
|
-
let mut chars = symbol.chars();
|
|
596
|
-
opts.strong_em_symbol = chars
|
|
597
|
-
.next()
|
|
598
|
-
.ok_or_else(|| "strong_em_symbol must not be empty".to_string())?;
|
|
599
|
-
}
|
|
600
|
-
|
|
601
|
-
if let Some(val) = obj.get("escape_asterisks") {
|
|
602
|
-
opts.escape_asterisks = val
|
|
603
|
-
.as_bool()
|
|
604
|
-
.ok_or_else(|| "escape_asterisks must be a boolean".to_string())?;
|
|
605
|
-
}
|
|
606
|
-
|
|
607
|
-
if let Some(val) = obj.get("escape_underscores") {
|
|
608
|
-
opts.escape_underscores = val
|
|
609
|
-
.as_bool()
|
|
610
|
-
.ok_or_else(|| "escape_underscores must be a boolean".to_string())?;
|
|
611
|
-
}
|
|
612
|
-
|
|
613
|
-
if let Some(val) = obj.get("escape_misc") {
|
|
614
|
-
opts.escape_misc = val
|
|
615
|
-
.as_bool()
|
|
616
|
-
.ok_or_else(|| "escape_misc must be a boolean".to_string())?;
|
|
617
|
-
}
|
|
618
|
-
|
|
619
|
-
if let Some(val) = obj.get("escape_ascii") {
|
|
620
|
-
opts.escape_ascii = val
|
|
621
|
-
.as_bool()
|
|
622
|
-
.ok_or_else(|| "escape_ascii must be a boolean".to_string())?;
|
|
623
|
-
}
|
|
624
|
-
|
|
625
|
-
if let Some(val) = obj.get("code_language") {
|
|
626
|
-
opts.code_language = val
|
|
627
|
-
.as_str()
|
|
628
|
-
.map(str::to_string)
|
|
629
|
-
.ok_or_else(|| "code_language must be a string".to_string())?;
|
|
630
|
-
}
|
|
631
|
-
|
|
632
|
-
if let Some(val) = obj.get("autolinks") {
|
|
633
|
-
opts.autolinks = val.as_bool().ok_or_else(|| "autolinks must be a boolean".to_string())?;
|
|
634
|
-
}
|
|
635
|
-
|
|
636
|
-
if let Some(val) = obj.get("default_title") {
|
|
637
|
-
opts.default_title = val
|
|
638
|
-
.as_bool()
|
|
639
|
-
.ok_or_else(|| "default_title must be a boolean".to_string())?;
|
|
640
|
-
}
|
|
641
|
-
|
|
642
|
-
if let Some(val) = obj.get("br_in_tables") {
|
|
643
|
-
opts.br_in_tables = val
|
|
644
|
-
.as_bool()
|
|
645
|
-
.ok_or_else(|| "br_in_tables must be a boolean".to_string())?;
|
|
646
|
-
}
|
|
647
|
-
|
|
648
|
-
if let Some(val) = obj.get("hocr_spatial_tables") {
|
|
649
|
-
opts.hocr_spatial_tables = val
|
|
650
|
-
.as_bool()
|
|
651
|
-
.ok_or_else(|| "hocr_spatial_tables must be a boolean".to_string())?;
|
|
652
|
-
}
|
|
653
|
-
|
|
654
|
-
if let Some(val) = obj.get("highlight_style") {
|
|
655
|
-
opts.highlight_style = parse_enum(Some(val), parse_highlight_style)?.unwrap_or(opts.highlight_style);
|
|
656
|
-
}
|
|
657
|
-
|
|
658
|
-
if let Some(val) = obj.get("extract_metadata") {
|
|
659
|
-
opts.extract_metadata = val
|
|
660
|
-
.as_bool()
|
|
661
|
-
.ok_or_else(|| "extract_metadata must be a boolean".to_string())?;
|
|
662
|
-
}
|
|
663
|
-
|
|
664
|
-
if let Some(val) = obj.get("whitespace_mode") {
|
|
665
|
-
opts.whitespace_mode = parse_enum(Some(val), parse_whitespace_mode)?.unwrap_or(opts.whitespace_mode);
|
|
666
|
-
}
|
|
667
|
-
|
|
668
|
-
if let Some(val) = obj.get("strip_newlines") {
|
|
669
|
-
opts.strip_newlines = val
|
|
670
|
-
.as_bool()
|
|
671
|
-
.ok_or_else(|| "strip_newlines must be a boolean".to_string())?;
|
|
672
|
-
}
|
|
673
|
-
|
|
674
|
-
if let Some(val) = obj.get("wrap") {
|
|
675
|
-
opts.wrap = val.as_bool().ok_or_else(|| "wrap must be a boolean".to_string())?;
|
|
676
|
-
}
|
|
677
|
-
|
|
678
|
-
if let Some(val) = obj.get("wrap_width") {
|
|
679
|
-
opts.wrap_width = val
|
|
680
|
-
.as_u64()
|
|
681
|
-
.map(|v| v as usize)
|
|
682
|
-
.ok_or_else(|| "wrap_width must be an integer".to_string())?;
|
|
683
|
-
}
|
|
684
|
-
|
|
685
|
-
if let Some(val) = obj.get("convert_as_inline") {
|
|
686
|
-
opts.convert_as_inline = val
|
|
687
|
-
.as_bool()
|
|
688
|
-
.ok_or_else(|| "convert_as_inline must be a boolean".to_string())?;
|
|
689
|
-
}
|
|
690
|
-
|
|
691
|
-
if let Some(val) = obj.get("sub_symbol") {
|
|
692
|
-
opts.sub_symbol = val
|
|
693
|
-
.as_str()
|
|
694
|
-
.map(str::to_string)
|
|
695
|
-
.ok_or_else(|| "sub_symbol must be a string".to_string())?;
|
|
696
|
-
}
|
|
697
|
-
|
|
698
|
-
if let Some(val) = obj.get("sup_symbol") {
|
|
699
|
-
opts.sup_symbol = val
|
|
700
|
-
.as_str()
|
|
701
|
-
.map(str::to_string)
|
|
702
|
-
.ok_or_else(|| "sup_symbol must be a string".to_string())?;
|
|
703
|
-
}
|
|
704
|
-
|
|
705
|
-
if let Some(val) = obj.get("newline_style") {
|
|
706
|
-
opts.newline_style = parse_enum(Some(val), parse_newline_style)?.unwrap_or(opts.newline_style);
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
if let Some(val) = obj.get("code_block_style") {
|
|
710
|
-
opts.code_block_style = parse_enum(Some(val), parse_code_block_style)?.unwrap_or(opts.code_block_style);
|
|
711
|
-
}
|
|
712
|
-
|
|
713
|
-
if let Some(val) = obj.get("keep_inline_images_in") {
|
|
714
|
-
opts.keep_inline_images_in = val
|
|
715
|
-
.as_array()
|
|
716
|
-
.ok_or_else(|| "keep_inline_images_in must be an array".to_string())?
|
|
717
|
-
.iter()
|
|
718
|
-
.map(|v| {
|
|
719
|
-
v.as_str()
|
|
720
|
-
.map(str::to_string)
|
|
721
|
-
.ok_or_else(|| "keep_inline_images_in entries must be strings".to_string())
|
|
722
|
-
})
|
|
723
|
-
.collect::<FfiResult<Vec<_>>>()?;
|
|
724
|
-
}
|
|
725
|
-
|
|
726
|
-
if let Some(val) = obj.get("encoding") {
|
|
727
|
-
opts.encoding = val
|
|
728
|
-
.as_str()
|
|
729
|
-
.map(str::to_string)
|
|
730
|
-
.ok_or_else(|| "encoding must be a string".to_string())?;
|
|
731
|
-
}
|
|
732
|
-
|
|
733
|
-
if let Some(val) = obj.get("debug") {
|
|
734
|
-
opts.debug = val.as_bool().ok_or_else(|| "debug must be a boolean".to_string())?;
|
|
735
|
-
}
|
|
736
|
-
|
|
737
|
-
if let Some(val) = obj.get("strip_tags") {
|
|
738
|
-
opts.strip_tags = val
|
|
739
|
-
.as_array()
|
|
740
|
-
.ok_or_else(|| "strip_tags must be an array".to_string())?
|
|
741
|
-
.iter()
|
|
742
|
-
.map(|v| {
|
|
743
|
-
v.as_str()
|
|
744
|
-
.map(str::to_string)
|
|
745
|
-
.ok_or_else(|| "strip_tags entries must be strings".to_string())
|
|
746
|
-
})
|
|
747
|
-
.collect::<FfiResult<Vec<_>>>()?;
|
|
748
|
-
}
|
|
749
|
-
|
|
750
|
-
if let Some(val) = obj.get("preserve_tags") {
|
|
751
|
-
opts.preserve_tags = val
|
|
752
|
-
.as_array()
|
|
753
|
-
.ok_or_else(|| "preserve_tags must be an array".to_string())?
|
|
754
|
-
.iter()
|
|
755
|
-
.map(|v| {
|
|
756
|
-
v.as_str()
|
|
757
|
-
.map(str::to_string)
|
|
758
|
-
.ok_or_else(|| "preserve_tags entries must be strings".to_string())
|
|
759
|
-
})
|
|
760
|
-
.collect::<FfiResult<Vec<_>>>()?;
|
|
761
|
-
}
|
|
762
|
-
|
|
763
|
-
Ok(opts)
|
|
764
|
-
}
|
|
765
|
-
|
|
766
|
-
let json_value: serde_json::Value = serde_json::from_str(json_str).map_err(|e| format!("Invalid JSON: {}", e))?;
|
|
767
|
-
|
|
768
|
-
let mut config: ExtractionConfig =
|
|
769
|
-
serde_json::from_value(json_value.clone()).map_err(|e| format!("Invalid configuration structure: {}", e))?;
|
|
770
|
-
|
|
771
|
-
if let Some(html_opts_val) = json_value.get("html_options") {
|
|
772
|
-
config.html_options = Some(parse_html_options(html_opts_val)?);
|
|
773
|
-
}
|
|
774
|
-
|
|
775
|
-
Ok(config)
|
|
776
|
-
}
|
|
777
|
-
|
|
778
|
-
/// SerializableEmbeddingPreset for FFI serialization.
|
|
779
|
-
#[derive(Serialize)]
|
|
780
|
-
struct SerializableEmbeddingPreset<'a> {
|
|
781
|
-
name: &'a str,
|
|
782
|
-
chunk_size: usize,
|
|
783
|
-
overlap: usize,
|
|
784
|
-
model_name: String,
|
|
785
|
-
dimensions: usize,
|
|
786
|
-
description: &'a str,
|
|
787
|
-
}
|
|
788
|
-
|
|
789
|
-
/// Load an ExtractionConfig from a file.
|
|
790
|
-
///
|
|
791
|
-
/// Returns a JSON string representing the loaded configuration.
|
|
792
|
-
///
|
|
793
|
-
/// # Safety
|
|
794
|
-
///
|
|
795
|
-
/// - `file_path` must be a valid null-terminated C string
|
|
796
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
797
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
798
|
-
#[unsafe(no_mangle)]
|
|
799
|
-
pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
|
|
800
|
-
ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
|
|
801
|
-
clear_last_error();
|
|
802
|
-
|
|
803
|
-
if file_path.is_null() {
|
|
804
|
-
set_last_error("file_path cannot be NULL".to_string());
|
|
805
|
-
return ptr::null_mut();
|
|
806
|
-
}
|
|
807
|
-
|
|
808
|
-
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
809
|
-
Ok(s) => s,
|
|
810
|
-
Err(e) => {
|
|
811
|
-
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
812
|
-
return ptr::null_mut();
|
|
813
|
-
}
|
|
814
|
-
};
|
|
815
|
-
|
|
816
|
-
match ExtractionConfig::from_file(path_str) {
|
|
817
|
-
Ok(config) => match serde_json::to_string(&config) {
|
|
818
|
-
Ok(json) => match CString::new(json) {
|
|
819
|
-
Ok(cstr) => cstr.into_raw(),
|
|
820
|
-
Err(e) => {
|
|
821
|
-
set_last_error(format!("Failed to create C string: {}", e));
|
|
822
|
-
ptr::null_mut()
|
|
823
|
-
}
|
|
824
|
-
},
|
|
825
|
-
Err(e) => {
|
|
826
|
-
set_last_error(format!("Failed to serialize config to JSON: {}", e));
|
|
827
|
-
ptr::null_mut()
|
|
828
|
-
}
|
|
829
|
-
},
|
|
830
|
-
Err(e) => {
|
|
831
|
-
set_last_error(e.to_string());
|
|
832
|
-
ptr::null_mut()
|
|
833
|
-
}
|
|
834
|
-
}
|
|
835
|
-
})
|
|
836
|
-
}
|
|
837
|
-
|
|
838
|
-
/// Load an ExtractionConfig from a file (returns pointer to config struct).
|
|
839
|
-
///
|
|
840
|
-
/// # Safety
|
|
841
|
-
///
|
|
842
|
-
/// - `path` must be a valid null-terminated C string
|
|
843
|
-
/// - The returned pointer must be freed with `kreuzberg_config_free`
|
|
844
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
845
|
-
///
|
|
846
|
-
/// # Example (C)
|
|
847
|
-
///
|
|
848
|
-
/// ```c
|
|
849
|
-
/// ExtractionConfig* config = kreuzberg_config_from_file("config.toml");
|
|
850
|
-
/// if (config == NULL) {
|
|
851
|
-
/// printf("Error: %s\n", kreuzberg_last_error());
|
|
852
|
-
/// return 1;
|
|
853
|
-
/// }
|
|
854
|
-
/// kreuzberg_config_free(config);
|
|
855
|
-
/// ```
|
|
856
|
-
#[unsafe(no_mangle)]
|
|
857
|
-
pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
|
|
858
|
-
ffi_panic_guard!("kreuzberg_config_from_file", {
|
|
859
|
-
clear_last_error();
|
|
860
|
-
|
|
861
|
-
if path.is_null() {
|
|
862
|
-
set_last_error("Config path cannot be NULL".to_string());
|
|
863
|
-
return ptr::null_mut();
|
|
864
|
-
}
|
|
865
|
-
|
|
866
|
-
let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
|
|
867
|
-
Ok(s) => s,
|
|
868
|
-
Err(e) => {
|
|
869
|
-
set_last_error(format!("Invalid UTF-8 in config path: {}", e));
|
|
870
|
-
return ptr::null_mut();
|
|
871
|
-
}
|
|
872
|
-
};
|
|
873
|
-
|
|
874
|
-
let path_buf = Path::new(path_str);
|
|
875
|
-
|
|
876
|
-
match ExtractionConfig::from_file(path_buf) {
|
|
877
|
-
Ok(config) => Box::into_raw(Box::new(config)),
|
|
878
|
-
Err(e) => {
|
|
879
|
-
match &e {
|
|
880
|
-
KreuzbergError::Io(io_err) => {
|
|
881
|
-
set_last_error(format!("IO error loading config: {}", io_err));
|
|
882
|
-
}
|
|
883
|
-
_ => {
|
|
884
|
-
set_last_error(format!("Failed to load config from file: {}", e));
|
|
885
|
-
}
|
|
886
|
-
}
|
|
887
|
-
ptr::null_mut()
|
|
888
|
-
}
|
|
889
|
-
}
|
|
890
|
-
})
|
|
891
|
-
}
|
|
892
|
-
|
|
893
|
-
/// Discover and load an ExtractionConfig by searching parent directories.
|
|
894
|
-
///
|
|
895
|
-
/// Searches the current directory and all parent directories for:
|
|
896
|
-
/// - `kreuzberg.toml`
|
|
897
|
-
/// - `kreuzberg.json`
|
|
898
|
-
///
|
|
899
|
-
/// Returns the first config file found as a JSON string.
|
|
900
|
-
///
|
|
901
|
-
/// # Safety
|
|
902
|
-
///
|
|
903
|
-
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
904
|
-
/// - Returns NULL if no config is found or on error
|
|
905
|
-
///
|
|
906
|
-
/// # Example (C)
|
|
907
|
-
///
|
|
908
|
-
/// ```c
|
|
909
|
-
/// char* config_json = kreuzberg_config_discover();
|
|
910
|
-
/// if (config_json != NULL) {
|
|
911
|
-
/// printf("Discovered config: %s\n", config_json);
|
|
912
|
-
/// kreuzberg_free_string(config_json);
|
|
913
|
-
/// }
|
|
914
|
-
/// ```
|
|
915
|
-
#[unsafe(no_mangle)]
|
|
916
|
-
pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
|
|
917
|
-
ffi_panic_guard!("kreuzberg_config_discover", {
|
|
918
|
-
clear_last_error();
|
|
919
|
-
|
|
920
|
-
match ExtractionConfig::discover() {
|
|
921
|
-
Ok(Some(config)) => match serde_json::to_string(&config) {
|
|
922
|
-
Ok(json) => match CString::new(json) {
|
|
923
|
-
Ok(cstr) => cstr.into_raw(),
|
|
924
|
-
Err(e) => {
|
|
925
|
-
set_last_error(format!("Failed to serialize config: {}", e));
|
|
926
|
-
ptr::null_mut()
|
|
927
|
-
}
|
|
928
|
-
},
|
|
929
|
-
Err(e) => {
|
|
930
|
-
set_last_error(format!("Failed to serialize config: {}", e));
|
|
931
|
-
ptr::null_mut()
|
|
932
|
-
}
|
|
933
|
-
},
|
|
934
|
-
Ok(None) => ptr::null_mut(),
|
|
935
|
-
Err(e) => {
|
|
936
|
-
match &e {
|
|
937
|
-
KreuzbergError::Io(io_err) => {
|
|
938
|
-
set_last_error(format!("IO error discovering config: {}", io_err));
|
|
939
|
-
}
|
|
940
|
-
_ => {
|
|
941
|
-
set_last_error(format!("Failed to discover config: {}", e));
|
|
942
|
-
}
|
|
943
|
-
}
|
|
944
|
-
ptr::null_mut()
|
|
945
|
-
}
|
|
946
|
-
}
|
|
947
|
-
})
|
|
948
|
-
}
|
|
949
|
-
|
|
950
|
-
/// List available embedding preset names.
|
|
951
|
-
///
|
|
952
|
-
/// # Safety
|
|
953
|
-
///
|
|
954
|
-
/// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
|
|
955
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
956
|
-
#[unsafe(no_mangle)]
|
|
957
|
-
pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
|
|
958
|
-
ffi_panic_guard!("kreuzberg_list_embedding_presets", {
|
|
959
|
-
clear_last_error();
|
|
960
|
-
|
|
961
|
-
let presets = kreuzberg::embeddings::list_presets();
|
|
962
|
-
match serde_json::to_string(&presets) {
|
|
963
|
-
Ok(json) => match string_to_c_string(json) {
|
|
964
|
-
Ok(ptr) => ptr,
|
|
965
|
-
Err(e) => {
|
|
966
|
-
set_last_error(e);
|
|
967
|
-
ptr::null_mut()
|
|
968
|
-
}
|
|
969
|
-
},
|
|
970
|
-
Err(e) => {
|
|
971
|
-
set_last_error(format!("Failed to serialize presets: {}", e));
|
|
972
|
-
ptr::null_mut()
|
|
973
|
-
}
|
|
974
|
-
}
|
|
975
|
-
})
|
|
976
|
-
}
|
|
977
|
-
|
|
978
|
-
/// Get a specific embedding preset by name.
|
|
979
|
-
///
|
|
980
|
-
/// # Safety
|
|
981
|
-
///
|
|
982
|
-
/// - `name` must be a valid null-terminated C string
|
|
983
|
-
/// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
|
|
984
|
-
/// - Returns NULL on error (check `kreuzberg_last_error`)
|
|
985
|
-
#[unsafe(no_mangle)]
|
|
986
|
-
pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
|
|
987
|
-
ffi_panic_guard!("kreuzberg_get_embedding_preset", {
|
|
988
|
-
clear_last_error();
|
|
989
|
-
|
|
990
|
-
if name.is_null() {
|
|
991
|
-
set_last_error("preset name cannot be NULL".to_string());
|
|
992
|
-
return ptr::null_mut();
|
|
993
|
-
}
|
|
994
|
-
|
|
995
|
-
let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
996
|
-
Ok(s) => s,
|
|
997
|
-
Err(e) => {
|
|
998
|
-
set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
|
|
999
|
-
return ptr::null_mut();
|
|
1000
|
-
}
|
|
1001
|
-
};
|
|
1002
|
-
|
|
1003
|
-
let preset = match kreuzberg::embeddings::get_preset(preset_name) {
|
|
1004
|
-
Some(preset) => preset,
|
|
1005
|
-
None => {
|
|
1006
|
-
set_last_error(format!("Unknown embedding preset: {}", preset_name));
|
|
1007
|
-
return ptr::null_mut();
|
|
1008
|
-
}
|
|
1009
|
-
};
|
|
1010
|
-
|
|
1011
|
-
let model_name = format!("{:?}", preset.model);
|
|
1012
|
-
let serializable = SerializableEmbeddingPreset {
|
|
1013
|
-
name: preset.name,
|
|
1014
|
-
chunk_size: preset.chunk_size,
|
|
1015
|
-
overlap: preset.overlap,
|
|
1016
|
-
model_name,
|
|
1017
|
-
dimensions: preset.dimensions,
|
|
1018
|
-
description: preset.description,
|
|
1019
|
-
};
|
|
1020
|
-
|
|
1021
|
-
match serde_json::to_string(&serializable) {
|
|
1022
|
-
Ok(json) => match string_to_c_string(json) {
|
|
1023
|
-
Ok(ptr) => ptr,
|
|
1024
|
-
Err(e) => {
|
|
1025
|
-
set_last_error(e);
|
|
1026
|
-
ptr::null_mut()
|
|
1027
|
-
}
|
|
1028
|
-
},
|
|
1029
|
-
Err(e) => {
|
|
1030
|
-
set_last_error(format!("Failed to serialize embedding preset: {}", e));
|
|
1031
|
-
ptr::null_mut()
|
|
1032
|
-
}
|
|
1033
|
-
}
|
|
1034
|
-
})
|
|
1035
|
-
}
|
|
1036
|
-
|
|
1037
|
-
#[cfg(test)]
|
|
1038
|
-
mod tests {
|
|
1039
|
-
use super::*;
|
|
1040
|
-
use std::ffi::CStr;
|
|
1041
|
-
|
|
1042
|
-
#[test]
|
|
1043
|
-
fn test_parse_minimal_config() {
|
|
1044
|
-
let json = "{}";
|
|
1045
|
-
let result = parse_extraction_config_from_json(json);
|
|
1046
|
-
assert!(result.is_ok());
|
|
1047
|
-
}
|
|
1048
|
-
|
|
1049
|
-
#[test]
|
|
1050
|
-
fn test_parse_config_with_use_cache() {
|
|
1051
|
-
let json = r#"{"use_cache": true}"#;
|
|
1052
|
-
let result = parse_extraction_config_from_json(json);
|
|
1053
|
-
assert!(result.is_ok());
|
|
1054
|
-
let config = result.unwrap();
|
|
1055
|
-
assert!(config.use_cache);
|
|
1056
|
-
}
|
|
1057
|
-
|
|
1058
|
-
#[test]
|
|
1059
|
-
fn test_parse_config_with_ocr() {
|
|
1060
|
-
let json = r#"{"ocr": {"backend": "tesseract", "language": "eng"}}"#;
|
|
1061
|
-
let result = parse_extraction_config_from_json(json);
|
|
1062
|
-
assert!(result.is_ok());
|
|
1063
|
-
let config = result.unwrap();
|
|
1064
|
-
assert!(config.ocr.is_some());
|
|
1065
|
-
let ocr = config.ocr.unwrap();
|
|
1066
|
-
assert_eq!(ocr.backend, "tesseract");
|
|
1067
|
-
assert_eq!(ocr.language, "eng");
|
|
1068
|
-
}
|
|
1069
|
-
|
|
1070
|
-
#[test]
|
|
1071
|
-
fn test_parse_invalid_json() {
|
|
1072
|
-
let json = "{invalid json}";
|
|
1073
|
-
let result = parse_extraction_config_from_json(json);
|
|
1074
|
-
assert!(result.is_err());
|
|
1075
|
-
}
|
|
1076
|
-
|
|
1077
|
-
#[test]
|
|
1078
|
-
fn test_parse_complex_config() {
|
|
1079
|
-
let json = r#"{
|
|
1080
|
-
"use_cache": true,
|
|
1081
|
-
"enable_quality_processing": true,
|
|
1082
|
-
"force_ocr": false,
|
|
1083
|
-
"ocr": {
|
|
1084
|
-
"backend": "tesseract",
|
|
1085
|
-
"language": "eng"
|
|
1086
|
-
},
|
|
1087
|
-
"chunking": {
|
|
1088
|
-
"max_chars": 1024,
|
|
1089
|
-
"max_overlap": 128
|
|
1090
|
-
},
|
|
1091
|
-
"max_concurrent_extractions": 4
|
|
1092
|
-
}"#;
|
|
1093
|
-
let result = parse_extraction_config_from_json(json);
|
|
1094
|
-
assert!(result.is_ok());
|
|
1095
|
-
}
|
|
1096
|
-
|
|
1097
|
-
#[test]
|
|
1098
|
-
fn test_config_to_json() {
|
|
1099
|
-
let json_str = r#"{"use_cache": true}"#;
|
|
1100
|
-
let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
|
|
1101
|
-
assert!(!config_ptr.is_null());
|
|
1102
|
-
|
|
1103
|
-
let json_out = unsafe { kreuzberg_config_to_json(config_ptr) };
|
|
1104
|
-
assert!(!json_out.is_null());
|
|
1105
|
-
|
|
1106
|
-
let out_str = unsafe { CStr::from_ptr(json_out).to_str().unwrap() };
|
|
1107
|
-
assert!(out_str.contains("use_cache"));
|
|
1108
|
-
assert!(out_str.contains("true"));
|
|
1109
|
-
|
|
1110
|
-
unsafe {
|
|
1111
|
-
crate::kreuzberg_free_string(json_out);
|
|
1112
|
-
kreuzberg_config_free(config_ptr);
|
|
1113
|
-
}
|
|
1114
|
-
}
|
|
1115
|
-
|
|
1116
|
-
#[test]
|
|
1117
|
-
fn test_config_to_json_null_pointer() {
|
|
1118
|
-
let result = unsafe { kreuzberg_config_to_json(ptr::null()) };
|
|
1119
|
-
assert!(result.is_null());
|
|
1120
|
-
}
|
|
1121
|
-
|
|
1122
|
-
#[test]
|
|
1123
|
-
fn test_config_get_field_simple() {
|
|
1124
|
-
let json_str = r#"{"use_cache": true}"#;
|
|
1125
|
-
let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
|
|
1126
|
-
assert!(!config_ptr.is_null());
|
|
1127
|
-
|
|
1128
|
-
let field_name = std::ffi::CString::new("use_cache").unwrap();
|
|
1129
|
-
let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
|
|
1130
|
-
assert!(!field_value.is_null());
|
|
1131
|
-
|
|
1132
|
-
let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
|
|
1133
|
-
assert_eq!(value_str, "true");
|
|
1134
|
-
|
|
1135
|
-
unsafe {
|
|
1136
|
-
crate::kreuzberg_free_string(field_value);
|
|
1137
|
-
kreuzberg_config_free(config_ptr);
|
|
1138
|
-
}
|
|
1139
|
-
}
|
|
1140
|
-
|
|
1141
|
-
#[test]
|
|
1142
|
-
fn test_config_get_field_nested() {
|
|
1143
|
-
let json_str = r#"{"ocr": {"backend": "tesseract"}}"#;
|
|
1144
|
-
let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
|
|
1145
|
-
assert!(!config_ptr.is_null());
|
|
1146
|
-
|
|
1147
|
-
let field_name = std::ffi::CString::new("ocr.backend").unwrap();
|
|
1148
|
-
let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
|
|
1149
|
-
assert!(!field_value.is_null());
|
|
1150
|
-
|
|
1151
|
-
let value_str = unsafe { CStr::from_ptr(field_value).to_str().unwrap() };
|
|
1152
|
-
assert_eq!(value_str, r#""tesseract""#);
|
|
1153
|
-
|
|
1154
|
-
unsafe {
|
|
1155
|
-
crate::kreuzberg_free_string(field_value);
|
|
1156
|
-
kreuzberg_config_free(config_ptr);
|
|
1157
|
-
}
|
|
1158
|
-
}
|
|
1159
|
-
|
|
1160
|
-
#[test]
|
|
1161
|
-
fn test_config_get_field_missing() {
|
|
1162
|
-
let json_str = r#"{"use_cache": true}"#;
|
|
1163
|
-
let config_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(json_str).unwrap().as_ptr()) };
|
|
1164
|
-
assert!(!config_ptr.is_null());
|
|
1165
|
-
|
|
1166
|
-
let field_name = std::ffi::CString::new("nonexistent").unwrap();
|
|
1167
|
-
let field_value = unsafe { kreuzberg_config_get_field(config_ptr, field_name.as_ptr()) };
|
|
1168
|
-
assert!(field_value.is_null());
|
|
1169
|
-
|
|
1170
|
-
unsafe {
|
|
1171
|
-
kreuzberg_config_free(config_ptr);
|
|
1172
|
-
}
|
|
1173
|
-
}
|
|
1174
|
-
|
|
1175
|
-
#[test]
|
|
1176
|
-
fn test_config_get_field_null_pointer() {
|
|
1177
|
-
let field_name = std::ffi::CString::new("use_cache").unwrap();
|
|
1178
|
-
let result = unsafe { kreuzberg_config_get_field(ptr::null(), field_name.as_ptr()) };
|
|
1179
|
-
assert!(result.is_null());
|
|
1180
|
-
}
|
|
1181
|
-
|
|
1182
|
-
#[test]
|
|
1183
|
-
fn test_config_merge() {
|
|
1184
|
-
let base_json = r#"{"use_cache": true, "force_ocr": false}"#;
|
|
1185
|
-
let override_json = r#"{"force_ocr": true}"#;
|
|
1186
|
-
|
|
1187
|
-
let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
|
|
1188
|
-
let override_ptr =
|
|
1189
|
-
unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
|
|
1190
|
-
|
|
1191
|
-
assert!(!base_ptr.is_null());
|
|
1192
|
-
assert!(!override_ptr.is_null());
|
|
1193
|
-
|
|
1194
|
-
let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
|
|
1195
|
-
assert_eq!(result, 1);
|
|
1196
|
-
|
|
1197
|
-
let merged_json = unsafe { kreuzberg_config_to_json(base_ptr) };
|
|
1198
|
-
assert!(!merged_json.is_null());
|
|
1199
|
-
|
|
1200
|
-
let merged_str = unsafe { CStr::from_ptr(merged_json).to_str().unwrap() };
|
|
1201
|
-
assert!(merged_str.contains("use_cache"));
|
|
1202
|
-
assert!(merged_str.contains("force_ocr"));
|
|
1203
|
-
|
|
1204
|
-
unsafe {
|
|
1205
|
-
crate::kreuzberg_free_string(merged_json);
|
|
1206
|
-
kreuzberg_config_free(base_ptr);
|
|
1207
|
-
kreuzberg_config_free(override_ptr);
|
|
1208
|
-
}
|
|
1209
|
-
}
|
|
1210
|
-
|
|
1211
|
-
#[test]
|
|
1212
|
-
fn test_config_merge_null_base() {
|
|
1213
|
-
let override_json = r#"{"force_ocr": true}"#;
|
|
1214
|
-
let override_ptr =
|
|
1215
|
-
unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
|
|
1216
|
-
|
|
1217
|
-
let result = unsafe { kreuzberg_config_merge(ptr::null_mut(), override_ptr) };
|
|
1218
|
-
assert_eq!(result, 0);
|
|
1219
|
-
|
|
1220
|
-
unsafe {
|
|
1221
|
-
kreuzberg_config_free(override_ptr);
|
|
1222
|
-
}
|
|
1223
|
-
}
|
|
1224
|
-
|
|
1225
|
-
#[test]
|
|
1226
|
-
fn test_config_merge_null_override() {
|
|
1227
|
-
let base_json = r#"{"use_cache": true}"#;
|
|
1228
|
-
let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
|
|
1229
|
-
|
|
1230
|
-
let result = unsafe { kreuzberg_config_merge(base_ptr, ptr::null()) };
|
|
1231
|
-
assert_eq!(result, 0);
|
|
1232
|
-
|
|
1233
|
-
unsafe {
|
|
1234
|
-
kreuzberg_config_free(base_ptr);
|
|
1235
|
-
}
|
|
1236
|
-
}
|
|
1237
|
-
|
|
1238
|
-
#[test]
|
|
1239
|
-
fn test_config_merge_override_to_default_value() {
|
|
1240
|
-
let base_json = r#"{"use_cache": false}"#;
|
|
1241
|
-
let override_json = r#"{"use_cache": true}"#;
|
|
1242
|
-
|
|
1243
|
-
let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
|
|
1244
|
-
let override_ptr =
|
|
1245
|
-
unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
|
|
1246
|
-
|
|
1247
|
-
assert!(!base_ptr.is_null());
|
|
1248
|
-
assert!(!override_ptr.is_null());
|
|
1249
|
-
|
|
1250
|
-
let base_ref = unsafe { &*base_ptr };
|
|
1251
|
-
assert!(!base_ref.use_cache);
|
|
1252
|
-
|
|
1253
|
-
let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
|
|
1254
|
-
assert_eq!(result, 1);
|
|
1255
|
-
|
|
1256
|
-
let base_ref = unsafe { &*base_ptr };
|
|
1257
|
-
assert!(base_ref.use_cache, "override to default value should be applied");
|
|
1258
|
-
|
|
1259
|
-
unsafe {
|
|
1260
|
-
kreuzberg_config_free(base_ptr);
|
|
1261
|
-
kreuzberg_config_free(override_ptr);
|
|
1262
|
-
}
|
|
1263
|
-
}
|
|
1264
|
-
|
|
1265
|
-
#[test]
|
|
1266
|
-
fn test_config_merge_override_force_ocr() {
|
|
1267
|
-
let base_json = r#"{"force_ocr": false}"#;
|
|
1268
|
-
let override_json = r#"{"force_ocr": true}"#;
|
|
1269
|
-
|
|
1270
|
-
let base_ptr = unsafe { kreuzberg_config_from_json(std::ffi::CString::new(base_json).unwrap().as_ptr()) };
|
|
1271
|
-
let override_ptr =
|
|
1272
|
-
unsafe { kreuzberg_config_from_json(std::ffi::CString::new(override_json).unwrap().as_ptr()) };
|
|
1273
|
-
|
|
1274
|
-
assert!(!base_ptr.is_null());
|
|
1275
|
-
assert!(!override_ptr.is_null());
|
|
1276
|
-
|
|
1277
|
-
let result = unsafe { kreuzberg_config_merge(base_ptr, override_ptr) };
|
|
1278
|
-
assert_eq!(result, 1);
|
|
1279
|
-
|
|
1280
|
-
let base_ref = unsafe { &*base_ptr };
|
|
1281
|
-
assert!(base_ref.force_ocr);
|
|
1282
|
-
|
|
1283
|
-
unsafe {
|
|
1284
|
-
kreuzberg_config_free(base_ptr);
|
|
1285
|
-
kreuzberg_config_free(override_ptr);
|
|
1286
|
-
}
|
|
1287
|
-
}
|
|
1288
|
-
|
|
1289
|
-
#[test]
|
|
1290
|
-
fn test_list_embedding_presets() {
|
|
1291
|
-
let result = unsafe { kreuzberg_list_embedding_presets() };
|
|
1292
|
-
assert!(!result.is_null());
|
|
1293
|
-
|
|
1294
|
-
let presets_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
|
|
1295
|
-
assert!(presets_str.starts_with('['));
|
|
1296
|
-
assert!(presets_str.ends_with(']'));
|
|
1297
|
-
|
|
1298
|
-
unsafe {
|
|
1299
|
-
crate::kreuzberg_free_string(result);
|
|
1300
|
-
}
|
|
1301
|
-
}
|
|
1302
|
-
|
|
1303
|
-
#[test]
|
|
1304
|
-
fn test_get_embedding_preset_null() {
|
|
1305
|
-
let result = unsafe { kreuzberg_get_embedding_preset(ptr::null()) };
|
|
1306
|
-
assert!(result.is_null());
|
|
1307
|
-
}
|
|
1308
|
-
|
|
1309
|
-
#[test]
|
|
1310
|
-
fn test_get_embedding_preset_unknown() {
|
|
1311
|
-
let name = CString::new("nonexistent_preset").unwrap();
|
|
1312
|
-
let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
|
|
1313
|
-
assert!(result.is_null());
|
|
1314
|
-
}
|
|
1315
|
-
|
|
1316
|
-
#[test]
|
|
1317
|
-
fn test_get_embedding_preset_valid() {
|
|
1318
|
-
let name = CString::new("fast").unwrap();
|
|
1319
|
-
let result = unsafe { kreuzberg_get_embedding_preset(name.as_ptr()) };
|
|
1320
|
-
assert!(!result.is_null());
|
|
1321
|
-
|
|
1322
|
-
let preset_str = unsafe { CStr::from_ptr(result).to_str().unwrap() };
|
|
1323
|
-
assert!(preset_str.contains("name"));
|
|
1324
|
-
assert!(preset_str.contains("chunk_size"));
|
|
1325
|
-
|
|
1326
|
-
unsafe {
|
|
1327
|
-
crate::kreuzberg_free_string(result);
|
|
1328
|
-
}
|
|
1329
|
-
}
|
|
1330
|
-
|
|
1331
|
-
#[test]
|
|
1332
|
-
fn test_config_discover_null_safe() {
|
|
1333
|
-
let result = unsafe { kreuzberg_config_discover() };
|
|
1334
|
-
// Result can be null if no config found, which is valid
|
|
1335
|
-
if !result.is_null() {
|
|
1336
|
-
unsafe {
|
|
1337
|
-
crate::kreuzberg_free_string(result);
|
|
1338
|
-
}
|
|
1339
|
-
}
|
|
1340
|
-
}
|
|
1341
|
-
}
|