kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
//! Centralized FFI configuration parsing module.
|
|
2
|
+
//!
|
|
3
|
+
//! This module consolidates all configuration parsing logic that was previously
|
|
4
|
+
//! duplicated across all language bindings (Python, TypeScript, Ruby, Java, Go, C#).
|
|
5
|
+
//!
|
|
6
|
+
//! Instead of each binding reimplementing config parsing from JSON, they now
|
|
7
|
+
//! call the FFI functions provided here, ensuring:
|
|
8
|
+
//! - Single source of truth for validation rules
|
|
9
|
+
//! - Consistent behavior across all languages
|
|
10
|
+
//! - Elimination of drift/inconsistencies
|
|
11
|
+
//! - Better performance (no JSON round-trips in language bindings)
|
|
12
|
+
|
|
13
|
+
mod html;
|
|
14
|
+
mod loader;
|
|
15
|
+
mod merge;
|
|
16
|
+
mod parse;
|
|
17
|
+
mod serialize;
|
|
18
|
+
|
|
19
|
+
// Re-export key functions for internal use
|
|
20
|
+
pub use loader::{
|
|
21
|
+
discover_config_as_json, get_embedding_preset, list_embedding_presets, load_config_as_json, load_config_from_file,
|
|
22
|
+
};
|
|
23
|
+
pub use merge::merge_configs;
|
|
24
|
+
pub use parse::parse_extraction_config_from_json;
|
|
25
|
+
pub use serialize::{config_to_json_string, get_field_as_json, json_to_c_string};
|
|
26
|
+
|
|
27
|
+
use crate::ffi_panic_guard;
|
|
28
|
+
use crate::helpers::{clear_last_error, set_last_error, string_to_c_string};
|
|
29
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
30
|
+
use std::ffi::{CStr, CString};
|
|
31
|
+
use std::os::raw::c_char;
|
|
32
|
+
use std::path::Path;
|
|
33
|
+
use std::ptr;
|
|
34
|
+
|
|
35
|
+
/// Parse an ExtractionConfig from a JSON string.
|
|
36
|
+
///
|
|
37
|
+
/// This is the primary FFI entry point for all language bindings to parse
|
|
38
|
+
/// configuration from JSON. Replaces the need for each binding to implement
|
|
39
|
+
/// its own JSON parsing logic.
|
|
40
|
+
///
|
|
41
|
+
/// # Arguments
|
|
42
|
+
///
|
|
43
|
+
/// * `json_config` - Null-terminated C string containing JSON configuration
|
|
44
|
+
///
|
|
45
|
+
/// # Returns
|
|
46
|
+
///
|
|
47
|
+
/// A pointer to an ExtractionConfig struct that MUST be freed with
|
|
48
|
+
/// `kreuzberg_config_free`, or NULL on error (check kreuzberg_last_error).
|
|
49
|
+
///
|
|
50
|
+
/// # Safety
|
|
51
|
+
///
|
|
52
|
+
/// - `json_config` must be a valid null-terminated C string
|
|
53
|
+
/// - The returned pointer must be freed with `kreuzberg_config_free`
|
|
54
|
+
/// - Returns NULL if parsing fails (error available via `kreuzberg_last_error`)
|
|
55
|
+
#[unsafe(no_mangle)]
|
|
56
|
+
pub unsafe extern "C" fn kreuzberg_config_from_json(json_config: *const c_char) -> *mut ExtractionConfig {
|
|
57
|
+
if json_config.is_null() {
|
|
58
|
+
set_last_error("Config JSON cannot be NULL".to_string());
|
|
59
|
+
return ptr::null_mut();
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
clear_last_error();
|
|
63
|
+
|
|
64
|
+
let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
|
|
65
|
+
Ok(s) => s,
|
|
66
|
+
Err(e) => {
|
|
67
|
+
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
68
|
+
return ptr::null_mut();
|
|
69
|
+
}
|
|
70
|
+
};
|
|
71
|
+
|
|
72
|
+
match parse_extraction_config_from_json(json_str) {
|
|
73
|
+
Ok(config) => Box::into_raw(Box::new(config)),
|
|
74
|
+
Err(e) => {
|
|
75
|
+
set_last_error(e);
|
|
76
|
+
ptr::null_mut()
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/// Free an ExtractionConfig allocated by kreuzberg_config_from_json or similar.
|
|
82
|
+
///
|
|
83
|
+
/// # Safety
|
|
84
|
+
///
|
|
85
|
+
/// - `config` must be a pointer previously returned by a config creation function
|
|
86
|
+
/// - `config` can be NULL (no-op)
|
|
87
|
+
/// - `config` must not be used after this call
|
|
88
|
+
#[unsafe(no_mangle)]
|
|
89
|
+
pub unsafe extern "C" fn kreuzberg_config_free(config: *mut ExtractionConfig) {
|
|
90
|
+
if !config.is_null() {
|
|
91
|
+
let _ = unsafe { Box::from_raw(config) };
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/// Validate a JSON config string without parsing it.
|
|
96
|
+
///
|
|
97
|
+
/// # Returns
|
|
98
|
+
///
|
|
99
|
+
/// - 1 if valid (would parse successfully)
|
|
100
|
+
/// - 0 if invalid (check `kreuzberg_last_error` for details)
|
|
101
|
+
///
|
|
102
|
+
/// # Safety
|
|
103
|
+
///
|
|
104
|
+
/// - `json_config` must be a valid null-terminated C string
|
|
105
|
+
#[unsafe(no_mangle)]
|
|
106
|
+
pub unsafe extern "C" fn kreuzberg_config_is_valid(json_config: *const c_char) -> i32 {
|
|
107
|
+
if json_config.is_null() {
|
|
108
|
+
set_last_error("Config JSON cannot be NULL".to_string());
|
|
109
|
+
return 0;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
clear_last_error();
|
|
113
|
+
|
|
114
|
+
let json_str = match unsafe { CStr::from_ptr(json_config) }.to_str() {
|
|
115
|
+
Ok(s) => s,
|
|
116
|
+
Err(e) => {
|
|
117
|
+
set_last_error(format!("Invalid UTF-8 in config JSON: {}", e));
|
|
118
|
+
return 0;
|
|
119
|
+
}
|
|
120
|
+
};
|
|
121
|
+
|
|
122
|
+
match parse_extraction_config_from_json(json_str) {
|
|
123
|
+
Ok(_) => 1,
|
|
124
|
+
Err(e) => {
|
|
125
|
+
set_last_error(e);
|
|
126
|
+
0
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/// Serialize an ExtractionConfig to JSON string.
|
|
132
|
+
///
|
|
133
|
+
/// # Safety
|
|
134
|
+
///
|
|
135
|
+
/// - `config` must be a valid pointer to an ExtractionConfig
|
|
136
|
+
/// - The returned pointer must be freed with `kreuzberg_free_string`
|
|
137
|
+
#[unsafe(no_mangle)]
|
|
138
|
+
pub unsafe extern "C" fn kreuzberg_config_to_json(config: *const ExtractionConfig) -> *mut c_char {
|
|
139
|
+
if config.is_null() {
|
|
140
|
+
set_last_error("Config cannot be NULL".to_string());
|
|
141
|
+
return ptr::null_mut();
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
clear_last_error();
|
|
145
|
+
|
|
146
|
+
match config_to_json_string(unsafe { &*config }) {
|
|
147
|
+
Some(json) => json_to_c_string(json),
|
|
148
|
+
None => ptr::null_mut(),
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/// Get a specific field from config as JSON string.
|
|
153
|
+
///
|
|
154
|
+
/// # Safety
|
|
155
|
+
///
|
|
156
|
+
/// - `config` must be a valid pointer to an ExtractionConfig
|
|
157
|
+
/// - `field_name` must be a valid null-terminated C string
|
|
158
|
+
#[unsafe(no_mangle)]
|
|
159
|
+
pub unsafe extern "C" fn kreuzberg_config_get_field(
|
|
160
|
+
config: *const ExtractionConfig,
|
|
161
|
+
field_name: *const c_char,
|
|
162
|
+
) -> *mut c_char {
|
|
163
|
+
if config.is_null() {
|
|
164
|
+
set_last_error("Config cannot be NULL".to_string());
|
|
165
|
+
return ptr::null_mut();
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if field_name.is_null() {
|
|
169
|
+
set_last_error("Field name cannot be NULL".to_string());
|
|
170
|
+
return ptr::null_mut();
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
clear_last_error();
|
|
174
|
+
|
|
175
|
+
let field_str = match unsafe { CStr::from_ptr(field_name) }.to_str() {
|
|
176
|
+
Ok(s) => s,
|
|
177
|
+
Err(e) => {
|
|
178
|
+
set_last_error(format!("Invalid UTF-8 in field name: {}", e));
|
|
179
|
+
return ptr::null_mut();
|
|
180
|
+
}
|
|
181
|
+
};
|
|
182
|
+
|
|
183
|
+
match get_field_as_json(unsafe { &*config }, field_str) {
|
|
184
|
+
Some(json) => json_to_c_string(json),
|
|
185
|
+
None => ptr::null_mut(),
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
/// Merge two configs (override takes precedence over base).
|
|
190
|
+
///
|
|
191
|
+
/// # Returns
|
|
192
|
+
///
|
|
193
|
+
/// - 1 on success
|
|
194
|
+
/// - 0 on error (check `kreuzberg_last_error`)
|
|
195
|
+
///
|
|
196
|
+
/// # Safety
|
|
197
|
+
///
|
|
198
|
+
/// - `base` must be a valid mutable pointer to an ExtractionConfig
|
|
199
|
+
/// - `override_config` must be a valid pointer to an ExtractionConfig
|
|
200
|
+
#[unsafe(no_mangle)]
|
|
201
|
+
pub unsafe extern "C" fn kreuzberg_config_merge(
|
|
202
|
+
base: *mut ExtractionConfig,
|
|
203
|
+
override_config: *const ExtractionConfig,
|
|
204
|
+
) -> i32 {
|
|
205
|
+
if base.is_null() {
|
|
206
|
+
set_last_error("Base config cannot be NULL".to_string());
|
|
207
|
+
return 0;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
if override_config.is_null() {
|
|
211
|
+
set_last_error("Override config cannot be NULL".to_string());
|
|
212
|
+
return 0;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
clear_last_error();
|
|
216
|
+
|
|
217
|
+
merge_configs(unsafe { &mut *base }, unsafe { &*override_config });
|
|
218
|
+
|
|
219
|
+
1
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/// Load an ExtractionConfig from a file (returns JSON string).
|
|
223
|
+
///
|
|
224
|
+
/// # Safety
|
|
225
|
+
///
|
|
226
|
+
/// - `file_path` must be a valid null-terminated C string
|
|
227
|
+
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
228
|
+
#[unsafe(no_mangle)]
|
|
229
|
+
pub unsafe extern "C" fn kreuzberg_load_extraction_config_from_file(file_path: *const c_char) -> *mut c_char {
|
|
230
|
+
ffi_panic_guard!("kreuzberg_load_extraction_config_from_file", {
|
|
231
|
+
clear_last_error();
|
|
232
|
+
|
|
233
|
+
if file_path.is_null() {
|
|
234
|
+
set_last_error("file_path cannot be NULL".to_string());
|
|
235
|
+
return ptr::null_mut();
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
let path_str = match unsafe { CStr::from_ptr(file_path) }.to_str() {
|
|
239
|
+
Ok(s) => s,
|
|
240
|
+
Err(e) => {
|
|
241
|
+
set_last_error(format!("Invalid UTF-8 in file path: {}", e));
|
|
242
|
+
return ptr::null_mut();
|
|
243
|
+
}
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
match load_config_as_json(path_str) {
|
|
247
|
+
Ok(json) => match CString::new(json) {
|
|
248
|
+
Ok(cstr) => cstr.into_raw(),
|
|
249
|
+
Err(e) => {
|
|
250
|
+
set_last_error(format!("Failed to create C string: {}", e));
|
|
251
|
+
ptr::null_mut()
|
|
252
|
+
}
|
|
253
|
+
},
|
|
254
|
+
Err(e) => {
|
|
255
|
+
set_last_error(e);
|
|
256
|
+
ptr::null_mut()
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
})
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/// Load an ExtractionConfig from a file (returns pointer to config struct).
|
|
263
|
+
///
|
|
264
|
+
/// # Safety
|
|
265
|
+
///
|
|
266
|
+
/// - `path` must be a valid null-terminated C string
|
|
267
|
+
/// - The returned pointer must be freed with `kreuzberg_config_free`
|
|
268
|
+
#[unsafe(no_mangle)]
|
|
269
|
+
pub unsafe extern "C" fn kreuzberg_config_from_file(path: *const c_char) -> *mut ExtractionConfig {
|
|
270
|
+
ffi_panic_guard!("kreuzberg_config_from_file", {
|
|
271
|
+
clear_last_error();
|
|
272
|
+
|
|
273
|
+
if path.is_null() {
|
|
274
|
+
set_last_error("Config path cannot be NULL".to_string());
|
|
275
|
+
return ptr::null_mut();
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
let path_str = match unsafe { CStr::from_ptr(path) }.to_str() {
|
|
279
|
+
Ok(s) => s,
|
|
280
|
+
Err(e) => {
|
|
281
|
+
set_last_error(format!("Invalid UTF-8 in config path: {}", e));
|
|
282
|
+
return ptr::null_mut();
|
|
283
|
+
}
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
let path_buf = Path::new(path_str);
|
|
287
|
+
|
|
288
|
+
match load_config_from_file(path_buf) {
|
|
289
|
+
Ok(config) => Box::into_raw(Box::new(config)),
|
|
290
|
+
Err(e) => {
|
|
291
|
+
set_last_error(e);
|
|
292
|
+
ptr::null_mut()
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
})
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
/// Discover and load an ExtractionConfig by searching parent directories.
|
|
299
|
+
///
|
|
300
|
+
/// # Safety
|
|
301
|
+
///
|
|
302
|
+
/// - The returned string must be freed with `kreuzberg_free_string`
|
|
303
|
+
#[unsafe(no_mangle)]
|
|
304
|
+
pub unsafe extern "C" fn kreuzberg_config_discover() -> *mut c_char {
|
|
305
|
+
ffi_panic_guard!("kreuzberg_config_discover", {
|
|
306
|
+
clear_last_error();
|
|
307
|
+
|
|
308
|
+
match discover_config_as_json() {
|
|
309
|
+
Some(json) => match CString::new(json) {
|
|
310
|
+
Ok(cstr) => cstr.into_raw(),
|
|
311
|
+
Err(e) => {
|
|
312
|
+
set_last_error(format!("Failed to serialize config: {}", e));
|
|
313
|
+
ptr::null_mut()
|
|
314
|
+
}
|
|
315
|
+
},
|
|
316
|
+
None => ptr::null_mut(),
|
|
317
|
+
}
|
|
318
|
+
})
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
/// List available embedding preset names.
|
|
322
|
+
///
|
|
323
|
+
/// # Safety
|
|
324
|
+
///
|
|
325
|
+
/// - Returned string is a JSON array and must be freed with `kreuzberg_free_string`
|
|
326
|
+
#[unsafe(no_mangle)]
|
|
327
|
+
pub unsafe extern "C" fn kreuzberg_list_embedding_presets() -> *mut c_char {
|
|
328
|
+
ffi_panic_guard!("kreuzberg_list_embedding_presets", {
|
|
329
|
+
clear_last_error();
|
|
330
|
+
|
|
331
|
+
match list_embedding_presets() {
|
|
332
|
+
Ok(json) => match string_to_c_string(json) {
|
|
333
|
+
Ok(ptr) => ptr,
|
|
334
|
+
Err(e) => {
|
|
335
|
+
set_last_error(e);
|
|
336
|
+
ptr::null_mut()
|
|
337
|
+
}
|
|
338
|
+
},
|
|
339
|
+
Err(e) => {
|
|
340
|
+
set_last_error(e);
|
|
341
|
+
ptr::null_mut()
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
})
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
/// Get a specific embedding preset by name.
|
|
348
|
+
///
|
|
349
|
+
/// # Safety
|
|
350
|
+
///
|
|
351
|
+
/// - `name` must be a valid null-terminated C string
|
|
352
|
+
/// - Returned string is JSON object and must be freed with `kreuzberg_free_string`
|
|
353
|
+
#[unsafe(no_mangle)]
|
|
354
|
+
pub unsafe extern "C" fn kreuzberg_get_embedding_preset(name: *const c_char) -> *mut c_char {
|
|
355
|
+
ffi_panic_guard!("kreuzberg_get_embedding_preset", {
|
|
356
|
+
clear_last_error();
|
|
357
|
+
|
|
358
|
+
if name.is_null() {
|
|
359
|
+
set_last_error("preset name cannot be NULL".to_string());
|
|
360
|
+
return ptr::null_mut();
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
let preset_name = match unsafe { CStr::from_ptr(name) }.to_str() {
|
|
364
|
+
Ok(s) => s,
|
|
365
|
+
Err(e) => {
|
|
366
|
+
set_last_error(format!("Invalid UTF-8 in preset name: {}", e));
|
|
367
|
+
return ptr::null_mut();
|
|
368
|
+
}
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
match get_embedding_preset(preset_name) {
|
|
372
|
+
Ok(json) => match string_to_c_string(json) {
|
|
373
|
+
Ok(ptr) => ptr,
|
|
374
|
+
Err(e) => {
|
|
375
|
+
set_last_error(e);
|
|
376
|
+
ptr::null_mut()
|
|
377
|
+
}
|
|
378
|
+
},
|
|
379
|
+
Err(e) => {
|
|
380
|
+
set_last_error(e);
|
|
381
|
+
ptr::null_mut()
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
})
|
|
385
|
+
}
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
//! JSON parsing and validation for ExtractionConfig
|
|
2
|
+
//!
|
|
3
|
+
//! Handles deserialization from JSON strings with comprehensive validation.
|
|
4
|
+
|
|
5
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
6
|
+
|
|
7
|
+
type FfiResult<T> = std::result::Result<T, String>;
|
|
8
|
+
|
|
9
|
+
/// Parse an ExtractionConfig from a JSON string.
|
|
10
|
+
///
|
|
11
|
+
/// This is the core parsing logic shared by all FFI functions that deal with
|
|
12
|
+
/// JSON configuration. It handles:
|
|
13
|
+
/// - JSON deserialization
|
|
14
|
+
/// - All validation rules
|
|
15
|
+
/// - Type conversions
|
|
16
|
+
/// - HTML options parsing (delegated to html module)
|
|
17
|
+
///
|
|
18
|
+
/// The error messages are user-friendly and include guidance on what went wrong.
|
|
19
|
+
pub fn parse_extraction_config_from_json(json_str: &str) -> FfiResult<ExtractionConfig> {
|
|
20
|
+
let json_value: serde_json::Value = serde_json::from_str(json_str).map_err(|e| format!("Invalid JSON: {}", e))?;
|
|
21
|
+
|
|
22
|
+
let mut config: ExtractionConfig =
|
|
23
|
+
serde_json::from_value(json_value.clone()).map_err(|e| format!("Invalid configuration structure: {}", e))?;
|
|
24
|
+
|
|
25
|
+
// Parse HTML options if present (complex nested structure)
|
|
26
|
+
if let Some(html_opts_val) = json_value.get("html_options") {
|
|
27
|
+
config.html_options = Some(super::html::parse_html_options(html_opts_val)?);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
Ok(config)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#[cfg(test)]
|
|
34
|
+
mod tests {
|
|
35
|
+
use super::*;
|
|
36
|
+
|
|
37
|
+
#[test]
|
|
38
|
+
fn test_parse_minimal_config() {
|
|
39
|
+
let json = "{}";
|
|
40
|
+
let result = parse_extraction_config_from_json(json);
|
|
41
|
+
assert!(result.is_ok());
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
#[test]
|
|
45
|
+
fn test_parse_config_with_use_cache() {
|
|
46
|
+
let json = r#"{"use_cache": true}"#;
|
|
47
|
+
let result = parse_extraction_config_from_json(json);
|
|
48
|
+
assert!(result.is_ok());
|
|
49
|
+
let config = result.unwrap();
|
|
50
|
+
assert!(config.use_cache);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
#[test]
|
|
54
|
+
fn test_parse_config_with_ocr() {
|
|
55
|
+
let json = r#"{"ocr": {"backend": "tesseract", "language": "eng"}}"#;
|
|
56
|
+
let result = parse_extraction_config_from_json(json);
|
|
57
|
+
assert!(result.is_ok());
|
|
58
|
+
let config = result.unwrap();
|
|
59
|
+
assert!(config.ocr.is_some());
|
|
60
|
+
let ocr = config.ocr.unwrap();
|
|
61
|
+
assert_eq!(ocr.backend, "tesseract");
|
|
62
|
+
assert_eq!(ocr.language, "eng");
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
#[test]
|
|
66
|
+
fn test_parse_invalid_json() {
|
|
67
|
+
let json = "{invalid json}";
|
|
68
|
+
let result = parse_extraction_config_from_json(json);
|
|
69
|
+
assert!(result.is_err());
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
#[test]
|
|
73
|
+
fn test_parse_complex_config() {
|
|
74
|
+
let json = r#"{
|
|
75
|
+
"use_cache": true,
|
|
76
|
+
"enable_quality_processing": true,
|
|
77
|
+
"force_ocr": false,
|
|
78
|
+
"ocr": {
|
|
79
|
+
"backend": "tesseract",
|
|
80
|
+
"language": "eng"
|
|
81
|
+
},
|
|
82
|
+
"chunking": {
|
|
83
|
+
"max_chars": 1024,
|
|
84
|
+
"max_overlap": 128
|
|
85
|
+
},
|
|
86
|
+
"max_concurrent_extractions": 4
|
|
87
|
+
}"#;
|
|
88
|
+
let result = parse_extraction_config_from_json(json);
|
|
89
|
+
assert!(result.is_ok());
|
|
90
|
+
}
|
|
91
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
//! Serialization and field extraction helpers
|
|
2
|
+
//!
|
|
3
|
+
//! Utilities for converting ExtractionConfig to JSON and extracting specific fields.
|
|
4
|
+
|
|
5
|
+
use crate::helpers::set_last_error;
|
|
6
|
+
use kreuzberg::core::config::ExtractionConfig;
|
|
7
|
+
use serde::Serialize;
|
|
8
|
+
use std::ffi::CString;
|
|
9
|
+
use std::os::raw::c_char;
|
|
10
|
+
use std::ptr;
|
|
11
|
+
|
|
12
|
+
/// SerializableEmbeddingPreset for FFI serialization.
|
|
13
|
+
#[derive(Serialize)]
|
|
14
|
+
pub struct SerializableEmbeddingPreset<'a> {
|
|
15
|
+
pub name: &'a str,
|
|
16
|
+
pub chunk_size: usize,
|
|
17
|
+
pub overlap: usize,
|
|
18
|
+
pub model_name: String,
|
|
19
|
+
pub dimensions: usize,
|
|
20
|
+
pub description: &'a str,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/// Serialize an ExtractionConfig to JSON string.
|
|
24
|
+
///
|
|
25
|
+
/// # Arguments
|
|
26
|
+
///
|
|
27
|
+
/// * `config` - Reference to an ExtractionConfig
|
|
28
|
+
///
|
|
29
|
+
/// # Returns
|
|
30
|
+
///
|
|
31
|
+
/// JSON string on success, or None on error.
|
|
32
|
+
pub fn config_to_json_string(config: &ExtractionConfig) -> Option<String> {
|
|
33
|
+
serde_json::to_string(config).ok()
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Convert a JSON value to C string pointer
|
|
37
|
+
pub fn json_to_c_string(json: String) -> *mut c_char {
|
|
38
|
+
match CString::new(json) {
|
|
39
|
+
Ok(c_string) => c_string.into_raw(),
|
|
40
|
+
Err(e) => {
|
|
41
|
+
set_last_error(format!("Failed to convert JSON to C string: {}", e));
|
|
42
|
+
ptr::null_mut()
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/// Extract a specific field from config as JSON string.
|
|
48
|
+
///
|
|
49
|
+
/// Supports dot notation for nested fields (e.g., "ocr.backend").
|
|
50
|
+
///
|
|
51
|
+
/// # Arguments
|
|
52
|
+
///
|
|
53
|
+
/// * `config` - Reference to an ExtractionConfig
|
|
54
|
+
/// * `field_path` - Dot-separated field path
|
|
55
|
+
///
|
|
56
|
+
/// # Returns
|
|
57
|
+
///
|
|
58
|
+
/// JSON string representation of the field value, or None if not found.
|
|
59
|
+
pub fn get_field_as_json(config: &ExtractionConfig, field_path: &str) -> Option<String> {
|
|
60
|
+
let json_value = match serde_json::to_value(config) {
|
|
61
|
+
Ok(val) => val,
|
|
62
|
+
Err(e) => {
|
|
63
|
+
set_last_error(format!("Failed to serialize config: {}", e));
|
|
64
|
+
return None;
|
|
65
|
+
}
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
let mut current = &json_value;
|
|
69
|
+
for part in field_path.split('.') {
|
|
70
|
+
if let Some(obj) = current.as_object() {
|
|
71
|
+
match obj.get(part) {
|
|
72
|
+
Some(val) => current = val,
|
|
73
|
+
None => {
|
|
74
|
+
set_last_error(format!("Field '{}' not found in config", field_path));
|
|
75
|
+
return None;
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
} else {
|
|
79
|
+
set_last_error(format!("Cannot access nested field '{}' in non-object", part));
|
|
80
|
+
return None;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
match serde_json::to_string(current) {
|
|
85
|
+
Ok(json) => Some(json),
|
|
86
|
+
Err(e) => {
|
|
87
|
+
set_last_error(format!("Failed to serialize field value: {}", e));
|
|
88
|
+
None
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
#[cfg(test)]
|
|
94
|
+
mod tests {
|
|
95
|
+
use super::*;
|
|
96
|
+
|
|
97
|
+
#[test]
|
|
98
|
+
fn test_config_to_json_string() {
|
|
99
|
+
let config = ExtractionConfig {
|
|
100
|
+
use_cache: true,
|
|
101
|
+
..Default::default()
|
|
102
|
+
};
|
|
103
|
+
let json = config_to_json_string(&config);
|
|
104
|
+
assert!(json.is_some());
|
|
105
|
+
assert!(json.unwrap().contains("use_cache"));
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
#[test]
|
|
109
|
+
fn test_get_field_as_json() {
|
|
110
|
+
let config = ExtractionConfig {
|
|
111
|
+
use_cache: true,
|
|
112
|
+
..Default::default()
|
|
113
|
+
};
|
|
114
|
+
let result = get_field_as_json(&config, "use_cache");
|
|
115
|
+
assert!(result.is_some());
|
|
116
|
+
assert_eq!(result.unwrap(), "true");
|
|
117
|
+
}
|
|
118
|
+
}
|