kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,598 @@
|
|
|
1
|
+
//! Builder pattern API for constructing ExtractionConfig programmatically.
|
|
2
|
+
//!
|
|
3
|
+
//! This module provides a step-by-step builder interface for language bindings
|
|
4
|
+
//! that prefer to construct configurations programmatically rather than via JSON.
|
|
5
|
+
//!
|
|
6
|
+
//! Unlike the JSON-based API in config.rs, this builder allows incremental
|
|
7
|
+
//! configuration construction with immediate validation at each step.
|
|
8
|
+
|
|
9
|
+
use crate::ffi_panic_guard;
|
|
10
|
+
use crate::ffi_panic_guard_i32;
|
|
11
|
+
use crate::helpers::{clear_last_error, set_last_error};
|
|
12
|
+
use kreuzberg::core::config::{
|
|
13
|
+
ChunkingConfig, ExtractionConfig, ImageExtractionConfig, LanguageDetectionConfig, OcrConfig, PdfConfig,
|
|
14
|
+
PostProcessorConfig,
|
|
15
|
+
};
|
|
16
|
+
use std::ffi::{CStr, c_char};
|
|
17
|
+
use std::ptr;
|
|
18
|
+
|
|
19
|
+
/// Opaque builder struct for constructing ExtractionConfig.
|
|
20
|
+
///
|
|
21
|
+
/// Use kreuzberg_config_builder_new() to create, set fields with setters,
|
|
22
|
+
/// then finalize with kreuzberg_config_builder_build().
|
|
23
|
+
pub struct ConfigBuilder {
|
|
24
|
+
config: ExtractionConfig,
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
impl ConfigBuilder {
|
|
28
|
+
fn new() -> Self {
|
|
29
|
+
Self {
|
|
30
|
+
config: ExtractionConfig::default(),
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
fn set_use_cache(&mut self, use_cache: bool) {
|
|
35
|
+
self.config.use_cache = use_cache;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
fn set_ocr_from_json(&mut self, ocr_json: &str) -> Result<(), String> {
|
|
39
|
+
let ocr_config: OcrConfig =
|
|
40
|
+
serde_json::from_str(ocr_json).map_err(|e| format!("Failed to parse OCR config JSON: {}", e))?;
|
|
41
|
+
self.config.ocr = Some(ocr_config);
|
|
42
|
+
Ok(())
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
fn set_pdf_from_json(&mut self, pdf_json: &str) -> Result<(), String> {
|
|
46
|
+
let pdf_config: PdfConfig =
|
|
47
|
+
serde_json::from_str(pdf_json).map_err(|e| format!("Failed to parse PDF config JSON: {}", e))?;
|
|
48
|
+
self.config.pdf_options = Some(pdf_config);
|
|
49
|
+
Ok(())
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
fn set_chunking_from_json(&mut self, chunking_json: &str) -> Result<(), String> {
|
|
53
|
+
let chunking_config: ChunkingConfig =
|
|
54
|
+
serde_json::from_str(chunking_json).map_err(|e| format!("Failed to parse chunking config JSON: {}", e))?;
|
|
55
|
+
self.config.chunking = Some(chunking_config);
|
|
56
|
+
Ok(())
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
fn set_image_extraction_from_json(&mut self, image_json: &str) -> Result<(), String> {
|
|
60
|
+
let image_config: ImageExtractionConfig = serde_json::from_str(image_json)
|
|
61
|
+
.map_err(|e| format!("Failed to parse image extraction config JSON: {}", e))?;
|
|
62
|
+
self.config.images = Some(image_config);
|
|
63
|
+
Ok(())
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
fn set_post_processor_from_json(&mut self, pp_json: &str) -> Result<(), String> {
|
|
67
|
+
let pp_config: PostProcessorConfig =
|
|
68
|
+
serde_json::from_str(pp_json).map_err(|e| format!("Failed to parse post processor config JSON: {}", e))?;
|
|
69
|
+
self.config.postprocessor = Some(pp_config);
|
|
70
|
+
Ok(())
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
fn set_language_detection_from_json(&mut self, ld_json: &str) -> Result<(), String> {
|
|
74
|
+
let ld_config: LanguageDetectionConfig = serde_json::from_str(ld_json)
|
|
75
|
+
.map_err(|e| format!("Failed to parse language detection config JSON: {}", e))?;
|
|
76
|
+
self.config.language_detection = Some(ld_config);
|
|
77
|
+
Ok(())
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
fn build(self) -> ExtractionConfig {
|
|
81
|
+
self.config
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/// Create a new config builder.
|
|
86
|
+
///
|
|
87
|
+
/// Returns an opaque pointer to ConfigBuilder. Must be freed with
|
|
88
|
+
/// kreuzberg_config_builder_free() or consumed by kreuzberg_config_builder_build().
|
|
89
|
+
///
|
|
90
|
+
/// # Safety
|
|
91
|
+
///
|
|
92
|
+
/// The returned pointer must be freed with kreuzberg_config_builder_free()
|
|
93
|
+
/// or passed to kreuzberg_config_builder_build().
|
|
94
|
+
///
|
|
95
|
+
/// # Example (C)
|
|
96
|
+
///
|
|
97
|
+
/// ```c
|
|
98
|
+
/// ConfigBuilder* builder = kreuzberg_config_builder_new();
|
|
99
|
+
/// kreuzberg_config_builder_set_use_cache(builder, 1);
|
|
100
|
+
/// ExtractionConfig* config = kreuzberg_config_builder_build(builder);
|
|
101
|
+
/// // builder is now consumed, don't call kreuzberg_config_builder_free
|
|
102
|
+
/// kreuzberg_config_free(config);
|
|
103
|
+
/// ```
|
|
104
|
+
#[unsafe(no_mangle)]
|
|
105
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_new() -> *mut ConfigBuilder {
|
|
106
|
+
ffi_panic_guard!("kreuzberg_config_builder_new", {
|
|
107
|
+
clear_last_error();
|
|
108
|
+
Box::into_raw(Box::new(ConfigBuilder::new()))
|
|
109
|
+
})
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/// Set the use_cache field.
|
|
113
|
+
///
|
|
114
|
+
/// # Arguments
|
|
115
|
+
///
|
|
116
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
117
|
+
/// * `use_cache` - 1 for true, 0 for false
|
|
118
|
+
///
|
|
119
|
+
/// # Returns
|
|
120
|
+
///
|
|
121
|
+
/// 0 on success, -1 on error (NULL builder)
|
|
122
|
+
///
|
|
123
|
+
/// # Safety
|
|
124
|
+
///
|
|
125
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
126
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
127
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
128
|
+
#[unsafe(no_mangle)]
|
|
129
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_use_cache(builder: *mut ConfigBuilder, use_cache: i32) -> i32 {
|
|
130
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_use_cache", {
|
|
131
|
+
if builder.is_null() {
|
|
132
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
133
|
+
return -1;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
clear_last_error();
|
|
137
|
+
unsafe { (*builder).set_use_cache(use_cache != 0) };
|
|
138
|
+
0
|
|
139
|
+
})
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/// Set OCR configuration from JSON.
|
|
143
|
+
///
|
|
144
|
+
/// # Arguments
|
|
145
|
+
///
|
|
146
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
147
|
+
/// * `ocr_json` - JSON string like `{"backend": "tesseract", "languages": ["en"]}`
|
|
148
|
+
///
|
|
149
|
+
/// # Returns
|
|
150
|
+
///
|
|
151
|
+
/// 0 on success, -1 on error (check kreuzberg_last_error)
|
|
152
|
+
///
|
|
153
|
+
/// # Safety
|
|
154
|
+
///
|
|
155
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
156
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
157
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
158
|
+
/// - `ocr_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
159
|
+
/// - The string pointer must remain valid for the duration of the function call
|
|
160
|
+
#[unsafe(no_mangle)]
|
|
161
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_ocr(builder: *mut ConfigBuilder, ocr_json: *const c_char) -> i32 {
|
|
162
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_ocr", {
|
|
163
|
+
if builder.is_null() {
|
|
164
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
165
|
+
return -1;
|
|
166
|
+
}
|
|
167
|
+
if ocr_json.is_null() {
|
|
168
|
+
set_last_error("OCR JSON cannot be NULL".to_string());
|
|
169
|
+
return -1;
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
clear_last_error();
|
|
173
|
+
|
|
174
|
+
let json_str = match unsafe { CStr::from_ptr(ocr_json) }.to_str() {
|
|
175
|
+
Ok(s) => s,
|
|
176
|
+
Err(e) => {
|
|
177
|
+
set_last_error(format!("Invalid UTF-8 in OCR JSON: {}", e));
|
|
178
|
+
return -1;
|
|
179
|
+
}
|
|
180
|
+
};
|
|
181
|
+
|
|
182
|
+
match unsafe { (*builder).set_ocr_from_json(json_str) } {
|
|
183
|
+
Ok(()) => 0,
|
|
184
|
+
Err(e) => {
|
|
185
|
+
set_last_error(e);
|
|
186
|
+
-1
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
})
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
/// Set PDF configuration from JSON.
|
|
193
|
+
///
|
|
194
|
+
/// # Arguments
|
|
195
|
+
///
|
|
196
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
197
|
+
/// * `pdf_json` - JSON string for PDF config
|
|
198
|
+
///
|
|
199
|
+
/// # Returns
|
|
200
|
+
///
|
|
201
|
+
/// 0 on success, -1 on error
|
|
202
|
+
///
|
|
203
|
+
/// # Safety
|
|
204
|
+
///
|
|
205
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
206
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
207
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
208
|
+
/// - `pdf_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
209
|
+
/// - The string pointer must remain valid for the duration of the function call
|
|
210
|
+
#[unsafe(no_mangle)]
|
|
211
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_pdf(builder: *mut ConfigBuilder, pdf_json: *const c_char) -> i32 {
|
|
212
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_pdf", {
|
|
213
|
+
if builder.is_null() {
|
|
214
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
215
|
+
return -1;
|
|
216
|
+
}
|
|
217
|
+
if pdf_json.is_null() {
|
|
218
|
+
set_last_error("PDF JSON cannot be NULL".to_string());
|
|
219
|
+
return -1;
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
clear_last_error();
|
|
223
|
+
|
|
224
|
+
let json_str = match unsafe { CStr::from_ptr(pdf_json) }.to_str() {
|
|
225
|
+
Ok(s) => s,
|
|
226
|
+
Err(e) => {
|
|
227
|
+
set_last_error(format!("Invalid UTF-8 in PDF JSON: {}", e));
|
|
228
|
+
return -1;
|
|
229
|
+
}
|
|
230
|
+
};
|
|
231
|
+
|
|
232
|
+
match unsafe { (*builder).set_pdf_from_json(json_str) } {
|
|
233
|
+
Ok(()) => 0,
|
|
234
|
+
Err(e) => {
|
|
235
|
+
set_last_error(e);
|
|
236
|
+
-1
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
})
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/// Set chunking configuration from JSON.
|
|
243
|
+
///
|
|
244
|
+
/// # Arguments
|
|
245
|
+
///
|
|
246
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
247
|
+
/// * `chunking_json` - JSON string for chunking config
|
|
248
|
+
///
|
|
249
|
+
/// # Returns
|
|
250
|
+
///
|
|
251
|
+
/// 0 on success, -1 on error
|
|
252
|
+
///
|
|
253
|
+
/// # Safety
|
|
254
|
+
///
|
|
255
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
256
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
257
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
258
|
+
/// - `chunking_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
259
|
+
/// - The string pointer must remain valid for the duration of the function call
|
|
260
|
+
#[unsafe(no_mangle)]
|
|
261
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_chunking(
|
|
262
|
+
builder: *mut ConfigBuilder,
|
|
263
|
+
chunking_json: *const c_char,
|
|
264
|
+
) -> i32 {
|
|
265
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_chunking", {
|
|
266
|
+
if builder.is_null() {
|
|
267
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
268
|
+
return -1;
|
|
269
|
+
}
|
|
270
|
+
if chunking_json.is_null() {
|
|
271
|
+
set_last_error("Chunking JSON cannot be NULL".to_string());
|
|
272
|
+
return -1;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
clear_last_error();
|
|
276
|
+
|
|
277
|
+
let json_str = match unsafe { CStr::from_ptr(chunking_json) }.to_str() {
|
|
278
|
+
Ok(s) => s,
|
|
279
|
+
Err(e) => {
|
|
280
|
+
set_last_error(format!("Invalid UTF-8 in chunking JSON: {}", e));
|
|
281
|
+
return -1;
|
|
282
|
+
}
|
|
283
|
+
};
|
|
284
|
+
|
|
285
|
+
match unsafe { (*builder).set_chunking_from_json(json_str) } {
|
|
286
|
+
Ok(()) => 0,
|
|
287
|
+
Err(e) => {
|
|
288
|
+
set_last_error(e);
|
|
289
|
+
-1
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
})
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/// Set image extraction configuration from JSON.
|
|
296
|
+
///
|
|
297
|
+
/// # Arguments
|
|
298
|
+
///
|
|
299
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
300
|
+
/// * `image_json` - JSON string for image extraction config
|
|
301
|
+
///
|
|
302
|
+
/// # Returns
|
|
303
|
+
///
|
|
304
|
+
/// 0 on success, -1 on error
|
|
305
|
+
///
|
|
306
|
+
/// # Safety
|
|
307
|
+
///
|
|
308
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
309
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
310
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
311
|
+
/// - `image_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
312
|
+
/// - The string pointer must remain valid for the duration of the function call
|
|
313
|
+
#[unsafe(no_mangle)]
|
|
314
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_image_extraction(
|
|
315
|
+
builder: *mut ConfigBuilder,
|
|
316
|
+
image_json: *const c_char,
|
|
317
|
+
) -> i32 {
|
|
318
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_image_extraction", {
|
|
319
|
+
if builder.is_null() {
|
|
320
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
321
|
+
return -1;
|
|
322
|
+
}
|
|
323
|
+
if image_json.is_null() {
|
|
324
|
+
set_last_error("Image extraction JSON cannot be NULL".to_string());
|
|
325
|
+
return -1;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
clear_last_error();
|
|
329
|
+
|
|
330
|
+
let json_str = match unsafe { CStr::from_ptr(image_json) }.to_str() {
|
|
331
|
+
Ok(s) => s,
|
|
332
|
+
Err(e) => {
|
|
333
|
+
set_last_error(format!("Invalid UTF-8 in image extraction JSON: {}", e));
|
|
334
|
+
return -1;
|
|
335
|
+
}
|
|
336
|
+
};
|
|
337
|
+
|
|
338
|
+
match unsafe { (*builder).set_image_extraction_from_json(json_str) } {
|
|
339
|
+
Ok(()) => 0,
|
|
340
|
+
Err(e) => {
|
|
341
|
+
set_last_error(e);
|
|
342
|
+
-1
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
})
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/// Set post-processor configuration from JSON.
|
|
349
|
+
///
|
|
350
|
+
/// # Arguments
|
|
351
|
+
///
|
|
352
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
353
|
+
/// * `pp_json` - JSON string for post-processor config
|
|
354
|
+
///
|
|
355
|
+
/// # Returns
|
|
356
|
+
///
|
|
357
|
+
/// 0 on success, -1 on error
|
|
358
|
+
///
|
|
359
|
+
/// # Safety
|
|
360
|
+
///
|
|
361
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
362
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
363
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
364
|
+
/// - `pp_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
365
|
+
/// - The string pointer must remain valid for the duration of the function call
|
|
366
|
+
#[unsafe(no_mangle)]
|
|
367
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_post_processor(
|
|
368
|
+
builder: *mut ConfigBuilder,
|
|
369
|
+
pp_json: *const c_char,
|
|
370
|
+
) -> i32 {
|
|
371
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_post_processor", {
|
|
372
|
+
if builder.is_null() {
|
|
373
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
374
|
+
return -1;
|
|
375
|
+
}
|
|
376
|
+
if pp_json.is_null() {
|
|
377
|
+
set_last_error("Post-processor JSON cannot be NULL".to_string());
|
|
378
|
+
return -1;
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
clear_last_error();
|
|
382
|
+
|
|
383
|
+
let json_str = match unsafe { CStr::from_ptr(pp_json) }.to_str() {
|
|
384
|
+
Ok(s) => s,
|
|
385
|
+
Err(e) => {
|
|
386
|
+
set_last_error(format!("Invalid UTF-8 in post-processor JSON: {}", e));
|
|
387
|
+
return -1;
|
|
388
|
+
}
|
|
389
|
+
};
|
|
390
|
+
|
|
391
|
+
match unsafe { (*builder).set_post_processor_from_json(json_str) } {
|
|
392
|
+
Ok(()) => 0,
|
|
393
|
+
Err(e) => {
|
|
394
|
+
set_last_error(e);
|
|
395
|
+
-1
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
})
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
/// Set language detection configuration from JSON.
|
|
402
|
+
///
|
|
403
|
+
/// # Arguments
|
|
404
|
+
///
|
|
405
|
+
/// * `builder` - Non-null pointer to ConfigBuilder
|
|
406
|
+
/// * `ld_json` - JSON string for language detection config
|
|
407
|
+
///
|
|
408
|
+
/// # Returns
|
|
409
|
+
///
|
|
410
|
+
/// 0 on success, -1 on error
|
|
411
|
+
///
|
|
412
|
+
/// # Safety
|
|
413
|
+
///
|
|
414
|
+
/// This function is meant to be called from C/FFI code. The caller must ensure:
|
|
415
|
+
/// - `builder` must be a valid, non-null pointer previously returned by `kreuzberg_config_builder_new`
|
|
416
|
+
/// - The pointer must be properly aligned and point to a valid ConfigBuilder instance
|
|
417
|
+
/// - `ld_json` must be a valid, non-null pointer to a null-terminated UTF-8 string
|
|
418
|
+
/// - The string pointer must remain valid for the duration of the function call
|
|
419
|
+
#[unsafe(no_mangle)]
|
|
420
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_set_language_detection(
|
|
421
|
+
builder: *mut ConfigBuilder,
|
|
422
|
+
ld_json: *const c_char,
|
|
423
|
+
) -> i32 {
|
|
424
|
+
ffi_panic_guard_i32!("kreuzberg_config_builder_set_language_detection", {
|
|
425
|
+
if builder.is_null() {
|
|
426
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
427
|
+
return -1;
|
|
428
|
+
}
|
|
429
|
+
if ld_json.is_null() {
|
|
430
|
+
set_last_error("Language detection JSON cannot be NULL".to_string());
|
|
431
|
+
return -1;
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
clear_last_error();
|
|
435
|
+
|
|
436
|
+
let json_str = match unsafe { CStr::from_ptr(ld_json) }.to_str() {
|
|
437
|
+
Ok(s) => s,
|
|
438
|
+
Err(e) => {
|
|
439
|
+
set_last_error(format!("Invalid UTF-8 in language detection JSON: {}", e));
|
|
440
|
+
return -1;
|
|
441
|
+
}
|
|
442
|
+
};
|
|
443
|
+
|
|
444
|
+
match unsafe { (*builder).set_language_detection_from_json(json_str) } {
|
|
445
|
+
Ok(()) => 0,
|
|
446
|
+
Err(e) => {
|
|
447
|
+
set_last_error(e);
|
|
448
|
+
-1
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
})
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
/// Build the final ExtractionConfig and consume the builder.
|
|
455
|
+
///
|
|
456
|
+
/// After calling this function, the builder pointer is invalid and must not be used.
|
|
457
|
+
/// The returned ExtractionConfig must be freed with kreuzberg_config_free().
|
|
458
|
+
///
|
|
459
|
+
/// # Arguments
|
|
460
|
+
///
|
|
461
|
+
/// * `builder` - Non-null pointer to ConfigBuilder (will be consumed)
|
|
462
|
+
///
|
|
463
|
+
/// # Returns
|
|
464
|
+
///
|
|
465
|
+
/// Pointer to ExtractionConfig on success, NULL on error
|
|
466
|
+
///
|
|
467
|
+
/// # Safety
|
|
468
|
+
///
|
|
469
|
+
/// - `builder` is consumed and must not be used after this call
|
|
470
|
+
/// - Do NOT call kreuzberg_config_builder_free() after this function
|
|
471
|
+
/// - The returned ExtractionConfig must be freed with kreuzberg_config_free()
|
|
472
|
+
#[unsafe(no_mangle)]
|
|
473
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_build(builder: *mut ConfigBuilder) -> *mut ExtractionConfig {
|
|
474
|
+
ffi_panic_guard!("kreuzberg_config_builder_build", {
|
|
475
|
+
if builder.is_null() {
|
|
476
|
+
set_last_error("ConfigBuilder pointer cannot be NULL".to_string());
|
|
477
|
+
return ptr::null_mut();
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
clear_last_error();
|
|
481
|
+
let builder_box = unsafe { Box::from_raw(builder) };
|
|
482
|
+
let config = builder_box.build();
|
|
483
|
+
Box::into_raw(Box::new(config))
|
|
484
|
+
})
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
/// Free a ConfigBuilder without building.
|
|
488
|
+
///
|
|
489
|
+
/// Use this to discard a builder without creating a config.
|
|
490
|
+
/// Do NOT call this after kreuzberg_config_builder_build() (builder is already consumed).
|
|
491
|
+
///
|
|
492
|
+
/// # Arguments
|
|
493
|
+
///
|
|
494
|
+
/// * `builder` - Pointer to ConfigBuilder, can be NULL (no-op)
|
|
495
|
+
///
|
|
496
|
+
/// # Safety
|
|
497
|
+
///
|
|
498
|
+
/// - `builder` can be NULL (no-op)
|
|
499
|
+
/// - Do NOT call this after kreuzberg_config_builder_build()
|
|
500
|
+
#[unsafe(no_mangle)]
|
|
501
|
+
pub unsafe extern "C" fn kreuzberg_config_builder_free(builder: *mut ConfigBuilder) {
|
|
502
|
+
if !builder.is_null() {
|
|
503
|
+
unsafe { drop(Box::from_raw(builder)) };
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
|
|
507
|
+
#[cfg(test)]
|
|
508
|
+
mod tests {
|
|
509
|
+
use super::*;
|
|
510
|
+
use std::ffi::CString;
|
|
511
|
+
|
|
512
|
+
#[test]
|
|
513
|
+
fn test_builder_basic_flow() {
|
|
514
|
+
unsafe {
|
|
515
|
+
let builder = kreuzberg_config_builder_new();
|
|
516
|
+
assert!(!builder.is_null());
|
|
517
|
+
|
|
518
|
+
let result = kreuzberg_config_builder_set_use_cache(builder, 1);
|
|
519
|
+
assert_eq!(result, 0);
|
|
520
|
+
|
|
521
|
+
let config = kreuzberg_config_builder_build(builder);
|
|
522
|
+
assert!(!config.is_null());
|
|
523
|
+
|
|
524
|
+
assert!((*config).use_cache);
|
|
525
|
+
|
|
526
|
+
// Clean up
|
|
527
|
+
let _ = Box::from_raw(config);
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
531
|
+
#[test]
|
|
532
|
+
fn test_builder_with_ocr() {
|
|
533
|
+
unsafe {
|
|
534
|
+
let builder = kreuzberg_config_builder_new();
|
|
535
|
+
assert!(!builder.is_null());
|
|
536
|
+
|
|
537
|
+
let ocr_json = CString::new(r#"{"backend":"tesseract","languages":["en"]}"#).unwrap();
|
|
538
|
+
let result = kreuzberg_config_builder_set_ocr(builder, ocr_json.as_ptr());
|
|
539
|
+
assert_eq!(result, 0);
|
|
540
|
+
|
|
541
|
+
let config = kreuzberg_config_builder_build(builder);
|
|
542
|
+
assert!(!config.is_null());
|
|
543
|
+
|
|
544
|
+
assert!((*config).ocr.is_some());
|
|
545
|
+
|
|
546
|
+
// Clean up
|
|
547
|
+
let _ = Box::from_raw(config);
|
|
548
|
+
}
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
#[test]
|
|
552
|
+
fn test_builder_null_checks() {
|
|
553
|
+
unsafe {
|
|
554
|
+
// NULL builder should fail
|
|
555
|
+
let result = kreuzberg_config_builder_set_use_cache(ptr::null_mut(), 1);
|
|
556
|
+
assert_eq!(result, -1);
|
|
557
|
+
|
|
558
|
+
let config = kreuzberg_config_builder_build(ptr::null_mut());
|
|
559
|
+
assert!(config.is_null());
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
|
|
563
|
+
#[test]
|
|
564
|
+
fn test_builder_free() {
|
|
565
|
+
unsafe {
|
|
566
|
+
let builder = kreuzberg_config_builder_new();
|
|
567
|
+
assert!(!builder.is_null());
|
|
568
|
+
|
|
569
|
+
// Free without building should not crash
|
|
570
|
+
kreuzberg_config_builder_free(builder);
|
|
571
|
+
|
|
572
|
+
// Freeing NULL should not crash
|
|
573
|
+
kreuzberg_config_builder_free(ptr::null_mut());
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
#[test]
|
|
578
|
+
fn test_builder_invalid_json() {
|
|
579
|
+
unsafe {
|
|
580
|
+
let builder = kreuzberg_config_builder_new();
|
|
581
|
+
assert!(!builder.is_null());
|
|
582
|
+
|
|
583
|
+
let invalid_json = CString::new("not valid json").unwrap();
|
|
584
|
+
let result = kreuzberg_config_builder_set_ocr(builder, invalid_json.as_ptr());
|
|
585
|
+
assert_eq!(result, -1);
|
|
586
|
+
|
|
587
|
+
// Builder should still be usable
|
|
588
|
+
let result = kreuzberg_config_builder_set_use_cache(builder, 0);
|
|
589
|
+
assert_eq!(result, 0);
|
|
590
|
+
|
|
591
|
+
let config = kreuzberg_config_builder_build(builder);
|
|
592
|
+
assert!(!config.is_null());
|
|
593
|
+
|
|
594
|
+
// Clean up
|
|
595
|
+
let _ = Box::from_raw(config);
|
|
596
|
+
}
|
|
597
|
+
}
|
|
598
|
+
}
|
|
@@ -486,22 +486,54 @@ pub extern "C" fn kreuzberg_get_error_details() -> CErrorDetails {
|
|
|
486
486
|
(None, None, 0)
|
|
487
487
|
};
|
|
488
488
|
|
|
489
|
+
// Helper to convert string to C string with proper error handling.
|
|
490
|
+
// On failure, logs the error and returns a fallback heap-allocated string.
|
|
491
|
+
fn string_to_cstring_with_fallback(value: String, fallback: &str, field_name: &str) -> *mut c_char {
|
|
492
|
+
match CString::new(value) {
|
|
493
|
+
Ok(cstr) => cstr.into_raw(),
|
|
494
|
+
Err(e) => {
|
|
495
|
+
log::warn!(
|
|
496
|
+
"kreuzberg_get_error_details: CString creation failed for {}: {} (contains interior NUL byte)",
|
|
497
|
+
field_name,
|
|
498
|
+
e
|
|
499
|
+
);
|
|
500
|
+
// Allocate a proper CString for the fallback so it can be safely freed
|
|
501
|
+
CString::new(fallback).map(CString::into_raw).unwrap_or_else(|_| {
|
|
502
|
+
// This should never happen since fallback is a static string without NUL bytes
|
|
503
|
+
log::warn!(
|
|
504
|
+
"kreuzberg_get_error_details: CRITICAL - fallback CString creation also failed for {}",
|
|
505
|
+
field_name
|
|
506
|
+
);
|
|
507
|
+
ptr::null_mut()
|
|
508
|
+
})
|
|
509
|
+
}
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
// Helper for optional string fields (accepts &str to match panic context types)
|
|
514
|
+
fn optional_str_to_cstring(value: Option<&str>, field_name: &str) -> *mut c_char {
|
|
515
|
+
match value {
|
|
516
|
+
Some(s) => match CString::new(s) {
|
|
517
|
+
Ok(cstr) => cstr.into_raw(),
|
|
518
|
+
Err(e) => {
|
|
519
|
+
log::warn!(
|
|
520
|
+
"kreuzberg_get_error_details: CString creation failed for {}: {} (contains interior NUL byte)",
|
|
521
|
+
field_name,
|
|
522
|
+
e
|
|
523
|
+
);
|
|
524
|
+
ptr::null_mut()
|
|
525
|
+
}
|
|
526
|
+
},
|
|
527
|
+
None => ptr::null_mut(),
|
|
528
|
+
}
|
|
529
|
+
}
|
|
530
|
+
|
|
489
531
|
CErrorDetails {
|
|
490
|
-
message:
|
|
491
|
-
.map(CString::into_raw)
|
|
492
|
-
.unwrap_or_else(|_| "Error message creation failed".as_ptr() as *mut c_char),
|
|
532
|
+
message: string_to_cstring_with_fallback(message, "CString error", "message"),
|
|
493
533
|
error_code,
|
|
494
|
-
error_type:
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
source_file: source_file
|
|
498
|
-
.and_then(|f| CString::new(f).ok())
|
|
499
|
-
.map(CString::into_raw)
|
|
500
|
-
.unwrap_or(ptr::null_mut()),
|
|
501
|
-
source_function: source_function
|
|
502
|
-
.and_then(|f| CString::new(f).ok())
|
|
503
|
-
.map(CString::into_raw)
|
|
504
|
-
.unwrap_or(ptr::null_mut()),
|
|
534
|
+
error_type: string_to_cstring_with_fallback(error_type, "unknown", "error_type"),
|
|
535
|
+
source_file: optional_str_to_cstring(source_file, "source_file"),
|
|
536
|
+
source_function: optional_str_to_cstring(source_function, "source_function"),
|
|
505
537
|
source_line,
|
|
506
538
|
context_info: ptr::null_mut(),
|
|
507
539
|
is_panic,
|