kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
//! Metadata handling and document format detection
|
|
2
|
+
//!
|
|
3
|
+
//! Provides utilities for MIME type detection, format validation, and extension mapping.
|
|
4
|
+
|
|
5
|
+
use crate::error_handling::runtime_error;
|
|
6
|
+
use magnus::Error;
|
|
7
|
+
|
|
8
|
+
/// Detect MIME type from bytes
|
|
9
|
+
pub fn detect_mime_type_from_bytes(bytes: String) -> Result<String, Error> {
|
|
10
|
+
let bytes_vec = bytes.into_bytes();
|
|
11
|
+
kreuzberg::core::mime::detect_mime_type_from_bytes(&bytes_vec)
|
|
12
|
+
.map_err(|e| runtime_error(format!("Failed to detect MIME type: {}", e)))
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/// Detect MIME type from file path
|
|
16
|
+
pub fn detect_mime_type_from_path_native(path: String) -> Result<String, Error> {
|
|
17
|
+
kreuzberg::core::mime::detect_mime_type(&path, true)
|
|
18
|
+
.map_err(|e| runtime_error(format!("Failed to detect MIME type from path: {}", e)))
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
/// Validate MIME type
|
|
22
|
+
pub fn validate_mime_type_native(mime_type: String) -> Result<String, Error> {
|
|
23
|
+
if kreuzberg::core::mime::validate_mime_type(&mime_type).is_ok() {
|
|
24
|
+
Ok(mime_type)
|
|
25
|
+
} else {
|
|
26
|
+
Err(runtime_error(format!("Unsupported MIME type: {}", mime_type)))
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Get file extensions for a given MIME type
|
|
31
|
+
pub fn get_extensions_for_mime_native(mime_type: String) -> Result<Vec<String>, Error> {
|
|
32
|
+
kreuzberg::core::mime::get_extensions_for_mime(&mime_type)
|
|
33
|
+
.map_err(|e| runtime_error(format!("Failed to get extensions: {}", e)))
|
|
34
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
//! Plugin management for Kreuzberg
|
|
2
|
+
//!
|
|
3
|
+
//! Handles registration and management of custom plugins including post-processors,
|
|
4
|
+
//! validators, and OCR backends.
|
|
5
|
+
|
|
6
|
+
pub mod post_processor;
|
|
7
|
+
pub mod validator;
|
|
8
|
+
pub mod ocr_backend;
|
|
9
|
+
|
|
10
|
+
pub use post_processor::register_post_processor;
|
|
11
|
+
pub use validator::register_validator;
|
|
12
|
+
pub use ocr_backend::{register_ocr_backend, unregister_ocr_backend, list_ocr_backends, clear_ocr_backends};
|
|
13
|
+
|
|
14
|
+
// Plugin registry functions
|
|
15
|
+
pub use kreuzberg::get_post_processor_registry;
|
|
16
|
+
|
|
17
|
+
use magnus::Error;
|
|
18
|
+
use kreuzberg::plugins::{
|
|
19
|
+
unregister_validator as kz_unregister_validator,
|
|
20
|
+
clear_validators as kz_clear_validators,
|
|
21
|
+
list_validators as kz_list_validators,
|
|
22
|
+
list_post_processors as kz_list_post_processors,
|
|
23
|
+
list_extractors as kz_list_extractors,
|
|
24
|
+
unregister_extractor as kz_unregister_extractor,
|
|
25
|
+
clear_extractors as kz_clear_extractors,
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
/// Unregister a post-processor plugin by name
|
|
29
|
+
pub fn unregister_post_processor(name: String) -> Result<(), Error> {
|
|
30
|
+
let registry = get_post_processor_registry();
|
|
31
|
+
registry
|
|
32
|
+
.write()
|
|
33
|
+
.map_err(|e| crate::error_handling::runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
34
|
+
.remove(&name)
|
|
35
|
+
.map_err(crate::error_handling::kreuzberg_error)?;
|
|
36
|
+
|
|
37
|
+
Ok(())
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/// Unregister a validator plugin by name
|
|
41
|
+
pub fn unregister_validator(name: String) -> Result<(), Error> {
|
|
42
|
+
kz_unregister_validator(&name)
|
|
43
|
+
.map_err(crate::error_handling::kreuzberg_error)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/// Clear all post-processors
|
|
47
|
+
pub fn clear_post_processors() -> Result<(), Error> {
|
|
48
|
+
let registry = get_post_processor_registry();
|
|
49
|
+
registry
|
|
50
|
+
.write()
|
|
51
|
+
.map_err(|e| crate::error_handling::runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
52
|
+
.shutdown_all()
|
|
53
|
+
.map_err(crate::error_handling::kreuzberg_error)?;
|
|
54
|
+
|
|
55
|
+
Ok(())
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/// Clear all validators
|
|
59
|
+
pub fn clear_validators() -> Result<(), Error> {
|
|
60
|
+
kz_clear_validators()
|
|
61
|
+
.map_err(crate::error_handling::kreuzberg_error)
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
/// List registered post-processors
|
|
65
|
+
pub fn list_post_processors() -> Result<Vec<String>, Error> {
|
|
66
|
+
kz_list_post_processors()
|
|
67
|
+
.map_err(crate::error_handling::kreuzberg_error)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/// List registered validators
|
|
71
|
+
pub fn list_validators() -> Result<Vec<String>, Error> {
|
|
72
|
+
kz_list_validators()
|
|
73
|
+
.map_err(crate::error_handling::kreuzberg_error)
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/// List registered document extractors
|
|
77
|
+
pub fn list_document_extractors() -> Result<Vec<String>, Error> {
|
|
78
|
+
kz_list_extractors()
|
|
79
|
+
.map_err(crate::error_handling::kreuzberg_error)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/// Unregister a document extractor
|
|
83
|
+
pub fn unregister_document_extractor(name: String) -> Result<(), Error> {
|
|
84
|
+
kz_unregister_extractor(&name)
|
|
85
|
+
.map_err(crate::error_handling::kreuzberg_error)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/// Clear all document extractors
|
|
89
|
+
pub fn clear_document_extractors() -> Result<(), Error> {
|
|
90
|
+
kz_clear_extractors()
|
|
91
|
+
.map_err(crate::error_handling::kreuzberg_error)
|
|
92
|
+
}
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
//! OCR backend plugin registration and management
|
|
2
|
+
|
|
3
|
+
use crate::error_handling::{kreuzberg_error, runtime_error};
|
|
4
|
+
use crate::gc_guarded_value::GcGuardedValue;
|
|
5
|
+
use magnus::{Error, Ruby, TryConvert, Value};
|
|
6
|
+
use magnus::value::ReprValue;
|
|
7
|
+
use kreuzberg::plugins::{
|
|
8
|
+
register_ocr_backend as kz_register_ocr_backend,
|
|
9
|
+
unregister_ocr_backend as kz_unregister_ocr_backend,
|
|
10
|
+
list_ocr_backends as kz_list_ocr_backends,
|
|
11
|
+
clear_ocr_backends as kz_clear_ocr_backends,
|
|
12
|
+
OcrBackend, OcrBackendType, Plugin,
|
|
13
|
+
};
|
|
14
|
+
use kreuzberg::types::{ExtractionResult, Metadata};
|
|
15
|
+
use kreuzberg::{OcrConfig, KreuzbergError};
|
|
16
|
+
use async_trait::async_trait;
|
|
17
|
+
use std::path::Path;
|
|
18
|
+
use std::sync::Arc;
|
|
19
|
+
|
|
20
|
+
/// Ruby OCR backend wrapper that implements the OcrBackend trait
|
|
21
|
+
struct RubyOcrBackend {
|
|
22
|
+
name: String,
|
|
23
|
+
backend: GcGuardedValue,
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
// SAFETY: Ruby's GC is handled by GcGuardedValue, and we ensure all Ruby
|
|
27
|
+
// calls happen through proper Magnus/Ruby FFI boundaries
|
|
28
|
+
unsafe impl Send for RubyOcrBackend {}
|
|
29
|
+
unsafe impl Sync for RubyOcrBackend {}
|
|
30
|
+
|
|
31
|
+
impl Plugin for RubyOcrBackend {
|
|
32
|
+
fn name(&self) -> &str {
|
|
33
|
+
&self.name
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
fn version(&self) -> String {
|
|
37
|
+
"1.0.0".to_string()
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
fn initialize(&self) -> kreuzberg::Result<()> {
|
|
41
|
+
Ok(())
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
45
|
+
Ok(())
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
#[async_trait]
|
|
50
|
+
impl OcrBackend for RubyOcrBackend {
|
|
51
|
+
async fn process_image(&self, image_bytes: &[u8], config: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
|
|
52
|
+
let backend_name = self.name.clone();
|
|
53
|
+
let backend = self.backend.value();
|
|
54
|
+
let image_data = image_bytes.to_vec();
|
|
55
|
+
let ocr_config = config.clone();
|
|
56
|
+
|
|
57
|
+
tokio::task::block_in_place(|| {
|
|
58
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
59
|
+
|
|
60
|
+
// Convert image bytes to Ruby string (binary)
|
|
61
|
+
let ruby_bytes = ruby.str_from_slice(&image_data);
|
|
62
|
+
|
|
63
|
+
// Convert config to Ruby hash
|
|
64
|
+
let config_hash = ruby.hash_new();
|
|
65
|
+
config_hash.aset("backend", ocr_config.backend.as_str())
|
|
66
|
+
.map_err(|e| KreuzbergError::Plugin {
|
|
67
|
+
message: format!("Failed to set backend in config: {}", e),
|
|
68
|
+
plugin_name: backend_name.clone(),
|
|
69
|
+
})?;
|
|
70
|
+
config_hash.aset("language", ocr_config.language.as_str())
|
|
71
|
+
.map_err(|e| KreuzbergError::Plugin {
|
|
72
|
+
message: format!("Failed to set language in config: {}", e),
|
|
73
|
+
plugin_name: backend_name.clone(),
|
|
74
|
+
})?;
|
|
75
|
+
|
|
76
|
+
// Call Ruby backend's process_image method
|
|
77
|
+
let result: magnus::Value = backend
|
|
78
|
+
.funcall("process_image", (ruby_bytes, config_hash))
|
|
79
|
+
.map_err(|e| KreuzbergError::Plugin {
|
|
80
|
+
message: format!("Ruby OCR backend failed: {}", e),
|
|
81
|
+
plugin_name: backend_name.clone(),
|
|
82
|
+
})?;
|
|
83
|
+
|
|
84
|
+
// Convert result to String
|
|
85
|
+
let content = String::try_convert(result)
|
|
86
|
+
.map_err(|e| KreuzbergError::Plugin {
|
|
87
|
+
message: format!("OCR backend must return a String: {}", e),
|
|
88
|
+
plugin_name: backend_name.clone(),
|
|
89
|
+
})?;
|
|
90
|
+
|
|
91
|
+
Ok(ExtractionResult {
|
|
92
|
+
content,
|
|
93
|
+
mime_type: "text/plain".to_string(),
|
|
94
|
+
metadata: Metadata::default(),
|
|
95
|
+
tables: vec![],
|
|
96
|
+
detected_languages: None,
|
|
97
|
+
chunks: None,
|
|
98
|
+
images: None,
|
|
99
|
+
djot_content: None,
|
|
100
|
+
pages: None,
|
|
101
|
+
elements: None,
|
|
102
|
+
})
|
|
103
|
+
})
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
async fn process_file(&self, path: &Path, config: &OcrConfig) -> kreuzberg::Result<ExtractionResult> {
|
|
107
|
+
let bytes = std::fs::read(path)?;
|
|
108
|
+
self.process_image(&bytes, config).await
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
fn supports_language(&self, _lang: &str) -> bool {
|
|
112
|
+
// Ruby backends are assumed to support all languages by default
|
|
113
|
+
// A more sophisticated implementation could call back to Ruby
|
|
114
|
+
true
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
fn backend_type(&self) -> OcrBackendType {
|
|
118
|
+
OcrBackendType::Custom
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/// Register an OCR backend plugin
|
|
123
|
+
pub fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
|
|
124
|
+
let _ruby = Ruby::get().expect("Ruby not initialized");
|
|
125
|
+
|
|
126
|
+
// Validate that the backend has the required methods
|
|
127
|
+
if !backend.respond_to("name", true)? {
|
|
128
|
+
return Err(runtime_error("OCR backend must implement #name method"));
|
|
129
|
+
}
|
|
130
|
+
if !backend.respond_to("process_image", true)? {
|
|
131
|
+
return Err(runtime_error("OCR backend must implement #process_image(image_bytes, config) method"));
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
let backend_impl = Arc::new(RubyOcrBackend {
|
|
135
|
+
name: name.clone(),
|
|
136
|
+
backend: GcGuardedValue::new(backend),
|
|
137
|
+
});
|
|
138
|
+
|
|
139
|
+
kz_register_ocr_backend(backend_impl)
|
|
140
|
+
.map_err(kreuzberg_error)
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/// Unregister an OCR backend
|
|
144
|
+
pub fn unregister_ocr_backend(_name: String) -> Result<(), Error> {
|
|
145
|
+
kz_unregister_ocr_backend(_name.as_str())
|
|
146
|
+
.map_err(kreuzberg_error)
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
/// List registered OCR backends
|
|
150
|
+
pub fn list_ocr_backends() -> Result<Vec<String>, Error> {
|
|
151
|
+
kz_list_ocr_backends()
|
|
152
|
+
.map_err(kreuzberg_error)
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/// Clear all OCR backends
|
|
156
|
+
pub fn clear_ocr_backends() -> Result<(), Error> {
|
|
157
|
+
kz_clear_ocr_backends()
|
|
158
|
+
.map_err(kreuzberg_error)
|
|
159
|
+
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
//! Post-processor plugin registration and management
|
|
2
|
+
|
|
3
|
+
use crate::{error_handling::{kreuzberg_error, runtime_error}, gc_guarded_value::GcGuardedValue, helpers::get_kw};
|
|
4
|
+
use magnus::{Error, Ruby, Value, scan_args::scan_args, TryConvert};
|
|
5
|
+
use magnus::value::ReprValue;
|
|
6
|
+
use std::sync::Arc;
|
|
7
|
+
|
|
8
|
+
/// Register a post-processor plugin
|
|
9
|
+
pub fn register_post_processor(args: &[Value]) -> Result<(), Error> {
|
|
10
|
+
let _ruby = Ruby::get().expect("Ruby not initialized");
|
|
11
|
+
let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
|
|
12
|
+
let (name, processor) = args.required;
|
|
13
|
+
let (priority,) = args.optional;
|
|
14
|
+
let priority = priority.unwrap_or(50);
|
|
15
|
+
|
|
16
|
+
if !processor.respond_to("call", true)? {
|
|
17
|
+
return Err(runtime_error("Post-processor must be a Proc or respond to 'call'"));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
use async_trait::async_trait;
|
|
21
|
+
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
22
|
+
|
|
23
|
+
struct RubyPostProcessor {
|
|
24
|
+
name: String,
|
|
25
|
+
processor: GcGuardedValue,
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
unsafe impl Send for RubyPostProcessor {}
|
|
29
|
+
unsafe impl Sync for RubyPostProcessor {}
|
|
30
|
+
|
|
31
|
+
impl Plugin for RubyPostProcessor {
|
|
32
|
+
fn name(&self) -> &str {
|
|
33
|
+
&self.name
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
fn version(&self) -> String {
|
|
37
|
+
"1.0.0".to_string()
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
fn initialize(&self) -> kreuzberg::Result<()> {
|
|
41
|
+
Ok(())
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
45
|
+
Ok(())
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
#[async_trait]
|
|
50
|
+
impl PostProcessor for RubyPostProcessor {
|
|
51
|
+
async fn process(
|
|
52
|
+
&self,
|
|
53
|
+
result: &mut kreuzberg::ExtractionResult,
|
|
54
|
+
_config: &kreuzberg::ExtractionConfig,
|
|
55
|
+
) -> kreuzberg::Result<()> {
|
|
56
|
+
let processor_name = self.name.clone();
|
|
57
|
+
let processor = self.processor.value();
|
|
58
|
+
let result_clone = result.clone();
|
|
59
|
+
|
|
60
|
+
let updated_result = tokio::task::block_in_place(|| {
|
|
61
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
62
|
+
let result_hash = crate::result::extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
|
|
63
|
+
kreuzberg::KreuzbergError::Plugin {
|
|
64
|
+
message: format!("Failed to convert result to Ruby: {}", e),
|
|
65
|
+
plugin_name: processor_name.clone(),
|
|
66
|
+
}
|
|
67
|
+
})?;
|
|
68
|
+
|
|
69
|
+
let modified = processor
|
|
70
|
+
.funcall::<_, _, magnus::Value>("call", (result_hash,))
|
|
71
|
+
.map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
72
|
+
message: format!("Ruby post-processor failed: {}", e),
|
|
73
|
+
plugin_name: processor_name.clone(),
|
|
74
|
+
})?;
|
|
75
|
+
|
|
76
|
+
let modified_hash =
|
|
77
|
+
magnus::RHash::try_convert(modified).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
78
|
+
message: format!("Post-processor must return a Hash: {}", e),
|
|
79
|
+
plugin_name: processor_name.clone(),
|
|
80
|
+
})?;
|
|
81
|
+
|
|
82
|
+
let mut updated_result = result_clone;
|
|
83
|
+
|
|
84
|
+
if let Some(content_val) = get_kw(&ruby, modified_hash, "content") {
|
|
85
|
+
let new_content =
|
|
86
|
+
String::try_convert(content_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
87
|
+
message: format!("Failed to convert content: {}", e),
|
|
88
|
+
plugin_name: processor_name.clone(),
|
|
89
|
+
})?;
|
|
90
|
+
updated_result.content = new_content;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if let Some(mime_val) = get_kw(&ruby, modified_hash, "mime_type") {
|
|
94
|
+
let new_mime = String::try_convert(mime_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
95
|
+
message: format!("Failed to convert mime_type: {}", e),
|
|
96
|
+
plugin_name: processor_name.clone(),
|
|
97
|
+
})?;
|
|
98
|
+
updated_result.mime_type = new_mime;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
Ok::<kreuzberg::ExtractionResult, kreuzberg::KreuzbergError>(updated_result)
|
|
102
|
+
})?;
|
|
103
|
+
|
|
104
|
+
*result = updated_result;
|
|
105
|
+
Ok(())
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
fn processing_stage(&self) -> ProcessingStage {
|
|
109
|
+
ProcessingStage::Late
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
let processor_impl = Arc::new(RubyPostProcessor {
|
|
114
|
+
name: name.clone(),
|
|
115
|
+
processor: GcGuardedValue::new(processor),
|
|
116
|
+
});
|
|
117
|
+
|
|
118
|
+
let registry = kreuzberg::get_post_processor_registry();
|
|
119
|
+
registry
|
|
120
|
+
.write()
|
|
121
|
+
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
122
|
+
.register(processor_impl, priority)
|
|
123
|
+
.map_err(kreuzberg_error)?;
|
|
124
|
+
|
|
125
|
+
Ok(())
|
|
126
|
+
}
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
//! Validator plugin registration and management
|
|
2
|
+
|
|
3
|
+
use crate::{error_handling::{kreuzberg_error, runtime_error}, gc_guarded_value::GcGuardedValue};
|
|
4
|
+
use magnus::{Error, Value, scan_args::scan_args, Ruby};
|
|
5
|
+
use magnus::value::ReprValue;
|
|
6
|
+
use std::sync::Arc;
|
|
7
|
+
|
|
8
|
+
/// Register a validator plugin
|
|
9
|
+
pub fn register_validator(args: &[Value]) -> Result<(), Error> {
|
|
10
|
+
let _ruby = Ruby::get().expect("Ruby not initialized");
|
|
11
|
+
let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
|
|
12
|
+
let (name, validator) = args.required;
|
|
13
|
+
let (priority,) = args.optional;
|
|
14
|
+
let priority = priority.unwrap_or(50);
|
|
15
|
+
|
|
16
|
+
if !validator.respond_to("call", true)? {
|
|
17
|
+
return Err(runtime_error("Validator must be a Proc or respond to 'call'"));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
use async_trait::async_trait;
|
|
21
|
+
use kreuzberg::plugins::{Plugin, Validator};
|
|
22
|
+
|
|
23
|
+
struct RubyValidator {
|
|
24
|
+
name: String,
|
|
25
|
+
validator: GcGuardedValue,
|
|
26
|
+
priority: i32,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
unsafe impl Send for RubyValidator {}
|
|
30
|
+
unsafe impl Sync for RubyValidator {}
|
|
31
|
+
|
|
32
|
+
impl Plugin for RubyValidator {
|
|
33
|
+
fn name(&self) -> &str {
|
|
34
|
+
&self.name
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
fn version(&self) -> String {
|
|
38
|
+
"1.0.0".to_string()
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
fn initialize(&self) -> kreuzberg::Result<()> {
|
|
42
|
+
Ok(())
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
46
|
+
Ok(())
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
#[async_trait]
|
|
51
|
+
impl Validator for RubyValidator {
|
|
52
|
+
async fn validate(
|
|
53
|
+
&self,
|
|
54
|
+
result: &kreuzberg::ExtractionResult,
|
|
55
|
+
_config: &kreuzberg::ExtractionConfig,
|
|
56
|
+
) -> kreuzberg::Result<()> {
|
|
57
|
+
let validator_name = self.name.clone();
|
|
58
|
+
let validator = self.validator.value();
|
|
59
|
+
let result_clone = result.clone();
|
|
60
|
+
|
|
61
|
+
tokio::task::block_in_place(|| {
|
|
62
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
63
|
+
let result_hash =
|
|
64
|
+
crate::result::extraction_result_to_ruby(&ruby, result_clone).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
65
|
+
message: format!("Failed to convert result to Ruby: {}", e),
|
|
66
|
+
plugin_name: validator_name.clone(),
|
|
67
|
+
})?;
|
|
68
|
+
|
|
69
|
+
validator
|
|
70
|
+
.funcall::<_, _, magnus::Value>("call", (result_hash,))
|
|
71
|
+
.map_err(|e| kreuzberg::KreuzbergError::Validation {
|
|
72
|
+
message: format!("Validation failed: {}", e),
|
|
73
|
+
source: None,
|
|
74
|
+
})?;
|
|
75
|
+
|
|
76
|
+
Ok(())
|
|
77
|
+
})
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
fn priority(&self) -> i32 {
|
|
81
|
+
self.priority
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
let validator_impl = Arc::new(RubyValidator {
|
|
86
|
+
name: name.clone(),
|
|
87
|
+
validator: GcGuardedValue::new(validator),
|
|
88
|
+
priority,
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
let registry = kreuzberg::get_validator_registry();
|
|
92
|
+
registry
|
|
93
|
+
.write()
|
|
94
|
+
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
95
|
+
.register(validator_impl)
|
|
96
|
+
.map_err(kreuzberg_error)?;
|
|
97
|
+
|
|
98
|
+
Ok(())
|
|
99
|
+
}
|