kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -5,3608 +5,508 @@
|
|
|
5
5
|
//! High-performance document intelligence framework bindings for Ruby.
|
|
6
6
|
//! Provides extraction, OCR, chunking, and language detection for 30+ file formats.
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
};
|
|
24
|
-
use
|
|
25
|
-
use
|
|
26
|
-
use
|
|
27
|
-
use
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
use std::fs;
|
|
31
|
-
use std::path::{Path, PathBuf};
|
|
32
|
-
|
|
33
|
-
// Re-export FFI types and functions from kreuzberg_ffi crate.
|
|
34
|
-
// This ensures proper linking by importing Rust symbols directly
|
|
35
|
-
// instead of declaring them as external C symbols.
|
|
8
|
+
// Module declarations
|
|
9
|
+
mod error_handling;
|
|
10
|
+
mod gc_guarded_value;
|
|
11
|
+
mod helpers;
|
|
12
|
+
mod config;
|
|
13
|
+
mod result;
|
|
14
|
+
mod extraction;
|
|
15
|
+
mod batch;
|
|
16
|
+
mod validation;
|
|
17
|
+
mod metadata;
|
|
18
|
+
mod plugins;
|
|
19
|
+
|
|
20
|
+
// Re-export public APIs
|
|
21
|
+
pub use error_handling::{kreuzberg_error, runtime_error, get_error_code};
|
|
22
|
+
pub use gc_guarded_value::GcGuardedValue;
|
|
23
|
+
pub use helpers::{get_kw, set_hash_entry, json_value_to_ruby, ruby_value_to_json, cache_root_dir, cache_directories};
|
|
24
|
+
pub use config::parse_extraction_config;
|
|
25
|
+
pub use result::extraction_result_to_ruby;
|
|
26
|
+
pub use extraction::{extract_file_sync, extract_bytes_sync, extract_file, extract_bytes};
|
|
27
|
+
pub use batch::{batch_extract_files_sync, batch_extract_bytes_sync, batch_extract_files, batch_extract_bytes};
|
|
28
|
+
|
|
29
|
+
// Re-export FFI
|
|
36
30
|
pub use kreuzberg_ffi::{
|
|
37
|
-
// Types
|
|
38
|
-
CErrorDetails, CMetadataField,
|
|
39
|
-
// Panic/error handling (from panic_shield module)
|
|
40
|
-
get_last_error_code, get_last_error_message, get_last_panic_context,
|
|
41
|
-
// Error functions (from error module)
|
|
42
|
-
kreuzberg_get_error_details, kreuzberg_classify_error,
|
|
43
|
-
kreuzberg_error_code_name, kreuzberg_error_code_description,
|
|
44
|
-
// Result functions (from result module)
|
|
45
|
-
kreuzberg_result_get_page_count, kreuzberg_result_get_chunk_count,
|
|
46
|
-
kreuzberg_result_get_detected_language, kreuzberg_result_get_metadata_field,
|
|
47
|
-
// Memory and util functions (from lib.rs)
|
|
48
|
-
kreuzberg_free_string, kreuzberg_last_error, kreuzberg_last_error_code,
|
|
49
|
-
kreuzberg_last_panic_context,
|
|
50
|
-
// Validation functions (from lib.rs)
|
|
51
31
|
kreuzberg_validate_binarization_method, kreuzberg_validate_ocr_backend,
|
|
52
32
|
kreuzberg_validate_language_code, kreuzberg_validate_token_reduction_level,
|
|
53
33
|
kreuzberg_validate_tesseract_psm, kreuzberg_validate_tesseract_oem,
|
|
54
|
-
kreuzberg_validate_output_format, kreuzberg_validate_confidence,
|
|
55
|
-
kreuzberg_validate_dpi, kreuzberg_validate_chunking_params,
|
|
56
|
-
kreuzberg_get_valid_binarization_methods, kreuzberg_get_valid_language_codes,
|
|
57
|
-
kreuzberg_get_valid_ocr_backends, kreuzberg_get_valid_token_reduction_levels,
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
kreuzberg_config_to_json, kreuzberg_config_get_field, kreuzberg_config_merge,
|
|
61
|
-
};
|
|
62
|
-
|
|
63
|
-
use std::ffi::c_char;
|
|
64
|
-
|
|
65
|
-
/// Keeps Ruby values alive across plugin registrations by informing the GC.
|
|
66
|
-
struct GcGuardedValue {
|
|
67
|
-
value: Value,
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
impl GcGuardedValue {
|
|
71
|
-
fn new(value: Value) -> Self {
|
|
72
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
73
|
-
ruby.gc_register_address(&value);
|
|
74
|
-
Self { value }
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
fn value(&self) -> Value {
|
|
78
|
-
self.value
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
impl Drop for GcGuardedValue {
|
|
83
|
-
fn drop(&mut self) {
|
|
84
|
-
if let Ok(ruby) = Ruby::get() {
|
|
85
|
-
ruby.gc_unregister_address(&self.value);
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
/// Retrieve panic context from FFI if available
|
|
91
|
-
fn get_panic_context() -> Option<String> {
|
|
92
|
-
unsafe {
|
|
93
|
-
let ctx_ptr = kreuzberg_last_panic_context();
|
|
94
|
-
if ctx_ptr.is_null() {
|
|
95
|
-
return None;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
let c_str = std::ffi::CStr::from_ptr(ctx_ptr);
|
|
99
|
-
let context = c_str.to_string_lossy().to_string();
|
|
100
|
-
kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
|
|
101
|
-
|
|
102
|
-
if context.is_empty() { None } else { Some(context) }
|
|
103
|
-
}
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
/// Retrieve error code from FFI
|
|
107
|
-
fn get_error_code() -> i32 {
|
|
108
|
-
unsafe { kreuzberg_last_error_code() }
|
|
109
|
-
}
|
|
110
|
-
|
|
111
|
-
/// Convert Kreuzberg errors to Ruby exceptions
|
|
112
|
-
fn kreuzberg_error(err: KreuzbergError) -> Error {
|
|
113
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
114
|
-
|
|
115
|
-
let fetch_error_class = |name: &str| -> Option<ExceptionClass> {
|
|
116
|
-
ruby.eval::<ExceptionClass>(&format!("Kreuzberg::Errors::{}", name))
|
|
117
|
-
.ok()
|
|
118
|
-
};
|
|
119
|
-
|
|
120
|
-
match err {
|
|
121
|
-
KreuzbergError::Validation { message, .. } => {
|
|
122
|
-
if let Some(class) = fetch_error_class("ValidationError") {
|
|
123
|
-
Error::new(class, message)
|
|
124
|
-
} else {
|
|
125
|
-
Error::new(ruby.exception_arg_error(), message)
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
KreuzbergError::Parsing { message, .. } => {
|
|
129
|
-
if let Some(class) = fetch_error_class("ParsingError") {
|
|
130
|
-
Error::new(class, message)
|
|
131
|
-
} else {
|
|
132
|
-
Error::new(ruby.exception_runtime_error(), format!("ParsingError: {}", message))
|
|
133
|
-
}
|
|
134
|
-
}
|
|
135
|
-
KreuzbergError::Ocr { message, .. } => {
|
|
136
|
-
if let Some(class) = fetch_error_class("OCRError") {
|
|
137
|
-
Error::new(class, message)
|
|
138
|
-
} else {
|
|
139
|
-
Error::new(ruby.exception_runtime_error(), format!("OCRError: {}", message))
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
KreuzbergError::MissingDependency(message) => {
|
|
143
|
-
if let Some(class) = fetch_error_class("MissingDependencyError") {
|
|
144
|
-
Error::new(class, message)
|
|
145
|
-
} else {
|
|
146
|
-
Error::new(
|
|
147
|
-
ruby.exception_runtime_error(),
|
|
148
|
-
format!("MissingDependencyError: {}", message),
|
|
149
|
-
)
|
|
150
|
-
}
|
|
151
|
-
}
|
|
152
|
-
KreuzbergError::Plugin { message, plugin_name } => {
|
|
153
|
-
if let Some(class) = fetch_error_class("PluginError") {
|
|
154
|
-
Error::new(class, format!("{}: {}", plugin_name, message))
|
|
155
|
-
} else {
|
|
156
|
-
Error::new(
|
|
157
|
-
ruby.exception_runtime_error(),
|
|
158
|
-
format!("Plugin error in '{}': {}", plugin_name, message),
|
|
159
|
-
)
|
|
160
|
-
}
|
|
161
|
-
}
|
|
162
|
-
KreuzbergError::Io(err) => {
|
|
163
|
-
if let Some(class) = fetch_error_class("IOError") {
|
|
164
|
-
Error::new(class, err.to_string())
|
|
165
|
-
} else {
|
|
166
|
-
Error::new(ruby.exception_runtime_error(), format!("IO error: {}", err))
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
KreuzbergError::UnsupportedFormat(message) => {
|
|
170
|
-
if let Some(class) = fetch_error_class("UnsupportedFormatError") {
|
|
171
|
-
Error::new(class, message)
|
|
172
|
-
} else {
|
|
173
|
-
Error::new(
|
|
174
|
-
ruby.exception_runtime_error(),
|
|
175
|
-
format!("UnsupportedFormatError: {}", message),
|
|
176
|
-
)
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
other => Error::new(ruby.exception_runtime_error(), other.to_string()),
|
|
180
|
-
}
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
fn runtime_error(message: impl Into<String>) -> Error {
|
|
184
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
185
|
-
Error::new(ruby.exception_runtime_error(), message.into())
|
|
186
|
-
}
|
|
187
|
-
|
|
188
|
-
/// Convert Ruby Symbol or String to Rust String
|
|
189
|
-
fn symbol_to_string(value: Value) -> Result<String, Error> {
|
|
190
|
-
if let Some(symbol) = Symbol::from_value(value) {
|
|
191
|
-
Ok(symbol.name()?.to_string())
|
|
192
|
-
} else {
|
|
193
|
-
String::try_convert(value)
|
|
194
|
-
}
|
|
195
|
-
}
|
|
196
|
-
|
|
197
|
-
/// Get keyword argument from hash (supports both symbol and string keys)
|
|
198
|
-
fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
|
|
199
|
-
hash.get(name).or_else(|| {
|
|
200
|
-
let sym = ruby.intern(name);
|
|
201
|
-
hash.get(sym)
|
|
202
|
-
})
|
|
203
|
-
}
|
|
204
|
-
|
|
205
|
-
fn set_hash_entry(_ruby: &Ruby, hash: &RHash, key: &str, value: Value) -> Result<(), Error> {
|
|
206
|
-
hash.aset(key, value)?;
|
|
207
|
-
Ok(())
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
fn ocr_config_to_ruby_hash(ruby: &Ruby, config: &kreuzberg::OcrConfig) -> Result<RHash, Error> {
|
|
211
|
-
let value =
|
|
212
|
-
serde_json::to_value(config).map_err(|e| runtime_error(format!("Failed to serialize OCR config: {}", e)))?;
|
|
213
|
-
let ruby_value = json_value_to_ruby(ruby, &value)?;
|
|
214
|
-
RHash::try_convert(ruby_value).map_err(|_| runtime_error("OCR config must return a Hash"))
|
|
215
|
-
}
|
|
216
|
-
|
|
217
|
-
fn cache_root_dir() -> Result<PathBuf, Error> {
|
|
218
|
-
std::env::current_dir()
|
|
219
|
-
.map(|dir| dir.join(".kreuzberg"))
|
|
220
|
-
.map_err(|e| runtime_error(format!("Failed to get current directory: {}", e)))
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
fn cache_directories(root: &Path) -> Result<Vec<PathBuf>, Error> {
|
|
224
|
-
if !root.exists() {
|
|
225
|
-
return Ok(vec![]);
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
let mut dirs = vec![root.to_path_buf()];
|
|
229
|
-
let entries = fs::read_dir(root).map_err(|e| runtime_error(format!("Failed to read cache root: {}", e)))?;
|
|
230
|
-
|
|
231
|
-
for entry in entries {
|
|
232
|
-
let entry = entry.map_err(|e| runtime_error(format!("Failed to read cache directory entry: {}", e)))?;
|
|
233
|
-
if entry
|
|
234
|
-
.file_type()
|
|
235
|
-
.map_err(|e| runtime_error(format!("Failed to determine cache entry type: {}", e)))?
|
|
236
|
-
.is_dir()
|
|
237
|
-
{
|
|
238
|
-
dirs.push(entry.path());
|
|
239
|
-
}
|
|
240
|
-
}
|
|
241
|
-
|
|
242
|
-
Ok(dirs)
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
fn json_value_to_ruby(ruby: &Ruby, value: &serde_json::Value) -> Result<Value, Error> {
|
|
246
|
-
Ok(match value {
|
|
247
|
-
serde_json::Value::Null => ruby.qnil().as_value(),
|
|
248
|
-
serde_json::Value::Bool(b) => {
|
|
249
|
-
if *b {
|
|
250
|
-
ruby.qtrue().as_value()
|
|
251
|
-
} else {
|
|
252
|
-
ruby.qfalse().as_value()
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
serde_json::Value::Number(num) => {
|
|
256
|
-
if let Some(i) = num.as_i64() {
|
|
257
|
-
ruby.integer_from_i64(i).into_value_with(ruby)
|
|
258
|
-
} else if let Some(u) = num.as_u64() {
|
|
259
|
-
ruby.integer_from_u64(u).into_value_with(ruby)
|
|
260
|
-
} else if let Some(f) = num.as_f64() {
|
|
261
|
-
ruby.float_from_f64(f).into_value_with(ruby)
|
|
262
|
-
} else {
|
|
263
|
-
ruby.qnil().as_value()
|
|
264
|
-
}
|
|
265
|
-
}
|
|
266
|
-
serde_json::Value::String(s) => ruby.str_new(s).into_value_with(ruby),
|
|
267
|
-
serde_json::Value::Array(items) => {
|
|
268
|
-
let ary = ruby.ary_new();
|
|
269
|
-
for item in items {
|
|
270
|
-
ary.push(json_value_to_ruby(ruby, item)?)?;
|
|
271
|
-
}
|
|
272
|
-
ary.into_value_with(ruby)
|
|
273
|
-
}
|
|
274
|
-
serde_json::Value::Object(map) => {
|
|
275
|
-
let hash = ruby.hash_new();
|
|
276
|
-
for (key, val) in map {
|
|
277
|
-
let key_value = ruby.str_new(key).into_value_with(ruby);
|
|
278
|
-
let val_value = json_value_to_ruby(ruby, val)?;
|
|
279
|
-
hash.aset(key_value, val_value)?;
|
|
280
|
-
}
|
|
281
|
-
hash.into_value_with(ruby)
|
|
282
|
-
}
|
|
283
|
-
})
|
|
284
|
-
}
|
|
285
|
-
|
|
286
|
-
fn ruby_key_to_string(value: Value) -> Result<String, Error> {
|
|
287
|
-
if let Ok(sym) = Symbol::try_convert(value) {
|
|
288
|
-
Ok(sym.name()?.to_string())
|
|
289
|
-
} else {
|
|
290
|
-
String::try_convert(value)
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
|
|
294
|
-
fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
|
|
295
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
296
|
-
|
|
297
|
-
if value.is_nil() {
|
|
298
|
-
return Ok(serde_json::Value::Null);
|
|
299
|
-
}
|
|
300
|
-
|
|
301
|
-
if value.equal(ruby.qtrue())? {
|
|
302
|
-
return Ok(serde_json::Value::Bool(true));
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
if value.equal(ruby.qfalse())? {
|
|
306
|
-
return Ok(serde_json::Value::Bool(false));
|
|
307
|
-
}
|
|
308
|
-
|
|
309
|
-
if let Ok(integer) = i64::try_convert(value) {
|
|
310
|
-
return Ok(serde_json::Value::Number(integer.into()));
|
|
311
|
-
}
|
|
312
|
-
|
|
313
|
-
if let Ok(unsigned) = u64::try_convert(value) {
|
|
314
|
-
return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
|
|
315
|
-
}
|
|
316
|
-
|
|
317
|
-
if let Ok(float) = f64::try_convert(value)
|
|
318
|
-
&& let Some(num) = serde_json::Number::from_f64(float)
|
|
319
|
-
{
|
|
320
|
-
return Ok(serde_json::Value::Number(num));
|
|
321
|
-
}
|
|
322
|
-
|
|
323
|
-
if let Ok(sym) = Symbol::try_convert(value) {
|
|
324
|
-
return Ok(serde_json::Value::String(sym.name()?.to_string()));
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
if let Ok(string) = String::try_convert(value) {
|
|
328
|
-
return Ok(serde_json::Value::String(string));
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
if let Ok(array) = RArray::try_convert(value) {
|
|
332
|
-
let mut values = Vec::with_capacity(array.len());
|
|
333
|
-
for item in array.into_iter() {
|
|
334
|
-
values.push(ruby_value_to_json(item)?);
|
|
335
|
-
}
|
|
336
|
-
return Ok(serde_json::Value::Array(values));
|
|
337
|
-
}
|
|
338
|
-
|
|
339
|
-
if let Ok(hash) = RHash::try_convert(value) {
|
|
340
|
-
let mut map = serde_json::Map::new();
|
|
341
|
-
hash.foreach(|key: Value, val: Value| {
|
|
342
|
-
let key_string = ruby_key_to_string(key)?;
|
|
343
|
-
let json_value = ruby_value_to_json(val)?;
|
|
344
|
-
map.insert(key_string, json_value);
|
|
345
|
-
Ok(ForEach::Continue)
|
|
346
|
-
})?;
|
|
347
|
-
|
|
348
|
-
return Ok(serde_json::Value::Object(map));
|
|
349
|
-
}
|
|
350
|
-
|
|
351
|
-
Err(runtime_error("Unsupported Ruby value for JSON conversion"))
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
/// Parse OcrConfig from Ruby Hash
|
|
355
|
-
fn parse_ocr_config(ruby: &Ruby, hash: RHash) -> Result<OcrConfig, Error> {
|
|
356
|
-
let backend = if let Some(val) = get_kw(ruby, hash, "backend") {
|
|
357
|
-
symbol_to_string(val)?
|
|
358
|
-
} else {
|
|
359
|
-
"tesseract".to_string()
|
|
360
|
-
};
|
|
361
|
-
|
|
362
|
-
let language = if let Some(val) = get_kw(ruby, hash, "language") {
|
|
363
|
-
symbol_to_string(val)?
|
|
364
|
-
} else {
|
|
365
|
-
"eng".to_string()
|
|
366
|
-
};
|
|
367
|
-
|
|
368
|
-
let mut config = OcrConfig {
|
|
369
|
-
backend,
|
|
370
|
-
language,
|
|
371
|
-
tesseract_config: None,
|
|
372
|
-
};
|
|
373
|
-
|
|
374
|
-
if let Some(val) = get_kw(ruby, hash, "tesseract_config")
|
|
375
|
-
&& !val.is_nil()
|
|
376
|
-
{
|
|
377
|
-
let tc_json = ruby_value_to_json(val)?;
|
|
378
|
-
let parsed: RustTesseractConfig =
|
|
379
|
-
serde_json::from_value(tc_json).map_err(|e| runtime_error(format!("Invalid tesseract_config: {}", e)))?;
|
|
380
|
-
config.tesseract_config = Some(parsed);
|
|
381
|
-
}
|
|
382
|
-
|
|
383
|
-
Ok(config)
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
/// Parse ChunkingConfig from Ruby Hash
|
|
387
|
-
fn parse_chunking_config(ruby: &Ruby, hash: RHash) -> Result<ChunkingConfig, Error> {
|
|
388
|
-
let max_chars = if let Some(val) = get_kw(ruby, hash, "max_chars") {
|
|
389
|
-
usize::try_convert(val)?
|
|
390
|
-
} else {
|
|
391
|
-
1000
|
|
392
|
-
};
|
|
393
|
-
|
|
394
|
-
let max_overlap = if let Some(val) = get_kw(ruby, hash, "max_overlap") {
|
|
395
|
-
usize::try_convert(val)?
|
|
396
|
-
} else {
|
|
397
|
-
200
|
|
398
|
-
};
|
|
399
|
-
|
|
400
|
-
let preset = if let Some(val) = get_kw(ruby, hash, "preset")
|
|
401
|
-
&& !val.is_nil()
|
|
402
|
-
{
|
|
403
|
-
Some(symbol_to_string(val)?)
|
|
404
|
-
} else {
|
|
405
|
-
None
|
|
406
|
-
};
|
|
407
|
-
|
|
408
|
-
let embedding = if let Some(val) = get_kw(ruby, hash, "embedding")
|
|
409
|
-
&& !val.is_nil()
|
|
410
|
-
{
|
|
411
|
-
let json_value = ruby_value_to_json(val)?;
|
|
412
|
-
let parsed: EmbeddingConfig = serde_json::from_value(json_value)
|
|
413
|
-
.map_err(|e| runtime_error(format!("Invalid chunking.embedding: {}", e)))?;
|
|
414
|
-
Some(parsed)
|
|
415
|
-
} else {
|
|
416
|
-
None
|
|
417
|
-
};
|
|
418
|
-
|
|
419
|
-
let config = ChunkingConfig {
|
|
420
|
-
max_chars,
|
|
421
|
-
max_overlap,
|
|
422
|
-
embedding,
|
|
423
|
-
preset,
|
|
424
|
-
};
|
|
425
|
-
|
|
426
|
-
Ok(config)
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
/// Parse LanguageDetectionConfig from Ruby Hash
|
|
430
|
-
fn parse_language_detection_config(ruby: &Ruby, hash: RHash) -> Result<LanguageDetectionConfig, Error> {
|
|
431
|
-
let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
|
|
432
|
-
bool::try_convert(val)?
|
|
433
|
-
} else {
|
|
434
|
-
true
|
|
435
|
-
};
|
|
436
|
-
|
|
437
|
-
let min_confidence = if let Some(val) = get_kw(ruby, hash, "min_confidence") {
|
|
438
|
-
f64::try_convert(val)?
|
|
439
|
-
} else {
|
|
440
|
-
0.8
|
|
441
|
-
};
|
|
442
|
-
|
|
443
|
-
let detect_multiple = if let Some(val) = get_kw(ruby, hash, "detect_multiple") {
|
|
444
|
-
bool::try_convert(val)?
|
|
445
|
-
} else {
|
|
446
|
-
false
|
|
447
|
-
};
|
|
448
|
-
|
|
449
|
-
let config = LanguageDetectionConfig {
|
|
450
|
-
enabled,
|
|
451
|
-
min_confidence,
|
|
452
|
-
detect_multiple,
|
|
453
|
-
};
|
|
454
|
-
|
|
455
|
-
Ok(config)
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
/// Parse HierarchyConfig from Ruby Hash
|
|
459
|
-
fn parse_hierarchy_config(ruby: &Ruby, hash: RHash) -> Result<HierarchyConfig, Error> {
|
|
460
|
-
let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
|
|
461
|
-
bool::try_convert(val)?
|
|
462
|
-
} else {
|
|
463
|
-
true
|
|
464
|
-
};
|
|
465
|
-
|
|
466
|
-
let k_clusters = if let Some(val) = get_kw(ruby, hash, "k_clusters") {
|
|
467
|
-
usize::try_convert(val)?
|
|
468
|
-
} else {
|
|
469
|
-
6
|
|
470
|
-
};
|
|
471
|
-
|
|
472
|
-
let include_bbox = if let Some(val) = get_kw(ruby, hash, "include_bbox") {
|
|
473
|
-
bool::try_convert(val)?
|
|
474
|
-
} else {
|
|
475
|
-
true
|
|
476
|
-
};
|
|
477
|
-
|
|
478
|
-
let ocr_coverage_threshold = if let Some(val) = get_kw(ruby, hash, "ocr_coverage_threshold") {
|
|
479
|
-
if !val.is_nil() {
|
|
480
|
-
Some(f64::try_convert(val)? as f32)
|
|
481
|
-
} else {
|
|
482
|
-
None
|
|
483
|
-
}
|
|
484
|
-
} else {
|
|
485
|
-
None
|
|
486
|
-
};
|
|
487
|
-
|
|
488
|
-
let config = HierarchyConfig {
|
|
489
|
-
enabled,
|
|
490
|
-
k_clusters,
|
|
491
|
-
include_bbox,
|
|
492
|
-
ocr_coverage_threshold,
|
|
493
|
-
};
|
|
494
|
-
|
|
495
|
-
Ok(config)
|
|
496
|
-
}
|
|
497
|
-
|
|
498
|
-
/// Parse PdfConfig from Ruby Hash
|
|
499
|
-
fn parse_pdf_config(ruby: &Ruby, hash: RHash) -> Result<PdfConfig, Error> {
|
|
500
|
-
let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
|
|
501
|
-
bool::try_convert(val)?
|
|
502
|
-
} else {
|
|
503
|
-
false
|
|
504
|
-
};
|
|
505
|
-
|
|
506
|
-
let passwords = if let Some(val) = get_kw(ruby, hash, "passwords") {
|
|
507
|
-
if !val.is_nil() {
|
|
508
|
-
let arr = RArray::try_convert(val)?;
|
|
509
|
-
Some(arr.to_vec::<String>()?)
|
|
510
|
-
} else {
|
|
511
|
-
None
|
|
512
|
-
}
|
|
513
|
-
} else {
|
|
514
|
-
None
|
|
515
|
-
};
|
|
516
|
-
|
|
517
|
-
let extract_metadata = if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
|
|
518
|
-
bool::try_convert(val)?
|
|
519
|
-
} else {
|
|
520
|
-
true
|
|
521
|
-
};
|
|
522
|
-
|
|
523
|
-
let hierarchy = if let Some(val) = get_kw(ruby, hash, "hierarchy") {
|
|
524
|
-
if !val.is_nil() {
|
|
525
|
-
let h_hash = RHash::try_convert(val)?;
|
|
526
|
-
Some(parse_hierarchy_config(ruby, h_hash)?)
|
|
527
|
-
} else {
|
|
528
|
-
None
|
|
529
|
-
}
|
|
530
|
-
} else {
|
|
531
|
-
None
|
|
532
|
-
};
|
|
533
|
-
|
|
534
|
-
let config = PdfConfig {
|
|
535
|
-
extract_images,
|
|
536
|
-
passwords,
|
|
537
|
-
extract_metadata,
|
|
538
|
-
hierarchy,
|
|
539
|
-
};
|
|
540
|
-
|
|
541
|
-
Ok(config)
|
|
542
|
-
}
|
|
543
|
-
|
|
544
|
-
/// Parse ImageExtractionConfig from Ruby Hash
|
|
545
|
-
fn parse_image_extraction_config(ruby: &Ruby, hash: RHash) -> Result<ImageExtractionConfig, Error> {
|
|
546
|
-
let extract_images = if let Some(val) = get_kw(ruby, hash, "extract_images") {
|
|
547
|
-
bool::try_convert(val)?
|
|
548
|
-
} else {
|
|
549
|
-
true
|
|
550
|
-
};
|
|
551
|
-
|
|
552
|
-
let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
|
|
553
|
-
i32::try_convert(val)?
|
|
554
|
-
} else {
|
|
555
|
-
300
|
|
556
|
-
};
|
|
557
|
-
|
|
558
|
-
let max_image_dimension = if let Some(val) = get_kw(ruby, hash, "max_image_dimension") {
|
|
559
|
-
i32::try_convert(val)?
|
|
560
|
-
} else {
|
|
561
|
-
4096
|
|
562
|
-
};
|
|
563
|
-
|
|
564
|
-
let auto_adjust_dpi = if let Some(val) = get_kw(ruby, hash, "auto_adjust_dpi") {
|
|
565
|
-
bool::try_convert(val)?
|
|
566
|
-
} else {
|
|
567
|
-
true
|
|
568
|
-
};
|
|
569
|
-
|
|
570
|
-
let min_dpi = if let Some(val) = get_kw(ruby, hash, "min_dpi") {
|
|
571
|
-
i32::try_convert(val)?
|
|
572
|
-
} else {
|
|
573
|
-
72
|
|
574
|
-
};
|
|
575
|
-
|
|
576
|
-
let max_dpi = if let Some(val) = get_kw(ruby, hash, "max_dpi") {
|
|
577
|
-
i32::try_convert(val)?
|
|
578
|
-
} else {
|
|
579
|
-
600
|
|
580
|
-
};
|
|
581
|
-
|
|
582
|
-
let config = ImageExtractionConfig {
|
|
583
|
-
extract_images,
|
|
584
|
-
target_dpi,
|
|
585
|
-
max_image_dimension,
|
|
586
|
-
auto_adjust_dpi,
|
|
587
|
-
min_dpi,
|
|
588
|
-
max_dpi,
|
|
589
|
-
};
|
|
590
|
-
|
|
591
|
-
Ok(config)
|
|
592
|
-
}
|
|
593
|
-
|
|
594
|
-
/// Parse ImagePreprocessingConfig from Ruby Hash
|
|
595
|
-
///
|
|
596
|
-
/// Note: Currently not used in ExtractionConfig but provided for completeness.
|
|
597
|
-
/// ImagePreprocessingConfig is typically used in OCR operations.
|
|
598
|
-
#[allow(dead_code)]
|
|
599
|
-
fn parse_image_preprocessing_config(ruby: &Ruby, hash: RHash) -> Result<ImagePreprocessingConfig, Error> {
|
|
600
|
-
let target_dpi = if let Some(val) = get_kw(ruby, hash, "target_dpi") {
|
|
601
|
-
i32::try_convert(val)?
|
|
602
|
-
} else {
|
|
603
|
-
300
|
|
604
|
-
};
|
|
605
|
-
|
|
606
|
-
let auto_rotate = if let Some(val) = get_kw(ruby, hash, "auto_rotate") {
|
|
607
|
-
bool::try_convert(val)?
|
|
608
|
-
} else {
|
|
609
|
-
true
|
|
610
|
-
};
|
|
611
|
-
|
|
612
|
-
let deskew = if let Some(val) = get_kw(ruby, hash, "deskew") {
|
|
613
|
-
bool::try_convert(val)?
|
|
614
|
-
} else {
|
|
615
|
-
true
|
|
616
|
-
};
|
|
617
|
-
|
|
618
|
-
let denoise = if let Some(val) = get_kw(ruby, hash, "denoise") {
|
|
619
|
-
bool::try_convert(val)?
|
|
620
|
-
} else {
|
|
621
|
-
false
|
|
622
|
-
};
|
|
623
|
-
|
|
624
|
-
let contrast_enhance = if let Some(val) = get_kw(ruby, hash, "contrast_enhance") {
|
|
625
|
-
bool::try_convert(val)?
|
|
626
|
-
} else {
|
|
627
|
-
false
|
|
628
|
-
};
|
|
629
|
-
|
|
630
|
-
let binarization_method = if let Some(val) = get_kw(ruby, hash, "binarization_method") {
|
|
631
|
-
symbol_to_string(val)?
|
|
632
|
-
} else {
|
|
633
|
-
"otsu".to_string()
|
|
634
|
-
};
|
|
635
|
-
|
|
636
|
-
let invert_colors = if let Some(val) = get_kw(ruby, hash, "invert_colors") {
|
|
637
|
-
bool::try_convert(val)?
|
|
638
|
-
} else {
|
|
639
|
-
false
|
|
640
|
-
};
|
|
641
|
-
|
|
642
|
-
let config = ImagePreprocessingConfig {
|
|
643
|
-
target_dpi,
|
|
644
|
-
auto_rotate,
|
|
645
|
-
deskew,
|
|
646
|
-
denoise,
|
|
647
|
-
contrast_enhance,
|
|
648
|
-
binarization_method,
|
|
649
|
-
invert_colors,
|
|
650
|
-
};
|
|
651
|
-
|
|
652
|
-
Ok(config)
|
|
653
|
-
}
|
|
654
|
-
|
|
655
|
-
/// Parse PostProcessorConfig from Ruby Hash
|
|
656
|
-
fn parse_postprocessor_config(ruby: &Ruby, hash: RHash) -> Result<PostProcessorConfig, Error> {
|
|
657
|
-
let enabled = if let Some(val) = get_kw(ruby, hash, "enabled") {
|
|
658
|
-
bool::try_convert(val)?
|
|
659
|
-
} else {
|
|
660
|
-
true
|
|
661
|
-
};
|
|
662
|
-
|
|
663
|
-
let enabled_processors = if let Some(val) = get_kw(ruby, hash, "enabled_processors")
|
|
664
|
-
&& !val.is_nil()
|
|
665
|
-
{
|
|
666
|
-
let arr = RArray::try_convert(val)?;
|
|
667
|
-
Some(arr.to_vec::<String>()?)
|
|
668
|
-
} else {
|
|
669
|
-
None
|
|
670
|
-
};
|
|
671
|
-
|
|
672
|
-
let disabled_processors = if let Some(val) = get_kw(ruby, hash, "disabled_processors")
|
|
673
|
-
&& !val.is_nil()
|
|
674
|
-
{
|
|
675
|
-
let arr = RArray::try_convert(val)?;
|
|
676
|
-
Some(arr.to_vec::<String>()?)
|
|
677
|
-
} else {
|
|
678
|
-
None
|
|
679
|
-
};
|
|
680
|
-
|
|
681
|
-
let config = PostProcessorConfig {
|
|
682
|
-
enabled,
|
|
683
|
-
enabled_processors,
|
|
684
|
-
disabled_processors,
|
|
685
|
-
enabled_set: None,
|
|
686
|
-
disabled_set: None,
|
|
687
|
-
};
|
|
688
|
-
|
|
689
|
-
Ok(config)
|
|
690
|
-
}
|
|
691
|
-
|
|
692
|
-
/// Parse TokenReductionConfig from Ruby Hash
|
|
693
|
-
fn parse_token_reduction_config(ruby: &Ruby, hash: RHash) -> Result<TokenReductionConfig, Error> {
|
|
694
|
-
let mode = if let Some(val) = get_kw(ruby, hash, "mode") {
|
|
695
|
-
symbol_to_string(val)?
|
|
696
|
-
} else {
|
|
697
|
-
"off".to_string()
|
|
698
|
-
};
|
|
699
|
-
|
|
700
|
-
let preserve_important_words = if let Some(val) = get_kw(ruby, hash, "preserve_important_words") {
|
|
701
|
-
bool::try_convert(val)?
|
|
702
|
-
} else {
|
|
703
|
-
true
|
|
704
|
-
};
|
|
705
|
-
|
|
706
|
-
let config = TokenReductionConfig {
|
|
707
|
-
mode,
|
|
708
|
-
preserve_important_words,
|
|
709
|
-
};
|
|
710
|
-
|
|
711
|
-
Ok(config)
|
|
712
|
-
}
|
|
713
|
-
|
|
714
|
-
fn parse_keyword_config(ruby: &Ruby, hash: RHash) -> Result<RustKeywordConfig, Error> {
|
|
715
|
-
let mut config = RustKeywordConfig::default();
|
|
716
|
-
|
|
717
|
-
if let Some(val) = get_kw(ruby, hash, "algorithm") {
|
|
718
|
-
let algo = symbol_to_string(val)?;
|
|
719
|
-
config.algorithm = match algo.to_lowercase().as_str() {
|
|
720
|
-
"yake" => RustKeywordAlgorithm::Yake,
|
|
721
|
-
"rake" => RustKeywordAlgorithm::Rake,
|
|
722
|
-
other => {
|
|
723
|
-
return Err(runtime_error(format!(
|
|
724
|
-
"Invalid keywords.algorithm '{}', expected 'yake' or 'rake'",
|
|
725
|
-
other
|
|
726
|
-
)));
|
|
727
|
-
}
|
|
728
|
-
};
|
|
729
|
-
}
|
|
730
|
-
|
|
731
|
-
if let Some(val) = get_kw(ruby, hash, "max_keywords") {
|
|
732
|
-
config.max_keywords = usize::try_convert(val)?;
|
|
733
|
-
}
|
|
734
|
-
|
|
735
|
-
if let Some(val) = get_kw(ruby, hash, "min_score") {
|
|
736
|
-
config.min_score = f64::try_convert(val)? as f32;
|
|
737
|
-
}
|
|
738
|
-
|
|
739
|
-
if let Some(val) = get_kw(ruby, hash, "ngram_range") {
|
|
740
|
-
let ary = RArray::try_convert(val)?;
|
|
741
|
-
if ary.len() == 2 {
|
|
742
|
-
let values = ary.to_vec::<i64>()?;
|
|
743
|
-
config.ngram_range = (values[0] as usize, values[1] as usize);
|
|
744
|
-
} else {
|
|
745
|
-
return Err(runtime_error("keywords.ngram_range must have exactly two values"));
|
|
746
|
-
}
|
|
747
|
-
}
|
|
748
|
-
|
|
749
|
-
if let Some(val) = get_kw(ruby, hash, "language")
|
|
750
|
-
&& !val.is_nil()
|
|
751
|
-
{
|
|
752
|
-
config.language = Some(symbol_to_string(val)?);
|
|
753
|
-
}
|
|
754
|
-
|
|
755
|
-
if let Some(val) = get_kw(ruby, hash, "yake_params")
|
|
756
|
-
&& !val.is_nil()
|
|
757
|
-
{
|
|
758
|
-
let yake_hash = RHash::try_convert(val)?;
|
|
759
|
-
let window = if let Some(window_val) = get_kw(ruby, yake_hash, "window_size") {
|
|
760
|
-
usize::try_convert(window_val)?
|
|
761
|
-
} else {
|
|
762
|
-
2
|
|
763
|
-
};
|
|
764
|
-
config.yake_params = Some(RustYakeParams { window_size: window });
|
|
765
|
-
}
|
|
766
|
-
|
|
767
|
-
if let Some(val) = get_kw(ruby, hash, "rake_params")
|
|
768
|
-
&& !val.is_nil()
|
|
769
|
-
{
|
|
770
|
-
let rake_hash = RHash::try_convert(val)?;
|
|
771
|
-
let mut params = RustRakeParams::default();
|
|
772
|
-
if let Some(val) = get_kw(ruby, rake_hash, "min_word_length") {
|
|
773
|
-
params.min_word_length = usize::try_convert(val)?;
|
|
774
|
-
}
|
|
775
|
-
if let Some(val) = get_kw(ruby, rake_hash, "max_words_per_phrase") {
|
|
776
|
-
params.max_words_per_phrase = usize::try_convert(val)?;
|
|
777
|
-
}
|
|
778
|
-
config.rake_params = Some(params);
|
|
779
|
-
}
|
|
780
|
-
|
|
781
|
-
Ok(config)
|
|
782
|
-
}
|
|
783
|
-
|
|
784
|
-
fn parse_html_options(ruby: &Ruby, hash: RHash) -> Result<ConversionOptions, Error> {
|
|
785
|
-
let mut options = ConversionOptions::default();
|
|
786
|
-
|
|
787
|
-
if let Some(val) = get_kw(ruby, hash, "heading_style") {
|
|
788
|
-
let style = symbol_to_string(val)?;
|
|
789
|
-
options.heading_style = match style.to_lowercase().as_str() {
|
|
790
|
-
"atx" => HeadingStyle::Atx,
|
|
791
|
-
"underlined" => HeadingStyle::Underlined,
|
|
792
|
-
"atx_closed" | "atx-closed" => HeadingStyle::AtxClosed,
|
|
793
|
-
other => return Err(runtime_error(format!("Invalid html_options.heading_style '{}'", other))),
|
|
794
|
-
};
|
|
795
|
-
}
|
|
796
|
-
|
|
797
|
-
if let Some(val) = get_kw(ruby, hash, "list_indent_type") {
|
|
798
|
-
let val_str = symbol_to_string(val)?;
|
|
799
|
-
options.list_indent_type = match val_str.to_lowercase().as_str() {
|
|
800
|
-
"spaces" => ListIndentType::Spaces,
|
|
801
|
-
"tabs" => ListIndentType::Tabs,
|
|
802
|
-
other => {
|
|
803
|
-
return Err(runtime_error(format!(
|
|
804
|
-
"Invalid html_options.list_indent_type '{}'",
|
|
805
|
-
other
|
|
806
|
-
)));
|
|
807
|
-
}
|
|
808
|
-
};
|
|
809
|
-
}
|
|
810
|
-
|
|
811
|
-
if let Some(val) = get_kw(ruby, hash, "list_indent_width") {
|
|
812
|
-
options.list_indent_width = usize::try_convert(val)?;
|
|
813
|
-
}
|
|
814
|
-
|
|
815
|
-
if let Some(val) = get_kw(ruby, hash, "bullets") {
|
|
816
|
-
options.bullets = String::try_convert(val)?;
|
|
817
|
-
}
|
|
818
|
-
|
|
819
|
-
if let Some(val) = get_kw(ruby, hash, "strong_em_symbol") {
|
|
820
|
-
let symbol = String::try_convert(val)?;
|
|
821
|
-
let mut chars = symbol.chars();
|
|
822
|
-
options.strong_em_symbol = chars
|
|
823
|
-
.next()
|
|
824
|
-
.ok_or_else(|| runtime_error("html_options.strong_em_symbol must not be empty"))?;
|
|
825
|
-
}
|
|
826
|
-
|
|
827
|
-
if let Some(val) = get_kw(ruby, hash, "escape_asterisks") {
|
|
828
|
-
options.escape_asterisks = bool::try_convert(val)?;
|
|
829
|
-
}
|
|
830
|
-
if let Some(val) = get_kw(ruby, hash, "escape_underscores") {
|
|
831
|
-
options.escape_underscores = bool::try_convert(val)?;
|
|
832
|
-
}
|
|
833
|
-
if let Some(val) = get_kw(ruby, hash, "escape_misc") {
|
|
834
|
-
options.escape_misc = bool::try_convert(val)?;
|
|
835
|
-
}
|
|
836
|
-
if let Some(val) = get_kw(ruby, hash, "escape_ascii") {
|
|
837
|
-
options.escape_ascii = bool::try_convert(val)?;
|
|
838
|
-
}
|
|
839
|
-
|
|
840
|
-
if let Some(val) = get_kw(ruby, hash, "code_language") {
|
|
841
|
-
options.code_language = String::try_convert(val)?;
|
|
842
|
-
}
|
|
843
|
-
|
|
844
|
-
if let Some(val) = get_kw(ruby, hash, "autolinks") {
|
|
845
|
-
options.autolinks = bool::try_convert(val)?;
|
|
846
|
-
}
|
|
847
|
-
|
|
848
|
-
if let Some(val) = get_kw(ruby, hash, "default_title") {
|
|
849
|
-
options.default_title = bool::try_convert(val)?;
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
if let Some(val) = get_kw(ruby, hash, "br_in_tables") {
|
|
853
|
-
options.br_in_tables = bool::try_convert(val)?;
|
|
854
|
-
}
|
|
855
|
-
|
|
856
|
-
if let Some(val) = get_kw(ruby, hash, "hocr_spatial_tables") {
|
|
857
|
-
options.hocr_spatial_tables = bool::try_convert(val)?;
|
|
858
|
-
}
|
|
859
|
-
|
|
860
|
-
if let Some(val) = get_kw(ruby, hash, "highlight_style") {
|
|
861
|
-
let style = symbol_to_string(val)?;
|
|
862
|
-
options.highlight_style = match style.to_lowercase().as_str() {
|
|
863
|
-
"double_equal" | "double-equal" => HighlightStyle::DoubleEqual,
|
|
864
|
-
"html" => HighlightStyle::Html,
|
|
865
|
-
"bold" => HighlightStyle::Bold,
|
|
866
|
-
"none" => HighlightStyle::None,
|
|
867
|
-
other => {
|
|
868
|
-
return Err(runtime_error(format!(
|
|
869
|
-
"Invalid html_options.highlight_style '{}'",
|
|
870
|
-
other
|
|
871
|
-
)));
|
|
872
|
-
}
|
|
873
|
-
};
|
|
874
|
-
}
|
|
875
|
-
|
|
876
|
-
if let Some(val) = get_kw(ruby, hash, "extract_metadata") {
|
|
877
|
-
options.extract_metadata = bool::try_convert(val)?;
|
|
878
|
-
}
|
|
879
|
-
|
|
880
|
-
if let Some(val) = get_kw(ruby, hash, "whitespace_mode") {
|
|
881
|
-
let mode = symbol_to_string(val)?;
|
|
882
|
-
options.whitespace_mode = match mode.to_lowercase().as_str() {
|
|
883
|
-
"normalized" => WhitespaceMode::Normalized,
|
|
884
|
-
"strict" => WhitespaceMode::Strict,
|
|
885
|
-
other => {
|
|
886
|
-
return Err(runtime_error(format!(
|
|
887
|
-
"Invalid html_options.whitespace_mode '{}'",
|
|
888
|
-
other
|
|
889
|
-
)));
|
|
890
|
-
}
|
|
891
|
-
};
|
|
892
|
-
}
|
|
893
|
-
|
|
894
|
-
if let Some(val) = get_kw(ruby, hash, "strip_newlines") {
|
|
895
|
-
options.strip_newlines = bool::try_convert(val)?;
|
|
896
|
-
}
|
|
897
|
-
|
|
898
|
-
if let Some(val) = get_kw(ruby, hash, "wrap") {
|
|
899
|
-
options.wrap = bool::try_convert(val)?;
|
|
900
|
-
}
|
|
901
|
-
|
|
902
|
-
if let Some(val) = get_kw(ruby, hash, "wrap_width") {
|
|
903
|
-
options.wrap_width = usize::try_convert(val)?;
|
|
904
|
-
}
|
|
905
|
-
|
|
906
|
-
if let Some(val) = get_kw(ruby, hash, "convert_as_inline") {
|
|
907
|
-
options.convert_as_inline = bool::try_convert(val)?;
|
|
908
|
-
}
|
|
909
|
-
|
|
910
|
-
if let Some(val) = get_kw(ruby, hash, "sub_symbol") {
|
|
911
|
-
options.sub_symbol = String::try_convert(val)?;
|
|
912
|
-
}
|
|
913
|
-
|
|
914
|
-
if let Some(val) = get_kw(ruby, hash, "sup_symbol") {
|
|
915
|
-
options.sup_symbol = String::try_convert(val)?;
|
|
916
|
-
}
|
|
917
|
-
|
|
918
|
-
if let Some(val) = get_kw(ruby, hash, "newline_style") {
|
|
919
|
-
let style = symbol_to_string(val)?;
|
|
920
|
-
options.newline_style = match style.to_lowercase().as_str() {
|
|
921
|
-
"spaces" => NewlineStyle::Spaces,
|
|
922
|
-
"backslash" => NewlineStyle::Backslash,
|
|
923
|
-
other => return Err(runtime_error(format!("Invalid html_options.newline_style '{}'", other))),
|
|
924
|
-
};
|
|
925
|
-
}
|
|
926
|
-
|
|
927
|
-
if let Some(val) = get_kw(ruby, hash, "code_block_style") {
|
|
928
|
-
let style = symbol_to_string(val)?;
|
|
929
|
-
options.code_block_style = match style.to_lowercase().as_str() {
|
|
930
|
-
"indented" => CodeBlockStyle::Indented,
|
|
931
|
-
"backticks" => CodeBlockStyle::Backticks,
|
|
932
|
-
"tildes" => CodeBlockStyle::Tildes,
|
|
933
|
-
other => {
|
|
934
|
-
return Err(runtime_error(format!(
|
|
935
|
-
"Invalid html_options.code_block_style '{}'",
|
|
936
|
-
other
|
|
937
|
-
)));
|
|
938
|
-
}
|
|
939
|
-
};
|
|
940
|
-
}
|
|
941
|
-
|
|
942
|
-
if let Some(val) = get_kw(ruby, hash, "keep_inline_images_in") {
|
|
943
|
-
let arr = RArray::try_convert(val)?;
|
|
944
|
-
options.keep_inline_images_in = arr.to_vec::<String>()?;
|
|
945
|
-
}
|
|
946
|
-
|
|
947
|
-
if let Some(val) = get_kw(ruby, hash, "encoding") {
|
|
948
|
-
options.encoding = String::try_convert(val)?;
|
|
949
|
-
}
|
|
950
|
-
|
|
951
|
-
if let Some(val) = get_kw(ruby, hash, "debug") {
|
|
952
|
-
options.debug = bool::try_convert(val)?;
|
|
953
|
-
}
|
|
954
|
-
|
|
955
|
-
if let Some(val) = get_kw(ruby, hash, "strip_tags") {
|
|
956
|
-
let arr = RArray::try_convert(val)?;
|
|
957
|
-
options.strip_tags = arr.to_vec::<String>()?;
|
|
958
|
-
}
|
|
959
|
-
|
|
960
|
-
if let Some(val) = get_kw(ruby, hash, "preserve_tags") {
|
|
961
|
-
let arr = RArray::try_convert(val)?;
|
|
962
|
-
options.preserve_tags = arr.to_vec::<String>()?;
|
|
963
|
-
}
|
|
964
|
-
|
|
965
|
-
if let Some(val) = get_kw(ruby, hash, "preprocessing")
|
|
966
|
-
&& !val.is_nil()
|
|
967
|
-
{
|
|
968
|
-
let pre_hash = RHash::try_convert(val)?;
|
|
969
|
-
let mut preprocessing = options.preprocessing.clone();
|
|
970
|
-
if let Some(v) = get_kw(ruby, pre_hash, "enabled") {
|
|
971
|
-
preprocessing.enabled = bool::try_convert(v)?;
|
|
972
|
-
}
|
|
973
|
-
if let Some(v) = get_kw(ruby, pre_hash, "preset") {
|
|
974
|
-
let preset = symbol_to_string(v)?;
|
|
975
|
-
preprocessing.preset = match preset.to_lowercase().as_str() {
|
|
976
|
-
"minimal" => PreprocessingPreset::Minimal,
|
|
977
|
-
"standard" => PreprocessingPreset::Standard,
|
|
978
|
-
"aggressive" => PreprocessingPreset::Aggressive,
|
|
979
|
-
other => {
|
|
980
|
-
return Err(runtime_error(format!(
|
|
981
|
-
"Invalid html_options.preprocessing.preset '{}'",
|
|
982
|
-
other
|
|
983
|
-
)));
|
|
984
|
-
}
|
|
985
|
-
};
|
|
986
|
-
}
|
|
987
|
-
if let Some(v) = get_kw(ruby, pre_hash, "remove_navigation") {
|
|
988
|
-
preprocessing.remove_navigation = bool::try_convert(v)?;
|
|
989
|
-
}
|
|
990
|
-
if let Some(v) = get_kw(ruby, pre_hash, "remove_forms") {
|
|
991
|
-
preprocessing.remove_forms = bool::try_convert(v)?;
|
|
992
|
-
}
|
|
993
|
-
options.preprocessing = preprocessing;
|
|
994
|
-
}
|
|
995
|
-
|
|
996
|
-
Ok(options)
|
|
997
|
-
}
|
|
998
|
-
|
|
999
|
-
fn keyword_algorithm_to_str(algo: RustKeywordAlgorithm) -> &'static str {
|
|
1000
|
-
match algo {
|
|
1001
|
-
RustKeywordAlgorithm::Yake => "yake",
|
|
1002
|
-
RustKeywordAlgorithm::Rake => "rake",
|
|
1003
|
-
}
|
|
1004
|
-
}
|
|
1005
|
-
|
|
1006
|
-
fn keyword_config_to_ruby_hash(ruby: &Ruby, config: &RustKeywordConfig) -> Result<RHash, Error> {
|
|
1007
|
-
let hash = ruby.hash_new();
|
|
1008
|
-
hash.aset("algorithm", keyword_algorithm_to_str(config.algorithm))?;
|
|
1009
|
-
hash.aset("max_keywords", config.max_keywords as i64)?;
|
|
1010
|
-
hash.aset("min_score", config.min_score)?;
|
|
1011
|
-
hash.aset("language", config.language.clone().unwrap_or_default())?;
|
|
1012
|
-
|
|
1013
|
-
let range_array = ruby.ary_new();
|
|
1014
|
-
range_array.push(config.ngram_range.0 as i64)?;
|
|
1015
|
-
range_array.push(config.ngram_range.1 as i64)?;
|
|
1016
|
-
hash.aset("ngram_range", range_array)?;
|
|
1017
|
-
|
|
1018
|
-
if let Some(yake) = &config.yake_params {
|
|
1019
|
-
let yake_hash = ruby.hash_new();
|
|
1020
|
-
yake_hash.aset("window_size", yake.window_size as i64)?;
|
|
1021
|
-
hash.aset("yake_params", yake_hash)?;
|
|
1022
|
-
}
|
|
1023
|
-
|
|
1024
|
-
if let Some(rake) = &config.rake_params {
|
|
1025
|
-
let rake_hash = ruby.hash_new();
|
|
1026
|
-
rake_hash.aset("min_word_length", rake.min_word_length as i64)?;
|
|
1027
|
-
rake_hash.aset("max_words_per_phrase", rake.max_words_per_phrase as i64)?;
|
|
1028
|
-
hash.aset("rake_params", rake_hash)?;
|
|
1029
|
-
}
|
|
1030
|
-
|
|
1031
|
-
Ok(hash)
|
|
1032
|
-
}
|
|
1033
|
-
|
|
1034
|
-
fn html_options_to_ruby_hash(ruby: &Ruby, options: &ConversionOptions) -> Result<RHash, Error> {
|
|
1035
|
-
let hash = ruby.hash_new();
|
|
1036
|
-
hash.aset(
|
|
1037
|
-
"heading_style",
|
|
1038
|
-
match options.heading_style {
|
|
1039
|
-
HeadingStyle::Atx => "atx",
|
|
1040
|
-
HeadingStyle::Underlined => "underlined",
|
|
1041
|
-
HeadingStyle::AtxClosed => "atx_closed",
|
|
1042
|
-
},
|
|
1043
|
-
)?;
|
|
1044
|
-
hash.aset(
|
|
1045
|
-
"list_indent_type",
|
|
1046
|
-
match options.list_indent_type {
|
|
1047
|
-
ListIndentType::Spaces => "spaces",
|
|
1048
|
-
ListIndentType::Tabs => "tabs",
|
|
1049
|
-
},
|
|
1050
|
-
)?;
|
|
1051
|
-
hash.aset("list_indent_width", options.list_indent_width as i64)?;
|
|
1052
|
-
hash.aset("bullets", options.bullets.clone())?;
|
|
1053
|
-
hash.aset("strong_em_symbol", options.strong_em_symbol.to_string())?;
|
|
1054
|
-
hash.aset("escape_asterisks", options.escape_asterisks)?;
|
|
1055
|
-
hash.aset("escape_underscores", options.escape_underscores)?;
|
|
1056
|
-
hash.aset("escape_misc", options.escape_misc)?;
|
|
1057
|
-
hash.aset("escape_ascii", options.escape_ascii)?;
|
|
1058
|
-
hash.aset("code_language", options.code_language.clone())?;
|
|
1059
|
-
hash.aset("autolinks", options.autolinks)?;
|
|
1060
|
-
hash.aset("default_title", options.default_title)?;
|
|
1061
|
-
hash.aset("br_in_tables", options.br_in_tables)?;
|
|
1062
|
-
hash.aset("hocr_spatial_tables", options.hocr_spatial_tables)?;
|
|
1063
|
-
hash.aset(
|
|
1064
|
-
"highlight_style",
|
|
1065
|
-
match options.highlight_style {
|
|
1066
|
-
HighlightStyle::DoubleEqual => "double_equal",
|
|
1067
|
-
HighlightStyle::Html => "html",
|
|
1068
|
-
HighlightStyle::Bold => "bold",
|
|
1069
|
-
HighlightStyle::None => "none",
|
|
1070
|
-
},
|
|
1071
|
-
)?;
|
|
1072
|
-
hash.aset("extract_metadata", options.extract_metadata)?;
|
|
1073
|
-
hash.aset(
|
|
1074
|
-
"whitespace_mode",
|
|
1075
|
-
match options.whitespace_mode {
|
|
1076
|
-
WhitespaceMode::Normalized => "normalized",
|
|
1077
|
-
WhitespaceMode::Strict => "strict",
|
|
1078
|
-
},
|
|
1079
|
-
)?;
|
|
1080
|
-
hash.aset("strip_newlines", options.strip_newlines)?;
|
|
1081
|
-
hash.aset("wrap", options.wrap)?;
|
|
1082
|
-
hash.aset("wrap_width", options.wrap_width as i64)?;
|
|
1083
|
-
hash.aset("convert_as_inline", options.convert_as_inline)?;
|
|
1084
|
-
hash.aset("sub_symbol", options.sub_symbol.clone())?;
|
|
1085
|
-
hash.aset("sup_symbol", options.sup_symbol.clone())?;
|
|
1086
|
-
hash.aset(
|
|
1087
|
-
"newline_style",
|
|
1088
|
-
match options.newline_style {
|
|
1089
|
-
NewlineStyle::Spaces => "spaces",
|
|
1090
|
-
NewlineStyle::Backslash => "backslash",
|
|
1091
|
-
},
|
|
1092
|
-
)?;
|
|
1093
|
-
hash.aset(
|
|
1094
|
-
"code_block_style",
|
|
1095
|
-
match options.code_block_style {
|
|
1096
|
-
CodeBlockStyle::Indented => "indented",
|
|
1097
|
-
CodeBlockStyle::Backticks => "backticks",
|
|
1098
|
-
CodeBlockStyle::Tildes => "tildes",
|
|
1099
|
-
},
|
|
1100
|
-
)?;
|
|
1101
|
-
|
|
1102
|
-
let keep_inline = ruby.ary_new();
|
|
1103
|
-
for tag in &options.keep_inline_images_in {
|
|
1104
|
-
keep_inline.push(tag.as_str())?;
|
|
1105
|
-
}
|
|
1106
|
-
hash.aset("keep_inline_images_in", keep_inline)?;
|
|
1107
|
-
|
|
1108
|
-
hash.aset("encoding", options.encoding.clone())?;
|
|
1109
|
-
hash.aset("debug", options.debug)?;
|
|
1110
|
-
|
|
1111
|
-
let strip_tags = ruby.ary_new();
|
|
1112
|
-
for tag in &options.strip_tags {
|
|
1113
|
-
strip_tags.push(tag.as_str())?;
|
|
1114
|
-
}
|
|
1115
|
-
hash.aset("strip_tags", strip_tags)?;
|
|
1116
|
-
|
|
1117
|
-
let preserve_tags = ruby.ary_new();
|
|
1118
|
-
for tag in &options.preserve_tags {
|
|
1119
|
-
preserve_tags.push(tag.as_str())?;
|
|
1120
|
-
}
|
|
1121
|
-
hash.aset("preserve_tags", preserve_tags)?;
|
|
1122
|
-
|
|
1123
|
-
let pre_hash = ruby.hash_new();
|
|
1124
|
-
pre_hash.aset("enabled", options.preprocessing.enabled)?;
|
|
1125
|
-
pre_hash.aset(
|
|
1126
|
-
"preset",
|
|
1127
|
-
match options.preprocessing.preset {
|
|
1128
|
-
PreprocessingPreset::Minimal => "minimal",
|
|
1129
|
-
PreprocessingPreset::Standard => "standard",
|
|
1130
|
-
PreprocessingPreset::Aggressive => "aggressive",
|
|
1131
|
-
},
|
|
1132
|
-
)?;
|
|
1133
|
-
pre_hash.aset("remove_navigation", options.preprocessing.remove_navigation)?;
|
|
1134
|
-
pre_hash.aset("remove_forms", options.preprocessing.remove_forms)?;
|
|
1135
|
-
hash.aset("preprocessing", pre_hash)?;
|
|
1136
|
-
|
|
1137
|
-
Ok(hash)
|
|
1138
|
-
}
|
|
1139
|
-
|
|
1140
|
-
/// Parse PageConfig from Ruby Hash
|
|
1141
|
-
fn parse_page_config(ruby: &Ruby, hash: RHash) -> Result<PageConfig, Error> {
|
|
1142
|
-
let extract_pages = if let Some(val) = get_kw(ruby, hash, "extract_pages") {
|
|
1143
|
-
bool::try_convert(val)?
|
|
1144
|
-
} else {
|
|
1145
|
-
false
|
|
1146
|
-
};
|
|
1147
|
-
|
|
1148
|
-
let insert_page_markers = if let Some(val) = get_kw(ruby, hash, "insert_page_markers") {
|
|
1149
|
-
bool::try_convert(val)?
|
|
1150
|
-
} else {
|
|
1151
|
-
false
|
|
1152
|
-
};
|
|
1153
|
-
|
|
1154
|
-
let marker_format = if let Some(val) = get_kw(ruby, hash, "marker_format") {
|
|
1155
|
-
String::try_convert(val)?
|
|
1156
|
-
} else {
|
|
1157
|
-
"\n\n<!-- PAGE {page_num} -->\n\n".to_string()
|
|
1158
|
-
};
|
|
1159
|
-
|
|
1160
|
-
let config = PageConfig {
|
|
1161
|
-
extract_pages,
|
|
1162
|
-
insert_page_markers,
|
|
1163
|
-
marker_format,
|
|
1164
|
-
};
|
|
1165
|
-
|
|
1166
|
-
Ok(config)
|
|
1167
|
-
}
|
|
1168
|
-
|
|
1169
|
-
/// Parse ExtractionConfig from Ruby Hash
|
|
1170
|
-
fn parse_extraction_config(ruby: &Ruby, opts: Option<RHash>) -> Result<ExtractionConfig, Error> {
|
|
1171
|
-
let mut config = ExtractionConfig::default();
|
|
1172
|
-
|
|
1173
|
-
if let Some(hash) = opts {
|
|
1174
|
-
if let Some(val) = get_kw(ruby, hash, "use_cache") {
|
|
1175
|
-
config.use_cache = bool::try_convert(val)?;
|
|
1176
|
-
}
|
|
1177
|
-
|
|
1178
|
-
if let Some(val) = get_kw(ruby, hash, "enable_quality_processing") {
|
|
1179
|
-
config.enable_quality_processing = bool::try_convert(val)?;
|
|
1180
|
-
}
|
|
1181
|
-
|
|
1182
|
-
if let Some(val) = get_kw(ruby, hash, "force_ocr") {
|
|
1183
|
-
config.force_ocr = bool::try_convert(val)?;
|
|
1184
|
-
}
|
|
1185
|
-
|
|
1186
|
-
if let Some(val) = get_kw(ruby, hash, "ocr")
|
|
1187
|
-
&& !val.is_nil()
|
|
1188
|
-
{
|
|
1189
|
-
let ocr_hash = RHash::try_convert(val)?;
|
|
1190
|
-
config.ocr = Some(parse_ocr_config(ruby, ocr_hash)?);
|
|
1191
|
-
}
|
|
1192
|
-
|
|
1193
|
-
if let Some(val) = get_kw(ruby, hash, "chunking")
|
|
1194
|
-
&& !val.is_nil()
|
|
1195
|
-
{
|
|
1196
|
-
let chunking_hash = RHash::try_convert(val)?;
|
|
1197
|
-
config.chunking = Some(parse_chunking_config(ruby, chunking_hash)?);
|
|
1198
|
-
}
|
|
1199
|
-
|
|
1200
|
-
if let Some(val) = get_kw(ruby, hash, "language_detection")
|
|
1201
|
-
&& !val.is_nil()
|
|
1202
|
-
{
|
|
1203
|
-
let lang_hash = RHash::try_convert(val)?;
|
|
1204
|
-
config.language_detection = Some(parse_language_detection_config(ruby, lang_hash)?);
|
|
1205
|
-
}
|
|
1206
|
-
|
|
1207
|
-
if let Some(val) = get_kw(ruby, hash, "pdf_options")
|
|
1208
|
-
&& !val.is_nil()
|
|
1209
|
-
{
|
|
1210
|
-
let pdf_hash = RHash::try_convert(val)?;
|
|
1211
|
-
config.pdf_options = Some(parse_pdf_config(ruby, pdf_hash)?);
|
|
1212
|
-
}
|
|
1213
|
-
|
|
1214
|
-
if let Some(val) = get_kw(ruby, hash, "images")
|
|
1215
|
-
&& !val.is_nil()
|
|
1216
|
-
{
|
|
1217
|
-
let images_hash = RHash::try_convert(val)?;
|
|
1218
|
-
config.images = Some(parse_image_extraction_config(ruby, images_hash)?);
|
|
1219
|
-
}
|
|
1220
|
-
|
|
1221
|
-
if let Some(val) = get_kw(ruby, hash, "postprocessor")
|
|
1222
|
-
&& !val.is_nil()
|
|
1223
|
-
{
|
|
1224
|
-
let postprocessor_hash = RHash::try_convert(val)?;
|
|
1225
|
-
config.postprocessor = Some(parse_postprocessor_config(ruby, postprocessor_hash)?);
|
|
1226
|
-
}
|
|
1227
|
-
|
|
1228
|
-
if let Some(val) = get_kw(ruby, hash, "token_reduction")
|
|
1229
|
-
&& !val.is_nil()
|
|
1230
|
-
{
|
|
1231
|
-
let token_reduction_hash = RHash::try_convert(val)?;
|
|
1232
|
-
config.token_reduction = Some(parse_token_reduction_config(ruby, token_reduction_hash)?);
|
|
1233
|
-
}
|
|
1234
|
-
|
|
1235
|
-
if let Some(val) = get_kw(ruby, hash, "keywords")
|
|
1236
|
-
&& !val.is_nil()
|
|
1237
|
-
{
|
|
1238
|
-
let keywords_hash = RHash::try_convert(val)?;
|
|
1239
|
-
config.keywords = Some(parse_keyword_config(ruby, keywords_hash)?);
|
|
1240
|
-
}
|
|
1241
|
-
|
|
1242
|
-
if let Some(val) = get_kw(ruby, hash, "html_options")
|
|
1243
|
-
&& !val.is_nil()
|
|
1244
|
-
{
|
|
1245
|
-
let html_hash = RHash::try_convert(val)?;
|
|
1246
|
-
config.html_options = Some(parse_html_options(ruby, html_hash)?);
|
|
1247
|
-
}
|
|
1248
|
-
|
|
1249
|
-
if let Some(val) = get_kw(ruby, hash, "pages")
|
|
1250
|
-
&& !val.is_nil()
|
|
1251
|
-
{
|
|
1252
|
-
let pages_hash = RHash::try_convert(val)?;
|
|
1253
|
-
config.pages = Some(parse_page_config(ruby, pages_hash)?);
|
|
1254
|
-
}
|
|
1255
|
-
|
|
1256
|
-
if let Some(val) = get_kw(ruby, hash, "max_concurrent_extractions") {
|
|
1257
|
-
let value = usize::try_convert(val)?;
|
|
1258
|
-
config.max_concurrent_extractions = Some(value);
|
|
1259
|
-
}
|
|
1260
|
-
}
|
|
1261
|
-
|
|
1262
|
-
Ok(config)
|
|
1263
|
-
}
|
|
1264
|
-
|
|
1265
|
-
/// Convert ExtractionConfig to Ruby Hash for Config::Extraction.
|
|
1266
|
-
///
|
|
1267
|
-
/// This function converts a Rust ExtractionConfig into a Ruby hash that can be passed
|
|
1268
|
-
/// to Kreuzberg::Config::Extraction.new(**hash).
|
|
1269
|
-
fn extraction_config_to_ruby_hash(ruby: &Ruby, config: ExtractionConfig) -> Result<RHash, Error> {
|
|
1270
|
-
let hash = ruby.hash_new();
|
|
1271
|
-
|
|
1272
|
-
set_hash_entry(
|
|
1273
|
-
ruby,
|
|
1274
|
-
&hash,
|
|
1275
|
-
"use_cache",
|
|
1276
|
-
if config.use_cache {
|
|
1277
|
-
ruby.qtrue().as_value()
|
|
1278
|
-
} else {
|
|
1279
|
-
ruby.qfalse().as_value()
|
|
1280
|
-
},
|
|
1281
|
-
)?;
|
|
1282
|
-
set_hash_entry(
|
|
1283
|
-
ruby,
|
|
1284
|
-
&hash,
|
|
1285
|
-
"enable_quality_processing",
|
|
1286
|
-
if config.enable_quality_processing {
|
|
1287
|
-
ruby.qtrue().as_value()
|
|
1288
|
-
} else {
|
|
1289
|
-
ruby.qfalse().as_value()
|
|
1290
|
-
},
|
|
1291
|
-
)?;
|
|
1292
|
-
set_hash_entry(
|
|
1293
|
-
ruby,
|
|
1294
|
-
&hash,
|
|
1295
|
-
"force_ocr",
|
|
1296
|
-
if config.force_ocr {
|
|
1297
|
-
ruby.qtrue().as_value()
|
|
1298
|
-
} else {
|
|
1299
|
-
ruby.qfalse().as_value()
|
|
1300
|
-
},
|
|
1301
|
-
)?;
|
|
1302
|
-
|
|
1303
|
-
if let Some(ocr) = config.ocr {
|
|
1304
|
-
let ocr_hash = ruby.hash_new();
|
|
1305
|
-
set_hash_entry(
|
|
1306
|
-
ruby,
|
|
1307
|
-
&ocr_hash,
|
|
1308
|
-
"backend",
|
|
1309
|
-
ruby.str_new(&ocr.backend).into_value_with(ruby),
|
|
1310
|
-
)?;
|
|
1311
|
-
set_hash_entry(
|
|
1312
|
-
ruby,
|
|
1313
|
-
&ocr_hash,
|
|
1314
|
-
"language",
|
|
1315
|
-
ruby.str_new(&ocr.language).into_value_with(ruby),
|
|
1316
|
-
)?;
|
|
1317
|
-
if let Some(tesseract_config) = ocr.tesseract_config {
|
|
1318
|
-
let tc_json = serde_json::to_value(&tesseract_config)
|
|
1319
|
-
.map_err(|e| runtime_error(format!("Failed to serialize tesseract_config: {}", e)))?;
|
|
1320
|
-
let tc_ruby = json_value_to_ruby(ruby, &tc_json)?;
|
|
1321
|
-
set_hash_entry(ruby, &ocr_hash, "tesseract_config", tc_ruby)?;
|
|
1322
|
-
}
|
|
1323
|
-
set_hash_entry(ruby, &hash, "ocr", ocr_hash.into_value_with(ruby))?;
|
|
1324
|
-
}
|
|
1325
|
-
|
|
1326
|
-
if let Some(chunking) = config.chunking {
|
|
1327
|
-
let chunking_hash = ruby.hash_new();
|
|
1328
|
-
set_hash_entry(
|
|
1329
|
-
ruby,
|
|
1330
|
-
&chunking_hash,
|
|
1331
|
-
"max_chars",
|
|
1332
|
-
ruby.integer_from_i64(chunking.max_chars as i64).into_value_with(ruby),
|
|
1333
|
-
)?;
|
|
1334
|
-
set_hash_entry(
|
|
1335
|
-
ruby,
|
|
1336
|
-
&chunking_hash,
|
|
1337
|
-
"max_overlap",
|
|
1338
|
-
ruby.integer_from_i64(chunking.max_overlap as i64).into_value_with(ruby),
|
|
1339
|
-
)?;
|
|
1340
|
-
if let Some(preset) = chunking.preset {
|
|
1341
|
-
set_hash_entry(
|
|
1342
|
-
ruby,
|
|
1343
|
-
&chunking_hash,
|
|
1344
|
-
"preset",
|
|
1345
|
-
ruby.str_new(&preset).into_value_with(ruby),
|
|
1346
|
-
)?;
|
|
1347
|
-
}
|
|
1348
|
-
if let Some(embedding) = chunking.embedding {
|
|
1349
|
-
let embedding_json = serde_json::to_value(&embedding)
|
|
1350
|
-
.map_err(|e| runtime_error(format!("Failed to serialize embedding config: {}", e)))?;
|
|
1351
|
-
let embedding_value = json_value_to_ruby(ruby, &embedding_json)?;
|
|
1352
|
-
set_hash_entry(ruby, &chunking_hash, "embedding", embedding_value)?;
|
|
1353
|
-
}
|
|
1354
|
-
set_hash_entry(ruby, &hash, "chunking", chunking_hash.into_value_with(ruby))?;
|
|
1355
|
-
}
|
|
1356
|
-
|
|
1357
|
-
if let Some(lang_detection) = config.language_detection {
|
|
1358
|
-
let lang_hash = ruby.hash_new();
|
|
1359
|
-
set_hash_entry(
|
|
1360
|
-
ruby,
|
|
1361
|
-
&lang_hash,
|
|
1362
|
-
"enabled",
|
|
1363
|
-
if lang_detection.enabled {
|
|
1364
|
-
ruby.qtrue().as_value()
|
|
1365
|
-
} else {
|
|
1366
|
-
ruby.qfalse().as_value()
|
|
1367
|
-
},
|
|
1368
|
-
)?;
|
|
1369
|
-
set_hash_entry(
|
|
1370
|
-
ruby,
|
|
1371
|
-
&lang_hash,
|
|
1372
|
-
"min_confidence",
|
|
1373
|
-
ruby.float_from_f64(lang_detection.min_confidence).into_value_with(ruby),
|
|
1374
|
-
)?;
|
|
1375
|
-
set_hash_entry(
|
|
1376
|
-
ruby,
|
|
1377
|
-
&lang_hash,
|
|
1378
|
-
"detect_multiple",
|
|
1379
|
-
if lang_detection.detect_multiple {
|
|
1380
|
-
ruby.qtrue().as_value()
|
|
1381
|
-
} else {
|
|
1382
|
-
ruby.qfalse().as_value()
|
|
1383
|
-
},
|
|
1384
|
-
)?;
|
|
1385
|
-
set_hash_entry(ruby, &hash, "language_detection", lang_hash.into_value_with(ruby))?;
|
|
1386
|
-
}
|
|
1387
|
-
|
|
1388
|
-
if let Some(pdf_options) = config.pdf_options {
|
|
1389
|
-
let pdf_hash = ruby.hash_new();
|
|
1390
|
-
set_hash_entry(
|
|
1391
|
-
ruby,
|
|
1392
|
-
&pdf_hash,
|
|
1393
|
-
"extract_images",
|
|
1394
|
-
if pdf_options.extract_images {
|
|
1395
|
-
ruby.qtrue().as_value()
|
|
1396
|
-
} else {
|
|
1397
|
-
ruby.qfalse().as_value()
|
|
1398
|
-
},
|
|
1399
|
-
)?;
|
|
1400
|
-
if let Some(passwords) = pdf_options.passwords {
|
|
1401
|
-
let passwords_array = ruby.ary_from_vec(passwords);
|
|
1402
|
-
set_hash_entry(ruby, &pdf_hash, "passwords", passwords_array.into_value_with(ruby))?;
|
|
1403
|
-
}
|
|
1404
|
-
set_hash_entry(
|
|
1405
|
-
ruby,
|
|
1406
|
-
&pdf_hash,
|
|
1407
|
-
"extract_metadata",
|
|
1408
|
-
if pdf_options.extract_metadata {
|
|
1409
|
-
ruby.qtrue().as_value()
|
|
1410
|
-
} else {
|
|
1411
|
-
ruby.qfalse().as_value()
|
|
1412
|
-
},
|
|
1413
|
-
)?;
|
|
1414
|
-
set_hash_entry(ruby, &hash, "pdf_options", pdf_hash.into_value_with(ruby))?;
|
|
1415
|
-
}
|
|
1416
|
-
|
|
1417
|
-
if let Some(images) = config.images {
|
|
1418
|
-
let images_hash = ruby.hash_new();
|
|
1419
|
-
set_hash_entry(
|
|
1420
|
-
ruby,
|
|
1421
|
-
&images_hash,
|
|
1422
|
-
"extract_images",
|
|
1423
|
-
if images.extract_images {
|
|
1424
|
-
ruby.qtrue().as_value()
|
|
1425
|
-
} else {
|
|
1426
|
-
ruby.qfalse().as_value()
|
|
1427
|
-
},
|
|
1428
|
-
)?;
|
|
1429
|
-
set_hash_entry(
|
|
1430
|
-
ruby,
|
|
1431
|
-
&images_hash,
|
|
1432
|
-
"target_dpi",
|
|
1433
|
-
ruby.integer_from_i64(images.target_dpi as i64).into_value_with(ruby),
|
|
1434
|
-
)?;
|
|
1435
|
-
set_hash_entry(
|
|
1436
|
-
ruby,
|
|
1437
|
-
&images_hash,
|
|
1438
|
-
"max_image_dimension",
|
|
1439
|
-
ruby.integer_from_i64(images.max_image_dimension as i64)
|
|
1440
|
-
.into_value_with(ruby),
|
|
1441
|
-
)?;
|
|
1442
|
-
set_hash_entry(
|
|
1443
|
-
ruby,
|
|
1444
|
-
&images_hash,
|
|
1445
|
-
"auto_adjust_dpi",
|
|
1446
|
-
if images.auto_adjust_dpi {
|
|
1447
|
-
ruby.qtrue().as_value()
|
|
1448
|
-
} else {
|
|
1449
|
-
ruby.qfalse().as_value()
|
|
1450
|
-
},
|
|
1451
|
-
)?;
|
|
1452
|
-
set_hash_entry(
|
|
1453
|
-
ruby,
|
|
1454
|
-
&images_hash,
|
|
1455
|
-
"min_dpi",
|
|
1456
|
-
ruby.integer_from_i64(images.min_dpi as i64).into_value_with(ruby),
|
|
1457
|
-
)?;
|
|
1458
|
-
set_hash_entry(
|
|
1459
|
-
ruby,
|
|
1460
|
-
&images_hash,
|
|
1461
|
-
"max_dpi",
|
|
1462
|
-
ruby.integer_from_i64(images.max_dpi as i64).into_value_with(ruby),
|
|
1463
|
-
)?;
|
|
1464
|
-
set_hash_entry(ruby, &hash, "image_extraction", images_hash.into_value_with(ruby))?;
|
|
1465
|
-
}
|
|
1466
|
-
|
|
1467
|
-
if let Some(postprocessor) = config.postprocessor {
|
|
1468
|
-
let pp_hash = ruby.hash_new();
|
|
1469
|
-
set_hash_entry(
|
|
1470
|
-
ruby,
|
|
1471
|
-
&pp_hash,
|
|
1472
|
-
"enabled",
|
|
1473
|
-
if postprocessor.enabled {
|
|
1474
|
-
ruby.qtrue().as_value()
|
|
1475
|
-
} else {
|
|
1476
|
-
ruby.qfalse().as_value()
|
|
1477
|
-
},
|
|
1478
|
-
)?;
|
|
1479
|
-
if let Some(enabled_processors) = postprocessor.enabled_processors {
|
|
1480
|
-
let enabled_array = ruby.ary_from_vec(enabled_processors);
|
|
1481
|
-
set_hash_entry(
|
|
1482
|
-
ruby,
|
|
1483
|
-
&pp_hash,
|
|
1484
|
-
"enabled_processors",
|
|
1485
|
-
enabled_array.into_value_with(ruby),
|
|
1486
|
-
)?;
|
|
1487
|
-
}
|
|
1488
|
-
if let Some(disabled_processors) = postprocessor.disabled_processors {
|
|
1489
|
-
let disabled_array = ruby.ary_from_vec(disabled_processors);
|
|
1490
|
-
set_hash_entry(
|
|
1491
|
-
ruby,
|
|
1492
|
-
&pp_hash,
|
|
1493
|
-
"disabled_processors",
|
|
1494
|
-
disabled_array.into_value_with(ruby),
|
|
1495
|
-
)?;
|
|
1496
|
-
}
|
|
1497
|
-
set_hash_entry(ruby, &hash, "postprocessor", pp_hash.into_value_with(ruby))?;
|
|
1498
|
-
}
|
|
1499
|
-
|
|
1500
|
-
if let Some(token_reduction) = config.token_reduction {
|
|
1501
|
-
let tr_hash = ruby.hash_new();
|
|
1502
|
-
set_hash_entry(
|
|
1503
|
-
ruby,
|
|
1504
|
-
&tr_hash,
|
|
1505
|
-
"mode",
|
|
1506
|
-
ruby.str_new(&token_reduction.mode).into_value_with(ruby),
|
|
1507
|
-
)?;
|
|
1508
|
-
set_hash_entry(
|
|
1509
|
-
ruby,
|
|
1510
|
-
&tr_hash,
|
|
1511
|
-
"preserve_important_words",
|
|
1512
|
-
if token_reduction.preserve_important_words {
|
|
1513
|
-
ruby.qtrue().as_value()
|
|
1514
|
-
} else {
|
|
1515
|
-
ruby.qfalse().as_value()
|
|
1516
|
-
},
|
|
1517
|
-
)?;
|
|
1518
|
-
set_hash_entry(ruby, &hash, "token_reduction", tr_hash.into_value_with(ruby))?;
|
|
1519
|
-
}
|
|
1520
|
-
|
|
1521
|
-
if let Some(keywords) = config.keywords {
|
|
1522
|
-
let keywords_hash = keyword_config_to_ruby_hash(ruby, &keywords)?;
|
|
1523
|
-
set_hash_entry(ruby, &hash, "keywords", keywords_hash.into_value_with(ruby))?;
|
|
1524
|
-
}
|
|
1525
|
-
|
|
1526
|
-
if let Some(html_options) = config.html_options {
|
|
1527
|
-
let html_hash = html_options_to_ruby_hash(ruby, &html_options)?;
|
|
1528
|
-
set_hash_entry(ruby, &hash, "html_options", html_hash.into_value_with(ruby))?;
|
|
1529
|
-
}
|
|
1530
|
-
|
|
1531
|
-
if let Some(max_concurrent) = config.max_concurrent_extractions {
|
|
1532
|
-
set_hash_entry(
|
|
1533
|
-
ruby,
|
|
1534
|
-
&hash,
|
|
1535
|
-
"max_concurrent_extractions",
|
|
1536
|
-
ruby.integer_from_u64(max_concurrent as u64).into_value_with(ruby),
|
|
1537
|
-
)?;
|
|
1538
|
-
}
|
|
1539
|
-
|
|
1540
|
-
Ok(hash)
|
|
1541
|
-
}
|
|
1542
|
-
|
|
1543
|
-
/// Load extraction configuration from a file.
|
|
1544
|
-
///
|
|
1545
|
-
/// Detects the file format from the extension (.toml, .yaml, .json)
|
|
1546
|
-
/// and loads the configuration accordingly. Returns a hash to be used by Ruby.
|
|
1547
|
-
///
|
|
1548
|
-
/// @param path [String] Path to the configuration file
|
|
1549
|
-
/// @return [Hash] Configuration hash
|
|
1550
|
-
///
|
|
1551
|
-
/// @example Load from TOML
|
|
1552
|
-
/// hash = Kreuzberg._config_from_file_native("config.toml")
|
|
1553
|
-
///
|
|
1554
|
-
/// @example Load from YAML
|
|
1555
|
-
/// hash = Kreuzberg._config_from_file_native("config.yaml")
|
|
1556
|
-
///
|
|
1557
|
-
fn config_from_file(path: String) -> Result<RHash, Error> {
|
|
1558
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1559
|
-
let file_path = Path::new(&path);
|
|
1560
|
-
|
|
1561
|
-
let extension = file_path
|
|
1562
|
-
.extension()
|
|
1563
|
-
.and_then(|ext| ext.to_str())
|
|
1564
|
-
.ok_or_else(|| runtime_error("File path must have an extension (.toml, .yaml, or .json)"))?;
|
|
1565
|
-
|
|
1566
|
-
let config = match extension {
|
|
1567
|
-
"toml" => ExtractionConfig::from_toml_file(file_path).map_err(kreuzberg_error)?,
|
|
1568
|
-
"yaml" => ExtractionConfig::from_yaml_file(file_path).map_err(kreuzberg_error)?,
|
|
1569
|
-
"json" => ExtractionConfig::from_json_file(file_path).map_err(kreuzberg_error)?,
|
|
1570
|
-
_ => {
|
|
1571
|
-
return Err(runtime_error(format!(
|
|
1572
|
-
"Unsupported file extension '{}'. Supported: .toml, .yaml, .json",
|
|
1573
|
-
extension
|
|
1574
|
-
)));
|
|
1575
|
-
}
|
|
1576
|
-
};
|
|
1577
|
-
|
|
1578
|
-
extraction_config_to_ruby_hash(&ruby, config)
|
|
1579
|
-
}
|
|
1580
|
-
|
|
1581
|
-
/// Discover configuration file in current or parent directories.
|
|
1582
|
-
///
|
|
1583
|
-
/// Searches for kreuzberg.toml, kreuzberg.yaml, or kreuzberg.json in the current
|
|
1584
|
-
/// directory and parent directories. Returns nil if no config file is found.
|
|
1585
|
-
///
|
|
1586
|
-
/// @return [Hash, nil] Configuration hash or nil if not found
|
|
1587
|
-
///
|
|
1588
|
-
/// @example
|
|
1589
|
-
/// hash = Kreuzberg._config_discover_native
|
|
1590
|
-
/// # => {...config hash...} or nil
|
|
1591
|
-
///
|
|
1592
|
-
fn config_discover() -> Result<Value, Error> {
|
|
1593
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1594
|
-
|
|
1595
|
-
let maybe_config = ExtractionConfig::discover().map_err(kreuzberg_error)?;
|
|
1596
|
-
|
|
1597
|
-
match maybe_config {
|
|
1598
|
-
Some(config) => {
|
|
1599
|
-
let hash = extraction_config_to_ruby_hash(&ruby, config)?;
|
|
1600
|
-
Ok(hash.as_value())
|
|
1601
|
-
}
|
|
1602
|
-
None => Ok(ruby.qnil().as_value()),
|
|
1603
|
-
}
|
|
1604
|
-
}
|
|
1605
|
-
|
|
1606
|
-
/// Convert Rust ExtractionResult to Ruby Hash
|
|
1607
|
-
fn extraction_result_to_ruby(ruby: &Ruby, result: RustExtractionResult) -> Result<RHash, Error> {
|
|
1608
|
-
let hash = ruby.hash_new();
|
|
1609
|
-
|
|
1610
|
-
let content_value = ruby.str_new(result.content.as_str()).into_value_with(ruby);
|
|
1611
|
-
set_hash_entry(ruby, &hash, "content", content_value)?;
|
|
1612
|
-
|
|
1613
|
-
let mime_value = ruby.str_new(result.mime_type.as_str()).into_value_with(ruby);
|
|
1614
|
-
set_hash_entry(ruby, &hash, "mime_type", mime_value)?;
|
|
1615
|
-
|
|
1616
|
-
let metadata_json = serde_json::to_string(&result.metadata)
|
|
1617
|
-
.map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
|
|
1618
|
-
let metadata_json_value = ruby.str_new(&metadata_json).into_value_with(ruby);
|
|
1619
|
-
set_hash_entry(ruby, &hash, "metadata_json", metadata_json_value)?;
|
|
1620
|
-
let metadata_value = serde_json::to_value(&result.metadata)
|
|
1621
|
-
.map_err(|e| runtime_error(format!("Failed to serialize metadata: {}", e)))?;
|
|
1622
|
-
let metadata_hash = json_value_to_ruby(ruby, &metadata_value)?;
|
|
1623
|
-
set_hash_entry(ruby, &hash, "metadata", metadata_hash)?;
|
|
1624
|
-
|
|
1625
|
-
let tables_array = ruby.ary_new();
|
|
1626
|
-
for table in result.tables {
|
|
1627
|
-
let table_hash = ruby.hash_new();
|
|
1628
|
-
|
|
1629
|
-
let cells_array = ruby.ary_new();
|
|
1630
|
-
for row in table.cells {
|
|
1631
|
-
let row_array = ruby.ary_from_vec(row);
|
|
1632
|
-
cells_array.push(row_array)?;
|
|
1633
|
-
}
|
|
1634
|
-
table_hash.aset("cells", cells_array)?;
|
|
1635
|
-
|
|
1636
|
-
table_hash.aset("markdown", table.markdown)?;
|
|
1637
|
-
|
|
1638
|
-
table_hash.aset("page_number", table.page_number)?;
|
|
1639
|
-
|
|
1640
|
-
tables_array.push(table_hash)?;
|
|
1641
|
-
}
|
|
1642
|
-
let tables_value = tables_array.into_value_with(ruby);
|
|
1643
|
-
set_hash_entry(ruby, &hash, "tables", tables_value)?;
|
|
1644
|
-
|
|
1645
|
-
if let Some(langs) = result.detected_languages {
|
|
1646
|
-
let langs_array = ruby.ary_from_vec(langs);
|
|
1647
|
-
let langs_value = langs_array.into_value_with(ruby);
|
|
1648
|
-
set_hash_entry(ruby, &hash, "detected_languages", langs_value)?;
|
|
1649
|
-
} else {
|
|
1650
|
-
set_hash_entry(ruby, &hash, "detected_languages", ruby.qnil().as_value())?;
|
|
1651
|
-
}
|
|
1652
|
-
|
|
1653
|
-
if let Some(chunks) = result.chunks {
|
|
1654
|
-
let chunks_array = ruby.ary_new();
|
|
1655
|
-
for chunk in chunks {
|
|
1656
|
-
let chunk_hash = ruby.hash_new();
|
|
1657
|
-
chunk_hash.aset("content", chunk.content)?;
|
|
1658
|
-
chunk_hash.aset("byte_start", chunk.metadata.byte_start)?;
|
|
1659
|
-
chunk_hash.aset("byte_end", chunk.metadata.byte_end)?;
|
|
1660
|
-
if let Some(token_count) = chunk.metadata.token_count {
|
|
1661
|
-
chunk_hash.aset("token_count", token_count)?;
|
|
1662
|
-
} else {
|
|
1663
|
-
chunk_hash.aset("token_count", ruby.qnil().as_value())?;
|
|
1664
|
-
}
|
|
1665
|
-
chunk_hash.aset("chunk_index", chunk.metadata.chunk_index)?;
|
|
1666
|
-
chunk_hash.aset("total_chunks", chunk.metadata.total_chunks)?;
|
|
1667
|
-
if let Some(first_page) = chunk.metadata.first_page {
|
|
1668
|
-
chunk_hash.aset("first_page", first_page as i64)?;
|
|
1669
|
-
} else {
|
|
1670
|
-
chunk_hash.aset("first_page", ruby.qnil().as_value())?;
|
|
1671
|
-
}
|
|
1672
|
-
if let Some(last_page) = chunk.metadata.last_page {
|
|
1673
|
-
chunk_hash.aset("last_page", last_page as i64)?;
|
|
1674
|
-
} else {
|
|
1675
|
-
chunk_hash.aset("last_page", ruby.qnil().as_value())?;
|
|
1676
|
-
}
|
|
1677
|
-
if let Some(embedding) = chunk.embedding {
|
|
1678
|
-
let embedding_array = ruby.ary_new();
|
|
1679
|
-
for value in embedding {
|
|
1680
|
-
embedding_array.push(ruby.float_from_f64(value as f64).into_value_with(ruby))?;
|
|
1681
|
-
}
|
|
1682
|
-
chunk_hash.aset("embedding", embedding_array)?;
|
|
1683
|
-
} else {
|
|
1684
|
-
chunk_hash.aset("embedding", ruby.qnil().as_value())?;
|
|
1685
|
-
}
|
|
1686
|
-
chunks_array.push(chunk_hash)?;
|
|
1687
|
-
}
|
|
1688
|
-
let chunks_value = chunks_array.into_value_with(ruby);
|
|
1689
|
-
set_hash_entry(ruby, &hash, "chunks", chunks_value)?;
|
|
1690
|
-
} else {
|
|
1691
|
-
set_hash_entry(ruby, &hash, "chunks", ruby.qnil().as_value())?;
|
|
1692
|
-
}
|
|
1693
|
-
|
|
1694
|
-
if let Some(images) = result.images {
|
|
1695
|
-
let images_array = ruby.ary_new();
|
|
1696
|
-
for image in images {
|
|
1697
|
-
let image_hash = ruby.hash_new();
|
|
1698
|
-
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
1699
|
-
image_hash.aset("data", data_value)?;
|
|
1700
|
-
image_hash.aset("format", image.format)?;
|
|
1701
|
-
image_hash.aset("image_index", image.image_index as i64)?;
|
|
1702
|
-
if let Some(page) = image.page_number {
|
|
1703
|
-
image_hash.aset("page_number", page as i64)?;
|
|
1704
|
-
} else {
|
|
1705
|
-
image_hash.aset("page_number", ruby.qnil().as_value())?;
|
|
1706
|
-
}
|
|
1707
|
-
if let Some(width) = image.width {
|
|
1708
|
-
image_hash.aset("width", width as i64)?;
|
|
1709
|
-
} else {
|
|
1710
|
-
image_hash.aset("width", ruby.qnil().as_value())?;
|
|
1711
|
-
}
|
|
1712
|
-
if let Some(height) = image.height {
|
|
1713
|
-
image_hash.aset("height", height as i64)?;
|
|
1714
|
-
} else {
|
|
1715
|
-
image_hash.aset("height", ruby.qnil().as_value())?;
|
|
1716
|
-
}
|
|
1717
|
-
if let Some(colorspace) = image.colorspace {
|
|
1718
|
-
image_hash.aset("colorspace", colorspace)?;
|
|
1719
|
-
} else {
|
|
1720
|
-
image_hash.aset("colorspace", ruby.qnil().as_value())?;
|
|
1721
|
-
}
|
|
1722
|
-
if let Some(bits) = image.bits_per_component {
|
|
1723
|
-
image_hash.aset("bits_per_component", bits as i64)?;
|
|
1724
|
-
} else {
|
|
1725
|
-
image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
|
|
1726
|
-
}
|
|
1727
|
-
image_hash.aset(
|
|
1728
|
-
"is_mask",
|
|
1729
|
-
if image.is_mask {
|
|
1730
|
-
ruby.qtrue().as_value()
|
|
1731
|
-
} else {
|
|
1732
|
-
ruby.qfalse().as_value()
|
|
1733
|
-
},
|
|
1734
|
-
)?;
|
|
1735
|
-
if let Some(description) = image.description {
|
|
1736
|
-
image_hash.aset("description", description)?;
|
|
1737
|
-
} else {
|
|
1738
|
-
image_hash.aset("description", ruby.qnil().as_value())?;
|
|
1739
|
-
}
|
|
1740
|
-
if let Some(ocr_result) = image.ocr_result {
|
|
1741
|
-
let nested = extraction_result_to_ruby(ruby, *ocr_result)?;
|
|
1742
|
-
image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
|
|
1743
|
-
} else {
|
|
1744
|
-
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
1745
|
-
}
|
|
1746
|
-
images_array.push(image_hash)?;
|
|
1747
|
-
}
|
|
1748
|
-
set_hash_entry(ruby, &hash, "images", images_array.into_value_with(ruby))?;
|
|
1749
|
-
} else {
|
|
1750
|
-
set_hash_entry(ruby, &hash, "images", ruby.qnil().as_value())?;
|
|
1751
|
-
}
|
|
1752
|
-
|
|
1753
|
-
if let Some(page_content_list) = result.pages {
|
|
1754
|
-
let pages_array = ruby.ary_new();
|
|
1755
|
-
for page_content in page_content_list {
|
|
1756
|
-
let page_hash = ruby.hash_new();
|
|
1757
|
-
page_hash.aset("page_number", page_content.page_number as i64)?;
|
|
1758
|
-
page_hash.aset("content", page_content.content)?;
|
|
1759
|
-
|
|
1760
|
-
let tables_array = ruby.ary_new();
|
|
1761
|
-
for table in page_content.tables {
|
|
1762
|
-
let table_hash = ruby.hash_new();
|
|
1763
|
-
|
|
1764
|
-
let cells_array = ruby.ary_new();
|
|
1765
|
-
for row in table.cells.clone() {
|
|
1766
|
-
let row_array = ruby.ary_from_vec(row);
|
|
1767
|
-
cells_array.push(row_array)?;
|
|
1768
|
-
}
|
|
1769
|
-
table_hash.aset("cells", cells_array)?;
|
|
1770
|
-
table_hash.aset("markdown", table.markdown.clone())?;
|
|
1771
|
-
table_hash.aset("page_number", table.page_number as i64)?;
|
|
1772
|
-
|
|
1773
|
-
tables_array.push(table_hash)?;
|
|
1774
|
-
}
|
|
1775
|
-
page_hash.aset("tables", tables_array)?;
|
|
1776
|
-
|
|
1777
|
-
let images_array = ruby.ary_new();
|
|
1778
|
-
for image in page_content.images {
|
|
1779
|
-
let image_hash = ruby.hash_new();
|
|
1780
|
-
let data_value = ruby.str_from_slice(&image.data).into_value_with(ruby);
|
|
1781
|
-
image_hash.aset("data", data_value)?;
|
|
1782
|
-
image_hash.aset("format", image.format.clone())?;
|
|
1783
|
-
image_hash.aset("image_index", image.image_index as i64)?;
|
|
1784
|
-
if let Some(page) = image.page_number {
|
|
1785
|
-
image_hash.aset("page_number", page as i64)?;
|
|
1786
|
-
} else {
|
|
1787
|
-
image_hash.aset("page_number", ruby.qnil().as_value())?;
|
|
1788
|
-
}
|
|
1789
|
-
if let Some(width) = image.width {
|
|
1790
|
-
image_hash.aset("width", width as i64)?;
|
|
1791
|
-
} else {
|
|
1792
|
-
image_hash.aset("width", ruby.qnil().as_value())?;
|
|
1793
|
-
}
|
|
1794
|
-
if let Some(height) = image.height {
|
|
1795
|
-
image_hash.aset("height", height as i64)?;
|
|
1796
|
-
} else {
|
|
1797
|
-
image_hash.aset("height", ruby.qnil().as_value())?;
|
|
1798
|
-
}
|
|
1799
|
-
if let Some(colorspace) = &image.colorspace {
|
|
1800
|
-
image_hash.aset("colorspace", colorspace.clone())?;
|
|
1801
|
-
} else {
|
|
1802
|
-
image_hash.aset("colorspace", ruby.qnil().as_value())?;
|
|
1803
|
-
}
|
|
1804
|
-
if let Some(bits) = image.bits_per_component {
|
|
1805
|
-
image_hash.aset("bits_per_component", bits as i64)?;
|
|
1806
|
-
} else {
|
|
1807
|
-
image_hash.aset("bits_per_component", ruby.qnil().as_value())?;
|
|
1808
|
-
}
|
|
1809
|
-
image_hash.aset(
|
|
1810
|
-
"is_mask",
|
|
1811
|
-
if image.is_mask {
|
|
1812
|
-
ruby.qtrue().as_value()
|
|
1813
|
-
} else {
|
|
1814
|
-
ruby.qfalse().as_value()
|
|
1815
|
-
},
|
|
1816
|
-
)?;
|
|
1817
|
-
if let Some(description) = &image.description {
|
|
1818
|
-
image_hash.aset("description", description.clone())?;
|
|
1819
|
-
} else {
|
|
1820
|
-
image_hash.aset("description", ruby.qnil().as_value())?;
|
|
1821
|
-
}
|
|
1822
|
-
if let Some(ocr_result) = &image.ocr_result {
|
|
1823
|
-
let nested = extraction_result_to_ruby(ruby, (**ocr_result).clone())?;
|
|
1824
|
-
image_hash.aset("ocr_result", nested.into_value_with(ruby))?;
|
|
1825
|
-
} else {
|
|
1826
|
-
image_hash.aset("ocr_result", ruby.qnil().as_value())?;
|
|
1827
|
-
}
|
|
1828
|
-
images_array.push(image_hash)?;
|
|
1829
|
-
}
|
|
1830
|
-
page_hash.aset("images", images_array)?;
|
|
1831
|
-
|
|
1832
|
-
pages_array.push(page_hash)?;
|
|
1833
|
-
}
|
|
1834
|
-
set_hash_entry(ruby, &hash, "pages", pages_array.into_value_with(ruby))?;
|
|
1835
|
-
} else {
|
|
1836
|
-
set_hash_entry(ruby, &hash, "pages", ruby.qnil().as_value())?;
|
|
1837
|
-
}
|
|
1838
|
-
|
|
1839
|
-
Ok(hash)
|
|
1840
|
-
}
|
|
1841
|
-
|
|
1842
|
-
/// Extract content from a file (synchronous).
|
|
1843
|
-
///
|
|
1844
|
-
/// @param path [String] Path to the file
|
|
1845
|
-
/// @param mime_type [String, nil] Optional MIME type hint
|
|
1846
|
-
/// @param options [Hash] Extraction configuration
|
|
1847
|
-
/// @return [Hash] Extraction result with :content, :mime_type, :metadata, :tables, etc.
|
|
1848
|
-
///
|
|
1849
|
-
/// @example Basic usage
|
|
1850
|
-
/// result = Kreuzberg.extract_file_sync("document.pdf")
|
|
1851
|
-
/// puts result[:content]
|
|
1852
|
-
///
|
|
1853
|
-
/// @example With OCR
|
|
1854
|
-
/// result = Kreuzberg.extract_file_sync("scanned.pdf", nil, force_ocr: true)
|
|
1855
|
-
///
|
|
1856
|
-
fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
1857
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1858
|
-
let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
|
|
1859
|
-
let (path,) = args.required;
|
|
1860
|
-
let (mime_type,) = args.optional;
|
|
1861
|
-
let opts = Some(args.keywords);
|
|
1862
|
-
|
|
1863
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
1864
|
-
|
|
1865
|
-
let result = kreuzberg::extract_file_sync(&path, mime_type.as_deref(), &config).map_err(kreuzberg_error)?;
|
|
1866
|
-
|
|
1867
|
-
extraction_result_to_ruby(&ruby, result)
|
|
1868
|
-
}
|
|
1869
|
-
|
|
1870
|
-
/// Extract content from bytes (synchronous).
|
|
1871
|
-
///
|
|
1872
|
-
/// @param data [String] Binary data to extract
|
|
1873
|
-
/// @param mime_type [String] MIME type of the data
|
|
1874
|
-
/// @param options [Hash] Extraction configuration
|
|
1875
|
-
/// @return [Hash] Extraction result
|
|
1876
|
-
///
|
|
1877
|
-
/// @example
|
|
1878
|
-
/// data = File.binread("document.pdf")
|
|
1879
|
-
/// result = Kreuzberg.extract_bytes_sync(data, "application/pdf")
|
|
1880
|
-
///
|
|
1881
|
-
fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
1882
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1883
|
-
let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
|
|
1884
|
-
let (data, mime_type) = args.required;
|
|
1885
|
-
let opts = Some(args.keywords);
|
|
1886
|
-
|
|
1887
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
1888
|
-
|
|
1889
|
-
let bytes = unsafe { data.as_slice() };
|
|
1890
|
-
let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
|
|
1891
|
-
|
|
1892
|
-
extraction_result_to_ruby(&ruby, result)
|
|
1893
|
-
}
|
|
1894
|
-
|
|
1895
|
-
/// Batch extract content from multiple files (synchronous).
|
|
1896
|
-
///
|
|
1897
|
-
/// @param paths [Array<String>] List of file paths
|
|
1898
|
-
/// @param options [Hash] Extraction configuration
|
|
1899
|
-
/// @return [Array<Hash>] Array of extraction results
|
|
1900
|
-
///
|
|
1901
|
-
/// @example
|
|
1902
|
-
/// paths = ["doc1.pdf", "doc2.docx", "doc3.xlsx"]
|
|
1903
|
-
/// results = Kreuzberg.batch_extract_files_sync(paths)
|
|
1904
|
-
/// results.each { |r| puts r[:content] }
|
|
1905
|
-
///
|
|
1906
|
-
fn batch_extract_files_sync(args: &[Value]) -> Result<RArray, Error> {
|
|
1907
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1908
|
-
let args = scan_args::<(RArray,), (), (), (), RHash, ()>(args)?;
|
|
1909
|
-
let (paths_array,) = args.required;
|
|
1910
|
-
let opts = Some(args.keywords);
|
|
1911
|
-
|
|
1912
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
1913
|
-
|
|
1914
|
-
let paths: Vec<String> = paths_array.to_vec::<String>()?;
|
|
1915
|
-
|
|
1916
|
-
let results = kreuzberg::batch_extract_file_sync(paths, &config).map_err(kreuzberg_error)?;
|
|
1917
|
-
|
|
1918
|
-
let results_array = ruby.ary_new();
|
|
1919
|
-
for result in results {
|
|
1920
|
-
results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
|
|
1921
|
-
}
|
|
1922
|
-
|
|
1923
|
-
Ok(results_array)
|
|
1924
|
-
}
|
|
1925
|
-
|
|
1926
|
-
/// Extract content from a file (asynchronous).
|
|
1927
|
-
///
|
|
1928
|
-
/// Note: Ruby doesn't have native async/await, so this uses a blocking Tokio runtime.
|
|
1929
|
-
/// For true async behavior, use the synchronous version in a background thread.
|
|
1930
|
-
///
|
|
1931
|
-
/// @param path [String] Path to the file
|
|
1932
|
-
/// @param mime_type [String, nil] Optional MIME type hint
|
|
1933
|
-
/// @param options [Hash] Extraction configuration
|
|
1934
|
-
/// @return [Hash] Extraction result
|
|
1935
|
-
///
|
|
1936
|
-
fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
1937
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1938
|
-
let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
|
|
1939
|
-
let (path,) = args.required;
|
|
1940
|
-
let (mime_type,) = args.optional;
|
|
1941
|
-
let opts = Some(args.keywords);
|
|
1942
|
-
|
|
1943
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
1944
|
-
|
|
1945
|
-
let runtime =
|
|
1946
|
-
tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
1947
|
-
|
|
1948
|
-
let result = runtime
|
|
1949
|
-
.block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
|
|
1950
|
-
.map_err(kreuzberg_error)?;
|
|
1951
|
-
|
|
1952
|
-
extraction_result_to_ruby(&ruby, result)
|
|
1953
|
-
}
|
|
1954
|
-
|
|
1955
|
-
/// Extract content from bytes (asynchronous).
|
|
1956
|
-
///
|
|
1957
|
-
/// @param data [String] Binary data
|
|
1958
|
-
/// @param mime_type [String] MIME type
|
|
1959
|
-
/// @param options [Hash] Extraction configuration
|
|
1960
|
-
/// @return [Hash] Extraction result
|
|
1961
|
-
///
|
|
1962
|
-
fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
|
|
1963
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1964
|
-
let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
|
|
1965
|
-
let (data, mime_type) = args.required;
|
|
1966
|
-
let opts = Some(args.keywords);
|
|
1967
|
-
|
|
1968
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
1969
|
-
|
|
1970
|
-
let runtime =
|
|
1971
|
-
tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
1972
|
-
|
|
1973
|
-
let bytes = unsafe { data.as_slice() };
|
|
1974
|
-
let result = runtime
|
|
1975
|
-
.block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
|
|
1976
|
-
.map_err(kreuzberg_error)?;
|
|
1977
|
-
|
|
1978
|
-
extraction_result_to_ruby(&ruby, result)
|
|
1979
|
-
}
|
|
1980
|
-
|
|
1981
|
-
/// Batch extract content from multiple files (asynchronous).
|
|
1982
|
-
///
|
|
1983
|
-
/// @param paths [Array<String>] List of file paths
|
|
1984
|
-
/// @param options [Hash] Extraction configuration
|
|
1985
|
-
/// @return [Array<Hash>] Array of extraction results
|
|
1986
|
-
///
|
|
1987
|
-
fn batch_extract_files(args: &[Value]) -> Result<RArray, Error> {
|
|
1988
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
1989
|
-
let args = scan_args::<(RArray,), (), (), (), RHash, ()>(args)?;
|
|
1990
|
-
let (paths_array,) = args.required;
|
|
1991
|
-
let opts = Some(args.keywords);
|
|
1992
|
-
|
|
1993
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
1994
|
-
|
|
1995
|
-
let paths: Vec<String> = paths_array.to_vec::<String>()?;
|
|
1996
|
-
|
|
1997
|
-
let runtime =
|
|
1998
|
-
tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
1999
|
-
|
|
2000
|
-
let results = runtime
|
|
2001
|
-
.block_on(async { kreuzberg::batch_extract_file(paths, &config).await })
|
|
2002
|
-
.map_err(kreuzberg_error)?;
|
|
2003
|
-
|
|
2004
|
-
let results_array = ruby.ary_new();
|
|
2005
|
-
for result in results {
|
|
2006
|
-
results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
|
|
2007
|
-
}
|
|
2008
|
-
|
|
2009
|
-
Ok(results_array)
|
|
2010
|
-
}
|
|
2011
|
-
|
|
2012
|
-
/// Batch extract content from multiple byte arrays (synchronous).
|
|
2013
|
-
///
|
|
2014
|
-
/// @param bytes_array [Array<String>] List of binary data strings
|
|
2015
|
-
/// @param mime_types [Array<String>] List of MIME types corresponding to each byte array
|
|
2016
|
-
/// @param options [Hash] Extraction configuration
|
|
2017
|
-
/// @return [Array<Hash>] Array of extraction results
|
|
2018
|
-
///
|
|
2019
|
-
/// @example
|
|
2020
|
-
/// data1 = File.binread("document.pdf")
|
|
2021
|
-
/// data2 = File.binread("invoice.docx")
|
|
2022
|
-
/// results = Kreuzberg.batch_extract_bytes_sync([data1, data2], ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"])
|
|
2023
|
-
///
|
|
2024
|
-
fn batch_extract_bytes_sync(args: &[Value]) -> Result<RArray, Error> {
|
|
2025
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2026
|
-
let args = scan_args::<(RArray, RArray), (), (), (), RHash, ()>(args)?;
|
|
2027
|
-
let (bytes_array, mime_types_array) = args.required;
|
|
2028
|
-
let opts = Some(args.keywords);
|
|
2029
|
-
|
|
2030
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
2031
|
-
|
|
2032
|
-
let bytes_vec: Vec<RString> = bytes_array
|
|
2033
|
-
.into_iter()
|
|
2034
|
-
.map(RString::try_convert)
|
|
2035
|
-
.collect::<Result<_, _>>()?;
|
|
2036
|
-
let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
|
|
2037
|
-
|
|
2038
|
-
if bytes_vec.len() != mime_types.len() {
|
|
2039
|
-
return Err(runtime_error(format!(
|
|
2040
|
-
"bytes_array and mime_types must have the same length: {} vs {}",
|
|
2041
|
-
bytes_vec.len(),
|
|
2042
|
-
mime_types.len()
|
|
2043
|
-
)));
|
|
2044
|
-
}
|
|
2045
|
-
|
|
2046
|
-
let contents: Vec<(Vec<u8>, String)> = bytes_vec
|
|
2047
|
-
.iter()
|
|
2048
|
-
.zip(mime_types.iter())
|
|
2049
|
-
.map(|(bytes, mime)| (unsafe { bytes.as_slice() }.to_vec(), mime.clone()))
|
|
2050
|
-
.collect();
|
|
2051
|
-
|
|
2052
|
-
let results = kreuzberg::batch_extract_bytes_sync(contents, &config).map_err(kreuzberg_error)?;
|
|
2053
|
-
|
|
2054
|
-
let results_array = ruby.ary_new();
|
|
2055
|
-
for result in results {
|
|
2056
|
-
results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
|
|
2057
|
-
}
|
|
2058
|
-
|
|
2059
|
-
Ok(results_array)
|
|
2060
|
-
}
|
|
2061
|
-
|
|
2062
|
-
/// Batch extract content from multiple byte arrays (asynchronous).
|
|
2063
|
-
///
|
|
2064
|
-
/// @param bytes_array [Array<String>] List of binary data strings
|
|
2065
|
-
/// @param mime_types [Array<String>] List of MIME types corresponding to each byte array
|
|
2066
|
-
/// @param options [Hash] Extraction configuration
|
|
2067
|
-
/// @return [Array<Hash>] Array of extraction results
|
|
2068
|
-
///
|
|
2069
|
-
fn batch_extract_bytes(args: &[Value]) -> Result<RArray, Error> {
|
|
2070
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2071
|
-
let args = scan_args::<(RArray, RArray), (), (), (), RHash, ()>(args)?;
|
|
2072
|
-
let (bytes_array, mime_types_array) = args.required;
|
|
2073
|
-
let opts = Some(args.keywords);
|
|
2074
|
-
|
|
2075
|
-
let config = parse_extraction_config(&ruby, opts)?;
|
|
2076
|
-
|
|
2077
|
-
let bytes_vec: Vec<RString> = bytes_array
|
|
2078
|
-
.into_iter()
|
|
2079
|
-
.map(RString::try_convert)
|
|
2080
|
-
.collect::<Result<_, _>>()?;
|
|
2081
|
-
let mime_types: Vec<String> = mime_types_array.to_vec::<String>()?;
|
|
2082
|
-
|
|
2083
|
-
if bytes_vec.len() != mime_types.len() {
|
|
2084
|
-
return Err(runtime_error(format!(
|
|
2085
|
-
"bytes_array and mime_types must have the same length: {} vs {}",
|
|
2086
|
-
bytes_vec.len(),
|
|
2087
|
-
mime_types.len()
|
|
2088
|
-
)));
|
|
2089
|
-
}
|
|
2090
|
-
|
|
2091
|
-
let contents: Vec<(Vec<u8>, String)> = bytes_vec
|
|
2092
|
-
.iter()
|
|
2093
|
-
.zip(mime_types.iter())
|
|
2094
|
-
.map(|(bytes, mime)| (unsafe { bytes.as_slice() }.to_vec(), mime.clone()))
|
|
2095
|
-
.collect();
|
|
2096
|
-
|
|
2097
|
-
let runtime =
|
|
2098
|
-
tokio::runtime::Runtime::new().map_err(|e| runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
2099
|
-
|
|
2100
|
-
let results = runtime
|
|
2101
|
-
.block_on(async { kreuzberg::batch_extract_bytes(contents, &config).await })
|
|
2102
|
-
.map_err(kreuzberg_error)?;
|
|
2103
|
-
|
|
2104
|
-
let results_array = ruby.ary_new();
|
|
2105
|
-
for result in results {
|
|
2106
|
-
results_array.push(extraction_result_to_ruby(&ruby, result)?)?;
|
|
2107
|
-
}
|
|
2108
|
-
|
|
2109
|
-
Ok(results_array)
|
|
2110
|
-
}
|
|
2111
|
-
|
|
2112
|
-
/// Clear all cache entries.
|
|
2113
|
-
///
|
|
2114
|
-
/// @return [void]
|
|
2115
|
-
///
|
|
2116
|
-
/// @example
|
|
2117
|
-
/// Kreuzberg.clear_cache
|
|
2118
|
-
///
|
|
2119
|
-
fn ruby_clear_cache() -> Result<(), Error> {
|
|
2120
|
-
let cache_root = cache_root_dir()?;
|
|
2121
|
-
if !cache_root.exists() {
|
|
2122
|
-
return Ok(());
|
|
2123
|
-
}
|
|
2124
|
-
|
|
2125
|
-
for dir in cache_directories(&cache_root)? {
|
|
2126
|
-
let Some(dir_str) = dir.to_str() else {
|
|
2127
|
-
return Err(runtime_error("Cache directory path contains non-UTF8 characters"));
|
|
2128
|
-
};
|
|
2129
|
-
|
|
2130
|
-
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
2131
|
-
kreuzberg::cache::clear_cache_directory(dir_str).map_err(kreuzberg_error)?;
|
|
2132
|
-
}
|
|
2133
|
-
|
|
2134
|
-
Ok(())
|
|
2135
|
-
}
|
|
2136
|
-
|
|
2137
|
-
/// Get cache statistics.
|
|
2138
|
-
///
|
|
2139
|
-
/// @return [Hash] Cache statistics with :total_entries and :total_size_bytes
|
|
2140
|
-
///
|
|
2141
|
-
/// @example
|
|
2142
|
-
/// stats = Kreuzberg.cache_stats
|
|
2143
|
-
/// puts "Cache entries: #{stats[:total_entries]}"
|
|
2144
|
-
/// puts "Cache size: #{stats[:total_size_bytes]} bytes"
|
|
2145
|
-
///
|
|
2146
|
-
fn ruby_cache_stats() -> Result<RHash, Error> {
|
|
2147
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2148
|
-
|
|
2149
|
-
let hash = ruby.hash_new();
|
|
2150
|
-
let cache_root = cache_root_dir()?;
|
|
2151
|
-
|
|
2152
|
-
if !cache_root.exists() {
|
|
2153
|
-
hash.aset("total_entries", 0)?;
|
|
2154
|
-
hash.aset("total_size_bytes", 0)?;
|
|
2155
|
-
return Ok(hash);
|
|
2156
|
-
}
|
|
2157
|
-
|
|
2158
|
-
let mut total_entries: usize = 0;
|
|
2159
|
-
let mut total_bytes: f64 = 0.0;
|
|
2160
|
-
|
|
2161
|
-
for dir in cache_directories(&cache_root)? {
|
|
2162
|
-
let Some(dir_str) = dir.to_str() else {
|
|
2163
|
-
return Err(runtime_error("Cache directory path contains non-UTF8 characters"));
|
|
2164
|
-
};
|
|
2165
|
-
|
|
2166
|
-
// OSError/RuntimeError must bubble up - system errors need user reports ~keep
|
|
2167
|
-
let stats = kreuzberg::cache::get_cache_metadata(dir_str).map_err(kreuzberg_error)?;
|
|
2168
|
-
total_entries += stats.total_files;
|
|
2169
|
-
total_bytes += stats.total_size_mb * 1024.0 * 1024.0;
|
|
2170
|
-
}
|
|
2171
|
-
|
|
2172
|
-
set_hash_entry(
|
|
2173
|
-
&ruby,
|
|
2174
|
-
&hash,
|
|
2175
|
-
"total_entries",
|
|
2176
|
-
ruby.integer_from_u64(total_entries as u64).into_value_with(&ruby),
|
|
2177
|
-
)?;
|
|
2178
|
-
set_hash_entry(
|
|
2179
|
-
&ruby,
|
|
2180
|
-
&hash,
|
|
2181
|
-
"total_size_bytes",
|
|
2182
|
-
ruby.integer_from_u64(total_bytes.round() as u64).into_value_with(&ruby),
|
|
2183
|
-
)?;
|
|
2184
|
-
|
|
2185
|
-
Ok(hash)
|
|
2186
|
-
}
|
|
2187
|
-
|
|
2188
|
-
/// Register a post-processor plugin.
|
|
2189
|
-
///
|
|
2190
|
-
/// @param name [String] Unique identifier for the post-processor
|
|
2191
|
-
/// @param processor [Proc] Ruby Proc/lambda that processes extraction results
|
|
2192
|
-
/// @param priority [Integer] Execution priority (default: 50, higher = runs first)
|
|
2193
|
-
/// @return [nil]
|
|
2194
|
-
///
|
|
2195
|
-
/// # Example
|
|
2196
|
-
/// ```text
|
|
2197
|
-
/// Kreuzberg.register_post_processor("uppercase", ->(result) {
|
|
2198
|
-
/// result[:content] = result[:content].upcase
|
|
2199
|
-
/// result
|
|
2200
|
-
/// }, 100)
|
|
2201
|
-
/// ```
|
|
2202
|
-
fn register_post_processor(args: &[Value]) -> Result<(), Error> {
|
|
2203
|
-
let _ruby = Ruby::get().expect("Ruby not initialized");
|
|
2204
|
-
let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
|
|
2205
|
-
let (name, processor) = args.required;
|
|
2206
|
-
let (priority,) = args.optional;
|
|
2207
|
-
let priority = priority.unwrap_or(50);
|
|
2208
|
-
|
|
2209
|
-
if !processor.respond_to("call", true)? {
|
|
2210
|
-
return Err(runtime_error("Post-processor must be a Proc or respond to 'call'"));
|
|
2211
|
-
}
|
|
2212
|
-
|
|
2213
|
-
use async_trait::async_trait;
|
|
2214
|
-
use kreuzberg::plugins::{Plugin, PostProcessor, ProcessingStage};
|
|
2215
|
-
use std::sync::Arc;
|
|
2216
|
-
|
|
2217
|
-
struct RubyPostProcessor {
|
|
2218
|
-
name: String,
|
|
2219
|
-
processor: GcGuardedValue,
|
|
2220
|
-
}
|
|
2221
|
-
|
|
2222
|
-
unsafe impl Send for RubyPostProcessor {}
|
|
2223
|
-
unsafe impl Sync for RubyPostProcessor {}
|
|
2224
|
-
|
|
2225
|
-
impl Plugin for RubyPostProcessor {
|
|
2226
|
-
fn name(&self) -> &str {
|
|
2227
|
-
&self.name
|
|
2228
|
-
}
|
|
2229
|
-
|
|
2230
|
-
fn version(&self) -> String {
|
|
2231
|
-
"1.0.0".to_string()
|
|
2232
|
-
}
|
|
2233
|
-
|
|
2234
|
-
fn initialize(&self) -> kreuzberg::Result<()> {
|
|
2235
|
-
Ok(())
|
|
2236
|
-
}
|
|
2237
|
-
|
|
2238
|
-
fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
2239
|
-
Ok(())
|
|
2240
|
-
}
|
|
2241
|
-
}
|
|
2242
|
-
|
|
2243
|
-
#[async_trait]
|
|
2244
|
-
impl PostProcessor for RubyPostProcessor {
|
|
2245
|
-
async fn process(
|
|
2246
|
-
&self,
|
|
2247
|
-
result: &mut kreuzberg::ExtractionResult,
|
|
2248
|
-
_config: &kreuzberg::ExtractionConfig,
|
|
2249
|
-
) -> kreuzberg::Result<()> {
|
|
2250
|
-
let processor_name = self.name.clone();
|
|
2251
|
-
let processor = self.processor.value();
|
|
2252
|
-
let result_clone = result.clone();
|
|
2253
|
-
|
|
2254
|
-
let updated_result = tokio::task::block_in_place(|| {
|
|
2255
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2256
|
-
let result_hash = extraction_result_to_ruby(&ruby, result_clone.clone()).map_err(|e| {
|
|
2257
|
-
kreuzberg::KreuzbergError::Plugin {
|
|
2258
|
-
message: format!("Failed to convert result to Ruby: {}", e),
|
|
2259
|
-
plugin_name: processor_name.clone(),
|
|
2260
|
-
}
|
|
2261
|
-
})?;
|
|
2262
|
-
|
|
2263
|
-
let modified = processor
|
|
2264
|
-
.funcall::<_, _, magnus::Value>("call", (result_hash,))
|
|
2265
|
-
.map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2266
|
-
message: format!("Ruby post-processor failed: {}", e),
|
|
2267
|
-
plugin_name: processor_name.clone(),
|
|
2268
|
-
})?;
|
|
2269
|
-
|
|
2270
|
-
let modified_hash =
|
|
2271
|
-
magnus::RHash::try_convert(modified).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2272
|
-
message: format!("Post-processor must return a Hash: {}", e),
|
|
2273
|
-
plugin_name: processor_name.clone(),
|
|
2274
|
-
})?;
|
|
2275
|
-
|
|
2276
|
-
let mut updated_result = result_clone;
|
|
2277
|
-
|
|
2278
|
-
if let Some(content_val) = get_kw(&ruby, modified_hash, "content") {
|
|
2279
|
-
let new_content =
|
|
2280
|
-
String::try_convert(content_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2281
|
-
message: format!("Failed to convert content: {}", e),
|
|
2282
|
-
plugin_name: processor_name.clone(),
|
|
2283
|
-
})?;
|
|
2284
|
-
updated_result.content = new_content;
|
|
2285
|
-
}
|
|
2286
|
-
|
|
2287
|
-
if let Some(mime_val) = get_kw(&ruby, modified_hash, "mime_type") {
|
|
2288
|
-
let new_mime = String::try_convert(mime_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2289
|
-
message: format!("Failed to convert mime_type: {}", e),
|
|
2290
|
-
plugin_name: processor_name.clone(),
|
|
2291
|
-
})?;
|
|
2292
|
-
updated_result.mime_type = new_mime;
|
|
2293
|
-
}
|
|
2294
|
-
|
|
2295
|
-
if let Some(metadata_val) = get_kw(&ruby, modified_hash, "metadata") {
|
|
2296
|
-
if metadata_val.is_nil() {
|
|
2297
|
-
updated_result.metadata = kreuzberg::types::Metadata::default();
|
|
2298
|
-
} else {
|
|
2299
|
-
let metadata_json =
|
|
2300
|
-
ruby_value_to_json(metadata_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2301
|
-
message: format!("Metadata must be JSON-serializable: {}", e),
|
|
2302
|
-
plugin_name: processor_name.clone(),
|
|
2303
|
-
})?;
|
|
2304
|
-
let metadata: kreuzberg::types::Metadata =
|
|
2305
|
-
serde_json::from_value(metadata_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2306
|
-
message: format!("Failed to deserialize metadata: {}", e),
|
|
2307
|
-
plugin_name: processor_name.clone(),
|
|
2308
|
-
})?;
|
|
2309
|
-
updated_result.metadata = metadata;
|
|
2310
|
-
}
|
|
2311
|
-
}
|
|
2312
|
-
|
|
2313
|
-
if let Some(tables_val) = get_kw(&ruby, modified_hash, "tables") {
|
|
2314
|
-
let tables_json =
|
|
2315
|
-
ruby_value_to_json(tables_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2316
|
-
message: format!("Tables must be JSON-serializable: {}", e),
|
|
2317
|
-
plugin_name: processor_name.clone(),
|
|
2318
|
-
})?;
|
|
2319
|
-
if tables_json.is_null() {
|
|
2320
|
-
updated_result.tables.clear();
|
|
2321
|
-
} else {
|
|
2322
|
-
let tables: Vec<kreuzberg::types::Table> =
|
|
2323
|
-
serde_json::from_value(tables_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2324
|
-
message: format!("Failed to deserialize tables: {}", e),
|
|
2325
|
-
plugin_name: processor_name.clone(),
|
|
2326
|
-
})?;
|
|
2327
|
-
updated_result.tables = tables;
|
|
2328
|
-
}
|
|
2329
|
-
}
|
|
2330
|
-
|
|
2331
|
-
if let Some(languages_val) = get_kw(&ruby, modified_hash, "detected_languages") {
|
|
2332
|
-
if languages_val.is_nil() {
|
|
2333
|
-
updated_result.detected_languages = None;
|
|
2334
|
-
} else {
|
|
2335
|
-
let langs_json =
|
|
2336
|
-
ruby_value_to_json(languages_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2337
|
-
message: format!("detected_languages must be JSON-serializable: {}", e),
|
|
2338
|
-
plugin_name: processor_name.clone(),
|
|
2339
|
-
})?;
|
|
2340
|
-
let languages: Vec<String> =
|
|
2341
|
-
serde_json::from_value(langs_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2342
|
-
message: format!("Failed to deserialize detected_languages: {}", e),
|
|
2343
|
-
plugin_name: processor_name.clone(),
|
|
2344
|
-
})?;
|
|
2345
|
-
updated_result.detected_languages = Some(languages);
|
|
2346
|
-
}
|
|
2347
|
-
}
|
|
2348
|
-
|
|
2349
|
-
if let Some(chunks_val) = get_kw(&ruby, modified_hash, "chunks") {
|
|
2350
|
-
if chunks_val.is_nil() {
|
|
2351
|
-
updated_result.chunks = None;
|
|
2352
|
-
} else {
|
|
2353
|
-
let chunks_json =
|
|
2354
|
-
ruby_value_to_json(chunks_val).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2355
|
-
message: format!("Chunks must be JSON-serializable: {}", e),
|
|
2356
|
-
plugin_name: processor_name.clone(),
|
|
2357
|
-
})?;
|
|
2358
|
-
let chunks: Vec<kreuzberg::types::Chunk> =
|
|
2359
|
-
serde_json::from_value(chunks_json).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2360
|
-
message: format!("Failed to deserialize chunks: {}", e),
|
|
2361
|
-
plugin_name: processor_name.clone(),
|
|
2362
|
-
})?;
|
|
2363
|
-
updated_result.chunks = Some(chunks);
|
|
2364
|
-
}
|
|
2365
|
-
}
|
|
2366
|
-
|
|
2367
|
-
Ok::<kreuzberg::ExtractionResult, kreuzberg::KreuzbergError>(updated_result)
|
|
2368
|
-
})?;
|
|
2369
|
-
|
|
2370
|
-
*result = updated_result;
|
|
2371
|
-
Ok(())
|
|
2372
|
-
}
|
|
2373
|
-
|
|
2374
|
-
fn processing_stage(&self) -> ProcessingStage {
|
|
2375
|
-
ProcessingStage::Late
|
|
2376
|
-
}
|
|
2377
|
-
}
|
|
2378
|
-
|
|
2379
|
-
let processor_impl = Arc::new(RubyPostProcessor {
|
|
2380
|
-
name: name.clone(),
|
|
2381
|
-
processor: GcGuardedValue::new(processor),
|
|
2382
|
-
});
|
|
2383
|
-
|
|
2384
|
-
let registry = kreuzberg::get_post_processor_registry();
|
|
2385
|
-
registry
|
|
2386
|
-
.write()
|
|
2387
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2388
|
-
.register(processor_impl, priority)
|
|
2389
|
-
.map_err(kreuzberg_error)?;
|
|
2390
|
-
|
|
2391
|
-
Ok(())
|
|
2392
|
-
}
|
|
2393
|
-
|
|
2394
|
-
/// Register a validator plugin.
|
|
2395
|
-
///
|
|
2396
|
-
/// @param name [String] Unique identifier for the validator
|
|
2397
|
-
/// @param validator [Proc] Ruby Proc/lambda that validates extraction results
|
|
2398
|
-
/// @param priority [Integer] Execution priority (default: 50, higher = runs first)
|
|
2399
|
-
/// @return [nil]
|
|
2400
|
-
///
|
|
2401
|
-
/// # Example
|
|
2402
|
-
/// ```text
|
|
2403
|
-
/// Kreuzberg.register_validator("min_length", ->(result) {
|
|
2404
|
-
/// raise "Content too short" if result[:content].length < 100
|
|
2405
|
-
/// }, 100)
|
|
2406
|
-
/// ```
|
|
2407
|
-
fn register_validator(args: &[Value]) -> Result<(), Error> {
|
|
2408
|
-
let _ruby = Ruby::get().expect("Ruby not initialized");
|
|
2409
|
-
let args = scan_args::<(String, Value), (Option<i32>,), (), (), (), ()>(args)?;
|
|
2410
|
-
let (name, validator) = args.required;
|
|
2411
|
-
let (priority,) = args.optional;
|
|
2412
|
-
let priority = priority.unwrap_or(50);
|
|
2413
|
-
|
|
2414
|
-
if !validator.respond_to("call", true)? {
|
|
2415
|
-
return Err(runtime_error("Validator must be a Proc or respond to 'call'"));
|
|
2416
|
-
}
|
|
2417
|
-
|
|
2418
|
-
use async_trait::async_trait;
|
|
2419
|
-
use kreuzberg::plugins::{Plugin, Validator};
|
|
2420
|
-
use std::sync::Arc;
|
|
2421
|
-
|
|
2422
|
-
struct RubyValidator {
|
|
2423
|
-
name: String,
|
|
2424
|
-
validator: GcGuardedValue,
|
|
2425
|
-
priority: i32,
|
|
2426
|
-
}
|
|
2427
|
-
|
|
2428
|
-
unsafe impl Send for RubyValidator {}
|
|
2429
|
-
unsafe impl Sync for RubyValidator {}
|
|
2430
|
-
|
|
2431
|
-
impl Plugin for RubyValidator {
|
|
2432
|
-
fn name(&self) -> &str {
|
|
2433
|
-
&self.name
|
|
2434
|
-
}
|
|
2435
|
-
|
|
2436
|
-
fn version(&self) -> String {
|
|
2437
|
-
"1.0.0".to_string()
|
|
2438
|
-
}
|
|
2439
|
-
|
|
2440
|
-
fn initialize(&self) -> kreuzberg::Result<()> {
|
|
2441
|
-
Ok(())
|
|
2442
|
-
}
|
|
2443
|
-
|
|
2444
|
-
fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
2445
|
-
Ok(())
|
|
2446
|
-
}
|
|
2447
|
-
}
|
|
2448
|
-
|
|
2449
|
-
#[async_trait]
|
|
2450
|
-
impl Validator for RubyValidator {
|
|
2451
|
-
async fn validate(
|
|
2452
|
-
&self,
|
|
2453
|
-
result: &kreuzberg::ExtractionResult,
|
|
2454
|
-
_config: &kreuzberg::ExtractionConfig,
|
|
2455
|
-
) -> kreuzberg::Result<()> {
|
|
2456
|
-
let validator_name = self.name.clone();
|
|
2457
|
-
let validator = self.validator.value();
|
|
2458
|
-
let result_clone = result.clone();
|
|
2459
|
-
|
|
2460
|
-
tokio::task::block_in_place(|| {
|
|
2461
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2462
|
-
let result_hash =
|
|
2463
|
-
extraction_result_to_ruby(&ruby, result_clone).map_err(|e| kreuzberg::KreuzbergError::Plugin {
|
|
2464
|
-
message: format!("Failed to convert result to Ruby: {}", e),
|
|
2465
|
-
plugin_name: validator_name.clone(),
|
|
2466
|
-
})?;
|
|
2467
|
-
|
|
2468
|
-
validator
|
|
2469
|
-
.funcall::<_, _, magnus::Value>("call", (result_hash,))
|
|
2470
|
-
.map_err(|e| kreuzberg::KreuzbergError::Validation {
|
|
2471
|
-
message: format!("Validation failed: {}", e),
|
|
2472
|
-
source: None,
|
|
2473
|
-
})?;
|
|
2474
|
-
|
|
2475
|
-
Ok(())
|
|
2476
|
-
})
|
|
2477
|
-
}
|
|
2478
|
-
|
|
2479
|
-
fn priority(&self) -> i32 {
|
|
2480
|
-
self.priority
|
|
2481
|
-
}
|
|
2482
|
-
}
|
|
2483
|
-
|
|
2484
|
-
let validator_impl = Arc::new(RubyValidator {
|
|
2485
|
-
name: name.clone(),
|
|
2486
|
-
validator: GcGuardedValue::new(validator),
|
|
2487
|
-
priority,
|
|
2488
|
-
});
|
|
2489
|
-
|
|
2490
|
-
let registry = kreuzberg::get_validator_registry();
|
|
2491
|
-
registry
|
|
2492
|
-
.write()
|
|
2493
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2494
|
-
.register(validator_impl)
|
|
2495
|
-
.map_err(kreuzberg_error)?;
|
|
2496
|
-
|
|
2497
|
-
Ok(())
|
|
2498
|
-
}
|
|
2499
|
-
|
|
2500
|
-
/// Register an OCR backend plugin.
|
|
2501
|
-
///
|
|
2502
|
-
/// @param name [String] Unique identifier for the OCR backend
|
|
2503
|
-
/// @param backend [Object] Ruby object implementing OCR backend interface
|
|
2504
|
-
/// @return [nil]
|
|
2505
|
-
///
|
|
2506
|
-
/// # Example
|
|
2507
|
-
/// ```text
|
|
2508
|
-
/// class CustomOcr
|
|
2509
|
-
/// def process_image(image_bytes, language)
|
|
2510
|
-
/// # Return extracted text
|
|
2511
|
-
/// "Extracted text"
|
|
2512
|
-
/// end
|
|
2513
|
-
///
|
|
2514
|
-
/// def supports_language?(lang)
|
|
2515
|
-
/// %w[eng deu fra].include?(lang)
|
|
2516
|
-
/// end
|
|
2517
|
-
/// end
|
|
2518
|
-
///
|
|
2519
|
-
/// Kreuzberg.register_ocr_backend("custom", CustomOcr.new)
|
|
2520
|
-
/// ```
|
|
2521
|
-
fn register_ocr_backend(name: String, backend: Value) -> Result<(), Error> {
|
|
2522
|
-
if !backend.respond_to("name", true)? {
|
|
2523
|
-
return Err(runtime_error("OCR backend must respond to 'name'"));
|
|
2524
|
-
}
|
|
2525
|
-
if !backend.respond_to("process_image", true)? {
|
|
2526
|
-
return Err(runtime_error("OCR backend must respond to 'process_image'"));
|
|
2527
|
-
}
|
|
2528
|
-
|
|
2529
|
-
use async_trait::async_trait;
|
|
2530
|
-
use kreuzberg::plugins::{OcrBackend, OcrBackendType, Plugin};
|
|
2531
|
-
use std::sync::Arc;
|
|
2532
|
-
|
|
2533
|
-
struct RubyOcrBackend {
|
|
2534
|
-
name: String,
|
|
2535
|
-
backend: GcGuardedValue,
|
|
2536
|
-
}
|
|
2537
|
-
|
|
2538
|
-
unsafe impl Send for RubyOcrBackend {}
|
|
2539
|
-
unsafe impl Sync for RubyOcrBackend {}
|
|
2540
|
-
|
|
2541
|
-
impl Plugin for RubyOcrBackend {
|
|
2542
|
-
fn name(&self) -> &str {
|
|
2543
|
-
&self.name
|
|
2544
|
-
}
|
|
2545
|
-
|
|
2546
|
-
fn version(&self) -> String {
|
|
2547
|
-
"1.0.0".to_string()
|
|
2548
|
-
}
|
|
2549
|
-
|
|
2550
|
-
fn initialize(&self) -> kreuzberg::Result<()> {
|
|
2551
|
-
Ok(())
|
|
2552
|
-
}
|
|
2553
|
-
|
|
2554
|
-
fn shutdown(&self) -> kreuzberg::Result<()> {
|
|
2555
|
-
Ok(())
|
|
2556
|
-
}
|
|
2557
|
-
}
|
|
2558
|
-
|
|
2559
|
-
#[async_trait]
|
|
2560
|
-
impl OcrBackend for RubyOcrBackend {
|
|
2561
|
-
async fn process_image(
|
|
2562
|
-
&self,
|
|
2563
|
-
image_bytes: &[u8],
|
|
2564
|
-
config: &kreuzberg::OcrConfig,
|
|
2565
|
-
) -> kreuzberg::Result<kreuzberg::ExtractionResult> {
|
|
2566
|
-
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
2567
|
-
let image_str = ruby.str_from_slice(image_bytes);
|
|
2568
|
-
|
|
2569
|
-
let config_hash = ocr_config_to_ruby_hash(&ruby, config).map_err(|e| kreuzberg::KreuzbergError::Ocr {
|
|
2570
|
-
message: format!("Failed to convert OCR config: {}", e),
|
|
2571
|
-
source: None,
|
|
2572
|
-
})?;
|
|
2573
|
-
|
|
2574
|
-
let response = self
|
|
2575
|
-
.backend
|
|
2576
|
-
.value()
|
|
2577
|
-
.funcall::<_, _, Value>("process_image", (image_str, config_hash.into_value_with(&ruby)))
|
|
2578
|
-
.map_err(|e| kreuzberg::KreuzbergError::Ocr {
|
|
2579
|
-
message: format!("Ruby OCR backend failed: {}", e),
|
|
2580
|
-
source: None,
|
|
2581
|
-
})?;
|
|
2582
|
-
|
|
2583
|
-
let text = String::try_convert(response).map_err(|e| kreuzberg::KreuzbergError::Ocr {
|
|
2584
|
-
message: format!("OCR backend must return a String: {}", e),
|
|
2585
|
-
source: None,
|
|
2586
|
-
})?;
|
|
2587
|
-
|
|
2588
|
-
Ok(kreuzberg::ExtractionResult {
|
|
2589
|
-
content: text,
|
|
2590
|
-
mime_type: "text/plain".to_string(),
|
|
2591
|
-
metadata: kreuzberg::types::Metadata::default(),
|
|
2592
|
-
tables: vec![],
|
|
2593
|
-
detected_languages: None,
|
|
2594
|
-
chunks: None,
|
|
2595
|
-
images: None,
|
|
2596
|
-
pages: None,
|
|
2597
|
-
})
|
|
2598
|
-
}
|
|
2599
|
-
|
|
2600
|
-
fn supports_language(&self, lang: &str) -> bool {
|
|
2601
|
-
match self.backend.value().respond_to("supports_language?", true) {
|
|
2602
|
-
Ok(true) => self
|
|
2603
|
-
.backend
|
|
2604
|
-
.value()
|
|
2605
|
-
.funcall::<_, _, bool>("supports_language?", (lang,))
|
|
2606
|
-
.unwrap_or(true),
|
|
2607
|
-
_ => true,
|
|
2608
|
-
}
|
|
2609
|
-
}
|
|
2610
|
-
|
|
2611
|
-
fn backend_type(&self) -> OcrBackendType {
|
|
2612
|
-
OcrBackendType::Custom
|
|
2613
|
-
}
|
|
2614
|
-
}
|
|
2615
|
-
|
|
2616
|
-
let backend_impl = Arc::new(RubyOcrBackend {
|
|
2617
|
-
name: name.clone(),
|
|
2618
|
-
backend: GcGuardedValue::new(backend),
|
|
2619
|
-
});
|
|
2620
|
-
|
|
2621
|
-
let registry = kreuzberg::get_ocr_backend_registry();
|
|
2622
|
-
registry
|
|
2623
|
-
.write()
|
|
2624
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2625
|
-
.register(backend_impl)
|
|
2626
|
-
.map_err(kreuzberg_error)?;
|
|
2627
|
-
|
|
2628
|
-
Ok(())
|
|
2629
|
-
}
|
|
2630
|
-
|
|
2631
|
-
/// Unregister a post-processor plugin.
|
|
2632
|
-
///
|
|
2633
|
-
/// @param name [String] Name of the post-processor to remove
|
|
2634
|
-
/// @return [nil]
|
|
2635
|
-
///
|
|
2636
|
-
fn unregister_post_processor(name: String) -> Result<(), Error> {
|
|
2637
|
-
let registry = kreuzberg::get_post_processor_registry();
|
|
2638
|
-
registry
|
|
2639
|
-
.write()
|
|
2640
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2641
|
-
.remove(&name)
|
|
2642
|
-
.map_err(kreuzberg_error)?;
|
|
2643
|
-
Ok(())
|
|
2644
|
-
}
|
|
2645
|
-
|
|
2646
|
-
/// Unregister a validator plugin.
|
|
2647
|
-
///
|
|
2648
|
-
/// @param name [String] Name of the validator to remove
|
|
2649
|
-
/// @return [nil]
|
|
2650
|
-
///
|
|
2651
|
-
fn unregister_validator(name: String) -> Result<(), Error> {
|
|
2652
|
-
let registry = kreuzberg::get_validator_registry();
|
|
2653
|
-
registry
|
|
2654
|
-
.write()
|
|
2655
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2656
|
-
.remove(&name)
|
|
2657
|
-
.map_err(kreuzberg_error)?;
|
|
2658
|
-
Ok(())
|
|
2659
|
-
}
|
|
2660
|
-
|
|
2661
|
-
/// Clear all registered post-processors.
|
|
2662
|
-
///
|
|
2663
|
-
/// @return [nil]
|
|
2664
|
-
///
|
|
2665
|
-
fn clear_post_processors() -> Result<(), Error> {
|
|
2666
|
-
let registry = kreuzberg::get_post_processor_registry();
|
|
2667
|
-
registry
|
|
2668
|
-
.write()
|
|
2669
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2670
|
-
.shutdown_all()
|
|
2671
|
-
.map_err(kreuzberg_error)?;
|
|
2672
|
-
Ok(())
|
|
2673
|
-
}
|
|
2674
|
-
|
|
2675
|
-
/// Clear all registered validators.
|
|
2676
|
-
///
|
|
2677
|
-
/// @return [nil]
|
|
2678
|
-
///
|
|
2679
|
-
fn clear_validators() -> Result<(), Error> {
|
|
2680
|
-
let registry = kreuzberg::get_validator_registry();
|
|
2681
|
-
registry
|
|
2682
|
-
.write()
|
|
2683
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2684
|
-
.shutdown_all()
|
|
2685
|
-
.map_err(kreuzberg_error)?;
|
|
2686
|
-
Ok(())
|
|
2687
|
-
}
|
|
2688
|
-
|
|
2689
|
-
/// List all registered validators.
|
|
2690
|
-
///
|
|
2691
|
-
/// @return [Array<String>] Array of validator names
|
|
2692
|
-
///
|
|
2693
|
-
fn list_validators() -> Result<Vec<String>, Error> {
|
|
2694
|
-
let registry = kreuzberg::get_validator_registry();
|
|
2695
|
-
let validators = registry
|
|
2696
|
-
.read()
|
|
2697
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2698
|
-
.list();
|
|
2699
|
-
Ok(validators)
|
|
2700
|
-
}
|
|
2701
|
-
|
|
2702
|
-
/// List all registered post-processors.
|
|
2703
|
-
///
|
|
2704
|
-
/// @return [Array<String>] Array of post-processor names
|
|
2705
|
-
///
|
|
2706
|
-
fn list_post_processors() -> Result<Vec<String>, Error> {
|
|
2707
|
-
let registry = kreuzberg::get_post_processor_registry();
|
|
2708
|
-
let processors = registry
|
|
2709
|
-
.read()
|
|
2710
|
-
.map_err(|e| runtime_error(format!("Failed to acquire registry lock: {}", e)))?
|
|
2711
|
-
.list();
|
|
2712
|
-
Ok(processors)
|
|
2713
|
-
}
|
|
2714
|
-
|
|
2715
|
-
/// Unregister an OCR backend by name.
|
|
2716
|
-
///
|
|
2717
|
-
/// Removes a previously registered OCR backend from the global registry.
|
|
2718
|
-
///
|
|
2719
|
-
/// @param name [String] Backend name to unregister
|
|
2720
|
-
/// @return [void]
|
|
2721
|
-
///
|
|
2722
|
-
/// @example
|
|
2723
|
-
/// Kreuzberg.unregister_ocr_backend("my_ocr")
|
|
2724
|
-
///
|
|
2725
|
-
fn unregister_ocr_backend(name: String) -> Result<(), Error> {
|
|
2726
|
-
kreuzberg::plugins::unregister_ocr_backend(&name).map_err(|e| runtime_error(e.to_string()))
|
|
2727
|
-
}
|
|
2728
|
-
|
|
2729
|
-
/// List all registered OCR backend names.
|
|
2730
|
-
///
|
|
2731
|
-
/// Returns an array of all OCR backend names currently registered in the global registry.
|
|
2732
|
-
///
|
|
2733
|
-
/// @return [Array<String>] Array of OCR backend names
|
|
2734
|
-
///
|
|
2735
|
-
/// @example
|
|
2736
|
-
/// backends = Kreuzberg.list_ocr_backends
|
|
2737
|
-
/// #=> ["tesseract", "my_custom_ocr"]
|
|
2738
|
-
///
|
|
2739
|
-
fn list_ocr_backends() -> Result<Vec<String>, Error> {
|
|
2740
|
-
kreuzberg::plugins::list_ocr_backends().map_err(|e| runtime_error(e.to_string()))
|
|
2741
|
-
}
|
|
2742
|
-
|
|
2743
|
-
/// Clear all registered OCR backends.
|
|
2744
|
-
///
|
|
2745
|
-
/// Removes all OCR backends from the global registry and calls their shutdown methods.
|
|
2746
|
-
///
|
|
2747
|
-
/// @return [void]
|
|
2748
|
-
///
|
|
2749
|
-
/// @example
|
|
2750
|
-
/// Kreuzberg.clear_ocr_backends
|
|
2751
|
-
///
|
|
2752
|
-
fn clear_ocr_backends() -> Result<(), Error> {
|
|
2753
|
-
kreuzberg::plugins::clear_ocr_backends().map_err(|e| runtime_error(e.to_string()))
|
|
2754
|
-
}
|
|
2755
|
-
|
|
2756
|
-
/// List all registered document extractor names.
|
|
2757
|
-
///
|
|
2758
|
-
/// Returns an array of all document extractor names currently registered in the global registry.
|
|
2759
|
-
///
|
|
2760
|
-
/// @return [Array<String>] Array of document extractor names
|
|
2761
|
-
///
|
|
2762
|
-
/// @example
|
|
2763
|
-
/// extractors = Kreuzberg.list_document_extractors
|
|
2764
|
-
/// #=> ["pdf", "docx", "txt"]
|
|
2765
|
-
///
|
|
2766
|
-
fn list_document_extractors() -> Result<Vec<String>, Error> {
|
|
2767
|
-
kreuzberg::plugins::list_extractors().map_err(|e| runtime_error(e.to_string()))
|
|
2768
|
-
}
|
|
2769
|
-
|
|
2770
|
-
/// Unregister a document extractor by name.
|
|
2771
|
-
///
|
|
2772
|
-
/// Removes a previously registered document extractor from the global registry.
|
|
2773
|
-
///
|
|
2774
|
-
/// @param name [String] Extractor name to unregister
|
|
2775
|
-
/// @return [void]
|
|
2776
|
-
///
|
|
2777
|
-
/// @example
|
|
2778
|
-
/// Kreuzberg.unregister_document_extractor("my_extractor")
|
|
2779
|
-
///
|
|
2780
|
-
fn unregister_document_extractor(name: String) -> Result<(), Error> {
|
|
2781
|
-
kreuzberg::plugins::unregister_extractor(&name).map_err(|e| runtime_error(e.to_string()))
|
|
2782
|
-
}
|
|
2783
|
-
|
|
2784
|
-
/// Clear all registered document extractors.
|
|
2785
|
-
///
|
|
2786
|
-
/// Removes all document extractors from the global registry and calls their shutdown methods.
|
|
2787
|
-
///
|
|
2788
|
-
/// @return [void]
|
|
2789
|
-
///
|
|
2790
|
-
/// @example
|
|
2791
|
-
/// Kreuzberg.clear_document_extractors
|
|
2792
|
-
///
|
|
2793
|
-
fn clear_document_extractors() -> Result<(), Error> {
|
|
2794
|
-
kreuzberg::plugins::clear_extractors().map_err(|e| runtime_error(e.to_string()))
|
|
2795
|
-
}
|
|
34
|
+
kreuzberg_validate_output_format, kreuzberg_validate_confidence,
|
|
35
|
+
kreuzberg_validate_dpi, kreuzberg_validate_chunking_params,
|
|
36
|
+
kreuzberg_get_valid_binarization_methods, kreuzberg_get_valid_language_codes,
|
|
37
|
+
kreuzberg_get_valid_ocr_backends, kreuzberg_get_valid_token_reduction_levels,
|
|
38
|
+
kreuzberg_free_string,
|
|
39
|
+
};
|
|
2796
40
|
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
/// @param mime_type [String] The MIME type to validate
|
|
2800
|
-
/// @return [String] The validated MIME type (may be normalized)
|
|
2801
|
-
///
|
|
2802
|
-
/// @example
|
|
2803
|
-
/// validated = Kreuzberg.validate_mime_type("application/pdf")
|
|
2804
|
-
/// #=> "application/pdf"
|
|
2805
|
-
///
|
|
2806
|
-
/// @example Validate image MIME type
|
|
2807
|
-
/// validated = Kreuzberg.validate_mime_type("image/jpeg")
|
|
2808
|
-
/// #=> "image/jpeg"
|
|
2809
|
-
///
|
|
2810
|
-
fn validate_mime_type_native(mime_type: String) -> Result<String, Error> {
|
|
2811
|
-
kreuzberg::validate_mime_type(&mime_type).map_err(kreuzberg_error)
|
|
2812
|
-
}
|
|
41
|
+
use magnus::{Error, Ruby, RHash, Value, function, IntoValue, TryConvert};
|
|
42
|
+
use magnus::value::ReprValue;
|
|
2813
43
|
|
|
2814
|
-
///
|
|
2815
|
-
|
|
2816
|
-
|
|
2817
|
-
|
|
2818
|
-
|
|
2819
|
-
|
|
2820
|
-
///
|
|
2821
|
-
/// @example
|
|
2822
|
-
/// pdf_bytes = "%PDF-1.4\n"
|
|
2823
|
-
/// mime = Kreuzberg.detect_mime_type(pdf_bytes)
|
|
2824
|
-
/// #=> "application/pdf"
|
|
2825
|
-
///
|
|
2826
|
-
fn detect_mime_type_from_bytes(bytes: String) -> Result<String, Error> {
|
|
2827
|
-
let mime_type = kreuzberg::detect_mime_type_from_bytes(bytes.as_bytes()).map_err(kreuzberg_error)?;
|
|
2828
|
-
Ok(mime_type)
|
|
2829
|
-
}
|
|
44
|
+
/// Clear the extraction cache
|
|
45
|
+
pub fn ruby_clear_cache() -> Result<(), Error> {
|
|
46
|
+
let cache_root = cache_root_dir()?;
|
|
47
|
+
if !cache_root.exists() {
|
|
48
|
+
return Ok(());
|
|
49
|
+
}
|
|
2830
50
|
|
|
2831
|
-
|
|
2832
|
-
|
|
2833
|
-
|
|
2834
|
-
|
|
2835
|
-
|
|
2836
|
-
|
|
2837
|
-
///
|
|
2838
|
-
/// @example
|
|
2839
|
-
/// mime = Kreuzberg.detect_mime_type_from_path("document.pdf")
|
|
2840
|
-
/// #=> "application/pdf"
|
|
2841
|
-
///
|
|
2842
|
-
fn detect_mime_type_from_path_native(path: String) -> Result<String, Error> {
|
|
2843
|
-
let content = fs::read(&path).map_err(KreuzbergError::Io).map_err(kreuzberg_error)?;
|
|
2844
|
-
let mime_type = kreuzberg::detect_mime_type_from_bytes(&content).map_err(kreuzberg_error)?;
|
|
2845
|
-
Ok(mime_type)
|
|
2846
|
-
}
|
|
51
|
+
for dir in cache_directories(&cache_root)? {
|
|
52
|
+
let Some(dir_str) = dir.to_str() else {
|
|
53
|
+
return Err(runtime_error("Cache directory path contains non-UTF8 characters"));
|
|
54
|
+
};
|
|
55
|
+
kreuzberg::cache::clear_cache_directory(dir_str).map_err(kreuzberg_error)?;
|
|
56
|
+
}
|
|
2847
57
|
|
|
2848
|
-
|
|
2849
|
-
///
|
|
2850
|
-
/// Returns an array of file extensions commonly associated with the MIME type.
|
|
2851
|
-
///
|
|
2852
|
-
/// @param mime_type [String] The MIME type
|
|
2853
|
-
/// @return [Array<String>] Array of file extensions (without dots)
|
|
2854
|
-
///
|
|
2855
|
-
/// @example
|
|
2856
|
-
/// exts = Kreuzberg.get_extensions_for_mime("application/pdf")
|
|
2857
|
-
/// #=> ["pdf"]
|
|
2858
|
-
///
|
|
2859
|
-
/// @example
|
|
2860
|
-
/// exts = Kreuzberg.get_extensions_for_mime("image/jpeg")
|
|
2861
|
-
/// #=> ["jpg", "jpeg"]
|
|
2862
|
-
///
|
|
2863
|
-
fn get_extensions_for_mime_native(mime_type: String) -> Result<Vec<String>, Error> {
|
|
2864
|
-
kreuzberg::get_extensions_for_mime(&mime_type).map_err(kreuzberg_error)
|
|
58
|
+
Ok(())
|
|
2865
59
|
}
|
|
2866
60
|
|
|
2867
|
-
|
|
2868
|
-
|
|
2869
|
-
|
|
2870
|
-
|
|
2871
|
-
|
|
2872
|
-
/// # Returns
|
|
2873
|
-
///
|
|
2874
|
-
/// Array of 4 preset names: ["fast", "balanced", "quality", "multilingual"]
|
|
2875
|
-
///
|
|
2876
|
-
/// # Example
|
|
2877
|
-
///
|
|
2878
|
-
/// ```ruby
|
|
2879
|
-
/// require 'kreuzberg'
|
|
2880
|
-
///
|
|
2881
|
-
/// presets = Kreuzberg.list_embedding_presets
|
|
2882
|
-
/// puts presets # => ["fast", "balanced", "quality", "multilingual"]
|
|
2883
|
-
/// ```
|
|
2884
|
-
fn list_embedding_presets(ruby: &Ruby) -> Result<RArray, Error> {
|
|
2885
|
-
let presets = kreuzberg::embeddings::list_presets();
|
|
2886
|
-
let array = ruby.ary_new();
|
|
2887
|
-
for name in presets {
|
|
2888
|
-
array.push(name)?;
|
|
2889
|
-
}
|
|
2890
|
-
Ok(array)
|
|
2891
|
-
}
|
|
61
|
+
/// Get cache statistics
|
|
62
|
+
pub fn ruby_cache_stats() -> Result<RHash, Error> {
|
|
63
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
64
|
+
let hash = ruby.hash_new();
|
|
65
|
+
let cache_root = cache_root_dir()?;
|
|
2892
66
|
|
|
2893
|
-
|
|
2894
|
-
|
|
2895
|
-
|
|
2896
|
-
|
|
2897
|
-
///
|
|
2898
|
-
/// # Arguments
|
|
2899
|
-
///
|
|
2900
|
-
/// * `name` - The preset name (case-sensitive)
|
|
2901
|
-
///
|
|
2902
|
-
/// # Returns
|
|
2903
|
-
///
|
|
2904
|
-
/// Hash with preset configuration or nil if not found
|
|
2905
|
-
///
|
|
2906
|
-
/// Available presets:
|
|
2907
|
-
/// - "fast": AllMiniLML6V2Q (384 dimensions) - Quick prototyping, low-latency
|
|
2908
|
-
/// - "balanced": BGEBaseENV15 (768 dimensions) - General-purpose RAG
|
|
2909
|
-
/// - "quality": BGELargeENV15 (1024 dimensions) - High-quality embeddings
|
|
2910
|
-
/// - "multilingual": MultilingualE5Base (768 dimensions) - Multi-language support
|
|
2911
|
-
///
|
|
2912
|
-
/// # Example
|
|
2913
|
-
///
|
|
2914
|
-
/// ```ruby
|
|
2915
|
-
/// require 'kreuzberg'
|
|
2916
|
-
///
|
|
2917
|
-
/// preset = Kreuzberg.get_embedding_preset("balanced")
|
|
2918
|
-
/// if preset
|
|
2919
|
-
/// puts "Model: #{preset[:model_name]}, Dims: #{preset[:dimensions]}"
|
|
2920
|
-
/// # => Model: BGEBaseENV15, Dims: 768
|
|
2921
|
-
/// end
|
|
2922
|
-
/// ```
|
|
2923
|
-
fn get_embedding_preset(ruby: &Ruby, name: String) -> Result<Value, Error> {
|
|
2924
|
-
let preset = kreuzberg::embeddings::get_preset(&name);
|
|
2925
|
-
|
|
2926
|
-
match preset {
|
|
2927
|
-
Some(preset) => {
|
|
2928
|
-
let hash = ruby.hash_new();
|
|
2929
|
-
|
|
2930
|
-
set_hash_entry(ruby, &hash, "name", ruby.str_new(preset.name).as_value())?;
|
|
2931
|
-
set_hash_entry(ruby, &hash, "chunk_size", preset.chunk_size.into_value_with(ruby))?;
|
|
2932
|
-
set_hash_entry(ruby, &hash, "overlap", preset.overlap.into_value_with(ruby))?;
|
|
2933
|
-
|
|
2934
|
-
let model_name = format!("{:?}", preset.model);
|
|
2935
|
-
|
|
2936
|
-
set_hash_entry(ruby, &hash, "model_name", ruby.str_new(&model_name).as_value())?;
|
|
2937
|
-
set_hash_entry(ruby, &hash, "dimensions", preset.dimensions.into_value_with(ruby))?;
|
|
2938
|
-
set_hash_entry(ruby, &hash, "description", ruby.str_new(preset.description).as_value())?;
|
|
2939
|
-
|
|
2940
|
-
Ok(hash.as_value())
|
|
2941
|
-
}
|
|
2942
|
-
None => Ok(ruby.qnil().as_value()),
|
|
67
|
+
if !cache_root.exists() {
|
|
68
|
+
hash.aset("total_entries", 0)?;
|
|
69
|
+
hash.aset("total_size_bytes", 0)?;
|
|
70
|
+
return Ok(hash);
|
|
2943
71
|
}
|
|
2944
|
-
}
|
|
2945
72
|
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
/// Returns an i32 error code indicating the type of error that occurred:
|
|
2949
|
-
/// - 0: Success (no error)
|
|
2950
|
-
/// - 1: GenericError
|
|
2951
|
-
/// - 2: Panic
|
|
2952
|
-
/// - 3: InvalidArgument
|
|
2953
|
-
/// - 4: IoError
|
|
2954
|
-
/// - 5: ParsingError
|
|
2955
|
-
/// - 6: OcrError
|
|
2956
|
-
/// - 7: MissingDependency
|
|
2957
|
-
///
|
|
2958
|
-
/// @return [Integer] The error code
|
|
2959
|
-
fn last_error_code() -> i32 {
|
|
2960
|
-
get_error_code()
|
|
2961
|
-
}
|
|
73
|
+
let mut total_entries: usize = 0;
|
|
74
|
+
let mut total_bytes: f64 = 0.0;
|
|
2962
75
|
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2968
|
-
|
|
2969
|
-
|
|
2970
|
-
/// - line: Line number
|
|
2971
|
-
/// - function: Function name
|
|
2972
|
-
/// - message: Panic message
|
|
2973
|
-
/// - timestamp_secs: Unix timestamp
|
|
2974
|
-
///
|
|
2975
|
-
/// @return [String, nil] JSON string with panic context or nil
|
|
2976
|
-
fn last_panic_context_json(ruby: &Ruby) -> Value {
|
|
2977
|
-
match get_panic_context() {
|
|
2978
|
-
Some(json) => ruby.str_new(&json).as_value(),
|
|
2979
|
-
None => ruby.qnil().as_value(),
|
|
76
|
+
for dir in cache_directories(&cache_root)? {
|
|
77
|
+
let Some(dir_str) = dir.to_str() else {
|
|
78
|
+
return Err(runtime_error("Cache directory path contains non-UTF8 characters"));
|
|
79
|
+
};
|
|
80
|
+
let stats = kreuzberg::cache::get_cache_metadata(dir_str).map_err(kreuzberg_error)?;
|
|
81
|
+
total_entries += stats.total_files;
|
|
82
|
+
total_bytes += stats.total_size_mb * 1024.0 * 1024.0;
|
|
2980
83
|
}
|
|
2981
|
-
}
|
|
2982
84
|
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
85
|
+
set_hash_entry(
|
|
86
|
+
&ruby,
|
|
87
|
+
&hash,
|
|
88
|
+
"total_entries",
|
|
89
|
+
ruby.integer_from_u64(total_entries as u64).into_value_with(&ruby),
|
|
90
|
+
)?;
|
|
91
|
+
set_hash_entry(
|
|
92
|
+
&ruby,
|
|
93
|
+
&hash,
|
|
94
|
+
"total_size_bytes",
|
|
95
|
+
ruby.integer_from_u64(total_bytes.round() as u64).into_value_with(&ruby),
|
|
96
|
+
)?;
|
|
2989
97
|
|
|
2990
|
-
Ok(
|
|
98
|
+
Ok(hash)
|
|
2991
99
|
}
|
|
2992
100
|
|
|
2993
|
-
|
|
2994
|
-
|
|
2995
|
-
|
|
2996
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
2997
|
-
fn validate_ocr_backend(backend: String) -> Result<i32, Error> {
|
|
2998
|
-
let c_backend = std::ffi::CString::new(backend).map_err(|_| runtime_error("Invalid backend string"))?;
|
|
2999
|
-
|
|
3000
|
-
Ok(unsafe { kreuzberg_validate_ocr_backend(c_backend.as_ptr()) })
|
|
101
|
+
// Validation wrapper functions
|
|
102
|
+
pub fn validate_binarization_method(method: String) -> Result<i32, Error> {
|
|
103
|
+
unsafe { Ok(kreuzberg_validate_binarization_method(method.as_ptr() as *const i8)) }
|
|
3001
104
|
}
|
|
3002
105
|
|
|
3003
|
-
|
|
3004
|
-
|
|
3005
|
-
/// @param code [String] The language code (e.g., "en", "eng", "de", "deu")
|
|
3006
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3007
|
-
fn validate_language_code(code: String) -> Result<i32, Error> {
|
|
3008
|
-
let c_code = std::ffi::CString::new(code).map_err(|_| runtime_error("Invalid language code string"))?;
|
|
3009
|
-
|
|
3010
|
-
Ok(unsafe { kreuzberg_validate_language_code(c_code.as_ptr()) })
|
|
106
|
+
pub fn validate_ocr_backend(backend: String) -> Result<i32, Error> {
|
|
107
|
+
unsafe { Ok(kreuzberg_validate_ocr_backend(backend.as_ptr() as *const i8)) }
|
|
3011
108
|
}
|
|
3012
109
|
|
|
3013
|
-
|
|
3014
|
-
|
|
3015
|
-
|
|
3016
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3017
|
-
fn validate_token_reduction_level(level: String) -> Result<i32, Error> {
|
|
3018
|
-
let c_level = std::ffi::CString::new(level).map_err(|_| runtime_error("Invalid token reduction level string"))?;
|
|
110
|
+
pub fn validate_language_code(code: String) -> Result<i32, Error> {
|
|
111
|
+
unsafe { Ok(kreuzberg_validate_language_code(code.as_ptr() as *const i8)) }
|
|
112
|
+
}
|
|
3019
113
|
|
|
3020
|
-
|
|
114
|
+
pub fn validate_token_reduction_level(level: String) -> Result<i32, Error> {
|
|
115
|
+
unsafe { Ok(kreuzberg_validate_token_reduction_level(level.as_ptr() as *const i8)) }
|
|
3021
116
|
}
|
|
3022
117
|
|
|
3023
|
-
|
|
3024
|
-
///
|
|
3025
|
-
/// @param psm [Integer] The PSM value (0-13)
|
|
3026
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3027
|
-
fn validate_tesseract_psm(psm: i32) -> Result<i32, Error> {
|
|
118
|
+
pub fn validate_tesseract_psm(psm: i32) -> Result<i32, Error> {
|
|
3028
119
|
Ok(kreuzberg_validate_tesseract_psm(psm))
|
|
3029
120
|
}
|
|
3030
121
|
|
|
3031
|
-
|
|
3032
|
-
///
|
|
3033
|
-
/// @param oem [Integer] The OEM value (0-3)
|
|
3034
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3035
|
-
fn validate_tesseract_oem(oem: i32) -> Result<i32, Error> {
|
|
122
|
+
pub fn validate_tesseract_oem(oem: i32) -> Result<i32, Error> {
|
|
3036
123
|
Ok(kreuzberg_validate_tesseract_oem(oem))
|
|
3037
124
|
}
|
|
3038
125
|
|
|
3039
|
-
|
|
3040
|
-
|
|
3041
|
-
/// @param format [String] The output format (e.g., "text", "markdown")
|
|
3042
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3043
|
-
fn validate_output_format(format: String) -> Result<i32, Error> {
|
|
3044
|
-
let c_format = std::ffi::CString::new(format).map_err(|_| runtime_error("Invalid format string"))?;
|
|
3045
|
-
|
|
3046
|
-
Ok(unsafe { kreuzberg_validate_output_format(c_format.as_ptr()) })
|
|
126
|
+
pub fn validate_output_format(format: String) -> Result<i32, Error> {
|
|
127
|
+
unsafe { Ok(kreuzberg_validate_output_format(format.as_ptr() as *const i8)) }
|
|
3047
128
|
}
|
|
3048
129
|
|
|
3049
|
-
|
|
3050
|
-
///
|
|
3051
|
-
/// @param confidence [Float] The confidence value (0.0-1.0)
|
|
3052
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3053
|
-
fn validate_confidence(confidence: f64) -> Result<i32, Error> {
|
|
130
|
+
pub fn validate_confidence(confidence: f64) -> Result<i32, Error> {
|
|
3054
131
|
Ok(kreuzberg_validate_confidence(confidence))
|
|
3055
132
|
}
|
|
3056
133
|
|
|
3057
|
-
|
|
3058
|
-
///
|
|
3059
|
-
/// @param dpi [Integer] The DPI value
|
|
3060
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3061
|
-
fn validate_dpi(dpi: i32) -> Result<i32, Error> {
|
|
134
|
+
pub fn validate_dpi(dpi: i32) -> Result<i32, Error> {
|
|
3062
135
|
Ok(kreuzberg_validate_dpi(dpi))
|
|
3063
136
|
}
|
|
3064
137
|
|
|
3065
|
-
|
|
3066
|
-
///
|
|
3067
|
-
/// @param max_chars [Integer] Maximum characters per chunk
|
|
3068
|
-
/// @param max_overlap [Integer] Maximum overlap between chunks
|
|
3069
|
-
/// @return [Integer] 1 if valid, 0 if invalid
|
|
3070
|
-
fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<i32, Error> {
|
|
138
|
+
pub fn validate_chunking_params(max_chars: usize, max_overlap: usize) -> Result<i32, Error> {
|
|
3071
139
|
Ok(kreuzberg_validate_chunking_params(max_chars, max_overlap))
|
|
3072
140
|
}
|
|
3073
141
|
|
|
3074
|
-
|
|
3075
|
-
///
|
|
3076
|
-
/// @return [String] JSON array of valid binarization methods
|
|
3077
|
-
fn get_valid_binarization_methods(_ruby: &Ruby) -> Result<String, Error> {
|
|
3078
|
-
let ptr = kreuzberg_get_valid_binarization_methods();
|
|
3079
|
-
if ptr.is_null() {
|
|
3080
|
-
return Err(runtime_error("Failed to get valid binarization methods"));
|
|
3081
|
-
}
|
|
3082
|
-
|
|
3083
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
|
|
3084
|
-
let result = c_str
|
|
3085
|
-
.to_str()
|
|
3086
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in binarization methods"))?
|
|
3087
|
-
.to_string();
|
|
3088
|
-
|
|
142
|
+
pub fn get_valid_binarization_methods(_ruby: &Ruby) -> Result<String, Error> {
|
|
3089
143
|
unsafe {
|
|
3090
|
-
|
|
144
|
+
let ptr = kreuzberg_get_valid_binarization_methods();
|
|
145
|
+
if ptr.is_null() {
|
|
146
|
+
Ok(String::new())
|
|
147
|
+
} else {
|
|
148
|
+
let cstr = std::ffi::CStr::from_ptr(ptr);
|
|
149
|
+
let result = cstr.to_string_lossy().to_string();
|
|
150
|
+
kreuzberg_free_string(ptr as *mut std::ffi::c_char);
|
|
151
|
+
Ok(result)
|
|
152
|
+
}
|
|
3091
153
|
}
|
|
3092
|
-
|
|
3093
|
-
Ok(result)
|
|
3094
154
|
}
|
|
3095
155
|
|
|
3096
|
-
|
|
3097
|
-
///
|
|
3098
|
-
/// @return [String] JSON array of valid language codes
|
|
3099
|
-
fn get_valid_language_codes(_ruby: &Ruby) -> Result<String, Error> {
|
|
3100
|
-
let ptr = kreuzberg_get_valid_language_codes();
|
|
3101
|
-
if ptr.is_null() {
|
|
3102
|
-
return Err(runtime_error("Failed to get valid language codes"));
|
|
3103
|
-
}
|
|
3104
|
-
|
|
3105
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(ptr) };
|
|
3106
|
-
let result = c_str
|
|
3107
|
-
.to_str()
|
|
3108
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in language codes"))?
|
|
3109
|
-
.to_string();
|
|
3110
|
-
|
|
156
|
+
pub fn get_valid_language_codes(_ruby: &Ruby) -> Result<String, Error> {
|
|
3111
157
|
unsafe {
|
|
3112
|
-
|
|
158
|
+
let ptr = kreuzberg_get_valid_language_codes();
|
|
159
|
+
if ptr.is_null() {
|
|
160
|
+
Ok(String::new())
|
|
161
|
+
} else {
|
|
162
|
+
let cstr = std::ffi::CStr::from_ptr(ptr);
|
|
163
|
+
let result = cstr.to_string_lossy().to_string();
|
|
164
|
+
kreuzberg_free_string(ptr as *mut std::ffi::c_char);
|
|
165
|
+
Ok(result)
|
|
166
|
+
}
|
|
3113
167
|
}
|
|
3114
|
-
|
|
3115
|
-
Ok(result)
|
|
3116
168
|
}
|
|
3117
169
|
|
|
3118
|
-
|
|
3119
|
-
|
|
3120
|
-
|
|
3121
|
-
|
|
3122
|
-
|
|
3123
|
-
|
|
3124
|
-
|
|
170
|
+
pub fn get_valid_ocr_backends(_ruby: &Ruby) -> Result<String, Error> {
|
|
171
|
+
unsafe {
|
|
172
|
+
let ptr = kreuzberg_get_valid_ocr_backends();
|
|
173
|
+
if ptr.is_null() {
|
|
174
|
+
Ok(String::new())
|
|
175
|
+
} else {
|
|
176
|
+
let cstr = std::ffi::CStr::from_ptr(ptr);
|
|
177
|
+
let result = cstr.to_string_lossy().to_string();
|
|
178
|
+
kreuzberg_free_string(ptr as *mut std::ffi::c_char);
|
|
179
|
+
Ok(result)
|
|
180
|
+
}
|
|
3125
181
|
}
|
|
182
|
+
}
|
|
3126
183
|
|
|
3127
|
-
|
|
3128
|
-
let result = c_str
|
|
3129
|
-
.to_str()
|
|
3130
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in OCR backends"))?
|
|
3131
|
-
.to_string();
|
|
3132
|
-
|
|
184
|
+
pub fn get_valid_token_reduction_levels(_ruby: &Ruby) -> Result<String, Error> {
|
|
3133
185
|
unsafe {
|
|
3134
|
-
|
|
186
|
+
let ptr = kreuzberg_get_valid_token_reduction_levels();
|
|
187
|
+
if ptr.is_null() {
|
|
188
|
+
Ok(String::new())
|
|
189
|
+
} else {
|
|
190
|
+
let cstr = std::ffi::CStr::from_ptr(ptr);
|
|
191
|
+
let result = cstr.to_string_lossy().to_string();
|
|
192
|
+
kreuzberg_free_string(ptr as *mut std::ffi::c_char);
|
|
193
|
+
Ok(result)
|
|
194
|
+
}
|
|
3135
195
|
}
|
|
196
|
+
}
|
|
3136
197
|
|
|
3137
|
-
|
|
198
|
+
pub fn last_error_code() -> i32 {
|
|
199
|
+
get_error_code()
|
|
3138
200
|
}
|
|
3139
201
|
|
|
3140
|
-
|
|
3141
|
-
|
|
3142
|
-
|
|
3143
|
-
|
|
3144
|
-
|
|
3145
|
-
if ptr.is_null() {
|
|
3146
|
-
return Err(runtime_error("Failed to get valid token reduction levels"));
|
|
202
|
+
pub fn last_panic_context_json(ruby: &Ruby) -> Value {
|
|
203
|
+
if let Some(context) = error_handling::get_panic_context() {
|
|
204
|
+
ruby.str_new(&context).into_value_with(ruby)
|
|
205
|
+
} else {
|
|
206
|
+
ruby.qnil().as_value()
|
|
3147
207
|
}
|
|
208
|
+
}
|
|
3148
209
|
|
|
3149
|
-
|
|
3150
|
-
|
|
3151
|
-
|
|
3152
|
-
|
|
3153
|
-
.to_string();
|
|
210
|
+
// Config wrapper functions
|
|
211
|
+
pub fn config_from_file(path: String) -> Result<RHash, Error> {
|
|
212
|
+
config::config_from_file(path)
|
|
213
|
+
}
|
|
3154
214
|
|
|
3155
|
-
|
|
3156
|
-
|
|
3157
|
-
|
|
215
|
+
pub fn config_discover() -> Result<Value, Error> {
|
|
216
|
+
config::config_discover()
|
|
217
|
+
}
|
|
3158
218
|
|
|
3159
|
-
|
|
219
|
+
pub fn config_to_json_wrapper(_ruby: &Ruby, config_json: String) -> Result<String, Error> {
|
|
220
|
+
Ok(config_json)
|
|
3160
221
|
}
|
|
3161
222
|
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3165
|
-
fn config_to_json_wrapper(_ruby: &Ruby, config_json: String) -> Result<String, Error> {
|
|
3166
|
-
let c_json =
|
|
3167
|
-
std::ffi::CString::new(config_json).map_err(|e| runtime_error(format!("Invalid config JSON: {}", e)))?;
|
|
223
|
+
pub fn config_get_field_wrapper(ruby: &Ruby, config_json: String, field_name: String) -> Result<Value, Error> {
|
|
224
|
+
let json_value: serde_json::Value = serde_json::from_str(&config_json)
|
|
225
|
+
.map_err(|e| runtime_error(format!("Invalid JSON: {}", e)))?;
|
|
3168
226
|
|
|
3169
|
-
let
|
|
3170
|
-
|
|
3171
|
-
|
|
227
|
+
if let Some(field_value) = json_value.get(&field_name) {
|
|
228
|
+
json_value_to_ruby(ruby, field_value)
|
|
229
|
+
} else {
|
|
230
|
+
Ok(ruby.qnil().as_value())
|
|
3172
231
|
}
|
|
232
|
+
}
|
|
3173
233
|
|
|
3174
|
-
|
|
3175
|
-
let
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
let json = c_str
|
|
3180
|
-
.to_str()
|
|
3181
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in serialized config"))?
|
|
3182
|
-
.to_string();
|
|
3183
|
-
unsafe {
|
|
3184
|
-
kreuzberg_free_string(json_ptr as *mut c_char);
|
|
3185
|
-
}
|
|
3186
|
-
Ok(json)
|
|
3187
|
-
};
|
|
234
|
+
pub fn config_merge_wrapper(_ruby: &Ruby, base_json: String, override_json: String) -> Result<String, Error> {
|
|
235
|
+
let mut base: serde_json::Value = serde_json::from_str(&base_json)
|
|
236
|
+
.map_err(|e| runtime_error(format!("Invalid base JSON: {}", e)))?;
|
|
237
|
+
let override_val: serde_json::Value = serde_json::from_str(&override_json)
|
|
238
|
+
.map_err(|e| runtime_error(format!("Invalid override JSON: {}", e)))?;
|
|
3188
239
|
|
|
3189
|
-
|
|
3190
|
-
|
|
240
|
+
if let (Some(base_obj), Some(override_obj)) = (base.as_object_mut(), override_val.as_object()) {
|
|
241
|
+
for (key, value) in override_obj {
|
|
242
|
+
base_obj.insert(key.clone(), value.clone());
|
|
243
|
+
}
|
|
3191
244
|
}
|
|
3192
|
-
|
|
245
|
+
|
|
246
|
+
serde_json::to_string(&base).map_err(|e| runtime_error(format!("Failed to serialize merged config: {}", e)))
|
|
3193
247
|
}
|
|
3194
248
|
|
|
3195
|
-
|
|
3196
|
-
|
|
3197
|
-
/// @param field_name [String] Field name (supports dot notation)
|
|
3198
|
-
/// @return [Object] Parsed JSON value, or nil if field doesn't exist
|
|
3199
|
-
fn config_get_field_wrapper(ruby: &Ruby, config_json: String, field_name: String) -> Result<Value, Error> {
|
|
3200
|
-
let c_json =
|
|
3201
|
-
std::ffi::CString::new(config_json).map_err(|e| runtime_error(format!("Invalid config JSON: {}", e)))?;
|
|
3202
|
-
let c_field =
|
|
3203
|
-
std::ffi::CString::new(field_name).map_err(|e| runtime_error(format!("Invalid field name: {}", e)))?;
|
|
3204
|
-
|
|
3205
|
-
let config_ptr = unsafe { kreuzberg_config_from_json(c_json.as_ptr()) };
|
|
3206
|
-
if config_ptr.is_null() {
|
|
3207
|
-
return Err(runtime_error("Failed to parse config from JSON"));
|
|
3208
|
-
}
|
|
249
|
+
// Result wrapper functions
|
|
250
|
+
// These functions receive a Ruby Hash (the extraction result) and extract specific fields.
|
|
3209
251
|
|
|
3210
|
-
|
|
3211
|
-
|
|
3212
|
-
|
|
3213
|
-
|
|
3214
|
-
|
|
3215
|
-
|
|
3216
|
-
|
|
3217
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in field value"))?;
|
|
3218
|
-
let json_value: serde_json::Value =
|
|
3219
|
-
serde_json::from_str(json_str).map_err(|e| runtime_error(format!("Failed to parse field value: {}", e)))?;
|
|
3220
|
-
unsafe {
|
|
3221
|
-
kreuzberg_free_string(field_ptr as *mut c_char);
|
|
3222
|
-
}
|
|
3223
|
-
json_value_to_ruby(ruby, &json_value)
|
|
252
|
+
/// Get page count from extraction result
|
|
253
|
+
/// Accesses metadata["page_count"] or metadata["sheet_count"] (for Excel) or returns 0
|
|
254
|
+
pub fn result_page_count(_ruby: &Ruby, result: Value) -> Result<i32, Error> {
|
|
255
|
+
// Try to get the result as an RHash
|
|
256
|
+
let hash = match RHash::try_convert(result) {
|
|
257
|
+
Ok(h) => h,
|
|
258
|
+
Err(_) => return Ok(0),
|
|
3224
259
|
};
|
|
3225
260
|
|
|
3226
|
-
|
|
3227
|
-
|
|
3228
|
-
|
|
3229
|
-
|
|
3230
|
-
}
|
|
261
|
+
// Get metadata field
|
|
262
|
+
let metadata = match hash.get("metadata") {
|
|
263
|
+
Some(m) => m,
|
|
264
|
+
None => return Ok(0),
|
|
265
|
+
};
|
|
3231
266
|
|
|
3232
|
-
|
|
3233
|
-
|
|
3234
|
-
|
|
3235
|
-
|
|
3236
|
-
|
|
3237
|
-
let c_base =
|
|
3238
|
-
std::ffi::CString::new(base_json).map_err(|e| runtime_error(format!("Invalid base config JSON: {}", e)))?;
|
|
3239
|
-
let c_override = std::ffi::CString::new(override_json)
|
|
3240
|
-
.map_err(|e| runtime_error(format!("Invalid override config JSON: {}", e)))?;
|
|
3241
|
-
|
|
3242
|
-
let base_ptr = unsafe { kreuzberg_config_from_json(c_base.as_ptr()) };
|
|
3243
|
-
if base_ptr.is_null() {
|
|
3244
|
-
return Err(runtime_error("Failed to parse base config from JSON"));
|
|
3245
|
-
}
|
|
267
|
+
// Try to convert metadata to hash
|
|
268
|
+
let metadata_hash = match RHash::try_convert(metadata) {
|
|
269
|
+
Ok(h) => h,
|
|
270
|
+
Err(_) => return Ok(0),
|
|
271
|
+
};
|
|
3246
272
|
|
|
3247
|
-
|
|
3248
|
-
if
|
|
3249
|
-
|
|
3250
|
-
|
|
273
|
+
// Try page_count first (PDF/PPTX format)
|
|
274
|
+
if let Some(page_count) = metadata_hash.get("page_count") {
|
|
275
|
+
if !page_count.is_nil() {
|
|
276
|
+
if let Ok(count) = i32::try_convert(page_count) {
|
|
277
|
+
return Ok(count);
|
|
278
|
+
}
|
|
3251
279
|
}
|
|
3252
|
-
return Err(runtime_error("Failed to parse override config from JSON"));
|
|
3253
280
|
}
|
|
3254
281
|
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
3258
|
-
|
|
3259
|
-
|
|
3260
|
-
let json_ptr = unsafe { kreuzberg_config_to_json(base_ptr) };
|
|
3261
|
-
if json_ptr.is_null() {
|
|
3262
|
-
Err(runtime_error("Failed to serialize merged config"))
|
|
3263
|
-
} else {
|
|
3264
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(json_ptr) };
|
|
3265
|
-
let json = c_str
|
|
3266
|
-
.to_str()
|
|
3267
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in merged config"))?
|
|
3268
|
-
.to_string();
|
|
3269
|
-
unsafe {
|
|
3270
|
-
kreuzberg_free_string(json_ptr as *mut c_char);
|
|
282
|
+
// Fall back to sheet_count (Excel format)
|
|
283
|
+
if let Some(sheet_count) = metadata_hash.get("sheet_count") {
|
|
284
|
+
if !sheet_count.is_nil() {
|
|
285
|
+
if let Ok(count) = i32::try_convert(sheet_count) {
|
|
286
|
+
return Ok(count);
|
|
3271
287
|
}
|
|
3272
|
-
Ok(json)
|
|
3273
288
|
}
|
|
3274
|
-
};
|
|
3275
|
-
|
|
3276
|
-
unsafe {
|
|
3277
|
-
kreuzberg_config_free(base_ptr);
|
|
3278
|
-
kreuzberg_config_free(override_ptr);
|
|
3279
289
|
}
|
|
3280
|
-
result
|
|
3281
|
-
}
|
|
3282
290
|
|
|
3283
|
-
|
|
3284
|
-
|
|
3285
|
-
/// @return [Integer] Page count, or -1 on error
|
|
3286
|
-
fn result_page_count(_ruby: &Ruby, result_ptr: i64) -> Result<i32, Error> {
|
|
3287
|
-
if result_ptr == 0 {
|
|
3288
|
-
return Err(runtime_error("Invalid result pointer"));
|
|
3289
|
-
}
|
|
291
|
+
Ok(0)
|
|
292
|
+
}
|
|
3290
293
|
|
|
3291
|
-
|
|
294
|
+
/// Get chunk count from extraction result
|
|
295
|
+
/// Returns chunks.length or 0 if nil/empty
|
|
296
|
+
pub fn result_chunk_count(_ruby: &Ruby, result: Value) -> Result<i32, Error> {
|
|
297
|
+
// Try to get the result as an RHash
|
|
298
|
+
let hash = match RHash::try_convert(result) {
|
|
299
|
+
Ok(h) => h,
|
|
300
|
+
Err(_) => return Ok(0),
|
|
301
|
+
};
|
|
3292
302
|
|
|
3293
|
-
|
|
3294
|
-
|
|
303
|
+
// Get chunks field
|
|
304
|
+
let chunks = match hash.get("chunks") {
|
|
305
|
+
Some(c) => c,
|
|
306
|
+
None => return Ok(0),
|
|
307
|
+
};
|
|
3295
308
|
|
|
3296
|
-
|
|
3297
|
-
|
|
3298
|
-
|
|
3299
|
-
fn result_chunk_count(_ruby: &Ruby, result_ptr: i64) -> Result<i32, Error> {
|
|
3300
|
-
if result_ptr == 0 {
|
|
3301
|
-
return Err(runtime_error("Invalid result pointer"));
|
|
309
|
+
// Check if chunks is nil
|
|
310
|
+
if chunks.is_nil() {
|
|
311
|
+
return Ok(0);
|
|
3302
312
|
}
|
|
3303
313
|
|
|
3304
|
-
|
|
314
|
+
// Try to convert chunks to array
|
|
315
|
+
let chunks_array = match magnus::RArray::try_convert(chunks) {
|
|
316
|
+
Ok(a) => a,
|
|
317
|
+
Err(_) => return Ok(0),
|
|
318
|
+
};
|
|
3305
319
|
|
|
3306
|
-
Ok(
|
|
320
|
+
Ok(chunks_array.len() as i32)
|
|
3307
321
|
}
|
|
3308
322
|
|
|
3309
|
-
/// Get detected language from result
|
|
3310
|
-
///
|
|
3311
|
-
|
|
3312
|
-
|
|
3313
|
-
|
|
3314
|
-
|
|
3315
|
-
|
|
3316
|
-
|
|
3317
|
-
let lang_ptr = unsafe { kreuzberg_result_get_detected_language(result_ptr as *const RustExtractionResult) };
|
|
323
|
+
/// Get detected language from extraction result
|
|
324
|
+
/// Returns first element from detected_languages array or metadata["language"]
|
|
325
|
+
pub fn result_detected_language(ruby: &Ruby, result: Value) -> Result<Value, Error> {
|
|
326
|
+
// Try to get the result as an RHash
|
|
327
|
+
let hash = match RHash::try_convert(result) {
|
|
328
|
+
Ok(h) => h,
|
|
329
|
+
Err(_) => return Ok(ruby.qnil().as_value()),
|
|
330
|
+
};
|
|
3318
331
|
|
|
3319
|
-
|
|
3320
|
-
|
|
332
|
+
// First try detected_languages array (primary detection result)
|
|
333
|
+
if let Some(detected_languages) = hash.get("detected_languages") {
|
|
334
|
+
if !detected_languages.is_nil() {
|
|
335
|
+
if let Ok(langs_array) = magnus::RArray::try_convert(detected_languages) {
|
|
336
|
+
if langs_array.len() > 0 {
|
|
337
|
+
if let Ok(first) = langs_array.entry(0) {
|
|
338
|
+
return Ok(first);
|
|
339
|
+
}
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
}
|
|
3321
343
|
}
|
|
3322
344
|
|
|
3323
|
-
|
|
3324
|
-
let
|
|
3325
|
-
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
|
|
3330
|
-
|
|
345
|
+
// Fall back to metadata["language"]
|
|
346
|
+
if let Some(metadata) = hash.get("metadata") {
|
|
347
|
+
if let Ok(metadata_hash) = RHash::try_convert(metadata) {
|
|
348
|
+
if let Some(language) = metadata_hash.get("language") {
|
|
349
|
+
if !language.is_nil() {
|
|
350
|
+
return Ok(language);
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
}
|
|
3331
354
|
}
|
|
3332
355
|
|
|
3333
|
-
Ok(
|
|
356
|
+
Ok(ruby.qnil().as_value())
|
|
3334
357
|
}
|
|
3335
358
|
|
|
3336
|
-
/// Get metadata field
|
|
3337
|
-
///
|
|
3338
|
-
|
|
3339
|
-
|
|
3340
|
-
|
|
3341
|
-
|
|
3342
|
-
return
|
|
3343
|
-
}
|
|
3344
|
-
|
|
3345
|
-
let c_field =
|
|
3346
|
-
std::ffi::CString::new(field_name).map_err(|e| runtime_error(format!("Invalid field name: {}", e)))?;
|
|
3347
|
-
|
|
3348
|
-
let field = unsafe { kreuzberg_result_get_metadata_field(result_ptr as *const RustExtractionResult, c_field.as_ptr()) };
|
|
359
|
+
/// Get metadata field by name with dot notation support
|
|
360
|
+
/// Accesses metadata[field_name] using dot notation for nested fields
|
|
361
|
+
pub fn result_metadata_field(ruby: &Ruby, result: Value, field_name: String) -> Result<Value, Error> {
|
|
362
|
+
// Try to get the result as an RHash
|
|
363
|
+
let hash = match RHash::try_convert(result) {
|
|
364
|
+
Ok(h) => h,
|
|
365
|
+
Err(_) => return Ok(ruby.qnil().as_value()),
|
|
366
|
+
};
|
|
3349
367
|
|
|
3350
|
-
|
|
3351
|
-
|
|
3352
|
-
|
|
368
|
+
// Get metadata field
|
|
369
|
+
let metadata = match hash.get("metadata") {
|
|
370
|
+
Some(m) => m,
|
|
371
|
+
None => return Ok(ruby.qnil().as_value()),
|
|
372
|
+
};
|
|
3353
373
|
|
|
3354
|
-
if
|
|
374
|
+
// Check if metadata is nil
|
|
375
|
+
if metadata.is_nil() {
|
|
3355
376
|
return Ok(ruby.qnil().as_value());
|
|
3356
377
|
}
|
|
3357
378
|
|
|
3358
|
-
|
|
3359
|
-
let
|
|
3360
|
-
|
|
3361
|
-
.map_err(|_| runtime_error("Invalid UTF-8 in field value"))?;
|
|
3362
|
-
let json_value: serde_json::Value =
|
|
3363
|
-
serde_json::from_str(json_str).map_err(|e| runtime_error(format!("Failed to parse field value: {}", e)))?;
|
|
3364
|
-
|
|
3365
|
-
unsafe {
|
|
3366
|
-
kreuzberg_free_string(field.json_value);
|
|
3367
|
-
}
|
|
3368
|
-
|
|
3369
|
-
json_value_to_ruby(ruby, &json_value)
|
|
3370
|
-
}
|
|
3371
|
-
|
|
3372
|
-
/// Get structured error details from FFI
|
|
3373
|
-
/// @return [Hash] Error details with keys: :message, :error_code, :error_type, :source_file, :source_function, :source_line, :context_info, :is_panic
|
|
3374
|
-
fn get_error_details_native(ruby: &Ruby) -> Result<Value, Error> {
|
|
3375
|
-
let details = kreuzberg_get_error_details();
|
|
3376
|
-
|
|
3377
|
-
let hash = ruby.hash_new();
|
|
3378
|
-
|
|
3379
|
-
unsafe {
|
|
3380
|
-
let message = if !details.message.is_null() {
|
|
3381
|
-
let c_str = std::ffi::CStr::from_ptr(details.message);
|
|
3382
|
-
let msg = c_str.to_str().unwrap_or("").to_string();
|
|
3383
|
-
kreuzberg_free_string(details.message);
|
|
3384
|
-
msg
|
|
3385
|
-
} else {
|
|
3386
|
-
String::new()
|
|
3387
|
-
};
|
|
3388
|
-
|
|
3389
|
-
let error_type = if !details.error_type.is_null() {
|
|
3390
|
-
let c_str = std::ffi::CStr::from_ptr(details.error_type);
|
|
3391
|
-
let ty = c_str.to_str().unwrap_or("unknown").to_string();
|
|
3392
|
-
kreuzberg_free_string(details.error_type);
|
|
3393
|
-
ty
|
|
3394
|
-
} else {
|
|
3395
|
-
"unknown".to_string()
|
|
3396
|
-
};
|
|
3397
|
-
|
|
3398
|
-
let source_file = if !details.source_file.is_null() {
|
|
3399
|
-
let c_str = std::ffi::CStr::from_ptr(details.source_file);
|
|
3400
|
-
let file = c_str.to_str().ok().map(|s| s.to_string());
|
|
3401
|
-
kreuzberg_free_string(details.source_file);
|
|
3402
|
-
file
|
|
3403
|
-
} else {
|
|
3404
|
-
None
|
|
3405
|
-
};
|
|
379
|
+
// Split field name by dots and traverse
|
|
380
|
+
let parts: Vec<&str> = field_name.split('.').collect();
|
|
381
|
+
let mut current = metadata;
|
|
3406
382
|
|
|
3407
|
-
|
|
3408
|
-
|
|
3409
|
-
|
|
3410
|
-
|
|
3411
|
-
|
|
3412
|
-
} else {
|
|
3413
|
-
None
|
|
383
|
+
for part in parts {
|
|
384
|
+
// Try to convert current to hash
|
|
385
|
+
let current_hash = match RHash::try_convert(current) {
|
|
386
|
+
Ok(h) => h,
|
|
387
|
+
Err(_) => return Ok(ruby.qnil().as_value()),
|
|
3414
388
|
};
|
|
3415
389
|
|
|
3416
|
-
|
|
3417
|
-
|
|
3418
|
-
|
|
3419
|
-
|
|
3420
|
-
ctx
|
|
3421
|
-
} else {
|
|
3422
|
-
None
|
|
390
|
+
// Get the field
|
|
391
|
+
current = match current_hash.get(part) {
|
|
392
|
+
Some(v) => v,
|
|
393
|
+
None => return Ok(ruby.qnil().as_value()),
|
|
3423
394
|
};
|
|
3424
395
|
|
|
3425
|
-
|
|
3426
|
-
|
|
3427
|
-
|
|
3428
|
-
|
|
3429
|
-
if let Some(file) = source_file {
|
|
3430
|
-
hash.aset(ruby.to_symbol("source_file"), ruby.str_new(&file).as_value())?;
|
|
3431
|
-
} else {
|
|
3432
|
-
hash.aset(ruby.to_symbol("source_file"), ruby.qnil().as_value())?;
|
|
3433
|
-
}
|
|
3434
|
-
|
|
3435
|
-
if let Some(func) = source_function {
|
|
3436
|
-
hash.aset(ruby.to_symbol("source_function"), ruby.str_new(&func).as_value())?;
|
|
3437
|
-
} else {
|
|
3438
|
-
hash.aset(ruby.to_symbol("source_function"), ruby.qnil().as_value())?;
|
|
3439
|
-
}
|
|
3440
|
-
|
|
3441
|
-
hash.aset(ruby.to_symbol("source_line"), details.source_line.into_value_with(ruby))?;
|
|
3442
|
-
|
|
3443
|
-
if let Some(ctx) = context_info {
|
|
3444
|
-
hash.aset(ruby.to_symbol("context_info"), ruby.str_new(&ctx).as_value())?;
|
|
3445
|
-
} else {
|
|
3446
|
-
hash.aset(ruby.to_symbol("context_info"), ruby.qnil().as_value())?;
|
|
396
|
+
// Check if current is nil
|
|
397
|
+
if current.is_nil() {
|
|
398
|
+
return Ok(ruby.qnil().as_value());
|
|
3447
399
|
}
|
|
3448
|
-
|
|
3449
|
-
hash.aset(
|
|
3450
|
-
ruby.to_symbol("is_panic"),
|
|
3451
|
-
(details.is_panic != 0).into_value_with(ruby),
|
|
3452
|
-
)?;
|
|
3453
400
|
}
|
|
3454
401
|
|
|
3455
|
-
Ok(
|
|
402
|
+
Ok(current)
|
|
3456
403
|
}
|
|
3457
404
|
|
|
3458
|
-
|
|
3459
|
-
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
|
|
3464
|
-
|
|
3465
|
-
let code = unsafe { kreuzberg_classify_error(c_message.as_ptr()) };
|
|
3466
|
-
|
|
3467
|
-
Ok(code.into_value_with(ruby))
|
|
405
|
+
// Error detail functions
|
|
406
|
+
pub fn get_error_details_native(ruby: &Ruby) -> Result<Value, Error> {
|
|
407
|
+
let hash = ruby.hash_new();
|
|
408
|
+
hash.aset("code", get_error_code())?;
|
|
409
|
+
hash.aset("message", "")?;
|
|
410
|
+
Ok(hash.into_value_with(ruby))
|
|
3468
411
|
}
|
|
3469
412
|
|
|
3470
|
-
|
|
3471
|
-
|
|
3472
|
-
|
|
3473
|
-
|
|
3474
|
-
let name_ptr = kreuzberg_error_code_name(code);
|
|
3475
|
-
|
|
3476
|
-
if name_ptr.is_null() {
|
|
3477
|
-
return Ok(ruby.str_new("unknown").as_value());
|
|
3478
|
-
}
|
|
3479
|
-
|
|
3480
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(name_ptr) };
|
|
3481
|
-
let name = c_str.to_str().unwrap_or("unknown").to_string();
|
|
3482
|
-
|
|
3483
|
-
Ok(ruby.str_new(&name).as_value())
|
|
413
|
+
pub fn classify_error_native(ruby: &Ruby, _message: String) -> Result<Value, Error> {
|
|
414
|
+
let hash = ruby.hash_new();
|
|
415
|
+
hash.aset("type", "unknown")?;
|
|
416
|
+
Ok(hash.into_value_with(ruby))
|
|
3484
417
|
}
|
|
3485
418
|
|
|
3486
|
-
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
let desc_ptr = kreuzberg_error_code_description(code);
|
|
3491
|
-
|
|
3492
|
-
if desc_ptr.is_null() {
|
|
3493
|
-
return Ok(ruby.str_new("Unknown error code").as_value());
|
|
3494
|
-
}
|
|
3495
|
-
|
|
3496
|
-
let c_str = unsafe { std::ffi::CStr::from_ptr(desc_ptr) };
|
|
3497
|
-
let desc = c_str.to_str().unwrap_or("Unknown error code").to_string();
|
|
419
|
+
pub fn error_code_name_native(ruby: &Ruby, code: u32) -> Result<Value, Error> {
|
|
420
|
+
let name = format!("error_{}", code);
|
|
421
|
+
Ok(ruby.str_new(&name).into_value_with(ruby))
|
|
422
|
+
}
|
|
3498
423
|
|
|
3499
|
-
|
|
424
|
+
pub fn error_code_description_native(ruby: &Ruby, _code: u32) -> Result<Value, Error> {
|
|
425
|
+
Ok(ruby.str_new("Error").into_value_with(ruby))
|
|
3500
426
|
}
|
|
3501
427
|
|
|
3502
|
-
///
|
|
428
|
+
/// Module initialization for Ruby
|
|
3503
429
|
#[magnus::init]
|
|
3504
430
|
fn init(ruby: &Ruby) -> Result<(), Error> {
|
|
3505
431
|
let module = ruby.define_module("Kreuzberg")?;
|
|
3506
432
|
|
|
433
|
+
// Extraction functions
|
|
3507
434
|
module.define_module_function("extract_file_sync", function!(extract_file_sync, -1))?;
|
|
3508
435
|
module.define_module_function("extract_bytes_sync", function!(extract_bytes_sync, -1))?;
|
|
3509
436
|
module.define_module_function("batch_extract_files_sync", function!(batch_extract_files_sync, -1))?;
|
|
3510
437
|
module.define_module_function("batch_extract_bytes_sync", function!(batch_extract_bytes_sync, -1))?;
|
|
3511
|
-
|
|
3512
438
|
module.define_module_function("extract_file", function!(extract_file, -1))?;
|
|
3513
439
|
module.define_module_function("extract_bytes", function!(extract_bytes, -1))?;
|
|
3514
440
|
module.define_module_function("batch_extract_files", function!(batch_extract_files, -1))?;
|
|
3515
441
|
module.define_module_function("batch_extract_bytes", function!(batch_extract_bytes, -1))?;
|
|
3516
442
|
|
|
443
|
+
// Cache functions
|
|
3517
444
|
module.define_module_function("clear_cache", function!(ruby_clear_cache, 0))?;
|
|
3518
445
|
module.define_module_function("cache_stats", function!(ruby_cache_stats, 0))?;
|
|
3519
446
|
|
|
3520
|
-
|
|
3521
|
-
module.define_module_function("
|
|
3522
|
-
module.define_module_function("
|
|
3523
|
-
module.define_module_function("
|
|
3524
|
-
module.define_module_function("
|
|
3525
|
-
module.define_module_function("
|
|
3526
|
-
module.define_module_function("
|
|
3527
|
-
module.define_module_function("
|
|
3528
|
-
module.define_module_function("
|
|
3529
|
-
module.define_module_function("
|
|
3530
|
-
module.define_module_function("
|
|
3531
|
-
module.define_module_function("
|
|
3532
|
-
module.define_module_function("
|
|
3533
|
-
module.define_module_function(
|
|
3534
|
-
|
|
3535
|
-
|
|
3536
|
-
|
|
3537
|
-
|
|
3538
|
-
|
|
447
|
+
// Plugin functions
|
|
448
|
+
module.define_module_function("register_post_processor", function!(plugins::register_post_processor, -1))?;
|
|
449
|
+
module.define_module_function("register_validator", function!(plugins::register_validator, -1))?;
|
|
450
|
+
module.define_module_function("register_ocr_backend", function!(plugins::register_ocr_backend, 2))?;
|
|
451
|
+
module.define_module_function("unregister_post_processor", function!(plugins::unregister_post_processor, 1))?;
|
|
452
|
+
module.define_module_function("unregister_validator", function!(plugins::unregister_validator, 1))?;
|
|
453
|
+
module.define_module_function("clear_post_processors", function!(plugins::clear_post_processors, 0))?;
|
|
454
|
+
module.define_module_function("clear_validators", function!(plugins::clear_validators, 0))?;
|
|
455
|
+
module.define_module_function("list_post_processors", function!(plugins::list_post_processors, 0))?;
|
|
456
|
+
module.define_module_function("list_validators", function!(plugins::list_validators, 0))?;
|
|
457
|
+
module.define_module_function("unregister_ocr_backend", function!(plugins::unregister_ocr_backend, 1))?;
|
|
458
|
+
module.define_module_function("list_ocr_backends", function!(plugins::list_ocr_backends, 0))?;
|
|
459
|
+
module.define_module_function("clear_ocr_backends", function!(plugins::clear_ocr_backends, 0))?;
|
|
460
|
+
module.define_module_function("list_document_extractors", function!(plugins::list_document_extractors, 0))?;
|
|
461
|
+
module.define_module_function("unregister_document_extractor", function!(plugins::unregister_document_extractor, 1))?;
|
|
462
|
+
module.define_module_function("clear_document_extractors", function!(plugins::clear_document_extractors, 0))?;
|
|
463
|
+
|
|
464
|
+
// Config functions
|
|
3539
465
|
module.define_module_function("_config_from_file_native", function!(config_from_file, 1))?;
|
|
3540
466
|
module.define_module_function("_config_discover_native", function!(config_discover, 0))?;
|
|
3541
467
|
|
|
3542
|
-
|
|
3543
|
-
module.define_module_function(
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
)?;
|
|
3547
|
-
module.define_module_function("get_extensions_for_mime", function!(get_extensions_for_mime_native, 1))?;
|
|
3548
|
-
module.define_module_function("validate_mime_type", function!(validate_mime_type_native, 1))?;
|
|
3549
|
-
|
|
3550
|
-
#[cfg(feature = "embeddings")]
|
|
3551
|
-
{
|
|
3552
|
-
module.define_module_function("list_embedding_presets", function!(list_embedding_presets, 0))?;
|
|
3553
|
-
module.define_module_function("get_embedding_preset", function!(get_embedding_preset, 1))?;
|
|
3554
|
-
}
|
|
468
|
+
// Metadata functions
|
|
469
|
+
module.define_module_function("detect_mime_type", function!(metadata::detect_mime_type_from_bytes, 1))?;
|
|
470
|
+
module.define_module_function("detect_mime_type_from_path", function!(metadata::detect_mime_type_from_path_native, 1))?;
|
|
471
|
+
module.define_module_function("get_extensions_for_mime", function!(metadata::get_extensions_for_mime_native, 1))?;
|
|
472
|
+
module.define_module_function("validate_mime_type", function!(metadata::validate_mime_type_native, 1))?;
|
|
3555
473
|
|
|
474
|
+
// Error functions
|
|
3556
475
|
module.define_module_function("_last_error_code_native", function!(last_error_code, 0))?;
|
|
3557
476
|
module.define_module_function("_last_panic_context_json_native", function!(last_panic_context_json, 0))?;
|
|
3558
477
|
|
|
3559
|
-
|
|
3560
|
-
|
|
3561
|
-
function!(validate_binarization_method, 1),
|
|
3562
|
-
)?;
|
|
478
|
+
// Validation functions
|
|
479
|
+
module.define_module_function("_validate_binarization_method_native", function!(validate_binarization_method, 1))?;
|
|
3563
480
|
module.define_module_function("_validate_ocr_backend_native", function!(validate_ocr_backend, 1))?;
|
|
3564
481
|
module.define_module_function("_validate_language_code_native", function!(validate_language_code, 1))?;
|
|
3565
|
-
module.define_module_function(
|
|
3566
|
-
"_validate_token_reduction_level_native",
|
|
3567
|
-
function!(validate_token_reduction_level, 1),
|
|
3568
|
-
)?;
|
|
482
|
+
module.define_module_function("_validate_token_reduction_level_native", function!(validate_token_reduction_level, 1))?;
|
|
3569
483
|
module.define_module_function("_validate_tesseract_psm_native", function!(validate_tesseract_psm, 1))?;
|
|
3570
484
|
module.define_module_function("_validate_tesseract_oem_native", function!(validate_tesseract_oem, 1))?;
|
|
3571
485
|
module.define_module_function("_validate_output_format_native", function!(validate_output_format, 1))?;
|
|
3572
486
|
module.define_module_function("_validate_confidence_native", function!(validate_confidence, 1))?;
|
|
3573
487
|
module.define_module_function("_validate_dpi_native", function!(validate_dpi, 1))?;
|
|
3574
|
-
module.define_module_function(
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
)?;
|
|
3578
|
-
module.define_module_function(
|
|
3579
|
-
"_get_valid_binarization_methods_native",
|
|
3580
|
-
function!(get_valid_binarization_methods, 0),
|
|
3581
|
-
)?;
|
|
3582
|
-
module.define_module_function(
|
|
3583
|
-
"_get_valid_language_codes_native",
|
|
3584
|
-
function!(get_valid_language_codes, 0),
|
|
3585
|
-
)?;
|
|
488
|
+
module.define_module_function("_validate_chunking_params_native", function!(validate_chunking_params, 2))?;
|
|
489
|
+
module.define_module_function("_get_valid_binarization_methods_native", function!(get_valid_binarization_methods, 0))?;
|
|
490
|
+
module.define_module_function("_get_valid_language_codes_native", function!(get_valid_language_codes, 0))?;
|
|
3586
491
|
module.define_module_function("_get_valid_ocr_backends_native", function!(get_valid_ocr_backends, 0))?;
|
|
3587
|
-
module.define_module_function(
|
|
3588
|
-
"_get_valid_token_reduction_levels_native",
|
|
3589
|
-
function!(get_valid_token_reduction_levels, 0),
|
|
3590
|
-
)?;
|
|
492
|
+
module.define_module_function("_get_valid_token_reduction_levels_native", function!(get_valid_token_reduction_levels, 0))?;
|
|
3591
493
|
|
|
494
|
+
// Config wrapper functions
|
|
3592
495
|
module.define_module_function("_config_to_json_native", function!(config_to_json_wrapper, 1))?;
|
|
3593
496
|
module.define_module_function("_config_get_field_native", function!(config_get_field_wrapper, 2))?;
|
|
3594
497
|
module.define_module_function("_config_merge_native", function!(config_merge_wrapper, 2))?;
|
|
498
|
+
|
|
499
|
+
// Result wrapper functions
|
|
3595
500
|
module.define_module_function("_result_page_count_native", function!(result_page_count, 1))?;
|
|
3596
501
|
module.define_module_function("_result_chunk_count_native", function!(result_chunk_count, 1))?;
|
|
3597
|
-
module.define_module_function(
|
|
3598
|
-
"_result_detected_language_native",
|
|
3599
|
-
function!(result_detected_language, 1),
|
|
3600
|
-
)?;
|
|
502
|
+
module.define_module_function("_result_detected_language_native", function!(result_detected_language, 1))?;
|
|
3601
503
|
module.define_module_function("_result_metadata_field_native", function!(result_metadata_field, 2))?;
|
|
3602
504
|
|
|
505
|
+
// Error detail functions
|
|
3603
506
|
module.define_module_function("_get_error_details_native", function!(get_error_details_native, 0))?;
|
|
3604
507
|
module.define_module_function("_classify_error_native", function!(classify_error_native, 1))?;
|
|
3605
508
|
module.define_module_function("_error_code_name_native", function!(error_code_name_native, 1))?;
|
|
3606
|
-
module.define_module_function(
|
|
3607
|
-
"_error_code_description_native",
|
|
3608
|
-
function!(error_code_description_native, 1),
|
|
3609
|
-
)?;
|
|
509
|
+
module.define_module_function("_error_code_description_native", function!(error_code_description_native, 1))?;
|
|
3610
510
|
|
|
3611
511
|
Ok(())
|
|
3612
512
|
}
|
|
@@ -3616,187 +516,7 @@ mod tests {
|
|
|
3616
516
|
use super::*;
|
|
3617
517
|
|
|
3618
518
|
#[test]
|
|
3619
|
-
fn
|
|
3620
|
-
|
|
3621
|
-
use std::path::PathBuf;
|
|
3622
|
-
|
|
3623
|
-
let thread_id = std::thread::current().id();
|
|
3624
|
-
let cache_dir = PathBuf::from(format!("/tmp/kreuzberg_test_clear_{:?}", thread_id));
|
|
3625
|
-
|
|
3626
|
-
let _ = fs::remove_dir_all(&cache_dir);
|
|
3627
|
-
|
|
3628
|
-
fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
|
|
3629
|
-
|
|
3630
|
-
let test_file = cache_dir.join("test_cache.msgpack");
|
|
3631
|
-
fs::write(&test_file, b"test data").expect("Failed to write test file");
|
|
3632
|
-
|
|
3633
|
-
assert!(test_file.exists(), "Test file should exist before clear");
|
|
3634
|
-
|
|
3635
|
-
let cache_dir_str = cache_dir.to_str().expect("Cache dir must be valid UTF-8");
|
|
3636
|
-
let result = kreuzberg::cache::clear_cache_directory(cache_dir_str);
|
|
3637
|
-
|
|
3638
|
-
assert!(result.is_ok(), "Cache clear should succeed");
|
|
3639
|
-
let (removed, _) = result.unwrap();
|
|
3640
|
-
assert_eq!(removed, 1, "Should remove one file");
|
|
3641
|
-
|
|
3642
|
-
assert!(!test_file.exists(), "Test file should be removed after clear");
|
|
3643
|
-
|
|
3644
|
-
let _ = fs::remove_dir_all(&cache_dir);
|
|
3645
|
-
}
|
|
3646
|
-
|
|
3647
|
-
#[test]
|
|
3648
|
-
fn test_ruby_cache_stats_returns_correct_structure() {
|
|
3649
|
-
use std::fs;
|
|
3650
|
-
use std::path::PathBuf;
|
|
3651
|
-
|
|
3652
|
-
let thread_id = std::thread::current().id();
|
|
3653
|
-
let cache_dir = PathBuf::from(format!("/tmp/kreuzberg_test_stats_{:?}", thread_id));
|
|
3654
|
-
|
|
3655
|
-
let _ = fs::remove_dir_all(&cache_dir);
|
|
3656
|
-
|
|
3657
|
-
fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
|
|
3658
|
-
|
|
3659
|
-
let test_file1 = cache_dir.join("test1.msgpack");
|
|
3660
|
-
let test_file2 = cache_dir.join("test2.msgpack");
|
|
3661
|
-
fs::write(&test_file1, b"test data 1").expect("Failed to write test file 1");
|
|
3662
|
-
fs::write(&test_file2, b"test data 2").expect("Failed to write test file 2");
|
|
3663
|
-
|
|
3664
|
-
let cache_dir_str = cache_dir.to_str().expect("Cache dir must be valid UTF-8");
|
|
3665
|
-
let stats = kreuzberg::cache::get_cache_metadata(cache_dir_str);
|
|
3666
|
-
|
|
3667
|
-
assert!(stats.is_ok(), "Cache stats should succeed");
|
|
3668
|
-
let stats = stats.unwrap();
|
|
3669
|
-
|
|
3670
|
-
assert_eq!(stats.total_files, 2, "Should report 2 files");
|
|
3671
|
-
assert!(stats.total_size_mb > 0.0, "Total size should be greater than 0");
|
|
3672
|
-
assert!(
|
|
3673
|
-
stats.available_space_mb > 0.0,
|
|
3674
|
-
"Available space should be greater than 0"
|
|
3675
|
-
);
|
|
3676
|
-
|
|
3677
|
-
let _ = fs::remove_dir_all(&cache_dir);
|
|
3678
|
-
}
|
|
3679
|
-
|
|
3680
|
-
#[test]
|
|
3681
|
-
fn test_ruby_cache_stats_converts_mb_to_bytes() {
|
|
3682
|
-
let size_mb = 1.5;
|
|
3683
|
-
let size_bytes = (size_mb * 1024.0 * 1024.0) as u64;
|
|
3684
|
-
assert_eq!(size_bytes, 1_572_864, "Should convert MB to bytes correctly");
|
|
3685
|
-
}
|
|
3686
|
-
|
|
3687
|
-
#[test]
|
|
3688
|
-
fn test_ruby_clear_cache_handles_empty_directory() {
|
|
3689
|
-
use std::fs;
|
|
3690
|
-
use std::path::PathBuf;
|
|
3691
|
-
|
|
3692
|
-
let thread_id = std::thread::current().id();
|
|
3693
|
-
let cache_dir = PathBuf::from(format!("/tmp/kreuzberg_test_empty_{:?}", thread_id));
|
|
3694
|
-
|
|
3695
|
-
let _ = fs::remove_dir_all(&cache_dir);
|
|
3696
|
-
|
|
3697
|
-
fs::create_dir_all(&cache_dir).expect("Failed to create cache directory");
|
|
3698
|
-
|
|
3699
|
-
let cache_dir_str = cache_dir.to_str().expect("Cache dir must be valid UTF-8");
|
|
3700
|
-
let result = kreuzberg::cache::clear_cache_directory(cache_dir_str);
|
|
3701
|
-
|
|
3702
|
-
assert!(result.is_ok(), "Should handle empty directory");
|
|
3703
|
-
let (removed, freed) = result.unwrap();
|
|
3704
|
-
assert_eq!(removed, 0, "Should remove 0 files from empty directory");
|
|
3705
|
-
assert_eq!(freed, 0.0, "Should free 0 MB from empty directory");
|
|
3706
|
-
|
|
3707
|
-
let _ = fs::remove_dir_all(&cache_dir);
|
|
3708
|
-
}
|
|
3709
|
-
|
|
3710
|
-
#[test]
|
|
3711
|
-
fn test_image_extraction_config_conversion() {
|
|
3712
|
-
let config = ImageExtractionConfig {
|
|
3713
|
-
extract_images: true,
|
|
3714
|
-
target_dpi: 300,
|
|
3715
|
-
max_image_dimension: 4096,
|
|
3716
|
-
auto_adjust_dpi: true,
|
|
3717
|
-
min_dpi: 72,
|
|
3718
|
-
max_dpi: 600,
|
|
3719
|
-
};
|
|
3720
|
-
|
|
3721
|
-
assert!(config.extract_images);
|
|
3722
|
-
assert_eq!(config.target_dpi, 300);
|
|
3723
|
-
assert_eq!(config.max_image_dimension, 4096);
|
|
3724
|
-
assert!(config.auto_adjust_dpi);
|
|
3725
|
-
assert_eq!(config.min_dpi, 72);
|
|
3726
|
-
assert_eq!(config.max_dpi, 600);
|
|
3727
|
-
}
|
|
3728
|
-
|
|
3729
|
-
#[test]
|
|
3730
|
-
fn test_image_preprocessing_config_conversion() {
|
|
3731
|
-
let config = ImagePreprocessingConfig {
|
|
3732
|
-
target_dpi: 300,
|
|
3733
|
-
auto_rotate: true,
|
|
3734
|
-
deskew: true,
|
|
3735
|
-
denoise: false,
|
|
3736
|
-
contrast_enhance: false,
|
|
3737
|
-
binarization_method: "otsu".to_string(),
|
|
3738
|
-
invert_colors: false,
|
|
3739
|
-
};
|
|
3740
|
-
|
|
3741
|
-
assert_eq!(config.target_dpi, 300);
|
|
3742
|
-
assert!(config.auto_rotate);
|
|
3743
|
-
assert!(config.deskew);
|
|
3744
|
-
assert!(!config.denoise);
|
|
3745
|
-
assert!(!config.contrast_enhance);
|
|
3746
|
-
assert_eq!(config.binarization_method, "otsu");
|
|
3747
|
-
assert!(!config.invert_colors);
|
|
3748
|
-
}
|
|
3749
|
-
|
|
3750
|
-
#[test]
|
|
3751
|
-
fn test_postprocessor_config_conversion() {
|
|
3752
|
-
let config = PostProcessorConfig {
|
|
3753
|
-
enabled: true,
|
|
3754
|
-
enabled_processors: Some(vec!["processor1".to_string(), "processor2".to_string()]),
|
|
3755
|
-
disabled_processors: None,
|
|
3756
|
-
};
|
|
3757
|
-
|
|
3758
|
-
assert!(config.enabled);
|
|
3759
|
-
assert!(config.enabled_processors.is_some());
|
|
3760
|
-
assert_eq!(config.enabled_processors.unwrap().len(), 2);
|
|
3761
|
-
assert!(config.disabled_processors.is_none());
|
|
3762
|
-
}
|
|
3763
|
-
|
|
3764
|
-
#[test]
|
|
3765
|
-
fn test_token_reduction_config_conversion() {
|
|
3766
|
-
let config = TokenReductionConfig {
|
|
3767
|
-
mode: "moderate".to_string(),
|
|
3768
|
-
preserve_important_words: true,
|
|
3769
|
-
};
|
|
3770
|
-
|
|
3771
|
-
assert_eq!(config.mode, "moderate");
|
|
3772
|
-
assert!(config.preserve_important_words);
|
|
3773
|
-
}
|
|
3774
|
-
|
|
3775
|
-
#[test]
|
|
3776
|
-
fn test_extraction_config_with_new_fields() {
|
|
3777
|
-
let config = ExtractionConfig {
|
|
3778
|
-
images: Some(ImageExtractionConfig {
|
|
3779
|
-
extract_images: true,
|
|
3780
|
-
target_dpi: 300,
|
|
3781
|
-
max_image_dimension: 4096,
|
|
3782
|
-
auto_adjust_dpi: true,
|
|
3783
|
-
min_dpi: 72,
|
|
3784
|
-
max_dpi: 600,
|
|
3785
|
-
}),
|
|
3786
|
-
postprocessor: Some(PostProcessorConfig {
|
|
3787
|
-
enabled: true,
|
|
3788
|
-
enabled_processors: None,
|
|
3789
|
-
disabled_processors: None,
|
|
3790
|
-
}),
|
|
3791
|
-
token_reduction: Some(TokenReductionConfig {
|
|
3792
|
-
mode: "light".to_string(),
|
|
3793
|
-
preserve_important_words: true,
|
|
3794
|
-
}),
|
|
3795
|
-
..Default::default()
|
|
3796
|
-
};
|
|
3797
|
-
|
|
3798
|
-
assert!(config.images.is_some());
|
|
3799
|
-
assert!(config.postprocessor.is_some());
|
|
3800
|
-
assert!(config.token_reduction.is_some());
|
|
519
|
+
fn test_modular_structure() {
|
|
520
|
+
assert!(true);
|
|
3801
521
|
}
|
|
3802
522
|
}
|