kreuzberg 4.0.8 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +2 -2
- data/README.md +1 -1
- data/ext/kreuzberg_rb/native/Cargo.lock +94 -98
- data/ext/kreuzberg_rb/native/Cargo.toml +4 -2
- data/ext/kreuzberg_rb/native/src/batch.rs +139 -0
- data/ext/kreuzberg_rb/native/src/config/mod.rs +10 -0
- data/ext/kreuzberg_rb/native/src/config/types.rs +1058 -0
- data/ext/kreuzberg_rb/native/src/error_handling.rs +125 -0
- data/ext/kreuzberg_rb/native/src/extraction.rs +79 -0
- data/ext/kreuzberg_rb/native/src/gc_guarded_value.rs +35 -0
- data/ext/kreuzberg_rb/native/src/helpers.rs +176 -0
- data/ext/kreuzberg_rb/native/src/lib.rs +342 -3622
- data/ext/kreuzberg_rb/native/src/metadata.rs +34 -0
- data/ext/kreuzberg_rb/native/src/plugins/mod.rs +92 -0
- data/ext/kreuzberg_rb/native/src/plugins/ocr_backend.rs +159 -0
- data/ext/kreuzberg_rb/native/src/plugins/post_processor.rs +126 -0
- data/ext/kreuzberg_rb/native/src/plugins/validator.rs +99 -0
- data/ext/kreuzberg_rb/native/src/result.rs +326 -0
- data/ext/kreuzberg_rb/native/src/validation.rs +4 -0
- data/lib/kreuzberg/config.rb +66 -0
- data/lib/kreuzberg/result.rb +107 -2
- data/lib/kreuzberg/types.rb +104 -0
- data/lib/kreuzberg/version.rb +1 -1
- data/lib/kreuzberg.rb +0 -4
- data/sig/kreuzberg.rbs +105 -1
- data/vendor/Cargo.toml +3 -3
- data/vendor/kreuzberg/Cargo.toml +4 -3
- data/vendor/kreuzberg/README.md +1 -1
- data/vendor/kreuzberg/src/api/config.rs +69 -0
- data/vendor/kreuzberg/src/api/handlers.rs +99 -2
- data/vendor/kreuzberg/src/api/mod.rs +14 -7
- data/vendor/kreuzberg/src/api/router.rs +214 -0
- data/vendor/kreuzberg/src/api/startup.rs +243 -0
- data/vendor/kreuzberg/src/api/types.rs +78 -0
- data/vendor/kreuzberg/src/cache/cleanup.rs +277 -0
- data/vendor/kreuzberg/src/cache/core.rs +428 -0
- data/vendor/kreuzberg/src/cache/mod.rs +21 -843
- data/vendor/kreuzberg/src/cache/utilities.rs +156 -0
- data/vendor/kreuzberg/src/chunking/boundaries.rs +301 -0
- data/vendor/kreuzberg/src/chunking/builder.rs +294 -0
- data/vendor/kreuzberg/src/chunking/config.rs +52 -0
- data/vendor/kreuzberg/src/chunking/core.rs +1017 -0
- data/vendor/kreuzberg/src/chunking/mod.rs +14 -2211
- data/vendor/kreuzberg/src/chunking/processor.rs +10 -0
- data/vendor/kreuzberg/src/chunking/validation.rs +686 -0
- data/vendor/kreuzberg/src/core/config/extraction/core.rs +169 -0
- data/vendor/kreuzberg/src/core/config/extraction/env.rs +179 -0
- data/vendor/kreuzberg/src/core/config/extraction/loaders.rs +204 -0
- data/vendor/kreuzberg/src/core/config/extraction/mod.rs +42 -0
- data/vendor/kreuzberg/src/core/config/extraction/types.rs +93 -0
- data/vendor/kreuzberg/src/core/config/formats.rs +135 -0
- data/vendor/kreuzberg/src/core/config/mod.rs +20 -0
- data/vendor/kreuzberg/src/core/config/ocr.rs +73 -0
- data/vendor/kreuzberg/src/core/config/page.rs +57 -0
- data/vendor/kreuzberg/src/core/config/pdf.rs +111 -0
- data/vendor/kreuzberg/src/core/config/processing.rs +312 -0
- data/vendor/kreuzberg/src/core/config_validation/dependencies.rs +187 -0
- data/vendor/kreuzberg/src/core/config_validation/mod.rs +386 -0
- data/vendor/kreuzberg/src/core/config_validation/sections.rs +401 -0
- data/vendor/kreuzberg/src/core/extractor/batch.rs +246 -0
- data/vendor/kreuzberg/src/core/extractor/bytes.rs +116 -0
- data/vendor/kreuzberg/src/core/extractor/file.rs +240 -0
- data/vendor/kreuzberg/src/core/extractor/helpers.rs +71 -0
- data/vendor/kreuzberg/src/core/extractor/legacy.rs +62 -0
- data/vendor/kreuzberg/src/core/extractor/mod.rs +490 -0
- data/vendor/kreuzberg/src/core/extractor/sync.rs +208 -0
- data/vendor/kreuzberg/src/core/mod.rs +4 -1
- data/vendor/kreuzberg/src/core/pipeline/cache.rs +60 -0
- data/vendor/kreuzberg/src/core/pipeline/execution.rs +89 -0
- data/vendor/kreuzberg/src/core/pipeline/features.rs +108 -0
- data/vendor/kreuzberg/src/core/pipeline/format.rs +392 -0
- data/vendor/kreuzberg/src/core/pipeline/initialization.rs +67 -0
- data/vendor/kreuzberg/src/core/pipeline/mod.rs +135 -0
- data/vendor/kreuzberg/src/core/pipeline/tests.rs +975 -0
- data/vendor/kreuzberg/src/core/server_config/env.rs +90 -0
- data/vendor/kreuzberg/src/core/server_config/loader.rs +202 -0
- data/vendor/kreuzberg/src/core/server_config/mod.rs +380 -0
- data/vendor/kreuzberg/src/core/server_config/tests/basic_tests.rs +124 -0
- data/vendor/kreuzberg/src/core/server_config/tests/env_tests.rs +216 -0
- data/vendor/kreuzberg/src/core/server_config/tests/file_loading_tests.rs +341 -0
- data/vendor/kreuzberg/src/core/server_config/tests/mod.rs +5 -0
- data/vendor/kreuzberg/src/core/server_config/validation.rs +17 -0
- data/vendor/kreuzberg/src/embeddings.rs +136 -13
- data/vendor/kreuzberg/src/extraction/{archive.rs → archive/mod.rs} +45 -239
- data/vendor/kreuzberg/src/extraction/archive/sevenz.rs +98 -0
- data/vendor/kreuzberg/src/extraction/archive/tar.rs +118 -0
- data/vendor/kreuzberg/src/extraction/archive/zip.rs +101 -0
- data/vendor/kreuzberg/src/extraction/html/converter.rs +592 -0
- data/vendor/kreuzberg/src/extraction/html/image_handling.rs +95 -0
- data/vendor/kreuzberg/src/extraction/html/mod.rs +53 -0
- data/vendor/kreuzberg/src/extraction/html/processor.rs +659 -0
- data/vendor/kreuzberg/src/extraction/html/stack_management.rs +103 -0
- data/vendor/kreuzberg/src/extraction/html/types.rs +28 -0
- data/vendor/kreuzberg/src/extraction/mod.rs +6 -2
- data/vendor/kreuzberg/src/extraction/pptx/container.rs +159 -0
- data/vendor/kreuzberg/src/extraction/pptx/content_builder.rs +168 -0
- data/vendor/kreuzberg/src/extraction/pptx/elements.rs +132 -0
- data/vendor/kreuzberg/src/extraction/pptx/image_handling.rs +57 -0
- data/vendor/kreuzberg/src/extraction/pptx/metadata.rs +160 -0
- data/vendor/kreuzberg/src/extraction/pptx/mod.rs +558 -0
- data/vendor/kreuzberg/src/extraction/pptx/parser.rs +379 -0
- data/vendor/kreuzberg/src/extraction/transform/content.rs +205 -0
- data/vendor/kreuzberg/src/extraction/transform/elements.rs +211 -0
- data/vendor/kreuzberg/src/extraction/transform/mod.rs +480 -0
- data/vendor/kreuzberg/src/extraction/transform/types.rs +27 -0
- data/vendor/kreuzberg/src/extractors/archive.rs +2 -0
- data/vendor/kreuzberg/src/extractors/bibtex.rs +2 -0
- data/vendor/kreuzberg/src/extractors/djot_format/attributes.rs +134 -0
- data/vendor/kreuzberg/src/extractors/djot_format/conversion.rs +223 -0
- data/vendor/kreuzberg/src/extractors/djot_format/extractor.rs +172 -0
- data/vendor/kreuzberg/src/extractors/djot_format/mod.rs +24 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/block_handlers.rs +271 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/content_extraction.rs +257 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/event_handlers.rs +101 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/inline_handlers.rs +201 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/mod.rs +16 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/state.rs +78 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/table_extraction.rs +68 -0
- data/vendor/kreuzberg/src/extractors/djot_format/parsing/text_extraction.rs +61 -0
- data/vendor/kreuzberg/src/extractors/djot_format/rendering.rs +452 -0
- data/vendor/kreuzberg/src/extractors/docbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/docx.rs +12 -1
- data/vendor/kreuzberg/src/extractors/email.rs +2 -0
- data/vendor/kreuzberg/src/extractors/epub/content.rs +333 -0
- data/vendor/kreuzberg/src/extractors/epub/metadata.rs +137 -0
- data/vendor/kreuzberg/src/extractors/epub/mod.rs +186 -0
- data/vendor/kreuzberg/src/extractors/epub/parsing.rs +86 -0
- data/vendor/kreuzberg/src/extractors/excel.rs +4 -0
- data/vendor/kreuzberg/src/extractors/fictionbook.rs +2 -0
- data/vendor/kreuzberg/src/extractors/frontmatter_utils.rs +466 -0
- data/vendor/kreuzberg/src/extractors/html.rs +80 -8
- data/vendor/kreuzberg/src/extractors/image.rs +8 -1
- data/vendor/kreuzberg/src/extractors/jats/elements.rs +350 -0
- data/vendor/kreuzberg/src/extractors/jats/metadata.rs +21 -0
- data/vendor/kreuzberg/src/extractors/{jats.rs → jats/mod.rs} +10 -412
- data/vendor/kreuzberg/src/extractors/jats/parser.rs +52 -0
- data/vendor/kreuzberg/src/extractors/jupyter.rs +2 -0
- data/vendor/kreuzberg/src/extractors/latex/commands.rs +93 -0
- data/vendor/kreuzberg/src/extractors/latex/environments.rs +157 -0
- data/vendor/kreuzberg/src/extractors/latex/metadata.rs +27 -0
- data/vendor/kreuzberg/src/extractors/latex/mod.rs +146 -0
- data/vendor/kreuzberg/src/extractors/latex/parser.rs +231 -0
- data/vendor/kreuzberg/src/extractors/latex/utilities.rs +126 -0
- data/vendor/kreuzberg/src/extractors/markdown.rs +39 -162
- data/vendor/kreuzberg/src/extractors/mod.rs +9 -1
- data/vendor/kreuzberg/src/extractors/odt.rs +2 -0
- data/vendor/kreuzberg/src/extractors/opml/core.rs +165 -0
- data/vendor/kreuzberg/src/extractors/opml/mod.rs +31 -0
- data/vendor/kreuzberg/src/extractors/opml/parser.rs +479 -0
- data/vendor/kreuzberg/src/extractors/orgmode.rs +2 -0
- data/vendor/kreuzberg/src/extractors/pdf/extraction.rs +106 -0
- data/vendor/kreuzberg/src/extractors/{pdf.rs → pdf/mod.rs} +25 -324
- data/vendor/kreuzberg/src/extractors/pdf/ocr.rs +214 -0
- data/vendor/kreuzberg/src/extractors/pdf/pages.rs +51 -0
- data/vendor/kreuzberg/src/extractors/pptx.rs +9 -2
- data/vendor/kreuzberg/src/extractors/rst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/rtf/encoding.rs +116 -0
- data/vendor/kreuzberg/src/extractors/rtf/formatting.rs +24 -0
- data/vendor/kreuzberg/src/extractors/rtf/images.rs +72 -0
- data/vendor/kreuzberg/src/extractors/rtf/metadata.rs +216 -0
- data/vendor/kreuzberg/src/extractors/rtf/mod.rs +142 -0
- data/vendor/kreuzberg/src/extractors/rtf/parser.rs +259 -0
- data/vendor/kreuzberg/src/extractors/rtf/tables.rs +83 -0
- data/vendor/kreuzberg/src/extractors/structured.rs +2 -0
- data/vendor/kreuzberg/src/extractors/text.rs +4 -0
- data/vendor/kreuzberg/src/extractors/typst.rs +2 -0
- data/vendor/kreuzberg/src/extractors/xml.rs +2 -0
- data/vendor/kreuzberg/src/keywords/processor.rs +14 -0
- data/vendor/kreuzberg/src/language_detection/processor.rs +10 -0
- data/vendor/kreuzberg/src/lib.rs +2 -2
- data/vendor/kreuzberg/src/mcp/errors.rs +312 -0
- data/vendor/kreuzberg/src/mcp/format.rs +211 -0
- data/vendor/kreuzberg/src/mcp/mod.rs +9 -3
- data/vendor/kreuzberg/src/mcp/params.rs +196 -0
- data/vendor/kreuzberg/src/mcp/server.rs +39 -1438
- data/vendor/kreuzberg/src/mcp/tools/cache.rs +179 -0
- data/vendor/kreuzberg/src/mcp/tools/extraction.rs +403 -0
- data/vendor/kreuzberg/src/mcp/tools/mime.rs +150 -0
- data/vendor/kreuzberg/src/mcp/tools/mod.rs +11 -0
- data/vendor/kreuzberg/src/ocr/backends/easyocr.rs +96 -0
- data/vendor/kreuzberg/src/ocr/backends/mod.rs +7 -0
- data/vendor/kreuzberg/src/ocr/backends/paddleocr.rs +27 -0
- data/vendor/kreuzberg/src/ocr/backends/tesseract.rs +134 -0
- data/vendor/kreuzberg/src/ocr/hocr.rs +60 -16
- data/vendor/kreuzberg/src/ocr/language_registry.rs +11 -235
- data/vendor/kreuzberg/src/ocr/mod.rs +1 -0
- data/vendor/kreuzberg/src/ocr/processor/config.rs +203 -0
- data/vendor/kreuzberg/src/ocr/processor/execution.rs +494 -0
- data/vendor/kreuzberg/src/ocr/processor/mod.rs +265 -0
- data/vendor/kreuzberg/src/ocr/processor/validation.rs +145 -0
- data/vendor/kreuzberg/src/ocr/tesseract_backend.rs +41 -24
- data/vendor/kreuzberg/src/pdf/bindings.rs +21 -8
- data/vendor/kreuzberg/src/pdf/hierarchy/bounding_box.rs +289 -0
- data/vendor/kreuzberg/src/pdf/hierarchy/clustering.rs +199 -0
- data/vendor/kreuzberg/src/pdf/{hierarchy.rs → hierarchy/extraction.rs} +6 -346
- data/vendor/kreuzberg/src/pdf/hierarchy/mod.rs +18 -0
- data/vendor/kreuzberg/src/plugins/extractor/mod.rs +319 -0
- data/vendor/kreuzberg/src/plugins/extractor/registry.rs +434 -0
- data/vendor/kreuzberg/src/plugins/extractor/trait.rs +391 -0
- data/vendor/kreuzberg/src/plugins/mod.rs +13 -0
- data/vendor/kreuzberg/src/plugins/ocr.rs +11 -0
- data/vendor/kreuzberg/src/plugins/processor/mod.rs +365 -0
- data/vendor/kreuzberg/src/plugins/processor/registry.rs +37 -0
- data/vendor/kreuzberg/src/plugins/processor/trait.rs +284 -0
- data/vendor/kreuzberg/src/plugins/registry/extractor.rs +416 -0
- data/vendor/kreuzberg/src/plugins/registry/mod.rs +116 -0
- data/vendor/kreuzberg/src/plugins/registry/ocr.rs +293 -0
- data/vendor/kreuzberg/src/plugins/registry/processor.rs +304 -0
- data/vendor/kreuzberg/src/plugins/registry/validator.rs +238 -0
- data/vendor/kreuzberg/src/plugins/validator/mod.rs +424 -0
- data/vendor/kreuzberg/src/plugins/validator/registry.rs +355 -0
- data/vendor/kreuzberg/src/plugins/validator/trait.rs +276 -0
- data/vendor/kreuzberg/src/stopwords/languages/asian.rs +40 -0
- data/vendor/kreuzberg/src/stopwords/languages/germanic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/mod.rs +10 -0
- data/vendor/kreuzberg/src/stopwords/languages/other.rs +44 -0
- data/vendor/kreuzberg/src/stopwords/languages/romance.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/languages/slavic.rs +36 -0
- data/vendor/kreuzberg/src/stopwords/mod.rs +7 -33
- data/vendor/kreuzberg/src/text/quality.rs +1 -1
- data/vendor/kreuzberg/src/text/quality_processor.rs +10 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/analysis.rs +238 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/mod.rs +8 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/punctuation.rs +54 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/reducer.rs +384 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/sentence_selection.rs +68 -0
- data/vendor/kreuzberg/src/text/token_reduction/core/word_filtering.rs +156 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/general.rs +377 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/html.rs +51 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters/markdown.rs +285 -0
- data/vendor/kreuzberg/src/text/token_reduction/filters.rs +131 -246
- data/vendor/kreuzberg/src/types/djot.rs +209 -0
- data/vendor/kreuzberg/src/types/extraction.rs +301 -0
- data/vendor/kreuzberg/src/types/formats.rs +443 -0
- data/vendor/kreuzberg/src/types/metadata.rs +560 -0
- data/vendor/kreuzberg/src/types/mod.rs +281 -0
- data/vendor/kreuzberg/src/types/page.rs +182 -0
- data/vendor/kreuzberg/src/types/serde_helpers.rs +132 -0
- data/vendor/kreuzberg/src/types/tables.rs +39 -0
- data/vendor/kreuzberg/src/utils/quality/heuristics.rs +58 -0
- data/vendor/kreuzberg/src/utils/{quality.rs → quality/mod.rs} +168 -489
- data/vendor/kreuzberg/src/utils/quality/patterns.rs +117 -0
- data/vendor/kreuzberg/src/utils/quality/scoring.rs +178 -0
- data/vendor/kreuzberg/src/utils/string_pool/buffer_pool.rs +325 -0
- data/vendor/kreuzberg/src/utils/string_pool/interned.rs +102 -0
- data/vendor/kreuzberg/src/utils/string_pool/language_pool.rs +119 -0
- data/vendor/kreuzberg/src/utils/string_pool/mime_pool.rs +235 -0
- data/vendor/kreuzberg/src/utils/string_pool/mod.rs +41 -0
- data/vendor/kreuzberg/tests/api_chunk.rs +313 -0
- data/vendor/kreuzberg/tests/api_embed.rs +6 -9
- data/vendor/kreuzberg/tests/batch_orchestration.rs +1 -0
- data/vendor/kreuzberg/tests/concurrency_stress.rs +7 -0
- data/vendor/kreuzberg/tests/core_integration.rs +1 -0
- data/vendor/kreuzberg/tests/docx_metadata_extraction_test.rs +130 -0
- data/vendor/kreuzberg/tests/epub_native_extractor_tests.rs +5 -14
- data/vendor/kreuzberg/tests/format_integration.rs +2 -0
- data/vendor/kreuzberg/tests/helpers/mod.rs +1 -0
- data/vendor/kreuzberg/tests/html_table_test.rs +11 -11
- data/vendor/kreuzberg/tests/ocr_configuration.rs +16 -0
- data/vendor/kreuzberg/tests/ocr_errors.rs +18 -0
- data/vendor/kreuzberg/tests/ocr_quality.rs +9 -0
- data/vendor/kreuzberg/tests/ocr_stress.rs +1 -0
- data/vendor/kreuzberg/tests/pipeline_integration.rs +50 -0
- data/vendor/kreuzberg/tests/plugin_ocr_backend_test.rs +13 -0
- data/vendor/kreuzberg/tests/plugin_system.rs +12 -0
- data/vendor/kreuzberg/tests/registry_integration_tests.rs +2 -0
- data/vendor/kreuzberg-ffi/Cargo.toml +2 -1
- data/vendor/kreuzberg-ffi/benches/result_view_benchmark.rs +2 -0
- data/vendor/kreuzberg-ffi/kreuzberg.h +347 -178
- data/vendor/kreuzberg-ffi/src/config/html.rs +318 -0
- data/vendor/kreuzberg-ffi/src/config/loader.rs +154 -0
- data/vendor/kreuzberg-ffi/src/config/merge.rs +104 -0
- data/vendor/kreuzberg-ffi/src/config/mod.rs +385 -0
- data/vendor/kreuzberg-ffi/src/config/parse.rs +91 -0
- data/vendor/kreuzberg-ffi/src/config/serialize.rs +118 -0
- data/vendor/kreuzberg-ffi/src/config_builder.rs +598 -0
- data/vendor/kreuzberg-ffi/src/error.rs +46 -14
- data/vendor/kreuzberg-ffi/src/helpers.rs +10 -0
- data/vendor/kreuzberg-ffi/src/html_options.rs +421 -0
- data/vendor/kreuzberg-ffi/src/lib.rs +16 -0
- data/vendor/kreuzberg-ffi/src/panic_shield.rs +11 -0
- data/vendor/kreuzberg-ffi/src/plugins/ocr_backend.rs +2 -0
- data/vendor/kreuzberg-ffi/src/result.rs +148 -122
- data/vendor/kreuzberg-ffi/src/result_view.rs +4 -0
- data/vendor/kreuzberg-tesseract/Cargo.toml +2 -2
- metadata +200 -28
- data/vendor/kreuzberg/src/api/server.rs +0 -518
- data/vendor/kreuzberg/src/core/config.rs +0 -1914
- data/vendor/kreuzberg/src/core/config_validation.rs +0 -949
- data/vendor/kreuzberg/src/core/extractor.rs +0 -1200
- data/vendor/kreuzberg/src/core/pipeline.rs +0 -1223
- data/vendor/kreuzberg/src/core/server_config.rs +0 -1220
- data/vendor/kreuzberg/src/extraction/html.rs +0 -1830
- data/vendor/kreuzberg/src/extraction/pptx.rs +0 -3102
- data/vendor/kreuzberg/src/extractors/epub.rs +0 -696
- data/vendor/kreuzberg/src/extractors/latex.rs +0 -653
- data/vendor/kreuzberg/src/extractors/opml.rs +0 -635
- data/vendor/kreuzberg/src/extractors/rtf.rs +0 -809
- data/vendor/kreuzberg/src/ocr/processor.rs +0 -858
- data/vendor/kreuzberg/src/plugins/extractor.rs +0 -1042
- data/vendor/kreuzberg/src/plugins/processor.rs +0 -650
- data/vendor/kreuzberg/src/plugins/registry.rs +0 -1339
- data/vendor/kreuzberg/src/plugins/validator.rs +0 -967
- data/vendor/kreuzberg/src/text/token_reduction/core.rs +0 -832
- data/vendor/kreuzberg/src/types.rs +0 -1713
- data/vendor/kreuzberg/src/utils/string_pool.rs +0 -762
- data/vendor/kreuzberg-ffi/src/config.rs +0 -1341
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
//! Error handling and conversion to Ruby exceptions
|
|
2
|
+
//!
|
|
3
|
+
//! Provides error conversion from Kreuzberg errors to Magnus Ruby exceptions,
|
|
4
|
+
//! panic context retrieval, and error code utilities.
|
|
5
|
+
|
|
6
|
+
use kreuzberg::KreuzbergError;
|
|
7
|
+
use magnus::{Error, exception::ExceptionClass, Ruby};
|
|
8
|
+
use std::ffi::CStr;
|
|
9
|
+
|
|
10
|
+
pub use kreuzberg_ffi::{
|
|
11
|
+
kreuzberg_free_string, kreuzberg_last_error_code,
|
|
12
|
+
kreuzberg_last_panic_context,
|
|
13
|
+
};
|
|
14
|
+
|
|
15
|
+
/// Retrieve panic context from FFI if available
|
|
16
|
+
pub fn get_panic_context() -> Option<String> {
|
|
17
|
+
unsafe {
|
|
18
|
+
let ctx_ptr = kreuzberg_last_panic_context();
|
|
19
|
+
if ctx_ptr.is_null() {
|
|
20
|
+
return None;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
let c_str = CStr::from_ptr(ctx_ptr);
|
|
24
|
+
let context = c_str.to_string_lossy().to_string();
|
|
25
|
+
kreuzberg_free_string(ctx_ptr as *mut std::ffi::c_char);
|
|
26
|
+
|
|
27
|
+
if context.is_empty() { None } else { Some(context) }
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/// Retrieve error code from FFI
|
|
32
|
+
pub fn get_error_code() -> i32 {
|
|
33
|
+
unsafe { kreuzberg_last_error_code() }
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Convert Kreuzberg errors to Ruby exceptions
|
|
37
|
+
pub fn kreuzberg_error(err: KreuzbergError) -> Error {
|
|
38
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
39
|
+
|
|
40
|
+
let fetch_error_class = |name: &str| -> Option<ExceptionClass> {
|
|
41
|
+
ruby.eval::<ExceptionClass>(&format!("Kreuzberg::Errors::{}", name))
|
|
42
|
+
.ok()
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
match err {
|
|
46
|
+
KreuzbergError::Validation { message, .. } => {
|
|
47
|
+
if let Some(class) = fetch_error_class("ValidationError") {
|
|
48
|
+
Error::new(class, message)
|
|
49
|
+
} else {
|
|
50
|
+
Error::new(ruby.exception_arg_error(), message)
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
KreuzbergError::Parsing { message, .. } => {
|
|
54
|
+
if let Some(class) = fetch_error_class("ParsingError") {
|
|
55
|
+
Error::new(class, message)
|
|
56
|
+
} else {
|
|
57
|
+
Error::new(ruby.exception_runtime_error(), format!("ParsingError: {}", message))
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
KreuzbergError::Ocr { message, .. } => {
|
|
61
|
+
if let Some(class) = fetch_error_class("OCRError") {
|
|
62
|
+
Error::new(class, message)
|
|
63
|
+
} else {
|
|
64
|
+
Error::new(ruby.exception_runtime_error(), format!("OCRError: {}", message))
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
KreuzbergError::MissingDependency(message) => {
|
|
68
|
+
if let Some(class) = fetch_error_class("MissingDependencyError") {
|
|
69
|
+
Error::new(class, message)
|
|
70
|
+
} else {
|
|
71
|
+
Error::new(
|
|
72
|
+
ruby.exception_runtime_error(),
|
|
73
|
+
format!("MissingDependencyError: {}", message),
|
|
74
|
+
)
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
KreuzbergError::Plugin { message, plugin_name } => {
|
|
78
|
+
if let Some(class) = fetch_error_class("PluginError") {
|
|
79
|
+
Error::new(class, format!("{}: {}", plugin_name, message))
|
|
80
|
+
} else {
|
|
81
|
+
Error::new(
|
|
82
|
+
ruby.exception_runtime_error(),
|
|
83
|
+
format!("Plugin error in '{}': {}", plugin_name, message),
|
|
84
|
+
)
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
KreuzbergError::Io(err) => {
|
|
88
|
+
if let Some(class) = fetch_error_class("IOError") {
|
|
89
|
+
Error::new(class, err.to_string())
|
|
90
|
+
} else {
|
|
91
|
+
Error::new(ruby.exception_runtime_error(), format!("IO error: {}", err))
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
KreuzbergError::UnsupportedFormat(message) => {
|
|
95
|
+
if let Some(class) = fetch_error_class("UnsupportedFormatError") {
|
|
96
|
+
Error::new(class, message)
|
|
97
|
+
} else {
|
|
98
|
+
Error::new(
|
|
99
|
+
ruby.exception_runtime_error(),
|
|
100
|
+
format!("UnsupportedFormatError: {}", message),
|
|
101
|
+
)
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
other => Error::new(ruby.exception_runtime_error(), other.to_string()),
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/// Create a generic runtime error
|
|
109
|
+
pub fn runtime_error(message: impl Into<String>) -> Error {
|
|
110
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
111
|
+
Error::new(ruby.exception_runtime_error(), message.into())
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
/// Create a validation error (Kreuzberg::Errors::ValidationError)
|
|
115
|
+
pub fn validation_error(message: impl Into<String>) -> Error {
|
|
116
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
117
|
+
|
|
118
|
+
// Try to get the ValidationError class from Ruby
|
|
119
|
+
if let Ok(class) = ruby.eval::<ExceptionClass>("Kreuzberg::Errors::ValidationError") {
|
|
120
|
+
Error::new(class, message.into())
|
|
121
|
+
} else {
|
|
122
|
+
// Fall back to ArgumentError if the class doesn't exist
|
|
123
|
+
Error::new(ruby.exception_arg_error(), message.into())
|
|
124
|
+
}
|
|
125
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
//! File extraction functions
|
|
2
|
+
//!
|
|
3
|
+
//! Handles extraction from files and byte arrays (synchronous and asynchronous).
|
|
4
|
+
|
|
5
|
+
use crate::config::parse_extraction_config;
|
|
6
|
+
use crate::error_handling::kreuzberg_error;
|
|
7
|
+
use crate::result::extraction_result_to_ruby;
|
|
8
|
+
|
|
9
|
+
use magnus::{Error, RHash, RString, Ruby, Value, scan_args::scan_args};
|
|
10
|
+
|
|
11
|
+
/// Extract content from a file (synchronous)
|
|
12
|
+
pub fn extract_file_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
13
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
14
|
+
let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
|
|
15
|
+
let (path,) = args.required;
|
|
16
|
+
let (mime_type,) = args.optional;
|
|
17
|
+
let opts = Some(args.keywords);
|
|
18
|
+
|
|
19
|
+
let config = parse_extraction_config(&ruby, opts)?;
|
|
20
|
+
|
|
21
|
+
let result = kreuzberg::extract_file_sync(&path, mime_type.as_deref(), &config).map_err(kreuzberg_error)?;
|
|
22
|
+
|
|
23
|
+
extraction_result_to_ruby(&ruby, result)
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/// Extract content from bytes (synchronous)
|
|
27
|
+
pub fn extract_bytes_sync(args: &[Value]) -> Result<RHash, Error> {
|
|
28
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
29
|
+
let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
|
|
30
|
+
let (data, mime_type) = args.required;
|
|
31
|
+
let opts = Some(args.keywords);
|
|
32
|
+
|
|
33
|
+
let config = parse_extraction_config(&ruby, opts)?;
|
|
34
|
+
|
|
35
|
+
let bytes = unsafe { data.as_slice() };
|
|
36
|
+
let result = kreuzberg::extract_bytes_sync(bytes, &mime_type, &config).map_err(kreuzberg_error)?;
|
|
37
|
+
|
|
38
|
+
extraction_result_to_ruby(&ruby, result)
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/// Extract content from a file (asynchronous)
|
|
42
|
+
pub fn extract_file(args: &[Value]) -> Result<RHash, Error> {
|
|
43
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
44
|
+
let args = scan_args::<(String,), (Option<String>,), (), (), RHash, ()>(args)?;
|
|
45
|
+
let (path,) = args.required;
|
|
46
|
+
let (mime_type,) = args.optional;
|
|
47
|
+
let opts = Some(args.keywords);
|
|
48
|
+
|
|
49
|
+
let config = parse_extraction_config(&ruby, opts)?;
|
|
50
|
+
|
|
51
|
+
let runtime =
|
|
52
|
+
tokio::runtime::Runtime::new().map_err(|e| crate::error_handling::runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
53
|
+
|
|
54
|
+
let result = runtime
|
|
55
|
+
.block_on(async { kreuzberg::extract_file(&path, mime_type.as_deref(), &config).await })
|
|
56
|
+
.map_err(kreuzberg_error)?;
|
|
57
|
+
|
|
58
|
+
extraction_result_to_ruby(&ruby, result)
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/// Extract content from bytes (asynchronous)
|
|
62
|
+
pub fn extract_bytes(args: &[Value]) -> Result<RHash, Error> {
|
|
63
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
64
|
+
let args = scan_args::<(RString, String), (), (), (), RHash, ()>(args)?;
|
|
65
|
+
let (data, mime_type) = args.required;
|
|
66
|
+
let opts = Some(args.keywords);
|
|
67
|
+
|
|
68
|
+
let config = parse_extraction_config(&ruby, opts)?;
|
|
69
|
+
|
|
70
|
+
let runtime =
|
|
71
|
+
tokio::runtime::Runtime::new().map_err(|e| crate::error_handling::runtime_error(format!("Failed to create Tokio runtime: {}", e)))?;
|
|
72
|
+
|
|
73
|
+
let bytes = unsafe { data.as_slice() };
|
|
74
|
+
let result = runtime
|
|
75
|
+
.block_on(async { kreuzberg::extract_bytes(bytes, &mime_type, &config).await })
|
|
76
|
+
.map_err(kreuzberg_error)?;
|
|
77
|
+
|
|
78
|
+
extraction_result_to_ruby(&ruby, result)
|
|
79
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
//! GC-guarded Ruby value wrapper for plugin registrations
|
|
2
|
+
//!
|
|
3
|
+
//! Keeps Ruby values alive across plugin registrations by informing the Ruby GC.
|
|
4
|
+
|
|
5
|
+
use magnus::{Ruby, Value};
|
|
6
|
+
|
|
7
|
+
/// Keeps Ruby values alive across plugin registrations by informing the GC.
|
|
8
|
+
///
|
|
9
|
+
/// This prevents Ruby objects (like Procs) from being garbage collected while
|
|
10
|
+
/// they're being used as plugin callbacks.
|
|
11
|
+
pub struct GcGuardedValue {
|
|
12
|
+
value: Value,
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
impl GcGuardedValue {
|
|
16
|
+
/// Create a new GC-guarded value
|
|
17
|
+
pub fn new(value: Value) -> Self {
|
|
18
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
19
|
+
ruby.gc_register_address(&value);
|
|
20
|
+
Self { value }
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
/// Get the wrapped value
|
|
24
|
+
pub fn value(&self) -> Value {
|
|
25
|
+
self.value
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
impl Drop for GcGuardedValue {
|
|
30
|
+
fn drop(&mut self) {
|
|
31
|
+
if let Ok(ruby) = Ruby::get() {
|
|
32
|
+
ruby.gc_unregister_address(&self.value);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
//! Helper utilities for Ruby value conversion and manipulation
|
|
2
|
+
//!
|
|
3
|
+
//! Provides utilities for converting between Ruby and JSON values,
|
|
4
|
+
//! accessing keyword arguments, and managing cache directories.
|
|
5
|
+
|
|
6
|
+
use magnus::{Error, RArray, RHash, Ruby, Symbol, Value, TryConvert, IntoValue};
|
|
7
|
+
use magnus::value::ReprValue;
|
|
8
|
+
use std::fs;
|
|
9
|
+
use std::path::{Path, PathBuf};
|
|
10
|
+
|
|
11
|
+
use crate::error_handling::runtime_error;
|
|
12
|
+
|
|
13
|
+
/// Convert Ruby Symbol or String to Rust String
|
|
14
|
+
pub fn symbol_to_string(value: Value) -> Result<String, Error> {
|
|
15
|
+
if let Some(symbol) = Symbol::from_value(value) {
|
|
16
|
+
Ok(symbol.name()?.to_string())
|
|
17
|
+
} else {
|
|
18
|
+
String::try_convert(value)
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/// Get keyword argument from hash (supports both symbol and string keys)
|
|
23
|
+
pub fn get_kw(ruby: &Ruby, hash: RHash, name: &str) -> Option<Value> {
|
|
24
|
+
hash.get(name).or_else(|| {
|
|
25
|
+
let sym = ruby.intern(name);
|
|
26
|
+
hash.get(sym)
|
|
27
|
+
})
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/// Set a hash entry with a string key
|
|
31
|
+
pub fn set_hash_entry(_ruby: &Ruby, hash: &RHash, key: &str, value: Value) -> Result<(), Error> {
|
|
32
|
+
hash.aset(key, value)?;
|
|
33
|
+
Ok(())
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/// Convert serde_json Value to Ruby Value
|
|
37
|
+
pub fn json_value_to_ruby(ruby: &Ruby, value: &serde_json::Value) -> Result<Value, Error> {
|
|
38
|
+
Ok(match value {
|
|
39
|
+
serde_json::Value::Null => ruby.qnil().as_value(),
|
|
40
|
+
serde_json::Value::Bool(b) => {
|
|
41
|
+
if *b {
|
|
42
|
+
ruby.qtrue().as_value()
|
|
43
|
+
} else {
|
|
44
|
+
ruby.qfalse().as_value()
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
serde_json::Value::Number(num) => {
|
|
48
|
+
if let Some(i) = num.as_i64() {
|
|
49
|
+
ruby.integer_from_i64(i).into_value_with(ruby)
|
|
50
|
+
} else if let Some(u) = num.as_u64() {
|
|
51
|
+
ruby.integer_from_u64(u).into_value_with(ruby)
|
|
52
|
+
} else if let Some(f) = num.as_f64() {
|
|
53
|
+
ruby.float_from_f64(f).into_value_with(ruby)
|
|
54
|
+
} else {
|
|
55
|
+
ruby.qnil().as_value()
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
serde_json::Value::String(s) => ruby.str_new(s).into_value_with(ruby),
|
|
59
|
+
serde_json::Value::Array(items) => {
|
|
60
|
+
let ary = ruby.ary_new();
|
|
61
|
+
for item in items {
|
|
62
|
+
ary.push(json_value_to_ruby(ruby, item)?)?;
|
|
63
|
+
}
|
|
64
|
+
ary.into_value_with(ruby)
|
|
65
|
+
}
|
|
66
|
+
serde_json::Value::Object(map) => {
|
|
67
|
+
let hash = ruby.hash_new();
|
|
68
|
+
for (key, val) in map {
|
|
69
|
+
let key_value = ruby.str_new(key).into_value_with(ruby);
|
|
70
|
+
let val_value = json_value_to_ruby(ruby, val)?;
|
|
71
|
+
hash.aset(key_value, val_value)?;
|
|
72
|
+
}
|
|
73
|
+
hash.into_value_with(ruby)
|
|
74
|
+
}
|
|
75
|
+
})
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/// Convert Ruby key (String or Symbol) to Rust String
|
|
79
|
+
pub fn ruby_key_to_string(value: Value) -> Result<String, Error> {
|
|
80
|
+
if let Ok(sym) = Symbol::try_convert(value) {
|
|
81
|
+
Ok(sym.name()?.to_string())
|
|
82
|
+
} else {
|
|
83
|
+
String::try_convert(value)
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/// Convert Ruby Value to serde_json Value
|
|
88
|
+
pub fn ruby_value_to_json(value: Value) -> Result<serde_json::Value, Error> {
|
|
89
|
+
let ruby = Ruby::get().expect("Ruby not initialized");
|
|
90
|
+
|
|
91
|
+
if value.is_nil() {
|
|
92
|
+
return Ok(serde_json::Value::Null);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if value.equal(ruby.qtrue())? {
|
|
96
|
+
return Ok(serde_json::Value::Bool(true));
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if value.equal(ruby.qfalse())? {
|
|
100
|
+
return Ok(serde_json::Value::Bool(false));
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
if let Ok(integer) = i64::try_convert(value) {
|
|
104
|
+
return Ok(serde_json::Value::Number(integer.into()));
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
if let Ok(unsigned) = u64::try_convert(value) {
|
|
108
|
+
return Ok(serde_json::Value::Number(serde_json::Number::from(unsigned)));
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
if let Ok(float) = f64::try_convert(value)
|
|
112
|
+
&& let Some(num) = serde_json::Number::from_f64(float)
|
|
113
|
+
{
|
|
114
|
+
return Ok(serde_json::Value::Number(num));
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if let Ok(sym) = Symbol::try_convert(value) {
|
|
118
|
+
return Ok(serde_json::Value::String(sym.name()?.to_string()));
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if let Ok(string) = String::try_convert(value) {
|
|
122
|
+
return Ok(serde_json::Value::String(string));
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
if let Ok(array) = RArray::try_convert(value) {
|
|
126
|
+
let mut values = Vec::with_capacity(array.len());
|
|
127
|
+
for item in array.into_iter() {
|
|
128
|
+
values.push(ruby_value_to_json(item)?);
|
|
129
|
+
}
|
|
130
|
+
return Ok(serde_json::Value::Array(values));
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if let Ok(hash) = RHash::try_convert(value) {
|
|
134
|
+
let mut map = serde_json::Map::new();
|
|
135
|
+
hash.foreach(|key: Value, val: Value| {
|
|
136
|
+
let key_string = ruby_key_to_string(key)?;
|
|
137
|
+
let json_value = ruby_value_to_json(val)?;
|
|
138
|
+
map.insert(key_string, json_value);
|
|
139
|
+
Ok(magnus::r_hash::ForEach::Continue)
|
|
140
|
+
})?;
|
|
141
|
+
|
|
142
|
+
return Ok(serde_json::Value::Object(map));
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
Err(runtime_error("Unsupported Ruby value for JSON conversion"))
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
/// Get the cache root directory
|
|
149
|
+
pub fn cache_root_dir() -> Result<PathBuf, Error> {
|
|
150
|
+
std::env::current_dir()
|
|
151
|
+
.map(|dir| dir.join(".kreuzberg"))
|
|
152
|
+
.map_err(|e| runtime_error(format!("Failed to get current directory: {}", e)))
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/// Get all cache directories (root and subdirectories)
|
|
156
|
+
pub fn cache_directories(root: &Path) -> Result<Vec<PathBuf>, Error> {
|
|
157
|
+
if !root.exists() {
|
|
158
|
+
return Ok(vec![]);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
let mut dirs = vec![root.to_path_buf()];
|
|
162
|
+
let entries = fs::read_dir(root).map_err(|e| runtime_error(format!("Failed to read cache root: {}", e)))?;
|
|
163
|
+
|
|
164
|
+
for entry in entries {
|
|
165
|
+
let entry = entry.map_err(|e| runtime_error(format!("Failed to read cache directory entry: {}", e)))?;
|
|
166
|
+
if entry
|
|
167
|
+
.file_type()
|
|
168
|
+
.map_err(|e| runtime_error(format!("Failed to determine cache entry type: {}", e)))?
|
|
169
|
+
.is_dir()
|
|
170
|
+
{
|
|
171
|
+
dirs.push(entry.path());
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
Ok(dirs)
|
|
176
|
+
}
|